예제 #1
0
#####################################################################
# Next, we'll set up the block reader to work on short segments of 
# audio at a time.

# We'll generate 16 frames at a time, each frame having 4096 samples
# and 50% overlap.
#

n_fft = 4096
hop_length = n_fft // 2

# fill_value pads out the last frame with zeros so that we have a
# full frame at the end of the signal, even if the signal doesn't
# divide evenly into full frames.
sr = librosa.get_samplerate(filename)

stream = librosa.stream(filename, block_length=16,
                        frame_length=n_fft,
                        hop_length=hop_length,
                        mono=True,
                        fill_value=0)
#####################################################################
# For this example, we'll compute PCEN on each block, average over
# frequency, and store the results in a list.

# Make an array to store the frequency-averaged PCEN values
pcen_blocks = []

# Initialize the PCEN filter delays to steady state
zi = None
예제 #2
0
#####################################################################
# Next, we'll set up the block reader to work on short segments of
# audio at a time.

# We'll generate 16 frames at a time, each frame having 4096 samples
# and 50% overlap.
#

n_fft = 4096
hop_length = n_fft // 2

# fill_value pads out the last frame with zeros so that we have a
# full frame at the end of the signal, even if the signal doesn't
# divide evenly into full frames.
sr = librosa.get_samplerate(filename)

stream = librosa.stream(filename,
                        block_length=16,
                        frame_length=n_fft,
                        hop_length=hop_length,
                        mono=True,
                        fill_value=0)
#####################################################################
# For this example, we'll compute PCEN on each block, average over
# frequency, and store the results in a list.

# Make an array to store the frequency-averaged PCEN values
pcen_blocks = []

# Initialize the PCEN filter delays to steady state
def main(pathIN,
         VADthresh,
         probThresh,
         nopREQ,
         del_temp=True,
         model='pcen_rnn4_cl2_RMED_allARUs_run0.hdf5',
         recursive=False):
    #%% Parameters
    maxDur = 400  #in seconds
    nop = multiprocessing.cpu_count()
    print(str(int(nop)) + 'cpus found')
    nopREC = np.max([1, nop])
    if nopREQ < nopREC:
        nopUSE = nopREQ
    else:
        nopUSE = nopREC
    print(str(nopUSE) + 'cpus will be used')

    inputWavPath = pathIN
    outputDataPath = pathIN
    durations = []
    wavFileNames = []

    #%% Specify classicication model and number of classes
    # modelfileName ='pcen_rnn4_cl2_RMED_allARUs_run0.hdf5';
    modelfileName = model

    Nclasses = 2

    # %%
    if os.path.exists(inputWavPath + '/' + 'features') == False:
        os.mkdir((inputWavPath + '/' + 'features'))

    if os.path.exists(inputWavPath + '/' + 'extracted_segments') == False:
        os.mkdir((inputWavPath + '/' + 'extracted_segments'))

#%% do the job
    folder_with_recordings = (inputWavPath + '/*.wav')
    if recursive:
        wavs = glob.glob(folder_with_recordings.replace('*', '**/*'),
                         recursive=recursive)
    else:
        wavs = glob.glob(folder_with_recordings)

    for wavName in wavs:
        pool = multiprocessing.Pool(nopUSE)
        fileDuration = librosa.get_duration(filename=wavName)
        sr = librosa.get_samplerate(wavName)
        durations.append(fileDuration)

        if sr == 8000:
            wavFileNames.append(wavName)
            if fileDuration < maxDur:
                timeBorders = np.array((0, fileDuration))
            else:
                timeBorders = np.arange(0, fileDuration, maxDur)
                timeBorders = np.delete(timeBorders, -1, axis=None)
                timeBorders = np.append(timeBorders, fileDuration)

            Nsegm = timeBorders.size

            for segmIdx in range(Nsegm - 1):  #range(0,Nsegm-1):
                # pool.apply_async(extract_features, args=(wavName,outputDataPath,timeBorders,segmIdx,VADthresh))
                pool.apply_async(extract_features, \
                    args=(wavName,outputDataPath,durations, timeBorders,segmIdx,VADthresh))

            pool.close()
            pool.join()

        else:
            print(
                wavName.split(os.sep)[-1] +
                ' has a sampling rate different than 8000  Hz, will not process this audio file '
            )
#%% Run classifier and extract positive .wav segments
    classify_features(outputDataPath, f"models/{modelfileName}", Nclasses,
                      probThresh)

    if del_temp:
        import shutil
        ftrs = inputWavPath + '/' + 'features'
        if os.path.exists(ftrs):
            try:
                shutil.rmtree(ftrs)
            except Exception as e:
                print('Failed to delete %s. Reason: %s' % (file_path, e))
예제 #4
0
import math
import sys

import librosa

sys.path.append("./../")
from configs import config


for wav_file in wav_files:
    print(wav_file)
    sr = librosa.get_samplerate(wav_file)
    frame_length = int(math.pow(2, math.ceil(math.log2((sr * config.frame_size_in_ms * 0.001)))))
    hop_length = int(config.percentage_overlap * frame_length / 100)
    print(sr, frame_length, hop_length)

    stream = librosa.stream(wav_file, block_length=1, frame_length=frame_length, hop_length=hop_length)
    for frame in stream:
        print(list(frame))
        break
예제 #5
0
def invert_spectrogram(spectrogram):
    '''
    spectrogram: [f, t]
    '''
    return librosa.istft(spectrogram,
                         hparams['hop_length'],
                         win_length=hparams['win_length'],
                         window="hann")


if __name__ == "__main__":

    fpath = sys.argv[1]

    hparams = {}
    hparams['sr'] = librosa.get_samplerate(fpath)
    hparams['n_fft'] = 512  # fft points (samples)
    hparams['frame_shift'] = 0.0025  # seconds
    hparams['frame_length'] = 0.01  # seconds
    hparams['hop_length'] = int(hparams['sr'] *
                                hparams['frame_shift'])  # samples.
    hparams['win_length'] = int(hparams['sr'] *
                                hparams['frame_length'])  # samples.
    hparams['n_mels'] = 80  # Number of Mel banks to generate
    hparams['power'] = 1.2  # Exponent for amplifying the predicted magnitude
    hparams['n_iter'] = 100  # Number of inversion iterations
    hparams['preemphasis'] = .97  # or None
    hparams['max_db'] = 100
    hparams['ref_db'] = 20
    hparams['top_db'] = 15
예제 #6
0
 def __init__(self, file_path, balance=True, balance_size=1000, augment=0):
     #self.sounds = []
     self.ffts = []
     self.labels = []
     heavy_files = os.listdir(file_path + 'bass/')
     high_files = os.listdir(file_path + 'high/')
     # light_files = os.listdir(file_path + 'beats_light/')
     # ----------------------------------------------------------------------------------
     # Balance: Training 및 중간 Validation용. Positive, Negative 숫자를 동일하게 밸런싱
     if balance:
         # Heavy
         for i in range(balance_size):
             index = random.randint(0, len(heavy_files) - 1)
             beat_file = file_path + '{}/{}'.format('bass/',
                                                    heavy_files[index])
             if '.wav' not in beat_file:
                 continue
             sr = librosa.get_samplerate(beat_file)
             sound, sr = librosa.load(beat_file, sr=sr)
             duration = librosa.get_duration(sound, sr)
             # print(beat_file, sr, duration, len(sound))
             #
             if sr != 48000:
                 # print('resampling:', beat_file, sr)
                 sound = signal.resample(sound,
                                         int(len(sound) * 48000 / sr))
             #
             # FFT
             freq, fft_abs, fft_beat = utils.fft(sound, len(sound),
                                                 len(sound))
             fft_padding = np.zeros(FFT_SIZE)
             fft_padding[:len(fft_beat)] = fft_beat[:np.min(
                 [FFT_SIZE, len(fft_beat)])]
             #
             # self.sounds.append(sound_padding.reshape(len(sound), 1))
             self.ffts.append(fft_padding.reshape(FFT_SIZE, 1))
             self.labels.append([1, 0])
         #
         # High
         for i in range(balance_size):
             index = random.randint(0, len(high_files) - 1)
             beat_file = file_path + '{}/{}'.format('high/',
                                                    high_files[index])
             if '.wav' not in beat_file:
                 continue
             sr = librosa.get_samplerate(beat_file)
             sound, sr = librosa.load(beat_file, sr=sr)
             duration = librosa.get_duration(sound, sr)
             # print(beat_file, sr, duration, len(sound))
             #
             if sr != 48000:
                 # print('resampling:', beat_file, sr)
                 sound = signal.resample(sound,
                                         int(len(sound) * 48000 / sr))
             #
             # FFT
             freq, fft_abs, fft_beat = utils.fft(sound, len(sound),
                                                 len(sound))
             fft_padding = np.zeros(FFT_SIZE)
             fft_padding[:len(fft_beat)] = fft_beat[:np.min(
                 [FFT_SIZE, len(fft_beat)])]
             #
             self.ffts.append(fft_padding.reshape(FFT_SIZE, 1))
             self.labels.append([0, 1])
     #
     # No-balance: 최종 Validation 또는 Test용. Dataset을 있는 그대로 적용
     else:
         # Heavy
         for index in range(0, len(heavy_files)):
             beat_file = file_path + '{}/{}'.format('bass/',
                                                    heavy_files[index])
             if '.wav' not in beat_file:
                 continue
             sr = librosa.get_samplerate(beat_file)
             sound, sr = librosa.load(beat_file, sr=sr)
             duration = librosa.get_duration(sound, sr)
             # print(beat_file, sr, duration, len(sound))
             #
             if sr != 48000:
                 # print('resampling:', beat_file, sr)
                 sound = signal.resample(sound,
                                         int(len(sound) * 48000 / sr))
             #
             # FFT
             freq, fft_abs, fft_beat = utils.fft(sound, len(sound),
                                                 len(sound))
             fft_padding = np.zeros(FFT_SIZE)
             fft_padding[:len(fft_beat)] = fft_beat[:np.min(
                 [FFT_SIZE, len(fft_beat)])]
             #
             self.ffts.append(fft_padding.reshape(FFT_SIZE, 1))
             self.labels.append([1, 0])
         #
         # High
         for index in range(0, len(high_files)):
             beat_file = file_path + '{}/{}'.format('high/',
                                                    high_files[index])
             if '.wav' not in beat_file:
                 continue
             sr = librosa.get_samplerate(beat_file)
             sound, sr = librosa.load(beat_file, sr=sr)
             duration = librosa.get_duration(sound, sr)
             # print(beat_file, sr, duration, len(sound))
             #
             if sr != 48000:
                 # print('resampling:', beat_file, sr)
                 sound = signal.resample(sound,
                                         int(len(sound) * 48000 / sr))
             #
             # FFT
             freq, fft_abs, fft_beat = utils.fft(sound, len(sound),
                                                 len(sound))
             fft_padding = np.zeros(FFT_SIZE)
             fft_padding[:len(fft_beat)] = fft_beat[:np.min(
                 [FFT_SIZE, len(fft_beat)])]
             #
             self.ffts.append(fft_padding.reshape(FFT_SIZE, 1))
             self.labels.append([0, 1])
     #
     print('data len:', np.sum(self.labels, axis=0))
     # -------------------------------------------------------------------------------------
     self.ffts = np.array(self.ffts)
     self.labels = np.array(self.labels)
def create_dataset(sample_rate, input_dir, tmp_dir, output_dir):
    video_files = glob(f'{input_dir}/**/*.mp4', recursive=True)

    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
    os.makedirs(f'{output_dir}/wavs')
    filelist = open(f'{output_dir}/filelist.txt', 'w', encoding='utf-8')
    total_duration = 0

    for video_path in video_files:
        # Load annotation file
        file_name = os.path.splitext(os.path.basename(video_path))[0]
        json_path = video_path.replace('mp4', 'json')
        try:
            with open(json_path, 'r', encoding='utf-8') as f:
                annotation = json.load(f)
        except UnicodeDecodeError:
            continue

        # Load video clip
        audio_path = video_path.replace(input_dir, tmp_dir,
                                        1).replace('mp4', 'wav')
        orig_sr = librosa.get_samplerate(audio_path)
        y, sr = librosa.load(audio_path, sr=orig_sr)
        duration = librosa.get_duration(y, sr=sr)
        new_sr = sample_rate
        new_y = librosa.resample(y, sr, new_sr)

        # Metadata
        n_frames = float(annotation['nr_frame'])
        fps = n_frames / duration

        for frame, frame_data in annotation['data'].items():
            for sub_id, info_data in frame_data.items():
                if 'text' not in info_data.keys():
                    continue

                # Extract data
                text_data = info_data['text']
                speaker_id = info_data['person_id']
                start_frame = text_data['script_start']
                end_frame = text_data['script_end']
                script = refine_text(text_data['script'])

                start_idx = int(float(start_frame) / fps * new_sr)
                end_idx = int(float(end_frame) / fps * new_sr)

                # Write wav
                y_part = new_y[start_idx:end_idx]
                wav_path = os.path.join(
                    os.path.dirname(audio_path).replace(
                        tmp_dir, f'{output_dir}/wavs'),
                    f'{file_name}_{speaker_id}_{start_frame}_{end_frame}.wav')
                wav_path_text = f'/path_to_speech_dataset/wavs/{file_name}_{speaker_id}_{start_frame}_{end_frame}.wav'
                if not os.path.exists(wav_path):
                    os.makedirs(os.path.dirname(wav_path), exist_ok=True)
                    soundfile.write(wav_path, y_part, new_sr)

                    # Write filelist
                    filelist.write(f'{wav_path_text}|{script}|{speaker_id}\n')
                    total_duration += (end_idx - start_idx) / float(new_sr)

    filelist.close()
    print(f'End parsing, total duration: {total_duration}')
예제 #8
0
def file_to_vector_array_stream_test_data(file_name,
                                          n_mels=128,
                                          frames=5,
                                          n_fft=1024,
                                          hop_length=512,
                                          power=1):
    """
    convert file_name to a vector array.

    file_name : str
        target .wav file

    return : numpy.array( numpy.array( float ) )
        vector array
        * dataset.shape = (dataset_size, feature_vector_length)
    """
    # 01 calculate the number of dimensions

    dims = n_mels * frames
    global tamanhoVec
    global contador
    # 02 generate melspectrogram using librosa
    stream = file_load_stream(file_name)
    sr = librosa.get_samplerate(file_name)
    lista = []
    for n, y in enumerate(stream):
        mel_spectrogram = librosa.feature.melspectrogram(y=y,
                                                         sr=sr,
                                                         n_fft=n_fft,
                                                         hop_length=hop_length,
                                                         n_mels=n_mels,
                                                         power=power)

        # 03 convert melspectrogram

        log_mel_spectrogram = 20.0 / power * numpy.log10(mel_spectrogram + sys.float_info.epsilon)

        # 04 calculate total vector size
        vector_array_size = len(log_mel_spectrogram[0, :]) - frames + 1  # aqui será len(
        # log_mel_spectrogram[0, :])

        # vezes 2 visto que teremos a versão "normal" e warped!

        tamanhoVec.append(vector_array_size)
        # 05 skip too short clips
        if vector_array_size < 1:
            return numpy.empty((0, dims))

        # 06 generate feature vectors by concatenating multiframes
        vector_array = numpy.zeros((vector_array_size, dims))

        for t in range(frames):
            vector_array[:, n_mels * t: n_mels * (t + 1)] = log_mel_spectrogram[:, t: t + vector_array_size].T

        lista.append(vector_array)

        contador += 1

    lista2 = numpy.asarray(lista)
    lista2 = lista2.reshape(lista2.shape[0] * lista2.shape[1], lista2.shape[2])
    return lista2
 def refresh_newsounds(self, new_names):
     print('new sounds!')
     new_sounds = []
     true_names = []
     for name in new_names:
         try:
             new_sounds.append(hk.Soundo(name))
             true_names.append(name)
         except Error:
             print(get_samplerate(name))
             pass
     '''data format:
         [Envelope,FFT,Punch,melspec]
         PCA is pretty fast, so it's no big deal to run it again.
     '''
     if len(true_names) == 0:
         self.refresh_nonew()
         return
     new_envs = []
     new_spectrums = []
     new_punches = []
     new_melspecs = []
     for sound in new_sounds:
         new_envs.append(sound.envelope())
         new_spectrums.append(sound.spectrum())
         new_punches.append(sound.punch())
         new_melspecs.append(sound.melspectrogram())
     for dat in (new_envs, new_melspecs, new_punches):
         dat = np.array(dat)
     new_data = [
         np.array(fresh)
         for fresh in [new_envs, new_spectrums, new_punches]
     ]
     self.new_data = new_data
     new_melspecs = np.array(new_melspecs)
     print(new_melspecs)
     self.PCAaay(new_melspecs)
     with open(os.getcwd() + '\\HnKdata.pickle', 'rb') as f:
         data = pickle.load(f)
         f.close()
     with open(os.getcwd() + '\\HnKsounds.pickle', 'rb') as f:
         old_sounds = pickle.load(f)
         f.close()
     self.sounds = old_sounds + new_sounds
     # print(new_data.shape)
     # print(data.shape)
     for i, dat in enumerate(data):
         data[i] = np.concatenate([data[i], new_data[i]])
     self.data = data
     with open(os.getcwd() + '\\HnK_melspec.pickle', 'rb') as g:
         old_melspec = pickle.load(g)
         g.close()
     all_specs = np.concatenate([old_melspec, new_melspecs])
     self.PCAaay(all_specs)
     with open(os.getcwd() + '\\HnK_melspec.pickle', 'wb') as f1:
         pickle.dump(self.melspecs, f1, pickle.HIGHEST_PROTOCOL)
         f1.close()
     with open(os.getcwd() + '\\HnKdata.pickle', 'wb') as f2:
         pickle.dump(self.data, f2, pickle.HIGHEST_PROTOCOL)
         f2.close()
     with open(os.getcwd() + '\\HnKfile_list.dat', 'w') as f3:
         for file in true_names:
             f3.write(file + '\n')
         f3.close()
     with open(os.getcwd() + '\\HnKsounds.pickle', 'wb') as f4:
         # print('should be writing...')
         pickle.dump(self.sounds, f4, pickle.HIGHEST_PROTOCOL)
         f4.close()
     self.files = true_names
     self.indicies = np.arange(len(self.files))
예제 #10
0
def main():

    parser = argparse.ArgumentParser(
        description='batch_processor',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('pathIN', help='folder containing audio data')
    parser.add_argument('-u',
                        '--nopREQ',
                        type=int,
                        default=6,
                        help='number of processing units employed')
    parser.add_argument('-t',
                        '--VADthresh',
                        type=float,
                        default=0.078,
                        help='Treshold for VAD detection, default=0.078')
    parser.add_argument('-p',
                        '--probThresh',
                        type=float,
                        default=0.75,
                        help='Treshold for RNN classifier, default=0.75')
    args = parser.parse_args()

    #%% Parameters
    maxDur = 400  #in seconds
    nop = multiprocessing.cpu_count()
    print(str(int(nop)) + 'cpus found')
    nopREC = np.max([1, nop - 1])
    if args.nopREQ < nopREC:
        nopUSE = args.nopREQ
    else:
        nopUSE = nopREC
    print(str(nopUSE) + 'cpus will be used')
    if args.VADthresh >= 0.0779:
        VADthresh = args.VADthresh
        print('Setting VAD threshold value to the requested value')
    else:
        VADthresh = 0.078
        print('VAD threshold value is too small, setting it to 0.078')

    if args.probThresh > 1:
        print(
            'Probability threshold should be lower than one, a value greater than one will not admit any positive decisions'
        )
    probThresh = args.probThresh

    inputWavPath = args.pathIN
    outputDataPath = args.pathIN
    wavFileNames = []

    #%% Specify classicication model and number of classes
    modelfileName = 'pcen_rnn4_cl2_RMED_allARUs_run0.hdf5'
    Nclasses = 2

    # %%
    if os.path.exists(inputWavPath + '/' + 'features') == False:
        os.mkdir((inputWavPath + '/' + 'features'))

    if os.path.exists(inputWavPath + '/' + 'extracted_segments') == False:
        os.mkdir((inputWavPath + '/' + 'extracted_segments'))

#%% do the job
    folder_with_recordings = (inputWavPath + '/*.wav')
    for wavName in glob.glob(folder_with_recordings):
        pool = multiprocessing.Pool(nopUSE)
        fileDuration = librosa.get_duration(filename=wavName)
        sr = librosa.get_samplerate(wavName)
        if sr == 8000:
            wavFileNames.append(wavName)
            if fileDuration < maxDur:
                timeBorders = np.array((0, fileDuration))
            else:
                timeBorders = np.arange(0, fileDuration, maxDur)
                timeBorders = np.delete(timeBorders, -1, axis=None)
                timeBorders = np.append(timeBorders, fileDuration)

            Nsegm = timeBorders.size

            for segmIdx in range(Nsegm - 1):  #range(0,Nsegm-1):
                pool.apply_async(extract_features,
                                 args=(wavName, outputDataPath, timeBorders,
                                       segmIdx, VADthresh))

            pool.close()
            pool.join()
        else:
            print(
                wavName.split(os.sep)[-1] +
                ' has a sampling rate different than 8000  Hz, will not process this audio file '
            )

#%% Run classifier and extract positive .wav segments
    classify_features(outputDataPath, f"Models/{modelfileName}", Nclasses,
                      probThresh)