##################################################################### # Next, we'll set up the block reader to work on short segments of # audio at a time. # We'll generate 16 frames at a time, each frame having 4096 samples # and 50% overlap. # n_fft = 4096 hop_length = n_fft // 2 # fill_value pads out the last frame with zeros so that we have a # full frame at the end of the signal, even if the signal doesn't # divide evenly into full frames. sr = librosa.get_samplerate(filename) stream = librosa.stream(filename, block_length=16, frame_length=n_fft, hop_length=hop_length, mono=True, fill_value=0) ##################################################################### # For this example, we'll compute PCEN on each block, average over # frequency, and store the results in a list. # Make an array to store the frequency-averaged PCEN values pcen_blocks = [] # Initialize the PCEN filter delays to steady state zi = None
##################################################################### # Next, we'll set up the block reader to work on short segments of # audio at a time. # We'll generate 16 frames at a time, each frame having 4096 samples # and 50% overlap. # n_fft = 4096 hop_length = n_fft // 2 # fill_value pads out the last frame with zeros so that we have a # full frame at the end of the signal, even if the signal doesn't # divide evenly into full frames. sr = librosa.get_samplerate(filename) stream = librosa.stream(filename, block_length=16, frame_length=n_fft, hop_length=hop_length, mono=True, fill_value=0) ##################################################################### # For this example, we'll compute PCEN on each block, average over # frequency, and store the results in a list. # Make an array to store the frequency-averaged PCEN values pcen_blocks = [] # Initialize the PCEN filter delays to steady state
def main(pathIN, VADthresh, probThresh, nopREQ, del_temp=True, model='pcen_rnn4_cl2_RMED_allARUs_run0.hdf5', recursive=False): #%% Parameters maxDur = 400 #in seconds nop = multiprocessing.cpu_count() print(str(int(nop)) + 'cpus found') nopREC = np.max([1, nop]) if nopREQ < nopREC: nopUSE = nopREQ else: nopUSE = nopREC print(str(nopUSE) + 'cpus will be used') inputWavPath = pathIN outputDataPath = pathIN durations = [] wavFileNames = [] #%% Specify classicication model and number of classes # modelfileName ='pcen_rnn4_cl2_RMED_allARUs_run0.hdf5'; modelfileName = model Nclasses = 2 # %% if os.path.exists(inputWavPath + '/' + 'features') == False: os.mkdir((inputWavPath + '/' + 'features')) if os.path.exists(inputWavPath + '/' + 'extracted_segments') == False: os.mkdir((inputWavPath + '/' + 'extracted_segments')) #%% do the job folder_with_recordings = (inputWavPath + '/*.wav') if recursive: wavs = glob.glob(folder_with_recordings.replace('*', '**/*'), recursive=recursive) else: wavs = glob.glob(folder_with_recordings) for wavName in wavs: pool = multiprocessing.Pool(nopUSE) fileDuration = librosa.get_duration(filename=wavName) sr = librosa.get_samplerate(wavName) durations.append(fileDuration) if sr == 8000: wavFileNames.append(wavName) if fileDuration < maxDur: timeBorders = np.array((0, fileDuration)) else: timeBorders = np.arange(0, fileDuration, maxDur) timeBorders = np.delete(timeBorders, -1, axis=None) timeBorders = np.append(timeBorders, fileDuration) Nsegm = timeBorders.size for segmIdx in range(Nsegm - 1): #range(0,Nsegm-1): # pool.apply_async(extract_features, args=(wavName,outputDataPath,timeBorders,segmIdx,VADthresh)) pool.apply_async(extract_features, \ args=(wavName,outputDataPath,durations, timeBorders,segmIdx,VADthresh)) pool.close() pool.join() else: print( wavName.split(os.sep)[-1] + ' has a sampling rate different than 8000 Hz, will not process this audio file ' ) #%% Run classifier and extract positive .wav segments classify_features(outputDataPath, f"models/{modelfileName}", Nclasses, probThresh) if del_temp: import shutil ftrs = inputWavPath + '/' + 'features' if os.path.exists(ftrs): try: shutil.rmtree(ftrs) except Exception as e: print('Failed to delete %s. Reason: %s' % (file_path, e))
import math import sys import librosa sys.path.append("./../") from configs import config for wav_file in wav_files: print(wav_file) sr = librosa.get_samplerate(wav_file) frame_length = int(math.pow(2, math.ceil(math.log2((sr * config.frame_size_in_ms * 0.001))))) hop_length = int(config.percentage_overlap * frame_length / 100) print(sr, frame_length, hop_length) stream = librosa.stream(wav_file, block_length=1, frame_length=frame_length, hop_length=hop_length) for frame in stream: print(list(frame)) break
def invert_spectrogram(spectrogram): ''' spectrogram: [f, t] ''' return librosa.istft(spectrogram, hparams['hop_length'], win_length=hparams['win_length'], window="hann") if __name__ == "__main__": fpath = sys.argv[1] hparams = {} hparams['sr'] = librosa.get_samplerate(fpath) hparams['n_fft'] = 512 # fft points (samples) hparams['frame_shift'] = 0.0025 # seconds hparams['frame_length'] = 0.01 # seconds hparams['hop_length'] = int(hparams['sr'] * hparams['frame_shift']) # samples. hparams['win_length'] = int(hparams['sr'] * hparams['frame_length']) # samples. hparams['n_mels'] = 80 # Number of Mel banks to generate hparams['power'] = 1.2 # Exponent for amplifying the predicted magnitude hparams['n_iter'] = 100 # Number of inversion iterations hparams['preemphasis'] = .97 # or None hparams['max_db'] = 100 hparams['ref_db'] = 20 hparams['top_db'] = 15
def __init__(self, file_path, balance=True, balance_size=1000, augment=0): #self.sounds = [] self.ffts = [] self.labels = [] heavy_files = os.listdir(file_path + 'bass/') high_files = os.listdir(file_path + 'high/') # light_files = os.listdir(file_path + 'beats_light/') # ---------------------------------------------------------------------------------- # Balance: Training 및 중간 Validation용. Positive, Negative 숫자를 동일하게 밸런싱 if balance: # Heavy for i in range(balance_size): index = random.randint(0, len(heavy_files) - 1) beat_file = file_path + '{}/{}'.format('bass/', heavy_files[index]) if '.wav' not in beat_file: continue sr = librosa.get_samplerate(beat_file) sound, sr = librosa.load(beat_file, sr=sr) duration = librosa.get_duration(sound, sr) # print(beat_file, sr, duration, len(sound)) # if sr != 48000: # print('resampling:', beat_file, sr) sound = signal.resample(sound, int(len(sound) * 48000 / sr)) # # FFT freq, fft_abs, fft_beat = utils.fft(sound, len(sound), len(sound)) fft_padding = np.zeros(FFT_SIZE) fft_padding[:len(fft_beat)] = fft_beat[:np.min( [FFT_SIZE, len(fft_beat)])] # # self.sounds.append(sound_padding.reshape(len(sound), 1)) self.ffts.append(fft_padding.reshape(FFT_SIZE, 1)) self.labels.append([1, 0]) # # High for i in range(balance_size): index = random.randint(0, len(high_files) - 1) beat_file = file_path + '{}/{}'.format('high/', high_files[index]) if '.wav' not in beat_file: continue sr = librosa.get_samplerate(beat_file) sound, sr = librosa.load(beat_file, sr=sr) duration = librosa.get_duration(sound, sr) # print(beat_file, sr, duration, len(sound)) # if sr != 48000: # print('resampling:', beat_file, sr) sound = signal.resample(sound, int(len(sound) * 48000 / sr)) # # FFT freq, fft_abs, fft_beat = utils.fft(sound, len(sound), len(sound)) fft_padding = np.zeros(FFT_SIZE) fft_padding[:len(fft_beat)] = fft_beat[:np.min( [FFT_SIZE, len(fft_beat)])] # self.ffts.append(fft_padding.reshape(FFT_SIZE, 1)) self.labels.append([0, 1]) # # No-balance: 최종 Validation 또는 Test용. Dataset을 있는 그대로 적용 else: # Heavy for index in range(0, len(heavy_files)): beat_file = file_path + '{}/{}'.format('bass/', heavy_files[index]) if '.wav' not in beat_file: continue sr = librosa.get_samplerate(beat_file) sound, sr = librosa.load(beat_file, sr=sr) duration = librosa.get_duration(sound, sr) # print(beat_file, sr, duration, len(sound)) # if sr != 48000: # print('resampling:', beat_file, sr) sound = signal.resample(sound, int(len(sound) * 48000 / sr)) # # FFT freq, fft_abs, fft_beat = utils.fft(sound, len(sound), len(sound)) fft_padding = np.zeros(FFT_SIZE) fft_padding[:len(fft_beat)] = fft_beat[:np.min( [FFT_SIZE, len(fft_beat)])] # self.ffts.append(fft_padding.reshape(FFT_SIZE, 1)) self.labels.append([1, 0]) # # High for index in range(0, len(high_files)): beat_file = file_path + '{}/{}'.format('high/', high_files[index]) if '.wav' not in beat_file: continue sr = librosa.get_samplerate(beat_file) sound, sr = librosa.load(beat_file, sr=sr) duration = librosa.get_duration(sound, sr) # print(beat_file, sr, duration, len(sound)) # if sr != 48000: # print('resampling:', beat_file, sr) sound = signal.resample(sound, int(len(sound) * 48000 / sr)) # # FFT freq, fft_abs, fft_beat = utils.fft(sound, len(sound), len(sound)) fft_padding = np.zeros(FFT_SIZE) fft_padding[:len(fft_beat)] = fft_beat[:np.min( [FFT_SIZE, len(fft_beat)])] # self.ffts.append(fft_padding.reshape(FFT_SIZE, 1)) self.labels.append([0, 1]) # print('data len:', np.sum(self.labels, axis=0)) # ------------------------------------------------------------------------------------- self.ffts = np.array(self.ffts) self.labels = np.array(self.labels)
def create_dataset(sample_rate, input_dir, tmp_dir, output_dir): video_files = glob(f'{input_dir}/**/*.mp4', recursive=True) if os.path.exists(output_dir): shutil.rmtree(output_dir) os.makedirs(f'{output_dir}/wavs') filelist = open(f'{output_dir}/filelist.txt', 'w', encoding='utf-8') total_duration = 0 for video_path in video_files: # Load annotation file file_name = os.path.splitext(os.path.basename(video_path))[0] json_path = video_path.replace('mp4', 'json') try: with open(json_path, 'r', encoding='utf-8') as f: annotation = json.load(f) except UnicodeDecodeError: continue # Load video clip audio_path = video_path.replace(input_dir, tmp_dir, 1).replace('mp4', 'wav') orig_sr = librosa.get_samplerate(audio_path) y, sr = librosa.load(audio_path, sr=orig_sr) duration = librosa.get_duration(y, sr=sr) new_sr = sample_rate new_y = librosa.resample(y, sr, new_sr) # Metadata n_frames = float(annotation['nr_frame']) fps = n_frames / duration for frame, frame_data in annotation['data'].items(): for sub_id, info_data in frame_data.items(): if 'text' not in info_data.keys(): continue # Extract data text_data = info_data['text'] speaker_id = info_data['person_id'] start_frame = text_data['script_start'] end_frame = text_data['script_end'] script = refine_text(text_data['script']) start_idx = int(float(start_frame) / fps * new_sr) end_idx = int(float(end_frame) / fps * new_sr) # Write wav y_part = new_y[start_idx:end_idx] wav_path = os.path.join( os.path.dirname(audio_path).replace( tmp_dir, f'{output_dir}/wavs'), f'{file_name}_{speaker_id}_{start_frame}_{end_frame}.wav') wav_path_text = f'/path_to_speech_dataset/wavs/{file_name}_{speaker_id}_{start_frame}_{end_frame}.wav' if not os.path.exists(wav_path): os.makedirs(os.path.dirname(wav_path), exist_ok=True) soundfile.write(wav_path, y_part, new_sr) # Write filelist filelist.write(f'{wav_path_text}|{script}|{speaker_id}\n') total_duration += (end_idx - start_idx) / float(new_sr) filelist.close() print(f'End parsing, total duration: {total_duration}')
def file_to_vector_array_stream_test_data(file_name, n_mels=128, frames=5, n_fft=1024, hop_length=512, power=1): """ convert file_name to a vector array. file_name : str target .wav file return : numpy.array( numpy.array( float ) ) vector array * dataset.shape = (dataset_size, feature_vector_length) """ # 01 calculate the number of dimensions dims = n_mels * frames global tamanhoVec global contador # 02 generate melspectrogram using librosa stream = file_load_stream(file_name) sr = librosa.get_samplerate(file_name) lista = [] for n, y in enumerate(stream): mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels, power=power) # 03 convert melspectrogram log_mel_spectrogram = 20.0 / power * numpy.log10(mel_spectrogram + sys.float_info.epsilon) # 04 calculate total vector size vector_array_size = len(log_mel_spectrogram[0, :]) - frames + 1 # aqui será len( # log_mel_spectrogram[0, :]) # vezes 2 visto que teremos a versão "normal" e warped! tamanhoVec.append(vector_array_size) # 05 skip too short clips if vector_array_size < 1: return numpy.empty((0, dims)) # 06 generate feature vectors by concatenating multiframes vector_array = numpy.zeros((vector_array_size, dims)) for t in range(frames): vector_array[:, n_mels * t: n_mels * (t + 1)] = log_mel_spectrogram[:, t: t + vector_array_size].T lista.append(vector_array) contador += 1 lista2 = numpy.asarray(lista) lista2 = lista2.reshape(lista2.shape[0] * lista2.shape[1], lista2.shape[2]) return lista2
def refresh_newsounds(self, new_names): print('new sounds!') new_sounds = [] true_names = [] for name in new_names: try: new_sounds.append(hk.Soundo(name)) true_names.append(name) except Error: print(get_samplerate(name)) pass '''data format: [Envelope,FFT,Punch,melspec] PCA is pretty fast, so it's no big deal to run it again. ''' if len(true_names) == 0: self.refresh_nonew() return new_envs = [] new_spectrums = [] new_punches = [] new_melspecs = [] for sound in new_sounds: new_envs.append(sound.envelope()) new_spectrums.append(sound.spectrum()) new_punches.append(sound.punch()) new_melspecs.append(sound.melspectrogram()) for dat in (new_envs, new_melspecs, new_punches): dat = np.array(dat) new_data = [ np.array(fresh) for fresh in [new_envs, new_spectrums, new_punches] ] self.new_data = new_data new_melspecs = np.array(new_melspecs) print(new_melspecs) self.PCAaay(new_melspecs) with open(os.getcwd() + '\\HnKdata.pickle', 'rb') as f: data = pickle.load(f) f.close() with open(os.getcwd() + '\\HnKsounds.pickle', 'rb') as f: old_sounds = pickle.load(f) f.close() self.sounds = old_sounds + new_sounds # print(new_data.shape) # print(data.shape) for i, dat in enumerate(data): data[i] = np.concatenate([data[i], new_data[i]]) self.data = data with open(os.getcwd() + '\\HnK_melspec.pickle', 'rb') as g: old_melspec = pickle.load(g) g.close() all_specs = np.concatenate([old_melspec, new_melspecs]) self.PCAaay(all_specs) with open(os.getcwd() + '\\HnK_melspec.pickle', 'wb') as f1: pickle.dump(self.melspecs, f1, pickle.HIGHEST_PROTOCOL) f1.close() with open(os.getcwd() + '\\HnKdata.pickle', 'wb') as f2: pickle.dump(self.data, f2, pickle.HIGHEST_PROTOCOL) f2.close() with open(os.getcwd() + '\\HnKfile_list.dat', 'w') as f3: for file in true_names: f3.write(file + '\n') f3.close() with open(os.getcwd() + '\\HnKsounds.pickle', 'wb') as f4: # print('should be writing...') pickle.dump(self.sounds, f4, pickle.HIGHEST_PROTOCOL) f4.close() self.files = true_names self.indicies = np.arange(len(self.files))
def main(): parser = argparse.ArgumentParser( description='batch_processor', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('pathIN', help='folder containing audio data') parser.add_argument('-u', '--nopREQ', type=int, default=6, help='number of processing units employed') parser.add_argument('-t', '--VADthresh', type=float, default=0.078, help='Treshold for VAD detection, default=0.078') parser.add_argument('-p', '--probThresh', type=float, default=0.75, help='Treshold for RNN classifier, default=0.75') args = parser.parse_args() #%% Parameters maxDur = 400 #in seconds nop = multiprocessing.cpu_count() print(str(int(nop)) + 'cpus found') nopREC = np.max([1, nop - 1]) if args.nopREQ < nopREC: nopUSE = args.nopREQ else: nopUSE = nopREC print(str(nopUSE) + 'cpus will be used') if args.VADthresh >= 0.0779: VADthresh = args.VADthresh print('Setting VAD threshold value to the requested value') else: VADthresh = 0.078 print('VAD threshold value is too small, setting it to 0.078') if args.probThresh > 1: print( 'Probability threshold should be lower than one, a value greater than one will not admit any positive decisions' ) probThresh = args.probThresh inputWavPath = args.pathIN outputDataPath = args.pathIN wavFileNames = [] #%% Specify classicication model and number of classes modelfileName = 'pcen_rnn4_cl2_RMED_allARUs_run0.hdf5' Nclasses = 2 # %% if os.path.exists(inputWavPath + '/' + 'features') == False: os.mkdir((inputWavPath + '/' + 'features')) if os.path.exists(inputWavPath + '/' + 'extracted_segments') == False: os.mkdir((inputWavPath + '/' + 'extracted_segments')) #%% do the job folder_with_recordings = (inputWavPath + '/*.wav') for wavName in glob.glob(folder_with_recordings): pool = multiprocessing.Pool(nopUSE) fileDuration = librosa.get_duration(filename=wavName) sr = librosa.get_samplerate(wavName) if sr == 8000: wavFileNames.append(wavName) if fileDuration < maxDur: timeBorders = np.array((0, fileDuration)) else: timeBorders = np.arange(0, fileDuration, maxDur) timeBorders = np.delete(timeBorders, -1, axis=None) timeBorders = np.append(timeBorders, fileDuration) Nsegm = timeBorders.size for segmIdx in range(Nsegm - 1): #range(0,Nsegm-1): pool.apply_async(extract_features, args=(wavName, outputDataPath, timeBorders, segmIdx, VADthresh)) pool.close() pool.join() else: print( wavName.split(os.sep)[-1] + ' has a sampling rate different than 8000 Hz, will not process this audio file ' ) #%% Run classifier and extract positive .wav segments classify_features(outputDataPath, f"Models/{modelfileName}", Nclasses, probThresh)