def __getitem__(self,index): while True: notnoise = 1 # Clean files if len(self.clean_files) != 0: # Randomly sample a clean file f = random.choice(self.clean_files) fs,audio = read('{}{}/{}_{}.wav'.format(self.clean_root_dir,'clean',f,'clean')) audio = audio.astype('float32') # Randomly sample a clean clip r = random.random() if r < self.pure_noise and self.flag == 'train': normalized_clean = torch.zeros(LEN*self.sr).float() notnoise = 0 else: start = random.randint(START*fs,len(audio)-LEN*fs) clip = resample(audio[start:start+LEN*fs],fs,self.sr)/1e5 if r >= self.pure_noise and np.sum(clip**2) < self.threshold and self.flag == 'train': continue mu, sigma = np.mean(clip), np.std(clip) normalized_clean = torch.from_numpy((clip-mu)/sigma) # Noise files if len(self.noise_files) != 0: nf = random.choice(self.noise_files) audio_noise, fs = sf.read(nf) if len(audio_noise.shape) > 1: audio_noise = np.mean(audio_noise,axis=1) audio_noise = audio_noise.astype('float32') # Randomly sample a clip of noise if len(audio_noise) < LEN*fs: continue start = random.randint(0,len(audio_noise)-LEN*fs) clip_noise = resample(audio_noise[start:start+LEN*fs],fs,self.sr) mu_noise, sigma_noise = np.mean(clip_noise), np.std(clip_noise) normalized_noise = torch.from_numpy((clip_noise-mu_noise)/(sigma_noise+EPS)) # Mix the noise with the clean audio clip at given SNR level interference = 10**(-self.snr/20)*normalized_noise if r < self.pure_noise and self.flag == 'train': mixture = interference else: mixture = normalized_clean + interference mu_mixture, sigma_mixture = torch.mean(mixture), torch.std(mixture) mixture = (mixture-mu_mixture) / sigma_mixture if len(self.noise_files) != 0: if self.flag == 'train': return mixture, normalized_clean, notnoise if self.flag == 'test': return mixture, normalized_clean return normalized_clean
def SaveSpectrogram(y_mix, y_vocal, y_inst, fname, original_sr=44100): """extract features and save""" y_mix = resample(y_mix, original_sr, C.SR) y_vocal = resample(y_vocal, original_sr, C.SR) y_inst = resample(y_inst, original_sr, C.SR) S_mix = np.abs(stft(y_mix, n_fft=C.FFT_SIZE, hop_length=C.H)).astype(np.float32) S_vocal = np.abs(stft(y_vocal, n_fft=C.FFT_SIZE, hop_length=C.H)).astype(np.float32) S_inst = np.abs(stft(y_inst, n_fft=C.FFT_SIZE, hop_length=C.H)).astype(np.float32) norm = S_mix.max() S_mix /= norm S_vocal /= norm S_inst /= norm # np.savez(os.path.join(C.PATH_FFT, fname+".npz"), mix=S_mix, vocal=S_vocal, inst=S_inst) # Generate sequence (1,512,128) and save cnt = 1 i = 0 while i + C.PATCH_LENGTH < S_mix.shape[1]: mix_spec = np.zeros((1, 512, C.PATCH_LENGTH), dtype=np.float32) #vocal_spec = np.zeros((1, 512, C.PATCH_LENGTH), dtype=np.float32) inst_spec = np.zeros((1, 512, C.PATCH_LENGTH), dtype=np.float32) mix_spec[0, :, :] = S_mix[1:, i:i + C.PATCH_LENGTH] #vocal_spec[0, :, :] = S_vocal[1:, i:i + C.PATCH_LENGTH] inst_spec[0, :, :] = S_inst[1:, i:i + C.PATCH_LENGTH] np.savez(os.path.join(C.VAL_PATH_FFT, fname + str(cnt) + ".npz"), data=mix_spec, label=inst_spec) i += C.PATCH_LENGTH cnt += 1 if S_mix.shape[1] >= 128: mix_spec = np.zeros((1, 512, C.PATCH_LENGTH), dtype=np.float32) #vocal_spec = np.zeros((1, 512, C.PATCH_LENGTH), dtype=np.float32) inst_spec = np.zeros((1, 512, C.PATCH_LENGTH), dtype=np.float32) mix_spec[0, :, :] = S_mix[1:, S_mix.shape[1] - C.PATCH_LENGTH:S_mix.shape[1]] #vocal_spec[0, :, :] = S_vocal[1:, S_mix.shape[1] - C.PATCH_LENGTH:S_mix.shape[1]] inst_spec[0, :, :] = S_inst[1:, S_mix.shape[1] - C.PATCH_LENGTH:S_mix.shape[1]] np.savez(os.path.join(C.VAL_PATH_FFT, fname + str(cnt) + ".npz"), data=mix_spec, label=inst_spec) cnt += 1
def apply(self, sample, clock=0.0): # late binding librosa and its dependencies # pre-importing sklearn fixes https://github.com/scikit-learn/scikit-learn/issues/14485 import sklearn # pylint: disable=import-outside-toplevel from librosa.core import resample # pylint: disable=import-outside-toplevel sample.change_audio_type(new_audio_type=AUDIO_TYPE_NP) rate = pick_value_from_range(self.rate, clock=clock) audio = sample.audio orig_len = len(audio) audio = np.swapaxes(audio, 0, 1) audio = resample(audio, sample.audio_format.rate, rate) audio = resample(audio, rate, sample.audio_format.rate) audio = np.swapaxes(audio, 0, 1)[0:orig_len] sample.audio = audio
def validate(hp, args, generator, discriminator, valloader, writer, step): generator.eval() discriminator.eval() torch.backends.cudnn.benchmark = False loader = tqdm.tqdm(valloader, desc='Validation loop') loss_g_sum = 0.0 loss_d_sum = 0.0 for mel, audio in loader: mel = mel.cuda() audio = audio.cuda() # generator fake_audio = generator(mel) disc_fake = discriminator(fake_audio[:, :, :audio.size(2)]) disc_real = discriminator(audio) loss_g = 0.0 loss_d = 0.0 for (feats_fake, score_fake), (feats_real, score_real) in zip(disc_fake, disc_real): loss_g += torch.mean( torch.sum(torch.pow(score_fake - 1.0, 2), dim=[1, 2])) for feat_f, feat_r in zip(feats_fake, feats_real): loss_g += hp.model.feat_match * torch.mean( torch.abs(feat_f - feat_r)) loss_d += torch.mean( torch.sum(torch.pow(score_real - 1.0, 2), dim=[1, 2])) loss_d += torch.mean( torch.sum(torch.pow(score_fake, 2), dim=[1, 2])) loss_g_sum += loss_g.item() loss_d_sum += loss_d.item() loss_g_avg = loss_g_sum / len(valloader.dataset) loss_d_avg = loss_d_sum / len(valloader.dataset) audio = audio[0][0].cpu().detach().numpy() fake_audio = fake_audio[0][0].cpu().detach().numpy() audio_16k = resample(audio, hp.audio.sampling_rate, 16000) fake_audio_16k = resample(fake_audio, hp.audio.sampling_rate, 16000) pesq_score = pesq(16000, audio_16k, fake_audio_16k, 'wb') writer.log_validation(loss_g_avg, loss_d_avg, pesq_score, generator, discriminator, audio, fake_audio, step) torch.backends.cudnn.benchmark = True
def preprocess_audio(self, x, audio_fps): x = ap.to_mono(x.numpy()) if audio_fps != self.audio_fps: x = ap.resample(x, audio_fps, self.audio_fps) if x.shape[0] < self.audio_fps: x = np.pad(x, (0, self.audio_fps - x.shape[0])) return x.reshape((1, -1))
def generate_cqt(file_path, st_status): st_status.text('Opening {}'.format(file_path)) data, sample_rate = auto_load(file_path, sr=None) print('Sample Rate:', sample_rate, 'shape:', data.shape) if len(data.shape) == 2: print('Converting to mono channel...') data = to_mono(data) st_status.text('Resampling to {} Hz...'.format(TARGET_SAMPLE_RATE)) downsampled_data = resample(data, orig_sr=sample_rate, target_sr=TARGET_SAMPLE_RATE) # downsampled_data = data st_status.text('Downsampled to {} Hz, shape is now {}'.format( TARGET_SAMPLE_RATE, downsampled_data.shape)) st_status.text('Generating CQT...') cqt_result = np.abs( cqt(downsampled_data, sr=TARGET_SAMPLE_RATE, hop_length=HOP_LENGTH, n_bins=TOTAL_BINS, bins_per_octave=BINS_PER_OCTAVE)) return cqt_result
def generate_cqt(i, file_path, offset=0, duration=None): print('[{}] Opening'.format(i), file_path) data, sample_rate = load(file_path, sr=None, offset=offset, duration=duration) print('[{}] Sample Rate:'.format(i), sample_rate, 'shape:', data.shape) if len(data.shape) == 2: with Timer('[{}] Converted to mono'.format(i)): print('[{}] Converting to mono channel...'.format(i)) data = to_mono(data) with Timer('[{}] Resampling'.format(i)): print('[{}] Resampling to'.format(i), TARGET_SAMPLE_RATE, 'Hz...') downsampled_data = resample(data, orig_sr=sample_rate, target_sr=TARGET_SAMPLE_RATE) # downsampled_data = data print('[{}] Downsampled to'.format(i), TARGET_SAMPLE_RATE, 'Hz shape is now', downsampled_data.shape) with Timer('[{}] CQT'.format(i)): print('[{}] Generating CQT...'.format(i)) cqt_result = np.abs( cqt(downsampled_data, sr=TARGET_SAMPLE_RATE, hop_length=HOP_LENGTH, n_bins=TOTAL_BINS, bins_per_octave=BINS_PER_OCTAVE)) return cqt_result
def separate(PATH_INPUT, PATH_OUTPUT, MODEL, SR=16000, FFT_SIZE = 1024, H = 512): if os.path.isdir( PATH_INPUT): # 入力がディレクトリーの場合、ファイルリストをつくる filelist_mixdown = find_files(PATH_INPUT, ext="wav", case_sensitive=True) else: # 入力が単一ファイルの場合 filelist_mixdown=[PATH_INPUT] print ('number of mixdown file', len(filelist_mixdown)) # 出力用のディレクトリーがない場合は 作成する。 _, path_output_ext = os.path.splitext(PATH_OUTPUT) print ('path_output_ext',path_output_ext) if len(path_output_ext)==0 and not os.path.exists(PATH_OUTPUT): os.mkdir(PATH_OUTPUT) # モデルの読み込み unet = train.UNet() chainer.serializers.load_npz( MODEL,unet) config.train = False config.enable_backprop = False # ミックスされたものを読み込み、vocal(speech)の分離を試みる for fmixdown in filelist_mixdown: # audioread でエラーが発生した場合は、scipyを使う。 try: y_mixdown, _ = load(fmixdown, sr=SR, mono=True) except: sr_mixdown, y_mixdown = read(fmixdown) if not sr_mixdown == SR: y_mixdown = resample(y_mixdown, sr_mixdown, SR) # 入力の短時間スペクトラムを計算して、正規化する。 spec = stft(y_mixdown, n_fft=FFT_SIZE, hop_length=H, win_length=FFT_SIZE) mag = np.abs(spec) mag /= np.max(mag) phase = np.exp(1.j*np.angle(spec)) print ('mag.shape', mag.shape) start = 0 end = 128 * (mag.shape[1] // 128) # 入力のフレーム数以下で、networkの定義に依存して 適切な値を選ぶこと。 # speech(vocal)を分離するためのマスクを求める mask = unet(mag[:, start:end][np.newaxis, np.newaxis, 1:, :]).data[0, 0, :, :] mask = np.vstack((np.zeros(mask.shape[1], dtype="float32"), mask)) # 入力の短時間スペクトラムにマスクを掛けて、逆FFTで波形を合成する。 mag2=mag[:, start:end]*mask phase2=phase[:, start:end] y = istft(mag2*phase2, hop_length=H, win_length=FFT_SIZE) # 分離した speech(vocal)を出力ファイルとして保存する。 if len(path_output_ext)==0: # ディレクトリーへ出力 foutname, _ = os.path.splitext( os.path.basename(fmixdown) ) fname= os.path.join(PATH_OUTPUT, (foutname + '.wav')) else: # 指定されたファイルへ出力 fname= PATH_OUTPUT print ('saving... ', fname) write_wav(fname, y, SR, norm=True)
def LoadAudio(fname): y, sr = load(fname, sr=None) if sr != C.SR: y = resample(y, sr, C.SR) spec = stft(y, n_fft=C.FFT_SIZE, hop_length=C.H, win_length=C.FFT_SIZE) mag = np.abs(spec) mag /= np.max(mag) phase = np.exp(1.j * np.angle(spec)) return mag, phase
def transpose(audio_data, rate=1.0, mul=1.0, **kwargs): """ see https://librosa.github.io/librosa/generated/librosa.core.resample.html """ return resample( y=audio_data, orig_sr=44100.0, # dummy value for resampler target_sr=44100.0 / rate, **kwargs) * mul
def apply(self, sample, clock=0.0): # late binding librosa and its dependencies # pre-importing sklearn fixes https://github.com/scikit-learn/scikit-learn/issues/14485 from librosa.core import resample # pylint: disable=import-outside-toplevel sample.change_audio_type(new_audio_type=AUDIO_TYPE_NP) rate = pick_value_from_range(self.rate, clock=clock) audio = sample.audio orig_len = len(audio) audio = np.swapaxes(audio, 0, 1) if audio.shape[0] < 2: # since v0.8 librosa enforces a shape of (samples,) instead of (channels, samples) for mono samples resampled = resample(audio[0], sample.audio_format.rate, rate) audio[0] = resample(resampled, rate, sample.audio_format.rate)[:orig_len] else: audio = resample(audio, sample.audio_format.rate, rate) audio = resample(audio, rate, sample.audio_format.rate) audio = np.swapaxes(audio, 0, 1)[0:orig_len] sample.audio = audio
def SaveSpectrogram(y_mix, y_vocal, y_inst, fname, original_sr=44100): y_mix = resample(y_mix, original_sr, C.SR) y_vocal = resample(y_vocal, original_sr, C.SR) y_inst = resample(y_inst, original_sr, C.SR) S_mix = np.abs( stft(y_mix, n_fft=C.FFT_SIZE, hop_length=C.H)).astype(np.float32) S_vocal = np.abs( stft(y_vocal, n_fft=C.FFT_SIZE, hop_length=C.H)).astype(np.float32) S_inst = np.abs( stft(y_inst, n_fft=C.FFT_SIZE, hop_length=C.H)).astype(np.float32) norm = S_mix.max() S_mix /= norm S_vocal /= norm S_inst /= norm np.savez(os.path.join(C.PATH_FFT, fname+".npz"), mix=S_mix, vocal=S_vocal, inst=S_inst)
def __getitem__(self,index): while True: notnoise = 1 # Randomly sample a file f = random.choice(self.files) fs, audio = read('{}{}/{}_{}.wav'.format(self.root_dir,self.version[0],f,self.version[0])) audio = audio.astype('float32') # Randomly sample a clip r = random.random() is_silence = False if r < self.pure_noise and self.flag == 'train': start = random.randint(0, START*fs-LEN*fs) is_silence = True notnoise = 0 else: start = random.randint(START*fs,len(audio)-LEN*fs) # Resample the clip clip = resample(audio[start:start+LEN*fs],fs,self.sr) / 1e5 # Thresholding: discard clip if the clip contains too much silence if not is_silence and np.sum(clip**2) < self.threshold: continue # Normalize the clip mu, sigma = np.mean(clip), np.std(clip) normalized = torch.from_numpy((clip-mu)/sigma) if len(self.version) > 1: fs, audio_clean = read('{}{}/{}_{}.wav'.format(self.root_dir,self.version[1],f,self.version[1])) audio_clean = audio_clean.astype('float32') # Extract the corresponding clean clip if is_silence: normalized_clean = torch.zeros(LEN*self.sr).float() else: clip_clean = resample(audio_clean[start:start+LEN*fs],fs,self.sr) mu_clean, sigma_clean = np.mean(clip_clean), np.std(clip_clean) normalized_clean = torch.from_numpy((clip_clean-mu_clean)/sigma_clean) if self.flag == 'train': return normalized, normalized_clean, notnoise else: return normalized, normalized_clean return normalized
def LoadAudio_Arg(fname, pitch_shift, time_stretch): y, sr = load(fname, sr=C.SR) if sr != C.SR: y = resample(y, sr, C.SR) y = pitch_shift(y, C.SR, pitch_shift) y = time_stretch(y, time_stretch) spec = stft(y, n_fft=C.FFT_SIZE, hop_length=C.H, win_length=C.FFT_SIZE) mag = np.abs(spec) mag /= np.max(mag) phase = np.exp(1.j * np.angle(spec)) return mag, phase
def downsample_mono(path, sr): rate, wav = wavfile.read(path) wav = resample(wav.astype(np.float32), rate, sr) wav = wav.astype(np.int16) # checks stereo and converts to mono if nessesary try: tmp = wav.shape[1] wav = wav[:, 0] + wav[:, 1] / 2 except: pass return sr, wav
def downsample_mono(path, sr): rate, wav = wavfile.read(path) wav = wav.astype(np.float32, order='F') try: tmp = wav.shape[1] wav = to_mono(wav.T) except: pass wav = resample(wav, rate, sr) wav = wav.astype(np.int16) return sr, wav
def SaveSpectrogram(y_mix, y_vocal, y_inst, fname, original_sr=44100): y_mix = resample(y_mix, original_sr, C.SR) y_vocal = resample(y_vocal, original_sr, C.SR) y_inst = resample(y_inst, original_sr, C.SR) S_mix = np.abs(stft(y_mix, n_fft=C.FFT_SIZE, hop_length=C.H)).astype(np.float32) S_vocal = np.abs(stft(y_vocal, n_fft=C.FFT_SIZE, hop_length=C.H)).astype(np.float32) S_inst = np.abs(stft(y_inst, n_fft=C.FFT_SIZE, hop_length=C.H)).astype(np.float32) norm = S_mix.max() S_mix /= norm S_vocal /= norm S_inst /= norm np.savez(os.path.join(C.PATH_FFT, fname + ".npz"), mix=S_mix, vocal=S_vocal, inst=S_inst)
def from_flac_to_tfrecords(train_r=0.8, valid_test_r=0.2): # Extract the information about this subset (speakers, chapters) # Dictionary with the following shape: # {speaker_key: {chapters: [...], sex:'M/F', ... } } folder = config.data_root+'/'+config.data_subset speakers_info = data_tools.read_metadata(config.data_subset) keys_to_index = {} for i, key in enumerate(speakers_info.keys()): keys_to_index[key] = i sex = ['M' for i in range(len(speakers_info))] for k, v in speakers_info.items(): i = keys_to_index[k] sex[i] = v['sex'] np.save('genders_index.arr', sex) # exit() allfiles = np.array([os.path.join(r,f) for r,dirs,files in os.walk(folder) for f in files if f.endswith(".flac")]) L = len(allfiles) np.random.shuffle(allfiles) train = allfiles[:int(L*train_r)] valid = allfiles[int(L*train_r):int(L*(train_r+valid_test_r/2))] test = allfiles[int(L*(train_r+valid_test_r/2)):] print len(train), len(valid), len(test) for group_name, data_split in [("train", train),("test", test), ("valid", valid)]: for s in ['M', 'F']: writer = tf.python_io.TFRecordWriter(group_name + '_' + s +'.tfrecords') for file in data_split: splits = file.split('/') key = splits[-3] sex = speakers_info[key]['sex'] if sex == s: raw_audio, sr = load(file, sr=16000) raw_audio = resample(raw_audio, sr, config.fs) raw_audio = raw_audio.astype(np.float32).tostring() feature = tf.train.Example(features=tf.train.Features( feature = { 'audio' : tf.train.Feature(bytes_list=tf.train.BytesList(value=[raw_audio])), 'key' : tf.train.Feature(int64_list=tf.train.Int64List(value=[keys_to_index[key]])) })) print group_name, s, key, keys_to_index[key] writer.write(feature.SerializeToString()) writer.close()
def save_mp3(path, y, sr): if sr != 44100: y = core.resample(y, sr, 44100) yint = (y / np.max(y) * 0.49 * (2**16)).astype(np.int16) audio = AudioSegment(data=yint, frame_rate=44100, sample_width=2, channels=1) audio.export(path + '.mp3', format='mp3') return
def downsample_mono(path, sr): obj = wavio.read(path) wav = obj.data.astype(np.float32, order='F') rate = obj.rate try: # tmp = wav.shape[1] wav = to_mono(wav.T) except: pass wav = resample(wav, rate, sr) wav = wav.astype(np.int16) return sr, wav
def preprocess_noise(noise_audio, fs_noise, fs): """[summary] Args: noise_list ([type]): [description] Returns: [type]: [description] """ # Downsample to 16kHz if fs != fs_noise: noise_audio_resamp = resample(noise_audio, fs_noise, fs) return noise_audio_resamp
def set_samplerate(datadir, words, samplerate): # only check samplerate for files in word list wav_files = [f for word in words for f in glob.glob(os.path.join('data', word,'*.wav'))] print(f'Verifying data is at {samplerate} Hz') for wav_file in tqdm(wav_files): samples, sr = sf.read(wav_file) if sr != samplerate: samples = resample(samples, sr, samplerate) sf.write(wav_file, samples, samplerate) print(f'All Data sampled at {samplerate} Hz')
def downsample_mono(path, sr): rate, wav = wavfile.read(path) # print(type(wav)) # np.asfortranarray(wav[0,0:Length-1,:].copy()) wav = resample(wav.astype(np.float32), rate, sr) # print(type(wav)) wav = wav.astype(np.int16) # checks stereo and converts to mono if nessesary try: tmp = wav.shape[1] wav = wav[:, 0] + wav[:, 1] / 2 except: pass return sr, wav
def SaveSpectrogramA(y_mix, y_target, fname, original_sr=44100, generate_high_data=False): if original_sr != C.SR: y_mix = resample(y_mix, original_sr, C.SR) y_target = resample(y_target, original_sr, C.SR) S_mix = np.abs( stft(y_mix, n_fft=C.FFT_SIZE, hop_length=C.H, window=C.WINDOW)).astype(np.float32) S_target = np.abs( stft(y_target, n_fft=C.FFT_SIZE, hop_length=C.H, window=C.WINDOW)).astype(np.float32) S_mix_low = S_mix[:C.SP] S_target_low = S_target[:C.SP] norm = S_mix_low.max() S_mix_low /= norm S_target_low /= norm np.savez(C.PATH_TRAINDATA / (fname + "-low.npz"), mix=S_mix_low, target=S_target_low) del S_mix_low, S_target_low if generate_high_data: S_mix_high = S_mix[C.SP:C.SP + C.SP] S_target_high = S_target[C.SP:C.SP + C.SP] norm = S_mix_high.max() S_mix_high /= norm S_target_high /= norm np.savez(C.PATH_TRAINDATA / (fname + "-high.npz"), mix=S_mix_high, target=S_target_high)
def save_as_torch_file(self): """Download the yesno data if it doesn't exist in processed_folder already.""" import tarfile if self._check_exists(): return raw_abs_dir = os.path.join(self.root, self.raw_folder) processed_abs_dir = os.path.join(self.root, self.processed_folder) # dset_abs_path = os.path.join( # self.root, self.raw_folder) dset_abs_path = YOUTUBE_PIANOS_RAW # process and save as torch files print('Processing...') # shutil.copyfile( # os.path.join(dset_abs_path, "README"), # os.path.join(processed_abs_dir, "YESNO_README") # ) audios = [x for x in os.listdir(dset_abs_path) if ".wav" in x] print("Found {} audio files".format(len(audios))) tensors = [] labels = [] lengths = [] for i, f in enumerate(audios): if i >= self.dataset_size: break print("Reading: {0}".format(f)) full_path = os.path.join(dset_abs_path, f) sig, sr = read_audio(full_path, 44100) sig = resample(sig.numpy(), sr, self.sample_rate) sig = sig.reshape((1, -1)) sig = torch.FloatTensor(sig) tensors.append(sig) lengths.append(sig.size(1)) labels.append(os.path.basename(f).split(".", 1)[0].split("_")) # sort sigs/labels: longest -> shortest tensors, labels = zip(*[(b, c) for (a, b, c) in sorted( zip(lengths, tensors, labels), key=lambda x: x[0], reverse=True)]) self.max_len = tensors[0].size(1) torch.save((tensors, labels), os.path.join(self.processed_folder, self.processed_file)) print('Done!')
def create_raw_audio_dataset(output_fn, subset=config.data_subset, data_root=config.data_root): """ Create a H5 file from the LibriSpeech dataset and the subset given: Inputs: output_fn: filename for the created file subset: LibriSpeech subset : 'dev-clean' , ... data_root: LibriSpeech folder path """ from librosa.core import resample, load # Extract the information about this subset (speakers, chapters) # Dictionary with the following shape: # {speaker_key: {chapters: [...], sex:'M/F', ... } } speakers_info = data_tools.read_metadata(subset) with h5py.File(output_fn, 'w') as data_file: for key, elements in tqdm(speakers_info.items(), total=len(speakers_info), desc='Speakers'): if key not in data_file: # Create an H5 Group for each key/speaker data_file.create_group(key) # Current speaker folder path folder = data_root + '/' + subset + '/' + key # For all the chapters read by this speaker for i, chapter in enumerate( tqdm(elements['chapters'], desc='Chapters')): # Find all .flac audio for root, dirs, files in os.walk(folder + '/' + chapter): for file in tqdm(files, desc='Files'): if file.endswith(".flac"): path = os.path.join(root, file) raw_audio, sr = load(path, sr=16000) raw_audio = resample(raw_audio, sr, config.fs) data_file[key].create_dataset( file, shape=raw_audio.shape, data=raw_audio, chunks=raw_audio.shape, maxshape=raw_audio.shape, compression="gzip", compression_opts=9) print 'Dataset for the subset: ' + subset + ' has been built'
def downsample(path, down_sample): sample_rate, wave = wavfile.read(path) wave = wave.astype(np.float32, order='F') ## wave, sample_rate = librosa.load(path, sr = args.down_sample, mono=True) try: tmp = wave.shape[1] wave = to_mono(wave.T) except: pass wave = resample(wave, sample_rate, down_sample) wave = wave.astype(np.int16) return wave, down_sample
def read_wav(wav_path, fs): """Read a single-channel wav file from given path. Perform resampling and amp normalization :param wav_path: Path where the single-channel wav file is located :param fs: Desired sampling rate :return: Amp normalized wav at specified sampling rate """ fs_wav, wav = wavfile.read(wav_path) wav = wav / np.max(np.abs(wav)) if fs_wav != fs: warnings.warn("Sampling rate of wav file is not ", fs, ". Will be resampled") wav = resample(wav, wav, fs) return fs_wav, wav / np.max(np.abs(wav))
def read_test_data(): data = np.empty((4512, 862, 40)) for i in np.arange(4512): audio = np.load('audio/' + str(i) + '.npy') # Resampling to 44100 audio = core.resample(audio, orig_sr=48000, target_sr=44100) audio = audio * 1 / np.max(np.abs(audio)) spec = melspectrogram(y=audio, sr=44100, n_fft=1024, hop_length=512, n_mels=40) spec = spec.T data[i, :, :] = spec return data
def vad(data, fs, fs_vad=16000, hop_length=30, vad_mode=0): # # check data # if data.dtype.kind == 'i': # if data.max() > 2**15 - 1 or data.min() < -2**15: # raise ValueError( # 'When data.type is int, data must be -32768 < data < 32767.') # data = data.astype('f') / 2.0**15 # elif data.dtype.kind == 'f': # if np.abs(data).max() > 1: # raise ValueError( # 'When data.type is float, data must be -1.0 <= data <= 1.0.') # data = data.astype('f') # else: # raise ValueError('data.dtype must be int or float.') data = data.squeeze() if not data.ndim == 1: raise ValueError('data must be mono (1 ch).') # resampling if fs != fs_vad: resampled = resample(data, fs, fs_vad) if np.abs(resampled).max() > 1.0: resampled *= (0.99 / np.abs(resampled).max()) # warn('Resampling causes data clipping. data was rescaled.') else: resampled = data resampled = (resampled * 2.0**15).astype('int16') hop = fs_vad * hop_length // 1000 framelen = resampled.size // hop + 1 padlen = framelen * hop - resampled.size paded = np.lib.pad(resampled, (0, padlen), 'constant', constant_values=0) framed = frame(paded, frame_length=hop, hop_length=hop).T vad = webrtcvad.Vad() vad.set_mode(vad_mode) valist = [vad.is_speech(tmp.tobytes(), fs_vad) for tmp in framed] hop_origin = fs * hop_length // 1000 va_framed = np.zeros([len(valist), hop_origin]) va_framed[valist] = 1 return va_framed.reshape(-1)[:data.size]
def load_audio_file(file_path): data_l = os.listdir(file_path) input_length = 16000 x = 1 for i in data_l: rate, data_in = wavfile.read(file_path + i) data_in = data_in.astype(np.float32, order='F') try: tmp = data_in.shape[1] data_in = to_mono(data_in.T) except: pass data_in = resample(data_in, rate, 16000) data_in = data_in.astype(np.float32) data_ap.append(data_in) x += 1 return data_ap