def get_audios_embeds(self, sess, request_id, audio_body_dic): """ 获取音频文件的特征 :param audio_body_dic: dic,格式为:{1:(wav,sr),2:{wav,sr},..} :return: 所有音频的embed向量 """ embed_result = {} for audio_id in audio_body_dic.keys(): wav, sr = audio_body_dic[audio_id] wav = np.array(wav) # 预处理音频 wav = audio_ops.preprocess_wav(wav, source_sr=hp.sampling_rate) if len(wav) < min_second_utterance * hp.sampling_rate: logger.info( 'request_id:{} audio_id:{} 音频有效长度({})小于指定最小长度({})'.format( request_id, audio_id, len(wav) // hp.sampling_rate, min_second_utterance)) frames_batch = slice_utterance_mel( wav) # shape=[batch_size, n_frames, n_channels]#对音频进行分割为多段 [partial_embeds] = sess.run( [self.svf_model.embeds], feed_dict={self.svf_model.inpt_inference: frames_batch}) raw_embed = np.mean(partial_embeds, axis=0) embed = raw_embed / np.linalg.norm(raw_embed, 2) embed_result[audio_id] = embed.tolist() return embed_result
def generate_new_conbined_utters(n_speakers, max_speakers, n_spkrs_utters, path_speakers_audio, path_to_save): # Make new wavs combining different speakers wavs speakers_list = [ x for x in os.listdir(path_speakers_audio) if os.path.isdir(path_speakers_audio + x) ] np.random.shuffle(speakers_list) for speaker in tqdm(speakers_list[:n_speakers]): speaker_path = path_speakers_audio + speaker if not os.path.isdir(speaker_path): continue curr_n_speakers = np.random.randint(1, max_speakers + 1) rndm_spkrs = np.random.choice(speakers_list, curr_n_speakers) wavs = [] combined_labels = [] times_between = [] for j, speaker_file in enumerate(os.listdir(speaker_path)): if j == n_spkrs_utters: break speaker_file_path = speaker_path + '/' + speaker_file spkr_wav = preprocess_wav(speaker_file_path, sampling_rate=sr, trim_silence=True) wavs.append(spkr_wav) combined_labels.append(speaker) for random_speaker in rndm_spkrs: random_speaker_path = path_speakers_audio + random_speaker random_spkr_uttr_name = np.random.choice([ x for x in os.listdir(random_speaker_path) if x[-3:] == 'wav' ]) random_speaker_file_path = random_speaker_path + '/' + random_spkr_uttr_name random_spkr_wav = preprocess_wav(random_speaker_file_path, sampling_rate=sr, trim_silence=True) wavs.append(random_spkr_wav) combined_labels.append(random_speaker) combined_utters, labels_encoded = combine_utters( wavs, combined_labels, sr) filename = path_to_save + speaker + '_' + str(len(rndm_spkrs)) filename += '_' + str(np.random.randint(0, 1e+5)) + '.wav' # np.save(filename, list([combined_utters, labels_encoded])) save_pickle((combined_utters, labels_encoded), filename)
def get_embeds(file_path, slice_len, encoder, sr): wav = preprocess_wav(file_path, sampling_rate=sr) embedds = [] slice_len *= sr n_slices = int(-np.floor(-wav.shape[0]/slice_len)) # hack to floor to biggest prev_ind = 0 for i in range(n_slices): curr_index = int(prev_ind + slice_len) emb = encoder.embed_utterance(wav[prev_ind: curr_index], return_partials=False, rate=1.5) embedds.append(emb) prev_ind = curr_index return embedds
def sample_speaker_wav(speaker_dir, save_dir, sample_num, min_second): speaker_dir_wav_cl = [fp for fp in speaker_dir.glob("*.wav")] random.shuffle(speaker_dir_wav_cl) i = 0 for fp in speaker_dir_wav_cl: wav = audio.preprocess_wav(fp, hp.sampling_rate) if i < sample_num and len(wav) >= min_second * hp.sampling_rate: save_speaker_dir = Path(save_dir).joinpath(speaker_dir.name) save_speaker_dir.mkdir(exist_ok=True) shutil.copy(str(fp), str(save_speaker_dir)) i += 1 return 1
def get_audios_embeds(self, audio_file): wav, sr = audio_ops.load_wav(audio_file) wav = np.array(wav) # audio preprocess wav = audio_ops.preprocess_wav(wav, source_sr=hp.sampling_rate) if len(wav) < min_second_utterance * hp.sampling_rate: print(' 音频有效长度({})小于指定最小长度({})'.format( len(wav) // hp.sampling_rate, min_second_utterance)) frames_batch = slice_utterance_mel( wav) # shape=[batch_size, n_frames, n_channels]#对音频进行分割为多段 [partial_embeds] = self.sess.run( [self.svf_model.embeds], feed_dict={self.svf_model.inpt_inference: frames_batch}) raw_embed = np.mean(partial_embeds, axis=0) embed = raw_embed / np.linalg.norm(raw_embed, 2) return embed
def combine_utters_from_noisy_dataset(n_speakers, max_speakers, n_spkrs_utters, path_speakers_audio, path_to_save): utters_list = os.listdir(path_speakers_audio) nunique_speakers = np.unique([x[:4] for x in utters_list]).shape[0] for n in range(nunique_speakers): current_n_speakers = np.random.randint(2, max_speakers + 1) wavs = [] combined_labels = [] curent_utters = np.random.choice(utters_list, size=current_n_speakers) for i, utter_path in enumerate(curent_utters): spkr_wav = preprocess_wav(path_speakers_audio + utter_path, sampling_rate=sr, trim_silence=True) wavs.append(spkr_wav) speaker = utter_path[:4] combined_labels.append(speaker) combined_utters, labels_encoded = combine_utters( wavs, combined_labels, sr) filename = path_to_save + speaker + '_' + str(current_n_speakers) filename += '_' + str(np.random.randint(0, 1e+5)) + '.wav' # np.save(filename, list([combined_utters, labels_encoded])) save_pickle((combined_utters, labels_encoded), filename) print('saved', filename)
# 新建保存向量的目录 shutil.rmtree(save_dir, ignore_errors=True) Path(save_dir).mkdir(exist_ok=True) i = 0 for speaker_dir in Path(read_dir).glob("*"): if speaker_dir.name in train_speaker_set: # 只有不出现在训练集的speaker才拿来做测试 print('{} 存在训练集中,丢弃!!'.format(speaker_dir.name)) continue for wav_path in speaker_dir.glob("*.wav"): save_speaker_dir = Path(save_dir).joinpath(speaker_dir.name) save_speaker_dir.mkdir(exist_ok=True) # 预处理音频 wav = audio.preprocess_wav(wav_path, source_sr=hp.sampling_rate) if len(wav) < min_second_utterances * hp.sampling_rate: continue frames_batch = slice_utterance_mel( wav) # shape=[batch_size, n_frames, n_channels]#对音频进行分割为多段 save_wav_path = str( save_speaker_dir.joinpath( wav_path.name.replace(".wav", "_{}.npy".format(len(wav))))) # 获取音频的embedding向量,然后保存 embedding_wav_and_save_vector(frames_batch, save_wav_path) i += 1 if i % 100 == 0: print(i)
save_pickle(data, '../data/data_embeds.dat') labels_all = [] for i in os.listdir(PATH_TO_SAVE): embedds, labels = load_pickle(PATH_TO_SAVE + i) labels_all.append(labels) plt.hist(np.concatenate(labels_all).flatten()) # wav = load_pickle('audio_data/combined/p225_1_592.wav')[0] wav = preprocess_wav('audio_data/test.m4a', sampling_rate=sr) play_wav_file(wav, fs=sr) # !mkdir data/my_test start_time = time.time() emb = get_embeds('audio_data/test.m4a', sr=sampling_rate, slice_len=0.5, encoder=encoder) end_time = time.time() total_time = end_time - start_time print(f'embedds got in {total_time:.2f} seconds') save_pickle(emb, 'data/my_test/test_voice_embeddings.dat')
def finalize_dataset(self, min_audio_length=7): """ Download youtube videos as .wav files. Parameters: links_txt: A .txt file that contains list of youtube urls separated by new line. """ tqdm.write(f"Trimming silence from audios in '{self.concat_dir}'.") concat_audios = [ wav for wav in os.listdir(self.concat_dir) if wav.endswith(".wav") ] concat_txt = [wav.replace(".wav", ".txt") for wav in concat_audios] filtered_audios = [] filtered_txts = [] for ix in tqdm(range(len(concat_audios))): audio = concat_audios[ix] wav, sr = librosa.load(os.path.join(self.concat_dir, audio)) silence_removed = preprocess_wav(wav) trimmed_length = silence_removed.shape[0] / sr if trimmed_length >= min_audio_length: self.len_dataset += trimmed_length librosa.output.write_wav( os.path.join(self.dest_dir, "wavs", audio), silence_removed, sr ) filtered_audios.append(audio) filtered_txts.append(audio.replace(".wav", ".txt")) for text in filtered_txts: shutil.copyfile( os.path.join(self.concat_dir, text), os.path.join(self.dest_dir, "txts", text), ) trimmed = [] for wav, trans in zip(filtered_audios, filtered_txts): with open(os.path.join(self.concat_dir, trans)) as f: text = f.read().strip() trimmed.append([wav, text]) trimmed = pd.DataFrame(trimmed, columns=["wav_file_name", "transcription"]) if not self.keep_audio_extension: trimmed["wav_file_name"] = trimmed["wav_file_name"].apply( lambda x: x.replace(".wav", "") ) if self.output_type == "csv": trimmed["transcription_utf"] = trimmed["transcription"] trimmed.to_csv( os.path.join(self.dest_dir, "metadata.csv"), sep="|", index=None, header=None, ) tqdm.write( f"Dataset '{self.name}' has been generated. Wav files are placed in '{self.dest_dir}/wavs'. Transcription files are placed in '{self.dest_dir}/txts'." ) tqdm.write(f"Metadata is placed in '{self.dest_dir}' as 'metadata.csv'.") elif self.output_type == "json": data = {} for ix in range(trimmed.shape[0]): name = trimmed.iloc[ix][0] text = trimmed.iloc[ix][1] data[name] = text with open(os.path.join(self.dest_dir, "alignment.json"), "w") as f: json.dump(data, f) tqdm.write( f"Dataset '{self.name}' has been generated. Wav files are placed in '{self.dest_dir}/wavs'. Transcription files are placed in '{self.dest_dir}/txts'." ) tqdm.write(f"Metadata is placed in '{self.dest_dir}' as 'alignment.json'.") tqdm.write( f"Collected {round(self.len_dataset/3600, 2)}hours ({int(self.len_dataset)} seconds) of audio." )