Пример #1
0
    def get_audios_embeds(self, sess, request_id, audio_body_dic):
        """
        获取音频文件的特征
        :param audio_body_dic: dic,格式为:{1:(wav,sr),2:{wav,sr},..}
        :return: 所有音频的embed向量
        """
        embed_result = {}

        for audio_id in audio_body_dic.keys():
            wav, sr = audio_body_dic[audio_id]
            wav = np.array(wav)
            # 预处理音频
            wav = audio_ops.preprocess_wav(wav, source_sr=hp.sampling_rate)

            if len(wav) < min_second_utterance * hp.sampling_rate:
                logger.info(
                    'request_id:{} audio_id:{} 音频有效长度({})小于指定最小长度({})'.format(
                        request_id, audio_id,
                        len(wav) // hp.sampling_rate, min_second_utterance))

            frames_batch = slice_utterance_mel(
                wav)  # shape=[batch_size, n_frames, n_channels]#对音频进行分割为多段
            [partial_embeds] = sess.run(
                [self.svf_model.embeds],
                feed_dict={self.svf_model.inpt_inference: frames_batch})
            raw_embed = np.mean(partial_embeds, axis=0)
            embed = raw_embed / np.linalg.norm(raw_embed, 2)
            embed_result[audio_id] = embed.tolist()

        return embed_result
Пример #2
0
def generate_new_conbined_utters(n_speakers, max_speakers, n_spkrs_utters,
                                 path_speakers_audio, path_to_save):
    # Make new wavs combining different speakers wavs
    speakers_list = [
        x for x in os.listdir(path_speakers_audio)
        if os.path.isdir(path_speakers_audio + x)
    ]
    np.random.shuffle(speakers_list)
    for speaker in tqdm(speakers_list[:n_speakers]):
        speaker_path = path_speakers_audio + speaker
        if not os.path.isdir(speaker_path):
            continue

        curr_n_speakers = np.random.randint(1, max_speakers + 1)
        rndm_spkrs = np.random.choice(speakers_list, curr_n_speakers)
        wavs = []
        combined_labels = []
        times_between = []
        for j, speaker_file in enumerate(os.listdir(speaker_path)):
            if j == n_spkrs_utters:
                break
            speaker_file_path = speaker_path + '/' + speaker_file
            spkr_wav = preprocess_wav(speaker_file_path,
                                      sampling_rate=sr,
                                      trim_silence=True)
            wavs.append(spkr_wav)
            combined_labels.append(speaker)
            for random_speaker in rndm_spkrs:
                random_speaker_path = path_speakers_audio + random_speaker
                random_spkr_uttr_name = np.random.choice([
                    x for x in os.listdir(random_speaker_path)
                    if x[-3:] == 'wav'
                ])
                random_speaker_file_path = random_speaker_path + '/' + random_spkr_uttr_name
                random_spkr_wav = preprocess_wav(random_speaker_file_path,
                                                 sampling_rate=sr,
                                                 trim_silence=True)
                wavs.append(random_spkr_wav)
                combined_labels.append(random_speaker)
        combined_utters, labels_encoded = combine_utters(
            wavs, combined_labels, sr)
        filename = path_to_save + speaker + '_' + str(len(rndm_spkrs))
        filename += '_' + str(np.random.randint(0, 1e+5)) + '.wav'
        # np.save(filename, list([combined_utters, labels_encoded]))
        save_pickle((combined_utters, labels_encoded), filename)
Пример #3
0
def get_embeds(file_path, slice_len, encoder, sr):
    wav = preprocess_wav(file_path, sampling_rate=sr)
    embedds = []
    slice_len *= sr
    n_slices = int(-np.floor(-wav.shape[0]/slice_len))  # hack to floor to biggest
    prev_ind = 0
    for i in range(n_slices):
        curr_index = int(prev_ind + slice_len)
        emb = encoder.embed_utterance(wav[prev_ind: curr_index], return_partials=False, rate=1.5)
        embedds.append(emb)
        prev_ind = curr_index
    return embedds
Пример #4
0
def sample_speaker_wav(speaker_dir, save_dir, sample_num, min_second):
    speaker_dir_wav_cl = [fp for fp in speaker_dir.glob("*.wav")]

    random.shuffle(speaker_dir_wav_cl)

    i = 0
    for fp in speaker_dir_wav_cl:
        wav = audio.preprocess_wav(fp, hp.sampling_rate)

        if i < sample_num and len(wav) >= min_second * hp.sampling_rate:
            save_speaker_dir = Path(save_dir).joinpath(speaker_dir.name)
            save_speaker_dir.mkdir(exist_ok=True)

            shutil.copy(str(fp), str(save_speaker_dir))

            i += 1
    return 1
    def get_audios_embeds(self, audio_file):

        wav, sr = audio_ops.load_wav(audio_file)
        wav = np.array(wav)
        # audio preprocess
        wav = audio_ops.preprocess_wav(wav, source_sr=hp.sampling_rate)

        if len(wav) < min_second_utterance * hp.sampling_rate:
            print(' 音频有效长度({})小于指定最小长度({})'.format(
                len(wav) // hp.sampling_rate, min_second_utterance))

        frames_batch = slice_utterance_mel(
            wav)  # shape=[batch_size, n_frames, n_channels]#对音频进行分割为多段
        [partial_embeds] = self.sess.run(
            [self.svf_model.embeds],
            feed_dict={self.svf_model.inpt_inference: frames_batch})
        raw_embed = np.mean(partial_embeds, axis=0)
        embed = raw_embed / np.linalg.norm(raw_embed, 2)

        return embed
Пример #6
0
def combine_utters_from_noisy_dataset(n_speakers, max_speakers, n_spkrs_utters,
                                      path_speakers_audio, path_to_save):
    utters_list = os.listdir(path_speakers_audio)
    nunique_speakers = np.unique([x[:4] for x in utters_list]).shape[0]
    for n in range(nunique_speakers):
        current_n_speakers = np.random.randint(2, max_speakers + 1)
        wavs = []
        combined_labels = []
        curent_utters = np.random.choice(utters_list, size=current_n_speakers)
        for i, utter_path in enumerate(curent_utters):
            spkr_wav = preprocess_wav(path_speakers_audio + utter_path,
                                      sampling_rate=sr,
                                      trim_silence=True)
            wavs.append(spkr_wav)
            speaker = utter_path[:4]
            combined_labels.append(speaker)
        combined_utters, labels_encoded = combine_utters(
            wavs, combined_labels, sr)
        filename = path_to_save + speaker + '_' + str(current_n_speakers)
        filename += '_' + str(np.random.randint(0, 1e+5)) + '.wav'
        # np.save(filename, list([combined_utters, labels_encoded]))
        save_pickle((combined_utters, labels_encoded), filename)
        print('saved', filename)
Пример #7
0
    # 新建保存向量的目录
    shutil.rmtree(save_dir, ignore_errors=True)
    Path(save_dir).mkdir(exist_ok=True)

    i = 0
    for speaker_dir in Path(read_dir).glob("*"):
        if speaker_dir.name in train_speaker_set:  # 只有不出现在训练集的speaker才拿来做测试
            print('{} 存在训练集中,丢弃!!'.format(speaker_dir.name))
            continue

        for wav_path in speaker_dir.glob("*.wav"):
            save_speaker_dir = Path(save_dir).joinpath(speaker_dir.name)
            save_speaker_dir.mkdir(exist_ok=True)

            # 预处理音频
            wav = audio.preprocess_wav(wav_path, source_sr=hp.sampling_rate)
            if len(wav) < min_second_utterances * hp.sampling_rate:
                continue

            frames_batch = slice_utterance_mel(
                wav)  # shape=[batch_size, n_frames, n_channels]#对音频进行分割为多段

            save_wav_path = str(
                save_speaker_dir.joinpath(
                    wav_path.name.replace(".wav", "_{}.npy".format(len(wav)))))
            # 获取音频的embedding向量,然后保存
            embedding_wav_and_save_vector(frames_batch, save_wav_path)

        i += 1
        if i % 100 == 0:
            print(i)
Пример #8
0
        
    save_pickle(data, '../data/data_embeds.dat')
    
    
    labels_all = []
    for i in os.listdir(PATH_TO_SAVE):
          embedds, labels = load_pickle(PATH_TO_SAVE + i)
          labels_all.append(labels)
    
    
    plt.hist(np.concatenate(labels_all).flatten())
    
    
    # wav = load_pickle('audio_data/combined/p225_1_592.wav')[0]
    
    wav = preprocess_wav('audio_data/test.m4a', sampling_rate=sr)
    play_wav_file(wav, fs=sr)
    
    # !mkdir data/my_test
    
    start_time = time.time()
    emb = get_embeds('audio_data/test.m4a', sr=sampling_rate, slice_len=0.5,   encoder=encoder)
    end_time = time.time()
    total_time = end_time - start_time 
    print(f'embedds got in {total_time:.2f} seconds')
    
    
    save_pickle(emb, 'data/my_test/test_voice_embeddings.dat')
    

    def finalize_dataset(self, min_audio_length=7):
        """
        Download youtube videos as .wav files.
        Parameters:
            links_txt: A .txt file that contains
                       list of youtube urls separated by new line.
        """
        tqdm.write(f"Trimming silence from audios in '{self.concat_dir}'.")

        concat_audios = [
            wav for wav in os.listdir(self.concat_dir) if wav.endswith(".wav")
        ]
        concat_txt = [wav.replace(".wav", ".txt") for wav in concat_audios]

        filtered_audios = []
        filtered_txts = []

        for ix in tqdm(range(len(concat_audios))):
            audio = concat_audios[ix]
            wav, sr = librosa.load(os.path.join(self.concat_dir, audio))
            silence_removed = preprocess_wav(wav)
            trimmed_length = silence_removed.shape[0] / sr
            if trimmed_length >= min_audio_length:
                self.len_dataset += trimmed_length
                librosa.output.write_wav(
                    os.path.join(self.dest_dir, "wavs", audio), silence_removed, sr
                )
                filtered_audios.append(audio)
                filtered_txts.append(audio.replace(".wav", ".txt"))

        for text in filtered_txts:
            shutil.copyfile(
                os.path.join(self.concat_dir, text),
                os.path.join(self.dest_dir, "txts", text),
            )

        trimmed = []

        for wav, trans in zip(filtered_audios, filtered_txts):
            with open(os.path.join(self.concat_dir, trans)) as f:
                text = f.read().strip()
            trimmed.append([wav, text])

        trimmed = pd.DataFrame(trimmed, columns=["wav_file_name", "transcription"])

        if not self.keep_audio_extension:
            trimmed["wav_file_name"] = trimmed["wav_file_name"].apply(
                lambda x: x.replace(".wav", "")
            )

        if self.output_type == "csv":
            trimmed["transcription_utf"] = trimmed["transcription"]
            trimmed.to_csv(
                os.path.join(self.dest_dir, "metadata.csv"),
                sep="|",
                index=None,
                header=None,
            )
            tqdm.write(
                f"Dataset '{self.name}' has been generated. Wav files are placed in '{self.dest_dir}/wavs'. Transcription files are placed in '{self.dest_dir}/txts'."
            )
            tqdm.write(f"Metadata is placed in '{self.dest_dir}' as 'metadata.csv'.")
        elif self.output_type == "json":
            data = {}
            for ix in range(trimmed.shape[0]):
                name = trimmed.iloc[ix][0]
                text = trimmed.iloc[ix][1]
                data[name] = text
            with open(os.path.join(self.dest_dir, "alignment.json"), "w") as f:
                json.dump(data, f)
            tqdm.write(
                f"Dataset '{self.name}' has been generated. Wav files are placed in '{self.dest_dir}/wavs'. Transcription files are placed in '{self.dest_dir}/txts'."
            )
            tqdm.write(f"Metadata is placed in '{self.dest_dir}' as 'alignment.json'.")

        tqdm.write(
            f"Collected {round(self.len_dataset/3600, 2)}hours ({int(self.len_dataset)} seconds) of audio."
        )