def add_real_utterance(self, wav, name, speaker_name): # Compute the mel spectrogram spec = Synthesizer.make_spectrogram(wav) self.ui.draw_spec(spec, "current") # Compute the embedding if not encoder.is_loaded(): self.init_encoder() encoder_wav = encoder.preprocess_wav(wav) embed, partial_embeds, _ = encoder.embed_utterance( encoder_wav, return_partials=True) np.save(self._out_embed_dir.joinpath(name + '.npy'), embed, allow_pickle=False) # save # Add the utterance utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, False) self.utterances.add(utterance) self.ui.register_utterance(utterance) # Plot it self.ui.draw_embed(embed, name, "current") self.ui.draw_umap_projections(self.utterances)
def vocode(self): speaker_name, spec, breaks, _ = self.current_generated assert spec is not None # Synthesize the waveform if not vocoder.is_loaded(): self.init_vocoder() def vocoder_progress(i, seq_len, b_size, gen_rate): real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000 line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \ % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor) self.ui.log(line, "overwrite") self.ui.set_loading(i, seq_len) if self.ui.current_vocoder_fpath is not None: self.ui.log("") wav = vocoder.infer_waveform(spec, progress_callback=vocoder_progress) else: self.ui.log("Waveform generation with Griffin-Lim... ") wav = Synthesizer.griffin_lim(spec) self.ui.set_loading(0) self.ui.log(" Done!", "append") # Add breaks b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size) b_starts = np.concatenate(([0], b_ends[:-1])) wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)] breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks) wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)]) # Play it wav = wav / np.abs(wav).max() * 0.97 self.ui.play(wav, Synthesizer.sample_rate) self.ui.save_button.setDisabled(False) # Compute the embedding # TODO: this is problematic with different sampling rates, gotta fix it if not encoder.is_loaded(): self.init_encoder() encoder_wav = encoder.preprocess_wav(wav) embed, partial_embeds, _ = encoder.embed_utterance( encoder_wav, return_partials=True) # Add the utterance if not speaker_name is None: name = speaker_name else: name = "unknown" name = name + "_gen_%05d" % np.random.randint(100000) utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, True) self.utterances.add(utterance) self.ui.register_utterance(utterance) # Plot it self.ui.draw_embed(embed, name, "generated") self.ui.draw_umap_projections(self.utterances)
def vocode(self): speaker_name, spec, breaks, _ = self.current_generated assert spec is not None # Synthesize the waveform if not vocoder.is_loaded(): self.init_vocoder() def vocoder_progress(i, seq_len, b_size, gen_rate): real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000 line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \ % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor) self.ui.log(line, "overwrite") self.ui.set_loading(i, seq_len) if self.ui.current_vocoder_fpath is not None: self.ui.log("") wav = vocoder.infer_waveform(spec, progress_callback=vocoder_progress) else: self.ui.log("Waveform generation with Griffin-Lim... ") wav = Synthesizer.griffin_lim(spec) self.ui.set_loading(0) self.ui.log(" Done!", "append") # Add breaks b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size) b_starts = np.concatenate(([0], b_ends[:-1])) wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)] breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks) wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)]) # Play it wav = wav / np.abs(wav).max() * 0.97 self.ui.play(wav, Synthesizer.sample_rate) fref = '-'.join([self.ui.current_dataset_name, self.ui.current_speaker_name, self.ui.current_utterance_name]) ftime = '{}'.format(int(time.time())) ftext = self.ui.text_prompt.toPlainText() fms = int(len(wav) * 1000 / Synthesizer.sample_rate) fname = filename_formatter('{}_{}_{}ms_{}.wav'.format(fref, ftime, fms, ftext)) audio.save_wav(wav, _out_wav_dir.joinpath(fname), Synthesizer.sample_rate) # save # Compute the embedding # TODO: this is problematic with different sampling rates, gotta fix it if not encoder.is_loaded(): self.init_encoder() encoder_wav = encoder.preprocess_wav(wav) embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True) # Add the utterance name = speaker_name + "_gen_%05d" % int(time.time()) utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, True) np.save(_out_embed_dir.joinpath(name + '.npy'), embed, allow_pickle=False) # save self.utterances.add(utterance) # Plot it self.ui.draw_embed(embed, name, "generated") self.ui.draw_umap_projections(self.utterances)
def transform_embed(wav, encoder_model_fpath=Path()): from encoder import inference as encoder if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath) wav = encoder.preprocess_wav(wav) embed = encoder.embed_utterance(wav) return embed
def embed_utterance(fpaths, encoder_model_fpath): if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath) # Compute the speaker embedding of the utterance wav_fpath = fpaths wav, rate = librosa.load(wav_fpath) wav = encoder.preprocess_wav(wav, rate) return encoder.embed_utterance(wav)
def extract_utterance_feats_spkr(self, data_utterance_path, is_full_ppg=False): """Get PPG and Mel (+ optional F0) for an utterance. Args: data_utterance_path: The path to the data utterance protocol buffer. is_full_ppg: If True, will use the full PPGs. Returns: feat_pairs: A list, each is a [pps, mel, dvec(spkr embedding)] pair. """ utt = Utterance() fs, wav = wavfile.read(data_utterance_path) utt.fs = fs utt.wav = wav utt.ppg = get_ppg(data_utterance_path, self.ppg_deps) audio = torch.FloatTensor(utt.wav.astype(np.float32)) fs = utt.fs if fs != self.stft.sampling_rate: raise ValueError("{} SR doesn't match target {} SR".format( fs, self.stft.sampling_rate)) audio_norm = audio / self.max_wav_value audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) # (1, n_mel_channels, T) acoustic_feats = self.stft.mel_spectrogram(audio_norm) # (n_mel_channels, T) acoustic_feats = torch.squeeze(acoustic_feats, 0) # (T, n_mel_channels) acoustic_feats = acoustic_feats.transpose(0, 1) #print("encoder model path", self.encoder_model_fpath) from encoder import inference as encoder if not encoder.is_loaded(): encoder.load_model(self.encoder_model_fpath) #wav = np.load(data_utterance_path) wav = encoder.preprocess_wav(data_utterance_path) # wav embed = encoder.embed_utterance(wav) #print("spkr embedding", embed) #print("shape of ppg, acoustic feats and spkr embedding", (utt.ppg).shape, acoustic_feats.shape, embed.shape) if is_full_ppg: if self.is_append_f0: ppg_f0 = append_ppg(utt.ppg, utt.f0) return [ppg_f0, acoustic_feats, embed] else: return [utt.ppg, acoustic_feats, embed] else: if self.is_append_f0: ppg_f0 = append_ppg(utt.monophone_ppg, utt.f0) return [ppg_f0, acoustic_feats, embed] else: return [utt.monophone_ppg, acoustic_feats, embed]
def get_embed(self, wav): # from encoder import inference as encoder if not encoder.is_loaded(): encoder.load_model(self.encoder_model_fpath, device='cpu') # 用cpu避免以下报错。 # "RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the ‘spawn’ start method" wav = encoder.preprocess_wav(wav) embed = encoder.embed_utterance(wav) return embed
def embed_utterance(fpaths, encoder_model_fpath): if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath) # Compute the speaker embedding of the utterance wav_fpath, embed_fpath = fpaths wav = np.load(wav_fpath) wav = encoder.preprocess_wav(wav) embed = encoder.embed_utterance(wav) np.save(embed_fpath, embed, allow_pickle=False)
def embed_utterance(fpaths, encoder_model_fpath): if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath) # Compute the speaker embedding of the utterance wav_fpath = embed_fpath = fpaths embed_fpath = embed_fpath.replace(".wav", ".npy") wav, rate = librosa.load(wav_fpath) wav = encoder.preprocess_wav(wav, rate) embed = encoder.embed_utterance(wav) np.save(embed_fpath, embed, allow_pickle=False)
def embed_utterance(src, skip_existing=True, encoder_model_fpath=Path()): if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath) wav_fpath, embed_fpath = src if skip_existing and embed_fpath.is_file(): return wav = aukit.load_wav(wav_fpath, sr=hp.sampling_rate) wav = encoder.preprocess_wav(wav) embed = encoder.embed_utterance(wav) np.save(embed_fpath, embed, allow_pickle=False)
def embed_utterance(fpaths, encoder_model_fpath): if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath) # Compute the speaker embedding of the utterance wav_fpath, embed_fpath, _ = fpaths # try: # wav = np.load(wav_fpath) # except ValueError as e: # print(e) # wav = np.load(wav_fpath, allow_pickle=True) wav = encoder.preprocess_wav(wav_fpath) embed = encoder.embed_utterance(wav) np.save(embed_fpath, embed, allow_pickle=False)
def signup(wav_fpath: Path, username, encoder_model_fpath): if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath) # Compute the speaker embedding of the utterance embed_fpath = signup_dir.joinpath(username + ".npy") wav = encoder.preprocess_wav(str(wav_fpath)) embed = encoder.embed_utterance(wav) if os.path.exists(embed_fpath): old_embed = np.load(embed_fpath) embed = old_embed + embed embed /= np.linalg.norm(embed, 2) os.remove(embed_fpath) np.save(embed_fpath, embed, allow_pickle=False) print(username + " signed up.")
def transform_embed(wav, encoder_model_fpath=Path()): # from encoder import inference as encoder if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath) wav_ = encoder.preprocess_wav(wav) # Take segment segment_length = 2 * encoder.sampling_rate # 随机选取2秒语音生成语音表示向量 if len(wav_) > segment_length: max_audio_start = len(wav_) - segment_length audio_start = random.randint(0, max_audio_start) wav_ = wav_[audio_start:audio_start + segment_length] embed = encoder.embed_utterance(wav_) return embed
def embed_utterance(fpaths, encoder_model_fpath, hparams): if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath) # Compute the speaker embedding of the utterance wav_fpath, embed_fpath = fpaths if embed_fpath.exists(): return # wav = np.load(wav_fpath) wav, _ = librosa.load(wav_fpath, hparams.sample_rate) if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max wav = encoder.preprocess_wav(wav) embed = encoder.embed_utterance(wav) np.save(embed_fpath, embed, allow_pickle=False)
def get_embed(self, wav): # from encoder import inference as encoder if not encoder.is_loaded(): encoder.load_model(self.encoder_model_fpath, device='cpu') # 用cpu避免以下报错。 # "RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the ‘spawn’ start method" wav_ = encoder.preprocess_wav(wav) # Take segment segment_length = 2 * encoder.sampling_rate # 随机选取2秒语音生成语音表示向量 if len(wav_) > segment_length: max_audio_start = len(wav_) - segment_length audio_start = random.randint(0, max_audio_start) wav_ = wav_[audio_start:audio_start + segment_length] embed = encoder.embed_utterance(wav_) return embed
def embed_utterance(fpaths, encoder_model_fpath, module_name, reject_list_fpath): if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath, module_name=module_name) # Compute the speaker embedding of the utterance wav_fpath, embed_fpath = fpaths wav = np.load(wav_fpath) wav = encoder.preprocess_wav(wav) embed = encoder.embed_utterance(wav) if(embed is None): with open(reject_list_fpath, 'a') as reject_file: reject_file.write(str(os.path.basename(embed_fpath))+'\n') reject_file.close() else: if(embed.shape[0]==128): embed = np.concatenate((embed, embed), axis=0) np.save(embed_fpath, embed, allow_pickle=False)
def signin(wav_or_wavpath, encoder_model_fpath): if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath) wav = encoder.preprocess_wav(wav_or_wavpath) embed = encoder.embed_utterance(wav) embed = np.reshape(embed, [np.shape(embed)[0], 1]) # [emb_dim, 1] signed_spk_embs = list(signup_dir.glob("*.npy")) signed_spk_name = [_dir.stem for _dir in signed_spk_embs] signed_spk_embs = [np.load(str(_dir)) for _dir in signed_spk_embs] signed_spk_embs = np.array(signed_spk_embs) # [n, emb_dim] print(signed_spk_name) print(np.shape(signed_spk_embs), np.shape(embed)) similar_score = np.matmul(signed_spk_embs, embed) similar_score = np.reshape(similar_score, [-1]) sim_id = np.argmax(similar_score) sim_name = signed_spk_name[sim_id] for name, score in zip(signed_spk_name, similar_score): print(name, score) print("\nMatching name: ", sim_name)
def vocode(self): speaker_name, spec, breaks, _ = self.current_generated assert spec is not None # Initialize the vocoder model and make it determinstic, if user provides a seed if self.ui.random_seed_checkbox.isChecked(): seed = self.synthesizer.set_seed(int(self.ui.seed_textbox.text())) self.ui.populate_gen_options(seed, self.trim_silences) else: seed = None if seed is not None: torch.manual_seed(seed) # Synthesize the waveform if not vocoder.is_loaded() or seed is not None: self.init_vocoder() def vocoder_progress(i, seq_len, b_size, gen_rate): real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000 line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \ % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor) self.ui.log(line, "overwrite") self.ui.set_loading(i, seq_len) if self.ui.current_vocoder_fpath is not None: self.ui.log("") wav = vocoder.infer_waveform(spec, progress_callback=vocoder_progress) else: self.ui.log("Waveform generation with Griffin-Lim... ") wav = Synthesizer.griffin_lim(spec) self.ui.set_loading(0) self.ui.log(" Done!", "append") # Add breaks b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size) b_starts = np.concatenate(([0], b_ends[:-1])) wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)] breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks) wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)]) # Trim excessive silences if self.ui.trim_silences_checkbox.isChecked(): wav = encoder.preprocess_wav(wav) # Play it wav = wav / np.abs(wav).max() * 0.97 self.ui.play(wav, Synthesizer.sample_rate) # Name it (history displayed in combobox) # TODO better naming for the combobox items? wav_name = str(self.waves_count + 1) #Update waves combobox self.waves_count += 1 if self.waves_count > MAX_WAVES: self.waves_list.pop() self.waves_namelist.pop() self.waves_list.insert(0, wav) self.waves_namelist.insert(0, wav_name) self.ui.waves_cb.disconnect() self.ui.waves_cb_model.setStringList(self.waves_namelist) self.ui.waves_cb.setCurrentIndex(0) self.ui.waves_cb.currentIndexChanged.connect(self.set_current_wav) # Update current wav self.set_current_wav(0) #Enable replay and save buttons: self.ui.replay_wav_button.setDisabled(False) self.ui.export_wav_button.setDisabled(False) # Compute the embedding # TODO: this is problematic with different sampling rates, gotta fix it if not encoder.is_loaded(): self.init_encoder() encoder_wav = encoder.preprocess_wav(wav) embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True) # Add the utterance name = speaker_name + "_gen_%05d" % np.random.randint(100000) utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, True) self.utterances.add(utterance) # Plot it self.ui.draw_embed(embed, name, "generated") self.ui.draw_umap_projections(self.utterances)
def vocode(self): speaker_name, spec, breaks, _ = self.current_generated assert spec is not None # Synthesize the waveform if not vocoder.is_loaded(): self.init_vocoder() def vocoder_progress(i, seq_len, b_size, gen_rate): real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000 line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \ % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor) self.ui.log(line, "overwrite") self.ui.set_loading(i, seq_len) wav = None vocname = "" if self.ui.current_vocoder_fpath is not None: model_fpath = self.ui.current_vocoder_fpath vocname = Path(model_fpath).parent.stem if Path(model_fpath).parent.stem == "melgan": self.ui.log("Waveform generation with MelGAN... ") wav = vocoder_melgan.infer_waveform_melgan(spec, model_fpath) elif Path(model_fpath).parent.stem == "wavernn": self.ui.log("Waveform generation with WaveRNN... ") wav = vocoder.infer_waveform(spec, progress_callback=vocoder_progress) if wav is None: vocname = "griffinlim" self.ui.log("Waveform generation with Griffin-Lim... ") wav = Synthesizer.griffin_lim(spec) self.ui.set_loading(0) self.ui.log(" Done!", "append") # Play it wav = wav / np.abs(wav).max() * 0.97 self.ui.play(wav, Synthesizer.sample_rate) fref = self.ui.selected_utterance.name ftime = '{}'.format(time_formatter()) ftext = self.ui.text_prompt.toPlainText() fms = int(len(wav) * 1000 / Synthesizer.sample_rate) fvoc = vocname fname = filename_formatter('{}_{}_{}_{}ms_{}.wav'.format(fref, ftime, fvoc, fms, ftext)) audio.save_wav(wav, self._out_wav_dir.joinpath(fname), Synthesizer.sample_rate) # save # Compute the embedding # TODO: this is problematic with different sampling rates, gotta fix it if not encoder.is_loaded(): self.init_encoder() encoder_wav = encoder.preprocess_wav(wav) embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True) # Add the utterance name = speaker_name + "_gen_{}".format(time_formatter()) utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, True) np.save(self._out_embed_dir.joinpath(name + '.npy'), embed, allow_pickle=False) # save self.utterances.add(utterance) # Plot it self.ui.draw_embed(embed, name, "generated") self.ui.draw_umap_projections(self.utterances)
def embed_utterance(wav, encoder_model_fpath): if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath, device="cpu") wav = encoder.preprocess_wav(wav) embed = encoder.embed_utterance(wav) return embed