def DeepTalk_encoder(file_path, model_save_path, module_name, preprocess=True, normalize=True, sampling_rate=8000, duration=None): encoder.load_model(model_save_path, module_name=module_name) if (preprocess): wav = Synthesizer.load_preprocess_wav(file_path) ref_audio = encoder.preprocess_wav(wav) else: ref_audio, sr = librosa.load(file_path, sr=sampling_rate) if (duration is not None): ref_audio = ref_audio[0:int(duration * sampling_rate)] embed, partial_embeds, _ = encoder.embed_utterance(ref_audio, using_partials=True, return_partials=True) if (normalize): embed = embed / np.linalg.norm(embed) return embed
def synth(text, audio_file): """ Parameters ---------- text : string text to be said in synthesized voice audio_file : filepath filepath for audio file in wav format Returns ------- generated_wav : numpy.ndarray Numpy padded array of synthesized audio signal """ in_fpath = Path("audio.wav") original_wav, sampling_rate = librosa.load(in_fpath) preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate) embed = encoder.embed_utterance(preprocessed_wav) print("Synthesizing new audio...") specs = synthesizer.synthesize_spectrograms([text], [embed]) generated_wav = vocoder.infer_waveform(specs[0]) generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant") return generated_wav
def change_mode(character: str = "Human_Man", tone: str = "neutral"): training_dir = voices_dict[character]['ID'] tone_file = voices_dict[character]['tone'][tone] + '.flac' tone_dir = tone_file.split("-")[1] local_infpath = Path(f'{data_path}/{training_dir}/{tone_dir}/{tone_file}') global in_fpath, filenum, preprocessed_wav, embed, torch, vocoder if local_infpath != in_fpath and character is not None: if tone is None: tone = "neutral" print( f'Reference sound has changed; now loading {character}:{tone}...') with nostdout(): in_fpath = local_infpath preprocessed_wav = encoder.preprocess_wav(in_fpath) original_wav, sampling_rate = librosa.load(str(in_fpath)) preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate) embed = encoder.embed_utterance(preprocessed_wav) torch.manual_seed(seed) vocoder.load_model(vocoder_path) text_to_speech('Tea.', play_sound=False) else: print('Mode is already correct. No need to change.')
def __init__(self): if (Text2SpeechModel == "dc_tts"): self.g = Graph(mode="synthesize") print("Text2Speech Tensorflow Graph loaded") elif (Text2SpeechModel == "RTVC"): enc_model_fpath = os.path.join( root_file_path, "RTVC", "encoder/saved_models/pretrained.pt") syn_model_dir = os.path.join( root_file_path, "RTVC", "synthesizer/saved_models/logs-pretrained") voc_model_fpath = os.path.join( root_file_path, "RTVC", "vocoder/saved_models/pretrained/pretrained.pt") encoder.load_model(enc_model_fpath) self.synthesizer = Synthesizer(os.path.join( syn_model_dir, "taco_pretrained"), low_mem=False) vocoder.load_model(voc_model_fpath) in_fpath = os.path.join("/", *root_file_path.split("/")[:-1], "REF/refaudioRTVC/ref.wav") preprocessed_wav = encoder.preprocess_wav(in_fpath) original_wav, sampling_rate = librosa.load(in_fpath) preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate) embed = encoder.embed_utterance(preprocessed_wav) self.embeds = [embed] elif (Text2SpeechModel == "AudioSynth"): taco_pretrained_config_path = os.path.join( root_file_path, 'AudioSynth/TensorFlowTTS/examples/tacotron2/conf/tacotron2.v1.yaml' ) tacotron2_config = AutoConfig.from_pretrained( taco_pretrained_config_path) taco_path = os.path.join(root_file_path, "AudioSynth/tacotron2-120k.h5") self.tacotron2 = TFAutoModel.from_pretrained( config=tacotron2_config, pretrained_path=taco_path, training=False, name="tacotron2") melgan_stft_pretrained_config_path = os.path.join( root_file_path, 'AudioSynth/TensorFlowTTS/examples/melgan.stft/conf/melgan.stft.v1.yaml' ) melgan_stft_config = AutoConfig.from_pretrained( melgan_stft_pretrained_config_path) melgan_stft_path = os.path.join(root_file_path, "AudioSynth/melgan.stft-2M.h5") self.melgan_stft = TFAutoModel.from_pretrained( config=melgan_stft_config, pretrained_path=melgan_stft_path, name="melgan_stft") self.processor = AutoProcessor.from_pretrained( pretrained_path=os.path.join( root_file_path, "AudioSynth/ljspeech_mapper.json")) mels, alignment_history, audios = do_synthesis( "Hello, how can I help you today?", self.tacotron2, self.melgan_stft, "TACOTRON", "MELGAN-STFT", self.processor)
def preprocess_embeddings(self, path, ext_audio, ext_embed): for i in range(1, len(self._walker)): fileid = self._walker[i] speaker_id, chapter_id, utterance_id = fileid.split("-") fileid_audio = speaker_id + "-" + chapter_id + "-" + utterance_id file_audio = fileid_audio + ext_audio file_audio = os.path.join(path, speaker_id, chapter_id, file_audio) file_embed = fileid_audio + ext_embed file_embed = os.path.join(path, speaker_id, chapter_id, file_embed) # Load audio waveform, sample_rate = torchaudio.load(file_audio) print("Loaded file: ", fileid) # Calculate speaker embedding wav = waveform.transpose(0, 1).detach().numpy().squeeze() preprocessed_wav = styleEncoder.preprocess_wav(wav, sample_rate) embedding = styleEncoder.embed_utterance(preprocessed_wav) # Save embeddings to corresponding csv files data = asarray(embedding) savetxt(file_embed, data, delimiter=',') print("Saved embedding: ", file_embed)
def generate_voice(self, in_fpath, text, out_fpath): try: preprocessed_wav = encoder.preprocess_wav(in_fpath) original_wav, sampling_rate = librosa.load(in_fpath) preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate) print("Loaded file successfully") embed = encoder.embed_utterance(preprocessed_wav) print("Created the embedding") texts = [text] embeds = [embed] # If you know what the attention layer alignments are, you can retrieve them here by # passing return_alignments=True specs = self.synthesizer.synthesize_spectrograms(texts, embeds) spec = specs[0] print("Created the mel spectrogram") ## Generating the waveform print("Synthesizing the waveform:") generated_wav = vocoder.infer_waveform(spec) generated_wav = np.pad(generated_wav, (0, self.synthesizer.sample_rate), mode="constant") librosa.output.write_wav(out_fpath, generated_wav.astype(np.float32), self.synthesizer.sample_rate) print("\nSaved output as %s\n\n" % out_fpath) except Exception as e: print("Caught exception: %s" % repr(e)) print("Restarting\n")
def vocode(self): speaker_name, spec, breaks, _ = self.current_generated assert spec is not None # Synthesize the waveform if not vocoder.is_loaded(): self.init_vocoder() def vocoder_progress(i, seq_len, b_size, gen_rate): real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000 line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \ % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor) self.ui.log(line, "overwrite") self.ui.set_loading(i, seq_len) if self.ui.current_vocoder_fpath is not None: self.ui.log("") wav = vocoder.infer_waveform(spec, progress_callback=vocoder_progress) else: self.ui.log("Waveform generation with Griffin-Lim... ") wav = Synthesizer.griffin_lim(spec) self.ui.set_loading(0) self.ui.log(" Done!", "append") # Add breaks b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size) b_starts = np.concatenate(([0], b_ends[:-1])) wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)] breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks) wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)]) # Play it wav = wav / np.abs(wav).max() * 0.97 self.ui.play(wav, Synthesizer.sample_rate) fref = '-'.join([self.ui.current_dataset_name, self.ui.current_speaker_name, self.ui.current_utterance_name]) ftime = '{}'.format(int(time.time())) ftext = self.ui.text_prompt.toPlainText() fms = int(len(wav) * 1000 / Synthesizer.sample_rate) fname = filename_formatter('{}_{}_{}ms_{}.wav'.format(fref, ftime, fms, ftext)) audio.save_wav(wav, _out_wav_dir.joinpath(fname), Synthesizer.sample_rate) # save # Compute the embedding # TODO: this is problematic with different sampling rates, gotta fix it if not encoder.is_loaded(): self.init_encoder() encoder_wav = encoder.preprocess_wav(wav) embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True) # Add the utterance name = speaker_name + "_gen_%05d" % int(time.time()) utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, True) np.save(_out_embed_dir.joinpath(name + '.npy'), embed, allow_pickle=False) # save self.utterances.add(utterance) # Plot it self.ui.draw_embed(embed, name, "generated") self.ui.draw_umap_projections(self.utterances)
def predict(self, path): # fpath = '/home/ali/Desktop/a2lsv/deneme/' fpaths = glob(path+"/*.wav") embedings = [] embedingsDict = {} for fpath in fpaths: wav = librosa.load(fpath, 16000)[0] encoder_wav = encoder.preprocess_wav(wav) embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True) embed = np.array(embed).reshape(-1) embedings.append(embed) embedingsDict[fpath.split("/")[-1].split(".wav")[0]] = embed pickle.dump(embedingsDict, open(path+"/embedingsDict.pickle", 'wb')) # reducer = TSNE() reducer = umap.UMAP(int(np.ceil(np.sqrt(len(embedings)))), metric="cosine") projections = reducer.fit_transform(embedings) thresh = 1 clusters = hcluster.fclusterdata(projections, thresh, criterion="distance") speakerSlices = {} for fpath, speaker in zip(fpaths, clusters): speaker = str(speaker) audioId = fpath.split('/')[-1].split('.')[0] if speaker not in speakerSlices.keys(): speakerSlices[speaker] = [int(audioId)] else: speakerSlices[speaker] += [int(audioId)] for k, v in speakerSlices.items(): v.sort() return speakerSlices
def add_real_utterance(self, wav, name, speaker_name): # Compute the mel spectrogram spec = Synthesizer.make_spectrogram(wav) self.ui.draw_spec(spec, "current") # Compute the embedding if not encoder.is_loaded(): self.init_encoder() encoder_wav = encoder.preprocess_wav(wav) embed, partial_embeds, _ = encoder.embed_utterance( encoder_wav, return_partials=True) np.save(self._out_embed_dir.joinpath(name + '.npy'), embed, allow_pickle=False) # save # Add the utterance utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, False) self.utterances.add(utterance) self.ui.register_utterance(utterance) # Plot it self.ui.draw_embed(embed, name, "current") self.ui.draw_umap_projections(self.utterances)
def vocode(self): speaker_name, spec, breaks, _ = self.current_generated assert spec is not None # Synthesize the waveform if not vocoder.is_loaded(): self.init_vocoder() def vocoder_progress(i, seq_len, b_size, gen_rate): real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000 line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \ % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor) self.ui.log(line, "overwrite") self.ui.set_loading(i, seq_len) if self.ui.current_vocoder_fpath is not None: self.ui.log("") wav = vocoder.infer_waveform(spec, progress_callback=vocoder_progress) else: self.ui.log("Waveform generation with Griffin-Lim... ") wav = Synthesizer.griffin_lim(spec) self.ui.set_loading(0) self.ui.log(" Done!", "append") # Add breaks b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size) b_starts = np.concatenate(([0], b_ends[:-1])) wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)] breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks) wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)]) # Play it wav = wav / np.abs(wav).max() * 0.97 self.ui.play(wav, Synthesizer.sample_rate) self.ui.save_button.setDisabled(False) # Compute the embedding # TODO: this is problematic with different sampling rates, gotta fix it if not encoder.is_loaded(): self.init_encoder() encoder_wav = encoder.preprocess_wav(wav) embed, partial_embeds, _ = encoder.embed_utterance( encoder_wav, return_partials=True) # Add the utterance if not speaker_name is None: name = speaker_name else: name = "unknown" name = name + "_gen_%05d" % np.random.randint(100000) utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, True) self.utterances.add(utterance) self.ui.register_utterance(utterance) # Plot it self.ui.draw_embed(embed, name, "generated") self.ui.draw_umap_projections(self.utterances)
def load_model(in_fpath, parser): parser.add_argument("-e", "--enc_model_fpath", type=Path, default="encoder/saved_models/pretrained.pt", help="Path to a saved encoder") parser.add_argument("-s", "--syn_model_dir", type=Path, default="synthesizer/saved_models/logs-pretrained/", help="Directory containing the synthesizer model") parser.add_argument("-v", "--voc_model_fpath", type=Path, default="vocoder/saved_models/pretrained/pretrained.pt", help="Path to a saved vocoder") parser.add_argument("--low_mem", action="store_true", help=\ "If True, the memory used by the synthesizer will be freed after each use. Adds large " "overhead but allows to save some GPU memory for lower-end GPUs.") parser.add_argument("--no_sound", action="store_true", help=\ "If True, audio won't be played.") args = parser.parse_args() encoder.load_model(args.enc_model_fpath) synthesizer = Synthesizer(args.syn_model_dir.joinpath("taco_pretrained"), low_mem=args.low_mem) vocoder.load_model(args.voc_model_fpath) preprocessed_wav = encoder.preprocess_wav(in_fpath) original_wav, sampling_rate = librosa.load(in_fpath) preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate) embed = encoder.embed_utterance(preprocessed_wav) return synthesizer, sampling_rate, embed
def initialize(self): print("Running a test of your configuration...\n") if not torch.cuda.is_available(): print( "Your PyTorch installation is not configured to use CUDA. If you have a GPU ready " "for deep learning, ensure that the drivers are properly installed, and that your " "CUDA version matches your PyTorch installation. CPU-only inference is currently " "not supported.") quit(-1) print("PyTorch is available and working...") device_id = torch.cuda.current_device() gpu_properties = torch.cuda.get_device_properties(device_id) print( "Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with " "%.1fGb total memory.\n" % (torch.cuda.device_count(), device_id, gpu_properties.name, gpu_properties.major, gpu_properties.minor, gpu_properties.total_memory / 1e9)) ## Load the models one by one. print("Preparing the encoder, the synthesizer and the vocoder...") encoder.load_model(self.enc_model_fpath) vocoder.load_model(self.voc_model_fpath) ## Run a test print("Testing your configuration with small inputs.") print("\tTesting the encoder...") encoder.embed_utterance(np.zeros(encoder.sampling_rate)) embed = np.random.rand(speaker_embedding_size) embed /= np.linalg.norm(embed) embeds = [embed, np.zeros(speaker_embedding_size)] texts = ["test 1", "test 2"] print( "\tTesting the synthesizer... (loading the model will output a lot of text)" ) mels = self.synthesizer.synthesize_spectrograms(texts, embeds) mel = np.concatenate(mels, axis=1) no_action = lambda *args: None print("\tTesting the vocoder...") vocoder.infer_waveform(mel, target=200, overlap=50, progress_callback=no_action) print("All test passed! You can now synthesize speech.\n\n")
def transform_embed(wav, encoder_model_fpath=Path()): from encoder import inference as encoder if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath) wav = encoder.preprocess_wav(wav) embed = encoder.embed_utterance(wav) return embed
def embed_utterance(fpaths, encoder_model_fpath): if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath) # Compute the speaker embedding of the utterance wav_fpath = fpaths wav, rate = librosa.load(wav_fpath) wav = encoder.preprocess_wav(wav, rate) return encoder.embed_utterance(wav)
def extract_utterance_feats_spkr(self, data_utterance_path, is_full_ppg=False): """Get PPG and Mel (+ optional F0) for an utterance. Args: data_utterance_path: The path to the data utterance protocol buffer. is_full_ppg: If True, will use the full PPGs. Returns: feat_pairs: A list, each is a [pps, mel, dvec(spkr embedding)] pair. """ utt = Utterance() fs, wav = wavfile.read(data_utterance_path) utt.fs = fs utt.wav = wav utt.ppg = get_ppg(data_utterance_path, self.ppg_deps) audio = torch.FloatTensor(utt.wav.astype(np.float32)) fs = utt.fs if fs != self.stft.sampling_rate: raise ValueError("{} SR doesn't match target {} SR".format( fs, self.stft.sampling_rate)) audio_norm = audio / self.max_wav_value audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) # (1, n_mel_channels, T) acoustic_feats = self.stft.mel_spectrogram(audio_norm) # (n_mel_channels, T) acoustic_feats = torch.squeeze(acoustic_feats, 0) # (T, n_mel_channels) acoustic_feats = acoustic_feats.transpose(0, 1) #print("encoder model path", self.encoder_model_fpath) from encoder import inference as encoder if not encoder.is_loaded(): encoder.load_model(self.encoder_model_fpath) #wav = np.load(data_utterance_path) wav = encoder.preprocess_wav(data_utterance_path) # wav embed = encoder.embed_utterance(wav) #print("spkr embedding", embed) #print("shape of ppg, acoustic feats and spkr embedding", (utt.ppg).shape, acoustic_feats.shape, embed.shape) if is_full_ppg: if self.is_append_f0: ppg_f0 = append_ppg(utt.ppg, utt.f0) return [ppg_f0, acoustic_feats, embed] else: return [utt.ppg, acoustic_feats, embed] else: if self.is_append_f0: ppg_f0 = append_ppg(utt.monophone_ppg, utt.f0) return [ppg_f0, acoustic_feats, embed] else: return [utt.monophone_ppg, acoustic_feats, embed]
def embed_utterance(fpaths, encoder_model_fpath): if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath) # Compute the speaker embedding of the utterance wav_fpath, embed_fpath = fpaths wav = np.load(wav_fpath) wav = encoder.preprocess_wav(wav) embed = encoder.embed_utterance(wav) np.save(embed_fpath, embed, allow_pickle=False)
def get_embed(self, wav): # from encoder import inference as encoder if not encoder.is_loaded(): encoder.load_model(self.encoder_model_fpath, device='cpu') # 用cpu避免以下报错。 # "RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the ‘spawn’ start method" wav = encoder.preprocess_wav(wav) embed = encoder.embed_utterance(wav) return embed
def get_spk_embed(load_path, enc_model_fpath): file_name = load_path.split('/')[-1] wav = load_wav(load_path) encoder.load_model(enc_model_fpath) preprocessed_wav = encoder.preprocess_wav(load_path) embed = encoder.embed_utterance(preprocessed_wav) spk_embd = torch.tensor(embed).unsqueeze(0) return spk_embd, file_name
def embed_utterance(fpaths, encoder_model_fpath): if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath) # Compute the speaker embedding of the utterance wav_fpath = embed_fpath = fpaths embed_fpath = embed_fpath.replace(".wav", ".npy") wav, rate = librosa.load(wav_fpath) wav = encoder.preprocess_wav(wav, rate) embed = encoder.embed_utterance(wav) np.save(embed_fpath, embed, allow_pickle=False)
def run_voiceCloning(filename): in_fpath = dataPath + "/" + filename #transforming mp3 into wav subprocess.call(['ffmpeg', '-i', in_fpath + '.mp3', in_fpath + '.wav']) time.sleep(5) #running the encoder on the audio input original_wav, sampling_rate = librosa.load(Path(in_fpath + '.wav')) preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate) #getting the embeds from the encoder embed = encoder.embed_utterance(preprocessed_wav) return audioFromEmbeds(filename, embed)
def embed_utterance(src, skip_existing=True, encoder_model_fpath=Path()): if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath) wav_fpath, embed_fpath = src if skip_existing and embed_fpath.is_file(): return wav = aukit.load_wav(wav_fpath, sr=hp.sampling_rate) wav = encoder.preprocess_wav(wav) embed = encoder.embed_utterance(wav) np.save(embed_fpath, embed, allow_pickle=False)
def clone_voice(sentence, results_file): """Adapted from 'demo_cli.py'""" u_path = Path('utterance.wav') results_path = Path(results_file) preprocessed_wav = encoder.preprocess_wav(u_path) embed = encoder.embed_utterance(preprocessed_wav) specs = synthesizer.synthesize_spectrograms([sentence], [embed]) generated_wav = vocoder.infer_waveform(specs[0]) generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant") librosa.output.write_wav(results_path, generated_wav.astype(np.float32), synthesizer.sample_rate)
def embed_utterance(fpaths, encoder_model_fpath): if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath) # Compute the speaker embedding of the utterance wav_fpath, embed_fpath, _ = fpaths # try: # wav = np.load(wav_fpath) # except ValueError as e: # print(e) # wav = np.load(wav_fpath, allow_pickle=True) wav = encoder.preprocess_wav(wav_fpath) embed = encoder.embed_utterance(wav) np.save(embed_fpath, embed, allow_pickle=False)
def gen_audio(self, ref_audio, text): try: in_fpath = Path(ref_audio.replace("\"", "").replace("\'", "")) ## Computing the embedding # First, we load the wav using the function that the speaker encoder provides. This is # important: there is preprocessing that must be applied. # The following two methods are equivalent: # - Directly load from the filepath: # preprocessed_wav = encoder.preprocess_wav(in_fpath) # - If the wav is already loaded: original_wav, sampling_rate = librosa.load(in_fpath, sr=None) preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate) print("Loaded file succesfully, rate=%s" % sampling_rate) # Then we derive the embedding. There are many functions and parameters that the # speaker encoder interfaces. These are mostly for in-depth research. You will typically # only use this function (with its default parameters): embed = encoder.embed_utterance(preprocessed_wav) print("Created the embedding") ## Generating the spectrogram # The synthesizer works in batch, so you need to put your data in a list or numpy array texts = [text] embeds = [embed] # If you know what the attention layer alignments are, you can retrieve them here by # passing return_alignments=True specs = self.synthesizer.synthesize_spectrograms(texts, embeds) spec = specs[0] print("Created the mel spectrogram") ## Generating the waveform print("Synthesizing the waveform:") # Synthesizing the waveform is fairly straightforward. Remember that the longer the # spectrogram, the more time-efficient the vocoder. generated_wav = vocoder.infer_waveform(spec) ## Post-generation # There's a bug with sounddevice that makes the audio cut one second earlier, so we # pad it. # generated_wav = np.pad(generated_wav, (0, self.synthesizer.sample_rate), mode="constant") print("\n samples = %s @ %s" % (len(generated_wav), self.synthesizer.sample_rate)) return generated_wav except Exception as e: traceback.print_exc() print("Caught exception: %s" % repr(e)) print("Restarting\n")
def create_embedding(path): if EMBEDDING_TYPE == "ge2e": in_fpath = Path(path.replace("\"", "").replace("\'", "")) preprocessed_wav = encoder.preprocess_wav(in_fpath) return encoder.embed_utterance(preprocessed_wav) else: # in_fpath = Path(path.replace("\"", "").replace("\'", "")) # preprocessed_wav = encoder.preprocess_wav(in_fpath) samplerate, data = read(path.replace("\"", "").replace("\'", "")) if samplerate != 16000: sys.exit(f"{path} does not have sample rate of 16000") emb = module(samples=np.array(data, dtype=float), sample_rate=16000)['embedding'] emb.shape.assert_is_compatible_with([None, 2048]) return emb.numpy()
def signup(wav_fpath: Path, username, encoder_model_fpath): if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath) # Compute the speaker embedding of the utterance embed_fpath = signup_dir.joinpath(username + ".npy") wav = encoder.preprocess_wav(str(wav_fpath)) embed = encoder.embed_utterance(wav) if os.path.exists(embed_fpath): old_embed = np.load(embed_fpath) embed = old_embed + embed embed /= np.linalg.norm(embed, 2) os.remove(embed_fpath) np.save(embed_fpath, embed, allow_pickle=False) print(username + " signed up.")
def transform_embed(wav, encoder_model_fpath=Path()): # from encoder import inference as encoder if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath) wav_ = encoder.preprocess_wav(wav) # Take segment segment_length = 2 * encoder.sampling_rate # 随机选取2秒语音生成语音表示向量 if len(wav_) > segment_length: max_audio_start = len(wav_) - segment_length audio_start = random.randint(0, max_audio_start) wav_ = wav_[audio_start:audio_start + segment_length] embed = encoder.embed_utterance(wav_) return embed
def _compute_embedding(audio): ''' Description Loading Embedding from the audio file to clone Input: audio: Audio File Output Embeddings ''' global embedding embedding = None embedding = encoder.embed_utterance(encoder.preprocess_wav(audio, SAMPLE_RATE))
def tts(input_dict): ''' Flow: 0) Check if audio has embeddings (Not yet) 1) Encode the audio 2) Synthesizer the text with embeddings 3) Vocoder out the fake wav ''' # init output_dict = {"data": {}} # loop the input for audio_name, raw_audio in input_dict["data"].items(): wav_name_no_ext = Path(audio_name).stem saved_path_obj = Path.cwd() / "data/output" # step 1 print("Step 1") raw_audio_np, sample_rate = librosa.load(io.BytesIO(raw_audio)) preprocessed_wav = encoder.preprocess_wav(raw_audio_np, sample_rate) embeddings = encoder.embed_utterance(preprocessed_wav) # step 2 print("Step 2") splitted_text = input_dict["text"].split(".") clean_text_list = [text for text in splitted_text if len(text) > 0] if len(clean_text_list) == 0: raise Exception("Empty text field") sentence_count = len(clean_text_list) embeddings_list = [embeddings] * sentence_count specs = synthesizer.synthesize_spectrograms(clean_text_list, embeddings_list) # step 3 print("Step 3") for index, spec in enumerate(specs): generated_wav = vocoder.infer_waveform(spec) # needed to 1 second for playback capability generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant") file_name = "{}_tts_{}.wav".format(wav_name_no_ext, index) file_path = saved_path_obj / file_name sf.write(str(file_path), generated_wav.astype(np.float32), synthesizer.sample_rate, 'PCM_16') output_dict["data"][index] = file_path return output_dict
def embed_utterance(fpaths, encoder_model_fpath, hparams): if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath) # Compute the speaker embedding of the utterance wav_fpath, embed_fpath = fpaths if embed_fpath.exists(): return # wav = np.load(wav_fpath) wav, _ = librosa.load(wav_fpath, hparams.sample_rate) if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max wav = encoder.preprocess_wav(wav) embed = encoder.embed_utterance(wav) np.save(embed_fpath, embed, allow_pickle=False)