def create_infer_dags(neural_factory, neural_modules, tacotron2_params, infer_dataset, infer_batch_size, cpu_per_dl=1): (_, text_embedding, t2_enc, t2_dec, t2_postnet, _, _) = neural_modules data_layer = nemo_asr.TranscriptDataLayer( path=infer_dataset, labels=tacotron2_params['labels'], batch_size=infer_batch_size, num_workers=cpu_per_dl, load_audio=False, bos_id=len(tacotron2_params['labels']), eos_id=len(tacotron2_params['labels']) + 1, pad_id=len(tacotron2_params['labels']) + 2, shuffle=False) transcript, transcript_len = data_layer() transcript_embedded = text_embedding(char_phone=transcript) transcript_encoded = t2_enc(char_phone_embeddings=transcript_embedded, embedding_length=transcript_len) if isinstance(t2_dec, nemo_tts.Tacotron2DecoderInfer): mel_decoder, gate, alignments, mel_len = t2_dec( char_phone_encoded=transcript_encoded, encoded_length=transcript_len) else: raise ValueError( "The Neural Module for tacotron2 decoder was not understood") mel_postnet = t2_postnet(mel_input=mel_decoder) return [mel_postnet, gate, alignments, mel_len]
def create_dag(args, cfg, num_gpus): # Defining nodes data = nemo_asr.TranscriptDataLayer( path=args.train_dataset, labels=cfg['target']['labels'], eos_id=cfg['target']['eos_id'], pad_id=cfg['target']['pad_id'], batch_size=cfg['optimization']['batch_size'], drop_last=True, ) data_eval = nemo_asr.AudioToTextDataLayer( manifest_filepath=args.eval_datasets, labels=cfg['target']['labels'], eos_id=cfg['target']['eos_id'], batch_size=cfg['inference']['batch_size'], load_audio=False ) decoder = nemo.backends.pytorch.DecoderRNN( voc_size=len(cfg['target']['labels']), bos_id=cfg['target']['bos_id'], **cfg['DecoderRNN'] ) num_data = len(data) batch_size = cfg['optimization']['batch_size'] num_epochs = cfg['optimization']['params']['num_epochs'] steps_per_epoch = int(num_data / (batch_size)) total_steps = num_epochs * steps_per_epoch vsc = ValueSetterCallback tf_callback = ValueSetterCallback( decoder, 'teacher_forcing', policies=[ vsc.Policy(vsc.Method.Const(1.0), start=0.0, end=1.0), ], total_steps=total_steps ) seq_loss = nemo.backends.pytorch.SequenceLoss( pad_id=cfg['target']['pad_id'], smoothing_coef=cfg['optimization']['smoothing_coef'] ) saver_callback = nemo.core.ModuleSaverCallback( save_modules_list=[decoder], folder=args.checkpoint_dir, step_freq=args.checkpoint_save_freq ) # Creating DAG texts, _ = data() log_probs, _ = decoder( targets=texts ) train_loss = seq_loss( log_probs=log_probs, targets=texts ) evals = [] _, _, texts, _ = data_eval() log_probs, _ = decoder( targets=texts ) eval_loss = seq_loss( log_probs=log_probs, targets=texts ) evals.append((args.eval_datasets, (eval_loss, log_probs, texts))) # Update config cfg['num_params'] = {'decoder': decoder.num_weights} cfg['num_params']['total'] = sum(cfg['num_params'].values()) cfg['input']['train'] = {'num_data': num_data} cfg['optimization']['steps_per_epoch'] = steps_per_epoch cfg['optimization']['total_steps'] = total_steps return (train_loss, evals), cfg, [tf_callback, saver_callback]
def synthesis(self, text): """Reads text and returns the audio signal in a wav file""" self.nf.logger.info('Starting speech synthesis') # create inference DAGs data_layer = nemo_asr.TranscriptDataLayer( path = build_text_path(text), labels = self.tacotron2_params['labels'], batch_size = 1, num_workers = 1, load_audio=False, bos_id = len(self.tacotron2_params['labels']), eos_id = len(self.tacotron2_params['labels']) + 1, pad_id = len(self.tacotron2_params['labels']) + 2, shuffle = False ) os.remove("text.json") self.nf.logger.info("Running Tacotron 2") transcript, transcript_len = data_layer() transcript_embedded = self.text_embedding(char_phone=transcript) transcript_encoded = self.t2_enc( char_phone_embeddings=transcript_embedded, embedding_length=transcript_len) mel_decoder, gate, alignments, mel_len = self.t2_dec( char_phone_encoded=transcript_encoded, encoded_length=transcript_len) mel_postnet = self.t2_postnet(mel_input=mel_decoder) infer_tensors = [mel_postnet, gate, alignments, mel_len] # Run tacotron 2 evaluated_tensors = self.nf.infer( tensors = infer_tensors, cache = True, offload_to_cpu = True) mel_len = evaluated_tensors[-1] # creating vocoder if self.tts_conf["vocoder"] == "griffin-lim": self.nf.logger.info("Running Griffin-Lim as a vocoder") mel_spec = evaluated_tensors[0][0] log_mel = mel_spec.cpu().numpy().transpose(0, 2, 1) mel = np.exp(log_mel) filterbank = librosa.filters.mel( sr = self.tacotron2_params["sample_rate"], n_fft = self.tacotron2_params["n_fft"], n_mels = self.tacotron2_params["n_mels"], fmax = self.tacotron2_params["fmax"]) sample = np.dot(mel, filterbank) * self.tts_conf["mag_scale"] sample = sample[0][:mel_len[0][0], :] # convert magnitude spectrograms to audio signal magnitudes = sample.T ** self.tts_conf["power"] phase = np.exp(2j * np.pi * np.random.rand(*magnitudes.shape)) complex_spec = magnitudes * phase signal = librosa.istft(complex_spec) if np.isfinite(signal).all(): for _ in range(self.tts_conf["n_iters"]): _, phase = librosa.magphase(librosa.stft(signal, n_fft = self.tts_conf["n_fft"])) complex_spec = magnitudes * phase signal = librosa.istft(complex_spec) else: self.nf.logger.warn("audio was not finite") signal = np.array([0]) outwav = "griffin_sample.wav" wavfile.write(outwav, self.tacotron2_params["sample_rate"], signal) self.nf.logger.info("Wav file was generated and named: " + outwav) elif self.tts_conf["vocoder"] == "waveglow": self.nf.logger.info("Running Waveglow as a vocoder") audio_pred = self.waveglow(mel_spectrogram=mel_postnet) # Run waveglow evaluated_tensors = self.nf.infer( tensors = [audio_pred], modules_to_restore = [self.waveglow], use_cache = True) mel_spec = evaluated_tensors[0][0] sample = mel_spec.cpu().numpy()[0] sample_len = mel_len[0][0] * self.tacotron2_params["n_stride"] sample = sample[:sample_len] # apply denoiser waveglow_denoiser_strength = self.tts_conf["denoising_strength"] if waveglow_denoiser_strength > 0: sample, spec = self.waveglow.denoise(sample, strength = waveglow_denoiser_strength) else: spec, _ = librosa.core.magphase(librosa.core.stft( sample, n_fft = self.waveglow_params["n_fft"])) outwav = "waveglow_sample.wav" wavfile.write(outwav, self.waveglow_params["sample_rate"], sample) self.nf.logger.info("Wav file was generated and named: " + outwav)