def tts(model, text, p=0, speaker_id=None, fast=False, wavenet=None): """Convert text to speech waveform given a deepvoice3 model. Args: text (str) : Input text to be synthesized p (float) : Replace word to pronounciation if p > 0. Default is 0. """ model = model.to(device) model.eval() if fast: model.make_generation_fast_() sequence = np.array(_frontend.text_to_sequence(text, p=p)) print('sequence to synthesize: ', sequence) sequence = torch.from_numpy(sequence).unsqueeze(0).long().to(device) text_positions = torch.arange(1, sequence.size(-1) + 1).unsqueeze(0).long().to(device) speaker_ids = None if speaker_id is None else torch.LongTensor([speaker_id]).to(device) # Greedy decoding with torch.no_grad(): mel_outputs, linear_outputs, alignments, done = model( sequence, text_positions=text_positions, speaker_ids=speaker_ids) linear_output = linear_outputs[0].cpu().data.numpy() spectrogram = audio._denormalize(linear_output) alignment = alignments[0].cpu().data.numpy() mel = mel_outputs[0].cpu().data.numpy() mel = audio._denormalize(mel) # Predicted audio signal if wavenet is not None: wavenet = wavenet.to(device) wavenet.eval() if fast: wavenet.make_generation_fast_() # TODO: assuming scalar input initial_value = 0.0 initial_input = torch.zeros(1, 1, 1).fill_(initial_value).to(device) # (B, T, C) -> (B, C, T) c = mel_outputs.transpose(1, 2).contiguous() g = None Tc = c.size(-1) length = Tc * 256 initial_input = initial_input.to(device) c = c.to(device) waveform = wavenet.incremental_forward( initial_input, c=c, g=g, T=length, tqdm=tqdm, softmax=True, quantize=True, log_scale_min=float(np.log(1e-14))) waveform = waveform.view(-1).cpu().data.numpy() else: waveform = audio.inv_spectrogram(linear_output.T) return waveform, alignment, spectrogram, mel
def tts(model, text, p=0., speaker_id=None): """ Convert text to speech waveform given a deepvoice3 model. Args: model (DeepVoiceTTS): Model used to synthesize waveform. text (str) : Input text to be synthesized p (float) : Replace word to pronounciation if p > 0. Default is 0. Returns: waveform (numpy.ndarray): Shape(T_wav, ), predicted wave form, where T_wav means the length of the synthesized wave form. alignment (numpy.ndarray): Shape(T_dec, T_enc), predicted alignment matrix, where T_dec means the time steps of decoder outputs, T_enc means the time steps of encoder outoputs. spectrogram (numpy.ndarray): Shape(T_lin, C_lin), predicted linear spectrogram, where T__lin means the time steps of linear spectrogram and C_lin mean sthe channels of linear spectrogram. mel (numpy.ndarray): Shape(T_mel, C_mel), predicted mel spectrogram, where T_mel means the time steps of mel spectrogram and C_mel means the channels of mel spectrogram. """ model.eval() sequence = np.array(_frontend.text_to_sequence(text, p=p)).astype("int64") sequence = np.reshape(sequence, (1, -1)) text_positions = np.arange(1, sequence.shape[1] + 1, dtype="int64") text_positions = np.reshape(text_positions, (1, -1)) sequence = dg.to_variable(sequence) text_positions = dg.to_variable(text_positions) speaker_ids = None if speaker_id is None else fluid.layers.fill_constant( shape=[1, 1], value=speaker_id) # sequence: shape(1, input_length, 1) # text_positions: shape(1, input_length, 1) # Greedy decoding mel_outputs, linear_outputs, alignments, done = model.transduce( sequence, text_positions, speaker_ids) # reshape to the desired shape linear_output = linear_outputs.numpy().squeeze().T spectrogram = audio._denormalize(linear_output) alignment = alignments.numpy()[0] mel = mel_outputs.numpy().squeeze().T mel = audio._denormalize(mel) # Predicted audio signal waveform = audio.inv_spectrogram(linear_output.T) return waveform, alignment, spectrogram, mel
def save_spectrogram(path, linear_output): spectrogram = audio._denormalize(linear_output) plt.figure(figsize=(16, 10)) plt.imshow(spectrogram.T, aspect="auto", origin="lower") plt.colorbar() plt.tight_layout() plt.savefig(path, format="png") plt.close()
def tts(model, text, speaker_id=None, fast=False): """Convert text to speech waveform given a deepvoice3 model. """ model = model.to(device) model.eval() if fast: model.make_generation_fast_() if cfg.frontend == "en": sequence = np.array(english.text_to_sequence(text)) else: raise NotImplementedError sequence = torch.from_numpy(sequence).unsqueeze(0).long().to(device) text_positions = torch.arange(1, sequence.size(-1) + 1).unsqueeze(0).long().to(device) speaker_ids = None if speaker_id is None else torch.LongTensor( [speaker_id]).to(device) # Greedy decoding with torch.no_grad(): mel_outputs, linear_outputs, alignments, done = model( sequence, text_positions=text_positions, speaker_ids=speaker_ids) linear_output = linear_outputs[0].cpu().data.numpy() spectrogram = audio._denormalize(linear_output) alignment = alignments[0].cpu().data.numpy() mel = mel_outputs[0].cpu().data.numpy() mel = audio._denormalize(mel) # Predicted audio signal waveform = audio.inv_spectrogram(linear_output.T) return waveform, alignment, spectrogram, mel
def save_states(global_step, writer, mel_outputs, linear_outputs, attn, mel, y, input_lengths, checkpoint_dir=None): print("Save intermediate states at step {}".format(global_step)) # idx = np.random.randint(0, len(input_lengths)) idx = min(1, len(input_lengths) - 1) input_length = input_lengths[idx] # Alignment # Multi-hop attention if attn is not None and attn.dim() == 4: for i, alignment in enumerate(attn): alignment = alignment[idx].cpu().data.numpy() tag = "alignment_layer{}".format(i + 1) writer.add_image(tag, np.uint8(cm.viridis(np.flip(alignment, 1).T) * 255), global_step) # save files as well for now alignment_dir = join(checkpoint_dir, "alignment_layer{}".format(i + 1)) os.makedirs(alignment_dir, exist_ok=True) path = join(alignment_dir, "step{:09d}_layer_{}_alignment.png".format( global_step, i + 1)) save_alignment(path, alignment) # Save averaged alignment alignment_dir = join(checkpoint_dir, "alignment_ave") os.makedirs(alignment_dir, exist_ok=True) path = join(alignment_dir, "step{:09d}_alignment.png".format(global_step)) alignment = attn.mean(0)[idx].cpu().data.numpy() save_alignment(path, alignment) tag = "averaged_alignment" writer.add_image(tag, np.uint8(cm.viridis(np.flip(alignment, 1).T) * 255), global_step) # Predicted mel spectrogram if mel_outputs is not None: mel_output = mel_outputs[idx].cpu().data.numpy() if hparams.vocoder != "world": mel_output = prepare_spec_image(audio._denormalize(mel_output)) writer.add_image("Predicted mel spectrogram", mel_output, global_step) else: mel_output_prep = mel_output try: writer.add_image("Predicted WORLD output", mel_output_prep, global_step) except: pass mel_output = denormalize(mel_output) nfft = pw.get_cheaptrick_fft_size(hparams.sample_rate) f0 = mel_output[:,0].astype(np.float64) sp = pw.decode_spectral_envelope(mel_output[:,1:(hparams.coded_env_dim+1)].astype(np.float64), hparams.sample_rate, nfft) ap = pw.decode_aperiodicity(mel_output[:,(hparams.coded_env_dim+1):hparams.num_mels].astype(np.float64), hparams.sample_rate, nfft) signal = pw.synthesize(f0, sp, ap, hparams.sample_rate, pw.default_frame_period) path = join(checkpoint_dir, "step{:09d}_out.wav".format( global_step)) audio.save_wav(signal, path) try: signal /= np.max(np.abs(signal)) writer.add_audio("Target audio signal", signal, global_step, sample_rate=fs) except: print("Unexpected error :", sys.exc_info()) mel_tgt = mel[idx].cpu().data.numpy() mel_tgt = denormalize(mel_tgt) f0 = mel_tgt[:,0].astype(np.float64) sp = pw.decode_spectral_envelope(mel_tgt[:,1:(hparams.coded_env_dim+1)].astype(np.float64), hparams.sample_rate, nfft) ap = pw.decode_aperiodicity(mel_tgt[:,(hparams.coded_env_dim+1):hparams.num_mels].astype(np.float64), hparams.sample_rate, nfft) signal = pw.synthesize(f0, sp, ap, hparams.sample_rate, pw.default_frame_period) try: signal /= np.max(np.abs(signal)) writer.add_audio("Target audio signal", signal, global_step, sample_rate=hparams.sample_rate) except: print("Unexpected error :", sys.exc_info()) # Predicted spectrogram if linear_outputs is not None: linear_output = linear_outputs[idx].cpu().data.numpy() spectrogram = prepare_spec_image(audio._denormalize(linear_output)) writer.add_image("Predicted linear spectrogram", spectrogram, global_step) # Predicted audio signal signal = audio.inv_spectrogram(linear_output.T) signal /= np.max(np.abs(signal)) path = join(checkpoint_dir, "step{:09d}_predicted.wav".format( global_step)) try: writer.add_audio("Predicted audio signal", signal, global_step, sample_rate=fs) except Exception as e: warn(str(e)) pass audio.save_wav(signal, path) # Target mel spectrogram if mel_outputs is not None: mel_output = mel[idx].cpu().data.numpy() mel_output = prepare_spec_image(audio._denormalize(mel_output)) writer.add_image("Target mel spectrogram", mel_output, global_step) # Target spectrogram if linear_outputs is not None: linear_output = y[idx].cpu().data.numpy() spectrogram = prepare_spec_image(audio._denormalize(linear_output)) writer.add_image("Target linear spectrogram", spectrogram, global_step) #ei path = join(checkpoint_dir, "step{:09d}_mel_target.npy".format( global_step)) mel_output = mel[idx].cpu().data.numpy() np.save(path, denormalize(mel_output)) path = join(checkpoint_dir, "step{:09d}_mel_out.npy".format( global_step)) mel_output = denormalize(mel_outputs[idx].cpu().data.numpy()) np.save(path, mel_output)
def eval_model(global_step, writer, device, model, checkpoint_dir, ismultispeaker): # harded coded texts = [ "And debtors might practically have as much as they liked%if they could only pay for it.", "There's a way to measure the acute emotional intelligence that has never gone out of style.", "President trump met with other leaders at the group of 20 conference.", "Generative adversarial network or variational auto encoder.", "Please call stella.", "Some have accepted this as a miracle without any physical explanation.", ] import synthesis synthesis._frontend = _frontend eval_output_dir = join(checkpoint_dir, "eval") os.makedirs(eval_output_dir, exist_ok=True) # Prepare model for evaluation model_eval = tm.build_model().to(device) model_eval.load_state_dict(model.state_dict()) # hard coded speaker_ids = [0, 1, 10] if ismultispeaker else [None] for speaker_id in speaker_ids: speaker_str = "multispeaker{}".format( speaker_id) if speaker_id is not None else "single" for idx, text in enumerate(texts, 1): model_eval.eval() model_eval.make_generation_fast_() sequence = np.array(_frontend.text_to_sequence(text, p=0.5)) #import pdb; pdb.set_trace() sequence = torch.from_numpy(sequence).unsqueeze(0).long().to( device) text_positions = torch.arange(1, sequence.size(-1) + 1).unsqueeze(0).long().to(device) speaker_ids = None if speaker_id is None else torch.LongTensor( [speaker_id]).to(device) # Greedy decoding with torch.no_grad(): mel, alignments, done = model_eval( sequence, text_positions=text_positions, speaker_ids=speaker_ids) alignments = alignments[0].cpu().data.numpy() mel = mel[0].cpu().data.numpy() mel = audio._denormalize(mel) # Alignment for i, alignment in enumerate(alignments, 1): alignment_dir = join(eval_output_dir, "alignment_layer{}".format(i)) os.makedirs(alignment_dir, exist_ok=True) path = join( alignment_dir, "step{:09d}_text{}_{}_layer{}_alignment.png".format( global_step, idx, speaker_str, i)) tm.save_alignment(path, alignment, global_step) tag = "eval_text_{}_alignment_layer{}_{}".format( idx, i, speaker_str) writer.add_image( tag, np.uint8(cm.viridis(np.flip(alignment, 1)) * 255).T, global_step) # Mel writer.add_image( "(Eval) Predicted mel spectrogram text{}_{}".format( idx, speaker_str), tm.prepare_spec_image(mel).transpose(2, 0, 1), global_step)
def save_states(global_step, writer, mel_outputs, linear_outputs, attn, mel, y, input_lengths, checkpoint_dir=None): """ Save states for the trainning process. """ print("[train] Saving intermediate states at step {}".format(global_step)) idx = min(1, len(input_lengths) - 1) input_length = input_lengths[idx] # Alignment, Multi-hop attention if attn is not None and len(attn.shape) == 4: attn = attn.numpy() for i in range(attn.shape[0]): alignment = attn[i] alignment = alignment[idx] tag = "alignment_layer{}".format(i + 1) writer.add_image(tag, np.uint8( cm.viridis(np.flip(alignment, 1).T) * 255), global_step, dataformats='HWC') alignment_dir = join(checkpoint_dir, "alignment_layer{}".format(i + 1)) if not os.path.exists(alignment_dir): os.makedirs(alignment_dir) path = join( alignment_dir, "step{:09d}_layer_{}_alignment.png".format(global_step, i + 1)) save_alignment(global_step, path, alignment) alignment_dir = join(checkpoint_dir, "alignment_ave") if not os.path.exists(alignment_dir): os.makedirs(alignment_dir) path = join(alignment_dir, "step{:09d}_alignment.png".format(global_step)) alignment = np.mean(attn, axis=0)[idx] save_alignment(global_step, path, alignment) tag = "averaged_alignment" writer.add_image(tag, np.uint8(cm.viridis(np.flip(alignment, 1).T) * 255), global_step, dataformats="HWC") if mel_outputs is not None: mel_output = mel_outputs[idx].numpy().squeeze().T mel_output = prepare_spec_image(audio._denormalize(mel_output)) writer.add_image("Predicted_mel_spectrogram", mel_output, global_step, dataformats="HWC") if linear_outputs is not None: linear_output = linear_outputs[idx].numpy().squeeze().T spectrogram = prepare_spec_image(audio._denormalize(linear_output)) writer.add_image("Predicted_linear_spectrogram", spectrogram, global_step, dataformats="HWC") signal = audio.inv_spectrogram(linear_output.T) signal /= np.max(np.abs(signal)) path = join(checkpoint_dir, "step{:09d}_predicted.wav".format(global_step)) try: writer.add_audio("Predicted_audio_signal", signal, global_step, sample_rate=hparams.sample_rate) except Exception as e: warn(str(e)) pass audio.save_wav(signal, path) if mel_outputs is not None: mel_output = mel[idx].numpy().squeeze().T mel_output = prepare_spec_image(audio._denormalize(mel_output)) writer.add_image("Target_mel_spectrogram", mel_output, global_step, dataformats="HWC") if linear_outputs is not None: linear_output = y[idx].numpy().squeeze().T spectrogram = prepare_spec_image(audio._denormalize(linear_output)) writer.add_image("Target_linear_spectrogram", spectrogram, global_step, dataformats="HWC")