def tts(model, text): """Convert text to speech waveform given a Tacotron model. """ if use_cuda: model = model.cuda() # TODO: Turning off dropout of decoder's prenet causes serious performance # regression, not sure why. # model.decoder.eval() model.encoder.eval() model.postnet.eval() sequence = np.array(text_to_sequence(text, [hparams.cleaners])) sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0) if use_cuda: sequence = sequence.cuda() # Greedy decoding mel_outputs, linear_outputs, alignments = model(sequence) linear_output = linear_outputs[0].cpu().data.numpy() spectrogram = audio._denormalize(linear_output) alignment = alignments[0].cpu().data.numpy() # Predicted audio signal waveform = audio.inv_spectrogram(linear_output.T) return waveform, alignment, spectrogram
def save_spectrogram(path, linear_output): spectrogram = audio._denormalize(linear_output) plt.figure(figsize=(16, 10)) plt.imshow(spectrogram.T, aspect="auto", origin="lower") plt.colorbar() plt.tight_layout() plt.savefig(path, format="png") plt.close()