Exemplo n.º 1
0
    def generate_plots(self, model: ForwardTacotron,
                       session: ForwardSession) -> None:
        model.eval()
        device = next(model.parameters()).device
        x, m, ids, x_lens, mel_lens, dur = session.val_sample
        x, m, dur, mel_lens = x.to(device), m.to(device), dur.to(
            device), mel_lens.to(device)

        m1_hat, m2_hat, dur_hat = model(x, m, dur, mel_lens)
        m1_hat = np_now(m1_hat)[0, :600, :]
        m2_hat = np_now(m2_hat)[0, :600, :]
        m = np_now(m)[0, :600, :]

        m1_hat_fig = plot_mel(m1_hat)
        m2_hat_fig = plot_mel(m2_hat)
        m_fig = plot_mel(m)
        # pitch_fig = plot_pitch(np_now(pitch[0]))
        # pitch_gta_fig = plot_pitch(np_now(pitch_hat.squeeze()[0]))

        # self.writer.add_figure('Pitch/target', pitch_fig, model.step)
        # self.writer.add_figure('Pitch/ground_truth_aligned', pitch_gta_fig, model.step)
        self.writer.add_figure('Ground_Truth_Aligned/target', m_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/linear', m1_hat_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/postnet', m2_hat_fig,
                               model.step)

        m2_hat_wav = reconstruct_waveform(m2_hat)
        target_wav = reconstruct_waveform(m)

        self.writer.add_audio(tag='Ground_Truth_Aligned/target_wav',
                              snd_tensor=target_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)
        self.writer.add_audio(tag='Ground_Truth_Aligned/postnet_wav',
                              snd_tensor=m2_hat_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)

        m1_hat, m2_hat, dur_hat = model.generate(x[0, :x_lens[0]].tolist())
        m1_hat_fig = plot_mel(m1_hat)
        m2_hat_fig = plot_mel(m2_hat)

        # pitch_gen_fig = plot_pitch(np_now(pitch_hat.squeeze()))

        # self.writer.add_figure('Pitch/generated', pitch_gen_fig, model.step)
        self.writer.add_figure('Generated/target', m_fig, model.step)
        self.writer.add_figure('Generated/linear', m1_hat_fig, model.step)
        self.writer.add_figure('Generated/postnet', m2_hat_fig, model.step)

        m2_hat_wav = reconstruct_waveform(m2_hat)
        self.writer.add_audio(tag='Generated/target_wav',
                              snd_tensor=target_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)
        self.writer.add_audio(tag='Generated/postnet_wav',
                              snd_tensor=m2_hat_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)
Exemplo n.º 2
0
 def evaluate(self, model: ForwardTacotron, val_set: Dataset) -> Tuple[float, float,float]:
     model.eval()
     m_val_loss = 0
     dur_val_loss = 0
     pitch_val_loss = 0
     device = next(model.parameters()).device
     for i, (x, m, ids, x_lens, mel_lens, dur, pitch, puncts) in enumerate(
         val_set, 1
     ):
         x, m, dur, x_lens, mel_lens, pitch, puncts = (
             x.to(device),
             m.to(device),
             dur.to(device),
             x_lens.to(device),
             mel_lens.to(device),
             pitch.to(device),
             puncts.to(device),
         )
         with torch.no_grad():
             m1_hat, m2_hat, dur_hat, pitch_hat = model(
                 x, m, dur, mel_lens, pitch, puncts
             )
             m1_loss = self.l1_loss(m1_hat, m, mel_lens)
             m2_loss = self.l1_loss(m2_hat, m, mel_lens)
             dur_loss = self.l1_loss(dur_hat.unsqueeze(1), dur.unsqueeze(1), x_lens)
             pitch_val_loss += self.l1_loss(pitch_hat, pitch.unsqueeze(1), x_lens)
             m_val_loss += m1_loss.item() + m2_loss.item()
             dur_val_loss += dur_loss.item()
     m_val_loss /= len(val_set)
     dur_val_loss /= len(val_set)
     pitch_val_loss /= len(val_set)
     return m_val_loss, dur_val_loss, pitch_val_loss
Exemplo n.º 3
0
 def evaluate(self, model: ForwardTacotron,
              val_set: DataLoader) -> Dict[str, float]:
     model.eval()
     m_val_loss = 0
     dur_val_loss = 0
     pitch_val_loss = 0
     energy_val_loss = 0
     device = next(model.parameters()).device
     for i, batch in enumerate(val_set, 1):
         batch = to_device(batch, device=device)
         with torch.no_grad():
             pred = model(batch)
             m1_loss = self.l1_loss(pred['mel'], batch['mel'],
                                    batch['mel_len'])
             m2_loss = self.l1_loss(pred['mel_post'], batch['mel'],
                                    batch['mel_len'])
             dur_loss = self.l1_loss(pred['dur'].unsqueeze(1),
                                     batch['dur'].unsqueeze(1),
                                     batch['x_len'])
             pitch_loss = self.l1_loss(pred['pitch'],
                                       batch['pitch'].unsqueeze(1),
                                       batch['x_len'])
             energy_loss = self.l1_loss(pred['energy'],
                                        batch['energy'].unsqueeze(1),
                                        batch['x_len'])
             pitch_val_loss += pitch_loss
             energy_val_loss += energy_loss
             m_val_loss += m1_loss.item() + m2_loss.item()
             dur_val_loss += dur_loss.item()
     return {
         'mel_loss': m_val_loss / len(val_set),
         'dur_loss': dur_val_loss / len(val_set),
         'pitch_loss': pitch_val_loss / len(val_set),
         'energy_loss': energy_val_loss / len(val_set)
     }
Exemplo n.º 4
0
    def generate_plots(self, model: ForwardTacotron,
                       session: TTSSession) -> None:
        model.eval()
        device = next(model.parameters()).device
        x, m, ids, lens, dur = session.val_sample
        x, m, dur = x.to(device), m.to(device), dur.to(device)

        m1_hat, m2_hat, dur_hat = model(x, m, dur)
        m1_hat = np_now(m1_hat)[0, :600, :]
        m2_hat = np_now(m2_hat)[0, :600, :]
        m = np_now(m)[0, :600, :]

        m1_hat_fig = plot_mel(m1_hat)
        m2_hat_fig = plot_mel(m2_hat)
        m_fig = plot_mel(m)

        self.writer.add_figure('Ground_Truth_Aligned/target', m_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/linear', m1_hat_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/postnet', m2_hat_fig,
                               model.step)

        m1_hat, m2_hat, m = rescale_mel(m1_hat), rescale_mel(
            m2_hat), rescale_mel(m)
        m2_hat_wav = reconstruct_waveform(m2_hat)
        target_wav = reconstruct_waveform(m)

        self.writer.add_audio(tag='Ground_Truth_Aligned/target_wav',
                              snd_tensor=target_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)
        self.writer.add_audio(tag='Ground_Truth_Aligned/postnet_wav',
                              snd_tensor=m2_hat_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)

        m1_hat, m2_hat, dur_hat = model.generate(x[0].tolist())
        m1_hat, m2_hat = rescale_mel(m1_hat), rescale_mel(m2_hat)
        m1_hat_fig = plot_mel(m1_hat)
        m2_hat_fig = plot_mel(m2_hat)

        self.writer.add_figure('Generated/target', m_fig, model.step)
        self.writer.add_figure('Generated/linear', m1_hat_fig, model.step)
        self.writer.add_figure('Generated/postnet', m2_hat_fig, model.step)

        m2_hat_wav = reconstruct_waveform(m2_hat)

        self.writer.add_audio(tag='Generated/target_wav',
                              snd_tensor=target_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)
        self.writer.add_audio(tag='Generated/postnet_wav',
                              snd_tensor=m2_hat_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)
Exemplo n.º 5
0
 def evaluate(self, model: ForwardTacotron, val_set: Dataset) -> Tuple[float, float]:
     model.eval()
     m_val_loss = 0
     dur_val_loss = 0
     device = next(model.parameters()).device
     for i, (x, m, ids, lens, dur) in enumerate(val_set, 1):
         x, m, dur, lens = x.to(device), m.to(device), dur.to(device), lens.to(device)
         with torch.no_grad():
             m1_hat, m2_hat, dur_hat = model(x, m, dur)
             m1_loss = self.l1_loss(m1_hat, m, lens)
             m2_loss = self.l1_loss(m2_hat, m, lens)
             dur_loss = F.l1_loss(dur_hat, dur)
             m_val_loss += m1_loss.item() + m2_loss.item()
             dur_val_loss += dur_loss.item()
     return m_val_loss / len(val_set), dur_val_loss / len(val_set)
Exemplo n.º 6
0
    def generate_plots(self, model: ForwardTacotron,
                       session: TTSSession) -> None:
        model.eval()
        device = next(model.parameters()).device
        batch = session.val_sample
        batch = to_device(batch, device=device)

        pred = model(batch)
        m1_hat = np_now(pred['mel'])[0, :600, :]
        m2_hat = np_now(pred['mel_post'])[0, :600, :]
        m_target = np_now(batch['mel'])[0, :600, :]

        m1_hat_fig = plot_mel(m1_hat)
        m2_hat_fig = plot_mel(m2_hat)
        m_target_fig = plot_mel(m_target)
        pitch_fig = plot_pitch(np_now(batch['pitch'][0]))
        pitch_gta_fig = plot_pitch(np_now(pred['pitch'].squeeze()[0]))
        energy_fig = plot_pitch(np_now(batch['energy'][0]))
        energy_gta_fig = plot_pitch(np_now(pred['energy'].squeeze()[0]))

        self.writer.add_figure('Pitch/target', pitch_fig, model.step)
        self.writer.add_figure('Pitch/ground_truth_aligned', pitch_gta_fig,
                               model.step)
        self.writer.add_figure('Energy/target', energy_fig, model.step)
        self.writer.add_figure('Energy/ground_truth_aligned', energy_gta_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/target', m_target_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/linear', m1_hat_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/postnet', m2_hat_fig,
                               model.step)

        m2_hat_wav = self.dsp.griffinlim(m2_hat)
        target_wav = self.dsp.griffinlim(m_target)

        self.writer.add_audio(tag='Ground_Truth_Aligned/target_wav',
                              snd_tensor=target_wav,
                              global_step=model.step,
                              sample_rate=self.dsp.sample_rate)
        self.writer.add_audio(tag='Ground_Truth_Aligned/postnet_wav',
                              snd_tensor=m2_hat_wav,
                              global_step=model.step,
                              sample_rate=self.dsp.sample_rate)

        gen = model.generate(batch['x'][0:1, :batch['x_len'][0]])

        m1_hat_fig = plot_mel(np_now(gen['mel']))
        m2_hat_fig = plot_mel(np_now(gen['mel_post']))

        pitch_gen_fig = plot_pitch(np_now(gen['pitch'].squeeze()))
        energy_gen_fig = plot_pitch(np_now(gen['energy'].squeeze()))

        self.writer.add_figure('Pitch/generated', pitch_gen_fig, model.step)
        self.writer.add_figure('Energy/generated', energy_gen_fig, model.step)
        self.writer.add_figure('Generated/target', m_target_fig, model.step)
        self.writer.add_figure('Generated/linear', m1_hat_fig, model.step)
        self.writer.add_figure('Generated/postnet', m2_hat_fig, model.step)

        m2_hat_wav = self.dsp.griffinlim(m2_hat)

        self.writer.add_audio(tag='Generated/target_wav',
                              snd_tensor=target_wav,
                              global_step=model.step,
                              sample_rate=self.dsp.sample_rate)
        self.writer.add_audio(tag='Generated/postnet_wav',
                              snd_tensor=m2_hat_wav,
                              global_step=model.step,
                              sample_rate=self.dsp.sample_rate)
Exemplo n.º 7
0
def main():
    # Parse Arguments
    parser = argparse.ArgumentParser(description='TTS Generator')

    parser.add_argument(
        '--tts_weights',
        type=str,
        help='[string/path] Load in different FastSpeech weights')

    parser.add_argument('--hp_file',
                        metavar='FILE',
                        default='hparams.py',
                        help='The file to use for the hyperparameters')
    parser.add_argument(
        '--alpha',
        type=float,
        default=1.,
        help='Parameter for controlling length regulator for speedup '
        'or slow-down of generated speech, e.g. alpha=2.0 is double-time')

    if not os.path.exists('onnx'):
        os.mkdir('onnx')

    args = parser.parse_args()

    hp.configure(args.hp_file)

    input_text = "the forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves."
    tts_weights = args.tts_weights

    paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id)

    device = torch.device('cpu')
    print('Using device:', device)

    print('\nInitialising Forward TTS Model...\n')
    tts_model = ForwardTacotron(embed_dims=hp.forward_embed_dims,
                                num_chars=len(symbols),
                                durpred_rnn_dims=hp.forward_durpred_rnn_dims,
                                durpred_conv_dims=hp.forward_durpred_conv_dims,
                                rnn_dim=hp.forward_rnn_dims,
                                postnet_k=hp.forward_postnet_K,
                                postnet_dims=hp.forward_postnet_dims,
                                prenet_k=hp.forward_prenet_K,
                                prenet_dims=hp.forward_prenet_dims,
                                highways=hp.forward_num_highways,
                                dropout=hp.forward_dropout,
                                n_mels=hp.num_mels).to(device)

    tts_load_path = tts_weights or paths.forward_latest_weights
    tts_model.load(tts_load_path)

    encoder = DurationPredictor(tts_model)
    decoder = Tacotron(tts_model)

    tts_model.eval()
    encoder.eval()
    decoder.eval()

    opset_version = 10

    with torch.no_grad():
        input_seq = text_to_sequence(input_text.strip(), hp.tts_cleaner_names)
        input_seq = torch.as_tensor(input_seq, dtype=torch.long,
                                    device=device).unsqueeze(0)
        '''
        FIRST STEP: predict symbols duration
        '''
        torch.onnx.export(encoder,
                          input_seq,
                          "./onnx/forward_tacotron_duration_prediction.onnx",
                          opset_version=opset_version,
                          do_constant_folding=True,
                          input_names=["input_seq"],
                          output_names=["embeddings", "duration"])

        x, durations = encoder(input_seq)
        '''
        SECOND STEP: expand symbols by durations
        '''
        x = encoder.lr(x, durations)
        '''
        THIRD STEP: generate mel
        '''
        torch.onnx.export(decoder,
                          x,
                          "./onnx/forward_tacotron_regression.onnx",
                          opset_version=opset_version,
                          do_constant_folding=True,
                          input_names=["data"],
                          output_names=["mel"])

    print('Done!')