Пример #1
0
    checkpoint = torch.load('test/TTSglow_130000')
    model.load_state_dict(checkpoint['model'].state_dict())

    dataset = FastSpeechDataset()
    testing_loader = DataLoader(dataset,
                                batch_size=1,
                                shuffle=False,
                                collate_fn=collate_fn,
                                drop_last=True,
                                num_workers=4)
    model = model.eval()

    for i, data_of_batch in enumerate(testing_loader):
        src_seq = data_of_batch["texts"]
        src_pos = data_of_batch["pos"]

        src_seq = torch.from_numpy(src_seq).long().to(device)
        src_pos = torch.from_numpy(src_pos).long().to(device)

        mel = model.inference(src_seq, src_pos, sigma=1.0, alpha=1.0)
        mel = mel.squeeze()
        print(mel.size())
        mel_path = os.path.join("results", "{}_synthesis.pt".format(i))
        torch.save(mel, mel_path)
        plot_data([mel.cpu().numpy().T], i)
        if i > 10:
            break
        ''' glow = get_waveglow()
        synthesis_waveglow(mel, glow, i, alpha=1.0)
        print("Synthesized by Waveglow.")'''
Пример #2
0
        src_seq = data_of_batch["texts"]
        src_pos = data_of_batch["pos"]
        mel_tgt = data_of_batch["mels"]
        alignment_target = data_of_batch["alignment"]

        src_seq = torch.from_numpy(src_seq).long().to(device)
        src_pos = torch.from_numpy(src_pos).long().to(device)
        mel_tgt = torch.from_numpy(mel_tgt).float().to(device)
        alignment_target = torch.from_numpy(alignment_target).float().to(
            device)
        mel_max_len = mel_tgt.size(1)

        with torch.no_grad():
            audio = model.inference(src_seq,
                                    src_pos,
                                    mel_max_len,
                                    alignment_target,
                                    sigma=1.0,
                                    alpha=1.0)
            audio = audio * MAX_WAV_VALUE
        audio = audio.squeeze()
        print(torch.mean(audio))
        audio = audio.cpu().numpy()

        audio_tgt = torch.cat(audio_tgt)
        audio_tgt = audio_tgt * MAX_WAV_VALUE
        print(torch.mean(audio_tgt))
        #print (audio_tgt)
        audio_tgt = audio_tgt.squeeze()
        audio_tgt = audio_tgt.cpu().numpy()

        audio = audio.astype('int16')