checkpoint = torch.load('test/TTSglow_130000') model.load_state_dict(checkpoint['model'].state_dict()) dataset = FastSpeechDataset() testing_loader = DataLoader(dataset, batch_size=1, shuffle=False, collate_fn=collate_fn, drop_last=True, num_workers=4) model = model.eval() for i, data_of_batch in enumerate(testing_loader): src_seq = data_of_batch["texts"] src_pos = data_of_batch["pos"] src_seq = torch.from_numpy(src_seq).long().to(device) src_pos = torch.from_numpy(src_pos).long().to(device) mel = model.inference(src_seq, src_pos, sigma=1.0, alpha=1.0) mel = mel.squeeze() print(mel.size()) mel_path = os.path.join("results", "{}_synthesis.pt".format(i)) torch.save(mel, mel_path) plot_data([mel.cpu().numpy().T], i) if i > 10: break ''' glow = get_waveglow() synthesis_waveglow(mel, glow, i, alpha=1.0) print("Synthesized by Waveglow.")'''
src_seq = data_of_batch["texts"] src_pos = data_of_batch["pos"] mel_tgt = data_of_batch["mels"] alignment_target = data_of_batch["alignment"] src_seq = torch.from_numpy(src_seq).long().to(device) src_pos = torch.from_numpy(src_pos).long().to(device) mel_tgt = torch.from_numpy(mel_tgt).float().to(device) alignment_target = torch.from_numpy(alignment_target).float().to( device) mel_max_len = mel_tgt.size(1) with torch.no_grad(): audio = model.inference(src_seq, src_pos, mel_max_len, alignment_target, sigma=1.0, alpha=1.0) audio = audio * MAX_WAV_VALUE audio = audio.squeeze() print(torch.mean(audio)) audio = audio.cpu().numpy() audio_tgt = torch.cat(audio_tgt) audio_tgt = audio_tgt * MAX_WAV_VALUE print(torch.mean(audio_tgt)) #print (audio_tgt) audio_tgt = audio_tgt.squeeze() audio_tgt = audio_tgt.cpu().numpy() audio = audio.astype('int16')