def test_nyanko_basics(): x, y = _test_data() for v in [False, True]: model = nyanko(n_vocab, mel_dim=num_mels, linear_dim=num_freq, r=1, downsample_step=4, use_decoder_state_for_postnet_input=v) mel_outputs, linear_outputs, alignments, done = model(x, y)
def test_incremental_path_multiple_times(): texts = ["ni2 hao3 , wo3 shi4 jing3 cha2 ."] seqs = np.array([text_to_sequence(t) for t in texts]) text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0])) r = 1 mel_dim = 80 sequence = torch.LongTensor(seqs) text_positions = torch.LongTensor(text_positions) model = nyanko(n_vocab, mel_dim=mel_dim, linear_dim=513, downsample_step=4, r=r, force_monotonic_attention=False) model.eval() # first call mel_outputs, linear_outputs, alignments, done = model( sequence, text_positions=text_positions, speaker_ids=None) # second call mel_outputs2, linear_outputs2, alignments2, done2 = model( sequence, text_positions=text_positions, speaker_ids=None) # Should get same result c = (mel_outputs - mel_outputs2).abs() print(c.mean(), c.max()) assert np.allclose(mel_outputs.cpu().data.numpy(), mel_outputs2.cpu().data.numpy(), atol=1e-5)
def test_incremental_correctness(): texts = [ "they discarded this for a more completely Roman and far less beautiful letter." ] seqs = np.array([text_to_sequence(t) for t in texts]) text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0])) mel = np.load( "/home/ryuichi/Dropbox/sp/deepvoice3_pytorch/data/ljspeech/ljspeech-mel-00035.npy" ) max_target_len = mel.shape[0] r = 1 mel_dim = 80 if max_target_len % r != 0: max_target_len += r - max_target_len % r assert max_target_len % r == 0 mel = _pad_2d(mel, max_target_len) mel = Variable(torch.from_numpy(mel)) mel_reshaped = mel.view(1, -1, mel_dim * r) frame_positions = np.arange(1, mel_reshaped.size(1) + 1).reshape( 1, mel_reshaped.size(1)) x = Variable(torch.LongTensor(seqs)) text_positions = Variable(torch.LongTensor(text_positions)) frame_positions = Variable(torch.LongTensor(frame_positions)) model = nyanko(n_vocab, mel_dim=mel_dim, linear_dim=513, downsample_step=4, r=r, force_monotonic_attention=False) model.eval() # Encoder encoder_outs = model.seq2seq.encoder(x) # Off line decoding mel_outputs_offline, alignments_offline, done, _ = model.seq2seq.decoder( encoder_outs, mel_reshaped, text_positions=text_positions, frame_positions=frame_positions) # Online decoding with test inputs model.seq2seq.decoder.start_fresh_sequence() mel_outputs_online, alignments, dones_online, _ = model.seq2seq.decoder.incremental_forward( encoder_outs, text_positions, test_inputs=mel_reshaped) # Should get same result assert np.allclose(mel_outputs_offline.cpu().data.numpy(), mel_outputs_online.cpu().data.numpy())
def test_incremental_correctness(): texts = ["ni2 hao3 , wo3 shi4 jing3 cha2 ."] seqs = np.array([text_to_sequence(t) for t in texts]) text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0])) mel_path = join(dirname(__file__), "data", "ljspeech-mel-00001.npy") mel = np.load(mel_path)[::4] max_target_len = mel.shape[0] r = 1 mel_dim = 80 if max_target_len % r != 0: max_target_len += r - max_target_len % r assert max_target_len % r == 0 mel = _pad_2d(mel, max_target_len) mel = torch.from_numpy(mel) mel_reshaped = mel.view(1, -1, mel_dim * r) frame_positions = np.arange(1, mel_reshaped.size(1) + 1).reshape(1, mel_reshaped.size(1)) x = torch.LongTensor(seqs) text_positions = torch.LongTensor(text_positions) frame_positions = torch.LongTensor(frame_positions) model = nyanko(n_vocab, mel_dim=mel_dim, linear_dim=513, downsample_step=4, r=r, force_monotonic_attention=False) model.eval() # Encoder encoder_outs = model.seq2seq.encoder(x) # Off line decoding mel_outputs_offline, alignments_offline, done, _ = model.seq2seq.decoder( encoder_outs, mel_reshaped, text_positions=text_positions, frame_positions=frame_positions) # Online decoding with test inputs model.seq2seq.decoder.start_fresh_sequence() mel_outputs_online, alignments, dones_online, _ = model.seq2seq.decoder.incremental_forward( encoder_outs, text_positions, test_inputs=mel_reshaped) # Should get same result assert np.allclose(mel_outputs_offline.cpu().data.numpy(), mel_outputs_online.cpu().data.numpy())
def test_nyanko(): texts = [ "they discarded this for a more completely Roman and far less beautiful letter." ] seqs = np.array([text_to_sequence(t) for t in texts]) text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0])) mel = np.load( "/home/ryuichi/Dropbox/sp/deepvoice3_pytorch/data/ljspeech/ljspeech-mel-00035.npy" ) max_target_len = mel.shape[0] r = 1 mel_dim = 80 if max_target_len % r != 0: max_target_len += r - max_target_len % r assert max_target_len % r == 0 mel = _pad_2d(mel, max_target_len) mel = torch.from_numpy(mel) mel_reshaped = mel.view(1, -1, mel_dim * r) frame_positions = np.arange(1, mel_reshaped.size(1) + 1).reshape( 1, mel_reshaped.size(1)) x = torch.LongTensor(seqs) text_positions = torch.LongTensor(text_positions) frame_positions = torch.LongTensor(frame_positions) model = nyanko(n_vocab, mel_dim=mel_dim, linear_dim=513, downsample_step=4, r=r, force_monotonic_attention=False) model.eval() def _plot(mel, mel_predicted, alignments): from matplotlib import pylab as plt plt.figure(figsize=(16, 10)) plt.subplot(3, 1, 1) plt.imshow(mel.data.cpu().numpy().T, origin="lower bottom", aspect="auto", cmap="magma") plt.colorbar() plt.subplot(3, 1, 2) plt.imshow(mel_predicted.view(-1, mel_dim).data.cpu().numpy().T, origin="lower bottom", aspect="auto", cmap="magma") plt.colorbar() plt.subplot(3, 1, 3) if alignments.dim() == 4: alignments = alignments.mean(0) plt.imshow(alignments[0].data.cpu().numpy().T, origin="lower bottom", aspect="auto") plt.colorbar() plt.show() seq2seq = model.seq2seq # Encoder encoder_outs = seq2seq.encoder(x) # Off line decoding print("Offline decoding") mel_outputs_offline, alignments_offline, done, _ = seq2seq.decoder( encoder_outs, mel_reshaped, text_positions=text_positions, frame_positions=frame_positions) _plot(mel, mel_outputs_offline, alignments_offline) # Online decoding with test inputs print("Online decoding") seq2seq.decoder.start_fresh_sequence() mel_outputs_online, alignments, dones_online, _ = seq2seq.decoder.incremental_forward( encoder_outs, text_positions, test_inputs=mel_reshaped) a = mel_outputs_offline.cpu().data.numpy() b = mel_outputs_online.cpu().data.numpy() c = (mel_outputs_offline - mel_outputs_online).abs() print(c.mean(), c.max()) _plot(mel, mel_outputs_offline, alignments_offline) _plot(mel, mel_outputs_online, alignments) _plot(mel, c, alignments) # Should get same result assert np.allclose(a, b) postnet = model.postnet linear_outputs = postnet(mel_outputs_offline) print(linear_outputs.size())