def test_incremental_path_multiple_times(): texts = ["they discarded this for a more completely Roman and far less beautiful letter."] seqs = np.array([text_to_sequence(t) for t in texts]) text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0])) r = 4 mel_dim = 80 sequence = Variable(torch.LongTensor(seqs)) text_positions = Variable(torch.LongTensor(text_positions)) for model, speaker_ids in [ (_get_model(force_monotonic_attention=False), None), (_get_model(force_monotonic_attention=False, n_speakers=32, speaker_embed_dim=16), Variable(torch.LongTensor([1])))]: model.eval() # first call mel_outputs, linear_outputs, alignments, done = model( sequence, text_positions=text_positions, speaker_ids=speaker_ids) # second call mel_outputs2, linear_outputs2, alignments2, done2 = model( sequence, text_positions=text_positions, speaker_ids=speaker_ids) # Should get same result c = (mel_outputs - mel_outputs2).abs() print(c.mean(), c.max()) assert np.allclose(mel_outputs.cpu().data.numpy(), mel_outputs2.cpu().data.numpy(), atol=1e-5)
def test_incremental_path_multiple_times(): texts = ["ni2 hao3 , wo3 shi4 jing3 cha2 ."] seqs = np.array([text_to_sequence(t) for t in texts]) text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0])) r = 1 mel_dim = 80 sequence = torch.LongTensor(seqs) text_positions = torch.LongTensor(text_positions) model = nyanko(n_vocab, mel_dim=mel_dim, linear_dim=513, downsample_step=4, r=r, force_monotonic_attention=False) model.eval() # first call mel_outputs, linear_outputs, alignments, done = model( sequence, text_positions=text_positions, speaker_ids=None) # second call mel_outputs2, linear_outputs2, alignments2, done2 = model( sequence, text_positions=text_positions, speaker_ids=None) # Should get same result c = (mel_outputs - mel_outputs2).abs() print(c.mean(), c.max()) assert np.allclose(mel_outputs.cpu().data.numpy(), mel_outputs2.cpu().data.numpy(), atol=1e-5)
def test_incremental_path_multiple_times(): texts = ["ni2 hao3 , wo3 shi4 jing3 cha2 ."] seqs = np.array([text_to_sequence(t) for t in texts]) text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0])) r = 4 mel_dim = 80 sequence = torch.LongTensor(seqs) text_positions = torch.LongTensor(text_positions) for model, speaker_ids in [ (_get_model(force_monotonic_attention=False), None), (_get_model(force_monotonic_attention=False, n_speakers=32, speaker_embed_dim=16), torch.LongTensor([1])) ]: model.eval() # first call mel_outputs, linear_outputs, alignments, done = model( sequence, text_positions=text_positions, speaker_ids=speaker_ids) # second call mel_outputs2, linear_outputs2, alignments2, done2 = model( sequence, text_positions=text_positions, speaker_ids=speaker_ids) # Should get same result c = (mel_outputs - mel_outputs2).abs() print(c.mean(), c.max()) assert np.allclose(mel_outputs.cpu().data.numpy(), mel_outputs2.cpu().data.numpy(), atol=1e-5)
def _test_data(): texts = ["Thank you very much.", "Hello.", "Deep voice 3."] seqs = [np.array(text_to_sequence(t), dtype=np.int) for t in texts] input_lengths = np.array([len(s) for s in seqs]) max_len = np.max(input_lengths) seqs = np.array([_pad(s, max_len) for s in seqs]) # Test encoder x = Variable(torch.LongTensor(seqs)) y = Variable(torch.rand(x.size(0), 12, 80)) return x, y
def _test_data(): texts = ["ni2 hao3 , wo3 shi4 jing3 cha2 .", "jin1 tian1 tian1 qi4 zhen1 bu2 cuo4 . "] seqs = [np.array(text_to_sequence(t), dtype=np.int) for t in texts] input_lengths = np.array([len(s) for s in seqs]) max_len = np.max(input_lengths) seqs = np.array([_pad(s, max_len) for s in seqs]) # Test encoder x = torch.LongTensor(seqs) y = torch.rand(x.size(0), 12, 80) return x, y
def test_incremental_correctness(): texts = [ "they discarded this for a more completely Roman and far less beautiful letter." ] seqs = np.array([text_to_sequence(t) for t in texts]) text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0])) mel = np.load( "/home/ryuichi/Dropbox/sp/deepvoice3_pytorch/data/ljspeech/ljspeech-mel-00035.npy" ) max_target_len = mel.shape[0] r = 1 mel_dim = 80 if max_target_len % r != 0: max_target_len += r - max_target_len % r assert max_target_len % r == 0 mel = _pad_2d(mel, max_target_len) mel = Variable(torch.from_numpy(mel)) mel_reshaped = mel.view(1, -1, mel_dim * r) frame_positions = np.arange(1, mel_reshaped.size(1) + 1).reshape( 1, mel_reshaped.size(1)) x = Variable(torch.LongTensor(seqs)) text_positions = Variable(torch.LongTensor(text_positions)) frame_positions = Variable(torch.LongTensor(frame_positions)) model = nyanko(n_vocab, mel_dim=mel_dim, linear_dim=513, downsample_step=4, r=r, force_monotonic_attention=False) model.eval() # Encoder encoder_outs = model.seq2seq.encoder(x) # Off line decoding mel_outputs_offline, alignments_offline, done, _ = model.seq2seq.decoder( encoder_outs, mel_reshaped, text_positions=text_positions, frame_positions=frame_positions) # Online decoding with test inputs model.seq2seq.decoder.start_fresh_sequence() mel_outputs_online, alignments, dones_online, _ = model.seq2seq.decoder.incremental_forward( encoder_outs, text_positions, test_inputs=mel_reshaped) # Should get same result assert np.allclose(mel_outputs_offline.cpu().data.numpy(), mel_outputs_online.cpu().data.numpy())
def test_incremental_correctness(): texts = ["they discarded this for a more completely Roman and far less beautiful letter."] seqs = np.array([text_to_sequence(t) for t in texts]) text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0])) mel_path = join(dirname(__file__), "data", "ljspeech-mel-00001.npy") mel = np.load(mel_path) max_target_len = mel.shape[0] r = 4 mel_dim = 80 if max_target_len % r != 0: max_target_len += r - max_target_len % r assert max_target_len % r == 0 mel = _pad_2d(mel, max_target_len) mel = Variable(torch.from_numpy(mel)) mel_reshaped = mel.view(1, -1, mel_dim * r) frame_positions = np.arange(1, mel_reshaped.size(1) + 1).reshape(1, mel_reshaped.size(1)) x = Variable(torch.LongTensor(seqs)) text_positions = Variable(torch.LongTensor(text_positions)) frame_positions = Variable(torch.LongTensor(frame_positions)) for model, speaker_ids in [ (_get_model(force_monotonic_attention=False), None), (_get_model(force_monotonic_attention=False, n_speakers=32, speaker_embed_dim=16), Variable(torch.LongTensor([1])))]: model.eval() if speaker_ids is not None: speaker_embed = model.embed_speakers(speaker_ids) else: speaker_embed = None # Encoder encoder_outs = model.seq2seq.encoder(x, speaker_embed=speaker_embed) # Off line decoding mel_outputs_offline, alignments_offline, done, _ = model.seq2seq.decoder( encoder_outs, mel_reshaped, speaker_embed=speaker_embed, text_positions=text_positions, frame_positions=frame_positions) # Online decoding with test inputs model.seq2seq.decoder.start_fresh_sequence() mel_outputs_online, alignments, dones_online, _ = model.seq2seq.decoder.incremental_forward( encoder_outs, text_positions, speaker_embed=speaker_embed, test_inputs=mel_reshaped) # Should get same result c = (mel_outputs_offline - mel_outputs_online).abs() print(c.mean(), c.max()) assert np.allclose(mel_outputs_offline.cpu().data.numpy(), mel_outputs_online.cpu().data.numpy(), atol=1e-5)
def test_incremental_correctness(): texts = ["ni2 hao3 , wo3 shi4 jing3 cha2 ."] seqs = np.array([text_to_sequence(t) for t in texts]) text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0])) mel_path = join(dirname(__file__), "data", "ljspeech-mel-00001.npy") mel = np.load(mel_path)[::4] max_target_len = mel.shape[0] r = 1 mel_dim = 80 if max_target_len % r != 0: max_target_len += r - max_target_len % r assert max_target_len % r == 0 mel = _pad_2d(mel, max_target_len) mel = torch.from_numpy(mel) mel_reshaped = mel.view(1, -1, mel_dim * r) frame_positions = np.arange(1, mel_reshaped.size(1) + 1).reshape(1, mel_reshaped.size(1)) x = torch.LongTensor(seqs) text_positions = torch.LongTensor(text_positions) frame_positions = torch.LongTensor(frame_positions) model = nyanko(n_vocab, mel_dim=mel_dim, linear_dim=513, downsample_step=4, r=r, force_monotonic_attention=False) model.eval() # Encoder encoder_outs = model.seq2seq.encoder(x) # Off line decoding mel_outputs_offline, alignments_offline, done, _ = model.seq2seq.decoder( encoder_outs, mel_reshaped, text_positions=text_positions, frame_positions=frame_positions) # Online decoding with test inputs model.seq2seq.decoder.start_fresh_sequence() mel_outputs_online, alignments, dones_online, _ = model.seq2seq.decoder.incremental_forward( encoder_outs, text_positions, test_inputs=mel_reshaped) # Should get same result assert np.allclose(mel_outputs_offline.cpu().data.numpy(), mel_outputs_online.cpu().data.numpy())
def test_multi_speaker_deepvoice3(): texts = ["Thank you very much.", "Hello.", "Deep voice 3."] seqs = [np.array(text_to_sequence(t), dtype=np.int) for t in texts] input_lengths = np.array([len(s) for s in seqs]) max_len = np.max(input_lengths) seqs = np.array([_pad(s, max_len) for s in seqs]) # Test encoder x = Variable(torch.LongTensor(seqs)) y = Variable(torch.rand(x.size(0), 4 * 33, 80)) model = _get_model(n_speakers=32, speaker_embed_dim=16) speaker_ids = Variable(torch.LongTensor([1, 2, 3])) mel_outputs, linear_outputs, alignments, done = model(x, y, speaker_ids=speaker_ids) print("Input text:", x.size()) print("Input mel:", y.size()) print("Mel:", mel_outputs.size()) print("Linear:", linear_outputs.size()) print("Alignments:", alignments.size()) print("Done:", done.size())
def test_multi_speaker_deepvoice3(): texts = [ "ni2 hao3 , wo3 shi4 jing3 cha2 .", "jin1 tian1 tian1 qi4 zhen1 bu2 cuo4 . " ] seqs = [np.array(text_to_sequence(t), dtype=np.int) for t in texts] input_lengths = np.array([len(s) for s in seqs]) max_len = np.max(input_lengths) seqs = np.array([_pad(s, max_len) for s in seqs]) # Test encoder x = torch.LongTensor(seqs) y = torch.rand(x.size(0), 4 * 33, 80) model = _get_model(n_speakers=32, speaker_embed_dim=16) speaker_ids = torch.LongTensor([1, 2, 3]) mel_outputs, linear_outputs, alignments, done = model( x, y, speaker_ids=speaker_ids) print("Input text:", x.size()) print("Input mel:", y.size()) print("Mel:", mel_outputs.size()) print("Linear:", linear_outputs.size()) print("Alignments:", alignments.size()) print("Done:", done.size())
def test_incremental_forward(): checkpoint_path = join(dirname(__file__), "../checkpoints/checkpoint_step000140000.pth") if not exists(checkpoint_path): return model = _get_model() checkpoint = torch.load(checkpoint_path) model.load_state_dict(checkpoint["state_dict"]) model = model.cuda() if use_cuda else model texts = [ "they discarded this for a more completely Roman and far less beautiful letter." ] seqs = np.array([text_to_sequence(t) for t in texts]) input_lengths = [len(s) for s in seqs] use_manual_padding = False if use_manual_padding: max_input_len = np.max(input_lengths) + 10 # manuall padding seqs = np.array([_pad(x, max_input_len) for x in seqs], dtype=np.int) input_lengths = torch.LongTensor(input_lengths) input_lengths = input_lengths.cuda() if use_cuda else input_lenghts else: input_lengths = None text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0])) mel = np.load("/home/ryuichi/tacotron/training/ljspeech-mel-00035.npy") max_target_len = mel.shape[0] r = 4 mel_dim = 80 if max_target_len % r != 0: max_target_len += r - max_target_len % r assert max_target_len % r == 0 mel = _pad_2d(mel, max_target_len) mel = Variable(torch.from_numpy(mel)) mel_reshaped = mel.view(1, -1, mel_dim * r) frame_positions = np.arange(1, mel_reshaped.size(1) + 1).reshape( 1, mel_reshaped.size(1)) x = Variable(torch.LongTensor(seqs)) text_positions = Variable(torch.LongTensor(text_positions)) frame_positions = Variable(torch.LongTensor(frame_positions)) if use_cuda: x = x.cuda() text_positions = text_positions.cuda() frame_positions = frame_positions.cuda() mel_reshaped = mel_reshaped.cuda() # model.make_generation_fast_() model.eval() encoder_outs = model.encoder(x, lengths=input_lengths) # Off line decoding mel_output_offline, alignments_offline, done, decoder_states = model.decoder( encoder_outs, mel_reshaped, text_positions=text_positions, frame_positions=frame_positions, lengths=input_lengths) from matplotlib import pylab as plt def _plot(mel, mel_predicted, alignments): plt.figure(figsize=(16, 10)) plt.subplot(3, 1, 1) plt.imshow(mel.data.cpu().numpy().T, origin="lower bottom", aspect="auto") plt.colorbar() plt.subplot(3, 1, 2) plt.imshow(mel_predicted.view(-1, mel_dim).data.cpu().numpy().T, origin="lower bottom", aspect="auto") plt.colorbar() plt.subplot(3, 1, 3) if alignments.dim() == 4: alignments = alignments.mean(0) plt.imshow(alignments[0].data.cpu().numpy().T, origin="lower bottom", aspect="auto") plt.colorbar() plt.show() _plot(mel, mel_output_offline, alignments_offline) # Online decoding model.decoder._start_incremental_inference() mel_outputs, alignments, dones_online, decoder_states_online = model.decoder._incremental_forward( encoder_outs, text_positions, # initial_input=mel_reshaped[:, :1, :], test_inputs=None) # test_inputs=mel_reshaped) model.decoder._stop_incremental_inference() _plot(mel, mel_outputs, alignments)
def test_incremental_forward(): checkpoint_path = join(dirname(__file__), "../test_whole/checkpoint_step000265000.pth") if not exists(checkpoint_path): return model = _get_model() use_cuda = False checkpoint = torch.load(checkpoint_path) model.load_state_dict(checkpoint["state_dict"]) model.make_generation_fast_() model = model.cuda() if use_cuda else model texts = ["ni2 hao3 , wo3 shi4 jing3 cha2 ."] seqs = np.array([text_to_sequence(t) for t in texts]) input_lengths = [len(s) for s in seqs] use_manual_padding = False if use_manual_padding: max_input_len = np.max(input_lengths) + 10 # manuall padding seqs = np.array([_pad(x, max_input_len) for x in seqs], dtype=np.int) input_lengths = torch.LongTensor(input_lengths) input_lengths = input_lengths.cuda() if use_cuda else input_lengths else: input_lengths = None text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0])) mel = np.load( "/home/ryuichi/Dropbox/sp/deepvoice3_pytorch/data/ljspeech/ljspeech-mel-00035.npy" ) max_target_len = mel.shape[0] r = 4 mel_dim = 80 if max_target_len % r != 0: max_target_len += r - max_target_len % r assert max_target_len % r == 0 mel = _pad_2d(mel, max_target_len) mel = torch.from_numpy(mel) mel_reshaped = mel.view(1, -1, mel_dim * r) frame_positions = np.arange(1, mel_reshaped.size(1) + 1).reshape( 1, mel_reshaped.size(1)) x = torch.LongTensor(seqs) text_positions = torch.LongTensor(text_positions) frame_positions = torch.LongTensor(frame_positions) if use_cuda: x = x.cuda() text_positions = text_positions.cuda() frame_positions = frame_positions.cuda() mel_reshaped = mel_reshaped.cuda() model.eval() def _plot(mel, mel_predicted, alignments): from matplotlib import pylab as plt plt.figure(figsize=(16, 10)) plt.subplot(3, 1, 1) plt.imshow(mel.data.cpu().numpy().T, origin="lower bottom", aspect="auto", cmap="magma") plt.colorbar() plt.subplot(3, 1, 2) plt.imshow(mel_predicted.view(-1, mel_dim).data.cpu().numpy().T, origin="lower bottom", aspect="auto", cmap="magma") plt.colorbar() plt.subplot(3, 1, 3) if alignments.dim() == 4: alignments = alignments.mean(0) plt.imshow(alignments[0].data.cpu().numpy().T, origin="lower bottom", aspect="auto") plt.colorbar() plt.show() # Encoder encoder_outs = model.seq2seq.encoder(x, lengths=input_lengths) # Off line decoding mel_output_offline, alignments_offline, done = model.seq2seq.decoder( encoder_outs, mel_reshaped, text_positions=text_positions, frame_positions=frame_positions, lengths=input_lengths) _plot(mel, mel_output_offline, alignments_offline) # Online decoding test_inputs = None # test_inputs = mel_reshaped model.seq2seq.decoder.start_fresh_sequence() mel_outputs, alignments, dones_online = model.seq2seq.decoder.incremental_forward( encoder_outs, text_positions, # initial_input=mel_reshaped[:, :1, :], test_inputs=test_inputs) if test_inputs is not None: c = (mel_output_offline - mel_outputs).abs() print(c.mean(), c.max()) _plot(mel, c, alignments) _plot(mel, mel_outputs, alignments)
def test_nyanko(): texts = [ "they discarded this for a more completely Roman and far less beautiful letter." ] seqs = np.array([text_to_sequence(t) for t in texts]) text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0])) mel = np.load( "/home/ryuichi/Dropbox/sp/deepvoice3_pytorch/data/ljspeech/ljspeech-mel-00035.npy" ) max_target_len = mel.shape[0] r = 1 mel_dim = 80 if max_target_len % r != 0: max_target_len += r - max_target_len % r assert max_target_len % r == 0 mel = _pad_2d(mel, max_target_len) mel = torch.from_numpy(mel) mel_reshaped = mel.view(1, -1, mel_dim * r) frame_positions = np.arange(1, mel_reshaped.size(1) + 1).reshape( 1, mel_reshaped.size(1)) x = torch.LongTensor(seqs) text_positions = torch.LongTensor(text_positions) frame_positions = torch.LongTensor(frame_positions) model = nyanko(n_vocab, mel_dim=mel_dim, linear_dim=513, downsample_step=4, r=r, force_monotonic_attention=False) model.eval() def _plot(mel, mel_predicted, alignments): from matplotlib import pylab as plt plt.figure(figsize=(16, 10)) plt.subplot(3, 1, 1) plt.imshow(mel.data.cpu().numpy().T, origin="lower bottom", aspect="auto", cmap="magma") plt.colorbar() plt.subplot(3, 1, 2) plt.imshow(mel_predicted.view(-1, mel_dim).data.cpu().numpy().T, origin="lower bottom", aspect="auto", cmap="magma") plt.colorbar() plt.subplot(3, 1, 3) if alignments.dim() == 4: alignments = alignments.mean(0) plt.imshow(alignments[0].data.cpu().numpy().T, origin="lower bottom", aspect="auto") plt.colorbar() plt.show() seq2seq = model.seq2seq # Encoder encoder_outs = seq2seq.encoder(x) # Off line decoding print("Offline decoding") mel_outputs_offline, alignments_offline, done, _ = seq2seq.decoder( encoder_outs, mel_reshaped, text_positions=text_positions, frame_positions=frame_positions) _plot(mel, mel_outputs_offline, alignments_offline) # Online decoding with test inputs print("Online decoding") seq2seq.decoder.start_fresh_sequence() mel_outputs_online, alignments, dones_online, _ = seq2seq.decoder.incremental_forward( encoder_outs, text_positions, test_inputs=mel_reshaped) a = mel_outputs_offline.cpu().data.numpy() b = mel_outputs_online.cpu().data.numpy() c = (mel_outputs_offline - mel_outputs_online).abs() print(c.mean(), c.max()) _plot(mel, mel_outputs_offline, alignments_offline) _plot(mel, mel_outputs_online, alignments) _plot(mel, c, alignments) # Should get same result assert np.allclose(a, b) postnet = model.postnet linear_outputs = postnet(mel_outputs_offline) print(linear_outputs.size())