示例#1
0
def test_incremental_path_multiple_times():
    texts = ["they discarded this for a more completely Roman and far less beautiful letter."]
    seqs = np.array([text_to_sequence(t) for t in texts])
    text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0]))

    r = 4
    mel_dim = 80
    sequence = Variable(torch.LongTensor(seqs))
    text_positions = Variable(torch.LongTensor(text_positions))

    for model, speaker_ids in [
            (_get_model(force_monotonic_attention=False), None),
            (_get_model(force_monotonic_attention=False, n_speakers=32, speaker_embed_dim=16), Variable(torch.LongTensor([1])))]:
        model.eval()

        # first call
        mel_outputs, linear_outputs, alignments, done = model(
            sequence, text_positions=text_positions, speaker_ids=speaker_ids)

        # second call
        mel_outputs2, linear_outputs2, alignments2, done2 = model(
            sequence, text_positions=text_positions, speaker_ids=speaker_ids)

        # Should get same result
        c = (mel_outputs - mel_outputs2).abs()
        print(c.mean(), c.max())

        assert np.allclose(mel_outputs.cpu().data.numpy(),
                           mel_outputs2.cpu().data.numpy(), atol=1e-5)
示例#2
0
def test_incremental_path_multiple_times():
    texts = ["ni2 hao3 , wo3 shi4 jing3 cha2 ."]
    seqs = np.array([text_to_sequence(t) for t in texts])
    text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0]))

    r = 1
    mel_dim = 80

    sequence = torch.LongTensor(seqs)
    text_positions = torch.LongTensor(text_positions)

    model = nyanko(n_vocab, mel_dim=mel_dim, linear_dim=513, downsample_step=4,
                   r=r, force_monotonic_attention=False)
    model.eval()

    # first call
    mel_outputs, linear_outputs, alignments, done = model(
        sequence, text_positions=text_positions, speaker_ids=None)

    # second call
    mel_outputs2, linear_outputs2, alignments2, done2 = model(
        sequence, text_positions=text_positions, speaker_ids=None)

    # Should get same result
    c = (mel_outputs - mel_outputs2).abs()
    print(c.mean(), c.max())

    assert np.allclose(mel_outputs.cpu().data.numpy(),
                       mel_outputs2.cpu().data.numpy(), atol=1e-5)
示例#3
0
def test_incremental_path_multiple_times():
    texts = ["ni2 hao3 , wo3 shi4 jing3 cha2 ."]
    seqs = np.array([text_to_sequence(t) for t in texts])
    text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0]))

    r = 4
    mel_dim = 80
    sequence = torch.LongTensor(seqs)
    text_positions = torch.LongTensor(text_positions)

    for model, speaker_ids in [
        (_get_model(force_monotonic_attention=False), None),
        (_get_model(force_monotonic_attention=False,
                    n_speakers=32,
                    speaker_embed_dim=16), torch.LongTensor([1]))
    ]:
        model.eval()

        # first call
        mel_outputs, linear_outputs, alignments, done = model(
            sequence, text_positions=text_positions, speaker_ids=speaker_ids)

        # second call
        mel_outputs2, linear_outputs2, alignments2, done2 = model(
            sequence, text_positions=text_positions, speaker_ids=speaker_ids)

        # Should get same result
        c = (mel_outputs - mel_outputs2).abs()
        print(c.mean(), c.max())

        assert np.allclose(mel_outputs.cpu().data.numpy(),
                           mel_outputs2.cpu().data.numpy(),
                           atol=1e-5)
示例#4
0
def _test_data():
    texts = ["Thank you very much.", "Hello.", "Deep voice 3."]
    seqs = [np.array(text_to_sequence(t), dtype=np.int) for t in texts]
    input_lengths = np.array([len(s) for s in seqs])
    max_len = np.max(input_lengths)
    seqs = np.array([_pad(s, max_len) for s in seqs])

    # Test encoder
    x = Variable(torch.LongTensor(seqs))
    y = Variable(torch.rand(x.size(0), 12, 80))

    return x, y
示例#5
0
def _test_data():
    texts = ["ni2 hao3 , wo3 shi4 jing3 cha2 .", "jin1 tian1 tian1 qi4 zhen1 bu2 cuo4 . "]
    seqs = [np.array(text_to_sequence(t), dtype=np.int) for t in texts]
    input_lengths = np.array([len(s) for s in seqs])
    max_len = np.max(input_lengths)
    seqs = np.array([_pad(s, max_len) for s in seqs])

    # Test encoder
    x = torch.LongTensor(seqs)
    y = torch.rand(x.size(0), 12, 80)

    return x, y
示例#6
0
def test_incremental_correctness():
    texts = [
        "they discarded this for a more completely Roman and far less beautiful letter."
    ]
    seqs = np.array([text_to_sequence(t) for t in texts])
    text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0]))

    mel = np.load(
        "/home/ryuichi/Dropbox/sp/deepvoice3_pytorch/data/ljspeech/ljspeech-mel-00035.npy"
    )
    max_target_len = mel.shape[0]
    r = 1
    mel_dim = 80
    if max_target_len % r != 0:
        max_target_len += r - max_target_len % r
        assert max_target_len % r == 0
    mel = _pad_2d(mel, max_target_len)
    mel = Variable(torch.from_numpy(mel))
    mel_reshaped = mel.view(1, -1, mel_dim * r)
    frame_positions = np.arange(1,
                                mel_reshaped.size(1) + 1).reshape(
                                    1, mel_reshaped.size(1))

    x = Variable(torch.LongTensor(seqs))
    text_positions = Variable(torch.LongTensor(text_positions))
    frame_positions = Variable(torch.LongTensor(frame_positions))

    model = nyanko(n_vocab,
                   mel_dim=mel_dim,
                   linear_dim=513,
                   downsample_step=4,
                   r=r,
                   force_monotonic_attention=False)
    model.eval()

    # Encoder
    encoder_outs = model.seq2seq.encoder(x)

    # Off line decoding
    mel_outputs_offline, alignments_offline, done, _ = model.seq2seq.decoder(
        encoder_outs,
        mel_reshaped,
        text_positions=text_positions,
        frame_positions=frame_positions)

    # Online decoding with test inputs
    model.seq2seq.decoder.start_fresh_sequence()
    mel_outputs_online, alignments, dones_online, _ = model.seq2seq.decoder.incremental_forward(
        encoder_outs, text_positions, test_inputs=mel_reshaped)

    # Should get same result
    assert np.allclose(mel_outputs_offline.cpu().data.numpy(),
                       mel_outputs_online.cpu().data.numpy())
示例#7
0
def test_incremental_correctness():
    texts = ["they discarded this for a more completely Roman and far less beautiful letter."]
    seqs = np.array([text_to_sequence(t) for t in texts])
    text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0]))

    mel_path = join(dirname(__file__), "data", "ljspeech-mel-00001.npy")
    mel = np.load(mel_path)
    max_target_len = mel.shape[0]
    r = 4
    mel_dim = 80
    if max_target_len % r != 0:
        max_target_len += r - max_target_len % r
        assert max_target_len % r == 0
    mel = _pad_2d(mel, max_target_len)
    mel = Variable(torch.from_numpy(mel))
    mel_reshaped = mel.view(1, -1, mel_dim * r)
    frame_positions = np.arange(1, mel_reshaped.size(1) + 1).reshape(1, mel_reshaped.size(1))

    x = Variable(torch.LongTensor(seqs))
    text_positions = Variable(torch.LongTensor(text_positions))
    frame_positions = Variable(torch.LongTensor(frame_positions))

    for model, speaker_ids in [
            (_get_model(force_monotonic_attention=False), None),
            (_get_model(force_monotonic_attention=False, n_speakers=32, speaker_embed_dim=16), Variable(torch.LongTensor([1])))]:
        model.eval()

        if speaker_ids is not None:
            speaker_embed = model.embed_speakers(speaker_ids)
        else:
            speaker_embed = None

        # Encoder
        encoder_outs = model.seq2seq.encoder(x, speaker_embed=speaker_embed)

        # Off line decoding
        mel_outputs_offline, alignments_offline, done, _ = model.seq2seq.decoder(
            encoder_outs, mel_reshaped, speaker_embed=speaker_embed,
            text_positions=text_positions, frame_positions=frame_positions)

        # Online decoding with test inputs
        model.seq2seq.decoder.start_fresh_sequence()
        mel_outputs_online, alignments, dones_online, _ = model.seq2seq.decoder.incremental_forward(
            encoder_outs, text_positions, speaker_embed=speaker_embed,
            test_inputs=mel_reshaped)

        # Should get same result
        c = (mel_outputs_offline - mel_outputs_online).abs()
        print(c.mean(), c.max())

        assert np.allclose(mel_outputs_offline.cpu().data.numpy(),
                           mel_outputs_online.cpu().data.numpy(), atol=1e-5)
示例#8
0
def test_incremental_correctness():
    texts = ["ni2 hao3 , wo3 shi4 jing3 cha2 ."]
    seqs = np.array([text_to_sequence(t) for t in texts])
    text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0]))

    mel_path = join(dirname(__file__), "data", "ljspeech-mel-00001.npy")
    mel = np.load(mel_path)[::4]
    max_target_len = mel.shape[0]
    r = 1
    mel_dim = 80
    if max_target_len % r != 0:
        max_target_len += r - max_target_len % r
        assert max_target_len % r == 0
    mel = _pad_2d(mel, max_target_len)
    mel = torch.from_numpy(mel)
    mel_reshaped = mel.view(1, -1, mel_dim * r)
    frame_positions = np.arange(1, mel_reshaped.size(1) + 1).reshape(1, mel_reshaped.size(1))

    x = torch.LongTensor(seqs)
    text_positions = torch.LongTensor(text_positions)
    frame_positions = torch.LongTensor(frame_positions)

    model = nyanko(n_vocab, mel_dim=mel_dim, linear_dim=513, downsample_step=4,
                   r=r, force_monotonic_attention=False)
    model.eval()

    # Encoder
    encoder_outs = model.seq2seq.encoder(x)

    # Off line decoding
    mel_outputs_offline, alignments_offline, done, _ = model.seq2seq.decoder(
        encoder_outs, mel_reshaped,
        text_positions=text_positions, frame_positions=frame_positions)

    # Online decoding with test inputs
    model.seq2seq.decoder.start_fresh_sequence()
    mel_outputs_online, alignments, dones_online, _ = model.seq2seq.decoder.incremental_forward(
        encoder_outs, text_positions,
        test_inputs=mel_reshaped)

    # Should get same result
    assert np.allclose(mel_outputs_offline.cpu().data.numpy(),
                       mel_outputs_online.cpu().data.numpy())
示例#9
0
def test_multi_speaker_deepvoice3():
    texts = ["Thank you very much.", "Hello.", "Deep voice 3."]
    seqs = [np.array(text_to_sequence(t), dtype=np.int) for t in texts]
    input_lengths = np.array([len(s) for s in seqs])
    max_len = np.max(input_lengths)
    seqs = np.array([_pad(s, max_len) for s in seqs])

    # Test encoder
    x = Variable(torch.LongTensor(seqs))
    y = Variable(torch.rand(x.size(0), 4 * 33, 80))
    model = _get_model(n_speakers=32, speaker_embed_dim=16)
    speaker_ids = Variable(torch.LongTensor([1, 2, 3]))

    mel_outputs, linear_outputs, alignments, done = model(x, y, speaker_ids=speaker_ids)
    print("Input text:", x.size())
    print("Input mel:", y.size())
    print("Mel:", mel_outputs.size())
    print("Linear:", linear_outputs.size())
    print("Alignments:", alignments.size())
    print("Done:", done.size())
示例#10
0
def test_multi_speaker_deepvoice3():
    texts = [
        "ni2 hao3 , wo3 shi4 jing3 cha2 .",
        "jin1 tian1 tian1 qi4 zhen1 bu2 cuo4 . "
    ]
    seqs = [np.array(text_to_sequence(t), dtype=np.int) for t in texts]
    input_lengths = np.array([len(s) for s in seqs])
    max_len = np.max(input_lengths)
    seqs = np.array([_pad(s, max_len) for s in seqs])

    # Test encoder
    x = torch.LongTensor(seqs)
    y = torch.rand(x.size(0), 4 * 33, 80)
    model = _get_model(n_speakers=32, speaker_embed_dim=16)
    speaker_ids = torch.LongTensor([1, 2, 3])

    mel_outputs, linear_outputs, alignments, done = model(
        x, y, speaker_ids=speaker_ids)
    print("Input text:", x.size())
    print("Input mel:", y.size())
    print("Mel:", mel_outputs.size())
    print("Linear:", linear_outputs.size())
    print("Alignments:", alignments.size())
    print("Done:", done.size())
示例#11
0
def test_incremental_forward():
    checkpoint_path = join(dirname(__file__),
                           "../checkpoints/checkpoint_step000140000.pth")
    if not exists(checkpoint_path):
        return
    model = _get_model()

    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint["state_dict"])
    model = model.cuda() if use_cuda else model

    texts = [
        "they discarded this for a more completely Roman and far less beautiful letter."
    ]
    seqs = np.array([text_to_sequence(t) for t in texts])
    input_lengths = [len(s) for s in seqs]

    use_manual_padding = False
    if use_manual_padding:
        max_input_len = np.max(input_lengths) + 10  # manuall padding
        seqs = np.array([_pad(x, max_input_len) for x in seqs], dtype=np.int)
        input_lengths = torch.LongTensor(input_lengths)
        input_lengths = input_lengths.cuda() if use_cuda else input_lenghts
    else:
        input_lengths = None

    text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0]))

    mel = np.load("/home/ryuichi/tacotron/training/ljspeech-mel-00035.npy")
    max_target_len = mel.shape[0]
    r = 4
    mel_dim = 80
    if max_target_len % r != 0:
        max_target_len += r - max_target_len % r
        assert max_target_len % r == 0
    mel = _pad_2d(mel, max_target_len)
    mel = Variable(torch.from_numpy(mel))
    mel_reshaped = mel.view(1, -1, mel_dim * r)

    frame_positions = np.arange(1,
                                mel_reshaped.size(1) + 1).reshape(
                                    1, mel_reshaped.size(1))

    x = Variable(torch.LongTensor(seqs))
    text_positions = Variable(torch.LongTensor(text_positions))
    frame_positions = Variable(torch.LongTensor(frame_positions))

    if use_cuda:
        x = x.cuda()
        text_positions = text_positions.cuda()
        frame_positions = frame_positions.cuda()
        mel_reshaped = mel_reshaped.cuda()

    # model.make_generation_fast_()
    model.eval()

    encoder_outs = model.encoder(x, lengths=input_lengths)

    # Off line decoding
    mel_output_offline, alignments_offline, done, decoder_states = model.decoder(
        encoder_outs,
        mel_reshaped,
        text_positions=text_positions,
        frame_positions=frame_positions,
        lengths=input_lengths)

    from matplotlib import pylab as plt

    def _plot(mel, mel_predicted, alignments):
        plt.figure(figsize=(16, 10))
        plt.subplot(3, 1, 1)
        plt.imshow(mel.data.cpu().numpy().T,
                   origin="lower bottom",
                   aspect="auto")
        plt.colorbar()

        plt.subplot(3, 1, 2)
        plt.imshow(mel_predicted.view(-1, mel_dim).data.cpu().numpy().T,
                   origin="lower bottom",
                   aspect="auto")
        plt.colorbar()

        plt.subplot(3, 1, 3)
        if alignments.dim() == 4:
            alignments = alignments.mean(0)
        plt.imshow(alignments[0].data.cpu().numpy().T,
                   origin="lower bottom",
                   aspect="auto")
        plt.colorbar()
        plt.show()

    _plot(mel, mel_output_offline, alignments_offline)

    # Online decoding
    model.decoder._start_incremental_inference()
    mel_outputs, alignments, dones_online, decoder_states_online = model.decoder._incremental_forward(
        encoder_outs,
        text_positions,
        # initial_input=mel_reshaped[:, :1, :],
        test_inputs=None)
    # test_inputs=mel_reshaped)
    model.decoder._stop_incremental_inference()

    _plot(mel, mel_outputs, alignments)
示例#12
0
def test_incremental_forward():
    checkpoint_path = join(dirname(__file__),
                           "../test_whole/checkpoint_step000265000.pth")
    if not exists(checkpoint_path):
        return
    model = _get_model()

    use_cuda = False

    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint["state_dict"])
    model.make_generation_fast_()
    model = model.cuda() if use_cuda else model

    texts = ["ni2 hao3 , wo3 shi4 jing3 cha2 ."]
    seqs = np.array([text_to_sequence(t) for t in texts])
    input_lengths = [len(s) for s in seqs]

    use_manual_padding = False
    if use_manual_padding:
        max_input_len = np.max(input_lengths) + 10  # manuall padding
        seqs = np.array([_pad(x, max_input_len) for x in seqs], dtype=np.int)
        input_lengths = torch.LongTensor(input_lengths)
        input_lengths = input_lengths.cuda() if use_cuda else input_lengths
    else:
        input_lengths = None

    text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0]))

    mel = np.load(
        "/home/ryuichi/Dropbox/sp/deepvoice3_pytorch/data/ljspeech/ljspeech-mel-00035.npy"
    )
    max_target_len = mel.shape[0]
    r = 4
    mel_dim = 80
    if max_target_len % r != 0:
        max_target_len += r - max_target_len % r
        assert max_target_len % r == 0
    mel = _pad_2d(mel, max_target_len)
    mel = torch.from_numpy(mel)
    mel_reshaped = mel.view(1, -1, mel_dim * r)

    frame_positions = np.arange(1,
                                mel_reshaped.size(1) + 1).reshape(
                                    1, mel_reshaped.size(1))

    x = torch.LongTensor(seqs)
    text_positions = torch.LongTensor(text_positions)
    frame_positions = torch.LongTensor(frame_positions)

    if use_cuda:
        x = x.cuda()
        text_positions = text_positions.cuda()
        frame_positions = frame_positions.cuda()
        mel_reshaped = mel_reshaped.cuda()

    model.eval()

    def _plot(mel, mel_predicted, alignments):
        from matplotlib import pylab as plt
        plt.figure(figsize=(16, 10))
        plt.subplot(3, 1, 1)
        plt.imshow(mel.data.cpu().numpy().T,
                   origin="lower bottom",
                   aspect="auto",
                   cmap="magma")
        plt.colorbar()

        plt.subplot(3, 1, 2)
        plt.imshow(mel_predicted.view(-1, mel_dim).data.cpu().numpy().T,
                   origin="lower bottom",
                   aspect="auto",
                   cmap="magma")
        plt.colorbar()

        plt.subplot(3, 1, 3)
        if alignments.dim() == 4:
            alignments = alignments.mean(0)
        plt.imshow(alignments[0].data.cpu().numpy().T,
                   origin="lower bottom",
                   aspect="auto")
        plt.colorbar()
        plt.show()

    # Encoder
    encoder_outs = model.seq2seq.encoder(x, lengths=input_lengths)

    # Off line decoding
    mel_output_offline, alignments_offline, done = model.seq2seq.decoder(
        encoder_outs,
        mel_reshaped,
        text_positions=text_positions,
        frame_positions=frame_positions,
        lengths=input_lengths)

    _plot(mel, mel_output_offline, alignments_offline)

    # Online decoding
    test_inputs = None
    # test_inputs = mel_reshaped
    model.seq2seq.decoder.start_fresh_sequence()
    mel_outputs, alignments, dones_online = model.seq2seq.decoder.incremental_forward(
        encoder_outs,
        text_positions,
        # initial_input=mel_reshaped[:, :1, :],
        test_inputs=test_inputs)

    if test_inputs is not None:
        c = (mel_output_offline - mel_outputs).abs()
        print(c.mean(), c.max())
        _plot(mel, c, alignments)

    _plot(mel, mel_outputs, alignments)
示例#13
0
def test_nyanko():
    texts = [
        "they discarded this for a more completely Roman and far less beautiful letter."
    ]
    seqs = np.array([text_to_sequence(t) for t in texts])
    text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0]))

    mel = np.load(
        "/home/ryuichi/Dropbox/sp/deepvoice3_pytorch/data/ljspeech/ljspeech-mel-00035.npy"
    )
    max_target_len = mel.shape[0]
    r = 1
    mel_dim = 80
    if max_target_len % r != 0:
        max_target_len += r - max_target_len % r
        assert max_target_len % r == 0
    mel = _pad_2d(mel, max_target_len)
    mel = torch.from_numpy(mel)
    mel_reshaped = mel.view(1, -1, mel_dim * r)
    frame_positions = np.arange(1,
                                mel_reshaped.size(1) + 1).reshape(
                                    1, mel_reshaped.size(1))

    x = torch.LongTensor(seqs)
    text_positions = torch.LongTensor(text_positions)
    frame_positions = torch.LongTensor(frame_positions)

    model = nyanko(n_vocab,
                   mel_dim=mel_dim,
                   linear_dim=513,
                   downsample_step=4,
                   r=r,
                   force_monotonic_attention=False)
    model.eval()

    def _plot(mel, mel_predicted, alignments):
        from matplotlib import pylab as plt
        plt.figure(figsize=(16, 10))
        plt.subplot(3, 1, 1)
        plt.imshow(mel.data.cpu().numpy().T,
                   origin="lower bottom",
                   aspect="auto",
                   cmap="magma")
        plt.colorbar()

        plt.subplot(3, 1, 2)
        plt.imshow(mel_predicted.view(-1, mel_dim).data.cpu().numpy().T,
                   origin="lower bottom",
                   aspect="auto",
                   cmap="magma")
        plt.colorbar()

        plt.subplot(3, 1, 3)
        if alignments.dim() == 4:
            alignments = alignments.mean(0)
        plt.imshow(alignments[0].data.cpu().numpy().T,
                   origin="lower bottom",
                   aspect="auto")
        plt.colorbar()
        plt.show()

    seq2seq = model.seq2seq

    # Encoder
    encoder_outs = seq2seq.encoder(x)

    # Off line decoding
    print("Offline decoding")
    mel_outputs_offline, alignments_offline, done, _ = seq2seq.decoder(
        encoder_outs,
        mel_reshaped,
        text_positions=text_positions,
        frame_positions=frame_positions)

    _plot(mel, mel_outputs_offline, alignments_offline)

    # Online decoding with test inputs
    print("Online decoding")
    seq2seq.decoder.start_fresh_sequence()
    mel_outputs_online, alignments, dones_online, _ = seq2seq.decoder.incremental_forward(
        encoder_outs, text_positions, test_inputs=mel_reshaped)

    a = mel_outputs_offline.cpu().data.numpy()
    b = mel_outputs_online.cpu().data.numpy()
    c = (mel_outputs_offline - mel_outputs_online).abs()
    print(c.mean(), c.max())

    _plot(mel, mel_outputs_offline, alignments_offline)
    _plot(mel, mel_outputs_online, alignments)
    _plot(mel, c, alignments)

    # Should get same result
    assert np.allclose(a, b)

    postnet = model.postnet

    linear_outputs = postnet(mel_outputs_offline)
    print(linear_outputs.size())