예제 #1
0
def _deepvoice3(n_vocab, embed_dim=256, mel_dim=80,
                linear_dim=4096, r=5,
                n_speakers=1, speaker_embed_dim=16,
                padding_idx=None,
                dropout=(1 - 0.95), dilation=1):

    from deepvoice3_pytorch.deepvoice3 import Encoder, Decoder, Converter
    h = 128
    encoder = Encoder(
        n_vocab, embed_dim, padding_idx=padding_idx,
        n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim,
        dropout=dropout,
        convolutions=[(h, 3, dilation), (h, 3, dilation), (h, 3, dilation),
                      (h, 3, dilation), (h, 3, dilation)],
    )

    h = 256
    decoder = Decoder(
        embed_dim, in_dim=mel_dim, r=r, padding_idx=padding_idx,
        n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim,
        dropout=dropout,
        preattention=[(h, 3, 1)],
        convolutions=[(h, 3, dilation), (h, 3, dilation), (h, 3, dilation),
                      (h, 3, dilation), (h, 3, dilation)],
        attention=[True, False, False, False, True],
        force_monotonic_attention=False)

    seq2seq = AttentionSeq2Seq(encoder, decoder)

    in_dim = mel_dim
    h = 256
    converter = Converter(n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim,
                          in_dim=in_dim, out_dim=linear_dim, dropout=dropout,
                          convolutions=[(h, 3, dilation), (h, 3, dilation), (h, 3, dilation),
                                        (h, 3, dilation), (h, 3, dilation)])

    model = MultiSpeakerTTSModel(
        seq2seq, converter, padding_idx=padding_idx,
        mel_dim=mel_dim, linear_dim=linear_dim,
        n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim)

    return model
예제 #2
0
def nyanko(n_vocab,
           embed_dim=128,
           mel_dim=80,
           linear_dim=513,
           r=1,
           n_speakers=1,
           speaker_embed_dim=16,
           padding_idx=0,
           dropout=(1 - 0.95),
           kernel_size=3,
           encoder_channels=256,
           decoder_channels=256,
           converter_channels=512,
           query_position_rate=1.0,
           key_position_rate=1.29,
           use_memory_mask=False,
           trainable_positional_encodings=False,
           force_monotonic_attention=True,
           use_decoder_state_for_postnet_input=False,
           max_positions=512):
    from deepvoice3_pytorch.nyanko import Encoder, Decoder, Converter
    assert encoder_channels == decoder_channels

    # Seq2seq
    encoder = Encoder(
        n_vocab,
        embed_dim,
        channels=encoder_channels,
        kernel_size=kernel_size,
        padding_idx=padding_idx,
        n_speakers=n_speakers,
        speaker_embed_dim=speaker_embed_dim,
        dropout=dropout,
    )

    decoder = Decoder(embed_dim,
                      in_dim=mel_dim,
                      r=r,
                      channels=decoder_channels,
                      kernel_size=kernel_size,
                      padding_idx=padding_idx,
                      n_speakers=n_speakers,
                      speaker_embed_dim=speaker_embed_dim,
                      dropout=dropout,
                      max_positions=max_positions,
                      force_monotonic_attention=force_monotonic_attention,
                      query_position_rate=query_position_rate,
                      key_position_rate=key_position_rate,
                      use_memory_mask=use_memory_mask)

    seq2seq = AttentionSeq2Seq(encoder, decoder)

    if use_decoder_state_for_postnet_input:
        in_dim = decoder_channels // r
    else:
        in_dim = mel_dim

    converter = Converter(in_dim=in_dim,
                          out_dim=linear_dim,
                          channels=converter_channels,
                          kernel_size=kernel_size,
                          dropout=dropout)

    # Seq2seq + post net
    model = MultiSpeakerTTSModel(
        seq2seq,
        converter,
        padding_idx=padding_idx,
        mel_dim=mel_dim,
        linear_dim=linear_dim,
        n_speakers=n_speakers,
        speaker_embed_dim=speaker_embed_dim,
        trainable_positional_encodings=trainable_positional_encodings,
        use_decoder_state_for_postnet_input=use_decoder_state_for_postnet_input
    )

    return model
예제 #3
0
def deepvoice3(
    n_vocab,
    embed_dim=256,
    mel_dim=80,
    linear_dim=513,
    r=4,
    n_speakers=1,
    speaker_embed_dim=16,
    padding_idx=0,
    dropout=(1 - 0.95),
    kernel_size=5,
    encoder_channels=128,
    decoder_channels=256,
    converter_channels=256,
    query_position_rate=1.0,
    key_position_rate=1.29,
    use_memory_mask=False,
    trainable_positional_encodings=False,
    force_monotonic_attention=True,
    use_decoder_state_for_postnet_input=True,
    max_positions=512,
):
    """Build deepvoice3
    """
    from deepvoice3_pytorch.deepvoice3 import Encoder, Decoder, Converter

    # Seq2seq
    h = encoder_channels  # hidden dim (channels)
    k = kernel_size  # kernel size
    encoder = Encoder(
        n_vocab,
        embed_dim,
        padding_idx=padding_idx,
        n_speakers=n_speakers,
        speaker_embed_dim=speaker_embed_dim,
        dropout=dropout,
        max_positions=max_positions,
        # (channels, kernel_size, dilation)
        convolutions=[(h, k, 1), (h, k, 1), (h, k, 1), (h, k, 1), (h, k, 2),
                      (h, k, 4), (h, k, 8)],
    )

    h = decoder_channels
    decoder = Decoder(embed_dim,
                      in_dim=mel_dim,
                      r=r,
                      padding_idx=padding_idx,
                      n_speakers=n_speakers,
                      speaker_embed_dim=speaker_embed_dim,
                      dropout=dropout,
                      max_positions=max_positions,
                      convolutions=[(h, k, 1), (h, k, 1), (h, k, 2), (h, k, 4),
                                    (h, k, 8)],
                      attention=[True, False, False, False, True],
                      force_monotonic_attention=force_monotonic_attention,
                      query_position_rate=query_position_rate,
                      key_position_rate=key_position_rate,
                      use_memory_mask=use_memory_mask)

    seq2seq = AttentionSeq2Seq(encoder, decoder)

    # Post net
    if use_decoder_state_for_postnet_input:
        in_dim = h // r
    else:
        in_dim = mel_dim
    h = converter_channels
    converter = Converter(in_dim=in_dim,
                          out_dim=linear_dim,
                          dropout=dropout,
                          convolutions=[(h, k, 1), (h, k, 1), (h, k, 2),
                                        (h, k, 4), (h, k, 8)])

    # Seq2seq + post net
    model = MultiSpeakerTTSModel(
        seq2seq,
        converter,
        padding_idx=padding_idx,
        mel_dim=mel_dim,
        linear_dim=linear_dim,
        n_speakers=n_speakers,
        speaker_embed_dim=speaker_embed_dim,
        trainable_positional_encodings=trainable_positional_encodings,
        use_decoder_state_for_postnet_input=use_decoder_state_for_postnet_input
    )

    return model
예제 #4
0
def nyanko(n_vocab, embed_dim=128, mel_dim=80, linear_dim=513, r=1,
           downsample_step=4,
           n_speakers=1, speaker_embed_dim=16, padding_idx=0,
           dropout=(1 - 0.95), kernel_size=3,
           encoder_channels=256,
           decoder_channels=256,
           converter_channels=512,
           query_position_rate=1.0,
           key_position_rate=1.29,
           use_memory_mask=False,
           trainable_positional_encodings=False,
           force_monotonic_attention=True,
           use_decoder_state_for_postnet_input=False,
           max_positions=512, embedding_weight_std=0.01,
           speaker_embedding_weight_std=0.01,
           freeze_embedding=False,
           window_ahead=3,
           window_backward=1,
           key_projection=False,
           value_projection=False,
           ):
    from deepvoice3_pytorch.nyanko import Encoder, Decoder, Converter
    assert encoder_channels == decoder_channels

    if n_speakers != 1:
        raise ValueError("Multi-speaker is not supported")
    if not (downsample_step == 4 and r == 1):
        raise ValueError("Not supported. You need to change hardcoded parameters")

    # Seq2seq
    encoder = Encoder(
        n_vocab, embed_dim, channels=encoder_channels, kernel_size=kernel_size,
        padding_idx=padding_idx,
        n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim,
        dropout=dropout, embedding_weight_std=embedding_weight_std,
    )

    decoder = Decoder(
        embed_dim, in_dim=mel_dim, r=r, channels=decoder_channels,
        kernel_size=kernel_size, padding_idx=padding_idx,
        n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim,
        dropout=dropout, max_positions=max_positions,
        force_monotonic_attention=force_monotonic_attention,
        query_position_rate=query_position_rate,
        key_position_rate=key_position_rate,
        use_memory_mask=use_memory_mask,
        window_ahead=window_ahead,
        window_backward=window_backward,
        key_projection=key_projection,
        value_projection=value_projection,
    )

    seq2seq = AttentionSeq2Seq(encoder, decoder)

    if use_decoder_state_for_postnet_input:
        in_dim = decoder_channels // r
    else:
        in_dim = mel_dim

    converter = Converter(
        in_dim=in_dim, out_dim=linear_dim, channels=converter_channels,
        kernel_size=kernel_size, dropout=dropout)

    # Seq2seq + post net
    model = MultiSpeakerTTSModel(
        seq2seq, converter, padding_idx=padding_idx,
        mel_dim=mel_dim, linear_dim=linear_dim,
        n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim,
        trainable_positional_encodings=trainable_positional_encodings,
        use_decoder_state_for_postnet_input=use_decoder_state_for_postnet_input,
        speaker_embedding_weight_std=speaker_embedding_weight_std,
        freeze_embedding=freeze_embedding)

    return model
예제 #5
0
def deepvoice3(n_vocab, embed_dim=256, mel_dim=80, linear_dim=513, r=4,
               downsample_step=1,
               n_speakers=1, speaker_embed_dim=16, padding_idx=0,
               dropout=(1 - 0.95), kernel_size=5,
               encoder_channels=128,
               decoder_channels=256,
               converter_channels=256,
               query_position_rate=1.0,
               key_position_rate=1.29,
               use_memory_mask=False,
               trainable_positional_encodings=False,
               force_monotonic_attention=True,
               use_decoder_state_for_postnet_input=True,
               max_positions=512,
               embedding_weight_std=0.1,
               speaker_embedding_weight_std=0.01,
               freeze_embedding=False,
               window_ahead=3,
               window_backward=1,
               key_projection=False,
               value_projection=False,
               ):
    """Build deepvoice3
    """
    from deepvoice3_pytorch.deepvoice3 import Encoder, Decoder, Converter

    time_upsampling = max(downsample_step // r, 1)

    # Seq2seq
    h = encoder_channels  # hidden dim (channels)
    k = kernel_size   # kernel size
    encoder = Encoder(
        n_vocab, embed_dim, padding_idx=padding_idx,
        n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim,
        dropout=dropout, max_positions=max_positions,
        embedding_weight_std=embedding_weight_std,
        # (channels, kernel_size, dilation)
        convolutions=[(h, k, 1), (h, k, 3), (h, k, 9), (h, k, 27),
                      (h, k, 1), (h, k, 3), (h, k, 9), (h, k, 27),
                      (h, k, 1), (h, k, 3)],
    )

    h = decoder_channels
    decoder = Decoder(
        embed_dim, in_dim=mel_dim, r=r, padding_idx=padding_idx,
        n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim,
        dropout=dropout, max_positions=max_positions,
        preattention=[(h, k, 1), (h, k, 3)],
        convolutions=[(h, k, 1), (h, k, 3), (h, k, 9), (h, k, 27),
                      (h, k, 1)],
        attention=[True, False, False, False, True],
        force_monotonic_attention=force_monotonic_attention,
        query_position_rate=query_position_rate,
        key_position_rate=key_position_rate,
        use_memory_mask=use_memory_mask,
        window_ahead=window_ahead,
        window_backward=window_backward,
        key_projection=key_projection,
        value_projection=value_projection,
    )

    seq2seq = AttentionSeq2Seq(encoder, decoder)

    # Post net
    if use_decoder_state_for_postnet_input:
        in_dim = h // r
    else:
        in_dim = mel_dim
    h = converter_channels
    converter = Converter(
        n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim,
        in_dim=in_dim, out_dim=linear_dim, dropout=dropout,
        time_upsampling=time_upsampling,
        convolutions=[(h, k, 1), (h, k, 3), (2 * h, k, 1), (2 * h, k, 3)],
    )

    # Seq2seq + post net
    model = MultiSpeakerTTSModel(
        seq2seq, converter, padding_idx=padding_idx,
        mel_dim=mel_dim, linear_dim=linear_dim,
        n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim,
        trainable_positional_encodings=trainable_positional_encodings,
        use_decoder_state_for_postnet_input=use_decoder_state_for_postnet_input,
        speaker_embedding_weight_std=speaker_embedding_weight_std,
        freeze_embedding=freeze_embedding)

    return model
예제 #6
0
def deepvoice3(n_vocab,
               embed_dim=256,
               mel_dim=80,
               linear_dim=513,
               r=4,
               n_speakers=1,
               speaker_embed_dim=16,
               padding_idx=0,
               dropout=(1 - 0.95),
               kernel_size=5,
               encoder_channels=128,
               num_encoder_layer=7,
               decoder_channels=256,
               num_decoder_layer=4,
               attention_hidden=128,
               converter_channels=256,
               num_converter_layer=5,
               query_position_rate=1.0,
               key_position_rate=1.29,
               position_weight=1.0,
               use_memory_mask=False,
               trainable_positional_encodings=False,
               force_monotonic_attention=True,
               use_decoder_state_for_postnet_input=True,
               max_positions=512,
               embedding_weight_std=0.1,
               speaker_embedding_weight_std=0.01,
               freeze_embedding=False,
               window_ahead=3,
               window_backward=1,
               world_upsample=1,
               sp_fft_size=1025,
               training_type='seq2seq'):
    """Build deepvoice3
    """
    from deepvoice3_pytorch.deepvoice3 import Encoder, Decoder, LinearConverter, WorldConverter

    # Seq2seq
    h = encoder_channels  # hidden dim (channels)
    k = kernel_size  # kernel size
    encoder = Encoder(
        n_vocab,
        embed_dim,
        padding_idx=padding_idx,
        n_speakers=n_speakers,
        speaker_embed_dim=speaker_embed_dim,
        dropout=dropout,
        max_positions=max_positions,
        embedding_weight_std=embedding_weight_std,
        # (channels, kernel_size, dilation)
        convolutions=[
            (h, k, 1),
        ] * num_encoder_layer,
    )

    h = decoder_channels
    k = kernel_size
    att_hid = attention_hidden
    decoder = Decoder(
        embed_dim,
        attention_hidden=att_hid,
        in_dim=mel_dim,
        r=r,
        padding_idx=padding_idx,
        n_speakers=n_speakers,
        speaker_embed_dim=speaker_embed_dim,
        dropout=dropout,
        max_positions=max_positions,
        preattention=[(mel_dim * r, h // 2), (h // 2, h)],
        convolutions=[
            (h, k, 1),
        ] * num_decoder_layer,
        force_monotonic_attention=force_monotonic_attention,
        query_position_rate=query_position_rate,
        key_position_rate=key_position_rate,
        position_weight=position_weight,
        use_memory_mask=use_memory_mask,
        window_ahead=window_ahead,
        window_backward=window_backward,
    )

    seq2seq = AttentionSeq2Seq(encoder, decoder)

    if training_type == 'seq2seq':
        scale_speaker_embed = num_encoder_layer + 2 + num_decoder_layer * 2 + 2  #TODO:なくても良いかもなので確認
        model = MultispeakerSeq2seq(
            seq2seq,
            padding_idx=padding_idx,
            mel_dim=mel_dim,
            n_speakers=n_speakers,
            speaker_embed_dim=speaker_embed_dim,
            trainable_positional_encodings=trainable_positional_encodings,
            use_decoder_state_for_postnet_input=
            use_decoder_state_for_postnet_input,
            speaker_embedding_weight_std=speaker_embedding_weight_std,
            freeze_embedding=freeze_embedding,
            scale_speaker_embed=scale_speaker_embed)
        model.training_type = training_type
        return model

    # Post net
    if use_decoder_state_for_postnet_input:
        in_dim = h
    else:
        in_dim = mel_dim
    h = converter_channels
    k = kernel_size

    #Linear or world parameter
    if training_type == 'linear':
        converter = LinearConverter(n_speakers=n_speakers,
                                    speaker_embed_dim=speaker_embed_dim,
                                    in_dim=in_dim,
                                    out_dim=linear_dim,
                                    dropout=dropout,
                                    r=r,
                                    convolutions=[
                                        (h, k, 1),
                                    ] * num_converter_layer)
    elif training_type == 'world':
        converter = WorldConverter(n_speakers=n_speakers,
                                   speaker_embed_dim=speaker_embed_dim,
                                   in_dim=in_dim,
                                   out_dim=sp_fft_size,
                                   dropout=dropout,
                                   r=r,
                                   time_upsampling=world_upsample,
                                   convolutions=[
                                       (h, k, 1),
                                   ] * num_converter_layer)

    scale_speaker_embed = num_encoder_layer + 2 + num_decoder_layer * 2 + 2 + num_converter_layer + 1

    # Seq2seq + post net
    model = MultiSpeakerTTSModel(
        seq2seq,
        converter,
        padding_idx=padding_idx,
        mel_dim=mel_dim,
        linear_dim=linear_dim,
        n_speakers=n_speakers,
        speaker_embed_dim=speaker_embed_dim,
        trainable_positional_encodings=trainable_positional_encodings,
        use_decoder_state_for_postnet_input=use_decoder_state_for_postnet_input,
        speaker_embedding_weight_std=speaker_embedding_weight_std,
        freeze_embedding=freeze_embedding,
        scale_speaker_embed=scale_speaker_embed)
    model.training_type = training_type

    return model