Exemplo n.º 1
0
    def __init__(self,
                 n_src_vocab,
                 len_max_seq,
                 d_word_vec,
                 n_layers,
                 n_head,
                 d_k,
                 d_v,
                 d_model,
                 d_inner,
                 dropout=0.1):

        super().__init__()

        n_position = len_max_seq + 1

        self.src_word_emb = nn.Embedding(n_src_vocab,
                                         d_word_vec,
                                         padding_idx=Constants.PAD)

        self.position_enc = nn.Embedding.from_pretrained(
            get_sinusoid_encoding_table(n_position,
                                        d_word_vec,
                                        padding_idx=Constants.PAD),
            freeze=True)

        self.layer_stack = nn.ModuleList([
            EncoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout)
            for _ in range(n_layers)
        ])
    def __init__(self, hparams):
        super(Encoder, self).__init__()

        convolutions = []
        for _ in range(hparams.encoder_n_convolutions):
            conv_layer = nn.Sequential(
                ConvNorm(hparams.encoder_embedding_dim,
                         hparams.encoder_embedding_dim,
                         kernel_size=hparams.encoder_kernel_size, stride=1,
                         padding=int((hparams.encoder_kernel_size - 1) / 2),
                         dilation=1, w_init_gain='relu'),
                nn.BatchNorm1d(hparams.encoder_embedding_dim))
            convolutions.append(conv_layer)
        self.convolutions = nn.ModuleList(convolutions)

        self.lstm = nn.LSTM(hparams.encoder_embedding_dim,
                            int(hparams.encoder_embedding_dim / 2), 1,
                            batch_first=True, bidirectional=True)

        # Transformer-TTS
        self.pos_emb = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(1024, 512, padding_idx=0),
                                                    freeze=True)
        self.pos_dropout = nn.Dropout(p=0.1)
        self.alpha = nn.Parameter(torch.ones(1))
        self.layers = clones(SelfAttention(hparams.encoder_embedding_dim), hparams.n_attention)
        self.ffns = clones(FFN(hparams.encoder_embedding_dim), hparams.n_attention)
        self.norm = nn.LayerNorm(hparams.encoder_embedding_dim, hparams.encoder_embedding_dim)
        self.concat_after = LinearNorm(hparams.encoder_embedding_dim + hparams.encoder_embedding_dim,
                                       hparams.encoder_embedding_dim)
        self.linear_norm = LinearNorm(hparams.encoder_embedding_dim, hparams.encoder_embedding_dim)
        self.pos_linear = Linear(hparams.encoder_embedding_dim, hparams.encoder_embedding_dim)
Exemplo n.º 3
0
    def __init__(self, num_hidden, seq_len):
        '''
        Args:
           num_hidden: dimension of hidden
        '''
        super(MelDecoder, self).__init__()
        self.pos_emb = nn.Embedding.from_pretrained(
            get_sinusoid_encoding_table(seq_len, num_hidden, padding_idx=0),
            freeze=True)
        # shape (seq_len, num_hidden), ie vocab_size = seq_len
        self.pos_dropout = nn.Dropout(p=0.1)
        self.alpha = nn.Parameter(torch.ones(1))
        self.decoder_prenet = Prenet(hp.num_mels,
                                     num_hidden * 2,
                                     num_hidden,
                                     p=0.2)
        self.norm = Linear(num_hidden, num_hidden)

        self.selfattn_layers = clones(Attention(num_hidden), 3)
        self.dotattn_layers = clones(Attention(num_hidden), 3)
        self.ffns = clones(FFN(num_hidden), 3)
        self.mel_linear = Linear(num_hidden, hp.num_mels * hp.outputs_per_step)
        self.stop_linear = Linear(num_hidden, 1, w_init='sigmoid')

        self.postconvnet = PostConvNet(num_hidden)
Exemplo n.º 4
0
    def __init__(self, kv_num_hidden, style_num_hidden, q_num_hidden,
                 num_hidden):
        """
        :param num_hidden: dimension of hidden
        """
        super(MelDecoder, self).__init__()
        self.pos_emb = nn.Embedding.from_pretrained(
            get_sinusoid_encoding_table(1024, num_hidden, padding_idx=0),
            freeze=True)
        self.pos_dropout = nn.Dropout(p=0.1)
        self.alpha = nn.Parameter(t.ones(1))
        self.decoder_prenet = Prenet(hp.num_mels,
                                     num_hidden * 2,
                                     num_hidden,
                                     p=0.2)
        self.norm = Linear(q_num_hidden, q_num_hidden)

        self.selfattn_layers = clones(
            Attention(q_num_hidden, q_num_hidden, num_hidden, hp.n_heads),
            hp.n_layers)
        self.styleattn_layers = clones(
            Attention(style_num_hidden, q_num_hidden, num_hidden, hp.n_heads),
            hp.n_layers)
        self.dotattn_layers = clones(
            Attention(kv_num_hidden, q_num_hidden, num_hidden, hp.n_heads),
            hp.n_layers)
        self.ffns = clones(FFN(q_num_hidden), hp.n_layers)
        self.mel_linear = Linear(q_num_hidden,
                                 hp.num_mels * hp.outputs_per_step)

        self.postconvnet = PostConvNet(q_num_hidden)
Exemplo n.º 5
0
 def __init__(self, embedding_size, num_hidden):
     """
     :param embedding_size: dimension of embedding	512
     :param num_hidden: dimension of hidden	256
     """
     super(Encoder, self).__init__()
     self.alpha = nn.Parameter(t.ones(1))	#1
     self.pos_emb = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(1024, num_hidden, padding_idx=0),
                                                 freeze=True)		#[1024, 256]
     self.pos_dropout = nn.Dropout(p=0.1)
     self.encoder_prenet = EncoderPrenet(embedding_size, num_hidden)		# output:256
     self.layers = clones(Attention(num_hidden, num_hidden, num_hidden, hp.n_heads), hp.n_layers)
     self.ffns = clones(FFN(num_hidden), hp.n_layers)
Exemplo n.º 6
0
 def __init__(self, embedding_size, num_hidden):
     """
     :param embedding_size: dimension of embedding
     :param num_hidden: dimension of hidden
     """
     super(Encoder, self).__init__()
     self.alpha = nn.Parameter(t.ones(1))
     self.pos_emb = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(1024, num_hidden, padding_idx=0),
                                                 freeze=True)
     self.pos_dropout = nn.Dropout(p=0.1)
     self.encoder_prenet = EncoderPrenet(embedding_size, num_hidden)
     self.layers = clones(Attention(num_hidden, hp.n_encoder_attention_heads), hp.n_encoder_layers)
     self.ffns = clones(FFN(num_hidden=num_hidden, filter_size=hp.encoder_conv1d_filter_size,
                            kernel_size=hp.encoder_conv1d_kernel), hp.n_encoder_layers)
Exemplo n.º 7
0
 def __init__(self, embedding_size, num_hidden, seq_len):
     '''
     Args:
        embedding_size: dimension of embedding
        num_hidden: dimension of hidden
     '''
     super(Encoder, self).__init__()
     self.alpha = nn.Parameter(torch.ones(1))
     self.pos_emb = nn.Embedding.from_pretrained(
         get_sinusoid_encoding_table(seq_len, num_hidden, padding_idx=0),
         freeze=True)
     self.pos_dropout = nn.Dropout(p=0.1)
     self.encoder_prenet = EncoderPrenet(embedding_size, num_hidden)
     self.layers = clones(Attention(num_hidden), 3)
     self.ffns = clones(FFN(num_hidden), 3)
Exemplo n.º 8
0
 def __init__(self, embedding_size, num_hidden):
     """
     :param embedding_size: dimension of embedding
     :param num_hidden: dimension of hidden
     """
     super(Encoder, self).__init__()
     self.alpha = nn.Parameter(t.ones(1))  #声明alpha为一个可训练参数
     # 此处对输入的pos_text先做三角变换而后embed
     self.pos_emb = nn.Embedding.from_pretrained(
         get_sinusoid_encoding_table(1024, num_hidden, padding_idx=0),
         freeze=True)
     self.pos_dropout = nn.Dropout(p=0.1)
     self.encoder_prenet = EncoderPrenet(embedding_size, num_hidden)
     self.layers = clones(Attention(num_hidden), 3)
     self.ffns = clones(FFN(num_hidden), 3)
    def __init__(self, hparams):
        super(Decoder, self).__init__()
        self.n_mel_channels = hparams.n_mel_channels
        self.n_frames_per_step = hparams.n_frames_per_step
        self.encoder_embedding_dim = hparams.encoder_embedding_dim
        self.attention_rnn_dim = hparams.attention_rnn_dim
        self.decoder_rnn_dim = hparams.decoder_rnn_dim
        self.prenet_dim = hparams.prenet_dim
        self.max_decoder_steps = hparams.max_decoder_steps
        self.gate_threshold = hparams.gate_threshold
        self.p_attention_dropout = hparams.p_attention_dropout
        self.p_decoder_dropout = hparams.p_decoder_dropout

        self.prenet = Prenet(
            hparams.n_mel_channels * hparams.n_frames_per_step,
            [hparams.prenet_dim, hparams.prenet_dim])


        self.attention_rnn = nn.LSTMCell(
            hparams.prenet_dim + hparams.encoder_embedding_dim,
            hparams.attention_rnn_dim)

        self.attention_layer = Attention(
            hparams.attention_rnn_dim, hparams.encoder_embedding_dim,
            hparams.attention_dim, hparams.attention_location_n_filters,
            hparams.attention_location_kernel_size)

        self.decoder_rnn = nn.LSTMCell(
            hparams.attention_rnn_dim + hparams.encoder_embedding_dim,
            hparams.decoder_rnn_dim, 1)

        self.linear_projection = LinearNorm(
            hparams.decoder_rnn_dim + hparams.encoder_embedding_dim,
            hparams.n_mel_channels * hparams.n_frames_per_step)

        self.gate_layer = LinearNorm(
            hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, 1,
            bias=True, w_init_gain='sigmoid')

        # Transformer TTS
        self.norm = LinearNorm(hparams.prenet_dim, hparams.prenet_dim)
        self.pos_emb = nn.Embedding.from_pretrained(
            get_sinusoid_encoding_table(1024, hparams.prenet_dim, padding_idx=0),
            freeze=True)
        self.alpha = nn.Parameter(torch.ones(1))
        self.pos_dropout = nn.Dropout(p=0.1)
        self.pos_linear = Linear(hparams.prenet_dim, hparams.prenet_dim)
Exemplo n.º 10
0
    def __init__(self, embedding_size, num_hidden):
        """
        :param embedding_size: dimension of embedding
        :param num_hidden: dimension of hidden
        """
        super(Encoder, self).__init__()
        self.alpha = nn.Parameter(t.ones(1))
        self.pos_emb = nn.Embedding.from_pretrained(
            get_sinusoid_encoding_table(4096, num_hidden, padding_idx=0),
            freeze=True)
        self.pos_dropout = nn.Dropout(p=0.1)
        self.embed = nn.Embedding(256, num_hidden, padding_idx=0)
        # self.enc_prenet = Linear(embedding_size, num_hidden)

        self.layers = clones(Attention(num_hidden, hp.num_heads),
                             hp.num_layers)
        self.ffns = clones(FFN(num_hidden), hp.num_layers)