def __init__(self, n_src_vocab, len_max_seq, d_word_vec, n_layers, n_head, d_k, d_v, d_model, d_inner, dropout=0.1): super().__init__() n_position = len_max_seq + 1 self.src_word_emb = nn.Embedding(n_src_vocab, d_word_vec, padding_idx=Constants.PAD) self.position_enc = nn.Embedding.from_pretrained( get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=Constants.PAD), freeze=True) self.layer_stack = nn.ModuleList([ EncoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout) for _ in range(n_layers) ])
def __init__(self, hparams): super(Encoder, self).__init__() convolutions = [] for _ in range(hparams.encoder_n_convolutions): conv_layer = nn.Sequential( ConvNorm(hparams.encoder_embedding_dim, hparams.encoder_embedding_dim, kernel_size=hparams.encoder_kernel_size, stride=1, padding=int((hparams.encoder_kernel_size - 1) / 2), dilation=1, w_init_gain='relu'), nn.BatchNorm1d(hparams.encoder_embedding_dim)) convolutions.append(conv_layer) self.convolutions = nn.ModuleList(convolutions) self.lstm = nn.LSTM(hparams.encoder_embedding_dim, int(hparams.encoder_embedding_dim / 2), 1, batch_first=True, bidirectional=True) # Transformer-TTS self.pos_emb = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(1024, 512, padding_idx=0), freeze=True) self.pos_dropout = nn.Dropout(p=0.1) self.alpha = nn.Parameter(torch.ones(1)) self.layers = clones(SelfAttention(hparams.encoder_embedding_dim), hparams.n_attention) self.ffns = clones(FFN(hparams.encoder_embedding_dim), hparams.n_attention) self.norm = nn.LayerNorm(hparams.encoder_embedding_dim, hparams.encoder_embedding_dim) self.concat_after = LinearNorm(hparams.encoder_embedding_dim + hparams.encoder_embedding_dim, hparams.encoder_embedding_dim) self.linear_norm = LinearNorm(hparams.encoder_embedding_dim, hparams.encoder_embedding_dim) self.pos_linear = Linear(hparams.encoder_embedding_dim, hparams.encoder_embedding_dim)
def __init__(self, num_hidden, seq_len): ''' Args: num_hidden: dimension of hidden ''' super(MelDecoder, self).__init__() self.pos_emb = nn.Embedding.from_pretrained( get_sinusoid_encoding_table(seq_len, num_hidden, padding_idx=0), freeze=True) # shape (seq_len, num_hidden), ie vocab_size = seq_len self.pos_dropout = nn.Dropout(p=0.1) self.alpha = nn.Parameter(torch.ones(1)) self.decoder_prenet = Prenet(hp.num_mels, num_hidden * 2, num_hidden, p=0.2) self.norm = Linear(num_hidden, num_hidden) self.selfattn_layers = clones(Attention(num_hidden), 3) self.dotattn_layers = clones(Attention(num_hidden), 3) self.ffns = clones(FFN(num_hidden), 3) self.mel_linear = Linear(num_hidden, hp.num_mels * hp.outputs_per_step) self.stop_linear = Linear(num_hidden, 1, w_init='sigmoid') self.postconvnet = PostConvNet(num_hidden)
def __init__(self, kv_num_hidden, style_num_hidden, q_num_hidden, num_hidden): """ :param num_hidden: dimension of hidden """ super(MelDecoder, self).__init__() self.pos_emb = nn.Embedding.from_pretrained( get_sinusoid_encoding_table(1024, num_hidden, padding_idx=0), freeze=True) self.pos_dropout = nn.Dropout(p=0.1) self.alpha = nn.Parameter(t.ones(1)) self.decoder_prenet = Prenet(hp.num_mels, num_hidden * 2, num_hidden, p=0.2) self.norm = Linear(q_num_hidden, q_num_hidden) self.selfattn_layers = clones( Attention(q_num_hidden, q_num_hidden, num_hidden, hp.n_heads), hp.n_layers) self.styleattn_layers = clones( Attention(style_num_hidden, q_num_hidden, num_hidden, hp.n_heads), hp.n_layers) self.dotattn_layers = clones( Attention(kv_num_hidden, q_num_hidden, num_hidden, hp.n_heads), hp.n_layers) self.ffns = clones(FFN(q_num_hidden), hp.n_layers) self.mel_linear = Linear(q_num_hidden, hp.num_mels * hp.outputs_per_step) self.postconvnet = PostConvNet(q_num_hidden)
def __init__(self, embedding_size, num_hidden): """ :param embedding_size: dimension of embedding 512 :param num_hidden: dimension of hidden 256 """ super(Encoder, self).__init__() self.alpha = nn.Parameter(t.ones(1)) #1 self.pos_emb = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(1024, num_hidden, padding_idx=0), freeze=True) #[1024, 256] self.pos_dropout = nn.Dropout(p=0.1) self.encoder_prenet = EncoderPrenet(embedding_size, num_hidden) # output:256 self.layers = clones(Attention(num_hidden, num_hidden, num_hidden, hp.n_heads), hp.n_layers) self.ffns = clones(FFN(num_hidden), hp.n_layers)
def __init__(self, embedding_size, num_hidden): """ :param embedding_size: dimension of embedding :param num_hidden: dimension of hidden """ super(Encoder, self).__init__() self.alpha = nn.Parameter(t.ones(1)) self.pos_emb = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(1024, num_hidden, padding_idx=0), freeze=True) self.pos_dropout = nn.Dropout(p=0.1) self.encoder_prenet = EncoderPrenet(embedding_size, num_hidden) self.layers = clones(Attention(num_hidden, hp.n_encoder_attention_heads), hp.n_encoder_layers) self.ffns = clones(FFN(num_hidden=num_hidden, filter_size=hp.encoder_conv1d_filter_size, kernel_size=hp.encoder_conv1d_kernel), hp.n_encoder_layers)
def __init__(self, embedding_size, num_hidden, seq_len): ''' Args: embedding_size: dimension of embedding num_hidden: dimension of hidden ''' super(Encoder, self).__init__() self.alpha = nn.Parameter(torch.ones(1)) self.pos_emb = nn.Embedding.from_pretrained( get_sinusoid_encoding_table(seq_len, num_hidden, padding_idx=0), freeze=True) self.pos_dropout = nn.Dropout(p=0.1) self.encoder_prenet = EncoderPrenet(embedding_size, num_hidden) self.layers = clones(Attention(num_hidden), 3) self.ffns = clones(FFN(num_hidden), 3)
def __init__(self, embedding_size, num_hidden): """ :param embedding_size: dimension of embedding :param num_hidden: dimension of hidden """ super(Encoder, self).__init__() self.alpha = nn.Parameter(t.ones(1)) #声明alpha为一个可训练参数 # 此处对输入的pos_text先做三角变换而后embed self.pos_emb = nn.Embedding.from_pretrained( get_sinusoid_encoding_table(1024, num_hidden, padding_idx=0), freeze=True) self.pos_dropout = nn.Dropout(p=0.1) self.encoder_prenet = EncoderPrenet(embedding_size, num_hidden) self.layers = clones(Attention(num_hidden), 3) self.ffns = clones(FFN(num_hidden), 3)
def __init__(self, hparams): super(Decoder, self).__init__() self.n_mel_channels = hparams.n_mel_channels self.n_frames_per_step = hparams.n_frames_per_step self.encoder_embedding_dim = hparams.encoder_embedding_dim self.attention_rnn_dim = hparams.attention_rnn_dim self.decoder_rnn_dim = hparams.decoder_rnn_dim self.prenet_dim = hparams.prenet_dim self.max_decoder_steps = hparams.max_decoder_steps self.gate_threshold = hparams.gate_threshold self.p_attention_dropout = hparams.p_attention_dropout self.p_decoder_dropout = hparams.p_decoder_dropout self.prenet = Prenet( hparams.n_mel_channels * hparams.n_frames_per_step, [hparams.prenet_dim, hparams.prenet_dim]) self.attention_rnn = nn.LSTMCell( hparams.prenet_dim + hparams.encoder_embedding_dim, hparams.attention_rnn_dim) self.attention_layer = Attention( hparams.attention_rnn_dim, hparams.encoder_embedding_dim, hparams.attention_dim, hparams.attention_location_n_filters, hparams.attention_location_kernel_size) self.decoder_rnn = nn.LSTMCell( hparams.attention_rnn_dim + hparams.encoder_embedding_dim, hparams.decoder_rnn_dim, 1) self.linear_projection = LinearNorm( hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, hparams.n_mel_channels * hparams.n_frames_per_step) self.gate_layer = LinearNorm( hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, 1, bias=True, w_init_gain='sigmoid') # Transformer TTS self.norm = LinearNorm(hparams.prenet_dim, hparams.prenet_dim) self.pos_emb = nn.Embedding.from_pretrained( get_sinusoid_encoding_table(1024, hparams.prenet_dim, padding_idx=0), freeze=True) self.alpha = nn.Parameter(torch.ones(1)) self.pos_dropout = nn.Dropout(p=0.1) self.pos_linear = Linear(hparams.prenet_dim, hparams.prenet_dim)
def __init__(self, embedding_size, num_hidden): """ :param embedding_size: dimension of embedding :param num_hidden: dimension of hidden """ super(Encoder, self).__init__() self.alpha = nn.Parameter(t.ones(1)) self.pos_emb = nn.Embedding.from_pretrained( get_sinusoid_encoding_table(4096, num_hidden, padding_idx=0), freeze=True) self.pos_dropout = nn.Dropout(p=0.1) self.embed = nn.Embedding(256, num_hidden, padding_idx=0) # self.enc_prenet = Linear(embedding_size, num_hidden) self.layers = clones(Attention(num_hidden, hp.num_heads), hp.num_layers) self.ffns = clones(FFN(num_hidden), hp.num_layers)