예제 #1
0
파일: model.py 프로젝트: ishine/UniNet
 def __init__(self, linguistic_dim, phone_level_rnn_dim, join_model_dim,
              join_model_hidden_dim):
     super(JoinModol, self).__init__()
     self.layers1 = LinearNorm(linguistic_dim + phone_level_rnn_dim,
                               join_model_hidden_dim)
     self.layers2 = LinearNorm(join_model_hidden_dim, join_model_hidden_dim)
     self.layers3 = LinearNorm(join_model_hidden_dim, join_model_dim)
예제 #2
0
 def __init__(self, vocab_size, decoder_dim, hidden_size, dropout=0.5):
     super(MIEsitmator, self).__init__()
     self.proj = nn.Sequential(
         LinearNorm(decoder_dim, hidden_size, bias=True,
                    w_init_gain='relu'), nn.ReLU(), nn.Dropout(p=dropout))
     self.ctc_proj = LinearNorm(hidden_size, vocab_size + 1, bias=True)
     self.ctc = nn.CTCLoss(blank=vocab_size, reduction='none')
    def __init__(self,
                 input_size,
                 decoder_hidden_size,
                 audio_encoder_size,
                 spectral_size,
                 hparams,
                 num_layers=1,
                 bidirectional=False,
                 drop_prob=0):
        super(RecurrentDecoder, self).__init__()
        self.batch_size = hparams.batch_size
        self.decoder_hidden_size = decoder_hidden_size
        self.rnn_dropout = hparams.rnn_dropout
        self.n_mel_channels = hparams.n_mel_channels

        self.attention = AttentionLoop(audio_encoder_size,
                                       decoder_hidden_size,
                                       method="concat")
        self.rnn = nn.GRUCell(input_size + audio_encoder_size,
                              decoder_hidden_size,
                              bias=False)
        self.spectral_linear_projection = LinearNorm(audio_encoder_size,
                                                     audio_encoder_size)
        self.gate_linear_projection = LinearNorm(audio_encoder_size,
                                                 1,
                                                 bias=True,
                                                 w_init_gain='sigmoid')
예제 #4
0
    def __init__(self, hparams):
        super(VAE, self).__init__()

        self.fc_r_mu = LinearNorm(hparams.reference_dim + 3,
                                  hparams.z_residual_dim)
        self.fc_r_lv = LinearNorm(hparams.reference_dim + 3,
                                  hparams.z_residual_dim)
    def __init__(self, hparams):
        super(Encoder, self).__init__()

        convolutions = []
        for _ in range(hparams.encoder_n_convolutions):
            conv_layer = nn.Sequential(
                ConvNorm(hparams.encoder_embedding_dim,
                         hparams.encoder_embedding_dim,
                         kernel_size=hparams.encoder_kernel_size, stride=1,
                         padding=int((hparams.encoder_kernel_size - 1) / 2),
                         dilation=1, w_init_gain='relu'),
                nn.BatchNorm1d(hparams.encoder_embedding_dim))
            convolutions.append(conv_layer)
        self.convolutions = nn.ModuleList(convolutions)

        self.lstm = nn.LSTM(hparams.encoder_embedding_dim,
                            int(hparams.encoder_embedding_dim / 2), 1,
                            batch_first=True, bidirectional=True)

        # Transformer-TTS
        self.pos_emb = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(1024, 512, padding_idx=0),
                                                    freeze=True)
        self.pos_dropout = nn.Dropout(p=0.1)
        self.alpha = nn.Parameter(torch.ones(1))
        self.layers = clones(SelfAttention(hparams.encoder_embedding_dim), hparams.n_attention)
        self.ffns = clones(FFN(hparams.encoder_embedding_dim), hparams.n_attention)
        self.norm = nn.LayerNorm(hparams.encoder_embedding_dim, hparams.encoder_embedding_dim)
        self.concat_after = LinearNorm(hparams.encoder_embedding_dim + hparams.encoder_embedding_dim,
                                       hparams.encoder_embedding_dim)
        self.linear_norm = LinearNorm(hparams.encoder_embedding_dim, hparams.encoder_embedding_dim)
        self.pos_linear = Linear(hparams.encoder_embedding_dim, hparams.encoder_embedding_dim)
    def __init__(self, enc_hidden_size, dec_hidden_size):
        super(Attention, self).__init__()
        # self.model = LinearNorm(dec_hidden_size, enc_hidden_size, bias=False)

        self.fc_1 = LinearNorm(enc_hidden_size, dec_hidden_size, bias=True)
        self.fc_2 = LinearNorm(dec_hidden_size, dec_hidden_size, bias=True)
        self.weight = nn.Parameter(torch.zeros(dec_hidden_size, 1))
예제 #7
0
    def __init__(self, hparams):
        super(Decoder, self).__init__()
        self.n_mel_channels = hparams.n_mel_channels
        self.n_frames_per_step = hparams.n_frames_per_step
        self.encoder_embedding_dim = hparams.encoder_embedding_dim
        self.attention_rnn_dim = hparams.attention_rnn_dim
        self.decoder_rnn_dim = hparams.decoder_rnn_dim
        self.prenet_dim = hparams.prenet_dim
        self.max_decoder_steps = hparams.max_decoder_steps
        self.gate_threshold = hparams.gate_threshold

        self.prenet = Prenet(
            hparams.n_mel_channels * hparams.n_frames_per_step,
            [hparams.prenet_dim, hparams.prenet_dim])

        self.attention_rnn = nn.LSTMCell(
            hparams.decoder_rnn_dim + hparams.encoder_embedding_dim,
            hparams.attention_rnn_dim)

        self.attention_layer = Attention(
            hparams.attention_rnn_dim, hparams.encoder_embedding_dim,
            hparams.attention_dim, hparams.attention_location_n_filters,
            hparams.attention_location_kernel_size)

        self.decoder_rnn = nn.LSTMCell(
            hparams.prenet_dim + hparams.encoder_embedding_dim,
            hparams.decoder_rnn_dim, 1)

        self.linear_projection = LinearNorm(
            hparams.decoder_rnn_dim + hparams.encoder_embedding_dim,
            hparams.n_mel_channels*hparams.n_frames_per_step)

        self.gate_layer = LinearNorm(
            hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, 1,
            bias=True, w_init_gain='sigmoid')
예제 #8
0
    def __init__(self, hparams):
        super(Discriminator, self).__init__()

        self.aae_hidden1 = LinearNorm(hparams.z_residual_dim+3, 256)
        self.aae_hidden2 = LinearNorm(256, 256)
        self.aae_output = LinearNorm(256, 1)
        self.batchs = hparams.batch_size
        self.z_r_dim = hparams.z_residual_dim
 def __init__(self, enc_hidden_size, dec_hidden_size, method="concat"):
     super(AttentionLoop, self).__init__()
     self.method = method
     if method == "general":
         self.model = LinearNorm(dec_hidden_size,
                                 enc_hidden_size,
                                 bias=True)
     elif method == "concat":
         self.fc = LinearNorm(dec_hidden_size, enc_hidden_size, bias=True)
         self.weight = nn.Parameter(torch.zeros(enc_hidden_size, 1))
예제 #10
0
    def __init__(self, hparams, supervised=False):
        super(GMVAE_revised, self).__init__()
        self.latent_embedding_dim = hparams.latent_embedding_dim
        self.supervised = supervised
        convolutions = []
        conv_layer_1 = nn.Sequential(
            ConvNorm(hparams.n_mel_channels,
                     hparams.latent_embedding_dim,
                     kernel_size=hparams.latent_kernel_size,
                     stride=1,
                     padding=int((hparams.latent_kernel_size - 1) / 2),
                     dilation=1,
                     w_init_gain='relu'),
            nn.BatchNorm1d(hparams.latent_embedding_dim))
        convolutions.append(conv_layer_1)

        conv_layer_2 = nn.Sequential(
            ConvNorm(hparams.latent_embedding_dim,
                     hparams.latent_embedding_dim,
                     kernel_size=hparams.latent_kernel_size,
                     stride=1,
                     padding=int((hparams.latent_kernel_size - 1) / 2),
                     dilation=1,
                     w_init_gain='relu'),
            nn.BatchNorm1d(hparams.latent_embedding_dim))
        convolutions.append(conv_layer_2)

        self.convolutions = nn.ModuleList(convolutions)

        self.lstm = nn.LSTM(hparams.latent_embedding_dim,
                            int(hparams.latent_embedding_dim / 2),
                            1,
                            batch_first=True,
                            bidirectional=True)

        # self.mean_pool = nn.AvgPool1d(hparams.latent_kernel_size, stride=1)
        #
        # self.mean_pool_out_size = hparams.latent_embedding_dim - hparams.latent_kernel_size + 1

        self.linear_projection = LinearNorm(
            hparams.latent_embedding_dim,
            int(hparams.latent_embedding_dim / 2))

        self.linear_projection_mean = LinearNorm(
            int(hparams.latent_embedding_dim / 2), hparams.latent_out_dim)

        self.linear_projection_variance = LinearNorm(
            int(hparams.latent_embedding_dim / 2), hparams.latent_out_dim)

        self.fc3 = nn.Linear(hparams.latent_out_dim,
                             int(hparams.latent_embedding_dim / 2))

        self.fc4 = nn.Linear(int(hparams.latent_embedding_dim / 2),
                             hparams.latent_embedding_dim)
예제 #11
0
파일: model.py 프로젝트: zuiwanting/zhrtvc
 def __init__(self, attention_rnn_dim, embedding_dim, attention_dim,
              attention_location_n_filters, attention_location_kernel_size):
     super(Attention, self).__init__()
     self.query_layer = LinearNorm(attention_rnn_dim, attention_dim,
                                   bias=False, w_init_gain='tanh')
     self.memory_layer = LinearNorm(embedding_dim, attention_dim, bias=False,
                                    w_init_gain='tanh')
     self.v = LinearNorm(attention_dim, 1, bias=False)
     self.location_layer = LocationLayer(attention_location_n_filters,
                                         attention_location_kernel_size,
                                         attention_dim)
     self.score_mask_value = -float("inf")
예제 #12
0
    def __init__(self, hparams):
        super(Decoder, self).__init__()
        self.n_mel_channels = hparams.n_mel_channels
        self.n_frames_per_step = hparams.n_frames_per_step
        self.encoder_embedding_dim = hparams.encoder_embedding_dim + hparams.token_embedding_size
        #+ hparams.speaker_embedding_dim # no speaker embedding: bigger token embedding size
        self.attention_rnn_dim = hparams.attention_rnn_dim
        self.decoder_rnn_dim = hparams.decoder_rnn_dim
        self.prenet_dim = hparams.prenet_dim
        self.max_decoder_steps = hparams.max_decoder_steps
        self.gate_threshold = hparams.gate_threshold
        self.p_attention_dropout = hparams.p_attention_dropout
        self.p_decoder_dropout = hparams.p_decoder_dropout
        self.p_teacher_forcing = hparams.p_teacher_forcing

        #        self.prenet_f0_dim = hparams.prenet_f0_dim
        #        self.prenet_f0 = ConvNorm(
        #            1, hparams.prenet_f0_dim,
        #            kernel_size=hparams.prenet_f0_kernel_size,
        #            padding=max(0, int(hparams.prenet_f0_kernel_size/2)),
        #            bias=False, stride=1, dilation=1) # no f0

        self.prenet = Prenet(
            hparams.n_mel_channels * hparams.n_frames_per_step,
            [hparams.prenet_dim, hparams.prenet_dim])

        #        self.attention_rnn = nn.LSTMCell(
        #            hparams.prenet_dim + hparams.prenet_f0_dim + self.encoder_embedding_dim,
        #            hparams.attention_rnn_dim)

        self.attention_rnn = nn.LSTMCell(
            hparams.prenet_dim + self.encoder_embedding_dim,
            hparams.attention_rnn_dim)

        self.attention_layer = Attention(
            hparams.attention_rnn_dim, self.encoder_embedding_dim,
            hparams.attention_dim, hparams.attention_location_n_filters,
            hparams.attention_location_kernel_size)

        self.decoder_rnn = nn.LSTMCell(
            hparams.attention_rnn_dim + self.encoder_embedding_dim,
            hparams.decoder_rnn_dim, 1)

        self.linear_projection = LinearNorm(
            hparams.decoder_rnn_dim + self.encoder_embedding_dim,
            hparams.n_mel_channels * hparams.n_frames_per_step)

        self.gate_layer = LinearNorm(hparams.decoder_rnn_dim +
                                     self.encoder_embedding_dim,
                                     1,
                                     bias=True,
                                     w_init_gain='sigmoid')
예제 #13
0
    def __init__(self, hparams):
        super(Decoder, self).__init__()
        self.mellotron = hparams.mellotron
        self.disable_f0 = hparams.disable_f0
        self.n_mel_channels = hparams.n_mel_channels
        self.n_frames_per_step = hparams.n_frames_per_step
        self.encoder_embedding_dim = hparams.encoder_embedding_dim + hparams.token_embedding_size + hparams.speaker_embedding_dim if self.mellotron else hparams.encoder_embedding_dim
        self.attention_rnn_dim = hparams.attention_rnn_dim
        self.decoder_rnn_dim = hparams.decoder_rnn_dim
        self.prenet_dim = hparams.prenet_dim
        self.max_decoder_steps = hparams.max_decoder_steps
        self.gate_threshold = hparams.gate_threshold
        self.p_attention_dropout = hparams.p_attention_dropout
        self.p_decoder_dropout = hparams.p_decoder_dropout
        self.p_teacher_forcing = hparams.p_teacher_forcing
        
        if self.mellotron and not self.disable_f0:
            self.prenet_f0 = ConvNorm(
                1, hparams.prenet_f0_dim,
                kernel_size=hparams.prenet_f0_kernel_size,
                padding=max(0, int(hparams.prenet_f0_kernel_size/2)),
                bias=False, stride=1, dilation=1)
            attention_rnn_in_dim = hparams.prenet_dim + self.encoder_embedding_dim + hparams.prenet_f0_dim
        else:
            attention_rnn_in_dim = hparams.prenet_dim + self.encoder_embedding_dim
        
        self.prenet = Prenet(
            hparams.n_mel_channels * hparams.n_frames_per_step,
            [hparams.prenet_dim, hparams.prenet_dim])

        self.attention_rnn = nn.LSTMCell(
            attention_rnn_in_dim,
            hparams.attention_rnn_dim)

        self.attention_layer = Attention(
            hparams.attention_rnn_dim, self.encoder_embedding_dim,
            hparams.attention_dim, hparams.attention_location_n_filters,
            hparams.attention_location_kernel_size)

        self.decoder_rnn = nn.LSTMCell(
            hparams.attention_rnn_dim + self.encoder_embedding_dim,
            hparams.decoder_rnn_dim, 1)

        self.linear_projection = LinearNorm(
            hparams.decoder_rnn_dim + self.encoder_embedding_dim,
            hparams.n_mel_channels * hparams.n_frames_per_step)

        self.gate_layer = LinearNorm(
            hparams.decoder_rnn_dim + self.encoder_embedding_dim, 1,
            bias=True, w_init_gain='sigmoid')
예제 #14
0
    def __init__(self, hparams):
        super(Decoder, self).__init__()
        self.n_mel_channels = hparams.n_mel_channels
        self.n_frames_per_step = hparams.n_frames_per_step
        self.encoder_embedding_dim = hparams.encoder_embedding_dim
        self.attention_rnn_dim = hparams.attention_rnn_dim
        self.decoder_rnn_dim = hparams.decoder_rnn_dim
        self.prenet_dim = hparams.prenet_dim
        self.max_decoder_steps = hparams.max_decoder_steps
        self.gate_threshold = hparams.gate_threshold
        self.p_attention_dropout = hparams.p_attention_dropout
        self.p_decoder_dropout = hparams.p_decoder_dropout
        self.p_teacher_forcing = hparams.p_teacher_forcing

        self.prenet = Prenet(
            hparams.n_mel_channels * hparams.n_frames_per_step,
            [hparams.prenet_dim, hparams.prenet_dim])

        self.attention_rnn = nn.LSTMCell(
            hparams.prenet_dim + hparams.encoder_embedding_dim,
            hparams.attention_rnn_dim)

        self.attention_layer = Attention(
            hparams.attention_rnn_dim, hparams.encoder_embedding_dim,
            hparams.attention_dim, hparams.attention_location_n_filters,
            hparams.attention_location_kernel_size)

        self.decoder_rnn = nn.LSTMCell(
            hparams.attention_rnn_dim + hparams.encoder_embedding_dim,
            hparams.decoder_rnn_dim, 1)

        self.linear_projection = nn.Sequential(
            LinearNorm(hparams.decoder_rnn_dim + hparams.encoder_embedding_dim,
                       hparams.decoder_rnn_dim,
                       bias=True,
                       w_init_gain='relu'), nn.ReLU(), nn.Dropout(p=0.2))

        self.mel_layer = nn.Sequential(
            LinearNorm(hparams.decoder_rnn_dim,
                       hparams.decoder_rnn_dim,
                       bias=True,
                       w_init_gain='relu'), nn.ReLU(), nn.Dropout(p=0.2),
            LinearNorm(hparams.decoder_rnn_dim,
                       hparams.n_mel_channels * hparams.n_frames_per_step))

        self.gate_layer = LinearNorm(hparams.decoder_rnn_dim,
                                     1,
                                     bias=True,
                                     w_init_gain='sigmoid')
예제 #15
0
파일: model.py 프로젝트: ishine/UniNet
 def __init__(self, query_dim, keys_dim, attention_dim):
     super(SimpleAttention, self).__init__()
     # 传统attention需要query和keys做线性变换再v^T.*tanh(W * query + V * keys)
     # 这个query_layer和memory_layer分别得到 W * query 和 V * keys
     # w_init_gain='tanh'是因为他们包在tanh(W * query + V * keys)函数中
     self.query_layer = LinearNorm(query_dim,
                                   attention_dim,
                                   bias=False,
                                   w_init_gain='tanh')
     self.memory_layer = LinearNorm(keys_dim,
                                    attention_dim,
                                    bias=False,
                                    w_init_gain='tanh')
     self.v = LinearNorm(attention_dim, 1, bias=False)
     self.score_mask_value = -float("inf")
예제 #16
0
    def __init__(self, hparams):
        super(Residual_Encoder, self).__init__()

        self.z_r = LinearNorm(hparams.reference_dim, hparams.z_residual_dim)

        self.n_mels = hparams.n_mel_channels
        self.batchs = hparams.batch_size
        self.z_r_dim = hparams.z_residual_dim

        convolutions = []
        for i in range(hparams.n_convolutions):
            conv_layer = nn.Sequential(
                ConvNorm2d(hparams.conv_dim_in[i],
                           hparams.conv_dim_out[i],
                           kernel_size=3, stride=2,
                           padding=1,
                           dilation=1, w_init_gain='relu'),
                nn.BatchNorm2d(hparams.conv_dim_out[i]))
            convolutions.append(conv_layer)
        self.convolutions = nn.ModuleList(convolutions)
        out_channels = self.calculate_channels(hparams.n_mel_channels, 3, 2, 1, 2)
        self.ceil_n_mel_64 = int(ceil(hparams.n_mel_channels / 64))
        self.lstm = nn.LSTM(hparams.reference_dim*self.ceil_n_mel_64, hparams.reference_dim//2, 1, batch_first=True, bidirectional=True)

        for name, param in self.lstm.named_parameters():
            if 'weight' in name:
                torch.nn.init.xavier_uniform_(param)
    def __init__(self, hparams):
        super(NeuralConcatenativeSpeechSynthesis, self).__init__()
        self.n_mel_channels = hparams.n_mel_channels
        self.audio_prenet = Prenet(hparams.n_mel_channels,
                                   [hparams.prenet_dim, hparams.prenet_dim],
                                   hparams)
        self.target_audio_prenet = TargetPrenet(
            hparams.n_mel_channels, [hparams.prenet_dim, hparams.prenet_dim])
        self.text_prenet = ConvNorm(hparams.symbols_embedding_dim,
                                    hparams.symbols_embedding_dim,
                                    kernel_size=hparams.decoder_kernel_size,
                                    stride=hparams.text_stride)

        self.embedding = nn.Embedding(hparams.n_symbols,
                                      hparams.symbols_embedding_dim)
        # Text to audio seq2seq(alignment 1 module)
        self.glued_mel_encoder = AudioEncoder(hparams.prenet_dim,
                                              hparams.encoder_rnn_dim)
        self.glued_text_decoder = AttentionDecoder(
            hparams.symbols_embedding_dim, hparams.decoder_rnn_dim,
            hparams.encoder_rnn_dim)
        # Text to text seq2seq(Pseudo alignment 2)
        self.target_text_decoder = AttentionDecoder(
            hparams.symbols_embedding_dim, hparams.decoder_rnn_dim,
            hparams.decoder_rnn_dim)
        # Decoder
        self.decoder = RecurrentDecoder(hparams.prenet_dim,
                                        hparams.mel_decoder_rnn_dim,
                                        hparams.prenet_dim,
                                        hparams.n_mel_channels, hparams)
        self.postnet = LinearNorm(hparams.prenet_dim, hparams.n_mel_channels)
예제 #18
0
 def __init__(self, in_dim, sizes):
     super(Prenet, self).__init__()
     in_sizes = [in_dim] + sizes[:-1]
     self.layers = nn.ModuleList([
         LinearNorm(in_size, out_size, bias=False)
         for (in_size, out_size) in zip(in_sizes, sizes)
     ])
    def __init__(self, hparams):
        super(Decoder, self).__init__()
        self.n_mel_channels = hparams.n_mel_channels
        self.n_frames_per_step = hparams.n_frames_per_step
        self.encoder_embedding_dim = hparams.encoder_embedding_dim
        self.attention_rnn_dim = hparams.attention_rnn_dim
        self.decoder_rnn_dim = hparams.decoder_rnn_dim
        self.prenet_dim = hparams.prenet_dim
        self.max_decoder_steps = hparams.max_decoder_steps
        self.gate_threshold = hparams.gate_threshold
        self.p_attention_dropout = hparams.p_attention_dropout
        self.p_decoder_dropout = hparams.p_decoder_dropout

        self.prenet = Prenet(
            hparams.n_mel_channels * hparams.n_frames_per_step,
            [hparams.prenet_dim, hparams.prenet_dim])


        self.attention_rnn = nn.LSTMCell(
            hparams.prenet_dim + hparams.encoder_embedding_dim,
            hparams.attention_rnn_dim)

        self.attention_layer = Attention(
            hparams.attention_rnn_dim, hparams.encoder_embedding_dim,
            hparams.attention_dim, hparams.attention_location_n_filters,
            hparams.attention_location_kernel_size)

        self.decoder_rnn = nn.LSTMCell(
            hparams.attention_rnn_dim + hparams.encoder_embedding_dim,
            hparams.decoder_rnn_dim, 1)

        self.linear_projection = LinearNorm(
            hparams.decoder_rnn_dim + hparams.encoder_embedding_dim,
            hparams.n_mel_channels * hparams.n_frames_per_step)

        self.gate_layer = LinearNorm(
            hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, 1,
            bias=True, w_init_gain='sigmoid')

        # Transformer TTS
        self.norm = LinearNorm(hparams.prenet_dim, hparams.prenet_dim)
        self.pos_emb = nn.Embedding.from_pretrained(
            get_sinusoid_encoding_table(1024, hparams.prenet_dim, padding_idx=0),
            freeze=True)
        self.alpha = nn.Parameter(torch.ones(1))
        self.pos_dropout = nn.Dropout(p=0.1)
        self.pos_linear = Linear(hparams.prenet_dim, hparams.prenet_dim)
예제 #20
0
    def __init__(self, hparams):
        super(Decoder, self).__init__()
        self.n_mel_channels = hparams.n_mel_channels
        self.n_frames_per_step = hparams.n_frames_per_step
        self.encoder_embedding_dim = hparams.encoder_embedding_dim
        self.attention_rnn_dim = hparams.attention_rnn_dim
        self.decoder_rnn_dim = hparams.decoder_rnn_dim
        self.prenet_dim = hparams.prenet_dim
        self.max_decoder_steps = hparams.max_decoder_steps
        self.gate_threshold = hparams.gate_threshold
        self.p_attention_dropout = hparams.p_attention_dropout
        self.p_decoder_dropout = hparams.p_decoder_dropout

        self.prenet = Prenet(
            hparams.n_mel_channels * hparams.n_frames_per_step,  # 80*n
            [hparams.prenet_dim, hparams.prenet_dim])  # [256,256]

        # 256+512->1024
        self.attention_rnn = nn.LSTMCell(
            hparams.prenet_dim + hparams.encoder_embedding_dim,  # 256+512
            hparams.attention_rnn_dim)  # 1024

        self.attention_layer = Attention(
            hparams.attention_rnn_dim,  # 1024
            hparams.encoder_embedding_dim,  # 512
            hparams.attention_dim,  # 128
            hparams.attention_location_n_filters,  # 32
            hparams.attention_location_kernel_size)  # 31
        # 1024+512->1024
        self.decoder_rnn = nn.LSTMCell(
            hparams.attention_rnn_dim +
            hparams.encoder_embedding_dim,  # 1024+512
            hparams.decoder_rnn_dim,
            1)  # 1024 (单层)
        # 1024+512->n*mels
        self.linear_projection = LinearNorm(
            hparams.decoder_rnn_dim +
            hparams.encoder_embedding_dim,  # 1024+512
            hparams.n_mel_channels * hparams.n_frames_per_step)  # n*mels
        # 1024+512->1
        self.gate_layer = LinearNorm(hparams.decoder_rnn_dim +
                                     hparams.encoder_embedding_dim,
                                     1,
                                     bias=True,
                                     w_init_gain='sigmoid')
예제 #21
0
파일: model.py 프로젝트: ishine/UniNet
    def __init__(self, hparams):
        super(Decoder, self).__init__()
        self.n_mel_channels = hparams.n_mel_channels
        self.n_frames_per_step = hparams.n_frames_per_step
        self.encoder_embedding_dim = hparams.encoder_embedding_dim
        self.frame_level_rnn_dim = hparams.frame_level_rnn_dim
        self.phone_level_rnn_dim = hparams.phone_level_rnn_dim
        self.join_model_dim = hparams.frame_level_rnn_dim
        self.decoder_rnn_dim = hparams.decoder_rnn_dim
        self.prenet_dim = hparams.prenet_dim
        self.p_decoder_dropout = hparams.p_decoder_dropout
        self.decoder_training_mode = hparams.decoder_training_mode
        if self.decoder_training_mode == 'random annealing':
            self.annealing = annealing(0.9999)

        self.prenet = Prenet(
            hparams.n_mel_channels * hparams.n_frames_per_step,
            [hparams.prenet_dim, hparams.prenet_dim])

        self.frame_level_rnn = nn.LSTMCell(hparams.prenet_dim,
                                           hparams.frame_level_rnn_dim)

        self.self_attention = vectorBased_selfAttention(
            hparams.frame_level_rnn_dim, hparams.self_attention_dim)

        self.text_attention_layer = SimpleAttention(
            hparams.frame_level_rnn_dim, hparams.encoder_embedding_dim,
            hparams.attention_dim)

        self.phone_level_rnn = nn.LSTMCell(hparams.frame_level_rnn_dim,
                                           hparams.phone_level_rnn_dim)

        self.join_model_layer = JoinModol(hparams.encoder_embedding_dim,
                                          hparams.phone_level_rnn_dim,
                                          self.join_model_dim,
                                          hparams.join_model_hidden_dim)

        self.decoder_rnn = nn.LSTMCell(
            hparams.frame_level_rnn_dim + self.join_model_dim,
            hparams.decoder_rnn_dim)

        self.linear_projection = LinearNorm(
            hparams.decoder_rnn_dim + self.join_model_dim,
            hparams.n_mel_channels * hparams.n_frames_per_step)

        self.attention_layer = Attention(
            hparams.decoder_rnn_dim + self.join_model_dim,
            hparams.encoder_embedding_dim, hparams.attention_dim,
            hparams.attention_location_n_filters,
            hparams.attention_location_kernel_size)

        self.max_decoder_steps = hparams.max_decoder_steps

        if hparams.more_information:
            self.more_information = True
        else:
            self.more_information = False
예제 #22
0
 def __init__(self, attention_n_filters, attention_kernel_size,
              attention_dim):
     super(LocationLayer, self).__init__()
     padding = int((attention_kernel_size - 1) / 2)
     self.location_conv = ConvNorm(2, attention_n_filters,
                                   kernel_size=attention_kernel_size,
                                   padding=padding, bias=False)
     self.location_dense = LinearNorm(attention_n_filters, attention_dim,
                                      bias=False, w_init_gain='tanh')
 def __init__(self, hparams):
     super().__init__()
     self.dropout = nn.Dropout(hparams.ref_attention_dropout)
     self.d_q = hparams.encoder_embedding_dim
     self.d_k = hparams.prosody_embedding_dim
     self.linears = nn.ModuleList([
         LinearNorm(in_dim, hparams.ref_attention_dim, bias=False, w_init_gain='tanh') \
             for in_dim in (self.d_q, self.d_k)
     ])
     self.score_mask_value = 1e-9
예제 #24
0
파일: model.py 프로젝트: CookiePPP/codedump
    def __init__(self, num_mixtures, attention_layers, attention_rnn_dim,
                 embedding_dim, attention_dim, attention_location_n_filters,
                 attention_location_kernel_size, hparams):
        super(GMMAttention, self).__init__()
        self.num_mixtures = num_mixtures
        self.normalize_attention_input = hparams.normalize_attention_input
        self.delta_min_limit = hparams.delta_min_limit
        self.delta_offset = hparams.delta_offset
        self.lin_bias = hparams.lin_bias
        self.initial_gain = hparams.initial_gain
        lin = nn.Linear(attention_dim, 3 * num_mixtures, bias=self.lin_bias)
        lin.weight.data.mul_(0.01)
        if self.lin_bias:
            lin.bias.data.mul_(0.008)
            lin.bias.data.sub_(2.0)

        if attention_layers == 1:
            self.F = nn.Sequential(
                LinearNorm(attention_rnn_dim,
                           attention_dim,
                           bias=True,
                           w_init_gain=self.initial_gain), nn.Tanh(), lin)
        elif attention_layers == 2:
            self.F = nn.Sequential(
                LinearNorm(attention_rnn_dim,
                           attention_dim,
                           bias=True,
                           w_init_gain=self.initial_gain),
                LinearNorm(attention_dim,
                           attention_dim,
                           bias=False,
                           w_init_gain='tanh'), nn.Tanh(), lin)
        else:
            print(
                f"attention_layers invalid, valid values are... 1, 2\nCurrent Value {attention_layers}"
            )
            raise

        self.score_mask_value = 0  # -float("inf")
        self.register_buffer(
            'pos',
            torch.arange(0, 2000, dtype=torch.float).view(1, -1, 1).data)
예제 #25
0
    def __init__(self, attention_rnn_dim, embedding_dim, attention_dim,
                 attention_location_n_filters, attention_location_kernel_size):
        super(Attention, self).__init__()
        # 传统attention需要query和keys做线性变换再v^T.*tanh(W * query + V * keys)
        # 这个query_layer和memory_layer分别得到 W * query 和 V * keys
        # w_init_gain='tanh'是因为他们包在tanh(W * query + V * keys)函数中
        self.query_layer = LinearNorm(attention_rnn_dim,
                                      attention_dim,
                                      bias=False,
                                      w_init_gain='tanh')
        self.memory_layer = LinearNorm(embedding_dim,
                                       attention_dim,
                                       bias=False,
                                       w_init_gain='tanh')
        # 当前attention除了传统参数还包括对注意力权重做卷积处理
        self.location_layer = LocationLayer(attention_location_n_filters,
                                            attention_location_kernel_size,
                                            attention_dim)
        self.v = LinearNorm(attention_dim, 1, bias=False)

        self.score_mask_value = -float("inf")
예제 #26
0
    def __init__(self, config):
        super(Decoder, self).__init__()
        self.n_mel_channels = config["n_mel_channels"]
        self.n_frames_per_step = config["n_frames_per_step"]
        self.encoder_embedding_dim = config["encoder_embedding_dim"]
        self.attention_rnn_dim = config["attention_rnn_dim"]
        self.decoder_rnn_dim = config["decoder_rnn_dim"]
        self.prenet_dim = config["prenet_dim"]
        self.max_decoder_steps = config["max_decoder_steps"]
        self.gate_threshold = config["gate_threshold"]
        self.p_attention_dropout = config["p_attention_dropout"]
        self.p_decoder_dropout = config["p_decoder_dropout"]

        self.prenet = Prenet(
            config["n_mel_channels"] * config["n_frames_per_step"],
            [config["prenet_dim"], config["prenet_dim"]])

        self.attention_rnn = nn.LSTMCell(
            config["prenet_dim"] + config["encoder_embedding_dim"],
            config["attention_rnn_dim"])

        self.attention_layer = Attention(
            config["attention_rnn_dim"], config["encoder_embedding_dim"],
            config["attention_dim"], config["attention_location_n_filters"],
            config["attention_location_kernel_size"])

        self.decoder_rnn = nn.LSTMCell(
            config["attention_rnn_dim"] + config["encoder_embedding_dim"],
            config["decoder_rnn_dim"], 1)

        self.linear_projection = LinearNorm(
            config["decoder_rnn_dim"] + config["encoder_embedding_dim"],
            config["n_mel_channels"] * config["n_frames_per_step"])

        self.gate_layer = LinearNorm(config["decoder_rnn_dim"] +
                                     config["encoder_embedding_dim"],
                                     1,
                                     bias=True,
                                     w_init_gain='sigmoid')
예제 #27
0
파일: model.py 프로젝트: zge/tacotron2-vae
 def __init__(self, hparams):
     super(Tacotron2, self).__init__()
     self.mask_padding = hparams.mask_padding
     self.fp16_run = hparams.fp16_run
     self.use_vae = hparams.use_vae
     self.embedding_variation = hparams.embedding_variation
     self.label_type = hparams.label_type
     self.n_mel_channels = hparams.n_mel_channels
     self.n_frames_per_step = hparams.n_frames_per_step
     self.symbols_embedding_dim = hparams.symbols_embedding_dim
     self.speaker_embedding_dim = hparams.speaker_embedding_dim
     self.emotion_embedding_dim = hparams.emotion_embedding_dim
     self.transcript_embedding = nn.Embedding(hparams.n_symbols,
                                              hparams.symbols_embedding_dim)
     if self.use_vae:
         if self.label_type == 'one-hot':
             self.speaker_embedding = LinearNorm(
                 hparams.n_speakers,
                 hparams.speaker_embedding_dim,
                 bias=True,
                 w_init_gain='tanh')
             self.emotion_embedding = LinearNorm(
                 hparams.n_emotions,
                 hparams.emotion_embedding_dim,
                 bias=True,
                 w_init_gain='tanh')
         elif self.label_type == 'id':
             self.speaker_embedding = nn.Embedding(
                 hparams.n_speakers, hparams.speaker_embedding_dim)
             self.emotion_embedding = nn.Embedding(
                 hparams.n_emotions, hparams.emotion_embedding_dim)
     self.vae_input_type = hparams.vae_input_type
     std = sqrt(2.0 / (hparams.n_symbols + hparams.symbols_embedding_dim))
     val = sqrt(3.0) * std  # uniform bounds for std
     self.transcript_embedding.weight.data.uniform_(-val, val)
     self.encoder = Encoder(hparams)
     self.decoder = Decoder(hparams)
     self.postnet = Postnet(hparams)
     self.vae_gst = VAE_GST(hparams)
예제 #28
0
 def __init__(self, attention_rnn_dim, embedding_dim, attention_dim,
              attention_location_n_filters, attention_location_kernel_size):
     super(Attention, self).__init__()
     self.query_layer = LinearNorm(attention_rnn_dim,
                                   attention_dim,
                                   bias=False,
                                   w_init_gain='tanh')
     # if hparams.style == 'speaker_encoder':
     #     embedding_dim += 256
     # elif hparams.style == 'style_embedding':
     #     embedding_dim += 128
     # elif hparams.style == 'both':
     #     embedding_dim += 256 + 128
     self.memory_layer = LinearNorm(embedding_dim,
                                    attention_dim,
                                    bias=False,
                                    w_init_gain='tanh')
     self.v = LinearNorm(attention_dim, 1, bias=False)
     self.location_layer = LocationLayer(attention_location_n_filters,
                                         attention_location_kernel_size,
                                         attention_dim)
     self.score_mask_value = -float("inf")
    def __init__(self, in_dim, sizes, hparams):
        super(Prenet, self).__init__()
        in_sizes = [in_dim] + sizes[:-1]
        self.layers = nn.ModuleList([
            LinearNorm(in_size, out_size, bias=False)
            for (in_size, out_size) in zip(in_sizes, sizes)
        ])

        self.convolutions = nn.Sequential(
            ConvNorm(hparams.prenet_dim,
                     hparams.prenet_dim,
                     kernel_size=hparams.audio_kernel_size,
                     stride=hparams.audio_stride,
                     w_init_gain='relu'), nn.BatchNorm1d(hparams.prenet_dim))
예제 #30
0
파일: model.py 프로젝트: CookiePPP/codedump
    def __init__(self, in_dim, sizes, p_prenet_dropout, prenet_batchnorm):
        super(Prenet, self).__init__()
        in_sizes = [in_dim] + sizes[:-1]
        self.layers = nn.ModuleList([
            LinearNorm(in_size, out_size, bias=False)
            for (in_size, out_size) in zip(in_sizes, sizes)
        ])
        self.p_prenet_dropout = p_prenet_dropout
        self.prenet_batchnorm = prenet_batchnorm
        self.p_prenet_input_dropout = 0

        if self.prenet_batchnorm:
            self.batchnorms = nn.ModuleList(
                [nn.BatchNorm1d(size) for size in sizes])