Пример #1
0
    def __init__(self, hparams):
        super(Decoder, self).__init__()
        self.n_acoustic_feat_dims = hparams.n_acoustic_feat_dims
        self.encoder_embedding_dim = hparams.encoder_embedding_dim
        self.attention_rnn_dim = hparams.attention_rnn_dim
        self.decoder_rnn_dim = hparams.decoder_rnn_dim
        self.prenet_dim = hparams.prenet_dim
        self.max_decoder_steps = hparams.max_decoder_steps
        self.gate_threshold = hparams.gate_threshold
        self.p_attention_dropout = hparams.p_attention_dropout
        self.p_decoder_dropout = hparams.p_decoder_dropout
        self.attention_window_size = hparams.attention_window_size
        self.prenet = Prenet(hparams.n_acoustic_feat_dims,
                             [hparams.prenet_dim, hparams.prenet_dim])

        self.attention_rnn = nn.LSTMCell(
            hparams.prenet_dim + hparams.encoder_embedding_dim,
            hparams.attention_rnn_dim)

        self.attention_layer = Attention(
            hparams.attention_rnn_dim, hparams.encoder_embedding_dim,
            hparams.attention_dim, hparams.attention_location_n_filters,
            hparams.attention_location_kernel_size)

        self.decoder_rnn = nn.LSTMCell(
            hparams.attention_rnn_dim + hparams.encoder_embedding_dim,
            hparams.decoder_rnn_dim, 1)

        self.linear_projection = LinearNorm(
            hparams.decoder_rnn_dim + hparams.encoder_embedding_dim,
            hparams.n_acoustic_feat_dims)

        self.gate_layer = LinearNorm(
            hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, 1,
            bias=True, w_init_gain='sigmoid')
Пример #2
0
 def __init__(self, vocab_size, decoder_dim, hidden_size, dropout=0.5):
     super(MIEsitmator, self).__init__()
     self.proj = nn.Sequential(
         LinearNorm(decoder_dim, hidden_size, bias=True,
                    w_init_gain='relu'), nn.ReLU(), nn.Dropout(p=dropout))
     self.ctc_proj = LinearNorm(hidden_size, vocab_size + 1, bias=True)
     self.ctc = nn.CTCLoss(blank=vocab_size, reduction='none')
Пример #3
0
    def __init__(self, n_mel_channels, n_frames_per_step,
                 encoder_embedding_dim, attention_dim,
                 attention_location_n_filters, attention_location_kernel_size,
                 prenet_dim, decoder_rnn_dim, max_decoder_steps,
                 gate_threshold, decoder_n_lstms, p_decoder_dropout):
        super(Decoder, self).__init__()
        self.n_mel_channels = n_mel_channels
        self.n_frames_per_step = n_frames_per_step
        self.encoder_embedding_dim = encoder_embedding_dim
        self.decoder_rnn_dim = decoder_rnn_dim
        self.prenet_dim = prenet_dim
        self.max_decoder_steps = max_decoder_steps
        self.gate_threshold = gate_threshold
        self.decoder_n_lstms = decoder_n_lstms
        self.p_decoder_dropout = p_decoder_dropout

        self.prenet = Prenet(n_mel_channels, [prenet_dim, prenet_dim])

        self.lstm0 = nn.LSTMCell(prenet_dim + encoder_embedding_dim,
                                 decoder_rnn_dim)
        self.lstm1 = nn.LSTMCell(decoder_rnn_dim + encoder_embedding_dim,
                                 decoder_rnn_dim)

        self.attention_layer = Attention(decoder_rnn_dim,
                                         encoder_embedding_dim, attention_dim,
                                         attention_location_n_filters,
                                         attention_location_kernel_size)

        self.linear_projection = LinearNorm(
            decoder_rnn_dim + encoder_embedding_dim,
            n_mel_channels * n_frames_per_step)

        self.gate_layer = LinearNorm(decoder_rnn_dim + encoder_embedding_dim,
                                     n_frames_per_step,
                                     w_init_gain='sigmoid')
Пример #4
0
 def __init__(self, query_dim, memory_dim, attention_dim,
              attention_location_n_filters, attention_location_kernel_size):
     super(Attention, self).__init__()
     self.query_layer = LinearNorm(query_dim, attention_dim, w_init_gain='tanh')
     self.memory_layer = LinearNorm(memory_dim, attention_dim, w_init_gain='tanh')
     self.v = LinearNorm(attention_dim, 1)
     self.location_layer = LocationLayer(attention_location_n_filters,
                                         attention_location_kernel_size,
                                         attention_dim)
     self.score_mask_value = -float("inf")
Пример #5
0
 def __init__(self, attention_rnn_dim, embedding_dim, attention_dim,
              attention_location_n_filters, attention_location_kernel_size):
     super(Attention, self).__init__()
     self.query_layer = LinearNorm(attention_rnn_dim, attention_dim,
                                   bias=False, w_init_gain='tanh')
     self.memory_layer = LinearNorm(embedding_dim, attention_dim, bias=False,
                                    w_init_gain='tanh')
     self.v = LinearNorm(attention_dim, 1, bias=False)
     self.location_layer = LocationLayer(attention_location_n_filters,
                                         attention_location_kernel_size,
                                         attention_dim)
     self.score_mask_value = -float("inf")
     self.not_so_small_mask = -1000
Пример #6
0
    def __init__(self, n_mel_channels, n_frames_per_step,
                 encoder_embedding_dim, attention_dim,
                 attention_location_n_filters, attention_location_kernel_size,
                 attention_rnn_dim, decoder_rnn_dim, prenet_dim,
                 max_decoder_steps, gate_threshold, p_attention_dropout,
                 p_decoder_dropout, early_stopping):
        super(Decoder, self).__init__()
        self.n_mel_channels = n_mel_channels
        self.n_frames_per_step = n_frames_per_step
        self.encoder_embedding_dim = encoder_embedding_dim
        self.attention_rnn_dim = attention_rnn_dim
        self.decoder_rnn_dim = decoder_rnn_dim
        self.prenet_dim = prenet_dim
        self.max_decoder_steps = max_decoder_steps
        self.gate_threshold = gate_threshold
        self.p_attention_dropout = p_attention_dropout
        self.p_decoder_dropout = p_decoder_dropout

        self.prenet = Prenet(n_mel_channels * n_frames_per_step,
                             [prenet_dim, prenet_dim])

        self.attention_rnn = nn.LSTMCell(prenet_dim + encoder_embedding_dim,
                                         attention_rnn_dim)

        self.attention_layer = Attention(attention_rnn_dim,
                                         encoder_embedding_dim, attention_dim,
                                         attention_location_n_filters,
                                         attention_location_kernel_size)

        self.decoder_rnn = nn.LSTMCell(
            attention_rnn_dim + encoder_embedding_dim, decoder_rnn_dim, 1)

        self.linear_projection = nn.Sequential(
            LinearNorm(decoder_rnn_dim + encoder_embedding_dim,
                       decoder_rnn_dim,
                       bias=True,
                       w_init_gain='relu'), nn.ReLU(), nn.Dropout(p=0.5))

        self.mel_layer = nn.Sequential(
            LinearNorm(decoder_rnn_dim,
                       decoder_rnn_dim,
                       bias=True,
                       w_init_gain='relu'), nn.ReLU(), nn.Dropout(p=0.5),
            LinearNorm(decoder_rnn_dim, n_mel_channels * n_frames_per_step))

        self.gate_layer = LinearNorm(decoder_rnn_dim,
                                     1,
                                     bias=True,
                                     w_init_gain='sigmoid')
Пример #7
0
 def __init__(self, in_dim, sizes):
     super(Prenet, self).__init__()
     in_sizes = [in_dim] + sizes[:-1]
     self.layers = nn.ModuleList([
         LinearNorm(in_size, out_size, bias=False)
         for (in_size, out_size) in zip(in_sizes, sizes)
     ])
Пример #8
0
 def __init__(self, attention_n_filters, attention_kernel_size, attention_dim):
     super(LocationLayer, self).__init__()
     self.location_conv = ConvNorm(1, attention_n_filters,
                                   kernel_size=attention_kernel_size,
                                   padding=int((attention_kernel_size - 1) / 2),
                                   stride=1, dilation=1)
     self.location_dense = LinearNorm(attention_n_filters, attention_dim,
                                      bias=False, w_init_gain='tanh')
Пример #9
0
    def __init__(self, n_mel_channels, n_frames_per_step,
                 encoder_embedding_dim, attention_dim,
                 attention_location_n_filters, attention_location_kernel_size,
                 attention_rnn_dim, decoder_rnn_dim, prenet_dim,
                 max_decoder_steps, gate_threshold, p_attention_dropout,
                 p_decoder_dropout, early_stopping, E, speaker_embedding_dim):
        super(Decoder, self).__init__()
        self.n_mel_channels = n_mel_channels
        self.n_frames_per_step = n_frames_per_step
        self.encoder_embedding_dim = encoder_embedding_dim + E  # + speaker_embedding_dim # speaker_id 쓸때만 이거 값 더해주기!
        self.attention_rnn_dim = attention_rnn_dim
        self.decoder_rnn_dim = decoder_rnn_dim
        self.prenet_dim = prenet_dim
        self.max_decoder_steps = max_decoder_steps
        self.gate_threshold = gate_threshold
        self.p_attention_dropout = p_attention_dropout
        self.p_decoder_dropout = p_decoder_dropout
        self.early_stopping = early_stopping

        self.prenet = Prenet(n_mel_channels * n_frames_per_step,
                             [prenet_dim, prenet_dim])

        self.attention_rnn = nn.LSTMCell(
            prenet_dim + self.encoder_embedding_dim, attention_rnn_dim)

        self.attention_layer = Attention(attention_rnn_dim,
                                         self.encoder_embedding_dim,
                                         attention_dim,
                                         attention_location_n_filters,
                                         attention_location_kernel_size)

        self.decoder_rnn = nn.LSTMCell(
            attention_rnn_dim + self.encoder_embedding_dim, decoder_rnn_dim, 1)

        self.linear_projection = LinearNorm(
            decoder_rnn_dim + self.encoder_embedding_dim,
            n_mel_channels * n_frames_per_step)

        self.gate_layer = LinearNorm(decoder_rnn_dim +
                                     self.encoder_embedding_dim,
                                     1,
                                     bias=True,
                                     w_init_gain='sigmoid')
Пример #10
0
 def __init__(self, in_dim, sizes):
     """
     Args:
         in_dim ([type]): n_mel_channels * n_frames_per_step
         sizes ([type]): [prenet_dim, prenet_dim]
     """
     super(Prenet, self).__init__()
     in_sizes = [in_dim] + sizes[:-1]
     self.layers = nn.ModuleList([
         LinearNorm(in_size, out_size, bias=False)
         for (in_size, out_size) in zip(in_sizes, sizes)
     ])