def __init__(self, hparams): super(Decoder, self).__init__() self.n_acoustic_feat_dims = hparams.n_acoustic_feat_dims self.encoder_embedding_dim = hparams.encoder_embedding_dim self.attention_rnn_dim = hparams.attention_rnn_dim self.decoder_rnn_dim = hparams.decoder_rnn_dim self.prenet_dim = hparams.prenet_dim self.max_decoder_steps = hparams.max_decoder_steps self.gate_threshold = hparams.gate_threshold self.p_attention_dropout = hparams.p_attention_dropout self.p_decoder_dropout = hparams.p_decoder_dropout self.attention_window_size = hparams.attention_window_size self.prenet = Prenet(hparams.n_acoustic_feat_dims, [hparams.prenet_dim, hparams.prenet_dim]) self.attention_rnn = nn.LSTMCell( hparams.prenet_dim + hparams.encoder_embedding_dim, hparams.attention_rnn_dim) self.attention_layer = Attention( hparams.attention_rnn_dim, hparams.encoder_embedding_dim, hparams.attention_dim, hparams.attention_location_n_filters, hparams.attention_location_kernel_size) self.decoder_rnn = nn.LSTMCell( hparams.attention_rnn_dim + hparams.encoder_embedding_dim, hparams.decoder_rnn_dim, 1) self.linear_projection = LinearNorm( hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, hparams.n_acoustic_feat_dims) self.gate_layer = LinearNorm( hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, 1, bias=True, w_init_gain='sigmoid')
def __init__(self, vocab_size, decoder_dim, hidden_size, dropout=0.5): super(MIEsitmator, self).__init__() self.proj = nn.Sequential( LinearNorm(decoder_dim, hidden_size, bias=True, w_init_gain='relu'), nn.ReLU(), nn.Dropout(p=dropout)) self.ctc_proj = LinearNorm(hidden_size, vocab_size + 1, bias=True) self.ctc = nn.CTCLoss(blank=vocab_size, reduction='none')
def __init__(self, n_mel_channels, n_frames_per_step, encoder_embedding_dim, attention_dim, attention_location_n_filters, attention_location_kernel_size, prenet_dim, decoder_rnn_dim, max_decoder_steps, gate_threshold, decoder_n_lstms, p_decoder_dropout): super(Decoder, self).__init__() self.n_mel_channels = n_mel_channels self.n_frames_per_step = n_frames_per_step self.encoder_embedding_dim = encoder_embedding_dim self.decoder_rnn_dim = decoder_rnn_dim self.prenet_dim = prenet_dim self.max_decoder_steps = max_decoder_steps self.gate_threshold = gate_threshold self.decoder_n_lstms = decoder_n_lstms self.p_decoder_dropout = p_decoder_dropout self.prenet = Prenet(n_mel_channels, [prenet_dim, prenet_dim]) self.lstm0 = nn.LSTMCell(prenet_dim + encoder_embedding_dim, decoder_rnn_dim) self.lstm1 = nn.LSTMCell(decoder_rnn_dim + encoder_embedding_dim, decoder_rnn_dim) self.attention_layer = Attention(decoder_rnn_dim, encoder_embedding_dim, attention_dim, attention_location_n_filters, attention_location_kernel_size) self.linear_projection = LinearNorm( decoder_rnn_dim + encoder_embedding_dim, n_mel_channels * n_frames_per_step) self.gate_layer = LinearNorm(decoder_rnn_dim + encoder_embedding_dim, n_frames_per_step, w_init_gain='sigmoid')
def __init__(self, query_dim, memory_dim, attention_dim, attention_location_n_filters, attention_location_kernel_size): super(Attention, self).__init__() self.query_layer = LinearNorm(query_dim, attention_dim, w_init_gain='tanh') self.memory_layer = LinearNorm(memory_dim, attention_dim, w_init_gain='tanh') self.v = LinearNorm(attention_dim, 1) self.location_layer = LocationLayer(attention_location_n_filters, attention_location_kernel_size, attention_dim) self.score_mask_value = -float("inf")
def __init__(self, attention_rnn_dim, embedding_dim, attention_dim, attention_location_n_filters, attention_location_kernel_size): super(Attention, self).__init__() self.query_layer = LinearNorm(attention_rnn_dim, attention_dim, bias=False, w_init_gain='tanh') self.memory_layer = LinearNorm(embedding_dim, attention_dim, bias=False, w_init_gain='tanh') self.v = LinearNorm(attention_dim, 1, bias=False) self.location_layer = LocationLayer(attention_location_n_filters, attention_location_kernel_size, attention_dim) self.score_mask_value = -float("inf") self.not_so_small_mask = -1000
def __init__(self, n_mel_channels, n_frames_per_step, encoder_embedding_dim, attention_dim, attention_location_n_filters, attention_location_kernel_size, attention_rnn_dim, decoder_rnn_dim, prenet_dim, max_decoder_steps, gate_threshold, p_attention_dropout, p_decoder_dropout, early_stopping): super(Decoder, self).__init__() self.n_mel_channels = n_mel_channels self.n_frames_per_step = n_frames_per_step self.encoder_embedding_dim = encoder_embedding_dim self.attention_rnn_dim = attention_rnn_dim self.decoder_rnn_dim = decoder_rnn_dim self.prenet_dim = prenet_dim self.max_decoder_steps = max_decoder_steps self.gate_threshold = gate_threshold self.p_attention_dropout = p_attention_dropout self.p_decoder_dropout = p_decoder_dropout self.prenet = Prenet(n_mel_channels * n_frames_per_step, [prenet_dim, prenet_dim]) self.attention_rnn = nn.LSTMCell(prenet_dim + encoder_embedding_dim, attention_rnn_dim) self.attention_layer = Attention(attention_rnn_dim, encoder_embedding_dim, attention_dim, attention_location_n_filters, attention_location_kernel_size) self.decoder_rnn = nn.LSTMCell( attention_rnn_dim + encoder_embedding_dim, decoder_rnn_dim, 1) self.linear_projection = nn.Sequential( LinearNorm(decoder_rnn_dim + encoder_embedding_dim, decoder_rnn_dim, bias=True, w_init_gain='relu'), nn.ReLU(), nn.Dropout(p=0.5)) self.mel_layer = nn.Sequential( LinearNorm(decoder_rnn_dim, decoder_rnn_dim, bias=True, w_init_gain='relu'), nn.ReLU(), nn.Dropout(p=0.5), LinearNorm(decoder_rnn_dim, n_mel_channels * n_frames_per_step)) self.gate_layer = LinearNorm(decoder_rnn_dim, 1, bias=True, w_init_gain='sigmoid')
def __init__(self, in_dim, sizes): super(Prenet, self).__init__() in_sizes = [in_dim] + sizes[:-1] self.layers = nn.ModuleList([ LinearNorm(in_size, out_size, bias=False) for (in_size, out_size) in zip(in_sizes, sizes) ])
def __init__(self, attention_n_filters, attention_kernel_size, attention_dim): super(LocationLayer, self).__init__() self.location_conv = ConvNorm(1, attention_n_filters, kernel_size=attention_kernel_size, padding=int((attention_kernel_size - 1) / 2), stride=1, dilation=1) self.location_dense = LinearNorm(attention_n_filters, attention_dim, bias=False, w_init_gain='tanh')
def __init__(self, n_mel_channels, n_frames_per_step, encoder_embedding_dim, attention_dim, attention_location_n_filters, attention_location_kernel_size, attention_rnn_dim, decoder_rnn_dim, prenet_dim, max_decoder_steps, gate_threshold, p_attention_dropout, p_decoder_dropout, early_stopping, E, speaker_embedding_dim): super(Decoder, self).__init__() self.n_mel_channels = n_mel_channels self.n_frames_per_step = n_frames_per_step self.encoder_embedding_dim = encoder_embedding_dim + E # + speaker_embedding_dim # speaker_id 쓸때만 이거 값 더해주기! self.attention_rnn_dim = attention_rnn_dim self.decoder_rnn_dim = decoder_rnn_dim self.prenet_dim = prenet_dim self.max_decoder_steps = max_decoder_steps self.gate_threshold = gate_threshold self.p_attention_dropout = p_attention_dropout self.p_decoder_dropout = p_decoder_dropout self.early_stopping = early_stopping self.prenet = Prenet(n_mel_channels * n_frames_per_step, [prenet_dim, prenet_dim]) self.attention_rnn = nn.LSTMCell( prenet_dim + self.encoder_embedding_dim, attention_rnn_dim) self.attention_layer = Attention(attention_rnn_dim, self.encoder_embedding_dim, attention_dim, attention_location_n_filters, attention_location_kernel_size) self.decoder_rnn = nn.LSTMCell( attention_rnn_dim + self.encoder_embedding_dim, decoder_rnn_dim, 1) self.linear_projection = LinearNorm( decoder_rnn_dim + self.encoder_embedding_dim, n_mel_channels * n_frames_per_step) self.gate_layer = LinearNorm(decoder_rnn_dim + self.encoder_embedding_dim, 1, bias=True, w_init_gain='sigmoid')
def __init__(self, in_dim, sizes): """ Args: in_dim ([type]): n_mel_channels * n_frames_per_step sizes ([type]): [prenet_dim, prenet_dim] """ super(Prenet, self).__init__() in_sizes = [in_dim] + sizes[:-1] self.layers = nn.ModuleList([ LinearNorm(in_size, out_size, bias=False) for (in_size, out_size) in zip(in_sizes, sizes) ])