def __init__(self, linguistic_dim, phone_level_rnn_dim, join_model_dim, join_model_hidden_dim): super(JoinModol, self).__init__() self.layers1 = LinearNorm(linguistic_dim + phone_level_rnn_dim, join_model_hidden_dim) self.layers2 = LinearNorm(join_model_hidden_dim, join_model_hidden_dim) self.layers3 = LinearNorm(join_model_hidden_dim, join_model_dim)
def __init__(self, vocab_size, decoder_dim, hidden_size, dropout=0.5): super(MIEsitmator, self).__init__() self.proj = nn.Sequential( LinearNorm(decoder_dim, hidden_size, bias=True, w_init_gain='relu'), nn.ReLU(), nn.Dropout(p=dropout)) self.ctc_proj = LinearNorm(hidden_size, vocab_size + 1, bias=True) self.ctc = nn.CTCLoss(blank=vocab_size, reduction='none')
def __init__(self, input_size, decoder_hidden_size, audio_encoder_size, spectral_size, hparams, num_layers=1, bidirectional=False, drop_prob=0): super(RecurrentDecoder, self).__init__() self.batch_size = hparams.batch_size self.decoder_hidden_size = decoder_hidden_size self.rnn_dropout = hparams.rnn_dropout self.n_mel_channels = hparams.n_mel_channels self.attention = AttentionLoop(audio_encoder_size, decoder_hidden_size, method="concat") self.rnn = nn.GRUCell(input_size + audio_encoder_size, decoder_hidden_size, bias=False) self.spectral_linear_projection = LinearNorm(audio_encoder_size, audio_encoder_size) self.gate_linear_projection = LinearNorm(audio_encoder_size, 1, bias=True, w_init_gain='sigmoid')
def __init__(self, hparams): super(VAE, self).__init__() self.fc_r_mu = LinearNorm(hparams.reference_dim + 3, hparams.z_residual_dim) self.fc_r_lv = LinearNorm(hparams.reference_dim + 3, hparams.z_residual_dim)
def __init__(self, hparams): super(Encoder, self).__init__() convolutions = [] for _ in range(hparams.encoder_n_convolutions): conv_layer = nn.Sequential( ConvNorm(hparams.encoder_embedding_dim, hparams.encoder_embedding_dim, kernel_size=hparams.encoder_kernel_size, stride=1, padding=int((hparams.encoder_kernel_size - 1) / 2), dilation=1, w_init_gain='relu'), nn.BatchNorm1d(hparams.encoder_embedding_dim)) convolutions.append(conv_layer) self.convolutions = nn.ModuleList(convolutions) self.lstm = nn.LSTM(hparams.encoder_embedding_dim, int(hparams.encoder_embedding_dim / 2), 1, batch_first=True, bidirectional=True) # Transformer-TTS self.pos_emb = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(1024, 512, padding_idx=0), freeze=True) self.pos_dropout = nn.Dropout(p=0.1) self.alpha = nn.Parameter(torch.ones(1)) self.layers = clones(SelfAttention(hparams.encoder_embedding_dim), hparams.n_attention) self.ffns = clones(FFN(hparams.encoder_embedding_dim), hparams.n_attention) self.norm = nn.LayerNorm(hparams.encoder_embedding_dim, hparams.encoder_embedding_dim) self.concat_after = LinearNorm(hparams.encoder_embedding_dim + hparams.encoder_embedding_dim, hparams.encoder_embedding_dim) self.linear_norm = LinearNorm(hparams.encoder_embedding_dim, hparams.encoder_embedding_dim) self.pos_linear = Linear(hparams.encoder_embedding_dim, hparams.encoder_embedding_dim)
def __init__(self, enc_hidden_size, dec_hidden_size): super(Attention, self).__init__() # self.model = LinearNorm(dec_hidden_size, enc_hidden_size, bias=False) self.fc_1 = LinearNorm(enc_hidden_size, dec_hidden_size, bias=True) self.fc_2 = LinearNorm(dec_hidden_size, dec_hidden_size, bias=True) self.weight = nn.Parameter(torch.zeros(dec_hidden_size, 1))
def __init__(self, hparams): super(Decoder, self).__init__() self.n_mel_channels = hparams.n_mel_channels self.n_frames_per_step = hparams.n_frames_per_step self.encoder_embedding_dim = hparams.encoder_embedding_dim self.attention_rnn_dim = hparams.attention_rnn_dim self.decoder_rnn_dim = hparams.decoder_rnn_dim self.prenet_dim = hparams.prenet_dim self.max_decoder_steps = hparams.max_decoder_steps self.gate_threshold = hparams.gate_threshold self.prenet = Prenet( hparams.n_mel_channels * hparams.n_frames_per_step, [hparams.prenet_dim, hparams.prenet_dim]) self.attention_rnn = nn.LSTMCell( hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, hparams.attention_rnn_dim) self.attention_layer = Attention( hparams.attention_rnn_dim, hparams.encoder_embedding_dim, hparams.attention_dim, hparams.attention_location_n_filters, hparams.attention_location_kernel_size) self.decoder_rnn = nn.LSTMCell( hparams.prenet_dim + hparams.encoder_embedding_dim, hparams.decoder_rnn_dim, 1) self.linear_projection = LinearNorm( hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, hparams.n_mel_channels*hparams.n_frames_per_step) self.gate_layer = LinearNorm( hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, 1, bias=True, w_init_gain='sigmoid')
def __init__(self, hparams): super(Discriminator, self).__init__() self.aae_hidden1 = LinearNorm(hparams.z_residual_dim+3, 256) self.aae_hidden2 = LinearNorm(256, 256) self.aae_output = LinearNorm(256, 1) self.batchs = hparams.batch_size self.z_r_dim = hparams.z_residual_dim
def __init__(self, enc_hidden_size, dec_hidden_size, method="concat"): super(AttentionLoop, self).__init__() self.method = method if method == "general": self.model = LinearNorm(dec_hidden_size, enc_hidden_size, bias=True) elif method == "concat": self.fc = LinearNorm(dec_hidden_size, enc_hidden_size, bias=True) self.weight = nn.Parameter(torch.zeros(enc_hidden_size, 1))
def __init__(self, hparams, supervised=False): super(GMVAE_revised, self).__init__() self.latent_embedding_dim = hparams.latent_embedding_dim self.supervised = supervised convolutions = [] conv_layer_1 = nn.Sequential( ConvNorm(hparams.n_mel_channels, hparams.latent_embedding_dim, kernel_size=hparams.latent_kernel_size, stride=1, padding=int((hparams.latent_kernel_size - 1) / 2), dilation=1, w_init_gain='relu'), nn.BatchNorm1d(hparams.latent_embedding_dim)) convolutions.append(conv_layer_1) conv_layer_2 = nn.Sequential( ConvNorm(hparams.latent_embedding_dim, hparams.latent_embedding_dim, kernel_size=hparams.latent_kernel_size, stride=1, padding=int((hparams.latent_kernel_size - 1) / 2), dilation=1, w_init_gain='relu'), nn.BatchNorm1d(hparams.latent_embedding_dim)) convolutions.append(conv_layer_2) self.convolutions = nn.ModuleList(convolutions) self.lstm = nn.LSTM(hparams.latent_embedding_dim, int(hparams.latent_embedding_dim / 2), 1, batch_first=True, bidirectional=True) # self.mean_pool = nn.AvgPool1d(hparams.latent_kernel_size, stride=1) # # self.mean_pool_out_size = hparams.latent_embedding_dim - hparams.latent_kernel_size + 1 self.linear_projection = LinearNorm( hparams.latent_embedding_dim, int(hparams.latent_embedding_dim / 2)) self.linear_projection_mean = LinearNorm( int(hparams.latent_embedding_dim / 2), hparams.latent_out_dim) self.linear_projection_variance = LinearNorm( int(hparams.latent_embedding_dim / 2), hparams.latent_out_dim) self.fc3 = nn.Linear(hparams.latent_out_dim, int(hparams.latent_embedding_dim / 2)) self.fc4 = nn.Linear(int(hparams.latent_embedding_dim / 2), hparams.latent_embedding_dim)
def __init__(self, attention_rnn_dim, embedding_dim, attention_dim, attention_location_n_filters, attention_location_kernel_size): super(Attention, self).__init__() self.query_layer = LinearNorm(attention_rnn_dim, attention_dim, bias=False, w_init_gain='tanh') self.memory_layer = LinearNorm(embedding_dim, attention_dim, bias=False, w_init_gain='tanh') self.v = LinearNorm(attention_dim, 1, bias=False) self.location_layer = LocationLayer(attention_location_n_filters, attention_location_kernel_size, attention_dim) self.score_mask_value = -float("inf")
def __init__(self, hparams): super(Decoder, self).__init__() self.n_mel_channels = hparams.n_mel_channels self.n_frames_per_step = hparams.n_frames_per_step self.encoder_embedding_dim = hparams.encoder_embedding_dim + hparams.token_embedding_size #+ hparams.speaker_embedding_dim # no speaker embedding: bigger token embedding size self.attention_rnn_dim = hparams.attention_rnn_dim self.decoder_rnn_dim = hparams.decoder_rnn_dim self.prenet_dim = hparams.prenet_dim self.max_decoder_steps = hparams.max_decoder_steps self.gate_threshold = hparams.gate_threshold self.p_attention_dropout = hparams.p_attention_dropout self.p_decoder_dropout = hparams.p_decoder_dropout self.p_teacher_forcing = hparams.p_teacher_forcing # self.prenet_f0_dim = hparams.prenet_f0_dim # self.prenet_f0 = ConvNorm( # 1, hparams.prenet_f0_dim, # kernel_size=hparams.prenet_f0_kernel_size, # padding=max(0, int(hparams.prenet_f0_kernel_size/2)), # bias=False, stride=1, dilation=1) # no f0 self.prenet = Prenet( hparams.n_mel_channels * hparams.n_frames_per_step, [hparams.prenet_dim, hparams.prenet_dim]) # self.attention_rnn = nn.LSTMCell( # hparams.prenet_dim + hparams.prenet_f0_dim + self.encoder_embedding_dim, # hparams.attention_rnn_dim) self.attention_rnn = nn.LSTMCell( hparams.prenet_dim + self.encoder_embedding_dim, hparams.attention_rnn_dim) self.attention_layer = Attention( hparams.attention_rnn_dim, self.encoder_embedding_dim, hparams.attention_dim, hparams.attention_location_n_filters, hparams.attention_location_kernel_size) self.decoder_rnn = nn.LSTMCell( hparams.attention_rnn_dim + self.encoder_embedding_dim, hparams.decoder_rnn_dim, 1) self.linear_projection = LinearNorm( hparams.decoder_rnn_dim + self.encoder_embedding_dim, hparams.n_mel_channels * hparams.n_frames_per_step) self.gate_layer = LinearNorm(hparams.decoder_rnn_dim + self.encoder_embedding_dim, 1, bias=True, w_init_gain='sigmoid')
def __init__(self, hparams): super(Decoder, self).__init__() self.mellotron = hparams.mellotron self.disable_f0 = hparams.disable_f0 self.n_mel_channels = hparams.n_mel_channels self.n_frames_per_step = hparams.n_frames_per_step self.encoder_embedding_dim = hparams.encoder_embedding_dim + hparams.token_embedding_size + hparams.speaker_embedding_dim if self.mellotron else hparams.encoder_embedding_dim self.attention_rnn_dim = hparams.attention_rnn_dim self.decoder_rnn_dim = hparams.decoder_rnn_dim self.prenet_dim = hparams.prenet_dim self.max_decoder_steps = hparams.max_decoder_steps self.gate_threshold = hparams.gate_threshold self.p_attention_dropout = hparams.p_attention_dropout self.p_decoder_dropout = hparams.p_decoder_dropout self.p_teacher_forcing = hparams.p_teacher_forcing if self.mellotron and not self.disable_f0: self.prenet_f0 = ConvNorm( 1, hparams.prenet_f0_dim, kernel_size=hparams.prenet_f0_kernel_size, padding=max(0, int(hparams.prenet_f0_kernel_size/2)), bias=False, stride=1, dilation=1) attention_rnn_in_dim = hparams.prenet_dim + self.encoder_embedding_dim + hparams.prenet_f0_dim else: attention_rnn_in_dim = hparams.prenet_dim + self.encoder_embedding_dim self.prenet = Prenet( hparams.n_mel_channels * hparams.n_frames_per_step, [hparams.prenet_dim, hparams.prenet_dim]) self.attention_rnn = nn.LSTMCell( attention_rnn_in_dim, hparams.attention_rnn_dim) self.attention_layer = Attention( hparams.attention_rnn_dim, self.encoder_embedding_dim, hparams.attention_dim, hparams.attention_location_n_filters, hparams.attention_location_kernel_size) self.decoder_rnn = nn.LSTMCell( hparams.attention_rnn_dim + self.encoder_embedding_dim, hparams.decoder_rnn_dim, 1) self.linear_projection = LinearNorm( hparams.decoder_rnn_dim + self.encoder_embedding_dim, hparams.n_mel_channels * hparams.n_frames_per_step) self.gate_layer = LinearNorm( hparams.decoder_rnn_dim + self.encoder_embedding_dim, 1, bias=True, w_init_gain='sigmoid')
def __init__(self, hparams): super(Decoder, self).__init__() self.n_mel_channels = hparams.n_mel_channels self.n_frames_per_step = hparams.n_frames_per_step self.encoder_embedding_dim = hparams.encoder_embedding_dim self.attention_rnn_dim = hparams.attention_rnn_dim self.decoder_rnn_dim = hparams.decoder_rnn_dim self.prenet_dim = hparams.prenet_dim self.max_decoder_steps = hparams.max_decoder_steps self.gate_threshold = hparams.gate_threshold self.p_attention_dropout = hparams.p_attention_dropout self.p_decoder_dropout = hparams.p_decoder_dropout self.p_teacher_forcing = hparams.p_teacher_forcing self.prenet = Prenet( hparams.n_mel_channels * hparams.n_frames_per_step, [hparams.prenet_dim, hparams.prenet_dim]) self.attention_rnn = nn.LSTMCell( hparams.prenet_dim + hparams.encoder_embedding_dim, hparams.attention_rnn_dim) self.attention_layer = Attention( hparams.attention_rnn_dim, hparams.encoder_embedding_dim, hparams.attention_dim, hparams.attention_location_n_filters, hparams.attention_location_kernel_size) self.decoder_rnn = nn.LSTMCell( hparams.attention_rnn_dim + hparams.encoder_embedding_dim, hparams.decoder_rnn_dim, 1) self.linear_projection = nn.Sequential( LinearNorm(hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, hparams.decoder_rnn_dim, bias=True, w_init_gain='relu'), nn.ReLU(), nn.Dropout(p=0.2)) self.mel_layer = nn.Sequential( LinearNorm(hparams.decoder_rnn_dim, hparams.decoder_rnn_dim, bias=True, w_init_gain='relu'), nn.ReLU(), nn.Dropout(p=0.2), LinearNorm(hparams.decoder_rnn_dim, hparams.n_mel_channels * hparams.n_frames_per_step)) self.gate_layer = LinearNorm(hparams.decoder_rnn_dim, 1, bias=True, w_init_gain='sigmoid')
def __init__(self, query_dim, keys_dim, attention_dim): super(SimpleAttention, self).__init__() # 传统attention需要query和keys做线性变换再v^T.*tanh(W * query + V * keys) # 这个query_layer和memory_layer分别得到 W * query 和 V * keys # w_init_gain='tanh'是因为他们包在tanh(W * query + V * keys)函数中 self.query_layer = LinearNorm(query_dim, attention_dim, bias=False, w_init_gain='tanh') self.memory_layer = LinearNorm(keys_dim, attention_dim, bias=False, w_init_gain='tanh') self.v = LinearNorm(attention_dim, 1, bias=False) self.score_mask_value = -float("inf")
def __init__(self, hparams): super(Residual_Encoder, self).__init__() self.z_r = LinearNorm(hparams.reference_dim, hparams.z_residual_dim) self.n_mels = hparams.n_mel_channels self.batchs = hparams.batch_size self.z_r_dim = hparams.z_residual_dim convolutions = [] for i in range(hparams.n_convolutions): conv_layer = nn.Sequential( ConvNorm2d(hparams.conv_dim_in[i], hparams.conv_dim_out[i], kernel_size=3, stride=2, padding=1, dilation=1, w_init_gain='relu'), nn.BatchNorm2d(hparams.conv_dim_out[i])) convolutions.append(conv_layer) self.convolutions = nn.ModuleList(convolutions) out_channels = self.calculate_channels(hparams.n_mel_channels, 3, 2, 1, 2) self.ceil_n_mel_64 = int(ceil(hparams.n_mel_channels / 64)) self.lstm = nn.LSTM(hparams.reference_dim*self.ceil_n_mel_64, hparams.reference_dim//2, 1, batch_first=True, bidirectional=True) for name, param in self.lstm.named_parameters(): if 'weight' in name: torch.nn.init.xavier_uniform_(param)
def __init__(self, hparams): super(NeuralConcatenativeSpeechSynthesis, self).__init__() self.n_mel_channels = hparams.n_mel_channels self.audio_prenet = Prenet(hparams.n_mel_channels, [hparams.prenet_dim, hparams.prenet_dim], hparams) self.target_audio_prenet = TargetPrenet( hparams.n_mel_channels, [hparams.prenet_dim, hparams.prenet_dim]) self.text_prenet = ConvNorm(hparams.symbols_embedding_dim, hparams.symbols_embedding_dim, kernel_size=hparams.decoder_kernel_size, stride=hparams.text_stride) self.embedding = nn.Embedding(hparams.n_symbols, hparams.symbols_embedding_dim) # Text to audio seq2seq(alignment 1 module) self.glued_mel_encoder = AudioEncoder(hparams.prenet_dim, hparams.encoder_rnn_dim) self.glued_text_decoder = AttentionDecoder( hparams.symbols_embedding_dim, hparams.decoder_rnn_dim, hparams.encoder_rnn_dim) # Text to text seq2seq(Pseudo alignment 2) self.target_text_decoder = AttentionDecoder( hparams.symbols_embedding_dim, hparams.decoder_rnn_dim, hparams.decoder_rnn_dim) # Decoder self.decoder = RecurrentDecoder(hparams.prenet_dim, hparams.mel_decoder_rnn_dim, hparams.prenet_dim, hparams.n_mel_channels, hparams) self.postnet = LinearNorm(hparams.prenet_dim, hparams.n_mel_channels)
def __init__(self, in_dim, sizes): super(Prenet, self).__init__() in_sizes = [in_dim] + sizes[:-1] self.layers = nn.ModuleList([ LinearNorm(in_size, out_size, bias=False) for (in_size, out_size) in zip(in_sizes, sizes) ])
def __init__(self, hparams): super(Decoder, self).__init__() self.n_mel_channels = hparams.n_mel_channels self.n_frames_per_step = hparams.n_frames_per_step self.encoder_embedding_dim = hparams.encoder_embedding_dim self.attention_rnn_dim = hparams.attention_rnn_dim self.decoder_rnn_dim = hparams.decoder_rnn_dim self.prenet_dim = hparams.prenet_dim self.max_decoder_steps = hparams.max_decoder_steps self.gate_threshold = hparams.gate_threshold self.p_attention_dropout = hparams.p_attention_dropout self.p_decoder_dropout = hparams.p_decoder_dropout self.prenet = Prenet( hparams.n_mel_channels * hparams.n_frames_per_step, [hparams.prenet_dim, hparams.prenet_dim]) self.attention_rnn = nn.LSTMCell( hparams.prenet_dim + hparams.encoder_embedding_dim, hparams.attention_rnn_dim) self.attention_layer = Attention( hparams.attention_rnn_dim, hparams.encoder_embedding_dim, hparams.attention_dim, hparams.attention_location_n_filters, hparams.attention_location_kernel_size) self.decoder_rnn = nn.LSTMCell( hparams.attention_rnn_dim + hparams.encoder_embedding_dim, hparams.decoder_rnn_dim, 1) self.linear_projection = LinearNorm( hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, hparams.n_mel_channels * hparams.n_frames_per_step) self.gate_layer = LinearNorm( hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, 1, bias=True, w_init_gain='sigmoid') # Transformer TTS self.norm = LinearNorm(hparams.prenet_dim, hparams.prenet_dim) self.pos_emb = nn.Embedding.from_pretrained( get_sinusoid_encoding_table(1024, hparams.prenet_dim, padding_idx=0), freeze=True) self.alpha = nn.Parameter(torch.ones(1)) self.pos_dropout = nn.Dropout(p=0.1) self.pos_linear = Linear(hparams.prenet_dim, hparams.prenet_dim)
def __init__(self, hparams): super(Decoder, self).__init__() self.n_mel_channels = hparams.n_mel_channels self.n_frames_per_step = hparams.n_frames_per_step self.encoder_embedding_dim = hparams.encoder_embedding_dim self.attention_rnn_dim = hparams.attention_rnn_dim self.decoder_rnn_dim = hparams.decoder_rnn_dim self.prenet_dim = hparams.prenet_dim self.max_decoder_steps = hparams.max_decoder_steps self.gate_threshold = hparams.gate_threshold self.p_attention_dropout = hparams.p_attention_dropout self.p_decoder_dropout = hparams.p_decoder_dropout self.prenet = Prenet( hparams.n_mel_channels * hparams.n_frames_per_step, # 80*n [hparams.prenet_dim, hparams.prenet_dim]) # [256,256] # 256+512->1024 self.attention_rnn = nn.LSTMCell( hparams.prenet_dim + hparams.encoder_embedding_dim, # 256+512 hparams.attention_rnn_dim) # 1024 self.attention_layer = Attention( hparams.attention_rnn_dim, # 1024 hparams.encoder_embedding_dim, # 512 hparams.attention_dim, # 128 hparams.attention_location_n_filters, # 32 hparams.attention_location_kernel_size) # 31 # 1024+512->1024 self.decoder_rnn = nn.LSTMCell( hparams.attention_rnn_dim + hparams.encoder_embedding_dim, # 1024+512 hparams.decoder_rnn_dim, 1) # 1024 (单层) # 1024+512->n*mels self.linear_projection = LinearNorm( hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, # 1024+512 hparams.n_mel_channels * hparams.n_frames_per_step) # n*mels # 1024+512->1 self.gate_layer = LinearNorm(hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, 1, bias=True, w_init_gain='sigmoid')
def __init__(self, hparams): super(Decoder, self).__init__() self.n_mel_channels = hparams.n_mel_channels self.n_frames_per_step = hparams.n_frames_per_step self.encoder_embedding_dim = hparams.encoder_embedding_dim self.frame_level_rnn_dim = hparams.frame_level_rnn_dim self.phone_level_rnn_dim = hparams.phone_level_rnn_dim self.join_model_dim = hparams.frame_level_rnn_dim self.decoder_rnn_dim = hparams.decoder_rnn_dim self.prenet_dim = hparams.prenet_dim self.p_decoder_dropout = hparams.p_decoder_dropout self.decoder_training_mode = hparams.decoder_training_mode if self.decoder_training_mode == 'random annealing': self.annealing = annealing(0.9999) self.prenet = Prenet( hparams.n_mel_channels * hparams.n_frames_per_step, [hparams.prenet_dim, hparams.prenet_dim]) self.frame_level_rnn = nn.LSTMCell(hparams.prenet_dim, hparams.frame_level_rnn_dim) self.self_attention = vectorBased_selfAttention( hparams.frame_level_rnn_dim, hparams.self_attention_dim) self.text_attention_layer = SimpleAttention( hparams.frame_level_rnn_dim, hparams.encoder_embedding_dim, hparams.attention_dim) self.phone_level_rnn = nn.LSTMCell(hparams.frame_level_rnn_dim, hparams.phone_level_rnn_dim) self.join_model_layer = JoinModol(hparams.encoder_embedding_dim, hparams.phone_level_rnn_dim, self.join_model_dim, hparams.join_model_hidden_dim) self.decoder_rnn = nn.LSTMCell( hparams.frame_level_rnn_dim + self.join_model_dim, hparams.decoder_rnn_dim) self.linear_projection = LinearNorm( hparams.decoder_rnn_dim + self.join_model_dim, hparams.n_mel_channels * hparams.n_frames_per_step) self.attention_layer = Attention( hparams.decoder_rnn_dim + self.join_model_dim, hparams.encoder_embedding_dim, hparams.attention_dim, hparams.attention_location_n_filters, hparams.attention_location_kernel_size) self.max_decoder_steps = hparams.max_decoder_steps if hparams.more_information: self.more_information = True else: self.more_information = False
def __init__(self, attention_n_filters, attention_kernel_size, attention_dim): super(LocationLayer, self).__init__() padding = int((attention_kernel_size - 1) / 2) self.location_conv = ConvNorm(2, attention_n_filters, kernel_size=attention_kernel_size, padding=padding, bias=False) self.location_dense = LinearNorm(attention_n_filters, attention_dim, bias=False, w_init_gain='tanh')
def __init__(self, hparams): super().__init__() self.dropout = nn.Dropout(hparams.ref_attention_dropout) self.d_q = hparams.encoder_embedding_dim self.d_k = hparams.prosody_embedding_dim self.linears = nn.ModuleList([ LinearNorm(in_dim, hparams.ref_attention_dim, bias=False, w_init_gain='tanh') \ for in_dim in (self.d_q, self.d_k) ]) self.score_mask_value = 1e-9
def __init__(self, num_mixtures, attention_layers, attention_rnn_dim, embedding_dim, attention_dim, attention_location_n_filters, attention_location_kernel_size, hparams): super(GMMAttention, self).__init__() self.num_mixtures = num_mixtures self.normalize_attention_input = hparams.normalize_attention_input self.delta_min_limit = hparams.delta_min_limit self.delta_offset = hparams.delta_offset self.lin_bias = hparams.lin_bias self.initial_gain = hparams.initial_gain lin = nn.Linear(attention_dim, 3 * num_mixtures, bias=self.lin_bias) lin.weight.data.mul_(0.01) if self.lin_bias: lin.bias.data.mul_(0.008) lin.bias.data.sub_(2.0) if attention_layers == 1: self.F = nn.Sequential( LinearNorm(attention_rnn_dim, attention_dim, bias=True, w_init_gain=self.initial_gain), nn.Tanh(), lin) elif attention_layers == 2: self.F = nn.Sequential( LinearNorm(attention_rnn_dim, attention_dim, bias=True, w_init_gain=self.initial_gain), LinearNorm(attention_dim, attention_dim, bias=False, w_init_gain='tanh'), nn.Tanh(), lin) else: print( f"attention_layers invalid, valid values are... 1, 2\nCurrent Value {attention_layers}" ) raise self.score_mask_value = 0 # -float("inf") self.register_buffer( 'pos', torch.arange(0, 2000, dtype=torch.float).view(1, -1, 1).data)
def __init__(self, attention_rnn_dim, embedding_dim, attention_dim, attention_location_n_filters, attention_location_kernel_size): super(Attention, self).__init__() # 传统attention需要query和keys做线性变换再v^T.*tanh(W * query + V * keys) # 这个query_layer和memory_layer分别得到 W * query 和 V * keys # w_init_gain='tanh'是因为他们包在tanh(W * query + V * keys)函数中 self.query_layer = LinearNorm(attention_rnn_dim, attention_dim, bias=False, w_init_gain='tanh') self.memory_layer = LinearNorm(embedding_dim, attention_dim, bias=False, w_init_gain='tanh') # 当前attention除了传统参数还包括对注意力权重做卷积处理 self.location_layer = LocationLayer(attention_location_n_filters, attention_location_kernel_size, attention_dim) self.v = LinearNorm(attention_dim, 1, bias=False) self.score_mask_value = -float("inf")
def __init__(self, config): super(Decoder, self).__init__() self.n_mel_channels = config["n_mel_channels"] self.n_frames_per_step = config["n_frames_per_step"] self.encoder_embedding_dim = config["encoder_embedding_dim"] self.attention_rnn_dim = config["attention_rnn_dim"] self.decoder_rnn_dim = config["decoder_rnn_dim"] self.prenet_dim = config["prenet_dim"] self.max_decoder_steps = config["max_decoder_steps"] self.gate_threshold = config["gate_threshold"] self.p_attention_dropout = config["p_attention_dropout"] self.p_decoder_dropout = config["p_decoder_dropout"] self.prenet = Prenet( config["n_mel_channels"] * config["n_frames_per_step"], [config["prenet_dim"], config["prenet_dim"]]) self.attention_rnn = nn.LSTMCell( config["prenet_dim"] + config["encoder_embedding_dim"], config["attention_rnn_dim"]) self.attention_layer = Attention( config["attention_rnn_dim"], config["encoder_embedding_dim"], config["attention_dim"], config["attention_location_n_filters"], config["attention_location_kernel_size"]) self.decoder_rnn = nn.LSTMCell( config["attention_rnn_dim"] + config["encoder_embedding_dim"], config["decoder_rnn_dim"], 1) self.linear_projection = LinearNorm( config["decoder_rnn_dim"] + config["encoder_embedding_dim"], config["n_mel_channels"] * config["n_frames_per_step"]) self.gate_layer = LinearNorm(config["decoder_rnn_dim"] + config["encoder_embedding_dim"], 1, bias=True, w_init_gain='sigmoid')
def __init__(self, hparams): super(Tacotron2, self).__init__() self.mask_padding = hparams.mask_padding self.fp16_run = hparams.fp16_run self.use_vae = hparams.use_vae self.embedding_variation = hparams.embedding_variation self.label_type = hparams.label_type self.n_mel_channels = hparams.n_mel_channels self.n_frames_per_step = hparams.n_frames_per_step self.symbols_embedding_dim = hparams.symbols_embedding_dim self.speaker_embedding_dim = hparams.speaker_embedding_dim self.emotion_embedding_dim = hparams.emotion_embedding_dim self.transcript_embedding = nn.Embedding(hparams.n_symbols, hparams.symbols_embedding_dim) if self.use_vae: if self.label_type == 'one-hot': self.speaker_embedding = LinearNorm( hparams.n_speakers, hparams.speaker_embedding_dim, bias=True, w_init_gain='tanh') self.emotion_embedding = LinearNorm( hparams.n_emotions, hparams.emotion_embedding_dim, bias=True, w_init_gain='tanh') elif self.label_type == 'id': self.speaker_embedding = nn.Embedding( hparams.n_speakers, hparams.speaker_embedding_dim) self.emotion_embedding = nn.Embedding( hparams.n_emotions, hparams.emotion_embedding_dim) self.vae_input_type = hparams.vae_input_type std = sqrt(2.0 / (hparams.n_symbols + hparams.symbols_embedding_dim)) val = sqrt(3.0) * std # uniform bounds for std self.transcript_embedding.weight.data.uniform_(-val, val) self.encoder = Encoder(hparams) self.decoder = Decoder(hparams) self.postnet = Postnet(hparams) self.vae_gst = VAE_GST(hparams)
def __init__(self, attention_rnn_dim, embedding_dim, attention_dim, attention_location_n_filters, attention_location_kernel_size): super(Attention, self).__init__() self.query_layer = LinearNorm(attention_rnn_dim, attention_dim, bias=False, w_init_gain='tanh') # if hparams.style == 'speaker_encoder': # embedding_dim += 256 # elif hparams.style == 'style_embedding': # embedding_dim += 128 # elif hparams.style == 'both': # embedding_dim += 256 + 128 self.memory_layer = LinearNorm(embedding_dim, attention_dim, bias=False, w_init_gain='tanh') self.v = LinearNorm(attention_dim, 1, bias=False) self.location_layer = LocationLayer(attention_location_n_filters, attention_location_kernel_size, attention_dim) self.score_mask_value = -float("inf")
def __init__(self, in_dim, sizes, hparams): super(Prenet, self).__init__() in_sizes = [in_dim] + sizes[:-1] self.layers = nn.ModuleList([ LinearNorm(in_size, out_size, bias=False) for (in_size, out_size) in zip(in_sizes, sizes) ]) self.convolutions = nn.Sequential( ConvNorm(hparams.prenet_dim, hparams.prenet_dim, kernel_size=hparams.audio_kernel_size, stride=hparams.audio_stride, w_init_gain='relu'), nn.BatchNorm1d(hparams.prenet_dim))
def __init__(self, in_dim, sizes, p_prenet_dropout, prenet_batchnorm): super(Prenet, self).__init__() in_sizes = [in_dim] + sizes[:-1] self.layers = nn.ModuleList([ LinearNorm(in_size, out_size, bias=False) for (in_size, out_size) in zip(in_sizes, sizes) ]) self.p_prenet_dropout = p_prenet_dropout self.prenet_batchnorm = prenet_batchnorm self.p_prenet_input_dropout = 0 if self.prenet_batchnorm: self.batchnorms = nn.ModuleList( [nn.BatchNorm1d(size) for size in sizes])