def __init__(self, embed_dim=512 // 2, frame_dim=80, dropout=0.5): super().__init__() self.conv = nn.ModuleList() self.dropout = nn.Dropout(0.5) self.conv.append( nn.Sequential( ConvNorm(frame_dim, embed_dim, kernel_size=5, stride=1, padding=int((5 - 1) / 2), dilation=1, w_init_gain='tanh'), nn.BatchNorm1d(embed_dim))) for i in range(1, 4): self.conv.append( nn.Sequential( ConvNorm(embed_dim, embed_dim, kernel_size=5, stride=1, padding=int((5 - 1) / 2), dilation=1, w_init_gain='tanh'), nn.BatchNorm1d(embed_dim))) self.conv.append( nn.Sequential( ConvNorm(embed_dim, frame_dim, kernel_size=5, stride=1, padding=int((5 - 1) / 2), dilation=1, w_init_gain='tanh'), nn.BatchNorm1d(frame_dim)))
def __init__(self, hparams): super(Postnet, self).__init__() self.convolutions = nn.ModuleList() self.convolutions.append( nn.Sequential( ConvNorm(hparams.n_mel_channels, hparams.postnet_embedding_dim, kernel_size=hparams.postnet_kernel_size, stride=1, padding=int((hparams.postnet_kernel_size - 1) / 2), dilation=1, w_init_gain='tanh'), nn.BatchNorm1d(hparams.postnet_embedding_dim)) ) for i in range(1, hparams.postnet_n_convolutions - 1): self.convolutions.append( nn.Sequential( ConvNorm(hparams.postnet_embedding_dim, hparams.postnet_embedding_dim, kernel_size=hparams.postnet_kernel_size, stride=1, padding=int((hparams.postnet_kernel_size - 1) / 2), dilation=1, w_init_gain='tanh'), nn.BatchNorm1d(hparams.postnet_embedding_dim)) ) self.convolutions.append( nn.Sequential( ConvNorm(hparams.postnet_embedding_dim, hparams.n_mel_channels, kernel_size=hparams.postnet_kernel_size, stride=1, padding=int((hparams.postnet_kernel_size - 1) / 2), dilation=1, w_init_gain='linear'), nn.BatchNorm1d(hparams.n_mel_channels)) )
def __init__(self, hparams, supervised=False): super(GMVAE_revised, self).__init__() self.latent_embedding_dim = hparams.latent_embedding_dim self.supervised = supervised convolutions = [] conv_layer_1 = nn.Sequential( ConvNorm(hparams.n_mel_channels, hparams.latent_embedding_dim, kernel_size=hparams.latent_kernel_size, stride=1, padding=int((hparams.latent_kernel_size - 1) / 2), dilation=1, w_init_gain='relu'), nn.BatchNorm1d(hparams.latent_embedding_dim)) convolutions.append(conv_layer_1) conv_layer_2 = nn.Sequential( ConvNorm(hparams.latent_embedding_dim, hparams.latent_embedding_dim, kernel_size=hparams.latent_kernel_size, stride=1, padding=int((hparams.latent_kernel_size - 1) / 2), dilation=1, w_init_gain='relu'), nn.BatchNorm1d(hparams.latent_embedding_dim)) convolutions.append(conv_layer_2) self.convolutions = nn.ModuleList(convolutions) self.lstm = nn.LSTM(hparams.latent_embedding_dim, int(hparams.latent_embedding_dim / 2), 1, batch_first=True, bidirectional=True) # self.mean_pool = nn.AvgPool1d(hparams.latent_kernel_size, stride=1) # # self.mean_pool_out_size = hparams.latent_embedding_dim - hparams.latent_kernel_size + 1 self.linear_projection = LinearNorm( hparams.latent_embedding_dim, int(hparams.latent_embedding_dim / 2)) self.linear_projection_mean = LinearNorm( int(hparams.latent_embedding_dim / 2), hparams.latent_out_dim) self.linear_projection_variance = LinearNorm( int(hparams.latent_embedding_dim / 2), hparams.latent_out_dim) self.fc3 = nn.Linear(hparams.latent_out_dim, int(hparams.latent_embedding_dim / 2)) self.fc4 = nn.Linear(int(hparams.latent_embedding_dim / 2), hparams.latent_embedding_dim)
def __init__(self, hparams): super(Encoder, self).__init__() convolutions = [] for _ in range(hparams.encoder_n_convolutions): conv_layer = nn.Sequential( ConvNorm(hparams.encoder_embedding_dim, hparams.encoder_embedding_dim, kernel_size=hparams.encoder_kernel_size, stride=1, padding=int((hparams.encoder_kernel_size - 1) / 2), dilation=1, w_init_gain='relu'), nn.BatchNorm1d(hparams.encoder_embedding_dim)) convolutions.append(conv_layer) self.convolutions = nn.ModuleList(convolutions) self.lstm = nn.LSTM(hparams.encoder_embedding_dim, int(hparams.encoder_embedding_dim / 2), 1, batch_first=True, bidirectional=True)
def __init__(self, hparams): super(Encoder, self).__init__() self.encoder_speaker_embed_dim = hparams.encoder_speaker_embed_dim self.encoder_concat_speaker_embed = hparams.encoder_concat_speaker_embed if self.encoder_concat_speaker_embed == 'before': self.conv_dim = hparams.encoder_embedding_dim elif self.encoder_concat_speaker_embed == 'inside': self.conv_dim = hparams.symbols_embedding_dim else: print( f'encoder_concat_speaker_embed is has invalid value {hparams.encoder_concat_speaker_embed}, valid values are "before","inside".' ) raise convolutions = [] for _ in range(hparams.encoder_n_convolutions): conv_layer = nn.Sequential( ConvNorm(self.conv_dim, self.conv_dim, kernel_size=hparams.encoder_kernel_size, stride=1, padding=int((hparams.encoder_kernel_size - 1) / 2), dilation=1, w_init_gain='relu'), nn.BatchNorm1d(self.conv_dim)) convolutions.append(conv_layer) self.convolutions = nn.ModuleList(convolutions) self.lstm = nn.LSTM(hparams.encoder_embedding_dim, int(hparams.encoder_embedding_dim / 2), 1, batch_first=True, bidirectional=True) self.LReLU = nn.LeakyReLU(negative_slope=0.01) # LeakyReLU
def __init__(self, hparams): super(NeuralConcatenativeSpeechSynthesis, self).__init__() self.n_mel_channels = hparams.n_mel_channels self.audio_prenet = Prenet(hparams.n_mel_channels, [hparams.prenet_dim, hparams.prenet_dim], hparams) self.target_audio_prenet = TargetPrenet( hparams.n_mel_channels, [hparams.prenet_dim, hparams.prenet_dim]) self.text_prenet = ConvNorm(hparams.symbols_embedding_dim, hparams.symbols_embedding_dim, kernel_size=hparams.decoder_kernel_size, stride=hparams.text_stride) self.embedding = nn.Embedding(hparams.n_symbols, hparams.symbols_embedding_dim) # Text to audio seq2seq(alignment 1 module) self.glued_mel_encoder = AudioEncoder(hparams.prenet_dim, hparams.encoder_rnn_dim) self.glued_text_decoder = AttentionDecoder( hparams.symbols_embedding_dim, hparams.decoder_rnn_dim, hparams.encoder_rnn_dim) # Text to text seq2seq(Pseudo alignment 2) self.target_text_decoder = AttentionDecoder( hparams.symbols_embedding_dim, hparams.decoder_rnn_dim, hparams.decoder_rnn_dim) # Decoder self.decoder = RecurrentDecoder(hparams.prenet_dim, hparams.mel_decoder_rnn_dim, hparams.prenet_dim, hparams.n_mel_channels, hparams) self.postnet = LinearNorm(hparams.prenet_dim, hparams.n_mel_channels)
def __init__(self, hparams): super(Encoder, self).__init__() convolutions = [] for _ in range(hparams.encoder_n_convolutions): conv_layer = nn.Sequential( ConvNorm(hparams.encoder_embedding_dim, hparams.encoder_embedding_dim, kernel_size=hparams.encoder_kernel_size, stride=1, padding=int((hparams.encoder_kernel_size - 1) / 2), dilation=1, w_init_gain='relu'), nn.BatchNorm1d(hparams.encoder_embedding_dim)) convolutions.append(conv_layer) self.convolutions = nn.ModuleList(convolutions) self.lstm = nn.LSTM(hparams.encoder_embedding_dim, int(hparams.encoder_embedding_dim / 2), 1, batch_first=True, bidirectional=True) # Transformer-TTS self.pos_emb = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(1024, 512, padding_idx=0), freeze=True) self.pos_dropout = nn.Dropout(p=0.1) self.alpha = nn.Parameter(torch.ones(1)) self.layers = clones(SelfAttention(hparams.encoder_embedding_dim), hparams.n_attention) self.ffns = clones(FFN(hparams.encoder_embedding_dim), hparams.n_attention) self.norm = nn.LayerNorm(hparams.encoder_embedding_dim, hparams.encoder_embedding_dim) self.concat_after = LinearNorm(hparams.encoder_embedding_dim + hparams.encoder_embedding_dim, hparams.encoder_embedding_dim) self.linear_norm = LinearNorm(hparams.encoder_embedding_dim, hparams.encoder_embedding_dim) self.pos_linear = Linear(hparams.encoder_embedding_dim, hparams.encoder_embedding_dim)
def __init__(self, attention_n_filters, attention_kernel_size, attention_dim): super(LocationLayer, self).__init__() padding = int((attention_kernel_size - 1) / 2) self.location_conv = ConvNorm(2, attention_n_filters, kernel_size=attention_kernel_size, padding=padding, bias=False) self.location_dense = LinearNorm(attention_n_filters, attention_dim, bias=False, w_init_gain='tanh')
def __init__(self, hparams): super(Decoder, self).__init__() self.n_mel_channels = hparams.n_mel_channels self.n_frames_per_step = hparams.n_frames_per_step self.encoder_embedding_dim = hparams.encoder_embedding_dim \ + hparams.emotion_embedding_dim + hparams.speaker_embedding_dim self.attention_rnn_dim = hparams.attention_rnn_dim self.decoder_rnn_dim = hparams.decoder_rnn_dim self.prenet_dim = hparams.prenet_dim self.max_decoder_steps = hparams.max_decoder_steps self.gate_threshold = hparams.gate_threshold self.p_attention_dropout = hparams.p_attention_dropout self.p_decoder_dropout = hparams.p_decoder_dropout self.p_teacher_forcing = hparams.p_teacher_forcing self.prenet_f0 = ConvNorm(1, hparams.prenet_f0_dim, kernel_size=hparams.prenet_f0_kernel_size, padding=max( 0, int(hparams.prenet_f0_kernel_size / 2)), bias=False, stride=1, dilation=1) self.prenet = Prenet( hparams.n_mel_channels * hparams.n_frames_per_step, [hparams.prenet_dim, hparams.prenet_dim]) self.attention_rnn = nn.LSTMCell( hparams.prenet_dim + self.encoder_embedding_dim, hparams.attention_rnn_dim) self.attention_layer = Attention( hparams.attention_rnn_dim, self.encoder_embedding_dim, hparams.attention_dim, hparams.attention_location_n_filters, hparams.attention_location_kernel_size) self.decoder_rnn = nn.LSTMCell( hparams.attention_rnn_dim + self.encoder_embedding_dim, hparams.decoder_rnn_dim, 1) self.linear_projection = LinearNorm( hparams.decoder_rnn_dim + self.encoder_embedding_dim, hparams.n_mel_channels * hparams.n_frames_per_step) self.gate_layer = LinearNorm(hparams.decoder_rnn_dim + self.encoder_embedding_dim, 1, bias=True, w_init_gain='sigmoid')
def __init__(self, config): super(Postnet, self).__init__() self.convolutions = nn.ModuleList() self.convolutions.append( nn.Sequential( ConvNorm(config["n_mel_channels"], config["postnet_embedding_dim"], kernel_size=config["postnet_kernel_size"], stride=1, padding=int((config["postnet_kernel_size"] - 1) / 2), dilation=1, w_init_gain='tanh'), nn.BatchNorm1d(config["postnet_embedding_dim"]))) for i in range(1, config["postnet_n_convolutions"] - 1): self.convolutions.append( nn.Sequential( ConvNorm(config["postnet_embedding_dim"], config["postnet_embedding_dim"], kernel_size=config["postnet_kernel_size"], stride=1, padding=int( (config["postnet_kernel_size"] - 1) / 2), dilation=1, w_init_gain='tanh'), nn.BatchNorm1d(config["postnet_embedding_dim"]))) self.convolutions.append( nn.Sequential( ConvNorm(config["postnet_embedding_dim"], config["n_mel_channels"], kernel_size=config["postnet_kernel_size"], stride=1, padding=int((config["postnet_kernel_size"] - 1) / 2), dilation=1, w_init_gain='linear'), nn.BatchNorm1d(config["n_mel_channels"])))
def __init__(self, in_dim, sizes, hparams): super(Prenet, self).__init__() in_sizes = [in_dim] + sizes[:-1] self.layers = nn.ModuleList([ LinearNorm(in_size, out_size, bias=False) for (in_size, out_size) in zip(in_sizes, sizes) ]) self.convolutions = nn.Sequential( ConvNorm(hparams.prenet_dim, hparams.prenet_dim, kernel_size=hparams.audio_kernel_size, stride=hparams.audio_stride, w_init_gain='relu'), nn.BatchNorm1d(hparams.prenet_dim))
def __init__(self, hparams): super(Encoder, self).__init__() self.encoder_speaker_embed_dim = hparams.encoder_speaker_embed_dim if self.encoder_speaker_embed_dim: self.encoder_speaker_embedding = nn.Embedding( hparams.n_speakers, self.encoder_speaker_embed_dim) self.encoder_concat_speaker_embed = hparams.encoder_concat_speaker_embed self.encoder_conv_hidden_dim = hparams.encoder_conv_hidden_dim convolutions = [] for _ in range(hparams.encoder_n_convolutions): if _ == 0: if self.encoder_concat_speaker_embed == 'before_conv': input_dim = hparams.symbols_embedding_dim + self.encoder_speaker_embed_dim elif self.encoder_concat_speaker_embed == 'before_lstm': input_dim = hparams.symbols_embedding_dim else: raise NotImplementedError( f'encoder_concat_speaker_embed is has invalid value {hparams.encoder_concat_speaker_embed}, valid values are "before","inside".' ) else: input_dim = self.encoder_conv_hidden_dim if _ == (hparams.encoder_n_convolutions) - 1: # last conv if self.encoder_concat_speaker_embed == 'before_conv': output_dim = hparams.encoder_LSTM_dim elif self.encoder_concat_speaker_embed == 'before_lstm': output_dim = hparams.encoder_LSTM_dim - self.encoder_speaker_embed_dim else: output_dim = self.encoder_conv_hidden_dim conv_layer = nn.Sequential( ConvNorm(input_dim, output_dim, kernel_size=hparams.encoder_kernel_size, stride=1, padding=int((hparams.encoder_kernel_size - 1) / 2), dilation=1, w_init_gain='relu'), nn.BatchNorm1d(output_dim)) convolutions.append(conv_layer) self.convolutions = nn.ModuleList(convolutions) self.lstm = nn.LSTM(hparams.encoder_LSTM_dim, int(hparams.encoder_LSTM_dim / 2), 1, batch_first=True, bidirectional=True) self.LReLU = nn.LeakyReLU(negative_slope=0.01) # LeakyReLU
def __init__(self, attention_n_filters, attention_kernel_size, attention_dim): super(LocationLayer, self).__init__() ##每个类的__init__都要加这一句? padding = int((attention_kernel_size - 1) / 2) self.location_conv = ConvNorm( 2, attention_n_filters, ##ConvNorm是layers.py里定义的一个类 kernel_size=attention_kernel_size, padding=padding, bias=False, stride=1, dilation=1) self.location_dense = LinearNorm( attention_n_filters, attention_dim, ##LinearNorm是layers.py里定义的一个类 bias=False, w_init_gain='tanh')
def __init__(self, embed_dim=512 // 2, pre_layers=3, kernel_size=5, dropout=0.5): super().__init__() self.conv = nn.ModuleList() self.dropout = nn.Dropout(dropout) for i in range(pre_layers): self.conv.append( nn.Sequential( ConvNorm(embed_dim, embed_dim, kernel_size=kernel_size, stride=1, padding=int((kernel_size - 1) / 2), dilation=1, w_init_gain='relu'), nn.BatchNorm1d(embed_dim)))
def __init__(self, config): super(Encoder, self).__init__() convolutions = [] for _ in range(config["encoder_n_convolutions"]): conv_layer = nn.Sequential( ConvNorm(config["encoder_embedding_dim"], config["encoder_embedding_dim"], kernel_size=config["encoder_kernel_size"], stride=1, padding=int((config["encoder_kernel_size"] - 1) / 2), dilation=1, w_init_gain='relu'), nn.BatchNorm1d(config["encoder_embedding_dim"])) convolutions.append(conv_layer) self.convolutions = nn.ModuleList(convolutions) self.lstm = nn.LSTM(config["encoder_embedding_dim"], int(config["encoder_embedding_dim"] / 2), 1, batch_first=True, bidirectional=True)
def __init__(self, hparams): super(Structure_CNN, self).__init__() # V = args.embed_num D = 1401 # C = args.class_num C = 512 Ci = 1 Co = 100 Ks = [3, 5, 7] convolutions = [] for K in Ks: conv_layer = nn.Sequential( # ConvNorm(D, Co, kernel_size=K, stride=1, padding=int((K - 1) / 2), dilation=1, w_init_gain='relu'), ConvNorm(D, Co, kernel_size=K, stride=1, dilation=1, w_init_gain='relu'), nn.BatchNorm1d(Co)) convolutions.append(conv_layer) self.convolutions = nn.ModuleList(convolutions)
def __init__(self, hparams): super(Decoder, self).__init__() self.mellotron = hparams.mellotron self.disable_f0 = hparams.disable_f0 self.n_mel_channels = hparams.n_mel_channels self.n_frames_per_step = hparams.n_frames_per_step self.encoder_dim = hparams.encoder_LSTM_dim + hparams.token_embedding_size + hparams.speaker_embedding_dim self.attention_rnn_dim = hparams.attention_rnn_dim self.decoder_rnn_dim = hparams.decoder_rnn_dim self.prenet_dim = hparams.prenet_dim self.prenet_layers = hparams.prenet_layers self.prenet_batchnorm = hparams.prenet_batchnorm self.p_prenet_dropout = hparams.p_prenet_dropout self.max_decoder_steps = hparams.max_decoder_steps self.gate_threshold = hparams.gate_threshold self.AttRNN_extra_decoder_input = hparams.AttRNN_extra_decoder_input self.AttRNN_hidden_dropout_type = hparams.AttRNN_hidden_dropout_type self.p_AttRNN_hidden_dropout = hparams.p_AttRNN_hidden_dropout self.p_AttRNN_cell_dropout = hparams.p_AttRNN_cell_dropout self.DecRNN_hidden_dropout_type = hparams.DecRNN_hidden_dropout_type self.p_DecRNN_hidden_dropout = hparams.p_DecRNN_hidden_dropout self.p_DecRNN_cell_dropout = hparams.p_DecRNN_cell_dropout self.p_teacher_forcing = hparams.p_teacher_forcing self.teacher_force_till = hparams.teacher_force_till self.num_att_mixtures = hparams.num_att_mixtures self.extra_projection = hparams.extra_projection self.normalize_attention_input = hparams.normalize_attention_input self.normalize_AttRNN_output = hparams.normalize_AttRNN_output self.attention_type = hparams.attention_type self.attention_layers = hparams.attention_layers self.low_vram_inference = hparams.low_vram_inference self.context_frames = hparams.context_frames self.hide_startstop_tokens = hparams.hide_startstop_tokens attention_rnn_in_dim = hparams.prenet_dim + self.encoder_dim if not self.disable_f0: self.prenet_f0 = ConvNorm( 1, hparams.prenet_f0_dim, kernel_size=hparams.prenet_f0_kernel_size, padding=max(0, int(hparams.prenet_f0_kernel_size / 2)), bias=False, stride=1, dilation=1) attention_rnn_in_dim += hparams.prenet_f0_dim self.prenet = Prenet( hparams.n_mel_channels * hparams.n_frames_per_step * self.context_frames, [hparams.prenet_dim] * hparams.prenet_layers, self.p_prenet_dropout, self.prenet_batchnorm) if self.AttRNN_extra_decoder_input: attention_rnn_in_dim += hparams.decoder_rnn_dim if self.AttRNN_hidden_dropout_type == 'dropout': self.attention_rnn = nn.LSTMCell( attention_rnn_in_dim, # input_size hparams.attention_rnn_dim) # hidden_size) elif self.AttRNN_hidden_dropout_type == 'zoneout': self.attention_rnn = LSTMCellWithZoneout( attention_rnn_in_dim, # input_size hparams.attention_rnn_dim, zoneout_prob=self.p_DecRNN_hidden_dropout ) # hidden_size, zoneout) self.p_AttRNN_hidden_dropout = 0.0 # zoneout assigned inside LSTMCellWithZoneout so don't need normal dropout if self.attention_type == 0: self.attention_layer = Attention( hparams.attention_rnn_dim, self.encoder_dim, hparams.attention_dim, hparams.attention_location_n_filters, hparams.attention_location_kernel_size) elif self.attention_type == 1: self.attention_layer = GMMAttention( hparams.num_att_mixtures, hparams.attention_layers, hparams.attention_rnn_dim, self.encoder_dim, hparams.attention_dim, hparams.attention_location_n_filters, hparams.attention_location_kernel_size, hparams) else: raise NotImplementedException( "attention_type invalid, valid values are... 0 and 1") if self.DecRNN_hidden_dropout_type == 'dropout': self.decoder_rnn = nn.LSTMCell( hparams.attention_rnn_dim + self.encoder_dim, # input_size hparams.decoder_rnn_dim, 1) # hidden_size, bias) elif self.DecRNN_hidden_dropout_type == 'zoneout': self.decoder_rnn = LSTMCellWithZoneout( hparams.attention_rnn_dim + self.encoder_dim, # input_size hparams.decoder_rnn_dim, 1, zoneout_prob=self.p_DecRNN_hidden_dropout ) # hidden_size, zoneout) self.p_DecRNN_hidden_dropout = 0.0 # zoneout assigned inside LSTMCellWithZoneout so don't need normal dropout if self.extra_projection: self.linear_projection_pre = LinearNorm( hparams.decoder_rnn_dim + self.encoder_dim, hparams.decoder_rnn_dim + self.encoder_dim) self.linear_projection = LinearNorm( hparams.decoder_rnn_dim + self.encoder_dim, hparams.n_mel_channels * hparams.n_frames_per_step) self.gate_layer = LinearNorm(hparams.decoder_rnn_dim + self.encoder_dim, 1, bias=True, w_init_gain='sigmoid')