def __init__(self, stats_file, stats_mel_file, phone_size, embed_size, \ hidden_size, glu_num_layers, dropout, \ dec_num_block, dec_nhead, output_dim, n_mels=80, \ double_mel_loss=True, local_gaussian=False,device="cuda"): super(Transformer_noGLUSVS_norm,self).__init__() self.encoder = SA_Encoder(phone_size,embed_size,hidden_size,dropout) self.normalizer = GlobalMVN(stats_file) # FIX ME, add utterance normalizer self.mel_normalizer = GlobalMVN(stats_mel_file) self.enc_postnet = Encoder_Postnet(embed_size) self.use_mel = (n_mels > 0) if self.use_mel: self.double_mel_loss = double_mel_loss else: self.double_mel_loss = False if self.use_mel: self.decoder = Decoder_noGLU(dec_num_block, embed_size, n_mels, dec_nhead, dropout, local_gaussian=local_gaussian, device=device) if self.double_mel_loss: self.double_mel = module.PostNet(n_mels, n_mels, n_mels) self.postnet = module.PostNet(n_mels, output_dim, (output_dim // 2 * 2)) else: self.decoder = Decoder_noGLU(dec_num_block, embed_size, output_dim, dec_nhead, dropout, local_gaussian=local_gaussian, device=device) self.postnet = module.PostNet(output_dim, output_dim, (output_dim // 2 * 2))
def __init__(self, embed_size=512, d_model=512, d_output=1324, num_layers=2, phone_size=87, n_mels=-1, dropout=0.1, device="cuda", use_asr_post=False): super(GRUSVS_gs, self).__init__() # Encoder self.embedding_phone = nn.Embedding(phone_size, embed_size) self.rnnEncoder = nn.GRU(embed_size + 2, d_model, bidirectional = True) self.fcEncoder = nn.Linear(d_model * 2, d_model) self.dropoutEncoder = nn.Dropout(dropout) # Attention self.attn = nn.Linear((d_model * 2) + d_model, d_model) self.v = nn.Linear(d_model, 1, bias = False) # Decoder self.rnnDecoder = nn.GRU((d_model * 2) + d_model * 2, d_model) self.fc_hid1 = nn.Linear((d_model * 2) + d_model * 2 + d_model, d_model * 2) # self.fc_hid2 = nn.Linear(2048, 1600) self.dropoutDecoder = nn.Dropout(dropout) self.use_mel = (n_mels > 0) self.n_mels = n_mels if self.use_mel: self.output_mel = nn.Linear(d_model * 2, n_mels) self.postnet = module.PostNet(n_mels, d_output, (d_output // 2 * 2)) else: self.fc_out = nn.Linear(d_model * 2, d_output) self._reset_parameters() self.d_model = d_model self.d_output = d_output
def __init__(self, embed_size=512, d_model=512, d_output=1324, num_layers=2, phone_size=87, n_mels=-1, dropout=0.1, device="cuda", use_asr_post=False): super(LSTMSVS, self).__init__() if use_asr_post: self.input_fc = nn.Linear(phone_size - 1, d_model) else: self.input_embed = nn.Embedding(phone_size, embed_size) self.linear_wrapper = nn.Linear(embed_size, d_model) self.phone_lstm = nn.LSTM(input_size=d_model, hidden_size=d_model, num_layers=num_layers, batch_first=True, bidirectional=True, dropout=dropout) self.linear_wrapper2 = nn.Linear(d_model * 2, d_model) self.pos = module.PositionalEncoding(d_model) #Remember! embed_size must be even!! assert embed_size % 2 == 0 self.fc_pos = nn.Linear(d_model, d_model) self.pos_lstm = nn.LSTM(input_size=d_model, hidden_size=d_model, num_layers=num_layers, batch_first=True, bidirectional=True, dropout=dropout) self.fc_pitch = nn.Linear(1, d_model) self.linear_wrapper3 = nn.Linear(d_model * 2, d_model) self.pitch_lstm = nn.LSTM(input_size=d_model, hidden_size=d_model, num_layers=num_layers, batch_first=True, bidirectional=True, dropout=dropout) #only 0 and 1 two possibilities self.emb_beats = nn.Embedding(2, d_model) self.linear_wrapper4 = nn.Linear(d_model * 2, d_model) self.beats_lstm = nn.LSTM(input_size=d_model, hidden_size=d_model, num_layers=num_layers, batch_first=True, bidirectional=True, dropout=dropout) self.output_fc = nn.Linear(d_model * 2, d_output) self.use_mel = (n_mels > 0) self.n_mels = n_mels if self.use_mel: self.output_mel = nn.Linear(d_model * 2, n_mels) self.postnet = module.PostNet(n_mels, d_output, (d_output // 2 * 2)) else: self.output_fc = nn.Linear(d_model * 2, d_output) self._reset_parameters() self.use_asr_post = use_asr_post self.d_model = d_model
def __init__(self, phone_size, embed_size, hidden_size, glu_num_layers, dropout, dec_num_block, dec_nhead, output_dim, n_mels=80, double_mel_loss=True, local_gaussian=False, device="cuda"): super(TransformerSVS, self).__init__(phone_size, embed_size, hidden_size, glu_num_layers, dropout, dec_num_block,dec_nhead, output_dim, local_gaussian=local_gaussian, device="cuda") self.encoder = SA_Encoder(phone_size, embed_size, hidden_size, dropout) self.use_mel = (n_mels > 0) if self.use_mel: self.double_mel_loss = double_mel_loss else: self.double_mel_loss = False if self.use_mel: self.decoder = Decoder(dec_num_block, embed_size, n_mels, dec_nhead, dropout, local_gaussian=local_gaussian, device=device) if self.double_mel_loss: self.double_mel = module.PostNet(n_mels, n_mels, n_mels) self.postnet = module.PostNet(n_mels, output_dim, (output_dim // 2 * 2)) else: self.decoder = Decoder(dec_num_block, embed_size, output_dim, dec_nhead, dropout, local_gaussian=local_gaussian, device=device) self.postnet = module.PostNet(output_dim, output_dim, (output_dim // 2 * 2))