Пример #1
0
    def __init__(self, stats_file, stats_mel_file, phone_size, embed_size, \
                  hidden_size, glu_num_layers, dropout, \
                 dec_num_block, dec_nhead, output_dim, n_mels=80, \
                 double_mel_loss=True, local_gaussian=False,device="cuda"):
        super(Transformer_noGLUSVS_norm,self).__init__()
        self.encoder = SA_Encoder(phone_size,embed_size,hidden_size,dropout)
        self.normalizer = GlobalMVN(stats_file)   # FIX ME, add utterance normalizer
        self.mel_normalizer = GlobalMVN(stats_mel_file)
        self.enc_postnet = Encoder_Postnet(embed_size)
        self.use_mel = (n_mels > 0)

        if self.use_mel:
            self.double_mel_loss = double_mel_loss
        else:
            self.double_mel_loss = False

        if self.use_mel:
            self.decoder = Decoder_noGLU(dec_num_block, embed_size, n_mels, dec_nhead, dropout,
                                   local_gaussian=local_gaussian, device=device)
            if self.double_mel_loss:
                self.double_mel = module.PostNet(n_mels, n_mels, n_mels)
            self.postnet = module.PostNet(n_mels, output_dim, (output_dim // 2 * 2))
        else:
            self.decoder = Decoder_noGLU(dec_num_block, embed_size, output_dim, dec_nhead, dropout,
                                   local_gaussian=local_gaussian, device=device)
            self.postnet = module.PostNet(output_dim, output_dim, (output_dim // 2 * 2))
Пример #2
0
    def __init__(self, embed_size=512, d_model=512, d_output=1324,
                 num_layers=2, phone_size=87, n_mels=-1,
                 dropout=0.1, device="cuda", use_asr_post=False):
        super(GRUSVS_gs, self).__init__()
        
        # Encoder
        self.embedding_phone = nn.Embedding(phone_size, embed_size)
        self.rnnEncoder = nn.GRU(embed_size + 2, d_model, bidirectional = True)
        self.fcEncoder = nn.Linear(d_model * 2, d_model)
        self.dropoutEncoder = nn.Dropout(dropout)

        # Attention
        self.attn = nn.Linear((d_model * 2) + d_model, d_model)
        self.v = nn.Linear(d_model, 1, bias = False)

        # Decoder
        self.rnnDecoder = nn.GRU((d_model * 2) + d_model * 2, d_model)
        self.fc_hid1 = nn.Linear((d_model * 2) + d_model * 2 + d_model, d_model * 2)
        # self.fc_hid2 = nn.Linear(2048, 1600)
        
        self.dropoutDecoder = nn.Dropout(dropout)
        
        self.use_mel = (n_mels > 0)
        self.n_mels = n_mels
        if self.use_mel:
            self.output_mel = nn.Linear(d_model * 2, n_mels)
            self.postnet = module.PostNet(n_mels, d_output, (d_output // 2 * 2))
        else:
            self.fc_out = nn.Linear(d_model * 2, d_output)

        self._reset_parameters()
        self.d_model = d_model
        self.d_output = d_output
Пример #3
0
    def __init__(self, embed_size=512, d_model=512, d_output=1324,
                 num_layers=2, phone_size=87, n_mels=-1,
                 dropout=0.1, device="cuda", use_asr_post=False):
        super(LSTMSVS, self).__init__()
        
        if use_asr_post:
            self.input_fc = nn.Linear(phone_size - 1, d_model)
        else:
            self.input_embed = nn.Embedding(phone_size, embed_size)

        self.linear_wrapper = nn.Linear(embed_size, d_model)

        self.phone_lstm = nn.LSTM(input_size=d_model, hidden_size=d_model, num_layers=num_layers, batch_first=True,
            bidirectional=True, dropout=dropout)

        self.linear_wrapper2 = nn.Linear(d_model * 2, d_model)

        self.pos = module.PositionalEncoding(d_model)
        #Remember! embed_size must be even!!
        assert embed_size % 2 == 0
        self.fc_pos = nn.Linear(d_model, d_model)

        self.pos_lstm = nn.LSTM(input_size=d_model, hidden_size=d_model, num_layers=num_layers, batch_first=True,
            bidirectional=True, dropout=dropout)

        self.fc_pitch = nn.Linear(1, d_model)
        self.linear_wrapper3 = nn.Linear(d_model * 2, d_model)
        self.pitch_lstm = nn.LSTM(input_size=d_model, hidden_size=d_model, num_layers=num_layers, batch_first=True,
            bidirectional=True, dropout=dropout)
        
        #only 0 and 1 two possibilities
        self.emb_beats = nn.Embedding(2, d_model)
        self.linear_wrapper4 = nn.Linear(d_model * 2, d_model)
        self.beats_lstm = nn.LSTM(input_size=d_model, hidden_size=d_model, num_layers=num_layers, batch_first=True,
            bidirectional=True, dropout=dropout)
    
        
        self.output_fc = nn.Linear(d_model * 2, d_output)

        self.use_mel = (n_mels > 0)
        self.n_mels = n_mels
        if self.use_mel:
            self.output_mel = nn.Linear(d_model * 2, n_mels)
            self.postnet = module.PostNet(n_mels, d_output, (d_output // 2 * 2))
        else:
            self.output_fc = nn.Linear(d_model * 2, d_output)


        self._reset_parameters()

        self.use_asr_post = use_asr_post
        self.d_model = d_model
Пример #4
0
    def __init__(self, phone_size, embed_size, hidden_size, glu_num_layers, dropout, dec_num_block,
            dec_nhead, output_dim, n_mels=80, double_mel_loss=True, local_gaussian=False, device="cuda"):
        super(TransformerSVS, self).__init__(phone_size, embed_size, hidden_size,
                glu_num_layers, dropout, dec_num_block,dec_nhead, output_dim,
                local_gaussian=local_gaussian, device="cuda")
        self.encoder = SA_Encoder(phone_size, embed_size, hidden_size, dropout)
        self.use_mel = (n_mels > 0)
        if self.use_mel:
            self.double_mel_loss = double_mel_loss
        else:
            self.double_mel_loss = False

        if self.use_mel:
            self.decoder = Decoder(dec_num_block, embed_size, n_mels, dec_nhead, dropout,
                                   local_gaussian=local_gaussian, device=device)
            if self.double_mel_loss:
                self.double_mel = module.PostNet(n_mels, n_mels, n_mels)
            self.postnet = module.PostNet(n_mels, output_dim, (output_dim // 2 * 2))
        else:
            self.decoder = Decoder(dec_num_block, embed_size, output_dim, dec_nhead, dropout,
                                   local_gaussian=local_gaussian, device=device)
            self.postnet = module.PostNet(output_dim, output_dim, (output_dim // 2 * 2))