Exemplo n.º 1
0
    def encode(self, input, src_len=None):
        """
        input : (batch x max_src_len)
        mask : (batch x max_src_len)
        """
        batch, max_src_len = input.size()

        if src_len is None:
            src_len = [max_src_len] * batch
        res = self.enc_emb_lyr(input)  # batch x max_src_len x emb_dim #
        res = F.dropout(res, self.enc_emb_do, training=self.training)
        res = res.view(batch * max_src_len, -1)
        for ii in range(len(self.enc_prenet_lyr)):
            res = self.enc_prenet_lyr[ii](res)
            res = generator_act_fn(self.enc_prenet_fn)(res)
            res = F.dropout(res,
                            p=self.enc_prenet_do[ii],
                            training=self.training)
        res = res.view(batch, max_src_len, -1)
        res = self.enc_core_lyr(res, src_len)

        ctx = res

        if src_len is not None:
            ctx_mask = Variable(
                generate_seq_mask(src_len, self, max_len=ctx.size(1)))
        else:
            ctx_mask = None

        self.ctx = ctx
        self.ctx_mask = ctx_mask
        self.src_len = src_len

        self.dec_att_lyr.set_ctx(ctx, ctx_mask)
Exemplo n.º 2
0
    def fn_batch_ce(feat_mat, feat_len, speaker_list, train_step=True):
        feat_mat = Variable(feat_mat)
        feat_mask = Variable(
            generate_seq_mask([x for x in feat_len], opts['gpu']))
        speaker_list_id = [map_spk2id[x] for x in speaker_list]
        speaker_list_id = Variable(
            torchauto(model).LongTensor(speaker_list_id))

        model.reset()
        model.train(train_step)
        batch, dec_len, _ = feat_mat.size()

        pred_emb = model(feat_mat, feat_len)
        pred_softmax = model.forward_softmax(pred_emb)

        loss = criterion_ce(pred_softmax, speaker_list_id) * opts['coeff_ce']
        acc = torch.max(pred_softmax, 1)[1].data.eq(
            speaker_list_id.data).sum() / batch

        if train_step:
            opt.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm(model.parameters(),
                                          opts['grad_clip'])
            opt.step()
        return loss.data.sum(), acc
Exemplo n.º 3
0
    def encode(self, input, src_len=None):
        """
        input : (batch x max_src_len x in_size)
        mask : (batch x max_src_len)
        """
        batch, max_src_len, in_size = input.size()

        if src_len is None:
            src_len = [max_src_len] * batch
        res = input.view(batch * max_src_len, in_size)
        enc_fnn_act = getattr(F, self.enc_fnn_act)
        for ii in range(len(self.enc_fnn)):
            res = F.dropout(enc_fnn_act(self.enc_fnn[ii](res)),
                            self.enc_fnn_do[ii], self.training)
            pass
        res = res.view(batch, max_src_len, -1)
        for ii in range(len(self.enc_rnn)):
            res = pack(res, src_len, batch_first=True)
            res = self.enc_rnn[ii](res)[0]  # get h only #
            res, _ = unpack(res, batch_first=True)
            res = F.dropout(res, self.enc_rnn_do[ii], self.training)
            if self.downsampling[ii] == True:
                res = res[:, 1::2]
                src_len = [x // 2 for x in src_len]
                pass
        ctx = res
        # create mask if required #
        if src_len is not None:
            ctx_mask = Variable(
                generate_seq_mask(src_len, self, max_len=ctx.size(1)))
        else:
            ctx_mask = None
        self.dec.set_ctx(ctx, ctx_mask)
Exemplo n.º 4
0
    def forward(self, input, seq_len=None):
        if seq_len is not None:
            mask_input = Variable(
                generate_seq_mask(
                    seq_len=seq_len,
                    device=self).unsqueeze(-1))  # batch x seq_len x 1 #
            mask_input_conv = mask_input.transpose(1, 2)  # batch x 1 x seq_len
        else:
            mask_input = None

        if mask_input is not None:
            input = input * mask_input

        res = input
        res = res.transpose(1, 2)
        for ii in range(len(self.conv_bank_lyrs)):
            res = self.conv_bank_lyrs[ii](res)
            res = generator_act_fn(self.conv_fn_act)(res)
            if self.conv_do[ii] > 0.0:
                res = F.dropout(res,
                                p=self.conv_do[ii],
                                training=self.training)
            if mask_input is not None:
                res = res * mask_input_conv
        res = res.transpose(1, 2)  # batch x seq_len x ndim
        # apply linear layer #
        res = self.lin_pred_lyr(res)
        if mask_input is not None:
            res = res * mask_input
        return res
Exemplo n.º 5
0
def decode_greedy_tf_torch(model,
                           text_mat,
                           text_len,
                           feat_mat,
                           feat_len,
                           group,
                           feat_sil,
                           max_target=1000,
                           aux_info=None):
    """
    decode with teacher forcing method by using ground truth feature as the input
    """
    assert isinstance(model, AVAILABLE_MODEL), "model is not supported"
    if not isinstance(text_mat, Variable):
        text_mat = Variable(text_mat)
    batch = text_mat.size(0)
    model.reset()
    model.eval()
    model.encode(text_mat, text_len)
    if aux_info is not None:
        if isinstance(aux_info['speaker_vector'], list):
            aux_info['speaker_vector'] = Variable(
                tensorauto(
                    model,
                    torch.from_numpy(
                        np.stack(
                            aux_info['speaker_vector']).astype('float32'))))
        model.set_aux_info(aux_info)

    feats_core = []
    feats_att = []
    feat_sil = np.tile(feat_sil, group).astype('float32')
    feat_sil = tensorauto(
        model,
        torch.from_numpy(feat_sil).unsqueeze(0).expand(batch,
                                                       feat_sil.shape[0]))
    feat_sil_var = Variable(feat_sil)
    feat_mat_input = feat_mat[:, 0:-1]
    feat_mask = Variable(generate_seq_mask([x - 1 for x in feat_len], model))

    dec_len = feat_mat_input.size(1)
    for ii in range(dec_len):
        curr_feat, curr_decatt_res, curr_bern_end = model.decode(
            feat_mat[:, ii], feat_mask[:, ii])

        feats_core.append(curr_feat)
        feats_att.append(curr_decatt_res['att_output']['p_ctx'])
        pass

    feats_core = torch.stack(feats_core, dim=1)
    feats_core = feats_core * feat_mask.unsqueeze(-1)
    feats_core = feats_core.view(batch, feats_core.shape[1] * group, -1)
    feats_att = torch.stack(feats_att, dim=1)
    return feats_core, feat_len, feats_att
Exemplo n.º 6
0
    def fn_batch(feat_in_mat,
                 feat_in_len,
                 feat_out_mat,
                 feat_out_len,
                 train_step=True):

        feat_in_mat = Variable(feat_in_mat)
        feat_out_mat = Variable(feat_out_mat)

        feat_in_mask = Variable(generate_seq_mask(feat_in_len, opts['gpu']))
        feat_out_mask = Variable(generate_seq_mask(feat_out_len, opts['gpu']))
        model.train(train_step)
        pred_mat_output = model(feat_in_mat, feat_in_len)

        # main : loss mel spectrogram #
        loss_core = criterion(pred_mat_output, feat_out_mat, feat_out_mask)

        # optional : aux loss for lower frequency #
        loss_core_freq = criterion_freq(pred_mat_output, feat_out_mat,
                                        feat_out_mask)

        loss_feat = loss_core + loss_core_freq

        # combine all loss #
        loss = loss_feat

        if train_step:
            opt.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm(model.parameters(),
                                          opts['grad_clip'])
            opt.step()

            # write report #
            tf_writer.add_scalar('loss',
                                 loss.data[0],
                                 global_step=tf_writer._n_iter)
            tf_writer._n_iter += 1
        return loss.data.sum()
Exemplo n.º 7
0
    def encode(self, input, src_len=None):
        """
        input : (batch x max_src_len)
        mask : (batch x max_src_len)
        """
        batch, max_src_len = input.size()

        if src_len is None:
            src_len = [max_src_len] * batch

        mask_input = Variable(
            generate_seq_mask(src_len, self,
                              max_len=input.shape[1]).unsqueeze(-1))

        res = self.enc_emb_lyr(input)  # batch x src_len x emb_dim #
        res = res * mask_input
        res = res.transpose(1, 2)  # batch x emb_dim x src_len #
        # apply enc conv
        res = self.enc_conv_lyr(res)  # batch x filter x src_len #
        res = res.transpose(1, 2)  # batch x src_len x filter #
        res = res * mask_input

        # apply enc rnn
        for ii in range(len(self.enc_rnn_lyr)):
            res = pack(res, src_len, batch_first=True)
            res = self.enc_rnn_lyr[ii](res)[0]
            res, _ = unpack(res, batch_first=True)
            if ii != len(self.enc_rnn_lyr) - 1:
                res = F.dropout(res,
                                p=self.enc_rnn_do[ii],
                                training=self.training)

        # save as context
        ctx = res

        if src_len is not None:
            ctx_mask = mask_input.squeeze(-1)
        else:
            ctx_mask = None

        self.ctx = ctx
        self.ctx_mask = ctx_mask
        self.src_len = src_len

        self.dec_att_lyr.set_ctx(ctx, ctx_mask)
Exemplo n.º 8
0
    def encode(self, input, input_aux, src_len=None):
        """
        input : feat matrix
        input_aux : map contains additional info speaker embedding ID
        """
        batch, max_src_len = input.size()
        self.input_spk_emb = self.get_speaker_emb(input_aux['spk'])
        assert self.input_spk_emb.size(0) == batch

        if src_len is None:
            src_len = [max_src_len] * batch
        res = self.enc_emb_lyr(input)  # batch x max_src_len x emb_dim #
        res = F.dropout(res, self.enc_emb_do, self.training)
        res = res.view(batch * max_src_len, -1)
        for ii in range(len(self.enc_prenet_lyr)):
            res = self.enc_prenet_lyr[ii](res)
            res = generator_act_fn(self.enc_prenet_fn)(res)
            res = F.dropout(res,
                            p=self.enc_prenet_do[ii],
                            training=self.training)
        res = res.view(batch, max_src_len, -1)

        ### SPK ###
        # res_spk = self.spk_enc_lin_prenet_lyr(input_spk_emb).unsqueeze(1).expand_as(
        # batch, max_src_len, self.spk_emb_lyr.embedding_dim)
        # res_spk = self.spk_act_fn(res_spk)
        # res = res + res_spk
        ###########

        res = self.enc_core_lyr(res, src_len)

        ctx = res

        if src_len is not None:
            ctx_mask = Variable(
                generate_seq_mask(src_len, self, max_len=ctx.size(1)))
        else:
            ctx_mask = None

        self.ctx = ctx
        self.ctx_mask = ctx_mask
        self.src_len = src_len

        self.dec_att_lyr.set_ctx(ctx, ctx_mask)
        pass
Exemplo n.º 9
0
 def encode(self, input, src_len=None):
     """
     input : (batch x max_src_len x in_size)
     mask : (batch x max_src_len)
     """
     batch, max_src_len, in_size = input.size()
     if src_len is None:
         src_len = [max_src_len] * batch
     res = self.enc_lyr(input, src_len)
     ctx = res['enc_output']
     ctx_len = res['enc_len']
     ctx_mask = None
     if src_len is not None:
         ctx_mask = Variable(
             generate_seq_mask(seq_len=src_len,
                               device=self,
                               max_len=ctx.size(1)))
     self.dec_lyr.set_ctx(ctx, ctx_mask)
     pass
Exemplo n.º 10
0
    def fn_generate_dynamic_supervised(model_gen, feat_source, feat_source_len, 
            feat_target, feat_target_len, train_step=True, 
            tfboard_writer=None, niter=0, opt=None, model_name='') :
        if not isinstance(feat_source, Variable) :
            feat_source = Variable(feat_source)

        if not isinstance(feat_target, Variable) :
            feat_target = Variable(feat_target)

        feat_target_input = feat_target[:, 0:-1]
        feat_target_output = feat_target[:, 1:]
        feat_target_input_len = [x-1 for x in feat_target_len]
        feat_target_mask = Variable(generate_seq_mask(seq_len=feat_target_input_len, 
            device=opts['gpu']))
        batch, dec_len, _ = feat_target_input.size()

        model_gen.reset()
        model_gen.train(train_step)
        model_gen.encode(feat_source, feat_source_len)
        list_dec_core = []
        for ii in range(dec_len) :
            _dec_core_ii, _ = model_gen.decode(feat_target_input[:, ii], feat_target_mask[:, ii] if opts['mask_dec'] else None)
            list_dec_core.append(_dec_core_ii)
            pass
        dec_core = torch.stack(list_dec_core,1)
        
        # calculate loss and update #
        loss_sup = criterion_recon(feat_target_output, dec_core,
                mask=None) # TODO : decide use mask or not #
        if train_step :
            opt.zero_grad()
            torch.nn.utils.clip_grad_norm(model_gen.parameters(), opts['grad_clip'])
            loss_sup.backward()
            opt.step()

        # log #
        if tfboard_writer is not None :
            tfboard_writer.add_scalar('loss/sup {}'.format(model_name), 
                    loss_sup.data.cpu().numpy(), niter)
            pass 
        pass
Exemplo n.º 11
0
    def forward(self, input, input_len=None):
        batch, max_input_len, in_size = input.size()
        # convert to batch, channel, seq_len, n_dim

        # apply masking #
        if input_len is not None:
            mask_input = Variable(
                generate_seq_mask(input_len,
                                  device=self,
                                  max_len=max_input_len).unsqueeze(-1))
            input = input * mask_input

        res = input.unsqueeze(1)

        # apply conv
        for ii in range(self.num_layers):
            res = self.conv_lyr[ii](res)
            res = generator_act_fn(self.conv_fn_act)(res)
            res = self.resblock_lyr[ii](res)

        # res = [batch, out_channel, seq_len, n_dim] #
        # pool across seq_len #
        if self.pool_fn == 'avg':
            res = F.avg_pool2d(res, kernel_size=[res.size(2), 1], stride=1)
        elif self.pool_fn == 'max':
            res = F.max_pool2d(res, kernel_size=[res.size(2), 1], stride=1)
        else:
            raise ValueError("pool_fn {} is not implemented".format(
                self.pool_fn))

        # affine transform #
        # res = [batch, out_channel, 1, n_dim] #
        res = F.avg_pool2d(res, kernel_size=[1, res.size(-1)], stride=1)
        # res = [batch, out_channel, 1, 1] #
        res = res.squeeze(-1).squeeze(-1)  # res = [batch, out_channel]
        res = self.lin_emb_lyr(res)
        # normalize to unit-norm #
        res = res / torch.norm(res, p=2, dim=1, keepdim=True)
        return res
Exemplo n.º 12
0
    def fn_batch(text_mat,
                 text_len,
                 feat_mat,
                 feat_len,
                 aux_info=None,
                 train_step=True):
        text_mat = Variable(text_mat)
        feat_mat_input = Variable(feat_mat[:, 0:-1])
        feat_mat_output = Variable(feat_mat[:, 1:])

        feat_mask = Variable(
            generate_seq_mask([x - 1 for x in feat_len], opts['gpu']))

        feat_label_end = Variable(
            1. - generate_seq_mask([x - 1 - opts['pad_sil'] for x in feat_len],
                                   opts['gpu'],
                                   max_len=feat_mask.size(1)))
        model.reset()
        model.train(train_step)
        model.encode(text_mat, text_len)

        # additional input condition
        if model.TYPE == TacotronType.MULTI_SPEAKER:
            aux_info['speaker_vector'] = Variable(
                tensorauto(
                    opts['gpu'],
                    torch.from_numpy(
                        np.stack(
                            aux_info['speaker_vector']).astype('float32'))))
            model.set_aux_info(aux_info)

        batch, dec_len, _ = feat_mat_input.size()
        list_dec_core = []
        list_dec_core_bernoulli_end = []
        list_dec_att = []
        for ii in range(dec_len):
            _dec_core_ii, _dec_att_ii, _dec_core_bernoulli_end = model.decode(
                feat_mat_input[:, ii],
                feat_mask[:, ii] if opts['mask_dec'] else None)
            list_dec_core.append(_dec_core_ii)
            list_dec_core_bernoulli_end.append(_dec_core_bernoulli_end)
            list_dec_att.append(_dec_att_ii['att_output']['p_ctx'])
            pass

        dec_core = torch.stack(list_dec_core, 1)
        dec_core_bernoulli_end = torch.cat(list_dec_core_bernoulli_end, 1)
        dec_att = torch.stack(list_dec_att, dim=1)

        # main : loss mel spectrogram #
        loss_core = criterion(dec_core, feat_mat_output, feat_mask)

        # optional : aux loss for lower frequency #
        loss_core_freq = 1 * criterion_freq(dec_core, feat_mat_output,
                                            feat_mask)

        loss_feat = loss_core + loss_core_freq

        # main : frame ending prediction #
        loss_core_bernoulli_end = F.binary_cross_entropy_with_logits(
            dec_core_bernoulli_end, feat_label_end) * opts['coeff_bern']
        acc_core_bernoulli_end = ((dec_core_bernoulli_end > 0.0) == (
            feat_label_end > 0.5)).float().mean()

        # optional : aux loss for encourage diagonal attention #
        loss_diag_att = 1 * criterion_diag_att(
            dec_att, dec_len=[x - 1 for x in feat_len], enc_len=text_len)

        # combine all loss #
        loss = loss_feat + loss_core_bernoulli_end + loss_diag_att

        if train_step:
            opt.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm(model.parameters(),
                                          opts['grad_clip'])
            opt.step()

            # write report #
            tf_writer.add_scalar('loss',
                                 loss.data[0],
                                 global_step=tf_writer._n_iter)
            tf_writer.add_scalar('loss_feat',
                                 loss_feat.data[0],
                                 global_step=tf_writer._n_iter)
            tf_writer.add_scalar('loss_bern_end',
                                 loss_core_bernoulli_end.data[0],
                                 global_step=tf_writer._n_iter)
            if opts['loss_diag_att_cfg'] is not None:
                tf_writer.add_scalar('loss_diag_att',
                                     loss_diag_att.data[0],
                                     global_step=tf_writer._n_iter)
            tf_writer._n_iter += 1

        return loss.data.sum(), loss_feat.data.sum(
        ), loss_core_bernoulli_end.data.sum(), acc_core_bernoulli_end.data.sum(
        )
Exemplo n.º 13
0
    def fn_batch_tts(model,
                     text_mat,
                     text_len,
                     feat_mat,
                     feat_len,
                     aux_info=None,
                     train_step=True,
                     coeff_loss=1):
        # refit data #
        if max(feat_len) != feat_mat.shape[1]:
            feat_mat = feat_mat[:, 0:max(feat_len)]
        if max(text_len) != text_mat.shape[1]:
            text_mat = text_mat[:, 0:max(text_len)]
        batch_size = text_mat.shape[0]
        if not isinstance(text_mat, Variable):
            text_mat = Variable(text_mat)
        if not isinstance(feat_mat, Variable):
            feat_mat = Variable(feat_mat)
        feat_mat_input = feat_mat[:, 0:-1]
        feat_mat_output = feat_mat[:, 1:]

        feat_mask = Variable(
            generate_seq_mask([x - 1 for x in feat_len], opts['gpu']))

        feat_label_end = Variable(
            1. -
            generate_seq_mask([x - 1 - opts['tts_pad_sil'] for x in feat_len],
                              opts['gpu'],
                              max_len=feat_mask.size(1)))
        model.reset()
        model.train(train_step)
        model.encode(text_mat, text_len)

        # additional input condition
        if model.TYPE == TacotronType.MULTI_SPEAKER:
            aux_info['speaker_vector'] = Variable(
                tensorauto(
                    opts['gpu'],
                    torch.from_numpy(
                        np.stack(
                            aux_info['speaker_vector']).astype('float32'))))
            model.set_aux_info(aux_info)

        batch, dec_len, _ = feat_mat_input.size()
        list_dec_core = []
        list_dec_core_bernoulli_end = []
        list_dec_att = []
        for ii in range(dec_len):
            _dec_core_ii, _dec_att_ii, _dec_core_bernoulli_end = model.decode(
                feat_mat_input[:, ii],
                feat_mask[:, ii] if opts['tts_mask_dec'] else None)
            list_dec_core.append(_dec_core_ii)
            list_dec_core_bernoulli_end.append(_dec_core_bernoulli_end)
            list_dec_att.append(_dec_att_ii['att_output']['p_ctx'])
            pass

        dec_core = torch.stack(list_dec_core, 1)
        dec_core_bernoulli_end = torch.cat(list_dec_core_bernoulli_end, 1)
        dec_att = torch.stack(list_dec_att, dim=1)

        # main : loss mel spectrogram #
        loss_core = tts_loss(dec_core, feat_mat_output, feat_mask)

        # optional : aux loss for lower frequency #
        loss_core_freq = 1 * tts_loss_freq(dec_core, feat_mat_output,
                                           feat_mask)

        loss_feat = loss_core + loss_core_freq

        # optional : aux loss for speaker embedding reconstruction #
        if model_tts.TYPE == TacotronType.MULTI_SPEAKER:
            loss_spk_emb = tts_loss_spk_emb(
                dec_core.view(batch_size, -1, NDIM_FEAT),
                [x * opts['tts_group'] for x in feat_len],
                aux_info['speaker_vector'])
        else:
            loss_spk_emb = Variable(torchauto(opts['gpu']).FloatTensor([0.0]))

        # main : frame ending prediction #
        loss_core_bernoulli_end = F.binary_cross_entropy_with_logits(
            dec_core_bernoulli_end, feat_label_end) * opts['tts_coeff_bern']
        acc_core_bernoulli_end = ((dec_core_bernoulli_end > 0.0) == (
            feat_label_end > 0.5)).float().mean()

        # combine all loss #
        loss = loss_feat + loss_core_bernoulli_end + loss_spk_emb
        loss = loss * coeff_loss

        # if train_step :
        if train_step == True:
            model.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm(model.parameters(),
                                          opts['tts_grad_clip'])
            tts_opt.step()

        return loss.data.sum(), loss_feat.data.sum(), loss_core_bernoulli_end.data.sum(), \
                loss_spk_emb.data.sum(), acc_core_bernoulli_end.data.sum()