Exemplo n.º 1
0
def decode_greedy_tf_torch(model,
                           text_mat,
                           text_len,
                           feat_mat,
                           feat_len,
                           group,
                           feat_sil,
                           max_target=1000,
                           aux_info=None):
    """
    decode with teacher forcing method by using ground truth feature as the input
    """
    assert isinstance(model, AVAILABLE_MODEL), "model is not supported"
    if not isinstance(text_mat, Variable):
        text_mat = Variable(text_mat)
    batch = text_mat.size(0)
    model.reset()
    model.eval()
    model.encode(text_mat, text_len)
    if aux_info is not None:
        if isinstance(aux_info['speaker_vector'], list):
            aux_info['speaker_vector'] = Variable(
                tensorauto(
                    model,
                    torch.from_numpy(
                        np.stack(
                            aux_info['speaker_vector']).astype('float32'))))
        model.set_aux_info(aux_info)

    feats_core = []
    feats_att = []
    feat_sil = np.tile(feat_sil, group).astype('float32')
    feat_sil = tensorauto(
        model,
        torch.from_numpy(feat_sil).unsqueeze(0).expand(batch,
                                                       feat_sil.shape[0]))
    feat_sil_var = Variable(feat_sil)
    feat_mat_input = feat_mat[:, 0:-1]
    feat_mask = Variable(generate_seq_mask([x - 1 for x in feat_len], model))

    dec_len = feat_mat_input.size(1)
    for ii in range(dec_len):
        curr_feat, curr_decatt_res, curr_bern_end = model.decode(
            feat_mat[:, ii], feat_mask[:, ii])

        feats_core.append(curr_feat)
        feats_att.append(curr_decatt_res['att_output']['p_ctx'])
        pass

    feats_core = torch.stack(feats_core, dim=1)
    feats_core = feats_core * feat_mask.unsqueeze(-1)
    feats_core = feats_core.view(batch, feats_core.shape[1] * group, -1)
    feats_att = torch.stack(feats_att, dim=1)
    return feats_core, feat_len, feats_att
Exemplo n.º 2
0
    def criterion_diag_att(att_mat,
                           dec_len,
                           enc_len,
                           std_dev=0.2,
                           size_average=True):
        if opts['loss_diag_att_cfg'] is None:
            return 0
        batch, max_dec_len, max_enc_len = att_mat.size()
        loss = 0
        for bb in range(batch):
            range_dec_len = torch.arange(0, dec_len[bb])
            range_enc_len = torch.arange(0, enc_len[bb])
            inv_normal_diag_mat = 1.0 - torch.exp(-(
                (range_dec_len / dec_len[bb])[:, None] -
                (range_enc_len / enc_len[bb])[None, :])**2 / (2 * std_dev**2))
            inv_normal_diag_mat = Variable(
                tensorauto(model, inv_normal_diag_mat))  # convert to device
            loss_bb = (att_mat[bb, 0:dec_len[bb], 0:enc_len[bb]] *
                       inv_normal_diag_mat).sum()
            loss += loss_bb

        if size_average:
            loss /= batch

        return loss * scheduler_decay_diag_att.value
Exemplo n.º 3
0
def batch_speech(device,
                 feat_list,
                 feat_sil=None,
                 group=None,
                 start_sil=0,
                 end_sil=0):
    if group is not None:
        # grouping feat per x frame into 1 frame #
        feat_list = [
            group_feat_timestep(feat_ii, group) for feat_ii in feat_list
        ]
    if feat_sil is not None:
        feat_sil = np.tile(feat_sil, group)
    feat_len = [len(x) + start_sil + end_sil for x in feat_list]
    batch = len(feat_list)
    max_feat_len = max(feat_len)
    ndim = feat_list[0].shape[-1]

    feat_mat = np.zeros((batch, max_feat_len, ndim), dtype='float32') + \
            (feat_sil if feat_sil is not None else 0)
    for ii in range(batch):
        feat_mat[ii, start_sil:start_sil + len(feat_list[ii])] = feat_list[ii]

    feat_mat = torch.from_numpy(feat_mat).float()
    feat_mat = tensorauto(device, feat_mat)
    return feat_mat, feat_len
Exemplo n.º 4
0
    def encode(self, input, src_len=None):
        """
        input : (batch x max_src_len x in_size)
        mask : (batch x max_src_len)
        """
        batch, max_src_len, in_size = input.size()

        if src_len is None:
            src_len = [max_src_len] * batch
        res = input.view(batch * max_src_len, in_size)
        enc_fnn_act = getattr(F, self.enc_fnn_act)
        for ii in range(len(self.enc_fnn)):
            res = F.dropout(enc_fnn_act(self.enc_fnn[ii](res)),
                            self.enc_fnn_do[ii], self.training)
            pass
        # res = batch * max_src_len x ndim #
        res = res.view(batch, max_src_len,
                       res.size(1)).transpose(1, 2).unsqueeze(3)
        # res = batch x ndim x src_len x 1 #
        enc_cnn_act = getattr(F, self.enc_cnn_act)
        for ii in range(len(self.enc_cnn)):
            if self.use_pad1[ii]:
                res = F.pad(res, (0, 0, 0, 1))
            res = self.enc_cnn[ii](res)
            res = enc_cnn_act(res)
            src_len = [x // self.enc_cnn_strides[ii] for x in src_len]
            pass
        res = res.squeeze(3).transpose(1, 2)  # batch x src_len x ndim #
        # add position embedding #
        _pos_arr = np.arange(0, res.size(1)).astype('float32')  # src_len #
        _pos_arr = np.repeat(_pos_arr[np.newaxis, :], batch,
                             0)  # batch x src_len #
        _pos_arr /= np.array(
            src_len)[:, np.newaxis]  # divide for relative position #
        _pos_arr = tensorauto(self, torch.from_numpy(_pos_arr))
        _pos_var = Variable(_pos_arr.view(batch * _pos_arr.size(1), 1))
        # TODO : absolute or relative position #
        res_pos = self.pos_emb(_pos_var)
        res_pos = res_pos.view(batch, _pos_arr.size(1), -1)
        ctx = res + res_pos  # TODO : sum or concat ? #
        # create mask if required #
        if src_len is not None:
            ctx_mask = torchauto(self).FloatTensor(batch, ctx.size(1)).zero_()
            for ii in range(batch):
                ctx_mask[ii, 0:src_len[ii]] = 1.0
            ctx_mask = Variable(ctx_mask)
        else:
            ctx_mask = None
        self.dec.set_ctx(ctx, ctx_mask)
Exemplo n.º 5
0
def batch_text(device, text_list, add_bos=True, add_eos=True):
    """
    return text_mat, text_len
    """
    assert all(isinstance(x, list) for x in text_list)
    text_idx_list = text_list
    batch = len(text_list)
    if add_bos:
        text_idx_list = [[constant.BOS] + x for x in text_idx_list]
    if add_eos:
        text_idx_list = [x + [constant.EOS] for x in text_idx_list]
    text_len = [len(x) for x in text_idx_list
                ]  # -1 because we shift mask by 1 for input output #
    text_mat = np.full((batch, max(text_len)), constant.PAD, dtype='int64')
    for ii in range(batch):
        text_mat[ii, 0:text_len[ii]] = text_idx_list[ii]
    text_mat = tensorauto(device, torch.from_numpy(text_mat))
    return text_mat, text_len
Exemplo n.º 6
0
def greedy_decoder_torch(model, feat_source_mat, feat_source_len, feat_sil):
    MAX_DURATION = 400  # hardcoded
    EOS_THRESHOLD = 0.1
    batch = feat_source_mat.size(0)
    model.reset()
    model.eval()
    model.encode(feat_source_mat, feat_source_len)
    feat_sil = np.tile(feat_sil, group).astype('float32')
    feat_sil = tensorauto(
        model,
        torch.from_numpy(feat_sil).unsqueeze(0).repeat(batch, 1))
    feat_sil_var = Variable(feat_sil)
    prev_feat = feat_sil_var  # 1 dim #
    idx = 0
    feat_pred_len = [-1 for _ in range(batch)]
    feats_pred_core = []
    alignments = []
    while True:
        curr_feat, curr_decatt_res = model.decode(prev_feat)
        feats_pred_core.append(curr_feat)
        alignments.append(curr_decatt_res['att_output']['p_ctx'])
        prev_feat = curr_feat

        # check if batch bb already finished or not #
        dist_to_sil = (torch.abs(curr_feat - feat_sil_var)).sum(1).data
        print(idx, dist_to_sil.tolist())
        for bb in range(batch):
            if feat_pred_len[bb] == -1 and dist_to_sil[bb] < EOS_THRESHOLD:
                feat_pred_len[bb] = idx
        idx += 1

        if idx > MAX_DURATION or all([x != -1 for x in feat_pred_len]):
            break
        pass

    feats_pred_core = torch.stack(feats_pred_core, dim=1)

    alignments = torch.stack(alignments, dim=1)
    return feats_pred_core, alignments, feat_pred_len
    pass
Exemplo n.º 7
0
    def fn_batch(text_mat,
                 text_len,
                 feat_mat,
                 feat_len,
                 aux_info=None,
                 train_step=True):
        text_mat = Variable(text_mat)
        feat_mat_input = Variable(feat_mat[:, 0:-1])
        feat_mat_output = Variable(feat_mat[:, 1:])

        feat_mask = Variable(
            generate_seq_mask([x - 1 for x in feat_len], opts['gpu']))

        feat_label_end = Variable(
            1. - generate_seq_mask([x - 1 - opts['pad_sil'] for x in feat_len],
                                   opts['gpu'],
                                   max_len=feat_mask.size(1)))
        model.reset()
        model.train(train_step)
        model.encode(text_mat, text_len)

        # additional input condition
        if model.TYPE == TacotronType.MULTI_SPEAKER:
            aux_info['speaker_vector'] = Variable(
                tensorauto(
                    opts['gpu'],
                    torch.from_numpy(
                        np.stack(
                            aux_info['speaker_vector']).astype('float32'))))
            model.set_aux_info(aux_info)

        batch, dec_len, _ = feat_mat_input.size()
        list_dec_core = []
        list_dec_core_bernoulli_end = []
        list_dec_att = []
        for ii in range(dec_len):
            _dec_core_ii, _dec_att_ii, _dec_core_bernoulli_end = model.decode(
                feat_mat_input[:, ii],
                feat_mask[:, ii] if opts['mask_dec'] else None)
            list_dec_core.append(_dec_core_ii)
            list_dec_core_bernoulli_end.append(_dec_core_bernoulli_end)
            list_dec_att.append(_dec_att_ii['att_output']['p_ctx'])
            pass

        dec_core = torch.stack(list_dec_core, 1)
        dec_core_bernoulli_end = torch.cat(list_dec_core_bernoulli_end, 1)
        dec_att = torch.stack(list_dec_att, dim=1)

        # main : loss mel spectrogram #
        loss_core = criterion(dec_core, feat_mat_output, feat_mask)

        # optional : aux loss for lower frequency #
        loss_core_freq = 1 * criterion_freq(dec_core, feat_mat_output,
                                            feat_mask)

        loss_feat = loss_core + loss_core_freq

        # main : frame ending prediction #
        loss_core_bernoulli_end = F.binary_cross_entropy_with_logits(
            dec_core_bernoulli_end, feat_label_end) * opts['coeff_bern']
        acc_core_bernoulli_end = ((dec_core_bernoulli_end > 0.0) == (
            feat_label_end > 0.5)).float().mean()

        # optional : aux loss for encourage diagonal attention #
        loss_diag_att = 1 * criterion_diag_att(
            dec_att, dec_len=[x - 1 for x in feat_len], enc_len=text_len)

        # combine all loss #
        loss = loss_feat + loss_core_bernoulli_end + loss_diag_att

        if train_step:
            opt.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm(model.parameters(),
                                          opts['grad_clip'])
            opt.step()

            # write report #
            tf_writer.add_scalar('loss',
                                 loss.data[0],
                                 global_step=tf_writer._n_iter)
            tf_writer.add_scalar('loss_feat',
                                 loss_feat.data[0],
                                 global_step=tf_writer._n_iter)
            tf_writer.add_scalar('loss_bern_end',
                                 loss_core_bernoulli_end.data[0],
                                 global_step=tf_writer._n_iter)
            if opts['loss_diag_att_cfg'] is not None:
                tf_writer.add_scalar('loss_diag_att',
                                     loss_diag_att.data[0],
                                     global_step=tf_writer._n_iter)
            tf_writer._n_iter += 1

        return loss.data.sum(), loss_feat.data.sum(
        ), loss_core_bernoulli_end.data.sum(), acc_core_bernoulli_end.data.sum(
        )
Exemplo n.º 8
0
def decode_greedy_pred_torch(model,
                             text_mat,
                             text_len,
                             group,
                             feat_sil,
                             max_target=1000,
                             aux_info=None):
    """
    decode free-path with its own predicted feature as the input
    """
    assert isinstance(model, AVAILABLE_MODEL), "model is not supported"
    if not isinstance(text_mat, Variable):
        text_mat = Variable(text_mat)
    batch = text_mat.size(0)
    model.reset()
    model.eval()
    model.encode(text_mat, text_len)
    if aux_info is not None:
        if isinstance(aux_info['speaker_vector'], list):
            aux_info['speaker_vector'] = Variable(
                tensorauto(
                    model,
                    torch.from_numpy(
                        np.stack(
                            aux_info['speaker_vector']).astype('float32'))))
        model.set_aux_info(aux_info)

    feats_core = []
    feats_att = []
    feat_sil = np.tile(feat_sil, group).astype('float32')
    feat_sil = tensorauto(
        model,
        torch.from_numpy(feat_sil).unsqueeze(0).expand(batch,
                                                       feat_sil.shape[0]))
    feat_sil_var = Variable(feat_sil)
    prev_feat = feat_sil_var  # 1 dim #
    idx = 0
    feat_len = [-1 for _ in range(batch)]
    while True:
        curr_feat, curr_decatt_res, curr_bern_end = model.decode(prev_feat)
        feats_core.append(curr_feat)
        feats_att.append(curr_decatt_res['att_output']['p_ctx'])
        idx += 1  # increase index #
        prev_feat = curr_feat

        # check if batch bb already finished or not #
        curr_bern_end = curr_bern_end[:, 0].data
        dist_to_sil = (torch.abs(curr_feat - feat_sil_var)).sum(1).data
        for bb in range(batch):
            # output frame end is logit (not sigmoid)
            if feat_len[bb] == -1 and curr_bern_end[bb] > 0.0:
                feat_len[bb] = idx

        if idx >= max_target or all([x != -1 for x in feat_len]):
            # too long or all samples already STOP
            break
        pass

    feats_core = torch.stack(feats_core, dim=1)

    # TODO : masking #

    # reshape
    feats_core = feats_core.view(batch, feats_core.shape[1] * group, -1)
    feat_len = [x * group for x in feat_len]
    feats_att = torch.stack(feats_att, dim=1)
    return feats_core, feat_len, feats_att
Exemplo n.º 9
0
        print('[info] load pretrained model')

        # additional #
        if opts['result'].startswith('+'):
            opts['result'] = os.path.dirname(
                opts['model_pt']) + opts['result'][1:]
            print('[info] append pretrained folder name to result')
    else:
        _model_cfg = opts['model_cfg']
        _model_cfg['enc_in_size'] = NDIM
        _model_cfg['dec_in_size'] = NVOCAB
        _model_cfg['dec_out_size'] = NVOCAB
        model = ModelSerializer.load_config(_model_cfg)

    crit_weight = tensorauto(opts['gpu'], torch.ones(NVOCAB))
    crit_weight[constant.PAD] = 0
    crit_weight = Variable(crit_weight, requires_grad=False)
    criterion = ElementwiseCrossEntropy(weight=crit_weight,
                                        label_smoothing=opts['lbl_smooth'])

    if opts['gpu'] >= 0:
        model.cuda(opts['gpu'])
        pass

    # setting optimizer #
    opt = getattr(torch.optim, opts['opt'])(model.parameters(),
                                            lr=opts['lrate'])
    scheduler = ReduceLROnPlateauEv(opt,
                                    factor=opts['reducelr']['factor'],
                                    patience=opts['reducelr']['patience'],
Exemplo n.º 10
0
                           excludes=[])
 list_saved = []
 for rr in tqdm(list(data_rr), ascii=True, ncols=50):
     # optional #
     aux_info = None
     # TODO REMOVE THIS
     # case prediction mode
     if opts['mode'] == 'pred':
         curr_key_list = text_iterator.get_key_by_index(rr)
         if model.TYPE == TacotronType.MULTI_SPEAKER:
             if opts['spkvec'] is None:
                 _spk_vec = np.stack(
                     feat_spkvec_iterator.get_feat_by_key(
                         curr_key_list)).astype('float32')
                 _spk_vec = Variable(
                     tensorauto(opts['gpu'], torch.from_numpy(_spk_vec)))
             elif os.path.exists(opts['spkvec']):
                 _spk_vec = np.load(
                     opts['spkvec'])['feat'][None, :].astype('float32')
                 _spk_vec = np.repeat(_spk_vec, len(rr), axis=0)
                 _spk_vec = Variable(
                     tensorauto(opts['gpu'], torch.from_numpy(_spk_vec)))
             else:
                 _spk_vec = feat_spkvec_iterator.get_feat_by_key(
                     opts['spkvec'])
                 _spk_vec = _spk_vec[None, :].astype('float32')
                 _spk_vec = np.repeat(_spk_vec, len(rr), axis=0)
                 _spk_vec = Variable(
                     tensorauto(opts['gpu'], torch.from_numpy(_spk_vec)))
             aux_info = {'speaker_vector': _spk_vec}
         else:
Exemplo n.º 11
0
    def fn_batch_tts(model,
                     text_mat,
                     text_len,
                     feat_mat,
                     feat_len,
                     aux_info=None,
                     train_step=True,
                     coeff_loss=1):
        # refit data #
        if max(feat_len) != feat_mat.shape[1]:
            feat_mat = feat_mat[:, 0:max(feat_len)]
        if max(text_len) != text_mat.shape[1]:
            text_mat = text_mat[:, 0:max(text_len)]
        batch_size = text_mat.shape[0]
        if not isinstance(text_mat, Variable):
            text_mat = Variable(text_mat)
        if not isinstance(feat_mat, Variable):
            feat_mat = Variable(feat_mat)
        feat_mat_input = feat_mat[:, 0:-1]
        feat_mat_output = feat_mat[:, 1:]

        feat_mask = Variable(
            generate_seq_mask([x - 1 for x in feat_len], opts['gpu']))

        feat_label_end = Variable(
            1. -
            generate_seq_mask([x - 1 - opts['tts_pad_sil'] for x in feat_len],
                              opts['gpu'],
                              max_len=feat_mask.size(1)))
        model.reset()
        model.train(train_step)
        model.encode(text_mat, text_len)

        # additional input condition
        if model.TYPE == TacotronType.MULTI_SPEAKER:
            aux_info['speaker_vector'] = Variable(
                tensorauto(
                    opts['gpu'],
                    torch.from_numpy(
                        np.stack(
                            aux_info['speaker_vector']).astype('float32'))))
            model.set_aux_info(aux_info)

        batch, dec_len, _ = feat_mat_input.size()
        list_dec_core = []
        list_dec_core_bernoulli_end = []
        list_dec_att = []
        for ii in range(dec_len):
            _dec_core_ii, _dec_att_ii, _dec_core_bernoulli_end = model.decode(
                feat_mat_input[:, ii],
                feat_mask[:, ii] if opts['tts_mask_dec'] else None)
            list_dec_core.append(_dec_core_ii)
            list_dec_core_bernoulli_end.append(_dec_core_bernoulli_end)
            list_dec_att.append(_dec_att_ii['att_output']['p_ctx'])
            pass

        dec_core = torch.stack(list_dec_core, 1)
        dec_core_bernoulli_end = torch.cat(list_dec_core_bernoulli_end, 1)
        dec_att = torch.stack(list_dec_att, dim=1)

        # main : loss mel spectrogram #
        loss_core = tts_loss(dec_core, feat_mat_output, feat_mask)

        # optional : aux loss for lower frequency #
        loss_core_freq = 1 * tts_loss_freq(dec_core, feat_mat_output,
                                           feat_mask)

        loss_feat = loss_core + loss_core_freq

        # optional : aux loss for speaker embedding reconstruction #
        if model_tts.TYPE == TacotronType.MULTI_SPEAKER:
            loss_spk_emb = tts_loss_spk_emb(
                dec_core.view(batch_size, -1, NDIM_FEAT),
                [x * opts['tts_group'] for x in feat_len],
                aux_info['speaker_vector'])
        else:
            loss_spk_emb = Variable(torchauto(opts['gpu']).FloatTensor([0.0]))

        # main : frame ending prediction #
        loss_core_bernoulli_end = F.binary_cross_entropy_with_logits(
            dec_core_bernoulli_end, feat_label_end) * opts['tts_coeff_bern']
        acc_core_bernoulli_end = ((dec_core_bernoulli_end > 0.0) == (
            feat_label_end > 0.5)).float().mean()

        # combine all loss #
        loss = loss_feat + loss_core_bernoulli_end + loss_spk_emb
        loss = loss * coeff_loss

        # if train_step :
        if train_step == True:
            model.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm(model.parameters(),
                                          opts['tts_grad_clip'])
            tts_opt.step()

        return loss.data.sum(), loss_feat.data.sum(), loss_core_bernoulli_end.data.sum(), \
                loss_spk_emb.data.sum(), acc_core_bernoulli_end.data.sum()