Пример #1
0
    def gpu_decode(feat_list,
                   gpu,
                   cvlist=None,
                   lsd_cvlist=None,
                   lsdstd_cvlist=None,
                   cvlist_dv=None,
                   lsd_cvlist_dv=None,
                   lsdstd_cvlist_dv=None,
                   f0rmse_cvlist=None,
                   f0corr_cvlist=None,
                   caprmse_cvlist=None,
                   f0rmse_cvlist_dv=None,
                   f0corr_cvlist_dv=None,
                   caprmse_cvlist_dv=None,
                   cvlist_cyc=None,
                   lsd_cvlist_cyc=None,
                   lsdstd_cvlist_cyc=None,
                   cvlist_cyc_dv=None,
                   lsd_cvlist_cyc_dv=None,
                   lsdstd_cvlist_cyc_dv=None,
                   f0rmse_cvlist_cyc=None,
                   f0corr_cvlist_cyc=None,
                   caprmse_cvlist_cyc=None,
                   f0rmse_cvlist_cyc_dv=None,
                   f0corr_cvlist_cyc_dv=None,
                   caprmse_cvlist_cyc_dv=None):
        with torch.cuda.device(gpu):
            # define model and load parameters
            with torch.no_grad():
                model_encoder_melsp = GRU_VAE_ENCODER(
                    in_dim=config.mel_dim,
                    n_spk=n_spk,
                    lat_dim=config.lat_dim,
                    hidden_layers=config.hidden_layers_enc,
                    hidden_units=config.hidden_units_enc,
                    kernel_size=config.kernel_size_enc,
                    dilation_size=config.dilation_size_enc,
                    causal_conv=config.causal_conv_enc,
                    bi=False,
                    ar=False,
                    pad_first=True,
                    right_size=config.right_size_enc)
                logging.info(model_encoder_melsp)
                model_decoder_melsp = GRU_SPEC_DECODER(
                    feat_dim=config.lat_dim + config.lat_dim_e,
                    excit_dim=config.excit_dim,
                    out_dim=config.mel_dim,
                    n_spk=n_spk,
                    hidden_layers=config.hidden_layers_dec,
                    hidden_units=config.hidden_units_dec,
                    kernel_size=config.kernel_size_dec,
                    dilation_size=config.dilation_size_dec,
                    causal_conv=config.causal_conv_dec,
                    bi=False,
                    ar=False,
                    pad_first=True,
                    right_size=config.right_size_dec)
                logging.info(model_decoder_melsp)
                model_encoder_excit = GRU_VAE_ENCODER(
                    in_dim=config.mel_dim,
                    n_spk=n_spk,
                    lat_dim=config.lat_dim_e,
                    hidden_layers=config.hidden_layers_enc,
                    hidden_units=config.hidden_units_enc,
                    kernel_size=config.kernel_size_enc,
                    dilation_size=config.dilation_size_enc,
                    causal_conv=config.causal_conv_enc,
                    bi=False,
                    ar=False,
                    pad_first=True,
                    right_size=config.right_size_enc)
                logging.info(model_encoder_excit)
                model_decoder_excit = GRU_EXCIT_DECODER(
                    feat_dim=config.lat_dim_e,
                    cap_dim=config.cap_dim,
                    n_spk=n_spk,
                    hidden_layers=config.hidden_layers_lf0,
                    hidden_units=config.hidden_units_lf0,
                    kernel_size=config.kernel_size_lf0,
                    dilation_size=config.dilation_size_lf0,
                    causal_conv=config.causal_conv_lf0,
                    bi=False,
                    ar=False,
                    pad_first=True,
                    right_size=config.right_size_lf0)
                logging.info(model_decoder_excit)
                if (config.spkidtr_dim > 0):
                    model_spkidtr = SPKID_TRANSFORM_LAYER(
                        n_spk=n_spk, spkidtr_dim=config.spkidtr_dim)
                    logging.info(model_spkidtr)
                model_encoder_melsp.load_state_dict(
                    torch.load(args.model)["model_encoder_melsp"])
                model_decoder_melsp.load_state_dict(
                    torch.load(args.model)["model_decoder_melsp"])
                model_encoder_excit.load_state_dict(
                    torch.load(args.model)["model_encoder_excit"])
                model_decoder_excit.load_state_dict(
                    torch.load(args.model)["model_decoder_excit"])
                if (config.spkidtr_dim > 0):
                    model_spkidtr.load_state_dict(
                        torch.load(args.model)["model_spkidtr"])
                model_encoder_melsp.cuda()
                model_decoder_melsp.cuda()
                model_encoder_excit.cuda()
                model_decoder_excit.cuda()
                if (config.spkidtr_dim > 0):
                    model_spkidtr.cuda()
                model_encoder_melsp.eval()
                model_decoder_melsp.eval()
                model_encoder_excit.eval()
                model_decoder_excit.eval()
                if (config.spkidtr_dim > 0):
                    model_spkidtr.eval()
                for param in model_encoder_melsp.parameters():
                    param.requires_grad = False
                for param in model_decoder_melsp.parameters():
                    param.requires_grad = False
                for param in model_encoder_excit.parameters():
                    param.requires_grad = False
                for param in model_decoder_excit.parameters():
                    param.requires_grad = False
                if (config.spkidtr_dim > 0):
                    for param in model_spkidtr.parameters():
                        param.requires_grad = False
            count = 0
            pad_left = (model_encoder_melsp.pad_left +
                        model_decoder_excit.pad_left +
                        model_decoder_melsp.pad_left) * 2
            pad_right = (model_encoder_melsp.pad_right +
                         model_decoder_excit.pad_right +
                         model_decoder_melsp.pad_right) * 2
            outpad_lefts = [None] * 5
            outpad_rights = [None] * 5
            outpad_lefts[0] = pad_left - model_encoder_melsp.pad_left
            outpad_rights[0] = pad_right - model_encoder_melsp.pad_right
            outpad_lefts[1] = outpad_lefts[0] - model_decoder_excit.pad_left
            outpad_rights[1] = outpad_rights[0] - model_decoder_excit.pad_right
            outpad_lefts[2] = outpad_lefts[1] - model_decoder_melsp.pad_left
            outpad_rights[2] = outpad_rights[1] - model_decoder_melsp.pad_right
            outpad_lefts[3] = outpad_lefts[2] - model_encoder_melsp.pad_left
            outpad_rights[3] = outpad_rights[2] - model_encoder_melsp.pad_right
            outpad_lefts[4] = outpad_lefts[3] - model_decoder_excit.pad_left
            outpad_rights[4] = outpad_rights[3] - model_decoder_excit.pad_right
            for feat_file in feat_list:
                # reconst. melsp
                logging.info("recmelsp " + feat_file)

                feat_org = read_hdf5(feat_file, "/log_1pmelmagsp")
                logging.info(feat_org.shape)

                with torch.no_grad():
                    feat = F.pad(
                        torch.FloatTensor(feat_org).cuda().unsqueeze(
                            0).transpose(1, 2), (pad_left, pad_right),
                        "replicate").transpose(1, 2)

                    spk_logits, _, lat_src, _ = model_encoder_melsp(
                        feat, sampling=False)
                    spk_logits_e, _, lat_src_e, _ = model_encoder_excit(
                        feat, sampling=False)
                    logging.info('input spkpost')
                    if outpad_rights[0] > 0:
                        logging.info(
                            torch.mean(
                                F.softmax(spk_logits[:, outpad_lefts[0]:
                                                     -outpad_rights[0]],
                                          dim=-1), 1))
                    else:
                        logging.info(
                            torch.mean(
                                F.softmax(spk_logits[:, outpad_lefts[0]:],
                                          dim=-1), 1))
                    logging.info('input spkpost_e')
                    if outpad_rights[0] > 0:
                        logging.info(
                            torch.mean(
                                F.softmax(spk_logits_e[:, outpad_lefts[0]:
                                                       -outpad_rights[0]],
                                          dim=-1), 1))
                    else:
                        logging.info(
                            torch.mean(
                                F.softmax(spk_logits_e[:, outpad_lefts[0]:],
                                          dim=-1), 1))

                    if config.spkidtr_dim > 0:
                        src_code = model_spkidtr((torch.ones(
                            (1, lat_src_e.shape[1])) * spk_idx).cuda().long())
                    else:
                        src_code = (torch.ones(
                            (1, lat_src_e.shape[1])) * spk_idx).cuda().long()
                    cvlf0_src, _ = model_decoder_excit(src_code, lat_src_e)

                    if model_decoder_excit.pad_right > 0:
                        lat_cat = torch.cat((
                            lat_src_e[:, model_decoder_excit.
                                      pad_left:-model_decoder_excit.pad_right],
                            lat_src[:, model_decoder_excit.
                                    pad_left:-model_decoder_excit.pad_right]),
                                            2)
                    else:
                        lat_cat = torch.cat(
                            (lat_src_e[:, model_decoder_excit.pad_left:],
                             lat_src[:, model_decoder_excit.pad_left:]), 2)
                    if config.spkidtr_dim > 0:
                        src_code = model_spkidtr((torch.ones(
                            (1, lat_cat.shape[1])) * spk_idx).cuda().long())
                    else:
                        src_code = (torch.ones(
                            (1, lat_cat.shape[1])) * spk_idx).cuda().long()
                    cvmelsp_src, _ = model_decoder_melsp(
                        lat_cat,
                        y=src_code,
                        e=cvlf0_src[:, :, :config.excit_dim])

                    spk_logits, _, lat_rec, _ = model_encoder_melsp(
                        cvmelsp_src, sampling=False)
                    spk_logits_e, _, lat_rec_e, _ = model_encoder_excit(
                        cvmelsp_src, sampling=False)
                    logging.info('rec spkpost')
                    if outpad_rights[3] > 0:
                        logging.info(
                            torch.mean(
                                F.softmax(spk_logits[:, outpad_lefts[3]:
                                                     -outpad_rights[3]],
                                          dim=-1), 1))
                    else:
                        logging.info(
                            torch.mean(
                                F.softmax(spk_logits[:, outpad_lefts[3]:],
                                          dim=-1), 1))
                    logging.info('rec spkpost_e')
                    if outpad_rights[3] > 0:
                        logging.info(
                            torch.mean(
                                F.softmax(spk_logits_e[:, outpad_lefts[3]:
                                                       -outpad_rights[3]],
                                          dim=-1), 1))
                    else:
                        logging.info(
                            torch.mean(
                                F.softmax(spk_logits_e[:, outpad_lefts[3]:],
                                          dim=-1), 1))

                    if config.spkidtr_dim > 0:
                        src_code = model_spkidtr((torch.ones(
                            (1, lat_rec_e.shape[1])) * spk_idx).cuda().long())
                    else:
                        src_code = (torch.ones(
                            (1, lat_rec_e.shape[1])) * spk_idx).cuda().long()
                    cvlf0_cyc, _ = model_decoder_excit(src_code, lat_rec_e)

                    if model_decoder_excit.pad_right > 0:
                        lat_cat = torch.cat((
                            lat_rec_e[:, model_decoder_excit.
                                      pad_left:-model_decoder_excit.pad_right],
                            lat_rec[:, model_decoder_excit.
                                    pad_left:-model_decoder_excit.pad_right]),
                                            2)
                    else:
                        lat_cat = torch.cat(
                            (lat_rec_e[:, model_decoder_excit.pad_left:],
                             lat_rec[:, model_decoder_excit.pad_left:]), 2)
                    if config.spkidtr_dim > 0:
                        src_code = model_spkidtr((torch.ones(
                            (1, lat_cat.shape[1])) * spk_idx).cuda().long())
                    else:
                        src_code = (torch.ones(
                            (1, lat_cat.shape[1])) * spk_idx).cuda().long()
                    cvmelsp_cyc, _ = model_decoder_melsp(
                        lat_cat,
                        y=src_code,
                        e=cvlf0_cyc[:, :, :config.excit_dim])

                    if outpad_rights[1] > 0:
                        cvlf0_src = cvlf0_src[:, outpad_lefts[1]:
                                              -outpad_rights[1]]
                    else:
                        cvlf0_src = cvlf0_src[:, outpad_lefts[1]:]
                    if outpad_rights[2] > 0:
                        cvmelsp_src = cvmelsp_src[:, outpad_lefts[2]:
                                                  -outpad_rights[2]]
                    else:
                        cvmelsp_src = cvmelsp_src[:, outpad_lefts[2]:]
                    if outpad_rights[4] > 0:
                        cvlf0_cyc = cvlf0_cyc[:, outpad_lefts[4]:
                                              -outpad_rights[4]]
                    else:
                        cvlf0_cyc = cvlf0_cyc[:, outpad_lefts[4]:]

                    feat_rec = cvmelsp_src[0].cpu().data.numpy()
                    feat_cyc = cvmelsp_cyc[0].cpu().data.numpy()

                    cvmelsp_src = np.array(cvmelsp_src[0].cpu().data.numpy(),
                                           dtype=np.float64)
                    cvlf0_src = np.array(cvlf0_src[0].cpu().data.numpy(),
                                         dtype=np.float64)

                    cvmelsp_cyc = np.array(cvmelsp_cyc[0].cpu().data.numpy(),
                                           dtype=np.float64)
                    cvlf0_cyc = np.array(cvlf0_cyc[0].cpu().data.numpy(),
                                         dtype=np.float64)

                logging.info(cvlf0_src.shape)
                logging.info(cvmelsp_src.shape)

                logging.info(cvlf0_cyc.shape)
                logging.info(cvmelsp_cyc.shape)

                melsp = np.array(feat_org)

                feat_world = read_hdf5(feat_file, "/feat_mceplf0cap")
                f0 = np.array(
                    np.rint(feat_world[:, 0]) * np.exp(feat_world[:, 1]))
                codeap = np.array(
                    np.rint(feat_world[:, 2:3]) *
                    (-np.exp(feat_world[:, 3:config.full_excit_dim])))

                cvf0_src = np.array(
                    np.rint(cvlf0_src[:, 0]) * np.exp(cvlf0_src[:, 1]))
                cvcodeap_src = np.array(
                    np.rint(cvlf0_src[:, 2:3]) * (-np.exp(cvlf0_src[:, 3:])))
                f0_rmse = np.sqrt(np.mean((cvf0_src - f0)**2))
                logging.info('F0_rmse_rec: %lf Hz' % (f0_rmse))
                cvf0_src_mean = np.mean(cvf0_src)
                f0_mean = np.mean(f0)
                f0_corr = np.sum((cvf0_src-cvf0_src_mean)*(f0-f0_mean))/\
                            (np.sqrt(np.sum((cvf0_src-cvf0_src_mean)**2))*np.sqrt(np.sum((f0-f0_mean)**2)))
                logging.info('F0_corr_rec: %lf' % (f0_corr))

                codeap_rmse = np.sqrt(
                    np.mean((cvcodeap_src - codeap)**2, axis=0))
                for i in range(codeap_rmse.shape[-1]):
                    logging.info('codeap-%d_rmse_rec: %lf dB' %
                                 (i + 1, codeap_rmse[i]))

                cvf0_cyc = np.array(
                    np.rint(cvlf0_cyc[:, 0]) * np.exp(cvlf0_cyc[:, 1]))
                cvcodeap_cyc = np.array(
                    np.rint(cvlf0_cyc[:, 2:3]) * (-np.exp(cvlf0_cyc[:, 3:])))
                f0_rmse_cyc = np.sqrt(np.mean((cvf0_cyc - f0)**2))
                logging.info('F0_rmse_cyc: %lf Hz' % (f0_rmse_cyc))
                cvf0_cyc_mean = np.mean(cvf0_cyc)
                f0_mean = np.mean(f0)
                f0_corr_cyc = np.sum((cvf0_cyc-cvf0_cyc_mean)*(f0-f0_mean))/\
                            (np.sqrt(np.sum((cvf0_cyc-cvf0_cyc_mean)**2))*np.sqrt(np.sum((f0-f0_mean)**2)))
                logging.info('F0_corr_cyc: %lf' % (f0_corr_cyc))

                codeap_rmse_cyc = np.sqrt(
                    np.mean((cvcodeap_cyc - codeap)**2, axis=0))
                for i in range(codeap_rmse_cyc.shape[-1]):
                    logging.info('codeap-%d_rmse_cyc: %lf dB' %
                                 (i + 1, codeap_rmse_cyc[i]))

                spcidx = np.array(read_hdf5(feat_file, "/spcidx_range")[0])

                melsp_rest = (np.exp(melsp) - 1) / 10000
                melsp_src_rest = (np.exp(cvmelsp_src) - 1) / 10000
                melsp_cyc_rest = (np.exp(cvmelsp_cyc) - 1) / 10000

                lsd_arr = np.sqrt(np.mean((20*(np.log10(np.clip(melsp_src_rest[spcidx], a_min=1e-16, a_max=None))\
                                                         -np.log10(np.clip(melsp_rest[spcidx], a_min=1e-16, a_max=None))))**2, axis=-1))
                lsd_mean = np.mean(lsd_arr)
                lsd_std = np.std(lsd_arr)
                logging.info("lsd_rec: %.6f dB +- %.6f" % (lsd_mean, lsd_std))

                lsd_arr = np.sqrt(np.mean((20*(np.log10(np.clip(melsp_cyc_rest[spcidx], a_min=1e-16, a_max=None))\
                                                         -np.log10(np.clip(melsp_rest[spcidx], a_min=1e-16, a_max=None))))**2, axis=-1))
                lsd_mean_cyc = np.mean(lsd_arr)
                lsd_std_cyc = np.std(lsd_arr)
                logging.info("lsd_cyc: %.6f dB +- %.6f" %
                             (lsd_mean_cyc, lsd_std_cyc))

                logging.info('org f0')
                logging.info(f0[10:15])
                logging.info('rec f0')
                logging.info(cvf0_src[10:15])
                logging.info('cyc f0')
                logging.info(cvf0_cyc[10:15])
                logging.info('org cap')
                logging.info(codeap[10:15])
                logging.info('rec cap')
                logging.info(cvcodeap_src[10:15])
                logging.info('cyc cap')
                logging.info(cvcodeap_cyc[10:15])

                dataset = feat_file.split('/')[1].split('_')[0]
                if 'tr' in dataset:
                    logging.info('trn')
                    f0rmse_cvlist.append(f0_rmse)
                    f0corr_cvlist.append(f0_corr)
                    caprmse_cvlist.append(codeap_rmse)
                    lsd_cvlist.append(lsd_mean)
                    lsdstd_cvlist.append(lsd_std)
                    cvlist.append(np.var(melsp_src_rest, axis=0))
                    logging.info(len(cvlist))
                    f0rmse_cvlist_cyc.append(f0_rmse_cyc)
                    f0corr_cvlist_cyc.append(f0_corr_cyc)
                    caprmse_cvlist_cyc.append(codeap_rmse_cyc)
                    lsd_cvlist_cyc.append(lsd_mean_cyc)
                    lsdstd_cvlist_cyc.append(lsd_std_cyc)
                    cvlist_cyc.append(np.var(melsp_cyc_rest, axis=0))
                elif 'dv' in dataset:
                    logging.info('dev')
                    f0rmse_cvlist_dv.append(f0_rmse)
                    f0corr_cvlist_dv.append(f0_corr)
                    caprmse_cvlist_dv.append(codeap_rmse)
                    lsd_cvlist_dv.append(lsd_mean)
                    lsdstd_cvlist_dv.append(lsd_std)
                    cvlist_dv.append(np.var(melsp_src_rest, axis=0))
                    logging.info(len(cvlist_dv))
                    f0rmse_cvlist_cyc_dv.append(f0_rmse_cyc)
                    f0corr_cvlist_cyc_dv.append(f0_corr_cyc)
                    caprmse_cvlist_cyc_dv.append(codeap_rmse_cyc)
                    lsd_cvlist_cyc_dv.append(lsd_mean_cyc)
                    lsdstd_cvlist_cyc_dv.append(lsd_std_cyc)
                    cvlist_cyc_dv.append(np.var(melsp_cyc_rest, axis=0))

                logging.info('write rec to h5')
                outh5dir = os.path.join(
                    os.path.dirname(os.path.dirname(feat_file)),
                    args.spk + "-" + args.spk)
                if not os.path.exists(outh5dir):
                    os.makedirs(outh5dir)
                feat_file = os.path.join(outh5dir, os.path.basename(feat_file))
                logging.info(feat_file + ' ' + args.string_path)
                logging.info(feat_rec.shape)
                write_hdf5(feat_file, args.string_path, feat_rec)

                logging.info('write cyc to h5')
                outh5dir = os.path.join(
                    os.path.dirname(os.path.dirname(feat_file)),
                    args.spk + "-" + args.spk + "-" + args.spk)
                if not os.path.exists(outh5dir):
                    os.makedirs(outh5dir)
                feat_file = os.path.join(outh5dir, os.path.basename(feat_file))
                logging.info(feat_file + ' ' + args.string_path)
                logging.info(feat_cyc.shape)
                write_hdf5(feat_file, args.string_path, feat_cyc)

                count += 1
Пример #2
0
    def decode_RNN(feat_list,
                   gpu,
                   cvlist=None,
                   lsd_cvlist_src=None,
                   lsdstd_cvlist_src=None,
                   lsd_cvlist_cyc=None,
                   lsdstd_cvlist_cyc=None,
                   lsd_cvlist=None,
                   lsdstd_cvlist=None,
                   lat_dist_rmse_list=None,
                   lat_dist_cosim_list=None):
        with torch.cuda.device(gpu):
            # define model and load parameters
            with torch.no_grad():
                model_encoder_melsp = GRU_VAE_ENCODER(
                    in_dim=config.mel_dim,
                    n_spk=n_spk,
                    lat_dim=config.lat_dim,
                    hidden_layers=config.hidden_layers_enc,
                    hidden_units=config.hidden_units_enc,
                    kernel_size=config.kernel_size_enc,
                    dilation_size=config.dilation_size_enc,
                    causal_conv=config.causal_conv_enc,
                    pad_first=True,
                    right_size=config.right_size_enc)
                logging.info(model_encoder_melsp)
                model_decoder_melsp = GRU_SPEC_DECODER(
                    feat_dim=config.lat_dim + config.lat_dim_e,
                    out_dim=config.mel_dim,
                    n_spk=(config.emb_spk_dim // config.n_weight_emb) *
                    config.n_weight_emb,
                    hidden_layers=config.hidden_layers_dec,
                    hidden_units=config.hidden_units_dec,
                    kernel_size=config.kernel_size_dec,
                    dilation_size=config.dilation_size_dec,
                    causal_conv=config.causal_conv_dec,
                    pad_first=True,
                    right_size=config.right_size_dec,
                    red_dim_upd=config.mel_dim,
                    pdf_gauss=True)
                logging.info(model_decoder_melsp)
                model_encoder_excit = GRU_VAE_ENCODER(
                    in_dim=config.mel_dim,
                    n_spk=n_spk,
                    lat_dim=config.lat_dim_e,
                    hidden_layers=config.hidden_layers_enc,
                    hidden_units=config.hidden_units_enc,
                    kernel_size=config.kernel_size_enc,
                    dilation_size=config.dilation_size_enc,
                    causal_conv=config.causal_conv_enc,
                    pad_first=True,
                    right_size=config.right_size_enc)
                logging.info(model_encoder_excit)
                model_spkidtr = SPKID_TRANSFORM_LAYER(
                    n_spk=n_spk,
                    emb_dim=config.emb_spk_dim,
                    n_weight_emb=config.n_weight_emb,
                    conv_emb_flag=True,
                    spkidtr_dim=config.spkidtr_dim)
                logging.info(model_spkidtr)
                model_encoder_melsp.load_state_dict(
                    torch.load(args.model)["model_encoder_melsp"])
                model_decoder_melsp.load_state_dict(
                    torch.load(args.model)["model_decoder_melsp"])
                model_encoder_excit.load_state_dict(
                    torch.load(args.model)["model_encoder_excit"])
                model_spkidtr.load_state_dict(
                    torch.load(args.model)["model_spkidtr"])
                model_encoder_melsp.cuda()
                model_decoder_melsp.cuda()
                model_encoder_excit.cuda()
                model_spkidtr.cuda()
                model_encoder_melsp.eval()
                model_decoder_melsp.eval()
                model_encoder_excit.eval()
                model_spkidtr.eval()
                model_encoder_melsp.remove_weight_norm()
                model_decoder_melsp.remove_weight_norm()
                model_encoder_excit.remove_weight_norm()
                model_spkidtr.remove_weight_norm()
                for param in model_encoder_melsp.parameters():
                    param.requires_grad = False
                for param in model_decoder_melsp.parameters():
                    param.requires_grad = False
                for param in model_encoder_excit.parameters():
                    param.requires_grad = False
                for param in model_spkidtr.parameters():
                    param.requires_grad = False
            count = 0
            pad_left = (model_encoder_melsp.pad_left +
                        model_decoder_melsp.pad_left) * 2
            pad_right = (model_encoder_melsp.pad_right +
                         model_decoder_melsp.pad_right) * 2
            outpad_lefts = [None] * 3
            outpad_rights = [None] * 3
            outpad_lefts[0] = pad_left - model_encoder_melsp.pad_left
            outpad_rights[0] = pad_right - model_encoder_melsp.pad_right
            outpad_lefts[1] = outpad_lefts[0] - model_decoder_melsp.pad_left
            outpad_rights[1] = outpad_rights[0] - model_decoder_melsp.pad_right
            outpad_lefts[2] = outpad_lefts[1] - model_encoder_melsp.pad_left
            outpad_rights[2] = outpad_rights[1] - model_encoder_melsp.pad_right
            melfb_t = np.linalg.pinv(
                librosa.filters.mel(args.fs, args.fftl, n_mels=config.mel_dim))
            temp = 0.675
            logging.info(f'temp: {temp}')
            for feat_file in feat_list:
                # convert melsp
                spk_src = os.path.basename(os.path.dirname(feat_file))
                logging.info('%s --> %s' % (spk_src, args.spk_trg))

                file_trg = os.path.join(
                    os.path.dirname(os.path.dirname(feat_file)), args.spk_trg,
                    os.path.basename(feat_file))
                trg_exist = False
                if os.path.exists(file_trg):
                    logging.info('exist: %s' % (file_trg))
                    feat_trg = read_hdf5(file_trg, "/log_1pmelmagsp")
                    logging.info(feat_trg.shape)
                    trg_exist = True

                feat_org = read_hdf5(feat_file, "/log_1pmelmagsp")
                logging.info(feat_org.shape)

                logging.info("generate")
                with torch.no_grad():
                    feat = F.pad(
                        torch.FloatTensor(feat_org).cuda().unsqueeze(
                            0).transpose(1, 2), (pad_left, pad_right),
                        "replicate").transpose(1, 2)

                    spk_logits, _, lat_src, _ = model_encoder_melsp(
                        feat, sampling=False)
                    spk_logits_e, _, lat_src_e, _ = model_encoder_excit(
                        feat, sampling=False)
                    logging.info('input spkpost')
                    if outpad_rights[0] > 0:
                        logging.info(
                            torch.mean(
                                F.softmax(spk_logits[:, outpad_lefts[0]:
                                                     -outpad_rights[0]],
                                          dim=-1), 1))
                    else:
                        logging.info(
                            torch.mean(
                                F.softmax(spk_logits[:, outpad_lefts[0]:],
                                          dim=-1), 1))
                    logging.info('input spkpost_e')
                    if outpad_rights[0] > 0:
                        logging.info(
                            torch.mean(
                                F.softmax(spk_logits_e[:, outpad_lefts[0]:
                                                       -outpad_rights[0]],
                                          dim=-1), 1))
                    else:
                        logging.info(
                            torch.mean(
                                F.softmax(spk_logits_e[:, outpad_lefts[0]:],
                                          dim=-1), 1))

                    if trg_exist:
                        spk_trg_logits, _, lat_trg, _ = model_encoder_melsp(
                            F.pad(
                                torch.FloatTensor(feat_trg).cuda().unsqueeze(
                                    0).transpose(1, 2),
                                (model_encoder_melsp.pad_left,
                                 model_encoder_melsp.pad_right),
                                "replicate").transpose(1, 2),
                            sampling=False)
                        spk_trg_logits_e, _, lat_trg_e, _ = model_encoder_excit(
                            F.pad(
                                torch.FloatTensor(feat_trg).cuda().unsqueeze(
                                    0).transpose(1, 2),
                                (model_encoder_excit.pad_left,
                                 model_encoder_excit.pad_right),
                                "replicate").transpose(1, 2),
                            sampling=False)
                        logging.info('target spkpost')
                        logging.info(
                            torch.mean(F.softmax(spk_trg_logits, dim=-1), 1))
                        logging.info('target spkpost_e')
                        logging.info(
                            torch.mean(F.softmax(spk_trg_logits_e, dim=-1), 1))

                    _, src_code = model_spkidtr((torch.ones(
                        (1, lat_src_e.shape[1])) * src_idx).cuda().long())
                    _, trg_code = model_spkidtr((torch.ones(
                        (1, lat_src_e.shape[1])) * trg_idx).cuda().long())
                    lat_cat = torch.cat((lat_src_e, lat_src), 2)

                    _, cvmelsp_src, _ = model_decoder_melsp(lat_cat,
                                                            y=src_code,
                                                            temp=temp)
                    _, cvmelsp, _ = model_decoder_melsp(lat_cat,
                                                        y=trg_code,
                                                        temp=temp)
                    trj_lat_cat = lat_cat_src = lat_cat

                    spk_logits, _, lat_cv, _ = model_encoder_melsp(
                        cvmelsp, sampling=False)
                    spk_logits_e, _, lat_cv_e, _ = model_encoder_excit(
                        cvmelsp, sampling=False)
                    logging.info('cv spkpost')
                    if outpad_rights[2] > 0:
                        logging.info(
                            torch.mean(
                                F.softmax(spk_logits[:, outpad_lefts[2]:
                                                     -outpad_rights[2]],
                                          dim=-1), 1))
                    else:
                        logging.info(
                            torch.mean(
                                F.softmax(spk_logits[:, outpad_lefts[2]:],
                                          dim=-1), 1))
                    logging.info('cv spkpost_e')
                    if outpad_rights[2] > 0:
                        logging.info(
                            torch.mean(
                                F.softmax(spk_logits_e[:, outpad_lefts[2]:
                                                       -outpad_rights[2]],
                                          dim=-1), 1))
                    else:
                        logging.info(
                            torch.mean(
                                F.softmax(spk_logits_e[:, outpad_lefts[2]:],
                                          dim=-1), 1))

                    _, src_code = model_spkidtr((torch.ones(
                        (1, lat_cv_e.shape[1])) * src_idx).cuda().long())
                    lat_cat = torch.cat((lat_cv_e, lat_cv), 2)

                    _, cvmelsp_cyc, _ = model_decoder_melsp(lat_cat,
                                                            y=src_code,
                                                            temp=temp)

                    #if outpad_rights[0] > 0:
                    #    trj_lat_cat = trj_lat_cat[:,outpad_lefts[0]:-outpad_rights[0]]
                    #else:
                    #    trj_lat_cat = trj_lat_cat[:,outpad_lefts[0]:-outpad_rights[0]]
                    if outpad_rights[1] > 0:
                        cvmelsp_src = cvmelsp_src[:, outpad_lefts[1]:
                                                  -outpad_rights[1]]
                        cvmelsp = cvmelsp[:, outpad_lefts[1]:-outpad_rights[1]]
                    else:
                        cvmelsp_src = cvmelsp_src[:, outpad_lefts[1]:]
                        cvmelsp = cvmelsp[:, outpad_lefts[1]:]

                    feat_cv = cvmelsp[0].cpu().data.numpy()
                    #feat_lat = trj_lat_cat[0].cpu().data.numpy()

                    cvmelsp_src = np.array(cvmelsp_src[0].cpu().data.numpy(),
                                           dtype=np.float64)
                    cvmelsp = np.array(cvmelsp[0].cpu().data.numpy(),
                                       dtype=np.float64)
                    cvmelsp_cyc = np.array(cvmelsp_cyc[0].cpu().data.numpy(),
                                           dtype=np.float64)

                    if trg_exist:
                        if outpad_rights[1] > 0:
                            lat_src = lat_cat_src[:, outpad_lefts[0]:
                                                  -outpad_rights[0]]
                        else:
                            lat_src = lat_cat_src[:, outpad_lefts[0]:]
                        lat_trg = torch.cat((lat_trg_e, lat_trg), 2)

                logging.info(cvmelsp_src.shape)
                logging.info(cvmelsp.shape)
                logging.info(cvmelsp_cyc.shape)

                melsp = np.array(feat_org)

                if trg_exist:
                    logging.info(lat_src.shape)
                    logging.info(lat_trg.shape)
                    melsp_trg = np.array(feat_trg)

                spcidx = np.array(read_hdf5(feat_file, "/spcidx_range")[0])

                melsp_rest = (np.exp(melsp) - 1) / 10000
                melsp_cv_rest = (np.exp(cvmelsp) - 1) / 10000
                melsp_src_rest = (np.exp(cvmelsp_src) - 1) / 10000
                melsp_cyc_rest = (np.exp(cvmelsp_cyc) - 1) / 10000

                cvlist.append(np.var(melsp_cv_rest, axis=0))

                lsd_arr = np.sqrt(np.mean((20*(np.log10(np.clip(melsp_src_rest[spcidx], a_min=1e-16, a_max=None))\
                                                         -np.log10(np.clip(melsp_rest[spcidx], a_min=1e-16, a_max=None))))**2, axis=-1))
                lsd_mean = np.mean(lsd_arr)
                lsd_std = np.std(lsd_arr)
                logging.info("lsd_src_cv: %.6f dB +- %.6f" %
                             (lsd_mean, lsd_std))
                lsd_cvlist_src.append(lsd_mean)
                lsdstd_cvlist_src.append(lsd_std)

                if trg_exist:
                    melsp_trg_rest = (np.exp(melsp_trg) - 1) / 10000

                    spcidx_trg = np.array(
                        read_hdf5(file_trg, "/spcidx_range")[0])

                    _, twf_melsp, _, _ = dtw.dtw_org_to_trg(np.array(melsp_cv_rest[spcidx], \
                                                dtype=np.float64), np.array(melsp_trg_rest[spcidx_trg], dtype=np.float64), mcd=-1)
                    twf_melsp = np.array(twf_melsp[:, 0])
                    lsd_arr = np.sqrt(np.mean((20*(np.log10(np.clip(melsp_cv_rest[twf_melsp], a_min=1e-16, a_max=None))\
                                                             -np.log10(np.clip(melsp_rest[twf_melsp], a_min=1e-16, a_max=None))))**2, axis=-1))
                    lsd_mean = np.mean(lsd_arr)
                    lsd_std = np.std(lsd_arr)
                    logging.info("lsd_trg: %.6f dB +- %.6f" %
                                 (lsd_mean, lsd_std))
                    lsd_cvlist.append(lsd_mean)
                    lsdstd_cvlist.append(lsd_std)

                    spcidx_src = torch.LongTensor(spcidx).cuda()
                    spcidx_trg = torch.LongTensor(spcidx_trg).cuda()

                    trj_lat_src = np.array(torch.index_select(
                        lat_src[0], 0, spcidx_src).cpu().data.numpy(),
                                           dtype=np.float64)
                    trj_lat_trg = np.array(torch.index_select(
                        lat_trg[0], 0, spcidx_trg).cpu().data.numpy(),
                                           dtype=np.float64)
                    aligned_lat_srctrg, _, _, _ = dtw.dtw_org_to_trg(
                        trj_lat_src, trj_lat_trg)
                    lat_dist_srctrg = np.mean(
                        np.sqrt(
                            np.mean((aligned_lat_srctrg - trj_lat_trg)**2,
                                    axis=0)))
                    _, _, lat_cdist_srctrg, _ = dtw.dtw_org_to_trg(trj_lat_trg,
                                                                   trj_lat_src,
                                                                   mcd=0)
                    aligned_lat_trgsrc, _, _, _ = dtw.dtw_org_to_trg(
                        trj_lat_trg, trj_lat_src)
                    lat_dist_trgsrc = np.mean(
                        np.sqrt(
                            np.mean((aligned_lat_trgsrc - trj_lat_src)**2,
                                    axis=0)))
                    _, _, lat_cdist_trgsrc, _ = dtw.dtw_org_to_trg(trj_lat_src,
                                                                   trj_lat_trg,
                                                                   mcd=0)
                    logging.info("%lf %lf %lf %lf" %
                                 (lat_dist_srctrg, lat_cdist_srctrg,
                                  lat_dist_trgsrc, lat_cdist_trgsrc))
                    lat_dist_rmse = (lat_dist_srctrg + lat_dist_trgsrc) / 2
                    lat_dist_cosim = (lat_cdist_srctrg + lat_cdist_trgsrc) / 2
                    lat_dist_rmse_list.append(lat_dist_rmse)
                    lat_dist_cosim_list.append(lat_dist_cosim)
                    logging.info("lat_dist: %.6f %.6f" %
                                 (lat_dist_rmse, lat_dist_cosim))

                lsd_arr = np.sqrt(np.mean((20*(np.log10(np.clip(melsp_cyc_rest[spcidx], a_min=1e-16, a_max=None))\
                                                         -np.log10(np.clip(melsp_rest[spcidx], a_min=1e-16, a_max=None))))**2, axis=-1))
                lsd_mean_cyc = np.mean(lsd_arr)
                lsd_std_cyc = np.std(lsd_arr)
                logging.info("lsd_cyc: %.6f dB +- %.6f" %
                             (lsd_mean_cyc, lsd_std_cyc))
                lsd_cvlist_cyc.append(lsd_mean_cyc)
                lsdstd_cvlist_cyc.append(lsd_std_cyc)

                logging.info("synth anasyn")
                magsp = np.matmul(melfb_t, melsp_rest.T)
                logging.info(magsp.shape)
                hop_length = int((args.fs / 1000) * args.shiftms)
                win_length = int((args.fs / 1000) * args.winms)
                wav = np.clip(
                    librosa.core.griffinlim(magsp,
                                            hop_length=hop_length,
                                            win_length=win_length,
                                            window='hann'), -1,
                    0.999969482421875)
                wavpath = os.path.join(
                    args.outdir,
                    os.path.basename(feat_file).replace(".h5", "_anasyn.wav"))
                logging.info(wavpath)
                sf.write(wavpath, wav, args.fs, 'PCM_16')

                #if trg_exist:
                #    logging.info("synth anasyn_trg")
                #    wav = np.clip(pw.synthesize(f0_trg, sp_trg, ap_trg, fs, frame_period=args.shiftms), -1, 1)
                #    wavpath = os.path.join(args.outdir,os.path.basename(feat_file).replace(".h5","_anasyn_trg.wav"))
                #    sf.write(wavpath, wav, fs, 'PCM_16')
                #    logging.info(wavpath)

                logging.info("synth gf rec")
                recmagsp = np.matmul(melfb_t, melsp_src_rest.T)
                logging.info(recmagsp.shape)
                wav = np.clip(
                    librosa.core.griffinlim(recmagsp,
                                            hop_length=hop_length,
                                            win_length=win_length,
                                            window='hann'), -1,
                    0.999969482421875)
                wavpath = os.path.join(
                    args.outdir,
                    os.path.basename(feat_file).replace(".h5", "_rec.wav"))
                logging.info(wavpath)
                sf.write(wavpath, wav, args.fs, 'PCM_16')

                logging.info("synth gf cv")
                cvmagsp = np.matmul(melfb_t, melsp_cv_rest.T)
                logging.info(cvmagsp.shape)
                wav = np.clip(
                    librosa.core.griffinlim(cvmagsp,
                                            hop_length=hop_length,
                                            win_length=win_length,
                                            window='hann'), -1,
                    0.999969482421875)
                wavpath = os.path.join(
                    args.outdir,
                    os.path.basename(feat_file).replace(".h5", "_cv.wav"))
                logging.info(wavpath)
                sf.write(wavpath, wav, args.fs, 'PCM_16')

                logging.info('write to h5')
                outh5dir = os.path.join(
                    os.path.dirname(os.path.dirname(feat_file)),
                    spk_src + "-" + args.spk_trg)
                if not os.path.exists(outh5dir):
                    os.makedirs(outh5dir)
                feat_file = os.path.join(outh5dir, os.path.basename(feat_file))
                # cv
                write_path = args.string_path
                logging.info(feat_file + ' ' + write_path)
                logging.info(feat_cv.shape)
                write_hdf5(feat_file, write_path, feat_cv)

                #logging.info('write lat to h5')
                #logging.info(feat_file + ' ' + args.string_path+'_lat')
                #logging.info(feat_lat.shape)
                #write_hdf5(feat_file, args.string_path+'_lat', feat_lat)

                count += 1