Exemplo n.º 1
0
def test_real_metrics():
    _, source = example_file_data_sources_for_acoustic_model()
    X = FileSourceDataset(source)
    lengths = [len(x) for x in X]
    X = X.asarray()

    mgc = X[:, :, :source.mgc_dim // 3]
    lf0 = X[:, :, source.lf0_start_idx]
    vuv = (X[:, :, source.vuv_start_idx] > 0).astype(np.int)
    bap = X[:, :, source.bap_start_idx]

    mgc_tgt = mgc + 0.01
    lf0_tgt = lf0 + 0.01
    vuv_tgt = vuv.copy()
    bap_tgt = bap + 0.01

    mcd = metrics.melcd(mgc, mgc_tgt, lengths)
    bap_mcd = metrics.melcd(bap, bap_tgt, lengths)
    lf0_mse = metrics.lf0_mean_squared_error(lf0, vuv, lf0_tgt, vuv_tgt,
                                             lengths)
    vuv_err = metrics.vuv_error(vuv, vuv_tgt)
    assert mcd > 0
    assert bap_mcd > 0
    assert lf0_mse > 0
    assert vuv_err == 0.0
Exemplo n.º 2
0
def compute_distortions(y_static,
                        y_hat_static,
                        Y_data_mean,
                        Y_data_std,
                        lengths=None):
    if hp.name == "acoustic":
        mgc, lf0, vuv, bap = split_streams(y_static, Y_data_mean, Y_data_std)
        mgc_hat, lf0_hat, vuv_hat, bap_hat = split_streams(
            y_hat_static, Y_data_mean, Y_data_std)
        try:
            f0_mse = metrics.lf0_mean_squared_error(lf0,
                                                    vuv,
                                                    lf0_hat,
                                                    vuv_hat,
                                                    lengths=lengths,
                                                    linear_domain=True)
        except ZeroDivisionError:
            f0_mse = np.nan

        distortions = {
            "mcd": metrics.melcd(mgc[:, :, 1:],
                                 mgc_hat[:, :, 1:],
                                 lengths=lengths),
            "bap_mcd": metrics.melcd(bap, bap_hat, lengths=lengths) / 10.0,
            "f0_rmse": np.sqrt(f0_mse),
            "vuv_err": metrics.vuv_error(vuv, vuv_hat, lengths=lengths),
        }
    elif hp.name == "duration":
        y_static_invscale = P.inv_scale(y_static, Y_data_mean, Y_data_std)
        y_hat_static_invscale = P.inv_scale(y_hat_static, Y_data_mean,
                                            Y_data_std)
        distortions = {
            "dur_rmse":
            math.sqrt(
                metrics.mean_squared_error(y_static_invscale,
                                           y_hat_static_invscale,
                                           lengths=lengths))
        }
    elif hp.name == "vc":
        static_dim = hp.order
        y_static_invscale = P.inv_scale(y_static, Y_data_mean[:static_dim],
                                        Y_data_std[:static_dim])
        y_hat_static_invscale = P.inv_scale(y_hat_static,
                                            Y_data_mean[:static_dim],
                                            Y_data_std[:static_dim])
        distortions = {
            "mcd":
            metrics.melcd(y_static_invscale,
                          y_hat_static_invscale,
                          lengths=lengths)
        }
    else:
        assert False

    return distortions
Exemplo n.º 3
0
    def mcd_k(org_cep, output_cep, k=None, start_bin=1):
        """Computes the Mel-cepstrum distortion of the first to k-th bin. Ignores c_0 (energy) by default."""
        org_coded_sp = org_cep[:len(output_cep)]
        if k is None:
            mcd = nnmnkwii_metrics.melcd(
                output_cep[:, start_bin:],
                org_coded_sp[:, start_bin:])  # TODO: Aligned mcd?
        else:
            mcd = nnmnkwii_metrics.melcd(output_cep[:, start_bin:k],
                                         org_coded_sp[:, start_bin:k])

        return mcd
Exemplo n.º 4
0
def mcd(filename1, filename2, sr=22050):
    wav1 = get_wave(filename1, sr)
    wav2 = get_wave(filename2, sr)
    mfcc1 = librosa.feature.mfcc(y=wav1, sr=sr)
    mfcc2 = librosa.feature.mfcc(y=wav2, sr=sr)
    D, wp = librosa.core.dtw(mfcc1, mfcc2)
    #print(wp)
    print(mfcc1.shape)
    print(mfcc2.shape)
    #print(D.shape)
    mfcc1 = np.array([mfcc1.T[i[0]] for i in wp])
    mfcc2 = np.array([mfcc2.T[i[1]] for i in wp])
    return metrics.melcd(mfcc1, mfcc2)
def do_convert(args, logdir1, logdir2):
    # Load graph
    model = Net2()

    df = Net2DataFlow(hp.convert.data_path, hp.convert.batch_size)

    ckpt1 = tf.train.latest_checkpoint(logdir1)
    ckpt2 = '{}/{}'.format(
        logdir2,
        args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir2)
    session_inits = []
    if ckpt2:
        session_inits.append(SaverRestore(ckpt2))
    if ckpt1:
        session_inits.append(SaverRestore(ckpt1, ignore=['global_step']))
    pred_conf = PredictConfig(model=model,
                              input_names=get_eval_input_names(),
                              output_names=get_eval_output_names(),
                              session_init=ChainInit(session_inits))

    predictor = OfflinePredictor(pred_conf)
    p_r, y_s, pp = next(df().get_data())
    pred_spec, y_spec, ppgs = predictor(p_r, y_s, pp)

    audio, y_audio, ppgs = convert(predictor, df, pred_spec, y_spec, ppgs)

    db = melcd(audio, y_audio)
    print('Mel Cepstral Distortion ', db)
    print('This should be printed')

    # Write the result
    tf.compat.v1.summary.audio('A',
                               y_audio,
                               hp.default.sr,
                               max_outputs=hp.convert.batch_size)
    tf.compat.v1.summary.audio('B',
                               audio,
                               hp.default.sr,
                               max_outputs=hp.convert.batch_size)

    # Visualize PPGs
    heatmap = np.expand_dims(ppgs, 3)  # channel=1
    tf.compat.v1.summary.image('PPG', heatmap, max_outputs=ppgs.shape[0])

    writer = tf.compat.v1.summary.FileWriter(logdir2)
    with tf.compat.v1.Session() as sess:
        summ = sess.run(tf.compat.v1.summary.merge_all())
    writer.add_summary(summ)
    writer.close()
Exemplo n.º 6
0
    def on_epoch_end(self, epoch, logs={}):
        y_pred = self.model.predict(
            [self.validation_data[0], self.validation_data[1]])
        pred_mel = y_pred[0]
        actual_mel = self.validation_data[2]

        #denormalize
        pred_mel = (np.clip(pred_mel, 0, 1) * MAX_DB) - MAX_DB + REF_DB
        actual_mel = (np.clip(actual_mel, 0, 1) * MAX_DB) - MAX_DB + REF_DB

        mcd = []
        for pred, actual in zip(pred_mel, actual_mel):
            mcd.append(melcd(pred, actual))

        print(f"Validation Mean MCD: {np.mean(mcd)}")
Exemplo n.º 7
0
def compute_distortions(y_static,
                        y_hat_static,
                        Y_data_mean,
                        Y_data_std,
                        lengths=None):
    if hp.name == "vc":
        static_dim = hp.order
        y_static_invscale = P.inv_scale(y_static, Y_data_mean[:static_dim],
                                        Y_data_std[:static_dim])
        y_hat_static_invscale = P.inv_scale(y_hat_static,
                                            Y_data_mean[:static_dim],
                                            Y_data_std[:static_dim])
        distortions = {
            "mcd":
            metrics.melcd(y_static_invscale,
                          y_hat_static_invscale,
                          lengths=lengths)
        }
    else:
        assert False

    return distortions
Exemplo n.º 8
0
def main():
    """Create samples with artificial alpha for each phoneme."""
    from idiaptts.src.model_trainers.vtln.VTLNSpeakerAdaptionModelTrainer import VTLNSpeakerAdaptionModelTrainer
    hparams = VTLNSpeakerAdaptionModelTrainer.create_hparams()
    hparams.use_gpu = False
    hparams.voice = sys.argv[1]
    hparams.model_name = "WarpingLayerTest.nn"
    hparams.add_deltas = True
    hparams.num_coded_sps = 30
    alpha_range = 0.2
    num_phonemes = 70

    num_random_alphas = 7
    # num_random_alphas = 53

    # Randomly pick alphas for each phoneme.
    np.random.seed(42)
    # phonemes_to_alpha_tensor = ((np.random.choice(np.random.rand(num_random_alphas), num_phonemes) - 0.5) * 2 * alpha_range)
    phonemes_to_alpha_tensor = ((np.random.rand(num_phonemes) - 0.5) * 2 *
                                alpha_range)

    # hparams.num_questions = 505
    hparams.num_questions = 609
    # hparams.num_questions = 425

    hparams.out_dir = os.path.join("experiments", hparams.voice,
                                   "WORLD_artificially_warped")
    hparams.data_dir = os.path.realpath("database")
    hparams.model_name = "warping_layer_test"
    hparams.synth_dir = hparams.out_dir
    dir_world_labels = os.path.join("experiments", hparams.voice, "WORLD")

    print(
        "Create artificially warped MGCs for {} in {} for {} questions, {} random alphas, and an alpha range of {}."
        .format(hparams.voice, hparams.out_dir, hparams.num_questions,
                len(np.unique(phonemes_to_alpha_tensor)), alpha_range))

    from idiaptts.src.data_preparation.world.WorldFeatLabelGen import WorldFeatLabelGen
    gen_in = WorldFeatLabelGen(dir_world_labels,
                               add_deltas=hparams.add_deltas,
                               num_coded_sps=hparams.num_coded_sps)
    gen_in.get_normalisation_params(gen_in.dir_labels)

    from idiaptts.src.model_trainers.AcousticModelTrainer import AcousticModelTrainer
    trainer = AcousticModelTrainer(
        os.path.join("experiments", hparams.voice, "WORLD"),
        os.path.join("experiments", hparams.voice, "questions"), "ignored",
        hparams.num_questions, hparams)

    hparams.num_speakers = 1
    speaker = "p276"
    num_synth_files = 5  # Number of files to synthesise to check warping manually.

    sp_mean = gen_in.norm_params[0][:hparams.num_coded_sps *
                                    (3 if hparams.add_deltas else 1)]
    sp_std_dev = gen_in.norm_params[1][:hparams.num_coded_sps *
                                       (3 if hparams.add_deltas else 1)]
    wl = WarpingLayer((hparams.num_coded_sps, ), (hparams.num_coded_sps, ),
                      hparams)
    wl.set_norm_params(sp_mean, sp_std_dev)

    def _question_to_phoneme_index(questions):
        """Helper function to convert questions to their current phoneme index."""
        if questions.shape[-1] == 505:  # German question set.
            indices = np.arange(86, 347, 5, dtype=np.int)
        elif questions.shape[-1] == 425:  # English radio question set.
            indices = np.arange(58, 107, dtype=np.int)
        elif questions.shape[-1] == 609:  # English unilex question set.
            indices = np.arange(92, 162, dtype=np.int)
        else:
            raise NotImplementedError(
                "Unknown question set with {} questions.".format(
                    questions.shape[-1]))
        return QuestionLabelGen.questions_to_phoneme_indices(
            questions, indices)

    # with open(os.path.join(hparams.data_dir, "file_id_list_{}_train.txt".format(hparams.voice))) as f:
    with open(
            os.path.join(hparams.data_dir, "file_id_list_{}_adapt.txt".format(
                hparams.voice))) as f:
        id_list = f.readlines()
    id_list[:] = [s.strip(' \t\n\r') for s in id_list
                  if speaker in s]  # Trim line endings in-place.

    out_dir = hparams.out_dir
    makedirs_safe(out_dir)
    makedirs_safe(os.path.join(out_dir,
                               "cmp_mgc" + str(hparams.num_coded_sps)))
    t_benchmark = 0
    org_to_warped_mcd = 0.0
    for idx, id_name in enumerate(id_list):

        sample = WorldFeatLabelGen.load_sample(
            id_name,
            os.path.join("experiments", hparams.voice, "WORLD"),
            add_deltas=True,
            num_coded_sps=hparams.num_coded_sps)
        sample_pre = gen_in.preprocess_sample(sample)
        coded_sps = sample_pre[:, :hparams.num_coded_sps *
                               (3 if hparams.add_deltas else 1)]

        questions = QuestionLabelGen.load_sample(
            id_name,
            os.path.join("experiments", hparams.voice, "questions"),
            num_questions=hparams.num_questions)
        questions = questions[:len(coded_sps)]
        phoneme_indices = _question_to_phoneme_index(questions)
        alpha_vec = phonemes_to_alpha_tensor[phoneme_indices %
                                             len(phonemes_to_alpha_tensor),
                                             None]

        coded_sps = coded_sps[:len(alpha_vec), None,
                              ...]  # Create a batch dimension.
        alpha_vec = alpha_vec[:, None,
                              None]  # Create a batch and feature dimension.

        t_start = timer()
        mfcc_warped, (_, nn_alpha) = wl(torch.from_numpy(coded_sps),
                                        None, (len(coded_sps), ),
                                        (len(coded_sps), ),
                                        alphas=torch.from_numpy(alpha_vec))
        t_benchmark += timer() - t_start
        sample_pre[:len(mfcc_warped), :hparams.num_coded_sps *
                   (3 if hparams.add_deltas else 1)] = mfcc_warped[:,
                                                                   0].detach()

        sample_post = gen_in.postprocess_sample(sample_pre)
        # Manually create samples without normalisation but with deltas.
        sample_pre = (sample_pre * gen_in.norm_params[1] +
                      gen_in.norm_params[0]).astype(np.float32)

        if np.isnan(sample_pre).any():
            raise ValueError(
                "Detected nan values in output features for {}.".format(
                    id_name))

        # Compute error between warped version and original one.
        org_to_warped_mcd += metrics.melcd(
            sample[:, 0:hparams.num_coded_sps],
            sample_pre[:, 0:hparams.num_coded_sps])

        # Save warped features.
        sample_pre.tofile(
            os.path.join(
                out_dir, "cmp_mgc" + str(hparams.num_coded_sps),
                os.path.basename(id_name + WorldFeatLabelGen.ext_deltas)))

        hparams.synth_dir = out_dir
        if idx < num_synth_files:  # Only synthesize a few of samples.
            trainer.run_world_synth({id_name: sample_post}, hparams)

    print("Process time for {} warpings: {}. MCD caused by warping: {:.2f}".
          format(len(id_list), timedelta(seconds=t_benchmark),
                 org_to_warped_mcd / len(id_list)))

    # Copy normalisation files which are necessary for training.
    for feature in ["_bap", "_lf0", "_mgc{}".format(hparams.num_coded_sps)]:
        shutil.copyfile(
            os.path.join(
                gen_in.dir_labels, gen_in.dir_deltas,
                MeanCovarianceExtractor.file_name_appendix + feature + ".bin"),
            os.path.join(
                out_dir, "cmp_mgc" + str(hparams.num_coded_sps),
                MeanCovarianceExtractor.file_name_appendix + feature + ".bin"))
    def compute_score(self, dict_outputs_post, dict_hiddens, hparams):
        mcd, f0_rmse, vuv_error_rate, bap_mcd = super().compute_score(
            dict_outputs_post, dict_hiddens, hparams)

        # Get data for comparision.
        dict_original_post = dict()
        for id_name in dict_outputs_post.keys():
            dict_original_post[id_name] = WorldFeatLabelGen.load_sample(
                id_name,
                dir_out=self.OutputGen.dir_labels,
                add_deltas=True,
                num_coded_sps=hparams.num_coded_sps)

        # Create a warping layer for manual warping.
        wl = self._get_dummy_warping_layer(hparams)
        norm_params_no_deltas = (
            self.OutputGen.norm_params[0][:hparams.num_coded_sps],
            self.OutputGen.norm_params[1][:hparams.num_coded_sps])

        # Compute MCD for different set of coefficients.
        batch_size = len(dict_outputs_post)
        for cep_coef_start in [1]:
            for cep_coef_end in itertools.chain(range(10, 19), [-1]):
                org_to_output_mcd = 0.0
                org_to_pre_net_output_mcd = 0.0

                for id_name, labels in dict_outputs_post.items():
                    # Split NN output.
                    _, output_alphas = dict_hiddens[id_name]
                    output_mgc_post, *_ = self.OutputGen.convert_to_world_features(
                        labels, False, num_coded_sps=hparams.num_coded_sps)
                    # Reverse the warping.
                    pre_net_output, _ = wl.forward_sample(
                        labels, -output_alphas)
                    # Postprocess sample manually.
                    pre_net_output = pre_net_output.detach().cpu().numpy()
                    pre_net_mgc = pre_net_output[:, 0, :hparams.
                                                 num_coded_sps] * norm_params_no_deltas[
                                                     1] + norm_params_no_deltas[
                                                         0]
                    # Load the original warped sample.
                    org_mgc_post = dict_original_post[
                        id_name][:len(output_mgc_post), :hparams.num_coded_sps]

                    # Compute mcd difference.
                    org_to_output_mcd += metrics.melcd(
                        org_mgc_post[:, cep_coef_start:cep_coef_end],
                        output_mgc_post[:, cep_coef_start:cep_coef_end])
                    org_to_pre_net_output_mcd += metrics.melcd(
                        org_mgc_post[:, cep_coef_start:cep_coef_end],
                        pre_net_mgc[:, cep_coef_start:cep_coef_end])

                org_to_pre_net_output_mcd /= batch_size
                org_to_output_mcd /= batch_size

                self.logger.info("MCep from {} to {}:".format(
                    cep_coef_start, cep_coef_end))
                self.logger.info(
                    "Original mgc to pre-net mgc error: {:4.2f}dB".format(
                        org_to_pre_net_output_mcd))
                self.logger.info(
                    "Original mgc to nn mgc error: {:4.2f}dB".format(
                        org_to_output_mcd))

        return mcd, f0_rmse, vuv_error_rate, bap_mcd
Exemplo n.º 10
0
    def compute_score(self, dict_outputs_post, dict_hiddens, hparams):
        mcd, f0_rmse, vuv_error_rate, bap_mcd = super().compute_score(
            dict_outputs_post, dict_hiddens, hparams)

        # Get data for comparision.
        dict_original_post = dict()
        for id_name in dict_outputs_post.keys():
            dict_original_post[id_name] = WorldFeatLabelGen.load_sample(
                id_name,
                self.OutputGen.dir_labels,
                True,
                num_coded_sps=hparams.num_coded_sps)

        # Create a warping layer for manual warping.
        wl = WarpingLayer((hparams.num_coded_sps, ), (hparams.num_coded_sps, ),
                          hparams)
        if hparams.use_gpu:
            wl = wl.cuda()
        wl.set_norm_params(*self.OutputGen.norm_params)
        batch_size = len(dict_outputs_post)

        for cep_coef_start in [0, 1]:
            for cep_coef_end in (range(10, 19)
                                 if cep_coef_start == 1 else [-1]):
                alphas_rmse = 0.0
                org_to_warped_mcd = 0.0
                org_to_nn_warping_mcd = 0.0
                output_to_warped_mcd = 0.0

                for id_name, labels in dict_outputs_post.items():
                    # Split NN output.
                    _, output_alphas = dict_hiddens[id_name]
                    output_mgc_post, *_ = self.OutputGen.convert_to_world_features(
                        labels, False, num_coded_sps=hparams.num_coded_sps)

                    # Load the original sample without warping.
                    org_output = self.OutputGen.load_sample(
                        id_name,
                        os.path.join("experiments", hparams.voice, "WORLD"),
                        add_deltas=True,
                        num_coded_sps=hparams.num_coded_sps)
                    org_output = org_output[:len(output_mgc_post)]
                    org_mgc_post = org_output[:, :hparams.num_coded_sps]
                    org_output_pre = self.OutputGen.preprocess_sample(
                        org_output)  # Preprocess the sample.
                    org_mgc_pre = org_output_pre[:, :hparams.num_coded_sps * (
                        3 if hparams.add_deltas else 1)]

                    # Load the original warped sample.
                    org_mgc_warped_post = dict_original_post[
                        id_name][:len(output_mgc_post), :hparams.num_coded_sps]
                    # org_mgc_warped_post = self.OutputGen.load_sample(
                    #                                         id_name,
                    #                                         os.path.join("experiments",
                    #                                                      hparams.voice,
                    #                                                      "vtln_speaker_static",
                    #                                                      "alpha_1.10"),
                    #                                         add_deltas=True,
                    #                                         num_coded_sps=hparams.num_coded_sps)[:len(output_mgc_post), :hparams.num_coded_sps]

                    # Compute error between warped version and NN output.
                    output_to_warped_mcd += metrics.melcd(
                        org_mgc_warped_post[:, cep_coef_start:cep_coef_end],
                        output_mgc_post[:, cep_coef_start:cep_coef_end])
                    # Compute error between warped version and original one.
                    org_to_warped_mcd += metrics.melcd(
                        org_mgc_warped_post[:, cep_coef_start:cep_coef_end],
                        org_mgc_post[:, cep_coef_start:cep_coef_end])

                    # Get original alphas from phonemes.
                    questions = QuestionLabelGen.load_sample(
                        id_name,
                        os.path.join("experiments", hparams.voice,
                                     "questions"),
                        num_questions=hparams.num_questions)[:len(output_alphas
                                                                  )]
                    phoneme_indices = QuestionLabelGen.questions_to_phoneme_indices(
                        questions, hparams.phoneme_indices)
                    org_alphas = self.phonemes_to_alpha_tensor[
                        phoneme_indices % len(self.phonemes_to_alpha_tensor),
                        None]

                    # Compute RMSE of alphas.
                    alphas_rmse += math.sqrt(
                        ((org_alphas - output_alphas)**2).sum())

                    # Warp the original mgcs with the alpha predicted by the network.
                    org_mgc_nn_warped, _ = wl.forward_sample(
                        org_mgc_pre, output_alphas)  # Warp with the NN alphas.
                    org_output_pre[:, :hparams.num_coded_sps * (3 if hparams.add_deltas else 1)]\
                        = org_mgc_nn_warped[:, 0, ...].detach()  # Write warped mgcs back.
                    org_mgc_nn_warped_post = self.OutputGen.postprocess_sample(
                        org_output_pre,
                        apply_mlpg=False)[:, :hparams.num_coded_sps]

                    # Compute error between correctly warped version and original mgcs warped with NN alpha.
                    org_to_nn_warping_mcd += metrics.melcd(
                        org_mgc_warped_post[:, cep_coef_start:cep_coef_end],
                        org_mgc_nn_warped_post[:, cep_coef_start:cep_coef_end])

                alphas_rmse /= batch_size
                output_to_warped_mcd /= batch_size
                org_to_warped_mcd /= batch_size
                org_to_nn_warping_mcd /= batch_size

                self.logger.info("MCep from {} to {}:".format(
                    cep_coef_start, cep_coef_end))
                self.logger.info("RMSE alphas: {:4.2f}".format(alphas_rmse))
                self.logger.info(
                    "Original mgc to warped mgc error: {:4.2f}dB".format(
                        org_to_warped_mcd))
                self.logger.info(
                    "Original mgc warped by network alpha to warped mgc error: {:4.2f}dB ({:2.2f}%)"
                    .format(org_to_nn_warping_mcd,
                            (1 - org_to_nn_warping_mcd / org_to_warped_mcd) *
                            100))
                self.logger.info(
                    "Network output to original warped mgc error: {:4.2f}dB".
                    format(output_to_warped_mcd))

        return mcd, f0_rmse, vuv_error_rate, bap_mcd
Exemplo n.º 11
0
    def compute_score(self, dict_outputs_post, dict_hiddens, hparams):

        # Get data for comparision.
        dict_original_post = dict()
        for id_name in dict_outputs_post.keys():
            dict_original_post[id_name] = WorldFeatLabelGen.load_sample(
                id_name,
                dir_out=self.OutputGen.dir_labels,
                add_deltas=True,
                num_coded_sps=hparams.num_coded_sps)

        f0_rmse = 0.0
        f0_rmse_max_id = "None"
        f0_rmse_max = 0.0
        all_rmse = []
        vuv_error_rate = 0.0
        vuv_error_max_id = "None"
        vuv_error_max = 0.0
        all_vuv = []
        mcd = 0.0
        mcd_max_id = "None"
        mcd_max = 0.0
        all_mcd = []
        bap_error = 0.0
        bap_error_max_id = "None"
        bap_error_max = 0.0
        all_bap_error = []

        for id_name, labels in dict_outputs_post.items():
            output_coded_sp, output_lf0, output_vuv, output_bap = self.OutputGen.convert_to_world_features(
                sample=labels,
                contains_deltas=False,
                num_coded_sps=hparams.num_coded_sps)
            output_vuv = output_vuv.astype(bool)

            # Get data for comparision.
            org_coded_sp, org_lf0, org_vuv, org_bap = self.OutputGen.convert_to_world_features(
                sample=dict_original_post[id_name],
                contains_deltas=self.OutputGen.add_deltas,
                num_coded_sps=hparams.num_coded_sps)

            # Compute f0 from lf0.
            org_f0 = np.exp(org_lf0.squeeze())[:len(
                output_lf0)]  # Fix minor negligible length mismatch.
            output_f0 = np.exp(output_lf0)

            # Compute MCD.
            org_coded_sp = org_coded_sp[:len(output_coded_sp)]
            current_mcd = metrics.melcd(
                output_coded_sp[:, 1:],
                org_coded_sp[:, 1:])  # TODO: Use aligned mcd.
            if current_mcd > mcd_max:
                mcd_max_id = id_name
                mcd_max = current_mcd
            mcd += current_mcd
            all_mcd.append(current_mcd)

            # Compute RMSE.
            f0_mse = (org_f0 - output_f0)**2
            current_f0_rmse = math.sqrt(
                (f0_mse * org_vuv[:len(output_lf0)]).sum() /
                org_vuv[:len(output_lf0)].sum())
            if current_f0_rmse != current_f0_rmse:
                logging.error(
                    "Computed NaN for F0 RMSE for {}.".format(id_name))
            else:
                if current_f0_rmse > f0_rmse_max:
                    f0_rmse_max_id = id_name
                    f0_rmse_max = current_f0_rmse
                f0_rmse += current_f0_rmse
                all_rmse.append(current_f0_rmse)

            # Compute error of VUV in percentage.
            num_errors = (org_vuv[:len(output_lf0)] != output_vuv)
            vuv_error_rate_tmp = float(num_errors.sum()) / len(output_lf0)
            if vuv_error_rate_tmp > vuv_error_max:
                vuv_error_max_id = id_name
                vuv_error_max = vuv_error_rate_tmp
            vuv_error_rate += vuv_error_rate_tmp
            all_vuv.append(vuv_error_rate_tmp)

            # Compute aperiodicity distortion.
            org_bap = org_bap[:len(output_bap)]
            if len(output_bap.shape) > 1 and output_bap.shape[1] > 1:
                current_bap_error = metrics.melcd(
                    output_bap, org_bap)  # TODO: Use aligned mcd?
            else:
                current_bap_error = math.sqrt(
                    ((org_bap - output_bap)**
                     2).mean()) * (10.0 / np.log(10) * np.sqrt(2.0))
            if current_bap_error > bap_error_max:
                bap_error_max_id = id_name
                bap_error_max = current_bap_error
            bap_error += current_bap_error
            all_bap_error.append(current_bap_error)

        f0_rmse /= len(dict_outputs_post)
        vuv_error_rate /= len(dict_outputs_post)
        mcd /= len(dict_original_post)
        bap_error /= len(dict_original_post)

        self.logger.info("Worst MCD: {} {:4.2f}dB".format(mcd_max_id, mcd_max))
        self.logger.info("Worst F0 RMSE: {} {:4.2f}Hz".format(
            f0_rmse_max_id, f0_rmse_max))
        self.logger.info("Worst VUV error: {} {:2.2f}%".format(
            vuv_error_max_id, vuv_error_max * 100))
        self.logger.info("Worst BAP error: {} {:4.2f}db".format(
            bap_error_max_id, bap_error_max))
        self.logger.info(
            "Benchmark score: MCD {:4.2f}dB, F0 RMSE {:4.2f}Hz, VUV {:2.2f}%, BAP error {:4.2f}db"
            .format(mcd, f0_rmse, vuv_error_rate * 100, bap_error))

        return mcd, f0_rmse, vuv_error_rate, bap_error
Exemplo n.º 12
0
def compute_distortions(pred_out_feats, out_feats, lengths, out_scaler,
                        model_config):
    """Compute distortion measures between predicted and ground-truth acoustic features


    Args:
        pred_out_feats (nn.Tensor): predicted acoustic features
        out_feats (nn.Tensor): ground-truth acoustic features
        lengths (nn.Tensor): lengths of the sequences
        out_scaler (nn.Module): scaler to denormalize features
        model_config (dict): model configuration

    Returns:
        dict: a dict that includes MCD for mgc/bap, V/UV error and F0 RMSE
    """
    out_feats = out_scaler.inverse_transform(out_feats)
    pred_out_feats = out_scaler.inverse_transform(pred_out_feats)
    out_streams = get_static_features(
        out_feats,
        model_config.num_windows,
        model_config.stream_sizes,
        model_config.has_dynamic_features,
    )
    pred_out_streams = get_static_features(
        pred_out_feats,
        model_config.num_windows,
        model_config.stream_sizes,
        model_config.has_dynamic_features,
    )

    assert len(out_streams) >= 4
    mgc, lf0, vuv, bap = out_streams[0], out_streams[1], out_streams[
        2], out_streams[3]
    pred_mgc, pred_lf0, pred_vuv, pred_bap = (
        pred_out_streams[0],
        pred_out_streams[1],
        pred_out_streams[2],
        pred_out_streams[3],
    )

    # binarize vuv
    vuv, pred_vuv = (vuv > 0.5).float(), (pred_vuv > 0.5).float()

    dist = {
        "ObjEval_MGC_MCD":
        metrics.melcd(mgc[:, :, 1:], pred_mgc[:, :, 1:], lengths=lengths),
        "ObjEval_BAP_MCD":
        metrics.melcd(bap, pred_bap, lengths=lengths) / 10.0,
        "ObjEval_VUV_ERR":
        metrics.vuv_error(vuv, pred_vuv, lengths=lengths),
    }

    try:
        f0_mse = metrics.lf0_mean_squared_error(lf0,
                                                vuv,
                                                pred_lf0,
                                                pred_vuv,
                                                lengths=lengths,
                                                linear_domain=True)
        dist["ObjEval_F0_RMSE"] = np.sqrt(f0_mse)
    except ZeroDivisionError:
        pass

    return dist