def test_real_metrics(): _, source = example_file_data_sources_for_acoustic_model() X = FileSourceDataset(source) lengths = [len(x) for x in X] X = X.asarray() mgc = X[:, :, :source.mgc_dim // 3] lf0 = X[:, :, source.lf0_start_idx] vuv = (X[:, :, source.vuv_start_idx] > 0).astype(np.int) bap = X[:, :, source.bap_start_idx] mgc_tgt = mgc + 0.01 lf0_tgt = lf0 + 0.01 vuv_tgt = vuv.copy() bap_tgt = bap + 0.01 mcd = metrics.melcd(mgc, mgc_tgt, lengths) bap_mcd = metrics.melcd(bap, bap_tgt, lengths) lf0_mse = metrics.lf0_mean_squared_error(lf0, vuv, lf0_tgt, vuv_tgt, lengths) vuv_err = metrics.vuv_error(vuv, vuv_tgt) assert mcd > 0 assert bap_mcd > 0 assert lf0_mse > 0 assert vuv_err == 0.0
def compute_distortions(y_static, y_hat_static, Y_data_mean, Y_data_std, lengths=None): if hp.name == "acoustic": mgc, lf0, vuv, bap = split_streams(y_static, Y_data_mean, Y_data_std) mgc_hat, lf0_hat, vuv_hat, bap_hat = split_streams( y_hat_static, Y_data_mean, Y_data_std) try: f0_mse = metrics.lf0_mean_squared_error(lf0, vuv, lf0_hat, vuv_hat, lengths=lengths, linear_domain=True) except ZeroDivisionError: f0_mse = np.nan distortions = { "mcd": metrics.melcd(mgc[:, :, 1:], mgc_hat[:, :, 1:], lengths=lengths), "bap_mcd": metrics.melcd(bap, bap_hat, lengths=lengths) / 10.0, "f0_rmse": np.sqrt(f0_mse), "vuv_err": metrics.vuv_error(vuv, vuv_hat, lengths=lengths), } elif hp.name == "duration": y_static_invscale = P.inv_scale(y_static, Y_data_mean, Y_data_std) y_hat_static_invscale = P.inv_scale(y_hat_static, Y_data_mean, Y_data_std) distortions = { "dur_rmse": math.sqrt( metrics.mean_squared_error(y_static_invscale, y_hat_static_invscale, lengths=lengths)) } elif hp.name == "vc": static_dim = hp.order y_static_invscale = P.inv_scale(y_static, Y_data_mean[:static_dim], Y_data_std[:static_dim]) y_hat_static_invscale = P.inv_scale(y_hat_static, Y_data_mean[:static_dim], Y_data_std[:static_dim]) distortions = { "mcd": metrics.melcd(y_static_invscale, y_hat_static_invscale, lengths=lengths) } else: assert False return distortions
def mcd_k(org_cep, output_cep, k=None, start_bin=1): """Computes the Mel-cepstrum distortion of the first to k-th bin. Ignores c_0 (energy) by default.""" org_coded_sp = org_cep[:len(output_cep)] if k is None: mcd = nnmnkwii_metrics.melcd( output_cep[:, start_bin:], org_coded_sp[:, start_bin:]) # TODO: Aligned mcd? else: mcd = nnmnkwii_metrics.melcd(output_cep[:, start_bin:k], org_coded_sp[:, start_bin:k]) return mcd
def mcd(filename1, filename2, sr=22050): wav1 = get_wave(filename1, sr) wav2 = get_wave(filename2, sr) mfcc1 = librosa.feature.mfcc(y=wav1, sr=sr) mfcc2 = librosa.feature.mfcc(y=wav2, sr=sr) D, wp = librosa.core.dtw(mfcc1, mfcc2) #print(wp) print(mfcc1.shape) print(mfcc2.shape) #print(D.shape) mfcc1 = np.array([mfcc1.T[i[0]] for i in wp]) mfcc2 = np.array([mfcc2.T[i[1]] for i in wp]) return metrics.melcd(mfcc1, mfcc2)
def do_convert(args, logdir1, logdir2): # Load graph model = Net2() df = Net2DataFlow(hp.convert.data_path, hp.convert.batch_size) ckpt1 = tf.train.latest_checkpoint(logdir1) ckpt2 = '{}/{}'.format( logdir2, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir2) session_inits = [] if ckpt2: session_inits.append(SaverRestore(ckpt2)) if ckpt1: session_inits.append(SaverRestore(ckpt1, ignore=['global_step'])) pred_conf = PredictConfig(model=model, input_names=get_eval_input_names(), output_names=get_eval_output_names(), session_init=ChainInit(session_inits)) predictor = OfflinePredictor(pred_conf) p_r, y_s, pp = next(df().get_data()) pred_spec, y_spec, ppgs = predictor(p_r, y_s, pp) audio, y_audio, ppgs = convert(predictor, df, pred_spec, y_spec, ppgs) db = melcd(audio, y_audio) print('Mel Cepstral Distortion ', db) print('This should be printed') # Write the result tf.compat.v1.summary.audio('A', y_audio, hp.default.sr, max_outputs=hp.convert.batch_size) tf.compat.v1.summary.audio('B', audio, hp.default.sr, max_outputs=hp.convert.batch_size) # Visualize PPGs heatmap = np.expand_dims(ppgs, 3) # channel=1 tf.compat.v1.summary.image('PPG', heatmap, max_outputs=ppgs.shape[0]) writer = tf.compat.v1.summary.FileWriter(logdir2) with tf.compat.v1.Session() as sess: summ = sess.run(tf.compat.v1.summary.merge_all()) writer.add_summary(summ) writer.close()
def on_epoch_end(self, epoch, logs={}): y_pred = self.model.predict( [self.validation_data[0], self.validation_data[1]]) pred_mel = y_pred[0] actual_mel = self.validation_data[2] #denormalize pred_mel = (np.clip(pred_mel, 0, 1) * MAX_DB) - MAX_DB + REF_DB actual_mel = (np.clip(actual_mel, 0, 1) * MAX_DB) - MAX_DB + REF_DB mcd = [] for pred, actual in zip(pred_mel, actual_mel): mcd.append(melcd(pred, actual)) print(f"Validation Mean MCD: {np.mean(mcd)}")
def compute_distortions(y_static, y_hat_static, Y_data_mean, Y_data_std, lengths=None): if hp.name == "vc": static_dim = hp.order y_static_invscale = P.inv_scale(y_static, Y_data_mean[:static_dim], Y_data_std[:static_dim]) y_hat_static_invscale = P.inv_scale(y_hat_static, Y_data_mean[:static_dim], Y_data_std[:static_dim]) distortions = { "mcd": metrics.melcd(y_static_invscale, y_hat_static_invscale, lengths=lengths) } else: assert False return distortions
def main(): """Create samples with artificial alpha for each phoneme.""" from idiaptts.src.model_trainers.vtln.VTLNSpeakerAdaptionModelTrainer import VTLNSpeakerAdaptionModelTrainer hparams = VTLNSpeakerAdaptionModelTrainer.create_hparams() hparams.use_gpu = False hparams.voice = sys.argv[1] hparams.model_name = "WarpingLayerTest.nn" hparams.add_deltas = True hparams.num_coded_sps = 30 alpha_range = 0.2 num_phonemes = 70 num_random_alphas = 7 # num_random_alphas = 53 # Randomly pick alphas for each phoneme. np.random.seed(42) # phonemes_to_alpha_tensor = ((np.random.choice(np.random.rand(num_random_alphas), num_phonemes) - 0.5) * 2 * alpha_range) phonemes_to_alpha_tensor = ((np.random.rand(num_phonemes) - 0.5) * 2 * alpha_range) # hparams.num_questions = 505 hparams.num_questions = 609 # hparams.num_questions = 425 hparams.out_dir = os.path.join("experiments", hparams.voice, "WORLD_artificially_warped") hparams.data_dir = os.path.realpath("database") hparams.model_name = "warping_layer_test" hparams.synth_dir = hparams.out_dir dir_world_labels = os.path.join("experiments", hparams.voice, "WORLD") print( "Create artificially warped MGCs for {} in {} for {} questions, {} random alphas, and an alpha range of {}." .format(hparams.voice, hparams.out_dir, hparams.num_questions, len(np.unique(phonemes_to_alpha_tensor)), alpha_range)) from idiaptts.src.data_preparation.world.WorldFeatLabelGen import WorldFeatLabelGen gen_in = WorldFeatLabelGen(dir_world_labels, add_deltas=hparams.add_deltas, num_coded_sps=hparams.num_coded_sps) gen_in.get_normalisation_params(gen_in.dir_labels) from idiaptts.src.model_trainers.AcousticModelTrainer import AcousticModelTrainer trainer = AcousticModelTrainer( os.path.join("experiments", hparams.voice, "WORLD"), os.path.join("experiments", hparams.voice, "questions"), "ignored", hparams.num_questions, hparams) hparams.num_speakers = 1 speaker = "p276" num_synth_files = 5 # Number of files to synthesise to check warping manually. sp_mean = gen_in.norm_params[0][:hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] sp_std_dev = gen_in.norm_params[1][:hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] wl = WarpingLayer((hparams.num_coded_sps, ), (hparams.num_coded_sps, ), hparams) wl.set_norm_params(sp_mean, sp_std_dev) def _question_to_phoneme_index(questions): """Helper function to convert questions to their current phoneme index.""" if questions.shape[-1] == 505: # German question set. indices = np.arange(86, 347, 5, dtype=np.int) elif questions.shape[-1] == 425: # English radio question set. indices = np.arange(58, 107, dtype=np.int) elif questions.shape[-1] == 609: # English unilex question set. indices = np.arange(92, 162, dtype=np.int) else: raise NotImplementedError( "Unknown question set with {} questions.".format( questions.shape[-1])) return QuestionLabelGen.questions_to_phoneme_indices( questions, indices) # with open(os.path.join(hparams.data_dir, "file_id_list_{}_train.txt".format(hparams.voice))) as f: with open( os.path.join(hparams.data_dir, "file_id_list_{}_adapt.txt".format( hparams.voice))) as f: id_list = f.readlines() id_list[:] = [s.strip(' \t\n\r') for s in id_list if speaker in s] # Trim line endings in-place. out_dir = hparams.out_dir makedirs_safe(out_dir) makedirs_safe(os.path.join(out_dir, "cmp_mgc" + str(hparams.num_coded_sps))) t_benchmark = 0 org_to_warped_mcd = 0.0 for idx, id_name in enumerate(id_list): sample = WorldFeatLabelGen.load_sample( id_name, os.path.join("experiments", hparams.voice, "WORLD"), add_deltas=True, num_coded_sps=hparams.num_coded_sps) sample_pre = gen_in.preprocess_sample(sample) coded_sps = sample_pre[:, :hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] questions = QuestionLabelGen.load_sample( id_name, os.path.join("experiments", hparams.voice, "questions"), num_questions=hparams.num_questions) questions = questions[:len(coded_sps)] phoneme_indices = _question_to_phoneme_index(questions) alpha_vec = phonemes_to_alpha_tensor[phoneme_indices % len(phonemes_to_alpha_tensor), None] coded_sps = coded_sps[:len(alpha_vec), None, ...] # Create a batch dimension. alpha_vec = alpha_vec[:, None, None] # Create a batch and feature dimension. t_start = timer() mfcc_warped, (_, nn_alpha) = wl(torch.from_numpy(coded_sps), None, (len(coded_sps), ), (len(coded_sps), ), alphas=torch.from_numpy(alpha_vec)) t_benchmark += timer() - t_start sample_pre[:len(mfcc_warped), :hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] = mfcc_warped[:, 0].detach() sample_post = gen_in.postprocess_sample(sample_pre) # Manually create samples without normalisation but with deltas. sample_pre = (sample_pre * gen_in.norm_params[1] + gen_in.norm_params[0]).astype(np.float32) if np.isnan(sample_pre).any(): raise ValueError( "Detected nan values in output features for {}.".format( id_name)) # Compute error between warped version and original one. org_to_warped_mcd += metrics.melcd( sample[:, 0:hparams.num_coded_sps], sample_pre[:, 0:hparams.num_coded_sps]) # Save warped features. sample_pre.tofile( os.path.join( out_dir, "cmp_mgc" + str(hparams.num_coded_sps), os.path.basename(id_name + WorldFeatLabelGen.ext_deltas))) hparams.synth_dir = out_dir if idx < num_synth_files: # Only synthesize a few of samples. trainer.run_world_synth({id_name: sample_post}, hparams) print("Process time for {} warpings: {}. MCD caused by warping: {:.2f}". format(len(id_list), timedelta(seconds=t_benchmark), org_to_warped_mcd / len(id_list))) # Copy normalisation files which are necessary for training. for feature in ["_bap", "_lf0", "_mgc{}".format(hparams.num_coded_sps)]: shutil.copyfile( os.path.join( gen_in.dir_labels, gen_in.dir_deltas, MeanCovarianceExtractor.file_name_appendix + feature + ".bin"), os.path.join( out_dir, "cmp_mgc" + str(hparams.num_coded_sps), MeanCovarianceExtractor.file_name_appendix + feature + ".bin"))
def compute_score(self, dict_outputs_post, dict_hiddens, hparams): mcd, f0_rmse, vuv_error_rate, bap_mcd = super().compute_score( dict_outputs_post, dict_hiddens, hparams) # Get data for comparision. dict_original_post = dict() for id_name in dict_outputs_post.keys(): dict_original_post[id_name] = WorldFeatLabelGen.load_sample( id_name, dir_out=self.OutputGen.dir_labels, add_deltas=True, num_coded_sps=hparams.num_coded_sps) # Create a warping layer for manual warping. wl = self._get_dummy_warping_layer(hparams) norm_params_no_deltas = ( self.OutputGen.norm_params[0][:hparams.num_coded_sps], self.OutputGen.norm_params[1][:hparams.num_coded_sps]) # Compute MCD for different set of coefficients. batch_size = len(dict_outputs_post) for cep_coef_start in [1]: for cep_coef_end in itertools.chain(range(10, 19), [-1]): org_to_output_mcd = 0.0 org_to_pre_net_output_mcd = 0.0 for id_name, labels in dict_outputs_post.items(): # Split NN output. _, output_alphas = dict_hiddens[id_name] output_mgc_post, *_ = self.OutputGen.convert_to_world_features( labels, False, num_coded_sps=hparams.num_coded_sps) # Reverse the warping. pre_net_output, _ = wl.forward_sample( labels, -output_alphas) # Postprocess sample manually. pre_net_output = pre_net_output.detach().cpu().numpy() pre_net_mgc = pre_net_output[:, 0, :hparams. num_coded_sps] * norm_params_no_deltas[ 1] + norm_params_no_deltas[ 0] # Load the original warped sample. org_mgc_post = dict_original_post[ id_name][:len(output_mgc_post), :hparams.num_coded_sps] # Compute mcd difference. org_to_output_mcd += metrics.melcd( org_mgc_post[:, cep_coef_start:cep_coef_end], output_mgc_post[:, cep_coef_start:cep_coef_end]) org_to_pre_net_output_mcd += metrics.melcd( org_mgc_post[:, cep_coef_start:cep_coef_end], pre_net_mgc[:, cep_coef_start:cep_coef_end]) org_to_pre_net_output_mcd /= batch_size org_to_output_mcd /= batch_size self.logger.info("MCep from {} to {}:".format( cep_coef_start, cep_coef_end)) self.logger.info( "Original mgc to pre-net mgc error: {:4.2f}dB".format( org_to_pre_net_output_mcd)) self.logger.info( "Original mgc to nn mgc error: {:4.2f}dB".format( org_to_output_mcd)) return mcd, f0_rmse, vuv_error_rate, bap_mcd
def compute_score(self, dict_outputs_post, dict_hiddens, hparams): mcd, f0_rmse, vuv_error_rate, bap_mcd = super().compute_score( dict_outputs_post, dict_hiddens, hparams) # Get data for comparision. dict_original_post = dict() for id_name in dict_outputs_post.keys(): dict_original_post[id_name] = WorldFeatLabelGen.load_sample( id_name, self.OutputGen.dir_labels, True, num_coded_sps=hparams.num_coded_sps) # Create a warping layer for manual warping. wl = WarpingLayer((hparams.num_coded_sps, ), (hparams.num_coded_sps, ), hparams) if hparams.use_gpu: wl = wl.cuda() wl.set_norm_params(*self.OutputGen.norm_params) batch_size = len(dict_outputs_post) for cep_coef_start in [0, 1]: for cep_coef_end in (range(10, 19) if cep_coef_start == 1 else [-1]): alphas_rmse = 0.0 org_to_warped_mcd = 0.0 org_to_nn_warping_mcd = 0.0 output_to_warped_mcd = 0.0 for id_name, labels in dict_outputs_post.items(): # Split NN output. _, output_alphas = dict_hiddens[id_name] output_mgc_post, *_ = self.OutputGen.convert_to_world_features( labels, False, num_coded_sps=hparams.num_coded_sps) # Load the original sample without warping. org_output = self.OutputGen.load_sample( id_name, os.path.join("experiments", hparams.voice, "WORLD"), add_deltas=True, num_coded_sps=hparams.num_coded_sps) org_output = org_output[:len(output_mgc_post)] org_mgc_post = org_output[:, :hparams.num_coded_sps] org_output_pre = self.OutputGen.preprocess_sample( org_output) # Preprocess the sample. org_mgc_pre = org_output_pre[:, :hparams.num_coded_sps * ( 3 if hparams.add_deltas else 1)] # Load the original warped sample. org_mgc_warped_post = dict_original_post[ id_name][:len(output_mgc_post), :hparams.num_coded_sps] # org_mgc_warped_post = self.OutputGen.load_sample( # id_name, # os.path.join("experiments", # hparams.voice, # "vtln_speaker_static", # "alpha_1.10"), # add_deltas=True, # num_coded_sps=hparams.num_coded_sps)[:len(output_mgc_post), :hparams.num_coded_sps] # Compute error between warped version and NN output. output_to_warped_mcd += metrics.melcd( org_mgc_warped_post[:, cep_coef_start:cep_coef_end], output_mgc_post[:, cep_coef_start:cep_coef_end]) # Compute error between warped version and original one. org_to_warped_mcd += metrics.melcd( org_mgc_warped_post[:, cep_coef_start:cep_coef_end], org_mgc_post[:, cep_coef_start:cep_coef_end]) # Get original alphas from phonemes. questions = QuestionLabelGen.load_sample( id_name, os.path.join("experiments", hparams.voice, "questions"), num_questions=hparams.num_questions)[:len(output_alphas )] phoneme_indices = QuestionLabelGen.questions_to_phoneme_indices( questions, hparams.phoneme_indices) org_alphas = self.phonemes_to_alpha_tensor[ phoneme_indices % len(self.phonemes_to_alpha_tensor), None] # Compute RMSE of alphas. alphas_rmse += math.sqrt( ((org_alphas - output_alphas)**2).sum()) # Warp the original mgcs with the alpha predicted by the network. org_mgc_nn_warped, _ = wl.forward_sample( org_mgc_pre, output_alphas) # Warp with the NN alphas. org_output_pre[:, :hparams.num_coded_sps * (3 if hparams.add_deltas else 1)]\ = org_mgc_nn_warped[:, 0, ...].detach() # Write warped mgcs back. org_mgc_nn_warped_post = self.OutputGen.postprocess_sample( org_output_pre, apply_mlpg=False)[:, :hparams.num_coded_sps] # Compute error between correctly warped version and original mgcs warped with NN alpha. org_to_nn_warping_mcd += metrics.melcd( org_mgc_warped_post[:, cep_coef_start:cep_coef_end], org_mgc_nn_warped_post[:, cep_coef_start:cep_coef_end]) alphas_rmse /= batch_size output_to_warped_mcd /= batch_size org_to_warped_mcd /= batch_size org_to_nn_warping_mcd /= batch_size self.logger.info("MCep from {} to {}:".format( cep_coef_start, cep_coef_end)) self.logger.info("RMSE alphas: {:4.2f}".format(alphas_rmse)) self.logger.info( "Original mgc to warped mgc error: {:4.2f}dB".format( org_to_warped_mcd)) self.logger.info( "Original mgc warped by network alpha to warped mgc error: {:4.2f}dB ({:2.2f}%)" .format(org_to_nn_warping_mcd, (1 - org_to_nn_warping_mcd / org_to_warped_mcd) * 100)) self.logger.info( "Network output to original warped mgc error: {:4.2f}dB". format(output_to_warped_mcd)) return mcd, f0_rmse, vuv_error_rate, bap_mcd
def compute_score(self, dict_outputs_post, dict_hiddens, hparams): # Get data for comparision. dict_original_post = dict() for id_name in dict_outputs_post.keys(): dict_original_post[id_name] = WorldFeatLabelGen.load_sample( id_name, dir_out=self.OutputGen.dir_labels, add_deltas=True, num_coded_sps=hparams.num_coded_sps) f0_rmse = 0.0 f0_rmse_max_id = "None" f0_rmse_max = 0.0 all_rmse = [] vuv_error_rate = 0.0 vuv_error_max_id = "None" vuv_error_max = 0.0 all_vuv = [] mcd = 0.0 mcd_max_id = "None" mcd_max = 0.0 all_mcd = [] bap_error = 0.0 bap_error_max_id = "None" bap_error_max = 0.0 all_bap_error = [] for id_name, labels in dict_outputs_post.items(): output_coded_sp, output_lf0, output_vuv, output_bap = self.OutputGen.convert_to_world_features( sample=labels, contains_deltas=False, num_coded_sps=hparams.num_coded_sps) output_vuv = output_vuv.astype(bool) # Get data for comparision. org_coded_sp, org_lf0, org_vuv, org_bap = self.OutputGen.convert_to_world_features( sample=dict_original_post[id_name], contains_deltas=self.OutputGen.add_deltas, num_coded_sps=hparams.num_coded_sps) # Compute f0 from lf0. org_f0 = np.exp(org_lf0.squeeze())[:len( output_lf0)] # Fix minor negligible length mismatch. output_f0 = np.exp(output_lf0) # Compute MCD. org_coded_sp = org_coded_sp[:len(output_coded_sp)] current_mcd = metrics.melcd( output_coded_sp[:, 1:], org_coded_sp[:, 1:]) # TODO: Use aligned mcd. if current_mcd > mcd_max: mcd_max_id = id_name mcd_max = current_mcd mcd += current_mcd all_mcd.append(current_mcd) # Compute RMSE. f0_mse = (org_f0 - output_f0)**2 current_f0_rmse = math.sqrt( (f0_mse * org_vuv[:len(output_lf0)]).sum() / org_vuv[:len(output_lf0)].sum()) if current_f0_rmse != current_f0_rmse: logging.error( "Computed NaN for F0 RMSE for {}.".format(id_name)) else: if current_f0_rmse > f0_rmse_max: f0_rmse_max_id = id_name f0_rmse_max = current_f0_rmse f0_rmse += current_f0_rmse all_rmse.append(current_f0_rmse) # Compute error of VUV in percentage. num_errors = (org_vuv[:len(output_lf0)] != output_vuv) vuv_error_rate_tmp = float(num_errors.sum()) / len(output_lf0) if vuv_error_rate_tmp > vuv_error_max: vuv_error_max_id = id_name vuv_error_max = vuv_error_rate_tmp vuv_error_rate += vuv_error_rate_tmp all_vuv.append(vuv_error_rate_tmp) # Compute aperiodicity distortion. org_bap = org_bap[:len(output_bap)] if len(output_bap.shape) > 1 and output_bap.shape[1] > 1: current_bap_error = metrics.melcd( output_bap, org_bap) # TODO: Use aligned mcd? else: current_bap_error = math.sqrt( ((org_bap - output_bap)** 2).mean()) * (10.0 / np.log(10) * np.sqrt(2.0)) if current_bap_error > bap_error_max: bap_error_max_id = id_name bap_error_max = current_bap_error bap_error += current_bap_error all_bap_error.append(current_bap_error) f0_rmse /= len(dict_outputs_post) vuv_error_rate /= len(dict_outputs_post) mcd /= len(dict_original_post) bap_error /= len(dict_original_post) self.logger.info("Worst MCD: {} {:4.2f}dB".format(mcd_max_id, mcd_max)) self.logger.info("Worst F0 RMSE: {} {:4.2f}Hz".format( f0_rmse_max_id, f0_rmse_max)) self.logger.info("Worst VUV error: {} {:2.2f}%".format( vuv_error_max_id, vuv_error_max * 100)) self.logger.info("Worst BAP error: {} {:4.2f}db".format( bap_error_max_id, bap_error_max)) self.logger.info( "Benchmark score: MCD {:4.2f}dB, F0 RMSE {:4.2f}Hz, VUV {:2.2f}%, BAP error {:4.2f}db" .format(mcd, f0_rmse, vuv_error_rate * 100, bap_error)) return mcd, f0_rmse, vuv_error_rate, bap_error
def compute_distortions(pred_out_feats, out_feats, lengths, out_scaler, model_config): """Compute distortion measures between predicted and ground-truth acoustic features Args: pred_out_feats (nn.Tensor): predicted acoustic features out_feats (nn.Tensor): ground-truth acoustic features lengths (nn.Tensor): lengths of the sequences out_scaler (nn.Module): scaler to denormalize features model_config (dict): model configuration Returns: dict: a dict that includes MCD for mgc/bap, V/UV error and F0 RMSE """ out_feats = out_scaler.inverse_transform(out_feats) pred_out_feats = out_scaler.inverse_transform(pred_out_feats) out_streams = get_static_features( out_feats, model_config.num_windows, model_config.stream_sizes, model_config.has_dynamic_features, ) pred_out_streams = get_static_features( pred_out_feats, model_config.num_windows, model_config.stream_sizes, model_config.has_dynamic_features, ) assert len(out_streams) >= 4 mgc, lf0, vuv, bap = out_streams[0], out_streams[1], out_streams[ 2], out_streams[3] pred_mgc, pred_lf0, pred_vuv, pred_bap = ( pred_out_streams[0], pred_out_streams[1], pred_out_streams[2], pred_out_streams[3], ) # binarize vuv vuv, pred_vuv = (vuv > 0.5).float(), (pred_vuv > 0.5).float() dist = { "ObjEval_MGC_MCD": metrics.melcd(mgc[:, :, 1:], pred_mgc[:, :, 1:], lengths=lengths), "ObjEval_BAP_MCD": metrics.melcd(bap, pred_bap, lengths=lengths) / 10.0, "ObjEval_VUV_ERR": metrics.vuv_error(vuv, pred_vuv, lengths=lengths), } try: f0_mse = metrics.lf0_mean_squared_error(lf0, vuv, pred_lf0, pred_vuv, lengths=lengths, linear_domain=True) dist["ObjEval_F0_RMSE"] = np.sqrt(f0_mse) except ZeroDivisionError: pass return dist