def main(): from idiaptts.src.model_trainers.vtln.VTLNSpeakerAdaptionModelTrainer import VTLNSpeakerAdaptionModelTrainer hparams = VTLNSpeakerAdaptionModelTrainer.create_hparams() hparams.use_gpu = False hparams.voice = "English" hparams.model_name = "WarpingLayerTest.nn" hparams.add_deltas = True hparams.num_coded_sps = 30 # hparams.num_questions = 505 hparams.num_questions = 425 hparams.out_dir = "experiments/" + hparams.voice + "/VTLNArtificiallyWarped/" hparams.data_dir = os.path.realpath("database") hparams.model_name = "warping_layer_test" hparams.synth_dir = hparams.out_dir batch_size = 2 dir_world_labels = os.path.join("experiments", hparams.voice, "WORLD") from idiaptts.src.data_preparation.world.WorldFeatLabelGen import WorldFeatLabelGen gen_in = WorldFeatLabelGen(dir_world_labels, add_deltas=hparams.add_deltas, num_coded_sps=hparams.num_coded_sps) gen_in.get_normalisation_params(gen_in.dir_labels) from idiaptts.src.model_trainers.AcousticModelTrainer import AcousticModelTrainer trainer = AcousticModelTrainer( "experiments/" + hparams.voice + "/WORLD", "experiments/" + hparams.voice + "/questions", "ignored", hparams.num_questions, hparams) sp_mean = gen_in.norm_params[0][:hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] sp_std_dev = gen_in.norm_params[1][:hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] wl = WarpingLayer((hparams.num_coded_sps, ), (hparams.num_coded_sps, ), hparams) wl.set_norm_params(sp_mean, sp_std_dev) # id_list = ["dorian/doriangray_16_00199"] id_list = ["p225/p225_051"] hparams.num_speakers = 1 t_benchmark = 0 for id_name in id_list: for idx, alpha in enumerate(np.arange(-0.15, 0.2, 0.05)): out_dir = hparams.out_dir + "alpha_{0:0.2f}/".format(alpha) makedirs_safe(out_dir) sample = WorldFeatLabelGen.load_sample( id_name, os.path.join("experiments", hparams.voice, "WORLD"), add_deltas=True, num_coded_sps=hparams.num_coded_sps) sample_pre = gen_in.preprocess_sample(sample) coded_sps = sample_pre[:, :hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] alpha_vec = np.ones((coded_sps.shape[0], 1)) * alpha coded_sps = coded_sps[:len(alpha_vec), None, ...].repeat( batch_size, 1) # Copy data in batch dimension. alpha_vec = alpha_vec[:, None, None].repeat( batch_size, 1) # Copy data in batch dimension. t_start = timer() mfcc_warped, (_, nn_alpha) = wl(torch.from_numpy(coded_sps), None, (len(coded_sps), ), (len(coded_sps), ), alphas=torch.from_numpy(alpha_vec)) mfcc_warped.sum().backward() t_benchmark += timer() - t_start assert ((mfcc_warped[:, 0] == mfcc_warped[:, 1]).all() ) # Compare results for cloned coded_sps within batch. if alpha == 0: assert ((mfcc_warped == coded_sps).all() ) # Compare results for no warping. sample_pre[:len(mfcc_warped), :hparams.num_coded_sps * ( 3 if hparams.add_deltas else 1)] = mfcc_warped[:, 0].detach() sample_post = gen_in.postprocess_sample(sample_pre) # Manually create samples without normalisation but with deltas. sample_pre = (sample_pre * gen_in.norm_params[1] + gen_in.norm_params[0]).astype(np.float32) if np.isnan(sample_pre).any(): raise ValueError( "Detected nan values in output features for {}.".format( id_name)) # Save warped features. makedirs_safe(os.path.dirname(os.path.join(out_dir, id_name))) sample_pre.tofile( os.path.join(out_dir, id_name + WorldFeatLabelGen.ext_deltas)) hparams.synth_dir = out_dir Synthesiser.run_world_synth({id_name: sample_post}, hparams) print("Process time for {} runs: {}".format( len(id_list) * idx, timedelta(seconds=t_benchmark)))
class AcousticModelTrainer(ModelTrainer): """ Implementation of a ModelTrainer for the generation of acoustic data. Use question labels as input and WORLD features w/o deltas/double deltas (specified in hparams.add_deltas) as output. Synthesize audio from model output with MLPG smoothing. """ logger = logging.getLogger(__name__) ######################### # Default constructor # def __init__(self, dir_world_features, dir_question_labels, id_list, num_questions, hparams=None): """Default constructor. :param dir_world_features: Path to the directory containing the world features. :param dir_question_labels: Path to the directory containing the question labels. :param id_list: List of ids, can contain a speaker directory. :param num_questions: Number of questions in question file. :param hparams: Set of hyper parameters. """ if hparams is None: hparams = self.create_hparams() hparams.out_dir = os.path.curdir # Write missing default parameters. if hparams.variable_sequence_length_train is None: hparams.variable_sequence_length_train = hparams.batch_size_train > 1 if hparams.variable_sequence_length_test is None: hparams.variable_sequence_length_test = hparams.batch_size_test > 1 if hparams.synth_dir is None: hparams.synth_dir = os.path.join(hparams.out_dir, "synth") super(AcousticModelTrainer, self).__init__(id_list, hparams) self.InputGen = QuestionLabelGen(dir_question_labels, num_questions) self.InputGen.get_normalisation_params( dir_question_labels, hparams.input_norm_params_file_prefix) self.OutputGen = WorldFeatLabelGen(dir_world_features, add_deltas=hparams.add_deltas, num_coded_sps=hparams.num_coded_sps, sp_type=hparams.sp_type) self.OutputGen.get_normalisation_params( dir_world_features, hparams.output_norm_params_file_prefix) self.dataset_train = LabelGensDataset(self.id_list_train, self.InputGen, self.OutputGen, hparams, match_lengths=True) self.dataset_val = LabelGensDataset(self.id_list_val, self.InputGen, self.OutputGen, hparams, match_lengths=True) if self.loss_function is None: self.loss_function = torch.nn.MSELoss(reduction='none') if hparams.scheduler_type == "default": hparams.scheduler_type = "Plateau" hparams.add_hparams(plateau_verbose=True) @staticmethod def create_hparams(hparams_string=None, verbose=False): """Create model hyper parameter container. Parse non default from given string.""" hparams = ModelTrainer.create_hparams(hparams_string, verbose=False) hparams.add_hparams( num_questions=None, question_file=None, # Used to add labels in plot. num_coded_sps=60, sp_type="mcep", add_deltas=True, synth_load_org_sp=False, synth_load_org_lf0=False, synth_load_org_vuv=False, synth_load_org_bap=False) if verbose: logging.info(hparams.get_debug_string()) return hparams def gen_figure_from_output(self, id_name, label, hidden, hparams): labels_post = self.OutputGen.postprocess_sample(label) coded_sp, lf0, vuv, bap = WorldFeatLabelGen.convert_to_world_features( labels_post, contains_deltas=False, num_coded_sps=hparams.num_coded_sps) lf0, _ = interpolate_lin(lf0) # Load original lf0. org_labels_post = WorldFeatLabelGen.load_sample( id_name, self.OutputGen.dir_labels, add_deltas=self.OutputGen.add_deltas, num_coded_sps=hparams.num_coded_sps) original_mgc, original_lf0, original_vuv, *_ = WorldFeatLabelGen.convert_to_world_features( org_labels_post, contains_deltas=self.OutputGen.add_deltas, num_coded_sps=hparams.num_coded_sps) original_lf0, _ = interpolate_lin(original_lf0) # Get a data plotter. grid_idx = 0 plotter = DataPlotter() net_name = os.path.basename(hparams.model_name) filename = str(os.path.join(hparams.out_dir, id_name + '.' + net_name)) plotter.set_title(id_name + ' - ' + net_name) plotter.set_num_colors(3) # plotter.set_lim(grid_idx=0, ymin=math.log(60), ymax=math.log(250)) plotter.set_label(grid_idx=grid_idx, xlabel='frames [' + str(hparams.frame_size_ms) + ' ms]', ylabel='log(f0)') graphs = list() graphs.append((original_lf0, 'Original lf0')) graphs.append((lf0, 'PyTorch lf0')) plotter.set_data_list(grid_idx=grid_idx, data_list=graphs) plotter.set_area_list(grid_idx=grid_idx, area_list=[(np.invert(vuv.astype(bool)), '0.8', 1.0), (np.invert(original_vuv.astype(bool)), 'red', 0.2)]) grid_idx += 1 import librosa plotter.set_label(grid_idx=grid_idx, xlabel='frames [' + str(hparams.frame_size_ms) + ' ms]', ylabel='Original spectrogram') plotter.set_specshow(grid_idx=grid_idx, spec=librosa.amplitude_to_db(np.absolute( WorldFeatLabelGen.mcep_to_amp_sp( original_mgc, hparams.synth_fs)), top_db=None)) grid_idx += 1 plotter.set_label(grid_idx=grid_idx, xlabel='frames [' + str(hparams.frame_size_ms) + ' ms]', ylabel='NN spectrogram') plotter.set_specshow(grid_idx=grid_idx, spec=librosa.amplitude_to_db(np.absolute( WorldFeatLabelGen.mcep_to_amp_sp( coded_sp, hparams.synth_fs)), top_db=None)) if hasattr(hparams, "phoneme_indices") and hparams.phoneme_indices is not None \ and hasattr(hparams, "question_file") and hparams.question_file is not None: questions = QuestionLabelGen.load_sample( id_name, "experiments/" + hparams.voice + "/questions/", num_questions=hparams.num_questions)[:len(lf0)] np_phonemes = QuestionLabelGen.questions_to_phonemes( questions, hparams.phoneme_indices, hparams.question_file) plotter.set_annotations(grid_idx, np_phonemes) plotter.gen_plot() plotter.save_to_file(filename + '.Org-PyTorch' + hparams.gen_figure_ext) def compute_score(self, dict_outputs_post, dict_hiddens, hparams): # Get data for comparision. dict_original_post = dict() for id_name in dict_outputs_post.keys(): dict_original_post[id_name] = WorldFeatLabelGen.load_sample( id_name, dir_out=self.OutputGen.dir_labels, add_deltas=True, num_coded_sps=hparams.num_coded_sps) f0_rmse = 0.0 f0_rmse_max_id = "None" f0_rmse_max = 0.0 all_rmse = [] vuv_error_rate = 0.0 vuv_error_max_id = "None" vuv_error_max = 0.0 all_vuv = [] mcd = 0.0 mcd_max_id = "None" mcd_max = 0.0 all_mcd = [] bap_error = 0.0 bap_error_max_id = "None" bap_error_max = 0.0 all_bap_error = [] for id_name, labels in dict_outputs_post.items(): output_coded_sp, output_lf0, output_vuv, output_bap = self.OutputGen.convert_to_world_features( sample=labels, contains_deltas=False, num_coded_sps=hparams.num_coded_sps) output_vuv = output_vuv.astype(bool) # Get data for comparision. org_coded_sp, org_lf0, org_vuv, org_bap = self.OutputGen.convert_to_world_features( sample=dict_original_post[id_name], contains_deltas=self.OutputGen.add_deltas, num_coded_sps=hparams.num_coded_sps) # Compute f0 from lf0. org_f0 = np.exp(org_lf0.squeeze())[:len( output_lf0)] # Fix minor negligible length mismatch. output_f0 = np.exp(output_lf0) # Compute MCD. org_coded_sp = org_coded_sp[:len(output_coded_sp)] current_mcd = metrics.melcd( output_coded_sp[:, 1:], org_coded_sp[:, 1:]) # TODO: Use aligned mcd. if current_mcd > mcd_max: mcd_max_id = id_name mcd_max = current_mcd mcd += current_mcd all_mcd.append(current_mcd) # Compute RMSE. f0_mse = (org_f0 - output_f0)**2 current_f0_rmse = math.sqrt( (f0_mse * org_vuv[:len(output_lf0)]).sum() / org_vuv[:len(output_lf0)].sum()) if current_f0_rmse != current_f0_rmse: logging.error( "Computed NaN for F0 RMSE for {}.".format(id_name)) else: if current_f0_rmse > f0_rmse_max: f0_rmse_max_id = id_name f0_rmse_max = current_f0_rmse f0_rmse += current_f0_rmse all_rmse.append(current_f0_rmse) # Compute error of VUV in percentage. num_errors = (org_vuv[:len(output_lf0)] != output_vuv) vuv_error_rate_tmp = float(num_errors.sum()) / len(output_lf0) if vuv_error_rate_tmp > vuv_error_max: vuv_error_max_id = id_name vuv_error_max = vuv_error_rate_tmp vuv_error_rate += vuv_error_rate_tmp all_vuv.append(vuv_error_rate_tmp) # Compute aperiodicity distortion. org_bap = org_bap[:len(output_bap)] if len(output_bap.shape) > 1 and output_bap.shape[1] > 1: current_bap_error = metrics.melcd( output_bap, org_bap) # TODO: Use aligned mcd? else: current_bap_error = math.sqrt( ((org_bap - output_bap)** 2).mean()) * (10.0 / np.log(10) * np.sqrt(2.0)) if current_bap_error > bap_error_max: bap_error_max_id = id_name bap_error_max = current_bap_error bap_error += current_bap_error all_bap_error.append(current_bap_error) f0_rmse /= len(dict_outputs_post) vuv_error_rate /= len(dict_outputs_post) mcd /= len(dict_original_post) bap_error /= len(dict_original_post) self.logger.info("Worst MCD: {} {:4.2f}dB".format(mcd_max_id, mcd_max)) self.logger.info("Worst F0 RMSE: {} {:4.2f}Hz".format( f0_rmse_max_id, f0_rmse_max)) self.logger.info("Worst VUV error: {} {:2.2f}%".format( vuv_error_max_id, vuv_error_max * 100)) self.logger.info("Worst BAP error: {} {:4.2f}db".format( bap_error_max_id, bap_error_max)) self.logger.info( "Benchmark score: MCD {:4.2f}dB, F0 RMSE {:4.2f}Hz, VUV {:2.2f}%, BAP error {:4.2f}db" .format(mcd, f0_rmse, vuv_error_rate * 100, bap_error)) return mcd, f0_rmse, vuv_error_rate, bap_error def synthesize(self, id_list, synth_output, hparams): """ Depending on hparams override the network output with the extracted features, then continue with normal synthesis pipeline. """ if hparams.synth_load_org_sp\ or hparams.synth_load_org_lf0\ or hparams.synth_load_org_vuv\ or hparams.synth_load_org_bap: for id_name in id_list: world_dir = hparams.world_dir if hasattr(hparams, "world_dir") and hparams.world_dir is not None\ else os.path.join(self.OutputGen.dir_labels, self.dir_extracted_acoustic_features) labels = WorldFeatLabelGen.load_sample( id_name, world_dir, num_coded_sps=hparams.num_coded_sps) len_diff = len(labels) - len(synth_output[id_name]) if len_diff > 0: labels = WorldFeatLabelGen.trim_end_sample(labels, int(len_diff / 2), reverse=True) labels = WorldFeatLabelGen.trim_end_sample( labels, len_diff - int(len_diff / 2)) if hparams.synth_load_org_sp: synth_output[ id_name][:len(labels), :self.OutputGen. num_coded_sps] = labels[:, :self.OutputGen. num_coded_sps] if hparams.synth_load_org_lf0: synth_output[id_name][:len(labels), -3] = labels[:, -3] if hparams.synth_load_org_vuv: synth_output[id_name][:len(labels), -2] = labels[:, -2] if hparams.synth_load_org_bap: synth_output[id_name][:len(labels), -1] = labels[:, -1] # Run the vocoder. ModelTrainer.synthesize(self, id_list, synth_output, hparams)
def main(): """Create samples with artificial alpha for each phoneme.""" from idiaptts.src.model_trainers.vtln.VTLNSpeakerAdaptionModelTrainer import VTLNSpeakerAdaptionModelTrainer hparams = VTLNSpeakerAdaptionModelTrainer.create_hparams() hparams.use_gpu = False hparams.voice = sys.argv[1] hparams.model_name = "WarpingLayerTest.nn" hparams.add_deltas = True hparams.num_coded_sps = 30 alpha_range = 0.2 num_phonemes = 70 num_random_alphas = 7 # num_random_alphas = 53 # Randomly pick alphas for each phoneme. np.random.seed(42) # phonemes_to_alpha_tensor = ((np.random.choice(np.random.rand(num_random_alphas), num_phonemes) - 0.5) * 2 * alpha_range) phonemes_to_alpha_tensor = ((np.random.rand(num_phonemes) - 0.5) * 2 * alpha_range) # hparams.num_questions = 505 hparams.num_questions = 609 # hparams.num_questions = 425 hparams.out_dir = os.path.join("experiments", hparams.voice, "WORLD_artificially_warped") hparams.data_dir = os.path.realpath("database") hparams.model_name = "warping_layer_test" hparams.synth_dir = hparams.out_dir dir_world_labels = os.path.join("experiments", hparams.voice, "WORLD") print( "Create artificially warped MGCs for {} in {} for {} questions, {} random alphas, and an alpha range of {}." .format(hparams.voice, hparams.out_dir, hparams.num_questions, len(np.unique(phonemes_to_alpha_tensor)), alpha_range)) from idiaptts.src.data_preparation.world.WorldFeatLabelGen import WorldFeatLabelGen gen_in = WorldFeatLabelGen(dir_world_labels, add_deltas=hparams.add_deltas, num_coded_sps=hparams.num_coded_sps) gen_in.get_normalisation_params(gen_in.dir_labels) from idiaptts.src.model_trainers.AcousticModelTrainer import AcousticModelTrainer trainer = AcousticModelTrainer( os.path.join("experiments", hparams.voice, "WORLD"), os.path.join("experiments", hparams.voice, "questions"), "ignored", hparams.num_questions, hparams) hparams.num_speakers = 1 speaker = "p276" num_synth_files = 5 # Number of files to synthesise to check warping manually. sp_mean = gen_in.norm_params[0][:hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] sp_std_dev = gen_in.norm_params[1][:hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] wl = WarpingLayer((hparams.num_coded_sps, ), (hparams.num_coded_sps, ), hparams) wl.set_norm_params(sp_mean, sp_std_dev) def _question_to_phoneme_index(questions): """Helper function to convert questions to their current phoneme index.""" if questions.shape[-1] == 505: # German question set. indices = np.arange(86, 347, 5, dtype=np.int) elif questions.shape[-1] == 425: # English radio question set. indices = np.arange(58, 107, dtype=np.int) elif questions.shape[-1] == 609: # English unilex question set. indices = np.arange(92, 162, dtype=np.int) else: raise NotImplementedError( "Unknown question set with {} questions.".format( questions.shape[-1])) return QuestionLabelGen.questions_to_phoneme_indices( questions, indices) # with open(os.path.join(hparams.data_dir, "file_id_list_{}_train.txt".format(hparams.voice))) as f: with open( os.path.join(hparams.data_dir, "file_id_list_{}_adapt.txt".format( hparams.voice))) as f: id_list = f.readlines() id_list[:] = [s.strip(' \t\n\r') for s in id_list if speaker in s] # Trim line endings in-place. out_dir = hparams.out_dir makedirs_safe(out_dir) makedirs_safe(os.path.join(out_dir, "cmp_mgc" + str(hparams.num_coded_sps))) t_benchmark = 0 org_to_warped_mcd = 0.0 for idx, id_name in enumerate(id_list): sample = WorldFeatLabelGen.load_sample( id_name, os.path.join("experiments", hparams.voice, "WORLD"), add_deltas=True, num_coded_sps=hparams.num_coded_sps) sample_pre = gen_in.preprocess_sample(sample) coded_sps = sample_pre[:, :hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] questions = QuestionLabelGen.load_sample( id_name, os.path.join("experiments", hparams.voice, "questions"), num_questions=hparams.num_questions) questions = questions[:len(coded_sps)] phoneme_indices = _question_to_phoneme_index(questions) alpha_vec = phonemes_to_alpha_tensor[phoneme_indices % len(phonemes_to_alpha_tensor), None] coded_sps = coded_sps[:len(alpha_vec), None, ...] # Create a batch dimension. alpha_vec = alpha_vec[:, None, None] # Create a batch and feature dimension. t_start = timer() mfcc_warped, (_, nn_alpha) = wl(torch.from_numpy(coded_sps), None, (len(coded_sps), ), (len(coded_sps), ), alphas=torch.from_numpy(alpha_vec)) t_benchmark += timer() - t_start sample_pre[:len(mfcc_warped), :hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] = mfcc_warped[:, 0].detach() sample_post = gen_in.postprocess_sample(sample_pre) # Manually create samples without normalisation but with deltas. sample_pre = (sample_pre * gen_in.norm_params[1] + gen_in.norm_params[0]).astype(np.float32) if np.isnan(sample_pre).any(): raise ValueError( "Detected nan values in output features for {}.".format( id_name)) # Compute error between warped version and original one. org_to_warped_mcd += metrics.melcd( sample[:, 0:hparams.num_coded_sps], sample_pre[:, 0:hparams.num_coded_sps]) # Save warped features. sample_pre.tofile( os.path.join( out_dir, "cmp_mgc" + str(hparams.num_coded_sps), os.path.basename(id_name + WorldFeatLabelGen.ext_deltas))) hparams.synth_dir = out_dir if idx < num_synth_files: # Only synthesize a few of samples. trainer.run_world_synth({id_name: sample_post}, hparams) print("Process time for {} warpings: {}. MCD caused by warping: {:.2f}". format(len(id_list), timedelta(seconds=t_benchmark), org_to_warped_mcd / len(id_list))) # Copy normalisation files which are necessary for training. for feature in ["_bap", "_lf0", "_mgc{}".format(hparams.num_coded_sps)]: shutil.copyfile( os.path.join( gen_in.dir_labels, gen_in.dir_deltas, MeanCovarianceExtractor.file_name_appendix + feature + ".bin"), os.path.join( out_dir, "cmp_mgc" + str(hparams.num_coded_sps), MeanCovarianceExtractor.file_name_appendix + feature + ".bin"))
def main(): from idiaptts.src.model_trainers.vtln.VTLNSpeakerAdaptionModelTrainer import VTLNSpeakerAdaptionModelTrainer hparams = VTLNSpeakerAdaptionModelTrainer.create_hparams() hparams.use_gpu = False hparams.voice = "English" hparams.model_name = "AllPassWarpModelTest.nn" hparams.add_deltas = True hparams.num_coded_sps = 30 # hparams.num_questions = 505 hparams.num_questions = 425 hparams.out_dir = os.path.join("experiments", hparams.voice, "VTLNArtificiallyWarped") hparams.data_dir = os.path.realpath("database") hparams.model_name = "all_pass_warp_test" hparams.synth_dir = hparams.out_dir batch_size = 2 dir_world_labels = os.path.join("experiments", hparams.voice, "WORLD") # hparams.add_hparam("warp_matrix_size", hparams.num_coded_sps) hparams.alpha_ranges = [ 0.2, ] from idiaptts.src.data_preparation.world.WorldFeatLabelGen import WorldFeatLabelGen gen_in = WorldFeatLabelGen(dir_world_labels, add_deltas=hparams.add_deltas, num_coded_sps=hparams.num_coded_sps, num_bap=hparams.num_bap) gen_in.get_normalisation_params(gen_in.dir_labels) from idiaptts.src.model_trainers.AcousticModelTrainer import AcousticModelTrainer trainer = AcousticModelTrainer( "experiments/" + hparams.voice + "/WORLD", "experiments/" + hparams.voice + "/questions", "ignored", hparams.num_questions, hparams) sp_mean = gen_in.norm_params[0][:hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] sp_std_dev = gen_in.norm_params[1][:hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] all_pass_warp_model = AllPassWarpModel((hparams.num_coded_sps, ), (hparams.num_coded_sps, ), hparams) all_pass_warp_model.set_norm_params(sp_mean, sp_std_dev) # id_list = ["dorian/doriangray_16_00199"] # id_list = ["p225/p225_051", "p277/p277_012", "p278/p278_012", "p279/p279_012"] id_list = ["p225/p225_051"] t_benchmark = 0 for id_name in id_list: sample = WorldFeatLabelGen.load_sample( id_name, os.path.join("experiments", hparams.voice, "WORLD"), add_deltas=True, num_coded_sps=hparams.num_coded_sps, num_bap=hparams.num_bap, sp_type=hparams.sp_type) sample_pre = gen_in.preprocess_sample(sample) coded_sps = sample_pre[:, :hparams.num_coded_sps * (3 if hparams.add_deltas else 1)].copy() coded_sps = coded_sps[:, None, ...].repeat(batch_size, 1) # Copy data in batch dimension. for idx, alpha in enumerate(np.arange(-0.2, 0.21, 0.05)): out_dir = os.path.join(hparams.out_dir, "alpha_{0:0.2f}".format(alpha)) makedirs_safe(out_dir) alpha_vec = np.ones((coded_sps.shape[0], 1)) * alpha alpha_vec = alpha_vec[:, None].repeat( batch_size, 1) # Copy data in batch dimension. t_start = timer() sp_warped, (_, nn_alpha) = all_pass_warp_model( torch.from_numpy(coded_sps.copy()), None, (len(coded_sps), ), (len(coded_sps), ), alphas=torch.tensor(alpha_vec, requires_grad=True)) sp_warped.sum().backward() t_benchmark += timer() - t_start # assert((mfcc_warped[:, 0] == mfcc_warped[:, 1]).all()) # Compare results for cloned coded_sps within batch. if np.isclose(alpha, 0): assert np.isclose( sp_warped.detach().cpu().numpy(), coded_sps).all() # Compare no warping results. sample_pre[:len(sp_warped), :hparams.num_coded_sps * ( 3 if hparams.add_deltas else 1)] = sp_warped[:, 0].detach() sample_post = gen_in.postprocess_sample(sample_pre, apply_mlpg=False) # Manually create samples without normalisation but with deltas. sample_pre_with_deltas = (sample_pre * gen_in.norm_params[1] + gen_in.norm_params[0]).astype(np.float32) if np.isnan(sample_pre_with_deltas).any(): raise ValueError( "Detected nan values in output features for {}.".format( id_name)) # Save warped features. makedirs_safe(os.path.dirname(os.path.join(out_dir, id_name))) sample_pre_with_deltas.tofile( os.path.join(out_dir, id_name + "." + WorldFeatLabelGen.ext_deltas)) hparams.synth_dir = out_dir # sample_no_deltas = WorldFeatLabelGen.convert_from_world_features(*WorldFeatLabelGen.convert_to_world_features(sample, contains_deltas=hparams.add_deltas, num_coded_sps=hparams.num_coded_sps, num_bap=hparams.num_bap)) Synthesiser.run_world_synth({id_name: sample_post}, hparams) print("Process time for {} runs: {}, average: {}".format( len(id_list) * idx, timedelta(seconds=t_benchmark), timedelta(seconds=t_benchmark) / (len(id_list) * idx)))