def _get_acoustic_trainer(self, hparams): dir_world_features = os.path.join("integration", "fixtures", "WORLD") dir_question_labels = os.path.join("integration", "fixtures", "questions") return AcousticModelTrainer(dir_world_features, dir_question_labels, self.id_list, hparams.num_questions, hparams)
def test_gen_figure(self): num_test_files = 2 hparams = self._get_hparams() hparams.out_dir = os.path.join( hparams.out_dir, "test_gen_figure") # Add function name to path hparams.model_name = "test_model_in409_out67.nn" hparams.model_path = os.path.join("integration", "fixtures", hparams.model_name) trainer = AcousticModelTrainer(self.dir_world_features, self.dir_question_labels, self.id_list, hparams.num_questions, hparams) trainer.init(hparams) with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=UserWarning, module="matplotlib") trainer.gen_figure(hparams, self.id_list[:num_test_files]) # Check number of created files. found_files = list([ name for name in os.listdir(hparams.out_dir) if os.path.isfile(os.path.join(hparams.out_dir, name)) and name.endswith(hparams.model_name + ".Org-PyTorch" + hparams.gen_figure_ext) ]) self.assertEqual( len(self.id_list[:num_test_files]), len(found_files), msg="Number of {} files in out_dir directory does not match.". format(hparams.gen_figure_ext)) shutil.rmtree(hparams.out_dir)
def test_synth_wav(self): num_test_files = 2 hparams = self._get_hparams() hparams.out_dir = os.path.join( hparams.out_dir, "test_synth_wav") # Add function name to path hparams.model_name = "test_model_in409_out67" hparams.model_path = os.path.join("integration", "fixtures", hparams.model_name, hparams.networks_dir) hparams.synth_fs = 16000 hparams.frame_size_ms = 5 hparams.synth_ext = "wav" hparams.synth_load_org_sp = True hparams.synth_load_org_lf0 = True hparams.synth_load_org_vuv = True hparams.synth_load_org_bap = True trainer = AcousticModelTrainer( **AcousticModelTrainer.legacy_support_init( self.dir_world_features, self.dir_question_labels, self.id_list, hparams.num_questions, hparams)) trainer.init(hparams) hparams.synth_dir = os.path.join(hparams.out_dir, hparams.model_name) trainer.synth(hparams, self.id_list[:num_test_files]) found_files = list([ name for name in os.listdir(hparams.synth_dir) if os.path.isfile(os.path.join(hparams.synth_dir, name)) and name.endswith("_WORLD." + hparams.synth_ext) ]) # Check number of created files. self.assertEqual( len(self.id_list[:num_test_files]), len(found_files), msg="Number of {} files in synth_dir directory does not match.". format(hparams.synth_ext)) # Check readability and length of one created file. raw, fs = soundfile.read( os.path.join(hparams.synth_dir, found_files[0])) self.assertEqual( hparams.synth_fs, fs, msg="Desired sampling frequency of output doesn't match.") labels = trainer.datareaders["acoustic_features"][[ id_name for id_name in self.id_list[:num_test_files] if id_name in found_files[0] ][0]] expected_length = len( raw) / hparams.synth_fs / hparams.frame_size_ms * 1000 self.assertTrue( abs(expected_length - len(labels["acoustic_features"])) < 10, msg= "Saved raw audio file length does not roughly match length of labels." ) shutil.rmtree(hparams.out_dir)
def test_init(self): hparams = self._get_hparams() hparams.out_dir = os.path.join( hparams.out_dir, "test_init") # Add function name to path. trainer = AcousticModelTrainer(self.dir_world_features, self.dir_question_labels, self.id_list, hparams.num_questions, hparams) trainer.init(hparams) shutil.rmtree(hparams.out_dir)
def test_benchmark(self): hparams = self._get_hparams() hparams.out_dir = os.path.join( hparams.out_dir, "test_benchmark") # Add function name to path. hparams.seed = 1 trainer = AcousticModelTrainer(self.dir_world_features, self.dir_question_labels, self.id_list, hparams.num_questions, hparams) trainer.init(hparams) scores = trainer.benchmark(hparams) numpy.testing.assert_almost_equal((8.616, 78.4, 0.609, 37.352), scores, 3, "Wrong benchmark score.") shutil.rmtree(hparams.out_dir)
def generate_audio_features(self, id_list, hparams): # TODO: This function is untested. """ Generate mgc, vuv and bap data with an acoustic model. The name of the acoustic model is saved in hparams.synth_acoustic_model_path and given in the constructor. If the synth_acoustic_model_path is 'None' this method will not be called but the method load_extracted_audio_features, which reloads the original data extracted from the audio. If you want to generate audio directly from wcad atom extraction, uncomment the first block in the get_recon_from_synth_output method. Detailed execution process: This method reuses the synth method of the ModelTrainer base class. It overwrites the internal f_synthesize method and the OutputGen to accomplish the audio generation. Both are restored after finishing the generation. The base class synth method loads the acoustic model network by its name and forwards the question labels for each utterance in the id_list. At the end the method calls the f_synthesize method. Therefore the f_synthesize method is overwritten by the save_audio_features which saves the generate output mgc, vuv and bap files in the self.synth_dir folder. """ self.logger.info("Generate mgc, vuv and bap with " + hparams.synth_acoustic_model_path) acoustic_model_hparams = AcousticModelTrainer.create_hparams() acoustic_model_hparams.model_name = os.path.basename( hparams.synth_acoustic_model_path) acoustic_model_hparams.model_path = hparams.synth_acoustic_model_path acoustic_model_handler = AcousticModelTrainer(acoustic_model_hparams) org_model_handler = self.model_handler self.model_handler = acoustic_model_handler # Switch f_synthesize method and OutputGen for mgc, vuv and bap creation. # f_synthesize is called at the end of synth. self.f_synthesize = self.save_audio_features org_output_gen = self.OutputGen self.OutputGen = self.AudioGen # Explicitly synthesize with acoustic_model_name. # This method calls f_synthesize at the end which will save the mgc, vuv and bap. self.synth(hparams, id_list) # Switch back to atom creation. self.f_synthesize = self.synthesize self.OutputGen = org_output_gen self.model_handler = org_model_handler
def test_train(self): hparams = self._get_hparams() hparams.out_dir = os.path.join( hparams.out_dir, "test_train") # Add function name to path. hparams.seed = 1234 hparams.use_best_as_final_model = False trainer = AcousticModelTrainer(self.dir_world_features, self.dir_question_labels, self.id_list, hparams.num_questions, hparams) trainer.init(hparams) _, all_loss_train, _ = trainer.train(hparams) # Training loss decreases? self.assertLess(all_loss_train[-1], all_loss_train[1 if hparams.start_with_test else 0], msg="Loss did not decrease over {} epochs".format( hparams.epochs)) shutil.rmtree(hparams.out_dir)
def main(): from idiaptts.src.model_trainers.vtln.VTLNSpeakerAdaptionModelTrainer import VTLNSpeakerAdaptionModelTrainer hparams = VTLNSpeakerAdaptionModelTrainer.create_hparams() hparams.use_gpu = False hparams.voice = "English" hparams.model_name = "WarpingLayerTest.nn" hparams.add_deltas = True hparams.num_coded_sps = 30 # hparams.num_questions = 505 hparams.num_questions = 425 hparams.out_dir = "experiments/" + hparams.voice + "/VTLNArtificiallyWarped/" hparams.data_dir = os.path.realpath("database") hparams.model_name = "warping_layer_test" hparams.synth_dir = hparams.out_dir batch_size = 2 dir_world_labels = os.path.join("experiments", hparams.voice, "WORLD") from idiaptts.src.data_preparation.world.WorldFeatLabelGen import WorldFeatLabelGen gen_in = WorldFeatLabelGen(dir_world_labels, add_deltas=hparams.add_deltas, num_coded_sps=hparams.num_coded_sps) gen_in.get_normalisation_params(gen_in.dir_labels) from idiaptts.src.model_trainers.AcousticModelTrainer import AcousticModelTrainer trainer = AcousticModelTrainer( "experiments/" + hparams.voice + "/WORLD", "experiments/" + hparams.voice + "/questions", "ignored", hparams.num_questions, hparams) sp_mean = gen_in.norm_params[0][:hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] sp_std_dev = gen_in.norm_params[1][:hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] wl = WarpingLayer((hparams.num_coded_sps, ), (hparams.num_coded_sps, ), hparams) wl.set_norm_params(sp_mean, sp_std_dev) # id_list = ["dorian/doriangray_16_00199"] id_list = ["p225/p225_051"] hparams.num_speakers = 1 t_benchmark = 0 for id_name in id_list: for idx, alpha in enumerate(np.arange(-0.15, 0.2, 0.05)): out_dir = hparams.out_dir + "alpha_{0:0.2f}/".format(alpha) makedirs_safe(out_dir) sample = WorldFeatLabelGen.load_sample( id_name, os.path.join("experiments", hparams.voice, "WORLD"), add_deltas=True, num_coded_sps=hparams.num_coded_sps) sample_pre = gen_in.preprocess_sample(sample) coded_sps = sample_pre[:, :hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] alpha_vec = np.ones((coded_sps.shape[0], 1)) * alpha coded_sps = coded_sps[:len(alpha_vec), None, ...].repeat( batch_size, 1) # Copy data in batch dimension. alpha_vec = alpha_vec[:, None, None].repeat( batch_size, 1) # Copy data in batch dimension. t_start = timer() mfcc_warped, (_, nn_alpha) = wl(torch.from_numpy(coded_sps), None, (len(coded_sps), ), (len(coded_sps), ), alphas=torch.from_numpy(alpha_vec)) mfcc_warped.sum().backward() t_benchmark += timer() - t_start assert ((mfcc_warped[:, 0] == mfcc_warped[:, 1]).all() ) # Compare results for cloned coded_sps within batch. if alpha == 0: assert ((mfcc_warped == coded_sps).all() ) # Compare results for no warping. sample_pre[:len(mfcc_warped), :hparams.num_coded_sps * ( 3 if hparams.add_deltas else 1)] = mfcc_warped[:, 0].detach() sample_post = gen_in.postprocess_sample(sample_pre) # Manually create samples without normalisation but with deltas. sample_pre = (sample_pre * gen_in.norm_params[1] + gen_in.norm_params[0]).astype(np.float32) if np.isnan(sample_pre).any(): raise ValueError( "Detected nan values in output features for {}.".format( id_name)) # Save warped features. makedirs_safe(os.path.dirname(os.path.join(out_dir, id_name))) sample_pre.tofile( os.path.join(out_dir, id_name + WorldFeatLabelGen.ext_deltas)) hparams.synth_dir = out_dir Synthesiser.run_world_synth({id_name: sample_post}, hparams) print("Process time for {} runs: {}".format( len(id_list) * idx, timedelta(seconds=t_benchmark)))
def main(): """Create samples with artificial alpha for each phoneme.""" from idiaptts.src.model_trainers.vtln.VTLNSpeakerAdaptionModelTrainer import VTLNSpeakerAdaptionModelTrainer hparams = VTLNSpeakerAdaptionModelTrainer.create_hparams() hparams.use_gpu = False hparams.voice = sys.argv[1] hparams.model_name = "WarpingLayerTest.nn" hparams.add_deltas = True hparams.num_coded_sps = 30 alpha_range = 0.2 num_phonemes = 70 num_random_alphas = 7 # num_random_alphas = 53 # Randomly pick alphas for each phoneme. np.random.seed(42) # phonemes_to_alpha_tensor = ((np.random.choice(np.random.rand(num_random_alphas), num_phonemes) - 0.5) * 2 * alpha_range) phonemes_to_alpha_tensor = ((np.random.rand(num_phonemes) - 0.5) * 2 * alpha_range) # hparams.num_questions = 505 hparams.num_questions = 609 # hparams.num_questions = 425 hparams.out_dir = os.path.join("experiments", hparams.voice, "WORLD_artificially_warped") hparams.data_dir = os.path.realpath("database") hparams.model_name = "warping_layer_test" hparams.synth_dir = hparams.out_dir dir_world_labels = os.path.join("experiments", hparams.voice, "WORLD") print( "Create artificially warped MGCs for {} in {} for {} questions, {} random alphas, and an alpha range of {}." .format(hparams.voice, hparams.out_dir, hparams.num_questions, len(np.unique(phonemes_to_alpha_tensor)), alpha_range)) from idiaptts.src.data_preparation.world.WorldFeatLabelGen import WorldFeatLabelGen gen_in = WorldFeatLabelGen(dir_world_labels, add_deltas=hparams.add_deltas, num_coded_sps=hparams.num_coded_sps) gen_in.get_normalisation_params(gen_in.dir_labels) from idiaptts.src.model_trainers.AcousticModelTrainer import AcousticModelTrainer trainer = AcousticModelTrainer( os.path.join("experiments", hparams.voice, "WORLD"), os.path.join("experiments", hparams.voice, "questions"), "ignored", hparams.num_questions, hparams) hparams.num_speakers = 1 speaker = "p276" num_synth_files = 5 # Number of files to synthesise to check warping manually. sp_mean = gen_in.norm_params[0][:hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] sp_std_dev = gen_in.norm_params[1][:hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] wl = WarpingLayer((hparams.num_coded_sps, ), (hparams.num_coded_sps, ), hparams) wl.set_norm_params(sp_mean, sp_std_dev) def _question_to_phoneme_index(questions): """Helper function to convert questions to their current phoneme index.""" if questions.shape[-1] == 505: # German question set. indices = np.arange(86, 347, 5, dtype=np.int) elif questions.shape[-1] == 425: # English radio question set. indices = np.arange(58, 107, dtype=np.int) elif questions.shape[-1] == 609: # English unilex question set. indices = np.arange(92, 162, dtype=np.int) else: raise NotImplementedError( "Unknown question set with {} questions.".format( questions.shape[-1])) return QuestionLabelGen.questions_to_phoneme_indices( questions, indices) # with open(os.path.join(hparams.data_dir, "file_id_list_{}_train.txt".format(hparams.voice))) as f: with open( os.path.join(hparams.data_dir, "file_id_list_{}_adapt.txt".format( hparams.voice))) as f: id_list = f.readlines() id_list[:] = [s.strip(' \t\n\r') for s in id_list if speaker in s] # Trim line endings in-place. out_dir = hparams.out_dir makedirs_safe(out_dir) makedirs_safe(os.path.join(out_dir, "cmp_mgc" + str(hparams.num_coded_sps))) t_benchmark = 0 org_to_warped_mcd = 0.0 for idx, id_name in enumerate(id_list): sample = WorldFeatLabelGen.load_sample( id_name, os.path.join("experiments", hparams.voice, "WORLD"), add_deltas=True, num_coded_sps=hparams.num_coded_sps) sample_pre = gen_in.preprocess_sample(sample) coded_sps = sample_pre[:, :hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] questions = QuestionLabelGen.load_sample( id_name, os.path.join("experiments", hparams.voice, "questions"), num_questions=hparams.num_questions) questions = questions[:len(coded_sps)] phoneme_indices = _question_to_phoneme_index(questions) alpha_vec = phonemes_to_alpha_tensor[phoneme_indices % len(phonemes_to_alpha_tensor), None] coded_sps = coded_sps[:len(alpha_vec), None, ...] # Create a batch dimension. alpha_vec = alpha_vec[:, None, None] # Create a batch and feature dimension. t_start = timer() mfcc_warped, (_, nn_alpha) = wl(torch.from_numpy(coded_sps), None, (len(coded_sps), ), (len(coded_sps), ), alphas=torch.from_numpy(alpha_vec)) t_benchmark += timer() - t_start sample_pre[:len(mfcc_warped), :hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] = mfcc_warped[:, 0].detach() sample_post = gen_in.postprocess_sample(sample_pre) # Manually create samples without normalisation but with deltas. sample_pre = (sample_pre * gen_in.norm_params[1] + gen_in.norm_params[0]).astype(np.float32) if np.isnan(sample_pre).any(): raise ValueError( "Detected nan values in output features for {}.".format( id_name)) # Compute error between warped version and original one. org_to_warped_mcd += metrics.melcd( sample[:, 0:hparams.num_coded_sps], sample_pre[:, 0:hparams.num_coded_sps]) # Save warped features. sample_pre.tofile( os.path.join( out_dir, "cmp_mgc" + str(hparams.num_coded_sps), os.path.basename(id_name + WorldFeatLabelGen.ext_deltas))) hparams.synth_dir = out_dir if idx < num_synth_files: # Only synthesize a few of samples. trainer.run_world_synth({id_name: sample_post}, hparams) print("Process time for {} warpings: {}. MCD caused by warping: {:.2f}". format(len(id_list), timedelta(seconds=t_benchmark), org_to_warped_mcd / len(id_list))) # Copy normalisation files which are necessary for training. for feature in ["_bap", "_lf0", "_mgc{}".format(hparams.num_coded_sps)]: shutil.copyfile( os.path.join( gen_in.dir_labels, gen_in.dir_deltas, MeanCovarianceExtractor.file_name_appendix + feature + ".bin"), os.path.join( out_dir, "cmp_mgc" + str(hparams.num_coded_sps), MeanCovarianceExtractor.file_name_appendix + feature + ".bin"))
def run_DM_AM(hparams, input_strings): """ A function for TTS with a pre-trained duration and acoustic model. :param hparams: Hyper-parameter container. The following parameters are used: front_end: Full path to the makeLabels.sh script in scripts/tts_frontend, depends on the language. festival_dir: Full path to the directory with the festival bin/ folder. front_end_accent (optional): Give an accent to the front_end, used in tts_frontend. duration_labels_dir: Full path to the folder containing the normalisation parameters used to train the duration model. file_symbol_dict: A file containing all the used phonemes (has been used to train the duration model, usually mono_phone.list). duration_model: Full path to the pre-trained duration model. num_phoneme_states: Number of states per phoneme, for each a duration is predicted by the duration model. question_file: Full path to question file used to train the acoustic model. question_labels_norm_file: Full path to normalisation file of questions used to train the acoustic model. num_questions: Number of questions which form the input dimension to the acoustic model. acoustic_model: Full path to acoustic model. :param input_strings: :return: """ # Create a temporary directory to store all files. with tempfile.TemporaryDirectory() as tmp_dir_name: # tmp_dir_name = os.path.realpath("TMP") # makedirs_safe(tmp_dir_name) hparams.out_dir = tmp_dir_name print("Created temporary directory", tmp_dir_name) id_list = ["synth" + str(idx) for idx in range(len(input_strings))] # Write the text to synthesise into a single synth.txt file with ids. utts_file = os.path.join(tmp_dir_name, "synth.txt") with open(utts_file, "w") as text_file: for idx, text in enumerate(input_strings): text_file.write("synth{}\t{}\n".format( idx, text)) # TODO: Remove parenthesis etc. # Call the front end on the synth.txt file. front_end_arguments = [ hparams.front_end, hparams.festival_dir, utts_file ] if hasattr(hparams, "front_end_accent" ) and hparams.front_end_accent is not None: front_end_arguments.append(hparams.front_end_accent) front_end_arguments.append(tmp_dir_name) subprocess.check_call(front_end_arguments) # Remove durations from mono labels. dir_mono_no_align = os.path.join(tmp_dir_name, "mono_no_align") dir_mono = os.path.join(tmp_dir_name, "labels", "mono") if os.path.isdir(dir_mono_no_align): shutil.rmtree(dir_mono_no_align) os.rename(dir_mono, dir_mono_no_align) for id_name in id_list: with open(os.path.join(dir_mono_no_align, id_name + ".lab"), "r") as f: old = f.read() monophones = old.split()[2::3] with open(os.path.join(dir_mono_no_align, id_name + ".lab"), "w") as f: f.write("\n".join(monophones)) # Run duration model. hparams.batch_size_test = len(input_strings) hparams.test_set_perc = 0.0 hparams.val_set_perc = 0.0 hparams.phoneme_label_type = "mono_no_align" hparams.output_norm_params_file_prefix = hparams.duration_norm_file_name if hasattr( hparams, "duration_norm_file_name") else None duration_model_trainer = DurationModelTrainer( os.path.join(tmp_dir_name, "mono_no_align"), hparams.duration_labels_dir, id_list, hparams.file_symbol_dict, hparams) assert hparams.duration_model is not None, "Path to duration model in hparams.duration_model is needed." hparams.model_path = hparams.duration_model hparams.model_name = os.path.basename(hparams.duration_model) # Predict durations. Durations are already converted to multiples of hparams.min_phoneme_length. hparams.load_from_checkpoint = True duration_model_trainer.init(hparams) _, output_dict_post = duration_model_trainer.forward( hparams, id_list) hparams.output_norm_params_file_prefix = None # Reset again. # Write duration to full labels. dir_full = os.path.join(tmp_dir_name, "labels", "full") dir_label_state_align = os.path.join(tmp_dir_name, "labels", "label_state_align") makedirs_safe(dir_label_state_align) for id_name in id_list: with open(os.path.join(dir_full, id_name + ".lab"), "r") as f: full = f.read().split()[2::3] with open( os.path.join(dir_label_state_align, id_name + ".lab"), "w") as f: current_time = 0 timings = output_dict_post[id_name] for idx, monophone in enumerate(full): for state in range(hparams.num_phoneme_states): next_time = current_time + int(timings[idx, state]) f.write("{}\t{}\t{}[{}]\n".format( current_time, next_time, monophone, state + 2)) current_time = next_time # Generate questions from HTK full labels. QuestionLabelGen.gen_data(dir_label_state_align, hparams.question_file, dir_out=tmp_dir_name, file_id_list="synth", id_list=id_list, return_dict=False) # Run acoustic model and synthesise. shutil.copy2(hparams.question_labels_norm_file, tmp_dir_name + "/min-max.bin" ) # Get normalisation parameters in same directory. acoustic_model_trainer = AcousticModelTrainer( hparams.world_features_dir, tmp_dir_name, id_list, hparams.num_questions, hparams) assert hparams.acoustic_model is not None, "Path to acoustic model in hparams.acoustic_model is needed." hparams.model_path = hparams.acoustic_model hparams.model_name = os.path.basename(hparams.acoustic_model) hparams.load_from_checkpoint = True acoustic_model_trainer.init(hparams) hparams.model_name = "" # No suffix in synthesised files. _, output_dict_post = acoustic_model_trainer.synth( hparams, id_list) logging.info("Synthesized files are in {}.".format( hparams.synth_dir)) return 0
def main(): from idiaptts.src.model_trainers.vtln.VTLNSpeakerAdaptionModelTrainer import VTLNSpeakerAdaptionModelTrainer hparams = VTLNSpeakerAdaptionModelTrainer.create_hparams() hparams.use_gpu = False hparams.voice = "English" hparams.model_name = "AllPassWarpModelTest.nn" hparams.add_deltas = True hparams.num_coded_sps = 30 # hparams.num_questions = 505 hparams.num_questions = 425 hparams.out_dir = os.path.join("experiments", hparams.voice, "VTLNArtificiallyWarped") hparams.data_dir = os.path.realpath("database") hparams.model_name = "all_pass_warp_test" hparams.synth_dir = hparams.out_dir batch_size = 2 dir_world_labels = os.path.join("experiments", hparams.voice, "WORLD") # hparams.add_hparam("warp_matrix_size", hparams.num_coded_sps) hparams.alpha_ranges = [ 0.2, ] from idiaptts.src.data_preparation.world.WorldFeatLabelGen import WorldFeatLabelGen gen_in = WorldFeatLabelGen(dir_world_labels, add_deltas=hparams.add_deltas, num_coded_sps=hparams.num_coded_sps, num_bap=hparams.num_bap) gen_in.get_normalisation_params(gen_in.dir_labels) from idiaptts.src.model_trainers.AcousticModelTrainer import AcousticModelTrainer trainer = AcousticModelTrainer( "experiments/" + hparams.voice + "/WORLD", "experiments/" + hparams.voice + "/questions", "ignored", hparams.num_questions, hparams) sp_mean = gen_in.norm_params[0][:hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] sp_std_dev = gen_in.norm_params[1][:hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] all_pass_warp_model = AllPassWarpModel((hparams.num_coded_sps, ), (hparams.num_coded_sps, ), hparams) all_pass_warp_model.set_norm_params(sp_mean, sp_std_dev) # id_list = ["dorian/doriangray_16_00199"] # id_list = ["p225/p225_051", "p277/p277_012", "p278/p278_012", "p279/p279_012"] id_list = ["p225/p225_051"] t_benchmark = 0 for id_name in id_list: sample = WorldFeatLabelGen.load_sample( id_name, os.path.join("experiments", hparams.voice, "WORLD"), add_deltas=True, num_coded_sps=hparams.num_coded_sps, num_bap=hparams.num_bap, sp_type=hparams.sp_type) sample_pre = gen_in.preprocess_sample(sample) coded_sps = sample_pre[:, :hparams.num_coded_sps * (3 if hparams.add_deltas else 1)].copy() coded_sps = coded_sps[:, None, ...].repeat(batch_size, 1) # Copy data in batch dimension. for idx, alpha in enumerate(np.arange(-0.2, 0.21, 0.05)): out_dir = os.path.join(hparams.out_dir, "alpha_{0:0.2f}".format(alpha)) makedirs_safe(out_dir) alpha_vec = np.ones((coded_sps.shape[0], 1)) * alpha alpha_vec = alpha_vec[:, None].repeat( batch_size, 1) # Copy data in batch dimension. t_start = timer() sp_warped, (_, nn_alpha) = all_pass_warp_model( torch.from_numpy(coded_sps.copy()), None, (len(coded_sps), ), (len(coded_sps), ), alphas=torch.tensor(alpha_vec, requires_grad=True)) sp_warped.sum().backward() t_benchmark += timer() - t_start # assert((mfcc_warped[:, 0] == mfcc_warped[:, 1]).all()) # Compare results for cloned coded_sps within batch. if np.isclose(alpha, 0): assert np.isclose( sp_warped.detach().cpu().numpy(), coded_sps).all() # Compare no warping results. sample_pre[:len(sp_warped), :hparams.num_coded_sps * ( 3 if hparams.add_deltas else 1)] = sp_warped[:, 0].detach() sample_post = gen_in.postprocess_sample(sample_pre, apply_mlpg=False) # Manually create samples without normalisation but with deltas. sample_pre_with_deltas = (sample_pre * gen_in.norm_params[1] + gen_in.norm_params[0]).astype(np.float32) if np.isnan(sample_pre_with_deltas).any(): raise ValueError( "Detected nan values in output features for {}.".format( id_name)) # Save warped features. makedirs_safe(os.path.dirname(os.path.join(out_dir, id_name))) sample_pre_with_deltas.tofile( os.path.join(out_dir, id_name + "." + WorldFeatLabelGen.ext_deltas)) hparams.synth_dir = out_dir # sample_no_deltas = WorldFeatLabelGen.convert_from_world_features(*WorldFeatLabelGen.convert_to_world_features(sample, contains_deltas=hparams.add_deltas, num_coded_sps=hparams.num_coded_sps, num_bap=hparams.num_bap)) Synthesiser.run_world_synth({id_name: sample_post}, hparams) print("Process time for {} runs: {}, average: {}".format( len(id_list) * idx, timedelta(seconds=t_benchmark), timedelta(seconds=t_benchmark) / (len(id_list) * idx)))