def __init__(self, dir_world_features, dir_question_labels, id_list, num_questions, hparams=None): """Default constructor. :param dir_world_features: Path to the directory containing the world features. :param dir_question_labels: Path to the directory containing the question labels. :param id_list: List of ids, can contain a speaker directory. :param num_questions: Number of questions in question file. :param hparams: Set of hyper parameters. """ if hparams is None: hparams = self.create_hparams() hparams.out_dir = os.path.curdir # Write missing default parameters. if hparams.variable_sequence_length_train is None: hparams.variable_sequence_length_train = hparams.batch_size_train > 1 if hparams.variable_sequence_length_test is None: hparams.variable_sequence_length_test = hparams.batch_size_test > 1 if hparams.synth_dir is None: hparams.synth_dir = os.path.join(hparams.out_dir, "synth") super(AcousticModelTrainer, self).__init__(id_list, hparams) self.InputGen = QuestionLabelGen(dir_question_labels, num_questions) self.InputGen.get_normalisation_params( dir_question_labels, hparams.input_norm_params_file_prefix) self.OutputGen = WorldFeatLabelGen(dir_world_features, add_deltas=hparams.add_deltas, num_coded_sps=hparams.num_coded_sps, sp_type=hparams.sp_type) self.OutputGen.get_normalisation_params( dir_world_features, hparams.output_norm_params_file_prefix) self.dataset_train = LabelGensDataset(self.id_list_train, self.InputGen, self.OutputGen, hparams, match_lengths=True) self.dataset_val = LabelGensDataset(self.id_list_val, self.InputGen, self.OutputGen, hparams, match_lengths=True) if self.loss_function is None: self.loss_function = torch.nn.MSELoss(reduction='none') if hparams.scheduler_type == "default": hparams.scheduler_type = "Plateau" hparams.add_hparams(plateau_verbose=True)
def _get_trainer(self, hparams): dir_world_features = "integration/fixtures/WORLD" dir_question_labels = "integration/fixtures/questions" trainer = ModelTrainer(self.id_list, hparams) # Create datasets to work on. trainer.InputGen = QuestionLabelGen(dir_question_labels, hparams.num_questions) trainer.InputGen.get_normalisation_params(dir_question_labels) trainer.OutputGen = WorldFeatLabelGen( dir_world_features, num_coded_sps=hparams.num_coded_sps, add_deltas=True) trainer.OutputGen.get_normalisation_params(dir_world_features) trainer.dataset_train = LabelGensDataset(trainer.id_list_train, trainer.InputGen, trainer.OutputGen, hparams, match_lengths=True) trainer.dataset_val = LabelGensDataset(trainer.id_list_val, trainer.InputGen, trainer.OutputGen, hparams, match_lengths=True) trainer.loss_function = torch.nn.MSELoss(reduction='none') return trainer
def plot_phoneme_annotations(plotter: DataPlotter, id_name: str, hparams: ExtendedHParams, num_questions: int, phoneme_indices: np.ndarray, question_dir: os.PathLike, question_file: os.PathLike, grid_indices: List[int] = None): questions = QuestionLabelGen.load_sample(id_name=id_name, dir_out=question_dir, num_questions=num_questions) np_phonemes = QuestionLabelGen.questions_to_phonemes( questions, phoneme_indices, question_file) if grid_indices is None: grid_indices = plotter.get_all_grid_indices() for grid_idx in grid_indices: plotter.set_annotations(grid_idx, np_phonemes)
def test_save_load(self): dir_out = self._get_test_dir() label_dict, *extracted_norm_params = QuestionLabelGen.gen_data( dir_in=self.dir_labels, file_questions=self.file_questions, dir_out=dir_out, id_list=self.id_list, return_dict=True) question_gen = QuestionLabelGen(dir_out, num_questions=409) norm_params = question_gen.get_normalisation_params(dir_out) self.assertTrue((extracted_norm_params[0] == norm_params[0]).all()) self.assertTrue((extracted_norm_params[1] == norm_params[1]).all()) test_label = label_dict[self.id_list[1]] test_label_pre = question_gen.preprocess_sample(test_label) self.assertTrue( np.isclose(test_label_pre, question_gen[self.id_list[1]]).all()) test_label_post = question_gen.postprocess_sample(test_label_pre) self.assertTrue(np.isclose(test_label, test_label_post).all()) shutil.rmtree(dir_out)
def _question_to_phoneme_index(questions): """Helper function to convert questions to their current phoneme index.""" if questions.shape[-1] == 505: # German question set. indices = np.arange(86, 347, 5, dtype=np.int) elif questions.shape[-1] == 425: # English radio question set. indices = np.arange(58, 107, dtype=np.int) elif questions.shape[-1] == 609: # English unilex question set. indices = np.arange(92, 162, dtype=np.int) else: raise NotImplementedError( "Unknown question set with {} questions.".format( questions.shape[-1])) return QuestionLabelGen.questions_to_phoneme_indices( questions, indices)
def __init__(self, wcad_root, dir_audio, dir_atom_labels, dir_lf0_labels, dir_question_labels, id_list, thetas, k, num_questions, dist_window_size=51, hparams_phrase=None): """Default constructor. :param wcad_root: Path to main directory of wcad. :param dir_audio: Path to directory that contains the .wav files. :param dir_lf0_labels: Path to directory that contains the .lf0 files. :param dir_atom_labels: Path to directory that contains the .atoms files. :param dir_question_labels: Path to directory that contains the .lab files. :param id_list: List containing all ids. Subset is taken as test set. :param thetas: List of used theta values. :param k: k-order of each each atom. :param num_questions: Expected number of questions in question labels. :param dist_window_size: Width of the distribution surrounding each atom spike The window is only used for amps. Thetas are surrounded by a window of 5. :param hparams_phrase: Hyper-parameter container. """ if hparams_phrase is None: hparams_phrase = self.create_hparams() hparams_phrase.out_dir = os.path.curdir hparams_flat = hparams_phrase.hparams_flat if hparams_flat is None: hparams_flat = copy.deepcopy(hparams_phrase) # Set default paths to pre-trained models. if hparams_phrase.atom_model_path is None: hparams_phrase.atom_model_path = os.path.join( hparams_phrase.out_dir, hparams_phrase.networks_dir, hparams_phrase.model_name + "_flat_atoms") if hparams_phrase.flat_model_path is None: hparams_phrase.flat_model_path = os.path.join( hparams_phrase.out_dir, hparams_phrase.networks_dir, hparams_phrase.model_name + "_flat") # Write missing default parameters. if hparams_phrase.synth_dir is None: hparams_phrase.synth_dir = os.path.join(hparams_phrase.out_dir, "synth") super().__init__(id_list, hparams_phrase) self.InputGen = QuestionLabelGen(dir_question_labels, num_questions) self.InputGen.get_normalisation_params( dir_question_labels, hparams_phrase.input_norm_params_file_prefix) self.OutputGen = FlatLF0LabelGen(dir_lf0_labels, dir_atom_labels, remove_phrase=False) self.OutputGen.get_normalisation_params( dir_atom_labels, hparams_phrase.output_norm_params_file_prefix) self.dataset_train = PyTorchLabelGensDataset(self.id_list_train, self.InputGen, self.OutputGen, hparams_phrase, match_lengths=True) self.dataset_val = PyTorchLabelGensDataset(self.id_list_val, self.InputGen, self.OutputGen, hparams_phrase, match_lengths=True) self.flat_trainer = AtomNeuralFilterModelTrainer( wcad_root, dir_audio, dir_atom_labels, dir_lf0_labels, dir_question_labels, id_list, thetas, k, num_questions, dist_window_size, hparams_flat) if self.loss_function is None: self.loss_function = L1WeightedVUVMSELoss( weight_unvoiced=hparams_phrase.weight_unvoiced, vuv_loss_weight=hparams_phrase.vuv_loss_weight, L1_loss_weight=hparams_phrase.L1_loss_weight, reduce=False) if hparams_phrase.scheduler_type == "default": hparams_phrase.scheduler_type = "None" # Override the collate and decollate methods of batches. self.batch_collate_fn = self.prepare_batch self.batch_decollate_fn = self.decollate_network_output
class PhraseAtomNeuralFilterModelTrainer(ModelTrainer): """ Implementation of a ModelTrainer for the generation of intonation curves with an end-to-end system. The first part of the architecture runs atom position prediction, and the output layer contains neural filters. Output curves have dimension: T x 2 (amp, theta). Use question labels as input and extracted lf0 as output. """ logger = logging.getLogger(__name__) def __init__(self, wcad_root, dir_audio, dir_atom_labels, dir_lf0_labels, dir_question_labels, id_list, thetas, k, num_questions, dist_window_size=51, hparams_phrase=None): """Default constructor. :param wcad_root: Path to main directory of wcad. :param dir_audio: Path to directory that contains the .wav files. :param dir_lf0_labels: Path to directory that contains the .lf0 files. :param dir_atom_labels: Path to directory that contains the .atoms files. :param dir_question_labels: Path to directory that contains the .lab files. :param id_list: List containing all ids. Subset is taken as test set. :param thetas: List of used theta values. :param k: k-order of each each atom. :param num_questions: Expected number of questions in question labels. :param dist_window_size: Width of the distribution surrounding each atom spike The window is only used for amps. Thetas are surrounded by a window of 5. :param hparams_phrase: Hyper-parameter container. """ if hparams_phrase is None: hparams_phrase = self.create_hparams() hparams_phrase.out_dir = os.path.curdir hparams_flat = hparams_phrase.hparams_flat if hparams_flat is None: hparams_flat = copy.deepcopy(hparams_phrase) # Set default paths to pre-trained models. if hparams_phrase.atom_model_path is None: hparams_phrase.atom_model_path = os.path.join( hparams_phrase.out_dir, hparams_phrase.networks_dir, hparams_phrase.model_name + "_flat_atoms") if hparams_phrase.flat_model_path is None: hparams_phrase.flat_model_path = os.path.join( hparams_phrase.out_dir, hparams_phrase.networks_dir, hparams_phrase.model_name + "_flat") # Write missing default parameters. if hparams_phrase.synth_dir is None: hparams_phrase.synth_dir = os.path.join(hparams_phrase.out_dir, "synth") super().__init__(id_list, hparams_phrase) self.InputGen = QuestionLabelGen(dir_question_labels, num_questions) self.InputGen.get_normalisation_params( dir_question_labels, hparams_phrase.input_norm_params_file_prefix) self.OutputGen = FlatLF0LabelGen(dir_lf0_labels, dir_atom_labels, remove_phrase=False) self.OutputGen.get_normalisation_params( dir_atom_labels, hparams_phrase.output_norm_params_file_prefix) self.dataset_train = PyTorchLabelGensDataset(self.id_list_train, self.InputGen, self.OutputGen, hparams_phrase, match_lengths=True) self.dataset_val = PyTorchLabelGensDataset(self.id_list_val, self.InputGen, self.OutputGen, hparams_phrase, match_lengths=True) self.flat_trainer = AtomNeuralFilterModelTrainer( wcad_root, dir_audio, dir_atom_labels, dir_lf0_labels, dir_question_labels, id_list, thetas, k, num_questions, dist_window_size, hparams_flat) if self.loss_function is None: self.loss_function = L1WeightedVUVMSELoss( weight_unvoiced=hparams_phrase.weight_unvoiced, vuv_loss_weight=hparams_phrase.vuv_loss_weight, L1_loss_weight=hparams_phrase.L1_loss_weight, reduce=False) if hparams_phrase.scheduler_type == "default": hparams_phrase.scheduler_type = "None" # Override the collate and decollate methods of batches. self.batch_collate_fn = self.prepare_batch self.batch_decollate_fn = self.decollate_network_output @staticmethod def create_hparams(hparams_string=None, verbose=False): hparams = ModelTrainer.create_hparams(hparams_string, verbose=False) hparams.add_hparams( thetas=None, # One initial theta value per filter. k=2, # Order of the impulse response of the atoms. min_atom_amp= 0.25, # Post-processing removes atoms with an absolute amplitude smaller than this. complex_poles=True, # Comples poles possible. phase_init=0.0, # Initial phase of the filters. vuv_loss_weight=1.0, # Weight of the VUV RMSE. L1_loss_weight=1.0, # Weight of the L1 loss on the spiking inputs. weight_unvoiced=0.5, # Weight on unvoiced frames. num_questions=None, # Dimension of the input questions. dist_window_size= 51, # Size of distribution around spikes when training the AtomModel. phrase_bias_init= 0.0, # Initial bias of neural filter, should be estimated mean of speaker's LF0. atom_model_path=None, # Path to load a pre-trained atom model from. hparams_atom= None, # Hyper-parameter container used in the AtomModelTrainer flat_model_path= None, # Path to load a pre-trained atom neural filter model from (without phrase curve). hparams_flat= None, # Hyper-parameter container used in the AtomNeuralFilterModelTrainer. ) if verbose: logging.info(hparams.get_debug_string()) return hparams @staticmethod def prepare_batch(batch, common_divisor=1, batch_first=False): inputs, targets, seq_lengths_input, seq_lengths_output, mask, permutation = ModelHandler.prepare_batch( batch, common_divisor=common_divisor, batch_first=batch_first) if targets is not None: if mask is None: mask = torch.ones((seq_lengths_output[0], 1, 1)) mask = mask.expand(*mask.shape[:2], 2) # mask: T x B x 2 (lf0, vuv), add L1 error dimension. mask = torch.cat((mask, mask[..., -1:]), dim=-1).contiguous() # TODO this is a dirty hack, it works but only for VUV weight of 0 (it completes the loss function Weighted) mask[..., 0] = mask[..., 0] * seq_lengths_output.float() ################################################ return inputs, targets, seq_lengths_input, seq_lengths_output, mask, permutation @staticmethod def decollate_network_output(output, _, seq_lengths=None, permutation=None, batch_first=True): """Split output into LF0, V/UV and command signals. Return command signals as hidden state.""" # Split pre-net output (command signals). intern_amps, _ = ModelTrainer.split_batch(output[:, :, 2:], None, seq_lengths, permutation, batch_first) # Split final LF0, V/UV. output, _ = ModelTrainer.split_batch(output[:, :, :2], None, seq_lengths, permutation, batch_first) return output, intern_amps def init_flat(self, hparams): """ Initialize the neural filters model without phrase bias. If the model_type_filters is None, the old model will be loaded, which already contains the atom model. :param hparams: Hyper-parameter container. :return: Nothing """ if hparams.model_type is None and hparams.hparams_flat.epochs != 0: logging.warning( "When hparams_flat.model_type=None the old model is loaded. This means that training " "the atom model by hparams_flat.epochs={} has no effect, so we set it to zero." .format(hparams.hparams_flat.epochs)) hparams.hparams_flat.epochs = 0 self.logger.info("Create flat neural filter model.") self.flat_trainer.init(hparams.hparams_flat) def init_atom(self, hparams): """ Initialize the atom model. If the model_type_filters is None, the old model will be loaded, which already contains the atom model. :param hparams: Hyper-parameter container. :return: Nothing """ self.flat_trainer.init_atom(hparams.hparams_flat) def init(self, hparams): self.logger.info("Create phrase E2E model.") flat_trainer_model_path = os.path.join( hparams.hparams_flat.out_dir, hparams.hparams_flat.networks_dir, hparams.hparams_flat.model_name) if hparams.hparams_flat.epochs > 0 and hparams.flat_model_path != flat_trainer_model_path: logging.warning( "Flat model has been trained for {} epochs and saved in {}, " "but you will use hparams.flat_model_path = {} to create a new model." .format(hparams.hparams_flat.epochs, flat_trainer_model_path, hparams.flat_model_path)) super().init(hparams) def train_flat(self, hparams): output = self.flat_trainer.train(hparams.hparams_flat) if hparams.hparams_flat.epochs > 0: self.flat_trainer.benchmark(hparams.hparams_flat) return output def train_atom(self, hparams): return self.flat_trainer.train_atom(hparams.hparams_flat) def filters_forward(self, in_tensor, hparams, batch_seq_lengths=None, max_seq_length=None): """Get output of each filter without their superposition.""" self.model_handler.model.eval() # If input is numpy array convert it to torch tensor. if isinstance(in_tensor, np.ndarray): in_tensor = torch.from_numpy(in_tensor) if hparams.use_gpu: in_tensor = in_tensor.cuda() if batch_seq_lengths is None: batch_seq_lengths = (len(in_tensor), ) if max_seq_length is None: max_seq_length = max(batch_seq_lengths) hidden = self.model_handler.model.init_hidden(len(batch_seq_lengths)) output = self.model_handler.model.filters_forward( in_tensor, hidden, batch_seq_lengths, max_seq_length) return output.detach().cpu().numpy() # FIXME # def gen_animation(self, id_name, labels=None): # # if labels is None: # input_labels = self.InputGen.__getitem__(id_name)[:, None, :] # labels = self.model_handler.forward(input_labels) # # # Retrieve data from label. # labels_post = self.OutputGen.postprocess_sample(labels) # output_vuv = labels_post[:, 1] # output_vuv[output_vuv < 0.5] = 0.0 # output_vuv[output_vuv >= 0.5] = 1.0 # # output_lf0 = labels_post[:, 0] # # # Load original lf0 and vuv. # org_labels = self.OutputGen.load_sample(id_name, self.OutputGen.dir_labels) # original_lf0, _ = self.OutputGen.convert_to_world_features(org_labels) # # original_lf0, _ = interpolate_lin(original_lf0) # # phrase_curve = self.OutputGen.get_phrase_curve(id_name) # original_lf0 -= phrase_curve[:len(original_lf0)] # original_lf0 = original_lf0[:len(output_lf0)] # # org_labels = self.atom_trainer.OutputGen.load_sample(id_name, # self.atom_trainer.OutputGen.dir_labels, # len(self.atom_trainer.OutputGen.theta_interval), # self.atom_trainer.OutputGen.dir_world_labels) # # org_labels = org_labels[:, 1:] # len_diff = len(org_labels) - len(labels_post) # org_labels = self.atom_trainer.OutputGen.trim_end_sample(org_labels, int(len_diff / 2.0)) # org_labels = self.atom_trainer.OutputGen.trim_end_sample(org_labels, int(len_diff / 2.0) + 1) # org_atoms = AtomLabelGen.labels_to_atoms(org_labels, k=self.atom_trainer.OutputGen.k, frame_size=self.atom_trainer.OutputGen.frame_size) # wcad_lf0 = self.atom_trainer.OutputGen.atoms_to_lf0(org_atoms, len(org_labels)) # # phrase_curve = self.OutputGen.get_phrase_curve(id_name)[:len(wcad_lf0)] # original_lf0 = original_lf0[:len(wcad_lf0)] + phrase_curve.squeeze() # # for index in range(len(org_atoms)+1): # plotter = DataPlotter() # plot_id = 0 # wcad_lf0 = self.atom_trainer.OutputGen.atoms_to_lf0(org_atoms[:index], len(org_labels)) # reconstruction = phrase_curve + wcad_lf0 # # graphs_lf0 = list() # graphs_lf0.append((original_lf0, "Original")) # graphs_lf0.append((reconstruction, "Reconstruction")) # plotter.set_data_list(grid_idx=plot_id, data_list=graphs_lf0) # plotter.set_label(grid_idx=plot_id, xlabel='frames [' + str(self.atom_trainer.OutputGen.frame_size) + ' ms]', # ylabel='lf0') # plotter.set_lim(grid_idx=plot_id, ymin=4) # plotter.set_linestyles(grid_idx=plot_id, linestyles=['-.', '-','-']) # plotter.set_colors(grid_idx=plot_id, colors=['C3', 'C2'], alpha=1) # plot_id += 1 # # graphs_atoms = list() # # graphs_atoms.append((phrase_curve[:len(original_lf0)], )) # plotter.set_data_list(grid_idx=plot_id, data_list=graphs_atoms) # plotter.set_atom_list(grid_idx=plot_id, atom_list=org_atoms[:index]) # plotter.set_label(grid_idx=plot_id, xlabel='frames [' + str(self.atom_trainer.OutputGen.frame_size) + ' ms]', # ylabel='Atoms') # plotter.set_lim(grid_idx=plot_id, ymin=-0.5, ymax=0.3) # plotter.set_colors(grid_idx=plot_id, colors=['C1',], alpha=1) # # plotter.gen_plot(sharex=True) def gen_figure_from_output(self, id_name, labels, hidden, hparams, clustering=None, filters_out=None): if labels is None or filters_out is None: input_labels = self.InputGen[id_name][:, None, ...] labels = self.model_handler.forward(input_labels, hparams)[0][:, 0] filters_out = self.filters_forward(input_labels, hparams)[:, 0, ...] intern_amps = labels[:, 2:] labels = labels[:, :2] # Retrieve data from label. labels_post = self.OutputGen.postprocess_sample(labels) output_vuv = labels_post[:, 1] output_vuv[output_vuv < 0.5] = 0.0 output_vuv[output_vuv >= 0.5] = 1.0 output_vuv = output_vuv.astype(bool) output_lf0 = labels_post[:, 0] # Load original lf0 and vuv. org_labels = self.OutputGen.load_sample(id_name, self.OutputGen.dir_labels) original_lf0, original_vuv = self.OutputGen.convert_to_world_features( org_labels) # original_lf0, _ = interpolate_lin(original_lf0) # phrase_curve = self.OutputGen.get_phrase_curve(id_name) # original_lf0 -= phrase_curve[:len(original_lf0)] original_lf0 = original_lf0[:len(output_lf0)] f0_mse = (np.exp(original_lf0) - np.exp(output_lf0))**2 f0_rmse = math.sqrt((f0_mse * original_vuv[:len(output_lf0)]).sum() / original_vuv[:len(output_lf0)].sum()) self.logger.info("RMSE of {}: {} Hz.".format(id_name, f0_rmse)) org_labels = self.flat_trainer.atom_trainer.OutputGen.load_sample( id_name, self.flat_trainer.atom_trainer.OutputGen.dir_labels, len(self.flat_trainer.atom_trainer.OutputGen.theta_interval), self.flat_trainer.atom_trainer.OutputGen.dir_world_labels) org_vuv = org_labels[:, 0, 0] org_vuv = org_vuv.astype(bool) thetas = self.model_handler.model.thetas_approx() # Get a data plotter net_name = os.path.basename(hparams.model_name) filename = str(os.path.join(hparams.out_dir, id_name + '.' + net_name)) plotter = DataPlotter() plot_id = 0 graphs_intern = list() for idx in reversed(range(intern_amps.shape[1])): graphs_intern.append( (intern_amps[:, idx], r'$\theta$={0:.3f}'.format(thetas[idx]))) plotter.set_data_list(grid_idx=plot_id, data_list=graphs_intern) plotter.set_area_list(grid_idx=plot_id, area_list=[(np.invert(output_vuv), '0.75', 1.0)]) plotter.set_label(grid_idx=plot_id, ylabel='command') amp_max = 0.04 amp_min = -amp_max plotter.set_lim(grid_idx=plot_id, ymin=amp_min, ymax=amp_max) plot_id += 1 graphs_filters = list() for idx in reversed(range(filters_out.shape[1])): graphs_filters.append((filters_out[:, idx], )) plotter.set_data_list(grid_idx=plot_id, data_list=graphs_filters) plotter.set_area_list(grid_idx=plot_id, area_list=[(np.invert(output_vuv), '0.75', 1.0, 'Unvoiced')]) plotter.set_label(grid_idx=plot_id, ylabel='filtered') amp_max = 0.1 amp_min = -amp_max plotter.set_lim(grid_idx=plot_id, ymin=amp_min, ymax=amp_max) plot_id += 1 graphs_lf0 = list() graphs_lf0.append((original_lf0, "Original")) graphs_lf0.append((output_lf0, "Predicted")) plotter.set_data_list(grid_idx=plot_id, data_list=graphs_lf0) plotter.set_hatchstyles(grid_idx=plot_id, hatchstyles=['\\\\']) plotter.set_area_list(grid_idx=plot_id, area_list=[(np.invert(org_vuv.astype(bool)), '0.75', 1.0, 'Reference unvoiced')]) plotter.set_label(grid_idx=plot_id, xlabel='frames [' + str(hparams.frame_size_ms) + ' ms]', ylabel='LF0') plotter.set_lim(grid_idx=plot_id, ymin=3, ymax=6) plotter.set_linestyles(grid_idx=plot_id, linestyles=['-.', '-']) plotter.set_colors(grid_idx=plot_id, colors=['C3', 'C2', 'C0'], alpha=1) plotter.gen_plot() # plotter.gen_plot(True) plotter.save_to_file(filename + ".PHRASE" + hparams.gen_figure_ext) if clustering is None: return plotter = DataPlotter() def cluster(array, mean=False): if mean: return np.array([ np.take(array, i, axis=-1).mean() for i in clustering ]).transpose() return np.array([ np.take(array, i, axis=-1).sum(-1) for i in clustering ]).transpose() clustered_amps = cluster(intern_amps) clustered_thetas = cluster(thetas, True) clustered_filters = cluster(filters_out) plot_id = 0 graphs_intern = list() for idx in reversed(range(clustered_amps.shape[1])): graphs_intern.append( (clustered_amps[:, idx], r'$\theta$={0:.3f}'.format(clustered_thetas[idx]))) plotter.set_data_list(grid_idx=plot_id, data_list=graphs_intern) plotter.set_area_list(grid_idx=plot_id, area_list=[(np.invert(output_vuv), '0.75', 1.0, 'Unvoiced')]) plotter.set_label(grid_idx=plot_id, ylabel='cluster command') amp_max = 0.04 amp_min = -amp_max plotter.set_lim(grid_idx=plot_id, ymin=amp_min, ymax=amp_max) plot_id += 1 graphs_filters = list() for idx in reversed(range(clustered_filters.shape[1])): graphs_filters.append((clustered_filters[:, idx], )) plotter.set_data_list(grid_idx=plot_id, data_list=graphs_filters) plotter.set_area_list(grid_idx=plot_id, area_list=[(np.invert(output_vuv), '0.75', 1.0)]) plotter.set_label(grid_idx=plot_id, ylabel='filtered') amp_max = 0.175 amp_min = -amp_max plotter.set_lim(grid_idx=plot_id, ymin=amp_min, ymax=amp_max) plot_id += 1 graphs_lf0 = list() graphs_lf0.append((original_lf0, "Original")) graphs_lf0.append((output_lf0, "Predicted")) plotter.set_data_list(grid_idx=plot_id, data_list=graphs_lf0) plotter.set_hatchstyles(grid_idx=plot_id, hatchstyles=['\\\\']) plotter.set_area_list(grid_idx=plot_id, area_list=[(np.invert(org_vuv.astype(bool)), '0.75', 1.0, 'Reference unvoiced')]) plotter.set_label(grid_idx=plot_id, xlabel='frames [' + str(hparams.frame_size_ms) + ' ms]', ylabel='lf0') # amp_lim = max(np.max(np.abs(wcad_lf0)), np.max(np.abs(output_lf0))) * 1.1 amp_lim = 1 plotter.set_lim(grid_idx=plot_id, ymin=-amp_lim, ymax=amp_lim) plotter.set_linestyles(grid_idx=plot_id, linestyles=['-.', '-']) plotter.set_colors(grid_idx=plot_id, colors=['C3', 'C2', 'C0'], alpha=1) plotter.gen_plot() # plotter.gen_plot(True) plotter.save_to_file(filename + ".CLUSTERS" + hparams.gen_figure_ext) def gen_figure_atoms(self, hparams, ids_input): self.flat_trainer.gen_figure_atoms(hparams, ids_input) def gen_figure_flat(self, hparams, ids_input): self.flat_trainer.gen_figure(hparams, ids_input) def gen_figure_phrase(self, hparams, ids_input): id_list = ModelTrainer._input_to_str_list(ids_input) model_output, model_output_post = self._forward_batched( hparams, id_list, hparams.batch_size_gen_figure, synth=False, benchmark=False, gen_figure=False) for id_name, outputs_post in model_output_post.items(): if outputs_post.ndim < 2: outputs_post = np.expand_dims(outputs_post, axis=1) lf0 = outputs_post[:, 0] output_lf0, _ = interpolate_lin(lf0) output_vuv = outputs_post[:, 1] output_vuv[output_vuv < 0.5] = 0.0 output_vuv[output_vuv >= 0.5] = 1.0 output_vuv = output_vuv.astype(np.bool) # Load original lf0 and vuv. world_dir = hparams.world_dir if hasattr(hparams, "world_dir") and hparams.world_dir is not None\ else os.path.join(hparams.out_dir, self.dir_extracted_acoustic_features) org_labels = WorldFeatLabelGen.load_sample( id_name, world_dir, num_coded_sps=hparams.num_coded_sps, num_bap=hparams.num_bap)[:len(output_lf0)] _, original_lf0, original_vuv, _ = WorldFeatLabelGen.convert_to_world_features( org_labels, num_coded_sps=hparams.num_coded_sps, num_bap=hparams.num_bap) original_lf0, _ = interpolate_lin(original_lf0) original_vuv = original_vuv.astype(np.bool) phrase_curve = np.fromfile(os.path.join( self.flat_trainer.atom_trainer.OutputGen.dir_labels, id_name + self.OutputGen.ext_phrase), dtype=np.float32).reshape( -1, 1)[:len(original_lf0)] f0_mse = (np.exp(original_lf0.squeeze(-1)) - np.exp(phrase_curve.squeeze(-1)))**2 f0_rmse = math.sqrt( (f0_mse * original_vuv[:len(output_lf0)]).sum() / original_vuv[:len(output_lf0)].sum()) self.logger.info("RMSE of {} phrase curve: {} Hz.".format( id_name, f0_rmse)) len_diff = len(original_lf0) - len(lf0) original_lf0 = WorldFeatLabelGen.trim_end_sample( original_lf0, int(len_diff / 2.0)) original_lf0 = WorldFeatLabelGen.trim_end_sample( original_lf0, int(len_diff / 2.0) + 1, reverse=True) # Get a data plotter. net_name = os.path.basename(hparams.model_name) filename = str( os.path.join(hparams.out_dir, id_name + '.' + net_name)) plotter = DataPlotter() # plotter.set_title(id_name + " - " + net_name) grid_idx = 0 graphs_lf0 = list() graphs_lf0.append((original_lf0, "Original")) graphs_lf0.append((phrase_curve, "Predicted")) plotter.set_data_list(grid_idx=grid_idx, data_list=graphs_lf0) plotter.set_area_list(grid_idx=grid_idx, area_list=[(np.invert(original_vuv), '0.8', 1.0, 'Reference unvoiced')]) plotter.set_label(grid_idx=grid_idx, xlabel='frames [' + str(hparams.frame_size_ms) + ' ms]', ylabel='LF0') # amp_lim = max(np.max(np.abs(wcad_lf0)), np.max(np.abs(output_lf0))) * 1.1 # plotter.set_lim(grid_idx=grid_idx, ymin=-amp_lim, ymax=amp_lim) plotter.set_lim(grid_idx=grid_idx, ymin=4.2, ymax=5.4) # plotter.set_linestyles(grid_idx=grid_idx, linestyles=[':', '--', '-']) # plotter.set_lim(xmin=300, xmax=1100) plotter.gen_plot() plotter.save_to_file(filename + ".PHRASE" + hparams.gen_figure_ext) def synthesize(self, id_list, synth_output, hparams): """Save output of model to .lf0 and (.vuv) files and call Merlin synth which reads those files.""" # Reconstruct lf0 from generated atoms and write it to synth output. # recon_dict = self.get_recon_from_synth_output(synth_output) full_output = dict() for id_name, labels in synth_output.items(): # Take lf0 and vuv from network output. lf0 = labels[:, 0] vuv = labels[:, 1] vuv[vuv < 0.5] = 0.0 vuv[vuv >= 0.5] = 1.0 # Get mgc, vuv and bap data either through a trained acoustic model or from data extracted from the audio. if hparams.synth_acoustic_model_path is None: world_dir = hparams.world_dir if hasattr(hparams, "world_dir") and hparams.world_dir is not None\ else os.path.realpath(os.path.join(hparams.out_dir, self.dir_extracted_acoustic_features)) full_sample: np.ndarray = WorldFeatLabelGen.load_sample( id_name, world_dir, add_deltas=False, num_coded_sps=hparams.num_coded_sps, num_bap=hparams.num_bap) # Load extracted data. len_diff = len(full_sample) - len(lf0) trim_front = len_diff // 2 trim_end = len_diff - trim_front full_sample = WorldFeatLabelGen.trim_end_sample( full_sample, trim_end) full_sample = WorldFeatLabelGen.trim_end_sample(full_sample, trim_front, reverse=True) else: raise NotImplementedError() # Overwrite lf0 and vuv by network output. full_sample[:, hparams.num_coded_sps] = lf0 full_sample[:, hparams.num_coded_sps + 1] = vuv # Fill a dictionary with the samples. full_output[id_name + "_E2E_Phrase"] = full_sample # Run the vocoder. ModelTrainer.synthesize(self, id_list, full_output, hparams) def compute_score(self, dict_outputs_post, dict_hiddens, hparams): # Get data for comparision. dict_original_post = dict() for id_name in dict_outputs_post.keys(): dict_original_post[id_name] = self.OutputGen.load_sample( id_name, self.OutputGen.dir_labels) f0_rmse = 0.0 vuv_error_rate = 0.0 f0_rmse_max_id = "None" f0_rmse_max = 0.0 vuv_error_max_id = "None" vuv_error_max = 0.0 all_rmse = [] all_vuv = [] for id_name, labels in dict_outputs_post.items(): output_lf0 = labels[:, 0] output_vuv = labels[:, 1] output_vuv[output_vuv < 0.5] = 0.0 output_vuv[output_vuv >= 0.5] = 1.0 output_vuv = output_vuv.astype(bool) # Get data for comparision. org_lf0 = dict_original_post[id_name][:, 0] org_vuv = dict_original_post[id_name][:, 1] # Compute f0 from lf0. org_f0 = np.exp(org_lf0.squeeze())[:len( output_lf0)] # Fix minor negligible length mismatch. output_f0 = np.exp(output_lf0) # Compute RMSE, keep track of worst RMSE. f0_mse = (org_f0 - output_f0)**2 current_f0_rmse = math.sqrt( (f0_mse * org_vuv[:len(output_lf0)]).sum() / org_vuv[:len(output_lf0)].sum()) if current_f0_rmse > f0_rmse_max: f0_rmse_max_id = id_name f0_rmse_max = current_f0_rmse f0_rmse += current_f0_rmse all_rmse.append(current_f0_rmse) num_errors = (org_vuv[:len(output_lf0)] != output_vuv) vuv_error_rate_tmp = float(num_errors.sum()) / len(output_lf0) if vuv_error_rate_tmp > vuv_error_max: vuv_error_max_id = id_name vuv_error_max = vuv_error_rate_tmp vuv_error_rate += vuv_error_rate_tmp all_vuv.append(vuv_error_rate_tmp) f0_rmse /= len(dict_outputs_post) vuv_error_rate /= len(dict_outputs_post) self.logger.info("Worst F0 RMSE: " + f0_rmse_max_id + " {:4.2f}Hz".format(f0_rmse_max)) self.logger.info("Worst VUV error: " + vuv_error_max_id + " {:2.2f}%".format(vuv_error_max * 100)) self.logger.info("Benchmark score: F0 RMSE " + "{:4.2f}Hz".format(f0_rmse) + ", VUV " + "{:2.2f}%".format(vuv_error_rate * 100)) return f0_rmse, vuv_error_rate
def main(): """Create samples with artificial alpha for each phoneme.""" from idiaptts.src.model_trainers.vtln.VTLNSpeakerAdaptionModelTrainer import VTLNSpeakerAdaptionModelTrainer hparams = VTLNSpeakerAdaptionModelTrainer.create_hparams() hparams.use_gpu = False hparams.voice = sys.argv[1] hparams.model_name = "WarpingLayerTest.nn" hparams.add_deltas = True hparams.num_coded_sps = 30 alpha_range = 0.2 num_phonemes = 70 num_random_alphas = 7 # num_random_alphas = 53 # Randomly pick alphas for each phoneme. np.random.seed(42) # phonemes_to_alpha_tensor = ((np.random.choice(np.random.rand(num_random_alphas), num_phonemes) - 0.5) * 2 * alpha_range) phonemes_to_alpha_tensor = ((np.random.rand(num_phonemes) - 0.5) * 2 * alpha_range) # hparams.num_questions = 505 hparams.num_questions = 609 # hparams.num_questions = 425 hparams.out_dir = os.path.join("experiments", hparams.voice, "WORLD_artificially_warped") hparams.data_dir = os.path.realpath("database") hparams.model_name = "warping_layer_test" hparams.synth_dir = hparams.out_dir dir_world_labels = os.path.join("experiments", hparams.voice, "WORLD") print( "Create artificially warped MGCs for {} in {} for {} questions, {} random alphas, and an alpha range of {}." .format(hparams.voice, hparams.out_dir, hparams.num_questions, len(np.unique(phonemes_to_alpha_tensor)), alpha_range)) from idiaptts.src.data_preparation.world.WorldFeatLabelGen import WorldFeatLabelGen gen_in = WorldFeatLabelGen(dir_world_labels, add_deltas=hparams.add_deltas, num_coded_sps=hparams.num_coded_sps) gen_in.get_normalisation_params(gen_in.dir_labels) from idiaptts.src.model_trainers.AcousticModelTrainer import AcousticModelTrainer trainer = AcousticModelTrainer( os.path.join("experiments", hparams.voice, "WORLD"), os.path.join("experiments", hparams.voice, "questions"), "ignored", hparams.num_questions, hparams) hparams.num_speakers = 1 speaker = "p276" num_synth_files = 5 # Number of files to synthesise to check warping manually. sp_mean = gen_in.norm_params[0][:hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] sp_std_dev = gen_in.norm_params[1][:hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] wl = WarpingLayer((hparams.num_coded_sps, ), (hparams.num_coded_sps, ), hparams) wl.set_norm_params(sp_mean, sp_std_dev) def _question_to_phoneme_index(questions): """Helper function to convert questions to their current phoneme index.""" if questions.shape[-1] == 505: # German question set. indices = np.arange(86, 347, 5, dtype=np.int) elif questions.shape[-1] == 425: # English radio question set. indices = np.arange(58, 107, dtype=np.int) elif questions.shape[-1] == 609: # English unilex question set. indices = np.arange(92, 162, dtype=np.int) else: raise NotImplementedError( "Unknown question set with {} questions.".format( questions.shape[-1])) return QuestionLabelGen.questions_to_phoneme_indices( questions, indices) # with open(os.path.join(hparams.data_dir, "file_id_list_{}_train.txt".format(hparams.voice))) as f: with open( os.path.join(hparams.data_dir, "file_id_list_{}_adapt.txt".format( hparams.voice))) as f: id_list = f.readlines() id_list[:] = [s.strip(' \t\n\r') for s in id_list if speaker in s] # Trim line endings in-place. out_dir = hparams.out_dir makedirs_safe(out_dir) makedirs_safe(os.path.join(out_dir, "cmp_mgc" + str(hparams.num_coded_sps))) t_benchmark = 0 org_to_warped_mcd = 0.0 for idx, id_name in enumerate(id_list): sample = WorldFeatLabelGen.load_sample( id_name, os.path.join("experiments", hparams.voice, "WORLD"), add_deltas=True, num_coded_sps=hparams.num_coded_sps) sample_pre = gen_in.preprocess_sample(sample) coded_sps = sample_pre[:, :hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] questions = QuestionLabelGen.load_sample( id_name, os.path.join("experiments", hparams.voice, "questions"), num_questions=hparams.num_questions) questions = questions[:len(coded_sps)] phoneme_indices = _question_to_phoneme_index(questions) alpha_vec = phonemes_to_alpha_tensor[phoneme_indices % len(phonemes_to_alpha_tensor), None] coded_sps = coded_sps[:len(alpha_vec), None, ...] # Create a batch dimension. alpha_vec = alpha_vec[:, None, None] # Create a batch and feature dimension. t_start = timer() mfcc_warped, (_, nn_alpha) = wl(torch.from_numpy(coded_sps), None, (len(coded_sps), ), (len(coded_sps), ), alphas=torch.from_numpy(alpha_vec)) t_benchmark += timer() - t_start sample_pre[:len(mfcc_warped), :hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] = mfcc_warped[:, 0].detach() sample_post = gen_in.postprocess_sample(sample_pre) # Manually create samples without normalisation but with deltas. sample_pre = (sample_pre * gen_in.norm_params[1] + gen_in.norm_params[0]).astype(np.float32) if np.isnan(sample_pre).any(): raise ValueError( "Detected nan values in output features for {}.".format( id_name)) # Compute error between warped version and original one. org_to_warped_mcd += metrics.melcd( sample[:, 0:hparams.num_coded_sps], sample_pre[:, 0:hparams.num_coded_sps]) # Save warped features. sample_pre.tofile( os.path.join( out_dir, "cmp_mgc" + str(hparams.num_coded_sps), os.path.basename(id_name + WorldFeatLabelGen.ext_deltas))) hparams.synth_dir = out_dir if idx < num_synth_files: # Only synthesize a few of samples. trainer.run_world_synth({id_name: sample_post}, hparams) print("Process time for {} warpings: {}. MCD caused by warping: {:.2f}". format(len(id_list), timedelta(seconds=t_benchmark), org_to_warped_mcd / len(id_list))) # Copy normalisation files which are necessary for training. for feature in ["_bap", "_lf0", "_mgc{}".format(hparams.num_coded_sps)]: shutil.copyfile( os.path.join( gen_in.dir_labels, gen_in.dir_deltas, MeanCovarianceExtractor.file_name_appendix + feature + ".bin"), os.path.join( out_dir, "cmp_mgc" + str(hparams.num_coded_sps), MeanCovarianceExtractor.file_name_appendix + feature + ".bin"))
def gen_figure_from_output(self, id_name, label, hidden, hparams): _, alphas = hidden labels_post = self.OutputGen.postprocess_sample(label) coded_sp, lf0, vuv, bap = WorldFeatLabelGen.convert_to_world_features( labels_post, contains_deltas=False, num_coded_sps=hparams.num_coded_sps) sp = WorldFeatLabelGen.mcep_to_amp_sp(coded_sp, hparams.synth_fs) lf0, _ = interpolate_lin(lf0) # Load original LF0. org_labels_post = WorldFeatLabelGen.load_sample( id_name, dir_out=self.OutputGen.dir_labels, add_deltas=self.OutputGen.add_deltas, num_coded_sps=hparams.num_coded_sps) original_mgc, original_lf0, original_vuv, *_ = WorldFeatLabelGen.convert_to_world_features( sample=org_labels_post, contains_deltas=self.OutputGen.add_deltas, num_coded_sps=hparams.num_coded_sps) original_lf0, _ = interpolate_lin(original_lf0) sp = sp[:, :150] # Zoom into spectral features. # Get a data plotter. grid_idx = -1 plotter = DataPlotter() net_name = os.path.basename(hparams.model_name) filename = str(os.path.join(hparams.out_dir, id_name + '.' + net_name)) plotter.set_title(id_name + ' - ' + net_name) plotter.set_num_colors(3) # plotter.set_lim(grid_idx=0, ymin=math.log(60), ymax=math.log(250)) # # Plot LF0 # grid_idx += 1 # graphs.append((original_lf0, 'Original LF0')) # graphs.append((lf0, 'NN LF0')) # plotter.set_data_list(grid_idx=grid_idx, data_list=graphs) # plotter.set_area_list(grid_idx=grid_idx, area_list=[(np.invert(vuv.astype(bool)), '0.8', 1.0), # (np.invert(original_vuv.astype(bool)), 'red', 0.2)]) # plotter.set_label(grid_idx=grid_idx, xlabel='frames [{}] ms'.format(hparams.frame_length), ylabel='log(f0)') # Reverse the warping. wl = self._get_dummy_warping_layer(hparams) norm_params_no_deltas = ( self.OutputGen.norm_params[0][:hparams.num_coded_sps], self.OutputGen.norm_params[1][:hparams.num_coded_sps]) pre_net_output, _ = wl.forward_sample(label, -alphas) # Postprocess sample manually. pre_net_output = pre_net_output.detach().cpu().numpy() pre_net_mgc = pre_net_output[:, 0, :hparams. num_coded_sps] * norm_params_no_deltas[ 1] + norm_params_no_deltas[0] # Plot spectral features predicted by pre-network. grid_idx += 1 plotter.set_label(grid_idx=grid_idx, xlabel='frames [{}] ms'.format( hparams.frame_size_ms), ylabel='Pre-network') plotter.set_specshow(grid_idx=grid_idx, spec=np.abs( WorldFeatLabelGen.mcep_to_amp_sp( pre_net_mgc, hparams.synth_fs)[:, :sp.shape[1]])) # Plot final predicted spectral features. grid_idx += 1 plotter.set_label(grid_idx=grid_idx, xlabel='frames [{}] ms'.format( hparams.frame_size_ms), ylabel='VTLN') plotter.set_specshow(grid_idx=grid_idx, spec=np.abs(sp)) # Plot predicted alpha value and V/UV flag. grid_idx += 1 plotter.set_label(grid_idx=grid_idx, xlabel='frames [{}] ms'.format( hparams.frame_size_ms), ylabel='alpha') graphs = list() graphs.append((alphas, 'NN alpha')) plotter.set_data_list(grid_idx=grid_idx, data_list=graphs) plotter.set_area_list(grid_idx=grid_idx, area_list=[(np.invert(vuv.astype(bool)), '0.8', 1.0), (np.invert(original_vuv.astype(bool)), 'red', 0.2)]) # Add phoneme annotations if given. if hasattr(hparams, "phoneme_indices") and hparams.phoneme_indices is not None \ and hasattr(hparams, "question_file") and hparams.question_file is not None: questions = QuestionLabelGen.load_sample( id_name, os.path.join("experiments", hparams.voice, "questions"), num_questions=hparams.num_questions)[:len(lf0)] np_phonemes = QuestionLabelGen.questions_to_phonemes( questions, hparams.phoneme_indices, hparams.question_file) plotter.set_annotations(grid_idx, np_phonemes) # Plot reference spectral features. grid_idx += 1 plotter.set_label(grid_idx=grid_idx, xlabel='frames [{}] ms'.format( hparams.frame_size_ms), ylabel='Original spectrogram') plotter.set_specshow(grid_idx=grid_idx, spec=np.abs( WorldFeatLabelGen.mcep_to_amp_sp( original_mgc, hparams.synth_fs)[:, :sp.shape[1]])) plotter.gen_plot() plotter.save_to_file(filename + '.VTLN' + hparams.gen_figure_ext)
def gen_figure_from_output(self, id_name, label, hidden, hparams): _, alphas = hidden labels_post = self.OutputGen.postprocess_sample(label) coded_sp, lf0, vuv, bap = WorldFeatLabelGen.convert_to_world_features( labels_post, contains_deltas=False, num_coded_sps=hparams.num_coded_sps) sp = WorldFeatLabelGen.mcep_to_amp_sp(coded_sp, hparams.synth_fs) lf0, _ = interpolate_lin(lf0) # Load original lf0. org_labels_post = WorldFeatLabelGen.load_sample( id_name, self.OutputGen.dir_labels, add_deltas=self.OutputGen.add_deltas, num_coded_sps=hparams.num_coded_sps) original_mgc, original_lf0, original_vuv, *_ = WorldFeatLabelGen.convert_to_world_features( org_labels_post, contains_deltas=self.OutputGen.add_deltas, num_coded_sps=hparams.num_coded_sps) original_lf0, _ = interpolate_lin(original_lf0) questions = QuestionLabelGen.load_sample( id_name, os.path.join("experiments", hparams.voice, "questions"), num_questions=hparams.num_questions)[:len(alphas)] phoneme_indices = QuestionLabelGen.questions_to_phoneme_indices( questions, hparams.phoneme_indices) alpha_vec = self.phonemes_to_alpha_tensor[phoneme_indices % len( self.phonemes_to_alpha_tensor)] # Get a data plotter. grid_idx = 0 plotter = DataPlotter() net_name = os.path.basename(hparams.model_name) filename = str(os.path.join(hparams.out_dir, id_name + '.' + net_name)) plotter.set_title(id_name + ' - ' + net_name) plotter.set_num_colors(3) # plotter.set_lim(grid_idx=0, ymin=math.log(60), ymax=math.log(250)) plotter.set_label(grid_idx=grid_idx, xlabel='frames [' + str(hparams.frame_size_ms) + ' ms]', ylabel='log(f0)') graphs = list() graphs.append((original_lf0, 'Original LF0')) graphs.append((lf0, 'NN LF0')) plotter.set_data_list(grid_idx=grid_idx, data_list=graphs) plotter.set_area_list(grid_idx=grid_idx, area_list=[(np.invert(vuv.astype(bool)), '0.8', 1.0), (np.invert(original_vuv.astype(bool)), 'red', 0.2)]) # grid_idx += 1 # plotter.set_label(grid_idx=grid_idx, xlabel='frames [' + str(hparams.frame_size_ms) + ' ms]', ylabel='Original spectrogram') # plotter.set_specshow(grid_idx=grid_idx, spec=WorldFeatLabelGen.mgc_to_sp(original_mgc, hparams.synth_fs)) # # grid_idx += 1 # plotter.set_label(grid_idx=grid_idx, xlabel='frames [' + str(hparams.frame_size_ms) + ' ms]', ylabel='NN spectrogram') # plotter.set_specshow(grid_idx=grid_idx, spec=sp) grid_idx += 1 plotter.set_label(grid_idx=grid_idx, xlabel='frames [' + str(hparams.frame_size_ms) + ' ms]', ylabel='alpha') graphs = list() graphs.append((alpha_vec, 'Original alpha')) graphs.append((alphas, 'NN alpha')) plotter.set_data_list(grid_idx=grid_idx, data_list=graphs) plotter.set_area_list(grid_idx=grid_idx, area_list=[(np.invert(vuv.astype(bool)), '0.8', 1.0), (np.invert(original_vuv.astype(bool)), 'red', 0.2)]) if hasattr(hparams, "phoneme_indices") and hparams.phoneme_indices is not None \ and hasattr(hparams, "question_file") and hparams.question_file is not None: questions = QuestionLabelGen.load_sample( id_name, os.path.join("experiments", hparams.voice, "questions"), num_questions=hparams.num_questions)[:len(lf0)] np_phonemes = QuestionLabelGen.questions_to_phonemes( questions, hparams.phoneme_indices, hparams.question_file) plotter.set_annotations(grid_idx, np_phonemes) plotter.gen_plot() plotter.save_to_file(filename + '.VTLN' + hparams.gen_figure_ext)
def compute_score(self, dict_outputs_post, dict_hiddens, hparams): mcd, f0_rmse, vuv_error_rate, bap_mcd = super().compute_score( dict_outputs_post, dict_hiddens, hparams) # Get data for comparision. dict_original_post = dict() for id_name in dict_outputs_post.keys(): dict_original_post[id_name] = WorldFeatLabelGen.load_sample( id_name, self.OutputGen.dir_labels, True, num_coded_sps=hparams.num_coded_sps) # Create a warping layer for manual warping. wl = WarpingLayer((hparams.num_coded_sps, ), (hparams.num_coded_sps, ), hparams) if hparams.use_gpu: wl = wl.cuda() wl.set_norm_params(*self.OutputGen.norm_params) batch_size = len(dict_outputs_post) for cep_coef_start in [0, 1]: for cep_coef_end in (range(10, 19) if cep_coef_start == 1 else [-1]): alphas_rmse = 0.0 org_to_warped_mcd = 0.0 org_to_nn_warping_mcd = 0.0 output_to_warped_mcd = 0.0 for id_name, labels in dict_outputs_post.items(): # Split NN output. _, output_alphas = dict_hiddens[id_name] output_mgc_post, *_ = self.OutputGen.convert_to_world_features( labels, False, num_coded_sps=hparams.num_coded_sps) # Load the original sample without warping. org_output = self.OutputGen.load_sample( id_name, os.path.join("experiments", hparams.voice, "WORLD"), add_deltas=True, num_coded_sps=hparams.num_coded_sps) org_output = org_output[:len(output_mgc_post)] org_mgc_post = org_output[:, :hparams.num_coded_sps] org_output_pre = self.OutputGen.preprocess_sample( org_output) # Preprocess the sample. org_mgc_pre = org_output_pre[:, :hparams.num_coded_sps * ( 3 if hparams.add_deltas else 1)] # Load the original warped sample. org_mgc_warped_post = dict_original_post[ id_name][:len(output_mgc_post), :hparams.num_coded_sps] # org_mgc_warped_post = self.OutputGen.load_sample( # id_name, # os.path.join("experiments", # hparams.voice, # "vtln_speaker_static", # "alpha_1.10"), # add_deltas=True, # num_coded_sps=hparams.num_coded_sps)[:len(output_mgc_post), :hparams.num_coded_sps] # Compute error between warped version and NN output. output_to_warped_mcd += metrics.melcd( org_mgc_warped_post[:, cep_coef_start:cep_coef_end], output_mgc_post[:, cep_coef_start:cep_coef_end]) # Compute error between warped version and original one. org_to_warped_mcd += metrics.melcd( org_mgc_warped_post[:, cep_coef_start:cep_coef_end], org_mgc_post[:, cep_coef_start:cep_coef_end]) # Get original alphas from phonemes. questions = QuestionLabelGen.load_sample( id_name, os.path.join("experiments", hparams.voice, "questions"), num_questions=hparams.num_questions)[:len(output_alphas )] phoneme_indices = QuestionLabelGen.questions_to_phoneme_indices( questions, hparams.phoneme_indices) org_alphas = self.phonemes_to_alpha_tensor[ phoneme_indices % len(self.phonemes_to_alpha_tensor), None] # Compute RMSE of alphas. alphas_rmse += math.sqrt( ((org_alphas - output_alphas)**2).sum()) # Warp the original mgcs with the alpha predicted by the network. org_mgc_nn_warped, _ = wl.forward_sample( org_mgc_pre, output_alphas) # Warp with the NN alphas. org_output_pre[:, :hparams.num_coded_sps * (3 if hparams.add_deltas else 1)]\ = org_mgc_nn_warped[:, 0, ...].detach() # Write warped mgcs back. org_mgc_nn_warped_post = self.OutputGen.postprocess_sample( org_output_pre, apply_mlpg=False)[:, :hparams.num_coded_sps] # Compute error between correctly warped version and original mgcs warped with NN alpha. org_to_nn_warping_mcd += metrics.melcd( org_mgc_warped_post[:, cep_coef_start:cep_coef_end], org_mgc_nn_warped_post[:, cep_coef_start:cep_coef_end]) alphas_rmse /= batch_size output_to_warped_mcd /= batch_size org_to_warped_mcd /= batch_size org_to_nn_warping_mcd /= batch_size self.logger.info("MCep from {} to {}:".format( cep_coef_start, cep_coef_end)) self.logger.info("RMSE alphas: {:4.2f}".format(alphas_rmse)) self.logger.info( "Original mgc to warped mgc error: {:4.2f}dB".format( org_to_warped_mcd)) self.logger.info( "Original mgc warped by network alpha to warped mgc error: {:4.2f}dB ({:2.2f}%)" .format(org_to_nn_warping_mcd, (1 - org_to_nn_warping_mcd / org_to_warped_mcd) * 100)) self.logger.info( "Network output to original warped mgc error: {:4.2f}dB". format(output_to_warped_mcd)) return mcd, f0_rmse, vuv_error_rate, bap_mcd
# makedirs_safe(dir_labels) logging.warning("Label files are not recreated.") # TODO: Possible implementation at TTSModel.run_DM_AM(). # Generate durations logging.info("Create duration files.") shutil.rmtree(dir_dur) makedirs_safe(dir_dur) PhonemeDurationLabelGen.gen_data(dir_labels, dir_dur, id_list=id_list) # Generate questions. logging.info("Create question files.") shutil.rmtree(dir_questions) makedirs_safe(dir_questions) QuestionLabelGen.gen_data(dir_labels, "questions-en-radio_dnn_400.hed", dir_questions, id_list=id_list) # Generate WORLD features. logging.info("Create WORLD files.") shutil.rmtree(dir_world) makedirs_safe(dir_world) world_generator = WorldFeatLabelGen(dir_world, add_deltas=False, num_coded_sps=20, sp_type="mcep") world_generator.gen_data(dir_wav, dir_world, id_list=id_list) world_generator = WorldFeatLabelGen(dir_world, add_deltas=True, num_coded_sps=20, sp_type="mcep")
class AtomModelTrainer(ModelTrainer): """ Implementation of a ModelTrainer for the generation of acoustic data through atom prediction. Output labels for atoms have dimension: T x |thetas| x 2 (amp, theta). Use question labels as input and extracted wcad atoms as output. Synthesize audio from model output by generating F0 from atoms. MGC and BAP is either generated by a pre-trained acoustic model or loaded from the original extracted files. """ logger = logging.getLogger(__name__) def __init__(self, wcad_root, dir_atom_labels, dir_question_labels, id_list, thetas, k, num_questions, hparams=None): """Default constructor. :param wcad_root: Path to main directory of wcad. :param dir_atom_labels: Path to directory that contains the .atom files. :param dir_question_labels: Path to directory that contains the .questions files. :param id_list: List containing all ids. Subset is taken as test set. :param thetas: List of theta values. :param k: K value of atoms. :param num_questions: Expected number of questions in question labels. :param hparams: Hyper-parameter container. """ if hparams is None: hparams = self.create_hparams() hparams.out_dir = os.path.curdir # Write missing default parameters. if hparams.variable_sequence_length_train is None: hparams.variable_sequence_length_train = hparams.batch_size_train > 1 if hparams.variable_sequence_length_test is None: hparams.variable_sequence_length_test = hparams.batch_size_test > 1 if hparams.synth_dir is None: hparams.synth_dir = os.path.join(hparams.out_dir, "synth") # If the weight for unvoiced frames is not given, compute it to get equal weights. non_zero_occurrence = min(0.99, 0.02 / len(thetas)) zero_occurrence = 1 - non_zero_occurrence if not hasattr(hparams, "weight_zero"): hparams.add_hparam("weight_non_zero", 1 / non_zero_occurrence) hparams.add_hparam("weight_zero", 1 / zero_occurrence) elif hparams.weight_zero is None: hparams.weight_non_zero = 1 / non_zero_occurrence hparams.weight_zero = 1 / zero_occurrence super().__init__(id_list, hparams) self.InputGen = QuestionLabelGen(dir_question_labels, num_questions) self.InputGen.get_normalisation_params( dir_question_labels, hparams.input_norm_params_file_prefix) self.OutputGen = AtomLabelGen(wcad_root, dir_atom_labels, thetas, k, hparams.frame_size_ms) self.OutputGen.get_normalisation_params( dir_atom_labels, hparams.output_norm_params_file_prefix) self.dataset_train = PyTorchLabelGensDataset(self.id_list_train, self.InputGen, self.OutputGen, hparams, match_lengths=True) self.dataset_val = PyTorchLabelGensDataset(self.id_list_val, self.InputGen, self.OutputGen, hparams, match_lengths=True) if self.loss_function is None: self.loss_function = WeightedNonzeroMSELoss( hparams.use_gpu, hparams.weight_zero, hparams.weight_non_zero, size_average=False, reduce=False) if hparams.scheduler_type == "default": hparams.scheduler_type = "Plateau" hparams.add_hparams(plateau_patience=10, plateau_factor=0.5, plateau_verbose=True) @staticmethod def create_hparams(hparams_string=None, verbose=False): hparams = ModelTrainer.create_hparams(hparams_string, verbose=False) hparams.add_hparams(thetas=None, k=None, min_atom_amp=0.3, num_questions=None) if verbose: logging.info(hparams.get_debug_string()) return hparams def gen_figure_from_output(self, id_name, labels, hidden, hparams): if labels.ndim < 2: labels = np.expand_dims(labels, axis=1) labels_post = self.OutputGen.postprocess_sample(labels, identify_peaks=True, peak_range=100) lf0 = self.OutputGen.labels_to_lf0(labels_post, hparams.k) lf0, vuv = interpolate_lin(lf0) vuv = vuv.astype(np.bool) # Load original lf0 and vuv. world_dir = hparams.world_dir if hasattr(hparams, "world_dir") and hparams.world_dir is not None\ else os.path.join(self.OutputGen.dir_labels, self.dir_extracted_acoustic_features) org_labels = WorldFeatLabelGen.load_sample( id_name, world_dir, num_coded_sps=hparams.num_coded_sps) _, original_lf0, original_vuv, _ = WorldFeatLabelGen.convert_to_world_features( org_labels, num_coded_sps=hparams.num_coded_sps) original_lf0, _ = interpolate_lin(original_lf0) original_vuv = original_vuv.astype(np.bool) phrase_curve = np.fromfile(os.path.join( self.OutputGen.dir_labels, id_name + self.OutputGen.ext_phrase), dtype=np.float32).reshape(-1, 1) original_lf0 -= phrase_curve len_diff = len(original_lf0) - len(lf0) original_lf0 = WorldFeatLabelGen.trim_end_sample( original_lf0, int(len_diff / 2.0)) original_lf0 = WorldFeatLabelGen.trim_end_sample(original_lf0, int(len_diff / 2.0) + 1, reverse=True) org_labels = self.OutputGen.load_sample(id_name, self.OutputGen.dir_labels, len(hparams.thetas)) org_labels = self.OutputGen.trim_end_sample(org_labels, int(len_diff / 2.0)) org_labels = self.OutputGen.trim_end_sample(org_labels, int(len_diff / 2.0) + 1, reverse=True) org_atoms = self.OutputGen.labels_to_atoms( org_labels, k=hparams.k, frame_size=hparams.frame_size_ms) # Get a data plotter. net_name = os.path.basename(hparams.model_name) filename = str(os.path.join(hparams.out_dir, id_name + '.' + net_name)) plotter = DataPlotter() plotter.set_title(id_name + " - " + net_name) graphs_output = list() grid_idx = 0 for idx in reversed(range(labels.shape[1])): graphs_output.append( (labels[:, idx], r'$\theta$=' + "{0:.3f}".format(hparams.thetas[idx]))) plotter.set_label(grid_idx=grid_idx, xlabel='frames [' + str(hparams.frame_size_ms) + ' ms]', ylabel='NN output') plotter.set_data_list(grid_idx=grid_idx, data_list=graphs_output) # plotter.set_lim(grid_idx=0, ymin=-1.8, ymax=1.8) grid_idx += 1 graphs_peaks = list() for idx in reversed(range(labels_post.shape[1])): graphs_peaks.append((labels_post[:, idx, 0], )) plotter.set_label(grid_idx=grid_idx, xlabel='frames [' + str(hparams.frame_size_ms) + ' ms]', ylabel='NN post-processed') plotter.set_data_list(grid_idx=grid_idx, data_list=graphs_peaks) plotter.set_area_list(grid_idx=grid_idx, area_list=[(np.invert(vuv), '0.8', 1.0)]) plotter.set_lim(grid_idx=grid_idx, ymin=-1.8, ymax=1.8) grid_idx += 1 graphs_target = list() for idx in reversed(range(org_labels.shape[1])): graphs_target.append((org_labels[:, idx, 0], )) plotter.set_label(grid_idx=grid_idx, xlabel='frames [' + str(hparams.frame_size_ms) + ' ms]', ylabel='target') plotter.set_data_list(grid_idx=grid_idx, data_list=graphs_target) plotter.set_area_list(grid_idx=grid_idx, area_list=[(np.invert(original_vuv), '0.8', 1.0) ]) plotter.set_lim(grid_idx=grid_idx, ymin=-1.8, ymax=1.8) grid_idx += 1 output_atoms = AtomLabelGen.labels_to_atoms( labels_post, hparams.k, hparams.frame_size_ms, amp_threshold=hparams.min_atom_amp) wcad_lf0 = AtomLabelGen.atoms_to_lf0(org_atoms, len(labels)) output_lf0 = AtomLabelGen.atoms_to_lf0(output_atoms, len(labels)) graphs_lf0 = list() graphs_lf0.append((wcad_lf0, "wcad lf0")) graphs_lf0.append((original_lf0, "org lf0")) graphs_lf0.append((output_lf0, "predicted lf0")) plotter.set_data_list(grid_idx=grid_idx, data_list=graphs_lf0) plotter.set_area_list(grid_idx=grid_idx, area_list=[(np.invert(original_vuv), '0.8', 1.0) ]) plotter.set_label(grid_idx=grid_idx, xlabel='frames [' + str(hparams.frame_size_ms) + ' ms]', ylabel='lf0') amp_lim = max(np.max(np.abs(wcad_lf0)), np.max( np.abs(output_lf0))) * 1.1 plotter.set_lim(grid_idx=grid_idx, ymin=-amp_lim, ymax=amp_lim) plotter.set_linestyles(grid_idx=grid_idx, linestyles=[':', '--', '-']) # plotter.set_lim(xmin=300, xmax=1100) plotter.gen_plot() plotter.save_to_file(filename + ".BASE" + hparams.gen_figure_ext) def get_recon_from_synth_output(self, synth_output, hparams): """Reconstruct LF0 from atoms.""" # Transform output to GammaAtoms. recon_dict = dict() for id_name, label in synth_output.items(): if len(label.shape) == 2: label = np.expand_dims(label, axis=1) atoms = self.OutputGen.labels_to_atoms( label, k=hparams.k, frame_size=hparams.frame_size_ms, amp_threshold=hparams.min_atom_amp) reconstruction = self.OutputGen.atoms_to_lf0(atoms, num_frames=len(label)) # Add extracted phrase. phrase_curve = np.fromfile(os.path.join( self.OutputGen.dir_labels, id_name + self.OutputGen.ext_phrase), dtype=np.float32)[:len(reconstruction)] reconstruction[:len(phrase_curve)] += phrase_curve reconstruction[reconstruction <= math.log(WorldFeatLabelGen.f0_silence_threshold )] = WorldFeatLabelGen.lf0_zero recon_dict[id_name] = reconstruction return recon_dict def get_phrase_curve(self, id_name): return np.fromfile(os.path.join(self.OutputGen.dir_labels, id_name + self.OutputGen.ext_phrase), dtype=np.float32).reshape(-1, 1) def compute_score(self, dict_outputs_post, dict_hiddens, hparams): # Get data for comparision. dict_original_post = self.load_extracted_audio_features( dict_outputs_post, hparams) f0_rmse = 0.0 f0_rmse_max_id = "None" f0_rmse_max = 0.0 for id_name, labels in dict_outputs_post.items(): output_lf0 = AtomLabelGen.labels_to_lf0( labels, k=hparams.k, frame_size=hparams.frame_size_ms, amp_threshold=hparams.min_atom_amp) # Get data for comparision. org_lf0 = dict_original_post[id_name][:, hparams.num_coded_sps] org_vuv = dict_original_post[id_name][:, hparams.num_coded_sps + 1] phrase_curve = self.get_phrase_curve(id_name) # Compute f0 from lf0. org_f0 = (np.exp(org_lf0.squeeze()) * org_vuv)[:len( output_lf0)] # Fix minor negligible length mismatch. output_f0 = np.exp(output_lf0 + phrase_curve[:len(output_lf0)]. squeeze()) * org_vuv[:len(output_lf0)] # Compute RMSE, keep track of worst RMSE. f0_mse = (org_f0 - output_f0)**2 current_f0_rmse = math.sqrt(f0_mse.sum() / org_vuv.sum()) if current_f0_rmse > f0_rmse_max: f0_rmse_max_id = id_name f0_rmse_max = current_f0_rmse f0_rmse += current_f0_rmse f0_rmse /= len(dict_outputs_post) self.logger.info("Worst F0 RMSE: " + f0_rmse_max_id + " {:4.2f}Hz".format(f0_rmse_max)) self.logger.info("Benchmark score: F0 RMSE " + "{:4.2f}Hz".format(f0_rmse)) return f0_rmse def load_extracted_audio_features(self, synth_output, hparams): """Load the audio features extracted from audio.""" self.logger.info("Load extracted mgc, lf0, vuv, bap data.") org_output = dict() for id_name in synth_output.keys(): world_dir = hparams.world_dir if hasattr(hparams, "world_dir") and hparams.world_dir is not None\ else os.path.realpath(os.path.join(self.OutputGen.dir_labels, self.dir_extracted_acoustic_features)) org_output[id_name] = WorldFeatLabelGen.load_sample( id_name, world_dir, add_deltas=False, num_coded_sps=hparams.num_coded_sps) # Load extracted data. return org_output def generate_audio_features(self, id_list, hparams): # TODO: This function is untested. """ Generate mgc, vuv and bap data with an acoustic model. The name of the acoustic model is saved in hparams.synth_acoustic_model_path and given in the constructor. If the synth_acoustic_model_path is 'None' this method will not be called but the method load_extracted_audio_features, which reloads the original data extracted from the audio. If you want to generate audio directly from wcad atom extraction, uncomment the first block in the get_recon_from_synth_output method. Detailed execution process: This method reuses the synth method of the ModelTrainer base class. It overwrites the internal f_synthesize method and the OutputGen to accomplish the audio generation. Both are restored after finishing the generation. The base class synth method loads the acoustic model network by its name and forwards the question labels for each utterance in the id_list. At the end the method calls the f_synthesize method. Therefore the f_synthesize method is overwritten by the save_audio_features which saves the generate output mgc, vuv and bap files in the self.synth_dir folder. """ self.logger.info("Generate mgc, vuv and bap with " + hparams.synth_acoustic_model_path) acoustic_model_hparams = AcousticModelTrainer.create_hparams() acoustic_model_hparams.model_name = os.path.basename( hparams.synth_acoustic_model_path) acoustic_model_hparams.model_path = hparams.synth_acoustic_model_path acoustic_model_handler = AcousticModelTrainer(acoustic_model_hparams) org_model_handler = self.model_handler self.model_handler = acoustic_model_handler # Switch f_synthesize method and OutputGen for mgc, vuv and bap creation. # f_synthesize is called at the end of synth. self.f_synthesize = self.save_audio_features org_output_gen = self.OutputGen self.OutputGen = self.AudioGen # Explicitly synthesize with acoustic_model_name. # This method calls f_synthesize at the end which will save the mgc, vuv and bap. self.synth(hparams, id_list) # Switch back to atom creation. self.f_synthesize = self.synthesize self.OutputGen = org_output_gen self.model_handler = org_model_handler def synthesize(self, id_list, synth_output, hparams): """This method should be overwritten by sub classes.""" # Create lf0 from atoms of output and get other acoustic features either by loading the original labels or by # generating them with the model at hparams.synth_acoustic_model_path. full_output = self.run_atom_synth(id_list, synth_output, hparams) # Run the WORLD synthesizer. Synthesiser.run_world_synth(full_output, hparams) def synth_ref_wcad(self, file_id_list, hparams): synth_output = dict() # Load extracted atoms. for id_name in file_id_list: synth_output[id_name] = AtomLabelGen.load_sample( id_name, self.OutputGen.dir_labels, len(hparams.thetas)) full_output = self.run_atom_synth(file_id_list, synth_output, hparams) # Add identifier to suffix. old_synth_file_suffix = hparams.synth_file_suffix hparams.synth_file_suffix += "_wcad_ref" # Run the WORLD synthesizer. Synthesiser.run_world_synth(full_output, hparams) # Restore identifier. hparams.synth_file_suffix = old_synth_file_suffix def synth_phrase(self, file_id_list, hparams): # Create reference audio files containing only the vocoder degradation. self.logger.info("Synthesise phrase curve for [{0}].".format(", ".join( [id_name for id_name in file_id_list]))) # Create an empty dictionary which can be filled with extracted audio features. synth_output = dict() for id_name in file_id_list: synth_output[id_name] = None # Fill dictionary with extracted audio features. full_output = self.load_extracted_audio_features(synth_output, hparams) # Override the lf0 component by the phrase curve. for id_name in file_id_list: labels = full_output[id_name] phrase_curve = np.fromfile( os.path.join(self.OutputGen.dir_labels, id_name + self.OutputGen.ext_phrase), dtype=np.float32)[:len(full_output[id_name])] labels[:, -3] = phrase_curve[:len(labels)] # Add identifier to suffix. old_synth_file_suffix = hparams.synth_file_suffix hparams.synth_file_suffix += '_phrase' # Run the vocoder. ModelTrainer.synthesize(self, file_id_list, full_output, hparams) # Restore identifier. hparams.synth_file_suffix = old_synth_file_suffix def run_atom_synth(self, file_id_list, synth_output, hparams): """ Reconstruct lf0, get mgc and bap data, and store all in files in self.synth_dir. """ # Get mgc, vuv and bap data either through a trained acoustic model or from data extracted from the audio. if hparams.synth_acoustic_model_path is None: full_output = self.load_extracted_audio_features( synth_output, hparams) else: self.logger.warning("This method is untested.") full_output = self.generate_audio_features(file_id_list, hparams) # Reconstruct lf0 from generated atoms and write it to synth output. recon_dict = self.get_recon_from_synth_output(synth_output, hparams) for id_name, lf0 in recon_dict.items(): full_sample = full_output[id_name] len_diff = len(full_sample) - len(lf0) full_sample = WorldFeatLabelGen.trim_end_sample(full_sample, int(len_diff / 2), reverse=True) full_sample = WorldFeatLabelGen.trim_end_sample( full_sample, len_diff - int(len_diff / 2)) vuv = np.ones(lf0.shape) vuv[lf0 <= math.log(WorldFeatLabelGen.f0_silence_threshold)] = 0.0 full_sample[:, hparams.num_coded_sps] = lf0 full_sample[:, hparams.num_coded_sps + 1] = vuv return full_output
class AcousticModelTrainer(ModelTrainer): """ Implementation of a ModelTrainer for the generation of acoustic data. Use question labels as input and WORLD features w/o deltas/double deltas (specified in hparams.add_deltas) as output. Synthesize audio from model output with MLPG smoothing. """ logger = logging.getLogger(__name__) ######################### # Default constructor # def __init__(self, dir_world_features, dir_question_labels, id_list, num_questions, hparams=None): """Default constructor. :param dir_world_features: Path to the directory containing the world features. :param dir_question_labels: Path to the directory containing the question labels. :param id_list: List of ids, can contain a speaker directory. :param num_questions: Number of questions in question file. :param hparams: Set of hyper parameters. """ if hparams is None: hparams = self.create_hparams() hparams.out_dir = os.path.curdir # Write missing default parameters. if hparams.variable_sequence_length_train is None: hparams.variable_sequence_length_train = hparams.batch_size_train > 1 if hparams.variable_sequence_length_test is None: hparams.variable_sequence_length_test = hparams.batch_size_test > 1 if hparams.synth_dir is None: hparams.synth_dir = os.path.join(hparams.out_dir, "synth") super(AcousticModelTrainer, self).__init__(id_list, hparams) self.InputGen = QuestionLabelGen(dir_question_labels, num_questions) self.InputGen.get_normalisation_params( dir_question_labels, hparams.input_norm_params_file_prefix) self.OutputGen = WorldFeatLabelGen(dir_world_features, add_deltas=hparams.add_deltas, num_coded_sps=hparams.num_coded_sps, sp_type=hparams.sp_type) self.OutputGen.get_normalisation_params( dir_world_features, hparams.output_norm_params_file_prefix) self.dataset_train = LabelGensDataset(self.id_list_train, self.InputGen, self.OutputGen, hparams, match_lengths=True) self.dataset_val = LabelGensDataset(self.id_list_val, self.InputGen, self.OutputGen, hparams, match_lengths=True) if self.loss_function is None: self.loss_function = torch.nn.MSELoss(reduction='none') if hparams.scheduler_type == "default": hparams.scheduler_type = "Plateau" hparams.add_hparams(plateau_verbose=True) @staticmethod def create_hparams(hparams_string=None, verbose=False): """Create model hyper parameter container. Parse non default from given string.""" hparams = ModelTrainer.create_hparams(hparams_string, verbose=False) hparams.add_hparams( num_questions=None, question_file=None, # Used to add labels in plot. num_coded_sps=60, sp_type="mcep", add_deltas=True, synth_load_org_sp=False, synth_load_org_lf0=False, synth_load_org_vuv=False, synth_load_org_bap=False) if verbose: logging.info(hparams.get_debug_string()) return hparams def gen_figure_from_output(self, id_name, label, hidden, hparams): labels_post = self.OutputGen.postprocess_sample(label) coded_sp, lf0, vuv, bap = WorldFeatLabelGen.convert_to_world_features( labels_post, contains_deltas=False, num_coded_sps=hparams.num_coded_sps) lf0, _ = interpolate_lin(lf0) # Load original lf0. org_labels_post = WorldFeatLabelGen.load_sample( id_name, self.OutputGen.dir_labels, add_deltas=self.OutputGen.add_deltas, num_coded_sps=hparams.num_coded_sps) original_mgc, original_lf0, original_vuv, *_ = WorldFeatLabelGen.convert_to_world_features( org_labels_post, contains_deltas=self.OutputGen.add_deltas, num_coded_sps=hparams.num_coded_sps) original_lf0, _ = interpolate_lin(original_lf0) # Get a data plotter. grid_idx = 0 plotter = DataPlotter() net_name = os.path.basename(hparams.model_name) filename = str(os.path.join(hparams.out_dir, id_name + '.' + net_name)) plotter.set_title(id_name + ' - ' + net_name) plotter.set_num_colors(3) # plotter.set_lim(grid_idx=0, ymin=math.log(60), ymax=math.log(250)) plotter.set_label(grid_idx=grid_idx, xlabel='frames [' + str(hparams.frame_size_ms) + ' ms]', ylabel='log(f0)') graphs = list() graphs.append((original_lf0, 'Original lf0')) graphs.append((lf0, 'PyTorch lf0')) plotter.set_data_list(grid_idx=grid_idx, data_list=graphs) plotter.set_area_list(grid_idx=grid_idx, area_list=[(np.invert(vuv.astype(bool)), '0.8', 1.0), (np.invert(original_vuv.astype(bool)), 'red', 0.2)]) grid_idx += 1 import librosa plotter.set_label(grid_idx=grid_idx, xlabel='frames [' + str(hparams.frame_size_ms) + ' ms]', ylabel='Original spectrogram') plotter.set_specshow(grid_idx=grid_idx, spec=librosa.amplitude_to_db(np.absolute( WorldFeatLabelGen.mcep_to_amp_sp( original_mgc, hparams.synth_fs)), top_db=None)) grid_idx += 1 plotter.set_label(grid_idx=grid_idx, xlabel='frames [' + str(hparams.frame_size_ms) + ' ms]', ylabel='NN spectrogram') plotter.set_specshow(grid_idx=grid_idx, spec=librosa.amplitude_to_db(np.absolute( WorldFeatLabelGen.mcep_to_amp_sp( coded_sp, hparams.synth_fs)), top_db=None)) if hasattr(hparams, "phoneme_indices") and hparams.phoneme_indices is not None \ and hasattr(hparams, "question_file") and hparams.question_file is not None: questions = QuestionLabelGen.load_sample( id_name, "experiments/" + hparams.voice + "/questions/", num_questions=hparams.num_questions)[:len(lf0)] np_phonemes = QuestionLabelGen.questions_to_phonemes( questions, hparams.phoneme_indices, hparams.question_file) plotter.set_annotations(grid_idx, np_phonemes) plotter.gen_plot() plotter.save_to_file(filename + '.Org-PyTorch' + hparams.gen_figure_ext) def compute_score(self, dict_outputs_post, dict_hiddens, hparams): # Get data for comparision. dict_original_post = dict() for id_name in dict_outputs_post.keys(): dict_original_post[id_name] = WorldFeatLabelGen.load_sample( id_name, dir_out=self.OutputGen.dir_labels, add_deltas=True, num_coded_sps=hparams.num_coded_sps) f0_rmse = 0.0 f0_rmse_max_id = "None" f0_rmse_max = 0.0 all_rmse = [] vuv_error_rate = 0.0 vuv_error_max_id = "None" vuv_error_max = 0.0 all_vuv = [] mcd = 0.0 mcd_max_id = "None" mcd_max = 0.0 all_mcd = [] bap_error = 0.0 bap_error_max_id = "None" bap_error_max = 0.0 all_bap_error = [] for id_name, labels in dict_outputs_post.items(): output_coded_sp, output_lf0, output_vuv, output_bap = self.OutputGen.convert_to_world_features( sample=labels, contains_deltas=False, num_coded_sps=hparams.num_coded_sps) output_vuv = output_vuv.astype(bool) # Get data for comparision. org_coded_sp, org_lf0, org_vuv, org_bap = self.OutputGen.convert_to_world_features( sample=dict_original_post[id_name], contains_deltas=self.OutputGen.add_deltas, num_coded_sps=hparams.num_coded_sps) # Compute f0 from lf0. org_f0 = np.exp(org_lf0.squeeze())[:len( output_lf0)] # Fix minor negligible length mismatch. output_f0 = np.exp(output_lf0) # Compute MCD. org_coded_sp = org_coded_sp[:len(output_coded_sp)] current_mcd = metrics.melcd( output_coded_sp[:, 1:], org_coded_sp[:, 1:]) # TODO: Use aligned mcd. if current_mcd > mcd_max: mcd_max_id = id_name mcd_max = current_mcd mcd += current_mcd all_mcd.append(current_mcd) # Compute RMSE. f0_mse = (org_f0 - output_f0)**2 current_f0_rmse = math.sqrt( (f0_mse * org_vuv[:len(output_lf0)]).sum() / org_vuv[:len(output_lf0)].sum()) if current_f0_rmse != current_f0_rmse: logging.error( "Computed NaN for F0 RMSE for {}.".format(id_name)) else: if current_f0_rmse > f0_rmse_max: f0_rmse_max_id = id_name f0_rmse_max = current_f0_rmse f0_rmse += current_f0_rmse all_rmse.append(current_f0_rmse) # Compute error of VUV in percentage. num_errors = (org_vuv[:len(output_lf0)] != output_vuv) vuv_error_rate_tmp = float(num_errors.sum()) / len(output_lf0) if vuv_error_rate_tmp > vuv_error_max: vuv_error_max_id = id_name vuv_error_max = vuv_error_rate_tmp vuv_error_rate += vuv_error_rate_tmp all_vuv.append(vuv_error_rate_tmp) # Compute aperiodicity distortion. org_bap = org_bap[:len(output_bap)] if len(output_bap.shape) > 1 and output_bap.shape[1] > 1: current_bap_error = metrics.melcd( output_bap, org_bap) # TODO: Use aligned mcd? else: current_bap_error = math.sqrt( ((org_bap - output_bap)** 2).mean()) * (10.0 / np.log(10) * np.sqrt(2.0)) if current_bap_error > bap_error_max: bap_error_max_id = id_name bap_error_max = current_bap_error bap_error += current_bap_error all_bap_error.append(current_bap_error) f0_rmse /= len(dict_outputs_post) vuv_error_rate /= len(dict_outputs_post) mcd /= len(dict_original_post) bap_error /= len(dict_original_post) self.logger.info("Worst MCD: {} {:4.2f}dB".format(mcd_max_id, mcd_max)) self.logger.info("Worst F0 RMSE: {} {:4.2f}Hz".format( f0_rmse_max_id, f0_rmse_max)) self.logger.info("Worst VUV error: {} {:2.2f}%".format( vuv_error_max_id, vuv_error_max * 100)) self.logger.info("Worst BAP error: {} {:4.2f}db".format( bap_error_max_id, bap_error_max)) self.logger.info( "Benchmark score: MCD {:4.2f}dB, F0 RMSE {:4.2f}Hz, VUV {:2.2f}%, BAP error {:4.2f}db" .format(mcd, f0_rmse, vuv_error_rate * 100, bap_error)) return mcd, f0_rmse, vuv_error_rate, bap_error def synthesize(self, id_list, synth_output, hparams): """ Depending on hparams override the network output with the extracted features, then continue with normal synthesis pipeline. """ if hparams.synth_load_org_sp\ or hparams.synth_load_org_lf0\ or hparams.synth_load_org_vuv\ or hparams.synth_load_org_bap: for id_name in id_list: world_dir = hparams.world_dir if hasattr(hparams, "world_dir") and hparams.world_dir is not None\ else os.path.join(self.OutputGen.dir_labels, self.dir_extracted_acoustic_features) labels = WorldFeatLabelGen.load_sample( id_name, world_dir, num_coded_sps=hparams.num_coded_sps) len_diff = len(labels) - len(synth_output[id_name]) if len_diff > 0: labels = WorldFeatLabelGen.trim_end_sample(labels, int(len_diff / 2), reverse=True) labels = WorldFeatLabelGen.trim_end_sample( labels, len_diff - int(len_diff / 2)) if hparams.synth_load_org_sp: synth_output[ id_name][:len(labels), :self.OutputGen. num_coded_sps] = labels[:, :self.OutputGen. num_coded_sps] if hparams.synth_load_org_lf0: synth_output[id_name][:len(labels), -3] = labels[:, -3] if hparams.synth_load_org_vuv: synth_output[id_name][:len(labels), -2] = labels[:, -2] if hparams.synth_load_org_bap: synth_output[id_name][:len(labels), -1] = labels[:, -1] # Run the vocoder. ModelTrainer.synthesize(self, id_list, synth_output, hparams)
def __init__(self, wcad_root, dir_audio, dir_atom_labels, dir_lf0_labels, dir_question_labels, id_list, thetas, k, num_questions, dist_window_size=51, hparams=None): """Default constructor. :param wcad_root: Path to main directory of wcad. :param dir_audio: Path to directory that contains the .wav files. :param dir_atom_labels: Path to directory that contains the .atoms files. :param dir_lf0_labels: Path to directory that contains the .lf0 files. :param dir_question_labels: Path to directory that contains the .lab files. :param id_list: List containing all ids. Subset is taken as test set. :param thetas: List of theta values of the used atoms. :param k: K-value of atoms. :param num_questions: Expected number of questions in question labels. :param dist_window_size: Size of distribution around atom amplitudes when training the atom model. :param hparams: Hyper-parameter container. """ if hparams is None: hparams = self.create_hparams() hparams.out_dir = os.path.curdir hparams_atom = hparams.hparams_atom if hparams_atom is None: hparams_atom = copy.deepcopy(hparams) hparams_atom.synth_gen_figure = False hparams_atom.synth_acoustic_model_path = None if hparams.atom_model_path is None: hparams.atom_model_path = os.path.join( hparams.out_dir, hparams.networks_dir, hparams.model_name + "_atoms") # Write missing default parameters. if hparams.synth_dir is None: hparams.synth_dir = os.path.join(hparams.out_dir, "synth") super().__init__(id_list, hparams) self.InputGen = QuestionLabelGen(dir_question_labels, num_questions) self.InputGen.get_normalisation_params( dir_question_labels, hparams.input_norm_params_file_prefix) self.OutputGen = FlatLF0LabelGen(dir_lf0_labels, dir_atom_labels) self.OutputGen.get_normalisation_params( dir_atom_labels, hparams.output_norm_params_file_prefix) self.dataset_train = PyTorchLabelGensDataset(self.id_list_train, self.InputGen, self.OutputGen, hparams, match_lengths=True) self.dataset_val = PyTorchLabelGensDataset(self.id_list_val, self.InputGen, self.OutputGen, hparams, match_lengths=True) self.atom_trainer = AtomVUVDistPosModelTrainer( wcad_root, dir_atom_labels, dir_lf0_labels, dir_question_labels, id_list, thetas, k, num_questions, dist_window_size, hparams_atom) if self.loss_function is None: self.loss_function = L1WeightedVUVMSELoss( weight_unvoiced=hparams.weight_unvoiced, vuv_loss_weight=hparams.vuv_loss_weight, L1_loss_weight=hparams.L1_loss_weight, reduce=False) if hparams.scheduler_type == "default": hparams.scheduler_type = "None" # Override the collate and decollate methods of batches. self.batch_collate_fn = self.prepare_batch self.batch_decollate_fn = self.decollate_network_output
def test_load(self): sample = QuestionLabelGen.load_sample(self.id_list[0], self.dir_questions, num_questions=409) self.assertEqual(409, sample.shape[1])
def __init__(self, wcad_root, dir_atom_labels, dir_lf0_labels, dir_question_labels, id_list, thetas, k, num_questions, dist_window_size=51, hparams=None): """Default constructor. :param wcad_root: Path to main directory of wcad. :param dir_atom_labels: Path to directory that contains the .wav files. :param dir_lf0_labels: Path to directory that contains the .lf0 files. :param dir_question_labels: Path to directory that contains the .lab files. :param id_list: List containing all ids. Subset is taken as test set. :param thetas: List of theta values of atoms. :param k: K-value of atoms. :param num_questions: Expected number of questions in question labels. :param dist_window_size: Width of the distribution surrounding each atom spike The window is only used for amps. Thetas are surrounded by a window of 5. :param hparams: Hyper-parameter container. """ if hparams is None: hparams = self.create_hparams() hparams.out_dir = os.path.curdir # Write missing default parameters. if hparams.variable_sequence_length_train is None: hparams.variable_sequence_length_train = hparams.batch_size_train > 1 if hparams.variable_sequence_length_test is None: hparams.variable_sequence_length_test = hparams.batch_size_test > 1 if hparams.synth_dir is None: hparams.synth_dir = os.path.join(hparams.out_dir, "synth") # If the weight for unvoiced frames is not given, compute it to get equal weights. if not hasattr(hparams, "weight_zero") or hparams.weight_zero is None: non_zero_occurrence = min(0.99, 0.015 / len(thetas)) zero_occurrence = 1 - non_zero_occurrence hparams.add_hparam("weight_non_zero", 1 / non_zero_occurrence) hparams.add_hparam("weight_zero", 1 / zero_occurrence) if not hasattr(hparams, "weight_vuv") or hparams.weight_vuv is None: hparams.add_hparam("weight_vuv", 0.5) if not hasattr(hparams, "atom_loss_theta") or hparams.atom_loss_theta is None: hparams.add_hparam("atom_loss_theta", 0.01) # Explicitly call only the constructor of the baseclass of AtomModelTrainer. super(AtomModelTrainer, self).__init__(id_list, hparams) if hparams.dist_window_size % 2 == 0: hparams.dist_window_size += 1 self.logger.warning("hparams.dist_window_size should be odd, changed it to " + str(hparams.dist_window_size)) self.InputGen = QuestionLabelGen(dir_question_labels, num_questions) self.InputGen.get_normalisation_params(dir_question_labels, hparams.input_norm_params_file_prefix) # Overwrite OutputGen by the one with beta distribution. self.OutputGen = AtomVUVDistPosLabelGen(wcad_root, dir_atom_labels, dir_lf0_labels, thetas, k, hparams.frame_size_ms, window_size=dist_window_size) self.OutputGen.get_normalisation_params(dir_atom_labels, hparams.output_norm_params_file_prefix) self.dataset_train = PyTorchLabelGensDataset(self.id_list_train, self.InputGen, self.OutputGen, hparams, match_lengths=True) self.dataset_val = PyTorchLabelGensDataset(self.id_list_val, self.InputGen, self.OutputGen, hparams, match_lengths=True) if self.loss_function is None: self.loss_function = WeightedNonzeroWMSEAtomLoss(use_gpu=hparams.use_gpu, theta=hparams.atom_loss_theta, weights_vuv=hparams.weight_vuv, weights_zero=hparams.weight_zero, weights_non_zero=hparams.weight_non_zero, reduce=False) if hparams.scheduler_type == "default": hparams.scheduler_type = "None"
class AtomVUVDistPosModelTrainer(AtomModelTrainer): """ Subclass of AtomModelTrainer, which uses one amplitude per theta plus position flag, format is T x (|thetas| + 1). Each amplitude in the target labels is surrounded by a distribution. Positions of atoms are identified by finding the peaks of the position flag prediction. For positive peaks the theta with the highest amplitude is used, for negative peaks the theta with the lowest amplitude. Acoustic data is generated from these atoms. MGC and BAP is either generated by a pre-trained acoustic model or loaded from the original extracted files. Question labels are used as input. """ logger = logging.getLogger(__name__) def __init__(self, wcad_root, dir_atom_labels, dir_lf0_labels, dir_question_labels, id_list, thetas, k, num_questions, dist_window_size=51, hparams=None): """Default constructor. :param wcad_root: Path to main directory of wcad. :param dir_atom_labels: Path to directory that contains the .wav files. :param dir_lf0_labels: Path to directory that contains the .lf0 files. :param dir_question_labels: Path to directory that contains the .lab files. :param id_list: List containing all ids. Subset is taken as test set. :param thetas: List of theta values of atoms. :param k: K-value of atoms. :param num_questions: Expected number of questions in question labels. :param dist_window_size: Width of the distribution surrounding each atom spike The window is only used for amps. Thetas are surrounded by a window of 5. :param hparams: Hyper-parameter container. """ if hparams is None: hparams = self.create_hparams() hparams.out_dir = os.path.curdir # Write missing default parameters. if hparams.variable_sequence_length_train is None: hparams.variable_sequence_length_train = hparams.batch_size_train > 1 if hparams.variable_sequence_length_test is None: hparams.variable_sequence_length_test = hparams.batch_size_test > 1 if hparams.synth_dir is None: hparams.synth_dir = os.path.join(hparams.out_dir, "synth") # If the weight for unvoiced frames is not given, compute it to get equal weights. if not hasattr(hparams, "weight_zero") or hparams.weight_zero is None: non_zero_occurrence = min(0.99, 0.015 / len(thetas)) zero_occurrence = 1 - non_zero_occurrence hparams.add_hparam("weight_non_zero", 1 / non_zero_occurrence) hparams.add_hparam("weight_zero", 1 / zero_occurrence) if not hasattr(hparams, "weight_vuv") or hparams.weight_vuv is None: hparams.add_hparam("weight_vuv", 0.5) if not hasattr(hparams, "atom_loss_theta") or hparams.atom_loss_theta is None: hparams.add_hparam("atom_loss_theta", 0.01) # Explicitly call only the constructor of the baseclass of AtomModelTrainer. super(AtomModelTrainer, self).__init__(id_list, hparams) if hparams.dist_window_size % 2 == 0: hparams.dist_window_size += 1 self.logger.warning("hparams.dist_window_size should be odd, changed it to " + str(hparams.dist_window_size)) self.InputGen = QuestionLabelGen(dir_question_labels, num_questions) self.InputGen.get_normalisation_params(dir_question_labels, hparams.input_norm_params_file_prefix) # Overwrite OutputGen by the one with beta distribution. self.OutputGen = AtomVUVDistPosLabelGen(wcad_root, dir_atom_labels, dir_lf0_labels, thetas, k, hparams.frame_size_ms, window_size=dist_window_size) self.OutputGen.get_normalisation_params(dir_atom_labels, hparams.output_norm_params_file_prefix) self.dataset_train = PyTorchLabelGensDataset(self.id_list_train, self.InputGen, self.OutputGen, hparams, match_lengths=True) self.dataset_val = PyTorchLabelGensDataset(self.id_list_val, self.InputGen, self.OutputGen, hparams, match_lengths=True) if self.loss_function is None: self.loss_function = WeightedNonzeroWMSEAtomLoss(use_gpu=hparams.use_gpu, theta=hparams.atom_loss_theta, weights_vuv=hparams.weight_vuv, weights_zero=hparams.weight_zero, weights_non_zero=hparams.weight_non_zero, reduce=False) if hparams.scheduler_type == "default": hparams.scheduler_type = "None" @staticmethod def create_hparams(hparams_string=None, verbose=False): hparams = AtomModelTrainer.create_hparams(hparams_string, verbose=False) hparams.add_hparam("dist_window_size", 51) hparams.add_hparam("synth_acoustic_model", None) if verbose: logging.info(hparams.get_debug_string()) return hparams def gen_figure_from_output(self, id_name, label, hidden, hparams): # Retrieve data from label. output_amps = label[:, 1:-1] output_pos = label[:, -1] labels_post = self.OutputGen.postprocess_sample(label) output_vuv = labels_post[:, 0, 1].astype(bool) output_atoms = self.OutputGen.labels_to_atoms(labels_post, k=hparams.k, amp_threshold=hparams.min_atom_amp) output_lf0 = self.OutputGen.atoms_to_lf0(output_atoms, len(label)) # Load original lf0 and vuv. world_dir = hparams.world_dir if hasattr(hparams, "world_dir") and hparams.world_dir is not None\ else os.path.join(self.OutputGen.dir_labels, self.dir_extracted_acoustic_features) org_labels = LF0LabelGen.load_sample(id_name, world_dir) original_lf0, _ = LF0LabelGen.convert_to_world_features(org_labels) original_lf0, _ = interpolate_lin(original_lf0) phrase_curve = np.fromfile(os.path.join(self.OutputGen.dir_labels, id_name + self.OutputGen.ext_phrase), dtype=np.float32).reshape(-1, 1) original_lf0[:len(phrase_curve)] -= phrase_curve[:len(original_lf0)] original_lf0 = original_lf0[:len(output_lf0)] org_labels = self.OutputGen.load_sample(id_name, self.OutputGen.dir_labels, len(hparams.thetas), self.OutputGen.dir_world_labels) org_vuv = org_labels[:, 0, 0].astype(bool) org_labels = org_labels[:, 1:] len_diff = len(org_labels) - len(labels_post) org_labels = self.OutputGen.trim_end_sample(org_labels, int(len_diff / 2.0)) org_labels = self.OutputGen.trim_end_sample(org_labels, int(len_diff / 2.0) + 1) org_atoms = AtomLabelGen.labels_to_atoms(org_labels, k=hparams.k, frame_size=hparams.frame_size_ms) wcad_lf0 = self.OutputGen.atoms_to_lf0(org_atoms, len(org_labels)) # Get a data plotter net_name = os.path.basename(hparams.model_name) filename = str(os.path.join(hparams.out_dir, id_name + '.' + net_name)) plotter = DataPlotter() plotter.set_title(id_name + " - " + net_name) grid_idx = 0 graphs_output = list() for idx in reversed(range(output_amps.shape[1])): graphs_output.append((output_amps[:, idx], r'$\theta$={0:.3f}'.format(hparams.thetas[idx]))) plotter.set_data_list(grid_idx=grid_idx, data_list=graphs_output) plotter.set_label(grid_idx=grid_idx, ylabel='NN amps') amp_max = np.max(output_amps) * 1.1 amp_min = np.min(output_amps) * 1.1 plotter.set_lim(grid_idx=grid_idx, ymin=amp_min, ymax=amp_max) grid_idx += 1 graphs_pos_flag = list() graphs_pos_flag.append((output_pos,)) plotter.set_data_list(grid_idx=grid_idx, data_list=graphs_pos_flag) plotter.set_label(grid_idx=grid_idx, ylabel='NN pos') grid_idx += 1 graphs_peaks = list() for idx in reversed(range(label.shape[1] - 2)): graphs_peaks.append((labels_post[:, 1 + idx, 0],)) plotter.set_data_list(grid_idx=grid_idx, data_list=graphs_peaks) plotter.set_area_list(grid_idx=grid_idx, area_list=[(np.invert(output_vuv), '0.75', 1.0, 'Unvoiced')]) plotter.set_label(grid_idx=grid_idx, ylabel='NN peaks') plotter.set_lim(grid_idx=grid_idx, ymin=-1.8, ymax=1.8) grid_idx += 1 graphs_target = list() for idx in reversed(range(org_labels.shape[1])): graphs_target.append((org_labels[:, idx, 0],)) plotter.set_data_list(grid_idx=grid_idx, data_list=graphs_target) plotter.set_hatchstyles(grid_idx=grid_idx, hatchstyles=['\\\\']) plotter.set_area_list(grid_idx=grid_idx, area_list=[(np.invert(org_vuv.astype(bool)), '0.75', 1.0, 'Reference unvoiced')]) plotter.set_label(grid_idx=grid_idx, ylabel='target') plotter.set_lim(grid_idx=grid_idx, ymin=-1.8, ymax=1.8) grid_idx += 1 graphs_lf0 = list() graphs_lf0.append((wcad_lf0, "wcad lf0")) graphs_lf0.append((original_lf0, "org lf0")) graphs_lf0.append((output_lf0, "predicted lf0")) plotter.set_data_list(grid_idx=grid_idx, data_list=graphs_lf0) plotter.set_area_list(grid_idx=grid_idx, area_list=[(np.invert(org_vuv.astype(bool)), '0.75', 1.0)]) plotter.set_hatchstyles(grid_idx=grid_idx, hatchstyles=['\\\\']) plotter.set_label(grid_idx=grid_idx, xlabel='frames [' + str(hparams.frame_size_ms) + ' ms]', ylabel='lf0') amp_lim = max(np.max(np.abs(wcad_lf0)), np.max(np.abs(output_lf0))) * 1.1 plotter.set_lim(grid_idx=grid_idx, ymin=-amp_lim, ymax=amp_lim) plotter.set_linestyles(grid_idx=grid_idx, linestyles=[':', '--', '-']) # # Compute F0 RMSE for sample and add it to title. # org_f0 = (np.exp(lf0.squeeze() + phrase_curve[:len(lf0)].squeeze()) * vuv)[:len(output_lf0)] # Fix minor negligible length mismatch. # output_f0 = np.exp(output_lf0 + phrase_curve[:len(output_lf0)].squeeze()) * output_vuv[:len(output_lf0)] # f0_mse = (org_f0 - output_f0) ** 2 # # non_zero_count = np.logical_and(vuv[:len(output_lf0)], output_vuv).sum() # f0_rmse = math.sqrt(f0_mse.sum() / (np.logical_and(vuv[:len(output_lf0)], output_vuv).sum())) # # Compute vuv error rate. # num_errors = (vuv[:len(output_lf0)] != output_vuv) # vuv_error_rate = float(num_errors.sum()) / len(output_lf0) # plotter.set_title(id_name + " - " + net_name + " - F0_RMSE_" + "{:4.2f}Hz".format(f0_rmse) + " - VUV_" + "{:2.2f}%".format(vuv_error_rate * 100)) # plotter.set_lim(xmin=300, xmax=1100)g plotter.gen_plot(monochrome=True) plotter.gen_plot() plotter.save_to_file(filename + ".VUV_DIST_POS" + hparams.gen_figure_ext) def compute_score(self, dict_outputs_post, dict_hiddens, hparams): """Compute the score of a dictionary with post-processes labels.""" # Get data for comparision. dict_original_post = self.load_extracted_audio_features(dict_outputs_post, hparams) f0_rmse = 0.0 vuv_error_rate = 0.0 f0_rmse_max_id = "None" f0_rmse_max = 0.0 vuv_error_max_id = "None" vuv_error_max = 0.0 for id_name, label in dict_outputs_post.items(): output_vuv = label[:, 0, 1].astype(bool) output_atom_labels = label[:, 1:] output_lf0 = self.OutputGen.labels_to_lf0(output_atom_labels, k=hparams.k, frame_size=hparams.frame_size_ms, amp_threshold=hparams.min_atom_amp) # Get data for comparision. org_lf0 = dict_original_post[id_name][:, hparams.num_coded_sps] org_vuv = dict_original_post[id_name][:, hparams.num_coded_sps + 1] phrase_curve = self.get_phrase_curve(id_name) # Compute f0 from lf0. org_f0 = np.exp(org_lf0.squeeze())[:len(output_lf0)] # Fix minor negligible length mismatch. output_f0 = np.exp(output_lf0 + phrase_curve[:len(output_lf0)].squeeze()) # Compute RMSE, keep track of worst RMSE. f0_mse = (org_f0 - output_f0) ** 2 current_f0_rmse = math.sqrt((f0_mse * org_vuv[:len(output_lf0)]).sum() / org_vuv[:len(output_lf0)].sum()) if current_f0_rmse > f0_rmse_max: f0_rmse_max_id = id_name f0_rmse_max = current_f0_rmse f0_rmse += current_f0_rmse # Compute vuv error rate. num_errors = (org_vuv[:len(output_lf0)] != output_vuv) vuv_error_rate_tmp = float(num_errors.sum()) / len(output_lf0) if vuv_error_rate_tmp > vuv_error_max: vuv_error_max_id = id_name vuv_error_max = vuv_error_rate_tmp vuv_error_rate += vuv_error_rate_tmp f0_rmse /= len(dict_outputs_post) vuv_error_rate /= len(dict_outputs_post) self.logger.info("Worst F0 RMSE: " + f0_rmse_max_id + " {:4.2f}Hz".format(f0_rmse_max)) self.logger.info("Worst VUV error: " + vuv_error_max_id + " {:2.2f}%".format(vuv_error_max * 100)) self.logger.info("Benchmark score: F0 RMSE " + "{:4.2f}Hz".format(f0_rmse) + ", VUV " + "{:2.2f}%".format(vuv_error_rate * 100)) return f0_rmse, vuv_error_rate def synthesize(self, id_list, synth_output, hparams): """ Synthesise LF0 from atoms. The run_atom_synth function either loads the original acoustic features or uses an acoustic model to predict them. """ full_output = self.run_atom_synth(id_list, synth_output, hparams) for id_name, labels in full_output.items(): lf0 = labels[:, -3] lf0, _ = interpolate_lin(lf0) vuv = synth_output[id_name][:, 0, 1] len_diff = len(labels) - len(vuv) labels = WorldFeatLabelGen.trim_end_sample(labels, int(len_diff / 2), reverse=True) labels = WorldFeatLabelGen.trim_end_sample(labels, len_diff - int(len_diff / 2)) labels[:, -2] = vuv # Run the vocoder. ModelTrainer.synthesize(self, id_list, full_output, hparams)
def __init__(self, wcad_root, dir_atom_labels, dir_question_labels, id_list, thetas, k, num_questions, hparams=None): """Default constructor. :param wcad_root: Path to main directory of wcad. :param dir_atom_labels: Path to directory that contains the .atom files. :param dir_question_labels: Path to directory that contains the .questions files. :param id_list: List containing all ids. Subset is taken as test set. :param thetas: List of theta values. :param k: K value of atoms. :param num_questions: Expected number of questions in question labels. :param hparams: Hyper-parameter container. """ if hparams is None: hparams = self.create_hparams() hparams.out_dir = os.path.curdir # Write missing default parameters. if hparams.variable_sequence_length_train is None: hparams.variable_sequence_length_train = hparams.batch_size_train > 1 if hparams.variable_sequence_length_test is None: hparams.variable_sequence_length_test = hparams.batch_size_test > 1 if hparams.synth_dir is None: hparams.synth_dir = os.path.join(hparams.out_dir, "synth") # If the weight for unvoiced frames is not given, compute it to get equal weights. non_zero_occurrence = min(0.99, 0.02 / len(thetas)) zero_occurrence = 1 - non_zero_occurrence if not hasattr(hparams, "weight_zero"): hparams.add_hparam("weight_non_zero", 1 / non_zero_occurrence) hparams.add_hparam("weight_zero", 1 / zero_occurrence) elif hparams.weight_zero is None: hparams.weight_non_zero = 1 / non_zero_occurrence hparams.weight_zero = 1 / zero_occurrence super().__init__(id_list, hparams) self.InputGen = QuestionLabelGen(dir_question_labels, num_questions) self.InputGen.get_normalisation_params( dir_question_labels, hparams.input_norm_params_file_prefix) self.OutputGen = AtomLabelGen(wcad_root, dir_atom_labels, thetas, k, hparams.frame_size_ms) self.OutputGen.get_normalisation_params( dir_atom_labels, hparams.output_norm_params_file_prefix) self.dataset_train = PyTorchLabelGensDataset(self.id_list_train, self.InputGen, self.OutputGen, hparams, match_lengths=True) self.dataset_val = PyTorchLabelGensDataset(self.id_list_val, self.InputGen, self.OutputGen, hparams, match_lengths=True) if self.loss_function is None: self.loss_function = WeightedNonzeroMSELoss( hparams.use_gpu, hparams.weight_zero, hparams.weight_non_zero, size_average=False, reduce=False) if hparams.scheduler_type == "default": hparams.scheduler_type = "Plateau" hparams.add_hparams(plateau_patience=10, plateau_factor=0.5, plateau_verbose=True)
def run_DM_AM(hparams, input_strings): """ A function for TTS with a pre-trained duration and acoustic model. :param hparams: Hyper-parameter container. The following parameters are used: front_end: Full path to the makeLabels.sh script in scripts/tts_frontend, depends on the language. festival_dir: Full path to the directory with the festival bin/ folder. front_end_accent (optional): Give an accent to the front_end, used in tts_frontend. duration_labels_dir: Full path to the folder containing the normalisation parameters used to train the duration model. file_symbol_dict: A file containing all the used phonemes (has been used to train the duration model, usually mono_phone.list). duration_model: Full path to the pre-trained duration model. num_phoneme_states: Number of states per phoneme, for each a duration is predicted by the duration model. question_file: Full path to question file used to train the acoustic model. question_labels_norm_file: Full path to normalisation file of questions used to train the acoustic model. num_questions: Number of questions which form the input dimension to the acoustic model. acoustic_model: Full path to acoustic model. :param input_strings: :return: """ # Create a temporary directory to store all files. with tempfile.TemporaryDirectory() as tmp_dir_name: # tmp_dir_name = os.path.realpath("TMP") # makedirs_safe(tmp_dir_name) hparams.out_dir = tmp_dir_name print("Created temporary directory", tmp_dir_name) id_list = ["synth" + str(idx) for idx in range(len(input_strings))] # Write the text to synthesise into a single synth.txt file with ids. utts_file = os.path.join(tmp_dir_name, "synth.txt") with open(utts_file, "w") as text_file: for idx, text in enumerate(input_strings): text_file.write("synth{}\t{}\n".format( idx, text)) # TODO: Remove parenthesis etc. # Call the front end on the synth.txt file. front_end_arguments = [ hparams.front_end, hparams.festival_dir, utts_file ] if hasattr(hparams, "front_end_accent" ) and hparams.front_end_accent is not None: front_end_arguments.append(hparams.front_end_accent) front_end_arguments.append(tmp_dir_name) subprocess.check_call(front_end_arguments) # Remove durations from mono labels. dir_mono_no_align = os.path.join(tmp_dir_name, "mono_no_align") dir_mono = os.path.join(tmp_dir_name, "labels", "mono") if os.path.isdir(dir_mono_no_align): shutil.rmtree(dir_mono_no_align) os.rename(dir_mono, dir_mono_no_align) for id_name in id_list: with open(os.path.join(dir_mono_no_align, id_name + ".lab"), "r") as f: old = f.read() monophones = old.split()[2::3] with open(os.path.join(dir_mono_no_align, id_name + ".lab"), "w") as f: f.write("\n".join(monophones)) # Run duration model. hparams.batch_size_test = len(input_strings) hparams.test_set_perc = 0.0 hparams.val_set_perc = 0.0 hparams.phoneme_label_type = "mono_no_align" hparams.output_norm_params_file_prefix = hparams.duration_norm_file_name if hasattr( hparams, "duration_norm_file_name") else None duration_model_trainer = DurationModelTrainer( os.path.join(tmp_dir_name, "mono_no_align"), hparams.duration_labels_dir, id_list, hparams.file_symbol_dict, hparams) assert hparams.duration_model is not None, "Path to duration model in hparams.duration_model is needed." hparams.model_path = hparams.duration_model hparams.model_name = os.path.basename(hparams.duration_model) # Predict durations. Durations are already converted to multiples of hparams.min_phoneme_length. hparams.load_from_checkpoint = True duration_model_trainer.init(hparams) _, output_dict_post = duration_model_trainer.forward( hparams, id_list) hparams.output_norm_params_file_prefix = None # Reset again. # Write duration to full labels. dir_full = os.path.join(tmp_dir_name, "labels", "full") dir_label_state_align = os.path.join(tmp_dir_name, "labels", "label_state_align") makedirs_safe(dir_label_state_align) for id_name in id_list: with open(os.path.join(dir_full, id_name + ".lab"), "r") as f: full = f.read().split()[2::3] with open( os.path.join(dir_label_state_align, id_name + ".lab"), "w") as f: current_time = 0 timings = output_dict_post[id_name] for idx, monophone in enumerate(full): for state in range(hparams.num_phoneme_states): next_time = current_time + int(timings[idx, state]) f.write("{}\t{}\t{}[{}]\n".format( current_time, next_time, monophone, state + 2)) current_time = next_time # Generate questions from HTK full labels. QuestionLabelGen.gen_data(dir_label_state_align, hparams.question_file, dir_out=tmp_dir_name, file_id_list="synth", id_list=id_list, return_dict=False) # Run acoustic model and synthesise. shutil.copy2(hparams.question_labels_norm_file, tmp_dir_name + "/min-max.bin" ) # Get normalisation parameters in same directory. acoustic_model_trainer = AcousticModelTrainer( hparams.world_features_dir, tmp_dir_name, id_list, hparams.num_questions, hparams) assert hparams.acoustic_model is not None, "Path to acoustic model in hparams.acoustic_model is needed." hparams.model_path = hparams.acoustic_model hparams.model_name = os.path.basename(hparams.acoustic_model) hparams.load_from_checkpoint = True acoustic_model_trainer.init(hparams) hparams.model_name = "" # No suffix in synthesised files. _, output_dict_post = acoustic_model_trainer.synth( hparams, id_list) logging.info("Synthesized files are in {}.".format( hparams.synth_dir)) return 0