def __init__(self, model_dir, question_file_name, silence_pattern='/2:sil/'): ## TODO: where to handle silence pattern? Currently fragile super(NNAcousticModel, self).__init__(model_dir) self.load_stream_info() self.label_expander = HTSLabelNormalisation(question_file_name=question_file_name) self.param_generator = MLParameterGenerationFast() # ParameterGeneration() self.silent_feature_indices = self.get_silent_feature_indices(question_file_name, silence_pattern) std = self.output_std m = numpy.shape(std) std = std.reshape((1,self.outdim)) self.stream_std = self.split_into_streams(std)
def load_nnets_models(cfg_list): nnets_model_list = [] for i, cfg in enumerate(cfg_list): #load two nnets_models into memory model_dir = os.path.join(cfg.work_dir, 'nnets_model') hidden_layer_size = cfg.hyper_params['hidden_layer_size'] combined_model_arch = str(len(hidden_layer_size)) for hid_size in hidden_layer_size: combined_model_arch += '_' + str(hid_size) label_normaliser = HTSLabelNormalisation( question_file_name=cfg.question_file_name, add_frame_features=cfg.add_frame_features, subphone_feats=cfg.subphone_feats) add_feat_dim = sum(cfg.additional_features.values()) lab_dim = label_normaliser.dimension + add_feat_dim + cfg.appended_input_dim nnets_file_name = '%s/%s_%s_%d_%s_%d.%d.train.%d.%f.rnn.model' \ %(model_dir, cfg.combined_model_name, cfg.combined_feature_name, int(cfg.multistream_switch), combined_model_arch, lab_dim, cfg.cmp_dim, cfg.train_file_number, cfg.hyper_params['learning_rate']) nnets_model = cPickle.load(open(nnets_file_name, 'rb')) print nnets_file_name print "__________________________" nnets_model_list.append(nnets_model) assert (len(nnets_model_list) == 2) return nnets_model_list
def main_function(cfg, outdir, model_pickle_file=None): hidden_layer_size = cfg.hyper_params['hidden_layer_size'] data_dir = cfg.data_dir model_dir = os.path.join(cfg.work_dir, 'nnets_model') # norm_info_file = os.path.join(data_dir, 'norm_info' + cfg.combined_feature_name + '_' + str(cfg.cmp_dim) + '_' + cfg.output_feature_normalisation + '.dat') ### normalise input full context label if cfg.label_style == 'HTS': label_normaliser = HTSLabelNormalisation( question_file_name=cfg.question_file_name) lab_dim = label_normaliser.dimension + cfg.appended_input_dim print(('Input label dimension is %d' % lab_dim)) suffix = str(lab_dim) elif cfg.label_style == 'HTS_duration': label_normaliser = HTSDurationLabelNormalisation( question_file_name=cfg.question_file_name) lab_dim = label_normaliser.dimension ## + cfg.appended_input_dim print(('Input label dimension is %d' % lab_dim)) suffix = str(lab_dim) # no longer supported - use new "composed" style labels instead elif cfg.label_style == 'composed': # label_normaliser = XMLLabelNormalisation(xpath_file_name=cfg.xpath_file_name) suffix = 'composed' combined_model_arch = str(len(hidden_layer_size)) for hid_size in hidden_layer_size: combined_model_arch += '_' + str(hid_size) ## if made with run_lstm:-- ''' nnets_file_name = '%s/%s_%s_%d_%s_%d.%d.train.%d.%f.rnn.model' \ %(model_dir, cfg.combined_model_name, cfg.combined_feature_name, int(cfg.multistream_switch), combined_model_arch, lab_dim, cfg.cmp_dim, cfg.train_file_number, cfg.hyper_params['learning_rate']) ''' ## if made with run_dnn:-- nnets_file_name = '%s/%s_%s_%d_%s_%d.%d.train.%d.model' \ %(model_dir, cfg.model_type, cfg.combined_feature_name, int(cfg.multistream_switch), combined_model_arch, lab_dim, cfg.cmp_dim, cfg.train_file_number) ## override the name computed from config variables if model_pickle_file specified: if model_pickle_file != None: nnets_file_name = model_pickle_file print('store DNN') store_network(nnets_file_name, outdir)
def main_function(cfg, outdir, model_pickle_file=None): hidden_layer_size = cfg.hyper_params['hidden_layer_size'] data_dir = cfg.data_dir model_dir = os.path.join(cfg.work_dir, 'nnets_model') # norm_info_file = os.path.join(data_dir, 'norm_info' + cfg.combined_feature_name + '_' + str(cfg.cmp_dim) + '_' + cfg.output_feature_normalisation + '.dat') ### normalise input full context label assert cfg.label_style == 'HTS' # label_normaliser = HTSLabelNormalisation(question_file_name=cfg.question_file_name, add_frame_features=cfg.add_frame_features, subphone_feats=cfg.subphone_feats) lab_dim = label_normaliser.dimension + cfg.appended_input_dim print('Input label dimension is %d' % lab_dim) suffix=str(lab_dim) combined_model_arch = str(len(hidden_layer_size)) for hid_size in hidden_layer_size: combined_model_arch += '_' + str(hid_size) ## if made with run_lstm:-- ''' nnets_file_name = '%s/%s_%s_%d_%s_%d.%d.train.%d.%f.rnn.model' \ %(model_dir, cfg.combined_model_name, cfg.combined_feature_name, int(cfg.multistream_switch), combined_model_arch, lab_dim, cfg.cmp_dim, cfg.train_file_number, cfg.hyper_params['learning_rate']) ## if made with run_dnn:-- nnets_file_name = '%s/%s_%s_%d_%s_%d.%d.train.%d.model' \ %(model_dir, cfg.model_type, cfg.combined_feature_name, int(cfg.multistream_switch), combined_model_arch, lab_dim, cfg.cmp_dim, cfg.train_file_number) ''' ## if made with run_merlin:-- nnets_file_name = '%s/%s_%s_%d_%s_%d.%d.train.%d.%f.rnn.model' \ %(model_dir, cfg.combined_model_name, cfg.combined_feature_name, int(cfg.multistream_switch), combined_model_arch, lab_dim, cfg.cmp_dim, cfg.train_file_number, cfg.hyper_params['learning_rate']) ## override the name computed from config variables if model_pickle_file specified: if model_pickle_file != None: nnets_file_name = model_pickle_file print('store DNN') store_network(nnets_file_name, lab_dim, outdir)
def main_function(cfg): # get a logger for this main function logger = logging.getLogger("main") # get another logger to handle plotting duties plotlogger = logging.getLogger("plotting") # later, we might do this via a handler that is created, attached and configured # using the standard config mechanism of the logging module # but for now we need to do it manually plotlogger.set_plot_path(cfg.plot_dir) #### parameter setting######## hidden_layers_sizes = cfg.hyper_params['hidden_layer_size'] ####prepare environment try: file_id_list = read_file_list(cfg.file_id_scp) logger.debug('Loaded file id list from %s' % cfg.file_id_scp) except IOError: # this means that open(...) threw an error logger.critical('Could not load file id list from %s' % cfg.file_id_scp) raise ###total file number including training, development, and testing total_file_number = len(file_id_list) data_dir = cfg.data_dir nn_cmp_dir = os.path.join(data_dir, 'nn' + cfg.combined_feature_name + '_' + str(cfg.cmp_dim)) nn_cmp_nosil_dir = os.path.join(data_dir, 'nn_nosil' + cfg.combined_feature_name + '_' + str(cfg.cmp_dim)) nn_cmp_norm_dir = os.path.join(data_dir, 'nn_norm' + cfg.combined_feature_name + '_' + str(cfg.cmp_dim)) model_dir = os.path.join(cfg.work_dir, 'nnets_model') gen_dir = os.path.join(cfg.work_dir, 'gen') in_file_list_dict = {} for feature_name in list(cfg.in_dir_dict.keys()): in_file_list_dict[feature_name] = prepare_file_path_list(file_id_list, cfg.in_dir_dict[feature_name], cfg.file_extension_dict[feature_name], False) nn_cmp_file_list = prepare_file_path_list(file_id_list, nn_cmp_dir, cfg.cmp_ext) nn_cmp_nosil_file_list = prepare_file_path_list(file_id_list, nn_cmp_nosil_dir, cfg.cmp_ext) nn_cmp_norm_file_list = prepare_file_path_list(file_id_list, nn_cmp_norm_dir, cfg.cmp_ext) ###normalisation information norm_info_file = os.path.join(data_dir, 'norm_info' + cfg.combined_feature_name + '_' + str( cfg.cmp_dim) + '_' + cfg.output_feature_normalisation + '.dat') ### normalise input full context label # currently supporting two different forms of lingustic features # later, we should generalise this if cfg.label_style == 'HTS': label_normaliser = HTSLabelNormalisation(question_file_name=cfg.question_file_name) lab_dim = label_normaliser.dimension + cfg.appended_input_dim logger.info('Input label dimension is %d' % lab_dim) suffix = str(lab_dim) elif cfg.label_style == 'HTS_duration': label_normaliser = HTSDurationLabelNormalisation(question_file_name=cfg.question_file_name) lab_dim = label_normaliser.dimension ## + cfg.appended_input_dim logger.info('Input label dimension is %d' % lab_dim) suffix = str(lab_dim) # no longer supported - use new "composed" style labels instead elif cfg.label_style == 'composed': # label_normaliser = XMLLabelNormalisation(xpath_file_name=cfg.xpath_file_name) suffix = 'composed' if cfg.process_labels_in_work_dir: label_data_dir = cfg.work_dir else: label_data_dir = data_dir # the number can be removed binary_label_dir = os.path.join(label_data_dir, 'binary_label_' + suffix) nn_label_dir = os.path.join(label_data_dir, 'nn_no_silence_lab_' + suffix) nn_label_norm_dir = os.path.join(label_data_dir, 'nn_no_silence_lab_norm_' + suffix) # nn_label_norm_mvn_dir = os.path.join(data_dir, 'nn_no_silence_lab_norm_'+suffix) in_label_align_file_list = prepare_file_path_list(file_id_list, cfg.in_label_align_dir, cfg.lab_ext, False) binary_label_file_list = prepare_file_path_list(file_id_list, binary_label_dir, cfg.lab_ext) nn_label_file_list = prepare_file_path_list(file_id_list, nn_label_dir, cfg.lab_ext) nn_label_norm_file_list = prepare_file_path_list(file_id_list, nn_label_norm_dir, cfg.lab_ext) # to do - sanity check the label dimension here? min_max_normaliser = None label_norm_file = 'label_norm_%s.dat' % (cfg.label_style) label_norm_file = os.path.join(label_data_dir, label_norm_file) normaliser = math_statis.Statis(feature_dimension=lab_dim, read_func=file_util.load_binary_file_frame, writer_func=file_util.array_to_binary_file, min_value=0.01, max_value=0.99) if cfg.NORMLAB and (cfg.label_style in ['HTS', 'HTS_duration']): # simple HTS labels logger.info('preparing label data (input) using standard HTS style labels') label_normaliser.perform_normalisation(in_label_align_file_list, binary_label_file_list) if cfg.label_style == 'HTS': remover = SilenceRemover(n_cmp=lab_dim, silence_pattern=cfg.silence_pattern) remover.remove_silence(binary_label_file_list, in_label_align_file_list, nn_label_file_list) elif cfg.label_style == 'HTS_duration': ## don't remove silences for duration nn_label_file_list = binary_label_file_list ###use only training data to find min-max information, then apply on the whole dataset normaliser.find_min_max_values(nn_label_file_list[0:cfg.train_file_number]) normaliser.normalise_data(nn_label_file_list, nn_label_norm_file_list) if cfg.NORMLAB and (cfg.label_style == 'composed'): # new flexible label preprocessor logger.info('preparing label data (input) using "composed" style labels') label_composer = LabelComposer() label_composer.load_label_configuration(cfg.label_config_file) logger.info('Loaded label configuration') # logger.info('%s' % label_composer.configuration.labels ) lab_dim = label_composer.compute_label_dimension() logger.info('label dimension will be %d' % lab_dim) if cfg.precompile_xpaths: label_composer.precompile_xpaths() # there are now a set of parallel input label files (e.g, one set of HTS and another set of Ossian trees) # create all the lists of these, ready to pass to the label composer in_label_align_file_list = {} for label_style, label_style_required in label_composer.label_styles.items(): if label_style_required: logger.info('labels of style %s are required - constructing file paths for them' % label_style) if label_style == 'xpath': in_label_align_file_list['xpath'] = prepare_file_path_list(file_id_list, cfg.xpath_label_align_dir, cfg.utt_ext, False) elif label_style == 'hts': in_label_align_file_list['hts'] = prepare_file_path_list(file_id_list, cfg.hts_label_align_dir, cfg.lab_ext, False) else: logger.critical('unsupported label style %s specified in label configuration' % label_style) raise Exception # now iterate through the files, one at a time, constructing the labels for them num_files = len(file_id_list) logger.info('the label styles required are %s' % label_composer.label_styles) for i in range(num_files): logger.info('making input label features for %4d of %4d' % (i + 1, num_files)) # iterate through the required label styles and open each corresponding label file # a dictionary of file descriptors, pointing at the required files required_labels = {} for label_style, label_style_required in label_composer.label_styles.items(): # the files will be a parallel set of files for a single utterance # e.g., the XML tree and an HTS label file if label_style_required: required_labels[label_style] = open(in_label_align_file_list[label_style][i], 'r') logger.debug(' opening label file %s' % in_label_align_file_list[label_style][i]) logger.debug('label styles with open files: %s' % required_labels) label_composer.make_labels(required_labels, out_file_name=binary_label_file_list[i], fill_missing_values=cfg.fill_missing_values, iterate_over_frames=cfg.iterate_over_frames) # now close all opened files for fd in required_labels.values(): fd.close() # silence removal if cfg.remove_silence_using_binary_labels: silence_feature = 0 ## use first feature in label -- hardcoded for now logger.info('Silence removal from label using silence feature: %s' % ( label_composer.configuration.labels[silence_feature])) logger.info('Silence will be removed from CMP files in same way') ## Binary labels have 2 roles: both the thing trimmed and the instructions for trimming: trim_silence(binary_label_file_list, nn_label_file_list, lab_dim, \ binary_label_file_list, lab_dim, silence_feature, percent_to_keep=5) else: logger.info('No silence removal done') # start from the labels we have just produced, not trimmed versions nn_label_file_list = binary_label_file_list ###use only training data to find min-max information, then apply on the whole dataset normaliser.find_min_max_values(nn_label_file_list[0:cfg.train_file_number]) normaliser.normalise_data(nn_label_file_list, nn_label_norm_file_list) if normaliser != None: ### save label normalisation information for unseen testing labels label_min_vector = normaliser.min_vector label_max_vector = normaliser.max_vector label_norm_info = numpy.concatenate((label_min_vector, label_max_vector), axis=0) label_norm_info = numpy.array(label_norm_info, 'float32') fid = open(label_norm_file, 'wb') label_norm_info.tofile(fid) fid.close() logger.info('saved %s vectors to %s' % (label_min_vector.size, label_norm_file)) ### make output acoustic data if cfg.MAKECMP: logger.info('creating acoustic (output) features') delta_win = cfg.delta_win # [-0.5, 0.0, 0.5] acc_win = cfg.acc_win # [1.0, -2.0, 1.0] acoustic_worker = AcousticComposition(delta_win=delta_win, acc_win=acc_win) acoustic_worker.prepare_nn_data(in_file_list_dict, nn_cmp_file_list, cfg.in_dimension_dict, cfg.out_dimension_dict) if cfg.label_style == 'HTS': if cfg.remove_silence_using_binary_labels: ## do this to get lab_dim: label_composer = LabelComposer() label_composer.load_label_configuration(cfg.label_config_file) lab_dim = label_composer.compute_label_dimension() silence_feature = 0 ## use first feature in label -- hardcoded for now logger.info('Silence removal from CMP using binary label file') ## overwrite the untrimmed audio with the trimmed version: trim_silence(nn_cmp_file_list, nn_cmp_nosil_file_list, cfg.cmp_dim, binary_label_file_list, lab_dim, silence_feature) else: ## back off to previous method using HTS labels: remover = SilenceRemover(n_cmp=cfg.cmp_dim, silence_pattern=cfg.silence_pattern) remover.remove_silence(nn_cmp_file_list[0:cfg.train_file_number + cfg.valid_file_number], in_label_align_file_list[0:cfg.train_file_number + cfg.valid_file_number], nn_cmp_nosil_file_list[ 0:cfg.train_file_number + cfg.valid_file_number]) # save to itself elif cfg.label_style == 'HTS_duration': ## don't remove silences for duration nn_cmp_nosil_file_list = nn_cmp_file_list pass ### save acoustic normalisation information for normalising the features back var_dir = os.path.join(data_dir, 'var') if not os.path.exists(var_dir): os.makedirs(var_dir) var_file_dict = {} for feature_name in list(cfg.out_dimension_dict.keys()): var_file_dict[feature_name] = os.path.join(var_dir, feature_name + '_' + str(cfg.out_dimension_dict[feature_name])) ### normalise output acoustic data if cfg.NORMCMP: logger.info('normalising acoustic (output) features using method %s' % cfg.output_feature_normalisation) cmp_norm_info = None normaliser = math_statis.Statis(feature_dimension=cfg.cmp_dim, read_func=file_util.load_binary_file_frame, writer_func=file_util.array_to_binary_file) if cfg.output_feature_normalisation == 'MVN': ###calculate mean and std vectors on the training data, and apply on the whole dataset global_mean_vector = normaliser.compute_mean(nn_cmp_nosil_file_list[0:cfg.train_file_number], 0, cfg.cmp_dim) global_std_vector = normaliser.compute_std(nn_cmp_nosil_file_list[0:cfg.train_file_number], global_mean_vector, 0, cfg.cmp_dim) normaliser.feature_normalisation(nn_cmp_nosil_file_list[0:cfg.train_file_number + cfg.valid_file_number], nn_cmp_norm_file_list[0:cfg.train_file_number + cfg.valid_file_number]) cmp_norm_info = numpy.concatenate((global_mean_vector, global_std_vector), axis=0) elif cfg.output_feature_normalisation == 'MINMAX': global_mean_vector = normaliser.compute_mean(nn_cmp_nosil_file_list[0:cfg.train_file_number]) global_std_vector = normaliser.compute_std(nn_cmp_nosil_file_list[0:cfg.train_file_number], global_mean_vector) normaliser = math_statis.Statis(feature_dimension=cfg.cmp_dim, read_func=file_util.load_binary_file_frame, writer_func=file_util.array_to_binary_file, min_value=0.01, max_value=0.99) # min_max_normaliser = MinMaxNormalisation(feature_dimension=cfg.cmp_dim, min_value=0.01, max_value=0.99) normaliser.find_min_max_values(nn_cmp_nosil_file_list[0:cfg.train_file_number]) normaliser.normalise_data(nn_cmp_nosil_file_list, nn_cmp_norm_file_list) cmp_min_vector = normaliser.min_vector cmp_max_vector = normaliser.max_vector cmp_norm_info = numpy.concatenate((cmp_min_vector, cmp_max_vector), axis=0) else: logger.critical('Normalisation type %s is not supported!\n' % (cfg.output_feature_normalisation)) raise cmp_norm_info = numpy.array(cmp_norm_info, 'float32') fid = open(norm_info_file, 'wb') cmp_norm_info.tofile(fid) fid.close() logger.info('saved %s vectors to %s' % (cfg.output_feature_normalisation, norm_info_file)) # logger.debug(' value was\n%s' % cmp_norm_info) feature_index = 0 for feature_name in list(cfg.out_dimension_dict.keys()): feature_std_vector = numpy.array( global_std_vector[:, feature_index:feature_index + cfg.out_dimension_dict[feature_name]], 'float32') fid = open(var_file_dict[feature_name], 'w') feature_std_vector.tofile(fid) fid.close() logger.info('saved %s variance vector to %s' % (feature_name, var_file_dict[feature_name])) # logger.debug(' value was\n%s' % feature_std_vector) feature_index += cfg.out_dimension_dict[feature_name] train_x_file_list = nn_label_norm_file_list[0:cfg.train_file_number] train_y_file_list = nn_cmp_norm_file_list[0:cfg.train_file_number] valid_x_file_list = nn_label_norm_file_list[cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number] valid_y_file_list = nn_cmp_norm_file_list[cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number] test_x_file_list = nn_label_norm_file_list[ cfg.train_file_number + cfg.valid_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number] test_y_file_list = nn_cmp_norm_file_list[ cfg.train_file_number + cfg.valid_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number] # we need to know the label dimension before training the DNN # computing that requires us to look at the labels # # currently, there are two ways to do this if cfg.label_style == 'HTS': label_normaliser = HTSLabelNormalisation(question_file_name=cfg.question_file_name) lab_dim = label_normaliser.dimension + cfg.appended_input_dim elif cfg.label_style == 'composed': label_composer = LabelComposer() label_composer.load_label_configuration(cfg.label_config_file) lab_dim = label_composer.compute_label_dimension() logger.info('label dimension is %d' % lab_dim) combined_model_arch = str(len(hidden_layers_sizes)) for hid_size in hidden_layers_sizes: combined_model_arch += '_' + str(hid_size) nnets_file_name = '%s/%s_%s_%d_%s_%d.%d.train.%d.model' \ % (model_dir, cfg.model_type, cfg.combined_feature_name, int(cfg.multistream_switch), combined_model_arch, lab_dim, cfg.cmp_dim, cfg.train_file_number) ### DNN model training if cfg.TRAINDNN: logger.info('training DNN') try: os.makedirs(model_dir) except OSError as e: if e.errno == errno.EEXIST: # not an error - just means directory already exists pass else: logger.critical('Failed to create model directory %s' % model_dir) logger.critical(' OS error was: %s' % e.strerror) raise try: # print 'start DNN' train_DNN(train_xy_file_list=(train_x_file_list, train_y_file_list), \ valid_xy_file_list=(valid_x_file_list, valid_y_file_list), \ nnets_file_name=nnets_file_name, \ n_ins=lab_dim, n_outs=cfg.cmp_dim, ms_outs=cfg.multistream_outs, \ hyper_params=cfg.hyper_params, buffer_size=cfg.buffer_size, plot=cfg.plot) except KeyboardInterrupt: logger.critical('train_DNN interrupted via keyboard') # Could 'raise' the exception further, but that causes a deep traceback to be printed # which we don't care about for a keyboard interrupt. So, just bail out immediately sys.exit(1) except: logger.critical('train_DNN threw an exception') raise ### generate parameters from DNN temp_dir_name = '%s_%s_%d_%d_%d_%d_%d_%d' \ % (cfg.model_type, cfg.combined_feature_name, int(cfg.do_post_filtering), \ cfg.train_file_number, lab_dim, cfg.cmp_dim, \ len(hidden_layers_sizes), hidden_layers_sizes[0]) gen_dir = os.path.join(gen_dir, temp_dir_name) gen_file_id_list = file_id_list[ cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number] test_x_file_list = nn_label_norm_file_list[ cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number] if cfg.DNNGEN: logger.info('generating from DNN') try: os.makedirs(gen_dir) except OSError as e: if e.errno == errno.EEXIST: # not an error - just means directory already exists pass else: logger.critical('Failed to create generation directory %s' % gen_dir) logger.critical(' OS error was: %s' % e.strerror) raise gen_file_list = prepare_file_path_list(gen_file_id_list, gen_dir, cfg.cmp_ext) # dnn_generation(valid_x_file_list, nnets_file_name, lab_dim, cfg.cmp_dim, gen_file_list) dnn_generation(test_x_file_list, nnets_file_name, lab_dim, cfg.cmp_dim, gen_file_list) logger.debug('denormalising generated output using method %s' % cfg.output_feature_normalisation) fid = open(norm_info_file, 'rb') cmp_min_max = numpy.fromfile(fid, dtype=numpy.float32) fid.close() cmp_min_max = cmp_min_max.reshape((2, -1)) cmp_min_vector = cmp_min_max[0,] cmp_max_vector = cmp_min_max[1,] denormaliser = math_statis.Statis(feature_dimension=cfg.cmp_dim, read_func=file_util.load_binary_file_frame, writer_func=file_util.array_to_binary_file) if cfg.output_feature_normalisation == 'MVN' denormaliser.feature_denormalisation(gen_file_list, gen_file_list, cmp_min_vector, cmp_max_vector) elif cfg.output_feature_normalisation == 'MINMAX': denormaliser = math_statis.Statis(feature_dimension=cfg.cmp_dim, read_func=file_util.load_binary_file_frame, writer_func=file_util.array_to_binary_file, min_value=0.01, max_value=0.99, min_vector=cmp_min_vector, max_vector=cmp_max_vector) denormaliser.denormalise_data(gen_file_list, gen_file_list) else: logger.critical('denormalising method %s is not supported!\n' % (cfg.output_feature_normalisation)) raise ##perform MLPG to smooth parameter trajectory ## lf0 is included, the output features much have vuv. generator = ParameterGeneration(gen_wav_features=cfg.gen_wav_features) generator.acoustic_decomposition(gen_file_list, cfg.cmp_dim, cfg.out_dimension_dict, cfg.file_extension_dict, var_file_dict) ### generate wav if cfg.GENWAV: logger.info('reconstructing waveform(s)') generate_wav(gen_dir, gen_file_id_list, cfg) # generated speech # generate_wav(nn_cmp_dir, gen_file_id_list) # reference copy synthesis speech ### evaluation: calculate distortion if cfg.CALMCD: logger.info('calculating MCD') ref_data_dir = os.path.join(data_dir, 'ref_data') ref_mgc_list = prepare_file_path_list(gen_file_id_list, ref_data_dir, cfg.mgc_ext) ref_bap_list = prepare_file_path_list(gen_file_id_list, ref_data_dir, cfg.bap_ext) ref_lf0_list = prepare_file_path_list(gen_file_id_list, ref_data_dir, cfg.lf0_ext) in_gen_label_align_file_list = in_label_align_file_list[ cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number] calculator = IndividualDistortionComp() spectral_distortion = 0.0 bap_mse = 0.0 f0_mse = 0.0 vuv_error = 0.0 valid_file_id_list = file_id_list[cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number] test_file_id_list = file_id_list[ cfg.train_file_number + cfg.valid_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number] if cfg.remove_silence_using_binary_labels: ## get lab_dim: label_composer = LabelComposer() label_composer.load_label_configuration(cfg.label_config_file) lab_dim = label_composer.compute_label_dimension() ## use first feature in label -- hardcoded for now silence_feature = 0 ## Use these to trim silence: untrimmed_test_labels = binary_label_file_list[ cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number] if 'mgc' in cfg.in_dimension_dict: if cfg.remove_silence_using_binary_labels: untrimmed_reference_data = in_file_list_dict['mgc'][ cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number] trim_silence(untrimmed_reference_data, ref_mgc_list, cfg.mgc_dim, \ untrimmed_test_labels, lab_dim, silence_feature) else: remover = SilenceRemover(n_cmp=cfg.mgc_dim, silence_pattern=cfg.silence_pattern) remover.remove_silence(in_file_list_dict['mgc'][ cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number], in_gen_label_align_file_list, ref_mgc_list) valid_spectral_distortion = calculator.compute_distortion(valid_file_id_list, ref_data_dir, gen_dir, cfg.mgc_ext, cfg.mgc_dim) test_spectral_distortion = calculator.compute_distortion(test_file_id_list, ref_data_dir, gen_dir, cfg.mgc_ext, cfg.mgc_dim) valid_spectral_distortion *= (10 / numpy.log(10)) * numpy.sqrt(2.0) ##MCD test_spectral_distortion *= (10 / numpy.log(10)) * numpy.sqrt(2.0) ##MCD if 'bap' in cfg.in_dimension_dict: if cfg.remove_silence_using_binary_labels: untrimmed_reference_data = in_file_list_dict['bap'][ cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number] trim_silence(untrimmed_reference_data, ref_bap_list, cfg.bap_dim, \ untrimmed_test_labels, lab_dim, silence_feature) else: remover = SilenceRemover(n_cmp=cfg.bap_dim, silence_pattern=cfg.silence_pattern) remover.remove_silence(in_file_list_dict['bap'][ cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number], in_gen_label_align_file_list, ref_bap_list) valid_bap_mse = calculator.compute_distortion(valid_file_id_list, ref_data_dir, gen_dir, cfg.bap_ext, cfg.bap_dim) test_bap_mse = calculator.compute_distortion(test_file_id_list, ref_data_dir, gen_dir, cfg.bap_ext, cfg.bap_dim) valid_bap_mse = valid_bap_mse / 10.0 test_bap_mse = test_bap_mse / 10.0 if 'lf0' in cfg.in_dimension_dict: if cfg.remove_silence_using_binary_labels: untrimmed_reference_data = in_file_list_dict['lf0'][ cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number] trim_silence(untrimmed_reference_data, ref_lf0_list, cfg.lf0_dim, \ untrimmed_test_labels, lab_dim, silence_feature) else: remover = SilenceRemover(n_cmp=cfg.lf0_dim, silence_pattern=['*-#+*']) remover.remove_silence(in_file_list_dict['lf0'][ cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number], in_gen_label_align_file_list, ref_lf0_list) valid_f0_mse, valid_f0_corr, valid_vuv_error = calculator.compute_distortion(valid_file_id_list, ref_data_dir, gen_dir, cfg.lf0_ext, cfg.lf0_dim) test_f0_mse, test_f0_corr, test_vuv_error = calculator.compute_distortion(test_file_id_list, ref_data_dir, gen_dir, cfg.lf0_ext, cfg.lf0_dim) logger.info('Develop: DNN -- MCD: %.3f dB; BAP: %.3f dB; F0:- RMSE: %.3f Hz; CORR: %.3f; VUV: %.3f%%' \ % (valid_spectral_distortion, valid_bap_mse, valid_f0_mse, valid_f0_corr, valid_vuv_error * 100.)) logger.info('Test : DNN -- MCD: %.3f dB; BAP: %.3f dB; F0:- RMSE: %.3f Hz; CORR: %.3f; VUV: %.3f%%' \ % (test_spectral_distortion, test_bap_mse, test_f0_mse, test_f0_corr, test_vuv_error * 100.))
class NNAcousticModel(NN): ## add speech specific stuff, like splitting into streams and param gen def __init__( self, model_dir, question_file_name, silence_pattern='/2:sil/' ): ## TODO: where to handle silence pattern? Currently fragile super(NNAcousticModel, self).__init__(model_dir) self.load_stream_info() self.label_expander = HTSLabelNormalisation( question_file_name=question_file_name) self.param_generator = MLParameterGenerationFast( ) # ParameterGeneration() self.silent_feature_indices = self.get_silent_feature_indices( question_file_name, silence_pattern) std = self.output_std m = numpy.shape(std) std = std.reshape((1, self.outdim)) self.stream_std = self.split_into_streams(std) def get_silent_feature_indices(self, question_file_name, silence_pattern): print 'get_silent_feature_indices' indices = [] questions = [q for q in readlist(question_file_name) if q != ''] questions = [q for q in questions if 'CQS' not in q] for (i, question) in enumerate(questions): if silence_pattern in question: indices.append(i) print 'silence question found:' print question return indices def load_stream_info(self): stream_info_fname = os.path.join(self.model_dir, 'stream_info.txt') assert os.path.isfile(stream_info_fname) stream_data = readlist(stream_info_fname) stream_data = [line.split(' ') for line in stream_data] assert len(stream_data) == 4 (self.instreams, indims, self.outstreams, outdims) = stream_data indims = [int(val) for val in indims] outdims = [int(val) for val in outdims] ## note that indims are not network input, but input to acoustic preprocessing of data! assert self.outdim == sum(outdims) self.indims = dict(zip(self.instreams, indims)) self.outdims = dict(zip(self.outstreams, outdims)) ## FOR DEBUGGING:- def generate_from_norm_binary_lab(self, bin_label_file, labdim, outwave, enforce_silence=False, mlpg=True, vuv_thresh=0.5, fzero_scale=1.0): input = get_speech(bin_label_file, labdim) #input = input[:500,:] output = self.predict(input, input_normalisation=True) put_speech( output, '/afs/inf.ed.ac.uk/user/o/owatts/temp/cpu_gen/undenorm_66_015_from_norm_lab.cmp' ) sys.exit('vliadnviadnvdvn stoped early') streams = self.split_into_streams(output) if mlpg: mlpged = {} for (stream, data) in streams.items(): if stream in self.indims: mlpg_data = self.param_generator.generation( data, self.stream_std[stream], self.indims[stream]) else: mlpg_data = data mlpged[stream] = mlpg_data streams = mlpged else: # take statics only! statics = {} for (stream, data) in streams.items(): if stream in self.indims: statics[stream] = data[:, :self.indims[stream]] else: ## for e.g. vuv statics[stream] = data streams = statics if enforce_silence: for (stream, data) in streams.items(): print input[:, self.silent_feature_indices] sys.exit('ntfbdfbsfrbsfbr') silent_frames = numpy.sum(input[:, self.silent_feature_indices], axis=1) data[silent_frames == 1.0, :] = 0.0 streams[stream] = data if 'lf0' in streams: fzero = numpy.exp(streams['lf0']) if 'vuv' in streams: vuv = streams['vuv'] lf0 = streams['lf0'] fzero[vuv <= vuv_thresh] = 0.0 fzero *= fzero_scale streams['lf0'] = fzero self.world_resynth(streams, outwave) def generate(self, htk_label_file, enforce_silence=True, mlpg=True, fill_unvoiced_gaps=0, \ variance_expansion=1.0, vuv_thresh=0.5, fzero_scale=1.0): input = self.label_expander.load_labels_with_state_alignment( htk_label_file) output = self.predict(input) streams = self.split_into_streams(output) if mlpg: mlpged = {} for (stream, data) in streams.items(): if stream in self.indims: mlpg_data = self.param_generator.generation( data, self.stream_std[stream], self.indims[stream]) else: mlpg_data = data mlpged[stream] = mlpg_data streams = mlpged else: # take statics only! statics = {} for (stream, data) in streams.items(): if stream in self.indims: statics[stream] = data[:, :self.indims[stream]] else: ## for e.g. vuv statics[stream] = data streams = statics ## TODO: handle F0 separately if variance_expansion > 0.0: new_streams = {} for (stream, data) in streams.items(): new_streams[stream] = self.simple_scale_variance_wrapper_p0( streams[stream], stream) streams = new_streams # impose 0 ceiling on baps, else we get artifacts:- # (I think this was the problem I was trying to fix by not scaling f0 and energy previously) streams['bap'] = np.minimum(streams['bap'], np.zeros(np.shape(streams['bap']))) # if fill_unvoiced_gaps > 0: # vuv = streams['vuv'] # ## turn from soft to binary: # binary_vuv = np.zeros(np.shape(vuv)) # binary_vuv[vuv > vuv_thresh] = 1 # vuv = binary_vuv # gaplength = fill_unvoiced_gaps # vuv = fill_short_unvoiced_gaps(vuv, gaplength) # print vuv # streams['vuv'] = vuv # if enforce_silence: for (stream, data) in streams.items(): silent_frames = numpy.sum(input[:, self.silent_feature_indices], axis=1) data[silent_frames == 1.0, :] = 0.0 streams[stream] = data if 'lf0' in streams: fzero = numpy.exp(streams['lf0']) if 'vuv' in streams: vuv = streams['vuv'] lf0 = streams['lf0'] fzero[vuv <= vuv_thresh] = 0.0 fzero *= fzero_scale streams['lf0'] = fzero #self.world_resynth(streams, outwave) return streams def split_into_streams(self, input): nframe, ndim = numpy.shape(input) assert ndim == self.outdim, (ndim, self.outdim) start = 0 outputs = {} for stream in self.outstreams: end = start + self.outdims[stream] print stream outputs[stream] = input[:, start:end] start = end return outputs #def enforce_silence(streams): # def expand_label(): def simple_scale_variance_wrapper_0(self, speech, stream): return speech def simple_scale_variance_wrapper_p0(self, speech, stream): if stream == 'mgc': cep_speech = self.simple_scale_variance(speech, stream, gv_weight=1.0) ene_speech = self.simple_scale_variance(speech, stream, gv_weight=0.0) scaled_speech = np.hstack([ene_speech[:, :1], cep_speech[:, 1:]]) else: scaled_speech = speech return scaled_speech def simple_scale_variance_wrapper_p2(self, speech, stream): if stream == 'mgc': cep_speech = self.simple_scale_variance(speech, stream, gv_weight=1.0) ene_speech = self.simple_scale_variance(speech, stream, gv_weight=0.2) scaled_speech = np.hstack([ene_speech[:, :1], cep_speech[:, 1:]]) if stream == 'lf0': scaled_speech = self.simple_scale_variance(speech, stream, gv_weight=0.2) else: scaled_speech = speech return scaled_speech def simple_scale_variance_wrapper_p5(self, speech, stream): if stream == 'mgc': cep_speech = self.simple_scale_variance(speech, stream, gv_weight=1.0) ene_speech = self.simple_scale_variance(speech, stream, gv_weight=0.5) scaled_speech = np.hstack([ene_speech[:, :1], cep_speech[:, 1:]]) if stream == 'lf0': scaled_speech = self.simple_scale_variance(speech, stream, gv_weight=0.5) else: scaled_speech = speech return scaled_speech def simple_scale_variance_wrapper_1(self, speech, stream): if stream == 'mgc': cep_speech = self.simple_scale_variance(speech, stream, gv_weight=1.0) ene_speech = self.simple_scale_variance(speech, stream, gv_weight=1.0) scaled_speech = np.hstack([ene_speech[:, :1], cep_speech[:, 1:]]) if stream == 'lf0': scaled_speech = self.simple_scale_variance(speech, stream, gv_weight=1.0) else: scaled_speech = speech return scaled_speech def simple_scale_variance_wrapper_m2(self, speech, stream): if stream == 'mgc': cep_speech = self.simple_scale_variance(speech, stream, gv_weight=1.0) ene_speech = self.simple_scale_variance(speech, stream, gv_weight=0.0) scaled_speech = np.hstack([ene_speech[:, :1], cep_speech[:, 1:]]) if stream == 'lf0': scaled_speech = self.simple_scale_variance(speech, stream, gv_weight=0.2) if stream == 'bap': scaled_speech = self.simple_scale_variance(speech, stream, gv_weight=1.0) else: scaled_speech = speech return scaled_speech def simple_scale_variance_wrapper_n2(self, speech, stream): if stream == 'mgc': cep_speech = self.simple_scale_variance(speech, stream, gv_weight=1.0) ene_speech = self.simple_scale_variance(speech, stream, gv_weight=0.2) scaled_speech = np.hstack([ene_speech[:, :1], cep_speech[:, 1:]]) else: scaled_speech = speech return scaled_speech def simple_scale_variance_wrapper_nfull(self, speech, stream): if stream == 'mgc': scaled_speech = self.simple_scale_variance(speech, stream, gv_weight=1.0) else: scaled_speech = speech return scaled_speech def simple_scale_variance(self, speech, stream, gv_weight=1.0): stream_std = self.stream_std[stream][0, :] static_std = stream_std[:self.indims[stream]] assert gv_weight <= 1.0 and gv_weight >= 0.0 local_weight = 1.0 - gv_weight utt_mean = numpy.mean(speech, axis=0) utt_std = numpy.std(speech, axis=0) global_std = numpy.transpose(static_std) weighted_global_std = (gv_weight * global_std) + (local_weight * utt_std) std_ratio = weighted_global_std / utt_std nframes, ndim = numpy.shape(speech) utt_mean_matrix = numpy.tile(utt_mean, (nframes, 1)) std_ratio_matrix = numpy.tile(std_ratio, (nframes, 1)) scaled_speech = ( (speech - utt_mean_matrix) * std_ratio_matrix) + utt_mean_matrix return scaled_speech
def main_function(cfg): file_paths = FilePaths(cfg) # get a logger for this main function logger = logging.getLogger("main") # get another logger to handle plotting duties plotlogger = logging.getLogger("plotting") # later, we might do this via a handler that is created, attached and configured # using the standard config mechanism of the logging module # but for now we need to do it manually plotlogger.set_plot_path(cfg.plot_dir) # create plot dir if set to True if not os.path.exists(cfg.plot_dir) and cfg.plot: os.makedirs(cfg.plot_dir) #### parameter setting######## hidden_layer_size = cfg.hyper_params['hidden_layer_size'] ####prepare environment try: file_id_list = read_file_list(cfg.file_id_scp) logger.debug('Loaded file id list from %s' % cfg.file_id_scp) except IOError: # this means that open(...) threw an error logger.critical('Could not load file id list from %s' % cfg.file_id_scp) raise ###total file number including training, development, and testing total_file_number = len(file_id_list) assert cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number == total_file_number, 'check train, valid, test file number' data_dir = cfg.data_dir inter_data_dir = cfg.inter_data_dir nn_cmp_dir = file_paths.nn_cmp_dir nn_cmp_norm_dir = file_paths.nn_cmp_norm_dir model_dir = file_paths.model_dir gen_dir = file_paths.gen_dir in_file_list_dict = {} for feature_name in list(cfg.in_dir_dict.keys()): in_file_list_dict[feature_name] = prepare_file_path_list(file_id_list, cfg.in_dir_dict[feature_name], cfg.file_extension_dict[feature_name], False) nn_cmp_file_list = file_paths.get_nn_cmp_file_list() nn_cmp_norm_file_list = file_paths.get_nn_cmp_norm_file_list() ###normalisation information norm_info_file = file_paths.norm_info_file ### normalise input full context label # currently supporting two different forms of lingustic features # later, we should generalise this assert cfg.label_style == 'HTS', 'Only HTS-style labels are now supported as input to Merlin' label_normaliser = HTSLabelNormalisation(question_file_name=cfg.question_file_name, add_frame_features=cfg.add_frame_features, subphone_feats=cfg.subphone_feats) add_feat_dim = sum(cfg.additional_features.values()) lab_dim = label_normaliser.dimension + add_feat_dim + cfg.appended_input_dim if cfg.VoiceConversion: lab_dim = cfg.cmp_dim logger.info('Input label dimension is %d' % lab_dim) suffix=str(lab_dim) if cfg.process_labels_in_work_dir: inter_data_dir = cfg.work_dir # the number can be removed file_paths.set_label_dir(label_normaliser.dimension, suffix, lab_dim) file_paths.set_label_file_list() binary_label_dir = file_paths.binary_label_dir nn_label_dir = file_paths.nn_label_dir nn_label_norm_dir = file_paths.nn_label_norm_dir in_label_align_file_list = file_paths.in_label_align_file_list binary_label_file_list = file_paths.binary_label_file_list nn_label_file_list = file_paths.nn_label_file_list nn_label_norm_file_list = file_paths.nn_label_norm_file_list min_max_normaliser = None label_norm_file = file_paths.label_norm_file test_id_list = file_paths.test_id_list if cfg.NORMLAB: # simple HTS labels logger.info('preparing label data (input) using standard HTS style labels') label_normaliser.perform_normalisation(in_label_align_file_list, binary_label_file_list, label_type=cfg.label_type) if cfg.additional_features: out_feat_file_list = file_paths.out_feat_file_list in_dim = label_normaliser.dimension for new_feature, new_feature_dim in cfg.additional_features.items(): new_feat_dir = os.path.join(data_dir, new_feature) new_feat_file_list = prepare_file_path_list(file_id_list, new_feat_dir, '.'+new_feature) merger = MergeFeat(lab_dim = in_dim, feat_dim = new_feature_dim) merger.merge_data(binary_label_file_list, new_feat_file_list, out_feat_file_list) in_dim += new_feature_dim binary_label_file_list = out_feat_file_list remover = SilenceRemover(n_cmp = lab_dim, silence_pattern = cfg.silence_pattern, label_type=cfg.label_type, remove_frame_features = cfg.add_frame_features, subphone_feats = cfg.subphone_feats) remover.remove_silence(binary_label_file_list, in_label_align_file_list, nn_label_file_list) min_max_normaliser = MinMaxNormalisation(feature_dimension = lab_dim, min_value = 0.01, max_value = 0.99) ###use only training data to find min-max information, then apply on the whole dataset if cfg.GenTestList: min_max_normaliser.load_min_max_values(label_norm_file) else: min_max_normaliser.find_min_max_values(nn_label_file_list[0:cfg.train_file_number]) ### enforce silence such that the normalization runs without removing silence: only for final synthesis if cfg.GenTestList and cfg.enforce_silence: min_max_normaliser.normalise_data(binary_label_file_list, nn_label_norm_file_list) else: min_max_normaliser.normalise_data(nn_label_file_list, nn_label_norm_file_list) if min_max_normaliser != None and not cfg.GenTestList: ### save label normalisation information for unseen testing labels label_min_vector = min_max_normaliser.min_vector label_max_vector = min_max_normaliser.max_vector label_norm_info = numpy.concatenate((label_min_vector, label_max_vector), axis=0) label_norm_info = numpy.array(label_norm_info, 'float32') fid = open(label_norm_file, 'wb') label_norm_info.tofile(fid) fid.close() logger.info('saved %s vectors to %s' %(label_min_vector.size, label_norm_file)) ### make output duration data if cfg.MAKEDUR: logger.info('creating duration (output) features') label_normaliser.prepare_dur_data(in_label_align_file_list, file_paths.dur_file_list, cfg.label_type, cfg.dur_feature_type) ### make output acoustic data if cfg.MAKECMP: logger.info('creating acoustic (output) features') delta_win = cfg.delta_win #[-0.5, 0.0, 0.5] acc_win = cfg.acc_win #[1.0, -2.0, 1.0] if cfg.GenTestList: for feature_name in list(cfg.in_dir_dict.keys()): in_file_list_dict[feature_name] = prepare_file_path_list(test_id_list, cfg.in_dir_dict[feature_name], cfg.file_extension_dict[feature_name], False) nn_cmp_file_list = prepare_file_path_list(test_id_list, nn_cmp_dir, cfg.cmp_ext) nn_cmp_norm_file_list = prepare_file_path_list(test_id_list, nn_cmp_norm_dir, cfg.cmp_ext) acoustic_worker = AcousticComposition(delta_win = delta_win, acc_win = acc_win) if 'dur' in list(cfg.in_dir_dict.keys()) and cfg.AcousticModel: lf0_file_list = file_paths.get_lf0_file_list() acoustic_worker.make_equal_frames(dur_file_list, lf0_file_list, cfg.in_dimension_dict) acoustic_worker.prepare_nn_data(in_file_list_dict, nn_cmp_file_list, cfg.in_dimension_dict, cfg.out_dimension_dict) if cfg.remove_silence_using_binary_labels: ## do this to get lab_dim: label_composer = LabelComposer() label_composer.load_label_configuration(cfg.label_config_file) lab_dim=label_composer.compute_label_dimension() silence_feature = 0 ## use first feature in label -- hardcoded for now logger.info('Silence removal from CMP using binary label file') ## overwrite the untrimmed audio with the trimmed version: trim_silence(nn_cmp_file_list, nn_cmp_file_list, cfg.cmp_dim, binary_label_file_list, lab_dim, silence_feature) elif cfg.remove_silence_using_hts_labels: ## back off to previous method using HTS labels: remover = SilenceRemover(n_cmp = cfg.cmp_dim, silence_pattern = cfg.silence_pattern, label_type=cfg.label_type, remove_frame_features = cfg.add_frame_features, subphone_feats = cfg.subphone_feats) remover.remove_silence(nn_cmp_file_list, in_label_align_file_list, nn_cmp_file_list) # save to itself ### save acoustic normalisation information for normalising the features back var_dir = file_paths.var_dir var_file_dict = file_paths.get_var_dic() ### normalise output acoustic data if cfg.NORMCMP: logger.info('normalising acoustic (output) features using method %s' % cfg.output_feature_normalisation) cmp_norm_info = None if cfg.output_feature_normalisation == 'MVN': normaliser = MeanVarianceNorm(feature_dimension=cfg.cmp_dim) if cfg.GenTestList: # load mean std values global_mean_vector, global_std_vector = normaliser.load_mean_std_values(norm_info_file) else: ###calculate mean and std vectors on the training data, and apply on the whole dataset global_mean_vector = normaliser.compute_mean(nn_cmp_file_list[0:cfg.train_file_number], 0, cfg.cmp_dim) global_std_vector = normaliser.compute_std(nn_cmp_file_list[0:cfg.train_file_number], global_mean_vector, 0, cfg.cmp_dim) normaliser.feature_normalisation(nn_cmp_file_list, nn_cmp_norm_file_list) cmp_norm_info = numpy.concatenate((global_mean_vector, global_std_vector), axis=0) elif cfg.output_feature_normalisation == 'MINMAX': min_max_normaliser = MinMaxNormalisation(feature_dimension = cfg.cmp_dim, min_value = 0.01, max_value = 0.99) if cfg.GenTestList: min_max_normaliser.load_min_max_values(norm_info_file) else: min_max_normaliser.find_min_max_values(nn_cmp_file_list[0:cfg.train_file_number]) min_max_normaliser.normalise_data(nn_cmp_file_list, nn_cmp_norm_file_list) cmp_min_vector = min_max_normaliser.min_vector cmp_max_vector = min_max_normaliser.max_vector cmp_norm_info = numpy.concatenate((cmp_min_vector, cmp_max_vector), axis=0) else: logger.critical('Normalisation type %s is not supported!\n' %(cfg.output_feature_normalisation)) raise if not cfg.GenTestList: cmp_norm_info = numpy.array(cmp_norm_info, 'float32') fid = open(norm_info_file, 'wb') cmp_norm_info.tofile(fid) fid.close() logger.info('saved %s vectors to %s' %(cfg.output_feature_normalisation, norm_info_file)) feature_index = 0 for feature_name in list(cfg.out_dimension_dict.keys()): feature_std_vector = numpy.array(global_std_vector[:,feature_index:feature_index+cfg.out_dimension_dict[feature_name]], 'float32') fid = open(var_file_dict[feature_name], 'w') feature_var_vector = feature_std_vector**2 feature_var_vector.tofile(fid) fid.close() logger.info('saved %s variance vector to %s' %(feature_name, var_file_dict[feature_name])) feature_index += cfg.out_dimension_dict[feature_name] train_x_file_list, train_y_file_list = file_paths.get_train_list_x_y() valid_x_file_list, valid_y_file_list = file_paths.get_valid_list_x_y() test_x_file_list, test_y_file_list = file_paths.get_test_list_x_y() # we need to know the label dimension before training the DNN # computing that requires us to look at the labels # label_normaliser = HTSLabelNormalisation(question_file_name=cfg.question_file_name, add_frame_features=cfg.add_frame_features, subphone_feats=cfg.subphone_feats) add_feat_dim = sum(cfg.additional_features.values()) lab_dim = label_normaliser.dimension + add_feat_dim + cfg.appended_input_dim if cfg.VoiceConversion: lab_dim = cfg.cmp_dim logger.info('label dimension is %d' % lab_dim) combined_model_arch = str(len(hidden_layer_size)) for hid_size in hidden_layer_size: combined_model_arch += '_' + str(hid_size) nnets_file_name = file_paths.get_nnets_file_name() temp_dir_name = file_paths.get_temp_nn_dir_name() gen_dir = os.path.join(gen_dir, temp_dir_name) if cfg.switch_to_keras: ### set configuration variables ### cfg.inp_dim = lab_dim cfg.out_dim = cfg.cmp_dim cfg.inp_feat_dir = nn_label_norm_dir cfg.out_feat_dir = nn_cmp_norm_dir cfg.pred_feat_dir = gen_dir if cfg.GenTestList and cfg.test_synth_dir!="None": cfg.inp_feat_dir = cfg.test_synth_dir cfg.pred_feat_dir = cfg.test_synth_dir ### call kerasclass and use an instance ### keras_instance = KerasClass(cfg) ### DNN model training if cfg.TRAINDNN: var_dict = load_covariance(var_file_dict, cfg.out_dimension_dict) logger.info('training DNN') fid = open(norm_info_file, 'rb') cmp_min_max = numpy.fromfile(fid, dtype=numpy.float32) fid.close() cmp_min_max = cmp_min_max.reshape((2, -1)) cmp_mean_vector = cmp_min_max[0, ] cmp_std_vector = cmp_min_max[1, ] try: os.makedirs(model_dir) except OSError as e: if e.errno == errno.EEXIST: # not an error - just means directory already exists pass else: logger.critical('Failed to create model directory %s' % model_dir) logger.critical(' OS error was: %s' % e.strerror) raise try: if cfg.switch_to_keras: keras_instance.train_keras_model() else: train_DNN(train_xy_file_list = (train_x_file_list, train_y_file_list), \ valid_xy_file_list = (valid_x_file_list, valid_y_file_list), \ nnets_file_name = nnets_file_name, \ n_ins = lab_dim, n_outs = cfg.cmp_dim, ms_outs = cfg.multistream_outs, \ hyper_params = cfg.hyper_params, buffer_size = cfg.buffer_size, plot = cfg.plot, var_dict = var_dict, cmp_mean_vector = cmp_mean_vector, cmp_std_vector = cmp_std_vector) except KeyboardInterrupt: logger.critical('train_DNN interrupted via keyboard') # Could 'raise' the exception further, but that causes a deep traceback to be printed # which we don't care about for a keyboard interrupt. So, just bail out immediately sys.exit(1) except: logger.critical('train_DNN threw an exception') raise if cfg.GENBNFEA: # Please only tune on this step when you want to generate bottleneck features from DNN gen_dir = file_paths.bottleneck_features bottleneck_size = min(hidden_layer_size) bottleneck_index = 0 for i in range(len(hidden_layer_size)): if hidden_layer_size[i] == bottleneck_size: bottleneck_index = i logger.info('generating bottleneck features from DNN') try: os.makedirs(gen_dir) except OSError as e: if e.errno == errno.EEXIST: # not an error - just means directory already exists pass else: logger.critical('Failed to create generation directory %s' % gen_dir) logger.critical(' OS error was: %s' % e.strerror) raise gen_file_id_list = file_id_list[0:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number] test_x_file_list = nn_label_norm_file_list[0:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number] gen_file_list = prepare_file_path_list(gen_file_id_list, gen_dir, cfg.cmp_ext) dnn_hidden_generation(test_x_file_list, nnets_file_name, lab_dim, cfg.cmp_dim, gen_file_list, bottleneck_index) ### generate parameters from DNN gen_file_id_list = file_id_list[cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number] test_x_file_list = nn_label_norm_file_list[cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number] if cfg.GenTestList: gen_file_id_list = test_id_list test_x_file_list = nn_label_norm_file_list if cfg.test_synth_dir!="None": gen_dir = cfg.test_synth_dir if cfg.DNNGEN: logger.info('generating from DNN') try: os.makedirs(gen_dir) except OSError as e: if e.errno == errno.EEXIST: # not an error - just means directory already exists pass else: logger.critical('Failed to create generation directory %s' % gen_dir) logger.critical(' OS error was: %s' % e.strerror) raise gen_file_list = prepare_file_path_list(gen_file_id_list, gen_dir, cfg.cmp_ext) if cfg.switch_to_keras: keras_instance.test_keras_model() else: reshape_io = True if cfg.rnn_batch_training else False dnn_generation(test_x_file_list, nnets_file_name, lab_dim, cfg.cmp_dim, gen_file_list, reshape_io) logger.debug('denormalising generated output using method %s' % cfg.output_feature_normalisation) fid = open(norm_info_file, 'rb') cmp_min_max = numpy.fromfile(fid, dtype=numpy.float32) fid.close() cmp_min_max = cmp_min_max.reshape((2, -1)) cmp_min_vector = cmp_min_max[0, ] cmp_max_vector = cmp_min_max[1, ] if cfg.output_feature_normalisation == 'MVN': denormaliser = MeanVarianceNorm(feature_dimension = cfg.cmp_dim) denormaliser.feature_denormalisation(gen_file_list, gen_file_list, cmp_min_vector, cmp_max_vector) elif cfg.output_feature_normalisation == 'MINMAX': denormaliser = MinMaxNormalisation(cfg.cmp_dim, min_value = 0.01, max_value = 0.99, min_vector = cmp_min_vector, max_vector = cmp_max_vector) denormaliser.denormalise_data(gen_file_list, gen_file_list) else: logger.critical('denormalising method %s is not supported!\n' %(cfg.output_feature_normalisation)) raise if cfg.AcousticModel: ##perform MLPG to smooth parameter trajectory ## lf0 is included, the output features much have vuv. generator = ParameterGeneration(gen_wav_features = cfg.gen_wav_features, enforce_silence = cfg.enforce_silence) generator.acoustic_decomposition(gen_file_list, cfg.cmp_dim, cfg.out_dimension_dict, cfg.file_extension_dict, var_file_dict, do_MLPG=cfg.do_MLPG, cfg=cfg) if cfg.DurationModel: ### Perform duration normalization(min. state dur set to 1) ### gen_dur_list = prepare_file_path_list(gen_file_id_list, gen_dir, cfg.dur_ext) gen_label_list = prepare_file_path_list(gen_file_id_list, gen_dir, cfg.lab_ext) in_gen_label_align_file_list = prepare_file_path_list(gen_file_id_list, cfg.in_label_align_dir, cfg.lab_ext, False) generator = ParameterGeneration(gen_wav_features = cfg.gen_wav_features) generator.duration_decomposition(gen_file_list, cfg.cmp_dim, cfg.out_dimension_dict, cfg.file_extension_dict) label_modifier = HTSLabelModification(silence_pattern = cfg.silence_pattern, label_type = cfg.label_type) label_modifier.modify_duration_labels(in_gen_label_align_file_list, gen_dur_list, gen_label_list) ### generate wav if cfg.GENWAV: logger.info('reconstructing waveform(s)') generate_wav(gen_dir, gen_file_id_list, cfg) # generated speech # generate_wav(nn_cmp_dir, gen_file_id_list, cfg) # reference copy synthesis speech ### setting back to original conditions before calculating objective scores ### if cfg.GenTestList: in_label_align_file_list = prepare_file_path_list(file_id_list, cfg.in_label_align_dir, cfg.lab_ext, False) binary_label_file_list = prepare_file_path_list(file_id_list, binary_label_dir, cfg.lab_ext) gen_file_id_list = file_id_list[cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number] ### evaluation: RMSE and CORR for duration if cfg.CALMCD and cfg.DurationModel: logger.info('calculating MCD') ref_data_dir = os.path.join(inter_data_dir, 'ref_data') ref_dur_list = prepare_file_path_list(gen_file_id_list, ref_data_dir, cfg.dur_ext) in_gen_label_align_file_list = in_label_align_file_list[cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number] calculator = IndividualDistortionComp() valid_file_id_list = file_id_list[cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number] test_file_id_list = file_id_list[cfg.train_file_number+cfg.valid_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number] if cfg.remove_silence_using_binary_labels: untrimmed_reference_data = in_file_list_dict['dur'][cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number] trim_silence(untrimmed_reference_data, ref_dur_list, cfg.dur_dim, \ untrimmed_test_labels, lab_dim, silence_feature) else: remover = SilenceRemover(n_cmp = cfg.dur_dim, silence_pattern = cfg.silence_pattern, label_type=cfg.label_type, remove_frame_features = cfg.add_frame_features) remover.remove_silence(in_file_list_dict['dur'][cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number], in_gen_label_align_file_list, ref_dur_list) valid_dur_rmse, valid_dur_corr = calculator.compute_distortion(valid_file_id_list, ref_data_dir, gen_dir, cfg.dur_ext, cfg.dur_dim) test_dur_rmse, test_dur_corr = calculator.compute_distortion(test_file_id_list , ref_data_dir, gen_dir, cfg.dur_ext, cfg.dur_dim) logger.info('Develop: DNN -- RMSE: %.3f frames/phoneme; CORR: %.3f; ' \ %(valid_dur_rmse, valid_dur_corr)) logger.info('Test: DNN -- RMSE: %.3f frames/phoneme; CORR: %.3f; ' \ %(test_dur_rmse, test_dur_corr)) ### evaluation: calculate distortion if cfg.CALMCD and cfg.AcousticModel: logger.info('calculating MCD') ref_data_dir = os.path.join(inter_data_dir, 'ref_data') ref_mgc_list = prepare_file_path_list(gen_file_id_list, ref_data_dir, cfg.mgc_ext) ref_bap_list = prepare_file_path_list(gen_file_id_list, ref_data_dir, cfg.bap_ext) ref_lf0_list = prepare_file_path_list(gen_file_id_list, ref_data_dir, cfg.lf0_ext) in_gen_label_align_file_list = in_label_align_file_list[cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number] calculator = IndividualDistortionComp() spectral_distortion = 0.0 bap_mse = 0.0 f0_mse = 0.0 vuv_error = 0.0 valid_file_id_list = file_id_list[cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number] test_file_id_list = file_id_list[cfg.train_file_number+cfg.valid_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number] if cfg.remove_silence_using_binary_labels: ## get lab_dim: label_composer = LabelComposer() label_composer.load_label_configuration(cfg.label_config_file) lab_dim=label_composer.compute_label_dimension() ## use first feature in label -- hardcoded for now silence_feature = 0 ## Use these to trim silence: untrimmed_test_labels = binary_label_file_list[cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number] if 'mgc' in cfg.in_dimension_dict: if cfg.remove_silence_using_binary_labels: untrimmed_reference_data = in_file_list_dict['mgc'][cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number] trim_silence(untrimmed_reference_data, ref_mgc_list, cfg.mgc_dim, \ untrimmed_test_labels, lab_dim, silence_feature) elif cfg.remove_silence_using_hts_labels: remover = SilenceRemover(n_cmp = cfg.mgc_dim, silence_pattern = cfg.silence_pattern, label_type=cfg.label_type) remover.remove_silence(in_file_list_dict['mgc'][cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number], in_gen_label_align_file_list, ref_mgc_list) else: ref_data_dir = os.path.join(data_dir, 'mgc') valid_spectral_distortion = calculator.compute_distortion(valid_file_id_list, ref_data_dir, gen_dir, cfg.mgc_ext, cfg.mgc_dim) test_spectral_distortion = calculator.compute_distortion(test_file_id_list , ref_data_dir, gen_dir, cfg.mgc_ext, cfg.mgc_dim) valid_spectral_distortion *= (10 /numpy.log(10)) * numpy.sqrt(2.0) ##MCD test_spectral_distortion *= (10 /numpy.log(10)) * numpy.sqrt(2.0) ##MCD if 'bap' in cfg.in_dimension_dict: if cfg.remove_silence_using_binary_labels: untrimmed_reference_data = in_file_list_dict['bap'][cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number] trim_silence(untrimmed_reference_data, ref_bap_list, cfg.bap_dim, \ untrimmed_test_labels, lab_dim, silence_feature) elif cfg.remove_silence_using_hts_labels: remover = SilenceRemover(n_cmp = cfg.bap_dim, silence_pattern = cfg.silence_pattern, label_type=cfg.label_type) remover.remove_silence(in_file_list_dict['bap'][cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number], in_gen_label_align_file_list, ref_bap_list) else: ref_data_dir = os.path.join(data_dir, 'bap') valid_bap_mse = calculator.compute_distortion(valid_file_id_list, ref_data_dir, gen_dir, cfg.bap_ext, cfg.bap_dim) test_bap_mse = calculator.compute_distortion(test_file_id_list , ref_data_dir, gen_dir, cfg.bap_ext, cfg.bap_dim) valid_bap_mse = valid_bap_mse / 10.0 ##Cassia's bap is computed from 10*log|S(w)|. if use HTS/SPTK style, do the same as MGC test_bap_mse = test_bap_mse / 10.0 ##Cassia's bap is computed from 10*log|S(w)|. if use HTS/SPTK style, do the same as MGC if 'lf0' in cfg.in_dimension_dict: if cfg.remove_silence_using_binary_labels: untrimmed_reference_data = in_file_list_dict['lf0'][cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number] trim_silence(untrimmed_reference_data, ref_lf0_list, cfg.lf0_dim, \ untrimmed_test_labels, lab_dim, silence_feature) elif cfg.remove_silence_using_hts_labels: remover = SilenceRemover(n_cmp = cfg.lf0_dim, silence_pattern = cfg.silence_pattern, label_type=cfg.label_type) remover.remove_silence(in_file_list_dict['lf0'][cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number], in_gen_label_align_file_list, ref_lf0_list) else: ref_data_dir = os.path.join(data_dir, 'lf0') valid_f0_mse, valid_f0_corr, valid_vuv_error = calculator.compute_distortion(valid_file_id_list, ref_data_dir, gen_dir, cfg.lf0_ext, cfg.lf0_dim) test_f0_mse , test_f0_corr, test_vuv_error = calculator.compute_distortion(test_file_id_list , ref_data_dir, gen_dir, cfg.lf0_ext, cfg.lf0_dim) logger.info('Develop: DNN -- MCD: %.3f dB; BAP: %.3f dB; F0:- RMSE: %.3f Hz; CORR: %.3f; VUV: %.3f%%' \ %(valid_spectral_distortion, valid_bap_mse, valid_f0_mse, valid_f0_corr, valid_vuv_error*100.)) logger.info('Test : DNN -- MCD: %.3f dB; BAP: %.3f dB; F0:- RMSE: %.3f Hz; CORR: %.3f; VUV: %.3f%%' \ %(test_spectral_distortion , test_bap_mse , test_f0_mse , test_f0_corr, test_vuv_error*100.))
p.add_argument('-s', '--senlst', dest='senlst', required=True) p.add_argument('-c', '--config', dest='config', required=True) a = p.parse_args() load_config(a.config) from __init__ import * with open(a.senlst) as f: sentences = [l.rstrip() for l in f if l] hts2 = [path.join(HTS2DIR, s + '.lab') for s in sentences] lab1 = [path.join(LAB1DIR, s + '.lab') for s in sentences] lab2 = [path.join(LAB2DIR, s + '.lab') for s in sentences] lab3 = [path.join(LAB3DIR, s + '.lab') for s in sentences] binarizer = HTSLabelNormalisation( question_file_name=path.join(RESDIR, '600.hed')) binarizer.perform_normalisation(hts2, lab1) remover = SilenceRemover(n_cmp=binarizer.dimension, silence_pattern=['*-#+*']) remover.remove_silence(lab1, hts2, lab2) normalizer = MinMaxNormalisation(feature_dimension=binarizer.dimension, min_value=0.01, max_value=0.99) normalizer.find_min_max_values(lab2) print1(normalizer.min_vector) print1(normalizer.max_vector) lu.write_binfile(normalizer.min_vector, path.join(LABSDIR, 'min')) lu.write_binfile(normalizer.max_vector, path.join(LABSDIR, 'max'))
def main_function(cfg): # get a logger for this main function logger = logging.getLogger("main") # get another logger to handle plotting duties plotlogger = logging.getLogger("plotting") # later, we might do this via a handler that is created, attached and configured # using the standard config mechanism of the logging module # but for now we need to do it manually plotlogger.set_plot_path(cfg.plot_dir) #### parameter setting######## hidden_layer_size = cfg.hyper_params['hidden_layer_size'] ####prepare environment try: file_id_list = read_file_list(cfg.file_id_scp) logger.debug('Loaded file id list from %s' % cfg.file_id_scp) except IOError: # this means that open(...) threw an error logger.critical('Could not load file id list from %s' % cfg.file_id_scp) raise ###total file number including training, development, and testing total_file_number = len(file_id_list) data_dir = cfg.data_dir nn_cmp_dir = os.path.join(data_dir, 'nn' + cfg.combined_feature_name + '_' + str(cfg.cmp_dim)) nn_cmp_norm_dir = os.path.join(data_dir, 'nn_norm' + cfg.combined_feature_name + '_' + str(cfg.cmp_dim)) model_dir = os.path.join(cfg.work_dir, 'nnets_model') gen_dir = os.path.join(cfg.work_dir, 'gen') in_file_list_dict = {} for feature_name in list(cfg.in_dir_dict.keys()): in_file_list_dict[feature_name] = prepare_file_path_list(file_id_list, cfg.in_dir_dict[feature_name], cfg.file_extension_dict[feature_name], False) nn_cmp_file_list = prepare_file_path_list(file_id_list, nn_cmp_dir, cfg.cmp_ext) nn_cmp_norm_file_list = prepare_file_path_list(file_id_list, nn_cmp_norm_dir, cfg.cmp_ext) ###normalisation information norm_info_file = os.path.join(data_dir, 'norm_info' + cfg.combined_feature_name + '_' + str(cfg.cmp_dim) + '_' + cfg.output_feature_normalisation + '.dat') ### normalise input full context label # currently supporting two different forms of lingustic features # later, we should generalise this if cfg.label_style == 'HTS': label_normaliser = HTSLabelNormalisation(question_file_name=cfg.question_file_name) lab_dim = label_normaliser.dimension + cfg.appended_input_dim logger.info('Input label dimension is %d' % lab_dim) suffix=str(lab_dim) # no longer supported - use new "composed" style labels instead elif cfg.label_style == 'composed': # label_normaliser = XMLLabelNormalisation(xpath_file_name=cfg.xpath_file_name) suffix='composed' if cfg.process_labels_in_work_dir: label_data_dir = cfg.work_dir else: label_data_dir = data_dir # the number can be removed binary_label_dir = os.path.join(label_data_dir, 'binary_label_'+suffix) nn_label_dir = os.path.join(label_data_dir, 'nn_no_silence_lab_'+suffix) nn_label_norm_dir = os.path.join(label_data_dir, 'nn_no_silence_lab_norm_'+suffix) in_label_align_file_list = prepare_file_path_list(file_id_list, cfg.in_label_align_dir, cfg.lab_ext, False) binary_label_file_list = prepare_file_path_list(file_id_list, binary_label_dir, cfg.lab_ext) nn_label_file_list = prepare_file_path_list(file_id_list, nn_label_dir, cfg.lab_ext) nn_label_norm_file_list = prepare_file_path_list(file_id_list, nn_label_norm_dir, cfg.lab_ext) # to do - sanity check the label dimension here? min_max_normaliser = None label_norm_file = 'label_norm_%s.dat' %(cfg.label_style) label_norm_file = os.path.join(label_data_dir, label_norm_file) if cfg.NORMLAB and (cfg.label_style == 'HTS'): # simple HTS labels logger.info('preparing label data (input) using standard HTS style labels') label_normaliser.perform_normalisation(in_label_align_file_list, binary_label_file_list) remover = SilenceRemover(n_cmp = lab_dim, silence_pattern = cfg.silence_pattern) remover.remove_silence(binary_label_file_list, in_label_align_file_list, nn_label_file_list) min_max_normaliser = MinMaxNormalisation(feature_dimension = lab_dim, min_value = 0.01, max_value = 0.99) ###use only training data to find min-max information, then apply on the whole dataset min_max_normaliser.find_min_max_values(nn_label_file_list[0:cfg.train_file_number]) min_max_normaliser.normalise_data(nn_label_file_list, nn_label_norm_file_list) if cfg.NORMLAB and (cfg.label_style == 'composed'): # new flexible label preprocessor logger.info('preparing label data (input) using "composed" style labels') label_composer = LabelComposer() label_composer.load_label_configuration(cfg.label_config_file) logger.info('Loaded label configuration') # logger.info('%s' % label_composer.configuration.labels ) lab_dim=label_composer.compute_label_dimension() logger.info('label dimension will be %d' % lab_dim) if cfg.precompile_xpaths: label_composer.precompile_xpaths() # there are now a set of parallel input label files (e.g, one set of HTS and another set of Ossian trees) # create all the lists of these, ready to pass to the label composer in_label_align_file_list = {} for label_style, label_style_required in label_composer.label_styles.items(): if label_style_required: logger.info('labels of style %s are required - constructing file paths for them' % label_style) if label_style == 'xpath': in_label_align_file_list['xpath'] = prepare_file_path_list(file_id_list, cfg.xpath_label_align_dir, cfg.utt_ext, False) elif label_style == 'hts': in_label_align_file_list['hts'] = prepare_file_path_list(file_id_list, cfg.hts_label_align_dir, cfg.lab_ext, False) else: logger.critical('unsupported label style %s specified in label configuration' % label_style) raise Exception # now iterate through the files, one at a time, constructing the labels for them num_files=len(file_id_list) logger.info('the label styles required are %s' % label_composer.label_styles) for i in range(num_files): logger.info('making input label features for %4d of %4d' % (i+1,num_files)) # iterate through the required label styles and open each corresponding label file # a dictionary of file descriptors, pointing at the required files required_labels={} for label_style, label_style_required in label_composer.label_styles.items(): # the files will be a parallel set of files for a single utterance # e.g., the XML tree and an HTS label file if label_style_required: required_labels[label_style] = open(in_label_align_file_list[label_style][i] , 'r') logger.debug(' opening label file %s' % in_label_align_file_list[label_style][i]) logger.debug('label styles with open files: %s' % required_labels) label_composer.make_labels(required_labels,out_file_name=binary_label_file_list[i],fill_missing_values=cfg.fill_missing_values,iterate_over_frames=cfg.iterate_over_frames) # now close all opened files for fd in required_labels.values(): fd.close() # silence removal if cfg.remove_silence_using_binary_labels: silence_feature = 0 ## use first feature in label -- hardcoded for now logger.info('Silence removal from label using silence feature: %s'%(label_composer.configuration.labels[silence_feature])) logger.info('Silence will be removed from CMP files in same way') ## Binary labels have 2 roles: both the thing trimmed and the instructions for trimming: trim_silence(binary_label_file_list, nn_label_file_list, lab_dim, \ binary_label_file_list, lab_dim, silence_feature) else: logger.info('No silence removal done') # start from the labels we have just produced, not trimmed versions nn_label_file_list = binary_label_file_list min_max_normaliser = MinMaxNormalisation(feature_dimension = lab_dim, min_value = 0.01, max_value = 0.99) ###use only training data to find min-max information, then apply on the whole dataset min_max_normaliser.find_min_max_values(nn_label_file_list[0:cfg.train_file_number]) min_max_normaliser.normalise_data(nn_label_file_list, nn_label_norm_file_list) if min_max_normaliser != None: ### save label normalisation information for unseen testing labels label_min_vector = min_max_normaliser.min_vector label_max_vector = min_max_normaliser.max_vector label_norm_info = numpy.concatenate((label_min_vector, label_max_vector), axis=0) label_norm_info = numpy.array(label_norm_info, 'float32') fid = open(label_norm_file, 'wb') label_norm_info.tofile(fid) fid.close() logger.info('saved %s vectors to %s' %(label_min_vector.size, label_norm_file)) ### make output acoustic data if cfg.MAKECMP: logger.info('creating acoustic (output) features') delta_win = cfg.delta_win #[-0.5, 0.0, 0.5] acc_win = cfg.acc_win #[1.0, -2.0, 1.0] acoustic_worker = AcousticComposition(delta_win = delta_win, acc_win = acc_win) acoustic_worker.prepare_nn_data(in_file_list_dict, nn_cmp_file_list, cfg.in_dimension_dict, cfg.out_dimension_dict) if cfg.remove_silence_using_binary_labels: ## do this to get lab_dim: label_composer = LabelComposer() label_composer.load_label_configuration(cfg.label_config_file) lab_dim=label_composer.compute_label_dimension() silence_feature = 0 ## use first feature in label -- hardcoded for now logger.info('Silence removal from CMP using binary label file') ## overwrite the untrimmed audio with the trimmed version: trim_silence(nn_cmp_file_list, nn_cmp_file_list, cfg.cmp_dim, binary_label_file_list, lab_dim, silence_feature) else: ## back off to previous method using HTS labels: remover = SilenceRemover(n_cmp = cfg.cmp_dim, silence_pattern = cfg.silence_pattern) remover.remove_silence(nn_cmp_file_list[0:cfg.train_file_number+cfg.valid_file_number], in_label_align_file_list[0:cfg.train_file_number+cfg.valid_file_number], nn_cmp_file_list[0:cfg.train_file_number+cfg.valid_file_number]) # save to itself ### save acoustic normalisation information for normalising the features back var_dir = os.path.join(data_dir, 'var') if not os.path.exists(var_dir): os.makedirs(var_dir) var_file_dict = {} for feature_name in list(cfg.out_dimension_dict.keys()): var_file_dict[feature_name] = os.path.join(var_dir, feature_name + '_' + str(cfg.out_dimension_dict[feature_name])) ### normalise output acoustic data if cfg.NORMCMP: logger.info('normalising acoustic (output) features using method %s' % cfg.output_feature_normalisation) cmp_norm_info = None if cfg.output_feature_normalisation == 'MVN': normaliser = MeanVarianceNorm(feature_dimension=cfg.cmp_dim) ###calculate mean and std vectors on the training data, and apply on the whole dataset global_mean_vector = normaliser.compute_mean(nn_cmp_file_list[0:cfg.train_file_number], 0, cfg.cmp_dim) global_std_vector = normaliser.compute_std(nn_cmp_file_list[0:cfg.train_file_number], global_mean_vector, 0, cfg.cmp_dim) normaliser.feature_normalisation(nn_cmp_file_list[0:cfg.train_file_number+cfg.valid_file_number], nn_cmp_norm_file_list[0:cfg.train_file_number+cfg.valid_file_number]) cmp_norm_info = numpy.concatenate((global_mean_vector, global_std_vector), axis=0) elif cfg.output_feature_normalisation == 'MINMAX': min_max_normaliser = MinMaxNormalisation(feature_dimension = cfg.cmp_dim) global_mean_vector = min_max_normaliser.compute_mean(nn_cmp_file_list[0:cfg.train_file_number]) global_std_vector = min_max_normaliser.compute_std(nn_cmp_file_list[0:cfg.train_file_number], global_mean_vector) min_max_normaliser = MinMaxNormalisation(feature_dimension = cfg.cmp_dim, min_value = 0.01, max_value = 0.99) min_max_normaliser.find_min_max_values(nn_cmp_file_list[0:cfg.train_file_number]) min_max_normaliser.normalise_data(nn_cmp_file_list, nn_cmp_norm_file_list) cmp_min_vector = min_max_normaliser.min_vector cmp_max_vector = min_max_normaliser.max_vector cmp_norm_info = numpy.concatenate((cmp_min_vector, cmp_max_vector), axis=0) else: logger.critical('Normalisation type %s is not supported!\n' %(cfg.output_feature_normalisation)) raise cmp_norm_info = numpy.array(cmp_norm_info, 'float32') fid = open(norm_info_file, 'wb') cmp_norm_info.tofile(fid) fid.close() logger.info('saved %s vectors to %s' %(cfg.output_feature_normalisation, norm_info_file)) feature_index = 0 for feature_name in list(cfg.out_dimension_dict.keys()): feature_std_vector = numpy.array(global_std_vector[:,feature_index:feature_index+cfg.out_dimension_dict[feature_name]], 'float32') fid = open(var_file_dict[feature_name], 'w') feature_std_vector.tofile(fid) fid.close() logger.info('saved %s variance vector to %s' %(feature_name, var_file_dict[feature_name])) feature_index += cfg.out_dimension_dict[feature_name] train_x_file_list = nn_label_norm_file_list[0:cfg.train_file_number] train_y_file_list = nn_cmp_norm_file_list[0:cfg.train_file_number] valid_x_file_list = nn_label_norm_file_list[cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number] valid_y_file_list = nn_cmp_norm_file_list[cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number] test_x_file_list = nn_label_norm_file_list[cfg.train_file_number+cfg.valid_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number] test_y_file_list = nn_cmp_norm_file_list[cfg.train_file_number+cfg.valid_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number] # we need to know the label dimension before training the DNN # computing that requires us to look at the labels # # currently, there are two ways to do this if cfg.label_style == 'HTS': label_normaliser = HTSLabelNormalisation(question_file_name=cfg.question_file_name) lab_dim = label_normaliser.dimension + cfg.appended_input_dim elif cfg.label_style == 'composed': label_composer = LabelComposer() label_composer.load_label_configuration(cfg.label_config_file) lab_dim=label_composer.compute_label_dimension() logger.info('label dimension is %d' % lab_dim) combined_model_arch = str(len(hidden_layer_size)) for hid_size in hidden_layer_size: combined_model_arch += '_' + str(hid_size) nnets_file_name = '%s/%s_%s_%d_%s_%d.%d.train.%d.%f.nn.model' \ %(model_dir, cfg.combined_model_name, cfg.combined_feature_name, int(cfg.multistream_switch), combined_model_arch, lab_dim, cfg.cmp_dim, cfg.train_file_number, cfg.hyper_params['learning_rate']) ### DNN model training if cfg.TRAINDNN: var_dict = load_covariance(var_file_dict, cfg.out_dimension_dict) logger.info('training DNN') fid = open(norm_info_file, 'rb') cmp_min_max = numpy.fromfile(fid, dtype=numpy.float32) fid.close() cmp_min_max = cmp_min_max.reshape((2, -1)) cmp_mean_vector = cmp_min_max[0, ] cmp_std_vector = cmp_min_max[1, ] try: os.makedirs(model_dir) except OSError as e: if e.errno == errno.EEXIST: # not an error - just means directory already exists pass else: logger.critical('Failed to create model directory %s' % model_dir) logger.critical(' OS error was: %s' % e.strerror) raise try: train_DNN(train_xy_file_list = (train_x_file_list, train_y_file_list), \ valid_xy_file_list = (valid_x_file_list, valid_y_file_list), \ nnets_file_name = nnets_file_name, \ n_ins = lab_dim, n_outs = cfg.cmp_dim, ms_outs = cfg.multistream_outs, \ hyper_params = cfg.hyper_params, buffer_size = cfg.buffer_size, plot = cfg.plot, var_dict = var_dict, cmp_mean_vector = cmp_mean_vector, cmp_std_vector = cmp_std_vector, init_dnn_model_file = cfg.start_from_trained_model) except KeyboardInterrupt: logger.critical('train_DNN interrupted via keyboard') # Could 'raise' the exception further, but that causes a deep traceback to be printed # which we don't care about for a keyboard interrupt. So, just bail out immediately sys.exit(1) except: logger.critical('train_DNN threw an exception') raise ### generate parameters from DNN temp_dir_name = '%s_%s_%d_%d_%d_%d_%d_%d' \ %(cfg.combined_model_name, cfg.combined_feature_name, int(cfg.do_post_filtering), \ cfg.train_file_number, lab_dim, cfg.cmp_dim, \ len(hidden_layer_size), hidden_layer_size[0]) gen_dir = os.path.join(gen_dir, temp_dir_name) gen_file_id_list = file_id_list[cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number] test_x_file_list = nn_label_norm_file_list[cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number] if cfg.DNNGEN: logger.info('generating from DNN') try: os.makedirs(gen_dir) except OSError as e: if e.errno == errno.EEXIST: # not an error - just means directory already exists pass else: logger.critical('Failed to create generation directory %s' % gen_dir) logger.critical(' OS error was: %s' % e.strerror) raise gen_file_list = prepare_file_path_list(gen_file_id_list, gen_dir, cfg.cmp_ext) dnn_generation(test_x_file_list, nnets_file_name, lab_dim, cfg.cmp_dim, gen_file_list) logger.debug('denormalising generated output using method %s' % cfg.output_feature_normalisation) fid = open(norm_info_file, 'rb') cmp_min_max = numpy.fromfile(fid, dtype=numpy.float32) fid.close() cmp_min_max = cmp_min_max.reshape((2, -1)) cmp_min_vector = cmp_min_max[0, ] cmp_max_vector = cmp_min_max[1, ] if cfg.output_feature_normalisation == 'MVN': denormaliser = MeanVarianceNorm(feature_dimension = cfg.cmp_dim) denormaliser.feature_denormalisation(gen_file_list, gen_file_list, cmp_min_vector, cmp_max_vector) elif cfg.output_feature_normalisation == 'MINMAX': denormaliser = MinMaxNormalisation(cfg.cmp_dim, min_value = 0.01, max_value = 0.99, min_vector = cmp_min_vector, max_vector = cmp_max_vector) denormaliser.denormalise_data(gen_file_list, gen_file_list) else: logger.critical('denormalising method %s is not supported!\n' %(cfg.output_feature_normalisation)) raise ##perform MLPG to smooth parameter trajectory ## lf0 is included, the output features much have vuv. generator = ParameterGeneration(gen_wav_features = cfg.gen_wav_features) generator.acoustic_decomposition(gen_file_list, cfg.cmp_dim, cfg.out_dimension_dict, cfg.file_extension_dict, var_file_dict) ### generate wav if cfg.GENWAV: logger.info('reconstructing waveform(s)') print(len(gen_file_id_list)) generate_wav(gen_dir, gen_file_id_list[cfg.valid_file_number:cfg.valid_file_number+cfg.test_file_number], cfg) # generated speech # generate_wav(nn_cmp_dir, gen_file_id_list) # reference copy synthesis speech ### evaluation: calculate distortion if cfg.CALMCD: logger.info('calculating MCD') ref_data_dir = os.path.join(data_dir, 'ref_data') ref_mgc_list = prepare_file_path_list(gen_file_id_list, ref_data_dir, cfg.mgc_ext) ref_bap_list = prepare_file_path_list(gen_file_id_list, ref_data_dir, cfg.bap_ext) ref_lf0_list = prepare_file_path_list(gen_file_id_list, ref_data_dir, cfg.lf0_ext) in_gen_label_align_file_list = in_label_align_file_list[cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number] calculator = IndividualDistortionComp() spectral_distortion = 0.0 bap_mse = 0.0 f0_mse = 0.0 vuv_error = 0.0 valid_file_id_list = file_id_list[cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number] test_file_id_list = file_id_list[cfg.train_file_number+cfg.valid_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number] if cfg.remove_silence_using_binary_labels: ## get lab_dim: label_composer = LabelComposer() label_composer.load_label_configuration(cfg.label_config_file) lab_dim=label_composer.compute_label_dimension() ## use first feature in label -- hardcoded for now silence_feature = 0 ## Use these to trim silence: untrimmed_test_labels = binary_label_file_list[cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number] if 'mgc' in cfg.in_dimension_dict: if cfg.remove_silence_using_binary_labels: untrimmed_reference_data = in_file_list_dict['mgc'][cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number] trim_silence(untrimmed_reference_data, ref_mgc_list, cfg.mgc_dim, \ untrimmed_test_labels, lab_dim, silence_feature) else: remover = SilenceRemover(n_cmp = cfg.mgc_dim, silence_pattern = cfg.silence_pattern) remover.remove_silence(in_file_list_dict['mgc'][cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number], in_gen_label_align_file_list, ref_mgc_list) valid_spectral_distortion = calculator.compute_distortion(valid_file_id_list, ref_data_dir, gen_dir, cfg.mgc_ext, cfg.mgc_dim) test_spectral_distortion = calculator.compute_distortion(test_file_id_list , ref_data_dir, gen_dir, cfg.mgc_ext, cfg.mgc_dim) valid_spectral_distortion *= (10 /numpy.log(10)) * numpy.sqrt(2.0) ##MCD test_spectral_distortion *= (10 /numpy.log(10)) * numpy.sqrt(2.0) ##MCD if 'bap' in cfg.in_dimension_dict: if cfg.remove_silence_using_binary_labels: untrimmed_reference_data = in_file_list_dict['bap'][cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number] trim_silence(untrimmed_reference_data, ref_bap_list, cfg.bap_dim, \ untrimmed_test_labels, lab_dim, silence_feature) else: remover = SilenceRemover(n_cmp = cfg.bap_dim, silence_pattern = cfg.silence_pattern) remover.remove_silence(in_file_list_dict['bap'][cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number], in_gen_label_align_file_list, ref_bap_list) valid_bap_mse = calculator.compute_distortion(valid_file_id_list, ref_data_dir, gen_dir, cfg.bap_ext, cfg.bap_dim) test_bap_mse = calculator.compute_distortion(test_file_id_list , ref_data_dir, gen_dir, cfg.bap_ext, cfg.bap_dim) valid_bap_mse = valid_bap_mse / 10.0 ##Cassia's bap is computed from 10*log|S(w)|. if use HTS/SPTK style, do the same as MGC test_bap_mse = test_bap_mse / 10.0 ##Cassia's bap is computed from 10*log|S(w)|. if use HTS/SPTK style, do the same as MGC if 'lf0' in cfg.in_dimension_dict: if cfg.remove_silence_using_binary_labels: untrimmed_reference_data = in_file_list_dict['lf0'][cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number] trim_silence(untrimmed_reference_data, ref_lf0_list, cfg.lf0_dim, \ untrimmed_test_labels, lab_dim, silence_feature) else: remover = SilenceRemover(n_cmp = cfg.lf0_dim, silence_pattern = cfg.silence_pattern) remover.remove_silence(in_file_list_dict['lf0'][cfg.train_file_number:cfg.train_file_number+cfg.valid_file_number+cfg.test_file_number], in_gen_label_align_file_list, ref_lf0_list) valid_f0_mse, valid_vuv_error = calculator.compute_distortion(valid_file_id_list, ref_data_dir, gen_dir, cfg.lf0_ext, cfg.lf0_dim) test_f0_mse , test_vuv_error = calculator.compute_distortion(test_file_id_list , ref_data_dir, gen_dir, cfg.lf0_ext, cfg.lf0_dim) logger.info('Develop: DNN -- MCD: %.3f dB; BAP: %.3f dB; F0: %.3f Hz; VUV: %.3f%%' \ %(valid_spectral_distortion, valid_bap_mse, valid_f0_mse, valid_vuv_error*100.)) logger.info('Test : DNN -- MCD: %.3f dB; BAP: %.3f dB; F0: %.3f Hz; VUV: %.3f%%' \ %(test_spectral_distortion , test_bap_mse , test_f0_mse , test_vuv_error*100.))
def main_function(cfg, in_dir, out_dir): # get a logger for this main function logger = logging.getLogger("main") # get another logger to handle plotting duties plotlogger = logging.getLogger("plotting") #### parameter setting######## hidden_layers_sizes = cfg.hyper_params['hidden_layer_size'] file_id_list = [] if cfg.label_style == 'HTS': ext = '.lab' else: ext = '.utt' synth_utts = glob.glob(in_dir + '/*' + ext) for fname in synth_utts: junk, name = os.path.split(fname) file_id_list.append(name.replace(ext, '')) if not os.path.isdir(out_dir): os.mkdir(out_dir) ###total file number including training, development, and testing #total_file_number = len(file_id_list) data_dir = cfg.data_dir #nn_cmp_dir = os.path.join(data_dir, 'nn' + cfg.combined_feature_name + '_' + str(cfg.cmp_dim)) #nn_cmp_norm_dir = os.path.join(data_dir, 'nn_norm' + cfg.combined_feature_name + '_' + str(cfg.cmp_dim)) model_dir = os.path.join(cfg.work_dir, 'nnets_model') gen_dir = os.path.join(out_dir, 'gen') #in_file_list_dict = {} #for feature_name in cfg.in_dir_dict.keys(): # in_file_list_dict[feature_name] = prepare_file_path_list(file_id_list, cfg.in_dir_dict[feature_name], cfg.file_extension_dict[feature_name], False) #nn_cmp_file_list = prepare_file_path_list(file_id_list, nn_cmp_dir, cfg.cmp_ext) #nn_cmp_norm_file_list = prepare_file_path_list(file_id_list, nn_cmp_norm_dir, cfg.cmp_ext) ###normalisation information norm_info_file = os.path.join( data_dir, 'norm_info' + cfg.combined_feature_name + '_' + str(cfg.cmp_dim) + '_' + cfg.output_feature_normalisation + '.dat') ### normalise input full context label # currently supporting two different forms of lingustic features # later, we should generalise this if cfg.label_style == 'HTS': label_normaliser = HTSLabelNormalisation( question_file_name=cfg.question_file_name) lab_dim = label_normaliser.dimension logger.info('Input label dimension is %d' % lab_dim) suffix = str(lab_dim) # no longer supported - use new "composed" style labels instead elif cfg.label_style == 'composed': # label_normaliser = XMLLabelNormalisation(xpath_file_name=cfg.xpath_file_name) suffix = 'composed' # the number can be removed binary_label_dir = os.path.join(out_dir, 'lab_bin') nn_label_norm_dir = os.path.join(out_dir, 'lab_bin_norm') in_label_align_file_list = prepare_file_path_list(file_id_list, in_dir, cfg.lab_ext) binary_label_file_list = prepare_file_path_list(file_id_list, binary_label_dir, cfg.lab_ext) nn_label_norm_file_list = prepare_file_path_list(file_id_list, nn_label_norm_dir, cfg.lab_ext) ## need this to find normalisation info: if cfg.process_labels_in_work_dir: label_data_dir = cfg.work_dir else: label_data_dir = data_dir min_max_normaliser = None label_norm_file = 'label_norm_%s.dat' % (cfg.label_style) label_norm_file = os.path.join(label_data_dir, label_norm_file) if cfg.label_style == 'HTS': # simple HTS labels logger.info( 'preparing label data (input) using standard HTS style labels') label_normaliser.perform_normalisation(in_label_align_file_list, binary_label_file_list) else: logger.info( 'preparing label data (input) using "composed" style labels') label_composer = LabelComposer() label_composer.load_label_configuration(cfg.label_config_file) logger.info('Loaded label configuration') # logger.info('%s' % label_composer.configuration.labels ) lab_dim = label_composer.compute_label_dimension() logger.info('label dimension will be %d' % lab_dim) if cfg.precompile_xpaths: label_composer.precompile_xpaths() # there are now a set of parallel input label files (e.g, one set of HTS and another set of Ossian trees) # create all the lists of these, ready to pass to the label composer in_label_align_file_list = {} for label_style, label_style_required in label_composer.label_styles.iteritems( ): if label_style_required: logger.info( 'labels of style %s are required - constructing file paths for them' % label_style) if label_style == 'xpath': in_label_align_file_list['xpath'] = prepare_file_path_list( file_id_list, in_dir, cfg.utt_ext, False) elif label_style == 'hts': logger.critical('script not tested with HTS labels') else: logger.critical( 'unsupported label style %s specified in label configuration' % label_style) raise Exception # now iterate through the files, one at a time, constructing the labels for them num_files = len(file_id_list) logger.info('the label styles required are %s' % label_composer.label_styles) for i in xrange(num_files): logger.info('making input label features for %4d of %4d' % (i + 1, num_files)) # iterate through the required label styles and open each corresponding label file # a dictionary of file descriptors, pointing at the required files required_labels = {} for label_style, label_style_required in label_composer.label_styles.iteritems( ): # the files will be a parallel set of files for a single utterance # e.g., the XML tree and an HTS label file if label_style_required: required_labels[label_style] = open( in_label_align_file_list[label_style][i], 'r') logger.debug(' opening label file %s' % in_label_align_file_list[label_style][i]) logger.debug('label styles with open files: %s' % required_labels) label_composer.make_labels( required_labels, out_file_name=binary_label_file_list[i], fill_missing_values=cfg.fill_missing_values, iterate_over_frames=cfg.iterate_over_frames) # now close all opened files for fd in required_labels.itervalues(): fd.close() # no silence removal for synthesis ... ## minmax norm: min_max_normaliser = MinMaxNormalisation(feature_dimension=lab_dim, min_value=0.01, max_value=0.99) # reload stored minmax values: (TODO -- move reading and writing into MinMaxNormalisation class) fid = open(label_norm_file, 'rb') ## This doesn't work -- precision is lost -- reads in as float64 #label_norm_info = numpy.fromfile(fid) ## label_norm_info = numpy.array(label_norm_info, 'float32') ## use struct to enforce float32: nbytes = os.stat(label_norm_file)[6] # length in bytes data = fid.read(nbytes) # = read until bytes run out fid.close() m = nbytes / 4 ## number 32 bit floats format = str(m) + "f" label_norm_info = struct.unpack(format, data) label_norm_info = numpy.array(label_norm_info) min_max_normaliser.min_vector = label_norm_info[:m / 2] min_max_normaliser.max_vector = label_norm_info[m / 2:] ### apply precompuated min-max to the whole dataset min_max_normaliser.normalise_data(binary_label_file_list, nn_label_norm_file_list) ### make output acoustic data # if cfg.MAKECMP: ### retrieve acoustic normalisation information for normalising the features back var_dir = os.path.join(data_dir, 'var') var_file_dict = {} for feature_name in cfg.out_dimension_dict.keys(): var_file_dict[feature_name] = os.path.join( var_dir, feature_name + '_' + str(cfg.out_dimension_dict[feature_name])) ### normalise output acoustic data # if cfg.NORMCMP: combined_model_arch = str(len(hidden_layers_sizes)) for hid_size in hidden_layers_sizes: combined_model_arch += '_' + str(hid_size) nnets_file_name = '%s/%s_%s_%d_%s_%d.%d.train.%d.model' \ %(model_dir, cfg.model_type, cfg.combined_feature_name, int(cfg.multistream_switch), combined_model_arch, lab_dim, cfg.cmp_dim, cfg.train_file_number) ### DNN model training # if cfg.TRAINDNN: ##if cfg.DNNGEN: logger.info('generating from DNN') try: os.makedirs(gen_dir) except OSError as e: if e.errno == errno.EEXIST: # not an error - just means directory already exists pass else: logger.critical('Failed to create generation directory %s' % gen_dir) logger.critical(' OS error was: %s' % e.strerror) raise gen_file_list = prepare_file_path_list(file_id_list, gen_dir, cfg.cmp_ext) dnn_generation(nn_label_norm_file_list, nnets_file_name, lab_dim, cfg.cmp_dim, gen_file_list) logger.debug('denormalising generated output using method %s' % cfg.output_feature_normalisation) fid = open(norm_info_file, 'rb') cmp_min_max = numpy.fromfile(fid, dtype=numpy.float32) fid.close() cmp_min_max = cmp_min_max.reshape((2, -1)) cmp_min_vector = cmp_min_max[0, ] cmp_max_vector = cmp_min_max[1, ] if cfg.output_feature_normalisation == 'MVN': denormaliser = MeanVarianceNorm(feature_dimension=cfg.cmp_dim) denormaliser.feature_denormalisation(gen_file_list, gen_file_list, cmp_min_vector, cmp_max_vector) elif cfg.output_feature_normalisation == 'MINMAX': denormaliser = MinMaxNormalisation(cfg.cmp_dim, min_value=0.01, max_value=0.99, min_vector=cmp_min_vector, max_vector=cmp_max_vector) denormaliser.denormalise_data(gen_file_list, gen_file_list) else: logger.critical('denormalising method %s is not supported!\n' % (cfg.output_feature_normalisation)) raise ##perform MLPG to smooth parameter trajectory ## lf0 is included, the output features much have vuv. generator = ParameterGeneration(gen_wav_features=cfg.gen_wav_features) generator.acoustic_decomposition(gen_file_list, cfg.cmp_dim, cfg.out_dimension_dict, cfg.file_extension_dict, var_file_dict) logger.info('Simple variance expansion') test_var_scaling = False scaled_dir = gen_dir + '_scaled' if test_var_scaling: file_id_list = simple_scale_variance_CONTINUUM(gen_dir, scaled_dir, var_file_dict, cfg.out_dimension_dict, file_id_list) else: simple_scale_variance(gen_dir, scaled_dir, var_file_dict, cfg.out_dimension_dict, file_id_list, gv_weight=1.0) ## gv_weight hard coded here! ### generate wav ---- #if cfg.GENWAV: logger.info('reconstructing waveform(s)') #generate_wav_glottHMM(scaled_dir, file_id_list) generate_wav(scaled_dir, file_id_list, cfg)
def main_function_synth(cfg, dnn_model): # get a logger for this main function logger = logging.getLogger("main") # get another logger to handle plotting duties plotlogger = logging.getLogger("plotting") # later, we might do this via a handler that is created, attached and configured # using the standard config mechanism of the logging module # but for now we need to do it manually plotlogger.set_plot_path(cfg.plot_dir) #### parameter setting######## hidden_layer_size = cfg.hyper_params['hidden_layer_size'] ####prepare environment try: file_id_list = read_file_list(cfg.file_id_scp) logger.debug('Loaded file id list from %s' % cfg.file_id_scp) except IOError: # this means that open(...) threw an error logger.critical('Could not load file id list from %s' % cfg.file_id_scp) raise ###total file number including training, development, and testing total_file_number = len(file_id_list) data_dir = cfg.data_dir nn_cmp_dir = os.path.join( data_dir, 'nn' + cfg.combined_feature_name + '_' + str(cfg.cmp_dim)) nn_cmp_norm_dir = os.path.join( data_dir, 'nn_norm' + cfg.combined_feature_name + '_' + str(cfg.cmp_dim)) #model_dir = os.path.join(cfg.work_dir, 'nnets_model') gen_dir = os.path.join(cfg.work_dir, 'gen') in_file_list_dict = {} for feature_name in cfg.in_dir_dict.keys(): in_file_list_dict[feature_name] = prepare_file_path_list( file_id_list, cfg.in_dir_dict[feature_name], cfg.file_extension_dict[feature_name], False) nn_cmp_file_list = prepare_file_path_list(file_id_list, nn_cmp_dir, cfg.cmp_ext) nn_cmp_norm_file_list = prepare_file_path_list(file_id_list, nn_cmp_norm_dir, cfg.cmp_ext) ###normalisation information norm_info_file = os.path.join( data_dir, 'norm_info' + cfg.combined_feature_name + '_' + str(cfg.cmp_dim) + '_' + cfg.output_feature_normalisation + '.dat') ### normalise input full context label # currently supporting two different forms of lingustic features # later, we should generalise this if cfg.label_style == 'HTS': label_normaliser = HTSLabelNormalisation( question_file_name=cfg.question_file_name, add_frame_features=cfg.add_frame_features, subphone_feats=cfg.subphone_feats) add_feat_dim = sum(cfg.additional_features.values()) lab_dim = label_normaliser.dimension + add_feat_dim + cfg.appended_input_dim logger.info('Input label dimension is %d' % lab_dim) suffix = str(lab_dim) # no longer supported - use new "composed" style labels instead elif cfg.label_style == 'composed': # label_normaliser = XMLLabelNormalisation(xpath_file_name=cfg.xpath_file_name) suffix = 'composed' dnn_generation if cfg.process_labels_in_work_dir: label_data_dir = cfg.work_dir else: label_data_dir = data_dir # the number can be removed binary_label_dir = os.path.join( label_data_dir, 'binary_label_' + str(label_normaliser.dimension)) nn_label_dir = os.path.join(label_data_dir, 'nn_no_silence_lab_' + suffix) nn_label_norm_dir = os.path.join(label_data_dir, 'nn_no_silence_lab_norm_' + suffix) in_label_align_file_list = prepare_file_path_list(file_id_list, cfg.in_label_align_dir, cfg.lab_ext, False) binary_label_file_list = prepare_file_path_list(file_id_list, binary_label_dir, cfg.lab_ext) nn_label_file_list = prepare_file_path_list(file_id_list, nn_label_dir, cfg.lab_ext) nn_label_norm_file_list = prepare_file_path_list(file_id_list, nn_label_norm_dir, cfg.lab_ext) dur_file_list = prepare_file_path_list(file_id_list, cfg.in_dur_dir, cfg.dur_ext) lf0_file_list = prepare_file_path_list(file_id_list, cfg.in_lf0_dir, cfg.lf0_ext) # to do - sanity check the label dimension here? min_max_normaliser = None label_norm_file = 'label_norm_%s_%d.dat' % (cfg.label_style, lab_dim) label_norm_file = os.path.join(label_data_dir, label_norm_file) if cfg.GenTestList: try: test_id_list = read_file_list(cfg.test_id_scp) logger.debug('Loaded file id list from %s' % cfg.test_id_scp) except IOError: # this means that open(...) threw an error logger.critical('Could not load file id list from %s' % cfg.test_id_scp) raise in_label_align_file_list = prepare_file_path_list( test_id_list, cfg.in_label_align_dir, cfg.lab_ext, False) binary_label_file_list = prepare_file_path_list( test_id_list, binary_label_dir, cfg.lab_ext) nn_label_file_list = prepare_file_path_list(test_id_list, nn_label_dir, cfg.lab_ext) nn_label_norm_file_list = prepare_file_path_list( test_id_list, nn_label_norm_dir, cfg.lab_ext) if cfg.NORMLAB and (cfg.label_style == 'HTS'): # simple HTS labels logger.info( 'preparing label data (input) using standard HTS style labels') label_normaliser.perform_normalisation(in_label_align_file_list, binary_label_file_list, label_type=cfg.label_type) if cfg.additional_features: out_feat_dir = os.path.join(data_dir, 'binary_label_' + suffix) out_feat_file_list = prepare_file_path_list( file_id_list, out_feat_dir, cfg.lab_ext) in_dim = label_normaliser.dimension for new_feature, new_feature_dim in cfg.additional_features.iteritems( ): new_feat_dir = os.path.join(data_dir, new_feature) new_feat_file_list = prepare_file_path_list( file_id_list, new_feat_dir, '.' + new_feature) merger = MergeFeat(lab_dim=in_dim, feat_dim=new_feature_dim) merger.merge_data(binary_label_file_list, new_feat_file_list, out_feat_file_list) in_dim += new_feature_dim binary_label_file_list = out_feat_file_list remover = SilenceRemover(n_cmp=lab_dim, silence_pattern=cfg.silence_pattern, label_type=cfg.label_type, remove_frame_features=cfg.add_frame_features, subphone_feats=cfg.subphone_feats) remover.remove_silence(binary_label_file_list, in_label_align_file_list, nn_label_file_list) min_max_normaliser = MinMaxNormalisation(feature_dimension=lab_dim, min_value=0.01, max_value=0.99) ###use only training data to find min-max information, then apply on the whole dataset if cfg.GenTestList: min_max_normaliser.load_min_max_values(label_norm_file) else: min_max_normaliser.find_min_max_values( nn_label_file_list[0:cfg.train_file_number]) ### enforce silence such that the normalization runs without removing silence: only for final synthesis if cfg.GenTestList and cfg.enforce_silence: min_max_normaliser.normalise_data(binary_label_file_list, nn_label_norm_file_list) else: min_max_normaliser.normalise_data(nn_label_file_list, nn_label_norm_file_list) if cfg.NORMLAB and (cfg.label_style == 'composed'): # new flexible label preprocessor logger.info( 'preparing label data (input) using "composed" style labels') label_composer = LabelComposer() label_composer.load_label_configuration(cfg.label_config_file) logger.info('Loaded label configuration') # logger.info('%s' % label_composer.configuration.labels ) lab_dim = label_composer.compute_label_dimension() logger.info('label dimension will be %d' % lab_dim) if cfg.precompile_xpaths: label_composer.precompile_xpaths() # there are now a set of parallel input label files (e.g, one set of HTS and another set of Ossian trees) # create all the lists of these, ready to pass to the label composer in_label_align_file_list = {} for label_style, label_style_required in label_composer.label_styles.iteritems( ): if label_style_required: logger.info( 'labels of style %s are required - constructing file paths for them' % label_style) if label_style == 'xpath': in_label_align_file_list['xpath'] = prepare_file_path_list( file_id_list, cfg.xpath_label_align_dir, cfg.utt_ext, False) elif label_style == 'hts': in_label_align_file_list['hts'] = prepare_file_path_list( file_id_list, cfg.hts_label_align_dir, cfg.lab_ext, False) else: logger.critical( 'unsupported label style %s specified in label configuration' % label_style) raise Exception # now iterate through the files, one at a time, constructing the labels for them num_files = len(file_id_list) logger.info('the label styles required are %s' % label_composer.label_styles) for i in xrange(num_files): logger.info('making input label features for %4d of %4d' % (i + 1, num_files)) # iterate through the required label styles and open each corresponding label file # a dictionary of file descriptors, pointing at the required files required_labels = {} for label_style, label_style_required in label_composer.label_styles.iteritems( ): # the files will be a parallel set of files for a single utterance # e.g., the XML tree and an HTS label file if label_style_required: required_labels[label_style] = open( in_label_align_file_list[label_style][i], 'r') logger.debug(' opening label file %s' % in_label_align_file_list[label_style][i]) logger.debug('label styles with open files: %s' % required_labels) label_composer.make_labels( required_labels, out_file_name=binary_label_file_list[i], fill_missing_values=cfg.fill_missing_values, iterate_over_frames=cfg.iterate_over_frames) # now close all opened files for fd in required_labels.itervalues(): fd.close() # silence removal if cfg.remove_silence_using_binary_labels: silence_feature = 0 ## use first feature in label -- hardcoded for now logger.info( 'Silence removal from label using silence feature: %s' % (label_composer.configuration.labels[silence_feature])) logger.info('Silence will be removed from CMP files in same way') ## Binary labels have 2 roles: both the thing trimmed and the instructions for trimming: trim_silence(binary_label_file_list, nn_label_file_list, lab_dim, \ binary_label_file_list, lab_dim, silence_feature) else: logger.info('No silence removal done') # start from the labels we have just produced, not trimmed versions nn_label_file_list = binary_label_file_list min_max_normaliser = MinMaxNormalisation(feature_dimension=lab_dim, min_value=0.01, max_value=0.99) ###use only training data to find min-max information, then apply on the whole dataset min_max_normaliser.find_min_max_values( nn_label_file_list[0:cfg.train_file_number]) min_max_normaliser.normalise_data(nn_label_file_list, nn_label_norm_file_list) if min_max_normaliser != None and not cfg.GenTestList: ### save label normalisation information for unseen testing labels label_min_vector = min_max_normaliser.min_vector label_max_vector = min_max_normaliser.max_vector label_norm_info = numpy.concatenate( (label_min_vector, label_max_vector), axis=0) label_norm_info = numpy.array(label_norm_info, 'float32') fid = open(label_norm_file, 'wb') label_norm_info.tofile(fid) fid.close() logger.info('saved %s vectors to %s' % (label_min_vector.size, label_norm_file)) ### make output duration data if cfg.MAKEDUR: logger.info('creating duration (output) features') label_type = cfg.label_type feature_type = cfg.dur_feature_type label_normaliser.prepare_dur_data(in_label_align_file_list, dur_file_list, label_type, feature_type) ### make output acoustic data if cfg.MAKECMP: logger.info('creating acoustic (output) features') delta_win = cfg.delta_win #[-0.5, 0.0, 0.5] acc_win = cfg.acc_win #[1.0, -2.0, 1.0] acoustic_worker = AcousticComposition(delta_win=delta_win, acc_win=acc_win) if 'dur' in cfg.in_dir_dict.keys() and cfg.AcousticModel: acoustic_worker.make_equal_frames(dur_file_list, lf0_file_list, cfg.in_dimension_dict) acoustic_worker.prepare_nn_data(in_file_list_dict, nn_cmp_file_list, cfg.in_dimension_dict, cfg.out_dimension_dict) if cfg.remove_silence_using_binary_labels: ## do this to get lab_dim: label_composer = LabelComposer() label_composer.load_label_configuration(cfg.label_config_file) lab_dim = label_composer.compute_label_dimension() silence_feature = 0 ## use first feature in label -- hardcoded for now logger.info('Silence removal from CMP using binary label file') ## overwrite the untrimmed audio with the trimmed version: trim_silence(nn_cmp_file_list, nn_cmp_file_list, cfg.cmp_dim, binary_label_file_list, lab_dim, silence_feature) else: ## back off to previous method using HTS labels: remover = SilenceRemover( n_cmp=cfg.cmp_dim, silence_pattern=cfg.silence_pattern, label_type=cfg.label_type, remove_frame_features=cfg.add_frame_features, subphone_feats=cfg.subphone_feats) remover.remove_silence( nn_cmp_file_list[0:cfg.train_file_number + cfg.valid_file_number], in_label_align_file_list[0:cfg.train_file_number + cfg.valid_file_number], nn_cmp_file_list[0:cfg.train_file_number + cfg.valid_file_number]) # save to itself ### save acoustic normalisation information for normalising the features back var_dir = os.path.join(data_dir, 'var') if not os.path.exists(var_dir): os.makedirs(var_dir) var_file_dict = {} for feature_name in cfg.out_dimension_dict.keys(): var_file_dict[feature_name] = os.path.join( var_dir, feature_name + '_' + str(cfg.out_dimension_dict[feature_name])) ### normalise output acoustic data if cfg.NORMCMP: logger.info('normalising acoustic (output) features using method %s' % cfg.output_feature_normalisation) cmp_norm_info = None if cfg.output_feature_normalisation == 'MVN': normaliser = MeanVarianceNorm(feature_dimension=cfg.cmp_dim) ###calculate mean and std vectors on the training data, and apply on the whole dataset global_mean_vector = normaliser.compute_mean( nn_cmp_file_list[0:cfg.train_file_number], 0, cfg.cmp_dim) global_std_vector = normaliser.compute_std( nn_cmp_file_list[0:cfg.train_file_number], global_mean_vector, 0, cfg.cmp_dim) normaliser.feature_normalisation( nn_cmp_file_list[0:cfg.train_file_number + cfg.valid_file_number], nn_cmp_norm_file_list[0:cfg.train_file_number + cfg.valid_file_number]) cmp_norm_info = numpy.concatenate( (global_mean_vector, global_std_vector), axis=0) elif cfg.output_feature_normalisation == 'MINMAX': min_max_normaliser = MinMaxNormalisation( feature_dimension=cfg.cmp_dim) global_mean_vector = min_max_normaliser.compute_mean( nn_cmp_file_list[0:cfg.train_file_number]) global_std_vector = min_max_normaliser.compute_std( nn_cmp_file_list[0:cfg.train_file_number], global_mean_vector) min_max_normaliser = MinMaxNormalisation( feature_dimension=cfg.cmp_dim, min_value=0.01, max_value=0.99) min_max_normaliser.find_min_max_values( nn_cmp_file_list[0:cfg.train_file_number]) min_max_normaliser.normalise_data(nn_cmp_file_list, nn_cmp_norm_file_list) cmp_min_vector = min_max_normaliser.min_vector cmp_max_vector = min_max_normaliser.max_vector cmp_norm_info = numpy.concatenate((cmp_min_vector, cmp_max_vector), axis=0) else: logger.critical('Normalisation type %s is not supported!\n' % (cfg.output_feature_normalisation)) raise cmp_norm_info = numpy.array(cmp_norm_info, 'float32') fid = open(norm_info_file, 'wb') cmp_norm_info.tofile(fid) fid.close() logger.info('saved %s vectors to %s' % (cfg.output_feature_normalisation, norm_info_file)) feature_index = 0 for feature_name in cfg.out_dimension_dict.keys(): feature_std_vector = numpy.array( global_std_vector[:, feature_index:feature_index + cfg.out_dimension_dict[feature_name]], 'float32') fid = open(var_file_dict[feature_name], 'w') feature_var_vector = feature_std_vector**2 feature_var_vector.tofile(fid) fid.close() logger.info('saved %s variance vector to %s' % (feature_name, var_file_dict[feature_name])) feature_index += cfg.out_dimension_dict[feature_name] train_x_file_list = nn_label_norm_file_list[0:cfg.train_file_number] train_y_file_list = nn_cmp_norm_file_list[0:cfg.train_file_number] valid_x_file_list = nn_label_norm_file_list[cfg.train_file_number:cfg. train_file_number + cfg.valid_file_number] valid_y_file_list = nn_cmp_norm_file_list[cfg.train_file_number:cfg. train_file_number + cfg.valid_file_number] test_x_file_list = nn_label_norm_file_list[ cfg.train_file_number + cfg.valid_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number] test_y_file_list = nn_cmp_norm_file_list[ cfg.train_file_number + cfg.valid_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number] ### generate parameters from DNN temp_dir_name = '%s_%s_%d_%d_%d_%d_%d_%d_%d' \ %(cfg.combined_model_name, cfg.combined_feature_name, int(cfg.do_post_filtering), \ cfg.train_file_number, lab_dim, cfg.cmp_dim, \ len(hidden_layer_size), hidden_layer_size[0], hidden_layer_size[-1]) gen_dir = os.path.join(gen_dir, temp_dir_name) gen_file_id_list = file_id_list[cfg. train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number] test_x_file_list = nn_label_norm_file_list[cfg.train_file_number:cfg. train_file_number + cfg.valid_file_number + cfg.test_file_number] if cfg.GenTestList: gen_file_id_list = test_id_list test_x_file_list = nn_label_norm_file_list ### comment the below line if you don't want the files in a separate folder gen_dir = cfg.test_synth_dir if cfg.DNNGEN: logger.info('generating from DNN') try: os.makedirs(gen_dir) except OSError as e: if e.errno == errno.EEXIST: # not an error - just means directory already exists pass else: logger.critical('Failed to create generation directory %s' % gen_dir) logger.critical(' OS error was: %s' % e.strerror) raise gen_file_list = prepare_file_path_list(gen_file_id_list, gen_dir, cfg.cmp_ext) #dnn_generation(test_x_file_list, nnets_file_name, lab_dim, cfg.cmp_dim, gen_file_list) dnn_generation_yuhao(test_x_file_list, dnn_model, lab_dim, cfg.cmp_dim, gen_file_list) logger.debug('denormalising generated output using method %s' % cfg.output_feature_normalisation) fid = open(norm_info_file, 'rb') cmp_min_max = numpy.fromfile(fid, dtype=numpy.float32) fid.close() cmp_min_max = cmp_min_max.reshape((2, -1)) cmp_min_vector = cmp_min_max[0, ] cmp_max_vector = cmp_min_max[1, ] if cfg.output_feature_normalisation == 'MVN': denormaliser = MeanVarianceNorm(feature_dimension=cfg.cmp_dim) denormaliser.feature_denormalisation(gen_file_list, gen_file_list, cmp_min_vector, cmp_max_vector) elif cfg.output_feature_normalisation == 'MINMAX': denormaliser = MinMaxNormalisation(cfg.cmp_dim, min_value=0.01, max_value=0.99, min_vector=cmp_min_vector, max_vector=cmp_max_vector) denormaliser.denormalise_data(gen_file_list, gen_file_list) else: logger.critical('denormalising method %s is not supported!\n' % (cfg.output_feature_normalisation)) raise if cfg.AcousticModel: ##perform MLPG to smooth parameter trajectory ## lf0 is included, the output features much have vuv. generator = ParameterGeneration( gen_wav_features=cfg.gen_wav_features, enforce_silence=cfg.enforce_silence) generator.acoustic_decomposition(gen_file_list, cfg.cmp_dim, cfg.out_dimension_dict, cfg.file_extension_dict, var_file_dict, do_MLPG=cfg.do_MLPG, cfg=cfg) if cfg.DurationModel: ### Perform duration normalization(min. state dur set to 1) ### gen_dur_list = prepare_file_path_list(gen_file_id_list, gen_dir, cfg.dur_ext) gen_label_list = prepare_file_path_list(gen_file_id_list, gen_dir, cfg.lab_ext) in_gen_label_align_file_list = prepare_file_path_list( gen_file_id_list, cfg.in_label_align_dir, cfg.lab_ext, False) generator = ParameterGeneration( gen_wav_features=cfg.gen_wav_features) generator.duration_decomposition(gen_file_list, cfg.cmp_dim, cfg.out_dimension_dict, cfg.file_extension_dict) label_modifier = HTSLabelModification( silence_pattern=cfg.silence_pattern, label_type=cfg.label_type) label_modifier.modify_duration_labels(in_gen_label_align_file_list, gen_dur_list, gen_label_list) ### generate wav if cfg.GENWAV: logger.info('reconstructing waveform(s)') generate_wav(gen_dir, gen_file_id_list, cfg) # generated speech # generate_wav(nn_cmp_dir, gen_file_id_list, cfg) # reference copy synthesis speech ### setting back to original conditions before calculating objective scores ### if cfg.GenTestList: in_label_align_file_list = prepare_file_path_list( file_id_list, cfg.in_label_align_dir, cfg.lab_ext, False) binary_label_file_list = prepare_file_path_list( file_id_list, binary_label_dir, cfg.lab_ext) gen_file_id_list = file_id_list[cfg.train_file_number:cfg. train_file_number + cfg.valid_file_number + cfg.test_file_number] ### evaluation: RMSE and CORR for duration if cfg.CALMCD and cfg.DurationModel: logger.info('calculating MCD') ref_data_dir = os.path.join(data_dir, 'ref_data') ref_dur_list = prepare_file_path_list(gen_file_id_list, ref_data_dir, cfg.dur_ext) in_gen_label_align_file_list = in_label_align_file_list[ cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number] calculator = IndividualDistortionComp() valid_file_id_list = file_id_list[cfg.train_file_number:cfg. train_file_number + cfg.valid_file_number] test_file_id_list = file_id_list[cfg.train_file_number + cfg.valid_file_number:cfg. train_file_number + cfg.valid_file_number + cfg.test_file_number] if cfg.remove_silence_using_binary_labels: untrimmed_reference_data = in_file_list_dict[ 'dur'][cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number] trim_silence(untrimmed_reference_data, ref_dur_list, cfg.dur_dim, \ untrimmed_test_labels, lab_dim, silence_feature) else: remover = SilenceRemover( n_cmp=cfg.dur_dim, silence_pattern=cfg.silence_pattern, label_type=cfg.label_type, remove_frame_features=cfg.add_frame_features) remover.remove_silence( in_file_list_dict['dur'] [cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number], in_gen_label_align_file_list, ref_dur_list) valid_dur_rmse, valid_dur_corr = calculator.compute_distortion( valid_file_id_list, ref_data_dir, gen_dir, cfg.dur_ext, cfg.dur_dim) test_dur_rmse, test_dur_corr = calculator.compute_distortion( test_file_id_list, ref_data_dir, gen_dir, cfg.dur_ext, cfg.dur_dim) logger.info('Develop: DNN -- RMSE: %.3f frames/phoneme; CORR: %.3f; ' \ %(valid_dur_rmse, valid_dur_corr)) logger.info('Test: DNN -- RMSE: %.3f frames/phoneme; CORR: %.3f; ' \ %(test_dur_rmse, test_dur_corr)) ### evaluation: calculate distortion if cfg.CALMCD and cfg.AcousticModel: logger.info('calculating MCD') ref_data_dir = os.path.join(data_dir, 'ref_data') ref_mgc_list = prepare_file_path_list(gen_file_id_list, ref_data_dir, cfg.mgc_ext) ref_bap_list = prepare_file_path_list(gen_file_id_list, ref_data_dir, cfg.bap_ext) ref_lf0_list = prepare_file_path_list(gen_file_id_list, ref_data_dir, cfg.lf0_ext) in_gen_label_align_file_list = in_label_align_file_list[ cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number] calculator = IndividualDistortionComp() spectral_distortion = 0.0 bap_mse = 0.0 f0_mse = 0.0 vuv_error = 0.0 valid_file_id_list = file_id_list[cfg.train_file_number:cfg. train_file_number + cfg.valid_file_number] test_file_id_list = file_id_list[cfg.train_file_number + cfg.valid_file_number:cfg. train_file_number + cfg.valid_file_number + cfg.test_file_number] if cfg.remove_silence_using_binary_labels: ## get lab_dim: label_composer = LabelComposer() label_composer.load_label_configuration(cfg.label_config_file) lab_dim = label_composer.compute_label_dimension() ## use first feature in label -- hardcoded for now silence_feature = 0 ## Use these to trim silence: untrimmed_test_labels = binary_label_file_list[ cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number] if cfg.in_dimension_dict.has_key('mgc'): if cfg.remove_silence_using_binary_labels: untrimmed_reference_data = in_file_list_dict[ 'mgc'][cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number] trim_silence(untrimmed_reference_data, ref_mgc_list, cfg.mgc_dim, \ untrimmed_test_labels, lab_dim, silence_feature) else: remover = SilenceRemover(n_cmp=cfg.mgc_dim, silence_pattern=cfg.silence_pattern, label_type=cfg.label_type) remover.remove_silence( in_file_list_dict['mgc'] [cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number], in_gen_label_align_file_list, ref_mgc_list) valid_spectral_distortion = calculator.compute_distortion( valid_file_id_list, ref_data_dir, gen_dir, cfg.mgc_ext, cfg.mgc_dim) test_spectral_distortion = calculator.compute_distortion( test_file_id_list, ref_data_dir, gen_dir, cfg.mgc_ext, cfg.mgc_dim) valid_spectral_distortion *= (10 / numpy.log(10)) * numpy.sqrt( 2.0) ##MCD test_spectral_distortion *= (10 / numpy.log(10)) * numpy.sqrt( 2.0) ##MCD if cfg.in_dimension_dict.has_key('bap'): if cfg.remove_silence_using_binary_labels: untrimmed_reference_data = in_file_list_dict[ 'bap'][cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number] trim_silence(untrimmed_reference_data, ref_bap_list, cfg.bap_dim, \ untrimmed_test_labels, lab_dim, silence_feature) else: remover = SilenceRemover(n_cmp=cfg.bap_dim, silence_pattern=cfg.silence_pattern, label_type=cfg.label_type) remover.remove_silence( in_file_list_dict['bap'] [cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number], in_gen_label_align_file_list, ref_bap_list) valid_bap_mse = calculator.compute_distortion( valid_file_id_list, ref_data_dir, gen_dir, cfg.bap_ext, cfg.bap_dim) test_bap_mse = calculator.compute_distortion( test_file_id_list, ref_data_dir, gen_dir, cfg.bap_ext, cfg.bap_dim) valid_bap_mse = valid_bap_mse / 10.0 ##Cassia's bap is computed from 10*log|S(w)|. if use HTS/SPTK style, do the same as MGC test_bap_mse = test_bap_mse / 10.0 ##Cassia's bap is computed from 10*log|S(w)|. if use HTS/SPTK style, do the same as MGC if cfg.in_dimension_dict.has_key('lf0'): if cfg.remove_silence_using_binary_labels: untrimmed_reference_data = in_file_list_dict[ 'lf0'][cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number] trim_silence(untrimmed_reference_data, ref_lf0_list, cfg.lf0_dim, \ untrimmed_test_labels, lab_dim, silence_feature) else: remover = SilenceRemover(n_cmp=cfg.lf0_dim, silence_pattern=cfg.silence_pattern, label_type=cfg.label_type) remover.remove_silence( in_file_list_dict['lf0'] [cfg.train_file_number:cfg.train_file_number + cfg.valid_file_number + cfg.test_file_number], in_gen_label_align_file_list, ref_lf0_list) valid_f0_mse, valid_f0_corr, valid_vuv # if gnp._boardId is not None: # import gpu_lock # gpu_lock.free_lock(gnp._boardId)_error = calculator.compute_distortion(valid_file_id_list, ref_data_dir, gen_dir, cfg.lf0_ext, cfg.lf0_dim) test_f0_mse, test_f0_corr, test_vuv_error = calculator.compute_distortion( test_file_id_list, ref_data_dir, gen_dir, cfg.lf0_ext, cfg.lf0_dim) logger.info('Develop: DNN -- MCD: %.3f dB; BAP: %.3f dB; F0:- RMSE: %.3f Hz; CORR: %.3f; VUV: %.3f%%' \ %(valid_spectral_distortion, valid_bap_mse, valid_f0_mse, valid_f0_corr, valid_vuv_error*100.)) logger.info('Test : DNN -- MCD: %.3f dB; BAP: %.3f dB; F0:- RMSE: %.3f Hz; CORR: %.3f; VUV: %.3f%%' \ %(test_spectral_distortion , test_bap_mse , test_f0_mse , test_f0_corr, test_vuv_error*100.))
def main_function(cfg, in_dir, out_dir): # get a logger for this main function logger = logging.getLogger("main") # get another logger to handle plotting duties plotlogger = logging.getLogger("plotting") #### parameter setting######## hidden_layers_sizes = cfg.hyper_params['hidden_layer_size'] file_id_list = [] if cfg.label_style == 'HTS': ext = '.lab' else: ext = '.utt' synth_utts = glob.glob(in_dir + '/*' + ext) for fname in synth_utts: junk,name = os.path.split(fname) file_id_list.append(name.replace(ext,'')) if not os.path.isdir(out_dir): os.mkdir(out_dir) ###total file number including training, development, and testing #total_file_number = len(file_id_list) data_dir = cfg.data_dir #nn_cmp_dir = os.path.join(data_dir, 'nn' + cfg.combined_feature_name + '_' + str(cfg.cmp_dim)) #nn_cmp_norm_dir = os.path.join(data_dir, 'nn_norm' + cfg.combined_feature_name + '_' + str(cfg.cmp_dim)) model_dir = os.path.join(cfg.work_dir, 'nnets_model') gen_dir = os.path.join(out_dir, 'gen') #in_file_list_dict = {} #for feature_name in cfg.in_dir_dict.keys(): # in_file_list_dict[feature_name] = prepare_file_path_list(file_id_list, cfg.in_dir_dict[feature_name], cfg.file_extension_dict[feature_name], False) #nn_cmp_file_list = prepare_file_path_list(file_id_list, nn_cmp_dir, cfg.cmp_ext) #nn_cmp_norm_file_list = prepare_file_path_list(file_id_list, nn_cmp_norm_dir, cfg.cmp_ext) ###normalisation information norm_info_file = os.path.join(data_dir, 'norm_info' + cfg.combined_feature_name + '_' + str(cfg.cmp_dim) + '_' + cfg.output_feature_normalisation + '.dat') ### normalise input full context label # currently supporting two different forms of lingustic features # later, we should generalise this if cfg.label_style == 'HTS': label_normaliser = HTSLabelNormalisation(question_file_name=cfg.question_file_name) lab_dim = label_normaliser.dimension logger.info('Input label dimension is %d' % lab_dim) suffix=str(lab_dim) # no longer supported - use new "composed" style labels instead elif cfg.label_style == 'composed': # label_normaliser = XMLLabelNormalisation(xpath_file_name=cfg.xpath_file_name) suffix='composed' # the number can be removed binary_label_dir = os.path.join(out_dir, 'lab_bin') nn_label_norm_dir = os.path.join(out_dir, 'lab_bin_norm') in_label_align_file_list = prepare_file_path_list(file_id_list, in_dir, cfg.lab_ext) binary_label_file_list = prepare_file_path_list(file_id_list, binary_label_dir, cfg.lab_ext) nn_label_norm_file_list = prepare_file_path_list(file_id_list, nn_label_norm_dir, cfg.lab_ext) ## need this to find normalisation info: if cfg.process_labels_in_work_dir: label_data_dir = cfg.work_dir else: label_data_dir = data_dir min_max_normaliser = None label_norm_file = 'label_norm_%s.dat' %(cfg.label_style) label_norm_file = os.path.join(label_data_dir, label_norm_file) if cfg.label_style == 'HTS': # simple HTS labels logger.info('preparing label data (input) using standard HTS style labels') label_normaliser.perform_normalisation(in_label_align_file_list, binary_label_file_list) else: logger.info('preparing label data (input) using "composed" style labels') label_composer = LabelComposer() label_composer.load_label_configuration(cfg.label_config_file) logger.info('Loaded label configuration') # logger.info('%s' % label_composer.configuration.labels ) lab_dim=label_composer.compute_label_dimension() logger.info('label dimension will be %d' % lab_dim) if cfg.precompile_xpaths: label_composer.precompile_xpaths() # there are now a set of parallel input label files (e.g, one set of HTS and another set of Ossian trees) # create all the lists of these, ready to pass to the label composer in_label_align_file_list = {} for label_style, label_style_required in label_composer.label_styles.iteritems(): if label_style_required: logger.info('labels of style %s are required - constructing file paths for them' % label_style) if label_style == 'xpath': in_label_align_file_list['xpath'] = prepare_file_path_list(file_id_list, in_dir, cfg.utt_ext, False) elif label_style == 'hts': logger.critical('script not tested with HTS labels') else: logger.critical('unsupported label style %s specified in label configuration' % label_style) raise Exception # now iterate through the files, one at a time, constructing the labels for them num_files=len(file_id_list) logger.info('the label styles required are %s' % label_composer.label_styles) for i in xrange(num_files): logger.info('making input label features for %4d of %4d' % (i+1,num_files)) # iterate through the required label styles and open each corresponding label file # a dictionary of file descriptors, pointing at the required files required_labels={} for label_style, label_style_required in label_composer.label_styles.iteritems(): # the files will be a parallel set of files for a single utterance # e.g., the XML tree and an HTS label file if label_style_required: required_labels[label_style] = open(in_label_align_file_list[label_style][i] , 'r') logger.debug(' opening label file %s' % in_label_align_file_list[label_style][i]) logger.debug('label styles with open files: %s' % required_labels) label_composer.make_labels(required_labels,out_file_name=binary_label_file_list[i],fill_missing_values=cfg.fill_missing_values,iterate_over_frames=cfg.iterate_over_frames) # now close all opened files for fd in required_labels.itervalues(): fd.close() # no silence removal for synthesis ... ## minmax norm: min_max_normaliser = MinMaxNormalisation(feature_dimension = lab_dim, min_value = 0.01, max_value = 0.99) # reload stored minmax values: (TODO -- move reading and writing into MinMaxNormalisation class) fid = open(label_norm_file, 'rb') ## This doesn't work -- precision is lost -- reads in as float64 #label_norm_info = numpy.fromfile(fid) ## label_norm_info = numpy.array(label_norm_info, 'float32') ## use struct to enforce float32: nbytes = os.stat(label_norm_file)[6] # length in bytes data = fid.read(nbytes) # = read until bytes run out fid.close() m = nbytes / 4 ## number 32 bit floats format = str(m)+"f" label_norm_info = struct.unpack(format, data) label_norm_info = numpy.array(label_norm_info) min_max_normaliser.min_vector = label_norm_info[:m/2] min_max_normaliser.max_vector = label_norm_info[m/2:] ### apply precompuated min-max to the whole dataset min_max_normaliser.normalise_data(binary_label_file_list, nn_label_norm_file_list) ### make output acoustic data # if cfg.MAKECMP: ### retrieve acoustic normalisation information for normalising the features back var_dir = os.path.join(data_dir, 'var') var_file_dict = {} for feature_name in cfg.out_dimension_dict.keys(): var_file_dict[feature_name] = os.path.join(var_dir, feature_name + '_' + str(cfg.out_dimension_dict[feature_name])) ### normalise output acoustic data # if cfg.NORMCMP: combined_model_arch = str(len(hidden_layers_sizes)) for hid_size in hidden_layers_sizes: combined_model_arch += '_' + str(hid_size) nnets_file_name = '%s/%s_%s_%d_%s_%d.%d.train.%d.model' \ %(model_dir, cfg.model_type, cfg.combined_feature_name, int(cfg.multistream_switch), combined_model_arch, lab_dim, cfg.cmp_dim, cfg.train_file_number) ### DNN model training # if cfg.TRAINDNN: ##if cfg.DNNGEN: logger.info('generating from DNN') try: os.makedirs(gen_dir) except OSError as e: if e.errno == errno.EEXIST: # not an error - just means directory already exists pass else: logger.critical('Failed to create generation directory %s' % gen_dir) logger.critical(' OS error was: %s' % e.strerror) raise gen_file_list = prepare_file_path_list(file_id_list, gen_dir, cfg.cmp_ext) dnn_generation(nn_label_norm_file_list, nnets_file_name, lab_dim, cfg.cmp_dim, gen_file_list) logger.debug('denormalising generated output using method %s' % cfg.output_feature_normalisation) fid = open(norm_info_file, 'rb') cmp_min_max = numpy.fromfile(fid, dtype=numpy.float32) fid.close() cmp_min_max = cmp_min_max.reshape((2, -1)) cmp_min_vector = cmp_min_max[0, ] cmp_max_vector = cmp_min_max[1, ] if cfg.output_feature_normalisation == 'MVN': denormaliser = MeanVarianceNorm(feature_dimension = cfg.cmp_dim) denormaliser.feature_denormalisation(gen_file_list, gen_file_list, cmp_min_vector, cmp_max_vector) elif cfg.output_feature_normalisation == 'MINMAX': denormaliser = MinMaxNormalisation(cfg.cmp_dim, min_value = 0.01, max_value = 0.99, min_vector = cmp_min_vector, max_vector = cmp_max_vector) denormaliser.denormalise_data(gen_file_list, gen_file_list) else: logger.critical('denormalising method %s is not supported!\n' %(cfg.output_feature_normalisation)) raise ##perform MLPG to smooth parameter trajectory ## lf0 is included, the output features much have vuv. generator = ParameterGeneration(gen_wav_features = cfg.gen_wav_features) generator.acoustic_decomposition(gen_file_list, cfg.cmp_dim, cfg.out_dimension_dict, cfg.file_extension_dict, var_file_dict) logger.info('Simple variance expansion') test_var_scaling=False scaled_dir = gen_dir + '_scaled' if test_var_scaling: file_id_list = simple_scale_variance_CONTINUUM(gen_dir, scaled_dir, var_file_dict, cfg.out_dimension_dict, file_id_list) else: simple_scale_variance(gen_dir, scaled_dir, var_file_dict, cfg.out_dimension_dict, file_id_list, gv_weight=1.0) ## gv_weight hard coded here! ### generate wav ---- #if cfg.GENWAV: logger.info('reconstructing waveform(s)') #generate_wav_glottHMM(scaled_dir, file_id_list) generate_wav(scaled_dir, file_id_list, cfg)
def __init__(self, cfg): # model type (duration or acoustic) self.model_output_type = cfg.model_output_type # ---------------------------------------------------- # ------------------- Input-Output ------------------- # ---------------------------------------------------- self.label_type = cfg.label_type self.cmp_ext = cfg.cmp_ext inp_file_ext = cfg.inp_file_ext out_file_ext = cfg.out_file_ext self.label_normaliser = HTSLabelNormalisation( question_file_name=cfg.question_file_name, add_frame_features=cfg.add_frame_features == 'True', # must be bool subphone_feats=cfg.subphone_feats) # Create streams files (they store data from dimension dictionaries for synthesis) in_streams = sorted(cfg.in_dimension_dict.keys()) indims = [str(cfg.in_dimension_dict[s]) for s in in_streams] self.out_streams = sorted(cfg.out_dimension_dict.keys()) self.outdims = [ str(cfg.out_dimension_dict[s]) for s in self.out_streams ] with open(os.path.join(cfg.model_dir, 'stream_info.txt'), 'w') as f: f.write(' '.join(in_streams) + '\n') f.write(' '.join(indims) + '\n') f.write(' '.join(self.out_streams) + '\n') f.write(' '.join(self.outdims) + '\n') # Input output dimensions self.inp_dim = cfg.inp_dim if self.model_output_type == 'duration': self.out_dim = cfg.dur_dim elif self.model_output_type == 'acoustic': self.out_dim = cfg.cmp_dim # Data normalization method self.inp_norm = cfg.inp_norm self.out_norm = cfg.out_norm # Norm stats files self.inp_stats_file = cfg.inp_stats_file self.out_stats_file_list = cfg.out_stats_file_list self.speaker_id = cfg.speaker_id self.shared_layer_flag = cfg.shared_layer_flag self.inp_scaler = None self.out_scaler = None # --------------------------------------------------- # ------------------- Directories ------------------- # --------------------------------------------------- self.plot_dir = os.path.join(cfg.plot_dir, cfg.nnets_file_name) # Select data directories based on model input-output type if self.model_output_type == 'duration': # Input self.inp_feat_dir = cfg.inp_feat_dir_dur self.bin_lab_dir = cfg.bin_lab_dir_dur self.bin_lab_dir_nosilence = cfg.bin_lab_dir_dur_nosilence self.bin_lab_dir_nosilence_norm = cfg.bin_lab_dir_dur_nosilence_norm # Output self.out_feat_dir = cfg.out_feat_dir_dur self.out_feat_dir_norm = cfg.out_feat_dir_dur_norm elif self.model_output_type == 'acoustic': # Input self.inp_feat_dir = cfg.inp_feat_dir_cmp self.bin_lab_dir = cfg.bin_lab_dir_cmp self.bin_lab_dir_nosilence = cfg.bin_lab_dir_cmp_nosilence self.bin_lab_dir_nosilence_norm = cfg.bin_lab_dir_cmp_nosilence_norm # Output self.out_feat_dir = cfg.nn_cmp_dir self.out_feat_dir_norm = cfg.nn_cmp_norm_dir else: print("invalid model output type") raise # -------------------------------------------------------- # ------------------- Model Parameters ------------------- # -------------------------------------------------------- self.sequential_training = cfg.sequential_training self.stateful = cfg.stateful self.json_model_file = cfg.json_model_file self.h5_model_file = cfg.h5_model_file self.model_params_file = cfg.model_params_file # ----------------------------------------------------------- # ------------------- Generate file lists ------------------- # ----------------------------------------------------------- train_file_number = cfg.train_file_number valid_file_number = cfg.valid_file_number test_file_number = cfg.test_file_number # List of file ids self.file_id_scp = cfg.file_id_scp # Create train, valid and test file lists self.file_id_list = data_utils.read_file_list(self.file_id_scp) if cfg.shuffle_data: random.seed(1) random.shuffle(self.file_id_list ) # Shuffle to get random valid and test utterances self.train_id_list = self.file_id_list[0:train_file_number] self.valid_id_list = self.file_id_list[ train_file_number:train_file_number + valid_file_number] self.test_id_list = self.file_id_list[ train_file_number + valid_file_number:train_file_number + valid_file_number + test_file_number] # Intermediate file lists self.inp_feat_file_list = data_utils.prepare_file_path_list( self.file_id_list, self.inp_feat_dir, inp_file_ext) self.bin_lab_file_list = data_utils.prepare_file_path_list( self.file_id_list, self.bin_lab_dir, inp_file_ext) self.bin_lab_nosilence_file_list = data_utils.prepare_file_path_list( self.file_id_list, self.bin_lab_dir_nosilence, inp_file_ext) # Train, test, validation file lists self.inp_train_file_list = data_utils.prepare_file_path_list( self.train_id_list, self.bin_lab_dir_nosilence, inp_file_ext) self.out_train_file_list = data_utils.prepare_file_path_list( self.train_id_list, self.out_feat_dir, out_file_ext) self.inp_valid_file_list = data_utils.prepare_file_path_list( self.valid_id_list, self.bin_lab_dir_nosilence, inp_file_ext) self.out_valid_file_list = data_utils.prepare_file_path_list( self.valid_id_list, self.out_feat_dir, out_file_ext) self.inp_test_file_list = data_utils.prepare_file_path_list( self.test_id_list, self.bin_lab_dir_nosilence, inp_file_ext) self.out_test_file_list = data_utils.prepare_file_path_list( self.test_id_list, self.out_feat_dir, out_file_ext) # For cmp files generated as targets (applies to acoustic model only) self.nn_cmp_file_list = [] self.nn_cmp_norm_file_list = [] self.in_file_list_dict = {} for feature_name in list(cfg.in_dir_dict.keys()): self.in_file_list_dict[ feature_name] = data_utils.prepare_file_path_list( self.file_id_list, cfg.in_dir_dict[feature_name], cfg.file_extension_dict[feature_name], False) # self.gen_test_file_list = data_utils.prepare_file_path_list(self.test_id_list, pred_feat_dir, out_file_ext) # if self.GenTestList: # test_id_list = data_utils.read_file_list(test_id_scp) # self.inp_test_file_list = data_utils.prepare_file_path_list(test_id_list, inp_feat_dir, inp_file_ext) # self.gen_test_file_list = data_utils.prepare_file_path_list(test_id_list, pred_feat_dir, out_file_ext) # ------------------------------------------------------ # ------------------- Main Processes ------------------- # ------------------------------------------------------ self.MAKELAB = cfg.MAKELAB # make binary labels (required step before normalization and training) self.MAKECMP = cfg.MAKECMP self.NORMDATA = cfg.NORMDATA # normalizes input and output data, creates data scaling objects self.TRAINDNN = cfg.TRAINDNN # train the Keras model self.TESTDNN = cfg.TESTDNN # test the Keras model # ---------------------------------------------------------- # ------------------- Define Keras Model ------------------- # ---------------------------------------------------------- self.batch_size = cfg.batch_size model_params = { 'inp_dim': self.inp_dim, 'hidden_layer_size': cfg.hidden_layer_size, 'shared_layer_flag': cfg.shared_layer_flag, 'speaker_id': cfg.speaker_id, 'out_dim': self.out_dim, 'hidden_layer_type': cfg.hidden_layer_type, 'output_layer_type': cfg.output_layer_type, 'dropout_rate': cfg.dropout_rate, 'loss_function': cfg.loss_function, 'optimizer': cfg.optimizer, 'l1': cfg.l1_reg, 'l2': cfg.l2_reg, 'gpu_num': cfg.gpu_num } rnn_params = { 'merge_size': cfg.merge_size, 'seq_length': cfg.seq_length, 'bucket_range': cfg.bucket_range, 'stateful': cfg.stateful, 'training_algo': cfg.training_algo } training_params = { 'batch_size': cfg.batch_size, 'num_of_epochs': cfg.num_of_epochs, 'shuffle_data': cfg.shuffle_data, 'tensorboard_dir': os.path.join(cfg.plot_dir, cfg.nnets_file_name), 'stopping_patience': cfg.stopping_patience, 'restore_best_weights': cfg.restore_best_weights } self.keras_models = TrainKerasModels(model_params=model_params, rnn_params=rnn_params, training_params=training_params)
class KerasClass(object): def __init__(self, cfg): # model type (duration or acoustic) self.model_output_type = cfg.model_output_type # ---------------------------------------------------- # ------------------- Input-Output ------------------- # ---------------------------------------------------- self.label_type = cfg.label_type self.cmp_ext = cfg.cmp_ext inp_file_ext = cfg.inp_file_ext out_file_ext = cfg.out_file_ext self.label_normaliser = HTSLabelNormalisation( question_file_name=cfg.question_file_name, add_frame_features=cfg.add_frame_features == 'True', # must be bool subphone_feats=cfg.subphone_feats) # Create streams files (they store data from dimension dictionaries for synthesis) in_streams = sorted(cfg.in_dimension_dict.keys()) indims = [str(cfg.in_dimension_dict[s]) for s in in_streams] self.out_streams = sorted(cfg.out_dimension_dict.keys()) self.outdims = [ str(cfg.out_dimension_dict[s]) for s in self.out_streams ] with open(os.path.join(cfg.model_dir, 'stream_info.txt'), 'w') as f: f.write(' '.join(in_streams) + '\n') f.write(' '.join(indims) + '\n') f.write(' '.join(self.out_streams) + '\n') f.write(' '.join(self.outdims) + '\n') # Input output dimensions self.inp_dim = cfg.inp_dim if self.model_output_type == 'duration': self.out_dim = cfg.dur_dim elif self.model_output_type == 'acoustic': self.out_dim = cfg.cmp_dim # Data normalization method self.inp_norm = cfg.inp_norm self.out_norm = cfg.out_norm # Norm stats files self.inp_stats_file = cfg.inp_stats_file self.out_stats_file_list = cfg.out_stats_file_list self.speaker_id = cfg.speaker_id self.shared_layer_flag = cfg.shared_layer_flag self.inp_scaler = None self.out_scaler = None # --------------------------------------------------- # ------------------- Directories ------------------- # --------------------------------------------------- self.plot_dir = os.path.join(cfg.plot_dir, cfg.nnets_file_name) # Select data directories based on model input-output type if self.model_output_type == 'duration': # Input self.inp_feat_dir = cfg.inp_feat_dir_dur self.bin_lab_dir = cfg.bin_lab_dir_dur self.bin_lab_dir_nosilence = cfg.bin_lab_dir_dur_nosilence self.bin_lab_dir_nosilence_norm = cfg.bin_lab_dir_dur_nosilence_norm # Output self.out_feat_dir = cfg.out_feat_dir_dur self.out_feat_dir_norm = cfg.out_feat_dir_dur_norm elif self.model_output_type == 'acoustic': # Input self.inp_feat_dir = cfg.inp_feat_dir_cmp self.bin_lab_dir = cfg.bin_lab_dir_cmp self.bin_lab_dir_nosilence = cfg.bin_lab_dir_cmp_nosilence self.bin_lab_dir_nosilence_norm = cfg.bin_lab_dir_cmp_nosilence_norm # Output self.out_feat_dir = cfg.nn_cmp_dir self.out_feat_dir_norm = cfg.nn_cmp_norm_dir else: print("invalid model output type") raise # -------------------------------------------------------- # ------------------- Model Parameters ------------------- # -------------------------------------------------------- self.sequential_training = cfg.sequential_training self.stateful = cfg.stateful self.json_model_file = cfg.json_model_file self.h5_model_file = cfg.h5_model_file self.model_params_file = cfg.model_params_file # ----------------------------------------------------------- # ------------------- Generate file lists ------------------- # ----------------------------------------------------------- train_file_number = cfg.train_file_number valid_file_number = cfg.valid_file_number test_file_number = cfg.test_file_number # List of file ids self.file_id_scp = cfg.file_id_scp # Create train, valid and test file lists self.file_id_list = data_utils.read_file_list(self.file_id_scp) if cfg.shuffle_data: random.seed(1) random.shuffle(self.file_id_list ) # Shuffle to get random valid and test utterances self.train_id_list = self.file_id_list[0:train_file_number] self.valid_id_list = self.file_id_list[ train_file_number:train_file_number + valid_file_number] self.test_id_list = self.file_id_list[ train_file_number + valid_file_number:train_file_number + valid_file_number + test_file_number] # Intermediate file lists self.inp_feat_file_list = data_utils.prepare_file_path_list( self.file_id_list, self.inp_feat_dir, inp_file_ext) self.bin_lab_file_list = data_utils.prepare_file_path_list( self.file_id_list, self.bin_lab_dir, inp_file_ext) self.bin_lab_nosilence_file_list = data_utils.prepare_file_path_list( self.file_id_list, self.bin_lab_dir_nosilence, inp_file_ext) # Train, test, validation file lists self.inp_train_file_list = data_utils.prepare_file_path_list( self.train_id_list, self.bin_lab_dir_nosilence, inp_file_ext) self.out_train_file_list = data_utils.prepare_file_path_list( self.train_id_list, self.out_feat_dir, out_file_ext) self.inp_valid_file_list = data_utils.prepare_file_path_list( self.valid_id_list, self.bin_lab_dir_nosilence, inp_file_ext) self.out_valid_file_list = data_utils.prepare_file_path_list( self.valid_id_list, self.out_feat_dir, out_file_ext) self.inp_test_file_list = data_utils.prepare_file_path_list( self.test_id_list, self.bin_lab_dir_nosilence, inp_file_ext) self.out_test_file_list = data_utils.prepare_file_path_list( self.test_id_list, self.out_feat_dir, out_file_ext) # For cmp files generated as targets (applies to acoustic model only) self.nn_cmp_file_list = [] self.nn_cmp_norm_file_list = [] self.in_file_list_dict = {} for feature_name in list(cfg.in_dir_dict.keys()): self.in_file_list_dict[ feature_name] = data_utils.prepare_file_path_list( self.file_id_list, cfg.in_dir_dict[feature_name], cfg.file_extension_dict[feature_name], False) # self.gen_test_file_list = data_utils.prepare_file_path_list(self.test_id_list, pred_feat_dir, out_file_ext) # if self.GenTestList: # test_id_list = data_utils.read_file_list(test_id_scp) # self.inp_test_file_list = data_utils.prepare_file_path_list(test_id_list, inp_feat_dir, inp_file_ext) # self.gen_test_file_list = data_utils.prepare_file_path_list(test_id_list, pred_feat_dir, out_file_ext) # ------------------------------------------------------ # ------------------- Main Processes ------------------- # ------------------------------------------------------ self.MAKELAB = cfg.MAKELAB # make binary labels (required step before normalization and training) self.MAKECMP = cfg.MAKECMP self.NORMDATA = cfg.NORMDATA # normalizes input and output data, creates data scaling objects self.TRAINDNN = cfg.TRAINDNN # train the Keras model self.TESTDNN = cfg.TESTDNN # test the Keras model # ---------------------------------------------------------- # ------------------- Define Keras Model ------------------- # ---------------------------------------------------------- self.batch_size = cfg.batch_size model_params = { 'inp_dim': self.inp_dim, 'hidden_layer_size': cfg.hidden_layer_size, 'shared_layer_flag': cfg.shared_layer_flag, 'speaker_id': cfg.speaker_id, 'out_dim': self.out_dim, 'hidden_layer_type': cfg.hidden_layer_type, 'output_layer_type': cfg.output_layer_type, 'dropout_rate': cfg.dropout_rate, 'loss_function': cfg.loss_function, 'optimizer': cfg.optimizer, 'l1': cfg.l1_reg, 'l2': cfg.l2_reg, 'gpu_num': cfg.gpu_num } rnn_params = { 'merge_size': cfg.merge_size, 'seq_length': cfg.seq_length, 'bucket_range': cfg.bucket_range, 'stateful': cfg.stateful, 'training_algo': cfg.training_algo } training_params = { 'batch_size': cfg.batch_size, 'num_of_epochs': cfg.num_of_epochs, 'shuffle_data': cfg.shuffle_data, 'tensorboard_dir': os.path.join(cfg.plot_dir, cfg.nnets_file_name), 'stopping_patience': cfg.stopping_patience, 'restore_best_weights': cfg.restore_best_weights } self.keras_models = TrainKerasModels(model_params=model_params, rnn_params=rnn_params, training_params=training_params) def make_labels(self): # simple HTS labels print('preparing label data (input) using standard HTS style labels') if not os.path.isfile(self.bin_lab_file_list[-1]): # This does not normalize the data as the name suggests, rather translates it to binary self.label_normaliser.perform_normalisation( self.inp_feat_file_list, self.bin_lab_file_list, label_type=self.label_type) # TODO: Additional features may be added in the future... parts of speech? Some context for intonation? # if cfg.additional_features: # out_feat_dir = os.path.join(cfg.data_dir, 'binary_label_%s_%s' % (cfg.label_type, str(self.inp_dim))) # out_feat_file_list = data_utils.prepare_file_path_list(file_id_list, out_feat_dir, cfg.lab_ext) # in_dim = self.label_normaliser.dimension # for new_feature, new_feature_dim in cfg.additional_features.items(): # new_feat_dir = os.path.join(cfg.data_dir, new_feature) # new_feat_file_list = data_utils.prepare_file_path_list(file_id_list, new_feat_dir, '.' + new_feature) # # merger = MergeFeat(lab_dim=in_dim, feat_dim=new_feature_dim) # merger.merge_data(binary_label_file_list, new_feat_file_list, out_feat_file_list) # in_dim += new_feature_dim # # binary_label_file_list = out_feat_file_list # This silence remover has little to no effect, no change in file 1 if not os.path.isfile(self.bin_lab_nosilence_file_list[-1]): remover = SilenceRemover( n_cmp=self.inp_dim, silence_pattern=cfg.silence_pattern, label_type=cfg.label_type, remove_frame_features=cfg.add_frame_features, subphone_feats=cfg.subphone_feats) remover.remove_silence(self.bin_lab_file_list, self.inp_feat_file_list, self.bin_lab_nosilence_file_list) def make_cmp(self): # File lists for the final cmp files (these are re-generated below to fit a precise numpy data array) self.nn_cmp_file_list = data_utils.prepare_file_path_list( self.file_id_list, self.out_feat_dir, self.cmp_ext) # self.nn_cmp_norm_file_list = data_utils.prepare_file_path_list(self.file_id_list, self.out_feat_dir_norm, # self.cmp_ext) # TODO: Get the delta and acceleration windows from the recipe file. acoustic_worker = AcousticComposition(delta_win=[-0.5, 0.0, 0.5], acc_win=[1.0, -2.0, 1.0]) # TODO: Lets try this at some point # if 'dur' in list(cfg.in_dir_dict.keys()) and cfg.AcousticModel: # acoustic_worker.make_equal_frames(dur_file_list, lf0_file_list, cfg.in_dimension_dict) acoustic_worker.prepare_nn_data(self.in_file_list_dict, self.nn_cmp_file_list, cfg.in_dimension_dict, cfg.out_dimension_dict) remover = SilenceRemover(n_cmp=cfg.cmp_dim, silence_pattern=cfg.silence_pattern, label_type=cfg.label_type, remove_frame_features=cfg.add_frame_features, subphone_feats=cfg.subphone_feats) remover.remove_silence( self.nn_cmp_file_list[0:cfg.train_file_number + cfg.valid_file_number], self.inp_feat_file_list[0:cfg.train_file_number + cfg.valid_file_number], self.nn_cmp_file_list[0:cfg.train_file_number + cfg.valid_file_number]) # save to itself def normalize_data(self): # What type of normalization? -- its given as "method" in compute_norm_stats # Check if normalization stat files already exist if os.path.isfile(self.inp_stats_file) and os.path.isfile( self.out_stats_file_list[0]): self.inp_scaler = data_utils.load_norm_stats(self.inp_stats_file, self.inp_dim, method=self.inp_norm) self.out_scaler_list = [] for speaker_norm_file in self.out_stats_file_list: self.out_scaler_list.append( data_utils.load_norm_stats(speaker_norm_file, self.out_dim, method=self.out_norm)) else: # Create the scaler objects # Data must be in an a numpy array for normalization, therefore set sequential_training to false print( 'preparing train_x, train_y from input and output feature files...' ) if len(self.speaker_id) > 1: train_x, train_y_list, train_flen = data_utils.read_data_from_file_list_shared_2( self.speaker_id, self.inp_train_file_list, self.out_train_file_list, self.inp_dim, self.out_dim, sequential_training=False) else: train_x, train_y_list, train_flen = data_utils.read_data_from_file_list( self.inp_train_file_list, self.out_train_file_list, self.inp_dim, self.out_dim, sequential_training=False) print('computing norm stats for train_x...') # I have removed scaling from binary variables (discrete_dict columns are all binary) ind = [int(i) for i in self.label_normaliser.discrete_dict.keys()] self.inp_scaler = data_utils.compute_norm_stats( train_x, self.inp_stats_file, method=self.inp_norm, no_scaling_ind=ind) # The output values should all be continuous except vuv (in acoustic model) print('computing norm stats for train_y...') if self.model_output_type == 'acoustic': vuv_index = self.out_streams.index('vuv') index = [sum([int(num) for num in self.outdims[0:vuv_index]])] else: index = [] if type(train_y_list) != list: train_y_list = [train_y_list] self.out_scaler_list = [] for train_y, speaker in zip(train_y_list, self.speaker_id): ind = np.where([ speaker in file_name for file_name in self.out_stats_file_list ])[0][0] out_scaler = data_utils.compute_norm_stats( train_y, self.out_stats_file_list[ind], method=self.out_norm, no_scaling_ind=index) # For vuv (the first column) self.out_scaler_list.append(out_scaler) def train_keras_model(self): # TODO: for large datasets, I might have to batch load the data to memory... I will cross that bridge when it comes #### load the data #### print( 'preparing train_x, train_y from input and output feature files...' ) train_x, train_y, train_flen = data_utils.read_data_from_file_list( self.inp_train_file_list, self.out_train_file_list, self.inp_dim, self.out_dim, sequential_training=self.sequential_training) print( 'preparing valid_x, valid_y from input and output feature files...' ) valid_x, valid_y, valid_flen = data_utils.read_data_from_file_list( self.inp_valid_file_list, self.out_valid_file_list, self.inp_dim, self.out_dim, sequential_training=self.sequential_training) #### normalize the data (the input and output scalers need to be already created) #### train_x = data_utils.norm_data( train_x, self.inp_scaler, sequential_training=self.sequential_training) valid_x = data_utils.norm_data( valid_x, self.inp_scaler, sequential_training=self.sequential_training) # For each speaker: if self.sequential_training: # Cycle through all utterances once for utt_key in train_y.keys(): i = np.where( [speaker in utt_key for speaker in self.speaker_id])[0][0] # Sequential training false because we are normalizing one utterance at at a time train_y[utt_key] = data_utils.norm_data( train_y[utt_key], self.out_scaler_list[i], sequential_training=False) for utt_key in valid_y.keys(): i = np.where( [speaker in utt_key for speaker in self.speaker_id])[0][0] valid_y[utt_key] = data_utils.norm_data( valid_y[utt_key], self.out_scaler_list[i], sequential_training=False) else: for i, scaler in enumerate(self.out_scaler_list): train_y = data_utils.norm_data(train_y, scaler, sequential_training=False) valid_y = data_utils.norm_data(valid_y, scaler, sequential_training=False) #### define the model #### if not self.sequential_training: self.keras_models.define_feedforward_model() elif self.sequential_training and not self.stateful and sum( self.shared_layer_flag) == 0: self.keras_models.define_sequence_model() elif self.sequential_training and not self.stateful and sum( self.shared_layer_flag) > 0: self.keras_models.define_shared_model() elif self.sequential_training and self.stateful and sum( self.shared_layer_flag) == 0: self.keras_models.define_stateful_model(batch_size=self.batch_size, seq_length=self.seq_length) else: raise Exception('Model can not be defined with given settings.') #### train the model #### print('training...') shared = sum(self.shared_layer_flag) if not self.sequential_training: # Train feedforward model self.keras_models.train_feedforward_model(train_x, train_y, valid_x, valid_y) self.keras_models.save_model(self.json_model_file, self.h5_model_file, self.model_params_file) elif self.sequential_training and self.batch_size == 1 and sum( self.shared_layer_flag) == 0: # Train recurrent model of batch size one self.keras_models.train_recurrent_model_batchsize_one( train_x, train_y, valid_x, valid_y) self.keras_models.save_model(self.json_model_file, self.h5_model_file, self.model_params_file) elif self.sequential_training and self.batch_size == 1 and sum( self.shared_layer_flag) > 0: self.keras_models.train_shared_model(train_x, train_y, valid_x, valid_y) self.keras_models.save_models(self.json_model_file, self.h5_model_file, self.model_params_file) elif self.sequential_training and self.stateful: # Train recurrent model of many batches, should it be stateful? self.keras_models.train_recurrent_model(train_x, train_y, valid_x, valid_y, train_flen, training_algo=1) self.keras_models.save_model(self.json_model_file, self.h5_model_file, self.model_params_file) def test_keras_model(self): # TODO: Overhaul this function #### load the model #### self.keras_models.load_model(self.json_model_file, self.h5_model_file) #### load the data #### print('preparing test_x from input feature files...') test_x, test_flen = data_utils.read_test_data_from_file_list( self.inp_test_file_list, self.inp_dim) #### normalize the data #### data_utils.norm_data(test_x, self.inp_scaler) #### compute predictions #### self.keras_models.predict(test_x, self.out_scaler, self.gen_test_file_list, self.sequential_training) def main_function(self): ### Implement each module ### if self.MAKELAB: self.make_labels() if self.MAKECMP: self.make_cmp() if self.NORMDATA: self.normalize_data() if self.TRAINDNN: self.train_keras_model() if self.TESTDNN: self.test_keras_model()
def user_configuration(self, configFile=None): # get a logger logger = logging.getLogger("configuration") # load and parse the provided configFile, if provided if not configFile: logger.warn( 'no user configuration file provided; using only built-in default settings' ) return # load the config file try: cfgparser = configparser.ConfigParser() cfgparser.readfp(open(configFile)) logger.debug( 'successfully read and parsed user configuration file %s' % configFile) except: logger.fatal('error reading user configuration file %s' % configFile) raise #work_dir must be provided before initialising other directories try: self.work_dir = cfgparser.get('Paths', 'work') self.data_dir = cfgparser.get('Paths', 'data') self.plot_dir = cfgparser.get('Paths', 'plot') self.model_output_type = cfgparser.get('Input-Output', 'model_output_type') except (configparser.NoSectionError, configparser.NoOptionError): self.work_dir = None self.data_dir = None self.plot_dir = None logger.critical('Paths:work has no value!') raise Exception # The model must be placed in the processors folder which is copied to the voice folder for synthesis if self.model_output_type == 'duration': self.model_dir = os.path.join(self.data_dir, 'processors', 'duration_predictor') elif self.model_output_type == 'acoustic': self.model_dir = os.path.join(self.data_dir, 'processors', 'acoustic_predictor') # default place for some data self.keras_dir = os.path.join(self.work_dir, 'keras') self.gen_dir = os.path.join(self.keras_dir, 'gen') self.stats_dir = os.path.join(self.keras_dir, 'stats') self.question_file_name = cfgparser.get('Labels', 'question_file_name') self.add_frame_features = cfgparser.get('Labels', 'add_frame_features') self.subphone_feats = cfgparser.get('Labels', 'subphone_feats') self.model_type = cfgparser.get('Labels', 'subphone_feats') # TODO: the configuration is inflexible, it has hard coded elements and is designed only for the acoustic model # TODO: improve flexibility and incorporate duration model elements # TODO: I am going to perform all data normalization tasks in KerasClass # Set up file paths to ossian defaults label_normaliser = HTSLabelNormalisation( question_file_name=self.question_file_name, add_frame_features=self.add_frame_features, subphone_feats=self.subphone_feats) self.inp_dim = label_normaliser.dimension # lab_dim = label_normaliser.dimension # logger.info('Input label dimension is %d' % lab_dim) # suffix = str(lab_dim) # the number can be removed # binary_label_dir = os.path.join(self.work_dir, 'binary_label_' + str(label_normaliser.dimension)) # nn_label_dir = os.path.join(self.work_dir, 'nn_no_silence_lab_' + suffix) # self.def_inp_dir = os.path.join(self.work_dir, 'nn_no_silence_lab_norm_' + suffix) # self.def_inp_dir = os.path.join(self.work_dir, 'nn_no_silence_lab_norm_%s' % 1) # self.def_out_dir = os.path.join(self.work_dir, 'nn_norm_mgc_lf0_vuv_bap_%s' % 1) # --------------------------------------------------- # ------------------- Output data ------------------- # --------------------------------------------------- # Binary data (already generated by ossian) self.out_feat_dir_dur = os.path.join(self.data_dir, 'dur') self.out_feat_dir_cmp = os.path.join(self.data_dir, 'cmp') self.out_feat_dir_dur_norm = os.path.join(self.data_dir, 'dur_norm') self.out_feat_dir_cmp_norm = os.path.join(self.data_dir, 'cmp_norm') self.nn_cmp_dir = os.path.join(self.data_dir, 'nn_cmp') self.nn_cmp_norm_dir = os.path.join(self.data_dir, 'nn_norm_cmp') # --------------------------------------------------- # ------------------- Input data ------------------- # --------------------------------------------------- # Raw text data self.inp_feat_dir_dur = os.path.join(self.data_dir, 'lab_dur') self.inp_feat_dir_cmp = os.path.join(self.data_dir, 'lab_dnn') # Binary data self.bin_lab_dir_dur = os.path.join( self.data_dir, 'bin_lab_phone_%s' % str(self.inp_dim)) self.bin_lab_dir_cmp = os.path.join( self.data_dir, 'bin_lab_state_%s' % str(self.inp_dim)) # Binary data silence removed self.bin_lab_dir_dur_nosilence = os.path.join( self.data_dir, 'bin_lab_phone_no_sil_%s' % str(self.inp_dim)) self.bin_lab_dir_cmp_nosilence = os.path.join( self.data_dir, 'bin_lab_state_no_sil_%s' % str(self.inp_dim)) # Binary data silence removed and normalized self.bin_lab_dir_dur_nosilence_norm = os.path.join( self.data_dir, 'bin_lab_phone_no_sil_norm_%s' % str(self.inp_dim)) self.bin_lab_dir_cmp_nosilence_norm = os.path.join( self.data_dir, 'bin_lab_state_no_sil_norm_%s' % str(self.inp_dim)) # self.inter_data_dir = os.path.join(self.work_dir, 'inter_module') # self.def_inp_dir = os.path.join(self.inter_data_dir, 'nn_no_silence_lab_norm_425') # self.def_out_dir = os.path.join(self.inter_data_dir, 'nn_norm_mgc_lf0_vuv_bap_187') impossible_int = int(-99999) impossible_int = int(-99999) impossible_float = float(-99999.0) user_options = [ # General paths ('work_dir', self.work_dir, 'Paths', 'work'), ('data_dir', self.data_dir, 'Paths', 'data'), ('plot_dir', self.model_dir, 'Paths', 'plot'), ('model_dir', self.model_dir, 'Paths', 'models'), ('stats_dir', self.stats_dir, 'Paths', 'stats'), ('gen_dir', self.gen_dir, 'Paths', 'gen'), # Output data paths ('out_feat_dir_dur', self.out_feat_dir_dur, 'Paths', 'out_feat'), ('out_feat_dir_cmp', self.out_feat_dir_cmp, 'Paths', 'out_feat'), ('out_feat_dir_dur_norm', self.out_feat_dir_dur_norm, 'Paths', 'out_feat'), ('out_feat_dir_cmp_norm', self.out_feat_dir_cmp_norm, 'Paths', 'out_feat'), # Input data paths ('inp_feat_dir_dur', self.inp_feat_dir_dur, 'Paths', 'inp_feat'), ('inp_feat_dir_cmp', self.inp_feat_dir_cmp, 'Paths', 'inp_feat'), ('bin_lab_dir_dur', self.bin_lab_dir_dur, 'Paths', 'inp_feat'), ('bin_lab_dir_cmp', self.bin_lab_dir_cmp, 'Paths', 'inp_feat'), ('bin_lab_dir_dur_nosilence', self.bin_lab_dir_dur_nosilence, 'Paths', 'inp_feat'), ('bin_lab_dir_cmp_nosilence', self.bin_lab_dir_cmp_nosilence, 'Paths', 'inp_feat'), ('bin_lab_dir_dur_nosilence_norm', self.bin_lab_dir_dur_nosilence_norm, 'Paths', 'inp_feat'), ('bin_lab_dir_cmp_nosilence_norm', self.bin_lab_dir_cmp_nosilence_norm, 'Paths', 'inp_feat'), # TODO: Where is the actual file list? Fix these variables -- I believe this is fixed ('file_id_scp', os.path.join(self.data_dir, 'processors/duration_predictor/filelist.txt'), 'Paths', 'file_id_list'), # ('test_id_scp', os.path.join(self.data_dir, 'test_id_list.scp'), 'Paths', 'test_id_list'), # Labels ('label_type', 'phone_align', 'Labels', 'label_type'), ('silence_pattern', ['*-#+*'], 'Labels', 'silence_pattern'), # Input-Output # TODO: I can ad dur to this list to combine duration and acoustic modeling ('output_features', ['mgc', 'lf0', 'vuv', 'bap'], 'Input-Output', 'output_features'), ('model_output_type', 'acoustic', 'Input-Output', 'model_output_type'), ('inp_dim', self.inp_dim, 'Input-Output', 'inp_dim'), # ('out_dim', 187, 'Input-Output', 'out_dim'), ('mgc_dim', 60, 'Input-Output', 'mgc'), ('lf0_dim', 1, 'Input-Output', 'lf0'), ('bap_dim', 5, 'Input-Output', 'bap'), ('dmgc_dim', 180, 'Input-Output', 'mgc'), ('dlf0_dim', 3, 'Input-Output', 'lf0'), ('dbap_dim', 15, 'Input-Output', 'bap'), ('dur_dim', 5, 'Input-Output', 'cmp'), ('cmp_dim', 60 * 3 + 1 * 3 + 5 * 3, 'Input-Output', 'cmp'), ('inp_file_ext', '.lab', 'Input-Output', 'inp_file_ext'), ('out_file_ext', '.cmp', 'Input-Output', 'out_file_ext'), ('mgc_ext', '.mgc', 'Input-Output', 'mgc_ext'), ('bap_ext', '.bap', 'Input-Output', 'bap_ext'), ('lf0_ext', '.lf0', 'Input-Output', 'lf0_ext'), ('cmp_ext', '.cmp', 'Input-Output', 'cmp_ext'), ('lab_ext', '.lab', 'Input-Output', 'lab_ext'), ('utt_ext', '.utt', 'Input-Output', 'utt_ext'), ('stepw_ext', '.stepw', 'Input-Output', 'stepw_ext'), ('sp_ext', '.sp', 'Input-Output', 'sp_ext'), ('dur_ext', '.dur', 'Input-Output', 'dur_ext'), ('inp_norm', 'MVN', 'Input-Output', 'inp_norm'), ('out_norm', 'MVN', 'Input-Output', 'out_norm'), # Architecture ('hidden_layer_type', ['tanh', 'tanh', 'tanh', 'tanh'], 'Architecture', 'hidden_layer_type'), ('hidden_layer_size', [1024, 1024, 1024, 1024], 'Architecture', 'hidden_layer_size'), ('shared_layer_flag', [0, 0, 0, 0], 'Architecture', 'shared_layer_flag'), ('speaker_id', ['placeholder'], 'Architecture', 'speaker_id'), ('batch_size', 256, 'Architecture', 'batch_size'), ('num_of_epochs', 1, 'Architecture', 'training_epochs'), ('stopping_patience', 10, 'Architecture', 'stopping_patience'), ('restore_best_weights', True, 'Architecture', 'restore_best_weights'), ('dropout_rate', 0.0, 'Architecture', 'dropout_rate'), ('l1_reg', 0.0, 'Architecture', 'l1_reg'), ('l2_reg', 0.0, 'Architecture', 'l2_reg'), ('output_layer_type', 'linear', 'Architecture', 'output_layer_type'), ('optimizer', 'adam', 'Architecture', 'optimizer'), ('loss_function', 'mse', 'Architecture', 'loss_function'), # RNN ('sequential_training', False, 'Architecture', 'sequential_training'), ('stateful', False, 'Architecture', 'stateful'), ('use_high_batch_size', False, 'Architecture', 'use_high_batch_size'), ('training_algo', 1, 'Architecture', 'training_algo'), ('merge_size', 1, 'Architecture', 'merge_size'), ('seq_length', 200, 'Architecture', 'seq_length'), ('bucket_range', 100, 'Architecture', 'bucket_range'), ('gpu_num', 1, 'Architecture', 'gpu_num'), # Data ('shuffle_data', True, 'Data', 'shuffle_data'), ('train_file_number', impossible_int, 'Data', 'train_file_number'), ('valid_file_number', impossible_int, 'Data', 'valid_file_number'), ('test_file_number', impossible_int, 'Data', 'test_file_number'), # Processes ('GenTestList', False, 'Processes', 'GenTestList'), ('NORMDATA', False, 'Processes', 'NORMDATA'), ('TRAINDNN', False, 'Processes', 'TRAINDNN'), ('TESTDNN', False, 'Processes', 'TESTDNN'), ('MAKELAB', False, 'Processes', 'MAKELAB'), ('MAKECMP', False, 'Processes', 'MAKECMP'), ] # this uses exec(...) which is potentially dangerous since arbitrary code could be executed for (variable, default, section, option) in user_options: # default value value = None try: # first, look for a user-set value for this variable in the config file value = cfgparser.get(section, option) user_or_default = 'user' except (configparser.NoSectionError, configparser.NoOptionError): # use default value, if there is one if (default == None) or \ (default == '') or \ ((type(default) == int) and (default == impossible_int)) or \ ((type(default) == float) and (default == impossible_float)) : logger.critical('%20s has no value!' % (section + ":" + option)) raise Exception else: value = default user_or_default = 'default' if type(default) == str: exec('self.%s = "%s"' % (variable, value)) elif type(default) == int: exec('self.%s = int(%s)' % (variable, value)) elif type(default) == float: exec('self.%s = float(%s)' % (variable, value)) elif type(default) == bool: exec('self.%s = bool(%s)' % (variable, value)) elif type(default) == list: exec('self.%s = list(%s)' % (variable, value)) elif type(default) == dict: exec('self.%s = dict(%s)' % (variable, value)) else: logger.critical( 'Variable %s has default value of unsupported type %s', variable, type(default)) raise Exception( 'Internal error in configuration settings: unsupported default type' ) logger.info('%20s has %7s value %s' % (section + ":" + option, user_or_default, value))
# Configuration object cfg from config argument cfg = configuration.cfg cfg.configure(sys.argv[1]) # Get training file id list file_id_list = read_file_list(cfg.file_id_scp) # Y data file lists # nn_cmp_dir = os.path.join(cfg.data_dir, 'nn' + cfg.combined_feature_name + '_' + str(cfg.cmp_dim)) nn_cmp_norm_dir = os.path.join(cfg.data_dir, 'nn_norm' + cfg.combined_feature_name + '_' + str(cfg.cmp_dim)) # nn_cmp_file_list = prepare_file_path_list(file_id_list, nn_cmp_dir, cfg.cmp_ext) # nn_cmp_norm_file_list = prepare_file_path_list(file_id_list, nn_cmp_norm_dir, cfg.cmp_ext) # Get label dimensions label_normaliser = HTSLabelNormalisation(question_file_name=cfg.question_file_name, add_frame_features=cfg.add_frame_features, subphone_feats=cfg.subphone_feats) add_feat_dim = sum(cfg.additional_features.values()) inp_dim = label_normaliser.dimension + add_feat_dim + cfg.appended_input_dim out_dim = cfg.cmp_dim # X data file lists # binary_label_dir = os.path.join(cfg.data_dir, 'binary_label_'+str(label_normaliser.dimension)) # nn_label_dir = os.path.join(cfg.data_dir, 'nn_no_silence_lab_'+str(lab_dim)) nn_label_norm_dir = os.path.join(cfg.data_dir, 'nn_no_silence_lab_norm_'+str(inp_dim)) # binary_label_file_list = prepare_file_path_list(file_id_list, binary_label_dir, cfg.lab_ext) # nn_label_file_list = prepare_file_path_list(file_id_list, nn_label_dir, cfg.lab_ext) # nn_label_norm_file_list = prepare_file_path_list(file_id_list, nn_label_norm_dir, cfg.lab_ext) # Split files into train and test sets train_file_number = cfg.train_file_number
def main_function(cfg): logger = logging.getLogger("main") plotlogger = logging.getLogger("plotting") plotlogger.set_plot_path(cfg.plot_dir) data_dir = cfg.data_dir label_normaliser = HTSLabelNormalisation( question_file_name=cfg.question_file_name, add_frame_features=cfg.add_frame_features, subphone_feats=cfg.subphone_feats) add_feat_dim = sum(cfg.additional_features.values()) lab_dim = label_normaliser.dimension + add_feat_dim + cfg.appended_input_dim logger.info('Input label dimension is %d' % lab_dim) suffix = str(lab_dim) label_data_dir = cfg.work_dir binary_label_dir = os.path.join( label_data_dir, 'binary_label_' + str(label_normaliser.dimension)) nn_label_dir = os.path.join(label_data_dir, 'nn_no_silence_lab_' + suffix) nn_label_norm_dir = os.path.join(label_data_dir, 'output') min_max_normaliser = None label_norm_file = 'label_norm_%s_%d.dat' % (cfg.label_style, lab_dim) label_norm_file = os.path.join(data_dir, label_norm_file) try: test_id_list = read_file_list(cfg.test_id_scp) logger.debug('Loaded file id list from %s' % cfg.test_id_scp) except IOError: logger.critical('Could not load file id list from %s' % cfg.test_id_scp) raise in_label_align_file_list = prepare_file_path_list(test_id_list, cfg.in_label_align_dir, cfg.lab_ext, False) binary_label_file_list = prepare_file_path_list(test_id_list, binary_label_dir, cfg.lab_ext) nn_label_file_list = prepare_file_path_list(test_id_list, nn_label_dir, cfg.lab_ext) nn_label_norm_file_list = prepare_file_path_list(test_id_list, nn_label_norm_dir, cfg.lab_ext) logger.info('preparing label data (input) using standard HTS style labels') label_normaliser.perform_normalisation(in_label_align_file_list, binary_label_file_list, label_type=cfg.label_type) remover = SilenceRemover(n_cmp=lab_dim, silence_pattern=cfg.silence_pattern, label_type=cfg.label_type, remove_frame_features=cfg.add_frame_features, subphone_feats=cfg.subphone_feats) remover.remove_silence(binary_label_file_list, in_label_align_file_list, nn_label_file_list) min_max_normaliser = MinMaxNormalisation(feature_dimension=lab_dim, min_value=0.01, max_value=0.99) min_max_normaliser.load_min_max_values(label_norm_file) min_max_normaliser.normalise_data(binary_label_file_list, nn_label_norm_file_list)