def load_from_files(self, model_dir): # find the model files h5_file = glob(model_dir + '/*.h5')[0] json_file = glob(model_dir + '/*.json')[0] params_file = glob(model_dir + '/*.pickle')[0] norm_stats_inp_file = glob(model_dir + '/input*.norm')[0] norm_stats_out_file = glob(model_dir + '/output*.norm')[0] # Load trained model from files self.keras_wrapper.load_model(json_file, h5_file, params_file) # Get dimensions of the input and output with open(json_file, 'r') as f: model_dict = json.load(f) self.input_shape = model_dict['config']['layers'][0]['config'][ 'batch_input_shape'] self.input_dim = model_dict['config']['layers'][0]['config'][ 'batch_input_shape'][-1] self.output_dim = model_dict['config']['layers'][-1]['config']['units'] # Load the input and and output normalization objects # TODO: how do I deal with the normalization method? May need to pass this into the class self.scaler_inp = load_norm_stats(norm_stats_inp_file, self.input_dim, method="MVN") self.scaler_out = load_norm_stats(norm_stats_out_file, self.output_dim, method="MVN")
def normlize_data(self): ### normalize train data ### if os.path.isfile(self.inp_stats_file) and os.path.isfile( self.out_stats_file): self.inp_scaler = data_utils.load_norm_stats(self.inp_stats_file, self.inp_dim, method=self.inp_norm) self.out_scaler = data_utils.load_norm_stats(self.out_stats_file, self.out_dim, method=self.out_norm) else: print( 'preparing train_x, train_y from input and output feature files...' ) train_x, train_y, train_flen = data_utils.read_data_from_file_list( self.inp_train_file_list, self.out_train_file_list, self.inp_dim, self.out_dim, sequential_training=self.sequential_training) print('computing norm stats for train_x...') inp_scaler = data_utils.compute_norm_stats(train_x, self.inp_stats_file, method=self.inp_norm) print('computing norm stats for train_y...') out_scaler = data_utils.compute_norm_stats(train_y, self.out_stats_file, method=self.out_norm)
def normlize_data(self): ### normalize train data ### if os.path.isfile(self.inp_stats_file) and os.path.isfile(self.out_stats_file): self.inp_scaler = data_utils.load_norm_stats(self.inp_stats_file, self.inp_dim, method=self.inp_norm) self.out_scaler = data_utils.load_norm_stats(self.out_stats_file, self.out_dim, method=self.out_norm) else: print('preparing train_x, train_y from input and output feature files...') train_x, train_y, train_flen = data_utils.read_data_from_file_list(self.inp_train_file_list, self.out_train_file_list, self.inp_dim, self.out_dim, sequential_training=self.sequential_training) print('computing norm stats for train_x...') inp_scaler = data_utils.compute_norm_stats(train_x, self.inp_stats_file, method=self.inp_norm) print('computing norm stats for train_y...') out_scaler = data_utils.compute_norm_stats(train_y, self.out_stats_file, method=self.out_norm)
def main(cfg): ################################################### ########## User configurable variables ############ ################################################### work_dir = cfg.work_dir data_dir = cfg.data_dir inp_feat_dir = cfg.inp_feat_dir out_feat_dir = cfg.out_feat_dir model_dir = cfg.model_dir stats_dir = cfg.stats_dir gen_dir = cfg.gen_dir ### Input-Output ### inp_dim = cfg.inp_dim out_dim = cfg.out_dim inp_file_ext = cfg.inp_file_ext out_file_ext = cfg.out_file_ext inp_norm = cfg.inp_norm out_norm = cfg.out_norm ### define train, valid, test ### train_file_number = cfg.train_file_number valid_file_number = cfg.valid_file_number test_file_number = cfg.test_file_number #### Train, valid and test file lists #### file_id_list = data_utils.read_file_list(cfg.file_id_scp) train_id_list = file_id_list[0:train_file_number] valid_id_list = file_id_list[train_file_number:train_file_number+valid_file_number] test_id_list = file_id_list[train_file_number+valid_file_number:train_file_number+valid_file_number+test_file_number] inp_train_file_list = data_utils.prepare_file_path_list(train_id_list, inp_feat_dir, inp_file_ext) out_train_file_list = data_utils.prepare_file_path_list(train_id_list, out_feat_dir, out_file_ext) inp_test_file_list = data_utils.prepare_file_path_list(test_id_list, inp_feat_dir, inp_file_ext) out_test_file_list = data_utils.prepare_file_path_list(test_id_list, out_feat_dir, out_file_ext) gen_test_file_list = data_utils.prepare_file_path_list(test_id_list, cfg.pred_feat_dir, out_file_ext) #### define model params #### inp_scaler = None out_scaler = None hidden_layer_type = cfg.hidden_layer_type hidden_layer_size = cfg.hidden_layer_size batch_size = cfg.batch_size training_algo = cfg.training_algo output_layer_type = cfg.output_layer_type loss_function = cfg.loss_function optimizer = cfg.optimizer num_of_epochs = cfg.num_of_epochs dropout_rate = cfg.dropout_rate json_model_file = cfg.json_model_file h5_model_file = cfg.h5_model_file ################################################### ########## End of user-defined variables ########## ################################################### #### Define keras models class #### keras_models = TrainKerasModels(inp_dim, hidden_layer_size, out_dim, hidden_layer_type, output_layer_type, dropout_rate, loss_function, optimizer) if cfg.NORMDATA: ### normalize train data ### if os.path.isfile(cfg.inp_stats_file) and os.path.isfile(cfg.out_stats_file): inp_scaler = data_utils.load_norm_stats(cfg.inp_stats_file, inp_dim, method=inp_norm) out_scaler = data_utils.load_norm_stats(cfg.out_stats_file, out_dim, method=out_norm) else: print('preparing train_x, train_y from input and output feature files...') train_x, train_y, train_flen = data_utils.read_data_from_file_list(inp_train_file_list, out_train_file_list, inp_dim, out_dim, sequential_training=cfg.sequential_training) print('computing norm stats for train_x...') inp_scaler = data_utils.compute_norm_stats(train_x, cfg.inp_stats_file, method=inp_norm) print('computing norm stats for train_y...') out_scaler = data_utils.compute_norm_stats(train_y, cfg.out_stats_file, method=out_norm) if cfg.TRAINMODEL: #### define the model #### if not cfg.sequential_training: keras_models.define_feedforward_model() elif cfg.stateful: keras_models.define_stateful_model(batch_size=batch_size) else: keras_models.define_sequence_model() #### load the data #### print('preparing train_x, train_y from input and output feature files...') train_x, train_y, train_flen = data_utils.read_data_from_file_list(inp_train_file_list, out_train_file_list, inp_dim, out_dim, sequential_training=cfg.sequential_training) #### norm the data #### data_utils.norm_data(train_x, inp_scaler, sequential_training=cfg.sequential_training) data_utils.norm_data(train_y, out_scaler, sequential_training=cfg.sequential_training) #### train the model #### print('training...') if not cfg.sequential_training: ### Train feedforward model ### keras_models.train_feedforward_model(train_x, train_y, batch_size=batch_size, num_of_epochs=num_of_epochs, shuffle_data=cfg.shuffle_data) else: ### Train recurrent model ### keras_models.train_sequence_model(train_x, train_y, train_flen, batch_size=batch_size, num_of_epochs=num_of_epochs, shuffle_data=cfg.shuffle_data, training_algo=training_algo) #### store the model #### keras_models.save_model(json_model_file, h5_model_file) if cfg.TESTMODEL: #### load the model #### keras_models.load_model(json_model_file, h5_model_file) #### load the data #### print('preparing test_x from input feature files...') test_x, test_flen = data_utils.read_test_data_from_file_list(inp_test_file_list, inp_dim) #### norm the data #### data_utils.norm_data(test_x, inp_scaler) #### compute predictions #### keras_models.predict(test_x, out_scaler, gen_test_file_list, cfg.sequential_training)
def main(): start_time = time.time() ################################################### ########## User configurable variables ############ ################################################### merlin_dir = "/group/project/cstr1/srikanth/test/merlin" exp_dir = os.path.join( merlin_dir, "egs/slt_arctic/s1/experiments/slt_arctic_demo/acoustic_model/") inp_dim = 425 out_dim = 187 data_dir = os.path.join(exp_dir, "data") inp_feat_dir = os.path.join(data_dir, 'nn_no_silence_lab_norm_' + str(inp_dim)) out_feat_dir = os.path.join(data_dir, 'nn_norm_mgc_lf0_vuv_bap_' + str(out_dim)) inp_file_ext = '.lab' out_file_ext = '.cmp' model_dir = os.path.join(exp_dir, 'keras_models') if not os.path.exists(model_dir): os.makedirs(model_dir) inp_norm = "MINMAX" out_norm = "MINMAX" stats_dir = os.path.join(exp_dir, 'keras_stats') if not os.path.exists(stats_dir): os.makedirs(stats_dir) #### Main switch variables #### NormData = False TrainModel = True TestModel = True demo_mode = True if demo_mode: train_file_number = 50 valid_file_number = 5 test_file_number = 5 else: train_file_number = 1000 valid_file_number = 66 test_file_number = 66 #### Train, valid and test file lists #### file_id_scp = os.path.join(data_dir, 'file_id_list_demo.scp') file_id_list = data_utils.read_file_list(file_id_scp) train_id_list = file_id_list[0:train_file_number] valid_id_list = file_id_list[train_file_number:train_file_number + valid_file_number] test_id_list = file_id_list[train_file_number + valid_file_number:train_file_number + valid_file_number + test_file_number] inp_train_file_list = data_utils.prepare_file_path_list( train_id_list, inp_feat_dir, inp_file_ext) out_train_file_list = data_utils.prepare_file_path_list( train_id_list, out_feat_dir, out_file_ext) inp_test_file_list = data_utils.prepare_file_path_list( test_id_list, inp_feat_dir, inp_file_ext) out_test_file_list = data_utils.prepare_file_path_list( test_id_list, out_feat_dir, out_file_ext) ### set to True if training recurrent models ### sequential_training = False stateful = False ### set to True if data to be shuffled ### shuffle_data = True #### define Model, train and evaluate #### if sequential_training: if demo_mode: hidden_layer_type = ['tanh', 'lstm'] hidden_layer_size = [512, 512] else: hidden_layer_type = [ 'tanh', 'tanh', 'tanh', 'tanh', 'lstm', 'lstm' ] hidden_layer_size = [1024, 1024, 1024, 1024, 512, 512] ### batch size: sentences batch_size = 25 training_algo = 1 else: hidden_layer_type = ['tanh', 'tanh', 'tanh', 'tanh', 'tanh', 'tanh'] hidden_layer_size = [1024, 1024, 1024, 1024, 1024, 1024] ### batch size: frames batch_size = 256 optimizer = 'adam' output_type = 'linear' loss_function = 'mse' num_of_epochs = 25 dropout_rate = 0.0 if sequential_training: combined_model_arch = 'RNN' + str(training_algo) else: combined_model_arch = 'DNN' combined_model_arch += '_' + str(len(hidden_layer_size)) combined_model_arch += '_' + '_'.join(map(str, hidden_layer_size)) combined_model_arch += '_' + '_'.join(map(str, hidden_layer_type)) nnets_file_name = '%s_%d_train_%d_%d_%d_%d_%d_model' \ %(combined_model_arch, int(shuffle_data), inp_dim, out_dim, train_file_number, batch_size, num_of_epochs) print 'model file : ' + nnets_file_name json_model_file = os.path.join(model_dir, nnets_file_name + '.json') h5_model_file = os.path.join(model_dir, nnets_file_name + '.h5') inp_stats_file = os.path.join( stats_dir, "input_%d_%s_%d.norm" % (int(train_file_number), inp_norm, inp_dim)) out_stats_file = os.path.join( stats_dir, "output_%d_%s_%d.norm" % (int(train_file_number), out_norm, out_dim)) inp_scaler = None out_scaler = None gen_dir = os.path.join(exp_dir, 'gen') pred_feat_dir = os.path.join(gen_dir, nnets_file_name) if not os.path.exists(pred_feat_dir): os.makedirs(pred_feat_dir) gen_test_file_list = data_utils.prepare_file_path_list( test_id_list, pred_feat_dir, out_file_ext) gen_wav_file_list = data_utils.prepare_file_path_list( test_id_list, pred_feat_dir, '.wav') ################################################### ########## End of user-defined variables ########## ################################################### #### Define keras models class #### keras_models = TrainKerasModels(inp_dim, hidden_layer_size, out_dim, hidden_layer_type, output_type, dropout_rate, loss_function, optimizer) if NormData: ### normalize train data ### if os.path.isfile(inp_stats_file): inp_scaler = data_utils.load_norm_stats(inp_stats_file, inp_dim, method=inp_norm) else: print 'preparing train_x from input feature files...' train_x, train_flen_x = data_utils.read_data_from_file_list( inp_train_file_list, inp_dim, False) print 'computing norm stats for train_x...' inp_scaler = data_utils.compute_norm_stats(train_x, inp_stats_file, method=inp_norm) if os.path.isfile(out_stats_file): out_scaler = data_utils.load_norm_stats(out_stats_file, out_dim, method=out_norm) else: print 'preparing train_y from output feature files...' train_y, train_flen_y = data_utils.read_data_from_file_list( out_train_file_list, out_dim, False) print 'computing norm stats for train_y...' out_scaler = data_utils.compute_norm_stats(train_y, out_stats_file, method=out_norm) if TrainModel: #### define the model #### if not sequential_training: keras_models.define_feedforward_model() elif stateful: keras_models.define_stateful_model() else: keras_models.define_sequence_model() #### load the data #### print( 'preparing train_x, train_y from input and output feature files...' ) train_x, train_y, train_flen = data_utils.read_data_from_file_list( inp_train_file_list, out_train_file_list, inp_dim, out_dim, sequential_training=sequential_training) #### norm the data #### print('normalising the data...') data_utils.norm_data(train_x, inp_scaler, sequential_training=sequential_training) data_utils.norm_data(train_y, out_scaler, sequential_training=sequential_training) #### train the model #### if not sequential_training: ### Train feedforward model ### keras_models.train_feedforward_model(train_x, train_y, batch_size=batch_size, num_of_epochs=num_of_epochs, shuffle_data=shuffle_data) else: ### Train recurrent model ### keras_models.train_sequence_model(train_x, train_y, train_flen, batch_size=batch_size, num_of_epochs=num_of_epochs, shuffle_data=shuffle_data, training_algo=training_algo) #### store the model #### keras_models.save_model(json_model_file, h5_model_file) if TestModel: #### load the model #### keras_models.load_model(json_model_file, h5_model_file) #### load the data #### print 'preparing test_x from input feature files...' test_x, test_flen = data_utils.read_test_data_from_file_list( inp_test_file_list, inp_dim) #### norm the data #### data_utils.norm_data(test_x, inp_scaler) #### compute predictions #### keras_models.predict(test_x, out_scaler, gen_test_file_list, sequential_training) (m, s) = divmod(int(time.time() - start_time), 60) print("--- Job completion time: %d min. %d sec ---" % (m, s))
def normalize_data(self): # What type of normalization? -- its given as "method" in compute_norm_stats # Check if normalization stat files already exist if os.path.isfile(self.inp_stats_file) and os.path.isfile( self.out_stats_file_list[0]): self.inp_scaler = data_utils.load_norm_stats(self.inp_stats_file, self.inp_dim, method=self.inp_norm) self.out_scaler_list = [] for speaker_norm_file in self.out_stats_file_list: self.out_scaler_list.append( data_utils.load_norm_stats(speaker_norm_file, self.out_dim, method=self.out_norm)) else: # Create the scaler objects # Data must be in an a numpy array for normalization, therefore set sequential_training to false print( 'preparing train_x, train_y from input and output feature files...' ) if len(self.speaker_id) > 1: train_x, train_y_list, train_flen = data_utils.read_data_from_file_list_shared_2( self.speaker_id, self.inp_train_file_list, self.out_train_file_list, self.inp_dim, self.out_dim, sequential_training=False) else: train_x, train_y_list, train_flen = data_utils.read_data_from_file_list( self.inp_train_file_list, self.out_train_file_list, self.inp_dim, self.out_dim, sequential_training=False) print('computing norm stats for train_x...') # I have removed scaling from binary variables (discrete_dict columns are all binary) ind = [int(i) for i in self.label_normaliser.discrete_dict.keys()] self.inp_scaler = data_utils.compute_norm_stats( train_x, self.inp_stats_file, method=self.inp_norm, no_scaling_ind=ind) # The output values should all be continuous except vuv (in acoustic model) print('computing norm stats for train_y...') if self.model_output_type == 'acoustic': vuv_index = self.out_streams.index('vuv') index = [sum([int(num) for num in self.outdims[0:vuv_index]])] else: index = [] if type(train_y_list) != list: train_y_list = [train_y_list] self.out_scaler_list = [] for train_y, speaker in zip(train_y_list, self.speaker_id): ind = np.where([ speaker in file_name for file_name in self.out_stats_file_list ])[0][0] out_scaler = data_utils.compute_norm_stats( train_y, self.out_stats_file_list[ind], method=self.out_norm, no_scaling_ind=index) # For vuv (the first column) self.out_scaler_list.append(out_scaler)