def train_one_iteration(dir, iter, srand, egs_dir, num_jobs, num_archives_processed, num_archives, learning_rate, shrinkage_value, num_chunk_per_minibatch, num_hidden_layers, add_layers_period, left_context, right_context, apply_deriv_weights, min_deriv_time, max_deriv_time, l2_regularize, xent_regularize, leaky_hmm_coefficient, momentum, max_param_change, shuffle_buffer_size, frame_subsampling_factor, truncate_deriv_weights, run_opts, dropout_edit_string="", background_process_handler=None): """ Called from steps/nnet3/chain/train.py for one iteration for neural network training with LF-MMI objective """ # Set off jobs doing some diagnostics, in the background. # Use the egs dir from the previous iteration for the diagnostics logger.info("Training neural net (pass {0})".format(iter)) # check if different iterations use the same random seed if os.path.exists('{0}/srand'.format(dir)): try: saved_srand = int(open('{0}/srand'.format(dir)).readline().strip()) except (IOError, ValueError): logger.error("Exception while reading the random seed " "for training") raise if srand != saved_srand: logger.warning("The random seed provided to this iteration " "(srand={0}) is different from the one saved last " "time (srand={1}). Using srand={0}.".format( srand, saved_srand)) else: with open('{0}/srand'.format(dir), 'w') as f: f.write(str(srand)) # Sets off some background jobs to compute train and # validation set objectives compute_train_cv_probabilities( dir=dir, iter=iter, egs_dir=egs_dir, left_context=left_context, right_context=right_context, l2_regularize=l2_regularize, xent_regularize=xent_regularize, leaky_hmm_coefficient=leaky_hmm_coefficient, run_opts=run_opts, background_process_handler=background_process_handler) if iter > 0: # Runs in the background compute_progress(dir, iter, run_opts, background_process_handler=background_process_handler) if (iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and iter % add_layers_period == 0): # if we've just added new hiden layer, don't do averaging but take the # best. do_average = False cur_num_hidden_layers = 1 + iter / add_layers_period config_file = "{0}/configs/layer{1}.config".format( dir, cur_num_hidden_layers) raw_model_string = ("nnet3-am-copy --raw=true --learning-rate={lr} " "{dir}/{iter}.mdl - | nnet3-init --srand={srand} " "- {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file)) cache_io_opts = "" else: do_average = True if iter == 0: # on iteration 0, pick the best, don't average. do_average = False raw_model_string = ("nnet3-am-copy --raw=true --learning-rate={0} " "{1}/{2}.mdl - |".format(learning_rate, dir, iter)) cache_io_opts = "--read-cache={dir}/cache.{iter}".format(dir=dir, iter=iter) if do_average: cur_num_chunk_per_minibatch = num_chunk_per_minibatch cur_max_param_change = max_param_change else: # on iteration zero or when we just added a layer, use a smaller # minibatch size (and we will later choose the output of just one of # the jobs): the model-averaging isn't always helpful when the model is # changing too fast (i.e. it can worsen the objective function), and # the smaller minibatch size will help to keep the update stable. cur_num_chunk_per_minibatch = num_chunk_per_minibatch / 2 cur_max_param_change = float(max_param_change) / math.sqrt(2) raw_model_string = '{0} {1}'.format(raw_model_string, dropout_edit_string) shrink_info_str = '' if shrinkage_value != 1.0: shrink_info_str = ' and shrink value is {0}'.format(shrinkage_value) logger.info("On iteration {0}, learning rate is {1}" "{shrink_info}.".format( iter, learning_rate, shrink_info=shrink_info_str)) train_new_models(dir=dir, iter=iter, srand=srand, num_jobs=num_jobs, num_archives_processed=num_archives_processed, num_archives=num_archives, raw_model_string=raw_model_string, egs_dir=egs_dir, left_context=left_context, right_context=right_context, apply_deriv_weights=apply_deriv_weights, min_deriv_time=min_deriv_time, max_deriv_time=max_deriv_time, l2_regularize=l2_regularize, xent_regularize=xent_regularize, leaky_hmm_coefficient=leaky_hmm_coefficient, momentum=momentum, max_param_change=cur_max_param_change, shuffle_buffer_size=shuffle_buffer_size, num_chunk_per_minibatch=cur_num_chunk_per_minibatch, frame_subsampling_factor=frame_subsampling_factor, truncate_deriv_weights=truncate_deriv_weights, cache_io_opts=cache_io_opts, run_opts=run_opts) [models_to_average, best_model] = common_train_lib.get_successful_models( num_jobs, '{0}/log/train.{1}.%.log'.format(dir, iter)) nnets_list = [] for n in models_to_average: nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n)) if do_average: # average the output of the different jobs. common_train_lib.get_average_nnet_model( dir=dir, iter=iter, nnets_list=" ".join(nnets_list), run_opts=run_opts, shrink=shrinkage_value) else: # choose the best model from different jobs common_train_lib.get_best_nnet_model( dir=dir, iter=iter, best_model_index=best_model, run_opts=run_opts, shrink=shrinkage_value) try: for i in range(1, num_jobs + 1): os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i)) except OSError: raise Exception("Error while trying to delete the raw models") new_model = "{0}/{1}.mdl".format(dir, iter + 1) if not os.path.isfile(new_model): raise Exception("Could not find {0}, at the end of " "iteration {1}".format(new_model, iter)) elif os.stat(new_model).st_size == 0: raise Exception("{0} has size 0. Something went wrong in " "iteration {1}".format(new_model, iter)) if os.path.exists("{0}/cache.{1}".format(dir, iter)): os.remove("{0}/cache.{1}".format(dir, iter))
def train_one_iteration(dir, iter, srand, egs_dir, num_jobs, num_archives_processed, num_archives, learning_rate, minibatch_size_str, momentum, max_param_change, shuffle_buffer_size, run_opts, image_augmentation_opts=None, frames_per_eg=-1, min_deriv_time=None, max_deriv_time_relative=None, shrinkage_value=1.0, dropout_edit_string="", get_raw_nnet_from_am=True, use_multitask_egs=False, backstitch_training_scale=0.0, backstitch_training_interval=1): """ Called from steps/nnet3/train_*.py scripts for one iteration of neural network training Selected args: frames_per_eg: The default value -1 implies chunk_level_training, which is particularly applicable to RNN training. If it is > 0, then it implies frame-level training, which is applicable for DNN training. If it is > 0, then each parallel SGE job created, a different frame numbered 0..frames_per_eg-1 is used. shrinkage_value: If value is 1.0, no shrinkage is done; otherwise parameter values are scaled by this value. get_raw_nnet_from_am: If True, then the network is read and stored as acoustic model i.e. along with transition model e.g. 10.mdl as against a raw network e.g. 10.raw when the value is False. """ # Set off jobs doing some diagnostics, in the background. # Use the egs dir from the previous iteration for the diagnostics logger.info("Training neural net (pass {0})".format(iter)) # check if different iterations use the same random seed if os.path.exists('{0}/srand'.format(dir)): try: saved_srand = int(open('{0}/srand'.format(dir)).readline().strip()) except (IOError, ValueError): logger.error("Exception while reading the random seed " "for training") raise if srand != saved_srand: logger.warning("The random seed provided to this iteration " "(srand={0}) is different from the one saved last " "time (srand={1}). Using srand={0}.".format( srand, saved_srand)) else: with open('{0}/srand'.format(dir), 'w') as f: f.write(str(srand)) # Sets off some background jobs to compute train and # validation set objectives compute_train_cv_probabilities( dir=dir, iter=iter, egs_dir=egs_dir, run_opts=run_opts, get_raw_nnet_from_am=get_raw_nnet_from_am, use_multitask_egs=use_multitask_egs) if iter > 0: # Runs in the background compute_progress(dir=dir, iter=iter, egs_dir=egs_dir, run_opts=run_opts, get_raw_nnet_from_am=get_raw_nnet_from_am, use_multitask_egs=use_multitask_egs) do_average = (iter > 0) if get_raw_nnet_from_am: raw_model_string = ("nnet3-am-copy --raw=true --learning-rate={0} " "--scale={1} {2}/{3}.mdl - |".format( learning_rate, shrinkage_value, dir, iter)) else: raw_model_string = ("nnet3-copy --learning-rate={lr} --scale={s} " "{dir}/{iter}.raw - |".format( lr=learning_rate, s=shrinkage_value, dir=dir, iter=iter)) raw_model_string = raw_model_string + dropout_edit_string if do_average: cur_minibatch_size_str = minibatch_size_str cur_max_param_change = max_param_change else: # on iteration zero, use a smaller minibatch size (and we will later # choose the output of just one of the jobs): the model-averaging isn't # always helpful when the model is changing too fast (i.e. it can worsen # the objective function), and the smaller minibatch size will help to # keep the update stable. cur_minibatch_size_str = common_train_lib.halve_minibatch_size_str(minibatch_size_str) cur_max_param_change = float(max_param_change) / math.sqrt(2) shrink_info_str = '' if shrinkage_value != 1.0: shrink_info_str = ' and shrink value is {0}'.format(shrinkage_value) logger.info("On iteration {0}, learning rate is {1}" "{shrink_info}.".format( iter, learning_rate, shrink_info=shrink_info_str)) train_new_models(dir=dir, iter=iter, srand=srand, num_jobs=num_jobs, num_archives_processed=num_archives_processed, num_archives=num_archives, raw_model_string=raw_model_string, egs_dir=egs_dir, momentum=momentum, max_param_change=cur_max_param_change, shuffle_buffer_size=shuffle_buffer_size, minibatch_size_str=cur_minibatch_size_str, run_opts=run_opts, frames_per_eg=frames_per_eg, min_deriv_time=min_deriv_time, max_deriv_time_relative=max_deriv_time_relative, image_augmentation_opts=image_augmentation_opts, use_multitask_egs=use_multitask_egs, backstitch_training_scale=backstitch_training_scale, backstitch_training_interval=backstitch_training_interval) [models_to_average, best_model] = common_train_lib.get_successful_models( num_jobs, '{0}/log/train.{1}.%.log'.format(dir, iter)) nnets_list = [] for n in models_to_average: nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n)) if do_average: # average the output of the different jobs. common_train_lib.get_average_nnet_model( dir=dir, iter=iter, nnets_list=" ".join(nnets_list), run_opts=run_opts, get_raw_nnet_from_am=get_raw_nnet_from_am) else: # choose the best model from different jobs common_train_lib.get_best_nnet_model( dir=dir, iter=iter, best_model_index=best_model, run_opts=run_opts, get_raw_nnet_from_am=get_raw_nnet_from_am) try: for i in range(1, num_jobs + 1): os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i)) except OSError: logger.error("Error while trying to delete the raw models") raise if get_raw_nnet_from_am: new_model = "{0}/{1}.mdl".format(dir, iter + 1) else: new_model = "{0}/{1}.raw".format(dir, iter + 1) if not os.path.isfile(new_model): raise Exception("Could not find {0}, at the end of " "iteration {1}".format(new_model, iter)) elif os.stat(new_model).st_size == 0: raise Exception("{0} has size 0. Something went wrong in " "iteration {1}".format(new_model, iter)) if os.path.exists("{0}/cache.{1}".format(dir, iter)): os.remove("{0}/cache.{1}".format(dir, iter))
def train_one_iteration(dir, iter, srand, egs_dir, num_jobs, num_archives_processed, num_archives, learning_rate, shrinkage_value, num_chunk_per_minibatch_str, apply_deriv_weights, min_deriv_time, max_deriv_time_relative, l2_regularize, xent_regularize, leaky_hmm_coefficient, momentum, max_param_change, shuffle_buffer_size, frame_subsampling_factor, run_opts, dropout_edit_string="", train_opts="", backstitch_training_scale=0.0, backstitch_training_interval=1, use_multitask_egs=False): """ Called from steps/nnet3/chain/train.py for one iteration for neural network training with LF-MMI objective """ # Set off jobs doing some diagnostics, in the background. # Use the egs dir from the previous iteration for the diagnostics # check if different iterations use the same random seed if os.path.exists('{0}/srand'.format(dir)): try: saved_srand = int(open('{0}/srand'.format(dir)).readline().strip()) except (IOError, ValueError): logger.error("Exception while reading the random seed " "for training") raise if srand != saved_srand: logger.warning("The random seed provided to this iteration " "(srand={0}) is different from the one saved last " "time (srand={1}). Using srand={0}.".format( srand, saved_srand)) else: with open('{0}/srand'.format(dir), 'w') as f: f.write(str(srand)) # Sets off some background jobs to compute train and # validation set objectives compute_train_cv_probabilities( dir=dir, iter=iter, egs_dir=egs_dir, l2_regularize=l2_regularize, xent_regularize=xent_regularize, leaky_hmm_coefficient=leaky_hmm_coefficient, run_opts=run_opts, use_multitask_egs=use_multitask_egs) if iter > 0: # Runs in the background compute_progress(dir, iter, run_opts) do_average = (iter > 0) raw_model_string = ("nnet3-am-copy --raw=true --learning-rate={0} " "--scale={1} {2}/{3}.mdl - |".format( learning_rate, shrinkage_value, dir, iter)) if do_average: cur_num_chunk_per_minibatch_str = num_chunk_per_minibatch_str cur_max_param_change = max_param_change else: # on iteration zero, use a smaller minibatch size (and we will later # choose the output of just one of the jobs): the model-averaging isn't # always helpful when the model is changing too fast (i.e. it can worsen # the objective function), and the smaller minibatch size will help to # keep the update stable. cur_num_chunk_per_minibatch_str = common_train_lib.halve_minibatch_size_str( num_chunk_per_minibatch_str) cur_max_param_change = float(max_param_change) / math.sqrt(2) raw_model_string = raw_model_string + dropout_edit_string train_new_models(dir=dir, iter=iter, srand=srand, num_jobs=num_jobs, num_archives_processed=num_archives_processed, num_archives=num_archives, raw_model_string=raw_model_string, egs_dir=egs_dir, apply_deriv_weights=apply_deriv_weights, min_deriv_time=min_deriv_time, max_deriv_time_relative=max_deriv_time_relative, l2_regularize=l2_regularize, xent_regularize=xent_regularize, leaky_hmm_coefficient=leaky_hmm_coefficient, momentum=momentum, max_param_change=cur_max_param_change, shuffle_buffer_size=shuffle_buffer_size, num_chunk_per_minibatch_str=cur_num_chunk_per_minibatch_str, frame_subsampling_factor=frame_subsampling_factor, run_opts=run_opts, train_opts=train_opts, # linearly increase backstitch_training_scale during the # first few iterations (hard-coded as 15) backstitch_training_scale=(backstitch_training_scale * iter / 15 if iter < 15 else backstitch_training_scale), backstitch_training_interval=backstitch_training_interval, use_multitask_egs=use_multitask_egs) [models_to_average, best_model] = common_train_lib.get_successful_models( num_jobs, '{0}/log/train.{1}.%.log'.format(dir, iter)) nnets_list = [] for n in models_to_average: nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n)) if do_average: # average the output of the different jobs. common_train_lib.get_average_nnet_model( dir=dir, iter=iter, nnets_list=" ".join(nnets_list), run_opts=run_opts) else: # choose the best model from different jobs common_train_lib.get_best_nnet_model( dir=dir, iter=iter, best_model_index=best_model, run_opts=run_opts) try: for i in range(1, num_jobs + 1): os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i)) except OSError: raise Exception("Error while trying to delete the raw models") new_model = "{0}/{1}.mdl".format(dir, iter + 1) if not os.path.isfile(new_model): raise Exception("Could not find {0}, at the end of " "iteration {1}".format(new_model, iter)) elif os.stat(new_model).st_size == 0: raise Exception("{0} has size 0. Something went wrong in " "iteration {1}".format(new_model, iter)) if os.path.exists("{0}/cache.{1}".format(dir, iter)): os.remove("{0}/cache.{1}".format(dir, iter))
def train_one_iteration(dir, iter, srand, egs_dir, num_jobs, num_archives_processed, num_archives, learning_rate, minibatch_size_str, momentum, max_param_change, shuffle_buffer_size, run_opts, image_augmentation_opts=None, frames_per_eg=-1, min_deriv_time=None, max_deriv_time_relative=None, shrinkage_value=1.0, dropout_edit_string="", get_raw_nnet_from_am=True, use_multitask_egs=False, backstitch_training_scale=0.0, backstitch_training_interval=1, compute_per_dim_accuracy=False): """ Called from steps/nnet3/train_*.py scripts for one iteration of neural network training Selected args: frames_per_eg: The default value -1 implies chunk_level_training, which is particularly applicable to RNN training. If it is > 0, then it implies frame-level training, which is applicable for DNN training. If it is > 0, then each parallel SGE job created, a different frame numbered 0..frames_per_eg-1 is used. shrinkage_value: If value is 1.0, no shrinkage is done; otherwise parameter values are scaled by this value. get_raw_nnet_from_am: If True, then the network is read and stored as acoustic model i.e. along with transition model e.g. 10.mdl as against a raw network e.g. 10.raw when the value is False. """ # Set off jobs doing some diagnostics, in the background. # Use the egs dir from the previous iteration for the diagnostics logger.info("Training neural net (pass {0})".format(iter)) # check if different iterations use the same random seed if os.path.exists('{0}/srand'.format(dir)): try: saved_srand = int(open('{0}/srand'.format(dir)).readline().strip()) except (IOError, ValueError): logger.error("Exception while reading the random seed " "for training") raise if srand != saved_srand: logger.warning("The random seed provided to this iteration " "(srand={0}) is different from the one saved last " "time (srand={1}). Using srand={0}.".format( srand, saved_srand)) else: with open('{0}/srand'.format(dir), 'w') as f: f.write(str(srand)) # Sets off some background jobs to compute train and # validation set objectives compute_train_cv_probabilities( dir=dir, iter=iter, egs_dir=egs_dir, run_opts=run_opts, get_raw_nnet_from_am=get_raw_nnet_from_am, use_multitask_egs=use_multitask_egs, compute_per_dim_accuracy=compute_per_dim_accuracy) if iter > 0: # Runs in the background compute_progress(dir=dir, iter=iter, egs_dir=egs_dir, run_opts=run_opts, get_raw_nnet_from_am=get_raw_nnet_from_am, use_multitask_egs=use_multitask_egs) do_average = (iter > 0) if get_raw_nnet_from_am: raw_model_string = ("nnet3-am-copy --raw=true --learning-rate={0} " "--scale={1} {2}/{3}.mdl - |".format( learning_rate, shrinkage_value, dir, iter)) else: raw_model_string = ("nnet3-copy --learning-rate={lr} --scale={s} " "{dir}/{iter}.raw - |".format( lr=learning_rate, s=shrinkage_value, dir=dir, iter=iter)) raw_model_string = raw_model_string + dropout_edit_string if do_average: cur_minibatch_size_str = minibatch_size_str cur_max_param_change = max_param_change else: # on iteration zero, use a smaller minibatch size (and we will later # choose the output of just one of the jobs): the model-averaging isn't # always helpful when the model is changing too fast (i.e. it can worsen # the objective function), and the smaller minibatch size will help to # keep the update stable. cur_minibatch_size_str = common_train_lib.halve_minibatch_size_str(minibatch_size_str) cur_max_param_change = float(max_param_change) / math.sqrt(2) shrink_info_str = '' if shrinkage_value != 1.0: shrink_info_str = ' and shrink value is {0}'.format(shrinkage_value) logger.info("On iteration {0}, learning rate is {1}" "{shrink_info}.".format( iter, learning_rate, shrink_info=shrink_info_str)) train_new_models(dir=dir, iter=iter, srand=srand, num_jobs=num_jobs, num_archives_processed=num_archives_processed, num_archives=num_archives, raw_model_string=raw_model_string, egs_dir=egs_dir, momentum=momentum, max_param_change=cur_max_param_change, shuffle_buffer_size=shuffle_buffer_size, minibatch_size_str=cur_minibatch_size_str, run_opts=run_opts, frames_per_eg=frames_per_eg, min_deriv_time=min_deriv_time, max_deriv_time_relative=max_deriv_time_relative, image_augmentation_opts=image_augmentation_opts, use_multitask_egs=use_multitask_egs, backstitch_training_scale=backstitch_training_scale, backstitch_training_interval=backstitch_training_interval) [models_to_average, best_model] = common_train_lib.get_successful_models( num_jobs, '{0}/log/train.{1}.%.log'.format(dir, iter)) nnets_list = [] for n in models_to_average: nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n)) if do_average: # average the output of the different jobs. common_train_lib.get_average_nnet_model( dir=dir, iter=iter, nnets_list=" ".join(nnets_list), run_opts=run_opts, get_raw_nnet_from_am=get_raw_nnet_from_am) else: # choose the best model from different jobs common_train_lib.get_best_nnet_model( dir=dir, iter=iter, best_model_index=best_model, run_opts=run_opts, get_raw_nnet_from_am=get_raw_nnet_from_am) try: for i in range(1, num_jobs + 1): os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i)) except OSError: logger.error("Error while trying to delete the raw models") raise if get_raw_nnet_from_am: new_model = "{0}/{1}.mdl".format(dir, iter + 1) else: new_model = "{0}/{1}.raw".format(dir, iter + 1) if not os.path.isfile(new_model): raise Exception("Could not find {0}, at the end of " "iteration {1}".format(new_model, iter)) elif os.stat(new_model).st_size == 0: raise Exception("{0} has size 0. Something went wrong in " "iteration {1}".format(new_model, iter)) if os.path.exists("{0}/cache.{1}".format(dir, iter)): os.remove("{0}/cache.{1}".format(dir, iter))
def train_one_iteration(dir, iter, srand, egs_dir, num_jobs, num_archives_processed, num_archives, learning_rate, minibatch_size, num_hidden_layers, add_layers_period, left_context, right_context, momentum, max_param_change, shuffle_buffer_size, run_opts, cv_minibatch_size=256, frames_per_eg=-1, min_deriv_time=None, max_deriv_time=None, shrinkage_value=1.0, dropout_edit_string="", get_raw_nnet_from_am=True, background_process_handler=None): """ Called from steps/nnet3/train_*.py scripts for one iteration of neural network training Args: frames_per_eg: The default value -1 implies chunk_level_training, which is particularly applicable to RNN training. If it is > 0, then it implies frame-level training, which is applicable for DNN training. If it is > 0, then each parallel SGE job created, a different frame numbered 0..frames_per_eg-1 is used. min_deriv_time: Applicable for RNN training. A default value of None implies a min_deriv_time of 0 is used. During RNN training, its value is set to chunk_width - num_bptt_steps in the training script. shrinkage_value: If value is 1.0, no shrinkage is done; otherwise parameter values are scaled by this value. get_raw_nnet_from_am: If True, then the network is read and stored as acoustic model i.e. along with transition model e.g. 10.mdl as against a raw network e.g. 10.raw when the value is False. """ # Set off jobs doing some diagnostics, in the background. # Use the egs dir from the previous iteration for the diagnostics logger.info("Training neural net (pass {0})".format(iter)) # check if different iterations use the same random seed if os.path.exists('{0}/srand'.format(dir)): try: saved_srand = int(open('{0}/srand'.format(dir)).readline().strip()) except (IOError, ValueError): logger.error("Exception while reading the random seed " "for training") raise if srand != saved_srand: logger.warning("The random seed provided to this iteration " "(srand={0}) is different from the one saved last " "time (srand={1}). Using srand={0}.".format( srand, saved_srand)) else: with open('{0}/srand'.format(dir), 'w') as f: f.write(str(srand)) # Sets off some background jobs to compute train and # validation set objectives compute_train_cv_probabilities( dir=dir, iter=iter, egs_dir=egs_dir, left_context=left_context, right_context=right_context, run_opts=run_opts, mb_size=cv_minibatch_size, get_raw_nnet_from_am=get_raw_nnet_from_am, wait=False, background_process_handler=background_process_handler) if iter > 0: # Runs in the background compute_progress(dir=dir, iter=iter, egs_dir=egs_dir, left_context=left_context, right_context=right_context, run_opts=run_opts, mb_size=cv_minibatch_size, wait=False, get_raw_nnet_from_am=get_raw_nnet_from_am, background_process_handler=background_process_handler) # an option for writing cache (storing pairs of nnet-computations # and computation-requests) during training. cache_read_opt = "" if (iter > 0 and (iter <= (num_hidden_layers - 1) * add_layers_period) and iter % add_layers_period == 0): # if we've just added new hiden layer, don't do averaging but take the # best. do_average = False cur_num_hidden_layers = 1 + iter / add_layers_period config_file = "{0}/configs/layer{1}.config".format( dir, cur_num_hidden_layers) if get_raw_nnet_from_am: raw_model_string = ("nnet3-am-copy --raw=true " "--learning-rate={lr} {dir}/{iter}.mdl - | " "nnet3-init --srand={srand} - " "{config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file)) else: raw_model_string = ("nnet3-copy --learning-rate={lr} " "{dir}/{iter}.raw - | " "nnet3-init --srand={srand} - " "{config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file)) else: do_average = True if iter == 0: # on iteration 0, pick the best, don't average. do_average = False else: cache_read_opt = "--read-cache={dir}/cache.{iter}".format( dir=dir, iter=iter) if get_raw_nnet_from_am: raw_model_string = ("nnet3-am-copy --raw=true --learning-rate={0} " "{1}/{2}.mdl - |".format( learning_rate, dir, iter)) else: raw_model_string = ("nnet3-copy --learning-rate={lr} " "{dir}/{iter}.raw - |".format(lr=learning_rate, dir=dir, iter=iter)) raw_model_string = raw_model_string + dropout_edit_string if do_average: cur_minibatch_size = minibatch_size cur_max_param_change = max_param_change else: # on iteration zero or when we just added a layer, use a smaller # minibatch size (and we will later choose the output of just one of # the jobs): the model-averaging isn't always helpful when the model is # changing too fast (i.e. it can worsen the objective function), and # the smaller minibatch size will help to keep the update stable. cur_minibatch_size = minibatch_size / 2 cur_max_param_change = float(max_param_change) / math.sqrt(2) try: os.remove("{0}/.error".format(dir)) except OSError: pass shrink_info_str = '' if shrinkage_value != 1.0: shrink_info_str = ' and shrink value is {0}'.format(shrinkage_value) logger.info("On iteration {0}, learning rate is {1}" "{shrink_info}.".format(iter, learning_rate, shrink_info=shrink_info_str)) train_new_models(dir=dir, iter=iter, srand=srand, num_jobs=num_jobs, num_archives_processed=num_archives_processed, num_archives=num_archives, raw_model_string=raw_model_string, egs_dir=egs_dir, left_context=left_context, right_context=right_context, momentum=momentum, max_param_change=cur_max_param_change, shuffle_buffer_size=shuffle_buffer_size, minibatch_size=cur_minibatch_size, cache_read_opt=cache_read_opt, run_opts=run_opts, frames_per_eg=frames_per_eg, min_deriv_time=min_deriv_time, max_deriv_time=max_deriv_time) [models_to_average, best_model] = common_train_lib.get_successful_models( num_jobs, '{0}/log/train.{1}.%.log'.format(dir, iter)) nnets_list = [] for n in models_to_average: nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n)) if do_average: # average the output of the different jobs. common_train_lib.get_average_nnet_model( dir=dir, iter=iter, nnets_list=" ".join(nnets_list), run_opts=run_opts, get_raw_nnet_from_am=get_raw_nnet_from_am, shrink=shrinkage_value) else: # choose the best model from different jobs common_train_lib.get_best_nnet_model( dir=dir, iter=iter, best_model_index=best_model, run_opts=run_opts, get_raw_nnet_from_am=get_raw_nnet_from_am, shrink=shrinkage_value) try: for i in range(1, num_jobs + 1): os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i)) except OSError: logger.error("Error while trying to delete the raw models") raise if get_raw_nnet_from_am: new_model = "{0}/{1}.mdl".format(dir, iter + 1) else: new_model = "{0}/{1}.raw".format(dir, iter + 1) if not os.path.isfile(new_model): raise Exception("Could not find {0}, at the end of " "iteration {1}".format(new_model, iter)) elif os.stat(new_model).st_size == 0: raise Exception("{0} has size 0. Something went wrong in " "iteration {1}".format(new_model, iter)) if cache_read_opt and os.path.exists("{0}/cache.{1}".format(dir, iter)): os.remove("{0}/cache.{1}".format(dir, iter))
def train_one_iteration(dir, iter, srand, egs_dir, num_jobs, num_archives_processed, num_archives, learning_rate, minibatch_size, num_hidden_layers, add_layers_period, left_context, right_context, momentum, max_param_change, shuffle_buffer_size, run_opts, cv_minibatch_size=256, frames_per_eg=-1, min_deriv_time=None, max_deriv_time=None, shrinkage_value=1.0, get_raw_nnet_from_am=True, background_process_handler=None): """ Called from steps/nnet3/train_*.py scripts for one iteration of neural network training Args: frames_per_eg: The default value -1 implies chunk_level_training, which is particularly applicable to RNN training. If it is > 0, then it implies frame-level training, which is applicable for DNN training. If it is > 0, then each parallel SGE job created, a different frame numbered 0..frames_per_eg-1 is used. min_deriv_time: Applicable for RNN training. A default value of None implies a min_deriv_time of 0 is used. During RNN training, its value is set to chunk_width - num_bptt_steps in the training script. shrinkage_value: If value is 1.0, no shrinkage is done; otherwise parameter values are scaled by this value. get_raw_nnet_from_am: If True, then the network is read and stored as acoustic model i.e. along with transition model e.g. 10.mdl as against a raw network e.g. 10.raw when the value is False. """ # Set off jobs doing some diagnostics, in the background. # Use the egs dir from the previous iteration for the diagnostics logger.info("Training neural net (pass {0})".format(iter)) # check if different iterations use the same random seed if os.path.exists('{0}/srand'.format(dir)): try: saved_srand = int(open('{0}/srand'.format(dir)).readline().strip()) except (IOError, ValueError) as e: raise Exception("Exception while reading the random seed " "for training: {0}".format(e.str())) if srand != saved_srand: logger.warning("The random seed provided to this iteration " "(srand={0}) is different from the one saved last " "time (srand={1}). Using srand={0}.".format( srand, saved_srand)) else: with open('{0}/srand'.format(dir), 'w') as f: f.write(str(srand)) # Sets off some background jobs to compute train and # validation set objectives compute_train_cv_probabilities( dir=dir, iter=iter, egs_dir=egs_dir, left_context=left_context, right_context=right_context, run_opts=run_opts, mb_size=cv_minibatch_size, get_raw_nnet_from_am=get_raw_nnet_from_am, wait=False, background_process_handler=background_process_handler) if iter > 0: # Runs in the background compute_progress(dir=dir, iter=iter, egs_dir=egs_dir, left_context=left_context, right_context=right_context, run_opts=run_opts, mb_size=cv_minibatch_size, wait=False, get_raw_nnet_from_am=get_raw_nnet_from_am, background_process_handler=background_process_handler) # an option for writing cache (storing pairs of nnet-computations # and computation-requests) during training. cache_read_opt = "" if (iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and iter % add_layers_period == 0): # if we've just added new hiden layer, don't do averaging but take the # best. do_average = False cur_num_hidden_layers = 1 + iter / add_layers_period config_file = "{0}/configs/layer{1}.config".format( dir, cur_num_hidden_layers) if get_raw_nnet_from_am: raw_model_string = ("nnet3-am-copy --raw=true " "--learning-rate={lr} {dir}/{iter}.mdl - | " "nnet3-init --srand={srand} - " "{config} - |".format( lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file)) else: raw_model_string = ("nnet3-copy --learning-rate={lr} " "{dir}/{iter}.raw - | " "nnet3-init --srand={srand} - " "{config} - |".format( lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file)) else: do_average = True if iter == 0: # on iteration 0, pick the best, don't average. do_average = False else: cache_read_opt = "--read-cache={dir}/cache.{iter}".format( dir=dir, iter=iter) if get_raw_nnet_from_am: raw_model_string = ("nnet3-am-copy --raw=true --learning-rate={0} " "{1}/{2}.mdl - |".format(learning_rate, dir, iter)) else: raw_model_string = ("nnet3-copy --learning-rate={lr} " "{dir}/{iter}.raw - |".format( lr=learning_rate, dir=dir, iter=iter)) if do_average: cur_minibatch_size = minibatch_size cur_max_param_change = max_param_change else: # on iteration zero or when we just added a layer, use a smaller # minibatch size (and we will later choose the output of just one of # the jobs): the model-averaging isn't always helpful when the model is # changing too fast (i.e. it can worsen the objective function), and # the smaller minibatch size will help to keep the update stable. cur_minibatch_size = minibatch_size / 2 cur_max_param_change = float(max_param_change) / math.sqrt(2) try: os.remove("{0}/.error".format(dir)) except OSError: pass train_new_models(dir=dir, iter=iter, srand=srand, num_jobs=num_jobs, num_archives_processed=num_archives_processed, num_archives=num_archives, raw_model_string=raw_model_string, egs_dir=egs_dir, left_context=left_context, right_context=right_context, momentum=momentum, max_param_change=cur_max_param_change, shuffle_buffer_size=shuffle_buffer_size, minibatch_size=cur_minibatch_size, cache_read_opt=cache_read_opt, run_opts=run_opts, frames_per_eg=frames_per_eg, min_deriv_time=min_deriv_time, max_deriv_time=max_deriv_time) [models_to_average, best_model] = common_train_lib.get_successful_models( num_jobs, '{0}/log/train.{1}.%.log'.format(dir, iter)) nnets_list = [] for n in models_to_average: nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n)) if do_average: # average the output of the different jobs. common_train_lib.get_average_nnet_model( dir=dir, iter=iter, nnets_list=" ".join(nnets_list), run_opts=run_opts, get_raw_nnet_from_am=get_raw_nnet_from_am, shrink=shrinkage_value) else: # choose the best model from different jobs common_train_lib.get_best_nnet_model( dir=dir, iter=iter, best_model_index=best_model, run_opts=run_opts, get_raw_nnet_from_am=get_raw_nnet_from_am, shrink=shrinkage_value) try: for i in range(1, num_jobs + 1): os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i)) except OSError: raise Exception("Error while trying to delete the raw models") if get_raw_nnet_from_am: new_model = "{0}/{1}.mdl".format(dir, iter + 1) else: new_model = "{0}/{1}.raw".format(dir, iter + 1) if not os.path.isfile(new_model): raise Exception("Could not find {0}, at the end of " "iteration {1}".format(new_model, iter)) elif os.stat(new_model).st_size == 0: raise Exception("{0} has size 0. Something went wrong in " "iteration {1}".format(new_model, iter)) if cache_read_opt and os.path.exists("{0}/cache.{1}".format(dir, iter)): os.remove("{0}/cache.{1}".format(dir, iter))
def train_one_iteration(dir, iter, srand, egs_dir, num_jobs, num_archives_processed, num_archives, learning_rate, shrinkage_value, num_chunk_per_minibatch_str, apply_deriv_weights, min_deriv_time, max_deriv_time_relative, l2_regularize, xent_regularize, leaky_hmm_coefficient, momentum, max_param_change, shuffle_buffer_size, frame_subsampling_factor, run_opts, dropout_edit_string="", train_opts="", chain_opts="", backstitch_training_scale=0.0, backstitch_training_interval=1, use_multitask_egs=False): """ Called from steps/nnet3/chain/train.py for one iteration for neural network training with LF-MMI objective """ # Set off jobs doing some diagnostics, in the background. # Use the egs dir from the previous iteration for the diagnostics # check if different iterations use the same random seed if os.path.exists('{0}/srand'.format(dir)): try: saved_srand = int(open('{0}/srand'.format(dir)).readline().strip()) except (IOError, ValueError): logger.error("Exception while reading the random seed " "for training") raise if srand != saved_srand: logger.warning("The random seed provided to this iteration " "(srand={0}) is different from the one saved last " "time (srand={1}). Using srand={0}.".format( srand, saved_srand)) else: with open('{0}/srand'.format(dir), 'w') as f: f.write(str(srand)) # Sets off some background jobs to compute train and # validation set objectives compute_train_cv_probabilities(dir=dir, iter=iter, egs_dir=egs_dir, l2_regularize=l2_regularize, xent_regularize=xent_regularize, leaky_hmm_coefficient=leaky_hmm_coefficient, run_opts=run_opts, use_multitask_egs=use_multitask_egs, chain_opts=chain_opts) if iter > 0: # Runs in the background compute_progress(dir, iter, run_opts) do_average = (iter > 0) raw_model_string = ("nnet3-am-copy --raw=true --learning-rate={0} " "--scale={1} {2}/{3}.mdl - |".format( learning_rate, shrinkage_value, dir, iter)) if do_average: cur_num_chunk_per_minibatch_str = num_chunk_per_minibatch_str cur_max_param_change = max_param_change else: # on iteration zero, use a smaller minibatch size (and we will later # choose the output of just one of the jobs): the model-averaging isn't # always helpful when the model is changing too fast (i.e. it can worsen # the objective function), and the smaller minibatch size will help to # keep the update stable. cur_num_chunk_per_minibatch_str = common_train_lib.halve_minibatch_size_str( num_chunk_per_minibatch_str) cur_max_param_change = float(max_param_change) / math.sqrt(2) raw_model_string = raw_model_string + dropout_edit_string train_new_models( dir=dir, iter=iter, srand=srand, num_jobs=num_jobs, num_archives_processed=num_archives_processed, num_archives=num_archives, raw_model_string=raw_model_string, egs_dir=egs_dir, apply_deriv_weights=apply_deriv_weights, min_deriv_time=min_deriv_time, max_deriv_time_relative=max_deriv_time_relative, l2_regularize=l2_regularize, xent_regularize=xent_regularize, leaky_hmm_coefficient=leaky_hmm_coefficient, momentum=momentum, max_param_change=cur_max_param_change, shuffle_buffer_size=shuffle_buffer_size, num_chunk_per_minibatch_str=cur_num_chunk_per_minibatch_str, frame_subsampling_factor=frame_subsampling_factor, run_opts=run_opts, train_opts=train_opts, chain_opts=chain_opts, # linearly increase backstitch_training_scale during the # first few iterations (hard-coded as 15) backstitch_training_scale=(backstitch_training_scale * iter / 15 if iter < 15 else backstitch_training_scale), backstitch_training_interval=backstitch_training_interval, use_multitask_egs=use_multitask_egs) [models_to_average, best_model] = common_train_lib.get_successful_models( num_jobs, '{0}/log/train.{1}.%.log'.format(dir, iter)) nnets_list = [] for n in models_to_average: nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n)) if do_average: # average the output of the different jobs. common_train_lib.get_average_nnet_model( dir=dir, iter=iter, nnets_list=" ".join(nnets_list), run_opts=run_opts) else: # choose the best model from different jobs common_train_lib.get_best_nnet_model(dir=dir, iter=iter, best_model_index=best_model, run_opts=run_opts) try: for i in range(1, num_jobs + 1): os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i)) except OSError: raise Exception("Error while trying to delete the raw models") new_model = "{0}/{1}.mdl".format(dir, iter + 1) if not os.path.isfile(new_model): raise Exception("Could not find {0}, at the end of " "iteration {1}".format(new_model, iter)) elif os.stat(new_model).st_size == 0: raise Exception("{0} has size 0. Something went wrong in " "iteration {1}".format(new_model, iter)) if os.path.exists("{0}/cache.{1}".format(dir, iter)): os.remove("{0}/cache.{1}".format(dir, iter))
def train_one_iteration(dir, iter, srand, egs_dir, num_jobs, num_archives_processed, num_archives, learning_rate, shrinkage_value, num_chunk_per_minibatch, num_hidden_layers, add_layers_period, left_context, right_context, apply_deriv_weights, min_deriv_time, max_deriv_time, l2_regularize, xent_regularize, leaky_hmm_coefficient, momentum, max_param_change, shuffle_buffer_size, frame_subsampling_factor, truncate_deriv_weights, run_opts, background_process_handler=None): """ Called from steps/nnet3/chain/train.py for one iteration for neural network training with LF-MMI objective """ # Set off jobs doing some diagnostics, in the background. # Use the egs dir from the previous iteration for the diagnostics logger.info("Training neural net (pass {0})".format(iter)) # check if different iterations use the same random seed if os.path.exists('{0}/srand'.format(dir)): try: saved_srand = int(open('{0}/srand'.format(dir)).readline().strip()) except (IOError, ValueError) as e: raise Exception("Exception while reading the random seed " "for training: {0}".format(e.str())) if srand != saved_srand: logger.warning("The random seed provided to this iteration " "(srand={0}) is different from the one saved last " "time (srand={1}). Using srand={0}.".format( srand, saved_srand)) else: with open('{0}/srand'.format(dir), 'w') as f: f.write(str(srand)) # Sets off some background jobs to compute train and # validation set objectives compute_train_cv_probabilities( dir=dir, iter=iter, egs_dir=egs_dir, left_context=left_context, right_context=right_context, l2_regularize=l2_regularize, xent_regularize=xent_regularize, leaky_hmm_coefficient=leaky_hmm_coefficient, run_opts=run_opts, background_process_handler=background_process_handler) if iter > 0: # Runs in the background compute_progress(dir, iter, run_opts, background_process_handler=background_process_handler) if (iter > 0 and (iter <= (num_hidden_layers - 1) * add_layers_period) and iter % add_layers_period == 0): # if we've just added new hiden layer, don't do averaging but take the # best. do_average = False cur_num_hidden_layers = 1 + iter / add_layers_period config_file = "{0}/configs/layer{1}.config".format( dir, cur_num_hidden_layers) raw_model_string = ("nnet3-am-copy --raw=true --learning-rate={lr} " "{dir}/{iter}.mdl - | nnet3-init --srand={srand} " "- {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file)) cache_io_opts = "" else: do_average = True if iter == 0: # on iteration 0, pick the best, don't average. do_average = False raw_model_string = ("nnet3-am-copy --raw=true --learning-rate={0} " "{1}/{2}.mdl - |".format(learning_rate, dir, iter)) cache_io_opts = "--read-cache={dir}/cache.{iter}".format(dir=dir, iter=iter) if do_average: cur_num_chunk_per_minibatch = num_chunk_per_minibatch cur_max_param_change = max_param_change else: # on iteration zero or when we just added a layer, use a smaller # minibatch size (and we will later choose the output of just one of # the jobs): the model-averaging isn't always helpful when the model is # changing too fast (i.e. it can worsen the objective function), and # the smaller minibatch size will help to keep the update stable. cur_num_chunk_per_minibatch = num_chunk_per_minibatch / 2 cur_max_param_change = float(max_param_change) / math.sqrt(2) train_new_models(dir=dir, iter=iter, srand=srand, num_jobs=num_jobs, num_archives_processed=num_archives_processed, num_archives=num_archives, raw_model_string=raw_model_string, egs_dir=egs_dir, left_context=left_context, right_context=right_context, apply_deriv_weights=apply_deriv_weights, min_deriv_time=min_deriv_time, max_deriv_time=max_deriv_time, l2_regularize=l2_regularize, xent_regularize=xent_regularize, leaky_hmm_coefficient=leaky_hmm_coefficient, momentum=momentum, max_param_change=cur_max_param_change, shuffle_buffer_size=shuffle_buffer_size, num_chunk_per_minibatch=cur_num_chunk_per_minibatch, frame_subsampling_factor=frame_subsampling_factor, truncate_deriv_weights=truncate_deriv_weights, cache_io_opts=cache_io_opts, run_opts=run_opts) [models_to_average, best_model] = common_train_lib.get_successful_models( num_jobs, '{0}/log/train.{1}.%.log'.format(dir, iter)) nnets_list = [] for n in models_to_average: nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n)) if do_average: # average the output of the different jobs. common_train_lib.get_average_nnet_model( dir=dir, iter=iter, nnets_list=" ".join(nnets_list), run_opts=run_opts, shrink=shrinkage_value) else: # choose the best model from different jobs common_train_lib.get_best_nnet_model(dir=dir, iter=iter, best_model_index=best_model, run_opts=run_opts, shrink=shrinkage_value) try: for i in range(1, num_jobs + 1): os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i)) except OSError: raise Exception("Error while trying to delete the raw models") new_model = "{0}/{1}.mdl".format(dir, iter + 1) if not os.path.isfile(new_model): raise Exception("Could not find {0}, at the end of " "iteration {1}".format(new_model, iter)) elif os.stat(new_model).st_size == 0: raise Exception("{0} has size 0. Something went wrong in " "iteration {1}".format(new_model, iter)) if os.path.exists("{0}/cache.{1}".format(dir, iter)): os.remove("{0}/cache.{1}".format(dir, iter))