def prepare_initial_acoustic_model(dir, run_opts, srand=-1, input_model=None): """This function adds the first layer; It will also prepare the acoustic model with the transition model. If 'input_model' is specified, no initial network preparation(adding the first layer) is done and this model is used as initial 'raw' model instead of '0.raw' model to prepare '0.mdl' as acoustic model by adding the transition model. """ if input_model is None: common_train_lib.prepare_initial_network(dir, run_opts, srand=srand) # The model-format for a 'chain' acoustic model is just the transition # model and then the raw nnet, so we can use 'cat' to create this, as # long as they have the same mode (binary or not binary). # We ensure that they have the same mode (even if someone changed the # script to make one or both of them text mode) by copying them both # before concatenating them. common_lib.execute_command("""{command} {dir}/log/init_mdl.log \ nnet3-am-init {dir}/0.trans_mdl {raw_mdl} \ {dir}/0.mdl""".format( command=run_opts.command, dir=dir, raw_mdl=(input_model if input_model is not None else "{0}/0.raw".format(dir)), ))
def prepare_initial_acoustic_model(dir, alidir, run_opts, srand=-3, input_model=None): """Adds the first layer; this will also add in the lda.mat and presoftmax_prior_scale.vec. It will also prepare the acoustic model with the transition model. If 'input_model' is specified, no initial network preparation(adding the first layer) is done and this model is used as initial 'raw' model instead of '0.raw' model to prepare '0.mdl' as acoustic model by adding the transition model. """ if input_model is None: common_train_lib.prepare_initial_network(dir, run_opts, srand=srand) # Convert to .mdl, train the transitions, set the priors. common_lib.execute_command("""{command} {dir}/log/init_mdl.log \ nnet3-am-init {alidir}/final.mdl {raw_mdl} - \| \ nnet3-am-train-transitions - \ "ark:gunzip -c {alidir}/ali.*.gz|" {dir}/0.mdl """.format( command=run_opts.command, dir=dir, alidir=alidir, raw_mdl=(input_model if input_model is not None else "{0}/0.raw".format(dir)), ))
def prepare_initial_acoustic_model(dir, alidir, run_opts, srand=-3): """ Adds the first layer; this will also add in the lda.mat and presoftmax_prior_scale.vec. It will also prepare the acoustic model with the transition model.""" common_train_lib.prepare_initial_network(dir, run_opts, srand=srand) # Convert to .mdl, train the transitions, set the priors. common_lib.execute_command("""{command} {dir}/log/init_mdl.log \ nnet3-am-init {alidir}/final.mdl {dir}/0.raw - \| \ nnet3-am-train-transitions - \ "ark:gunzip -c {alidir}/ali.*.gz|" {dir}/0.mdl """.format(command=run_opts.command, dir=dir, alidir=alidir))
def prepare_initial_acoustic_model(dir, run_opts, srand=-1): """ Adds the first layer; this will also add in the lda.mat and presoftmax_prior_scale.vec. It will also prepare the acoustic model with the transition model.""" common_train_lib.prepare_initial_network(dir, run_opts, srand=srand) # The model-format for a 'chain' acoustic model is just the transition # model and then the raw nnet, so we can use 'cat' to create this, as # long as they have the same mode (binary or not binary). # We ensure that they have the same mode (even if someone changed the # script to make one or both of them text mode) by copying them both # before concatenating them. common_lib.execute_command("""{command} {dir}/log/init_mdl.log \ nnet3-am-init {dir}/0.trans_mdl {dir}/0.raw \ {dir}/0.mdl""".format(command=run_opts.command, dir=dir))
def prepare_initial_acoustic_model(dir, alidir, run_opts, srand=-3): """ Adds the first layer; this will also add in the lda.mat and presoftmax_prior_scale.vec. It will also prepare the acoustic model with the transition model.""" common_train_lib.prepare_initial_network(dir, run_opts, srand=srand) # Convert to .mdl, train the transitions, set the priors. common_lib.run_job( """{command} {dir}/log/init_mdl.log \ nnet3-am-init {alidir}/final.mdl {dir}/0.raw - \| \ nnet3-am-train-transitions - \ "ark:gunzip -c {alidir}/ali.*.gz|" {dir}/0.mdl """.format( command=run_opts.command, dir=dir, alidir=alidir ) )
def prepare_initial_acoustic_model(dir, run_opts, srand=-1): """ Adds the first layer; this will also add in the lda.mat and presoftmax_prior_scale.vec. It will also prepare the acoustic model with the transition model.""" common_train_lib.prepare_initial_network(dir, run_opts, srand=srand) # The model-format for a 'chain' acoustic model is just the transition # model and then the raw nnet, so we can use 'cat' to create this, as # long as they have the same mode (binary or not binary). # We ensure that they have the same mode (even if someone changed the # script to make one or both of them text mode) by copying them both # before concatenating them. common_lib.run_job( """{command} {dir}/log/init_mdl.log \ nnet3-am-init {dir}/0.trans_mdl {dir}/0.raw \ {dir}/0.mdl""".format(command=run_opts.command, dir=dir))
def prepare_initial_acoustic_model(dir, run_opts, srand=-1, input_model=None): """ This function adds the first layer; It will also prepare the acoustic model with the transition model. If 'input_model' is specified, no initial network preparation(adding the first layer) is done and this model is used as initial 'raw' model instead of '0.raw' model to prepare '0.mdl' as acoustic model by adding the transition model. """ if input_model is None: common_train_lib.prepare_initial_network(dir, run_opts, srand=srand) # The model-format for a 'chain' acoustic model is just the transition # model and then the raw nnet, so we can use 'cat' to create this, as # long as they have the same mode (binary or not binary). # We ensure that they have the same mode (even if someone changed the # script to make one or both of them text mode) by copying them both # before concatenating them. common_lib.execute_command( """{command} {dir}/log/init_mdl.log \ nnet3-am-init {dir}/0.trans_mdl {raw_mdl} \ {dir}/0.mdl""".format(command=run_opts.command, dir=dir, raw_mdl=(input_model if input_model is not None else '{0}/0.raw'.format(dir))))
def train(args, run_opts): """ The main function for training. Args: args: a Namespace object with the required parameters obtained from the function process_args() run_opts: RunOpts object obtained from the process_args() """ arg_string = pprint.pformat(vars(args)) logger.info("Arguments for the experiment\n{0}".format(arg_string)) # Set some variables. config_dir = '{0}/configs'.format(args.dir) am_var_file = '{0}/vars_am'.format(config_dir) xvec_var_file = '{0}/vars_xvec'.format(config_dir) am_variables = common_train_lib.parse_generic_config_vars_file(am_var_file) xvec_variables = common_train_lib.parse_generic_config_vars_file(xvec_var_file) # Set some variables. try: am_model_left_context = am_variables['model_left_context'] am_model_right_context = am_variables['model_right_context'] xvec_model_left_context = xvec_variables['model_left_context'] xvec_model_right_context = xvec_variables['model_right_context'] except KeyError as e: raise Exception("KeyError {0}: Variables need to be defined in " "{1}".format(str(e), '{0}/configs'.format(args.dir))) am_left_context = am_model_left_context am_right_context = am_model_right_context xvec_left_context = xvec_model_left_context xvec_right_context = xvec_model_right_context # Initialize as "raw" nnet, prior to training the LDA-like preconditioning # matrix. This first config just does any initial splicing that we do; # we do this as it's a convenient way to get the stats for the 'lda-like' # transform. if (args.stage <= -5) and os.path.exists(args.dir+"/configs/init.config"): logger.info("Initializing a basic network for estimating " "preconditioning matrix") common_lib.execute_command( """{command} {dir}/log/nnet_init.log \ nnet3-init --srand=-2 {dir}/configs/init.config \ {dir}/init.raw""".format(command=run_opts.command, dir=args.dir)) am_egs_dir = args.am_egs_dir xvec_egs_dir = args.xvec_egs_dir am_output_name = args.am_output_name xvec_output_name = args.xvec_output_name am_weight = args.am_weight xvec_weight = args.xvec_weight feat_dim = int(common_lib.get_command_stdout("cat {0}/info/feat_dim".format(am_egs_dir))) num_archives = int(common_lib.get_command_stdout("cat {0}/info/num_archives".format(am_egs_dir))) tmp_feat_dim = int(common_lib.get_command_stdout("cat {0}/info/feat_dim".format(xvec_egs_dir))) tmp_num_archives = int(common_lib.get_command_stdout("cat {0}/info/num_archives".format(xvec_egs_dir))) # frames_per_eg is no longer a parameter but load from am_egs/info/frames_per_eg am_frames_per_eg = int(common_lib.get_command_stdout("cat {0}/info/frames_per_eg".format(am_egs_dir))) if feat_dim != tmp_feat_dim or num_archives*am_frames_per_eg != tmp_num_archives: raise Exception('The am egs and xvec egs do not match') if args.num_jobs_final > num_archives: raise Exception('num_jobs_final cannot exceed the number of archives ' 'in the egs directory') # # No need to copy files for decoding # common_train_lib.copy_egs_properties_to_exp_dir(am_egs_dir, args.dir) if args.stage <= -3 and os.path.exists(args.dir+"/configs/init.config"): logger.info('Computing the preconditioning matrix for input features') train_lib.common.compute_preconditioning_matrix( args.dir, egs_dir, num_archives, run_opts, max_lda_jobs=args.max_lda_jobs, rand_prune=args.rand_prune) if args.stage <= -1: logger.info("Preparing the initial network.") common_train_lib.prepare_initial_network(args.dir, run_opts) # set num_iters so that as close as possible, we process the data # $num_epochs times, i.e. $num_iters*$avg_num_jobs) == # $num_epochs*$num_archives, where # avg_num_jobs=(num_jobs_initial+num_jobs_final)/2. num_archives_expanded = num_archives * am_frames_per_eg num_archives_to_process = int(args.num_epochs * num_archives_expanded) num_archives_processed = 0 num_iters = ((num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final)) # If do_final_combination is True, compute the set of models_to_combine. # Otherwise, models_to_combine will be none. if args.do_final_combination: models_to_combine = common_train_lib.get_model_combine_iters( num_iters, args.num_epochs, num_archives_expanded, args.max_models_combine, args.num_jobs_final) else: models_to_combine = None logger.info("Training will run for {0} epochs = " "{1} iterations".format(args.num_epochs, num_iters)) for iter in range(num_iters): if (args.exit_stage is not None) and (iter == args.exit_stage): logger.info("Exiting early due to --exit-stage {0}".format(iter)) return current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters) if args.stage <= iter: lrate = common_train_lib.get_learning_rate(iter, current_num_jobs, num_iters, num_archives_processed, num_archives_to_process, args.initial_effective_lrate, args.final_effective_lrate) shrinkage_value = 1.0 - (args.proportional_shrink * lrate) if shrinkage_value <= 0.5: raise Exception("proportional-shrink={0} is too large, it gives " "shrink-value={1}".format(args.proportional_shrink, shrinkage_value)) percent = num_archives_processed * 100.0 / num_archives_to_process epoch = (num_archives_processed * args.num_epochs / num_archives_to_process) shrink_info_str = '' if shrinkage_value != 1.0: shrink_info_str = 'shrink: {0:0.5f}'.format(shrinkage_value) logger.info("Iter: {0}/{1} " "Epoch: {2:0.2f}/{3:0.1f} ({4:0.1f}% complete) " "lr: {5:0.6f} {6}".format(iter, num_iters - 1, epoch, args.num_epochs, percent, lrate, shrink_info_str)) train_lib.common.train_cvector_one_iteration( dir=args.dir, iter=iter, srand=args.srand, am_output_name=am_output_name, am_weight=am_weight, am_egs_dir=am_egs_dir, xvec_output_name=xvec_output_name, xvec_weight=xvec_weight, xvec_egs_dir=xvec_egs_dir, num_jobs=current_num_jobs, num_archives_processed=num_archives_processed, num_archives=num_archives, learning_rate=lrate, minibatch_size_str=args.minibatch_size, momentum=args.momentum, max_param_change=args.max_param_change, shuffle_buffer_size=args.shuffle_buffer_size, run_opts=run_opts, am_frames_per_eg=am_frames_per_eg, dropout_edit_string=common_train_lib.get_dropout_edit_string( args.dropout_schedule, float(num_archives_processed) / num_archives_to_process, iter), shrinkage_value=shrinkage_value, get_raw_nnet_from_am=False, backstitch_training_scale=args.backstitch_training_scale, backstitch_training_interval=args.backstitch_training_interval) if args.cleanup: # do a clean up everythin but the last 2 models, under certain # conditions common_train_lib.remove_model( args.dir, iter-2, num_iters, models_to_combine, args.preserve_model_interval, get_raw_nnet_from_am=False) if args.email is not None: reporting_iter_interval = num_iters * args.reporting_interval if iter % reporting_iter_interval == 0: # lets do some reporting [report, times, data] = ( nnet3_log_parse.generate_acc_logprob_report(args.dir)) message = report subject = ("Update : Expt {dir} : " "Iter {iter}".format(dir=args.dir, iter=iter)) common_lib.send_mail(message, subject, args.email) num_archives_processed = num_archives_processed + current_num_jobs # when we do final combination, just use the xvector egs if args.stage <= num_iters: if args.do_final_combination: logger.info("Doing final combination to produce final.mdl") train_lib.common.combine_models( dir=args.dir, num_iters=num_iters, models_to_combine=models_to_combine, egs_dir=xvec_egs_dir, minibatch_size_str="64", run_opts=run_opts, get_raw_nnet_from_am=False, max_objective_evaluations=args.max_objective_evaluations, use_egs=True) # sum_to_one_penalty=args.combine_sum_to_one_penalty, else: common_lib.force_symlink("{0}.raw".format(num_iters), "{0}/final.raw".format(args.dir)) if args.cleanup: logger.info("Cleaning up the experiment directory " "{0}".format(args.dir)) remove_egs = False common_train_lib.clean_nnet_dir( nnet_dir=args.dir, num_iters=num_iters, egs_dir=am_egs_dir, preserve_model_interval=args.preserve_model_interval, remove_egs=remove_egs, get_raw_nnet_from_am=False) # TODO: we may trace other output nodes expect for "output" # do some reporting outputs_list = common_train_lib.get_outputs_list("{0}/final.raw".format( args.dir), get_raw_nnet_from_am=False) if 'output' in outputs_list: [report, times, data] = nnet3_log_parse.generate_acc_logprob_report(args.dir) if args.email is not None: common_lib.send_mail(report, "Update : Expt {0} : " "complete".format(args.dir), args.email) with open("{dir}/accuracy.{output_name}.report".format(dir=args.dir, output_name="output"), "w") as f: f.write(report) common_lib.execute_command("subtools/kaldi/steps/info/nnet3_dir_info.pl " "{0}".format(args.dir))
def train(args, run_opts): """ The main function for training. Args: args: a Namespace object with the required parameters obtained from the function process_args() run_opts: RunOpts object obtained from the process_args() """ arg_string = pprint.pformat(vars(args)) logger.info("Arguments for the experiment\n{0}".format(arg_string)) # Set some variables. # note, feat_dim gets set to 0 if args.feat_dir is unset (None). feat_dim = common_lib.get_feat_dim(args.feat_dir) ivector_dim = common_lib.get_ivector_dim(args.online_ivector_dir) ivector_id = common_lib.get_ivector_extractor_id(args.online_ivector_dir) config_dir = '{0}/configs'.format(args.dir) var_file = '{0}/vars'.format(config_dir) variables = common_train_lib.parse_generic_config_vars_file(var_file) # Set some variables. try: model_left_context = variables['model_left_context'] model_right_context = variables['model_right_context'] except KeyError as e: raise Exception("KeyError {0}: Variables need to be defined in " "{1}".format(str(e), '{0}/configs'.format(args.dir))) left_context = model_left_context right_context = model_right_context # Initialize as "raw" nnet, prior to training the LDA-like preconditioning # matrix. This first config just does any initial splicing that we do; # we do this as it's a convenient way to get the stats for the 'lda-like' # transform. if (args.stage <= -5) and os.path.exists(args.dir + "/configs/init.config"): logger.info("Initializing the network for computing the LDA stats") common_lib.execute_command("""{command} {dir}/log/nnet_init.log \ nnet3-init --srand=-2 {dir}/configs/init.config \ {dir}/init.raw""".format(command=run_opts.command, dir=args.dir)) default_egs_dir = '{0}/egs'.format(args.dir) if (args.stage <= -4) and args.egs_dir is None: if args.targets_scp is None or args.feat_dir is None: raise Exception( "If you don't supply the --egs-dir option, the " "--targets-scp and --feat-dir options are required.") logger.info("Generating egs") if args.use_dense_targets: target_type = "dense" try: num_targets = int(variables['num_targets']) if (common_lib.get_feat_dim_from_scp(args.targets_scp) != num_targets): raise Exception("Mismatch between num-targets provided to " "script vs configs") except KeyError as e: num_targets = -1 else: target_type = "sparse" try: num_targets = int(variables['num_targets']) except KeyError as e: raise Exception("KeyError {0}: Variables need to be defined " "in {1}".format(str(e), '{0}/configs'.format( args.dir))) train_lib.raw_model.generate_egs_using_targets( data=args.feat_dir, targets_scp=args.targets_scp, egs_dir=default_egs_dir, left_context=left_context, right_context=right_context, run_opts=run_opts, frames_per_eg_str=str(args.frames_per_eg), srand=args.srand, egs_opts=args.egs_opts, cmvn_opts=args.cmvn_opts, online_ivector_dir=args.online_ivector_dir, samples_per_iter=args.samples_per_iter, transform_dir=args.transform_dir, stage=args.egs_stage, target_type=target_type, num_targets=num_targets) if args.egs_dir is None: egs_dir = default_egs_dir else: egs_dir = args.egs_dir [egs_left_context, egs_right_context, frames_per_eg_str, num_archives] = (common_train_lib.verify_egs_dir(egs_dir, feat_dim, ivector_dim, ivector_id, left_context, right_context)) assert str(args.frames_per_eg) == frames_per_eg_str if args.num_jobs_final > num_archives: raise Exception('num_jobs_final cannot exceed the number of archives ' 'in the egs directory') # copy the properties of the egs to dir for # use during decoding common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir) if args.stage <= -3 and os.path.exists(args.dir + "/configs/init.config"): logger.info('Computing the preconditioning matrix for input features') train_lib.common.compute_preconditioning_matrix( args.dir, egs_dir, num_archives, run_opts, max_lda_jobs=args.max_lda_jobs, rand_prune=args.rand_prune) if args.stage <= -1: logger.info("Preparing the initial network.") common_train_lib.prepare_initial_network(args.dir, run_opts) # set num_iters so that as close as possible, we process the data # $num_epochs times, i.e. $num_iters*$avg_num_jobs) == # $num_epochs*$num_archives, where # avg_num_jobs=(num_jobs_initial+num_jobs_final)/2. num_archives_expanded = num_archives * args.frames_per_eg num_archives_to_process = int(args.num_epochs * num_archives_expanded) num_archives_processed = 0 num_iters = ((num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final)) # If do_final_combination is True, compute the set of models_to_combine. # Otherwise, models_to_combine will be none. if args.do_final_combination: models_to_combine = common_train_lib.get_model_combine_iters( num_iters, args.num_epochs, num_archives_expanded, args.max_models_combine, args.num_jobs_final) else: models_to_combine = None if os.path.exists('{0}/valid_diagnostic.scp'.format(egs_dir)): if os.path.exists('{0}/valid_diagnostic.egs'.format(egs_dir)): raise Exception('both {0}/valid_diagnostic.egs and ' '{0}/valid_diagnostic.scp exist.' 'This script expects only one of them to exist.' ''.format(egs_dir)) use_multitask_egs = True else: if not os.path.exists('{0}/valid_diagnostic.egs'.format(egs_dir)): raise Exception('neither {0}/valid_diagnostic.egs nor ' '{0}/valid_diagnostic.scp exist.' 'This script expects one of them.' ''.format(egs_dir)) use_multitask_egs = False logger.info("Training will run for {0} epochs = " "{1} iterations".format(args.num_epochs, num_iters)) for iter in range(num_iters): if (args.exit_stage is not None) and (iter == args.exit_stage): logger.info("Exiting early due to --exit-stage {0}".format(iter)) return current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters) if args.stage <= iter: lrate = common_train_lib.get_learning_rate( iter, current_num_jobs, num_iters, num_archives_processed, num_archives_to_process, args.initial_effective_lrate, args.final_effective_lrate) shrinkage_value = 1.0 - (args.proportional_shrink * lrate) if shrinkage_value <= 0.5: raise Exception( "proportional-shrink={0} is too large, it gives " "shrink-value={1}".format(args.proportional_shrink, shrinkage_value)) train_lib.common.train_one_iteration( dir=args.dir, iter=iter, srand=args.srand, egs_dir=egs_dir, num_jobs=current_num_jobs, num_archives_processed=num_archives_processed, num_archives=num_archives, learning_rate=lrate, dropout_edit_string=common_train_lib.get_dropout_edit_string( args.dropout_schedule, float(num_archives_processed) / num_archives_to_process, iter), minibatch_size_str=args.minibatch_size, frames_per_eg=args.frames_per_eg, momentum=args.momentum, max_param_change=args.max_param_change, shrinkage_value=shrinkage_value, shuffle_buffer_size=args.shuffle_buffer_size, run_opts=run_opts, get_raw_nnet_from_am=False, image_augmentation_opts=args.image_augmentation_opts, use_multitask_egs=use_multitask_egs, backstitch_training_scale=args.backstitch_training_scale, backstitch_training_interval=args.backstitch_training_interval) if args.cleanup: # do a clean up everything but the last 2 models, under certain # conditions common_train_lib.remove_model(args.dir, iter - 2, num_iters, models_to_combine, args.preserve_model_interval, get_raw_nnet_from_am=False) if args.email is not None: reporting_iter_interval = num_iters * args.reporting_interval if iter % reporting_iter_interval == 0: # lets do some reporting [report, times, data] = (nnet3_log_parse.generate_acc_logprob_report( args.dir)) message = report subject = ("Update : Expt {dir} : " "Iter {iter}".format(dir=args.dir, iter=iter)) common_lib.send_mail(message, subject, args.email) num_archives_processed = num_archives_processed + current_num_jobs if args.stage <= num_iters: if args.do_final_combination: logger.info("Doing final combination to produce final.raw") train_lib.common.combine_models( dir=args.dir, num_iters=num_iters, models_to_combine=models_to_combine, egs_dir=egs_dir, minibatch_size_str=args.minibatch_size, run_opts=run_opts, get_raw_nnet_from_am=False, max_objective_evaluations=args.max_objective_evaluations, use_multitask_egs=use_multitask_egs) else: common_lib.force_symlink("{0}.raw".format(num_iters), "{0}/final.raw".format(args.dir)) if args.compute_average_posteriors and args.stage <= num_iters + 1: logger.info("Getting average posterior for output-node 'output'.") train_lib.common.compute_average_posterior( dir=args.dir, iter='final', egs_dir=egs_dir, num_archives=num_archives, prior_subset_size=args.prior_subset_size, run_opts=run_opts, get_raw_nnet_from_am=False) if args.cleanup: logger.info("Cleaning up the experiment directory " "{0}".format(args.dir)) remove_egs = args.remove_egs if args.egs_dir is not None: # this egs_dir was not created by this experiment so we will not # delete it remove_egs = False common_train_lib.clean_nnet_dir( nnet_dir=args.dir, num_iters=num_iters, egs_dir=egs_dir, preserve_model_interval=args.preserve_model_interval, remove_egs=remove_egs, get_raw_nnet_from_am=False) # do some reporting outputs_list = common_train_lib.get_outputs_list( "{0}/final.raw".format(args.dir), get_raw_nnet_from_am=False) if 'output' in outputs_list: [report, times, data] = nnet3_log_parse.generate_acc_logprob_report(args.dir) if args.email is not None: common_lib.send_mail( report, "Update : Expt {0} : " "complete".format(args.dir), args.email) with open( "{dir}/accuracy.{output_name}.report".format( dir=args.dir, output_name="output"), "w") as f: f.write(report) common_lib.execute_command("steps/info/nnet3_dir_info.pl " "{0}".format(args.dir))
def train(args, run_opts): """ The main function for training. Args: args: a Namespace object with the required parameters obtained from the function process_args() run_opts: RunOpts object obtained from the process_args() """ arg_string = pprint.pformat(vars(args)) logger.info("Arguments for the experiment\n{0}".format(arg_string)) # Set some variables. feat_dim = common_lib.get_feat_dim(args.feat_dir) ivector_dim = common_lib.get_ivector_dim(args.online_ivector_dir) ivector_id = common_lib.get_ivector_extractor_id(args.online_ivector_dir) if args.input_model is None: config_dir = '{0}/configs'.format(args.dir) var_file = '{0}/vars'.format(config_dir) variables = common_train_lib.parse_generic_config_vars_file(var_file) else: # If args.input_model is specified, the model left and right contexts # are computed using input_model. variables = common_train_lib.get_input_model_info(args.input_model) # Set some variables. try: model_left_context = variables['model_left_context'] model_right_context = variables['model_right_context'] except KeyError as e: raise Exception("KeyError {0}: Variables need to be defined in " "{1}".format(str(e), '{0}/configs'.format(args.dir))) left_context = args.chunk_left_context + model_left_context right_context = args.chunk_right_context + model_right_context left_context_initial = (args.chunk_left_context_initial + model_left_context if args.chunk_left_context_initial >= 0 else -1) right_context_final = (args.chunk_right_context_final + model_right_context if args.chunk_right_context_final >= 0 else -1) # Initialize as "raw" nnet, prior to training the LDA-like preconditioning # matrix. This first config just does any initial splicing that we do; # we do this as it's a convenient way to get the stats for the 'lda-like' # transform. if (args.stage <= -4) and os.path.exists(args.dir+"/configs/init.config") and \ (args.input_model is None): logger.info("Initializing the network for computing the LDA stats") common_lib.execute_command( """{command} {dir}/log/nnet_init.log \ nnet3-init --srand=-2 {dir}/configs/init.config \ {dir}/init.raw""".format(command=run_opts.command, dir=args.dir)) default_egs_dir = '{0}/egs'.format(args.dir) if (args.stage <= -3) and args.egs_dir is None: logger.info("Generating egs") if args.use_dense_targets: target_type = "dense" try: num_targets = int(variables['num_targets']) if (common_lib.get_feat_dim_from_scp(args.targets_scp) != num_targets): raise Exception("Mismatch between num-targets provided to " "script vs configs") except KeyError as e: num_targets = -1 else: target_type = "sparse" try: num_targets = int(variables['num_targets']) except KeyError as e: raise Exception("KeyError {0}: Variables need to be defined " "in {1}".format( str(e), '{0}/configs'.format(args.dir))) train_lib.raw_model.generate_egs_using_targets( data=args.feat_dir, targets_scp=args.targets_scp, egs_dir=default_egs_dir, left_context=left_context, right_context=right_context, left_context_initial=left_context_initial, right_context_final=right_context_final, run_opts=run_opts, frames_per_eg_str=args.chunk_width, srand=args.srand, egs_opts=args.egs_opts, cmvn_opts=args.cmvn_opts, online_ivector_dir=args.online_ivector_dir, samples_per_iter=args.samples_per_iter, stage=args.egs_stage, target_type=target_type, num_targets=num_targets) if args.egs_dir is None: egs_dir = default_egs_dir else: egs_dir = args.egs_dir [egs_left_context, egs_right_context, frames_per_eg_str, num_archives] = ( common_train_lib.verify_egs_dir(egs_dir, feat_dim, ivector_dim, ivector_id, left_context, right_context, left_context_initial, right_context_final)) if args.chunk_width != frames_per_eg_str: raise Exception("mismatch between --egs.chunk-width and the frames_per_eg " "in the egs dir {0} vs {1}".format(args.chunk_width, frames_per_eg_str)) if args.num_jobs_final > num_archives: raise Exception('num_jobs_final cannot exceed the number of archives ' 'in the egs directory') # copy the properties of the egs to dir for # use during decoding common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir) if args.stage <= -2 and os.path.exists(args.dir+"/configs/init.config") and \ (args.input_model is None): logger.info('Computing the preconditioning matrix for input features') train_lib.common.compute_preconditioning_matrix( args.dir, egs_dir, num_archives, run_opts, max_lda_jobs=args.max_lda_jobs, rand_prune=args.rand_prune) if args.stage <= -1: logger.info("Preparing the initial network.") common_train_lib.prepare_initial_network(args.dir, run_opts, args.srand, args.input_model) # set num_iters so that as close as possible, we process the data # $num_epochs times, i.e. $num_iters*$avg_num_jobs) == # $num_epochs*$num_archives, where # avg_num_jobs=(num_jobs_initial+num_jobs_final)/2. num_archives_to_process = int(args.num_epochs * num_archives) num_archives_processed = 0 num_iters = int((num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final)) # If do_final_combination is True, compute the set of models_to_combine. # Otherwise, models_to_combine will be none. if args.do_final_combination: models_to_combine = common_train_lib.get_model_combine_iters( num_iters, args.num_epochs, num_archives, args.max_models_combine, args.num_jobs_final) else: models_to_combine = None if (os.path.exists('{0}/valid_diagnostic.scp'.format(egs_dir))): if (os.path.exists('{0}/valid_diagnostic.egs'.format(egs_dir))): raise Exception('both {0}/valid_diagnostic.egs and ' '{0}/valid_diagnostic.scp exist.' 'This script expects only one of them to exist.' ''.format(egs_dir)) use_multitask_egs = True else: if (not os.path.exists('{0}/valid_diagnostic.egs' ''.format(egs_dir))): raise Exception('neither {0}/valid_diagnostic.egs nor ' '{0}/valid_diagnostic.scp exist.' 'This script expects one of them.' ''.format(egs_dir)) use_multitask_egs = False min_deriv_time = None max_deriv_time_relative = None if args.deriv_truncate_margin is not None: min_deriv_time = -args.deriv_truncate_margin - model_left_context max_deriv_time_relative = \ args.deriv_truncate_margin + model_right_context logger.info("Training will run for {0} epochs = " "{1} iterations".format(args.num_epochs, num_iters)) for iter in range(num_iters): if (args.exit_stage is not None) and (iter == args.exit_stage): logger.info("Exiting early due to --exit-stage {0}".format(iter)) return current_num_jobs = common_train_lib.get_current_num_jobs( iter, num_iters, args.num_jobs_initial, args.num_jobs_step, args.num_jobs_final) if args.stage <= iter: model_file = "{dir}/{iter}.raw".format(dir=args.dir, iter=iter) lrate = common_train_lib.get_learning_rate(iter, current_num_jobs, num_iters, num_archives_processed, num_archives_to_process, args.initial_effective_lrate, args.final_effective_lrate) # shrinkage_value is a scale on the parameters. shrinkage_value = 1.0 - (args.proportional_shrink * lrate) if shrinkage_value <= 0.5: raise Exception("proportional-shrink={0} is too large, it gives " "shrink-value={1}".format(args.proportional_shrink, shrinkage_value)) if args.shrink_value < shrinkage_value: shrinkage_value = (args.shrink_value if common_train_lib.should_do_shrinkage( iter, model_file, args.shrink_saturation_threshold, get_raw_nnet_from_am=False) else shrinkage_value) percent = num_archives_processed * 100.0 / num_archives_to_process epoch = (num_archives_processed * args.num_epochs / num_archives_to_process) shrink_info_str = '' if shrinkage_value != 1.0: shrink_info_str = 'shrink: {0:0.5f}'.format(shrinkage_value) logger.info("Iter: {0}/{1} Jobs: {2} " "Epoch: {3:0.2f}/{4:0.1f} ({5:0.1f}% complete) " "lr: {6:0.6f} {7}".format(iter, num_iters - 1, current_num_jobs, epoch, args.num_epochs, percent, lrate, shrink_info_str)) train_lib.common.train_one_iteration( dir=args.dir, iter=iter, srand=args.srand, egs_dir=egs_dir, num_jobs=current_num_jobs, num_archives_processed=num_archives_processed, num_archives=num_archives, learning_rate=lrate, dropout_edit_string=common_train_lib.get_dropout_edit_string( args.dropout_schedule, float(num_archives_processed) / num_archives_to_process, iter), train_opts=' '.join(args.train_opts), shrinkage_value=shrinkage_value, minibatch_size_str=args.num_chunk_per_minibatch, min_deriv_time=min_deriv_time, max_deriv_time_relative=max_deriv_time_relative, momentum=args.momentum, max_param_change=args.max_param_change, shuffle_buffer_size=args.shuffle_buffer_size, run_opts=run_opts, get_raw_nnet_from_am=False, use_multitask_egs=use_multitask_egs, compute_per_dim_accuracy=args.compute_per_dim_accuracy) if args.cleanup: # do a clean up everythin but the last 2 models, under certain # conditions common_train_lib.remove_model( args.dir, iter-2, num_iters, models_to_combine, args.preserve_model_interval, get_raw_nnet_from_am=False) if args.email is not None: reporting_iter_interval = num_iters * args.reporting_interval if iter % reporting_iter_interval == 0: # lets do some reporting [report, times, data] = ( nnet3_log_parse.generate_acc_logprob_report(args.dir)) message = report subject = ("Update : Expt {dir} : " "Iter {iter}".format(dir=args.dir, iter=iter)) common_lib.send_mail(message, subject, args.email) num_archives_processed = num_archives_processed + current_num_jobs if args.stage <= num_iters: if args.do_final_combination: logger.info("Doing final combination to produce final.raw") train_lib.common.combine_models( dir=args.dir, num_iters=num_iters, models_to_combine=models_to_combine, egs_dir=egs_dir, minibatch_size_str=args.num_chunk_per_minibatch, run_opts=run_opts, chunk_width=args.chunk_width, get_raw_nnet_from_am=False, compute_per_dim_accuracy=args.compute_per_dim_accuracy, max_objective_evaluations=args.max_objective_evaluations, use_multitask_egs=use_multitask_egs) else: common_lib.force_symlink("{0}.raw".format(num_iters), "{0}/final.raw".format(args.dir)) if args.compute_average_posteriors and args.stage <= num_iters + 1: logger.info("Getting average posterior for purposes of " "adjusting the priors.") train_lib.common.compute_average_posterior( dir=args.dir, iter='final', egs_dir=egs_dir, num_archives=num_archives, prior_subset_size=args.prior_subset_size, run_opts=run_opts, get_raw_nnet_from_am=False) if args.cleanup: logger.info("Cleaning up the experiment directory " "{0}".format(args.dir)) remove_egs = args.remove_egs if args.egs_dir is not None: # this egs_dir was not created by this experiment so we will not # delete it remove_egs = False common_train_lib.clean_nnet_dir( nnet_dir=args.dir, num_iters=num_iters, egs_dir=egs_dir, preserve_model_interval=args.preserve_model_interval, remove_egs=remove_egs, get_raw_nnet_from_am=False) # do some reporting [report, times, data] = nnet3_log_parse.generate_acc_logprob_report(args.dir) if args.email is not None: common_lib.send_mail(report, "Update : Expt {0} : " "complete".format(args.dir), args.email) with open("{dir}/accuracy.report".format(dir=args.dir), "w") as f: f.write(report) common_lib.execute_command("subtools/kaldi/steps/info/nnet3_dir_info.pl " "{0}".format(args.dir))
def train(args, run_opts): """ The main function for training. Args: args: a Namespace object with the required parameters obtained from the function process_args() run_opts: RunOpts object obtained from the process_args() """ arg_string = pprint.pformat(vars(args)) logger.info("Arguments for the experiment\n{0}".format(arg_string)) # Set some variables. feat_dim = common_lib.get_feat_dim(args.feat_dir) ivector_dim = common_lib.get_ivector_dim(args.online_ivector_dir) ivector_id = common_lib.get_ivector_extractor_id(args.online_ivector_dir) config_dir = '{0}/configs'.format(args.dir) var_file = '{0}/vars'.format(config_dir) variables = common_train_lib.parse_generic_config_vars_file(var_file) # Set some variables. try: model_left_context = variables['model_left_context'] model_right_context = variables['model_right_context'] except KeyError as e: raise Exception("KeyError {0}: Variables need to be defined in " "{1}".format(str(e), '{0}/configs'.format(args.dir))) left_context = args.chunk_left_context + model_left_context right_context = args.chunk_right_context + model_right_context left_context_initial = (args.chunk_left_context_initial + model_left_context if args.chunk_left_context_initial >= 0 else -1) right_context_final = (args.chunk_right_context_final + model_right_context if args.chunk_right_context_final >= 0 else -1) # Initialize as "raw" nnet, prior to training the LDA-like preconditioning # matrix. This first config just does any initial splicing that we do; # we do this as it's a convenient way to get the stats for the 'lda-like' # transform. if (args.stage <= -4) and os.path.exists(args.dir+"/configs/init.config"): logger.info("Initializing the network for computing the LDA stats") common_lib.execute_command( """{command} {dir}/log/nnet_init.log \ nnet3-init --srand=-2 {dir}/configs/init.config \ {dir}/init.raw""".format(command=run_opts.command, dir=args.dir)) default_egs_dir = '{0}/egs'.format(args.dir) if (args.stage <= -3) and args.egs_dir is None: logger.info("Generating egs") if args.use_dense_targets: target_type = "dense" try: num_targets = int(variables['num_targets']) if (common_lib.get_feat_dim_from_scp(args.targets_scp) != num_targets): raise Exception("Mismatch between num-targets provided to " "script vs configs") except KeyError as e: num_targets = -1 else: target_type = "sparse" try: num_targets = int(variables['num_targets']) except KeyError as e: raise Exception("KeyError {0}: Variables need to be defined " "in {1}".format( str(e), '{0}/configs'.format(args.dir))) train_lib.raw_model.generate_egs_using_targets( data=args.feat_dir, targets_scp=args.targets_scp, egs_dir=default_egs_dir, left_context=left_context, right_context=right_context, left_context_initial=left_context_initial, right_context_final=right_context_final, run_opts=run_opts, frames_per_eg_str=args.chunk_width, srand=args.srand, egs_opts=args.egs_opts, cmvn_opts=args.cmvn_opts, online_ivector_dir=args.online_ivector_dir, samples_per_iter=args.samples_per_iter, stage=args.egs_stage, target_type=target_type, num_targets=num_targets) if args.egs_dir is None: egs_dir = default_egs_dir else: egs_dir = args.egs_dir [egs_left_context, egs_right_context, frames_per_eg_str, num_archives] = ( common_train_lib.verify_egs_dir(egs_dir, feat_dim, ivector_dim, ivector_id, left_context, right_context, left_context_initial, right_context_final)) if args.chunk_width != frames_per_eg_str: raise Exception("mismatch between --egs.chunk-width and the frames_per_eg " "in the egs dir {0} vs {1}".format(args.chunk_width, frames_per_eg_str)) if args.num_jobs_final > num_archives: raise Exception('num_jobs_final cannot exceed the number of archives ' 'in the egs directory') # copy the properties of the egs to dir for # use during decoding common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir) if args.stage <= -2 and os.path.exists(args.dir+"/configs/init.config"): logger.info('Computing the preconditioning matrix for input features') train_lib.common.compute_preconditioning_matrix( args.dir, egs_dir, num_archives, run_opts, max_lda_jobs=args.max_lda_jobs, rand_prune=args.rand_prune) if args.stage <= -1: logger.info("Preparing the initial network.") common_train_lib.prepare_initial_network(args.dir, run_opts) # set num_iters so that as close as possible, we process the data # $num_epochs times, i.e. $num_iters*$avg_num_jobs) == # $num_epochs*$num_archives, where # avg_num_jobs=(num_jobs_initial+num_jobs_final)/2. num_archives_to_process = int(args.num_epochs * num_archives) num_archives_processed = 0 num_iters = ((num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final)) # If do_final_combination is True, compute the set of models_to_combine. # Otherwise, models_to_combine will be none. if args.do_final_combination: models_to_combine = common_train_lib.get_model_combine_iters( num_iters, args.num_epochs, num_archives, args.max_models_combine, args.num_jobs_final) else: models_to_combine = None if (os.path.exists('{0}/valid_diagnostic.scp'.format(egs_dir))): if (os.path.exists('{0}/valid_diagnostic.egs'.format(egs_dir))): raise Exception('both {0}/valid_diagnostic.egs and ' '{0}/valid_diagnostic.scp exist.' 'This script expects only one of them to exist.' ''.format(egs_dir)) use_multitask_egs = True else: if (not os.path.exists('{0}/valid_diagnostic.egs' ''.format(egs_dir))): raise Exception('neither {0}/valid_diagnostic.egs nor ' '{0}/valid_diagnostic.scp exist.' 'This script expects one of them.' ''.format(egs_dir)) use_multitask_egs = False min_deriv_time = None max_deriv_time_relative = None if args.deriv_truncate_margin is not None: min_deriv_time = -args.deriv_truncate_margin - model_left_context max_deriv_time_relative = \ args.deriv_truncate_margin + model_right_context logger.info("Training will run for {0} epochs = " "{1} iterations".format(args.num_epochs, num_iters)) for iter in range(num_iters): if (args.exit_stage is not None) and (iter == args.exit_stage): logger.info("Exiting early due to --exit-stage {0}".format(iter)) return current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters) if args.stage <= iter: model_file = "{dir}/{iter}.raw".format(dir=args.dir, iter=iter) lrate = common_train_lib.get_learning_rate(iter, current_num_jobs, num_iters, num_archives_processed, num_archives_to_process, args.initial_effective_lrate, args.final_effective_lrate) # shrinkage_value is a scale on the parameters. shrinkage_value = 1.0 - (args.proportional_shrink * lrate) if shrinkage_value <= 0.5: raise Exception("proportional-shrink={0} is too large, it gives " "shrink-value={1}".format(args.proportional_shrink, shrinkage_value)) if args.shrink_value < shrinkage_value: shrinkage_value = (args.shrink_value if common_train_lib.should_do_shrinkage( iter, model_file, args.shrink_saturation_threshold, get_raw_nnet_from_am=False) else shrinkage_value) percent = num_archives_processed * 100.0 / num_archives_to_process epoch = (num_archives_processed * args.num_epochs / num_archives_to_process) shrink_info_str = '' if shrinkage_value != 1.0: shrink_info_str = 'shrink: {0:0.5f}'.format(shrinkage_value) logger.info("Iter: {0}/{1} " "Epoch: {2:0.2f}/{3:0.1f} ({4:0.1f}% complete) " "lr: {5:0.6f} {6}".format(iter, num_iters - 1, epoch, args.num_epochs, percent, lrate, shrink_info_str)) train_lib.common.train_one_iteration( dir=args.dir, iter=iter, srand=args.srand, egs_dir=egs_dir, num_jobs=current_num_jobs, num_archives_processed=num_archives_processed, num_archives=num_archives, learning_rate=lrate, dropout_edit_string=common_train_lib.get_dropout_edit_string( args.dropout_schedule, float(num_archives_processed) / num_archives_to_process, iter), train_opts=' '.join(args.train_opts), shrinkage_value=shrinkage_value, minibatch_size_str=args.num_chunk_per_minibatch, min_deriv_time=min_deriv_time, max_deriv_time_relative=max_deriv_time_relative, momentum=args.momentum, max_param_change=args.max_param_change, shuffle_buffer_size=args.shuffle_buffer_size, run_opts=run_opts, get_raw_nnet_from_am=False, use_multitask_egs=use_multitask_egs, compute_per_dim_accuracy=args.compute_per_dim_accuracy) if args.cleanup: # do a clean up everythin but the last 2 models, under certain # conditions common_train_lib.remove_model( args.dir, iter-2, num_iters, models_to_combine, args.preserve_model_interval, get_raw_nnet_from_am=False) if args.email is not None: reporting_iter_interval = num_iters * args.reporting_interval if iter % reporting_iter_interval == 0: # lets do some reporting [report, times, data] = ( nnet3_log_parse.generate_acc_logprob_report(args.dir)) message = report subject = ("Update : Expt {dir} : " "Iter {iter}".format(dir=args.dir, iter=iter)) common_lib.send_mail(message, subject, args.email) num_archives_processed = num_archives_processed + current_num_jobs if args.stage <= num_iters: if args.do_final_combination: logger.info("Doing final combination to produce final.raw") train_lib.common.combine_models( dir=args.dir, num_iters=num_iters, models_to_combine=models_to_combine, egs_dir=egs_dir, minibatch_size_str=args.num_chunk_per_minibatch, run_opts=run_opts, chunk_width=args.chunk_width, get_raw_nnet_from_am=False, compute_per_dim_accuracy=args.compute_per_dim_accuracy, max_objective_evaluations=args.max_objective_evaluations) else: common_lib.force_symlink("{0}.raw".format(num_iters), "{0}/final.raw".format(args.dir)) if args.compute_average_posteriors and args.stage <= num_iters + 1: logger.info("Getting average posterior for purposes of " "adjusting the priors.") train_lib.common.compute_average_posterior( dir=args.dir, iter='final', egs_dir=egs_dir, num_archives=num_archives, prior_subset_size=args.prior_subset_size, run_opts=run_opts, get_raw_nnet_from_am=False) if args.cleanup: logger.info("Cleaning up the experiment directory " "{0}".format(args.dir)) remove_egs = args.remove_egs if args.egs_dir is not None: # this egs_dir was not created by this experiment so we will not # delete it remove_egs = False common_train_lib.clean_nnet_dir( nnet_dir=args.dir, num_iters=num_iters, egs_dir=egs_dir, preserve_model_interval=args.preserve_model_interval, remove_egs=remove_egs, get_raw_nnet_from_am=False) # do some reporting [report, times, data] = nnet3_log_parse.generate_acc_logprob_report(args.dir) if args.email is not None: common_lib.send_mail(report, "Update : Expt {0} : " "complete".format(args.dir), args.email) with open("{dir}/accuracy.report".format(dir=args.dir), "w") as f: f.write(report) common_lib.execute_command("steps/info/nnet3_dir_info.pl " "{0}".format(args.dir))
def train(args, run_opts, background_process_handler): """ The main function for training. Args: args: a Namespace object with the required parameters obtained from the function process_args() run_opts: RunOpts object obtained from the process_args() """ arg_string = pprint.pformat(vars(args)) logger.info("Arguments for the experiment\n{0}".format(arg_string)) # Set some variables. feat_dim = common_lib.get_feat_dim(args.feat_dir) ivector_dim = common_lib.get_ivector_dim(args.online_ivector_dir) ivector_id = common_lib.get_ivector_extractor_id(args.online_ivector_dir) config_dir = '{0}/configs'.format(args.dir) var_file = '{0}/vars'.format(config_dir) variables = common_train_lib.parse_generic_config_vars_file(var_file) # Set some variables. try: model_left_context = variables['model_left_context'] model_right_context = variables['model_right_context'] # this is really the number of times we add layers to the network for # discriminative pretraining num_hidden_layers = variables['num_hidden_layers'] add_lda = common_lib.str_to_bool(variables['add_lda']) include_log_softmax = common_lib.str_to_bool( variables['include_log_softmax']) except KeyError as e: raise Exception("KeyError {0}: Variables need to be defined in " "{1}".format(str(e), '{0}/configs'.format(args.dir))) left_context = args.chunk_left_context + model_left_context right_context = args.chunk_right_context + model_right_context left_context_initial = (args.chunk_left_context_initial + model_left_context if args.chunk_left_context_initial >= 0 else -1) right_context_final = (args.chunk_right_context_final + model_right_context if args.chunk_right_context_final >= 0 else -1) # Initialize as "raw" nnet, prior to training the LDA-like preconditioning # matrix. This first config just does any initial splicing that we do; # we do this as it's a convenient way to get the stats for the 'lda-like' # transform. if (args.stage <= -4): logger.info("Initializing a basic network") common_lib.run_job( """{command} {dir}/log/nnet_init.log \ nnet3-init --srand=-2 {dir}/configs/init.config \ {dir}/init.raw""".format(command=run_opts.command, dir=args.dir)) default_egs_dir = '{0}/egs'.format(args.dir) if (args.stage <= -3) and args.egs_dir is None: logger.info("Generating egs") if args.use_dense_targets: target_type = "dense" try: num_targets = int(variables['num_targets']) if (common_lib.get_feat_dim_from_scp(args.targets_scp) != num_targets): raise Exception("Mismatch between num-targets provided to " "script vs configs") except KeyError as e: num_targets = -1 else: target_type = "sparse" try: num_targets = int(variables['num_targets']) except KeyError as e: raise Exception("KeyError {0}: Variables need to be defined " "in {1}".format( str(e), '{0}/configs'.format(args.dir))) train_lib.raw_model.generate_egs_using_targets( data=args.feat_dir, targets_scp=args.targets_scp, egs_dir=default_egs_dir, left_context=left_context, right_context=right_context, left_context_initial=left_context_initial, right_context_final=right_context_final, run_opts=run_opts, frames_per_eg_str=args.chunk_width, srand=args.srand, egs_opts=args.egs_opts, cmvn_opts=args.cmvn_opts, online_ivector_dir=args.online_ivector_dir, samples_per_iter=args.samples_per_iter, transform_dir=args.transform_dir, stage=args.egs_stage, target_type=target_type, num_targets=num_targets) if args.egs_dir is None: egs_dir = default_egs_dir else: egs_dir = args.egs_dir [egs_left_context, egs_right_context, frames_per_eg_str, num_archives] = ( common_train_lib.verify_egs_dir(egs_dir, feat_dim, ivector_dim, ivector_id, left_context, right_context)) if args.chunk_width != frames_per_eg_str: raise Exception("mismatch between --egs.chunk-width and the frames_per_eg " "in the egs dir {0} vs {1}".format(args.chunk_width, frames_per_eg_str)) if (args.num_jobs_final > num_archives): raise Exception('num_jobs_final cannot exceed the number of archives ' 'in the egs directory') # copy the properties of the egs to dir for # use during decoding common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir) if (add_lda and args.stage <= -2): logger.info('Computing the preconditioning matrix for input features') train_lib.common.compute_preconditioning_matrix( args.dir, egs_dir, num_archives, run_opts, max_lda_jobs=args.max_lda_jobs, rand_prune=args.rand_prune) if (args.stage <= -1): logger.info("Preparing the initial network.") common_train_lib.prepare_initial_network(args.dir, run_opts) # set num_iters so that as close as possible, we process the data # $num_epochs times, i.e. $num_iters*$avg_num_jobs) == # $num_epochs*$num_archives, where # avg_num_jobs=(num_jobs_initial+num_jobs_final)/2. num_archives_to_process = int(args.num_epochs * num_archives) num_archives_processed = 0 num_iters = ((num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final)) models_to_combine = common_train_lib.verify_iterations( num_iters, args.num_epochs, num_hidden_layers, num_archives, args.max_models_combine, args.add_layers_period, args.num_jobs_final) def learning_rate(iter, current_num_jobs, num_archives_processed): return common_train_lib.get_learning_rate(iter, current_num_jobs, num_iters, num_archives_processed, num_archives_to_process, args.initial_effective_lrate, args.final_effective_lrate) min_deriv_time = None max_deriv_time_relative = None if args.deriv_truncate_margin is not None: min_deriv_time = -args.deriv_truncate_margin - model_left_context max_deriv_time_relative = \ args.deriv_truncate_margin + model_right_context logger.info("Training will run for {0} epochs = " "{1} iterations".format(args.num_epochs, num_iters)) for iter in range(num_iters): if (args.exit_stage is not None) and (iter == args.exit_stage): logger.info("Exiting early due to --exit-stage {0}".format(iter)) return current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters) if args.stage <= iter: model_file = "{dir}/{iter}.raw".format(dir=args.dir, iter=iter) shrinkage_value = 1.0 if args.shrink_value != 1.0: shrinkage_value = (args.shrink_value if common_train_lib.do_shrinkage( iter, model_file, args.shrink_saturation_threshold, get_raw_nnet_from_am=False) else 1 ) train_lib.common.train_one_iteration( dir=args.dir, iter=iter, srand=args.srand, egs_dir=egs_dir, num_jobs=current_num_jobs, num_archives_processed=num_archives_processed, num_archives=num_archives, learning_rate=learning_rate(iter, current_num_jobs, num_archives_processed), dropout_edit_string=common_train_lib.get_dropout_edit_string( args.dropout_schedule, float(num_archives_processed) / num_archives_to_process, iter), shrinkage_value=shrinkage_value, minibatch_size_str=args.num_chunk_per_minibatch, num_hidden_layers=num_hidden_layers, add_layers_period=args.add_layers_period, left_context=left_context, right_context=right_context, min_deriv_time=min_deriv_time, max_deriv_time_relative=max_deriv_time_relative, momentum=args.momentum, max_param_change=args.max_param_change, shuffle_buffer_size=args.shuffle_buffer_size, run_opts=run_opts, get_raw_nnet_from_am=False, background_process_handler=background_process_handler) if args.cleanup: # do a clean up everythin but the last 2 models, under certain # conditions common_train_lib.remove_model( args.dir, iter-2, num_iters, models_to_combine, args.preserve_model_interval, get_raw_nnet_from_am=False) if args.email is not None: reporting_iter_interval = num_iters * args.reporting_interval if iter % reporting_iter_interval == 0: # lets do some reporting [report, times, data] = ( nnet3_log_parse.generate_acc_logprob_report(args.dir)) message = report subject = ("Update : Expt {dir} : " "Iter {iter}".format(dir=args.dir, iter=iter)) common_lib.send_mail(message, subject, args.email) num_archives_processed = num_archives_processed + current_num_jobs if args.stage <= num_iters: logger.info("Doing final combination to produce final.raw") train_lib.common.combine_models( dir=args.dir, num_iters=num_iters, models_to_combine=models_to_combine, egs_dir=egs_dir, left_context=left_context, right_context=right_context, minibatch_size_str=args.num_chunk_per_minibatch, run_opts=run_opts, chunk_width=args.chunk_width, background_process_handler=background_process_handler, get_raw_nnet_from_am=False, sum_to_one_penalty=args.combine_sum_to_one_penalty) if include_log_softmax and args.stage <= num_iters + 1: logger.info("Getting average posterior for purposes of " "adjusting the priors.") train_lib.common.compute_average_posterior( dir=args.dir, iter='final', egs_dir=egs_dir, num_archives=num_archives, left_context=left_context, right_context=right_context, prior_subset_size=args.prior_subset_size, run_opts=run_opts, get_raw_nnet_from_am=False) if args.cleanup: logger.info("Cleaning up the experiment directory " "{0}".format(args.dir)) remove_egs = args.remove_egs if args.egs_dir is not None: # this egs_dir was not created by this experiment so we will not # delete it remove_egs = False common_train_lib.clean_nnet_dir( nnet_dir=args.dir, num_iters=num_iters, egs_dir=egs_dir, preserve_model_interval=args.preserve_model_interval, remove_egs=remove_egs, get_raw_nnet_from_am=False) # do some reporting [report, times, data] = nnet3_log_parse.generate_acc_logprob_report(args.dir) if args.email is not None: common_lib.send_mail(report, "Update : Expt {0} : " "complete".format(args.dir), args.email) with open("{dir}/accuracy.report".format(dir=args.dir), "w") as f: f.write(report) common_lib.run_job("steps/info/nnet3_dir_info.pl " "{0}".format(args.dir))
def train(args, run_opts, background_process_handler): """ The main function for training. Args: args: a Namespace object with the required parameters obtained from the function process_args() run_opts: RunOpts object obtained from the process_args() """ arg_string = pprint.pformat(vars(args)) logger.info("Arguments for the experiment\n{0}".format(arg_string)) # Set some variables. feat_dim = common_lib.get_feat_dim(args.feat_dir) ivector_dim = common_lib.get_ivector_dim(args.online_ivector_dir) config_dir = '{0}/configs'.format(args.dir) var_file = '{0}/vars'.format(config_dir) variables = common_train_lib.parse_generic_config_vars_file(var_file) # Set some variables. try: model_left_context = variables['model_left_context'] model_right_context = variables['model_right_context'] # this is really the number of times we add layers to the network for # discriminative pretraining num_hidden_layers = variables['num_hidden_layers'] add_lda = common_lib.str_to_bool(variables['add_lda']) include_log_softmax = common_lib.str_to_bool( variables['include_log_softmax']) except KeyError as e: raise Exception("KeyError {0}: Variables need to be defined in " "{1}".format(str(e), '{0}/configs'.format(args.dir))) left_context = args.chunk_left_context + model_left_context right_context = args.chunk_right_context + model_right_context # Initialize as "raw" nnet, prior to training the LDA-like preconditioning # matrix. This first config just does any initial splicing that we do; # we do this as it's a convenient way to get the stats for the 'lda-like' # transform. if (args.stage <= -5): logger.info("Initializing a basic network") common_lib.run_job( """{command} {dir}/log/nnet_init.log \ nnet3-init --srand=-2 {dir}/configs/init.config \ {dir}/init.raw""".format(command=run_opts.command, dir=args.dir)) default_egs_dir = '{0}/egs'.format(args.dir) if (args.stage <= -4) and args.egs_dir is None: logger.info("Generating egs") if args.use_dense_targets: target_type = "dense" try: num_targets = int(variables['num_targets']) if (common_lib.get_feat_dim_from_scp(args.targets_scp) != num_targets): raise Exception("Mismatch between num-targets provided to " "script vs configs") except KeyError as e: num_targets = -1 else: target_type = "sparse" try: num_targets = int(variables['num_targets']) except KeyError as e: raise Exception("KeyError {0}: Variables need to be defined " "in {1}".format( str(e), '{0}/configs'.format(args.dir))) train_lib.raw_model.generate_egs_using_targets( data=args.feat_dir, targets_scp=args.targets_scp, egs_dir=default_egs_dir, left_context=left_context, right_context=right_context, valid_left_context=left_context, valid_right_context=right_context, run_opts=run_opts, frames_per_eg=args.frames_per_eg, srand=args.srand, egs_opts=args.egs_opts, cmvn_opts=args.cmvn_opts, online_ivector_dir=args.online_ivector_dir, samples_per_iter=args.samples_per_iter, transform_dir=args.transform_dir, stage=args.egs_stage, target_type=target_type, num_targets=num_targets) if args.egs_dir is None: egs_dir = default_egs_dir else: egs_dir = args.egs_dir [egs_left_context, egs_right_context, frames_per_eg, num_archives] = ( common_train_lib.verify_egs_dir(egs_dir, feat_dim, ivector_dim, left_context, right_context)) assert(args.frames_per_eg == frames_per_eg) if (args.num_jobs_final > num_archives): raise Exception('num_jobs_final cannot exceed the number of archives ' 'in the egs directory') # copy the properties of the egs to dir for # use during decoding common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir) if (add_lda and args.stage <= -3): logger.info('Computing the preconditioning matrix for input features') train_lib.common.compute_preconditioning_matrix( args.dir, egs_dir, num_archives, run_opts, max_lda_jobs=args.max_lda_jobs, rand_prune=args.rand_prune) if (args.stage <= -1): logger.info("Preparing the initial network.") common_train_lib.prepare_initial_network(args.dir, run_opts) # set num_iters so that as close as possible, we process the data # $num_epochs times, i.e. $num_iters*$avg_num_jobs) == # $num_epochs*$num_archives, where # avg_num_jobs=(num_jobs_initial+num_jobs_final)/2. num_archives_expanded = num_archives * args.frames_per_eg num_archives_to_process = args.num_epochs * num_archives_expanded num_archives_processed = 0 num_iters = ((num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final)) models_to_combine = common_train_lib.verify_iterations( num_iters, args.num_epochs, num_hidden_layers, num_archives_expanded, args.max_models_combine, args.add_layers_period, args.num_jobs_final) def learning_rate(iter, current_num_jobs, num_archives_processed): return common_train_lib.get_learning_rate(iter, current_num_jobs, num_iters, num_archives_processed, num_archives_to_process, args.initial_effective_lrate, args.final_effective_lrate) logger.info("Training will run for {0} epochs = " "{1} iterations".format(args.num_epochs, num_iters)) for iter in range(num_iters): if (args.exit_stage is not None) and (iter == args.exit_stage): logger.info("Exiting early due to --exit-stage {0}".format(iter)) return current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters) if args.stage <= iter: train_lib.common.train_one_iteration( dir=args.dir, iter=iter, srand=args.srand, egs_dir=egs_dir, num_jobs=current_num_jobs, num_archives_processed=num_archives_processed, num_archives=num_archives, learning_rate=learning_rate(iter, current_num_jobs, num_archives_processed), dropout_edit_string=common_lib.get_dropout_edit_string( args.dropout_schedule, float(num_archives_processed) / num_archives_to_process, iter), minibatch_size=args.minibatch_size, frames_per_eg=args.frames_per_eg, num_hidden_layers=num_hidden_layers, add_layers_period=args.add_layers_period, left_context=left_context, right_context=right_context, momentum=args.momentum, max_param_change=args.max_param_change, shuffle_buffer_size=args.shuffle_buffer_size, run_opts=run_opts, get_raw_nnet_from_am=False, background_process_handler=background_process_handler) if args.cleanup: # do a clean up everythin but the last 2 models, under certain # conditions common_train_lib.remove_model( args.dir, iter-2, num_iters, models_to_combine, args.preserve_model_interval, get_raw_nnet_from_am=False) if args.email is not None: reporting_iter_interval = num_iters * args.reporting_interval if iter % reporting_iter_interval == 0: # lets do some reporting [report, times, data] = ( nnet3_log_parse.generate_accuracy_report(args.dir)) message = report subject = ("Update : Expt {dir} : " "Iter {iter}".format(dir=args.dir, iter=iter)) common_lib.send_mail(message, subject, args.email) num_archives_processed = num_archives_processed + current_num_jobs if args.stage <= num_iters: logger.info("Doing final combination to produce final.raw") train_lib.common.combine_models( dir=args.dir, num_iters=num_iters, models_to_combine=models_to_combine, egs_dir=egs_dir, left_context=left_context, right_context=right_context, run_opts=run_opts, background_process_handler=background_process_handler, get_raw_nnet_from_am=False) if include_log_softmax and args.stage <= num_iters + 1: logger.info("Getting average posterior for purposes of " "adjusting the priors.") train_lib.common.compute_average_posterior( dir=args.dir, iter='final', egs_dir=egs_dir, num_archives=num_archives, left_context=left_context, right_context=right_context, prior_subset_size=args.prior_subset_size, run_opts=run_opts, get_raw_nnet_from_am=False) if args.cleanup: logger.info("Cleaning up the experiment directory " "{0}".format(args.dir)) remove_egs = args.remove_egs if args.egs_dir is not None: # this egs_dir was not created by this experiment so we will not # delete it remove_egs = False common_train_lib.clean_nnet_dir( nnet_dir=args.dir, num_iters=num_iters, egs_dir=egs_dir, preserve_model_interval=args.preserve_model_interval, remove_egs=remove_egs, get_raw_nnet_from_am=False) # do some reporting [report, times, data] = nnet3_log_parse.generate_accuracy_report(args.dir) if args.email is not None: common_lib.send_mail(report, "Update : Expt {0} : " "complete".format(args.dir), args.email) with open("{dir}/accuracy.report".format(dir=args.dir), "w") as f: f.write(report) common_lib.run_job("steps/info/nnet3_dir_info.pl " "{0}".format(args.dir))