def add_back_compatibility_info(config_dir): """This will be removed when python script refactoring is done.""" common_lib.run_kaldi_command("nnet3-init {0}/ref.config " "{0}/ref.raw".format(config_dir)) out, err = common_lib.run_kaldi_command("nnet3-info {0}/ref.raw | " "head -4".format(config_dir)) # out looks like this # left-context: 7 # right-context: 0 # num-parameters: 90543902 # modulus: 1 info = {} for line in out.split("\n"): parts = line.split(":") if len(parts) != 2: continue info[parts[0].strip()] = int(parts[1].strip()) # Writing the back-compatible vars file # model_left_context=0 # model_right_context=7 # num_hidden_layers=3 vf = open('{0}/vars'.format(config_dir), 'w') vf.write('model_left_context={0}\n'.format(info['left-context'])) vf.write('model_right_context={0}\n'.format(info['right-context'])) vf.write('num_hidden_layers=1\n') vf.close() common_lib.force_symlink("final.config".format(config_dir), "{0}/layer1.config".format(config_dir))
def do_shrinkage(iter, model_file, shrink_saturation_threshold, get_raw_nnet_from_am=True): if iter == 0: return True if get_raw_nnet_from_am: output, error = common_lib.run_kaldi_command( "nnet3-am-info --print-args=false {0} | " "steps/nnet3/get_saturation.pl".format(model_file)) else: output, error = common_lib.run_kaldi_command( "nnet3-info --print-args=false {0} | " "steps/nnet3/get_saturation.pl".format(model_file)) output = output.strip().split("\n") try: assert len(output) == 1 saturation = float(output[0]) assert saturation >= 0 and saturation <= 1 except: raise Exception("Something went wrong, could not get " "saturation from the output '{0}' of " "get_saturation.pl on the info of " "model {1}".format(output, model_file)) return (saturation > shrink_saturation_threshold)
def parse_prob_logs(exp_dir, key='accuracy', output="output"): train_prob_files = "%s/log/compute_prob_train.*.log" % (exp_dir) valid_prob_files = "%s/log/compute_prob_valid.*.log" % (exp_dir) train_prob_strings = common_lib.run_kaldi_command( 'grep -e {0} {1}'.format(key, train_prob_files), wait=True)[0] valid_prob_strings = common_lib.run_kaldi_command( 'grep -e {0} {1}'.format(key, valid_prob_files))[0] # LOG # (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:149) # Overall log-probability for 'output' is -0.399395 + -0.013437 = -0.412832 # per frame, over 20000 fra # LOG # (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:144) # Overall log-probability for 'output' is -0.307255 per frame, over 20000 # frames. parse_regex = re.compile( ".*compute_prob_.*\.([0-9]+).log:LOG " ".nnet3.*compute-prob.*:PrintTotalStats..:" "nnet.*diagnostics.cc:[0-9]+. Overall ([a-zA-Z\-]+) for " "'{output}'.*is ([0-9.\-e]+) .*per frame".format(output=output)) train_loss = {} valid_loss = {} for line in train_prob_strings.split('\n'): mat_obj = parse_regex.search(line) if mat_obj is not None: groups = mat_obj.groups() if groups[1] == key: train_loss[int(groups[0])] = groups[2] if not train_loss: raise KaldiLogParseException("Could not find any lines with {k} in " " {l}".format(k=key, l=train_prob_files)) for line in valid_prob_strings.split('\n'): mat_obj = parse_regex.search(line) if mat_obj is not None: groups = mat_obj.groups() if groups[1] == key: valid_loss[int(groups[0])] = groups[2] if not valid_loss: raise KaldiLogParseException("Could not find any lines with {k} in " " {l}".format(k=key, l=valid_prob_files)) iters = list(set(valid_loss.keys()).intersection(train_loss.keys())) if not iters: raise KaldiLogParseException("Could not any common iterations with" " key {k} in both {tl} and {vl}".format( k=key, tl=train_prob_files, vl=valid_prob_files)) iters.sort() return map(lambda x: (int(x), float(train_loss[x]), float(valid_loss[x])), iters)
def compile(self): root, ext = os.path.splitext(self.pdf_file) dir_name = os.path.dirname(self.pdf_file) latex_file = root + ".tex" lat_file = open(latex_file, "w") lat_file.write("\n".join(self.document)) lat_file.close() logger.info("Compiling the latex report.") try: common_lib.run_kaldi_command( "pdflatex -interaction=batchmode " "-output-directory={0} {1}".format(dir_name, latex_file)) except Exception as e: logger.warning("There was an error compiling the latex file {0}, " "please do it manually: {1}".format(latex_file, e)) return False return True
def parse_progress_logs_for_nonlinearity_stats(exp_dir): """ Parse progress logs for mean and std stats for non-linearities. e.g. for a line that is parsed from progress.*.log: exp/nnet3/lstm_self_repair_ld5_sp/log/progress.9.log:component name=Lstm3_i type=SigmoidComponent, dim=1280, self-repair-scale=1e-05, count=1.96e+05, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.05,0.09,0.11,0.15 0.19,0.27,0.50,0.72,0.83 0.88,0.92,0.94,0.99), mean=0.502, stddev=0.23], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.009,0.04,0.05,0.06 0.08,0.10,0.14,0.17,0.18 0.19,0.20,0.20,0.21), mean=0.134, stddev=0.0397] """ progress_log_files = "%s/log/progress.*.log" % (exp_dir) stats_per_component_per_iter = {} progress_log_lines = common_lib.run_kaldi_command( 'grep -e "value-avg.*deriv-avg" {0}'.format(progress_log_files))[0] parse_regex = re.compile( ".*progress.([0-9]+).log:component name=(.+) " "type=(.*)Component,.*" "value-avg=\[.*mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*" "deriv-avg=\[.*mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\]") for line in progress_log_lines.split("\n"): mat_obj = parse_regex.search(line) if mat_obj is None: continue # groups = ('9', 'Lstm3_i', 'Sigmoid', '0.502', '0.23', # '0.134', '0.0397') groups = mat_obj.groups() iteration = int(groups[0]) component_name = groups[1] component_type = groups[2] value_mean = float(groups[3]) value_stddev = float(groups[4]) deriv_mean = float(groups[5]) deriv_stddev = float(groups[6]) try: stats_per_component_per_iter[component_name][ 'stats'][iteration] = [value_mean, value_stddev, deriv_mean, deriv_stddev] except KeyError: stats_per_component_per_iter[component_name] = {} stats_per_component_per_iter[component_name][ 'type'] = component_type stats_per_component_per_iter[component_name]['stats'] = {} stats_per_component_per_iter[component_name][ 'stats'][iteration] = [value_mean, value_stddev, deriv_mean, deriv_stddev] return stats_per_component_per_iter
def parse_train_logs(exp_dir): train_log_files = "%s/log/train.*.log" % (exp_dir) train_log_lines = common_lib.run_kaldi_command( 'grep -e Accounting {0}'.format(train_log_files))[0] parse_regex = re.compile(".*train\.([0-9]+)\.([0-9]+)\.log:# " "Accounting: time=([0-9]+) thread.*") train_times = {} for line in train_log_lines.split('\n'): mat_obj = parse_regex.search(line) if mat_obj is not None: groups = mat_obj.groups() try: train_times[int(groups[0])][int(groups[1])] = float(groups[2]) except KeyError: train_times[int(groups[0])] = {} train_times[int(groups[0])][int(groups[1])] = float(groups[2]) iters = train_times.keys() for iter in iters: values = train_times[iter].values() train_times[iter] = max(values) return train_times
def train(args, run_opts, background_process_handler): """ The main function for training. Args: args: a Namespace object with the required parameters obtained from the function process_args() run_opts: RunOpts object obtained from the process_args() """ arg_string = pprint.pformat(vars(args)) logger.info("Arguments for the experiment\n{0}".format(arg_string)) # Check files chain_lib.check_for_required_files(args.feat_dir, args.tree_dir, args.lat_dir) # Set some variables. num_jobs = common_lib.get_number_of_jobs(args.tree_dir) feat_dim = common_lib.get_feat_dim(args.feat_dir) ivector_dim = common_lib.get_ivector_dim(args.online_ivector_dir) # split the training data into parts for individual jobs # we will use the same number of jobs as that used for alignment common_lib.split_data(args.feat_dir, num_jobs) shutil.copy('{0}/tree'.format(args.tree_dir), args.dir) with open('{0}/num_jobs'.format(args.dir), 'w') as f: f.write(str(num_jobs)) config_dir = '{0}/configs'.format(args.dir) var_file = '{0}/vars'.format(config_dir) variables = common_train_lib.parse_generic_config_vars_file(var_file) # Set some variables. try: model_left_context = variables['model_left_context'] model_right_context = variables['model_right_context'] # this is really the number of times we add layers to the network for # discriminative pretraining num_hidden_layers = variables['num_hidden_layers'] except KeyError as e: raise Exception("KeyError {0}: Variables need to be defined in " "{1}".format(str(e), '{0}/configs'.format(args.dir))) left_context = args.chunk_left_context + model_left_context right_context = args.chunk_right_context + model_right_context # Initialize as "raw" nnet, prior to training the LDA-like preconditioning # matrix. This first config just does any initial splicing that we do; # we do this as it's a convenient way to get the stats for the 'lda-like' # transform. if (args.stage <= -6): logger.info("Creating phone language-model") chain_lib.create_phone_lm(args.dir, args.tree_dir, run_opts, lm_opts=args.lm_opts) if (args.stage <= -5): logger.info("Creating denominator FST") chain_lib.create_denominator_fst(args.dir, args.tree_dir, run_opts) if (args.stage <= -4): logger.info("Initializing a basic network for estimating " "preconditioning matrix") common_lib.run_kaldi_command( """{command} {dir}/log/nnet_init.log \ nnet3-init --srand=-2 {dir}/configs/init.config \ {dir}/init.raw""".format(command=run_opts.command, dir=args.dir)) egs_left_context = left_context + args.frame_subsampling_factor/2 egs_right_context = right_context + args.frame_subsampling_factor/2 default_egs_dir = '{0}/egs'.format(args.dir) if (args.stage <= -3) and args.egs_dir is None: logger.info("Generating egs") # this is where get_egs.sh is called. chain_lib.generate_chain_egs( dir=args.dir, data=args.feat_dir, lat_dir=args.lat_dir, egs_dir=default_egs_dir, left_context=egs_left_context, right_context=egs_right_context, run_opts=run_opts, left_tolerance=args.left_tolerance, right_tolerance=args.right_tolerance, frame_subsampling_factor=args.frame_subsampling_factor, alignment_subsampling_factor=args.alignment_subsampling_factor, frames_per_eg=args.chunk_width, srand=args.srand, egs_opts=args.egs_opts, cmvn_opts=args.cmvn_opts, online_ivector_dir=args.online_ivector_dir, frames_per_iter=args.frames_per_iter, transform_dir=args.transform_dir, stage=args.egs_stage) if args.egs_dir is None: egs_dir = default_egs_dir else: egs_dir = args.egs_dir [egs_left_context, egs_right_context, frames_per_eg, num_archives] = ( common_train_lib.verify_egs_dir(egs_dir, feat_dim, ivector_dim, egs_left_context, egs_right_context)) assert(args.chunk_width == frames_per_eg) num_archives_expanded = num_archives * args.frame_subsampling_factor if (args.num_jobs_final > num_archives_expanded): raise Exception('num_jobs_final cannot exceed the ' 'expanded number of archives') # copy the properties of the egs to dir for # use during decoding common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir) if (args.stage <= -2): logger.info('Computing the preconditioning matrix for input features') chain_lib.compute_preconditioning_matrix( args.dir, egs_dir, num_archives, run_opts, max_lda_jobs=args.max_lda_jobs, rand_prune=args.rand_prune) if (args.stage <= -1): logger.info("Preparing the initial acoustic model.") chain_lib.prepare_initial_acoustic_model(args.dir, run_opts) with open("{0}/frame_subsampling_factor".format(args.dir), "w") as f: f.write(str(args.frame_subsampling_factor)) # set num_iters so that as close as possible, we process the data # $num_epochs times, i.e. $num_iters*$avg_num_jobs) == # $num_epochs*$num_archives, where # avg_num_jobs=(num_jobs_initial+num_jobs_final)/2. num_archives_to_process = args.num_epochs * num_archives_expanded num_archives_processed = 0 num_iters = ((num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final)) models_to_combine = common_train_lib.verify_iterations( num_iters, args.num_epochs, num_hidden_layers, num_archives_expanded, args.max_models_combine, args.add_layers_period, args.num_jobs_final) def learning_rate(iter, current_num_jobs, num_archives_processed): return common_train_lib.get_learning_rate(iter, current_num_jobs, num_iters, num_archives_processed, num_archives_to_process, args.initial_effective_lrate, args.final_effective_lrate) min_deriv_time = None max_deriv_time = None if args.deriv_truncate_margin is not None: min_deriv_time = -args.deriv_truncate_margin - model_left_context max_deriv_time = (args.chunk_width - 1 + args.deriv_truncate_margin + model_right_context) logger.info("Training will run for {0} epochs = " "{1} iterations".format(args.num_epochs, num_iters)) for iter in range(num_iters): if (args.exit_stage is not None) and (iter == args.exit_stage): logger.info("Exiting early due to --exit-stage {0}".format(iter)) return current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters) if args.stage <= iter: model_file = "{dir}/{iter}.mdl".format(dir=args.dir, iter=iter) shrinkage_value = 1.0 if args.shrink_value != 1.0: shrinkage_value = (args.shrink_value if common_train_lib.do_shrinkage( iter, model_file, args.shrink_saturation_threshold) else 1 ) logger.info("On iteration {0}, learning rate is {1} and " "shrink value is {2}.".format( iter, learning_rate(iter, current_num_jobs, num_archives_processed), shrinkage_value)) chain_lib.train_one_iteration( dir=args.dir, iter=iter, srand=args.srand, egs_dir=egs_dir, num_jobs=current_num_jobs, num_archives_processed=num_archives_processed, num_archives=num_archives, learning_rate=learning_rate(iter, current_num_jobs, num_archives_processed), shrinkage_value=shrinkage_value, num_chunk_per_minibatch=args.num_chunk_per_minibatch, num_hidden_layers=num_hidden_layers, add_layers_period=args.add_layers_period, left_context=left_context, right_context=right_context, apply_deriv_weights=args.apply_deriv_weights, min_deriv_time=min_deriv_time, max_deriv_time=max_deriv_time, l2_regularize=args.l2_regularize, xent_regularize=args.xent_regularize, leaky_hmm_coefficient=args.leaky_hmm_coefficient, momentum=args.momentum, max_param_change=args.max_param_change, shuffle_buffer_size=args.shuffle_buffer_size, frame_subsampling_factor=args.frame_subsampling_factor, truncate_deriv_weights=args.truncate_deriv_weights, run_opts=run_opts, background_process_handler=background_process_handler) if args.cleanup: # do a clean up everythin but the last 2 models, under certain # conditions common_train_lib.remove_model( args.dir, iter-2, num_iters, models_to_combine, args.preserve_model_interval) if args.email is not None: reporting_iter_interval = num_iters * args.reporting_interval if iter % reporting_iter_interval == 0: # lets do some reporting [report, times, data] = ( nnet3_log_parse.generate_accuracy_report( args.dir, "log-probability")) message = report subject = ("Update : Expt {dir} : " "Iter {iter}".format(dir=args.dir, iter=iter)) common_lib.send_mail(message, subject, args.email) num_archives_processed = num_archives_processed + current_num_jobs if args.stage <= num_iters: logger.info("Doing final combination to produce final.mdl") chain_lib.combine_models( dir=args.dir, num_iters=num_iters, models_to_combine=models_to_combine, num_chunk_per_minibatch=args.num_chunk_per_minibatch, egs_dir=egs_dir, left_context=left_context, right_context=right_context, leaky_hmm_coefficient=args.leaky_hmm_coefficient, l2_regularize=args.l2_regularize, xent_regularize=args.xent_regularize, run_opts=run_opts, background_process_handler=background_process_handler) if args.cleanup: logger.info("Cleaning up the experiment directory " "{0}".format(args.dir)) remove_egs = args.remove_egs if args.egs_dir is not None: # this egs_dir was not created by this experiment so we will not # delete it remove_egs = False common_train_lib.clean_nnet_dir( args.dir, num_iters, egs_dir, preserve_model_interval=args.preserve_model_interval, remove_egs=remove_egs) # do some reporting [report, times, data] = nnet3_log_parse.generate_accuracy_report( args.dir, "log-probability") if args.email is not None: common_lib.send_mail(report, "Update : Expt {0} : " "complete".format(args.dir), args.email) with open("{dir}/accuracy.report".format(dir=args.dir), "w") as f: f.write(report) common_lib.run_kaldi_command("steps/info/nnet3_dir_info.pl " "{0}".format(args.dir))
def parse_progress_logs_for_param_diff(exp_dir, pattern): """ Parse progress logs for per-component parameter differences. e.g. for a line that is parsed from progress.*.log: exp/chain/cwrnn_trial2_ld5_sp/log/progress.245.log:LOG (nnet3-show-progress:main():nnet3-show-progress.cc:144) Relative parameter differences per layer are [ Cwrnn1_T3_W_r:0.0171537 Cwrnn1_T3_W_x:1.33338e-07 Cwrnn1_T2_W_r:0.048075 Cwrnn1_T2_W_x:1.34088e-07 Cwrnn1_T1_W_r:0.0157277 Cwrnn1_T1_W_x:0.0212704 Final_affine:0.0321521 Cwrnn2_T3_W_r:0.0212082 Cwrnn2_T3_W_x:1.33691e-07 Cwrnn2_T2_W_r:0.0212978 Cwrnn2_T2_W_x:1.33401e-07 Cwrnn2_T1_W_r:0.014976 Cwrnn2_T1_W_x:0.0233588 Cwrnn3_T3_W_r:0.0237165 Cwrnn3_T3_W_x:1.33184e-07 Cwrnn3_T2_W_r:0.0239754 Cwrnn3_T2_W_x:1.3296e-07 Cwrnn3_T1_W_r:0.0194809 Cwrnn3_T1_W_x:0.0271934 ] """ if pattern not in set(["Relative parameter differences", "Parameter differences"]): raise Exception("Unknown value for pattern : {0}".format(pattern)) progress_log_files = "%s/log/progress.*.log" % (exp_dir) progress_per_iter = {} component_names = set([]) progress_log_lines = common_lib.run_kaldi_command( 'grep -e "{0}" {1}'.format(pattern, progress_log_files))[0] parse_regex = re.compile(".*progress\.([0-9]+)\.log:" "LOG.*{0}.*\[(.*)\]".format(pattern)) for line in progress_log_lines.split("\n"): mat_obj = parse_regex.search(line) if mat_obj is None: continue groups = mat_obj.groups() iteration = groups[0] differences = parse_difference_string(groups[1]) component_names = component_names.union(differences.keys()) progress_per_iter[int(iteration)] = differences component_names = list(component_names) component_names.sort() # rearranging the parameter differences available per iter # into parameter differences per component progress_per_component = {} for cn in component_names: progress_per_component[cn] = {} max_iter = max(progress_per_iter.keys()) total_missing_iterations = 0 gave_user_warning = False for iter in range(max_iter + 1): try: component_dict = progress_per_iter[iter] except KeyError: continue for component_name in component_names: try: progress_per_component[component_name][iter] = component_dict[ component_name] except KeyError: total_missing_iterations += 1 # the component was not found this iteration, may be because of # layerwise discriminative training pass if (total_missing_iterations/len(component_names) > 20 and not gave_user_warning and logger is not None): logger.warning("There are more than {0} missing iterations per " "component. Something might be wrong.".format( total_missing_iterations/len(component_names))) gave_user_warning = True return {'progress_per_component': progress_per_component, 'component_names': component_names, 'max_iter': max_iter}
def parse_progress_logs_for_clipped_proportion(exp_dir): """ Parse progress logs for clipped proportion stats. e.g. for a line that is parsed from progress.*.log: exp/chain/cwrnn_trial2_ld5_sp/log/progress.245.log:component name=BLstm1_forward_c type=ClipGradientComponent, dim=512, norm-based-clipping=true, clipping-threshold=30, clipped-proportion=0.000565527, self-repair-clipped-proportion-threshold=0.01, self-repair-target=0, self-repair-scale=1 """ progress_log_files = "%s/log/progress.*.log" % (exp_dir) component_names = set([]) progress_log_lines = common_lib.run_kaldi_command( 'grep -e "{0}" {1}'.format( "clipped-proportion", progress_log_files))[0] parse_regex = re.compile(".*progress\.([0-9]+)\.log:component " "name=(.*) type=.* " "clipped-proportion=([0-9\.e\-]+)") cp_per_component_per_iter = {} max_iteration = 0 component_names = set([]) for line in progress_log_lines.split("\n"): mat_obj = parse_regex.search(line) if mat_obj is None: if line.strip() == "": continue raise MalformedClippedProportionLineException(line) groups = mat_obj.groups() iteration = int(groups[0]) max_iteration = max(max_iteration, iteration) name = groups[1] clipped_proportion = float(groups[2]) if clipped_proportion > 1: raise MalformedClippedProportionLineException(line) if iteration not in cp_per_component_per_iter: cp_per_component_per_iter[iteration] = {} cp_per_component_per_iter[iteration][name] = clipped_proportion component_names.add(name) component_names = list(component_names) component_names.sort() # re arranging the data into an array # and into an cp_per_iter_per_component cp_per_iter_per_component = {} for component_name in component_names: cp_per_iter_per_component[component_name] = [] data = [] data.append(["iteration"]+component_names) for iter in range(max_iteration+1): if iter not in cp_per_component_per_iter: continue comp_dict = cp_per_component_per_iter[iter] row = [iter] for component in component_names: try: row.append(comp_dict[component]) cp_per_iter_per_component[component].append( [iter, comp_dict[component]]) except KeyError: # if clipped proportion is not available for a particular # component it is set to None # this usually happens during layer-wise discriminative # training row.append(None) data.append(row) return {'table': data, 'cp_per_component_per_iter': cp_per_component_per_iter, 'cp_per_iter_per_component': cp_per_iter_per_component}
def train(args, run_opts, background_process_handler): """ The main function for training. Args: args: a Namespace object with the required parameters obtained from the function process_args() run_opts: RunOpts object obtained from the process_args() """ arg_string = pprint.pformat(vars(args)) logger.info("Arguments for the experiment\n{0}".format(arg_string)) # Check files chain_lib.check_for_required_files(args.feat_dir, args.tree_dir, args.lat_dir) # Set some variables. num_jobs = common_lib.get_number_of_jobs(args.tree_dir) feat_dim = common_lib.get_feat_dim(args.feat_dir) ivector_dim = common_lib.get_ivector_dim(args.online_ivector_dir) ivector_id = common_lib.get_ivector_extractor_id(args.online_ivector_dir) # split the training data into parts for individual jobs # we will use the same number of jobs as that used for alignment common_lib.split_data(args.feat_dir, num_jobs) shutil.copy('{0}/tree'.format(args.tree_dir), args.dir) with open('{0}/num_jobs'.format(args.dir), 'w') as f: f.write(str(num_jobs)) config_dir = '{0}/configs'.format(args.dir) var_file = '{0}/vars'.format(config_dir) variables = common_train_lib.parse_generic_config_vars_file(var_file) # Set some variables. try: model_left_context = variables['model_left_context'] model_right_context = variables['model_right_context'] # this is really the number of times we add layers to the network for # discriminative pretraining num_hidden_layers = variables['num_hidden_layers'] except KeyError as e: raise Exception("KeyError {0}: Variables need to be defined in " "{1}".format(str(e), '{0}/configs'.format(args.dir))) left_context = args.chunk_left_context + model_left_context right_context = args.chunk_right_context + model_right_context left_context_initial = (args.chunk_left_context_initial + model_left_context if args.chunk_left_context_initial >= 0 else -1) right_context_final = (args.chunk_right_context_final + model_right_context if args.chunk_right_context_final >= 0 else -1) # Initialize as "raw" nnet, prior to training the LDA-like preconditioning # matrix. This first config just does any initial splicing that we do; # we do this as it's a convenient way to get the stats for the 'lda-like' # transform. if (args.stage <= -6): logger.info("Creating phone language-model") chain_lib.create_phone_lm(args.dir, args.tree_dir, run_opts, lm_opts=args.lm_opts) if (args.stage <= -5): logger.info("Creating denominator FST") chain_lib.create_denominator_fst(args.dir, args.tree_dir, run_opts) if (args.stage <= -4): logger.info("Initializing a basic network for estimating " "preconditioning matrix") common_lib.run_kaldi_command("""{command} {dir}/log/nnet_init.log \ nnet3-init --srand=-2 {dir}/configs/init.config \ {dir}/init.raw""".format(command=run_opts.command, dir=args.dir)) egs_left_context = left_context + args.frame_subsampling_factor / 2 egs_right_context = right_context + args.frame_subsampling_factor / 2 egs_left_context_initial = (left_context_initial + args.frame_subsampling_factor / 2 if left_context_initial >= 0 else -1) egs_right_context_final = (right_context_final + args.frame_subsampling_factor / 2 if right_context_final >= 0 else -1) default_egs_dir = '{0}/egs'.format(args.dir) if (args.stage <= -3) and args.egs_dir is None: logger.info("Generating egs") # this is where get_egs.sh is called. chain_lib.generate_chain_egs( dir=args.dir, data=args.feat_dir, lat_dir=args.lat_dir, egs_dir=default_egs_dir, left_context=egs_left_context, right_context=egs_right_context, left_context_initial=egs_left_context_initial, right_context_final=egs_right_context_final, run_opts=run_opts, left_tolerance=args.left_tolerance, right_tolerance=args.right_tolerance, frame_subsampling_factor=args.frame_subsampling_factor, alignment_subsampling_factor=args.alignment_subsampling_factor, frames_per_eg_str=args.chunk_width, srand=args.srand, egs_opts=args.egs_opts, cmvn_opts=args.cmvn_opts, online_ivector_dir=args.online_ivector_dir, frames_per_iter=args.frames_per_iter, transform_dir=args.transform_dir, stage=args.egs_stage) if args.egs_dir is None: egs_dir = default_egs_dir else: egs_dir = args.egs_dir [egs_left_context, egs_right_context, frames_per_eg_str, num_archives] = (common_train_lib.verify_egs_dir( egs_dir, feat_dim, ivector_dim, ivector_id, egs_left_context, egs_right_context, egs_left_context_initial, egs_right_context_final)) assert (args.chunk_width == frames_per_eg_str) num_archives_expanded = num_archives * args.frame_subsampling_factor if (args.num_jobs_final > num_archives_expanded): raise Exception('num_jobs_final cannot exceed the ' 'expanded number of archives') # copy the properties of the egs to dir for # use during decoding logger.info("Copying the properties from {0} to {1}".format( egs_dir, args.dir)) common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir) if (args.stage <= -2): logger.info('Computing the preconditioning matrix for input features') chain_lib.compute_preconditioning_matrix( args.dir, egs_dir, num_archives, run_opts, max_lda_jobs=args.max_lda_jobs, rand_prune=args.rand_prune) if (args.stage <= -1): logger.info("Preparing the initial acoustic model.") chain_lib.prepare_initial_acoustic_model(args.dir, run_opts) with open("{0}/frame_subsampling_factor".format(args.dir), "w") as f: f.write(str(args.frame_subsampling_factor)) # set num_iters so that as close as possible, we process the data # $num_epochs times, i.e. $num_iters*$avg_num_jobs) == # $num_epochs*$num_archives, where # avg_num_jobs=(num_jobs_initial+num_jobs_final)/2. num_archives_to_process = int(args.num_epochs * num_archives_expanded) num_archives_processed = 0 num_iters = ((num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final)) models_to_combine = common_train_lib.verify_iterations( num_iters, args.num_epochs, num_hidden_layers, num_archives_expanded, args.max_models_combine, args.add_layers_period, args.num_jobs_final) def learning_rate(iter, current_num_jobs, num_archives_processed): return common_train_lib.get_learning_rate(iter, current_num_jobs, num_iters, num_archives_processed, num_archives_to_process, args.initial_effective_lrate, args.final_effective_lrate) min_deriv_time = None max_deriv_time_relative = None if args.deriv_truncate_margin is not None: min_deriv_time = -args.deriv_truncate_margin - model_left_context max_deriv_time_relative = \ args.deriv_truncate_margin + model_right_context logger.info("Training will run for {0} epochs = " "{1} iterations".format(args.num_epochs, num_iters)) for iter in range(num_iters): if (args.exit_stage is not None) and (iter == args.exit_stage): logger.info("Exiting early due to --exit-stage {0}".format(iter)) return current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters) if args.stage <= iter: model_file = "{dir}/{iter}.mdl".format(dir=args.dir, iter=iter) shrinkage_value = 1.0 if args.shrink_value != 1.0: shrinkage_value = ( args.shrink_value if common_train_lib.do_shrinkage( iter, model_file, args.shrink_saturation_threshold) else 1) chain_lib.train_one_iteration( dir=args.dir, iter=iter, srand=args.srand, egs_dir=egs_dir, num_jobs=current_num_jobs, num_archives_processed=num_archives_processed, num_archives=num_archives, learning_rate=learning_rate(iter, current_num_jobs, num_archives_processed), dropout_edit_string=common_train_lib.get_dropout_edit_string( args.dropout_schedule, float(num_archives_processed) / num_archives_to_process, iter), shrinkage_value=shrinkage_value, num_chunk_per_minibatch_str=args.num_chunk_per_minibatch, num_hidden_layers=num_hidden_layers, add_layers_period=args.add_layers_period, left_context=left_context, right_context=right_context, apply_deriv_weights=args.apply_deriv_weights, min_deriv_time=min_deriv_time, max_deriv_time_relative=max_deriv_time_relative, l2_regularize=args.l2_regularize, xent_regularize=args.xent_regularize, leaky_hmm_coefficient=args.leaky_hmm_coefficient, momentum=args.momentum, max_param_change=args.max_param_change, shuffle_buffer_size=args.shuffle_buffer_size, frame_subsampling_factor=args.frame_subsampling_factor, run_opts=run_opts, background_process_handler=background_process_handler) if args.cleanup: # do a clean up everythin but the last 2 models, under certain # conditions common_train_lib.remove_model(args.dir, iter - 2, num_iters, models_to_combine, args.preserve_model_interval) if args.email is not None: reporting_iter_interval = num_iters * args.reporting_interval if iter % reporting_iter_interval == 0: # lets do some reporting [report, times, data] = (nnet3_log_parse.generate_acc_logprob_report( args.dir, "log-probability")) message = report subject = ("Update : Expt {dir} : " "Iter {iter}".format(dir=args.dir, iter=iter)) common_lib.send_mail(message, subject, args.email) num_archives_processed = num_archives_processed + current_num_jobs if args.stage <= num_iters: logger.info("Doing final combination to produce final.mdl") chain_lib.combine_models( dir=args.dir, num_iters=num_iters, models_to_combine=models_to_combine, num_chunk_per_minibatch_str=args.num_chunk_per_minibatch, egs_dir=egs_dir, left_context=left_context, right_context=right_context, leaky_hmm_coefficient=args.leaky_hmm_coefficient, l2_regularize=args.l2_regularize, xent_regularize=args.xent_regularize, run_opts=run_opts, background_process_handler=background_process_handler, sum_to_one_penalty=args.combine_sum_to_one_penalty) if args.cleanup: logger.info("Cleaning up the experiment directory " "{0}".format(args.dir)) remove_egs = args.remove_egs if args.egs_dir is not None: # this egs_dir was not created by this experiment so we will not # delete it remove_egs = False common_train_lib.clean_nnet_dir( args.dir, num_iters, egs_dir, preserve_model_interval=args.preserve_model_interval, remove_egs=remove_egs) # do some reporting [report, times, data ] = nnet3_log_parse.generate_acc_logprob_report(args.dir, "log-probability") if args.email is not None: common_lib.send_mail( report, "Update : Expt {0} : " "complete".format(args.dir), args.email) with open("{dir}/accuracy.report".format(dir=args.dir), "w") as f: f.write(report) common_lib.run_kaldi_command("steps/info/nnet3_dir_info.pl " "{0}".format(args.dir))
def parse_progress_logs_for_param_diff(exp_dir, pattern): """ Parse progress logs for per-component parameter differences. e.g. for a line that is parsed from progress.*.log: exp/chain/cwrnn_trial2_ld5_sp/log/progress.245.log:LOG (nnet3-show-progress:main():nnet3-show-progress.cc:144) Relative parameter differences per layer are [ Cwrnn1_T3_W_r:0.0171537 Cwrnn1_T3_W_x:1.33338e-07 Cwrnn1_T2_W_r:0.048075 Cwrnn1_T2_W_x:1.34088e-07 Cwrnn1_T1_W_r:0.0157277 Cwrnn1_T1_W_x:0.0212704 Final_affine:0.0321521 Cwrnn2_T3_W_r:0.0212082 Cwrnn2_T3_W_x:1.33691e-07 Cwrnn2_T2_W_r:0.0212978 Cwrnn2_T2_W_x:1.33401e-07 Cwrnn2_T1_W_r:0.014976 Cwrnn2_T1_W_x:0.0233588 Cwrnn3_T3_W_r:0.0237165 Cwrnn3_T3_W_x:1.33184e-07 Cwrnn3_T2_W_r:0.0239754 Cwrnn3_T2_W_x:1.3296e-07 Cwrnn3_T1_W_r:0.0194809 Cwrnn3_T1_W_x:0.0271934 ] """ if pattern not in set( ["Relative parameter differences", "Parameter differences"]): raise Exception("Unknown value for pattern : {0}".format(pattern)) progress_log_files = "%s/log/progress.*.log" % (exp_dir) progress_per_iter = {} component_names = set([]) progress_log_lines = common_lib.run_kaldi_command( 'grep -e "{0}" {1}'.format(pattern, progress_log_files))[0] parse_regex = re.compile(".*progress\.([0-9]+)\.log:" "LOG.*{0}.*\[(.*)\]".format(pattern)) for line in progress_log_lines.split("\n"): mat_obj = parse_regex.search(line) if mat_obj is None: continue groups = mat_obj.groups() iteration = groups[0] differences = parse_difference_string(groups[1]) component_names = component_names.union(differences.keys()) progress_per_iter[int(iteration)] = differences component_names = list(component_names) component_names.sort() # rearranging the parameter differences available per iter # into parameter differences per component progress_per_component = {} for cn in component_names: progress_per_component[cn] = {} max_iter = max(progress_per_iter.keys()) total_missing_iterations = 0 gave_user_warning = False for iter in range(max_iter + 1): try: component_dict = progress_per_iter[iter] except KeyError: continue for component_name in component_names: try: progress_per_component[component_name][iter] = component_dict[ component_name] except KeyError: total_missing_iterations += 1 # the component was not found this iteration, may be because of # layerwise discriminative training pass if (total_missing_iterations / len(component_names) > 20 and not gave_user_warning and logger is not None): logger.warning( "There are more than {0} missing iterations per " "component. Something might be wrong.".format( total_missing_iterations / len(component_names))) gave_user_warning = True return { 'progress_per_component': progress_per_component, 'component_names': component_names, 'max_iter': max_iter }
def parse_progress_logs_for_clipped_proportion(exp_dir): """ Parse progress logs for clipped proportion stats. e.g. for a line that is parsed from progress.*.log: exp/chain/cwrnn_trial2_ld5_sp/log/progress.245.log:component name=BLstm1_forward_c type=ClipGradientComponent, dim=512, norm-based-clipping=true, clipping-threshold=30, clipped-proportion=0.000565527, self-repair-clipped-proportion-threshold=0.01, self-repair-target=0, self-repair-scale=1 """ progress_log_files = "%s/log/progress.*.log" % (exp_dir) component_names = set([]) progress_log_lines = common_lib.run_kaldi_command( 'grep -e "{0}" {1}'.format("clipped-proportion", progress_log_files))[0] parse_regex = re.compile(".*progress\.([0-9]+)\.log:component " "name=(.*) type=.* " "clipped-proportion=([0-9\.e\-]+)") cp_per_component_per_iter = {} max_iteration = 0 component_names = set([]) for line in progress_log_lines.split("\n"): mat_obj = parse_regex.search(line) if mat_obj is None: if line.strip() == "": continue raise MalformedClippedProportionLineException(line) groups = mat_obj.groups() iteration = int(groups[0]) max_iteration = max(max_iteration, iteration) name = groups[1] clipped_proportion = float(groups[2]) if clipped_proportion > 1: raise MalformedClippedProportionLineException(line) if iteration not in cp_per_component_per_iter: cp_per_component_per_iter[iteration] = {} cp_per_component_per_iter[iteration][name] = clipped_proportion component_names.add(name) component_names = list(component_names) component_names.sort() # re arranging the data into an array # and into an cp_per_iter_per_component cp_per_iter_per_component = {} for component_name in component_names: cp_per_iter_per_component[component_name] = [] data = [] data.append(["iteration"] + component_names) for iter in range(max_iteration + 1): if iter not in cp_per_component_per_iter: continue comp_dict = cp_per_component_per_iter[iter] row = [iter] for component in component_names: try: row.append(comp_dict[component]) cp_per_iter_per_component[component].append( [iter, comp_dict[component]]) except KeyError: # if clipped proportion is not available for a particular # component it is set to None # this usually happens during layer-wise discriminative # training row.append(None) data.append(row) return { 'table': data, 'cp_per_component_per_iter': cp_per_component_per_iter, 'cp_per_iter_per_component': cp_per_iter_per_component }