def train_one_iteration(model_dir, _iter, random_seed, egs_dir, num_jobs, num_archives_processed, num_archives, learning_rate, minibatch_size, momentum, max_param_change, run_opts, feature_dim, archives_minibatch_count, shrinkage_value=1.0, current_dropout=0.0): """ Called from train for one iteration of neural network training Selected args: shrinkage_value: If value is 1.0, no shrinkage is done; otherwise parameter values are scaled by this value. """ # check if different iterations use the same random seed random_seed_file = '{0}/random_seed'.format(model_dir) if os.path.exists(random_seed_file): try: with open(random_seed_file, 'r') as fid: saved_random_seed = int(fid.readline().strip()) except (IOError, ValueError): logger.error( "Exception while reading the random seed for training") raise if random_seed != saved_random_seed: logger.warning( "The random seed provided to this iteration (random_seed={0}) is " "different from the one saved last time (random_seed={1}). " "Using random_seed={0}.".format(random_seed, saved_random_seed)) else: with open(random_seed_file, 'w') as fid: fid.write(str(random_seed)) # Sets off some background jobs to compute train and validation set objectives eval_trained_dnn(model_dir, _iter, egs_dir, run_opts) new_model = "{0}/model_{1}/model.meta".format(model_dir, _iter + 1) if utils.is_correct_model_dir("{0}/model_{1}".format(model_dir, _iter + 1)): logger.info( 'The output model {0} was exist and so I do not continue this iteration.' .format(new_model)) return do_average = (_iter > 0) dropout_proportion = current_dropout if do_average or True: # TODO cur_minibatch_size = minibatch_size cur_max_param_change = max_param_change else: # on iteration zero, use a smaller minibatch size (and we will later # choose the output of just one of the jobs): the model-averaging isn't # always helpful when the model is changing too fast (i.e. it can worsen # the objective function), and the smaller minibatch size will help to # keep the update stable. cur_minibatch_size = minibatch_size / 2 cur_max_param_change = float(max_param_change) / math.sqrt(2) try_count = 0 training_flag = True while try_count < MAX_TRY_COUNT and training_flag: train_new_models(model_dir=model_dir, _iter=_iter, random_seed=random_seed, num_jobs=num_jobs, num_archives_processed=num_archives_processed, num_archives=num_archives, learning_rate=learning_rate, shrinkage_value=shrinkage_value, dropout_proportion=dropout_proportion, egs_dir=egs_dir, momentum=momentum, max_param_change=cur_max_param_change, minibatch_size=cur_minibatch_size, feature_dim=feature_dim, try_count=try_count, archives_minibatch_count=archives_minibatch_count, run_opts=run_opts) try_count += 1 training_flag = False cnt = 0 for job in range(1, num_jobs + 1): if not utils.is_correct_model_dir("{0}/model_{1}.{2}".format( model_dir, _iter + 1, job)): # move logs to prevent rewriting. log_file = "{0}/log/train.{1}.{2}.log".format( model_dir, _iter, job) if os.path.exists(log_file): os.rename(log_file, "{0}.{1}".format(log_file, try_count)) training_flag = True cnt += 1 if training_flag: if try_count < MAX_TRY_COUNT: logger.warn( "{0}/{1} of jobs failed. Resubmitting them may solved the problem. " "Start resubmitting them after 30 seconds ...".format( cnt, num_jobs)) # sleep for 30 seconds before resubmitting time.sleep(30) else: logger.error( "{0}/{1} of jobs failed and maximum number of retrying is reached. " "Stop the training ...".format(cnt, num_jobs)) if training_flag: raise Exception("Some training jobs failed more than %d times. " "Please check the log files." % MAX_TRY_COUNT) [models_to_average, best_model] = utils.get_successful_models( num_jobs, '{0}/log/train.{1}.%.log'.format(model_dir, _iter)) if do_average and len(models_to_average) > 1: # average the output of the different jobs. # TODO nets_dirs = [] for n in models_to_average: nets_dirs.append("{0}/model_{1}.{2}".format( model_dir, _iter + 1, n)) utils.get_average_nnet_model(dir=model_dir, iter=_iter, nnets_list=" ".join(nets_dirs), run_opts=run_opts) else: # choose the best model from different jobs utils.copy_best_nnet_dir(_dir=model_dir, _iter=_iter, best_model_index=best_model) try: for i in range(1, num_jobs + 1): shutil.rmtree("{0}/model_{1}.{2}".format(model_dir, _iter + 1, i)) except OSError: logger.error("Error while trying to delete the client models.") raise if not os.path.isfile(new_model): raise Exception( "Could not find {0}, at the end of iteration {1}".format( new_model, _iter)) elif os.stat(new_model).st_size == 0: raise Exception( "{0} has size 0. Something went wrong in iteration {1}".format( new_model, _iter))
def train_new_models(model_dir, _iter, random_seed, num_jobs, num_archives_processed, num_archives, learning_rate, shrinkage_value, dropout_proportion, egs_dir, momentum, max_param_change, minibatch_size, run_opts, feature_dim, archives_minibatch_count, try_count=0, train_opts=""): """ Called from train_one_iteration(), this model does one iteration of training with 'num_jobs' jobs, and writes models in dirs like exp/tdnn_a/model_24.{1,2,3,..<num_jobs>} We cannot easily use a single parallel SGE job to do the main training, because the computation of which archive and which --frame option to use for each job is a little complex, so we spawn each one separately. """ threads = [] # the GPU timing info is only printed if we use the --verbose=1 flag; this # slows down the computation slightly, so don't accumulate it on every # iteration. Don't do it on iteration 0 either, because we use a smaller # than normal minibatch size, and people may get confused thinking it's # slower for iteration 0 because of the verbose option. verbose_opt = ("--verbose=1" if _iter % 20 == 0 and _iter > 0 else "") for job in range(1, num_jobs + 1): # k is a zero-based index that we will derive the other indexes from. k = num_archives_processed + job - 1 # work out the 1-based archive index. archive_index = (k % num_archives) + 1 minibatch_count = archives_minibatch_count[archive_index] if try_count > 0 and utils.is_correct_model_dir( '{0}/model_{1}.{2}'.format(model_dir, _iter + 1, job)): continue egs_rspecifier = \ '--ranges-file="{egs_dir}/temp/ranges.{archive_index}" ' \ '--scp-file="{egs_dir}/temp/feats.scp.{archive_index}" ' \ '--shuffle=True --minibatch-size={minibatch_size}'.format( egs_dir=egs_dir, archive_index=archive_index, minibatch_size=minibatch_size) # check whether tar file exist or not. If it was generated, so lets pass it to the script for speedup tar_file = '{egs_dir}/egs.{archive_index}.tar'.format( egs_dir=egs_dir, archive_index=archive_index) if os.path.exists(tar_file): egs_rspecifier = '--tar-file="{0}" {1}'.format( tar_file, egs_rspecifier) _command = '{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log ' \ 'local/tf/train_dnn_one_iteration.py ' \ '{parallel_train_opts} ' \ '{verbose_opt} --print-interval=10 ' \ '--momentum={momentum} ' \ '--max-param-change={max_param_change} ' \ '--l2-regularize-factor={l2_regularize_factor} ' \ '--random-seed={random_seed} {train_opts} ' \ '--learning-rate={learning_rate} ' \ '--scale={shrinkage_value} ' \ '--minibatch-count={minibatch_count} ' \ '--feature-dim={feature_dim} ' \ '--dropout-proportion={dropout_proportion} ' \ '{egs_rspecifier} ' \ '--input-dir={dir}/model_{iter} ' \ '--output-dir={dir}/model_{next_iter}.{job}' \ .format(command=run_opts.command, train_queue_opt=run_opts.train_queue_opt, dir=model_dir, iter=_iter, next_iter=_iter + 1, random_seed=_iter + random_seed, job=job, parallel_train_opts=run_opts.parallel_train_opts, verbose_opt=verbose_opt, momentum=momentum, max_param_change=max_param_change, l2_regularize_factor=1.0 / num_jobs, train_opts=train_opts, learning_rate=learning_rate, shrinkage_value=shrinkage_value, minibatch_count=minibatch_count, feature_dim=feature_dim, dropout_proportion=dropout_proportion, egs_rspecifier=egs_rspecifier) thread = utils.background_command(_command, require_zero_status=False) threads.append(thread) for thread in threads: thread.join()