Пример #1
0
def train_one_iteration(model_dir,
                        _iter,
                        random_seed,
                        egs_dir,
                        num_jobs,
                        num_archives_processed,
                        num_archives,
                        learning_rate,
                        minibatch_size,
                        momentum,
                        max_param_change,
                        run_opts,
                        feature_dim,
                        archives_minibatch_count,
                        shrinkage_value=1.0,
                        current_dropout=0.0):
    """ Called from train for one iteration of neural network training

    Selected args:
        shrinkage_value: If value is 1.0, no shrinkage is done; otherwise
            parameter values are scaled by this value.
    """

    # check if different iterations use the same random seed
    random_seed_file = '{0}/random_seed'.format(model_dir)
    if os.path.exists(random_seed_file):
        try:
            with open(random_seed_file, 'r') as fid:
                saved_random_seed = int(fid.readline().strip())
        except (IOError, ValueError):
            logger.error(
                "Exception while reading the random seed for training")
            raise
        if random_seed != saved_random_seed:
            logger.warning(
                "The random seed provided to this iteration (random_seed={0}) is "
                "different from the one saved last time (random_seed={1}). "
                "Using random_seed={0}.".format(random_seed,
                                                saved_random_seed))
    else:
        with open(random_seed_file, 'w') as fid:
            fid.write(str(random_seed))

    # Sets off some background jobs to compute train and validation set objectives
    eval_trained_dnn(model_dir, _iter, egs_dir, run_opts)

    new_model = "{0}/model_{1}/model.meta".format(model_dir, _iter + 1)
    if utils.is_correct_model_dir("{0}/model_{1}".format(model_dir,
                                                         _iter + 1)):
        logger.info(
            'The output model {0} was exist and so I do not continue this iteration.'
            .format(new_model))
        return

    do_average = (_iter > 0)

    dropout_proportion = current_dropout

    if do_average or True:  # TODO
        cur_minibatch_size = minibatch_size
        cur_max_param_change = max_param_change
    else:
        # on iteration zero, use a smaller minibatch size (and we will later
        # choose the output of just one of the jobs): the model-averaging isn't
        # always helpful when the model is changing too fast (i.e. it can worsen
        # the objective function), and the smaller minibatch size will help to
        # keep the update stable.
        cur_minibatch_size = minibatch_size / 2
        cur_max_param_change = float(max_param_change) / math.sqrt(2)

    try_count = 0
    training_flag = True
    while try_count < MAX_TRY_COUNT and training_flag:
        train_new_models(model_dir=model_dir,
                         _iter=_iter,
                         random_seed=random_seed,
                         num_jobs=num_jobs,
                         num_archives_processed=num_archives_processed,
                         num_archives=num_archives,
                         learning_rate=learning_rate,
                         shrinkage_value=shrinkage_value,
                         dropout_proportion=dropout_proportion,
                         egs_dir=egs_dir,
                         momentum=momentum,
                         max_param_change=cur_max_param_change,
                         minibatch_size=cur_minibatch_size,
                         feature_dim=feature_dim,
                         try_count=try_count,
                         archives_minibatch_count=archives_minibatch_count,
                         run_opts=run_opts)
        try_count += 1
        training_flag = False
        cnt = 0
        for job in range(1, num_jobs + 1):
            if not utils.is_correct_model_dir("{0}/model_{1}.{2}".format(
                    model_dir, _iter + 1, job)):
                # move logs to prevent rewriting.
                log_file = "{0}/log/train.{1}.{2}.log".format(
                    model_dir, _iter, job)
                if os.path.exists(log_file):
                    os.rename(log_file, "{0}.{1}".format(log_file, try_count))
                training_flag = True
                cnt += 1
        if training_flag:
            if try_count < MAX_TRY_COUNT:
                logger.warn(
                    "{0}/{1} of jobs failed. Resubmitting them may solved the problem. "
                    "Start resubmitting them after 30 seconds ...".format(
                        cnt, num_jobs))
                # sleep for 30 seconds before resubmitting
                time.sleep(30)
            else:
                logger.error(
                    "{0}/{1} of jobs failed and maximum number of retrying is reached. "
                    "Stop the training ...".format(cnt, num_jobs))

    if training_flag:
        raise Exception("Some training jobs failed more than %d times. "
                        "Please check the log files." % MAX_TRY_COUNT)

    [models_to_average, best_model] = utils.get_successful_models(
        num_jobs, '{0}/log/train.{1}.%.log'.format(model_dir, _iter))

    if do_average and len(models_to_average) > 1:
        # average the output of the different jobs.
        # TODO
        nets_dirs = []
        for n in models_to_average:
            nets_dirs.append("{0}/model_{1}.{2}".format(
                model_dir, _iter + 1, n))
        utils.get_average_nnet_model(dir=model_dir,
                                     iter=_iter,
                                     nnets_list=" ".join(nets_dirs),
                                     run_opts=run_opts)
    else:
        # choose the best model from different jobs
        utils.copy_best_nnet_dir(_dir=model_dir,
                                 _iter=_iter,
                                 best_model_index=best_model)

    try:
        for i in range(1, num_jobs + 1):
            shutil.rmtree("{0}/model_{1}.{2}".format(model_dir, _iter + 1, i))
    except OSError:
        logger.error("Error while trying to delete the client models.")
        raise

    if not os.path.isfile(new_model):
        raise Exception(
            "Could not find {0}, at the end of iteration {1}".format(
                new_model, _iter))
    elif os.stat(new_model).st_size == 0:
        raise Exception(
            "{0} has size 0. Something went wrong in iteration {1}".format(
                new_model, _iter))
Пример #2
0
def train_new_models(model_dir,
                     _iter,
                     random_seed,
                     num_jobs,
                     num_archives_processed,
                     num_archives,
                     learning_rate,
                     shrinkage_value,
                     dropout_proportion,
                     egs_dir,
                     momentum,
                     max_param_change,
                     minibatch_size,
                     run_opts,
                     feature_dim,
                     archives_minibatch_count,
                     try_count=0,
                     train_opts=""):
    """ Called from train_one_iteration(), this model does one iteration of
    training with 'num_jobs' jobs, and writes models in dirs like
    exp/tdnn_a/model_24.{1,2,3,..<num_jobs>}

    We cannot easily use a single parallel SGE job to do the main training,
    because the computation of which archive and which --frame option
    to use for each job is a little complex, so we spawn each one separately.
    """

    threads = []

    # the GPU timing info is only printed if we use the --verbose=1 flag; this
    # slows down the computation slightly, so don't accumulate it on every
    # iteration.  Don't do it on iteration 0 either, because we use a smaller
    # than normal minibatch size, and people may get confused thinking it's
    # slower for iteration 0 because of the verbose option.
    verbose_opt = ("--verbose=1" if _iter % 20 == 0 and _iter > 0 else "")

    for job in range(1, num_jobs + 1):
        # k is a zero-based index that we will derive the other indexes from.
        k = num_archives_processed + job - 1

        # work out the 1-based archive index.
        archive_index = (k % num_archives) + 1
        minibatch_count = archives_minibatch_count[archive_index]

        if try_count > 0 and utils.is_correct_model_dir(
                '{0}/model_{1}.{2}'.format(model_dir, _iter + 1, job)):
            continue

        egs_rspecifier = \
            '--ranges-file="{egs_dir}/temp/ranges.{archive_index}" ' \
            '--scp-file="{egs_dir}/temp/feats.scp.{archive_index}" ' \
            '--shuffle=True --minibatch-size={minibatch_size}'.format(
                egs_dir=egs_dir, archive_index=archive_index,
                minibatch_size=minibatch_size)

        # check whether tar file exist or not. If it was generated, so lets pass it to the script for speedup
        tar_file = '{egs_dir}/egs.{archive_index}.tar'.format(
            egs_dir=egs_dir, archive_index=archive_index)
        if os.path.exists(tar_file):
            egs_rspecifier = '--tar-file="{0}" {1}'.format(
                tar_file, egs_rspecifier)

        _command = '{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log ' \
                   'local/tf/train_dnn_one_iteration.py ' \
                   '{parallel_train_opts} ' \
                   '{verbose_opt} --print-interval=10 ' \
                   '--momentum={momentum} ' \
                   '--max-param-change={max_param_change} ' \
                   '--l2-regularize-factor={l2_regularize_factor} ' \
                   '--random-seed={random_seed} {train_opts} ' \
                   '--learning-rate={learning_rate} ' \
                   '--scale={shrinkage_value} ' \
                   '--minibatch-count={minibatch_count} ' \
                   '--feature-dim={feature_dim} ' \
                   '--dropout-proportion={dropout_proportion} ' \
                   '{egs_rspecifier} ' \
                   '--input-dir={dir}/model_{iter} ' \
                   '--output-dir={dir}/model_{next_iter}.{job}' \
            .format(command=run_opts.command,
                    train_queue_opt=run_opts.train_queue_opt,
                    dir=model_dir, iter=_iter,
                    next_iter=_iter + 1, random_seed=_iter + random_seed,
                    job=job,
                    parallel_train_opts=run_opts.parallel_train_opts,
                    verbose_opt=verbose_opt,
                    momentum=momentum, max_param_change=max_param_change,
                    l2_regularize_factor=1.0 / num_jobs,
                    train_opts=train_opts,
                    learning_rate=learning_rate,
                    shrinkage_value=shrinkage_value,
                    minibatch_count=minibatch_count,
                    feature_dim=feature_dim,
                    dropout_proportion=dropout_proportion,
                    egs_rspecifier=egs_rspecifier)

        thread = utils.background_command(_command, require_zero_status=False)
        threads.append(thread)

    for thread in threads:
        thread.join()