Exemplo n.º 1
0
def run_finetuning(config: configure_finetuning.FinetuningConfig):
  """Run finetuning."""
  hvd.init()

  config.model_dir = config.model_dir if hvd.rank() == 0 else \
      os.path.join(config.model_dir, str(hvd.rank()))
  config.train_batch_size = config.train_batch_size // hvd.size()

  # Setup for training
  results = []
  trial = 1
  heading_info = "model={:}, trial {:}/{:}".format(
      config.model_name, trial, config.num_trials)
  heading = lambda msg: utils.heading(msg + ": " + heading_info)
  heading("Config")
  utils.log_config(config)
  generic_model_dir = config.model_dir
  tasks = task_builder.get_tasks(config)

  # Train and evaluate num_trials models with different random seeds
  while config.num_trials < 0 or trial <= config.num_trials:
    config.model_dir = generic_model_dir + "_" + str(trial)
    if config.do_train:
      utils.rmkdir(config.model_dir)

    model_runner = ModelRunner(config, tasks, hvd)
    if config.do_train:
      heading("Start training")
      model_runner.train()
      utils.log()

    if config.do_eval:
      heading("Run dev set evaluation")
      results.append(model_runner.evaluate())
      write_results(config, results)
      if config.write_test_outputs and trial <= config.n_writes_test:
        heading("Running on the test set and writing the predictions")
        for task in tasks:
          # Currently only writing preds for GLUE and SQuAD 2.0 is supported
          if task.name in ["cola", "mrpc", "mnli", "sst", "rte", "qnli", "qqp",
                           "sts"]:
            for split in task.get_test_splits():
              model_runner.write_classification_outputs([task], trial, split)
          elif task.name == "squad":
            scorer = model_runner.evaluate_task(task, "test", False)
            scorer.write_predictions()
            preds = utils.load_json(config.qa_preds_file("squad"))
            null_odds = utils.load_json(config.qa_na_file("squad"))
            for q, _ in preds.items():
              if null_odds[q] > config.qa_na_threshold:
                preds[q] = ""
            utils.write_json(preds, config.test_predictions(
                task.name, "test", trial))
          else:
            utils.log("Skipping task", task.name,
                      "- writing predictions is not supported for this task")

    if trial != config.num_trials and (not config.keep_all_models):
      utils.rmrf(config.model_dir)
    trial += 1
Exemplo n.º 2
0
 def objective(params):
     num_epochs = params['num_epochs']
     lr = params['lr']
     batch_size = params['batch_size']
     config.num_train_epochs = num_epochs
     config.learning_rate = lr
     config.train_batch_size = batch_size
     suffix = "{}_{:.6}_{}".format(num_epochs, lr, batch_size)
     config.model_dir = generic_model_dir + "_opt_" + suffix
     utils.rmkdir(config.model_dir)
     model_runner = ModelRunner(config, tasks)
     utils.heading("Start training " + suffix)
     model_runner.train()
     utils.log()
     utils.heading("Run dev set evaluation " + suffix)
     result = list(model_runner.evaluate().values())[0]
     return {'loss': -result['f1'], 'status': STATUS_OK}
Exemplo n.º 3
0
def predict(config: configure_finetuning.FinetuningConfig):
    """using a trained model for task name, do predictions for test data set and save the results """
    no_trials = config.num_trials if config.num_trials > 0 else 1
    print(no_trials)
    generic_model_dir = config.model_dir
    for trial in range(1, no_trials + 1):
        utils.log_config(config)
        tasks = task_builder.get_tasks(config)
        config.model_dir = generic_model_dir + "_" + str(trial)
        print("config.model_dir:{}".format(config.model_dir))
        model_runner = ModelRunner(config, tasks)
        utils.heading("Running on the test set and writing the predictions")
        for task in tasks:
            if task.name in [
                    "cola", "mrpc", "mnli", "sst", "rte", "qnli", "qqp", "sts",
                    "yesno", "reranker", "weighted-reranker", "gad", "chemprot"
            ]:
                for split in task.get_test_splits():
                    model_runner.write_classification_outputs([task], trial,
                                                              split)
Exemplo n.º 4
0
def run_finetuning(config: configure_finetuning.FinetuningConfig):
  """Run finetuning."""
  os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
  os.environ["CUDA_VISIBLE_DEVICES"] = str(config.gpu)
  # Setup for training
  results = []
  trial = 1
  heading_info = "model={:}, trial {:}/{:}".format(config.model_name, trial, config.num_trials)
  heading = lambda msg: utils.heading(msg + ": " + heading_info)
  heading("Config")
  utils.log_config(config)
  generic_model_dir = config.model_dir
  tasks = task_builder.get_tasks(config)
  # Train and evaluate num_trials models with different random seeds
  while config.num_trials < 0 or trial <= config.num_trials:
    config.model_dir = generic_model_dir + "_" + str(trial)
    if config.do_train:
      utils.rmkdir(config.model_dir)

    model_runner = ModelRunner(config, tasks)
    if config.do_train:
      heading("Start training")
      model_runner.train()
      utils.log()

    if config.do_eval:
      heading("Run dev set evaluation")
      model_runner.evaluate()
#       results.append(model_runner.evaluate())
#       write_results(config, results)
#       if config.write_test_outputs and trial <= config.n_writes_test:
#         heading("Running on the test set and writing the predictions")
#         for task in tasks:
#           # Currently only writing preds for GLUE and SQuAD 2.0 is supported
#           if task.name in ["cola", "mrpc", "mnli", "sst", "rte", "qnli", "qqp","sts","conv"]:
#             for split in task.get_test_splits():
#               model_runner.write_classification_outputs([task], trial, split)
#           elif task.name == "squad":
#             scorer = model_runner.evaluate_task(task, "test", False)
#             scorer.write_predictions()
#             preds = utils.load_json(config.qa_preds_file("squad"))
#             null_odds = utils.load_json(config.qa_na_file("squad"))
#             for q, _ in preds.items():
#               if null_odds[q] > config.qa_na_threshold:
#                 preds[q] = ""
#             utils.write_json(preds, config.test_predictions(
#                 task.name, "test", trial))
#           else:
#             utils.log("Skipping task", task.name,
#                       "- writing predictions is not supported for this task")

    if trial != config.num_trials and (not config.keep_all_models):
      utils.rmrf(config.model_dir)
    trial += 1
Exemplo n.º 5
0
def train_or_eval(config: configure_pretraining.PretrainingConfig):
    """Run pre-training or evaluate the pre-trained model."""
    # initialize horovod
    hvd.init()
    if config.do_train == config.do_eval:
        raise ValueError(
            "Exactly one of `do_train` or `do_eval` must be True.")
    if config.debug:
        utils.rmkdir(config.model_dir)
    utils.heading("Config:")
    utils.log_config(config)

    config.model_dir = config.model_dir if hvd.rank() == 0 else \
        os.path.join(config.model_dir, str(hvd.rank()))
    config.train_batch_size = config.train_batch_size // hvd.size()
    config.eval_batch_size = config.eval_batch_size // hvd.size()

    is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2
    tpu_cluster_resolver = None
    if config.use_tpu and config.tpu_name:
        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            config.tpu_name, zone=config.tpu_zone, project=config.gcp_project)
    tpu_config = tf.estimator.tpu.TPUConfig(
        iterations_per_loop=config.iterations_per_loop,
        num_shards=(config.num_tpu_cores
                    if config.do_train else config.num_tpu_cores),
        tpu_job_name=config.tpu_job_name,
        per_host_input_for_training=is_per_host)

    session_config = tf.ConfigProto()
    session_config.gpu_options.visible_device_list = str(hvd.local_rank())

    run_config = tf.estimator.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=config.model_dir,
        save_checkpoints_steps=config.save_checkpoints_steps,
        session_config=session_config,
        tpu_config=tpu_config)
    model_fn = model_fn_builder(config=config)
    estimator = tf.estimator.tpu.TPUEstimator(
        use_tpu=config.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=config.train_batch_size,
        eval_batch_size=config.eval_batch_size)

    if config.do_train:
        utils.heading("Running training")
        hooks = [hvd.BroadcastGlobalVariablesHook(0)]
        estimator.train(input_fn=pretrain_data.get_input_fn(config, True, hvd),
                        max_steps=config.num_train_steps,
                        hooks=hooks)
    if config.do_eval:
        utils.heading("Running evaluation")
        result = estimator.evaluate(input_fn=pretrain_data.get_input_fn(
            config, False, hvd),
                                    steps=config.num_eval_steps)
        for key in sorted(result.keys()):
            utils.log("  {:} = {:}".format(key, str(result[key])))
        return result
Exemplo n.º 6
0
def train_or_eval(config: configure_pretraining.PretrainingConfig):
    """Run pre-training or evaluate the pre-trained model."""
    if config.do_train == config.do_eval:
        raise ValueError(
            "Exactly one of `do_train` or `do_eval` must be True.")
    if config.debug and config.do_train:
        utils.rmkdir(config.model_dir)
    utils.heading("Config:")
    utils.log_config(config)

    if config.use_tpu:
        is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2
        tpu_cluster_resolver = None
        if config.use_tpu and config.tpu_name:
            tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
                config.tpu_name,
                zone=config.tpu_zone,
                project=config.gcp_project)
        tpu_config = tf.estimator.tpu.TPUConfig(
            iterations_per_loop=config.iterations_per_loop,
            num_shards=config.num_tpu_cores,
            tpu_job_name=config.tpu_job_name,
            per_host_input_for_training=is_per_host)
        run_config = tf.estimator.tpu.RunConfig(
            cluster=tpu_cluster_resolver,
            model_dir=config.model_dir,
            save_checkpoints_steps=config.save_checkpoints_steps,
            keep_checkpoint_max=config.keep_checkpoint_max,
            tpu_config=tpu_config)
        model_fn = model_fn_builder(config=config)
        estimator = tf.estimator.tpu.TPUEstimator(
            use_tpu=config.use_tpu,
            model_fn=model_fn,
            config=run_config,
            train_batch_size=config.train_batch_size,
            eval_batch_size=config.eval_batch_size)
    else:
        run_config = tf.estimator.RunConfig(
            model_dir=config.model_dir,
            save_checkpoints_steps=config.save_checkpoints_steps,
            keep_checkpoint_max=config.keep_checkpoint_max)
        model_fn = model_fn_builder(config=config)
        estimator = tf.estimator.Estimator(
            model_fn=tensorflow.contrib.estimator.replicate_model_fn(model_fn),
            config=run_config,
            params={"batch_size": config.train_batch_size})

    if config.do_train:
        utils.heading("Running training")
        estimator.train(input_fn=pretrain_data.get_input_fn(config, True),
                        max_steps=config.num_train_steps)
    if config.do_eval:
        utils.heading("Running evaluation")
        result = estimator.evaluate(input_fn=pretrain_data.get_input_fn(
            config, False),
                                    steps=config.num_eval_steps)
        for key in sorted(result.keys()):
            utils.log("  {:} = {:}".format(key, str(result[key])))
        return result
Exemplo n.º 7
0
def train_or_eval(config: configure_pretraining.PretrainingConfig):
    """Run pre-training or evaluate the pre-trained model."""
    if config.do_train == config.do_eval:
        raise ValueError(
            "Exactly one of `do_train` or `do_eval` must be True.")
    if config.debug and config.do_train:
        utils.rmkdir(config.model_dir)
    utils.heading("Config:")
    utils.log_config(config)

    # warm_start_settings = None
    # if config.init_checkpoint:
    #     from tensorflow.python.estimator.estimator import WarmStartSettings
    #     warm_start_settings = WarmStartSettings(ckpt_to_initialize_from=config.init_checkpoint,
    #                                             vars_to_warm_start=['^(?!.*global_step.*)(?!.*adam.*)(?!.*Adam.*).*$'])

    is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2
    tpu_cluster_resolver = None
    if config.use_tpu and config.tpu_name:
        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            config.tpu_name, zone=config.tpu_zone, project=config.gcp_project)
    tpu_config = tf.estimator.tpu.TPUConfig(
        iterations_per_loop=config.iterations_per_loop,
        num_shards=config.num_tpu_cores,
        # tpu_job_name=config.tpu_job_name,
        per_host_input_for_training=is_per_host)
    run_config = tf.estimator.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=config.model_dir,
        save_checkpoints_steps=config.save_checkpoints_steps,
        tpu_config=tpu_config)
    model_fn = model_fn_builder(config=config)
    estimator = tf.estimator.tpu.TPUEstimator(
        use_tpu=config.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=config.train_batch_size,
        eval_batch_size=config.eval_batch_size)

    if config.do_train:
        utils.heading("Running training")
        estimator.train(input_fn=pretrain_data.get_input_fn(config, True),
                        max_steps=config.num_train_steps)
    if config.do_eval:
        utils.heading("Running evaluation")
        result = estimator.evaluate(input_fn=pretrain_data.get_input_fn(
            config, False),
                                    steps=config.num_eval_steps)
        for key in sorted(result.keys()):
            utils.log("  {:} = {:}".format(key, str(result[key])))
        return result
Exemplo n.º 8
0
def train_or_eval(config: configure_pretraining.PretrainingConfig):
  """Run pre-training or evaluate the pre-trained model."""
  if config.do_train == config.do_eval:
    raise ValueError("Exactly one of `do_train` or `do_eval` must be True.")
  if config.debug and config.do_train:
    utils.rmkdir(config.model_dir)
  utils.heading("Config:")
  utils.log_config(config)

  is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2
  tpu_cluster_resolver = None
  if config.use_tpu:
    tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(zone=config.tpu_zone, 
                                                                             project=config.gcp_project)
    print('Running on TPU ', tpu_cluster_resolver.cluster_spec().as_dict()['worker'])
        
    if tpu_cluster_resolver:
        tf.config.experimental_connect_to_cluster(tpu_cluster_resolver)
        tf.tpu.experimental.initialize_tpu_system(tpu_cluster_resolver)

  tpu_config = tf.estimator.tpu.TPUConfig(
      iterations_per_loop=config.iterations_per_loop,
      num_shards=config.num_tpu_cores,
      tpu_job_name=config.tpu_job_name,
      per_host_input_for_training=is_per_host)
  run_config = tf.estimator.tpu.RunConfig(
      cluster=tpu_cluster_resolver,
      model_dir=config.model_dir,
      save_checkpoints_steps=config.save_checkpoints_steps,
      keep_checkpoint_max=config.keep_checkpoint_max,
      tpu_config=tpu_config)
  model_fn = model_fn_builder(config=config)
  estimator = tf.estimator.tpu.TPUEstimator(
      use_tpu=config.use_tpu,
      model_fn=model_fn,
      config=run_config,
      train_batch_size=config.train_batch_size,
      eval_batch_size=config.eval_batch_size)

  if config.do_train:
    utils.heading("Running training")
    estimator.train(input_fn=pretrain_data.get_input_fn(config, True),
                    max_steps=config.num_train_steps)
  if config.do_eval:
    utils.heading("Running evaluation")
    result = estimator.evaluate(
        input_fn=pretrain_data.get_input_fn(config, False),
        steps=config.num_eval_steps)
    for key in sorted(result.keys()):
      utils.log("  {:} = {:}".format(key, str(result[key])))
    return result
 def heading(msg): return utils.heading(msg + ": " + heading_info)
 heading("Config")
 def heading(msg):
     return utils.heading(msg + ": " + heading_info)
Exemplo n.º 11
0
def run_finetuning(config: configure_finetuning.FinetuningConfig):
    """Run finetuning."""

    # Setup for training
    results = []
    trial = 1
    heading_info = "model={:}, trial {:}/{:}".format(config.model_name, trial,
                                                     config.num_trials)
    heading = lambda msg: utils.heading(msg + ": " + heading_info)
    heading("Config")
    utils.log_config(config)
    generic_model_dir = config.model_dir
    tasks = task_builder.get_tasks(config)

    # Train and evaluate num_trials models with different random seeds
    while config.num_trials < 0 or trial <= config.num_trials:
        config.model_dir = generic_model_dir + "_" + str(trial)
        if config.do_train:
            utils.rmkdir(config.model_dir)

        model_runner = ModelRunner(config, tasks)
        if config.do_train:
            heading("Start training")
            model_runner.train()
            utils.log()

        if config.do_eval:
            heading("Run dev set evaluation")
            results.append(model_runner.evaluate())
            if config.do_test:
                for task in tasks:
                    test_score = model_runner.evaluate_task_test(
                        task, results[-1][task.name]['checkpoint_path'])
                    results[-1][task.name]["test_results"] = test_score
            write_results(config, results)
            if config.write_test_outputs and trial <= config.n_writes_test:
                heading("Running on the test set and writing the predictions")
                for task in tasks:
                    # Currently only writing preds for GLUE and SQuAD 2.0 is supported
                    if task.name in [
                            "cola", "mrpc", "mnli", "sst", "rte", "qnli",
                            "qqp", "sts"
                    ]:
                        for split in task.get_test_splits():
                            model_runner.write_classification_outputs([task],
                                                                      trial,
                                                                      split)
                    elif task.name == "squad":
                        scorer = model_runner.evaluate_task(
                            task, "test", False)
                        scorer.write_predictions()
                        preds = utils.load_json(config.qa_preds_file("squad"))
                        null_odds = utils.load_json(config.qa_na_file("squad"))
                        for q, _ in preds.items():
                            if null_odds[q] > config.qa_na_threshold:
                                preds[q] = ""
                        utils.write_json(
                            preds,
                            config.test_predictions(task.name, "test", trial))
                    else:
                        utils.log(
                            "Skipping task", task.name,
                            "- writing predictions is not supported for this task"
                        )
        if config.do_predict:
            if "dev" in config.predict_split:
                results = model_runner.predict(tasks[0],
                                               config.predict_checkpoint_path,
                                               "dev")
                import pickle
                with open("predict_dev.pickle", "bw") as outfile:
                    pickle.dump(results, outfile)

            if "train" in config.predict_split:
                results = model_runner.predict(tasks[0],
                                               config.predict_checkpoint_path,
                                               "train")
                import pickle
                with open("predict_train.pickle", "bw") as outfile:
                    pickle.dump(results, outfile)

            if "test" in config.predict_split:
                results = model_runner.predict(tasks[0],
                                               config.predict_checkpoint_path,
                                               "test")
                import pickle
                with open("predict_test.pickle", "bw") as outfile:
                    pickle.dump(results, outfile)

        if trial != config.num_trials and (not config.keep_all_models):
            utils.rmrf(config.model_dir)
        trial += 1
Exemplo n.º 12
0
def train_or_eval(config: configure_pretraining.PretrainingConfig):
    """Run pre-training or evaluate the pre-trained model."""
    if config.do_train == config.do_eval:
        raise ValueError(
            "Exactly one of `do_train` or `do_eval` must be True.")
    if config.debug:
        utils.rmkdir(config.model_dir)
    utils.heading("Config:")
    utils.log_config(config)

    num_gpus = utils.get_available_gpus()
    utils.log("Found {} gpus".format(len(num_gpus)))

    if num_gpus == 1:
        session_config = tf.ConfigProto(
            log_device_placement=True,
            allow_soft_placement=True,
            gpu_options=tf.GPUOptions(allow_growth=True))

        run_config = tf.estimator.RunConfig(
            model_dir=config.model_dir,
            save_checkpoints_steps=config.save_checkpoints_steps,
            # save_checkpoints_secs=3600,
            # tf_random_seed=FLAGS.seed,
            session_config=session_config,
            # keep_checkpoint_max=0,
            log_step_count_steps=100)
    else:
        train_distribution_strategy = tf.distribute.MirroredStrategy(
            devices=None,
            cross_device_ops=tensorflow.contrib.distribute.
            AllReduceCrossDeviceOps('nccl', num_packs=len(num_gpus)))
        eval_distribution_strategy = tf.distribute.MirroredStrategy(
            devices=None)

        session_config = tf.ConfigProto(
            # log_device_placement=True,
            inter_op_parallelism_threads=0,
            intra_op_parallelism_threads=0,
            allow_soft_placement=True,
            gpu_options=tf.GPUOptions(allow_growth=True))

        run_config = tf.estimator.RunConfig(
            model_dir=config.model_dir,
            save_checkpoints_steps=config.save_checkpoints_steps,
            train_distribute=train_distribution_strategy,
            eval_distribute=eval_distribution_strategy,
            # save_checkpoints_secs=3600,
            # tf_random_seed=FLAGS.seed,
            session_config=session_config,
            # keep_checkpoint_max=0,
            log_step_count_steps=100)

    model_fn = model_fn_builder(config=config)
    estimator = tf.estimator.Estimator(model_fn=model_fn,
                                       config=run_config,
                                       params={
                                           'train_batch_size':
                                           config.train_batch_size,
                                           'eval_batch_size':
                                           config.eval_batch_size
                                       })

    if config.do_train:
        utils.heading("Running training")
        estimator.train(input_fn=pretrain_data.get_input_fn(config, True),
                        max_steps=config.num_train_steps)
    if config.do_eval:
        utils.heading("Running evaluation")
        result = estimator.evaluate(input_fn=pretrain_data.get_input_fn(
            config, False),
                                    steps=config.num_eval_steps)
        for key in sorted(result.keys()):
            utils.log("  {:} = {:}".format(key, str(result[key])))
        return result
Exemplo n.º 13
0
def train_or_eval(config: PretrainingConfig):
    """Run pre-training or evaluate the pre-trained model."""
    if config.do_train == config.do_eval:
        raise ValueError(
            "Exactly one of `do_train` or `do_eval` must be True.")
    if config.debug and config.do_train:
        utils.rmkdir(config.model_dir)
    utils.heading("Config:")
    utils.log_config(config)

    # session config
    session_config = tf.ConfigProto()
    session_config.gpu_options.allow_growth = True
    session_config.gpu_options.visible_device_list = str(
        hvd.local_rank())  # one gpu per process
    # session_config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1  # xla
    # session_config.graph_options.rewrite_options.memory_optimization = rewriter_config_pb2.RewriterConfig.NO_MEM_OPT  # xla

    # run config
    # согласно примеру: https://gist.github.com/alsrgv/34a32f30292f4e2c1fa29ec0d65dea26
    # model_dir = config.model_dir if hvd.rank() == 0 else None
    # UPD: если model_dir == None, то Estimator по умолчанию сохраняет чекпоинты в /tmp, что сжирает системный диск

    run_config = tf.estimator.RunConfig(
        model_dir=config.model_dir,
        session_config=session_config,
        save_checkpoints_steps=config.save_checkpoints_steps
        if hvd.rank() == 0 else None,
        save_summary_steps=100 if hvd.rank() == 0 else 0,
        keep_checkpoint_max=config.keep_checkpoint_max,
        log_step_count_steps=10000)

    # model_fn
    model_fn = model_fn_builder(config=config)

    # training hooks
    training_hooks = []

    if hvd.size() > 1:
        training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))

    # estimator
    estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config)

    if config.do_train:
        utils.heading("Running training")
        input_fn = pretrain_data.get_input_fn(
            pretrain_tfrecords=config.pretrain_tfrecords,
            max_seq_length=config.max_seq_length,
            batch_size=config.train_batch_size,
            is_training=True,
            hvd=hvd,
            num_cpu_threads=8)
        estimator.train(input_fn=input_fn,
                        hooks=training_hooks,
                        max_steps=config.num_train_steps)
    if config.do_eval:
        utils.heading("Running evaluation")
        input_fn = pretrain_data.get_input_fn(
            pretrain_tfrecords=config.pretrain_tfrecords,
            max_seq_length=config.max_seq_length,
            batch_size=config.eval_batch_size,
            is_training=False,
            hvd=hvd,
            num_cpu_threads=8)
        result = estimator.evaluate(input_fn=input_fn,
                                    steps=config.num_eval_steps)
        for key in sorted(result.keys()):
            utils.log("  {:} = {:}".format(key, str(result[key])))
        return result
Exemplo n.º 14
0
def run_finetuning(config: configure_finetuning.FinetuningConfig):
  """Run finetuning."""
  tf.get_variable_scope().reuse_variables() #import pdb; pdb.set_trace()

  # Setup for training
  results = []
  trial = 1
  heading_info = "model={:}, trial {:}/{:}".format(
      config.model_name, trial, config.num_trials)
  heading = lambda msg: utils.heading(msg + ": " + heading_info)
  heading("Config")
  utils.log_config(config)
  generic_model_dir = config.model_dir
  tasks = task_builder.get_tasks(config)
  # Train and evaluate num_trials models with different random seeds
  while config.num_trials < 0 or trial <= config.num_trials:
    config.model_dir = generic_model_dir + "_" + str(trial)
    if config.do_train:
      utils.rmkdir(config.model_dir)

    model_runner = ModelRunner(config, tasks)
    if config.do_train:
      heading("Start training")
      model_runner.train()
      utils.log()

    if config.do_eval:
      heading("Run dev set evaluation")
      results.append(model_runner.evaluate())
      write_results(config, results)
      if config.write_test_outputs and trial <= config.n_writes_test:
        heading("Running on the test set and writing the predictions")
        for task in tasks:
          # Currently only writing preds for GLUE and SQuAD 2.0 is supported
          if task.name in ["cola", "mrpc", "mnli", "sst", "rte", "qnli", "qqp",
                           "sts"]:
            for split in task.get_test_splits():
              model_runner.write_classification_outputs([task], trial, split)
          elif task.name == "squad":
            scorer = model_runner.evaluate_task(task, "test", False)
            scorer.write_predictions()
            preds = utils.load_json(config.qa_preds_file("squad"))
            null_odds = utils.load_json(config.qa_na_file("squad"))
            for q, _ in preds.items():
              if null_odds[q] > config.qa_na_threshold:
                preds[q] = ""
            utils.write_json(preds, config.test_predictions(
                task.name, "test", trial))
          else:
            utils.log("Skipping task", task.name,
                      "- writing predictions is not supported for this task")

    if trial != config.num_trials and (not config.keep_all_models):
      utils.rmrf(config.model_dir)
    trial += 1

  # exporting the model
  if config.export_dir:
    # with tf.variable_scope(tf.get_variable_scope(), reuse=True):
    #   model_runner = ModelRunner(config, tasks)
    #   tf.gfile.MakeDirs(config.export_dir)
    #   checkpoint_path = os.path.join(config.init_checkpoint, "model.ckpt-6315")
    #   squad_serving_input_fn = (
    #       build_squad_serving_input_fn(config.max_seq_length))
    #   utils.log("Starting to export model.")
    #   subfolder = model_runner._estimator.export_saved_model(
    #       export_dir_base=os.path.join(config.export_dir, "saved_model"),
    #       serving_input_receiver_fn=squad_serving_input_fn)
    tf.get_variable_scope().reuse_variables()
    model_runner = ModelRunner(config, tasks)
    tf.gfile.MakeDirs(config.export_dir)
    checkpoint_path = os.path.join(config.init_checkpoint, "model.ckpt-6315")
    squad_serving_input_fn = (
        build_squad_serving_input_fn(config.max_seq_length))
    utils.log("Starting to export model.")
    subfolder = model_runner._estimator.export_saved_model(
        export_dir_base=os.path.join(config.export_dir, "saved_model"),
        serving_input_receiver_fn=squad_serving_input_fn)
Exemplo n.º 15
0
def train_or_eval(config: configure_pretraining.PretrainingConfig):
    """Run pre-training or evaluate the pre-trained model."""
    if config.do_train == config.do_eval:
        raise ValueError(
            "Exactly one of `do_train` or `do_eval` must be True.")
    if config.debug:
        utils.rmkdir(config.model_dir)
    utils.heading("Config:")
    utils.log_config(config)

    if config.use_tpu:
        is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2
        tpu_cluster_resolver = None
        if config.use_tpu and config.tpu_name:
            tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
                config.tpu_name,
                zone=config.tpu_zone,
                project=config.gcp_project)

        tpu_config = tf.estimator.tpu.TPUConfig(
            iterations_per_loop=config.iterations_per_loop,
            num_shards=config.num_tpu_cores,
            tpu_job_name=config.tpu_job_name,
            per_host_input_for_training=is_per_host,
        )
        run_config = tf.estimator.tpu.RunConfig(
            cluster=tpu_cluster_resolver,
            model_dir=config.model_dir,
            save_checkpoints_steps=config.save_checkpoints_steps,
            tpu_config=tpu_config,
        )
        model_fn = model_fn_builder(config=config)
        estimator = tf.estimator.tpu.TPUEstimator(
            use_tpu=config.use_tpu,
            model_fn=model_fn,
            config=run_config,
            train_batch_size=config.train_batch_size,
            eval_batch_size=config.eval_batch_size,
        )
    else:
        config_proto = tf.ConfigProto()
        config_proto.gpu_options.allow_growth = True

        run_config = tf.estimator.RunConfig(
            model_dir=config.model_dir,
            save_checkpoints_steps=config.save_checkpoints_steps,
            session_config=config_proto,
        )
        model_fn = model_fn_builder(config=config)

        estimator = None
        if config.saved_model:
            estimator = tf.estimator.Estimator(
                model_fn=model_fn,
                config=run_config,
                warm_start_from=config.saved_model)
        else:
            estimator = tf.estimator.Estimator(model_fn=model_fn,
                                               config=run_config)

    if config.do_train:
        utils.heading("Running training")
        estimator.train(
            input_fn=pretrain_data.get_input_fn(config, True),
            max_steps=config.num_train_steps,
        )
    if config.do_eval:
        utils.heading("Running evaluation")
        result = estimator.evaluate(
            input_fn=pretrain_data.get_input_fn(config, False),
            steps=config.num_eval_steps,
        )
        for key in sorted(result.keys()):
            utils.log("  {:} = {:}".format(key, str(result[key])))
        return result