Exemplo n.º 1
0
 def test_mutual_exclusivity(self):
     with self.assertRaises(ValueError):
         schedule.Manager(train_steps=100,
                          steps_between_evals=100,
                          train_epochs=2,
                          epochs_between_evals=1,
                          default_train_epochs=None,
                          batch_size=2048,
                          max_length=256)
Exemplo n.º 2
0
    def test_step_basis_tpu(self):
        manager = schedule.Manager(train_steps=1000,
                                   steps_between_evals=100,
                                   train_epochs=None,
                                   epochs_between_evals=None,
                                   default_train_epochs=None,
                                   batch_size=2048,
                                   max_length=256,
                                   use_tpu=True)

        self.assertEqual(manager.single_iteration_train_steps, 100)
        # num_eval_examples / (batch_size / max_length) == 3000 / (2048 / 256)
        self.assertEqual(manager.single_iteration_eval_steps, 375)
        self.assertIsNone(manager.repeat_dataset)
Exemplo n.º 3
0
    def test_epoch_basis(self):
        manager = schedule.Manager(train_steps=None,
                                   steps_between_evals=None,
                                   train_epochs=10,
                                   epochs_between_evals=2,
                                   default_train_epochs=None,
                                   batch_size=2048,
                                   max_length=256)

        # For non-TPU, estimator relies on dataset exhausion
        self.assertIsNone(manager.single_iteration_train_steps)
        self.assertIsNone(manager.single_iteration_eval_steps)

        self.assertEqual(manager.repeat_dataset, 2)
Exemplo n.º 4
0
    def test_step_basis(self):
        manager = schedule.Manager(train_steps=1000,
                                   steps_between_evals=100,
                                   train_epochs=None,
                                   epochs_between_evals=None,
                                   default_train_epochs=None,
                                   batch_size=2048,
                                   max_length=256)

        self.assertEqual(manager.single_iteration_train_steps, 100)

        # Evaluation uses the full set
        self.assertIsNone(manager.single_iteration_eval_steps)

        self.assertIsNone(manager.repeat_dataset)
Exemplo n.º 5
0
    def test_epoch_basis_tpu(self):
        manager = schedule.Manager(train_steps=None,
                                   steps_between_evals=None,
                                   train_epochs=10,
                                   epochs_between_evals=2,
                                   default_train_epochs=None,
                                   batch_size=2048,
                                   max_length=256,
                                   use_tpu=True)

        self.assertEqual(
            manager.single_iteration_train_steps,
            schedule.NUM_EXAMPLES[tf.estimator.ModeKeys.TRAIN] * 2 //
            (2048 / 256))

        # num_eval_examples / (batch_size / max_length) == 3000 / (2048 / 256)
        self.assertEqual(manager.single_iteration_eval_steps, 375)

        self.assertEqual(manager.repeat_dataset, 2)
Exemplo n.º 6
0
def run_transformer(flags_obj):
    """Create tf.Estimator to train and evaluate transformer model.

  Args:
    flags_obj: Object containing parsed flag values.

  Returns:
    Dict of results of the run.  Contains the keys `eval_results`,
    `train_hooks`, `bleu_cased`, and `bleu_uncased`. `train_hooks` is a list the
    instances of hooks used during training.
  """
    num_gpus = flags_core.get_num_gpus(flags_obj)

    # Add flag-defined parameters to params object
    params = PARAMS_MAP[flags_obj.param_set]
    if num_gpus > 1:
        if flags_obj.param_set == "big":
            params = model_params.BIG_MULTI_GPU_PARAMS
        elif flags_obj.param_set == "base":
            params = model_params.BASE_MULTI_GPU_PARAMS

    params["data_dir"] = flags_obj.data_dir
    params["model_dir"] = flags_obj.model_dir
    params["num_parallel_calls"] = flags_obj.num_parallel_calls

    params["tpu"] = flags_obj.tpu
    params["use_tpu"] = bool(flags_obj.tpu)  # was a tpu specified.
    params["static_batch"] = flags_obj.static_batch or params["use_tpu"]
    params["allow_ffn_pad"] = not params["use_tpu"]

    params["max_length"] = flags_obj.max_length or params["max_length"]

    params["use_synthetic_data"] = flags_obj.use_synthetic_data

    # Set batch size parameter, which depends on the availability of
    # TPU and GPU, and distribution settings.
    params["batch_size"] = (
        flags_obj.batch_size
        or (params["default_batch_size_tpu"]
            if params["use_tpu"] else params["default_batch_size"]))

    total_batch_size = params["batch_size"]
    if not params["use_tpu"]:
        params["batch_size"] = distribution_utils.per_replica_batch_size(
            params["batch_size"], num_gpus)

    schedule_manager = schedule.Manager(
        train_steps=flags_obj.train_steps,
        steps_between_evals=flags_obj.steps_between_evals,
        train_epochs=flags_obj.train_epochs,
        epochs_between_evals=flags_obj.epochs_between_evals,
        default_train_epochs=DEFAULT_TRAIN_EPOCHS,
        batch_size=params["batch_size"],
        max_length=params["max_length"],
        use_tpu=params["use_tpu"],
        num_tpu_shards=flags_obj.num_tpu_shards)

    params["repeat_dataset"] = schedule_manager.repeat_dataset

    model_helpers.apply_clean(flags.FLAGS)

    # Create hooks that log information about the training and metric values
    train_hooks = hooks_helper.get_train_hooks(
        flags_obj.hooks,
        model_dir=flags_obj.model_dir,
        tensors_to_log=TENSORS_TO_LOG,  # used for logging hooks
        batch_size=total_batch_size,  # for ExamplesPerSecondHook
        use_tpu=params["use_tpu"]  # Not all hooks can run with TPUs
    )
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info(model_name="transformer",
                                  dataset_name="wmt_translate_ende",
                                  run_params=params,
                                  test_id=flags_obj.benchmark_test_id)

    # Train and evaluate transformer model
    estimator = construct_estimator(flags_obj, params, schedule_manager)
    stats = run_loop(
        estimator=estimator,
        # Training arguments
        schedule_manager=schedule_manager,
        train_hooks=train_hooks,
        benchmark_logger=benchmark_logger,
        # BLEU calculation arguments
        bleu_source=flags_obj.bleu_source,
        bleu_ref=flags_obj.bleu_ref,
        bleu_threshold=flags_obj.stop_threshold,
        vocab_file=flags_obj.vocab_file)

    if flags_obj.export_dir and not params["use_tpu"]:
        serving_input_fn = export.build_tensor_serving_input_receiver_fn(
            shape=[None], dtype=tf.int64, batch_size=None)
        # Export saved model, and save the vocab file as an extra asset. The vocab
        # file is saved to allow consistent input encoding and output decoding.
        # (See the "Export trained model" section in the README for an example of
        # how to use the vocab file.)
        # Since the model itself does not use the vocab file, this file is saved as
        # an extra asset rather than a core asset.
        estimator.export_savedmodel(
            flags_obj.export_dir,
            serving_input_fn,
            assets_extra={"vocab.txt": flags_obj.vocab_file},
            strip_default_attrs=True)
    return stats