def test_step_basis_tpu(self):
    manager = schedule.Manager(
        train_steps=1000, steps_between_evals=100, train_epochs=None,
        epochs_between_evals=None, default_train_epochs=None, batch_size=2048,
        max_length=256, use_tpu=True)

    self.assertEqual(manager.single_iteration_train_steps, 100)
    # num_eval_examples / (batch_size / max_length) == 3000 / (2048 / 256)
    self.assertEqual(manager.single_iteration_eval_steps, 375)
    self.assertIsNone(manager.repeat_dataset)
  def test_epoch_basis(self):
    manager = schedule.Manager(
        train_steps=None, steps_between_evals=None, train_epochs=10,
        epochs_between_evals=2, default_train_epochs=None, batch_size=2048,
        max_length=256)

    # For non-TPU, estimator relies on dataset exhausion
    self.assertIsNone(manager.single_iteration_train_steps)
    self.assertIsNone(manager.single_iteration_eval_steps)

    self.assertEqual(manager.repeat_dataset, 2)
  def test_step_basis(self):
    manager = schedule.Manager(
        train_steps=1000, steps_between_evals=100, train_epochs=None,
        epochs_between_evals=None, default_train_epochs=None, batch_size=2048,
        max_length=256)

    self.assertEqual(manager.single_iteration_train_steps, 100)

    # Evaluation uses the full set
    self.assertIsNone(manager.single_iteration_eval_steps)

    self.assertIsNone(manager.repeat_dataset)
  def test_epoch_basis_tpu(self):
    manager = schedule.Manager(
        train_steps=None, steps_between_evals=None, train_epochs=10,
        epochs_between_evals=2, default_train_epochs=None, batch_size=2048,
        max_length=256, use_tpu=True)

    self.assertEqual(
        manager.single_iteration_train_steps,
        schedule.NUM_EXAMPLES[tf.estimator.ModeKeys.TRAIN] * 2 // (2048 / 256)
    )

    # num_eval_examples / (batch_size / max_length) == 3000 / (2048 / 256)
    self.assertEqual(manager.single_iteration_eval_steps, 375)

    self.assertEqual(manager.repeat_dataset, 2)
Пример #5
0
def run_transformer(flags_obj):
  """Create tf.Estimator to train and evaluate transformer model.

  Args:
    flags_obj: Object containing parsed flag values.
  """
  num_gpus = flags_core.get_num_gpus(flags_obj)

  # Add flag-defined parameters to params object
  params = PARAMS_MAP[flags_obj.param_set]
  if num_gpus > 1:
    if flags_obj.param_set == "big":
      params = model_params.BIG_MULTI_GPU_PARAMS
    elif flags_obj.param_set == "base":
      params = model_params.BASE_MULTI_GPU_PARAMS

  params["data_dir"] = flags_obj.data_dir
  params["model_dir"] = flags_obj.model_dir
  params["num_parallel_calls"] = flags_obj.num_parallel_calls

  params["tpu"] = flags_obj.tpu
  params["use_tpu"] = bool(flags_obj.tpu)  # was a tpu specified.
  params["static_batch"] = flags_obj.static_batch or params["use_tpu"]
  params["allow_ffn_pad"] = not params["use_tpu"]

  params["use_synthetic_data"] = flags_obj.use_synthetic_data

  # Set batch size parameter, which depends on the availability of
  # TPU and GPU, and distribution settings.
  params["batch_size"] = (flags_obj.batch_size or (
      params["default_batch_size_tpu"] if params["use_tpu"]
      else params["default_batch_size"]))

  if not params["use_tpu"]:
    params["batch_size"] = distribution_utils.per_device_batch_size(
        params["batch_size"], num_gpus)

  schedule_manager = schedule.Manager(
      train_steps=flags_obj.train_steps,
      steps_between_evals=flags_obj.steps_between_evals,
      train_epochs=flags_obj.train_epochs,
      epochs_between_evals=flags_obj.epochs_between_evals,
      default_train_epochs=DEFAULT_TRAIN_EPOCHS,
      batch_size=params["batch_size"],
      max_length=params["max_length"],
      use_tpu=params["use_tpu"],
      num_tpu_shards=flags_obj.num_tpu_shards
  )

  params["repeat_dataset"] = schedule_manager.repeat_dataset

  model_helpers.apply_clean(flags.FLAGS)

  # Create hooks that log information about the training and metric values
  train_hooks = hooks_helper.get_train_hooks(
      flags_obj.hooks,
      model_dir=flags_obj.model_dir,
      tensors_to_log=TENSORS_TO_LOG,  # used for logging hooks
      batch_size=schedule_manager.batch_size,  # for ExamplesPerSecondHook
      use_tpu=params["use_tpu"]  # Not all hooks can run with TPUs
  )
  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info(
      model_name="transformer",
      dataset_name="wmt_translate_ende",
      run_params=params,
      test_id=flags_obj.benchmark_test_id)

  # Train and evaluate transformer model
  estimator = construct_estimator(flags_obj, params, schedule_manager)
  run_loop(
      estimator=estimator,
      # Training arguments
      schedule_manager=schedule_manager,
      train_hooks=train_hooks,
      benchmark_logger=benchmark_logger,
      # BLEU calculation arguments
      bleu_source=flags_obj.bleu_source,
      bleu_ref=flags_obj.bleu_ref,
      bleu_threshold=flags_obj.stop_threshold,
      vocab_file=flags_obj.vocab_file)

  if flags_obj.export_dir and not params["use_tpu"]:
    serving_input_fn = export.build_tensor_serving_input_receiver_fn(
        shape=[None], dtype=tf.int64, batch_size=None)
    # Export saved model, and save the vocab file as an extra asset. The vocab
    # file is saved to allow consistent input encoding and output decoding.
    # (See the "Export trained model" section in the README for an example of
    # how to use the vocab file.)
    # Since the model itself does not use the vocab file, this file is saved as
    # an extra asset rather than a core asset.
    estimator.export_savedmodel(
        flags_obj.export_dir, serving_input_fn,
        assets_extra={"vocab.txt": flags_obj.vocab_file},
        strip_default_attrs=True)
Пример #6
0
def run_transformer(flags_obj):
    """Create tf.Estimator to train and evaluate transformer model.

    Args:
      flags_obj: Object containing parsed flag values.
    """
    # num_gpus = flags_core.get_num_gpus(flags_obj)
    num_gpus = 1
    # Add flag-defined parameters to params object
    params = PARAMS_MAP[flags_obj.param_set]
    if num_gpus > 1:
        if flags_obj.param_set == "big":
            params = model_params.BIG_MULTI_GPU_PARAMS
        elif flags_obj.param_set == "base":
            params = model_params.BASE_MULTI_GPU_PARAMS

    params["data_dir"] = flags_obj.data_dir
    params["model_dir"] = flags_obj.model_dir
    params["num_parallel_calls"] = flags_obj.num_parallel_calls

    params["tpu"] = False #flags_obj.tpu
    params["use_tpu"] = False#bool(flags_obj.tpu)  # was a tpu specified.
    params["static_batch"] = flags_obj.static_batch or params["use_tpu"]
    params["allow_ffn_pad"] = not params["use_tpu"]

    params["use_synthetic_data"] = False#flags_obj.use_synthetic_data

    # Set batch size parameter, which depends on the availability of
    # TPU and GPU, and distribution settings.
    params["batch_size"] = (flags_obj.batch_size or (
        params["default_batch_size_tpu"] if params["use_tpu"]
        else params["default_batch_size"]))

    # if not params["use_tpu"]:
    #     params["batch_size"] = distribution_utils.per_device_batch_size(
    #         params["batch_size"], num_gpus)

    schedule_manager = schedule.Manager(
        train_steps=flags_obj.train_steps,
        steps_between_evals=flags_obj.steps_between_evals,
        train_epochs=flags_obj.train_epochs,
        epochs_between_evals=flags_obj.epochs_between_evals,
        default_train_epochs=DEFAULT_TRAIN_EPOCHS,
        batch_size=params["batch_size"],
        max_length=params["max_length"],
        use_tpu=params["use_tpu"],
        num_tpu_shards=None#flags_obj.num_tpu_shards
    )

    params["repeat_dataset"] = schedule_manager.repeat_dataset

    # Train and evaluate transformer model
    estimator = construct_estimator(flags_obj, params, schedule_manager)
    run_loop(
        estimator=estimator,
        # Training arguments
        schedule_manager=schedule_manager,
        train_hooks=None,#train_hooks,
        benchmark_logger=None,#benchmark_logger,
        # BLEU calculation arguments
        bleu_source=flags_obj.bleu_source,
        bleu_ref=flags_obj.bleu_ref,
        bleu_threshold=flags_obj.stop_threshold,
        vocab_file=flags_obj.vocab_file,
        vocab_file_target=flags_obj.vocab_file_en)

    if flags_obj.export_dir and not params["use_tpu"]:
        # serving_input_fn = export.build_tensor_serving_input_receiver_fn(
        #     shape=[None], dtype=tf.int64, batch_size=None)
        def serving_input_fn():
            serialized_tf_example = tf.placeholder(dtype=tf.string, shape=[None], name='input_tensors')
            receiver_tensors = {"predictor_inputs": serialized_tf_example}
            feature_spec = {"words": tf.FixedLenFeature([25], tf.int64)}
            features = tf.parse_example(serialized_tf_example, feature_spec)
            return tf.estimator.export.ServingInputReceiver(features, receiver_tensors)
        # Export saved model, and save the vocab file as an extra asset. The vocab
        # file is saved to allow consistent input encoding and output decoding.
        # (See the "Export trained model" section in the README for an example of
        # how to use the vocab file.)
        # Since the model itself does not use the vocab file, this file is saved as
        # an extra asset rather than a core asset.
        estimator.export_savedmodel(
            flags_obj.export_dir, serving_input_fn,
            assets_extra={"vocab.txt": flags_obj.vocab_file,
                          "vocab_en.txt": flags_obj.vocab_file_en},
            strip_default_attrs=True)
 def test_mutual_exclusivity(self):
   with self.assertRaises(ValueError):
     schedule.Manager(
         train_steps=100, steps_between_evals=100, train_epochs=2,
         epochs_between_evals=1, default_train_epochs=None, batch_size=2048,
         max_length=256)
Пример #8
0
def run_transformer(flags_obj):
    """Create tf.Estimator to train and evaluate transformer model.

  Args:
    flags_obj: Object containing parsed flag values.
  """
    num_gpus = flags_core.get_num_gpus(flags_obj)

    # Add flag-defined parameters to params object
    params = PARAMS_MAP[flags_obj.param_set]
    if num_gpus > 1:
        if flags_obj.param_set == "big":
            params = model_params.BIG_MULTI_GPU_PARAMS
        elif flags_obj.param_set == "base":
            params = model_params.BASE_MULTI_GPU_PARAMS

    params["data_dir"] = flags_obj.data_dir
    params["model_dir"] = flags_obj.model_dir
    params["num_parallel_calls"] = flags_obj.num_parallel_calls

    params["tpu"] = flags_obj.tpu
    params["use_tpu"] = bool(flags_obj.tpu)  # was a tpu specified.
    params["static_batch"] = flags_obj.static_batch or params["use_tpu"]
    params["allow_ffn_pad"] = not params["use_tpu"]

    params["use_synthetic_data"] = flags_obj.use_synthetic_data

    params["worker_hosts"] = flags_obj.worker_hosts
    params["task_index"] = flags_obj.task_index
    params["server_protocol"] = flags_obj.server_protocol

    # Set batch size parameter, which depends on the availability of
    # TPU and GPU, and distribution settings.
    params["batch_size"] = (
        flags_obj.batch_size
        or (params["default_batch_size_tpu"]
            if params["use_tpu"] else params["default_batch_size"]))

    if not params["use_tpu"]:
        params["batch_size"] = distribution_utils.per_device_batch_size(
            params["batch_size"], num_gpus)

    print("============== Batch Size for each GPU ==============")
    print("Batch Size for each GPU", params["batch_size"])
    print("============== Batch Size for each GPU ==============")

    schedule_manager = schedule.Manager(
        train_steps=flags_obj.train_steps,
        steps_between_evals=flags_obj.steps_between_evals,
        train_epochs=flags_obj.train_epochs,
        epochs_between_evals=flags_obj.epochs_between_evals,
        default_train_epochs=DEFAULT_TRAIN_EPOCHS,
        batch_size=params["batch_size"],
        max_length=params["max_length"],
        use_tpu=params["use_tpu"],
        num_tpu_shards=flags_obj.num_tpu_shards)

    params["repeat_dataset"] = schedule_manager.repeat_dataset

    model_helpers.apply_clean(flags.FLAGS)

    print("============== Train Hooks ==============")
    print(flags_obj.hooks)
    print("============== Train Hooks ==============")

    # Create hooks that log information about the training and metric values
    train_hooks = hooks_helper.get_train_hooks(
        flags_obj.hooks,
        model_dir=flags_obj.model_dir,
        save_steps=5000,
        tensors_to_log=TENSORS_TO_LOG,  # used for logging hooks
        batch_size=schedule_manager.batch_size  # for ExamplesPerSecondHook
    )
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info(model_name="transformer",
                                  dataset_name="wmt_translate_ende",
                                  run_params=params,
                                  test_id=flags_obj.benchmark_test_id)

    # Train and evaluate transformer model
    network = construct_network(num_gpus, flags_obj, params, schedule_manager)
    run_loop(
        network=network,
        # Training arguments
        schedule_manager=schedule_manager,
        train_hooks=train_hooks,
        benchmark_logger=benchmark_logger,
        # BLEU calculation arguments
        bleu_source=flags_obj.bleu_source,
        bleu_ref=flags_obj.bleu_ref,
        bleu_threshold=flags_obj.stop_threshold,
        vocab_file=flags_obj.vocab_file)

    if flags_obj.export_dir and not params["use_tpu"]:
        serving_input_fn = export.build_tensor_serving_input_receiver_fn(
            shape=[None], dtype=tf.int64, batch_size=None)