def test_mutual_exclusivity(self): with self.assertRaises(ValueError): schedule.Manager(train_steps=100, steps_between_evals=100, train_epochs=2, epochs_between_evals=1, default_train_epochs=None, batch_size=2048, max_length=256)
def test_step_basis_tpu(self): manager = schedule.Manager(train_steps=1000, steps_between_evals=100, train_epochs=None, epochs_between_evals=None, default_train_epochs=None, batch_size=2048, max_length=256, use_tpu=True) self.assertEqual(manager.single_iteration_train_steps, 100) # num_eval_examples / (batch_size / max_length) == 3000 / (2048 / 256) self.assertEqual(manager.single_iteration_eval_steps, 375) self.assertIsNone(manager.repeat_dataset)
def test_epoch_basis(self): manager = schedule.Manager(train_steps=None, steps_between_evals=None, train_epochs=10, epochs_between_evals=2, default_train_epochs=None, batch_size=2048, max_length=256) # For non-TPU, estimator relies on dataset exhausion self.assertIsNone(manager.single_iteration_train_steps) self.assertIsNone(manager.single_iteration_eval_steps) self.assertEqual(manager.repeat_dataset, 2)
def test_step_basis(self): manager = schedule.Manager(train_steps=1000, steps_between_evals=100, train_epochs=None, epochs_between_evals=None, default_train_epochs=None, batch_size=2048, max_length=256) self.assertEqual(manager.single_iteration_train_steps, 100) # Evaluation uses the full set self.assertIsNone(manager.single_iteration_eval_steps) self.assertIsNone(manager.repeat_dataset)
def test_epoch_basis_tpu(self): manager = schedule.Manager(train_steps=None, steps_between_evals=None, train_epochs=10, epochs_between_evals=2, default_train_epochs=None, batch_size=2048, max_length=256, use_tpu=True) self.assertEqual( manager.single_iteration_train_steps, schedule.NUM_EXAMPLES[tf.estimator.ModeKeys.TRAIN] * 2 // (2048 / 256)) # num_eval_examples / (batch_size / max_length) == 3000 / (2048 / 256) self.assertEqual(manager.single_iteration_eval_steps, 375) self.assertEqual(manager.repeat_dataset, 2)
def run_transformer(flags_obj): """Create tf.Estimator to train and evaluate transformer model. Args: flags_obj: Object containing parsed flag values. """ num_gpus = flags_core.get_num_gpus(flags_obj) # Add flag-defined parameters to params object params = PARAMS_MAP[flags_obj.param_set] if num_gpus > 1: if flags_obj.param_set == "big": params = model_params.BIG_MULTI_GPU_PARAMS elif flags_obj.param_set == "base": params = model_params.BASE_MULTI_GPU_PARAMS params["data_dir"] = flags_obj.data_dir params["model_dir"] = flags_obj.model_dir params["num_parallel_calls"] = flags_obj.num_parallel_calls params["tpu"] = flags_obj.tpu params["use_tpu"] = bool(flags_obj.tpu) # was a tpu specified. params["static_batch"] = flags_obj.static_batch or params["use_tpu"] params["allow_ffn_pad"] = not params["use_tpu"] params["use_synthetic_data"] = flags_obj.use_synthetic_data # Set batch size parameter, which depends on the availability of # TPU and GPU, and distribution settings. params["batch_size"] = ( flags_obj.batch_size or (params["default_batch_size_tpu"] if params["use_tpu"] else params["default_batch_size"])) if not params["use_tpu"]: params["batch_size"] = distribution_utils.per_device_batch_size( params["batch_size"], num_gpus) schedule_manager = schedule.Manager( train_steps=flags_obj.train_steps, steps_between_evals=flags_obj.steps_between_evals, train_epochs=flags_obj.train_epochs, epochs_between_evals=flags_obj.epochs_between_evals, default_train_epochs=DEFAULT_TRAIN_EPOCHS, batch_size=params["batch_size"], max_length=params["max_length"], use_tpu=params["use_tpu"], num_tpu_shards=flags_obj.num_tpu_shards) params["repeat_dataset"] = schedule_manager.repeat_dataset model_helpers.apply_clean(flags.FLAGS) # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( flags_obj.hooks, model_dir=flags_obj.model_dir, tensors_to_log=TENSORS_TO_LOG, # used for logging hooks batch_size=schedule_manager.batch_size, # for ExamplesPerSecondHook use_tpu=params["use_tpu"] # Not all hooks can run with TPUs ) benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info(model_name="transformer", dataset_name="wmt_translate_ende", run_params=params, test_id=flags_obj.benchmark_test_id) # Train and evaluate transformer model estimator = construct_estimator(flags_obj, params, schedule_manager) run_loop( estimator=estimator, # Training arguments schedule_manager=schedule_manager, train_hooks=train_hooks, benchmark_logger=benchmark_logger, # BLEU calculation arguments bleu_source=flags_obj.bleu_source, bleu_ref=flags_obj.bleu_ref, bleu_threshold=flags_obj.stop_threshold, vocab_file=flags_obj.vocab_file) if flags_obj.export_dir and not params["use_tpu"]: serving_input_fn = export.build_tensor_serving_input_receiver_fn( shape=[None], dtype=tf.int64, batch_size=None) # Export saved model, and save the vocab file as an extra asset. The vocab # file is saved to allow consistent input encoding and output decoding. # (See the "Export trained model" section in the README for an example of # how to use the vocab file.) # Since the model itself does not use the vocab file, this file is saved as # an extra asset rather than a core asset. estimator.export_savedmodel( flags_obj.export_dir, serving_input_fn, assets_extra={"vocab.txt": flags_obj.vocab_file}, strip_default_attrs=True)