def test_config_benchmark_file_logger(self): # Set the benchmark_log_dir first since the benchmark_logger_type will need # the value to be set when it does the validation. with flagsaver.flagsaver(benchmark_log_dir="/tmp"): with flagsaver.flagsaver( benchmark_logger_type="BenchmarkFileLogger"): logger.config_benchmark_logger() self.assertIsInstance(logger.get_benchmark_logger(), logger.BenchmarkFileLogger)
def run_loop(name, train_input_fn, eval_input_fn, model_column_fn, build_estimator_fn, flags_obj, tensors_to_log, early_stop=False): """Define training loop.""" model_helpers.apply_clean(flags.FLAGS) model = build_estimator_fn( model_dir=flags_obj.model_dir, model_type=flags_obj.model_type, model_column_fn=model_column_fn, inter_op=flags_obj.inter_op_parallelism_threads, intra_op=flags_obj.intra_op_parallelism_threads) run_params = { 'batch_size': flags_obj.batch_size, 'train_epochs': flags_obj.train_epochs, 'model_type': flags_obj.model_type, } benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info('wide_deep', name, run_params, test_id=flags_obj.benchmark_test_id) loss_prefix = LOSS_PREFIX.get(flags_obj.model_type, '') tensors_to_log = {k: v.format(loss_prefix=loss_prefix) for k, v in tensors_to_log.items()} train_hooks = hooks_helper.get_train_hooks( flags_obj.hooks, model_dir=flags_obj.model_dir, batch_size=flags_obj.batch_size, tensors_to_log=tensors_to_log) # Train and evaluate the model every `flags.epochs_between_evals` epochs. for n in range(flags_obj.train_epochs // flags_obj.epochs_between_evals): model.train(input_fn=train_input_fn, hooks=train_hooks) results = model.evaluate(input_fn=eval_input_fn) # Display evaluation metrics tf.logging.info('Results at epoch %d / %d', (n + 1) * flags_obj.epochs_between_evals, flags_obj.train_epochs) tf.logging.info('-' * 60) for key in sorted(results): tf.logging.info('%s: %s' % (key, results[key])) benchmark_logger.log_evaluation_result(results) if early_stop and model_helpers.past_stop_threshold( flags_obj.stop_threshold, results['accuracy']): break # Export the model if flags_obj.export_dir is not None: export_model(model, flags_obj.model_type, flags_obj.export_dir, model_column_fn)
def get_logging_metric_hook(tensors_to_log=None, every_n_secs=600, **kwargs): # pylint: disable=unused-argument """Function to get LoggingMetricHook. Args: tensors_to_log: List of tensor names or dictionary mapping labels to tensor names. If not set, log _TENSORS_TO_LOG by default. every_n_secs: `int`, the frequency for logging the metric. Default to every 10 mins. **kwargs: a dictionary of arguments. Returns: Returns a LoggingMetricHook that saves tensor values in a JSON format. """ if tensors_to_log is None: tensors_to_log = _TENSORS_TO_LOG return metric_hook.LoggingMetricHook( tensors=tensors_to_log, metric_logger=logger.get_benchmark_logger(), every_n_secs=every_n_secs)
def get_examples_per_second_hook(every_n_steps=100, batch_size=128, warm_steps=5, **kwargs): # pylint: disable=unused-argument """Function to get ExamplesPerSecondHook. Args: every_n_steps: `int`, print current and average examples per second every N steps. batch_size: `int`, total batch size used to calculate examples/second from global time. warm_steps: skip this number of steps before logging and running average. **kwargs: a dictionary of arguments to ExamplesPerSecondHook. Returns: Returns a ProfilerHook that writes out timelines that can be loaded into profiling tools like chrome://tracing. """ return hooks.ExamplesPerSecondHook( batch_size=batch_size, every_n_steps=every_n_steps, warm_steps=warm_steps, metric_logger=logger.get_benchmark_logger())
def log_and_get_hooks(eval_batch_size): """Convenience function for hook and logger creation.""" # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( FLAGS.hooks, model_dir=FLAGS.model_dir, batch_size=FLAGS.batch_size, # for ExamplesPerSecondHook tensors_to_log={"cross_entropy": "cross_entropy"}) run_params = { "batch_size": FLAGS.batch_size, "eval_batch_size": eval_batch_size, "number_factors": FLAGS.num_factors, "hr_threshold": FLAGS.hr_threshold, "train_epochs": FLAGS.train_epochs, } benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info(model_name="recommendation", dataset_name=FLAGS.dataset, run_params=run_params, test_id=FLAGS.benchmark_test_id) return benchmark_logger, train_hooks
def test_config_base_benchmark_logger(self): with flagsaver.flagsaver(benchmark_logger_type="BaseBenchmarkLogger"): logger.config_benchmark_logger() self.assertIsInstance(logger.get_benchmark_logger(), logger.BaseBenchmarkLogger)
def test_get_default_benchmark_logger(self): with flagsaver.flagsaver(benchmark_logger_type="foo"): self.assertIsInstance(logger.get_benchmark_logger(), logger.BaseBenchmarkLogger)
def run_deep_speech(_): """Run deep speech training and eval loop.""" tf.set_random_seed(flags_obj.seed) # Data preprocessing tf.logging.info("Data preprocessing...") train_speech_dataset = generate_dataset(flags_obj.train_data_dir) eval_speech_dataset = generate_dataset(flags_obj.eval_data_dir) # Number of label classes. Label string is "[a-z]' -" num_classes = len(train_speech_dataset.speech_labels) # Use distribution strategy for multi-gpu training num_gpus = flags_core.get_num_gpus(flags_obj) distribution_strategy = distribution_utils.get_distribution_strategy( num_gpus=num_gpus) run_config = tf.estimator.RunConfig(train_distribute=distribution_strategy) estimator = tf.estimator.Estimator(model_fn=model_fn, model_dir=flags_obj.model_dir, config=run_config, params={ "num_classes": num_classes, }) # Benchmark logging run_params = { "batch_size": flags_obj.batch_size, "train_epochs": flags_obj.train_epochs, "rnn_hidden_size": flags_obj.rnn_hidden_size, "rnn_hidden_layers": flags_obj.rnn_hidden_layers, "rnn_type": flags_obj.rnn_type, "is_bidirectional": flags_obj.is_bidirectional, "use_bias": flags_obj.use_bias } dataset_name = "LibriSpeech" benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info("deep_speech", dataset_name, run_params, test_id=flags_obj.benchmark_test_id) train_hooks = hooks_helper.get_train_hooks(flags_obj.hooks, model_dir=flags_obj.model_dir, batch_size=flags_obj.batch_size) per_replica_batch_size = per_device_batch_size(flags_obj.batch_size, num_gpus) def input_fn_train(): return dataset.input_fn(per_replica_batch_size, train_speech_dataset) def input_fn_eval(): return dataset.input_fn(per_replica_batch_size, eval_speech_dataset) total_training_cycle = (flags_obj.train_epochs // flags_obj.epochs_between_evals) for cycle_index in range(total_training_cycle): tf.logging.info("Starting a training cycle: %d/%d", cycle_index + 1, total_training_cycle) # Perform batch_wise dataset shuffling train_speech_dataset.entries = dataset.batch_wise_dataset_shuffle( train_speech_dataset.entries, cycle_index, flags_obj.sortagrad, flags_obj.batch_size) estimator.train(input_fn=input_fn_train, hooks=train_hooks) # Evaluation tf.logging.info("Starting to evaluate...") eval_results = evaluate_model(estimator, eval_speech_dataset.speech_labels, eval_speech_dataset.entries, input_fn_eval) # Log the WER and CER results. benchmark_logger.log_evaluation_result(eval_results) tf.logging.info("Iteration {}: WER = {:.2f}, CER = {:.2f}".format( cycle_index + 1, eval_results[_WER_KEY], eval_results[_CER_KEY])) # If some evaluation threshold is met if model_helpers.past_stop_threshold(flags_obj.wer_threshold, eval_results[_WER_KEY]): break
def run_transformer(flags_obj): """Create tf.Estimator to train and evaluate transformer model. Args: flags_obj: Object containing parsed flag values. Returns: Dict of results of the run. Contains the keys `eval_results`, `train_hooks`, `bleu_cased`, and `bleu_uncased`. `train_hooks` is a list the instances of hooks used during training. """ num_gpus = flags_core.get_num_gpus(flags_obj) # Add flag-defined parameters to params object params = PARAMS_MAP[flags_obj.param_set] if num_gpus > 1: if flags_obj.param_set == "big": params = model_params.BIG_MULTI_GPU_PARAMS elif flags_obj.param_set == "base": params = model_params.BASE_MULTI_GPU_PARAMS params["data_dir"] = flags_obj.data_dir params["model_dir"] = flags_obj.model_dir params["num_parallel_calls"] = flags_obj.num_parallel_calls params["tpu"] = flags_obj.tpu params["use_tpu"] = bool(flags_obj.tpu) # was a tpu specified. params["static_batch"] = flags_obj.static_batch or params["use_tpu"] params["allow_ffn_pad"] = not params["use_tpu"] params["max_length"] = flags_obj.max_length or params["max_length"] params["use_synthetic_data"] = flags_obj.use_synthetic_data # Set batch size parameter, which depends on the availability of # TPU and GPU, and distribution settings. params["batch_size"] = ( flags_obj.batch_size or (params["default_batch_size_tpu"] if params["use_tpu"] else params["default_batch_size"])) total_batch_size = params["batch_size"] if not params["use_tpu"]: params["batch_size"] = per_replica_batch_size(params["batch_size"], num_gpus) schedule_manager = schedule.Manager( train_steps=flags_obj.train_steps, steps_between_evals=flags_obj.steps_between_evals, train_epochs=flags_obj.train_epochs, epochs_between_evals=flags_obj.epochs_between_evals, default_train_epochs=DEFAULT_TRAIN_EPOCHS, batch_size=params["batch_size"], max_length=params["max_length"], use_tpu=params["use_tpu"], num_tpu_shards=flags_obj.num_tpu_shards) params["repeat_dataset"] = schedule_manager.repeat_dataset model_helpers.apply_clean(flags.FLAGS) # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( flags_obj.hooks, model_dir=flags_obj.model_dir, tensors_to_log=TENSORS_TO_LOG, # used for logging hooks batch_size=total_batch_size, # for ExamplesPerSecondHook use_tpu=params["use_tpu"] # Not all hooks can run with TPUs ) benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info(model_name="transformer", dataset_name="wmt_translate_ende", run_params=params, test_id=flags_obj.benchmark_test_id) # Train and evaluate transformer model estimator = construct_estimator(flags_obj, params, schedule_manager) stats = run_loop( estimator=estimator, # Training arguments schedule_manager=schedule_manager, train_hooks=train_hooks, benchmark_logger=benchmark_logger, # BLEU calculation arguments bleu_source=flags_obj.bleu_source, bleu_ref=flags_obj.bleu_ref, bleu_threshold=flags_obj.stop_threshold, vocab_file=flags_obj.vocab_file) if flags_obj.export_dir and not params["use_tpu"]: serving_input_fn = export.build_tensor_serving_input_receiver_fn( shape=[None], dtype=tf.int64, batch_size=None) # Export saved model, and save the vocab file as an extra asset. The vocab # file is saved to allow consistent input encoding and output decoding. # (See the "Export trained model" section in the README for an example of # how to use the vocab file.) # Since the model itself does not use the vocab file, this file is saved as # an extra asset rather than a core asset. estimator.export_savedmodel( flags_obj.export_dir, serving_input_fn, assets_extra={"vocab.txt": flags_obj.vocab_file}, strip_default_attrs=True) return stats
def resnet_main(flags_obj, model_function, input_function, dataset_name, shape=None): """Shared main loop for ResNet Models. Args: flags_obj: An object containing parsed flags. See define_resnet_flags() for details. model_function: the function that instantiates the Model and builds the ops for train/eval. This will be passed directly into the estimator. input_function: the function that processes the dataset and returns a dataset that the estimator can train on. This will be wrapped with all the relevant flags for running and passed to estimator. dataset_name: the name of the dataset for training and evaluation. This is used for logging purpose. shape: list of ints representing the shape of the images used for training. This is only used if flags_obj.export_dir is passed. Returns: Dict of results of the run. Contains the keys `eval_results` and `train_hooks`. `eval_results` contains accuracy (top_1) and accuracy_top_5. `train_hooks` is a list the instances of hooks used during training. """ model_helpers.apply_clean(flags.FLAGS) # Ensures flag override logic is only executed if explicitly triggered. if flags_obj.tf_gpu_thread_mode: override_flags_and_set_envars_for_gpu_thread_pool(flags_obj) # Configures cluster spec for distribution strategy. num_workers = distribution_utils.configure_cluster(flags_obj.worker_hosts, flags_obj.task_index) # Creates session config. allow_soft_placement = True, is required for # multi-GPU and is not harmful for other modes. session_config = tf.compat.v1.ConfigProto( inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads, intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads, allow_soft_placement=True) distribution_strategy = distribution_utils.get_distribution_strategy( distribution_strategy=flags_obj.distribution_strategy, num_gpus=flags_core.get_num_gpus(flags_obj), all_reduce_alg=flags_obj.all_reduce_alg, num_packs=flags_obj.num_packs) # Creates a `RunConfig` that checkpoints every 24 hours which essentially # results in checkpoints determined only by `epochs_between_evals`. run_config = tf.estimator.RunConfig(train_distribute=distribution_strategy, session_config=session_config, save_checkpoints_secs=60 * 60 * 24, save_checkpoints_steps=None) # Initializes model with all but the dense layer from pretrained ResNet. if flags_obj.pretrained_model_checkpoint_path is not None: warm_start_settings = tf.estimator.WarmStartSettings( flags_obj.pretrained_model_checkpoint_path, vars_to_warm_start='^(?!.*dense)') else: warm_start_settings = None classifier = tf.estimator.Estimator(model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config, warm_start_from=warm_start_settings, params={ 'resnet_size': int(flags_obj.resnet_size), 'data_format': flags_obj.data_format, 'batch_size': flags_obj.batch_size, 'resnet_version': int(flags_obj.resnet_version), 'loss_scale': flags_core.get_loss_scale( flags_obj, default_for_fp16=128), 'dtype': flags_core.get_tf_dtype(flags_obj), 'fine_tune': flags_obj.fine_tune, 'num_workers': num_workers, }) run_params = { 'batch_size': flags_obj.batch_size, 'dtype': flags_core.get_tf_dtype(flags_obj), 'resnet_size': flags_obj.resnet_size, 'resnet_version': flags_obj.resnet_version, 'synthetic_data': flags_obj.use_synthetic_data, 'train_epochs': flags_obj.train_epochs, 'num_workers': num_workers, } if flags_obj.use_synthetic_data: dataset_name = dataset_name + '-synthetic' benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info('resnet', dataset_name, run_params, test_id=flags_obj.benchmark_test_id) train_hooks = hooks_helper.get_train_hooks(flags_obj.hooks, model_dir=flags_obj.model_dir, batch_size=flags_obj.batch_size) def input_fn_train(num_epochs, input_context=None): return input_function(is_training=True, data_dir=flags_obj.data_dir, batch_size=per_replica_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=num_epochs, dtype=flags_core.get_tf_dtype(flags_obj), datasets_num_private_threads=flags_obj. datasets_num_private_threads, input_context=input_context) def input_fn_eval(): return input_function(is_training=False, data_dir=flags_obj.data_dir, batch_size=per_replica_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=1, dtype=flags_core.get_tf_dtype(flags_obj)) train_epochs = (0 if flags_obj.eval_only or not flags_obj.train_epochs else flags_obj.train_epochs) use_train_and_evaluate = flags_obj.use_train_and_evaluate or num_workers > 1 if use_train_and_evaluate: train_spec = tf.estimator.TrainSpec( input_fn=lambda input_context=None: input_fn_train( train_epochs, input_context=input_context), hooks=train_hooks, max_steps=flags_obj.max_train_steps) eval_spec = tf.estimator.EvalSpec(input_fn=input_fn_eval) logging.info('Starting to train and evaluate.') tf.estimator.train_and_evaluate(classifier, train_spec, eval_spec) # tf.estimator.train_and_evalute doesn't return anything in multi-worker # case. eval_results = {} else: if train_epochs == 0: # If --eval_only is set, perform a single loop with zero train epochs. schedule, n_loops = [0], 1 else: # Compute the number of times to loop while training. All but the last # pass will train for `epochs_between_evals` epochs, while the last will # train for the number needed to reach `training_epochs`. For instance if # train_epochs = 25 and epochs_between_evals = 10 # schedule will be set to [10, 10, 5]. That is to say, the loop will: # Train for 10 epochs and then evaluate. # Train for another 10 epochs and then evaluate. # Train for a final 5 epochs (to reach 25 epochs) and then evaluate. n_loops = math.ceil(train_epochs / flags_obj.epochs_between_evals) schedule = [ flags_obj.epochs_between_evals for _ in range(int(n_loops)) ] schedule[-1] = train_epochs - sum(schedule[:-1]) # over counting. dur_list = [] for cycle_index, num_train_epochs in enumerate(schedule): logging.info('Starting cycle: %d/%d', cycle_index, int(n_loops)) stime = time.time() if num_train_epochs: # Since we are calling classifier.train immediately in each loop, the # value of num_train_epochs in the lambda function will not be changed # before it is used. So it is safe to ignore the pylint error here # pylint: disable=cell-var-from-loop classifier.train( input_fn=lambda input_context=None: input_fn_train( num_train_epochs, input_context=input_context), hooks=train_hooks, max_steps=flags_obj.max_train_steps) # flags_obj.max_train_steps is generally associated with testing and # profiling. As a result it is frequently called with synthetic data, # which will iterate forever. Passing steps=flags_obj.max_train_steps # allows the eval (which is generally unimportant in those circumstances) # to terminate. Note that eval will run for max_train_steps each loop, # regardless of the global_step count. logging.info('Starting to evaluate.') eval_results = classifier.evaluate(input_fn=input_fn_eval, steps=flags_obj.max_train_steps) benchmark_logger.log_evaluation_result(eval_results) dur = time.time() - stime dur_list.append(dur) logging.info('DUR FOR EPOCH = %d', dur) if model_helpers.past_stop_threshold(flags_obj.stop_threshold, eval_results['accuracy']): break for i in dur_list: logging.info('Time_stat = %d', i) if flags_obj.export_dir is not None: # Exports a saved model for the given classifier. export_dtype = flags_core.get_tf_dtype(flags_obj) if flags_obj.image_bytes_as_serving_input: input_receiver_fn = functools.partial(image_bytes_serving_input_fn, shape, dtype=export_dtype) else: input_receiver_fn = export.build_tensor_serving_input_receiver_fn( shape, batch_size=flags_obj.batch_size, dtype=export_dtype) classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn, strip_default_attrs=True) stats = {} stats['eval_results'] = eval_results stats['train_hooks'] = train_hooks return stats