def test_log_multiple_metrics(self): log_dir = tempfile.mkdtemp(dir=self.get_temp_dir()) log = logger.BenchmarkLogger(log_dir) log.log_metric("accuracy", 0.999, global_step=1e4, extras={"name": "value"}) log.log_metric("loss", 0.02, global_step=1e4) metric_log = os.path.join(log_dir, "metric.log") self.assertTrue(tf.gfile.Exists(metric_log)) with tf.gfile.GFile(metric_log) as f: accuracy = json.loads(f.readline()) self.assertEqual(accuracy["name"], "accuracy") self.assertEqual(accuracy["value"], 0.999) self.assertEqual(accuracy["unit"], None) self.assertEqual(accuracy["global_step"], 1e4) self.assertEqual(accuracy["extras"], [{ "name": "name", "value": "value" }]) loss = json.loads(f.readline()) self.assertEqual(loss["name"], "loss") self.assertEqual(loss["value"], 0.02) self.assertEqual(loss["unit"], None) self.assertEqual(loss["global_step"], 1e4) self.assertEqual(loss["extras"], [])
def test_log_evaluation_result_with_invalid_type(self): eval_result = "{'loss': 0.46237424, 'global_step': 207082}" log_dir = tempfile.mkdtemp(dir=self.get_temp_dir()) log = logger.BenchmarkLogger(log_dir) log.log_estimator_evaluation_result(eval_result) metric_log = os.path.join(log_dir, "metric.log") self.assertFalse(tf.gfile.Exists(metric_log))
def test_log_non_nubmer_value(self): log_dir = tempfile.mkdtemp(dir=self.get_temp_dir()) log = logger.BenchmarkLogger(log_dir) const = tf.constant(1) log.log_metric("accuracy", const) metric_log = os.path.join(log_dir, "metric.log") self.assertFalse(tf.gfile.Exists(metric_log))
def __init__(self, tensors, log_dir=None, metric_logger=None, every_n_iter=None, every_n_secs=None, at_end=False): """Initializer for LoggingMetricHook. Args: tensors: `dict` that maps string-valued tags to tensors/tensor names, or `iterable` of tensors/tensor names. log_dir: `string`, directory path that metric hook should write log to. metric_logger: instance of `BenchmarkLogger`, the benchmark logger that hook should use to write the log. Exactly one of the `log_dir` and `metric_logger` should be provided. every_n_iter: `int`, print the values of `tensors` once every N local steps taken on the current worker. every_n_secs: `int` or `float`, print the values of `tensors` once every N seconds. Exactly one of `every_n_iter` and `every_n_secs` should be provided. at_end: `bool` specifying whether to print the values of `tensors` at the end of the run. Raises: ValueError: 1. `every_n_iter` is non-positive, or 2. Exactly one of every_n_iter and every_n_secs should be provided. 3. Exactly one of log_dir and metric_logger should be provided. """ super(LoggingMetricHook, self).__init__(tensors=tensors, every_n_iter=every_n_iter, every_n_secs=every_n_secs, at_end=at_end) if (log_dir is None) == (metric_logger is None): raise ValueError( "exactly one of log_dir and metric_logger should be provided.") if log_dir is not None: self._logger = logger.BenchmarkLogger(log_dir) else: self._logger = metric_logger
def test_log_evaluation_result(self): eval_result = { "loss": 0.46237424, "global_step": 207082, "accuracy": 0.9285 } log_dir = tempfile.mkdtemp(dir=self.get_temp_dir()) log = logger.BenchmarkLogger(log_dir) log.log_estimator_evaluation_result(eval_result) metric_log = os.path.join(log_dir, "metric.log") self.assertTrue(tf.gfile.Exists(metric_log)) with tf.gfile.GFile(metric_log) as f: accuracy = json.loads(f.readline()) self.assertEqual(accuracy["name"], "accuracy") self.assertEqual(accuracy["value"], 0.9285) self.assertEqual(accuracy["unit"], None) self.assertEqual(accuracy["global_step"], 207082) loss = json.loads(f.readline()) self.assertEqual(loss["name"], "loss") self.assertEqual(loss["value"], 0.46237424) self.assertEqual(loss["unit"], None) self.assertEqual(loss["global_step"], 207082)
def test_create_logging_dir(self): non_exist_temp_dir = os.path.join(self.get_temp_dir(), "unknown_dir") self.assertFalse(tf.gfile.IsDirectory(non_exist_temp_dir)) logger.BenchmarkLogger(non_exist_temp_dir) self.assertTrue(tf.gfile.IsDirectory(non_exist_temp_dir))
def resnet_main(flags, model_function, input_function, shape=None): """Shared main loop for ResNet Models. Args: flags: FLAGS object that contains the params for running. See ResnetArgParser for created flags. model_function: the function that instantiates the Model and builds the ops for train/eval. This will be passed directly into the estimator. input_function: the function that processes the dataset and returns a dataset that the estimator can train on. This will be wrapped with all the relevant flags for running and passed to estimator. shape: list of ints representing the shape of the images used for training. This is only used if flags.export_dir is passed. """ # Using the Winograd non-fused algorithms provides a small performance boost. os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' if flags.multi_gpu: validate_batch_size_for_multi_gpu(flags.batch_size) # There are two steps required if using multi-GPU: (1) wrap the model_fn, # and (2) wrap the optimizer. The first happens here, and (2) happens # in the model_fn itself when the optimizer is defined. model_function = tf.contrib.estimator.replicate_model_fn( model_function, loss_reduction=tf.losses.Reduction.MEAN) # Create session config based on values of inter_op_parallelism_threads and # intra_op_parallelism_threads. Note that we default to having # allow_soft_placement = True, which is required for multi-GPU and not # harmful for other modes. session_config = tf.ConfigProto( inter_op_parallelism_threads=flags.inter_op_parallelism_threads, intra_op_parallelism_threads=flags.intra_op_parallelism_threads, allow_soft_placement=True) # Set up a RunConfig to save checkpoint and set session config. run_config = tf.estimator.RunConfig().replace( save_checkpoints_secs=1e9, session_config=session_config) classifier = tf.estimator.Estimator(model_fn=model_function, model_dir=flags.model_dir, config=run_config, params={ 'resnet_size': flags.resnet_size, 'data_format': flags.data_format, 'batch_size': flags.batch_size, 'multi_gpu': flags.multi_gpu, 'version': flags.version, 'loss_scale': flags.loss_scale, 'dtype': flags.dtype }) if flags.benchmark_log_dir is not None: benchmark_logger = logger.BenchmarkLogger(flags.benchmark_log_dir) benchmark_logger.log_run_info('resnet') else: benchmark_logger = None for _ in range(flags.train_epochs // flags.epochs_between_evals): train_hooks = hooks_helper.get_train_hooks( flags.hooks, batch_size=flags.batch_size, benchmark_log_dir=flags.benchmark_log_dir) print('Starting a training cycle.') def input_fn_train(): return input_function(True, flags.data_dir, flags.batch_size, flags.epochs_between_evals, flags.num_parallel_calls, flags.multi_gpu) classifier.train(input_fn=input_fn_train, hooks=train_hooks, max_steps=flags.max_train_steps) print('Starting to evaluate.') # Evaluate the model and print results def input_fn_eval(): return input_function(False, flags.data_dir, flags.batch_size, 1, flags.num_parallel_calls, flags.multi_gpu) # flags.max_train_steps is generally associated with testing and profiling. # As a result it is frequently called with synthetic data, which will # iterate forever. Passing steps=flags.max_train_steps allows the eval # (which is generally unimportant in those circumstances) to terminate. # Note that eval will run for max_train_steps each loop, regardless of the # global_step count. eval_results = classifier.evaluate(input_fn=input_fn_eval, steps=flags.max_train_steps) print(eval_results) if benchmark_logger: benchmark_logger.log_estimator_evaluation_result(eval_results) if model_helpers.past_stop_threshold(flags.stop_threshold, eval_results['accuracy']): break if flags.export_dir is not None: warn_on_multi_gpu_export(flags.multi_gpu) # Exports a saved model for the given classifier. input_receiver_fn = export.build_tensor_serving_input_receiver_fn( shape, batch_size=flags.batch_size) classifier.export_savedmodel(flags.export_dir, input_receiver_fn)
def resnet_main(seed, flags, model_function, input_function, shape=None): """Shared main loop for ResNet Models. Args: flags: FLAGS object that contains the params for running. See ResnetArgParser for created flags. model_function: the function that instantiates the Model and builds the ops for train/eval. This will be passed directly into the estimator. input_function: the function that processes the dataset and returns a dataset that the estimator can train on. This will be wrapped with all the relevant flags for running and passed to estimator. shape: list of ints representing the shape of the images used for training. This is only used if flags.export_dir is passed. """ mlperf_log.resnet_print(key=mlperf_log.RUN_START) # Using the Winograd non-fused algorithms provides a small performance boost. os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' # Create session config based on values of inter_op_parallelism_threads and # intra_op_parallelism_threads. Note that we default to having # allow_soft_placement = True, which is required for multi-GPU and not # harmful for other modes. session_config = tf.ConfigProto( inter_op_parallelism_threads=flags.inter_op_parallelism_threads, intra_op_parallelism_threads=flags.intra_op_parallelism_threads, allow_soft_placement=True) if flags.num_gpus == 0: distribution = tf.contrib.distribute.OneDeviceStrategy('device:CPU:0') elif flags.num_gpus == 1: distribution = tf.contrib.distribute.OneDeviceStrategy('device:GPU:0') else: distribution = tf.contrib.distribute.MirroredStrategy( num_gpus=flags.num_gpus) mlperf_log.resnet_print(key=mlperf_log.RUN_SET_RANDOM_SEED, value=seed) run_config = tf.estimator.RunConfig(train_distribute=distribution, save_summary_steps=2000, save_checkpoints_steps=1000, session_config=session_config, tf_random_seed=seed, keep_checkpoint_max=2) mlperf_log.resnet_print(key=mlperf_log.INPUT_BATCH_SIZE, value=flags.batch_size) classifier = tf.estimator.Estimator(model_fn=model_function, model_dir=flags.model_dir, config=run_config, params={ 'resnet_size': flags.resnet_size, 'final_size': flags.final_size, 'pickle_model': flags.pickle_model, 'random_init': flags.random_init, 'data_format': flags.data_format, 'batch_size': flags.batch_size, 'train_epochs': flags.train_epochs, 'version': flags.version, 'version_t': flags.version_t, 'loss_scale': flags.loss_scale, 'gap_train': flags.gap_train, 'gap_lambda': flags.gap_lambda, 'gap_ft': flags.gap_ft, 'gap_start': flags.gap_start, 'dtype': flags.dtype, 'learn_rate': flags.learn_rate, 'label_smoothing': flags.label_smoothing, 'enable_lars': flags.enable_lars, 'enable_cos': flags.enable_cos, 'cos_alpha': flags.cos_alpha, 'warm_up': flags.warm_up, 'weight_decay': flags.weight_decay, 'fine_tune': flags.fine_tune, 'enable_kd': flags.enable_kd, 'kd_size': flags.kd_size, 'temp_dst': flags.temp_dst, 'w_dst': flags.w_dst, 'mix_up': flags.mix_up, 'mx_mode': flags.mx_mode, 'enable_quantize': flags.enable_quantize, 'online_quantize': flags.online_quantize, 'enable_at': flags.enable_at, 'w_at': flags.w_at, }) if flags.benchmark_log_dir is not None: benchmark_logger = logger.BenchmarkLogger(flags.benchmark_log_dir) benchmark_logger.log_run_info('resnet') else: benchmark_logger = None mlperf_log.resnet_print(key=mlperf_log.TRAIN_LOOP) # The reference performs the first evaluation on the fourth epoch. (offset # eval by 3 epochs) mlperf_log.resnet_print(key=mlperf_log.EVAL_EPOCH_OFFSET, value=3) success = False print('Training epochs: {}'.format(flags.train_epochs)) iter_train_epochs = flags.train_epochs for i in range(iter_train_epochs // flags.epochs_between_evals): # Data for epochs_between_evals (i.e. 4 epochs between evals) worth of # epochs is concatenated and run as a single block inside a session. For # this reason we declare all of the epochs that will be run at the start. # Submitters may report in a way which is reasonable for their control flow. for j in range(flags.epochs_between_evals): mlperf_log.resnet_print(key=mlperf_log.TRAIN_EPOCH, value=i * flags.epochs_between_evals + j) # input functions def input_fn_eval(): return input_function(is_training=False, data_dir=flags.data_dir, batch_size=per_device_batch_size( flags.batch_size, flags.num_gpus), num_epochs=1, dtype=flags.dtype, oss_load=flags.oss_load) def input_fn_train(): return input_function(is_training=True, data_dir=flags.data_dir, batch_size=per_device_batch_size( flags.batch_size, flags.num_gpus), num_epochs=flags.epochs_between_evals, num_gpus=flags.num_gpus, dtype=flags.dtype, mix_up=flags.mix_up, oss_load=flags.oss_load) # hooks for training train_hooks = hooks_helper.get_train_hooks( flags.hooks, batch_size=flags.batch_size, benchmark_log_dir=flags.benchmark_log_dir) _log_cache = [] def formatter(x): """Abuse side effects to get tensors out of the model_fn.""" if _log_cache: _log_cache.pop() _log_cache.append(x.copy()) return str(x) compliance_hook = tf.train.LoggingTensorHook( tensors={_NUM_EXAMPLES_NAME: _NUM_EXAMPLES_NAME}, every_n_iter=int(1e10), at_end=True, formatter=formatter) extra_hooks = [compliance_hook] if flags.enable_quantize: if flags.online_quantize: # online calculate the KL scale before train-eval quant_online_hook = QuantHook(bits=flags.q_bits, online=True, quant_mode=flags.q_mode) eval_results = classifier.evaluate(input_fn=input_fn_eval, steps=1, hooks=[quant_online_hook]) quant_train_hook = QuantHook(bits=flags.q_bits, quant_copy_num=flags.copy_num, quant_mode=flags.q_mode) extra_hooks.append(quant_train_hook) print('Starting a training cycle.') classifier.train(input_fn=input_fn_train, hooks=train_hooks + extra_hooks, max_steps=flags.max_train_steps) train_examples = int(_log_cache.pop()[_NUM_EXAMPLES_NAME]) mlperf_log.resnet_print(key=mlperf_log.INPUT_SIZE, value=train_examples) # Evaluate the model and print results print('Starting to evaluate.') mlperf_log.resnet_print(key=mlperf_log.EVAL_START) # flags.max_train_steps is generally associated with testing and profiling. # As a result it is frequently called with synthetic data, which will # iterate forever. Passing steps=flags.max_train_steps allows the eval # (which is generally unimportant in those circumstances) to terminate. # Note that eval will run for max_train_steps each loop, regardless of the # global_step count. eval_hooks = None if flags.enable_quantize: quant_eval_hook = QuantHook(bits=flags.q_bits, quant_mode=flags.q_mode) eval_hooks = [quant_eval_hook] eval_results = classifier.evaluate(input_fn=input_fn_eval, steps=flags.max_train_steps, hooks=eval_hooks) mlperf_log.resnet_print(key=mlperf_log.EVAL_STOP) mlperf_log.resnet_print(key=mlperf_log.EVAL_SIZE, value=int(eval_results[_NUM_EXAMPLES_NAME])) mlperf_log.resnet_print(key=mlperf_log.EVAL_ACCURACY, value=float(eval_results['accuracy'])) mlperf_log.resnet_print(key=mlperf_log.EVAL_TARGET, value=flags.stop_threshold) print(eval_results) if benchmark_logger: benchmark_logger.log_estimator_evaluation_result(eval_results) if model_helpers.past_stop_threshold(flags.stop_threshold, eval_results['accuracy']): success = True break mlperf_log.resnet_print(key=mlperf_log.RUN_STOP, value={"success": success}) mlperf_log.resnet_print(key=mlperf_log.RUN_FINAL)