def testShiftRatio(self): test_util.monkey_patch_base_cluster_manager() params = benchmark_cnn.make_params( data_name='imagenet', data_dir=os.path.join(platforms_util.get_test_data_dir(), 'fake_tf_record_data'), job_name='worker', worker_hosts='w1,w2,w3,w4', ps_hosts='p1', task_index=0) self.assertEqual( benchmark_cnn.BenchmarkCNN(params).image_preprocessor.shift_ratio, 0.0) params = params._replace(task_index=3) self.assertEqual( benchmark_cnn.BenchmarkCNN(params).image_preprocessor.shift_ratio, 0.75)
def main(params): """Run benchmarks for TensorFlow.""" print("Run benchmarks for TensorFlow.") tf_version = utils.get_tensorflow_version() print('TensorFlow: %i.%i' % (tf_version[0], tf_version[1])) params.all_reduce_spec = False if params.ip_list: ips = params.ip_list.split(',') TF_CONFIG = {} addresses = [] port = '5001' for ip in ips: address = ip + ":" + port addresses.append(address) TF_CONFIG['cluster'] = {'worker' : addresses} TF_CONFIG['task'] = { 'type': 'worker', 'index': params.job_index, } os.environ["TF_CONFIG"] = json.dumps(TF_CONFIG) params.all_reduce_spec = True if params.model in NLP_MODELS: bench = benchmark_nlp.BenchmarkNLP(params) else: bench = benchmark_cnn.BenchmarkCNN(params) bench.run()
def main(positional_arguments): # Command-line arguments like '--distortions False' are equivalent to # '--distortions=True False', where False is a positional argument. To prevent # this from silently running with distortions, we do not allow positional # arguments. assert len(positional_arguments) >= 1 if len(positional_arguments) > 1: raise ValueError('Received unknown positional arguments: %s' % positional_arguments[1:]) params = benchmark_cnn.make_params_from_flags() params = benchmark_cnn.setup(params) import sys if params.enable_dmo == True: if LoadFileSystem() == False: sys.exit(-1) else : print("\n*******DMO enabled********\n") # sys.exit(0) bench = benchmark_cnn.BenchmarkCNN(params) tfversion = cnn_util.tensorflow_version_tuple() log_fn('TensorFlow: %i.%i' % (tfversion[0], tfversion[1])) bench.print_info() bench.run()
def main(positional_arguments): # Command-line arguments like '--distortions False' are equivalent to # '--distortions=True False', where False is a positional argument. To prevent # this from silently running with distortions, we do not allow positional # arguments. assert len(positional_arguments) >= 1 if len(positional_arguments) > 1: raise ValueError('Received unknown positional arguments: %s' % positional_arguments[1:]) params = benchmark_cnn.make_params_from_flags() params = benchmark_cnn.setup(params) bench = benchmark_cnn.BenchmarkCNN(params) tfversion = cnn_util.tensorflow_version_tuple() log_fn('TensorFlow: %i.%i' % (tfversion[0], tfversion[1])) bench.print_info() print('num_inter_threads: ' + str(params.num_inter_threads)) print('num_intra_threads: ' + str(params.num_intra_threads)) print('datasets_num_private_threads: ' + str(params.datasets_num_private_threads)) print('datasets_use_prefetch: ' + str(params.datasets_use_prefetch)) print('datasets_prefetch_buffer_size: ' + str(params.datasets_prefetch_buffer_size)) bench.run()
def testLearningRate(self): params = benchmark_cnn.make_params(model='resnet50', batch_size=256) self._test_learning_rate(params, { 0: 0, 150136: 0.016, 150137: 0.0016, 300273: 0.0016, 300274: 0.00016, 10000000: 0.0000016 }) params = params._replace(init_learning_rate=1.) self._test_learning_rate(params, { 0: 1., 10000000: 1. }) params = params._replace(init_learning_rate=1., num_learning_rate_warmup_epochs=5) self._test_learning_rate(params, { 0: 0., 12511: 0.5, 25022: 1., 10000000: 1. }) params = params._replace( num_learning_rate_warmup_epochs=0, learning_rate_decay_factor=0.5, num_epochs_per_decay=2, minimum_learning_rate=0.3750, batch_size=32) self._test_learning_rate(params, { 0: 1., 80071: 1., 80072: 0.5, 160143: 0.5, 160144: 0.375, 10000000: 0.375 }) params = params._replace(num_epochs_per_decay=0.) with self.assertRaises(ValueError): with tf.Graph().as_default(): # This will fail because params.learning_rate_decay_factor cannot be # nonzero if params.num_epochs_per_decay is zero. benchmark_cnn.BenchmarkCNN(params)._build_model() params = benchmark_cnn.make_params( model='trivial', batch_size=32, piecewise_learning_rate_schedule='1;3;.1;5;.01') self._test_learning_rate(params, { 0: 1., 120108: 1., 120109: 0.1, 200181: 0.1, 200182: 0.01, 100000000: 0.01 })
def testDistributedReplicatedSavableVars(self): test_util.monkey_patch_base_cluster_manager() params = benchmark_cnn.make_params( variable_update='distributed_replicated', model='inception4', data_name='imagenet', data_dir=os.path.join(platforms_util.get_test_data_dir(), 'fake_tf_record_data'), job_name='worker', worker_hosts='w1,w2,w3,w4', ps_hosts='p1') bench = benchmark_cnn.BenchmarkCNN(params) with tf.Graph().as_default(): bench._build_model() savable_vars = bench.variable_mgr.savable_variables() # Assert all global variables are in savable_vars for v in tf.global_variables(): if not v.name.startswith( variable_mgr_util.PS_SHADOW_VAR_PREFIX + '/v0'): self.assertEqual(v.name, 'global_step:0') name = bench.variable_mgr._strip_port(v.name) if name.startswith(variable_mgr_util.PS_SHADOW_VAR_PREFIX): name = name[len(variable_mgr_util.PS_SHADOW_VAR_PREFIX + '/'):] self.assertIn(name, savable_vars) self.assertIn(savable_vars[name], tf.global_variables()) # Assert all local variables on the first tower are in savable_vars for v in tf.local_variables(): if v.name.startswith('v0/'): name = bench.variable_mgr._strip_port(v.name) self.assertIn(name, savable_vars)
def main(positional_arguments): # Command-line arguments like '--distortions False' are equivalent to # '--distortions=True False', where False is a positional argument. To prevent # this from silently running with distortions, we do not allow positional # arguments. assert len(positional_arguments) >= 1 if len(positional_arguments) > 1: raise ValueError('Received unknown positional arguments: %s' % positional_arguments[1:]) params = benchmark_cnn.make_params_from_flags() params = benchmark_cnn.setup(params) bench = benchmark_cnn.BenchmarkCNN(params) tfversion = cnn_util.tensorflow_version_tuple() log_fn('TensorFlow: %i.%i' % (tfversion[0], tfversion[1])) bench.print_info() with log_context(LOGGER_URL, LOGGER_USRENAME, LOGGER_PASSWORD, LOGGER_DB, LOGGER_SERIES, machine=LOGGER_VM): bench.run()
def main(positional_arguments): # Command-line arguments like '--distortions False' are equivalent to # '--distortions=True False', where False is a positional argument. To prevent # this from silently running with distortions, we do not allow positional # arguments. assert len(positional_arguments) >= 1 if len(positional_arguments) > 1: raise ValueError('Received unknown positional arguments: %s' % positional_arguments[1:]) params = benchmark_cnn.make_params_from_flags() # Print ENV Variables tf.logging.debug('=' * 20 + ' Environment Variables ' + '=' * 20) for k, v in os.environ.items(): tf.logging.debug('{}: {}'.format(k, v)) with mlperf.mlperf_logger(absl_flags.FLAGS.ml_perf_compliance_logging, params.model): params = benchmark_cnn.setup(params) bench = benchmark_cnn.BenchmarkCNN(params) tfversion = cnn_util.tensorflow_version_tuple() log_fn('TensorFlow: %i.%i' % (tfversion[0], tfversion[1])) bench.print_info() bench.run()
def main(positional_arguments): # Command-line arguments like '--distortions False' are equivalent to # '--distortions=True False', where False is a positional argument. To prevent # this from silently running with distortions, we do not allow positional # arguments. assert len(positional_arguments) >= 1 if len(positional_arguments) > 1: raise ValueError('Received unknown positional arguments: %s' % positional_arguments[1:]) options = make_options_from_flags(FLAGS) params = benchmark_cnn.make_params_from_flags() params = params._replace(batch_size=options.batch_size) params = params._replace(model='MY_GTSRB') params = params._replace(num_epochs=options.num_epochs) params = params._replace(num_gpus=options.num_gpus) params = params._replace(data_format='NHWC') params = params._replace(train_dir=options.checkpoint_folder) params = params._replace(allow_growth=True) params = params._replace(variable_update='replicated') params = params._replace(local_parameter_device='gpu') params = params._replace(use_tf_layers=False) # params = params._replace(all_reduce_spec='nccl') # params = params._replace(bottom_file=options.bottom_file) # params = params._replace(affine_files=options.affine_files) # params = params._replace(affine_classes=options.affine_classes) params = params._replace(optimizer=options.optimizer) params = params._replace(weight_decay=options.weight_decay) #params = params._replace(print_training_accuracy=True) params = params._replace(backbone_model_path=options.backbone_model_path) # Summary and Save & load checkpoints. # params = params._replace(summary_verbosity=1) # params = params._replace(save_summaries_steps=10) # params = params._replace(save_model_secs=3600) # save every 1 hour params = params._replace(save_model_secs=60) #save every 5 min params = benchmark_cnn.setup(params) #testtest(params) #exit(0) if 'test' in options.data_dir: dataset = GTSRBTestDataset(options) else: dataset = GTSRBDataset(options) model = Model_Builder(options.model_name, dataset.num_classes, options, params) bench = benchmark_cnn.BenchmarkCNN(params, dataset=dataset, model=model) tfversion = cnn_util.tensorflow_version_tuple() log_fn('TensorFlow: %i.%i' % (tfversion[0], tfversion[1])) bench.print_info() bench.run() tf.reset_default_graph()
def main(positional_arguments): # Command-line arguments like '--distortions False' are equivalent to # '--distortions=True False', where False is a positional argument. To prevent # this from silently running with distortions, we do not allow positional # arguments. assert len(positional_arguments) >= 1 if len(positional_arguments) > 1: raise ValueError('Received unknown positional arguments: %s' % positional_arguments[1:]) params = benchmark_cnn.make_params_from_flags() handler = benchmark_handler.Handler(params) params = handler.params params = benchmark_cnn.setup(params) bench = benchmark_cnn.BenchmarkCNN(params, dataset=handler.dataset, model=handler.model) handler.set_bench(bench) if getattr(bench.input_preprocessor, 'set_aug_list', None): bench.input_preprocessor.set_aug_list(params.aug_list) bench.benchmark_one_step = handler.benchmark_one_step bench.print_eval_results = handler.print_eval_results bench.check_early_stop = handler.check_early_stop bench.accum_grads = handler.accum_grads bench.build_fetches_forward = handler.build_fetches_forward if params.memory_saving_method == 'recomputing': bench.memory_saving = ms.Memory_Saving(benchmark_cnn=bench) # tfversion = util.tensorflow_version_tuple() # logging.info('TensorFlow: %i.%i' % (tfversion[0], tfversion[1])) bench.print_info() bench.run()
def _test_run_benchmark(self, params): """Tests that run_benchmark() runs successfully with the params.""" logs = [] with test_util.monkey_patch(all_reduce_benchmark, log_fn=test_util.print_and_add_to_list(logs)): bench_cnn = benchmark_cnn.BenchmarkCNN(params) all_reduce_benchmark.run_benchmark(bench_cnn, num_iters=5) self.assertRegex(logs[-1], '^Average time per step: [0-9.]+$')
def _binary_search_batch_size(self, params, init_batch_size): """Find the max batch_size using binary search.""" assert init_batch_size > 0 low_batch_size = 0 high_batch_size = None batch_size = init_batch_size # No need to run a warmup or many batches; if it doesn't OOM after 10 # batches, it should work in general. params = params._replace(num_batches=10, num_warmup_batches=0) # Find high_batch_size first. tf.logging.info( 'Looking for upper bound to batch size, starting with %d' % batch_size) while high_batch_size is None: tf.logging.info('Trying batch_size %d' % batch_size) params = params._replace(batch_size=batch_size) bench = benchmark_cnn.BenchmarkCNN(params) bench.print_info() try: bench.run() low_batch_size = batch_size batch_size *= 2 except tf.errors.ResourceExhaustedError: high_batch_size = batch_size - 1 # Binary Search tf.logging.info( 'Max batch size is in range (%d, %d]. Starting binary search to find ' 'exact max batch size.' % (low_batch_size, batch_size)) while low_batch_size < high_batch_size: batch_size = (low_batch_size + high_batch_size + 1) // 2 tf.logging.info('Trying batch_size %d' % batch_size) params = params._replace(batch_size=batch_size) bench = benchmark_cnn.BenchmarkCNN(params) bench.print_info() try: bench.run() low_batch_size = batch_size except tf.errors.ResourceExhaustedError: high_batch_size = batch_size - 1 self.report_benchmark(extras={'max_batch_size': low_batch_size})
def main(_): params = benchmark_cnn.make_params_from_flags() params = benchmark_cnn.setup(params) bench = benchmark_cnn.BenchmarkCNN(params) tfversion = cnn_util.tensorflow_version_tuple() log_fn('TensorFlow: %i.%i' % (tfversion[0], tfversion[1])) bench.print_info() bench.run()
def testInvalidFlags(self): params = benchmark_cnn.make_params(device='cpu', data_format='NCHW') with self.assertRaises(ValueError): benchmark_cnn.BenchmarkCNN(params) params = benchmark_cnn.make_params(use_fp16=True, fp16_vars=True, variable_update='replicated', all_reduce_spec='nccl') with self.assertRaises(ValueError): benchmark_cnn.BenchmarkCNN(params) # Automatic loss scaling is only supported for 'replicated', 'ps', # and 'independent' variable_updates. invalid_variable_updates = [ 'distributed_replicated', 'distributed_all_reduce' ] for variable_update in invalid_variable_updates: params = benchmark_cnn.make_params( use_fp16=True, fp16_vars=True, fp16_enable_auto_loss_scale=True, variable_update=variable_update) with self.assertRaises(ValueError): benchmark_cnn.BenchmarkCNN(params) # Automatic loss scaling is not supported for 'nccl'. params = benchmark_cnn.make_params( use_fp16=True, fp16_vars=True, fp16_enable_auto_loss_scale=True, all_reduce_spec='nccl') with self.assertRaises(ValueError): benchmark_cnn.BenchmarkCNN(params) # Automatic loss scaling is not supported for 'staged_vars'. params = benchmark_cnn.make_params( use_fp16=True, fp16_vars=True, fp16_enable_auto_loss_scale=True, staged_vars=True) with self.assertRaises(ValueError): benchmark_cnn.BenchmarkCNN(params)
def _test_learning_rate(self, params, global_step_to_expected_learning_rate): bench = benchmark_cnn.BenchmarkCNN(params) with tf.Graph().as_default() as graph: bench._build_model() global_step = graph.get_tensor_by_name('global_step:0') learning_rate = graph.get_tensor_by_name('learning_rate_tensor:0') with self.test_session(graph=graph, use_gpu=True) as sess: items = global_step_to_expected_learning_rate.items() for global_step_val, expected_learning_rate in items: self.assertAlmostEqual(sess.run(learning_rate, {global_step: global_step_val}), expected_learning_rate)
def run_with_test_model(params): """Runs tf_cnn_benchmarks with a test model.""" model = test_util.TestCNNModel() inputs = test_util.get_fake_var_update_inputs() with test_util.monkey_patch(benchmark_cnn, LOSS_AND_ACCURACY_DIGITS_TO_SHOW=15): bench = benchmark_cnn.BenchmarkCNN(params, dataset=test_util.TestDataSet(), model=model) # The test model does not use labels when computing loss, so the label # values do not matter as long as it's the right shape. labels = np.array([1] * inputs.shape[0]) bench.image_preprocessor.set_fake_data(inputs, labels) bench.run()
def run_with_real_model(params): """Runs tf_cnn_benchmarks with a real model.""" bench = benchmark_cnn.BenchmarkCNN(params) bench.print_info() preprocessor = get_test_image_preprocessor(bench.batch_size, params) if preprocessor is not None: # The test image preprocessor requires queue runners. Since this file is # used for testing, it is OK to access protected members. # pylint: disable=protected-access bench.dataset._queue_runner_required = True # pylint: enable=protected-access bench.image_preprocessor = preprocessor bench.run()
def _run_benchmark_cnn_with_fake_images(self, params, images, labels): logs = [] benchmark_cnn.log_fn = _print_and_add_to_list(logs) bench = benchmark_cnn.BenchmarkCNN(params) bench.image_preprocessor = preprocessing.TestImagePreprocessor( 227, 227, params.batch_size * params.num_gpus, params.num_gpus, benchmark_cnn.get_data_type(params)) bench.dataset._queue_runner_required = True bench.image_preprocessor.set_fake_data(images, labels) bench.image_preprocessor.expected_subset = ('validation' if params.eval else 'train') bench.run() return logs
def main(_): # Build benchmark_cnn model params = benchmark_cnn.make_params_from_flags() params, sess_config = benchmark_cnn.setup(params) bench = benchmark_cnn.BenchmarkCNN(params) # Print informaton tfversion = cnn_util.tensorflow_version_tuple() log_fn('TensorFlow: %i.%i' % (tfversion[0], tfversion[1])) bench.print_info() # Build single-GPU benchmark_cnn model with tf.Graph().as_default() as single_gpu_graph: bench.build_model() def run(sess, num_iters, tensor_or_op_name_to_replica_names, num_workers, worker_id, num_replicas_per_worker): fetches = { 'global_step': tensor_or_op_name_to_replica_names[bench.global_step.name][0], 'cost': tensor_or_op_name_to_replica_names[bench.cost.name][0], 'train_op': tensor_or_op_name_to_replica_names[bench.train_op.name][0], } if isinstance(bench.lr, tf.Tensor): fetches['lr'] = tensor_or_op_name_to_replica_names[ bench.lr.name][0] start = time.time() for i in range(num_iters): results = sess.run(fetches) if i % FLAGS.log_frequency == 0: end = time.time() throughput = float(FLAGS.log_frequency) / float(end - start) parallax.log.info( "global step: %d, lr: %f, loss: %f, " "throughput: %f steps/sec" % (results['global_step'], results['lr'] if 'lr' in results else bench.lr, results['cost'], throughput)) start = time.time() config = parallax_config.build_config() config.sess_config = sess_config parallax.parallel_run(single_gpu_graph, run, FLAGS.resource_info_file, FLAGS.max_steps, sync=FLAGS.sync, parallax_config=config)
def testMlPerfCompliance(self): string_io = six.StringIO() handler = logging.StreamHandler(string_io) data_dir = test_util.create_black_and_white_images() try: mlperf_log.LOGGER.addHandler(handler) params = benchmark_cnn.make_params( data_dir=data_dir, data_name='imagenet', batch_size=2, num_warmup_batches=0, num_batches=2, num_eval_batches=3, eval_during_training_every_n_steps=1, distortions=False, weight_decay=0.5, optimizer='momentum', momentum=0.5, stop_at_top_1_accuracy=2.0, tf_random_seed=9876, ml_perf=True) with mlperf.mlperf_logger(use_mlperf_logger=True, model='resnet50_v1.5'): bench_cnn = benchmark_cnn.BenchmarkCNN( params, model=_MlPerfTestModel()) bench_cnn.run() logs = string_io.getvalue().splitlines() log_regexes = Counter() for log in logs: for regex in self.EXPECTED_LOG_REGEXES: if regex.search(log): log_regexes[regex] += 1 if log_regexes != self.EXPECTED_LOG_REGEXES: diff_counter = Counter(log_regexes) diff_counter.subtract(self.EXPECTED_LOG_REGEXES) differences = [] for regex in (k for k in diff_counter.keys() if diff_counter[k]): found_count = log_regexes[regex] expected_count = self.EXPECTED_LOG_REGEXES[regex] differences.append( ' For regex %s: Found %d lines matching but ' 'expected to find %d' % (regex.pattern, found_count, expected_count)) raise AssertionError( 'Logs did not match expected logs. Differences:\n' '%s' % '\n'.join(differences)) finally: mlperf_log.LOGGER.removeHandler(handler)
def testModel(self): _check_has_gpu() if not self.get_model_name() or not self.model_execution_test(): return params = benchmark_cnn.make_params(model=self.get_model_name(), num_batches=1, num_intra_threads=1, num_inter_threads=12, batch_size=2, distortions=False) # Run this one; note that this uses a non-test session. bench = benchmark_cnn.BenchmarkCNN(params) bench.run()
def main(positional_arguments): assert len(positional_arguments) >= 1 if len(positional_arguments) > 1: raise ValueError('Received unknown positional arguments: %s' % positional_arguments[1:]) options = make_options_from_flags(FLAGS) params = benchmark_cnn.make_params_from_flags() params = params._replace(batch_size=options.batch_size) params = params._replace(model='MY_GTSRB') params = params._replace(num_epochs=options.num_epochs) params = params._replace(num_gpus=options.num_gpus) params = params._replace(data_format='NHWC') params = params._replace(train_dir=options.checkpoint_folder) params = params._replace(allow_growth=True) params = params._replace(variable_update='replicated') params = params._replace(local_parameter_device='gpu') params = params._replace(use_tf_layers=False) # params = params._replace(all_reduce_spec='nccl') # params = params._replace(bottom_file=options.bottom_file) # params = params._replace(affine_files=options.affine_files) # params = params._replace(affine_classes=options.affine_classes) params = params._replace(optimizer=options.optimizer) params = params._replace(weight_decay=options.weight_decay) params = params._replace(print_training_accuracy=True) params = params._replace(backbone_model_path=options.backbone_model_path) # Summary and Save & load checkpoints. # params = params._replace(summary_verbosity=1) # params = params._replace(save_summaries_steps=10) params = params._replace(save_model_secs=3600) # save every 1 hour # params = params._replace(save_model_secs=300) #save every 5 min params = benchmark_cnn.setup(params) dataset = CifarDataset(options) model = Model_Builder(options.model_name, dataset.num_classes, options, params) bench = benchmark_cnn.BenchmarkCNN(params, dataset=dataset, model=model) tfversion = cnn_util.tensorflow_version_tuple() log_fn('TensorFlow: %i.%i' % (tfversion[0], tfversion[1])) bench.print_info() bench.run()
def _get_benchmark_cnn_losses(self, inputs, params): """Returns the losses of BenchmarkCNN on the given inputs and params.""" logs = [] model = test_util.TestCNNModel() with test_util.monkey_patch(benchmark_cnn, log_fn=test_util.print_and_add_to_list(logs), LOSS_AND_ACCURACY_DIGITS_TO_SHOW=15): bench = benchmark_cnn.BenchmarkCNN( params, dataset=test_util.TestDataSet(), model=model) # The test model does not use labels when computing loss, so the label # values do not matter as long as it's the right shape. labels = np.array([1] * inputs.shape[0]) bench.image_preprocessor.set_fake_data(inputs, labels) bench.run() outputs = test_util.get_training_outputs_from_logs( logs, params.print_training_accuracy) return [x.loss for x in outputs]
def main(positional_arguments): # Command-line arguments like '--distortions False' are equivalent to # '--distortions=True False', where False is a positional argument. To prevent # this from silently running with distortions, we do not allow positional # arguments. assert len(positional_arguments) >= 1 if len(positional_arguments) > 1: raise ValueError('Received unknown positional arguments: %s' % positional_arguments[1:]) params = benchmark_cnn.make_params_from_flags() params = benchmark_cnn.setup(params) bench = benchmark_cnn.BenchmarkCNN(params) tfversion = cnn_util.tensorflow_version_tuple() log_fn('TensorFlow: %i.%i' % (tfversion[0], tfversion[1])) run_benchmark(bench, absl_flags.FLAGS.iters_per_step)
def main(extra_flags): # extra_flags is a list of command line arguments, excluding those defined # in tf.flags.FLAGS. extra_flags[0] is always the program name. It is an error # to supply flags not defined with tf.flags.FLAGS, so we raise an ValueError # in that case. assert len(extra_flags) >= 1 if len(extra_flags) > 1: raise ValueError('Received unknown flags: %s' % extra_flags[1:]) params = benchmark_cnn.make_params_from_flags() benchmark_cnn.setup(params) bench = benchmark_cnn.BenchmarkCNN(params) tfversion = cnn_util.tensorflow_version_tuple() log_fn('TensorFlow: %i.%i' % (tfversion[0], tfversion[1])) bench.print_info() bench.run()
def main(positional_arguments): # Command-line arguments like '--distortions False' are equivalent to # '--distortions=True False', where False is a positional argument. To prevent # this from silently running with distortions, we do not allow positional # arguments. assert len(positional_arguments) >= 1 if len(positional_arguments) > 1: raise ValueError("Received unknown positional arguments: %s" % positional_arguments[1:]) params = benchmark_cnn.make_params_from_flags() with mlperf.mlperf_logger(absl_flags.FLAGS.ml_perf_compliance_logging, params.model): params = benchmark_cnn.setup(params) bench = benchmark_cnn.BenchmarkCNN(params) tfversion = cnn_util.tensorflow_version_tuple() log_fn("TensorFlow: %i.%i" % (tfversion[0], tfversion[1])) bench.print_info() bench.run()
def main(_): # Build benchmark_cnn model params = benchmark_cnn.make_params_from_flags() params, sess_config = benchmark_cnn.setup(params) bench = benchmark_cnn.BenchmarkCNN(params) # Print informaton tfversion = cnn_util.tensorflow_version_tuple() log_fn('TensorFlow: %i.%i' % (tfversion[0], tfversion[1])) bench.print_info() # Build single-GPU benchmark_cnn model single_gpu_graph = tf.Graph() with single_gpu_graph.as_default(): bench.build_model() config = parallax_config.build_config() config.sess_config = sess_config sess, num_workers, worker_id, num_replicas_per_worker = \ parallax.parallel_run(single_gpu_graph, FLAGS.resource_info_file, sync=FLAGS.sync, parallax_config=config) fetches = { 'global_step': bench.global_step, 'cost': bench.cost, 'train_op': bench.train_op, } start = time.time() for i in range(FLAGS.max_steps): results = sess.run(fetches) if (i + 1) % FLAGS.log_frequency == 0: end = time.time() throughput = float(FLAGS.log_frequency) / float(end - start) parallax.log.info( "global step: %d, loss: %f, throughput: %f steps/sec" % (results['global_step'][0] + 1, results['cost'][0], throughput)) start = time.time()
def _run_benchmark(self, params): """Run a CNN benchmark and report its results. Args: params: Params tuple, typically created by benchmark_cnn.make_params or benchmark_cnn.make_params_from_flags. """ logging.info('Running benchmark [%s]', self._get_name()) params = benchmark_cnn.setup(params) bench = benchmark_cnn.BenchmarkCNN(params) bench.print_info() stats = bench.run() extras = {} extras['examples_per_sec'] = stats.get('images_per_sec') if 'last_average_loss' in stats: extras['last_average_loss'] = stats['last_average_loss'] if 'top_1_accuracy' in stats: extras['top_1_accuracy'] = stats['top_1_accuracy'] if 'top_5_accuracy' in stats: extras['top_5_accuracy'] = stats['top_5_accuracy'] self.report_benchmark(iters=stats.get('num_steps'), wall_time=stats.get('average_wall_time'), extras=extras)
def testSaveLoadModel(self): _check_has_gpu() if not self.get_model_name() or not self.model_save_load_test(): return params = benchmark_cnn.make_params( model=self.get_model_name(), num_batches=1, num_intra_threads=1, num_inter_threads=12, distortions=False, batch_size=2, variable_update='replicated', num_warmup_batches=0, num_gpus=2, train_dir=test_util.get_temp_dir('testSaveLoadModel_' + self.get_model_name())) # Run one batch and save the model. # Note that this uses a non-test session. bench = benchmark_cnn.BenchmarkCNN(params) bench.run() self.assertEquals(bench.init_global_step, 0) # Clear the default graph. tf.reset_default_graph() # Test if checkpoint had been saved. ckpt = tf.train.get_checkpoint_state(params.train_dir) match = re.match( os.path.join(params.train_dir, r'model.ckpt-(\d+).index'), ckpt.model_checkpoint_path + '.index') self.assertTrue(match) self.assertGreaterEqual(int(match.group(1)), params.num_batches) params = params._replace(num_batches=2) # Reload the model bench = benchmark_cnn.BenchmarkCNN(params) bench.run() # Check if global step has been restored. self.assertNotEquals(bench.init_global_step, 0) ckpt = tf.train.get_checkpoint_state(params.train_dir) match = re.match( os.path.join(params.train_dir, r'model.ckpt-(\d+).index'), ckpt.model_checkpoint_path + '.index') self.assertTrue(match) self.assertGreaterEqual(int(match.group(1)), params.num_batches) # Check that the batch norm moving averages are restored from checkpoints with tf.Graph().as_default(): bench = benchmark_cnn.BenchmarkCNN(params) bench._build_model() saver = tf.train.Saver(bench.variable_mgr.savable_variables()) with tf.Session( config=benchmark_cnn.create_config_proto(params)) as sess: benchmark_cnn.load_checkpoint(saver, sess, params.train_dir) sess.run(bench.variable_mgr.get_post_init_ops()) bn_moving_vars = [ v for v in tf.global_variables() if '/batchnorm' in v.name and '/moving' in v.name ] self.assertGreater(len(bn_moving_vars), 0) for moving_var in bn_moving_vars: moving_var_value = sess.run(moving_var) # Check that the moving means and moving variances have been restored # by asserting they are not their default values of 0 and 1, # respectively if '/moving_mean' in moving_var.name: self.assertFalse( np.array_equal( moving_var_value, np.zeros(moving_var_value.shape, moving_var_value.dtype))) else: self.assertIn('/moving_variance', moving_var.name) self.assertFalse( np.array_equal( moving_var_value, np.ones(moving_var_value.shape, moving_var_value.dtype)))
def _run_benchmark_cnn(self, params): logs = [] benchmark_cnn.log_fn = _print_and_add_to_list(logs) benchmark_cnn.BenchmarkCNN(params).run() return logs