def main(_): params = benchmark_cnn.make_params_from_flags() benchmark_cnn.setup(params) bench = benchmark_cnn.BenchmarkCNN(params) tfversion = cnn_util.tensorflow_version_tuple() log_fn('TensorFlow: %i.%i' % (tfversion[0], tfversion[1])) bench.print_info() bench.run()
def main(positional_arguments): # Command-line arguments like '--distortions False' are equivalent to # '--distortions=True False', where False is a positional argument. To prevent # this from silently running with distortions, we do not allow positional # arguments. assert len(positional_arguments) >= 1 if len(positional_arguments) > 1: raise ValueError('Received unknown positional arguments: %s' % positional_arguments[1:]) params = benchmark_cnn.make_params_from_flags() params = benchmark_cnn.setup(params) bench = benchmark_cnn.BenchmarkCNN(params) tfversion = cnn_util.tensorflow_version_tuple() log_fn('TensorFlow: %i.%i' % (tfversion[0], tfversion[1])) bench.print_info() with log_context(LOGGER_URL, LOGGER_USRENAME, LOGGER_PASSWORD, LOGGER_DB, LOGGER_SERIES, machine=LOGGER_VM): bench.run()
def main(positional_arguments): # Command-line arguments like '--distortions False' are equivalent to # '--distortions=True False', where False is a positional argument. To prevent # this from silently running with distortions, we do not allow positional # arguments. assert len(positional_arguments) >= 1 if len(positional_arguments) > 1: raise ValueError('Received unknown positional arguments: %s' % positional_arguments[1:]) params = benchmark_cnn.make_params_from_flags() # Print ENV Variables tf.logging.debug('=' * 20 + ' Environment Variables ' + '=' * 20) for k, v in os.environ.items(): tf.logging.debug('{}: {}'.format(k, v)) with mlperf.mlperf_logger(absl_flags.FLAGS.ml_perf_compliance_logging, params.model): params = benchmark_cnn.setup(params) bench = benchmark_cnn.BenchmarkCNN(params) tfversion = cnn_util.tensorflow_version_tuple() log_fn('TensorFlow: %i.%i' % (tfversion[0], tfversion[1])) bench.print_info() bench.run()
def main(positional_arguments): # Command-line arguments like '--distortions False' are equivalent to # '--distortions=True False', where False is a positional argument. To prevent # this from silently running with distortions, we do not allow positional # arguments. assert len(positional_arguments) >= 1 if len(positional_arguments) > 1: raise ValueError('Received unknown positional arguments: %s' % positional_arguments[1:]) params = benchmark_cnn.make_params_from_flags() params = benchmark_cnn.setup(params) bench = benchmark_cnn.BenchmarkCNN(params) tfversion = cnn_util.tensorflow_version_tuple() log_fn('TensorFlow: %i.%i' % (tfversion[0], tfversion[1])) bench.print_info() print('num_inter_threads: ' + str(params.num_inter_threads)) print('num_intra_threads: ' + str(params.num_intra_threads)) print('datasets_num_private_threads: ' + str(params.datasets_num_private_threads)) print('datasets_use_prefetch: ' + str(params.datasets_use_prefetch)) print('datasets_prefetch_buffer_size: ' + str(params.datasets_prefetch_buffer_size)) bench.run()
def main(positional_arguments): # Command-line arguments like '--distortions False' are equivalent to # '--distortions=True False', where False is a positional argument. To prevent # this from silently running with distortions, we do not allow positional # arguments. assert len(positional_arguments) >= 1 if len(positional_arguments) > 1: raise ValueError('Received unknown positional arguments: %s' % positional_arguments[1:]) params = benchmark_cnn.make_params_from_flags() params = benchmark_cnn.setup(params) import sys if params.enable_dmo == True: if LoadFileSystem() == False: sys.exit(-1) else : print("\n*******DMO enabled********\n") # sys.exit(0) bench = benchmark_cnn.BenchmarkCNN(params) tfversion = cnn_util.tensorflow_version_tuple() log_fn('TensorFlow: %i.%i' % (tfversion[0], tfversion[1])) bench.print_info() bench.run()
def main(positional_arguments): # Command-line arguments like '--distortions False' are equivalent to # '--distortions=True False', where False is a positional argument. To prevent # this from silently running with distortions, we do not allow positional # arguments. assert len(positional_arguments) >= 1 if len(positional_arguments) > 1: raise ValueError('Received unknown positional arguments: %s' % positional_arguments[1:]) params = benchmark_cnn.make_params_from_flags() handler = benchmark_handler.Handler(params) params = handler.params params = benchmark_cnn.setup(params) bench = benchmark_cnn.BenchmarkCNN(params, dataset=handler.dataset, model=handler.model) handler.set_bench(bench) if getattr(bench.input_preprocessor, 'set_aug_list', None): bench.input_preprocessor.set_aug_list(params.aug_list) bench.benchmark_one_step = handler.benchmark_one_step bench.print_eval_results = handler.print_eval_results bench.check_early_stop = handler.check_early_stop bench.accum_grads = handler.accum_grads bench.build_fetches_forward = handler.build_fetches_forward if params.memory_saving_method == 'recomputing': bench.memory_saving = ms.Memory_Saving(benchmark_cnn=bench) # tfversion = util.tensorflow_version_tuple() # logging.info('TensorFlow: %i.%i' % (tfversion[0], tfversion[1])) bench.print_info() bench.run()
def main(_): params = benchmark_cnn.make_params_from_flags() params = benchmark_cnn.setup(params) if params.model == 'test_model': run_with_test_model(params) else: run_with_real_model(params)
def main(positional_arguments): # Command-line arguments like '--distortions False' are equivalent to # '--distortions=True False', where False is a positional argument. To prevent # this from silently running with distortions, we do not allow positional # arguments. assert len(positional_arguments) >= 1 if len(positional_arguments) > 1: raise ValueError('Received unknown positional arguments: %s' % positional_arguments[1:]) options = make_options_from_flags(FLAGS) params = benchmark_cnn.make_params_from_flags() params = params._replace(batch_size=options.batch_size) params = params._replace(model='MY_GTSRB') params = params._replace(num_epochs=options.num_epochs) params = params._replace(num_gpus=options.num_gpus) params = params._replace(data_format='NHWC') params = params._replace(train_dir=options.checkpoint_folder) params = params._replace(allow_growth=True) params = params._replace(variable_update='replicated') params = params._replace(local_parameter_device='gpu') params = params._replace(use_tf_layers=False) # params = params._replace(all_reduce_spec='nccl') # params = params._replace(bottom_file=options.bottom_file) # params = params._replace(affine_files=options.affine_files) # params = params._replace(affine_classes=options.affine_classes) params = params._replace(optimizer=options.optimizer) params = params._replace(weight_decay=options.weight_decay) #params = params._replace(print_training_accuracy=True) params = params._replace(backbone_model_path=options.backbone_model_path) # Summary and Save & load checkpoints. # params = params._replace(summary_verbosity=1) # params = params._replace(save_summaries_steps=10) # params = params._replace(save_model_secs=3600) # save every 1 hour params = params._replace(save_model_secs=60) #save every 5 min params = benchmark_cnn.setup(params) #testtest(params) #exit(0) if 'test' in options.data_dir: dataset = GTSRBTestDataset(options) else: dataset = GTSRBDataset(options) model = Model_Builder(options.model_name, dataset.num_classes, options, params) bench = benchmark_cnn.BenchmarkCNN(params, dataset=dataset, model=model) tfversion = cnn_util.tensorflow_version_tuple() log_fn('TensorFlow: %i.%i' % (tfversion[0], tfversion[1])) bench.print_info() bench.run() tf.reset_default_graph()
def main(extra_flags): # extra_flags is a list of command line arguments, excluding those defined # in tf.flags.FLAGS. extra_flags[0] is always the program name. It is an error # to supply flags not defined with tf.flags.FLAGS, so we raise an ValueError # in that case. assert len(extra_flags) >= 1 if len(extra_flags) > 1: raise ValueError('Received unknown flags: %s' % extra_flags[1:]) params = benchmark_cnn.make_params_from_flags() benchmark_cnn.setup(params) bench = benchmark_cnn.BenchmarkCNN(params) tfversion = cnn_util.tensorflow_version_tuple() log_fn('TensorFlow: %i.%i' % (tfversion[0], tfversion[1])) bench.print_info() bench.run()
def get_data(options, dataset=None, model_name='gtsrb', phase='train'): if dataset is None: if 'gtsrb' == model_name: import train_gtsrb if 'test' in options.data_dir: dataset = train_gtsrb.GTSRBTestDataset(options) else: dataset = train_gtsrb.GTSRBDataset(options) elif 'resnet101' in model_name: import train_megaface dataset = train_megaface.MegaFaceDataset(options) elif 'resnet50' == model_name: import train_imagenet dataset = train_imagenet.ImageNetDataset(options) elif 'cifar10' in model_name: import train_cifar10 dataset = train_cifar10.CifarDataset(options) params = benchmark_cnn.make_params() params = params._replace(batch_size=options.batch_size) params = params._replace(model='MY_'+model_name) params = params._replace(num_epochs=options.num_epochs) params = params._replace(num_gpus=options.num_gpus) params = params._replace(data_format='NHWC') params = params._replace(allow_growth=True) params = params._replace(use_tf_layers=False) params = params._replace(forward_only=True) params = benchmark_cnn.setup(params) model = Model_Builder(model_name, dataset.num_classes, options, params) is_train = (phase=='train') p_class = dataset.get_input_preprocessor() preprocessor = p_class(options.batch_size, model.get_input_shapes(phase), options.batch_size, model.data_type, is_train, distortions=params.distortions, resize_method='bilinear') ds = preprocessor.create_dataset(batch_size=options.batch_size, num_splits=1, batch_size_per_split=options.batch_size, dataset=dataset, subset=phase, train=is_train, #datasets_repeat_cached_sample = params.datasets_repeat_cached_sample) datasets_repeat_cached_sample = False) ds_iter = preprocessor.create_iterator(ds) input_list = ds_iter.get_next() return model, dataset, input_list
def main(_): # Build benchmark_cnn model params = benchmark_cnn.make_params_from_flags() params, sess_config = benchmark_cnn.setup(params) bench = benchmark_cnn.BenchmarkCNN(params) # Print informaton tfversion = cnn_util.tensorflow_version_tuple() log_fn('TensorFlow: %i.%i' % (tfversion[0], tfversion[1])) bench.print_info() # Build single-GPU benchmark_cnn model with tf.Graph().as_default() as single_gpu_graph: bench.build_model() def run(sess, num_iters, tensor_or_op_name_to_replica_names, num_workers, worker_id, num_replicas_per_worker): fetches = { 'global_step': tensor_or_op_name_to_replica_names[bench.global_step.name][0], 'cost': tensor_or_op_name_to_replica_names[bench.cost.name][0], 'train_op': tensor_or_op_name_to_replica_names[bench.train_op.name][0], } if isinstance(bench.lr, tf.Tensor): fetches['lr'] = tensor_or_op_name_to_replica_names[ bench.lr.name][0] start = time.time() for i in range(num_iters): results = sess.run(fetches) if i % FLAGS.log_frequency == 0: end = time.time() throughput = float(FLAGS.log_frequency) / float(end - start) parallax.log.info( "global step: %d, lr: %f, loss: %f, " "throughput: %f steps/sec" % (results['global_step'], results['lr'] if 'lr' in results else bench.lr, results['cost'], throughput)) start = time.time() config = parallax_config.build_config() config.sess_config = sess_config parallax.parallel_run(single_gpu_graph, run, FLAGS.resource_info_file, FLAGS.max_steps, sync=FLAGS.sync, parallax_config=config)
def main(positional_arguments): assert len(positional_arguments) >= 1 if len(positional_arguments) > 1: raise ValueError('Received unknown positional arguments: %s' % positional_arguments[1:]) options = make_options_from_flags(FLAGS) params = benchmark_cnn.make_params_from_flags() params = params._replace(batch_size=options.batch_size) params = params._replace(model='MY_GTSRB') params = params._replace(num_epochs=options.num_epochs) params = params._replace(num_gpus=options.num_gpus) params = params._replace(data_format='NHWC') params = params._replace(train_dir=options.checkpoint_folder) params = params._replace(allow_growth=True) params = params._replace(variable_update='replicated') params = params._replace(local_parameter_device='gpu') params = params._replace(use_tf_layers=False) # params = params._replace(all_reduce_spec='nccl') # params = params._replace(bottom_file=options.bottom_file) # params = params._replace(affine_files=options.affine_files) # params = params._replace(affine_classes=options.affine_classes) params = params._replace(optimizer=options.optimizer) params = params._replace(weight_decay=options.weight_decay) params = params._replace(print_training_accuracy=True) params = params._replace(backbone_model_path=options.backbone_model_path) # Summary and Save & load checkpoints. # params = params._replace(summary_verbosity=1) # params = params._replace(save_summaries_steps=10) params = params._replace(save_model_secs=3600) # save every 1 hour # params = params._replace(save_model_secs=300) #save every 5 min params = benchmark_cnn.setup(params) dataset = CifarDataset(options) model = Model_Builder(options.model_name, dataset.num_classes, options, params) bench = benchmark_cnn.BenchmarkCNN(params, dataset=dataset, model=model) tfversion = cnn_util.tensorflow_version_tuple() log_fn('TensorFlow: %i.%i' % (tfversion[0], tfversion[1])) bench.print_info() bench.run()
def main(positional_arguments): # Command-line arguments like '--distortions False' are equivalent to # '--distortions=True False', where False is a positional argument. To prevent # this from silently running with distortions, we do not allow positional # arguments. assert len(positional_arguments) >= 1 if len(positional_arguments) > 1: raise ValueError('Received unknown positional arguments: %s' % positional_arguments[1:]) params = benchmark_cnn.make_params_from_flags() params = benchmark_cnn.setup(params) bench = benchmark_cnn.BenchmarkCNN(params) tfversion = cnn_util.tensorflow_version_tuple() log_fn('TensorFlow: %i.%i' % (tfversion[0], tfversion[1])) run_benchmark(bench, absl_flags.FLAGS.iters_per_step)
def main(positional_arguments): # Command-line arguments like '--distortions False' are equivalent to # '--distortions=True False', where False is a positional argument. To prevent # this from silently running with distortions, we do not allow positional # arguments. assert len(positional_arguments) >= 1 if len(positional_arguments) > 1: raise ValueError("Received unknown positional arguments: %s" % positional_arguments[1:]) params = benchmark_cnn.make_params_from_flags() with mlperf.mlperf_logger(absl_flags.FLAGS.ml_perf_compliance_logging, params.model): params = benchmark_cnn.setup(params) bench = benchmark_cnn.BenchmarkCNN(params) tfversion = cnn_util.tensorflow_version_tuple() log_fn("TensorFlow: %i.%i" % (tfversion[0], tfversion[1])) bench.print_info() bench.run()
def main(_): # Build benchmark_cnn model params = benchmark_cnn.make_params_from_flags() params, sess_config = benchmark_cnn.setup(params) bench = benchmark_cnn.BenchmarkCNN(params) # Print informaton tfversion = cnn_util.tensorflow_version_tuple() log_fn('TensorFlow: %i.%i' % (tfversion[0], tfversion[1])) bench.print_info() # Build single-GPU benchmark_cnn model single_gpu_graph = tf.Graph() with single_gpu_graph.as_default(): bench.build_model() config = parallax_config.build_config() config.sess_config = sess_config sess, num_workers, worker_id, num_replicas_per_worker = \ parallax.parallel_run(single_gpu_graph, FLAGS.resource_info_file, sync=FLAGS.sync, parallax_config=config) fetches = { 'global_step': bench.global_step, 'cost': bench.cost, 'train_op': bench.train_op, } start = time.time() for i in range(FLAGS.max_steps): results = sess.run(fetches) if (i + 1) % FLAGS.log_frequency == 0: end = time.time() throughput = float(FLAGS.log_frequency) / float(end - start) parallax.log.info( "global step: %d, loss: %f, throughput: %f steps/sec" % (results['global_step'][0] + 1, results['cost'][0], throughput)) start = time.time()
def _run_benchmark(self, params): """Run a CNN benchmark and report its results. Args: params: Params tuple, typically created by benchmark_cnn.make_params or benchmark_cnn.make_params_from_flags. """ logging.info('Running benchmark [%s]', self._get_name()) params = benchmark_cnn.setup(params) bench = benchmark_cnn.BenchmarkCNN(params) bench.print_info() stats = bench.run() extras = {} extras['examples_per_sec'] = stats.get('images_per_sec') if 'last_average_loss' in stats: extras['last_average_loss'] = stats['last_average_loss'] if 'top_1_accuracy' in stats: extras['top_1_accuracy'] = stats['top_1_accuracy'] if 'top_5_accuracy' in stats: extras['top_5_accuracy'] = stats['top_5_accuracy'] self.report_benchmark(iters=stats.get('num_steps'), wall_time=stats.get('average_wall_time'), extras=extras)
def setUp(self): super(VariableUpdateTest, self).setUp() _check_has_gpu() benchmark_cnn.setup(benchmark_cnn.make_params())
def setUp(self): super(TfCnnBenchmarksModelTest, self).setUp() benchmark_cnn.setup(benchmark_cnn.make_params())
def setUp(self): super(TfCnnBenchmarksTest, self).setUp() _check_has_gpu() benchmark_cnn.setup(benchmark_cnn.make_params())
def main(_): FLAGS.eval = True params = benchmark_cnn.make_params_from_flags() params, config = benchmark_cnn.setup(params) bench = benchmark_cnn.BenchmarkCNN(params) bench.evaluate()
def train(train_args): """ Train network train_args : dict Json dict with the user's configuration parameters. Can be loaded with json.loads() or with yaml.safe_load() """ run_results = { "status": "ok", "user_args": train_args, "machine_config": {}, "training": {}, "evaluation": {} } # Remove possible existing model and log files for f in os.listdir(cfg.MODELS_DIR): file_path = os.path.join(cfg.MODELS_DIR, f) try: if os.path.isfile(file_path): os.unlink(file_path) except Exception as e: print(e) # Declare training arguments kwargs = { 'model': yaml.safe_load(train_args.model).split(' ')[0], 'num_gpus': yaml.safe_load(train_args.num_gpus), 'num_epochs': yaml.safe_load(train_args.num_epochs), 'batch_size': yaml.safe_load(train_args.batch_size_per_device), 'optimizer': yaml.safe_load(train_args.optimizer), 'local_parameter_device': 'cpu', 'variable_update': 'parameter_server' } # Locate training data and check if the selected network fits it # For real data check whether the right data was mounted to the right place and if not download it (cifar10 only) if yaml.safe_load(train_args.dataset) != 'Synthetic data': data_name = yaml.safe_load(train_args.dataset) if data_name == 'cifar10': locate_cifar10() if data_name == 'imagenet': locate_imagenet() kwargs['data_name'] = data_name if data_name == 'imagenet_mini': locate_imagenet_mini() kwargs['data_name'] = 'imagenet' verify_selected_model(kwargs['model'], kwargs['data_name']) kwargs['data_dir'] = '{}/{}'.format(cfg.DATA_DIR, data_name) else: verify_selected_model(kwargs['model'], 'imagenet') # If no GPU is available or the gpu option is set to 0 run CPU mode if num_local_gpus == 0 or kwargs['num_gpus'] == 0: kwargs['device'] = 'cpu' kwargs['data_format'] = 'NHWC' # cpu data format kwargs[ 'num_gpus'] = 1 # Important: tensorflow uses this also to specify the number of CPUs else: kwargs['device'] = 'gpu' kwargs['data_format'] = 'NCHW' # Add training info to run_results but not the directories run_results["training"].update(kwargs) if run_results["training"]["device"] == "cpu": del run_results["training"]["num_gpus"] # avoid misleading info kwargs['train_dir'] = cfg.MODELS_DIR kwargs['benchmark_log_dir'] = cfg.MODELS_DIR # Setup and run the benchmark model params = benchmark.make_params(**kwargs) try: params = benchmark.setup(params) bench = benchmark.BenchmarkCNN(params) except ValueError as param_ex: raise BadRequest( "ValueError in parameter setup: {}. Params: {}".format( param_ex, params)) tf_version = '.'.join( [str(x) for x in cnn_util.tensorflow_version_tuple()]) run_results["training"]["tf_version"] = tf_version # Run benchmark and measure total execution time bench.print_info() start_time_global = datetime.datetime.now().strftime(time_fmt) try: bench.run() except ValueError as ve: raise BadRequest('ValueError in benchmark execution: {}'.format(ve)) end_time_global = datetime.datetime.now().strftime(time_fmt) # Read training and metric log files and store training results training_file = '{}/training.log'.format(cfg.MODELS_DIR) os.rename('{}/benchmark_run.log'.format(cfg.MODELS_DIR), training_file) run_parameters, machine_config = parse_logfile_training(training_file) run_results['training'].update(run_parameters) run_results["machine_config"] = machine_config metric_file = '{}/metric.log'.format(cfg.MODELS_DIR) run_results['training']['result'] = {} run_results['training']['result']['global_start_time'] = start_time_global run_results['training']['result']['global_end_time'] = end_time_global start, end, avg_examples = parse_metric_file(metric_file) run_results["training"]["result"][ "average_examples_per_sec"] = avg_examples run_results['training']['result']['execution_start_time'] = start run_results['training']['result']['execution_end_time'] = end ## Evaluation ## if yaml.safe_load(train_args.evaluation): run_results["evaluation"] = {} kwargs_eval = { 'model': kwargs['model'], 'num_gpus': kwargs['num_gpus'], 'device': kwargs['device'], 'data_format': kwargs['data_format'], 'benchmark_log_dir': kwargs['benchmark_log_dir'], 'train_dir': kwargs['train_dir'], 'eval': True # 'eval_dir': cfg.DATA_DIR, } run_results['evaluation']['device'] = kwargs_eval['device'] if run_results['evaluation']['device'] == 'gpu': run_results['evaluation']['num_gpus'] = kwargs_eval[ 'num_gpus'] # only for GPU to avoid confusion # Locate data if yaml.safe_load(train_args.dataset) != 'Synthetic data': kwargs_eval['data_name'] = kwargs['data_name'] kwargs_eval['data_dir'] = kwargs['data_dir'] # Setup and run the evaluation params_eval = benchmark.make_params(**kwargs_eval) try: params_eval = benchmark.setup(params_eval) evaluation = benchmark.BenchmarkCNN(params_eval) except ValueError as param_ex: raise BadRequest("ValueError: {}".format(param_ex)) evaluation.print_info() start_time_global = datetime.datetime.now().strftime(time_fmt) evaluation.run() end_time_global = datetime.datetime.now().strftime(time_fmt) # Read log files and get evaluation results os.rename('{}/benchmark_run.log'.format(cfg.MODELS_DIR), '{}/evaluation.log'.format(cfg.MODELS_DIR)) evaluation_file = '{}/evaluation.log'.format(cfg.MODELS_DIR) run_parameters = parse_logfile_evaluation(evaluation_file) run_results['evaluation'].update(run_parameters) logfile = '{}/metric.log'.format(cfg.MODELS_DIR) run_results['evaluation']['result'] = {} run_results['evaluation']['result'][ 'global_start_time'] = start_time_global run_results['evaluation']['result'][ 'global_end_time'] = end_time_global with open(logfile, "r") as f: for line in f: l = json.loads(line) if l["name"] == "eval_average_examples_per_sec": run_results["evaluation"]['result'][ "average_examples_per_sec"] = l["value"] if l["name"] == "eval_top_1_accuracy": run_results["evaluation"]['result']["top_1_accuracy"] = l[ "value"] if l["name"] == "eval_top_5_accuracy": run_results["evaluation"]['result']["top_5_accuracy"] = l[ "value"] return run_results
def train(train_args, kwargs, run_results): """Function for training and evalution used in the "pro" flavor Example of run_results, fields filled by this function: { "machine_config": {}, # filled in deep_api.py "benchmark": {}, # filled in deep_api.py "training": { "allow_growth": true, "batch_size": 64, "batch_size_per_device": 64, "data_format": "NCHW", "device": "gpu", "local_parameter_device": "cpu", "model": "resnet50", "num_batches": 100, "num_epochs": 0, "num_gpus": 1, "optimizer": "sgd", "use_fp16": false, "variable_update": "parameter_server", "weight_decay": 0.00004, "result": { "average_examples_per_sec": 124.41983172966508, "execution_start_time": "2021-02-10T22:59:17.434987Z", "execution_end_time": "2021-02-10T23:00:08.358017Z", "execution_time_sec": 50.92302989959717 } }, "evaluation": { "batch_size": 64, "batch_size_per_device": 64, "data_format": "NCHW", "device": "gpu", "model": "resnet50", "num_batches": 100, "num_gpus": 1, "result": { "average_examples_per_sec": 401.17907755615994, "top_1_accuracy": 0.0015625, "top_5_accuracy": 0.00609375 } }, ... } """ # Add more training arguments kwargs['batch_size'] = train_args['batch_size_per_device'] kwargs['model'] = train_args['model'].split(' ')[0] kwargs['weight_decay'] = train_args['weight_decay'] # Log additional arguments in run_results[] run_results['training']['models'].append(kwargs['model']) run_results["training"]['num_epochs'] = kwargs['num_epochs'] run_results['training']['weight_decay'] = kwargs['weight_decay'] # Check if the selected network fits the dataset dataset_name = (kwargs['data_name'] if 'data_name' in kwargs.keys() else 'synthetic_data') if dataset_name != 'synthetic_data': mutils.verify_selected_model(kwargs['model'], kwargs['data_name']) else: mutils.verify_selected_model(kwargs['model'], 'imagenet') # Create Train_Run_Dir to store training data Train_Run_Dir, _ = mutils.create_train_run_dir(kwargs) kwargs['train_dir'] = Train_Run_Dir kwargs['benchmark_log_dir'] = Train_Run_Dir # Log training directories, if they are not deleted later if not train_args['if_cleanup']: run_results['training']['train_dir'] = kwargs['train_dir'] run_results['training']['benchmark_log_dir'] = kwargs[ 'benchmark_log_dir'] # Setup and run the benchmark model print("[DEBUG] benchmark kwargs: %s" % (kwargs)) if cfg.DEBUG_MODEL else '' params = benchmark.make_params(**kwargs) try: params = benchmark.setup(params) bench = benchmark.BenchmarkCNN(params) except ValueError as param_ex: raise BadRequest( "ValueError in parameter setup: {}. Params: {}".format( param_ex, params)) # Run benchmark for Training bench.print_info() try: bench.run() except ValueError as ve: raise BadRequest('ValueError in benchmark execution: {}'.format(ve)) # Read training and metric log files and store training results training_file = os.path.join(Train_Run_Dir, 'training.log') os.rename(os.path.join(Train_Run_Dir, 'benchmark_run.log'), training_file) run_parameters = mutils.parse_logfile_training(training_file) run_results['training'].update(run_parameters) # sort the dictionary alphabetically run_results['training'] = OrderedDict( sorted(run_results['training'].items(), key=lambda t: t[0])) metric_file = os.path.join(Train_Run_Dir, 'metric.log') # it seems, in the case of synthetic_data we need a delay to close metric.log mutils.wait_final_read(metric_file, "average_examples_per_sec") start, end, avg_examples = mutils.parse_metric_file(metric_file) run_results['training']['result'] = {} run_results["training"]["result"][ "average_examples_per_sec"] = avg_examples run_results['training']['result']['execution_start_time'] = start run_results['training']['result']['execution_end_time'] = end start_sec = mutils.timestr_to_stamp(start, cfg.TIME_FORMAT) end_sec = mutils.timestr_to_stamp(end, cfg.TIME_FORMAT) run_results['training']['result'][ 'execution_time_sec'] = end_sec - start_sec ## Evaluation ## if train_args['evaluation']: run_results["evaluation"] = {} kwargs_eval = { 'model': kwargs['model'], 'num_gpus': kwargs['num_gpus'], 'device': kwargs['device'], 'data_format': kwargs['data_format'], 'benchmark_log_dir': kwargs['benchmark_log_dir'], 'train_dir': kwargs['train_dir'], 'eval': True # 'eval_dir': Eval_Dir, } if kwargs_eval['device'] == 'cpu': kwargs_eval['batch_size'] = cfg.BATCH_SIZE_CPU run_results['evaluation']['device'] = kwargs_eval['device'] if run_results['evaluation']['device'] == 'gpu': run_results['evaluation']['num_gpus'] = kwargs_eval[ 'num_gpus'] # only for GPU to avoid confusion # Locate data if dataset_name != 'synthetic_data': kwargs_eval['data_name'] = kwargs['data_name'] kwargs_eval['data_dir'] = kwargs['data_dir'] # Setup and run the evaluation params_eval = benchmark.make_params(**kwargs_eval) try: params_eval = benchmark.setup(params_eval) evaluation = benchmark.BenchmarkCNN(params_eval) except ValueError as param_ex: raise BadRequest("ValueError: {}".format(param_ex)) evaluation.print_info() evaluation.run() # Read log files and get evaluation results evaluation_file = os.path.join(Train_Run_Dir, 'evaluation.log') os.rename(os.path.join(Train_Run_Dir, 'benchmark_run.log'), evaluation_file) run_parameters = mutils.parse_logfile_evaluation(evaluation_file) run_results['evaluation'].update(run_parameters) # sort the dictionary alphabetically run_results['evaluation'] = OrderedDict( sorted(run_results['evaluation'].items(), key=lambda t: t[0])) logfile = os.path.join(Train_Run_Dir, 'metric.log') run_results['evaluation']['result'] = {} # it seems, in the case of synthetic_data we need a delay to close evaluation.log mutils.wait_final_read(logfile, "eval_average_examples_per_sec") with open(logfile, "r") as f: for line in f: l = json.loads(line) if l["name"] == "eval_average_examples_per_sec": run_results["evaluation"]['result'][ "average_examples_per_sec"] = l["value"] if l["name"] == "eval_top_1_accuracy": run_results["evaluation"]['result']["top_1_accuracy"] = l[ "value"] if l["name"] == "eval_top_5_accuracy": run_results["evaluation"]['result']["top_5_accuracy"] = l[ "value"] if train_args['if_cleanup']: shutil.rmtree(Train_Run_Dir)
def setUp(self): super(MlPerfComplianceTest, self).setUp() benchmark_cnn.setup(benchmark_cnn.make_params())
def main(positional_arguments): # Command-line arguments like '--distortions False' are equivalent to # '--distortions=True False', where False is a positional argument. To prevent # this from silently running with distortions, we do not allow positional # arguments. # For DGX servers use hierarchical_copy=True argument assert len(positional_arguments) >= 1 if len(positional_arguments) > 1: raise ValueError('Received unknown positional arguments: %s' % positional_arguments[1:]) tests_models = [ { 'num_gpus': None, 'batch_size': 64, 'variable_update': 'parameter_server', 'model': 'inception3' }, { 'num_gpus': None, 'batch_size': 64, 'variable_update': 'parameter_server', 'model': 'resnet50' }, { 'num_gpus': None, 'batch_size': 32, 'variable_update': 'parameter_server', 'model': 'resnet152' }, #batch=64 crashes { 'num_gpus': None, 'batch_size': 64, 'variable_update': 'replicated', 'model': 'vgg16' }, { 'num_gpus': None, 'batch_size': 512, 'variable_update': 'replicated', 'model': 'alexnet' } ] test_gpus = [1, 2, 4, 8] stats = [] for test in tests_models: for num_gpus in test_gpus: test['num_gpus'] = num_gpus params = benchmark_cnn.make_params_from_flags() params = benchmark_cnn.setup(params) # force --hierarchical_copy to False when using 1 GPU if num_gpus == 1: params = params._replace(hierarchical_copy=False) params = params._replace(num_gpus=test['num_gpus'], batch_size=test['batch_size'], model=test['model'], variable_update=test['variable_update']) bench = benchmark_cnn.BenchmarkCNN(params) tfversion = cnn_util.tensorflow_version_tuple() log_fn('TensorFlow: %i.%i' % (tfversion[0], tfversion[1])) bench.print_info() results = bench.run() # result # { # 'average_wall_time': 0.6646941304206848, # 'images_per_sec': 385.1395525908701, # 'last_average_loss': 7.256145, # 'num_steps': 100, # 'num_workers': 1 # } stats.append({'test': test.copy(), 'result': results}) # summary print('summary:') print('==========') pprint.pprint(stats) print('==========') s = '' for i in range(len(test_gpus)): for j in range(len(tests_models)): s += str(stats[i + j * len(test_gpus)]['result']['images_per_sec']) s += ', ' s += '\n' print(s) print('==========')
def train(kwargs, run_results): """Function to perform training in the case of 'synthetic'/'dataset' flavor. Updates run_results{} """ cnn_score = 0. # sort the dictionary alphabetically run_results['training'] = OrderedDict( sorted(run_results['training'].items(), key=lambda t: t[0])) # calculate "GPU memory scale" for the batch_size num_local_gpus, gpu_model, gpu_memory = mutils.get_available_gpus() m4gb = 4000000000. if kwargs['device'] == 'gpu': quotient = gpu_memory // m4gb remainder = gpu_memory % m4gb rest = remainder / m4gb if rest > 0.4 and rest <= 0.75: memory_scale = quotient + 0.5 elif rest > 0.75: memory_scale = quotient + 1 else: memory_scale = quotient else: memory_scale = 1. print("[DEBUG] GPU Memory scale = {}".format(memory_scale)) # Setup and run the benchmark model for model, batch_size in cfg.MODELS.items(): print() print("[INFO] Testing {} model ...".format(model)) kwargs['model'] = model # in the case of CPU, use batch_size = 8 if kwargs['device'] == 'gpu': kwargs['batch_size'] = int(batch_size * memory_scale) else: kwargs['batch_size'] = cfg.BATCH_SIZE_CPU # Check if the selected network fits the dataset if 'data_name' in kwargs.keys(): if kwargs['data_name'] != 'synthetic_data': mutils.verify_selected_model(kwargs['model'], kwargs['data_name']) else: mutils.verify_selected_model(kwargs['model'], 'imagenet') # Create Train_Run_Dir to store training data. # In the 'benchmark' case, we do not log directory names Train_Run_Dir, _ = mutils.create_train_run_dir(kwargs) kwargs['train_dir'] = Train_Run_Dir kwargs['benchmark_log_dir'] = Train_Run_Dir print("[DEBUG] benchmark kwargs: %s" % (kwargs)) if cfg.DEBUG_MODEL else '' params = benchmark.make_params(**kwargs) try: params = benchmark.setup(params) bench = benchmark.BenchmarkCNN(params) except ValueError as param_ex: raise BadRequest( "ValueError in parameter setup: {}. Params: {}".format( param_ex, params)) # Run benchmark and measure total execution time bench.print_info() try: bench.run() except ValueError as ve: raise BadRequest( 'ValueError in benchmark execution: {}'.format(ve)) # Read training and metric log files and store training results training_file = os.path.join(Train_Run_Dir, 'training.log') os.rename(os.path.join(Train_Run_Dir, 'benchmark_run.log'), training_file) run_parameters = mutils.parse_logfile_training(training_file) metric_file = os.path.join(Train_Run_Dir, 'metric.log') # it seems, in the case of synthetic_data we need a delay to close metric.log mutils.wait_final_read(metric_file, "average_examples_per_sec") run_results['training']['models'].append(kwargs['model']) run_results['training'][model] = {} run_results['training'][model].update(run_parameters) run_results['training'][model]['num_epochs'] = kwargs['num_epochs'] start, end, avg_examples = mutils.parse_metric_file(metric_file) print(start, end, avg_examples) cnn_score += avg_examples start = mutils.timestr_to_stamp(start, cfg.TIME_FORMAT) end = mutils.timestr_to_stamp(end, cfg.TIME_FORMAT) run_results["training"][model][ "average_examples_per_sec"] = avg_examples run_results['training'][model]['execution_time_sec'] = end - start # if_cleanup = true: delete training directory if cfg.IF_CLEANUP: shutil.rmtree(Train_Run_Dir) run_results['training']['score'] = cnn_score