def Run(benchmark_spec): """Run MXNet on the cluster for each model specified. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: A list of sample.Sample objects. """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm = benchmark_spec.vms[0] mx_benchmark_dir = 'incubator-mxnet/example/image-classification' results = [] for model in FLAGS.mx_models: num_layers = _GetNumLayers(model) batch_size = _GetBatchSize(model, num_layers) benchmark_spec.model = model benchmark_spec.batch_size = batch_size benchmark_spec.num_layers = num_layers benchmark_spec.image_shape = _GetImageShape(model) mx_benchmark_cmd = ( 'python train_imagenet.py ' '--benchmark=1 ' '--network={network} ' '--batch-size={batch_size} ' '--image-shape={image_shape} ' '--num-epochs={num_epochs} ' '--dtype={precision} ' '--kv-store={key_value_store}').format( network=model, batch_size=batch_size, image_shape=benchmark_spec.image_shape, num_epochs=benchmark_spec.num_epochs, precision=benchmark_spec.precision, key_value_store=benchmark_spec.key_value_store) if benchmark_spec.device == GPU: num_gpus = cuda_toolkit.QueryNumberOfGpus(vm) mx_benchmark_cmd = '{env} {cmd} --gpus {gpus}'.format( env=mxnet.GetEnvironmentVars(vm), cmd=mx_benchmark_cmd, gpus=','.join(str(n) for n in range(num_gpus))) elif benchmark_spec.device == CPU: # Specifies the number of threads to use in CPU test. # https://mxnet.incubator.apache.org/faq/perf.html mx_benchmark_cmd = 'OMP_NUM_THREADS={omp_num_threads} {cmd}'.format( omp_num_threads=vm.NumCpusForBenchmark() // 2, cmd=mx_benchmark_cmd) if num_layers: mx_benchmark_cmd = '%s --num-layers %s' % (mx_benchmark_cmd, num_layers) run_command = 'cd %s && %s' % (mx_benchmark_dir, mx_benchmark_cmd) stdout, stderr = vm.RobustRemoteCommand(run_command, should_log=True) results.append(_MakeSamplesFromOutput(benchmark_spec, stdout or stderr)) return results
def _RunModelOnVm(vm, model, benchmark_spec, args='', job_name=''): """Runs a TensorFlow benchmark on a single VM. Args: vm: VM to run on model: string, the name of model to run benchmark_spec: BenchmarkSpec object args: string, distributed arguments job_name: string, distributed job name Returns: a Sample containing the TensorFlow throughput or the process identification number from TensorFlow parameter server. """ tf_cnn_benchmark_dir = 'benchmarks/scripts/tf_cnn_benchmarks' batch_size = _GetBatchSize(model) tf_cnn_benchmark_cmd = ( 'python tf_cnn_benchmarks.py ' '--local_parameter_device={local_parameter_device} ' '--batch_size={batch_size} ' '--model={model} ' '--data_name={data_name} ' '--variable_update={variable_update} ' '--distortions={distortions} ' '--device={device} ' '--data_format={data_format} ' '--forward_only={forward_only} ' '--flush_stdout=true'.format( local_parameter_device=benchmark_spec.local_parameter_device, batch_size=batch_size, model=model, data_name=benchmark_spec.data_name, variable_update=benchmark_spec.variable_update, distortions=benchmark_spec.distortions, device=benchmark_spec.device, data_format=benchmark_spec.data_format, forward_only=benchmark_spec.forward_only)) if benchmark_spec.device == GPU: num_gpus = cuda_toolkit.QueryNumberOfGpus(vm) tf_cnn_benchmark_cmd = '{env} {cmd} --num_gpus={gpus}'.format( env=tensorflow.GetEnvironmentVars(vm), cmd=tf_cnn_benchmark_cmd, gpus=num_gpus) else: num_gpus = 0 if args: tf_cnn_benchmark_cmd = '{cmd} --job_name={job} {args}'.format( cmd=tf_cnn_benchmark_cmd, job=job_name, args=args) run_command = 'cd {path} ; {cmd}'.format(path=tf_cnn_benchmark_dir, cmd=tf_cnn_benchmark_cmd) output, _ = vm.RobustRemoteCommand(run_command, should_log=True) if job_name == 'ps': return _ExtractTfParameterServerPid(output) else: return _MakeSamplesFromOutput(benchmark_spec, output, model, batch_size, num_gpus)
def Prepare(benchmark_spec): """Install SHOC and push the machinefile. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. """ vm_util.RunThreaded(_InstallAndAuthenticateVm, benchmark_spec.vms) master_vm = benchmark_spec.vms[0] benchmark_spec.num_gpus = cuda_toolkit.QueryNumberOfGpus(master_vm) hpc_util.CreateMachineFile(benchmark_spec.vms, lambda _: benchmark_spec.num_gpus, MACHINEFILE)
def _UpdateBenchmarkSpecWithFlags(benchmark_spec): """Update the benchmark_spec with supplied command line flags. Args: benchmark_spec: benchmark specification to update """ gpus_per_node = cuda_toolkit.QueryNumberOfGpus(benchmark_spec.vms[0]) num_vms = len(benchmark_spec.vms) total_gpus = gpus_per_node * num_vms benchmark_spec.gpus_per_node = gpus_per_node benchmark_spec.num_vms = num_vms benchmark_spec.total_gpus = total_gpus benchmark_spec.model = FLAGS.horovod_model benchmark_spec.batch_size = FLAGS.horovod_batch_size benchmark_spec.deep_learning_examples_commit = ( FLAGS.horovod_deep_learning_examples_commit)
def _UpdateBenchmarkSpecWithFlags(benchmark_spec): """Update the benchmark_spec with supplied command line flags. Args: benchmark_spec: benchmark specification to update """ gpus_per_node = (FLAGS.hpcg_gpus_per_node or cuda_toolkit.QueryNumberOfGpus(benchmark_spec.vms[0])) cpus_per_rank = int(benchmark_spec.vms[0].num_cpus / gpus_per_node) num_vms = len(benchmark_spec.vms) total_gpus = gpus_per_node * num_vms benchmark_spec.gpus_per_node = gpus_per_node benchmark_spec.cpus_per_rank = cpus_per_rank benchmark_spec.num_vms = num_vms benchmark_spec.total_gpus = total_gpus benchmark_spec.hpcg_problem_size = FLAGS.hpcg_problem_size benchmark_spec.hpcg_runtime = FLAGS.hpcg_runtime benchmark_spec.run_as_root = FLAGS.mpirun_allow_run_as_root
def _UpdateBenchmarkSpecWithFlags(benchmark_spec): """Update the benchmark_spec with supplied command line flags. Args: benchmark_spec: benchmark specification to update """ gpus_per_node = cuda_toolkit.QueryNumberOfGpus(benchmark_spec.vms[0]) num_vms = len(benchmark_spec.vms) total_gpus = gpus_per_node * num_vms benchmark_spec.gpus_per_node = gpus_per_node benchmark_spec.num_vms = num_vms benchmark_spec.total_gpus = total_gpus benchmark_spec.model = FLAGS.horovod_model benchmark_spec.batch_size = FLAGS.horovod_batch_size benchmark_spec.num_epochs = FLAGS.horovod_num_epochs benchmark_spec.precision = FLAGS.horovod_precision benchmark_spec.max_seq_len = int(FLAGS.horovod_max_seq_len) benchmark_spec.bert_finetune = FLAGS.horovod_bert_finetune benchmark_spec.timeline = FLAGS.horovod_timelime benchmark_spec.nccl_net_plugin = FLAGS.nccl_net_plugin benchmark_spec.cuda_visible_devices = FLAGS.horovod_cuda_visible_devices
def Run(benchmark_spec): """Run MXNet on the cluster for each model specified. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: A list of sample.Sample objects. """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm = benchmark_spec.vms[0] mx_benchmark_dir = 'incubator-mxnet/example/image-classification' results = [] for model in FLAGS.mx_models: num_layers = _GetNumLayers(model) batch_size = _GetBatchSize(model, num_layers) benchmark_spec.model = model benchmark_spec.batch_size = batch_size benchmark_spec.num_layers = num_layers mx_benchmark_cmd = ( 'python train_imagenet.py --benchmark 1 --network %s --batch-size %s ' '--image-shape %s --num-epochs %s --kv-store device') % ( model, batch_size, IMAGENET_SHAPE, benchmark_spec.num_epochs) if benchmark_spec.device == GPU: gpus = cuda_toolkit.QueryNumberOfGpus(vm) mx_benchmark_cmd = '%s %s --gpus %s' % (mxnet.GetEnvironmentVars( vm), mx_benchmark_cmd, ','.join(str(n) for n in range(gpus))) if num_layers: mx_benchmark_cmd = '%s --num-layers %s' % (mx_benchmark_cmd, num_layers) run_command = 'cd %s && %s' % (mx_benchmark_dir, mx_benchmark_cmd) stdout, stderr = vm.RobustRemoteCommand(run_command, should_log=True) results.append(_MakeSamplesFromOutput(benchmark_spec, stdout or stderr)) return results
def _GetTfCnnBenchmarkCommand(vm, model, batch_size, benchmark_spec, args='', job_name=''): """Create the command used to run the tf_cnn_benchmarks script. The command is either formulated using flag values stored on the benchmark_spec, or is essentially provided outright through the benchmark_args flag. Args: vm: the VM to run on. model: name of the model to run. batch_size: batch size to use for training. benchmark_spec: the benchmark spec object. args: string, distributed arguments job_name: string, distributed job name Returns: A string that runs the tf_cnn_benchmarks.py script with the desired arguments. """ num_gpus = (cuda_toolkit.QueryNumberOfGpus(vm) if cuda_toolkit.CheckNvidiaGpuExists(vm) else 0) if benchmark_spec.benchmark_args is not None: cmd = 'python tf_cnn_benchmarks.py ' + benchmark_spec.benchmark_args # If the user didn't specify num_gpus in the benchmark_args string, # use all the GPUs on the system. if '--num_gpus' not in benchmark_spec.benchmark_args and num_gpus: cmd = '{cmd} --num_gpus={num_gpus}'.format(cmd=cmd, num_gpus=num_gpus) return cmd benchmark_spec.local_parameter_device = FLAGS.tf_local_parameter_device benchmark_spec.device = FLAGS.tf_device benchmark_spec.data_format = FLAGS.tf_data_format if num_gpus == 0: benchmark_spec.local_parameter_device = CPU benchmark_spec.device = CPU benchmark_spec.data_format = NHWC cmd = ('python tf_cnn_benchmarks.py ' '--local_parameter_device={local_parameter_device} ' '--batch_size={batch_size} ' '--model={model} ' '--data_name={data_name} ' '--variable_update={variable_update} ' '--distortions={distortions} ' '--device={device} ' '--data_format={data_format} ' '--forward_only={forward_only} ' '--use_fp16={use_fp16}'.format( local_parameter_device=benchmark_spec.local_parameter_device, batch_size=batch_size, model=model, data_name=benchmark_spec.data_name, variable_update=benchmark_spec.variable_update, distortions=benchmark_spec.distortions, device=benchmark_spec.device, data_format=benchmark_spec.data_format, forward_only=benchmark_spec.forward_only, use_fp16=(benchmark_spec.precision == FP16))) if benchmark_spec.device == GPU: cmd = '{env} {cmd} --num_gpus={gpus}'.format( env=tensorflow.GetEnvironmentVars(vm), cmd=cmd, gpus=num_gpus) if args: cmd = '{cmd} --job_name={job} {args}'.format(cmd=cmd, job=job_name, args=args) return cmd
def Run(benchmark_spec): """Run ResNet on the cluster. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: A list of sample.Sample objects. """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm = benchmark_spec.vms[0] if benchmark_spec.tpus: resnet_benchmark_script = 'resnet_main.py' resnet_benchmark_cmd = ( '{env_cmd} && ' 'cd tpu/models && ' 'export PYTHONPATH=$(pwd) &&' 'cd official/resnet && ' 'python {script} ' '--use_tpu={use_tpu} ' '--data_dir={data_dir} ' '--model_dir={model_dir} ' '--resnet_depth={depth} ' '--train_batch_size={train_batch_size} ' '--eval_batch_size={eval_batch_size} ' '--iterations_per_loop={iterations} ' '--data_format={data_format} ' '--precision={precision} ' '--skip_host_call={skip_host_call} ' '--num_train_images={num_train_images} ' '--num_eval_images={num_eval_images}'.format( env_cmd=benchmark_spec.env_cmd, script=resnet_benchmark_script, use_tpu=bool(benchmark_spec.tpus), data_dir=benchmark_spec.data_dir, model_dir=benchmark_spec.model_dir, depth=benchmark_spec.depth, train_batch_size=benchmark_spec.train_batch_size, eval_batch_size=benchmark_spec.eval_batch_size, iterations=benchmark_spec.iterations, data_format=benchmark_spec.data_format, precision=benchmark_spec.precision, skip_host_call=benchmark_spec.skip_host_call, num_train_images=benchmark_spec.num_train_images, num_eval_images=benchmark_spec.num_eval_images)) else: resnet_benchmark_script = 'imagenet_main.py' resnet_benchmark_cmd = ('{env_cmd} && ' 'cd models && ' 'export PYTHONPATH=$(pwd) && ' 'cd official/r1/resnet && ' 'python {script} ' '--data_dir=/data/imagenet ' '--model_dir={model_dir} ' '--resnet_size={resnet_size} ' '--batch_size={batch_size} ' '--data_format={data_format} '.format( env_cmd=benchmark_spec.env_cmd, script=resnet_benchmark_script, model_dir=benchmark_spec.model_dir, resnet_size=benchmark_spec.depth, batch_size=benchmark_spec.train_batch_size, data_format=benchmark_spec.data_format)) precision = '{precision}'.format(precision=benchmark_spec.precision) if precision == 'bfloat16': resnet_benchmark_cmd = '{cmd} --dtype=fp16'.format( cmd=resnet_benchmark_cmd) else: resnet_benchmark_cmd = '{cmd} --dtype=fp32'.format( cmd=resnet_benchmark_cmd) if cuda_toolkit.CheckNvidiaGpuExists(vm): resnet_benchmark_cmd = '{env} {cmd} --num_gpus={num_gpus}'.format( env=tensorflow.GetEnvironmentVars(vm), cmd=resnet_benchmark_cmd, num_gpus=cuda_toolkit.QueryNumberOfGpus(vm)) samples = [] metadata = _CreateMetadataDict(benchmark_spec) elapsed_seconds = 0 steps_per_eval = benchmark_spec.steps_per_eval train_steps = benchmark_spec.train_steps for step in range(steps_per_eval, train_steps + steps_per_eval, steps_per_eval): step = min(step, train_steps) resnet_benchmark_cmd_step = '{cmd} --train_steps={step}'.format( cmd=resnet_benchmark_cmd, step=step) if benchmark_spec.mode in ('train', 'train_and_eval'): if benchmark_spec.tpus: tpu = benchmark_spec.tpu_groups['train'].GetName() num_cores = '--num_cores={}'.format( benchmark_spec.tpu_groups['train'].GetNumShards()) resnet_benchmark_train_cmd = ( '{cmd} --tpu={tpu} --mode=train {num_cores}'.format( cmd=resnet_benchmark_cmd_step, tpu=tpu, num_cores=num_cores)) else: resnet_benchmark_train_cmd = ( '{cmd} --max_train_steps={max_train_steps} ' '--train_epochs={train_epochs} --noeval_only'.format( cmd=resnet_benchmark_cmd, train_epochs=benchmark_spec.epochs_per_eval, max_train_steps=step)) start = time.time() stdout, stderr = vm.RobustRemoteCommand(resnet_benchmark_train_cmd, should_log=True) elapsed_seconds += (time.time() - start) samples.extend( mnist_benchmark.MakeSamplesFromTrainOutput( metadata, stdout + stderr, elapsed_seconds, step)) if benchmark_spec.mode in ('train_and_eval', 'eval'): if benchmark_spec.tpus: tpu = benchmark_spec.tpu_groups['eval'].GetName() num_cores = '--num_cores={}'.format( benchmark_spec.tpu_groups['eval'].GetNumShards()) resnet_benchmark_eval_cmd = ( '{cmd} --tpu={tpu} --mode=eval {num_cores}'.format( cmd=resnet_benchmark_cmd_step, tpu=tpu, num_cores=num_cores)) else: resnet_benchmark_eval_cmd = ('{cmd} --eval_only'.format( cmd=resnet_benchmark_cmd)) stdout, stderr = vm.RobustRemoteCommand(resnet_benchmark_eval_cmd, should_log=True) samples.extend( MakeSamplesFromEvalOutput(metadata, stdout + stderr, elapsed_seconds, use_tpu=bool(benchmark_spec.tpus))) return samples
def testQueryNumberOfGpus(self): vm = mock.MagicMock() vm.RemoteCommand = mock.MagicMock(return_value=("count\n8", None)) self.assertEqual(8, cuda_toolkit.QueryNumberOfGpus(vm))