def Run(benchmark_spec): """Run MNIST on the cluster. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: A list of sample.Sample objects. """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm = benchmark_spec.vms[0] mnist_benchmark_dir = 'tpu-demos/cloud_tpu/models/mnist' mnist_benchmark_cmd = ( 'python mnist.py --master={master} --train_file={train_file} ' '--use_tpu={use_tpu} ' '--train_steps={train_steps}'.format( master=benchmark_spec.master, train_file=benchmark_spec.train_file, use_tpu=benchmark_spec.use_tpu, train_steps=benchmark_spec.train_steps)) if benchmark_spec.model_dir: mnist_benchmark_cmd = '{cmd} --model_dir {model_dir}'.format( cmd=mnist_benchmark_cmd, model_dir=benchmark_spec.model_dir) if FLAGS.tf_device == 'gpu': mnist_benchmark_cmd = '%s %s' % (tensorflow.GetEnvironmentVars(vm), mnist_benchmark_cmd) run_command = 'cd %s && %s' % (mnist_benchmark_dir, mnist_benchmark_cmd) stdout, stderr = vm.RobustRemoteCommand(run_command, should_log=True) return _MakeSamplesFromOutput(benchmark_spec, stdout + stderr)
def Run(benchmark_spec): """Run MNIST on the cluster. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: A list of sample.Sample objects. """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm = benchmark_spec.vms[0] mnist_benchmark_script = 'tpu/cloud_tpu/models/mnist/mnist.py' mnist_benchmark_cmd = ('python {script} ' '--master={master} ' '--train_file={train_file} ' '--use_tpu={use_tpu} ' '--train_steps={train_steps} ' '--iterations={iterations} ' '--model_dir={model_dir}'.format( script=mnist_benchmark_script, master=benchmark_spec.master, train_file=benchmark_spec.train_file, use_tpu=benchmark_spec.use_tpu, train_steps=benchmark_spec.train_steps, iterations=benchmark_spec.iterations, model_dir=benchmark_spec.model_dir)) if cuda_toolkit.CheckNvidiaGpuExists(vm): mnist_benchmark_cmd = '{env} {cmd}'.format( env=tensorflow.GetEnvironmentVars(vm), cmd=mnist_benchmark_cmd) stdout, stderr = vm.RobustRemoteCommand(mnist_benchmark_cmd, should_log=True) return MakeSamplesFromOutput(_CreateMetadataDict(benchmark_spec), stdout + stderr)
def Run(benchmark_spec): """Run MLPerf on the cluster. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: A list of sample.Sample objects. """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm = benchmark_spec.vms[0] if benchmark_spec.tpus: # For MLPerf v0.5, the benchmake code of different hardware are different. if benchmark_spec.tpu_groups['train'].GetNumShards() > 8: code_path = 'cloud_v2.512/resnet-tpuv2-512/code/resnet/model' elif benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v2-8': code_path = 'cloud_v2.8/resnet-tpuv2-8/code/resnet/model' elif benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-8': code_path = 'cloud_v3.8/resnet-tpuv3-8/code/resnet/model' else: raise ValueError( 'MLPerf configurations do not support the hardware in PKB. PKB may ' 'need to be updated if this is a new TPU type.') cmd = 'bash run_helper.sh 2>&1 | tee output.txt' else: code_path = 'cloud_v100x8/code/resnet' cmd = ('sudo nvidia-docker build . -t foo && ' 'sudo nvidia-docker run -v $MLP_HOST_DATA_DIR:/data -v ' '$MLP_HOST_OUTPUT_DIR:/output -v /proc:/host_proc -t ' 'foo:latest run_helper_8xV100.sh 2>&1 | tee output.txt') mlperf_benchmark_cmd = ( 'export MLP_GCS_MODEL_DIR={model_dir} && ' 'export MLP_PATH_GCS_IMAGENET={data_dir} && ' 'export MLP_TPU_NAME={tpu_train} && ' 'export MLP_PATH_GCS_EUW_IMAGENET={data_dir} && ' 'export MLP_GCS_EUW_MODEL_DIR={model_dir} && ' 'export MLP_TPU_SIDECAR_NAME={tpu_eval} && ' 'export MLP_HOST_DATA_DIR=/data && ' 'export MLP_HOST_OUTPUT_DIR=`pwd`/output && ' 'export PYTHONPATH=$PYTHONPATH:$PWD/tpu/models && ' 'cd results/v0.5.0/google/{code_path} && ' 'sed -i "s/python /python3 /g" run_helper*.sh && ' 'mkdir -p $MLP_HOST_OUTPUT_DIR && ' '{cmd}'.format(model_dir=benchmark_spec.model_dir, data_dir=benchmark_spec.data_dir, tpu_train=(benchmark_spec.tpu_groups['train'].GetName() if benchmark_spec.tpus else ''), tpu_eval=(benchmark_spec.tpu_groups['eval'].GetName() if benchmark_spec.tpus else ''), code_path=code_path, cmd=cmd)) if cuda_toolkit.CheckNvidiaGpuExists(vm): mlperf_benchmark_cmd = '{env} {cmd}'.format( env=tensorflow.GetEnvironmentVars(vm), cmd=mlperf_benchmark_cmd) samples = [] metadata = _CreateMetadataDict(benchmark_spec) stdout, _ = vm.RobustRemoteCommand(mlperf_benchmark_cmd, should_log=True) samples.extend(MakeSamplesFromOutput(metadata, stdout)) return samples
def Run(benchmark_spec): """Run MNIST on the cluster. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: A list of sample.Sample objects. """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm = benchmark_spec.vms[0] mnist_benchmark_script = 'mnist_tpu.py' mnist_benchmark_cmd = ( 'cd models/official/mnist && ' 'python {script} ' '--data_dir={data_dir} ' '--iterations={iterations} ' '--model_dir={model_dir} ' '--batch_size={batch_size}'.format( script=mnist_benchmark_script, data_dir=benchmark_spec.data_dir, iterations=benchmark_spec.iterations, model_dir=benchmark_spec.model_dir, batch_size=benchmark_spec.batch_size)) if cuda_toolkit.CheckNvidiaGpuExists(vm): mnist_benchmark_cmd = '{env} {cmd}'.format( env=tensorflow.GetEnvironmentVars(vm), cmd=mnist_benchmark_cmd) samples = [] metadata = CreateMetadataDict(benchmark_spec) if benchmark_spec.train_steps: if benchmark_spec.tpus: tpu = benchmark_spec.tpu_groups['train'].GetName() num_shards = '--num_shards={}'.format( benchmark_spec.tpu_groups['train'].GetNumShards()) else: tpu = num_shards = '' mnist_benchmark_train_cmd = ( '{cmd} --tpu={tpu} --use_tpu={use_tpu} --train_steps={train_steps} ' '{num_shards} --noenable_predict'.format( cmd=mnist_benchmark_cmd, tpu=tpu, use_tpu=bool(benchmark_spec.tpus), train_steps=benchmark_spec.train_steps, num_shards=num_shards)) start = time.time() stdout, stderr = vm.RobustRemoteCommand(mnist_benchmark_train_cmd, should_log=True) elapsed_seconds = (time.time() - start) samples.extend(MakeSamplesFromTrainOutput( metadata, stdout + stderr, elapsed_seconds, benchmark_spec.train_steps)) if benchmark_spec.eval_steps: mnist_benchmark_eval_cmd = ( '{cmd} --tpu="" --use_tpu=False --eval_steps={eval_steps}'.format( cmd=mnist_benchmark_cmd, eval_steps=benchmark_spec.eval_steps)) stdout, stderr = vm.RobustRemoteCommand(mnist_benchmark_eval_cmd, should_log=True) samples.extend(MakeSamplesFromEvalOutput(metadata, stdout + stderr, elapsed_seconds)) return samples
def Run(benchmark_spec): """Run ResNet on the cluster. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: A list of sample.Sample objects. """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm = benchmark_spec.vms[0] resnet_benchmark_script = 'resnet_main.py' resnet_benchmark_cmd = ( 'cd tpu/models/official/resnet && ' 'python {script} ' '--use_tpu={use_tpu} ' '--tpu={tpu} ' '--data_dir={data_dir} ' '--model_dir={model_dir} ' '--resnet_depth={depth} ' '--mode={mode} ' '--train_steps={train_steps} ' '--train_batch_size={train_batch_size} ' '--eval_batch_size={eval_batch_size} ' '--iterations_per_loop={iterations} ' '--num_cores={num_cores} ' '--data_format={data_format} ' '--precision={precision} ' '--skip_host_call={skip_host_call} ' '--num_train_images={num_train_images} ' '--num_eval_images={num_eval_images}'.format( script=resnet_benchmark_script, use_tpu=benchmark_spec.use_tpu, tpu=benchmark_spec.tpu, data_dir=benchmark_spec.data_dir, model_dir=benchmark_spec.model_dir, depth=benchmark_spec.depth, mode=benchmark_spec.mode, train_steps=benchmark_spec.train_steps, train_batch_size=benchmark_spec.train_batch_size, eval_batch_size=benchmark_spec.eval_batch_size, iterations=benchmark_spec.iterations, num_cores=benchmark_spec.num_shards, data_format=benchmark_spec.data_format, precision=benchmark_spec.precision, skip_host_call=benchmark_spec.skip_host_call, num_train_images=benchmark_spec.num_train_images, num_eval_images=benchmark_spec.num_eval_images )) if FLAGS.tf_device == 'gpu': resnet_benchmark_cmd = '{env} {cmd}'.format( env=tensorflow.GetEnvironmentVars(vm), cmd=resnet_benchmark_cmd) stdout, stderr = vm.RobustRemoteCommand(resnet_benchmark_cmd, should_log=True) return _MakeSamplesFromOutput(_CreateMetadataDict(benchmark_spec), stdout + stderr)
def _RunModelOnVm(vm, model, benchmark_spec, args='', job_name=''): """Runs a TensorFlow benchmark on a single VM. Args: vm: VM to run on model: string, the name of model to run benchmark_spec: BenchmarkSpec object args: string, distributed arguments job_name: string, distributed job name Returns: a Sample containing the TensorFlow throughput or the process identification number from TensorFlow parameter server. """ tf_cnn_benchmark_dir = 'benchmarks/scripts/tf_cnn_benchmarks' batch_size = _GetBatchSize(model) tf_cnn_benchmark_cmd = ( 'python tf_cnn_benchmarks.py ' '--local_parameter_device={local_parameter_device} ' '--batch_size={batch_size} ' '--model={model} ' '--data_name={data_name} ' '--variable_update={variable_update} ' '--distortions={distortions} ' '--device={device} ' '--data_format={data_format} ' '--forward_only={forward_only} ' '--flush_stdout=true'.format( local_parameter_device=benchmark_spec.local_parameter_device, batch_size=batch_size, model=model, data_name=benchmark_spec.data_name, variable_update=benchmark_spec.variable_update, distortions=benchmark_spec.distortions, device=benchmark_spec.device, data_format=benchmark_spec.data_format, forward_only=benchmark_spec.forward_only)) if benchmark_spec.device == GPU: num_gpus = cuda_toolkit.QueryNumberOfGpus(vm) tf_cnn_benchmark_cmd = '{env} {cmd} --num_gpus={gpus}'.format( env=tensorflow.GetEnvironmentVars(vm), cmd=tf_cnn_benchmark_cmd, gpus=num_gpus) else: num_gpus = 0 if args: tf_cnn_benchmark_cmd = '{cmd} --job_name={job} {args}'.format( cmd=tf_cnn_benchmark_cmd, job=job_name, args=args) run_command = 'cd {path} ; {cmd}'.format(path=tf_cnn_benchmark_dir, cmd=tf_cnn_benchmark_cmd) output, _ = vm.RobustRemoteCommand(run_command, should_log=True) if job_name == 'ps': return _ExtractTfParameterServerPid(output) else: return _MakeSamplesFromOutput(benchmark_spec, output, model, batch_size, num_gpus)
def Run(benchmark_spec): """Run Inception V3 on the cluster. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: A list of sample.Sample objects. """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm = benchmark_spec.vms[0] inception3_benchmark_script = ( 'tpu/models/experimental/inception/inception_v3.py') inception3_benchmark_cmd = ( 'python {script} ' '--tpu={tpu} ' '--learning_rate={learning_rate} ' '--train_steps={train_steps} ' '--iterations={iterations} ' '--use_tpu={use_tpu} ' '--use_data={use_data} ' '--mode={mode} ' '--train_steps_per_eval={train_steps_per_eval} ' '--data_dir={data_dir} ' '--model_dir={model_dir} ' '--save_checkpoints_secs={save_checkpoints_secs} ' '--train_batch_size={train_batch_size} ' '--eval_batch_size={eval_batch_size} ' '--num_shards={num_shards}'.format( script=inception3_benchmark_script, tpu=benchmark_spec.tpu, learning_rate=benchmark_spec.learning_rate, train_steps=benchmark_spec.train_steps, iterations=benchmark_spec.iterations, use_tpu=benchmark_spec.use_tpu, use_data=benchmark_spec.use_data, mode=benchmark_spec.mode, train_steps_per_eval=benchmark_spec.train_steps_per_eval, data_dir=benchmark_spec.data_dir, model_dir=benchmark_spec.model_dir, save_checkpoints_secs=benchmark_spec.save_checkpoints_secs, train_batch_size=benchmark_spec.train_batch_size, eval_batch_size=benchmark_spec.eval_batch_size, num_shards=benchmark_spec.num_shards)) if FLAGS.tf_device == 'gpu': inception3_benchmark_cmd = '{env} {cmd}'.format( env=tensorflow.GetEnvironmentVars(vm), cmd=inception3_benchmark_cmd) stdout, stderr = vm.RobustRemoteCommand(inception3_benchmark_cmd, should_log=True) return mnist_benchmark.MakeSamplesFromOutput( _CreateMetadataDict(benchmark_spec), stdout + stderr)
def _RunOnVm(vm, benchmark_spec): """Runs a TensorFlow benchmark on a single VM. Args: vm: VM to run on benchmark_spec: benchmark_spec object Returns: A list of samples """ tf_cnn_benchmark_dir = 'benchmarks/scripts/tf_cnn_benchmarks' results = [] for model in FLAGS.tf_models: batch_size = _GetBatchSize(model) tf_cnn_benchmark_cmd = ( 'python tf_cnn_benchmarks.py --local_parameter_device=%s ' '--batch_size=%s --model=%s --data_name=%s --variable_update=%s ' '--use_nccl=%s --distortions=%s --device=%s --data_format=%s ' '--forward_only=%s') % ( benchmark_spec.local_parameter_device, batch_size, model, benchmark_spec.data_name, benchmark_spec.variable_update, benchmark_spec.use_nccl, benchmark_spec.distortions, benchmark_spec.device, benchmark_spec.data_format, benchmark_spec.forward_only) if benchmark_spec.device == GPU: num_gpus = cuda_toolkit_8.QueryNumberOfGpus(vm) tf_cnn_benchmark_cmd = '%s %s --num_gpus=%s' % ( tensorflow.GetEnvironmentVars(vm), tf_cnn_benchmark_cmd, num_gpus) else: num_gpus = 0 run_command = 'cd %s && %s' % (tf_cnn_benchmark_dir, tf_cnn_benchmark_cmd) output, _ = vm.RobustRemoteCommand(run_command, should_log=True) results.extend( _MakeSamplesFromOutput(benchmark_spec, output, model, batch_size, num_gpus)) return results
def Run(benchmark_spec): """Run MLPerf on the cluster. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: A list of sample.Sample objects. """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vms = benchmark_spec.vms master_vm = vms[0] benchmark = benchmark_spec.benchmark env_params = {} env_params['SLURM_JOB_ID'] = r'{uri}'.format(uri=FLAGS.run_uri) env_params['PULL'] = 0 env_params['DGXSYSTEM'] = DGXSYSTEM env_params['NEXP'] = 1 env_params['LOGDIR'] = posixpath.join(vm_util.VM_TMP_DIR, benchmark) script_path = ('$HOME/training_results_{version}/NVIDIA/benchmarks/{model}' r'/implementations/{framework}'.format( version=mlperf_benchmark.MLPERF_VERSION, model='maskrcnn' if mlperf_benchmark.MASK in benchmark else benchmark, framework='mxnet' if mlperf_benchmark.RESNET in benchmark else 'pytorch')) benchmark_env_params = { mlperf_benchmark.TRANSFORMER: { 'CONT': r'"mlperf-nvidia:translation"', 'DATADIR': r'/data/wmt/utf8' }, mlperf_benchmark.SSD: { 'CONT': r'"mlperf-nvidia:single_stage_detector"', 'DATADIR': '/data' }, mlperf_benchmark.GNMT: { 'CONT': r'"mlperf-nvidia:rnn_translator"', 'DATADIR': r'/data/gnmt' }, mlperf_benchmark.MASK: {}, mlperf_benchmark.RESNET: {}, mlperf_benchmark.BERT: {}, } env_params.update(benchmark_env_params.get(benchmark, {})) if mlperf_benchmark.RESNET in benchmark: env_params['SLURM_JOB_NUM_NODES'] = benchmark_spec.num_vms env = r'' if nvidia_driver.CheckNvidiaGpuExists(master_vm): env = tensorflow.GetEnvironmentVars(master_vm) cmd = (f'cd {script_path} && ' f'{env} {_DictToString(env_params)} ' f'{FLAGS.nccl_mpi} ' '--allow-run-as-root ' '-hostfile $HOME/HOSTFILE ' '--mca pml ^cm ' '--mca btl tcp,self ' '--mca btl_tcp_if_exclude docker0,lo ' '--bind-to none ' '-N 1 ' './run_with_docker1.sh') if (mlperf_benchmark.NVPROF in FLAGS.mlperf_profiler or FLAGS.mlperf_keep_nccl_log): cmd += (r' && cp /tmp/pkb/cmd* {logdir}'.format( logdir=posixpath.join(vm_util.VM_TMP_DIR, benchmark))) samples = [] metadata = _CreateMetadataDict(benchmark_spec) stdout, _ = master_vm.RobustRemoteCommand(cmd, should_log=True) if mlperf_benchmark.NONE in FLAGS.mlperf_profiler: samples.extend(MakeSamplesFromOutput(metadata, stdout, model=benchmark)) if (mlperf_benchmark.NVPROF in FLAGS.mlperf_profiler or FLAGS.mlperf_keep_nccl_log): master_vm.RemoteCommand( r'mkdir -p /data/aggregated/{model}'.format(model=benchmark)) master_vm.RemoteCommand( r'mpirun -hostfile $HOME/{hostfile} -N 1 scp -r {logdir} ' r'{master_ip}:/data/aggregated/'.format( hostfile=HOSTFILE, logdir=posixpath.join(vm_util.VM_TMP_DIR, benchmark), master_ip=master_vm.internal_ip)) return samples
def Run(benchmark_spec): """Run MLPerf on the cluster. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: A list of sample.Sample objects. """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm = benchmark_spec.vms[0] if benchmark_spec.tpus: # For MLPerf v0.6, the benchmake code of different hardware are different. if (benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-32' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-128' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-256' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-512' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-1024' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-2048'): run_path = ( '$HOME/training_results_v0.6/Google/benchmarks/{model}/tpu-{tpus}' .format(model=benchmark_spec.benchmark, tpus=benchmark_spec.tpu_groups['train']. GetAcceleratorType())) code_path = ( '$HOME/training_results_v0.6/Google/benchmarks/{model}/implementations/tpu-{tpus}-{model}' .format(model=benchmark_spec.benchmark, tpus=benchmark_spec.tpu_groups['train']. GetAcceleratorType())) if 'mask' in benchmark_spec.benchmark: model = 'mask_rcnn' elif 'gnmt' in benchmark_spec.benchmark: model = 'nmt' else: model = benchmark_spec.benchmark mlperf_benchmark_cmd = ( 'cd {code_path} && ' 'export PYTHONPATH=$(pwd):$(pwd)/{model} && ' 'cd {model} && ' '{run_path}/run_and_time1.sh'.format(code_path=code_path, model=model, run_path=run_path)) if 'ssd' in benchmark_spec.benchmark: mlperf_benchmark_cmd = ( 'export ' 'MLP_GCS_RESNET_CHECKPOINT=gs://download.tensorflow.org/models/mlperf/v0.5.0/resnet34_ssd_checkpoint' ' && {cmd}'.format(cmd=mlperf_benchmark_cmd)) else: raise ValueError( 'MLPerf configurations do not support the hardware in PKB. PKB may ' 'need to be updated if this is a new TPU type.') else: if 'resnet' in benchmark_spec.benchmark: mlperf_benchmark_cmd = ( 'cd ' 'training_results_v0.6/NVIDIA/benchmarks/resnet/implementations/mxnet' ' && sed \'s/SYSLOGGING=1/SYSLOGGING=0/g\' ./run.sub > ./run1.sub &&' ' chmod 755 ./run1.sub && sudo DATADIR=/data/imagenet ' 'LOGDIR=/tmp/resnet PULL=0 DGXSYSTEM=DGX1 NEXP=1 ./run1.sub ') if 'transformer' in benchmark_spec.benchmark: mlperf_benchmark_cmd = ( 'cd ' 'training_results_v0.6/NVIDIA/benchmarks/transformer/implementations/pytorch' ' && sed \'s/SYSLOGGING=1/SYSLOGGING=0/g\' ./run.sub > ./run1.sub &&' ' chmod 755 ./run1.sub && sudo DATADIR=/data/wmt/utf8 ' 'LOGDIR=/tmp/transformer PULL=0 DGXSYSTEM=DGX1 NEXP=1 ./run1.sub ' ) if 'minigo' in benchmark_spec.benchmark: mlperf_benchmark_cmd = ( 'cd ' '$HOME/training_results_v0.6/NVIDIA/benchmarks/minigo/implementations/tensorflow' ' && sed \'s/SYSLOGGING=1/SYSLOGGING=0/g\' ./run.sub > run1.sub && ' 'chmod 755 ./run1.sub && sudo LOGDIR=/tmp/minigo ' 'CONT=mlperf-nvidia:minigo DGXSYSTEM=DGX1 NEXP=1 ./run1.sub ') if 'mask' in benchmark_spec.benchmark: mlperf_benchmark_cmd = ( 'cd ' '$HOME/training_results_v0.6/NVIDIA/benchmarks/maskrcnn/implementations/pytorch' ' && sed "s/SYSLOGGING=1/SYSLOGGING=0/g" ./run.sub > ./run1.sub && ' 'chmod 755 ./run1.sub && sudo LOGDIR=/tmp/mask DATADIR=/data PULL=0 ' 'DGXSYSTEM=DGX1 NEXP=1 ./run1.sub ') if 'gnmt' in benchmark_spec.benchmark: mlperf_benchmark_cmd = ( 'cd ' '$HOME/training_results_v0.6/NVIDIA/benchmarks/gnmt/implementations/pytorch' ' && sed "s/SYSLOGGING=1/SYSLOGGING=0/g" ./run.sub > ./run1.sub && ' 'chmod 755 ./run1.sub && sudo LOGDIR=/tmp/gnmt DATADIR=/data/gnmt ' 'PULL=0 DGXSYSTEM=DGX1 NEXP=1 ./run1.sub ') if 'ssd' in benchmark_spec.benchmark: mlperf_benchmark_cmd = ( 'cd ' '$HOME/training_results_v0.6/NVIDIA/benchmarks/ssd/implementations/pytorch' ' && sed "s/SYSLOGGING=1/SYSLOGGING=0/g" ./run.sub > ./run1.sub && ' 'chmod 755 ./run1.sub && sudo LOGDIR=/tmp/ssd DATADIR=/data PULL=0 ' 'DGXSYSTEM=DGX1 NEXP=1 ./run1.sub ') if cuda_toolkit.CheckNvidiaGpuExists(vm): mlperf_benchmark_cmd = '{env} {cmd}'.format( env=tensorflow.GetEnvironmentVars(vm), cmd=mlperf_benchmark_cmd) samples = [] metadata = _CreateMetadataDict(benchmark_spec) stdout, _ = vm.RobustRemoteCommand(mlperf_benchmark_cmd, should_log=True) samples.extend( MakeSamplesFromOutput(metadata, stdout, use_tpu=bool(benchmark_spec.tpus), model=benchmark_spec.benchmark)) return samples
def _GetTfCnnBenchmarkCommand(vm, model, batch_size, benchmark_spec, args='', job_name=''): """Create the command used to run the tf_cnn_benchmarks script. The command is either formulated using flag values stored on the benchmark_spec, or is essentially provided outright through the benchmark_args flag. Args: vm: the VM to run on. model: name of the model to run. batch_size: batch size to use for training. benchmark_spec: the benchmark spec object. args: string, distributed arguments job_name: string, distributed job name Returns: A string that runs the tf_cnn_benchmarks.py script with the desired arguments. """ num_gpus = (nvidia_driver.QueryNumberOfGpus(vm) if nvidia_driver.CheckNvidiaGpuExists(vm) else 0) benchmark_spec.num_gpus = num_gpus if benchmark_spec.benchmark_args is not None: cmd = 'python tf_cnn_benchmarks.py ' + benchmark_spec.benchmark_args # If the user didn't specify num_gpus in the benchmark_args string, # use all the GPUs on the system. if '--num_gpus' not in benchmark_spec.benchmark_args and num_gpus: cmd = '{cmd} --num_gpus={num_gpus}'.format(cmd=cmd, num_gpus=num_gpus) return cmd benchmark_spec.local_parameter_device = FLAGS.tf_local_parameter_device benchmark_spec.device = FLAGS.tf_device benchmark_spec.data_format = FLAGS.tf_data_format if num_gpus == 0: benchmark_spec.local_parameter_device = CPU benchmark_spec.device = CPU benchmark_spec.data_format = NHWC cmd = ('{env_vars} python tf_cnn_benchmarks.py ' '--local_parameter_device={local_parameter_device} ' '--batch_size={batch_size} ' '--model={model} ' '{data} ' '--data_name={data_name} ' '--variable_update={variable_update} ' '--distortions={distortions} ' '--device={device} ' '--data_format={data_format} ' '--forward_only={forward_only} ' '--use_fp16={use_fp16} ' '{num_gpus} ' '{job_name}'.format( env_vars=tensorflow.GetEnvironmentVars(vm), local_parameter_device=benchmark_spec.local_parameter_device, batch_size=batch_size, model=model, data=('--data_dir={}'.format(benchmark_spec.data_dir) if benchmark_spec.data_dir else ''), data_name=benchmark_spec.data_name, variable_update=benchmark_spec.variable_update, distortions=benchmark_spec.distortions, device=benchmark_spec.device, data_format=benchmark_spec.data_format, forward_only=benchmark_spec.forward_only, use_fp16=(benchmark_spec.precision == FP16), num_gpus='--num_gpus={}'.format(num_gpus) if num_gpus else '', job_name='--job_name={0} {1}'.format(job_name, args) if args else '')) return cmd
def Run(benchmark_spec): """Run ResNet on the cluster. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: A list of sample.Sample objects. """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm = benchmark_spec.vms[0] resnet_benchmark_script = 'resnet_main.py' resnet_benchmark_cmd = ( '{env_cmd} && cd tpu/models/official/resnet && ' 'python {script} ' '--use_tpu={use_tpu} ' '--data_dir={data_dir} ' '--model_dir={model_dir} ' '--resnet_depth={depth} ' '--train_batch_size={train_batch_size} ' '--eval_batch_size={eval_batch_size} ' '--iterations_per_loop={iterations} ' '--data_format={data_format} ' '--precision={precision} ' '--skip_host_call={skip_host_call} ' '--num_train_images={num_train_images} ' '--num_eval_images={num_eval_images}'.format( env_cmd=benchmark_spec.env_cmd, script=resnet_benchmark_script, use_tpu=bool(benchmark_spec.tpus), data_dir=benchmark_spec.data_dir, model_dir=benchmark_spec.model_dir, depth=benchmark_spec.depth, train_batch_size=benchmark_spec.train_batch_size, eval_batch_size=benchmark_spec.eval_batch_size, iterations=benchmark_spec.iterations, data_format=benchmark_spec.data_format, precision=benchmark_spec.precision, skip_host_call=benchmark_spec.skip_host_call, num_train_images=benchmark_spec.num_train_images, num_eval_images=benchmark_spec.num_eval_images )) if FLAGS.tf_device == 'gpu': resnet_benchmark_cmd = '{env} {cmd}'.format( env=tensorflow.GetEnvironmentVars(vm), cmd=resnet_benchmark_cmd) samples = [] metadata = _CreateMetadataDict(benchmark_spec) elapsed_seconds = 0 steps_per_eval = benchmark_spec.steps_per_eval train_steps = benchmark_spec.train_steps for step in range(steps_per_eval, train_steps + steps_per_eval, steps_per_eval): step = min(step, train_steps) resnet_benchmark_cmd_step = '{cmd} --train_steps={step}'.format( cmd=resnet_benchmark_cmd, step=step) if benchmark_spec.mode in ('train', 'train_and_eval'): if benchmark_spec.tpus: tpu = benchmark_spec.tpu_groups['train'].GetName() num_cores = '--num_cores={}'.format( benchmark_spec.tpu_groups['train'].GetNumShards()) else: tpu = num_cores = '' resnet_benchmark_train_cmd = ( '{cmd} --tpu={tpu} --mode=train {num_cores}'.format( cmd=resnet_benchmark_cmd_step, tpu=tpu, num_cores=num_cores)) start = time.time() stdout, stderr = vm.RobustRemoteCommand(resnet_benchmark_train_cmd, should_log=True) elapsed_seconds += (time.time() - start) samples.extend(mnist_benchmark.MakeSamplesFromTrainOutput( metadata, stdout + stderr, elapsed_seconds, step)) if benchmark_spec.mode in ('train_and_eval', 'eval'): if benchmark_spec.tpus: tpu = benchmark_spec.tpu_groups['eval'].GetName() num_cores = '--num_cores={}'.format( benchmark_spec.tpu_groups['eval'].GetNumShards()) else: tpu = num_cores = '' resnet_benchmark_eval_cmd = ( '{cmd} --tpu={tpu} --mode=eval {num_cores}'.format( cmd=resnet_benchmark_cmd_step, tpu=tpu, num_cores=num_cores)) stdout, stderr = vm.RobustRemoteCommand(resnet_benchmark_eval_cmd, should_log=True) samples.extend(MakeSamplesFromEvalOutput( metadata, stdout + stderr, elapsed_seconds)) return samples
def Run(benchmark_spec): """Run Inception V3 on the cluster. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: A list of sample.Sample objects. """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm = benchmark_spec.vms[0] inception3_benchmark_script = ( 'tpu/models/experimental/inception/inception_v3.py') inception3_benchmark_cmd = ( '{env_cmd} && python {script} ' '--learning_rate={learning_rate} ' '--iterations={iterations} ' '--use_tpu={use_tpu} ' '--use_data={use_data} ' '--train_steps_per_eval={steps_per_eval} ' '--data_dir={data_dir} ' '--model_dir={model_dir} ' '--save_checkpoints_secs={save_checkpoints_secs} ' '--train_batch_size={train_batch_size} ' '--eval_batch_size={eval_batch_size} ' '--precision={precision}'.format( env_cmd=benchmark_spec.env_cmd, script=inception3_benchmark_script, learning_rate=benchmark_spec.learning_rate, iterations=benchmark_spec.iterations, use_tpu=bool(benchmark_spec.tpus), use_data=benchmark_spec.use_data, steps_per_eval=benchmark_spec.steps_per_eval, data_dir=benchmark_spec.data_dir, model_dir=benchmark_spec.model_dir, save_checkpoints_secs=benchmark_spec.save_checkpoints_secs, train_batch_size=benchmark_spec.train_batch_size, eval_batch_size=benchmark_spec.eval_batch_size, precision=benchmark_spec.precision)) if FLAGS.tf_device == 'gpu': inception3_benchmark_cmd = '{env} {cmd}'.format( env=tensorflow.GetEnvironmentVars(vm), cmd=inception3_benchmark_cmd) samples = [] metadata = _CreateMetadataDict(benchmark_spec) elapsed_seconds = 0 steps_per_eval = benchmark_spec.steps_per_eval train_steps = benchmark_spec.train_steps for step in range(steps_per_eval, train_steps + steps_per_eval, steps_per_eval): step = min(step, train_steps) inception3_benchmark_cmd_step = '{cmd} --train_steps={step}'.format( cmd=inception3_benchmark_cmd, step=step) if benchmark_spec.mode in ('train', 'train_and_eval'): if benchmark_spec.tpus: tpu = benchmark_spec.tpu_groups['train'].GetName() num_shards = '--num_shards={}'.format( benchmark_spec.tpu_groups['train'].GetNumShards()) else: tpu = num_shards = '' inception3_benchmark_train_cmd = ( '{cmd} --tpu={tpu} --mode=train {num_shards}'.format( cmd=inception3_benchmark_cmd_step, tpu=tpu, num_shards=num_shards)) start = time.time() stdout, stderr = vm.RobustRemoteCommand( inception3_benchmark_train_cmd, should_log=True) elapsed_seconds += (time.time() - start) samples.extend( mnist_benchmark.MakeSamplesFromTrainOutput( metadata, stdout + stderr, elapsed_seconds, step)) if benchmark_spec.mode in ('train_and_eval', 'eval'): if benchmark_spec.tpus: tpu = benchmark_spec.tpu_groups['eval'].GetName() num_shards = '--num_shards={}'.format( benchmark_spec.tpu_groups['eval'].GetNumShards()) else: tpu = num_shards = '' inception3_benchmark_eval_cmd = ( '{cmd} --tpu={tpu} --mode=eval {num_shards}'.format( cmd=inception3_benchmark_cmd_step, tpu=tpu, num_shards=num_shards)) stdout, stderr = vm.RobustRemoteCommand( inception3_benchmark_eval_cmd, should_log=True) samples.extend( resnet_benchmark.MakeSamplesFromEvalOutput( metadata, stdout + stderr, elapsed_seconds)) return samples
def Run(benchmark_spec): """Run MLPerf on the cluster. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: A list of sample.Sample objects. """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm = benchmark_spec.vms[0] if benchmark_spec.tpus: # For MLPerf v0.6, the benchmake code of different hardware are different. if (benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-32' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-128' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-256' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-512' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-1024' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-2048'): run_path = ( '$HOME/training_results_v0.6/Google/benchmarks/{model}/tpu-{tpus}' .format( model=benchmark_spec.benchmark, tpus=benchmark_spec.tpu_groups['train'].GetAcceleratorType())) code_path = ( '$HOME/training_results_v0.6/Google/benchmarks/{model}/implementations/tpu-{tpus}-{model}' .format( model=benchmark_spec.benchmark, tpus=benchmark_spec.tpu_groups['train'].GetAcceleratorType())) if 'mask' in benchmark_spec.benchmark: model = 'mask_rcnn' elif 'gnmt' in benchmark_spec.benchmark: model = 'nmt' else: model = benchmark_spec.benchmark mlperf_benchmark_cmd = ('cd {code_path} && ' 'export PYTHONPATH=$(pwd):$(pwd)/{model} && ' 'cd {model} && ' '{run_path}/run_and_time.sh'.format( code_path=code_path, model=model, run_path=run_path)) if 'ssd' in benchmark_spec.benchmark: mlperf_benchmark_cmd = ( 'export ' 'MLP_GCS_RESNET_CHECKPOINT={checkpoint}' ' && {cmd}'.format( checkpoint=FLAGS.mlperf_gcs_resnet_checkpoint, cmd=mlperf_benchmark_cmd)) else: raise ValueError( 'MLPerf configurations do not support the hardware in PKB. PKB may ' 'need to be updated if this is a new TPU type.') else: benchmark_path = '$HOME/training_results_v0.6/NVIDIA/benchmarks' common_env = 'DGXSYSTEM=DGX1 NEXP=1' if 'resnet' in benchmark_spec.benchmark: run_path = posixpath.join(benchmark_path, 'resnet/implementations/mxnet') env = 'DATADIR=/data/imagenet LOGDIR=/tmp/resnet PULL=0' elif 'transformer' in benchmark_spec.benchmark: run_path = posixpath.join(benchmark_path, 'transformer/implementations/pytorch') env = 'DATADIR=/data/wmt/utf8 LOGDIR=/tmp/transformer PULL=0' elif 'minigo' in benchmark_spec.benchmark: run_path = posixpath.join(benchmark_path, 'minigo/implementations/tensorflow') env = 'LOGDIR=/tmp/minigo CONT=mlperf-nvidia:minigo' elif 'mask' in benchmark_spec.benchmark: run_path = posixpath.join(benchmark_path, 'maskrcnn/implementations/pytorch') env = 'LOGDIR=/tmp/mask DATADIR=/data PULL=0' elif 'gnmt' in benchmark_spec.benchmark: run_path = posixpath.join(benchmark_path, 'gnmt/implementations/pytorch') env = 'LOGDIR=/tmp/gnmt DATADIR=/data/gnmt PULL=0' elif 'ssd' in benchmark_spec.benchmark: run_path = posixpath.join(benchmark_path, 'ssd/implementations/pytorch') env = 'LOGDIR=/tmp/ssd DATADIR=/data PULL=0' run_script = posixpath.join(run_path, 'run.sub') vm_util.ReplaceText(vm, 'SYSLOGGING=1', 'SYSLOGGING=0', run_script) mlperf_benchmark_cmd = ( 'cd {run_path} && chmod 755 run.sub && sudo {common_env} {env} ' './run.sub'.format(run_path=run_path, common_env=common_env, env=env)) if nvidia_driver.CheckNvidiaGpuExists(vm): mlperf_benchmark_cmd = '{env} {cmd}'.format( env=tensorflow.GetEnvironmentVars(vm), cmd=mlperf_benchmark_cmd) samples = [] metadata = _CreateMetadataDict(benchmark_spec) stdout, _ = vm.RobustRemoteCommand(mlperf_benchmark_cmd, should_log=True) samples.extend( MakeSamplesFromOutput( metadata, stdout, use_tpu=bool(benchmark_spec.tpus), model=benchmark_spec.benchmark)) return samples
def Run(benchmark_spec): """Run ResNet on the cluster. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: A list of sample.Sample objects. """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm = benchmark_spec.vms[0] if benchmark_spec.tpus: resnet_benchmark_script = 'resnet_main.py' resnet_benchmark_cmd = ( '{env_cmd} && ' 'cd tpu/models && ' 'export PYTHONPATH=$(pwd) &&' 'cd official/resnet && ' 'python {script} ' '--use_tpu={use_tpu} ' '--data_dir={data_dir} ' '--model_dir={model_dir} ' '--resnet_depth={depth} ' '--train_batch_size={train_batch_size} ' '--eval_batch_size={eval_batch_size} ' '--iterations_per_loop={iterations} ' '--data_format={data_format} ' '--precision={precision} ' '--skip_host_call={skip_host_call} ' '--num_train_images={num_train_images} ' '--num_eval_images={num_eval_images}'.format( env_cmd=benchmark_spec.env_cmd, script=resnet_benchmark_script, use_tpu=bool(benchmark_spec.tpus), data_dir=benchmark_spec.data_dir, model_dir=benchmark_spec.model_dir, depth=benchmark_spec.depth, train_batch_size=benchmark_spec.train_batch_size, eval_batch_size=benchmark_spec.eval_batch_size, iterations=benchmark_spec.iterations, data_format=benchmark_spec.data_format, precision=benchmark_spec.precision, skip_host_call=benchmark_spec.skip_host_call, num_train_images=benchmark_spec.num_train_images, num_eval_images=benchmark_spec.num_eval_images)) else: resnet_benchmark_script = 'imagenet_main.py' resnet_benchmark_cmd = ('{env_cmd} && ' 'cd models && ' 'export PYTHONPATH=$(pwd) && ' 'cd official/r1/resnet && ' 'python {script} ' '--data_dir=/data/imagenet ' '--model_dir={model_dir} ' '--resnet_size={resnet_size} ' '--batch_size={batch_size} ' '--data_format={data_format} '.format( env_cmd=benchmark_spec.env_cmd, script=resnet_benchmark_script, model_dir=benchmark_spec.model_dir, resnet_size=benchmark_spec.depth, batch_size=benchmark_spec.train_batch_size, data_format=benchmark_spec.data_format)) precision = '{precision}'.format(precision=benchmark_spec.precision) if precision == 'bfloat16': resnet_benchmark_cmd = '{cmd} --dtype=fp16'.format( cmd=resnet_benchmark_cmd) else: resnet_benchmark_cmd = '{cmd} --dtype=fp32'.format( cmd=resnet_benchmark_cmd) if nvidia_driver.CheckNvidiaGpuExists(vm): resnet_benchmark_cmd = '{env} {cmd} --num_gpus={num_gpus}'.format( env=tensorflow.GetEnvironmentVars(vm), cmd=resnet_benchmark_cmd, num_gpus=nvidia_driver.QueryNumberOfGpus(vm)) samples = [] metadata = _CreateMetadataDict(benchmark_spec) elapsed_seconds = 0 steps_per_eval = benchmark_spec.steps_per_eval train_steps = benchmark_spec.train_steps for step in range(steps_per_eval, train_steps + steps_per_eval, steps_per_eval): step = min(step, train_steps) resnet_benchmark_cmd_step = '{cmd} --train_steps={step}'.format( cmd=resnet_benchmark_cmd, step=step) if benchmark_spec.mode in ('train', 'train_and_eval'): if benchmark_spec.tpus: tpu = benchmark_spec.tpu_groups['train'].GetName() num_cores = '--num_cores={}'.format( benchmark_spec.tpu_groups['train'].GetNumShards()) resnet_benchmark_train_cmd = ( '{cmd} --tpu={tpu} --mode=train {num_cores}'.format( cmd=resnet_benchmark_cmd_step, tpu=tpu, num_cores=num_cores)) else: resnet_benchmark_train_cmd = ( '{cmd} --max_train_steps={max_train_steps} ' '--train_epochs={train_epochs} --noeval_only'.format( cmd=resnet_benchmark_cmd, train_epochs=benchmark_spec.epochs_per_eval, max_train_steps=step)) start = time.time() stdout, stderr = vm.RobustRemoteCommand(resnet_benchmark_train_cmd, should_log=True) elapsed_seconds += (time.time() - start) samples.extend(mnist_benchmark.MakeSamplesFromTrainOutput( metadata, stdout + stderr, elapsed_seconds, step)) if benchmark_spec.mode in ('train_and_eval', 'eval'): if benchmark_spec.tpus: tpu = benchmark_spec.tpu_groups['eval'].GetName() num_cores = '--num_cores={}'.format( benchmark_spec.tpu_groups['eval'].GetNumShards()) resnet_benchmark_eval_cmd = ( '{cmd} --tpu={tpu} --mode=eval {num_cores}'.format( cmd=resnet_benchmark_cmd_step, tpu=tpu, num_cores=num_cores)) else: resnet_benchmark_eval_cmd = ('{cmd} --eval_only'.format( cmd=resnet_benchmark_cmd)) stdout, stderr = vm.RobustRemoteCommand(resnet_benchmark_eval_cmd, should_log=True) samples.extend( MakeSamplesFromEvalOutput( metadata, stdout + stderr, elapsed_seconds, use_tpu=bool(benchmark_spec.tpus))) return samples