def Prepare(bm_spec: benchmark_spec.BenchmarkSpec) -> None: """Install and set up MLPerf Inference on the target vm. Args: bm_spec: The benchmark specification Raises: errors.Config.InvalidValue upon both GPUs and TPUs appear in the config """ vm = bm_spec.vms[0] repository = f'inference_results_{MLPERF_INFERENCE_VERSION}' vm.RemoteCommand( f'git clone https://github.com/mlcommons/{repository}.git', should_log=True) makefile = f'{repository}/closed/NVIDIA/Makefile' vm_util.ReplaceText(vm, 'shell uname -p', 'shell uname -m', makefile) requirements = f'{repository}/closed/NVIDIA/docker/requirements.1' vm_util.ReplaceText(vm, 'opencv-python-headless==4.5.2.52', 'opencv-python-headless==4.5.3.56', requirements) if nvidia_driver.CheckNvidiaGpuExists(vm): vm.Install('cuda_toolkit') vm.Install('nvidia_driver') vm.Install('nvidia_docker') benchmark = FLAGS.mlperf_benchmark bm_spec.env_cmd = ('export MLPERF_SCRATCH_PATH=/scratch && ' f'cd {repository}/closed/NVIDIA') vm.RobustRemoteCommand( f'{bm_spec.env_cmd} && ' 'make build_docker NO_BUILD=1 && ' 'make docker_add_user && ' 'make launch_docker DOCKER_COMMAND="echo $MLPERF_SCRATCH_PATH" && ' 'make launch_docker DOCKER_COMMAND="ls -al $MLPERF_SCRATCH_PATH" && ' 'make launch_docker DOCKER_COMMAND="make clean" && ' 'make launch_docker DOCKER_COMMAND="make link_dirs" && ' 'make launch_docker DOCKER_COMMAND="ls -al build/"', should_log=True) vm.RobustRemoteCommand( f'{bm_spec.env_cmd} && ' 'make launch_docker DOCKER_COMMAND=' f'"make download_data BENCHMARKS={benchmark}"', should_log=True) vm.RobustRemoteCommand( f'{bm_spec.env_cmd} && ' 'make launch_docker DOCKER_COMMAND=' f'"make download_model BENCHMARKS={benchmark}" && ' 'make launch_docker DOCKER_COMMAND=' f'"make preprocess_data BENCHMARKS={benchmark}" && ' f'make launch_docker DOCKER_COMMAND="make build"', should_log=True)
def _UseMpi(vm, num_processes): """Configure OpenFOAM to use MPI if running with more than 1 VM.""" runscript = _GetPath(_RUNSCRIPT) vm_util.ReplaceText( vm, 'runParallel', 'mpirun ' '-hostfile {machinefile} ' '-mca btl ^openib ' '--map-by node ' '-np {num_processes}'.format(machinefile=_GetPath(_MACHINEFILE), num_processes=num_processes), runscript, '|') vm_util.ReplaceText(vm, '^mpirun.*', '& -parallel', runscript)
def _SetMeshDimensions(vm, dimensions): """Set the dimensions to test scalability of the motorBike tutorial.""" pattern = 'hex (0 1 2 3 4 5 6 7) ({}) simpleGrading (1 1 1)' original_string = pattern.format(_MOTORBIKE_DIMENSIONS['medium']) new_string = pattern.format(dimensions) vm_util.ReplaceText(vm, original_string, new_string, _GetPath(_BLOCKMESHDICT))
def BuildDockerImages(vm): """Builds the Docker images from source Dockerfiles for a pre-built env.""" vm.InstallPackages('git') vm.RemoteHostCommand('cd {0} && git clone -b {1} ' 'https://github.com/tensorflow/serving'.format( linux_packages.INSTALL_DIR, FLAGS.tf_serving_branch)) setup_script = posixpath.join( linux_packages.INSTALL_DIR, 'serving/tensorflow_serving/tools/docker/Dockerfile.devel') # Changes the TensorFlow git branch to tf_serving_branch vm_util.ReplaceText( vm, 'ARG TF_SERVING_VERSION_GIT_BRANCH=master', 'ARG TF_SERVING_VERSION_GIT_BRANCH={}'.format(FLAGS.tf_serving_branch), setup_script) # Build an optimized binary for TF Serving, and keep all the build artifacts vm.RemoteHostCommand( 'sudo docker build --target binary_build ' '-t benchmarks/tensorflow-serving-devel ' '-f {0}/tensorflow_serving/tools/docker/Dockerfile.devel ' '{0}/tensorflow_serving/tools/docker/'.format( TF_SERVING_BASE_DIRECTORY)) # Create a serving image with the optimized model_server binary vm.RemoteHostCommand( 'sudo docker build ' '-t benchmarks/tensorflow-serving ' '--build-arg ' 'TF_SERVING_BUILD_IMAGE=benchmarks/tensorflow-serving-devel ' '-f {0}/tensorflow_serving/tools/docker/Dockerfile ' '{0}/tensorflow_serving/tools/docker/'.format( TF_SERVING_BASE_DIRECTORY))
def testReplaceTextUsesCorrectCommand(self): """Test of vm_util.ReplaceText().""" vm_util.ReplaceText(self.mock_vm, 'current', 'new', 'test_file', regex_char='|') self.mock_vm.RemoteCommand.assert_called_with( 'sed -i -r "s|current|new|" test_file')
def _UseMpi(vm, num_processes, mapping): """Configure OpenFOAM to use MPI if running with more than 1 VM. This function looks for the word "runParallel" in the run script and replaces it with an mpirun command. Args: vm: The worker VM to use MPI on. num_processes: An integer representing the total number of processes for the MPI job. mapping: A string for the mpirun --map-by flag. """ run_script = _GetPath(_RUN_SCRIPT) vm_util.ReplaceText( vm, 'runParallel', 'mpirun ' f'-hostfile {_MACHINE_FILE} ' '-mca btl ^openib ' f'--map-by {mapping} ' f'-np {num_processes}', run_script, '|') vm_util.ReplaceText(vm, '^mpirun.*', '& -parallel', run_script)
def PrepareBenchmark(benchmark_spec, vm=None): """Install and set up MLPerf on the target vm. Args: benchmark_spec: The benchmark specification vm: The VM to work on Raises: errors.Config.InvalidValue upon both GPUs and TPUs appear in the config """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm = vm or benchmark_spec.vms[0] if (bool(benchmark_spec.tpus) and nvidia_driver.CheckNvidiaGpuExists(vm)): raise errors.Config.InvalidValue( 'Invalid configuration. GPUs and TPUs can not both present in the config.' ) vm.RemoteCommand( f'if [ ! -d "$HOME/training_results_{MLPERF_VERSION}" ]; then ' f' git clone https://github.com/mlcommons/training_results_{MLPERF_VERSION}.git ; ' 'fi', should_log=True) vm.Install('pip3') if not HYPERTHREADS.value: if BERT in benchmark_spec.benchmark: vm_util.ReplaceText( vm, "'bind_pyt'", "'bind_pyt' '--no_hyperthreads'", f'training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/bert/' 'implementations/pytorch/run_with_docker.sh') elif MASK in benchmark_spec.benchmark: vm_util.ReplaceText( vm, "'bind_launch'", "'bind_launch' '--no_hyperthreads'", f'training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/maskrcnn/' 'implementations/pytorch/run_and_time.sh') elif RESNET in benchmark_spec.benchmark: vm_util.ReplaceText( vm, '--cpu=exclusive', '--cpu=exclusive,nosmt', f'training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/resnet/' 'implementations/mxnet/run_and_time.sh')
def _SetDimensions(vm, dimensions): """Sets the mesh dimensions in blockMeshDict. Replaces lines of the format: hex (0 1 2 3 4 5 6 7) (20 8 8) simpleGrading (1 1 1) with: hex (0 1 2 3 4 5 6 7) (dimensions) simpleGrading (1 1 1) Args: vm: The vm to make the replacement on. dimensions: String, new mesh dimensions to run with. """ logging.info('Using dimensions (%s) in blockMeshDict', dimensions) vm_util.ReplaceText(vm, r'(hex \(.*\) \().*(\) .* \(.*\))', r'\1{}\2'.format(dimensions), _GetPath(_BLOCKMESHDICT), regex_char='|')
def _SetDimensions(vm, dimensions): """Sets the mesh dimensions in blockMeshDict. Replaces lines of the format: hex (0 1 2 3 4 5 6 7) (20 8 8) simpleGrading (1 1 1) with: hex (0 1 2 3 4 5 6 7) (dimensions) simpleGrading (1 1 1) The actual contents of the second set of parentheses doesn't matter. This function will just replace whatever is inside those. Args: vm: The VM to make the replacement on. dimensions: String, new mesh dimensions to run with. """ logging.info('Using dimensions (%s) in blockMeshDict', dimensions) vm_util.ReplaceText(vm, r'(hex \(.*\) \().*(\) .* \(.*\))', r'\1{}\2'.format(dimensions), _GetPath(_BLOCKMESHDICT), regex_char='|')
def _UpdateScripts(benchmark_spec, node_rank): """Update the running scripts on the target vm. Args: benchmark_spec: The benchmark specification. node_rank: int, The rank of the node for multi-node distributed training """ vm = benchmark_spec.vms[node_rank] benchmark = benchmark_spec.benchmark # TODO(tohaowu) Change config and script using a patch file. # request pairs to the sed command # each pair('str_A', 'str_B') indicates a request "replace anything # matching str_A to str_B" for a specific file config_sed = [] config_sed += [(r'DGXSYSTEM=.*', fr'DGXSYSTEM=\"{DGXSYSTEM}\"')] config_sed += [ (r'DGXNNODES=.*', r'DGXNNODES={num_vms}'.format(num_vms=benchmark_spec.num_vms)) ] config_sed += [(r'DGXNGPU=.*', r'DGXNGPU={gpus_per_vm}'.format( gpus_per_vm=benchmark_spec.gpus_per_vm))] config_sed += [(r'DGXNSOCKET=.*', r'DGXNSOCKET={nsockets}'.format( nsockets=vm.CheckLsCpu().socket_count))] config_sed += [(r'DGXSOCKETCORES=.*', r'DGXSOCKETCORES={ncores}'.format( ncores=vm.CheckLsCpu().cores_per_socket))] run_and_time_sed = [] run_and_time_sed += [(r'run_training.sh', r'run_training1.sh')] run_and_time_sed += [(r'DGXSYSTEM=.*', fr'DGXSYSTEM=\"{DGXSYSTEM}\"')] if FLAGS.mlperf_keep_nccl_log: run_and_time_sed += [(r'#\!\/bin\/bash', r'#\!\/bin\/bash\n' r'export NCCL_DEBUG=INFO\n' r'export NCCL_DEBUG_SUBSYS=ALL\n' r'export NCCL_DEBUG_FILE=\/results\/%h.%p.nccl')] nccl_exports = _GetNcclParams() if FLAGS.nccl_extra_params else r'' run_and_time_sed += [(r'#!\/bin\/bash', r'#!\/bin\/bash\n' fr'{nccl_exports}')] run_sed = [] run_sed += [(r'SYSLOGGING=1', r'SYSLOGGING=0')] run_sed += [(r'env [|] grep SLURM', r'export SLURM_NNODES={num_vms}'.format( num_vms=benchmark_spec.num_vms))] run_sed += [(r'data -v \$LOGDIR', r'data -v \$(pwd):\/workspace\/{model}1 -v \$LOGDIR'.format( model=benchmark))] run_sed += [(r'scontrol show hostname', r'mpirun -hostfile \$HOME\/{hostfile} -N 1 hostname -I ' r'\| awk \'{{print \$1}}\' '.format(hostfile=HOSTFILE))] run_sed += [(r'srun --mem=0 -N 1 -n 1 -w \$hostn', r'mpirun -N 1 -n 1 -H \$hostn')] run_sed += [(r'sleep 30', r'sleep 60')] run_sed += [(r'docker exec -it', r'docker exec -t')] run_sed += [(r'run_and_time.sh', r'run_and_time1.sh')] run_sed += [(r'nvidia-docker', r'sudo nvidia-docker')] run_sed += [(r'docker exec', r'sudo docker exec')] run_sed += [(r'docker container', r'sudo docker container')] if FLAGS.aws_efa or FLAGS.azure_infiniband: stdout, _ = vm.RemoteCommand('ls -d /dev/infiniband/*') devices = [device.replace('/', '\\/') for device in stdout.split()] device_args = ' '.join(f'--device={device}' for device in devices) run_sed += [(r'nvidia-docker run', fr'nvidia-docker run {device_args}') ] if FLAGS.azure_infiniband: run_sed += [ (r'_cont_mounts=(', r'_cont_mounts=(\"--volume=\/opt\/microsoft:\/opt\/microsoft\" ') ] nvprof_flags = r'-f -o \/results\/%h.%p.nvprof --profile-child-processes' script_path = ( r'$HOME/training_results_{version}/NVIDIA/benchmarks/{model}' r'/implementations/{framework}'.format( version=mlperf_benchmark.MLPERF_VERSION, model='maskrcnn' if mlperf_benchmark.MASK in benchmark else benchmark, framework='mxnet' if mlperf_benchmark.RESNET in benchmark else 'pytorch')) config_files = [CONFIG] if mlperf_benchmark.TRANSFORMER in benchmark: config_sed, run_sed, run_and_time_sed = _GetChangesForTransformer( benchmark_spec, vm, script_path, nvprof_flags, config_sed, run_sed, run_and_time_sed) elif mlperf_benchmark.SSD in benchmark: config_sed, run_sed, run_and_time_sed = _GetChangesForSSD( benchmark_spec, nvprof_flags, config_sed, run_sed, run_and_time_sed) elif mlperf_benchmark.GNMT in benchmark: config_sed, run_sed, run_and_time_sed = _GetChangesForGNMT( benchmark_spec, nvprof_flags, config_sed, run_sed, run_and_time_sed) elif mlperf_benchmark.MASK in benchmark: config_sed, run_sed, run_and_time_sed = _GetChangesForMask( benchmark_spec, node_rank, script_path, nvprof_flags, config_sed, run_sed, run_and_time_sed) config_files = ['config_DGXA100_multi_4x8x4.sh'] elif mlperf_benchmark.RESNET in benchmark: config_sed, run_sed, run_and_time_sed = _GetChangesForResnet( benchmark_spec, node_rank, nvprof_flags, config_sed, run_sed, run_and_time_sed) config_files = [ 'config_DGXA100_common.sh', 'config_DGXA100_multi_8x8x204.sh' ] elif mlperf_benchmark.BERT in benchmark: config_sed, run_sed, run_and_time_sed = _GetChangesForBert( benchmark_spec, node_rank, nvprof_flags, config_sed, run_sed, run_and_time_sed) config_files = [ 'config_DGXA100_common.sh', 'config_DGXA100_8x8x48x1.sh' ] vm.RemoteCommand(f'cd {script_path} && ' f'sed "{mlperf_benchmark.SedPairsToString(config_sed)}" ' f'{" ".join(config_files)} > {CONFIG} && ' f'chmod 755 {CONFIG} ') vm.RemoteCommand( f'cd {script_path} && ' f'sed "{mlperf_benchmark.SedPairsToString(run_and_time_sed)}" ' f'run_and_time.sh | sed "2 i source {CONFIG}" > run_and_time1.sh && ' 'chmod 755 run_and_time1.sh ') vm.RemoteCommand( f'cd {script_path} && ' f'sed "{mlperf_benchmark.SedPairsToString(run_sed)}" run_with_docker.sh ' f'| sed "2 i source {CONFIG}" > run_with_docker1.sh && ' 'chmod 755 run_with_docker1.sh') docker_file = posixpath.join(script_path, 'Dockerfile') if FLAGS.nccl_net_plugin: vm_util.ReplaceText( vm, 'RUN apt-get update', r'RUN echo \"deb https:\/\/packages.cloud.google.com\/apt ' r'google-fast-socket main\" | ' r'tee \/etc\/apt\/sources.list.d\/google-fast-socket.list\n' r'RUN curl -s -L ' r'https:\/\/packages.cloud.google.com\/apt\/doc\/apt-key.gpg | ' r'apt-key add -\n' r'RUN rm -f \/opt\/hpcx\/nccl_rdma_sharp_plugin\/lib\/libnccl-net.so\n' r'RUN apt-get update', docker_file) vm_util.ReplaceText( vm, 'apt-get install -y --no-install-recommends', 'apt-get install -y --no-install-recommends google-fast-socket', docker_file) if FLAGS.aws_efa: vm.RemoteCommand(f'git clone {AWS_EFA_NCCL_BASEAMI_PIPELINE_URL}') vm.RemoteCommand(f'cat {NVIDIA_EFA_DOCKERFILE} >> {docker_file}') vm_util.ReplaceText(vm, 'FROM nvcr.*', '', docker_file) vm_util.ReplaceText(vm, 'yum-utils.*', '', docker_file) vm_util.ReplaceText(vm, 'python3-distutils.*', 'python3-distutils', docker_file) vm_util.ReplaceText(vm, 'cmake', '', docker_file)
def Prepare(bm_spec: benchmark_spec.BenchmarkSpec) -> None: """Installs and sets up MLPerf Inference on the target vm. Args: bm_spec: The benchmark specification Raises: errors.Config.InvalidValue upon both GPUs and TPUs appear in the config """ vm = bm_spec.vms[0] repository = f'inference_results_{MLPERF_INFERENCE_VERSION}' vm.RemoteCommand(f'git clone https://github.com/mlcommons/{repository}.git') makefile = f'{repository}/closed/NVIDIA/Makefile' vm_util.ReplaceText(vm, 'shell uname -p', 'shell uname -m', makefile) requirements1 = f'{repository}/closed/NVIDIA/docker/requirements.1' vm_util.ReplaceText(vm, 'opencv-python-headless==4.5.2.52', 'opencv-python-headless==4.5.3.56', requirements1) requirements2 = f'{repository}/closed/NVIDIA/docker/requirements.2' benchmark = FLAGS.mlperf_benchmark if _SERVER_TARGET_QPS.value: config = f'{repository}/closed/NVIDIA/configs/{benchmark}/Server/__init__.py' vm_util.ReplaceText(vm, 'server_target_qps = .*', f'server_target_qps = {_SERVER_TARGET_QPS.value}', config) for requirements in (requirements1, requirements2): vm_util.ReplaceText(vm, 'git:', 'https:', requirements) if nvidia_driver.CheckNvidiaGpuExists(vm): vm.Install('cuda_toolkit') vm.Install('nvidia_driver') vm.Install('nvidia_docker') bm_spec.env_cmd = (f'export MLPERF_SCRATCH_PATH={_MLPERF_SCRATCH_PATH} && ' f'cd {repository}/closed/NVIDIA') docker.AddUser(vm) vm.RobustRemoteCommand( f'{bm_spec.env_cmd} && ' 'make build_docker NO_BUILD=1 && ' 'make docker_add_user && ' 'make launch_docker DOCKER_COMMAND="make clean" && ' 'make launch_docker DOCKER_COMMAND="make link_dirs"', should_log=True) if benchmark == mlperf_benchmark.DLRM: # Download data data_dir = posixpath.join(_MLPERF_SCRATCH_PATH, 'data', _DLRM_DATA_MODULE) vm.DownloadPreprovisionedData(data_dir, _DLRM_DATA_MODULE, _DLRM_DATA) vm.RemoteCommand(f'cd {data_dir} && gzip -d {_DLRM_DATA}') # Download model model_dir = posixpath.join(_MLPERF_SCRATCH_PATH, 'models', benchmark) vm.DownloadPreprovisionedData(model_dir, benchmark, _DLRM_MODEL) vm.RemoteCommand(f'cd {model_dir} && ' f'tar -zxvf {_DLRM_MODEL} && ' f'rm -f {_DLRM_MODEL}') vm.DownloadPreprovisionedData(model_dir, benchmark, _DLRM_ROW_FREQ) # Preprocess Data preprocessed_data_dir = posixpath.join(_MLPERF_SCRATCH_PATH, 'preprocessed_data', _DLRM_DATA_MODULE) vm.DownloadPreprovisionedData(preprocessed_data_dir, _DLRM_DATA_MODULE, _DLRM_PREPROCESSED_DATA) vm.RemoteCommand(f'cd {preprocessed_data_dir} && ' f'tar -zxvf {_DLRM_PREPROCESSED_DATA} && ' f'rm -f {_DLRM_PREPROCESSED_DATA}') elif benchmark == mlperf_benchmark.BERT: # Download data data_dir = posixpath.join(_MLPERF_SCRATCH_PATH, 'data', 'squad') vm.DownloadPreprovisionedData(data_dir, benchmark, 'dev-v1.1.json') # Download model model_dir = posixpath.join(_MLPERF_SCRATCH_PATH, 'models', benchmark) vm.DownloadPreprovisionedData(model_dir, benchmark, 'bert_large_v1_1.onnx') vm.DownloadPreprovisionedData(model_dir, benchmark, 'bert_large_v1_1_fake_quant.onnx') vm.DownloadPreprovisionedData(model_dir, benchmark, 'vocab.txt') # Preprocess Data preprocessed_data_dir = posixpath.join(_MLPERF_SCRATCH_PATH, 'preprocessed_data', 'squad_tokenized') vm.DownloadPreprovisionedData(preprocessed_data_dir, benchmark, 'input_ids.npy') vm.DownloadPreprovisionedData(preprocessed_data_dir, benchmark, 'input_mask.npy') vm.DownloadPreprovisionedData(preprocessed_data_dir, benchmark, 'segment_ids.npy') else: vm.RobustRemoteCommand( f'{bm_spec.env_cmd} && ' 'make launch_docker DOCKER_COMMAND=' f'"make download_data BENCHMARKS={benchmark}"', should_log=True) vm.RobustRemoteCommand( f'{bm_spec.env_cmd} && ' 'make launch_docker DOCKER_COMMAND=' f'"make download_model BENCHMARKS={benchmark}"', should_log=True) vm.RobustRemoteCommand( f'{bm_spec.env_cmd} && ' 'make launch_docker DOCKER_COMMAND=' f'"make preprocess_data BENCHMARKS={benchmark}"', should_log=True) vm.RobustRemoteCommand( f'{bm_spec.env_cmd} && ' 'make launch_docker DOCKER_COMMAND=' '"make build" && ' 'make launch_docker DOCKER_COMMAND=' '"make generate_engines RUN_ARGS=\'' f'--benchmarks={FLAGS.mlperf_benchmark} ' f'--scenarios={_SCENARIOS.value}\'"', should_log=True)
def _SetNumProcesses(vm, num_processes): """Configure OpenFOAM to use the correct number of processes.""" logging.info('Decomposing into %s subdomains', num_processes) vm_util.ReplaceText(vm, 'numberOfSubdomains.*', 'numberOfSubdomains %s;' % str(num_processes), _GetPath(_DECOMPOSEDICT))
def Run(benchmark_spec): """Run MLPerf on the cluster. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: A list of sample.Sample objects. """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm = benchmark_spec.vms[0] if benchmark_spec.tpus: # For MLPerf v0.6, the benchmake code of different hardware are different. if (benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-32' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-128' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-256' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-512' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-1024' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-2048'): run_path = ( '$HOME/training_results_v0.6/Google/benchmarks/{model}/tpu-{tpus}' .format( model=benchmark_spec.benchmark, tpus=benchmark_spec.tpu_groups['train'].GetAcceleratorType())) code_path = ( '$HOME/training_results_v0.6/Google/benchmarks/{model}/implementations/tpu-{tpus}-{model}' .format( model=benchmark_spec.benchmark, tpus=benchmark_spec.tpu_groups['train'].GetAcceleratorType())) if 'mask' in benchmark_spec.benchmark: model = 'mask_rcnn' elif 'gnmt' in benchmark_spec.benchmark: model = 'nmt' else: model = benchmark_spec.benchmark mlperf_benchmark_cmd = ('cd {code_path} && ' 'export PYTHONPATH=$(pwd):$(pwd)/{model} && ' 'cd {model} && ' '{run_path}/run_and_time.sh'.format( code_path=code_path, model=model, run_path=run_path)) if 'ssd' in benchmark_spec.benchmark: mlperf_benchmark_cmd = ( 'export ' 'MLP_GCS_RESNET_CHECKPOINT={checkpoint}' ' && {cmd}'.format( checkpoint=FLAGS.mlperf_gcs_resnet_checkpoint, cmd=mlperf_benchmark_cmd)) else: raise ValueError( 'MLPerf configurations do not support the hardware in PKB. PKB may ' 'need to be updated if this is a new TPU type.') else: benchmark_path = '$HOME/training_results_v0.6/NVIDIA/benchmarks' common_env = 'DGXSYSTEM=DGX1 NEXP=1' if 'resnet' in benchmark_spec.benchmark: run_path = posixpath.join(benchmark_path, 'resnet/implementations/mxnet') env = 'DATADIR=/data/imagenet LOGDIR=/tmp/resnet PULL=0' elif 'transformer' in benchmark_spec.benchmark: run_path = posixpath.join(benchmark_path, 'transformer/implementations/pytorch') env = 'DATADIR=/data/wmt/utf8 LOGDIR=/tmp/transformer PULL=0' elif 'minigo' in benchmark_spec.benchmark: run_path = posixpath.join(benchmark_path, 'minigo/implementations/tensorflow') env = 'LOGDIR=/tmp/minigo CONT=mlperf-nvidia:minigo' elif 'mask' in benchmark_spec.benchmark: run_path = posixpath.join(benchmark_path, 'maskrcnn/implementations/pytorch') env = 'LOGDIR=/tmp/mask DATADIR=/data PULL=0' elif 'gnmt' in benchmark_spec.benchmark: run_path = posixpath.join(benchmark_path, 'gnmt/implementations/pytorch') env = 'LOGDIR=/tmp/gnmt DATADIR=/data/gnmt PULL=0' elif 'ssd' in benchmark_spec.benchmark: run_path = posixpath.join(benchmark_path, 'ssd/implementations/pytorch') env = 'LOGDIR=/tmp/ssd DATADIR=/data PULL=0' run_script = posixpath.join(run_path, 'run.sub') vm_util.ReplaceText(vm, 'SYSLOGGING=1', 'SYSLOGGING=0', run_script) mlperf_benchmark_cmd = ( 'cd {run_path} && chmod 755 run.sub && sudo {common_env} {env} ' './run.sub'.format(run_path=run_path, common_env=common_env, env=env)) if nvidia_driver.CheckNvidiaGpuExists(vm): mlperf_benchmark_cmd = '{env} {cmd}'.format( env=tensorflow.GetEnvironmentVars(vm), cmd=mlperf_benchmark_cmd) samples = [] metadata = _CreateMetadataDict(benchmark_spec) stdout, _ = vm.RobustRemoteCommand(mlperf_benchmark_cmd, should_log=True) samples.extend( MakeSamplesFromOutput( metadata, stdout, use_tpu=bool(benchmark_spec.tpus), model=benchmark_spec.benchmark)) return samples
def _SetDecomposeMethod(vm, decompose_method): """Set the parallel decomposition method if using multiple cores.""" logging.info('Using %s decomposition', decompose_method) vm_util.ReplaceText(vm, 'method.*', 'method %s;' % decompose_method, _GetPath(_DECOMPOSEDICT))
def PrepareRunner(benchmark_spec, vm=None): """Install and set up MLPerf on the target vm. Args: benchmark_spec: The benchmark specification vm: The VM to work on Raises: errors.Config.InvalidValue upon both GPUs and TPUs appear in the config """ vm = vm or benchmark_spec.vms[0] if benchmark_spec.tpus: if vm == benchmark_spec.vms[0]: storage_service = gcs.GoogleCloudStorageService() benchmark_spec.storage_service = storage_service if FLAGS.mlperf_bucket: bucket = FLAGS.mlperf_bucket benchmark_spec.model_dir = f'gs://{bucket}/pkb-{FLAGS.run_uri}' else: bucket = f'pkb-{FLAGS.run_uri}'.format(uri=FLAGS.run_uri) benchmark_spec.model_dir = f'gs://{bucket}' benchmark_spec.bucket = bucket location = benchmark_spec.tpu_groups['train'].GetZone() storage_service.PrepareService(util.GetRegionFromZone(location)) storage_service.MakeBucket(bucket) storage_service.AclBucket(benchmark_spec.gcp_service_account, gcs.WRITER, bucket) # For MLPerf 1.0, the benchmake code of different hardware are different. if (benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-32' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-128' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-256' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-512' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-1024' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-2048'): run_path = ( '$HOME/training_results_{version}/Google/benchmarks/{model}/tpu-{tpus}' .format(version=MLPERF_VERSION, model=benchmark_spec.benchmark, tpus=benchmark_spec.tpu_groups['train']. GetAcceleratorType())) else: raise ValueError( 'MLPerf configurations do not support the hardware in PKB. PKB may ' 'need to be updated if this is a new TPU type.') if MASK in benchmark_spec.benchmark: model = 'mask_rcnn' elif GNMT in benchmark_spec.benchmark: model = 'nmt' else: model = benchmark_spec.benchmark code_path = ( '$HOME/training_results_{version}/Google/benchmarks/{model}/implementations/tpu-{tpus}-{model}' .format( version=MLPERF_VERSION, model=benchmark_spec.benchmark, tpus=benchmark_spec.tpu_groups['train'].GetAcceleratorType())) vm.RemoteCommand('pip3 install --upgrade pyyaml==3.13 ') vm.RemoteCommand('pip3 install cloud-tpu-profiler==1.12') if (MASK in benchmark_spec.benchmark or SSD in benchmark_spec.benchmark): # Install the coco package, to load the coco dataset for Mask-RCNN # and SSD benchmarks. # TODO(user): coco whl package for python 3.5 vm.RemoteCommand( 'cd /tmp && ' f'wget https://storage.cloud.google.com/mlperf_artifcats/{MLPERF_VERSION}_training/coco-1.1-cp36-cp36m-linux_x86_64.whl' ) setup_script = posixpath.join(run_path, 'setup.sh') vm_util.ReplaceText(vm, '--progress-bar off', ' ', setup_script) vm_util.ReplaceText(vm, 'pip ', 'pip3 ', setup_script) vm.RemoteCommand( 'chmod 755 {script} && {script}'.format(script=setup_script)) if MASK not in benchmark_spec.benchmark: vm.RemoteCommand( 'pip3 uninstall -y tf-estimator-nightly && ' 'pip3 install tf-estimator-nightly==1.14.0.dev2019051801') if RESNET in benchmark_spec.benchmark: data_dir = benchmark_spec.imagenet_data_dir elif TRANSFORMER in benchmark_spec.benchmark: data_dir = benchmark_spec.wmt_data_dir elif MASK in benchmark_spec.benchmark: data_dir = benchmark_spec.coco_data_dir elif GNMT in benchmark_spec.benchmark: data_dir = benchmark_spec.gnmt_data_dir elif SSD in benchmark_spec.benchmark: data_dir = benchmark_spec.coco_data_dir elif BERT in benchmark_spec.benchmark: data_dir = benchmark_spec.bert_data_dir else: raise ValueError( 'Unknown operation, cannot find {} in benchmark'.format( benchmark_spec.benchmark)) run_script = posixpath.join(run_path, 'run_and_time.sh') data_dir = data_dir.replace('/', r'\/') checkpoint = FLAGS.mlperf_gcs_resnet_checkpoint.replace('/', r'\/') decode_dir = FLAGS.mlperf_transformer_decode_dir.replace('/', r'\/') tpu = benchmark_spec.tpu_groups['train'].GetName() vm_util.ReplaceText(vm, '--model_dir=.*', r'--model_dir=gs:\/\/{} \\\\'.format(bucket), run_script) vm_util.ReplaceText(vm, '--data_dir=.*', r'--data_dir={} \\\\'.format(data_dir), run_script) vm_util.ReplaceText( vm, '--training_file_pattern=.*', r'--training_file_pattern={}\/train-* \\\\'.format(data_dir), run_script) vm_util.ReplaceText( vm, '--validation_file_pattern=.*', r'--validation_file_pattern={}\/val-* \\\\'.format(data_dir), run_script) vm_util.ReplaceText( vm, '--val_json_file=.*', r'--val_json_file={}\/instances_val2017.json \\\\'.format( data_dir), run_script) vm_util.ReplaceText(vm, '--resnet_checkpoint=.*', r'--resnet_checkpoint={} \\\\'.format(checkpoint), run_script) vm_util.ReplaceText( vm, '--decode_from_file=.*', r'--decode_from_file={}\/wmt14-en-de.src \\\\'.format(decode_dir), run_script) vm_util.ReplaceText( vm, '--decode_reference=.*', r'--decode_reference={}\/wmt14-en-de.ref \\\\'.format(decode_dir), run_script) vm_util.ReplaceText( vm, '--decode_to_file=.*', r'--decode_to_file={}\/decode.transformer_mlperf_tpu.' r'translate_ende_wmt32k_packed.2x2_log_1018_2 \\\\'.format(bucket), run_script) vm_util.ReplaceText(vm, '--tpu=.*', r'--tpu={} \\\\'.format(tpu), run_script) vm_util.ReplaceText(vm, '--output_dir=.*', r'--output_dir=gs:\/\/{} \\\\'.format(bucket), run_script) vm_util.ReplaceText(vm, '--cloud_tpu_name=.*', r'--cloud_tpu_name={} \\\\'.format(tpu), run_script) vm_util.ReplaceText(vm, '--out_dir=.*', r'--out_dir=gs:\/\/{} \\\\'.format(bucket), run_script) vm_util.ReplaceText(vm, '--tpu_name=.*', r'--tpu_name={} \\\\'.format(tpu), run_script) vm.RemoteCommand('chmod 755 {}'.format(run_script)) if GNMT in benchmark_spec.benchmark: metric_script = posixpath.join(code_path, model, 'metric.py') vm_util.ReplaceText(vm, ' sacrebleu -t', ' python3 -m sacrebleu -t', metric_script) else: benchmark_spec.model_dir = '/tmp' has_gpu = nvidia_driver.CheckNvidiaGpuExists(vm) if has_gpu: vm.Install('cuda_toolkit') vm.Install('nvidia_docker') vm.RemoteCommand( 'if [ ! -d "/data" ]; then sudo ln -s /scratch /data; fi') if RESNET in benchmark_spec.benchmark: run_script = f'training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/resnet/implementations/mxnet/run_and_time.sh' vm.RemoteCommand( f'cd training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/resnet/implementations/mxnet &&' ' sudo docker build --network=host . -t mlperf-nvidia:image_classification', should_log=True) _DownloadData(benchmark_spec.imagenet_data_dir, posixpath.join('/data', 'imagenet'), vm) if TRANSFORMER in benchmark_spec.benchmark: vm.RemoteCommand( f'cd training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/transformer/implementations/pytorch &&' ' sudo docker build --network=host . -t mlperf-nvidia:translation', should_log=True) _DownloadData(benchmark_spec.wmt_data_dir, posixpath.join('/data', 'wmt'), vm) if MINIGO in benchmark_spec.benchmark: build_path = f'training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/minigo/implementations/tensorflow' run_script = posixpath.join(build_path, 'run_and_time.sh') vm_util.ReplaceText( vm, 'get_data.py', 'get_data.py --src_dir={}'.format( FLAGS.minigo_model_dir.replace('/', r'\/')), run_script) vm.RemoteCommand('cd {} && sudo docker build --network=host -t ' 'mlperf-nvidia:minigo .'.format(build_path), should_log=True) if MASK in benchmark_spec.benchmark: vm.RemoteCommand( f'cd training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/maskrcnn/implementations/pytorch && ' 'sudo docker build --network=host -t mlperf-nvidia:object_detection . ', should_log=True) _DownloadData(benchmark_spec.coco_data_dir, posixpath.join('/data', 'coco2017'), vm) if GNMT in benchmark_spec.benchmark: vm.RemoteCommand( f'cd training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/gnmt/implementations/pytorch && ' 'sudo docker build --network=host -t mlperf-nvidia:rnn_translator . ', should_log=True) _DownloadData(benchmark_spec.gnmt_data_dir, posixpath.join('/data', 'gnmt'), vm) if SSD in benchmark_spec.benchmark: vm.RemoteCommand( f'cd training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/ssd/implementations/pytorch && ' 'sudo docker build --network=host -t mlperf-nvidia:single_stage_detector . ', should_log=True) _DownloadData(benchmark_spec.coco_data_dir, posixpath.join('/data', 'coco2017'), vm) if BERT in benchmark_spec.benchmark: vm.RemoteCommand( f'cd training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/bert/implementations/pytorch && ' 'sudo docker build --network=host -t mlperf-nvidia:language_model . ', should_log=True) _DownloadData(benchmark_spec.bert_data_dir, posixpath.join('/data', 'bert_data'), vm)
def _SetParallelDecompositionMethod(vm, decompose_method): """Set the parallel decomposition method if using multiple cores.""" vm_util.ReplaceText(vm, 'method.*', 'method %s;' % decompose_method, _GetPath(_DECOMPOSEDICT))
def _DownloadData(benchmark_spec, rank): """Downloads train valid and test on the target vm. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. rank: integer, the node rank in distributed training. """ vm = benchmark_spec.vms[rank] vm.InstallPackages('python3-pip') vm.Install('wget') vm.RemoteCommand( '[ -d $HOME/fairseq ] || git clone {git} -b {branch}'.format( git=FAIRSEQ_GIT, branch=FAIRSEQ_BRANCH)) setup_script = posixpath.join('fairseq', 'setup.py') vm_util.ReplaceText(vm, "'torch'", f"'torch >= {FLAGS.robertammlm_torch_version}'", setup_script) env = 'PATH=/opt/conda/bin:$PATH' vm.RemoteCommand('{} python3 -m pip install pyarrow'.format(env)) vm.RemoteCommand( 'cd fairseq && {} python3 -m pip install --editable .'.format(env)) vm.RemoteCommand('mkdir -p {}'.format(DATA_PATH)) text_zip = posixpath.join(DATA_PATH, posixpath.basename(WIKI_TEXT)) vm.RemoteCommand('wget -O {des} {src}'.format(des=text_zip, src=WIKI_TEXT)) vm.RemoteCommand('unzip {text_zip} -d {data_path}'.format( data_path=DATA_PATH, text_zip=text_zip)) bpe_dir = posixpath.join(DATA_PATH, 'gpt2_bpe') vm.RemoteCommand('mkdir -p {}'.format(bpe_dir)) vm.RemoteCommand('wget -O {des}/encoder.json {src}'.format( des=bpe_dir, src=ENCODER_JSON)) vm.RemoteCommand('wget -O {des}/vocab.bpe {src}'.format(des=bpe_dir, src=VOCAB_BPE)) for phase in ('train', 'valid', 'test'): vm.RemoteCommand('cd {data_path} && {env} python3 -m ' 'examples.roberta.multiprocessing_bpe_encoder ' '--encoder-json gpt2_bpe/encoder.json ' '--vocab-bpe gpt2_bpe/vocab.bpe ' '--inputs wikitext-103-raw/wiki.{phase}.raw ' '--outputs wikitext-103-raw/wiki.{phase}.bpe ' '--keep-empty ' '--workers 60 '.format(env=env, data_path=DATA_PATH, phase=phase)) vm.RemoteCommand('wget -O {des}/dict.txt {src}'.format(des=bpe_dir, src=FAIRSEQ_DICT)) vm.RemoteCommand('cd {data_path} && {env} fairseq-preprocess ' '--only-source --srcdict gpt2_bpe/dict.txt ' '--trainpref wikitext-103-raw/wiki.train.bpe ' '--validpref wikitext-103-raw/wiki.valid.bpe ' '--testpref wikitext-103-raw/wiki.test.bpe ' '--destdir data-bin/wikitext-103 ' '--workers 60'.format(env=env, data_path=DATA_PATH)) data_bin = posixpath.join(DATA_PATH, 'data-bin') vm.RemoteCommand('mkdir -p {}/mlm-w103'.format(data_bin)) vm.RemoteCommand('for x in `seq 1 {word_count}`;' 'do echo "$x 1" >> {data_bin}/mlm-w103/dict.txt;' 'done'.format(word_count=WORD_COUNT, data_bin=data_bin)) for copy in range(benchmark_spec.num_copies): vm.RemoteCommand( 'cp -r {data_bin}/wikitext-103 {data_bin}/mlm-w103/{copy}'.format( data_bin=data_bin, copy=copy)) vm.RemoteCommand('cp {data_bin}/mlm-w103/dict.txt {data_bin}/mlm-w103/' '{copy}'.format(data_bin=data_bin, copy=copy))
def Run(benchmark_spec): """Run MLPerf on the cluster. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: A list of sample.Sample objects. """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm = benchmark_spec.vms[0] if benchmark_spec.tpus: # For MLPerf 1.0, the benchmake code of different hardware are different. if (benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-32' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-128' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-256' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-512' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-1024' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-2048'): run_path = ( '$HOME/training_results_{version}/Google/benchmarks/{model}/tpu-{tpus}' .format(version=MLPERF_VERSION, model=benchmark_spec.benchmark, tpus=benchmark_spec.tpu_groups['train']. GetAcceleratorType())) code_path = ( '$HOME/training_results_{version}/Google/benchmarks/{model}/implementations/tpu-{tpus}-{model}' .format(version=MLPERF_VERSION, model=benchmark_spec.benchmark, tpus=benchmark_spec.tpu_groups['train']. GetAcceleratorType())) if MASK in benchmark_spec.benchmark: model = 'mask_rcnn' elif GNMT in benchmark_spec.benchmark: model = 'nmt' else: model = benchmark_spec.benchmark mlperf_benchmark_cmd = ( 'cd {code_path} && ' 'export PYTHONPATH=$(pwd):$(pwd)/{model} && ' 'cd {model} && ' '{run_path}/run_and_time.sh'.format(code_path=code_path, model=model, run_path=run_path)) if SSD in benchmark_spec.benchmark: mlperf_benchmark_cmd = ( 'export ' 'MLP_GCS_RESNET_CHECKPOINT={checkpoint}' ' && {cmd}'.format( checkpoint=FLAGS.mlperf_gcs_resnet_checkpoint, cmd=mlperf_benchmark_cmd)) else: raise ValueError( 'MLPerf configurations do not support the hardware in PKB. PKB may ' 'need to be updated if this is a new TPU type.') else: run_sub_paths = { RESNET: 'resnet/implementations/mxnet', TRANSFORMER: 'transformer/implementations/pytorch', MINIGO: 'minigo/implementations/tensorflow', MASK: 'maskrcnn/implementations/pytorch', GNMT: 'gnmt/implementations/pytorch', SSD: 'ssd/implementations/pytorch', BERT: 'bert/implementations/pytorch', } benchmark_path = f'$HOME/training_results_{MLPERF_VERSION}/NVIDIA/benchmarks' run_path = posixpath.join(benchmark_path, run_sub_paths[benchmark_spec.benchmark]) env = { 'DGXSYSTEM': DGXSYSTEM, 'NEXP': 1, 'PULL': 0, 'LOGDIR': f'/tmp/{benchmark_spec.benchmark}', } envs = { RESNET: {}, TRANSFORMER: { 'DATADIR': '/data/wmt/utf8' }, MINIGO: { 'CONT': 'mlperf-nvidia:minigo' }, MASK: {}, GNMT: { 'DATADIR': '/data/gnmt' }, SSD: { 'DATADIR': '/data' }, BERT: {} } env.update(envs[benchmark_spec.benchmark]) run_script = posixpath.join(run_path, 'run_with_docker.sh') vm_util.ReplaceText(vm, 'SYSLOGGING=1', 'SYSLOGGING=0', run_script) vm_util.ReplaceText(vm, 'docker exec -it', 'docker exec -t', run_script) vm_util.ReplaceText(vm, 'nvidia-docker', 'sudo nvidia-docker', run_script) vm_util.ReplaceText(vm, 'docker exec', 'sudo docker exec', run_script) vm_util.ReplaceText(vm, 'docker container', 'sudo docker container', run_script) if benchmark_spec.benchmark == MASK: vm_util.ReplaceText( vm, r'_cont_mounts=\(', r'_cont_mounts=\(\"--volume=\${PKLDIR}:\/pkl_coco\" ', run_script) env = ' '.join(f'{key}={value}' for key, value in env.items()) if nvidia_driver.CheckNvidiaGpuExists(vm): env = f'{tensorflow.GetEnvironmentVars(vm)} {env}' mlperf_benchmark_cmd = (f'chmod 755 {run_script} && ' f'cd {run_path} && ' f'{env} {run_script}') samples = [] metadata = _CreateMetadataDict(benchmark_spec) stdout, _ = vm.RobustRemoteCommand(mlperf_benchmark_cmd, should_log=True) if NONE in FLAGS.mlperf_profiler: samples.extend( MakeSamplesFromOutput(metadata, stdout, use_tpu=bool(benchmark_spec.tpus), model=benchmark_spec.benchmark)) return samples