Exemplo n.º 1
0
def _UpdateBenchmarkSpecWithFlags(benchmark_spec):
    """Update the benchmark_spec with supplied command line flags.

  Args:
    benchmark_spec: benchmark specification to update
  """
    gpus_per_node = nvidia_driver.QueryNumberOfGpus(benchmark_spec.vms[0])
    num_vms = len(benchmark_spec.vms)
    total_gpus = gpus_per_node * num_vms

    benchmark_spec.gpus_per_node = gpus_per_node
    benchmark_spec.num_vms = num_vms
    benchmark_spec.total_gpus = total_gpus
    benchmark_spec.model = FLAGS.horovod_model
    benchmark_spec.batch_size = FLAGS.horovod_batch_size
    benchmark_spec.num_steps = FLAGS.horovod_num_steps
    benchmark_spec.precision = FLAGS.horovod_precision
    benchmark_spec.max_seq_len = int(FLAGS.horovod_max_seq_len)
    benchmark_spec.bert_finetune = FLAGS.horovod_bert_finetune
    benchmark_spec.timeline = FLAGS.horovod_timelime
    benchmark_spec.synthetic = FLAGS.horovod_synthetic
    benchmark_spec.cuda_visible_devices = FLAGS.nccl_cuda_visible_devices
    benchmark_spec.nccl_version = FLAGS.nccl_version
    benchmark_spec.nccl_net_plugin = FLAGS.nccl_net_plugin
    benchmark_spec.nccl_extra_params = FLAGS.nccl_extra_params
def _UpdateBenchmarkSpecWithFlags(benchmark_spec):
    """Update the benchmark_spec with supplied command line flags.

  Args:
    benchmark_spec: benchmark specification to update
  """
    gpus_per_vm = nvidia_driver.QueryNumberOfGpus(benchmark_spec.vms[0])
    num_vms = len(benchmark_spec.vms)
    total_num_gpus = gpus_per_vm * num_vms

    benchmark_spec.gpus_per_vm = gpus_per_vm
    benchmark_spec.num_vms = num_vms
    benchmark_spec.total_num_gpus = total_num_gpus
    benchmark_spec.zones = FLAGS.zones

    # pylint: disable=protected-access
    mlperf_benchmark._UpdateBenchmarkSpecWithFlags(benchmark_spec)
    # pylint: enable=protected-access

    storage_service = gcs.GoogleCloudStorageService()
    benchmark_spec.storage_service = storage_service
    if FLAGS.mlperf_bucket:
        benchmark_spec.bucket = FLAGS.mlperf_bucket
        benchmark_spec.model_dir = 'gs://{bucket}/pkb-{uri}'.format(
            bucket=FLAGS.mlperf_bucket, uri=FLAGS.run_uri)
    else:
        benchmark_spec.bucket = None
        benchmark_spec.model_dir = None
def _CreateMetadataDict(benchmark_spec):
    """Create metadata dict to be used in run results.

  Args:
    benchmark_spec: The benchmark specification. Contains all data that is
        required to run the benchmark.

  Returns:
    metadata dict
  """
    metadata = {
        'use_tpu': bool(benchmark_spec.tpus),
        'model_dir': benchmark_spec.model_dir,
        'model': benchmark_spec.benchmark,
        'version': MLPERF_VERSION,
    }
    vms = benchmark_spec.vms
    num_vms = len(vms)
    vm = vms[0]
    gpus_per_node = nvidia_driver.QueryNumberOfGpus(vm)
    total_gpus = gpus_per_node * num_vms
    metadata.update(cuda_toolkit.GetMetadata(vm))
    metadata['total_gpus'] = total_gpus
    if benchmark_spec.tpus:
        metadata.update({
            'train_tpu_num_shards':
            benchmark_spec.tpu_groups['train'].GetNumShards(),
            'train_tpu_accelerator_type':
            benchmark_spec.tpu_groups['train'].GetAcceleratorType()
        })
    return metadata
Exemplo n.º 4
0
def _CreateMetadataDict(vms):
    """Create metadata dict to be used in run results.

  Args:
    vms: A list of worker VMs.

  Returns:
    metadata dict
  """
    vm = vms[0]
    gpus_per_node = nvidia_driver.QueryNumberOfGpus(vm)
    num_vms = len(vms)
    total_gpus = gpus_per_node * num_vms

    metadata = dict()
    metadata.update(cuda_toolkit.GetMetadata(vm))
    metadata['benchmark_version'] = BENCHMARK_VERSION
    metadata['num_nodes'] = len(vms)
    metadata['total_gpus'] = int(total_gpus)
    metadata['model'] = FLAGS.horovod_model
    metadata['batch_size'] = FLAGS.horovod_batch_size
    metadata['num_steps'] = FLAGS.horovod_num_steps
    metadata['synthetic'] = FLAGS.horovod_synthetic
    metadata['precision'] = FLAGS.horovod_precision
    metadata['max_seq_len'] = int(FLAGS.horovod_max_seq_len)
    metadata['nccl_version'] = FLAGS.nccl_version
    metadata['nccl_net_plugin'] = FLAGS.nccl_net_plugin
    metadata['cuda_visible_devices'] = FLAGS.nccl_cuda_visible_devices
    metadata['nccl_extra_params'] = FLAGS.nccl_extra_params
    return metadata
Exemplo n.º 5
0
def _UpdateBenchmarkSpecWithFlags(benchmark_spec):
    """Update the benchmark_spec with supplied command line flags.

  Args:
    benchmark_spec: benchmark specification to update
  """
    benchmark_spec.max_sentences = FLAGS.robertammlm_max_sentences
    benchmark_spec.nproc_per_node = FLAGS.robertammlm_nproc_per_node
    benchmark_spec.log_interval = FLAGS.robertammlm_log_interval
    benchmark_spec.profiler = FLAGS.robertammlm_profiler
    benchmark_spec.max_epoch = FLAGS.robertammlm_max_epoch
    vms = benchmark_spec.vms
    vm = vms[0]
    num_vms = len(vms)
    benchmark_spec.num_vms = num_vms
    benchmark_spec.global_batch_size = FLAGS.robertammlm_global_batch_size
    num_accelerators = nvidia_driver.QueryNumberOfGpus(vm) * num_vms
    benchmark_spec.num_accelerators = num_accelerators
    if FLAGS.robertammlm_update_freq:
        benchmark_spec.update_freq = FLAGS.robertammlm_update_freq
    else:
        benchmark_spec.update_freq = (
            benchmark_spec.global_batch_size //
            (benchmark_spec.max_sentences * num_accelerators))
    if FLAGS.robertammlm_num_copies:
        benchmark_spec.num_copies = FLAGS.robertammlm_num_copies
    else:
        benchmark_spec.num_copies = max(1, num_accelerators // 32)
Exemplo n.º 6
0
def Install(vm):
  """Installs XGBoost on the VM."""
  vm.Install('build_tools')
  install_dir = posixpath.join(linux_packages.INSTALL_DIR, 'xgboost')
  vm.RemoteCommand(
      f'git clone --recursive https://github.com/dmlc/xgboost {install_dir}')
  nccl_make_option = ''
  nccl_install_option = ''
  if nvidia_driver.QueryNumberOfGpus(vm) > 1:
    nccl_make_option = '-DUSE_NCCL=ON -DNCCL_ROOT=/usr/local/nccl2'
    nccl_install_option = '--use-nccl'
  cuda_env = ''
  cuda_make_option = ''
  cuda_install_option = ''
  if nvidia_driver.CheckNvidiaGpuExists:
    cuda_make_option = '-DUSE_CUDA=ON'
    cuda_env = 'CUDACXX=/usr/local/cuda/bin/nvcc'
    cuda_install_option = '--use-cuda'
  build_dir = posixpath.join(install_dir, 'build')
  package_dir = posixpath.join(install_dir, 'python-package')
  vm.RemoteCommand(f'mkdir -p {build_dir}')
  vm.RemoteCommand(f'cd {build_dir} && '
                   f'{cuda_env} cmake .. {cuda_make_option} {nccl_make_option}')
  vm.RemoteCommand(f'cd {build_dir} && make -j4')
  vm.RemoteCommand(f'cd {package_dir} && '
                   f'{_ENV.value} python3 setup.py install '
                   f'{cuda_install_option} {nccl_install_option}')
def _CollectGpuSamples(
        vm: virtual_machine.BaseVirtualMachine) -> List[sample.Sample]:
    """Run CUDA memcopy on the cluster.

  Args:
    vm: The virtual machine to run the benchmark.

  Returns:
    A list of sample.Sample objects.
  """
    if not nvidia_driver.CheckNvidiaGpuExists(vm):
        return []
    if not nvidia_driver.CheckNvidiaSmiExists(vm):
        return []
    global_metadata = _MetadataFromFlags()
    global_metadata.update(cuda_toolkit.GetMetadata(vm))
    global_cmd = [
        BANDWIDTH_TEST_PATH, '--csv', f'--memory={_MEMORY.value}',
        f'--mode={_MODE.value}'
    ]
    if _HTOD.value:
        global_cmd.append('--htod')
    if _DTOH.value:
        global_cmd.append('--dtoh')
    if _DTOD.value:
        global_cmd.append('--dtod')
    if _WC.value:
        global_cmd.append('--wc')

    num_gpus = nvidia_driver.QueryNumberOfGpus(vm)
    devices = list(range(num_gpus)) + (['all'] if num_gpus > 1 else [])
    samples = []
    for device in devices:
        cmd = ' '.join(global_cmd + [f'--device={device}'])
        stdout, stderr, exit_code = vm.RemoteCommandWithReturnCode(
            cmd, ignore_failure=True)
        if exit_code:
            logging.warning('Error with getting GPU stats: %s', stderr)
            continue
        results = regex_util.ExtractAllMatches(
            r'bandwidthTest-(\S+), '
            r'Bandwidth = ([\d\.]+) (\S+), '
            r'Time = ([\d\.]+) s, '
            r'Size = (\d+) bytes, '
            r'NumDevsUsed = (\d+)', stdout)

        for metric, bandwidth, unit, time, size, num_devs_used in results:
            metadata = {
                'time': float(time),
                'size': int(size),
                'NumDevsUsed': num_devs_used,
                'device': device,
                'command': cmd,
            }
            metadata.update(global_metadata)
            samples.append(
                sample.Sample(metric, float(bandwidth), unit, metadata))
    return samples
def Run(benchmark_spec):
    """Run MXNet on the cluster for each model specified.

  Args:
    benchmark_spec: The benchmark specification. Contains all data that is
      required to run the benchmark.

  Returns:
    A list of sample.Sample objects.
  """
    _UpdateBenchmarkSpecWithFlags(benchmark_spec)
    vm = benchmark_spec.vms[0]
    mx_benchmark_dir = 'incubator-mxnet/example/image-classification'
    results = []
    for model in FLAGS.mx_models:
        num_layers = _GetNumLayers(model)
        batch_size = _GetBatchSize(model, num_layers)
        benchmark_spec.model = model
        benchmark_spec.batch_size = batch_size
        benchmark_spec.num_layers = num_layers
        benchmark_spec.image_shape = _GetImageShape(model)
        mx_benchmark_cmd = ('python train_imagenet.py '
                            '--benchmark=1 '
                            '--network={network} '
                            '--batch-size={batch_size} '
                            '--image-shape={image_shape} '
                            '--num-epochs={num_epochs} '
                            '--dtype={precision} '
                            '--kv-store={key_value_store}').format(
                                network=model,
                                batch_size=batch_size,
                                image_shape=benchmark_spec.image_shape,
                                num_epochs=benchmark_spec.num_epochs,
                                precision=benchmark_spec.precision,
                                key_value_store=benchmark_spec.key_value_store)
        if benchmark_spec.device == GPU:
            num_gpus = nvidia_driver.QueryNumberOfGpus(vm)
            mx_benchmark_cmd = '{env} {cmd} --gpus {gpus}'.format(
                env=mxnet.GetEnvironmentVars(vm),
                cmd=mx_benchmark_cmd,
                gpus=','.join(str(n) for n in range(num_gpus)))
        elif benchmark_spec.device == CPU:
            # Specifies the number of threads to use in CPU test.
            # https://mxnet.incubator.apache.org/faq/perf.html
            mx_benchmark_cmd = 'OMP_NUM_THREADS={omp_num_threads} {cmd}'.format(
                omp_num_threads=vm.NumCpusForBenchmark() // 2,
                cmd=mx_benchmark_cmd)

        if num_layers:
            mx_benchmark_cmd = '%s --num-layers %s' % (mx_benchmark_cmd,
                                                       num_layers)
        run_command = 'cd %s && %s' % (mx_benchmark_dir, mx_benchmark_cmd)
        stdout, stderr = vm.RobustRemoteCommand(run_command, should_log=True)

        results.append(_MakeSamplesFromOutput(benchmark_spec, stdout
                                              or stderr))

    return results
Exemplo n.º 9
0
def Prepare(benchmark_spec):
    """Install SHOC and push the machinefile.

  Args:
    benchmark_spec: The benchmark specification. Contains all data that is
        required to run the benchmark.
  """
    vm_util.RunThreaded(_InstallAndAuthenticateVm, benchmark_spec.vms)

    master_vm = benchmark_spec.vms[0]
    benchmark_spec.num_gpus = nvidia_driver.QueryNumberOfGpus(master_vm)
    hpc_util.CreateMachineFile(benchmark_spec.vms,
                               lambda _: benchmark_spec.num_gpus, MACHINEFILE)
Exemplo n.º 10
0
def Prepare(benchmark_spec):
    """Install and set up RoBERTa mmlm on the target vm..

  Args:
    benchmark_spec: The benchmark specification. Contains all data that is
        required to run the benchmark.
  """
    _UpdateBenchmarkSpecWithFlags(benchmark_spec)
    vms = benchmark_spec.vms
    benchmark_spec.always_call_cleanup = True
    list_params = [((benchmark_spec, rank), {})
                   for rank in range(benchmark_spec.num_vms)]
    vm_util.RunThreaded(_PrepareVm, list_params)
    master = vms[0]
    if nvidia_driver.CheckNvidiaGpuExists(master):
        gpus_per_vm = nvidia_driver.QueryNumberOfGpus(master)
        hpc_util.CreateMachineFile(vms, lambda _: gpus_per_vm, HOSTFILE)
Exemplo n.º 11
0
def _UpdateBenchmarkSpecWithFlags(benchmark_spec):
  """Update the benchmark_spec with supplied command line flags.

  Args:
    benchmark_spec: benchmark specification to update
  """
  gpus_per_node = (FLAGS.hpcg_gpus_per_node or
                   nvidia_driver.QueryNumberOfGpus(benchmark_spec.vms[0]))
  cpus_per_rank = int(benchmark_spec.vms[0].NumCpusForBenchmark() /
                      gpus_per_node)
  num_vms = len(benchmark_spec.vms)
  total_gpus = gpus_per_node * num_vms

  benchmark_spec.gpus_per_node = gpus_per_node
  benchmark_spec.cpus_per_rank = cpus_per_rank
  benchmark_spec.num_vms = num_vms
  benchmark_spec.total_gpus = total_gpus
  benchmark_spec.hpcg_problem_size = FLAGS.hpcg_problem_size
  benchmark_spec.hpcg_runtime = FLAGS.hpcg_runtime
  benchmark_spec.run_as_root = FLAGS.mpirun_allow_run_as_root
def _CreateMetadataDict(
    bm_spec: benchmark_spec.BenchmarkSpec) -> Dict[str, Any]:
  """Creates metadata dict to be used in run results.

  Args:
    bm_spec: The benchmark specification. Contains all data that is required to
      run the benchmark.

  Returns:
    metadata dict
  """
  metadata = {
      'model': FLAGS.mlperf_benchmark,
      'version': MLPERF_INFERENCE_VERSION,
  }
  vms = bm_spec.vms
  num_vms = len(vms)
  vm = vms[0]
  gpus_per_node = nvidia_driver.QueryNumberOfGpus(vm)
  total_gpus = gpus_per_node * num_vms
  metadata.update(cuda_toolkit.GetMetadata(vm))
  metadata['total_gpus'] = total_gpus
  return metadata
def _GetTfCnnBenchmarkCommand(vm,
                              model,
                              batch_size,
                              benchmark_spec,
                              args='',
                              job_name=''):
    """Create the command used to run the tf_cnn_benchmarks script.

  The command is either formulated using flag values stored on the
  benchmark_spec, or is essentially provided outright through the
  benchmark_args flag.

  Args:
    vm: the VM to run on.
    model: name of the model to run.
    batch_size: batch size to use for training.
    benchmark_spec: the benchmark spec object.
    args: string, distributed arguments
    job_name: string, distributed job name

  Returns:
    A string that runs the tf_cnn_benchmarks.py script
    with the desired arguments.
  """
    num_gpus = (nvidia_driver.QueryNumberOfGpus(vm)
                if nvidia_driver.CheckNvidiaGpuExists(vm) else 0)
    benchmark_spec.num_gpus = num_gpus

    if benchmark_spec.benchmark_args is not None:
        cmd = 'python tf_cnn_benchmarks.py ' + benchmark_spec.benchmark_args
        # If the user didn't specify num_gpus in the benchmark_args string,
        # use all the GPUs on the system.
        if '--num_gpus' not in benchmark_spec.benchmark_args and num_gpus:
            cmd = '{cmd} --num_gpus={num_gpus}'.format(cmd=cmd,
                                                       num_gpus=num_gpus)
        return cmd

    benchmark_spec.local_parameter_device = FLAGS.tf_local_parameter_device
    benchmark_spec.device = FLAGS.tf_device
    benchmark_spec.data_format = FLAGS.tf_data_format
    if num_gpus == 0:
        benchmark_spec.local_parameter_device = CPU
        benchmark_spec.device = CPU
        benchmark_spec.data_format = NHWC

    cmd = ('{env_vars} python tf_cnn_benchmarks.py '
           '--local_parameter_device={local_parameter_device} '
           '--batch_size={batch_size} '
           '--model={model} '
           '{data} '
           '--data_name={data_name} '
           '--variable_update={variable_update} '
           '--distortions={distortions} '
           '--device={device} '
           '--data_format={data_format} '
           '--forward_only={forward_only} '
           '--use_fp16={use_fp16} '
           '{num_gpus} '
           '{job_name}'.format(
               env_vars=tensorflow.GetEnvironmentVars(vm),
               local_parameter_device=benchmark_spec.local_parameter_device,
               batch_size=batch_size,
               model=model,
               data=('--data_dir={}'.format(benchmark_spec.data_dir)
                     if benchmark_spec.data_dir else ''),
               data_name=benchmark_spec.data_name,
               variable_update=benchmark_spec.variable_update,
               distortions=benchmark_spec.distortions,
               device=benchmark_spec.device,
               data_format=benchmark_spec.data_format,
               forward_only=benchmark_spec.forward_only,
               use_fp16=(benchmark_spec.precision == FP16),
               num_gpus='--num_gpus={}'.format(num_gpus) if num_gpus else '',
               job_name='--job_name={0} {1}'.format(job_name, args)
               if args else ''))
    return cmd
Exemplo n.º 14
0
 def testQueryNumberOfGpus(self):
     vm = mock.MagicMock()
     vm.RemoteCommand = mock.MagicMock(return_value=('count\n8', None))
     self.assertEqual(8, nvidia_driver.QueryNumberOfGpus(vm))
Exemplo n.º 15
0
def RunWithVMs(vms, extra_envs=None):
    """Run Horovod on the cluster.

  Args:
    vms: A list of worker VMs.
    extra_envs: A dictionary of environment variables.

  Returns:
    A list of sample.Sample objects.
  """
    vm_util.RunThreaded(lambda vm: vm.RemoteCommand('rm -rf /tmp/models'), vms)
    master_vm = vms[0]

    gpus_per_node = nvidia_driver.QueryNumberOfGpus(master_vm)
    num_vms = len(vms)
    total_gpus = gpus_per_node * num_vms

    # GCP should work out of the box with the deep learning image but the AWS
    # image requires us to use the correct Tensorflow Python environment.
    if FLAGS.cloud == 'AWS':
        master_vm.RobustRemoteCommand(
            '. anaconda3/bin/activate tensorflow_p37')
        python_interpreter = 'anaconda3/envs/tensorflow_p37/bin/python'
    else:
        python_interpreter = '/opt/conda/bin/python'

    nccl_params = {
        'TF_CPP_MIN_LOG_LEVEL': 0,
        'NCCL_SOCKET_IFNAME': '^lo,docker0',
        'NCCL_DEBUG': 'INFO',
    }

    if FLAGS.horovod_timeline:
        nccl_params['HOROVOD_TIMELINE_MARK_CYCLES'] = 1
        nccl_params['HOROVOD_TIMELINE'] = f'{vm_util.VM_TMP_DIR}/timeline.json'

    if FLAGS.nccl_cuda_visible_devices:
        nccl_params['CUDA_VISIBLE_DEVICES'] = FLAGS.nccl_cuda_visible_devices

    if FLAGS.nccl_extra_params:
        for extra_param in FLAGS.nccl_extra_params:
            k, v = extra_param.split('=', 1)
            nccl_params[k] = v

    if extra_envs:
        nccl_params.update(extra_envs)

    run_command = ('{mpi} -np {num_gpus} -hostfile {host_file} '
                   '-mca plm_rsh_no_tree_spawn 1 '
                   '--allow-run-as-root '
                   '-bind-to socket -map-by slot '
                   '{nccl_params} '
                   '-mca pml ob1 -mca btl ^openib '
                   '-mca btl_tcp_if_exclude lo,docker0 '
                   '{python} ').format(
                       mpi=FLAGS.nccl_mpi,
                       num_gpus=total_gpus,
                       host_file=MACHINEFILE,
                       python=python_interpreter,
                       nccl_params=' '.join([
                           f'-x {key}={value}'
                           for key, value in nccl_params.items()
                       ]))

    if FLAGS.horovod_model == 'resnet-50':
        run_flags = {
            'arch': 'resnet50',
            'mode': 'training_benchmark',
            'warmup_steps': 101,
            'results_dir': '/tmp/models',
            'gpu_memory_fraction': 0.95,
            'static_loss_scale': 128,
            'lr_init': 0.016,
            'lr_warmup_epochs': 8,
            'momentum': 0.875,
            'weight_decay': 3.0517578125e-05,
            'iter_unit': 'batch'
        }
        run_flags.update({
            'batch_size': FLAGS.horovod_batch_size,
            'num_iter': FLAGS.horovod_num_steps,
        })
        if FLAGS.horovod_precision == 'fp16':
            run_flags['amp'] = None

        # Load ImageNet training data from GCS if benchmark is not in synthetic mode
        if not FLAGS.horovod_synthetic:
            run_flags[
                'data_dir'] = 'gs://cloud-ml-nas-public/classification/imagenet'

        run_command += 'DeepLearningExamples/TensorFlow/Classification/ConvNets/main.py '
        run_command += ' '.join([
            '--{}'.format(key) if value is None else '--{}={}'.format(
                key, value) for key, value in sorted(run_flags.items())
        ])
    elif FLAGS.horovod_model == 'resnext-101':
        run_flags = {
            'arch': 'resnext101-32x4d',
            'mode': 'training_benchmark',
            'warmup_steps': 101,
            'results_dir': '/tmp/models',
            'gpu_memory_fraction': 0.95,
            'use_static_loss_scaling': None,
            'loss_scale': 128,
            'lr_init': 0.016,
            'lr_warmup_epochs': 8,
            'momentum': 0.875,
            'weight_decay': 3.0517578125e-05,
            'weight_init': 'fan_in',
            'iter_unit': 'batch'
        }
        run_flags.update({
            'precision': FLAGS.horovod_precision,
            'batch_size': FLAGS.horovod_batch_size,
            'num_iter': FLAGS.horovod_num_steps,
        })

        # Load ImageNet training data from GCS if benchmark is not in synthetic mode
        if not FLAGS.horovod_synthetic:
            run_flags[
                'data_dir'] = 'gs://cloud-ml-nas-public/classification/imagenet'

        run_command += 'DeepLearningExamples/TensorFlow/Classification/ConvNets/main.py '
        run_command += ' '.join([
            '--{}'.format(key) if value is None else '--{}={}'.format(
                key, value) for key, value in sorted(run_flags.items())
        ])
    elif FLAGS.horovod_model.startswith('bert'):  # bert
        if not FLAGS.horovod_bert_finetune:
            raise NotImplementedError('BERT pretraining is not supported.')
        bert_dir = 'DeepLearningExamples/TensorFlow/LanguageModeling/BERT/data/download/google_pretrained_weights/{}'.format(
            'uncased_L-12_H-768_A-12' if FLAGS.horovod_model ==
            'bert-base' else 'uncased_L-24_H-1024_A-16')
        squad_train_file = 'DeepLearningExamples/TensorFlow/LanguageModeling/BERT/data/download/squad/v1.1/train-v1.1.json'
        run_flags = {
            'vocab_file': '{}/vocab.txt'.format(bert_dir),
            'bert_config_file': '{}/bert_config.json'.format(bert_dir),
            'init_checkpoint': '{}/bert_model.ckpt'.format(bert_dir),
            'do_train': None,
            'train_file': squad_train_file,
            'learning_rate': 5e-6,
            'output_dir': '/tmp/models',
            'horovod': None,
            'dllog_path': '/tmp/bert_dllog.json',
            'save_checkpoints_steps': 0,
        }
        run_flags.update({
            'precision':
            FLAGS.horovod_precision,
            'train_batch_size':
            FLAGS.horovod_batch_size,
            'num_train_epochs':
            FLAGS.horovod_num_steps,
            'max_seq_length':
            FLAGS.horovod_max_seq_len,
            'doc_stride':
            64 if FLAGS.horovod_max_seq_len == 128 else 128,
            'amp':
            FLAGS.horovod_precision == 'fp16'
        })
        run_command += 'DeepLearningExamples/TensorFlow/LanguageModeling/BERT/run_squad.py '
        run_command += ' '.join([
            '--{}'.format(key) if value is None else '--{}={}'.format(
                key, value) for key, value in sorted(run_flags.items())
        ])
    else:
        run_command += (
            'tensorpack/examples/FasterRCNN/train.py --config '
            'BACKBONE.WEIGHTS=ImageNet-R50-AlignPadding.npz '
            'DATA.BASEDIR=coco '
            'TRAINER=horovod '
            'TRAIN.EVAL_PERIOD=0 '
            # LR_SCHEDULE means equivalent steps when the total batch size is 8.
            'TRAIN.LR_SCHEDULE="[{step}, {step}, {step}]" '
            '--logdir {log_dir}/maskrcnn ').format(
                log_dir=vm_util.VM_TMP_DIR,
                step=FLAGS.horovod_num_steps * total_gpus // 8)
    stdout, stderr = master_vm.RobustRemoteCommand(run_command,
                                                   should_log=True)

    if FLAGS.horovod_timeline:
        master_vm.PullFile(vm_util.GetTempDir(),
                           '{}/timeline.json'.format(vm_util.VM_TMP_DIR))
    return _MakeSamplesFromOutput(vms, stdout, stderr)
def Run(benchmark_spec):
  """Run ResNet on the cluster.

  Args:
    benchmark_spec: The benchmark specification. Contains all data that is
        required to run the benchmark.

  Returns:
    A list of sample.Sample objects.
  """
  _UpdateBenchmarkSpecWithFlags(benchmark_spec)
  vm = benchmark_spec.vms[0]
  if benchmark_spec.tpus:
    resnet_benchmark_script = 'resnet_main.py'
    resnet_benchmark_cmd = (
        '{env_cmd} && '
        'cd tpu/models && '
        'export PYTHONPATH=$(pwd) &&'
        'cd official/resnet && '
        'python {script} '
        '--use_tpu={use_tpu} '
        '--data_dir={data_dir} '
        '--model_dir={model_dir} '
        '--resnet_depth={depth} '
        '--train_batch_size={train_batch_size} '
        '--eval_batch_size={eval_batch_size} '
        '--iterations_per_loop={iterations} '
        '--data_format={data_format} '
        '--precision={precision} '
        '--skip_host_call={skip_host_call} '
        '--num_train_images={num_train_images} '
        '--num_eval_images={num_eval_images}'.format(
            env_cmd=benchmark_spec.env_cmd,
            script=resnet_benchmark_script,
            use_tpu=bool(benchmark_spec.tpus),
            data_dir=benchmark_spec.data_dir,
            model_dir=benchmark_spec.model_dir,
            depth=benchmark_spec.depth,
            train_batch_size=benchmark_spec.train_batch_size,
            eval_batch_size=benchmark_spec.eval_batch_size,
            iterations=benchmark_spec.iterations,
            data_format=benchmark_spec.data_format,
            precision=benchmark_spec.precision,
            skip_host_call=benchmark_spec.skip_host_call,
            num_train_images=benchmark_spec.num_train_images,
            num_eval_images=benchmark_spec.num_eval_images))
  else:
    resnet_benchmark_script = 'imagenet_main.py'
    resnet_benchmark_cmd = ('{env_cmd} && '
                            'cd models && '
                            'export PYTHONPATH=$(pwd) && '
                            'cd official/r1/resnet && '
                            'python {script} '
                            '--data_dir=/data/imagenet '
                            '--model_dir={model_dir} '
                            '--resnet_size={resnet_size} '
                            '--batch_size={batch_size} '
                            '--data_format={data_format} '.format(
                                env_cmd=benchmark_spec.env_cmd,
                                script=resnet_benchmark_script,
                                model_dir=benchmark_spec.model_dir,
                                resnet_size=benchmark_spec.depth,
                                batch_size=benchmark_spec.train_batch_size,
                                data_format=benchmark_spec.data_format))
    precision = '{precision}'.format(precision=benchmark_spec.precision)
    if precision == 'bfloat16':
      resnet_benchmark_cmd = '{cmd} --dtype=fp16'.format(
          cmd=resnet_benchmark_cmd)
    else:
      resnet_benchmark_cmd = '{cmd} --dtype=fp32'.format(
          cmd=resnet_benchmark_cmd)

    if nvidia_driver.CheckNvidiaGpuExists(vm):
      resnet_benchmark_cmd = '{env} {cmd} --num_gpus={num_gpus}'.format(
          env=tensorflow.GetEnvironmentVars(vm),
          cmd=resnet_benchmark_cmd,
          num_gpus=nvidia_driver.QueryNumberOfGpus(vm))

  samples = []
  metadata = _CreateMetadataDict(benchmark_spec)
  elapsed_seconds = 0
  steps_per_eval = benchmark_spec.steps_per_eval
  train_steps = benchmark_spec.train_steps
  for step in range(steps_per_eval, train_steps + steps_per_eval,
                    steps_per_eval):
    step = min(step, train_steps)
    resnet_benchmark_cmd_step = '{cmd} --train_steps={step}'.format(
        cmd=resnet_benchmark_cmd, step=step)

    if benchmark_spec.mode in ('train', 'train_and_eval'):
      if benchmark_spec.tpus:
        tpu = benchmark_spec.tpu_groups['train'].GetName()
        num_cores = '--num_cores={}'.format(
            benchmark_spec.tpu_groups['train'].GetNumShards())
        resnet_benchmark_train_cmd = (
            '{cmd} --tpu={tpu} --mode=train {num_cores}'.format(
                cmd=resnet_benchmark_cmd_step, tpu=tpu, num_cores=num_cores))
      else:
        resnet_benchmark_train_cmd = (
            '{cmd} --max_train_steps={max_train_steps} '
            '--train_epochs={train_epochs} --noeval_only'.format(
                cmd=resnet_benchmark_cmd,
                train_epochs=benchmark_spec.epochs_per_eval,
                max_train_steps=step))

      start = time.time()
      stdout, stderr = vm.RobustRemoteCommand(resnet_benchmark_train_cmd,
                                              should_log=True)
      elapsed_seconds += (time.time() - start)
      samples.extend(mnist_benchmark.MakeSamplesFromTrainOutput(
          metadata, stdout + stderr, elapsed_seconds, step))

    if benchmark_spec.mode in ('train_and_eval', 'eval'):
      if benchmark_spec.tpus:
        tpu = benchmark_spec.tpu_groups['eval'].GetName()
        num_cores = '--num_cores={}'.format(
            benchmark_spec.tpu_groups['eval'].GetNumShards())
        resnet_benchmark_eval_cmd = (
            '{cmd} --tpu={tpu} --mode=eval {num_cores}'.format(
                cmd=resnet_benchmark_cmd_step, tpu=tpu, num_cores=num_cores))
      else:
        resnet_benchmark_eval_cmd = ('{cmd} --eval_only'.format(
            cmd=resnet_benchmark_cmd))

      stdout, stderr = vm.RobustRemoteCommand(resnet_benchmark_eval_cmd,
                                              should_log=True)
      samples.extend(
          MakeSamplesFromEvalOutput(
              metadata,
              stdout + stderr,
              elapsed_seconds,
              use_tpu=bool(benchmark_spec.tpus)))
  return samples