def AddUser(vm: virtual_machine.BaseVirtualMachine) -> None: """Run Docker as a non-root user. https://docs.docker.com/engine/install/linux-postinstall/#manage-docker-as-a-non-root-user Args: vm: The VM to work on """ # Create the docker group. vm.RemoteCommand('sudo groupadd docker', ignore_failure=True) # Add your user to the docker group. vm.RemoteCommand(f'sudo usermod -aG docker {vm.user_name}') # Log out and log back in so that your group membership is re-evaluated. vm.RemoteCommand(f'pkill -KILL -u {vm.user_name}', ignore_failure=True)
def _CollectGpuSamples( vm: virtual_machine.BaseVirtualMachine) -> List[sample.Sample]: """Run XGBoost on the cluster. Args: vm: The virtual machine to run the benchmark. Returns: A list of sample.Sample objects. """ cmd = [ f'{FLAGS.xgboost_env}', 'python3', f'{linux_packages.INSTALL_DIR}/xgboost/tests/benchmark/benchmark_tree.py', f'--tree_method={_TREE_METHOD.value}', f'--sparsity={_SPARSITY.value}', f'--rows={_ROWS.value}', f'--columns={_COLUMNS.value}', f'--iterations={_ITERATIONS.value}', f'--test_size={_TEST_SIZE.value}', ] if _PARAMS.value: cmd.append(f'--params="{_PARAMS.value}"') metadata = _MetadataFromFlags(vm) metadata.update(cuda_toolkit.GetMetadata(vm)) metadata['command'] = ' '.join(cmd) stdout, stderr, exit_code = vm.RemoteCommandWithReturnCode( metadata['command'], ignore_failure=True) if exit_code: logging.warning('Error with getting XGBoost stats: %s', stderr) training_time = regex_util.ExtractFloat(r'Train Time: ([\d\.]+) seconds', stdout) return sample.Sample('training_time', training_time, 'seconds', metadata)
def _CollectGpuSamples( vm: virtual_machine.BaseVirtualMachine) -> List[sample.Sample]: """Run CUDA memcopy on the cluster. Args: vm: The virtual machine to run the benchmark. Returns: A list of sample.Sample objects. """ if not nvidia_driver.CheckNvidiaGpuExists(vm): return [] if not nvidia_driver.CheckNvidiaSmiExists(vm): return [] global_metadata = _MetadataFromFlags() global_metadata.update(cuda_toolkit.GetMetadata(vm)) global_cmd = [ BANDWIDTH_TEST_PATH, '--csv', f'--memory={_MEMORY.value}', f'--mode={_MODE.value}' ] if _HTOD.value: global_cmd.append('--htod') if _DTOH.value: global_cmd.append('--dtoh') if _DTOD.value: global_cmd.append('--dtod') if _WC.value: global_cmd.append('--wc') num_gpus = nvidia_driver.QueryNumberOfGpus(vm) devices = list(range(num_gpus)) + (['all'] if num_gpus > 1 else []) samples = [] for device in devices: cmd = ' '.join(global_cmd + [f'--device={device}']) stdout, stderr, exit_code = vm.RemoteCommandWithReturnCode( cmd, ignore_failure=True) if exit_code: logging.warning('Error with getting GPU stats: %s', stderr) continue results = regex_util.ExtractAllMatches( r'bandwidthTest-(\S+), ' r'Bandwidth = ([\d\.]+) (\S+), ' r'Time = ([\d\.]+) s, ' r'Size = (\d+) bytes, ' r'NumDevsUsed = (\d+)', stdout) for metric, bandwidth, unit, time, size, num_devs_used in results: metadata = { 'time': float(time), 'size': int(size), 'NumDevsUsed': num_devs_used, 'device': device, 'command': cmd, } metadata.update(global_metadata) samples.append( sample.Sample(metric, float(bandwidth), unit, metadata)) return samples
def _RunGpuPingpong(vm: virtual_machine.BaseVirtualMachine, addr: str) -> List[Tuple[float, float]]: """Returns the Ping and Pong latency times.""" stdout, stderr = vm.RemoteCommand( f'{_ENV.value} python {_TEST_SCRIPT} {addr}') ping_bws = [ float(bw) for bw in regex_util.ExtractAllMatches(_TIMELINE_PING, stdout + stderr) ] pong_bws = [ float(bw) for bw in regex_util.ExtractAllMatches(_TIMELINE_PONG, stdout + stderr) ] return list(zip(ping_bws, pong_bws))
def ParseCsvResultsIntoMetadata(vm: virtual_machine.BaseVirtualMachine, path: str) -> List[Dict[str, Any]]: """Loads the CSV created by cloud harmony at path in the VM into metadata. The CSV located by path inside of virtual machine VM will be loaded. For each row of results, a set of key/value pairs is created. The keys will all be prepended with `cloudharmony` or similar. Args: vm: the Virtual Machine that has run a cloud harmony benchmark path: The path inside of VM which has the CSV file which should be loaded Returns: A list of metadata outputs that should be appended to the samples that are produced by a cloud harmony benchmark. """ csv_string, _ = vm.RemoteCommand('cat {path}'.format(path=path)) return ParseCsvResultsFromString(csv_string)