def get_allocation_info(): """Returns and sets the static CSM allocation info.""" if not LSFUtils._csm_allocation_info: lsf_allocation_id = os.environ["CSM_ALLOCATION_ID"].strip() output = io.StringIO() exit_code = safe_shell_exec.execute("{cmd} -a {allocation}".format( cmd=LSFUtils._CSM_ALLOCATION_QUERY, allocation=lsf_allocation_id), stdout=output, stderr=output) if exit_code != 0: raise RuntimeError( "{cmd} failed with exit code {exit_code}".format( cmd=LSFUtils._CSM_ALLOCATION_QUERY, exit_code=exit_code)) LSFUtils._csm_allocation_info = yaml.safe_load(output.getvalue()) # Fetch the total number of cores and gpus for the first host output = io.StringIO() exit_code = safe_shell_exec.execute("{cmd} -n {node}".format( cmd=LSFUtils._CSM_NODE_QUERY, node=LSFUtils._csm_allocation_info["compute_nodes"][0]), stdout=output, stderr=output) if exit_code != 0: raise RuntimeError( "{cmd} failed with exit code {exit_code}".format( cmd=LSFUtils._CSM_NODE_QUERY, exit_code=exit_code)) node_output = yaml.safe_load(output.getvalue()) total_core_count = (int(node_output["Record_1"]["discovered_cores"]) - int(node_output["Record_1"]["discovered_sockets"]) * LSFUtils._csm_allocation_info["isolated_cores"]) LSFUtils._csm_allocation_info["compute_node_cores"]= total_core_count LSFUtils._csm_allocation_info["compute_node_gpus"] = int(node_output["Record_1"]["discovered_gpus"]) # Sorting LSF hostnames LSFUtils._csm_allocation_info["compute_nodes"].sort() return LSFUtils._csm_allocation_info
def do_test_run_with_controller_failure(self, controller, mode, run): if run == 'func': command = None run_func = lambda: fn(0) elif run == 'cmd': command = 'false' run_func = None else: self.fail('unknown run argument {}'.format(run)) if controller == 'mpi': exception = 'mpirun failed with exit code 1' else: exception = 'Horovod detected that one or more processes exited with non-zero status' with self.horovod_args(mode, controller=controller, run_func=run_func, command=command) as (hargs, exec): if controller == 'mpi' and run == 'cmd': self.assertIsNone(_run(hargs)) exec.assert_called_once() args, kwargs = exec.call_args executable, args, env = args self.assertEqual('/bin/sh', executable) self.assertEqual(3, len(args)) self.assertEqual('/bin/sh', args[0]) self.assertEqual('-c', args[1]) exit_code = safe_shell_exec.execute(args[2], env) self.assertEqual(1, exit_code) else: with pytest.raises(RuntimeError, match=exception): _run(hargs)
def do_test_run_with_controller_success(self, controller, mode, run): if run == 'func': command = None run_func = fn elif run == 'cmd': command = 'true' run_func = None else: self.fail('unknown run argument {}'.format(run)) with self.horovod_args(mode, controller, run_func=run_func, command=command) as (hargs, exec): if controller == 'mpi' and run == 'cmd': self.assertIsNone(_run(hargs)) exec.assert_called_once() args, kwargs = exec.call_args executable, args, env = args self.assertEqual('/bin/sh', executable) self.assertEqual(3, len(args)) self.assertEqual('/bin/sh', args[0]) self.assertEqual('-c', args[1]) exit_code = safe_shell_exec.execute(args[2], env) self.assertEqual(0, exit_code) else: actual = _run(hargs) expected = list([(rank, hargs.np) for rank in range(hargs.np)]) if run == 'func' else None self.assertEqual(expected, actual)
def _exec_command(command, slot_info, events): index = slot_info.rank host_name = slot_info.hostname host_address = network.resolve_host_address(host_name) local_addresses = network.get_local_host_addresses() if host_address not in local_addresses: local_command = quote( 'cd {pwd} > /dev/null 2>&1 ; {command}'.format( pwd=os.getcwd(), command=command)) command = get_remote_command( local_command, host=host_name, port=settings.ssh_port, identity_file=settings.ssh_identity_file) if settings.verbose: print(command) # Redirect output if requested stdout = stderr = None stdout_file = stderr_file = None if settings.output_filename: padded_rank = _pad_rank(index, settings.num_proc) output_dir_rank = os.path.join( settings.output_filename, 'rank.{rank}'.format(rank=padded_rank)) if not os.path.exists(output_dir_rank): os.mkdir(output_dir_rank) stdout_file = open(os.path.join(output_dir_rank, 'stdout'), 'w') stderr_file = open(os.path.join(output_dir_rank, 'stderr'), 'w') stdout = MultiFile([sys.stdout, stdout_file]) stderr = MultiFile([sys.stderr, stderr_file]) try: exit_code = safe_shell_exec.execute( command, index=index, stdout=stdout, stderr=stderr, events=events, prefix_output_with_timestamp=settings. prefix_output_with_timestamp) if exit_code != 0: print('Process {idx} exit with status code {ec}.'.format( idx=index, ec=exit_code)) except Exception as e: print('Exception happened during safe_shell_exec, exception ' 'message: {message}'.format(message=e)) exit_code = 1 finally: if stdout_file: stdout_file.close() if stderr_file: stderr_file.close() return exit_code, time.time()
def _run(self, cmd, env): stdout = io.StringIO() stderr = io.StringIO() try: exit_code = safe_shell_exec.execute(cmd, env=env, stdout=stdout, stderr=stderr) return exit_code, stdout.getvalue(), stderr.getvalue() finally: stdout.close() stderr.close()
def _execute_discovery_script(self): stdout = io.StringIO() exit_code = safe_shell_exec.execute(self._discovery_script, stdout=stdout) if exit_code != 0: raise RuntimeError( 'Failed to execute discovery script: {}. Exit code: {}'.format( self._discovery_script, exit_code)) return stdout.getvalue()
def do_test_safe_shell_exec(self, cmd, expected_exit_code, expected_stdout, expected_stderr, event=None): stdout = io.StringIO() stderr = io.StringIO() res = safe_shell_exec.execute(cmd, stdout=stdout, stderr=stderr, events=[event] if event else None) self.assertEqual(expected_exit_code, res) if expected_stdout is not None: self.assertEqual(expected_stdout, stdout.getvalue()) if expected_stderr is not None: self.assertEqual(expected_stderr, stderr.getvalue())
def get_num_threads(): """Returns the number of hardware threads.""" lscpu_cmd = get_ssh_command(LSFUtils._LSCPU_CMD, host=LSFUtils.get_compute_hosts()[0]) output = io.StringIO() exit_code = safe_shell_exec.execute(lscpu_cmd, stdout=output, stderr=output) if exit_code != 0: raise RuntimeError( "{cmd} failed with exit code {exit_code}".format( cmd=lscpu_cmd, exit_code=exit_code)) return int(yaml.safe_load(output.getvalue())[LSFUtils._THREAD_KEY])
def get_num_threads(): """Returns the number of hardware threads.""" lscpu_cmd = 'ssh -o PasswordAuthentication=no -o StrictHostKeyChecking=no ' \ '{host} {cmd}'.format( host=LSFUtils.get_compute_hosts()[0], cmd=LSFUtils._LSCPU_CMD ) output = io.StringIO() exit_code = safe_shell_exec.execute(lscpu_cmd, stdout=output, stderr=output) if exit_code != 0: raise RuntimeError("{cmd} failed with exit code {exit_code}".format( cmd=lscpu_cmd, exit_code=exit_code)) return int(yaml.safe_load(output.getvalue())[LSFUtils._THREAD_KEY])
def _exec_command(command): host_output = io.StringIO() try: exit_code = safe_shell_exec.execute(command, stdout=host_output, stderr=host_output) if exit_code != 0: print('Launching horovod task function was not ' 'successful:\n{host_output}'.format( host_output=host_output.getvalue())) os._exit(exit_code) finally: host_output.close() return exit_code
def exec_command(command): exit_code = 1 output_msg = '' # Try ssh 5 times for i in range(SSH_ATTEMPTS): output = io.StringIO() try: exit_code = safe_shell_exec.execute(command, stdout=output, stderr=output) if exit_code == 0: break output_msg = output.getvalue() finally: output.close() return exit_code, output_msg
def find_available_hosts_and_slots(self): stdout = io.StringIO() exit_code = safe_shell_exec.execute(self._discovery_script, stdout=stdout) if exit_code != 0: raise RuntimeError('Failed to execute discovery script: {}. Exit code: {}' .format(self._discovery_script, exit_code)) host_slots = {} lines = set(stdout.getvalue().strip().split('\n')) for line in lines: host = line if ':' in line: host, slots = line.split(':') host_slots[host] = int(slots) else: host_slots[host] = self._default_slots return host_slots
def _run(self, cmd, env): stdout = io.StringIO() stderr = io.StringIO() try: if env is not None: env = { 'PATH': os.environ['PATH'], **env, } exit_code = safe_shell_exec.execute(cmd, env=env, stdout=stdout, stderr=stderr) return exit_code, stdout.getvalue(), stderr.getvalue() finally: stdout.close() stderr.close()
def _run_command(self, command, env, event, stdout=None, stderr=None, index=None, prefix_output_with_timestamp=False): self._command_exit_code = safe_shell_exec.execute( command, env=env, stdout=stdout, stderr=stderr, index=index, prefix_output_with_timestamp=prefix_output_with_timestamp, events=[event]) if stdout: stdout.close() if stderr: stderr.close()
def execute(command, env=None): """ Executes the command and returns stdout and stderr as a string, together with the exit code. :param command: command to execute :param env: environment variables to use :return: (output, exit code) or None on failure """ output = io.StringIO() try: exit_code = safe_shell_exec.execute(command, env=env, stdout=output, stderr=output) output_msg = output.getvalue() except Exception: print(traceback.format_exc(), file=sys.stderr) return None finally: output.close() return output_msg, exit_code
def js_run(settings, nics, env, command, stdout=None, stderr=None): """ Runs Horovod with jsrun. Args: settings: Settings for running jsrun. Note: settings.num_proc and settings.hosts must not be None. nics: Interfaces to include by jsrun. env: Environment dictionary to use for running jsrun. command: Command and arguments to run as a list of string. stdout: Stdout of the mpi process. Only used when settings.run_func_mode is True. stderr: Stderr of the mpi process. Only used when settings.run_func_mode is True. """ mpi_impl_flags, _ = _get_mpi_implementation_flags(settings.tcp_flag, env=env) if mpi_impl_flags is None: raise Exception(_MPI_NOT_FOUND_ERROR_MSG) if not is_jsrun_installed(): raise Exception( 'horovod does not find the jsrun command.\n\n' 'Please, make sure you are running on a cluster with jsrun installed or ' 'use one of the other launchers.') if nics and 'NCCL_SOCKET_IFNAME' not in env: env['NCCL_SOCKET_IFNAME'] = ','.join(nics) smpiargs = ' '.join(mpi_impl_flags) if settings.extra_mpi_args: smpiargs += ' ' + settings.extra_mpi_args if settings.binding_args: binding_args = settings.binding_args else: rf = generate_jsrun_rankfile(settings) if settings.verbose >= 2: safe_shell_exec.execute('cat {rf}'.format(rf=rf)) binding_args = '--erf_input {rf}'.format(rf=rf) jsrun_command = ( 'jsrun {binding_args} ' '{output_filename_arg} ' '{smpiargs} ' '{command}'.format( binding_args=binding_args, output_filename_arg='--stdio_stderr {file} --stdio_stdout {file}'. format(file=settings.output_filename) if settings.output_filename else '', smpiargs='--smpiargs {args}'.format( args=quote(smpiargs)) if smpiargs else '', command=' '.join(quote(par) for par in command))) if settings.verbose >= 2: print(jsrun_command) # Execute the jsrun command. if settings.run_func_mode: exit_code = safe_shell_exec.execute(jsrun_command, env=env, stdout=stdout, stderr=stderr) if exit_code != 0: raise RuntimeError( "jsrun failed with exit code {exit_code}".format( exit_code=exit_code)) else: os.execve('/bin/sh', ['/bin/sh', '-c', jsrun_command], env)
def mpi_run(settings, nics, env, command, stdout=None, stderr=None): """ Runs mpi_run. Args: settings: Settings for running MPI. Note: settings.num_proc and settings.hosts must not be None. nics: Interfaces to include by MPI. env: Environment dictionary to use for running command. command: Command and arguments to run as a list of string. stdout: Stdout of the mpi process. Only used when settings.run_func_mode is True. stderr: Stderr of the mpi process. Only used when settings.run_func_mode is True. """ if env is not None and not isinstance(env, dict): raise Exception('env argument must be a dict, not {type}: {env}' .format(type=type(env), env=env)) mpi_impl_flags, impl_binding_args, mpi = _get_mpi_implementation_flags(settings.tcp_flag, env=env) if mpi_impl_flags is None: raise Exception(_MPI_NOT_FOUND_ERROR_MSG) impi = _IMPI_IMPL == mpi ssh_args = [] if settings.ssh_port: ssh_args += [f'-p {settings.ssh_port}'] if settings.ssh_identity_file: ssh_args += [f'-i {settings.ssh_identity_file}'] mpi_ssh_args = '' if ssh_args: joined_ssh_args = ' '.join(ssh_args) mpi_ssh_args = f'-bootstrap=ssh -bootstrap-exec-args \"{joined_ssh_args}\"' if impi else f'-mca plm_rsh_args \"{joined_ssh_args}\"' tcp_intf_arg = '-mca btl_tcp_if_include {nics}'.format( nics=','.join(nics)) if nics and not impi else '' nccl_socket_intf_arg = '-{opt} NCCL_SOCKET_IFNAME={nics}'.format( opt='genv' if impi else 'x', nics=','.join(nics)) if nics else '' # On large cluster runs (e.g. Summit), we need extra settings to work around OpenMPI issues host_names, host_to_slots = hosts.parse_hosts_and_slots(settings.hosts) if not impi and host_names and len(host_names) >= _LARGE_CLUSTER_THRESHOLD: mpi_impl_flags.append('-mca plm_rsh_no_tree_spawn true') mpi_impl_flags.append('-mca plm_rsh_num_concurrent {}'.format(len(host_names))) # if user does not specify any hosts, mpirun by default uses local host. # There is no need to specify localhost. hosts_arg = '-{opt} {hosts}'.format(opt='hosts' if impi else 'H', hosts=','.join(host_names) if host_names and impi else settings.hosts) ppn_arg = ' ' if host_to_slots and impi: ppn = host_to_slots[host_names[0]] for h_name in host_names[1:]: if ppn != host_to_slots[h_name]: raise Exception('''Different slots in -hosts parameter are not supported in Intel(R) MPI. Use -machinefile <machine_file> for this purpose.''') ppn_arg = ' -ppn {} '.format(ppn) if settings.prefix_output_with_timestamp and not impi: mpi_impl_flags.append('--timestamp-output') binding_args = settings.binding_args if settings.binding_args and not impi else ' '.join(impl_binding_args) basic_args = '-l' if impi else '--allow-run-as-root --tag-output' output = [] if settings.output_filename: output.append('-outfile-pattern' if impi else '--output-filename') output.append(settings.output_filename) env_list = '' if impi else ' '.join( '-x %s' % key for key in sorted(env.keys()) if env_util.is_exportable(key)) # Pass all the env variables to the mpirun command. mpirun_command = ( 'mpirun {basic_args} ' '-np {num_proc}{ppn_arg}{hosts_arg} ' '{binding_args} ' '{mpi_args} ' '{mpi_ssh_args} ' '{tcp_intf_arg} ' '{nccl_socket_intf_arg} ' '{output_filename_arg} ' '{env} {extra_mpi_args} {command}' # expect a lot of environment variables .format(basic_args=basic_args, num_proc=settings.num_proc, ppn_arg=ppn_arg, hosts_arg=hosts_arg, binding_args=binding_args, mpi_args=' '.join(mpi_impl_flags), tcp_intf_arg=tcp_intf_arg, nccl_socket_intf_arg=nccl_socket_intf_arg, mpi_ssh_args=mpi_ssh_args, output_filename_arg=' '.join(output), env=env_list, extra_mpi_args=settings.extra_mpi_args if settings.extra_mpi_args else '', command=' '.join(quote(par) for par in command)) ) if settings.verbose >= 2: print(mpirun_command) # we need the driver's PATH and PYTHONPATH in env to run mpirun, # env for mpirun is different to env encoded in mpirun_command for var in ['PATH', 'PYTHONPATH']: if var not in env and var in os.environ: # copy env so we do not leak env modifications env = copy.copy(env) # copy var over from os.environ env[var] = os.environ[var] # Execute the mpirun command. if settings.run_func_mode: exit_code = safe_shell_exec.execute(mpirun_command, env=env, stdout=stdout, stderr=stderr) if exit_code != 0: raise RuntimeError("mpirun failed with exit code {exit_code}".format(exit_code=exit_code)) else: os.execve('/bin/sh', ['/bin/sh', '-c', mpirun_command], env)
def _run_command(self, command, env, event): self._command_exit_code = safe_shell_exec.execute(command, env=env, events=[event])
def _exec(cmd): exit_code = safe_shell_exec.execute(cmd) if exit_code is None or exit_code != 0: raise RuntimeError( 'executed command returned non-zero exit code: {}'.format( exit_code))
import sys import time from horovod.runner.common.util import safe_shell_exec class FakeEvent(object): def wait(self): time.sleep(999) def write(filename, value): filename_tmp = filename + '.tmp' with open(filename_tmp, 'w') as f: f.write(str(value)) # Atomic rename to prevent race conditions from reader os.rename(filename_tmp, filename) if __name__ == '__main__': logfile = sys.argv[1] write(logfile, os.getpid()) cmd = ' '.join([sys.executable] + sys.argv[2:]) # Mock out the event to avoid leaking semaphores safe_shell_exec._create_event = lambda ctx: FakeEvent() safe_shell_exec.execute(cmd)
def mpi_run(settings, nics, env, command, stdout=None, stderr=None): """ Runs mpi_run. Args: settings: Settings for running MPI. Note: settings.num_proc and settings.hosts must not be None. nics: Interfaces to include by MPI. env: Environment dictionary to use for running command. command: Command and arguments to run as a list of string. stdout: Stdout of the mpi process. Only used when settings.run_func_mode is True. stderr: Stderr of the mpi process. Only used when settings.run_func_mode is True. """ if env is not None and not isinstance(env, dict): raise Exception( 'env argument must be a dict, not {type}: {env}'.format( type=type(env), env=env)) mpi_impl_flags, impl_binding_args = _get_mpi_implementation_flags( settings.tcp_flag, env=env) if mpi_impl_flags is None: raise Exception(_MPI_NOT_FOUND_ERROR_MSG) ssh_port_arg = '-mca plm_rsh_args \"-p {ssh_port}\"'.format( ssh_port=settings.ssh_port) if settings.ssh_port else '' # if user does not specify any hosts, mpirun by default uses local host. # There is no need to specify localhost. hosts_arg = '-H {hosts}'.format(hosts=settings.hosts) tcp_intf_arg = '-mca btl_tcp_if_include {nics}'.format( nics=','.join(nics)) if nics else '' nccl_socket_intf_arg = '-x NCCL_SOCKET_IFNAME={nics}'.format( nics=','.join(nics)) if nics else '' # On large cluster runs (e.g. Summit), we need extra settings to work around OpenMPI issues host_names, _ = hosts.parse_hosts_and_slots(settings.hosts) if host_names and len(host_names) >= _LARGE_CLUSTER_THRESHOLD: mpi_impl_flags.append('-mca plm_rsh_no_tree_spawn true') mpi_impl_flags.append('-mca plm_rsh_num_concurrent {}'.format( len(host_names))) binding_args = settings.binding_args if settings.binding_args else ' '.join( impl_binding_args) # Pass all the env variables to the mpirun command. mpirun_command = ( 'mpirun --allow-run-as-root --tag-output ' '-np {num_proc} {hosts_arg} ' '{binding_args} ' '{mpi_args} ' '{ssh_port_arg} ' '{tcp_intf_arg} ' '{nccl_socket_intf_arg} ' '{output_filename_arg} ' '{env} {extra_mpi_args} {command}' # expect a lot of environment variables .format(num_proc=settings.num_proc, hosts_arg=hosts_arg, binding_args=binding_args, mpi_args=' '.join(mpi_impl_flags), tcp_intf_arg=tcp_intf_arg, nccl_socket_intf_arg=nccl_socket_intf_arg, ssh_port_arg=ssh_port_arg, output_filename_arg='--output-filename ' + settings.output_filename if settings.output_filename else '', env=' '.join('-x %s' % key for key in sorted(env.keys()) if env_util.is_exportable(key)), extra_mpi_args=settings.extra_mpi_args if settings.extra_mpi_args else '', command=' '.join(quote(par) for par in command))) if settings.verbose >= 2: print(mpirun_command) # we need the driver's PATH and PYTHONPATH in env to run mpirun, # env for mpirun is different to env encoded in mpirun_command for var in ['PATH', 'PYTHONPATH']: if var not in env and var in os.environ: # copy env so we do not leak env modifications env = copy.copy(env) # copy var over from os.environ env[var] = os.environ[var] # Execute the mpirun command. if settings.run_func_mode: exit_code = safe_shell_exec.execute(mpirun_command, env=env, stdout=stdout, stderr=stderr) if exit_code != 0: raise RuntimeError( "mpirun failed with exit code {exit_code}".format( exit_code=exit_code)) else: os.execve('/bin/sh', ['/bin/sh', '-c', mpirun_command], env)