def slot_info_to_command(slot_info): """ Given a slot_info, creates a command used by gloo to launch a single job. :param slot_info: host and slot to execute the run command on :return: """ host_name = slot_info.hostname horovod_rendez_env = ('HOROVOD_HOSTNAME={hostname} ' 'HOROVOD_RANK={rank} ' 'HOROVOD_SIZE={size} ' 'HOROVOD_LOCAL_RANK={local_rank} ' 'HOROVOD_LOCAL_SIZE={local_size} ' 'HOROVOD_CROSS_RANK={cross_rank} ' 'HOROVOD_CROSS_SIZE={cross_size} '.format( hostname=host_name, rank=slot_info.rank, size=slot_info.size, local_rank=slot_info.local_rank, local_size=slot_info.local_size, cross_rank=slot_info.cross_rank, cross_size=slot_info.cross_size)) return '{horovod_env} {env} {run_command}'.format( horovod_env=horovod_rendez_env, env=' '.join([ '%s=%s' % (key, quote(value)) for key, value in env.items() if env_util.is_exportable(key) ]), run_command=run_command)
def slot_info_to_command(slot_info): """ Given a slot_info, creates a command used by gloo to launch a single job. :param slot_info: host and slot to execute the run command on :return: """ env_vars = create_slot_env_vars(slot_info) horovod_rendez_env = " ".join( [f"{k}={str(v)}" for k, v in env_vars.items()]) return '{shell} {horovod_env} {env} {run_command}' .format( shell='env', horovod_env=horovod_rendez_env, env=' '.join(['%s=%s' % (key, quote(value)) for key, value in env.items() if env_util.is_exportable(key)]), run_command=run_command)
def mpi_run(settings, nics, env, command, stdout=None, stderr=None): """ Runs mpi_run. Args: settings: Settings for running MPI. Note: settings.num_proc and settings.hosts must not be None. nics: Interfaces to include by MPI. env: Environment dictionary to use for running command. command: Command and arguments to run as a list of string. stdout: Stdout of the mpi process. Only used when settings.run_func_mode is True. stderr: Stderr of the mpi process. Only used when settings.run_func_mode is True. """ if env is not None and not isinstance(env, dict): raise Exception('env argument must be a dict, not {type}: {env}' .format(type=type(env), env=env)) mpi_impl_flags, impl_binding_args, mpi = _get_mpi_implementation_flags(settings.tcp_flag, env=env) if mpi_impl_flags is None: raise Exception(_MPI_NOT_FOUND_ERROR_MSG) impi = _IMPI_IMPL == mpi ssh_args = [] if settings.ssh_port: ssh_args += [f'-p {settings.ssh_port}'] if settings.ssh_identity_file: ssh_args += [f'-i {settings.ssh_identity_file}'] mpi_ssh_args = '' if ssh_args: joined_ssh_args = ' '.join(ssh_args) mpi_ssh_args = f'-bootstrap=ssh -bootstrap-exec-args \"{joined_ssh_args}\"' if impi else f'-mca plm_rsh_args \"{joined_ssh_args}\"' tcp_intf_arg = '-mca btl_tcp_if_include {nics}'.format( nics=','.join(nics)) if nics and not impi else '' nccl_socket_intf_arg = '-{opt} NCCL_SOCKET_IFNAME={nics}'.format( opt='genv' if impi else 'x', nics=','.join(nics)) if nics else '' # On large cluster runs (e.g. Summit), we need extra settings to work around OpenMPI issues host_names, host_to_slots = hosts.parse_hosts_and_slots(settings.hosts) if not impi and host_names and len(host_names) >= _LARGE_CLUSTER_THRESHOLD: mpi_impl_flags.append('-mca plm_rsh_no_tree_spawn true') mpi_impl_flags.append('-mca plm_rsh_num_concurrent {}'.format(len(host_names))) # if user does not specify any hosts, mpirun by default uses local host. # There is no need to specify localhost. hosts_arg = '-{opt} {hosts}'.format(opt='hosts' if impi else 'H', hosts=','.join(host_names) if host_names and impi else settings.hosts) ppn_arg = ' ' if host_to_slots and impi: ppn = host_to_slots[host_names[0]] for h_name in host_names[1:]: if ppn != host_to_slots[h_name]: raise Exception('''Different slots in -hosts parameter are not supported in Intel(R) MPI. Use -machinefile <machine_file> for this purpose.''') ppn_arg = ' -ppn {} '.format(ppn) if settings.prefix_output_with_timestamp and not impi: mpi_impl_flags.append('--timestamp-output') binding_args = settings.binding_args if settings.binding_args and not impi else ' '.join(impl_binding_args) basic_args = '-l' if impi else '--allow-run-as-root --tag-output' output = [] if settings.output_filename: output.append('-outfile-pattern' if impi else '--output-filename') output.append(settings.output_filename) env_list = '' if impi else ' '.join( '-x %s' % key for key in sorted(env.keys()) if env_util.is_exportable(key)) # Pass all the env variables to the mpirun command. mpirun_command = ( 'mpirun {basic_args} ' '-np {num_proc}{ppn_arg}{hosts_arg} ' '{binding_args} ' '{mpi_args} ' '{mpi_ssh_args} ' '{tcp_intf_arg} ' '{nccl_socket_intf_arg} ' '{output_filename_arg} ' '{env} {extra_mpi_args} {command}' # expect a lot of environment variables .format(basic_args=basic_args, num_proc=settings.num_proc, ppn_arg=ppn_arg, hosts_arg=hosts_arg, binding_args=binding_args, mpi_args=' '.join(mpi_impl_flags), tcp_intf_arg=tcp_intf_arg, nccl_socket_intf_arg=nccl_socket_intf_arg, mpi_ssh_args=mpi_ssh_args, output_filename_arg=' '.join(output), env=env_list, extra_mpi_args=settings.extra_mpi_args if settings.extra_mpi_args else '', command=' '.join(quote(par) for par in command)) ) if settings.verbose >= 2: print(mpirun_command) # we need the driver's PATH and PYTHONPATH in env to run mpirun, # env for mpirun is different to env encoded in mpirun_command for var in ['PATH', 'PYTHONPATH']: if var not in env and var in os.environ: # copy env so we do not leak env modifications env = copy.copy(env) # copy var over from os.environ env[var] = os.environ[var] # Execute the mpirun command. if settings.run_func_mode: exit_code = safe_shell_exec.execute(mpirun_command, env=env, stdout=stdout, stderr=stderr) if exit_code != 0: raise RuntimeError("mpirun failed with exit code {exit_code}".format(exit_code=exit_code)) else: os.execve('/bin/sh', ['/bin/sh', '-c', mpirun_command], env)
def mpi_run(settings, nics, env, command, stdout=None, stderr=None): """ Runs mpi_run. Args: settings: Settings for running MPI. Note: settings.num_proc and settings.hosts must not be None. nics: Interfaces to include by MPI. env: Environment dictionary to use for running command. command: Command and arguments to run as a list of string. stdout: Stdout of the mpi process. Only used when settings.run_func_mode is True. stderr: Stderr of the mpi process. Only used when settings.run_func_mode is True. """ if env is not None and not isinstance(env, dict): raise Exception( 'env argument must be a dict, not {type}: {env}'.format( type=type(env), env=env)) mpi_impl_flags, impl_binding_args = _get_mpi_implementation_flags( settings.tcp_flag, env=env) if mpi_impl_flags is None: raise Exception(_MPI_NOT_FOUND_ERROR_MSG) ssh_port_arg = '-mca plm_rsh_args \"-p {ssh_port}\"'.format( ssh_port=settings.ssh_port) if settings.ssh_port else '' # if user does not specify any hosts, mpirun by default uses local host. # There is no need to specify localhost. hosts_arg = '-H {hosts}'.format(hosts=settings.hosts) tcp_intf_arg = '-mca btl_tcp_if_include {nics}'.format( nics=','.join(nics)) if nics else '' nccl_socket_intf_arg = '-x NCCL_SOCKET_IFNAME={nics}'.format( nics=','.join(nics)) if nics else '' # On large cluster runs (e.g. Summit), we need extra settings to work around OpenMPI issues host_names, _ = hosts.parse_hosts_and_slots(settings.hosts) if host_names and len(host_names) >= _LARGE_CLUSTER_THRESHOLD: mpi_impl_flags.append('-mca plm_rsh_no_tree_spawn true') mpi_impl_flags.append('-mca plm_rsh_num_concurrent {}'.format( len(host_names))) binding_args = settings.binding_args if settings.binding_args else ' '.join( impl_binding_args) # Pass all the env variables to the mpirun command. mpirun_command = ( 'mpirun --allow-run-as-root --tag-output ' '-np {num_proc} {hosts_arg} ' '{binding_args} ' '{mpi_args} ' '{ssh_port_arg} ' '{tcp_intf_arg} ' '{nccl_socket_intf_arg} ' '{output_filename_arg} ' '{env} {extra_mpi_args} {command}' # expect a lot of environment variables .format(num_proc=settings.num_proc, hosts_arg=hosts_arg, binding_args=binding_args, mpi_args=' '.join(mpi_impl_flags), tcp_intf_arg=tcp_intf_arg, nccl_socket_intf_arg=nccl_socket_intf_arg, ssh_port_arg=ssh_port_arg, output_filename_arg='--output-filename ' + settings.output_filename if settings.output_filename else '', env=' '.join('-x %s' % key for key in sorted(env.keys()) if env_util.is_exportable(key)), extra_mpi_args=settings.extra_mpi_args if settings.extra_mpi_args else '', command=' '.join(quote(par) for par in command))) if settings.verbose >= 2: print(mpirun_command) # we need the driver's PATH and PYTHONPATH in env to run mpirun, # env for mpirun is different to env encoded in mpirun_command for var in ['PATH', 'PYTHONPATH']: if var not in env and var in os.environ: # copy env so we do not leak env modifications env = copy.copy(env) # copy var over from os.environ env[var] = os.environ[var] # Execute the mpirun command. if settings.run_func_mode: exit_code = safe_shell_exec.execute(mpirun_command, env=env, stdout=stdout, stderr=stderr) if exit_code != 0: raise RuntimeError( "mpirun failed with exit code {exit_code}".format( exit_code=exit_code)) else: os.execve('/bin/sh', ['/bin/sh', '-c', mpirun_command], env)