def _check_all_hosts_ssh_successful(host_addresses, ssh_port=None): """ checks if ssh can successfully be performed to all the hosts. :param host_addresses: list of addresses to ssh into. for example, ['worker-0','worker-1'] ['10.11.11.11', '10.11.11.12'] :type host_addresses: list(strings) :return: Returns True if all ssh was successful into all the addresses. """ def exec_command(command): exit_code = 1 output_msg = "" # Try ssh 5 times for i in range(SSH_RETRIES): output = six.StringIO() try: exit_code = safe_shell_exec.execute(command, stdout=output, stderr=output) if exit_code == 0: break else: output_msg = output.getvalue() finally: output.close() return exit_code, output_msg if ssh_port: ssh_port_arg = "-p {ssh_port}".format(ssh_port=ssh_port) else: ssh_port_arg = "" ssh_command_format = 'ssh -o StrictHostKeyChecking=no {host} {ssh_port_arg} date' args_list = [[ssh_command_format.format(host=host_address, ssh_port_arg=ssh_port_arg)] for host_address in host_addresses] ssh_exit_codes = \ threads.execute_function_multithreaded(exec_command, args_list) ssh_successful_to_all_hosts = True for index, ssh_status in six.iteritems(ssh_exit_codes): exit_code, output_msg = ssh_status[0], ssh_status[1] if exit_code != 0: print("ssh not successful for host {host}:\n{msg_output}".format( host=host_addresses[index], msg_output=output_msg )) ssh_successful_to_all_hosts = False if not ssh_successful_to_all_hosts: exit(1) return True
def _check_all_hosts_ssh_successful(host_addresses, ssh_port=None): """ checks if ssh can successfully be performed to all the hosts. :param host_addresses: list of addresses to ssh into. for example, ['worker-0','worker-1'] ['10.11.11.11', '10.11.11.12'] :type host_addresses: list(strings) :return: Returns True if all ssh was successful into all the addresses. """ def exec_command(command): exit_code = 1 output_msg = '' # Try ssh 5 times for i in range(SSH_ATTEMPTS): output = io.StringIO() try: exit_code = safe_shell_exec.execute(command, stdout=output, stderr=output) if exit_code == 0: break output_msg = output.getvalue() finally: output.close() return exit_code, output_msg ssh_port_arg = '-p {ssh_port}'.format( ssh_port=ssh_port) if ssh_port else '' ssh_command_format = 'ssh -o PasswordAuthentication=no -o StrictHostKeyChecking=no' \ ' {host} {ssh_port_arg} true' args_list = [[ ssh_command_format.format(host=host_address, ssh_port_arg=ssh_port_arg) ] for host_address in host_addresses] ssh_exit_codes = \ threads.execute_function_multithreaded(exec_command, args_list) ssh_successful_to_all_hosts = True for index, ssh_status in ssh_exit_codes.items(): exit_code, output_msg = ssh_status[0], ssh_status[1] if exit_code != 0: print('ssh not successful for host {host}:\n{msg_output}'.format( host=host_addresses[index], msg_output=output_msg)) ssh_successful_to_all_hosts = False if not ssh_successful_to_all_hosts: return None # we could return False here but do not want it to be cached return True
def launch_gloo(command, exec_command, settings, nics, env, server_ip): """ Launches the given command multiple times using gloo. Each command is launched via exec_command. :param command: command to launch :param exec_command: means to execute a single command :param settings: settings for the distribution :param nics: common interfaces :param env: environment to use :param server_ip: ip to use for rendezvous server """ # Make the output directory if it does not exist if settings.output_filename: _mkdir_p(settings.output_filename) # start global rendezvous server and get port that it is listening on rendezvous = RendezvousServer(settings.verbose) # allocate processes into slots hosts = parse_hosts(settings.hosts) host_alloc_plan = get_host_assignments(hosts, settings.num_proc) # start global rendezvous server and get port that it is listening on global_rendezv_port = rendezvous.start_server() rendezvous.httpd.init(host_alloc_plan) run_command = get_run_command(command, server_ip, nics, global_rendezv_port) slot_info_to_command = _slot_info_to_command_fn(run_command, env) event = register_shutdown_event() args_list = [[slot_info_to_command(slot_info), slot_info, [event]] for slot_info in host_alloc_plan] # If an error occurs in one thread, entire process will be terminated. # Otherwise, threads will keep running. res = threads.execute_function_multithreaded(exec_command, args_list, block_until_all_done=True) for name, value in sorted(res.items(), key=lambda item: item[1][1]): exit_code, timestamp = value if exit_code != 0: raise RuntimeError( 'Horovod detected that one or more processes exited with non-zero ' 'status, thus causing the job to be terminated. The first process ' 'to do so was:\nProcess name: {name}\nExit code: {code}\n'. format(name=name, code=exit_code))
def filter_local_addresses(all_host_names): local_addresses = get_local_host_addresses() args_list = [[host] for host in all_host_names] host_addresses = threads.execute_function_multithreaded( resolve_host_address, args_list) # host_addresses is a map remote_host_names = [] for i in range(len(all_host_names)): host_address = host_addresses[i] host_name = all_host_names[i] if not host_address or host_address not in local_addresses: remote_host_names.append(host_name) return remote_host_names
def filter_local_addresses(all_host_names): local_addresses = _get_local_host_addresses() def resolve_host_name(host_name): try: return socket.gethostbyname(host_name) except socket.gaierror: return None args_list = [[host] for host in all_host_names] host_addresses = threads.execute_function_multithreaded( resolve_host_name, args_list) # host_addresses is a map remote_host_names = [] for i in range(len(all_host_names)): host_address = host_addresses[i] host_name = all_host_names[i] if not host_address or host_address not in local_addresses: remote_host_names.append(host_name) return remote_host_names
def launch_gloo(command, exec_command, settings, nics, env, server_ip): """ Launches the given command multiple times using gloo. Each command is launched via exec_command. :param command: command to launch :param exec_command: means to execute a single command :param settings: settings for the distribution :param nics: common interfaces :param env: environment to use :param server_ip: ip to use for rendezvous server """ # allocate processes into slots host_alloc_plan = _allocate(settings.hosts, settings.num_proc) # create global rendezvous server global_rendezv = RendezvousServer(settings.verbose) # Start rendezvous server and get port that it is listening global_rendezv_port = global_rendezv.start_server(host_alloc_plan) run_command = ( 'HOROVOD_GLOO_RENDEZVOUS_ADDR={addr} ' 'HOROVOD_GLOO_RENDEZVOUS_PORT={port} ' 'HOROVOD_CONTROLLER=gloo ' 'HOROVOD_CPU_OPERATIONS=gloo ' 'HOROVOD_GLOO_IFACE={iface} ' 'NCCL_SOCKET_IFNAME={nics} ' '{command}' # expect a lot of environment variables .format( addr=server_ip, port=global_rendezv_port, iface=list(nics)[0], # TODO: add multiple ifaces in future nics=','.join(nics), command=' '.join(quote(par) for par in command))) # Create a event for communication between threads event = threading.Event() def set_event_on_sigterm(signum, frame): event.set() signal.signal(signal.SIGINT, set_event_on_sigterm) signal.signal(signal.SIGTERM, set_event_on_sigterm) # TODO: Workaround for over-buffered outputs. Investigate how mpirun avoids this problem. env = copy.copy(env) # copy env so we do not leak env modifications env['PYTHONUNBUFFERED'] = '1' # In case, the main thread receives a SIGINT, the event will be set so the spawned threads can # kill their corresponding middleman processes so the jobs can be killed as well. alloc_info_to_command = _alloc_info_to_command_fn(run_command, env) args_list = [[alloc_info_to_command(alloc_info), alloc_info, event] for alloc_info in host_alloc_plan] # Make the output directory if it does not exist if settings.output_filename: _mkdir_p(settings.output_filename) # If an error occurs in one thread, entire process will be terminated. # Otherwise, threads will keep running. res = threads.execute_function_multithreaded(exec_command, args_list, block_until_all_done=True) for name, value in sorted(res.items(), key=lambda item: item[1][1]): exit_code, timestamp = value if exit_code != 0: raise RuntimeError( 'Gloo job detected that one or more processes exited with non-zero ' 'status, thus causing the job to be terminated. The first process ' 'to do so was:\nProcess name: {name}\nExit code: {code}\n'. format(name=name, code=exit_code))
def _launch_task_servers(all_host_names, local_host_names, driver_addresses, settings): """ Executes the task server and service client task for registration on the hosts. :param all_host_names: list of addresses. for example, ['worker-0','worker-1'] ['10.11.11.11', '10.11.11.12'] :type all_host_names: list(string) :param local_host_names: names that are resolved to one of the addresses of local hosts interfaces. For example, set(['localhost', '127.0.0.1']) :type local_host_names: set :param driver_addresses: map of interfaces and their address and port for the service. For example: { 'lo': [('127.0.0.1', 34588)], 'docker0': [('172.122.10.1', 34588)], 'eth0': [('11.111.33.73', 34588)] } :type driver_addresses: map :param settings: the object that contains the setting for running horovod :type settings: Horovod.run.common.util.settings.Settings :return: :rtype: """ def _exec_command(command): host_output = six.StringIO() try: exit_code = safe_shell_exec.execute(command, stdout=host_output, stderr=host_output) if exit_code != 0: print('Launching horovodrun task function was not ' 'successful:\n{host_output}'.format( host_output=host_output.getvalue())) os._exit(exit_code) finally: host_output.close() return exit_code ssh_port_args = _get_ssh_port_args(all_host_names, ssh_port=settings.ssh_port, ssh_ports=settings.ssh_ports) args_list = [] for index in range(len(all_host_names)): host_name = all_host_names[index] if host_name in local_host_names: command = \ '{python} -m horovod.run.task_fn {index} ' \ '{driver_addresses} {settings}'\ .format(python=sys.executable, index=codec.dumps_base64(index), driver_addresses=codec.dumps_base64(driver_addresses), settings=codec.dumps_base64(settings)) else: command = \ 'ssh -o StrictHostKeyChecking=no {host} {ssh_port_arg} ' \ '\'{python} -m horovod.run.task_fn {index} {driver_addresses}' \ ' {settings}\''\ .format(host=host_name, ssh_port_arg=ssh_port_args[index], python=sys.executable, index=codec.dumps_base64(index), driver_addresses=codec.dumps_base64(driver_addresses), settings=codec.dumps_base64(settings)) args_list.append([command]) # Each thread will use ssh command to launch the server on one task. If an # error occurs in one thread, entire process will be terminated. Otherwise, # threads will keep running and ssh session -- and the the task server -- # will be bound to the thread. In case, the horovodrun process dies, all # the ssh sessions and all the task servers will die as well. threads.execute_function_multithreaded(_exec_command, args_list, block_until_all_done=False)
def _launch_jobs(settings, env, host_alloc_plan, remote_host_names, _run_command): """ executes the jobs defined by run command on hosts. :param hosts_alloc: list of dict indicating the allocating info. For example, [{'Hostname':'worker-0', 'Rank': 0, 'Local_rank': 0, 'Cross_rank':0, 'Size':2, 'Local_size':1, 'Cross_size':2}, {'Hostname':'worker-1', 'Rank': 1, 'Local_rank': 0, 'Cross_rank':1, 'Size':2, 'Local_size':1, 'Cross_size':2} ] :type hosts_alloc: list(dict) :param remote_host_names: names that are resolved to one of the addresses of remote hosts interfaces. :type remote_host_names: set :param _run_command: command to execute :type _run_command: string :return: :rtype: """ def _exec_command(command, index, event): if settings.verbose: print(command) # Redirect output if requested stdout = stderr = None stdout_file = stderr_file = None if settings.output_filename: padded_rank = _pad_rank(index, settings.num_proc) output_dir_rank = os.path.join( settings.output_filename, 'rank.{rank}'.format(rank=padded_rank)) if not os.path.exists(output_dir_rank): os.mkdir(output_dir_rank) stdout_file = open(os.path.join(output_dir_rank, 'stdout'), 'w') stderr_file = open(os.path.join(output_dir_rank, 'stderr'), 'w') stdout = MultiFile([sys.stdout, stdout_file]) stderr = MultiFile([sys.stderr, stderr_file]) try: exit_code = safe_shell_exec.execute(command, index=index, event=event, stdout=stdout, stderr=stderr) if exit_code != 0: print('Process {idx} exit with status code {ec}.'.format( idx=index, ec=exit_code)) except Exception as e: print('Exception happened during safe_shell_exec, exception ' 'message: {message}'.format(message=e)) finally: if stdout_file: stdout_file.close() if stderr_file: stderr_file.close() return 0 ssh_port_arg = '-p {ssh_port}'.format( ssh_port=settings.ssh_port) if settings.ssh_port else '' # Create a event for communication between threads event = threading.Event() def set_event_on_sigterm(signum, frame): event.set() signal.signal(signal.SIGINT, set_event_on_sigterm) signal.signal(signal.SIGTERM, set_event_on_sigterm) args_list = [] for alloc_info in host_alloc_plan: # generate env for rendezvous horovod_rendez_env = 'HOROVOD_RANK={rank} HOROVOD_SIZE={size} ' \ 'HOROVOD_LOCAL_RANK={local_rank} HOROVOD_LOCAL_SIZE={local_size} ' \ 'HOROVOD_CROSS_RANK={cross_rank} HOROVOD_CROSS_SIZE={cross_size} ' \ .format(rank=alloc_info.rank, size=alloc_info.size, local_rank=alloc_info.local_rank, local_size=alloc_info.local_size, cross_rank=alloc_info.cross_rank, cross_size=alloc_info.cross_size) host_name = alloc_info.hostname # TODO: Workaround for over-buffered outputs. Investigate how mpirun avoids this problem. env['PYTHONUNBUFFERED'] = '1' local_command = '{horovod_env} {env} {run_command}'.format( horovod_env=horovod_rendez_env, env=' '.join([ '%s=%s' % (key, quote(value)) for key, value in env.items() if env_util.is_exportable(key) ]), run_command=_run_command) if host_name not in remote_host_names: command = local_command else: command = 'ssh -o StrictHostKeyChecking=no {host} {ssh_port_arg} ' \ '{local_command}'.format( host=host_name, ssh_port_arg=ssh_port_arg, local_command=quote('cd {pwd} >& /dev/null ; {local_command}' .format(pwd=os.getcwd(), local_command=local_command)) ) args_list.append([command, alloc_info.rank, event]) # Make the output directory if it does not exist if settings.output_filename: _mkdir_p(settings.output_filename) # Each thread will use ssh command to launch the job on each remote host. If an # error occurs in one thread, entire process will be terminated. Otherwise, # threads will keep running and ssh session. In case, the main thread receives # a SIGINT, the event will be set and the spawned threads will kill their # corresponding middleman processes and thus the jobs will be killed as # well. threads.execute_function_multithreaded(_exec_command, args_list, block_until_all_done=True)