def _exec_command(command, slot_info, events): index = slot_info.rank host_name = slot_info.hostname host_address = network.resolve_host_address(host_name) local_addresses = network.get_local_host_addresses() if host_address not in local_addresses: local_command = quote( 'cd {pwd} > /dev/null 2>&1 ; {command}'.format( pwd=os.getcwd(), command=command)) get_remote_command(local_command, host=host_name, port=settings.ssh_port, identity_file=settings.ssh_identity_file) if settings.verbose: print(command) # Redirect output if requested stdout = stderr = None stdout_file = stderr_file = None if settings.output_filename: padded_rank = _pad_rank(index, settings.num_proc) output_dir_rank = os.path.join( settings.output_filename, 'rank.{rank}'.format(rank=padded_rank)) if not os.path.exists(output_dir_rank): os.mkdir(output_dir_rank) stdout_file = open(os.path.join(output_dir_rank, 'stdout'), 'w') stderr_file = open(os.path.join(output_dir_rank, 'stderr'), 'w') stdout = MultiFile([sys.stdout, stdout_file]) stderr = MultiFile([sys.stderr, stderr_file]) try: exit_code = safe_shell_exec.execute(command, index=index, stdout=stdout, stderr=stderr, events=events) if exit_code != 0: print('Process {idx} exit with status code {ec}.'.format( idx=index, ec=exit_code)) except Exception as e: print('Exception happened during safe_shell_exec, exception ' 'message: {message}'.format(message=e)) exit_code = 1 finally: if stdout_file: stdout_file.close() if stderr_file: stderr_file.close() return exit_code, time.time()
def _check_all_hosts_ssh_successful(host_addresses, ssh_port=None, ssh_ports=None, ssh_identity_file=None): """ checks if ssh can successfully be performed to all the hosts. :param host_addresses: list of addresses to ssh into. for example, ['worker-0','worker-1'] ['10.11.11.11', '10.11.11.12'] :type host_addresses: list(strings) :return: Returns True if all ssh was successful into all the addresses. """ def exec_command(command): exit_code = 1 output_msg = '' # Try ssh 5 times for i in range(SSH_ATTEMPTS): output = io.StringIO() try: exit_code = safe_shell_exec.execute(command, stdout=output, stderr=output) if exit_code == 0: break output_msg = output.getvalue() finally: output.close() return exit_code, output_msg ssh_port_per_host = driver_service.get_ssh_port_args(host_addresses, ssh_port=ssh_port, ssh_ports=ssh_ports) args_list = [[ get_remote_command(local_command='true', host=host_address, port=port, identity_file=ssh_identity_file, timeout_s=SSH_CONNECT_TIMEOUT_S) ] for host_address, port in zip(host_addresses, ssh_port_per_host)] ssh_exit_codes = \ threads.execute_function_multithreaded(exec_command, args_list) ssh_successful_to_all_hosts = True for index, ssh_status in ssh_exit_codes.items(): exit_code, output_msg = ssh_status[0], ssh_status[1] if exit_code != 0: print('ssh not successful for host {host}:\n{msg_output}'.format( host=host_addresses[index], msg_output=output_msg)) ssh_successful_to_all_hosts = False if not ssh_successful_to_all_hosts: return None # we could return False here but do not want it to be cached return True
def get_num_threads(): """Returns the number of hardware threads.""" lscpu_cmd = get_remote_command(LSFUtils._LSCPU_CMD, host=LSFUtils.get_compute_hosts()[0]) output = io.StringIO() exit_code = safe_shell_exec.execute(lscpu_cmd, stdout=output, stderr=output) if exit_code != 0: raise RuntimeError( "{cmd} failed with exit code {exit_code}".format( cmd=lscpu_cmd, exit_code=exit_code)) return int(yaml.safe_load(output.getvalue())[LSFUtils._THREAD_KEY])
def _launch_task_servers(all_host_names, local_host_names, driver_addresses, settings): """ Executes the task server and service client task for registration on the hosts. :param all_host_names: list of addresses. for example, ['worker-0','worker-1'] ['10.11.11.11', '10.11.11.12'] :type all_host_names: list(string) :param local_host_names: names that are resolved to one of the addresses of local hosts interfaces. For example, set(['localhost', '127.0.0.1']) :type local_host_names: set :param driver_addresses: map of interfaces and their address and port for the service. For example: { 'lo': [('127.0.0.1', 34588)], 'docker0': [('172.122.10.1', 34588)], 'eth0': [('11.111.33.73', 34588)] } :type driver_addresses: map :param settings: the object that contains the setting for running horovod :type settings: horovod.runner.common.util.settings.Settings :return: :rtype: """ def _exec_command(command): host_output = io.StringIO() try: exit_code = safe_shell_exec.execute(command, stdout=host_output, stderr=host_output) if exit_code != 0: print('Launching horovod task function was not ' 'successful:\n{host_output}'.format( host_output=host_output.getvalue())) os._exit(exit_code) finally: host_output.close() return exit_code args_list = [] num_hosts = len(all_host_names) for index in range(num_hosts): host_name = all_host_names[index] command = \ '{python} -m horovod.runner.task_fn {index} {num_hosts} ' \ '{driver_addresses} {settings}' \ .format(python=sys.executable, index=codec.dumps_base64(index), num_hosts=codec.dumps_base64(num_hosts), driver_addresses=codec.dumps_base64(driver_addresses), settings=codec.dumps_base64(settings)) if host_name not in local_host_names: command = get_remote_command( command, host=host_name, port=settings.ssh_port, identity_file=settings.ssh_identity_file) if settings.verbose >= 2: print('Launching horovod task function: {}'.format(command)) args_list.append([command]) # Each thread will use ssh command to launch the server on one task. If an # error occurs in one thread, entire process will be terminated. Otherwise, # threads will keep running and ssh session -- and the the task server -- # will be bound to the thread. In case, the horovod process dies, all # the ssh sessions and all the task servers will die as well. threads.execute_function_multithreaded(_exec_command, args_list, block_until_all_done=False)