示例#1
0
    def _exec_command(command, slot_info, events):
        index = slot_info.rank
        host_name = slot_info.hostname

        host_address = network.resolve_host_address(host_name)
        local_addresses = network.get_local_host_addresses()
        if host_address not in local_addresses:
            local_command = quote(
                'cd {pwd} > /dev/null 2>&1 ; {command}'.format(
                    pwd=os.getcwd(), command=command))
            get_remote_command(local_command,
                               host=host_name,
                               port=settings.ssh_port,
                               identity_file=settings.ssh_identity_file)

        if settings.verbose:
            print(command)

        # Redirect output if requested
        stdout = stderr = None
        stdout_file = stderr_file = None
        if settings.output_filename:
            padded_rank = _pad_rank(index, settings.num_proc)
            output_dir_rank = os.path.join(
                settings.output_filename,
                'rank.{rank}'.format(rank=padded_rank))
            if not os.path.exists(output_dir_rank):
                os.mkdir(output_dir_rank)

            stdout_file = open(os.path.join(output_dir_rank, 'stdout'), 'w')
            stderr_file = open(os.path.join(output_dir_rank, 'stderr'), 'w')

            stdout = MultiFile([sys.stdout, stdout_file])
            stderr = MultiFile([sys.stderr, stderr_file])

        try:
            exit_code = safe_shell_exec.execute(command,
                                                index=index,
                                                stdout=stdout,
                                                stderr=stderr,
                                                events=events)
            if exit_code != 0:
                print('Process {idx} exit with status code {ec}.'.format(
                    idx=index, ec=exit_code))
        except Exception as e:
            print('Exception happened during safe_shell_exec, exception '
                  'message: {message}'.format(message=e))
            exit_code = 1
        finally:
            if stdout_file:
                stdout_file.close()
            if stderr_file:
                stderr_file.close()
        return exit_code, time.time()
示例#2
0
def _check_all_hosts_ssh_successful(host_addresses,
                                    ssh_port=None,
                                    ssh_ports=None,
                                    ssh_identity_file=None):
    """
    checks if ssh can successfully be performed to all the hosts.
    :param host_addresses: list of addresses to ssh into. for example,
        ['worker-0','worker-1']
        ['10.11.11.11', '10.11.11.12']
    :type host_addresses: list(strings)
    :return: Returns True if all ssh was successful into all the addresses.
    """
    def exec_command(command):
        exit_code = 1
        output_msg = ''

        # Try ssh 5 times
        for i in range(SSH_ATTEMPTS):
            output = io.StringIO()
            try:
                exit_code = safe_shell_exec.execute(command,
                                                    stdout=output,
                                                    stderr=output)
                if exit_code == 0:
                    break
                output_msg = output.getvalue()
            finally:
                output.close()
        return exit_code, output_msg

    ssh_port_per_host = driver_service.get_ssh_port_args(host_addresses,
                                                         ssh_port=ssh_port,
                                                         ssh_ports=ssh_ports)
    args_list = [[
        get_remote_command(local_command='true',
                           host=host_address,
                           port=port,
                           identity_file=ssh_identity_file,
                           timeout_s=SSH_CONNECT_TIMEOUT_S)
    ] for host_address, port in zip(host_addresses, ssh_port_per_host)]

    ssh_exit_codes = \
        threads.execute_function_multithreaded(exec_command,
                                               args_list)

    ssh_successful_to_all_hosts = True
    for index, ssh_status in ssh_exit_codes.items():
        exit_code, output_msg = ssh_status[0], ssh_status[1]
        if exit_code != 0:
            print('ssh not successful for host {host}:\n{msg_output}'.format(
                host=host_addresses[index], msg_output=output_msg))

            ssh_successful_to_all_hosts = False
    if not ssh_successful_to_all_hosts:
        return None  # we could return False here but do not want it to be cached
    return True
示例#3
0
 def get_num_threads():
     """Returns the number of hardware threads."""
     lscpu_cmd = get_remote_command(LSFUtils._LSCPU_CMD,
                                    host=LSFUtils.get_compute_hosts()[0])
     output = io.StringIO()
     exit_code = safe_shell_exec.execute(lscpu_cmd,
                                         stdout=output,
                                         stderr=output)
     if exit_code != 0:
         raise RuntimeError(
             "{cmd} failed with exit code {exit_code}".format(
                 cmd=lscpu_cmd, exit_code=exit_code))
     return int(yaml.safe_load(output.getvalue())[LSFUtils._THREAD_KEY])
示例#4
0
def _launch_task_servers(all_host_names, local_host_names, driver_addresses,
                         settings):
    """
    Executes the task server and service client task for registration on the
    hosts.
    :param all_host_names: list of addresses. for example,
        ['worker-0','worker-1']
        ['10.11.11.11', '10.11.11.12']
    :type all_host_names: list(string)
    :param local_host_names: names that are resolved to one of the addresses
    of local hosts interfaces. For example,
        set(['localhost', '127.0.0.1'])
    :type local_host_names: set
    :param driver_addresses: map of interfaces and their address and port for
    the service. For example:
        {
            'lo': [('127.0.0.1', 34588)],
            'docker0': [('172.122.10.1', 34588)],
            'eth0': [('11.111.33.73', 34588)]
        }
    :type driver_addresses: map
    :param settings: the object that contains the setting for running horovod
    :type settings: horovod.runner.common.util.settings.Settings
    :return:
    :rtype:
    """
    def _exec_command(command):
        host_output = io.StringIO()
        try:
            exit_code = safe_shell_exec.execute(command,
                                                stdout=host_output,
                                                stderr=host_output)
            if exit_code != 0:
                print('Launching horovod task function was not '
                      'successful:\n{host_output}'.format(
                          host_output=host_output.getvalue()))
                os._exit(exit_code)
        finally:
            host_output.close()
        return exit_code

    args_list = []
    num_hosts = len(all_host_names)
    for index in range(num_hosts):
        host_name = all_host_names[index]
        command = \
            '{python} -m horovod.runner.task_fn {index} {num_hosts} ' \
            '{driver_addresses} {settings}' \
            .format(python=sys.executable,
                    index=codec.dumps_base64(index),
                    num_hosts=codec.dumps_base64(num_hosts),
                    driver_addresses=codec.dumps_base64(driver_addresses),
                    settings=codec.dumps_base64(settings))
        if host_name not in local_host_names:
            command = get_remote_command(
                command,
                host=host_name,
                port=settings.ssh_port,
                identity_file=settings.ssh_identity_file)

        if settings.verbose >= 2:
            print('Launching horovod task function: {}'.format(command))
        args_list.append([command])
    # Each thread will use ssh command to launch the server on one task. If an
    # error occurs in one thread, entire process will be terminated. Otherwise,
    # threads will keep running and ssh session -- and the the task server --
    # will be bound to the thread. In case, the horovod process dies, all
    # the ssh sessions and all the task servers will die as well.
    threads.execute_function_multithreaded(_exec_command,
                                           args_list,
                                           block_until_all_done=False)