Exemplo n.º 1
0
    def start(self):
        """Starts the Horovod driver and services."""
        self.rendezvous = RendezvousServer(self.settings.verbose)
        self.driver = ElasticDriver(rendezvous=self.rendezvous,
                                    discovery=self.settings.discovery,
                                    min_np=self.settings.min_np,
                                    max_np=self.settings.max_np,
                                    timeout=self.settings.elastic_timeout,
                                    reset_limit=self.settings.reset_limit,
                                    verbose=self.settings.verbose)
        handler = create_rendezvous_handler(self.driver)
        global_rendezv_port = self.rendezvous.start(handler)
        self.driver.wait_for_available_slots(self.settings.num_proc)

        # Host-to-host common interface detection
        # requires at least 2 hosts in an elastic job.
        min_hosts = _get_min_start_hosts(self.settings)
        current_hosts = self.driver.wait_for_available_slots(
            self.settings.num_proc, min_hosts=min_hosts)
        nics = driver_service.get_common_interfaces(
            self.settings, current_hosts.host_assignment_order)

        server_ip = network.get_driver_ip(nics)
        self.run_env_vars = create_run_env_vars(server_ip,
                                                nics,
                                                global_rendezv_port,
                                                elastic=True)
Exemplo n.º 2
0
    def start(self):
        """Starts the Horovod driver and services."""
        self.rendezvous = RendezvousServer(self.settings.verbose)
        self.driver = ElasticDriver(rendezvous=self.rendezvous,
                                    discovery=self.settings.discovery,
                                    min_num_proc=self.settings.min_num_proc,
                                    max_num_proc=self.settings.max_num_proc,
                                    timeout=self.settings.elastic_timeout,
                                    reset_limit=self.settings.reset_limit,
                                    verbose=self.settings.verbose)
        handler = create_rendezvous_handler(self.driver)
        logger.debug("[ray] starting rendezvous")
        global_rendezv_port = self.rendezvous.start(handler)

        logger.debug(f"[ray] waiting for {self.settings.num_proc} to start.")
        self.driver.wait_for_available_slots(self.settings.num_proc)

        # Host-to-host common interface detection
        # requires at least 2 hosts in an elastic job.
        min_hosts = _get_min_start_hosts(self.settings)
        current_hosts = self.driver.wait_for_available_slots(
            self.settings.num_proc, min_hosts=min_hosts)
        logger.debug("[ray] getting common interfaces")
        nics = detect_nics(
            self.settings,
            all_host_names=current_hosts.host_assignment_order,
        )
        logger.debug("[ray] getting driver IP")
        server_ip = socket.gethostbyname(socket.gethostname())
        self.run_env_vars = create_run_env_vars(server_ip,
                                                nics,
                                                global_rendezv_port,
                                                elastic=True)
Exemplo n.º 3
0
    def start(self,
              executable_cls: type = None,
              executable_args: Optional[List] = None,
              executable_kwargs: Optional[Dict] = None,
              extra_env_vars: Optional[Dict] = None):
        """Starts the Horovod driver and services.

        Args:
            executable_cls (type): The class that will be created within
                an actor (BaseHorovodWorker). This will allow Horovod
                to establish its connections and set env vars.
            executable_args (List): Arguments to be passed into the
                worker class upon initialization.
            executable_kwargs (Dict): Keyword arguments to be passed into the
                worker class upon initialization.
            extra_env_vars (Dict): Environment variables to be set
                on the actors (worker processes) before initialization.

        """

        self.rendezvous = RendezvousServer(self.settings.verbose)
        self.driver = ElasticDriver(rendezvous=self.rendezvous,
                                    discovery=self.settings.discovery,
                                    min_np=self.min_workers,
                                    max_np=self.max_workers,
                                    timeout=self.elastic_timeout,
                                    reset_limit=self.reset_limit,
                                    cooldown_range=self.cooldown_range,
                                    verbose=self.settings.verbose)
        handler = create_rendezvous_handler(self.driver)
        logger.debug("[ray] starting rendezvous")
        global_rendezv_port = self.rendezvous.start(handler)

        logger.debug(f"[ray] waiting for {self.num_workers} to start.")
        self.driver.wait_for_available_slots(self.num_workers)

        # Host-to-host common interface detection
        # requires at least 2 hosts in an elastic job.
        min_hosts = _get_min_start_hosts(self.settings)
        current_hosts = self.driver.wait_for_available_slots(
            self.num_workers, min_hosts=min_hosts)
        logger.debug("[ray] getting common interfaces")
        nics = detect_nics(
            self.settings,
            all_host_names=current_hosts.host_assignment_order,
        )
        logger.debug("[ray] getting driver IP")
        server_ip = socket.gethostbyname(socket.gethostname())
        self.run_env_vars = create_run_env_vars(server_ip,
                                                nics,
                                                global_rendezv_port,
                                                elastic=True)

        self.executable_cls = executable_cls
        self.executable_args = executable_args
        self.executable_kwargs = executable_kwargs
        self.env_vars = extra_env_vars or {}
Exemplo n.º 4
0
def elastic_driver_fn():
    global_rendezv = RendezvousServer(verbose=1)
    discover_hosts = discovery.HostDiscoveryScript(
        "/Users/zuston/iqiyiDev/horovod-opal/dis.sh", 3)
    driver = ElasticDriver(global_rendezv, discover_hosts, min_np=2, max_np=4)
    handler = create_rendezvous_handler(driver)
    global_rendezv_port = global_rendezv.start(handler)
    print('port: ' + str(global_rendezv_port))
    print('wait for available slots: {}'.format(2))
    current_hosts = driver.wait_for_available_slots(2)
    print("current hosts:" + str(current_hosts))
    pending_slots = driver._update_host_assignments(current_hosts)
    print("pending hosts:" + str(pending_slots))
    driver._worker_registry.reset(driver.world_size())
Exemplo n.º 5
0
def launch_gloo_elastic(command, exec_command, settings, env,
                        get_common_interfaces, rendezvous):
    # Make the output directory if it does not exist
    if settings.output_filename:
        _mkdir_p(settings.output_filename)

    driver = ElasticDriver(rendezvous,
                           settings.discovery,
                           settings.min_np,
                           settings.max_np,
                           timeout=settings.elastic_timeout,
                           reset_limit=settings.reset_limit,
                           cooldown_range=settings.cooldown_range,
                           verbose=settings.verbose)

    handler = create_rendezvous_handler(driver)
    global_rendezv_port = rendezvous.start(handler)
    driver.wait_for_available_slots(settings.num_proc)

    nics = get_common_interfaces(driver)
    server_ip = network.get_driver_ip(nics)

    event = register_shutdown_event()
    run_command = get_run_command(command,
                                  server_ip,
                                  nics,
                                  global_rendezv_port,
                                  elastic=True)

    create_worker = _create_elastic_worker_fn(exec_command, run_command, env,
                                              event)

    driver.start(settings.num_proc, create_worker)
    res = driver.get_results()
    driver.stop()

    if res.error_message is not None:
        raise RuntimeError(res.error_message)

    for name, value in sorted(res.worker_results.items(),
                              key=lambda item: item[1][1]):
        exit_code, timestamp = value
        if exit_code != 0:
            raise RuntimeError(
                'Horovod detected that one or more processes exited with non-zero '
                'status, thus causing the job to be terminated. The first process '
                'to do so was:\nProcess name: {name}\nExit code: {code}\n'.
                format(name=name, code=exit_code))
Exemplo n.º 6
0
def launch_gloo_elastic(command_or_func, exec_command, settings, env, get_common_interfaces, rendezvous, executable):
    # Make the output directory if it does not exist
    if settings.output_filename:
        _mkdir_p(settings.output_filename)

    driver = ElasticDriver(rendezvous, settings.discovery,
                           settings.min_num_proc, settings.max_num_proc,
                           timeout=settings.elastic_timeout,
                           reset_limit=settings.reset_limit,
                           cooldown_range=settings.cooldown_range,
                           verbose=settings.verbose)

    handler = create_rendezvous_handler(driver)
    global_rendezv_port = rendezvous.start(handler)
    driver.wait_for_available_slots(settings.num_proc)

    nics = get_common_interfaces(driver)
    server_ip = network.get_driver_ip(nics)
    run_func_server = None
    run_func_server_port = None

    if settings.run_func_mode:
        # when running a func, we have to spin up the KVStoreServer
        # to get the func to the remote process and the result back
        run_func_server = KVStoreServer(verbose=settings.verbose)
        run_func_server_port = run_func_server.start_server()
        put_data_into_kvstore(server_ip, run_func_server_port, 'runfunc', 'func', command_or_func)

        command = [executable, '-m', 'horovod.runner.run_task', server_ip, str(run_func_server_port)]
    else:
        command = command_or_func

    try:
        event = register_shutdown_event()
        run_command = get_run_command(command, server_ip, nics, global_rendezv_port, elastic=True)

        create_worker = _create_elastic_worker_fn(exec_command, run_command, env, event)

        driver.start(settings.num_proc, create_worker)
        res = driver.get_results()
        driver.stop()

        if res.error_message is not None:
            raise RuntimeError(res.error_message)

        for name, value in sorted(res.worker_results.items(), key=lambda item: item[1][1]):
            exit_code, timestamp = value
            if exit_code != 0:
                raise RuntimeError('Horovod detected that one or more processes exited with non-zero '
                                   'status, thus causing the job to be terminated. The first process '
                                   'to do so was:\nProcess name: {name}\nExit code: {code}\n'
                                   .format(name=name, code=exit_code))

        # fetch the result if running a func
        if settings.run_func_mode:
            results = [None] * settings.min_num_proc
            # TODO: make it parallel to improve performance
            for i in range(settings.min_num_proc):
                results[i] = read_data_from_kvstore(server_ip, run_func_server_port, 'runfunc_result', str(i))
            return results

        return None
    finally:
        if run_func_server:
            run_func_server.shutdown_server()
Exemplo n.º 7
0
    def test_worker_notification_manager(self):
        """Tests that host add events are sent to the worker notification service and consumed."""
        slots = {'host-1': 2}
        discovery = FixedHosts(slots)

        rendezvous = RendezvousServer()
        driver = ElasticDriver(rendezvous, discovery, min_np=2, max_np=4)
        driver.wait_for_available_slots(min_np=2)
        handler = create_rendezvous_handler(driver)

        common_intfs = network.get_local_intfs()
        addr = network.get_driver_ip(common_intfs)
        port = rendezvous.start(handler)
        nic = list(common_intfs)[0]

        rank_results = {}

        class NotificationReceiver:
            def __init__(self):
                self.events = []

            def on_hosts_updated(self, timestamp, res):
                self.events.append((timestamp, res))

        def add_host():
            slots = {'host-1': 2, 'host-2': 2}
            discovery.set(slots)

        def remove_host():
            slots = {'host-2': 2}
            discovery.set(slots)

        def exec_command(slot_info, events):
            manager = WorkerNotificationManager()
            manager.init(rendezvous_addr=addr,
                         rendezvous_port=port,
                         nic=nic,
                         hostname=slot_info.hostname,
                         local_rank=slot_info.local_rank)

            notification_receiver = NotificationReceiver()
            manager.register_listener(notification_receiver)

            driver.record_ready(slot_info.hostname, slot_info.local_rank)

            if slot_info.rank == 0:
                add_host()
            driver.wait_for_available_slots(4)

            if slot_info.rank == 0:
                remove_host()

            # Busy wait for the number of available slots to decrease
            while driver._host_manager.current_hosts.count_available_slots(
            ) > 2:
                time.sleep(0.01)

            rank_results[slot_info.rank] = notification_receiver.events
            return 0, time.time()

        driver.start(np=2, create_worker_fn=exec_command)
        res = driver.get_results().worker_results
        driver.stop()

        assert len(res) == 2
        for name, (exit_code, timestamp) in res.items():
            assert exit_code == 0, name

        assert len(rank_results) == 2
        for rank, events in rank_results.items():
            expected = 2 if rank == 0 else 0
            assert len(events) == expected, rank
            if rank == 0:
                # First update is an add
                assert events[0][1] == HostUpdateResult.added
                # Second update is a removal
                assert events[1][1] == HostUpdateResult.removed

        rendezvous.stop()