Exemplo n.º 1
0
    def start(self):
        """Starts the Horovod driver and services."""
        self.rendezvous = RendezvousServer(self.settings.verbose)
        self.driver = ElasticDriver(rendezvous=self.rendezvous,
                                    discovery=self.settings.discovery,
                                    min_num_proc=self.settings.min_num_proc,
                                    max_num_proc=self.settings.max_num_proc,
                                    timeout=self.settings.elastic_timeout,
                                    reset_limit=self.settings.reset_limit,
                                    verbose=self.settings.verbose)
        handler = create_rendezvous_handler(self.driver)
        logger.debug("[ray] starting rendezvous")
        global_rendezv_port = self.rendezvous.start(handler)

        logger.debug(f"[ray] waiting for {self.settings.num_proc} to start.")
        self.driver.wait_for_available_slots(self.settings.num_proc)

        # Host-to-host common interface detection
        # requires at least 2 hosts in an elastic job.
        min_hosts = _get_min_start_hosts(self.settings)
        current_hosts = self.driver.wait_for_available_slots(
            self.settings.num_proc, min_hosts=min_hosts)
        logger.debug("[ray] getting common interfaces")
        nics = detect_nics(
            self.settings,
            all_host_names=current_hosts.host_assignment_order,
        )
        logger.debug("[ray] getting driver IP")
        server_ip = socket.gethostbyname(socket.gethostname())
        self.run_env_vars = create_run_env_vars(server_ip,
                                                nics,
                                                global_rendezv_port,
                                                elastic=True)
Exemplo n.º 2
0
    def start(self,
              executable_cls: type = None,
              executable_args: Optional[List] = None,
              executable_kwargs: Optional[Dict] = None,
              extra_env_vars: Optional[Dict] = None):
        """Starts the workers and colocates them on all machines.

        We implement a node grouping because it seems like
        our implementation doesn't quite work for imbalanced nodes.
        Also, colocation performance is typically much better than
        non-colocated workers.

        Args:
            executable_cls (type): The class that will be created within
                an actor (BaseHorovodWorker). This will allow Horovod
                to establish its connections and set env vars.
            executable_args (List): Arguments to be passed into the
                worker class upon initialization.
            executable_kwargs (Dict): Keyword arguments to be passed into the
                worker class upon initialization.
            extra_env_vars (Dict): Environment variables to be set
                on the actors (worker processes) before initialization.

        """
        extra_env_vars = extra_env_vars or {}

        def resources_per_host():
            num_cpus = self.cpus_per_slot * self.num_slots
            num_gpus = self.gpus_per_slot * self.num_slots * int(self.use_gpu)
            return dict(num_cpus=num_cpus, num_gpus=num_gpus)

        self.coordinator = Coordinator(self.settings)
        executable_args = executable_args or []
        self.workers = self._create_workers(resources_per_host())
        # Get all the hostnames of all workers
        hostnames = map_blocking(lambda w: w.hostname.remote(), self.workers)
        # Register each hostname to the coordinator. assumes the hostname
        # ordering is the same.
        for rank, hostname in enumerate(hostnames):
            self.coordinator.register(hostname, rank)
        all_info = self.coordinator.finalize_registration()

        indexed_runners = dict(enumerate(self.workers))
        for rank, local_cross_env_var in all_info.items():
            indexed_runners[rank].update_env_vars.remote(local_cross_env_var)

        coordinator_envs = self.coordinator.establish_rendezvous()
        coordinator_envs.update(extra_env_vars)
        nics = detect_nics(
            self.settings,
            all_host_names=list(self.coordinator.hostnames_by_rank),
            node_workers=self.colocators)
        coordinator_envs.update(nics_to_env_var(nics))

        map_blocking(lambda w: w.update_env_vars.remote(coordinator_envs),
                     self.workers)

        self._start_executables(executable_cls, executable_args,
                                executable_kwargs)
Exemplo n.º 3
0
    def start(self,
              executable_cls: type = None,
              executable_args: Optional[List] = None,
              executable_kwargs: Optional[Dict] = None,
              extra_env_vars: Optional[Dict] = None):
        """Starts the Horovod driver and services.

        Args:
            executable_cls (type): The class that will be created within
                an actor (BaseHorovodWorker). This will allow Horovod
                to establish its connections and set env vars.
            executable_args (List): Arguments to be passed into the
                worker class upon initialization.
            executable_kwargs (Dict): Keyword arguments to be passed into the
                worker class upon initialization.
            extra_env_vars (Dict): Environment variables to be set
                on the actors (worker processes) before initialization.

        """

        self.rendezvous = RendezvousServer(self.settings.verbose)
        self.driver = ElasticDriver(rendezvous=self.rendezvous,
                                    discovery=self.settings.discovery,
                                    min_np=self.min_workers,
                                    max_np=self.max_workers,
                                    timeout=self.elastic_timeout,
                                    reset_limit=self.reset_limit,
                                    cooldown_range=self.cooldown_range,
                                    verbose=self.settings.verbose)
        handler = create_rendezvous_handler(self.driver)
        logger.debug("[ray] starting rendezvous")
        global_rendezv_port = self.rendezvous.start(handler)

        logger.debug(f"[ray] waiting for {self.num_workers} to start.")
        self.driver.wait_for_available_slots(self.num_workers)

        # Host-to-host common interface detection
        # requires at least 2 hosts in an elastic job.
        min_hosts = _get_min_start_hosts(self.settings)
        current_hosts = self.driver.wait_for_available_slots(
            self.num_workers, min_hosts=min_hosts)
        logger.debug("[ray] getting common interfaces")
        nics = detect_nics(
            self.settings,
            all_host_names=current_hosts.host_assignment_order,
        )
        logger.debug("[ray] getting driver IP")
        server_ip = socket.gethostbyname(socket.gethostname())
        self.run_env_vars = create_run_env_vars(server_ip,
                                                nics,
                                                global_rendezv_port,
                                                elastic=True)

        self.executable_cls = executable_cls
        self.executable_args = executable_args
        self.executable_kwargs = executable_kwargs
        self.env_vars = extra_env_vars or {}
Exemplo n.º 4
0
    def on_start(self, worker_group: WorkerGroup, backend_config: HorovodConfig):

        # TODO(matt): Implement placement group strategies in BackendExecutor.

        # Initialize workers with Horovod environment variables
        setup_futures = []
        for rank in range(len(worker_group)):
            worker_node_id = worker_group.workers[rank].metadata.node_id
            setup_futures.append(
                worker_group.execute_single_async(
                    rank, init_env_vars, rank, len(worker_group), worker_node_id
                )
            )
        ray.get(setup_futures)

        # Use Horovod Ray Coordinator
        # backend_config as settings
        self.coordinator = Coordinator(backend_config)

        # Get all the hostnames of all workers
        node_ids = [w.metadata.node_id for w in worker_group.workers]
        hostnames = [w.metadata.hostname for w in worker_group.workers]
        # Register each hostname to the coordinator. assumes the hostname
        # ordering is the same.
        for rank, (hostname, node_id) in enumerate(zip(hostnames, node_ids)):
            self.coordinator.register(hostname, node_id, rank)
        all_info = self.coordinator.finalize_registration()

        setup_futures = []
        for rank, local_cross_env_var in all_info.items():
            setup_futures.append(
                worker_group.execute_single_async(
                    rank, update_env_vars, local_cross_env_var
                )
            )
        ray.get(setup_futures)

        coordinator_envs = self.coordinator.establish_rendezvous()

        # Get one worker from each host/node.
        node_worker_indexes = [node_ids.index(node_id) for node_id in set(node_ids)]
        node_workers = [
            HorovodWorkerWrapper(worker_group.workers[worker_index])
            for worker_index in node_worker_indexes
        ]
        assert len(node_workers) == len(self.coordinator.hostnames)

        nics = detect_nics(
            backend_config,
            all_host_names=list(self.coordinator.hostnames),
            node_workers=node_workers,
        )
        coordinator_envs.update(nics_to_env_var(nics))

        worker_group.execute(update_env_vars, coordinator_envs)