Пример #1
0
class BackendExecutor:
    def __init__(self,
                 backend_config: BackendConfig,
                 num_workers: int = 1,
                 num_cpus_per_worker: int = 1,
                 num_gpus_per_worker: int = 0):
        self._backend_config = backend_config
        self._num_workers = num_workers
        self._num_cpus_per_worker = num_cpus_per_worker
        self._num_gpus_per_worker = num_gpus_per_worker

    def start(self):
        self.worker_group = WorkerGroup(self._num_workers,
                                        self._num_cpus_per_worker,
                                        self._num_gpus_per_worker)

    def execute(self, train_func: Callable):
        pass

    def shutdown(self):
        self.worker_group.shutdown()

    def run(self, train_func: Callable):
        """ Runs the training function.

                1. Starts the executor.
                2. Executes the function.
                3. Shuts down the executor.
                4. Returns results of the function.
        """
        pass
Пример #2
0
def test_worker_creation_num_cpus(ray_start_2_cpus):
    assert ray.available_resources()["CPU"] == 2
    wg = WorkerGroup(num_cpus_per_worker=2)
    time.sleep(1)
    assert len(wg.workers) == 1
    # Make sure both CPUs are being used by the actor.
    assert "CPU" not in ray.available_resources()
    wg.shutdown()
Пример #3
0
def test_bad_resources(ray_start_2_cpus):
    with pytest.raises(ValueError):
        WorkerGroup(num_workers=-1)

    with pytest.raises(ValueError):
        WorkerGroup(num_cpus_per_worker=-1)

    with pytest.raises(ValueError):
        WorkerGroup(num_gpus_per_worker=-1)
Пример #4
0
def test_worker_restart(ray_start_2_cpus):
    wg = WorkerGroup(num_workers=2)
    with pytest.raises(RuntimeError):
        wg.start()
    # Avoid race condition.
    time.sleep(1)
    wg.shutdown(0)
    wg.start()
    wg.execute(lambda: 1)
Пример #5
0
    def handle_failure(self, worker_group: WorkerGroup,
                       failed_worker_indexes: List[int],
                       backend_config: BackendConfig):
        """Logic for handling failures.

        By default, restart all workers.
        """
        worker_group.shutdown()
        worker_group.start()
        self.on_start(worker_group, backend_config)
Пример #6
0
def test_execute_single(ray_start_2_cpus):
    wg = WorkerGroup(num_workers=2)

    def f():
        import os
        os.environ["TEST"] = "1"

    wg.execute_single(1, f)

    def check():
        import os
        return os.environ.get("TEST", "0")

    assert wg.execute(check) == ["0", "1"]
Пример #7
0
 def start(self,
           initialization_hook: Optional[Callable[[], None]] = None,
           train_cls: Optional[Type] = None,
           train_cls_args: Optional[Tuple] = None,
           train_cls_kwargs: Optional[Dict] = None):
     """Starts the worker group."""
     self.worker_group = WorkerGroup(
         num_workers=self._num_workers,
         num_cpus_per_worker=self._num_cpus_per_worker,
         num_gpus_per_worker=self._num_gpus_per_worker,
         additional_resources_per_worker=self.
         _additional_resources_per_worker,
         actor_cls=train_cls,
         actor_cls_args=train_cls_args,
         actor_cls_kwargs=train_cls_kwargs)
     try:
         if initialization_hook:
             self._initialization_hook = initialization_hook
             self.worker_group.execute(initialization_hook)
         if self._num_gpus_per_worker > 0:
             self._setup_gpus()
         self._backend.on_start(self.worker_group, self._backend_config)
     except RayActorError as exc:
         logger.exception(str(exc))
         self._increment_failures()
         self._restart()
Пример #8
0
    def start(self,
              initialization_hook: Optional[Callable[[], None]] = None,
              train_cls: Optional[Type] = None,
              train_cls_args: Optional[Tuple] = None,
              train_cls_kwargs: Optional[Dict] = None):
        """Starts the worker group."""
        self.worker_group = WorkerGroup(
            num_workers=self._num_workers,
            num_cpus_per_worker=self._num_cpus_per_worker,
            num_gpus_per_worker=self._num_gpus_per_worker,
            additional_resources_per_worker=self.
            _additional_resources_per_worker,
            actor_cls=train_cls,
            actor_cls_args=train_cls_args,
            actor_cls_kwargs=train_cls_kwargs)
        try:
            if initialization_hook:
                self._initialization_hook = initialization_hook
                self.worker_group.execute(initialization_hook)

            share_cuda_visible_devices_enabled = bool(
                env_integer(ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV,
                            self._backend.share_cuda_visible_devices))

            if (self._num_gpus_per_worker > 0
                    and share_cuda_visible_devices_enabled):
                self._share_cuda_visible_devices()
            self._backend.on_start(self.worker_group, self._backend_config)
        except RayActorError as exc:
            logger.exception(str(exc))
            self._increment_failures()
            self._restart()
Пример #9
0
 def start(self, initialization_hook: Optional[Callable[[], None]] = None):
     """Starts the worker group."""
     self.worker_group = WorkerGroup(self._num_workers,
                                     self._num_cpus_per_worker,
                                     self._num_gpus_per_worker)
     if initialization_hook:
         self.worker_group.execute(initialization_hook)
     self._backend.on_start(self.worker_group, self._backend_config)
Пример #10
0
    def on_start(self, worker_group: WorkerGroup, backend_config: TorchConfig):
        if len(worker_group) > 1 and dist.is_available():
            # Set the appropriate training backend.
            if backend_config.backend is None:
                if worker_group.num_gpus_per_worker > 0:
                    backend = "nccl"
                else:
                    backend = "gloo"
            else:
                backend = backend_config.backend

            master_addr, master_port = worker_group.execute_single(
                0, get_address_and_port)
            if backend_config.init_method == "env":

                def set_env_vars(addr, port):
                    os.environ["MASTER_ADDR"] = addr
                    os.environ["MASTER_PORT"] = str(port)

                worker_group.execute(set_env_vars,
                                     addr=master_addr,
                                     port=master_port)
                url = "env://"
            elif backend_config.init_method == "tcp":
                url = f"tcp://{master_addr}:{master_port}"
            else:
                raise ValueError(
                    f"The provided init_method ("
                    f"{backend_config.init_method}) is not supported. Must "
                    f"be either 'env' or 'tcp'.")

            setup_futures = []
            for i in range(len(worker_group)):
                setup_futures.append(
                    worker_group.execute_single_async(
                        i,
                        setup_torch_process_group,
                        backend=backend,
                        world_rank=i,
                        world_size=len(worker_group),
                        init_method=url,
                        timeout_s=backend_config.timeout_s))
            ray.get(setup_futures)
        else:
            logger.info("Distributed torch is not being used.")
Пример #11
0
    def on_start(self, worker_group: WorkerGroup,
                 backend_config: HorovodConfig):

        # TODO(matt): Implement placement group strategies in BackendExecutor.

        # Initialize workers with Horovod environment variables
        setup_futures = []
        for rank in range(len(worker_group)):
            worker_node_id = worker_group.workers[rank].metadata.node_id
            setup_futures.append(
                worker_group.execute_single_async(rank, init_env_vars, rank,
                                                  len(worker_group),
                                                  worker_node_id))
        ray.get(setup_futures)

        # Use Horovod Ray Coordinator
        # backend_config as settings
        self.coordinator = Coordinator(backend_config)

        # Get all the hostnames of all workers
        node_ids = [w.metadata.node_id for w in worker_group.workers]
        hostnames = [w.metadata.hostname for w in worker_group.workers]
        # Register each hostname to the coordinator. assumes the hostname
        # ordering is the same.
        for rank, (hostname, node_id) in enumerate(zip(hostnames, node_ids)):
            self.coordinator.register(hostname, node_id, rank)
        all_info = self.coordinator.finalize_registration()

        setup_futures = []
        for rank, local_cross_env_var in all_info.items():
            setup_futures.append(
                worker_group.execute_single_async(rank, update_env_vars,
                                                  local_cross_env_var))
        ray.get(setup_futures)

        coordinator_envs = self.coordinator.establish_rendezvous()

        nics = detect_nics(
            backend_config,
            all_host_names=list(self.coordinator.hostnames),
            node_workers=worker_group.workers)
        coordinator_envs.update(nics_to_env_var(nics))

        worker_group.execute(update_env_vars, coordinator_envs)
Пример #12
0
    def on_start(self, worker_group: WorkerGroup,
                 backend_config: TensorflowConfig):
        if len(worker_group) > 1:
            # Compute URL for initializing distributed setup.
            def get_url():
                address, port = get_address_and_port()
                return f"{address}:{port}"

            urls = worker_group.execute(get_url)

            # Get setup tasks in order to throw errors on failure.
            setup_futures = []
            for i in range(len(worker_group)):
                setup_futures.append(
                    worker_group.execute_single_async(
                        i,
                        setup_tensorflow_environment,
                        worker_addresses=urls,
                        index=i))
            ray.get(setup_futures)

        else:
            logger.info("Distributed Tensorflow is not being used.")
Пример #13
0
 def start(self, initialization_hook: Optional[Callable[[], None]] = None):
     """Starts the worker group."""
     self.worker_group = WorkerGroup(self._num_workers,
                                     self._num_cpus_per_worker,
                                     self._num_gpus_per_worker,
                                     self._additional_resources_per_worker)
     try:
         if initialization_hook:
             self._initialization_hook = initialization_hook
             self.worker_group.execute(initialization_hook)
         if self._num_gpus_per_worker > 0:
             self._setup_gpus()
         self._backend.on_start(self.worker_group, self._backend_config)
     except RayActorError as exc:
         logger.exception(str(exc))
         self._increment_failures()
         self._restart()
Пример #14
0
def test_worker_shutdown(ray_start_2_cpus):
    assert ray.available_resources()["CPU"] == 2
    wg = WorkerGroup(num_workers=2)
    time.sleep(1)
    assert "CPU" not in ray.available_resources()
    assert len(ray.state.actors()) == 2
    wg.shutdown()
    time.sleep(1)
    assert ray.available_resources()["CPU"] == 2

    with pytest.raises(RuntimeError):
        wg.execute(lambda: 1)
Пример #15
0
    def handle_failure(self, worker_group: WorkerGroup,
                       failed_worker_indexes: List[int],
                       backend_config: BackendConfig):
        """Failure handling for Tensorflow.

        Instead of restarting all workers, the failed workers are
        removed from the ``WorkerGroup``. The backend and session are
        shutdown on the remaining workers. Then new workers are added back in.
        """
        worker_group.remove_workers(failed_worker_indexes)
        if len(worker_group) > 0:
            self.on_shutdown(worker_group, backend_config)
            worker_group.execute(shutdown_session)
        worker_group.add_workers(len(failed_worker_indexes))
        self.on_start(worker_group, backend_config)
Пример #16
0
 def on_shutdown(self, worker_group: WorkerGroup,
                 backend_config: TorchConfig):
     if len(worker_group):
         worker_group.execute(dist.destroy_process_group)
     worker_group.execute(shutdown_torch)
Пример #17
0
 def start(self):
     self.worker_group = WorkerGroup(self._num_workers,
                                     self._num_cpus_per_worker,
                                     self._num_gpus_per_worker)
Пример #18
0
def test_execute_args(ray_start_2_cpus):
    wg = WorkerGroup(num_workers=2)
    outputs = wg.execute(lambda x: x, 1)
    assert len(outputs) == 2
    assert all(o == 1 for o in outputs)
Пример #19
0
def test_execute_async(ray_start_2_cpus):
    wg = WorkerGroup(num_workers=2)
    futures = wg.execute_async(lambda: 1)
    assert len(futures) == 2
    outputs = ray.get(futures)
    assert all(o == 1 for o in outputs)