class BackendExecutor: def __init__(self, backend_config: BackendConfig, num_workers: int = 1, num_cpus_per_worker: int = 1, num_gpus_per_worker: int = 0): self._backend_config = backend_config self._num_workers = num_workers self._num_cpus_per_worker = num_cpus_per_worker self._num_gpus_per_worker = num_gpus_per_worker def start(self): self.worker_group = WorkerGroup(self._num_workers, self._num_cpus_per_worker, self._num_gpus_per_worker) def execute(self, train_func: Callable): pass def shutdown(self): self.worker_group.shutdown() def run(self, train_func: Callable): """ Runs the training function. 1. Starts the executor. 2. Executes the function. 3. Shuts down the executor. 4. Returns results of the function. """ pass
def test_worker_creation_num_cpus(ray_start_2_cpus): assert ray.available_resources()["CPU"] == 2 wg = WorkerGroup(num_cpus_per_worker=2) time.sleep(1) assert len(wg.workers) == 1 # Make sure both CPUs are being used by the actor. assert "CPU" not in ray.available_resources() wg.shutdown()
def test_bad_resources(ray_start_2_cpus): with pytest.raises(ValueError): WorkerGroup(num_workers=-1) with pytest.raises(ValueError): WorkerGroup(num_cpus_per_worker=-1) with pytest.raises(ValueError): WorkerGroup(num_gpus_per_worker=-1)
def test_worker_restart(ray_start_2_cpus): wg = WorkerGroup(num_workers=2) with pytest.raises(RuntimeError): wg.start() # Avoid race condition. time.sleep(1) wg.shutdown(0) wg.start() wg.execute(lambda: 1)
def handle_failure(self, worker_group: WorkerGroup, failed_worker_indexes: List[int], backend_config: BackendConfig): """Logic for handling failures. By default, restart all workers. """ worker_group.shutdown() worker_group.start() self.on_start(worker_group, backend_config)
def test_execute_single(ray_start_2_cpus): wg = WorkerGroup(num_workers=2) def f(): import os os.environ["TEST"] = "1" wg.execute_single(1, f) def check(): import os return os.environ.get("TEST", "0") assert wg.execute(check) == ["0", "1"]
def start(self, initialization_hook: Optional[Callable[[], None]] = None, train_cls: Optional[Type] = None, train_cls_args: Optional[Tuple] = None, train_cls_kwargs: Optional[Dict] = None): """Starts the worker group.""" self.worker_group = WorkerGroup( num_workers=self._num_workers, num_cpus_per_worker=self._num_cpus_per_worker, num_gpus_per_worker=self._num_gpus_per_worker, additional_resources_per_worker=self. _additional_resources_per_worker, actor_cls=train_cls, actor_cls_args=train_cls_args, actor_cls_kwargs=train_cls_kwargs) try: if initialization_hook: self._initialization_hook = initialization_hook self.worker_group.execute(initialization_hook) if self._num_gpus_per_worker > 0: self._setup_gpus() self._backend.on_start(self.worker_group, self._backend_config) except RayActorError as exc: logger.exception(str(exc)) self._increment_failures() self._restart()
def start(self, initialization_hook: Optional[Callable[[], None]] = None, train_cls: Optional[Type] = None, train_cls_args: Optional[Tuple] = None, train_cls_kwargs: Optional[Dict] = None): """Starts the worker group.""" self.worker_group = WorkerGroup( num_workers=self._num_workers, num_cpus_per_worker=self._num_cpus_per_worker, num_gpus_per_worker=self._num_gpus_per_worker, additional_resources_per_worker=self. _additional_resources_per_worker, actor_cls=train_cls, actor_cls_args=train_cls_args, actor_cls_kwargs=train_cls_kwargs) try: if initialization_hook: self._initialization_hook = initialization_hook self.worker_group.execute(initialization_hook) share_cuda_visible_devices_enabled = bool( env_integer(ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV, self._backend.share_cuda_visible_devices)) if (self._num_gpus_per_worker > 0 and share_cuda_visible_devices_enabled): self._share_cuda_visible_devices() self._backend.on_start(self.worker_group, self._backend_config) except RayActorError as exc: logger.exception(str(exc)) self._increment_failures() self._restart()
def start(self, initialization_hook: Optional[Callable[[], None]] = None): """Starts the worker group.""" self.worker_group = WorkerGroup(self._num_workers, self._num_cpus_per_worker, self._num_gpus_per_worker) if initialization_hook: self.worker_group.execute(initialization_hook) self._backend.on_start(self.worker_group, self._backend_config)
def on_start(self, worker_group: WorkerGroup, backend_config: TorchConfig): if len(worker_group) > 1 and dist.is_available(): # Set the appropriate training backend. if backend_config.backend is None: if worker_group.num_gpus_per_worker > 0: backend = "nccl" else: backend = "gloo" else: backend = backend_config.backend master_addr, master_port = worker_group.execute_single( 0, get_address_and_port) if backend_config.init_method == "env": def set_env_vars(addr, port): os.environ["MASTER_ADDR"] = addr os.environ["MASTER_PORT"] = str(port) worker_group.execute(set_env_vars, addr=master_addr, port=master_port) url = "env://" elif backend_config.init_method == "tcp": url = f"tcp://{master_addr}:{master_port}" else: raise ValueError( f"The provided init_method (" f"{backend_config.init_method}) is not supported. Must " f"be either 'env' or 'tcp'.") setup_futures = [] for i in range(len(worker_group)): setup_futures.append( worker_group.execute_single_async( i, setup_torch_process_group, backend=backend, world_rank=i, world_size=len(worker_group), init_method=url, timeout_s=backend_config.timeout_s)) ray.get(setup_futures) else: logger.info("Distributed torch is not being used.")
def on_start(self, worker_group: WorkerGroup, backend_config: HorovodConfig): # TODO(matt): Implement placement group strategies in BackendExecutor. # Initialize workers with Horovod environment variables setup_futures = [] for rank in range(len(worker_group)): worker_node_id = worker_group.workers[rank].metadata.node_id setup_futures.append( worker_group.execute_single_async(rank, init_env_vars, rank, len(worker_group), worker_node_id)) ray.get(setup_futures) # Use Horovod Ray Coordinator # backend_config as settings self.coordinator = Coordinator(backend_config) # Get all the hostnames of all workers node_ids = [w.metadata.node_id for w in worker_group.workers] hostnames = [w.metadata.hostname for w in worker_group.workers] # Register each hostname to the coordinator. assumes the hostname # ordering is the same. for rank, (hostname, node_id) in enumerate(zip(hostnames, node_ids)): self.coordinator.register(hostname, node_id, rank) all_info = self.coordinator.finalize_registration() setup_futures = [] for rank, local_cross_env_var in all_info.items(): setup_futures.append( worker_group.execute_single_async(rank, update_env_vars, local_cross_env_var)) ray.get(setup_futures) coordinator_envs = self.coordinator.establish_rendezvous() nics = detect_nics( backend_config, all_host_names=list(self.coordinator.hostnames), node_workers=worker_group.workers) coordinator_envs.update(nics_to_env_var(nics)) worker_group.execute(update_env_vars, coordinator_envs)
def on_start(self, worker_group: WorkerGroup, backend_config: TensorflowConfig): if len(worker_group) > 1: # Compute URL for initializing distributed setup. def get_url(): address, port = get_address_and_port() return f"{address}:{port}" urls = worker_group.execute(get_url) # Get setup tasks in order to throw errors on failure. setup_futures = [] for i in range(len(worker_group)): setup_futures.append( worker_group.execute_single_async( i, setup_tensorflow_environment, worker_addresses=urls, index=i)) ray.get(setup_futures) else: logger.info("Distributed Tensorflow is not being used.")
def start(self, initialization_hook: Optional[Callable[[], None]] = None): """Starts the worker group.""" self.worker_group = WorkerGroup(self._num_workers, self._num_cpus_per_worker, self._num_gpus_per_worker, self._additional_resources_per_worker) try: if initialization_hook: self._initialization_hook = initialization_hook self.worker_group.execute(initialization_hook) if self._num_gpus_per_worker > 0: self._setup_gpus() self._backend.on_start(self.worker_group, self._backend_config) except RayActorError as exc: logger.exception(str(exc)) self._increment_failures() self._restart()
def test_worker_shutdown(ray_start_2_cpus): assert ray.available_resources()["CPU"] == 2 wg = WorkerGroup(num_workers=2) time.sleep(1) assert "CPU" not in ray.available_resources() assert len(ray.state.actors()) == 2 wg.shutdown() time.sleep(1) assert ray.available_resources()["CPU"] == 2 with pytest.raises(RuntimeError): wg.execute(lambda: 1)
def handle_failure(self, worker_group: WorkerGroup, failed_worker_indexes: List[int], backend_config: BackendConfig): """Failure handling for Tensorflow. Instead of restarting all workers, the failed workers are removed from the ``WorkerGroup``. The backend and session are shutdown on the remaining workers. Then new workers are added back in. """ worker_group.remove_workers(failed_worker_indexes) if len(worker_group) > 0: self.on_shutdown(worker_group, backend_config) worker_group.execute(shutdown_session) worker_group.add_workers(len(failed_worker_indexes)) self.on_start(worker_group, backend_config)
def on_shutdown(self, worker_group: WorkerGroup, backend_config: TorchConfig): if len(worker_group): worker_group.execute(dist.destroy_process_group) worker_group.execute(shutdown_torch)
def start(self): self.worker_group = WorkerGroup(self._num_workers, self._num_cpus_per_worker, self._num_gpus_per_worker)
def test_execute_args(ray_start_2_cpus): wg = WorkerGroup(num_workers=2) outputs = wg.execute(lambda x: x, 1) assert len(outputs) == 2 assert all(o == 1 for o in outputs)
def test_execute_async(ray_start_2_cpus): wg = WorkerGroup(num_workers=2) futures = wg.execute_async(lambda: 1) assert len(futures) == 2 outputs = ray.get(futures) assert all(o == 1 for o in outputs)