def test_shutdown(ray_start_regular_shared): q = Queue() actor = q.actor q.shutdown() assert q.actor is None with pytest.raises(RayActorError): ray.get(actor.empty.remote())
def execution_loop(self, trainer, tune_enabled: bool = True): """Main execution loop for training, testing, & prediction. Sets up the torch.distributed process group for each worker. Then trigger remote training/testing/eval via ``train_remote`` on each worker. If using with Ray Tune, create a communication queue to retrieve intermediate results, and process those results. Finally retrieve the training results from the rank 0 worker and return.""" # Sets environment variables for all workers. self._setup_env_vars() self.global_to_local = self.get_local_ranks() model = self._model model_ref = ray.put(model) # Don't pickle the model when training remotely. self._model = None queue = None if tune_enabled and TUNE_INSTALLED and is_session_enabled(): # Create communication queue and send to all the workers. queue = Queue(actor_options={"num_cpus": 0}) futures = [ self.workers[i].execute.remote(self.execute_remote, model_ref, i, queue) for i in range(self.num_workers) ] results = process_results(futures, queue) # Get the results, checkpoint path, and model weights from worker 0. results, best_path, state_stream = results[0] state_dict = load_state_stream(state_stream, to_gpu=self.use_gpu) # Set the state for PTL using the output from remote training. self._results = results self._model = model self._model.load_state_dict(state_dict) if self.lightning_module.trainer.checkpoint_callback: self.lightning_module.trainer.checkpoint_callback \ .best_model_path = best_path if queue: # Shutdown the queue. queue.shutdown() return results
def test_custom_resources(ray_start_regular_shared): current_resources = ray.available_resources() assert current_resources["CPU"] == 1.0 # By default an actor should not reserve any resources. q = Queue() current_resources = ray.available_resources() assert current_resources["CPU"] == 1.0 q.shutdown() # Specify resource requirement. The queue should now reserve 1 CPU. q = Queue(actor_options={"num_cpus": 1}) def no_cpu_in_resources(): return "CPU" not in ray.available_resources() wait_for_condition(no_cpu_in_resources) q.shutdown()
def start_training(self, trainer): """Main training loop. Trigger remote training via ``train_remote`` on each worker. If using with Ray Tune, create a communication queue to retrieve intermediate results, and process those results. Finally retrieve the training results from the rank 0 worker and return.""" model = self._model model_ref = ray.put(model) # Don't pickle the model when training remotely. self._model = None queue = None if TUNE_INSTALLED and is_session_enabled(): # Create communication queue and send to all the workers. queue = Queue(actor_options={"num_cpus": 0}) result_futures = self.executor.run_remote(self.train_remote, args=[model_ref, queue]) results = process_results(result_futures, queue) results, state_dict, best_path = results[0] self._results = results self._model = model self._model.load_state_dict(state_dict) self._model.trainer.accelerator.training_type_plugin = self if self.lightning_module.trainer.checkpoint_callback: self.lightning_module.trainer.checkpoint_callback \ .best_model_path = best_path if queue: # Shutdown the queue. queue.shutdown() return results
def run(self, worker_fn: Callable, callbacks: Optional[List[Callable]] = None) -> List[Any]: """Executes the provided function on all workers. Args: worker_fn: Target elastic function that can be executed. callbacks: List of callables. Each callback must either be a callable function or a class that implements __call__. Every callback will be invoked on every value logged by the rank 0 worker. Returns: List of return values from every completed worker. """ return_values = [] from ray.util.queue import Queue import inspect args = inspect.getfullargspec(Queue).args if "actor_options" not in args: # Ray 1.1 and less _queue = Queue() else: _queue = Queue(actor_options={ "num_cpus": 0, "resources": { ray.state.current_node_id(): 0.001 } }) self.driver.start( self.settings.num_proc, self._create_spawn_worker_fn(return_values, worker_fn, _queue)) def _process_calls(queue, callbacks, event): if not callbacks: return while queue.actor: if not queue.empty(): result = queue.get_nowait() for c in callbacks: c(result) # avoid slamming the CI elif event.is_set(): break time.sleep(0.1) try: event = threading.Event() _callback_thread = threading.Thread(target=_process_calls, args=(_queue, callbacks, event), daemon=True) _callback_thread.start() res = self.driver.get_results() event.set() if _callback_thread: _callback_thread.join(timeout=60) finally: if hasattr(_queue, "shutdown"): _queue.shutdown() else: done_ref = _queue.actor.__ray_terminate__.remote() done, not_done = ray.wait([done_ref], timeout=5) if not_done: ray.kill(_queue.actor) self.driver.stop() if res.error_message is not None: raise RuntimeError(res.error_message) for name, value in sorted(res.worker_results.items(), key=lambda item: item[1][1]): exit_code, timestamp = value if exit_code != 0: raise RuntimeError( 'Horovod detected that one or more processes ' 'exited with non-zero ' 'status, thus causing the job to be terminated. ' 'The first process ' 'to do so was:\nProcess name: {name}\nExit code: {code}\n'. format(name=name, code=exit_code)) return_values = [ value for k, value in sorted(return_values, key=lambda kv: kv[0]) ] return return_values