def test_shutdown(ray_start_2_cpus): config = TestConfig() e = BackendExecutor(config, num_workers=2) e.start() assert len(e.worker_group) == 2 e.shutdown() with pytest.raises(InactiveWorkerGroupError): e.start_training(lambda: 1, dataset_spec=EMPTY_RAY_DATASET_SPEC)
def training_loop(self) -> None: scaling_config_dataclass = self._validate_and_get_scaling_config_data_class( self.scaling_config ) train_loop_per_worker = construct_train_func( self._train_loop_per_worker, self._train_loop_config, fn_arg_name="train_loop_per_worker", ) additional_resources_per_worker = ( scaling_config_dataclass.additional_resources_per_worker ) trial_info = TrialInfo( name=session.get_trial_name(), id=session.get_trial_id(), resources=session.get_trial_resources(), logdir=os.getcwd(), ) backend_executor = BackendExecutor( backend_config=self._backend_config, trial_info=trial_info, num_workers=scaling_config_dataclass.num_workers, num_cpus_per_worker=scaling_config_dataclass.num_cpus_per_worker, num_gpus_per_worker=scaling_config_dataclass.num_gpus_per_worker, additional_resources_per_worker=additional_resources_per_worker, max_retries=0, ) checkpoint_manager = self._checkpoint_manager_cls( preprocessor=self.preprocessor ) # Start the remote actors. backend_executor.start(initialization_hook=None) training_iterator = TrainingIterator( backend_executor=backend_executor, backend_config=self._backend_config, train_func=train_loop_per_worker, dataset_spec=self._ingest_spec, checkpoint_manager=checkpoint_manager, checkpoint=self.resume_from_checkpoint, checkpoint_strategy=None, ) for results in training_iterator: # TODO(ml-team): add ability to report results from multiple workers. first_worker_results = results[0] tune.report(**first_worker_results) # Shutdown workers. backend_executor.shutdown()