def test_placement_group_pack(ray_4_node_4_cpu, num_workers): """Tests that workers are packed on nodes.""" config = TestConfig() e = BackendExecutor(config, num_workers=num_workers) e.start() node_id_set = get_node_id_set() assert len(node_id_set) == math.ceil(num_workers / 4)
def test_placement_group_spread(ray_4_node_4_cpu, num_workers): """Tests that workers are spread across nodes.""" os.environ[TRAIN_ENABLE_WORKER_SPREAD_ENV] = "1" config = TestConfig() e = BackendExecutor(config, num_workers=num_workers) e.start() node_id_set = get_node_id_set() assert len(node_id_set) == min(num_workers, 4)
def test_shutdown(ray_start_2_cpus): config = TestConfig() e = BackendExecutor(config, num_workers=2) e.start() assert len(e.worker_group) == 2 e.shutdown() with pytest.raises(InactiveWorkerGroupError): e.start_training(lambda: 1)
def test_torch_start_shutdown(ray_start_2_cpus, init_method): torch_config = TorchConfig(backend="gloo", init_method=init_method) e = BackendExecutor(torch_config, num_workers=2) e.start() def check_process_group(): import torch return torch.distributed.is_initialized( ) and torch.distributed.get_world_size() == 2 e.start_training(check_process_group) assert all(e.finish_training()) e._backend.on_shutdown(e.worker_group, e._backend_config) e.start_training(check_process_group) assert not any(e.finish_training())
def test_train(ray_start_2_cpus): config = TestConfig() e = BackendExecutor(config, num_workers=2) e.start() e.start_training(lambda: 1) assert e.finish_training() == [1, 1]
def test_train(ray_start_2_cpus): config = TestConfig() e = BackendExecutor(config, num_workers=2) e.start() e.start_training(lambda: 1, dataset_spec=EMPTY_RAY_DATASET_SPEC) assert e.finish_training() == [1, 1]
def test_mismatch_checkpoint_report(ray_start_2_cpus): def train_func(): if (train.world_rank()) == 0: train.save_checkpoint(epoch=0) else: train.report(iter=0) config = TestConfig() e = BackendExecutor(config, num_workers=2) e.start() e.start_training(train_func) with pytest.raises(RuntimeError): e.get_next_results()
def test_worker_failure(ray_start_2_cpus): config = TestConfig() e = BackendExecutor(config, num_workers=2) e.start() def train_fail(): ray.actor.exit_actor() new_execute_func = gen_execute_special(train_fail) with patch.object(WorkerGroup, "execute_async", new_execute_func): with pytest.raises(TrainingWorkerError): e.start_training(lambda: 1) e.finish_training()
def test_local_ranks(ray_start_2_cpus): config = TestConfig() e = BackendExecutor(config, num_workers=2) e.start() def train_func(): return train.local_rank() e.start_training(train_func, dataset_spec=EMPTY_RAY_DATASET_SPEC) assert set(e.finish_training()) == {0, 1}
def test_local_ranks(ray_start_2_cpus): config = TestConfig() e = BackendExecutor(config, num_workers=2) e.start() def train_func(): return train.local_rank() e.start_training(train_func) assert set(e.finish_training()) == {0, 1}
def test_start(ray_start_2_cpus): config = TestConfig() e = BackendExecutor(config, num_workers=2) with pytest.raises(InactiveWorkerGroupError): e.start_training(lambda: 1, dataset_spec=EMPTY_RAY_DATASET_SPEC) e.start() assert len(e.worker_group) == 2
def test_start(ray_start_2_cpus, tmp_path): config = TestConfig() e = BackendExecutor(config, num_workers=2) with pytest.raises(InactiveWorkerGroupError): e.start_training(lambda: 1, run_dir=tmp_path) e.start() assert len(e.worker_group) == 2
def training_loop(self) -> None: scaling_config_dataclass = self._validate_and_get_scaling_config_data_class( self.scaling_config ) train_loop_per_worker = construct_train_func( self.train_loop_per_worker, self.train_loop_config, fn_arg_name="train_loop_per_worker", ) additional_resources_per_worker = ( scaling_config_dataclass.additional_resources_per_worker ) backend_executor = BackendExecutor( backend_config=self.backend_config, num_workers=scaling_config_dataclass.num_workers, num_cpus_per_worker=scaling_config_dataclass.num_cpus_per_worker, num_gpus_per_worker=scaling_config_dataclass.num_gpus_per_worker, additional_resources_per_worker=additional_resources_per_worker, max_retries=0, ) checkpoint_manager = self._checkpoint_manager_cls() checkpoint_manager.on_init(preprocessor=self.preprocessor) # Start the remote actors. backend_executor.start(initialization_hook=None) if self.resume_from_checkpoint: resume_checkpoint_dict = self.resume_from_checkpoint.to_dict() else: resume_checkpoint_dict = None dataset_spec = _RayDatasetSpec( dataset_or_dict=self.datasets, dataset_split_fn=_default_dataset_split_fn ) # TODO(amog): Have TrainingIterator also accept a checkpoint ObjectRef instead # of just a Dict. training_iterator = TrainingIterator( backend_executor=backend_executor, backend_config=self.backend_config, train_func=train_loop_per_worker, dataset_spec=dataset_spec, checkpoint_manager=checkpoint_manager, checkpoint=resume_checkpoint_dict, checkpoint_strategy=None, ) for results in training_iterator: # TODO(ml-team): add ability to report results from multiple workers. first_worker_results = results[0] tune.report(**first_worker_results) # Shutdown workers. backend_executor.shutdown()
def test_initialization_hook(ray_start_2_cpus): config = TestConfig() e = BackendExecutor(config, num_workers=2) def init_hook(): import os os.environ["TEST"] = "1" e.start(initialization_hook=init_hook) def check(): import os return os.getenv("TEST", "0") e.start_training(check) assert e.finish_training() == ["1", "1"]
def test_cuda_visible_devices_multiple(ray_2_node_4_gpu, worker_results): config = TestConfig() def get_resources(): return os.environ["CUDA_VISIBLE_DEVICES"] num_workers, expected_results = worker_results os.environ[ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV] = "1" e = BackendExecutor( config, num_workers=num_workers, num_cpus_per_worker=0, num_gpus_per_worker=2 ) e.start() e.start_training(get_resources, dataset_spec=EMPTY_RAY_DATASET_SPEC) results = e.finish_training() results.sort() assert results == expected_results
def test_cuda_visible_devices_fractional(ray_2_node_2_gpu, worker_results): config = TestConfig() def get_resources(): return os.environ["CUDA_VISIBLE_DEVICES"] num_workers, expected_results = worker_results os.environ[ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV] = "1" e = BackendExecutor(config, num_workers=num_workers, num_cpus_per_worker=0, num_gpus_per_worker=0.5) e.start() e.start_training(get_resources) results = e.finish_training() results.sort() assert results == expected_results
def test_tensorflow_start(ray_start_2_cpus): num_workers = 2 tensorflow_config = TensorflowConfig() e = BackendExecutor(tensorflow_config, num_workers=num_workers) e.start() def get_tf_config(): import json import os return json.loads(os.environ["TF_CONFIG"]) e.start_training(get_tf_config) results = e.finish_training() assert len(results) == num_workers workers = [result["cluster"]["worker"] for result in results] assert all(worker == workers[0] for worker in workers) indexes = [result["task"]["index"] for result in results] assert len(set(indexes)) == num_workers
def training_loop(self) -> None: scaling_config_dataclass = ScalingConfigDataClass( **self.scaling_config) train_loop_per_worker = construct_train_func( self.train_loop_per_worker, self.train_loop_config, fn_arg_name="train_loop_per_worker", ) additional_resources_per_worker = ( scaling_config_dataclass.additional_resources_per_worker) backend_executor = BackendExecutor( backend_config=self.backend_config, num_workers=scaling_config_dataclass.num_workers, num_cpus_per_worker=scaling_config_dataclass.num_cpus_per_worker, num_gpus_per_worker=scaling_config_dataclass.num_gpus_per_worker, additional_resources_per_worker=additional_resources_per_worker, max_retries=0, ) checkpoint_manager = _DataParallelCheckpointManager() checkpoint_manager.on_init(preprocessor=self.preprocessor) # Start the remote actors. backend_executor.start(initialization_hook=None) if self.resume_from_checkpoint: resume_checkpoint_dict = self.resume_from_checkpoint.to_dict() else: resume_checkpoint_dict = None # Tell Ray Train to only shard the train dataset and not the other datasets. # This is purely an implementation detail and users do not need to know about # this. # TODO(amog): Refactor this to remove hack and make this more modular. # TrainingIterator should accept a generic custom_ingest_func that contains # the logic for how to split the Datasets. updated_dataset_dict = {} for key, value in self.datasets.items(): if key == TRAIN_DATASET_KEY: updated_dataset_dict[key] = value else: # Ray Train will strip out the added string before exposing to users. updated_dataset_dict[key + "_NO-SHARD"] = value # TODO(amog): Have TrainingIterator also accept a checkpoint ObjectRef instead # of just a Dict. training_iterator = TrainingIterator( backend_executor=backend_executor, backend_config=self.backend_config, train_func=train_loop_per_worker, dataset=updated_dataset_dict if len(updated_dataset_dict) > 0 else None, checkpoint_manager=checkpoint_manager, checkpoint=resume_checkpoint_dict, checkpoint_strategy=None, ) for results in training_iterator: # TODO(ml-team): add ability to report results from multiple workers. first_worker_results = results[0] tune.report(**first_worker_results) # Shutdown workers. backend_executor.shutdown()
def test(): config = TestConfig() e = BackendExecutor(config, num_workers=2) e.start() e.start_training(train_func) return e.finish_training()
def test(): config = TestConfig() e = BackendExecutor(config, num_workers=2) e.start() e.start_training(train_func, dataset_spec=EMPTY_RAY_DATASET_SPEC) return e.finish_training()
def test_train_failure(ray_start_2_cpus): config = TestConfig() e = BackendExecutor(config, num_workers=2) e.start() with pytest.raises(TrainBackendError): e.get_next_results() with pytest.raises(TrainBackendError): e.pause_reporting() with pytest.raises(TrainBackendError): e.finish_training() e.start_training(lambda: 1) with pytest.raises(TrainBackendError): e.start_training(lambda: 2) assert e.finish_training() == [1, 1]