def test_persisted_checkpoint(ray_start_2_cpus, tmp_path): def train_func(): for i in range(2): train.save_checkpoint(epoch=i) time.sleep(1) config = TestConfig() e = BackendExecutor(config) e.start() e.start_training(train_func, run_dir=tmp_path) e.finish_training() assert e.latest_checkpoint_id == 2 assert e.latest_checkpoint is not None assert e.latest_checkpoint["epoch"] == 1 assert e.best_checkpoint_path is not None assert os.path.exists(e.best_checkpoint_path) def validate(): checkpoint = train.load_checkpoint() assert checkpoint is not None assert checkpoint["epoch"] == 1 e2 = BackendExecutor(config) e2.start() e2.start_training(validate, checkpoint=e.best_checkpoint_path, run_dir=tmp_path) e2.finish_training()
def test_mismatch_checkpoint_report(ray_start_2_cpus, tmp_path): def train_func(): if (train.world_rank()) == 0: train.save_checkpoint(epoch=0) else: train.report(iter=0) config = TestConfig() e = BackendExecutor(config, num_workers=2) e.start() e.start_training(train_func, run_dir=tmp_path) with pytest.raises(RuntimeError): e.finish_training()
def test_worker_failure(ray_start_2_cpus, tmp_path): config = TestConfig() e = BackendExecutor(config, num_workers=2) e.start() def train_fail(): ray.actor.exit_actor() new_execute_func = gen_execute_special(train_fail) with patch.object(WorkerGroup, "execute_async", new_execute_func): with pytest.raises(TrainingWorkerError): e.start_training(lambda: 1, run_dir=tmp_path) e.finish_training()
def test_checkpoint(ray_start_2_cpus, tmp_path): def train_func(): for i in range(2): train.save_checkpoint(epoch=i) config = TestConfig() e = BackendExecutor(config, num_workers=1) e.start() e.start_training(train_func, run_dir=tmp_path) e.finish_training() assert e.latest_checkpoint is not None assert e.latest_checkpoint["epoch"] == 1
def test_train(ray_start_2_cpus, tmp_path): config = TestConfig() e = BackendExecutor(config, num_workers=2) e.start() e.start_training(lambda: 1, run_dir=tmp_path) assert e.finish_training() == [1, 1]
def test_torch_start_shutdown(ray_start_2_cpus, init_method, tmp_path): torch_config = TorchConfig(backend="gloo", init_method=init_method) e = BackendExecutor(torch_config, num_workers=2) e.start() def check_process_group(): import torch return torch.distributed.is_initialized( ) and torch.distributed.get_world_size() == 2 e.start_training(check_process_group, run_dir=tmp_path) assert all(e.finish_training()) e._backend.on_shutdown(e.worker_group, e._backend_config) e.start_training(check_process_group, run_dir=tmp_path) assert not any(e.finish_training())
def test_train_failure(ray_start_2_cpus, tmp_path): config = TestConfig() e = BackendExecutor(config, num_workers=2) e.start() with pytest.raises(TrainBackendError): e.fetch_next_result() with pytest.raises(TrainBackendError): e.finish_training() e.start_training(lambda: 1, run_dir=tmp_path) with pytest.raises(TrainBackendError): e.start_training(lambda: 2, run_dir=tmp_path) assert e.finish_training() == [1, 1]
def test_persisted_checkpoint_id(ray_start_2_cpus, tmp_path): def train_func(): for i in range(2): train.save_checkpoint(epoch=i) config = TestConfig() e = BackendExecutor(config) e.start() e.start_training(train_func, run_dir=tmp_path, latest_checkpoint_id=100) e.finish_training() assert e.latest_checkpoint_id == 102 assert e.latest_checkpoint is not None assert e.latest_checkpoint["epoch"] == 1 assert e.latest_checkpoint_path is not None assert os.path.exists(e.latest_checkpoint_path)
def test_local_ranks(ray_start_2_cpus, tmp_path): config = TestConfig() e = BackendExecutor(config, num_workers=2) e.start() def train_func(): return train.local_rank() e.start_training(train_func, run_dir=tmp_path) assert set(e.finish_training()) == {0, 1}
def test_no_exhaust(ray_start_2_cpus, tmp_path): """Tests if training can finish even if queue is not exhausted.""" def train_func(): for _ in range(2): train.report(loss=1) return 2 config = TestConfig() e = BackendExecutor(config, num_workers=2) e.start() e.start_training(train_func, run_dir=tmp_path) output = e.finish_training() assert output == [2, 2]
def test_initialization_hook(ray_start_2_cpus, tmp_path): config = TestConfig() e = BackendExecutor(config, num_workers=2) def init_hook(): import os os.environ["TEST"] = "1" e.start(initialization_hook=init_hook) def check(): import os return os.getenv("TEST", "0") e.start_training(check, run_dir=tmp_path) assert e.finish_training() == ["1", "1"]
def test_cuda_visible_devices(ray_2_node_2_gpu, worker_results, tmp_path): config = TestConfig() def get_resources(): return os.environ["CUDA_VISIBLE_DEVICES"] num_workers, expected_results = worker_results os.environ[ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV] = "1" e = BackendExecutor(config, num_workers=num_workers, num_cpus_per_worker=0, num_gpus_per_worker=1) e.start() e.start_training(get_resources, tmp_path) results = e.finish_training() results.sort() assert results == expected_results
def test_tensorflow_start(ray_start_2_cpus, tmp_path): num_workers = 2 tensorflow_config = TensorflowConfig() e = BackendExecutor(tensorflow_config, num_workers=num_workers) e.start() def get_tf_config(): import json import os return json.loads(os.environ["TF_CONFIG"]) e.start_training(get_tf_config, run_dir=tmp_path) results = e.finish_training() assert len(results) == num_workers workers = [result["cluster"]["worker"] for result in results] assert all(worker == workers[0] for worker in workers) indexes = [result["task"]["index"] for result in results] assert len(set(indexes)) == num_workers