def test_dataset_fault_tolerance(ray_start_4_cpus): dataset = ray.data.range(10) test_config = TestConfig() def train_func(): return train.get_dataset_shard() def train_actor_failure(): import sys sys.exit(0) new_backend_executor_cls = gen_new_backend_executor(train_actor_failure) class SingleGetDatasetShardsBackendExecutor(new_backend_executor_cls): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._has_called_get_dataset_shards = False def _get_dataset_shards(self, dataset_or_dict): if self._has_called_get_dataset_shards: raise Exception self._has_called_get_dataset_shards = True return super()._get_dataset_shards(dataset_or_dict) with patch.object(ray.train.trainer, "BackendExecutor", SingleGetDatasetShardsBackendExecutor): trainer = Trainer(test_config, num_workers=2) trainer.start() trainer.run(train_func, dataset=dataset)
def test_dataset_pipeline(ray_start_4_cpus): """Checks that Pipeline is correctly sharded even with multiple epochs.""" num_epochs = 2 num_data = 10 dataset = ray.data.range(num_data).repeat() def get_dataset(): pipeline_iterator = train.get_dataset_shard().iter_epochs() data_all_epochs = [] for _ in range(num_epochs): dataset_this_epoch = next(pipeline_iterator) data_this_epoch = [] for batch in dataset_this_epoch.iter_batches( batch_format="native"): data_this_epoch.extend(batch) data_all_epochs.append(data_this_epoch) return data_all_epochs config = TestConfig() trainer = Trainer(config, num_workers=2) trainer.start() results = trainer.run(get_dataset, dataset=dataset) check_dataset_output(num_data, num_epochs, results)
def test_dataset_pipeline_shuffle(ray_start_4_cpus): num_epochs = 2 num_data = 20 dataset = ray.data.range(num_data).repeat().random_shuffle_each_window() def get_dataset(): pipeline_iterator = train.get_dataset_shard().iter_epochs() data_all_epochs = [] for _ in range(2): dataset_this_epoch = next(pipeline_iterator) data_this_epoch = [] for batch in dataset_this_epoch.iter_batches( batch_format="native"): data_this_epoch.extend(batch) if len(data_all_epochs) > 0: # Make sure data is shuffled per epoch. assert data_this_epoch != data_all_epochs[-1] data_all_epochs.append(data_this_epoch) return data_all_epochs config = TestConfig() trainer = Trainer(config, num_workers=2) trainer.start() results = trainer.run(get_dataset, dataset=dataset) check_dataset_output(num_data, num_epochs, results)
def test_persisted_checkpoint_strategy_failure(ray_start_2_cpus): logdir = "/tmp/test/trainer/test_persisted_checkpoint_strategy_failure" config = TestConfig() def train_func(): train.save_checkpoint(epoch=0) trainer = Trainer(config, num_workers=2, logdir=logdir) trainer.start() with pytest.raises(ValueError): trainer.run(train_func, checkpoint_strategy=CheckpointConfig(num_to_keep=-1)) with pytest.raises(ValueError): trainer.run( train_func, checkpoint_strategy=CheckpointConfig( checkpoint_score_order="invalid_order"), ) with pytest.raises(ValueError): trainer.run( train_func, checkpoint_strategy=CheckpointConfig( checkpoint_score_attribute="missing_attribute"), )
def test_torch_auto_unwrap(ray_start_2_cpus): """Tests if underlying model from DDP is extracted when saving ckpt.""" def train_fn(): model = torch.nn.Linear(1, 1) # Wrap in DDP. model = train.torch.prepare_model(model) # Save DDP wrapped model. train.save_checkpoint(model=model) # Report DDP wrapped model. train.report(model=model) num_workers = 2 trainer = Trainer("torch", num_workers) trainer.start() class ValidateEncodedCallback(TrainingCallback): def handle_result(self, results, **info): for result in results: model = result["model"] assert isinstance(model, torch.nn.Module) and not isinstance( model, torch.nn.parallel.DistributedDataParallel) trainer.run(train_fn, callbacks=[ValidateEncodedCallback()]) last_checkpoint = trainer.latest_checkpoint model = last_checkpoint["model"] assert isinstance(model, torch.nn.Module) and not isinstance( model, torch.nn.parallel.DistributedDataParallel) trainer.shutdown()
def test_retry(ray_start_2_cpus): def train_func(): ckpt = train.load_checkpoint() restored = bool(ckpt) # Does a previous checkpoint exist? itr = 0 if ckpt: itr = ckpt["iter"] + 1 for i in range(itr, 4): if i == 2 and not restored: raise Exception("try to fail me") train.save_checkpoint(iter=i) train.report(test=i, training_iteration=i) trainer = Trainer(TestConfig(), num_workers=1) TestTrainable = trainer.to_tune_trainable(train_func) analysis = tune.run(TestTrainable, max_failures=3) last_ckpt = analysis.trials[0].checkpoint.value checkpoint_file = os.path.join(last_ckpt, TUNE_CHECKPOINT_FILE_NAME) assert os.path.exists(checkpoint_file) with open(checkpoint_file, "rb") as f: checkpoint = cloudpickle.load(f) assert checkpoint["iter"] == 3 trial_dfs = list(analysis.trial_dataframes.values()) assert len(trial_dfs[0]["training_iteration"]) == 4
def test_tf_non_distributed(ray_start_2_cpus): """Make sure Ray Train works without TF MultiWorkerMirroredStrategy.""" trainer = Trainer(backend="torch", num_workers=1) trainer.start() trainer.run(tf_quick_start_train_func) trainer.shutdown()
def test_checkpoint(ray_start_2_cpus): config = TestConfig() def train_func(): assert train.load_checkpoint() is None for i in range(3): train.save_checkpoint(epoch=i) return 1 trainer = Trainer(config, num_workers=2) trainer.start() trainer.run(train_func) checkpoint = trainer.latest_checkpoint assert checkpoint is not None assert checkpoint["epoch"] == 2 def train_func_checkpoint(): checkpoint = train.load_checkpoint() assert checkpoint is not None assert checkpoint["epoch"] == 2 for i in range(checkpoint["epoch"], 5): train.save_checkpoint(epoch=i) return 1 trainer.run(train_func_checkpoint, checkpoint=checkpoint) checkpoint = trainer.latest_checkpoint assert checkpoint is not None assert checkpoint["epoch"] == 4
def test_reuse_checkpoint(ray_start_2_cpus): def train_func(config): itr = 0 ckpt = train.load_checkpoint() if ckpt is not None: itr = ckpt["iter"] + 1 for i in range(itr, config["max_iter"]): train.save_checkpoint(iter=i) train.report(test=i, training_iteration=i) trainer = Trainer(TestConfig(), num_workers=1) TestTrainable = trainer.to_tune_trainable(train_func) [trial] = tune.run(TestTrainable, config={"max_iter": 5}).trials last_ckpt = trial.checkpoint.value checkpoint_file = os.path.join(last_ckpt, TUNE_CHECKPOINT_FILE_NAME) assert os.path.exists(checkpoint_file) with open(checkpoint_file, "rb") as f: checkpoint = cloudpickle.load(f) assert checkpoint["iter"] == 4 analysis = tune.run( TestTrainable, config={"max_iter": 10}, restore=last_ckpt) trial_dfs = list(analysis.trial_dataframes.values()) assert len(trial_dfs[0]["training_iteration"]) == 5
def test_mismatch_checkpoint_report(ray_start_2_cpus): test_config = TestConfig() def train_func(): for i in range(2): train.save_checkpoint(epoch=i) train.report(index=i) def train_mismatch(): train.save_checkpoint(epoch=0) train.report(index=0) # skip checkpoint train.report(index=1) new_backend_executor_cls = gen_new_backend_executor(train_mismatch) callback = TestCallback() with patch.object(ray.train.trainer, "BackendExecutor", new_backend_executor_cls): trainer = Trainer(test_config, num_workers=2) trainer.start() with pytest.raises(RuntimeError): trainer.run(train_func, callbacks=[callback]) # validate checkpoint assert trainer.latest_checkpoint["epoch"] == 0 # validate callback result_list = callback.result_list assert len(result_list) == 1 # 1 epoch succeeded intermediate_results = result_list[0] assert len(intermediate_results) == 2 # both workers reported for worker_result in intermediate_results: assert worker_result["index"] == 0
def test_persisted_checkpoint(ray_start_2_cpus, logdir): config = TestConfig() def train_func(): for i in range(2): train.save_checkpoint(epoch=i) time.sleep(1) trainer = Trainer(config, num_workers=2, logdir=logdir) trainer.start() trainer.run(train_func) assert trainer.best_checkpoint_path is not None if logdir is not None: assert trainer.logdir == Path(logdir).expanduser().resolve() assert trainer.latest_checkpoint_dir.is_dir() assert trainer.best_checkpoint_path.is_file() assert trainer.best_checkpoint_path.name == f"checkpoint_{2:06d}" assert trainer.best_checkpoint_path.parent.name == "checkpoints" latest_checkpoint = trainer.latest_checkpoint def validate(): checkpoint = train.load_checkpoint() assert checkpoint is not None assert checkpoint == latest_checkpoint trainer.run(validate, checkpoint=trainer.best_checkpoint_path)
def test_torch_non_distributed(ray_start_2_cpus): """Make sure Ray Train works without torch DDP.""" trainer = Trainer(backend="torch", num_workers=1) trainer.start() trainer.run(torch_quick_start_train_func) trainer.shutdown()
def test_to_worker_group(ray_start_2_cpus): config = TestConfig() trainer = Trainer(config, num_workers=2) class Incrementer: def __init__(self, starting=0): self.count = starting def increment(self): self.count += 1 def get_count(self): return self.count workers = trainer.to_worker_group(Incrementer, starting=2) assert ray.get([w.get_count.remote() for w in workers]) == [2, 2] ray.get([w.increment.remote() for w in workers]) assert ray.get([w.get_count.remote() for w in workers]) == [3, 3] ray.get(workers[0].increment.remote()) assert ray.get([w.get_count.remote() for w in workers]) == [4, 3] ray.get(workers[1].increment.remote()) assert ray.get([w.get_count.remote() for w in workers]) == [4, 4]
def test_fast_slow(ray_start_2_cpus): test_config = TestConfig() def train_func(): for i in range(2): train.save_checkpoint(epoch=i) train.report(index=i) def train_slow(): for i in range(2): train.save_checkpoint(epoch=i) time.sleep(5) train.report(index=i) time.sleep(5) new_backend_executor_cls = gen_new_backend_executor(train_slow) callback = TestCallback() with patch.object(ray.train.trainer, "BackendExecutor", new_backend_executor_cls): trainer = Trainer(test_config, num_workers=2) trainer.start() trainer.run(train_func, callbacks=[callback]) assert trainer.latest_checkpoint["epoch"] == 1 result_list = callback.result_list assert len(result_list) == 2 for index in range(len(result_list)): intermediate_results = result_list[index] assert len(intermediate_results) == 2 for worker_result in intermediate_results: assert worker_result["index"] == index
def test_json(monkeypatch, ray_start_4_cpus, make_temp_dir, workers_to_log, detailed, filename): if detailed: monkeypatch.setenv(ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, "1") config = TestConfig() num_iters = 5 num_workers = 4 if workers_to_log is None: num_workers_to_log = num_workers elif isinstance(workers_to_log, int): num_workers_to_log = 1 else: num_workers_to_log = len(workers_to_log) def train_func(): for i in range(num_iters): train.report(index=i) return 1 if filename is None: # if None, use default value callback = JsonLoggerCallback(workers_to_log=workers_to_log) else: callback = JsonLoggerCallback(filename=filename, workers_to_log=workers_to_log) trainer = Trainer(config, num_workers=num_workers, logdir=make_temp_dir) trainer.start() trainer.run(train_func, callbacks=[callback]) if filename is None: assert str( callback.log_path.name) == JsonLoggerCallback._default_filename else: assert str(callback.log_path.name) == filename with open(callback.log_path, "r") as f: log = json.load(f) print(log) assert len(log) == num_iters assert len(log[0]) == num_workers_to_log assert all(len(element) == len(log[0]) for element in log) assert all( all(worker["index"] == worker[TRAINING_ITERATION] - 1 for worker in element) for element in log) assert all( all( all(key in worker for key in BASIC_AUTOFILLED_KEYS) for worker in element) for element in log) if detailed: assert all( all( all(key in worker for key in DETAILED_AUTOFILLED_KEYS) for worker in element) for element in log) else: assert all( all(not any(key in worker for key in DETAILED_AUTOFILLED_KEYS) for worker in element) for element in log)
def latency(amp: bool) -> float: trainer = Trainer("torch", num_workers=2, use_gpu=True) trainer.start() start_time = timer() trainer.run(train_func, {"amp": amp}) end_time = timer() trainer.shutdown() return end_time - start_time
def train_tensorflow_mnist(num_workers=2, use_gpu=False, epochs=4): trainer = Trainer(backend="tensorflow", num_workers=num_workers, use_gpu=use_gpu) trainer.start() results = trainer.run( train_func=train_func, config={"lr": 1e-3, "batch_size": 64, "epochs": epochs} ) trainer.shutdown() print(f"Results: {results[0]}")
def test_tune_error(ray_start_2_cpus): def train_func(config): raise RuntimeError("Error in training function!") trainer = Trainer(TestConfig(), num_workers=1) TestTrainable = trainer.to_tune_trainable(train_func) with pytest.raises(TuneError): tune.run(TestTrainable)
def test_start_max_failures(ray_start_2_cpus): test_config = TestConfig() trainer = Trainer(test_config, num_workers=2) def init_hook_fail(): import sys sys.exit(0) with pytest.raises(RuntimeError): trainer.start(initialization_hook=init_hook_fail)
def test_world_rank(ray_start_2_cpus): config = TestConfig() def train_func(): return train.world_rank() trainer = Trainer(config, num_workers=2) trainer.start() results = trainer.run(train_func) assert set(results) == {0, 1}
def train_linear(num_workers=2, use_gpu=False, epochs=3): trainer = Trainer(backend="torch", num_workers=num_workers, use_gpu=use_gpu) config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs} trainer.start() results = trainer.run( train_func, config, callbacks=[JsonLoggerCallback(), TBXLoggerCallback()] ) trainer.shutdown() print(results) return results
def test_horovod_simple(ray_start_2_cpus): def simple_fn(): hvd_torch.init() return hvd_torch.rank() num_workers = 2 trainer = Trainer("horovod", num_workers) trainer.start() result = trainer.run(simple_fn) trainer.shutdown() assert result == list(range(num_workers))
def test_worker_kill_checkpoint(ray_start_2_cpus): test_config = TestConfig() def train_func(): checkpoint = train.load_checkpoint() if checkpoint: epoch = checkpoint["epoch"] else: epoch = 0 print("Epoch: ", epoch) for i in range(epoch, 2): train.report(loss=1, iter=i) train.save_checkpoint(epoch=i + 1) trainer = Trainer(test_config, num_workers=2) trainer.start() kill_callback = KillCallback( fail_on=0, worker_group=trainer._executor.worker_group) trainer.run(train_func, callbacks=[kill_callback]) # Run 1: epoch=0, counter=1, Successful # *Checkpoint is saved.* # *Worker is killed* # *Getting checkpoint fails. Workers are restarted from beginning* # Run 2: epoch=0, counter=2, Successful # Run 3: epoch=1, counter=3, Successful assert kill_callback.counter == 3 assert trainer.latest_checkpoint["epoch"] == 2 trainer.shutdown() trainer.start() kill_callback = KillCallback( fail_on=1, worker_group=trainer._executor.worker_group) trainer.run(train_func, callbacks=[kill_callback]) # Run 1: epoch=0, counter=1, Successful # *Checkpoint saved* # *Latest checkpoint updated, epoch=1 # Run 2: epoch=1, counter=2, Successful # *Checkpoint saved* # *Worker is killed* # *Getting checkpoint fails. Workers are restarted from last checkpoint.* # Run 3: epoch=1, counter=3, Successful. assert kill_callback.counter == 3 assert trainer.latest_checkpoint["epoch"] == 2 def train_func(): return 1 # Make sure Trainer is usable even after failure handling. trainer.run(train_func)
def test_run(ray_start_2_cpus): config = TestConfig() def train_func(): return 1 trainer = Trainer(config, num_workers=2) trainer.start() results = trainer.run(train_func) trainer.shutdown() assert len(results) == 2 assert all(result == 1 for result in results)
def test_gpu_requests(ray_start_4_cpus_4_gpus_4_extra): class CudaTestBackend(TestBackend): share_cuda_visible_devices = True class CudaTestConfig(TestConfig): @property def backend_cls(self): return CudaTestBackend # GPUs should not be requested if `use_gpu` is False. with pytest.raises(ValueError): Trainer(CudaTestConfig(), num_workers=2, use_gpu=False, resources_per_worker={"GPU": 1}) # GPUs should not be set to 0 if `use_gpu` is True. with pytest.raises(ValueError): Trainer(CudaTestConfig(), num_workers=2, use_gpu=True, resources_per_worker={"GPU": 0}) def get_resources(): return os.environ["CUDA_VISIBLE_DEVICES"] # 0 GPUs will be requested and should not raise an error. trainer = Trainer(CudaTestConfig(), num_workers=2, use_gpu=False) trainer.start() result = trainer.run(get_resources) assert result == ["", ""] trainer.shutdown() # 1 GPU will be requested and should not raise an error. trainer = Trainer(CudaTestConfig(), num_workers=2, use_gpu=True) trainer.start() result = trainer.run(get_resources) assert result == ["0,1", "0,1"] trainer.shutdown() # Partial GPUs should not raise an error. trainer = Trainer(CudaTestConfig(), num_workers=2, use_gpu=True, resources_per_worker={"GPU": 0.1}) trainer.start() result = trainer.run(get_resources) assert result == ["0", "0"] trainer.shutdown() # Multiple GPUs should not raise an error. trainer = Trainer(CudaTestConfig(), num_workers=2, use_gpu=True, resources_per_worker={"GPU": 2}) trainer.start() result = trainer.run(get_resources) assert result == ["0,1,2,3", "0,1,2,3"] trainer.shutdown()
def tune_linear(num_workers, num_samples): trainer = Trainer("torch", num_workers=num_workers) Trainable = trainer.to_tune_trainable(train_func) analysis = tune.run(Trainable, num_samples=num_samples, config={ "lr": tune.loguniform(1e-4, 1e-1), "batch_size": tune.choice([4, 16, 32]), "epochs": 3 }) results = analysis.get_best_config(metric="loss", mode="min") print(results) return results
def train_linear(num_workers=2): trainer = Trainer(TorchConfig(backend="gloo"), num_workers=num_workers) config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3} trainer.start() results = trainer.run( train_func, config, callbacks=[JsonLoggerCallback(), TBXLoggerCallback()]) trainer.shutdown() print(results) return results
def test_max_failures(ray_start_2_cpus): test_config = TestConfig() def train_func(): import sys sys.exit(0) trainer = Trainer(test_config, num_workers=2) trainer.start() iterator = trainer.run_iterator(train_func) with pytest.raises(RuntimeError): iterator.get_final_results(force=True) assert iterator._executor._num_failures == 3
def test_checkpoint_torch_model_with_amp(ray_start_4_cpus_2_gpus): """Test that model with AMP is serializable.""" def train_func(): train.torch.accelerate(amp=True) model = torchvision.models.resnet101() model = train.torch.prepare_model(model) train.save_checkpoint(model=model) trainer = Trainer("torch", num_workers=1, use_gpu=True) trainer.start() trainer.run(train_func) trainer.shutdown()
def test_horovod_torch_mnist_gpu(ray_start_4_cpus_2_gpus): num_workers = 2 num_epochs = 2 trainer = Trainer("horovod", num_workers, use_gpu=True) trainer.start() results = trainer.run( horovod_torch_train_func, config={"num_epochs": num_epochs, "lr": 1e-3} ) trainer.shutdown() assert len(results) == num_workers for worker_result in results: assert len(worker_result) == num_epochs assert worker_result[num_epochs - 1] < worker_result[0]