def test_tf_non_distributed(ray_start_2_cpus): """Make sure Ray Train works without TF MultiWorkerMirroredStrategy.""" trainer = Trainer(backend="torch", num_workers=1) trainer.start() trainer.run(tf_quick_start_train_func) trainer.shutdown()
def test_torch_get_device_dist(ray_2_node_4_gpu, num_gpus_per_worker): @patch("torch.cuda.is_available", lambda: True) def train_fn(): return train.torch.get_device().index trainer = Trainer( TorchConfig(backend="gloo"), num_workers=int(8 / num_gpus_per_worker), use_gpu=True, resources_per_worker={"GPU": num_gpus_per_worker}, ) trainer.start() devices = trainer.run(train_fn) trainer.shutdown() count = Counter(devices) if num_gpus_per_worker == 0.5: for i in range(4): assert count[i] == 4 elif num_gpus_per_worker == 1: for i in range(4): assert count[i] == 2 elif num_gpus_per_worker == 2: for i in range(2): assert count[2 * i] == 2 else: raise RuntimeError( "New parameter for this test has been added without checking that the " "correct devices have been returned.")
def test_torch_auto_unwrap(ray_start_2_cpus): """Tests if underlying model from DDP is extracted when saving ckpt.""" def train_fn(): model = torch.nn.Linear(1, 1) # Wrap in DDP. model = train.torch.prepare_model(model) # Save DDP wrapped model. train.save_checkpoint(model=model) # Report DDP wrapped model. train.report(model=model) num_workers = 2 trainer = Trainer("torch", num_workers) trainer.start() class ValidateEncodedCallback(TrainingCallback): def handle_result(self, results, **info): for result in results: model = result["model"] assert isinstance(model, torch.nn.Module) and not \ isinstance(model, torch.nn.parallel.DistributedDataParallel) trainer.run(train_fn, callbacks=[ValidateEncodedCallback()]) last_checkpoint = trainer.latest_checkpoint model = last_checkpoint["model"] assert isinstance(model, torch.nn.Module) and not \ isinstance(model, torch.nn.parallel.DistributedDataParallel) trainer.shutdown()
def test_torch_non_distributed(ray_start_2_cpus): """Make sure Ray Train works without torch DDP.""" trainer = Trainer(backend="torch", num_workers=1) trainer.start() trainer.run(torch_quick_start_train_func) trainer.shutdown()
def train_tensorflow_mnist(num_workers=2, use_gpu=False, epochs=4): trainer = Trainer(backend="tensorflow", num_workers=num_workers, use_gpu=use_gpu) trainer.start() results = trainer.run( train_func=train_func, config={"lr": 1e-3, "batch_size": 64, "epochs": epochs} ) trainer.shutdown() print(f"Results: {results[0]}")
def latency(amp: bool) -> float: trainer = Trainer("torch", num_workers=2, use_gpu=True) trainer.start() start_time = timer() trainer.run(train_func, {"amp": amp}) end_time = timer() trainer.shutdown() return end_time - start_time
def train_linear(num_workers=2, use_gpu=False, epochs=3): trainer = Trainer(backend="torch", num_workers=num_workers, use_gpu=use_gpu) config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs} trainer.start() results = trainer.run( train_func, config, callbacks=[JsonLoggerCallback(), TBXLoggerCallback()] ) trainer.shutdown() print(results) return results
def test_horovod_simple(ray_start_2_cpus): def simple_fn(): hvd_torch.init() return hvd_torch.rank() num_workers = 2 trainer = Trainer("horovod", num_workers) trainer.start() result = trainer.run(simple_fn) trainer.shutdown() assert result == list(range(num_workers))
def test_worker_kill_checkpoint(ray_start_2_cpus): test_config = TestConfig() def train_func(): checkpoint = train.load_checkpoint() if checkpoint: epoch = checkpoint["epoch"] else: epoch = 0 print("Epoch: ", epoch) for i in range(epoch, 2): train.report(loss=1, iter=i) train.save_checkpoint(epoch=i + 1) trainer = Trainer(test_config, num_workers=2) trainer.start() kill_callback = KillCallback( fail_on=0, worker_group=trainer._executor.worker_group) trainer.run(train_func, callbacks=[kill_callback]) # Run 1: epoch=0, counter=1, Successful # *Checkpoint is saved.* # *Worker is killed* # *Getting checkpoint fails. Workers are restarted from beginning* # Run 2: epoch=0, counter=2, Successful # Run 3: epoch=1, counter=3, Successful assert kill_callback.counter == 3 assert trainer.latest_checkpoint["epoch"] == 2 trainer.shutdown() trainer.start() kill_callback = KillCallback( fail_on=1, worker_group=trainer._executor.worker_group) trainer.run(train_func, callbacks=[kill_callback]) # Run 1: epoch=0, counter=1, Successful # *Checkpoint saved* # *Latest checkpoint updated, epoch=1 # Run 2: epoch=1, counter=2, Successful # *Checkpoint saved* # *Worker is killed* # *Getting checkpoint fails. Workers are restarted from last checkpoint.* # Run 3: epoch=1, counter=3, Successful. assert kill_callback.counter == 3 assert trainer.latest_checkpoint["epoch"] == 2 def train_func(): return 1 # Make sure Trainer is usable even after failure handling. trainer.run(train_func)
def train_linear(num_workers=2): trainer = Trainer(TorchConfig(backend="gloo"), num_workers=num_workers) config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3} trainer.start() results = trainer.run( train_func, config, callbacks=[JsonLoggerCallback(), TBXLoggerCallback()]) trainer.shutdown() print(results) return results
def test_run(ray_start_2_cpus): config = TestConfig() def train_func(): return 1 trainer = Trainer(config, num_workers=2) trainer.start() results = trainer.run(train_func) trainer.shutdown() assert len(results) == 2 assert all(result == 1 for result in results)
def test_checkpoint_torch_model_with_amp(ray_start_4_cpus_2_gpus): """Test that model with AMP is serializable.""" def train_func(): train.torch.accelerate(amp=True) model = torchvision.models.resnet101() model = train.torch.prepare_model(model) train.save_checkpoint(model=model) trainer = Trainer("torch", num_workers=1, use_gpu=True) trainer.start() trainer.run(train_func) trainer.shutdown()
def test_horovod_torch_mnist_gpu(ray_start_4_cpus_2_gpus): num_workers = 2 num_epochs = 2 trainer = Trainer("horovod", num_workers, use_gpu=True) trainer.start() results = trainer.run( horovod_torch_train_func, config={"num_epochs": num_epochs, "lr": 1e-3} ) trainer.shutdown() assert len(results) == num_workers for worker_result in results: assert len(worker_result) == num_epochs assert worker_result[num_epochs - 1] < worker_result[0]
def test_gpu_requests(ray_start_4_cpus_4_gpus_4_extra): class CudaTestBackend(TestBackend): share_cuda_visible_devices = True class CudaTestConfig(TestConfig): @property def backend_cls(self): return CudaTestBackend # GPUs should not be requested if `use_gpu` is False. with pytest.raises(ValueError): Trainer(CudaTestConfig(), num_workers=2, use_gpu=False, resources_per_worker={"GPU": 1}) # GPUs should not be set to 0 if `use_gpu` is True. with pytest.raises(ValueError): Trainer(CudaTestConfig(), num_workers=2, use_gpu=True, resources_per_worker={"GPU": 0}) def get_resources(): return os.environ["CUDA_VISIBLE_DEVICES"] # 0 GPUs will be requested and should not raise an error. trainer = Trainer(CudaTestConfig(), num_workers=2, use_gpu=False) trainer.start() result = trainer.run(get_resources) assert result == ["", ""] trainer.shutdown() # 1 GPU will be requested and should not raise an error. trainer = Trainer(CudaTestConfig(), num_workers=2, use_gpu=True) trainer.start() result = trainer.run(get_resources) assert result == ["0,1", "0,1"] trainer.shutdown() # Partial GPUs should not raise an error. trainer = Trainer(CudaTestConfig(), num_workers=2, use_gpu=True, resources_per_worker={"GPU": 0.1}) trainer.start() result = trainer.run(get_resources) assert result == ["0", "0"] trainer.shutdown() # Multiple GPUs should not raise an error. trainer = Trainer(CudaTestConfig(), num_workers=2, use_gpu=True, resources_per_worker={"GPU": 2}) trainer.start() result = trainer.run(get_resources) assert result == ["0,1,2,3", "0,1,2,3"] trainer.shutdown()
def test_torch_fashion_mnist(ray_start_2_cpus): num_workers = 2 epochs = 3 trainer = Trainer("torch", num_workers=num_workers) config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs} trainer.start() results = trainer.run(fashion_mnist_train_func, config) trainer.shutdown() assert len(results) == num_workers for result in results: assert len(result) == epochs assert result[-1] < result[0]
def test_torch_linear(ray_start_2_cpus, num_workers): num_workers = num_workers epochs = 3 trainer = Trainer("torch", num_workers=num_workers) config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs} trainer.start() results = trainer.run(linear_train_func, config) trainer.shutdown() assert len(results) == num_workers for result in results: assert len(result) == epochs assert result[-1]["loss"] < result[0]["loss"]
def test_torch_amp(ray_start_2_cpus): def train_fn(): train.torch.accelerate(amp=True) model = torch.nn.Linear(1, 1) model = train.torch.prepare_model(model) # Make sure model is serializable even with amp enabled. return model.module num_workers = 2 trainer = Trainer("torch", num_workers) trainer.start() trainer.run(train_fn) trainer.shutdown()
def test_run_config(ray_start_2_cpus): backend_config = TestConfig() def train_func(config): return config["fruit"] config = {"fruit": "banana"} trainer = Trainer(backend_config, num_workers=2) trainer.start() results = trainer.run(train_func, config) trainer.shutdown() assert len(results) == 2 assert all(result == "banana" for result in results)
def train_linear(num_workers=2, use_gpu=False): datasets = get_datasets() trainer = Trainer("torch", num_workers=num_workers, use_gpu=use_gpu) config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3} trainer.start() results = trainer.run( train_func, config, dataset=datasets, callbacks=[JsonLoggerCallback(), TBXLoggerCallback()], ) trainer.shutdown() print(results) return results
def test_torch_linear_failure(ray_start_4_cpus): num_workers = 2 epochs = 3 trainer = Trainer("torch", num_workers=num_workers) config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs} trainer.start() kill_callback = KillCallback(fail_on=1, trainer=trainer) results = trainer.run(linear_train_func, config, callbacks=[kill_callback]) trainer.shutdown() assert len(results) == num_workers for result in results: assert len(result) == epochs assert result[-1]["loss"] < result[0]["loss"]
def test_start_shutdown(ray_start_2_cpus, num_workers): config = TestConfig() assert ray.available_resources()["CPU"] == 2 trainer = Trainer(config, num_workers=num_workers) trainer.start() time.sleep(1) remaining = 2 - num_workers if remaining == 0: assert "CPU" not in ray.available_resources() else: assert ray.available_resources()["CPU"] == remaining trainer.shutdown() time.sleep(1) assert ray.available_resources()["CPU"] == 2
def train_tensorflow_linear(num_workers=2, use_gpu=False): dataset_pipeline = get_dataset_pipeline() trainer = Trainer(backend="tensorflow", num_workers=num_workers, use_gpu=use_gpu) trainer.start() results = trainer.run(train_func=train_func, dataset=dataset_pipeline, config={ "lr": 1e-3, "batch_size": 32, "epochs": 4 }) trainer.shutdown() print(f"Results: {results[0]}") return results
def test_gpu_requests(ray_start_4_cpus_4_gpus_4_extra): # GPUs should not be requested if `use_gpu` is False. with pytest.raises(ValueError): Trainer( TestConfig(), num_workers=2, use_gpu=False, resources_per_worker={"GPU": 1}) # GPUs should not be set to 0 if `use_gpu` is True. with pytest.raises(ValueError): Trainer( TestConfig(), num_workers=2, use_gpu=True, resources_per_worker={"GPU": 0}) def get_resources(): return os.environ["CUDA_VISIBLE_DEVICES"] os.environ[ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV] = "1" # 0 GPUs will be requested and should not raise an error. trainer = Trainer(TestConfig(), num_workers=2, use_gpu=False) trainer.start() result = trainer.run(get_resources) assert result == ["", ""] trainer.shutdown() # 1 GPU will be requested and should not raise an error. trainer = Trainer(TestConfig(), num_workers=2, use_gpu=True) trainer.start() result = trainer.run(get_resources) assert result == ["0,1", "0,1"] trainer.shutdown() # Partial GPUs should not raise an error. trainer = Trainer( TestConfig(), num_workers=2, use_gpu=True, resources_per_worker={"GPU": 0.1}) trainer.start() result = trainer.run(get_resources) assert result == ["0", "0"] trainer.shutdown() # Multiple GPUs should not raise an error. trainer = Trainer( TestConfig(), num_workers=2, use_gpu=True, resources_per_worker={"GPU": 2}) trainer.start() result = trainer.run(get_resources) assert result == ["0,1,2,3", "0,1,2,3"] trainer.shutdown()
def test_horovod_torch_mnist_stateful(ray_start_4_cpus): num_workers = 2 num_epochs = 2 trainer = Trainer("horovod", num_workers) workers = trainer.to_worker_group(HorovodTrainClass, config={ "num_epochs": num_epochs, "lr": 1e-3 }) results = [] for epoch in range(num_epochs): results.append(ray.get([w.train.remote(epoch=epoch) for w in workers])) trainer.shutdown() assert len(results) == num_epochs for i in range(num_workers): assert results[num_epochs - 1][i] < results[0][i]
def start_ray_train(config, num_workers=4, use_gpu=False): ''' Train model using RayTrain. num_workers determines the number of processes. Uses the same config as local training. ''' trainer = Trainer(backend="tensorflow", num_workers=num_workers, use_gpu=use_gpu) trainer.start() start_time = time.time() results = trainer.run(train_epochs_remote, config=config) duration = time.time() - start_time trainer.shutdown() return None, results, duration
def test_run_failure(ray_start_2_cpus): test_config = TestConfig() def train_invalid_signature(a, b): pass trainer = Trainer(test_config, num_workers=2) # Raise RuntimeError when trainer has not been started yet. with pytest.raises(RuntimeError): trainer.run(lambda: 1) trainer.start() with pytest.raises(ValueError): trainer.run(train_invalid_signature) trainer.shutdown()
def test_torch_prepare_model(ray_start_4_cpus_2_gpus): """Tests if ``prepare_model`` correctly wraps in DDP.""" def train_fn(): model = torch.nn.Linear(1, 1) # Wrap in DDP. model = train.torch.prepare_model(model) # Make sure model is wrapped in DDP. assert isinstance(model, DistributedDataParallel) # Make sure model is on cuda. assert next(model.parameters()).is_cuda trainer = Trainer("torch", num_workers=2, use_gpu=True) trainer.start() trainer.run(train_fn) trainer.shutdown()
def test_worker_kill(ray_start_2_cpus, backend): if backend == "test": test_config = TestConfig() elif backend == "torch": test_config = TorchConfig() elif backend == "tf": test_config = TensorflowConfig() elif backend == "horovod": test_config = HorovodConfig() trainer = Trainer(test_config, num_workers=2) def train_func(): for i in range(2): train.report(loss=1, iter=i) trainer.start() kill_callback = KillCallback( fail_on=0, worker_group=trainer._executor.worker_group) trainer.run(train_func, callbacks=[kill_callback]) # Run 1: iter=0, counter=1, Successful # Run 2: iter=1, counter=1, Unsuccessful, starts training from beginning # Run 3: iter=0, counter=2, Successful # Run 4: iter=1, counter=3, Successful assert kill_callback.counter == 3 trainer.shutdown() trainer.start() kill_callback = KillCallback( fail_on=1, worker_group=trainer._executor.worker_group) trainer.run(train_func, callbacks=[kill_callback]) # Run 1: iter=0, counter=1, Successful # Run 2: iter=1, counter=2, Successful # Run 3: None, counter=2, Unsuccessful, starts training from beginning. # Run 4: iter=0, counter=3, Successful # Run 5: iter=1, counter=4, Successful assert kill_callback.counter == 4 def train_func(): return 1 # Make sure Trainer is usable even after failure handling. trainer.run(train_func)
def test_enable_reproducibility(ray_start_4_cpus_2_gpus, use_gpu): # NOTE: Reproducible results aren't guaranteed between seeded executions, even with # identical hardware and software dependencies. This test should be okay given that # it only runs for two epochs on a small dataset. # NOTE: I've chosen to use a ResNet model over a more simple model, because # `enable_reproducibility` disables CUDA convolution benchmarking, and a simpler # model (e.g., linear) might not test this feature. def train_func(): train.torch.enable_reproducibility() model = torchvision.models.resnet18() model = train.torch.prepare_model(model) dataset_length = 128 dataset = torch.utils.data.TensorDataset( torch.randn(dataset_length, 3, 32, 32), torch.randint(low=0, high=1000, size=(dataset_length, )), ) dataloader = torch.utils.data.DataLoader(dataset, batch_size=64) dataloader = train.torch.prepare_data_loader(dataloader) optimizer = torch.optim.SGD(model.parameters(), lr=0.001) model.train() for epoch in range(2): for images, targets in dataloader: optimizer.zero_grad() outputs = model(images) loss = torch.nn.functional.cross_entropy(outputs, targets) loss.backward() optimizer.step() return loss.item() trainer = Trainer("torch", num_workers=2, use_gpu=use_gpu) trainer.start() result1 = trainer.run(train_func) result2 = trainer.run(train_func) trainer.shutdown() assert result1 == result2
def test_resources(ray_start_4_cpus_4_gpus_4_extra, resource, num_requested): num_workers = 2 config = TestConfig() original = ray.available_resources().get(resource) resources_per_worker = {resource: num_requested} use_gpu = resource == "GPU" trainer = Trainer(config, num_workers=num_workers, use_gpu=use_gpu, resources_per_worker=resources_per_worker) trainer.start() expected = original - num_workers * num_requested wait_for_condition( lambda: ray.available_resources().get(resource, 0) == expected) trainer.shutdown() wait_for_condition( lambda: ray.available_resources().get(resource, 0) == original)