def test_distributed_sampler(tmpdir, ray_start_2_cpus): """Tests if distributed sampler is properly set.""" model = BoringModel() train_dataloader = model.train_dataloader() initial_sampler = train_dataloader.sampler assert not isinstance(initial_sampler, DistributedSampler) class DistributedSamplerCallback(Callback): def on_train_start(self, trainer, pl_module): train_sampler = trainer.train_dataloader.sampler assert isinstance(train_sampler, DistributedSampler) assert train_sampler.shuffle assert train_sampler.num_replicas == 2 assert train_sampler.rank == trainer.global_rank def on_validation_start(self, trainer, pl_module): train_sampler = trainer.val_dataloaders[0].sampler assert isinstance(train_sampler, DistributedSampler) assert not train_sampler.shuffle assert train_sampler.num_replicas == 2 assert train_sampler.rank == trainer.global_rank def on_test_start(self, trainer, pl_module): train_sampler = trainer.test_dataloaders[0].sampler assert isinstance(train_sampler, DistributedSampler) assert not train_sampler.shuffle assert train_sampler.num_replicas == 2 assert train_sampler.rank == trainer.global_rank plugin = RayPlugin(num_workers=2) trainer = get_trainer( tmpdir, plugins=[plugin], callbacks=[DistributedSamplerCallback()]) trainer.fit(model)
def test_actor_creation(tmpdir, ray_start_2_cpus, num_workers): """Tests whether the appropriate number of training actors are created.""" model = BoringModel() def check_num_actor(): assert len(ray.actors()) == num_workers model.on_epoch_end = check_num_actor plugin = RayPlugin(num_workers=num_workers) trainer = get_trainer(tmpdir, plugins=[plugin]) trainer.fit(model) assert all(actor["State"] == ray.gcs_utils.ActorTableData.DEAD for actor in list(ray.actors().values()))
def test_ddp_sharded_plugin_finetune(tmpdir, ray_start_2_cpus, seed): """Tests if we can save and restart training.""" model = BoringModel() trainer = Trainer( plugins=[RayShardedPlugin(num_workers=2)], fast_dev_run=True, ) trainer.fit(model) checkpoint_path = os.path.join(tmpdir, "model.pt") trainer.save_checkpoint(checkpoint_path) saved_model = BoringModel.load_from_checkpoint(checkpoint_path) trainer = Trainer(fast_dev_run=True, ) trainer.fit(saved_model)
def test_early_stop(tmpdir, ray_start_2_cpus): """Tests if early stopping callback works correctly.""" model = BoringModel() accelerator = RayAccelerator(num_workers=1, use_gpu=False) early_stop = EarlyStopping(monitor="val_loss", patience=2, verbose=True) trainer = get_trainer(tmpdir, max_epochs=500, accelerator=accelerator, callbacks=[early_stop], limit_train_batches=1.0, limit_val_batches=1.0, progress_bar_refresh_rate=1) trainer.fit(model) trained_model = BoringModel.load_from_checkpoint( trainer.checkpoint_callback.best_model_path) assert trained_model.val_epoch == 2, trained_model.val_epoch
def test_train_client(tmpdir, start_ray_client_server_2_cpus, seed, num_slots): """Tests if training modifies model weights.""" assert ray.util.client.ray.is_connected() model = BoringModel() plugin = HorovodRayPlugin(num_slots=num_slots, use_gpu=False) trainer = get_trainer(tmpdir, plugins=[plugin]) train_test(trainer, model)
def test_multi_node(tmpdir): """Tests if multi-node GPU training works.""" ray.init("auto") num_gpus = ray.available_resources()["GPU"] model = BoringModel() accelerator = RayAccelerator(num_workers=num_gpus, use_gpu=True) trainer = get_trainer(tmpdir, accelerator=accelerator, use_gpu=True) train_test(trainer, model)
def _inner_train(config): model = BoringModel() trainer = get_trainer(dir, use_gpu=use_gpu, callbacks=callbacks, accelerator=accelerator, **config) trainer.fit(model)
def test_multi_node(tmpdir): """Tests if multi-node GPU training works.""" ray.init("auto") num_gpus = ray.available_resources()["GPU"] model = BoringModel() plugin = RayPlugin(num_workers=num_gpus, use_gpu=True) trainer = get_trainer(tmpdir, plugins=[plugin], use_gpu=True) train_test(trainer, model)
def test_ddp_sharded_plugin_checkpoint(tmpdir, ray_start_2_cpus, seed): """Tests if checkpoint is saved correctly.""" model = BoringModel() trainer = Trainer( plugins=[RayShardedPlugin(num_workers=2)], fast_dev_run=True, ) trainer.fit(model) checkpoint_path = os.path.join(tmpdir, "model.pt") trainer.save_checkpoint(checkpoint_path) saved_model = BoringModel.load_from_checkpoint(checkpoint_path) # Assert model parameters are identical after loading. for ddp_param, shard_param in zip(model.parameters(), saved_model.parameters()): assert torch.equal(ddp_param, shard_param)
def test_ddp_sharded_plugin_test(tmpdir, ray_start_2_cpus, seed): """Tests if test works without fit.""" model = BoringModel() trainer = Trainer( plugins=[RayShardedPlugin(num_workers=2)], fast_dev_run=True, ) trainer.test(model)
def _inner_train(config): model = BoringModel() trainer = get_trainer(dir, use_gpu=use_gpu, callbacks=callbacks, plugins=[plugin], checkpoint_callback=False, **config) trainer.fit(model)
def test_ddp_sharded_plugin_resume_from_checkpoint_downsize( tmpdir, ray_start_2_cpus, seed): """Tests if we can save and resume training with less workers.""" model = BoringModel() trainer = Trainer( plugins=[RayShardedPlugin(num_workers=2)], fast_dev_run=True) trainer.fit(model) checkpoint_path = os.path.join(tmpdir, "model.pt") trainer.save_checkpoint(checkpoint_path) model = BoringModel() trainer = Trainer( plugins=[RayShardedPlugin(num_workers=1)], fast_dev_run=True, resume_from_checkpoint=checkpoint_path) trainer.fit(model)
def test_unused_parameters(tmpdir, ray_start_2_cpus): """Tests if find_unused_parameters is properly passed to model.""" model = BoringModel() plugin = RayPlugin( num_workers=2, use_gpu=False, find_unused_parameters=False) class UnusedParameterCallback(Callback): def on_train_start(self, trainer, pl_module): assert trainer.model.find_unused_parameters is False trainer = get_trainer( tmpdir, plugins=[plugin], callbacks=[UnusedParameterCallback()]) trainer.fit(model)
def test_ddp_sharded_plugin_resume_from_checkpoint(tmpdir, ray_start_2_cpus, seed): """Tests if resuming from checkpoint works.""" model = BoringModel() trainer = Trainer( plugins=[RayShardedPlugin(num_workers=2)], fast_dev_run=True, ) trainer.fit(model) checkpoint_path = os.path.join(tmpdir, "model.pt") trainer.save_checkpoint(checkpoint_path) model = BoringModel() trainer = Trainer( plugins=[RayShardedPlugin(num_workers=2)], fast_dev_run=True, resume_from_checkpoint=checkpoint_path) trainer.fit(model)
def test_model_to_gpu(tmpdir, ray_start_2_gpus): """Tests if model is placed on CUDA device.""" model = BoringModel() class CheckGPUCallback(Callback): def on_epoch_end(self, trainer, pl_module): assert next(pl_module.parameters()).is_cuda plugin = RayPlugin(num_workers=2, use_gpu=True) trainer = get_trainer(tmpdir, plugins=[plugin], use_gpu=True, callbacks=[CheckGPUCallback()]) trainer.fit(model)
def test_ddp_choice_sharded(tmpdir, ray_start_2_cpus, seed): """Tests if sharded plugin is properly recognized.""" class CB(Callback): def on_fit_start(self, trainer, pl_module): assert isinstance(trainer.accelerator.training_type_plugin, RayShardedPlugin) raise SystemExit() model = BoringModel() trainer = Trainer( fast_dev_run=True, plugins=[RayShardedPlugin(num_workers=2)], callbacks=[CB()], ) with pytest.raises(SystemExit): trainer.fit(model)
def test_correct_devices(tmpdir, ray_start_2_gpus): """Tests if GPU devices are correctly set.""" model = BoringModel() class CheckDevicesCallback(Callback): def on_epoch_end(self, trainer, pl_module): assert trainer.root_gpu == 0 assert int(os.environ["CUDA_VISIBLE_DEVICES"]) == \ trainer.local_rank assert trainer.root_gpu == pl_module.device.index assert torch.cuda.current_device() == trainer.root_gpu plugin = RayPlugin(num_workers=2, use_gpu=True) trainer = get_trainer(tmpdir, plugins=plugin, use_gpu=True, callbacks=[CheckDevicesCallback()]) trainer.fit(model)
def test_load_gpu(tmpdir, ray_start_2_gpus, seed, num_slots): """Tests if model checkpoint can be loaded.""" model = BoringModel() accelerator = HorovodRayAccelerator(num_slots=num_slots, use_gpu=True) trainer = get_trainer(tmpdir, accelerator=accelerator, use_gpu=True) load_test(trainer, model)
def test_load(tmpdir, ray_start_2_cpus, num_workers): """Tests if model checkpoint can be loaded.""" model = BoringModel() plugin = RayPlugin(num_workers=num_workers, use_gpu=False) trainer = get_trainer(tmpdir, plugins=[plugin]) load_test(trainer, model)
def test_train(tmpdir, ray_start_2_cpus, num_workers): """Tests if training modifies model weights.""" model = BoringModel() plugin = RayPlugin(num_workers=num_workers) trainer = get_trainer(tmpdir, plugins=[plugin]) train_test(trainer, model)
def test_load(tmpdir, ray_start_2_cpus, num_workers): """Tests if model checkpoint can be loaded.""" model = BoringModel() accelerator = RayAccelerator(num_workers=num_workers, use_gpu=False) trainer = get_trainer(tmpdir, accelerator=accelerator) load_test(trainer, model)
def test_train(tmpdir, ray_start_2_cpus, num_workers): """Tests if training modifies model weights.""" model = BoringModel() accelerator = RayAccelerator(num_workers=num_workers) trainer = get_trainer(tmpdir, accelerator=accelerator) train_test(trainer, model)
def test_train_gpu(tmpdir, ray_start_2_gpus, seed, num_slots): """Tests if training modifies model weights.""" model = BoringModel() plugin = HorovodRayPlugin(num_slots=num_slots, use_gpu=True) trainer = get_trainer(tmpdir, plugins=[plugin], use_gpu=True) train_test(trainer, model)
def test_load_gpu(tmpdir, ray_start_2_gpus, seed, num_slots): """Tests if model checkpoint can be loaded.""" model = BoringModel() plugin = HorovodRayPlugin(num_slots=num_slots, use_gpu=True) trainer = get_trainer(tmpdir, plugins=[plugin], use_gpu=True) load_test(trainer, model)
def test_train_client(tmpdir, start_ray_client_server_2_cpus, num_workers): assert ray.util.client.ray.is_connected() model = BoringModel() plugin = RayPlugin(num_workers=num_workers) trainer = get_trainer(tmpdir, plugins=[plugin]) train_test(trainer, model)
def test_train_gpu(tmpdir, ray_start_2_gpus, seed, num_slots): """Tests if training modifies model weights.""" model = BoringModel() accelerator = HorovodRayAccelerator(num_slots=num_slots, use_gpu=True) trainer = get_trainer(tmpdir, accelerator=accelerator, use_gpu=True) train_test(trainer, model)