def test_collect_states(): """This test ensures state are properly collected across processes. This would be used to collect dataloader states as an example. """ tutils.set_random_main_port() mp.spawn(_test_collect_states, args=(2,), nprocs=2)
def test_result_reduce_horovod(tmpdir): """Make sure result logging works with Horovod. This test mirrors tests/core/test_results.py::_ddp_test_fn """ tutils.reset_seed() tutils.set_random_main_port() def hvd_test_fn(): path_here = os.path.abspath(os.path.dirname(__file__)) path_root = os.path.abspath(os.path.join(path_here, "..", "..")) sys.path.insert(0, os.path.abspath(path_root)) class TestModel(BoringModel): def training_step(self, batch, batch_idx): self.training_step_called = True tensor = torch.tensor([1.0]) self.log("test_tensor", tensor, sync_dist=True, reduce_fx="sum", on_step=True, on_epoch=True) res = self._results # Check that `tensor` is summed across all ranks automatically assert ( res["test_tensor"].item() == hvd.size() ), "Result-Log does not work properly with Horovod and Tensors" def training_epoch_end(self, outputs) -> None: assert len(outputs) == 0 model = TestModel() model.val_dataloader = None trainer = Trainer( default_root_dir=tmpdir, limit_train_batches=2, limit_val_batches=2, max_epochs=1, log_every_n_steps=1, enable_model_summary=False, logger=False, ) trainer.fit(model) horovod.run(hvd_test_fn, np=2)
def test_multi_gpu_model_ddp_spawn(tmpdir): tutils.set_random_main_port() trainer_options = dict( default_root_dir=tmpdir, max_epochs=1, limit_train_batches=10, limit_val_batches=10, accelerator="gpu", devices=[0, 1], strategy="ddp_spawn", enable_progress_bar=False, ) model = BoringModel() tpipes.run_model_test(trainer_options, model)
def test_multi_gpu_early_stop_ddp_spawn(tmpdir): tutils.set_random_main_port() trainer_options = dict( default_root_dir=tmpdir, callbacks=[EarlyStopping(monitor="train_acc")], max_epochs=50, limit_train_batches=10, limit_val_batches=10, accelerator="gpu", devices=[0, 1], strategy="ddp_spawn", ) dm = ClassifDataModule() model = ClassificationModel() tpipes.run_model_test(trainer_options, model, dm)
def test_running_test_pretrained_model_distrib_ddp_spawn(tmpdir): """Verify `test()` on pretrained model.""" tutils.set_random_main_port() dm = ClassifDataModule() model = ClassificationModel() # exp file to get meta logger = tutils.get_default_logger(tmpdir) # exp file to get weights checkpoint = tutils.init_checkpoint_callback(logger) trainer_options = dict( enable_progress_bar=False, max_epochs=2, limit_train_batches=2, limit_val_batches=2, callbacks=[checkpoint], logger=logger, accelerator="gpu", devices=[0, 1], strategy="ddp_spawn", default_root_dir=tmpdir, ) # fit model trainer = Trainer(**trainer_options) trainer.fit(model, datamodule=dm) log.info(os.listdir(tutils.get_data_path(logger, path_dir=tmpdir))) # correct result and ok accuracy assert trainer.state.finished, f"Training failed with {trainer.state}" pretrained_model = ClassificationModel.load_from_checkpoint(trainer.checkpoint_callback.best_model_path) # run test set new_trainer = Trainer(**trainer_options) new_trainer.test(pretrained_model, datamodule=dm) pretrained_model.cpu() dataloaders = dm.test_dataloader() if not isinstance(dataloaders, list): dataloaders = [dataloaders] for dataloader in dataloaders: tpipes.run_model_prediction(pretrained_model, dataloader, min_acc=0.1)
def test_multi_cpu_model_ddp(tmpdir): """Make sure DDP works.""" tutils.set_random_main_port() trainer_options = dict( default_root_dir=tmpdir, enable_progress_bar=False, max_epochs=1, limit_train_batches=0.4, limit_val_batches=0.2, accelerator="cpu", devices=2, strategy="ddp_spawn", ) dm = ClassifDataModule() model = ClassificationModel() tpipes.run_model_test(trainer_options, model, data=dm)
def test_ddp_all_dataloaders_passed_to_fit(tmpdir): """Make sure DDP works with dataloaders passed to fit()""" tutils.set_random_main_port() model = BoringModel() trainer = Trainer( default_root_dir=tmpdir, enable_progress_bar=False, max_epochs=1, limit_train_batches=0.2, limit_val_batches=0.2, accelerator="gpu", devices=[0, 1], strategy="ddp_spawn", ) trainer.fit(model, train_dataloaders=model.train_dataloader(), val_dataloaders=model.val_dataloader()) assert trainer.state.finished, "DDP doesn't work with dataloaders passed to fit()."
def test_amp_gpu_ddp_slurm_managed(tmpdir): """Make sure DDP + AMP work.""" # simulate setting slurm flags tutils.set_random_main_port() model = AMPTestModel() # exp file to get meta logger = tutils.get_default_logger(tmpdir) # exp file to get weights checkpoint = tutils.init_checkpoint_callback(logger) # fit model trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, accelerator="gpu", devices=[0], strategy="ddp_spawn", precision=16, callbacks=[checkpoint], logger=logger, ) trainer.fit(model) # correct result and ok accuracy assert trainer.state.finished, "amp + ddp model failed to complete" # test root model address assert isinstance(trainer.strategy.cluster_environment, SLURMEnvironment) assert trainer.strategy.cluster_environment.resolve_root_node_address( "abc") == "abc" assert trainer.strategy.cluster_environment.resolve_root_node_address( "abc[23]") == "abc23" assert trainer.strategy.cluster_environment.resolve_root_node_address( "abc[23-24]") == "abc23" generated = trainer.strategy.cluster_environment.resolve_root_node_address( "abc[23-24, 45-40, 40]") assert generated == "abc23"
def test_multi_gpu_early_stop_dp(tmpdir): """Make sure DDP works. with early stopping """ tutils.set_random_main_port() dm = ClassifDataModule() model = CustomClassificationModelDP() trainer_options = dict( default_root_dir=tmpdir, callbacks=[EarlyStopping(monitor="val_acc")], max_epochs=50, limit_train_batches=10, limit_val_batches=10, accelerator="gpu", devices=[0, 1], strategy="dp", ) tpipes.run_model_test(trainer_options, model, dm)
def test_evaluate(tmpdir, trainer_kwargs): tutils.set_random_main_port() seed_everything(1) dm = ClassifDataModule() model = CustomClassificationModelDP() trainer = Trainer(default_root_dir=tmpdir, max_epochs=2, limit_train_batches=10, limit_val_batches=10, **trainer_kwargs) trainer.fit(model, datamodule=dm) assert "ckpt" in trainer.checkpoint_callback.best_model_path old_weights = model.layer_0.weight.clone().detach().cpu() trainer.validate(datamodule=dm) trainer.test(datamodule=dm) # make sure weights didn't change new_weights = model.layer_0.weight.clone().detach().cpu() torch_test_assert_close(old_weights, new_weights)
def test_model_saves_on_multi_gpu(tmpdir): """Test that ONNX model saves on a distributed backend.""" tutils.set_random_main_port() trainer_options = dict( default_root_dir=tmpdir, max_epochs=1, limit_train_batches=10, limit_val_batches=10, accelerator="gpu", devices=[0, 1], strategy="ddp_spawn", enable_progress_bar=False, ) model = BoringModel() model.example_input_array = torch.randn(5, 32) tpipes.run_model_test(trainer_options, model, min_acc=0.08) file_path = os.path.join(tmpdir, "model.onnx") model.to_onnx(file_path) assert os.path.exists(file_path) is True
def test_result_reduce_ddp(): """Make sure result logging works with DDP.""" tutils.set_random_main_port() worldsize = 2 mp.spawn(_ddp_test_fn, args=(worldsize,), nprocs=worldsize)
def run_test_from_config(trainer_options, on_gpu, check_size): """Trains the default model with the given config.""" set_random_main_port() reset_seed() ckpt_path = trainer_options["default_root_dir"] trainer_options.update(callbacks=[ModelCheckpoint(dirpath=ckpt_path)]) class TestModel(BoringModel): def on_train_start(self) -> None: expected_device = torch.device( "cuda", self.trainer.local_rank) if on_gpu else torch.device("cpu") assert self.device == expected_device def training_epoch_end(self, outputs) -> None: res = self.trainer.strategy.reduce(torch.tensor( 1.0, device=self.device), reduce_op="sum") assert res.sum() == self.trainer.strategy.world_size model = TestModel() trainer = Trainer(**trainer_options) trainer.fit(model) assert trainer.state.finished, f"Training failed with {trainer.state}" trainer.test(model) assert model.device == torch.device("cpu") # Horovod should be initialized following training. If not, this will raise an exception. if check_size: assert hvd.size() == 2 if trainer.global_rank > 0: return # test model loading pretrained_model = BoringModel.load_from_checkpoint( trainer.checkpoint_callback.best_model_path) # test new model accuracy test_loaders = model.test_dataloader() if not isinstance(test_loaders, list): test_loaders = [test_loaders] for dataloader in test_loaders: batch = next(iter(dataloader)) pretrained_model(batch) # test HPC saving # save logger to make sure we get all the metrics if trainer.logger: trainer.logger.finalize("finished") hpc_save_path = trainer._checkpoint_connector.hpc_save_path(ckpt_path) trainer.save_checkpoint(hpc_save_path) # test HPC loading checkpoint_path = trainer._checkpoint_connector._CheckpointConnector__get_max_ckpt_path_from_folder( ckpt_path) trainer._checkpoint_connector.restore(checkpoint_path) if on_gpu: trainer = Trainer(accelerator="gpu", devices=1, strategy="horovod", max_epochs=1) # test root gpu index assert trainer.strategy.root_device.index == hvd.local_rank()
def test_gather_all_tensors(backend, process): tutils.set_random_main_port() mp.spawn(process, args=(2, backend), nprocs=2)