def test_amp_gpu_ddp_slurm_managed(tmpdir): """Make sure DDP + AMP work.""" # simulate setting slurm flags tutils.set_random_main_port() model = AMPTestModel() # exp file to get meta logger = tutils.get_default_logger(tmpdir) # exp file to get weights checkpoint = tutils.init_checkpoint_callback(logger) # fit model trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, gpus=[0], strategy="ddp_spawn", precision=16, callbacks=[checkpoint], logger=logger, ) trainer.fit(model) # correct result and ok accuracy assert trainer.state.finished, "amp + ddp model failed to complete" # test root model address assert isinstance(trainer.strategy.cluster_environment, SLURMEnvironment) assert trainer.strategy.cluster_environment.resolve_root_node_address("abc") == "abc" assert trainer.strategy.cluster_environment.resolve_root_node_address("abc[23]") == "abc23" assert trainer.strategy.cluster_environment.resolve_root_node_address("abc[23-24]") == "abc23" generated = trainer.strategy.cluster_environment.resolve_root_node_address("abc[23-24, 45-40, 40]") assert generated == "abc23"
def test_collect_states(): """This test ensures state are properly collected across processes. This would be used to collect dataloader states as an example. """ tutils.set_random_main_port() mp.spawn(_test_collect_states, args=(2,), nprocs=2)
def test_sync_batchnorm_ddp(tmpdir): seed_everything(234) set_random_main_port() # define datamodule and dataloader dm = MNISTDataModule() dm.prepare_data() dm.setup(stage=None) train_dataloader = dm.train_dataloader() model = SyncBNModule() bn_outputs = [] # shuffle is false by default for batch_idx, batch in enumerate(train_dataloader): x, _ = batch _, out_bn = model.forward(x, batch_idx) bn_outputs.append(out_bn) # get 3 steps if batch_idx == 2: break bn_outputs = [x.cuda() for x in bn_outputs] # reset datamodule # batch-size = 16 because 2 GPUs in DDP dm = MNISTDataModule(batch_size=16, dist_sampler=True) dm.prepare_data() dm.setup(stage=None) model = SyncBNModule(gpu_count=2, bn_targets=bn_outputs) ddp = DDPSpawnStrategy( parallel_devices=[torch.device("cuda", 0), torch.device("cuda", 1)], num_nodes=1, sync_batchnorm=True, cluster_environment=LightningEnvironment(), find_unused_parameters=True, ) trainer = Trainer( default_root_dir=tmpdir, gpus=2, num_nodes=1, strategy=ddp, max_epochs=1, max_steps=3, sync_batchnorm=True, num_sanity_val_steps=0, replace_sampler_ddp=False, ) trainer.fit(model, dm) # the strategy is responsible for tearing down the batchnorm wrappers assert not isinstance(model.bn_layer, torch.nn.modules.batchnorm.SyncBatchNorm) assert isinstance(model.bn_layer, torch.nn.modules.batchnorm._BatchNorm)
def test_result_reduce_horovod(tmpdir): """Make sure result logging works with Horovod. This test mirrors tests/core/test_results.py::_ddp_test_fn """ tutils.reset_seed() tutils.set_random_main_port() def hvd_test_fn(): path_here = os.path.abspath(os.path.dirname(__file__)) path_root = os.path.abspath(os.path.join(path_here, "..", "..")) sys.path.insert(0, os.path.abspath(path_root)) class TestModel(BoringModel): def training_step(self, batch, batch_idx): self.training_step_called = True tensor = torch.tensor([1.0]) self.log("test_tensor", tensor, sync_dist=True, reduce_fx="sum", on_step=True, on_epoch=True) res = self._results # Check that `tensor` is summed across all ranks automatically assert ( res["test_tensor"].item() == hvd.size() ), "Result-Log does not work properly with Horovod and Tensors" def training_epoch_end(self, outputs) -> None: assert len(outputs) == 0 model = TestModel() model.val_dataloader = None trainer = Trainer( default_root_dir=tmpdir, limit_train_batches=2, limit_val_batches=2, max_epochs=1, log_every_n_steps=1, enable_model_summary=False, logger=False, ) trainer.fit(model) horovod.run(hvd_test_fn, np=2)
def test_multi_gpu_none_backend(tmpdir): """Make sure when using multiple GPUs the user can't use `accelerator = None`.""" tutils.set_random_main_port() trainer_options = dict( default_root_dir=tmpdir, enable_progress_bar=False, max_epochs=1, limit_train_batches=0.2, limit_val_batches=0.2, gpus=2, ) dm = ClassifDataModule() model = ClassificationModel() tpipes.run_model_test(trainer_options, model, dm)
def test_running_test_pretrained_model_distrib_ddp_spawn(tmpdir): """Verify `test()` on pretrained model.""" tutils.set_random_main_port() dm = ClassifDataModule() model = ClassificationModel() # exp file to get meta logger = tutils.get_default_logger(tmpdir) # exp file to get weights checkpoint = tutils.init_checkpoint_callback(logger) trainer_options = dict( enable_progress_bar=False, max_epochs=2, limit_train_batches=2, limit_val_batches=2, callbacks=[checkpoint], logger=logger, gpus=[0, 1], strategy="ddp_spawn", default_root_dir=tmpdir, ) # fit model trainer = Trainer(**trainer_options) trainer.fit(model, datamodule=dm) log.info(os.listdir(tutils.get_data_path(logger, path_dir=tmpdir))) # correct result and ok accuracy assert trainer.state.finished, f"Training failed with {trainer.state}" pretrained_model = ClassificationModel.load_from_checkpoint( trainer.checkpoint_callback.best_model_path) # run test set new_trainer = Trainer(**trainer_options) new_trainer.test(pretrained_model, datamodule=dm) pretrained_model.cpu() dataloaders = dm.test_dataloader() if not isinstance(dataloaders, list): dataloaders = [dataloaders] for dataloader in dataloaders: tpipes.run_prediction_eval_model_template(pretrained_model, dataloader, min_acc=0.1)
def test_multi_gpu_early_stop_ddp_spawn(tmpdir): tutils.set_random_main_port() trainer_options = dict( default_root_dir=tmpdir, callbacks=[EarlyStopping(monitor="train_acc")], max_epochs=50, limit_train_batches=10, limit_val_batches=10, gpus=[0, 1], strategy="ddp_spawn", ) dm = ClassifDataModule() model = ClassificationModel() tpipes.run_model_test(trainer_options, model, dm)
def test_ddp_all_dataloaders_passed_to_fit(tmpdir): """Make sure DDP works with dataloaders passed to fit()""" tutils.set_random_main_port() model = BoringModel() trainer = Trainer( default_root_dir=tmpdir, enable_progress_bar=False, max_epochs=1, limit_train_batches=0.2, limit_val_batches=0.2, gpus=[0, 1], strategy="ddp_spawn", ) trainer.fit(model, train_dataloaders=model.train_dataloader(), val_dataloaders=model.val_dataloader()) assert trainer.state.finished, "DDP doesn't work with dataloaders passed to fit()."
def test_multi_gpu_model_dp(tmpdir): tutils.set_random_main_port() trainer_options = dict( default_root_dir=tmpdir, max_epochs=1, limit_train_batches=10, limit_val_batches=10, accelerator="gpu", devices=[0, 1], strategy="dp", enable_progress_bar=False, ) model = BoringModel() tpipes.run_model_test(trainer_options, model)
def test_multi_cpu_model_ddp(tmpdir): """Make sure DDP works.""" tutils.set_random_main_port() trainer_options = dict( default_root_dir=tmpdir, enable_progress_bar=False, max_epochs=1, limit_train_batches=0.4, limit_val_batches=0.2, gpus=None, num_processes=2, strategy="ddp_spawn", ) dm = ClassifDataModule() model = ClassificationModel() tpipes.run_model_test(trainer_options, model, data=dm, on_gpu=False)
def test_multi_gpu_model_ddp_spawn(tmpdir): tutils.set_random_main_port() trainer_options = dict( default_root_dir=tmpdir, max_epochs=1, limit_train_batches=10, limit_val_batches=10, gpus=[0, 1], strategy="ddp_spawn", enable_progress_bar=False, ) model = BoringModel() tpipes.run_model_test(trainer_options, model) # test memory helper functions memory.get_memory_profile("min_max")
def test_multi_gpu_early_stop_dp(tmpdir): """Make sure DDP works. with early stopping """ tutils.set_random_main_port() dm = ClassifDataModule() model = CustomClassificationModelDP() trainer_options = dict( default_root_dir=tmpdir, callbacks=[EarlyStopping(monitor="val_acc")], max_epochs=50, limit_train_batches=10, limit_val_batches=10, gpus=[0, 1], strategy="dp", ) tpipes.run_model_test(trainer_options, model, dm)
def test_evaluate(tmpdir, trainer_kwargs): tutils.set_random_main_port() seed_everything(1) dm = ClassifDataModule() model = CustomClassificationModelDP() trainer = Trainer(default_root_dir=tmpdir, max_epochs=2, limit_train_batches=10, limit_val_batches=10, **trainer_kwargs) trainer.fit(model, datamodule=dm) assert "ckpt" in trainer.checkpoint_callback.best_model_path old_weights = model.layer_0.weight.clone().detach().cpu() trainer.validate(datamodule=dm) trainer.test(datamodule=dm) # make sure weights didn't change new_weights = model.layer_0.weight.clone().detach().cpu() torch.testing.assert_allclose(old_weights, new_weights)
def test_model_saves_on_multi_gpu(tmpdir): """Test that ONNX model saves on a distributed backend.""" tutils.set_random_main_port() trainer_options = dict( default_root_dir=tmpdir, max_epochs=1, limit_train_batches=10, limit_val_batches=10, gpus=[0, 1], strategy="ddp_spawn", enable_progress_bar=False, ) model = BoringModel() model.example_input_array = torch.randn(5, 32) tpipes.run_model_test(trainer_options, model, min_acc=0.08) file_path = os.path.join(tmpdir, "model.onnx") model.to_onnx(file_path) assert os.path.exists(file_path) is True
def run_test_from_config(trainer_options, on_gpu, check_size=True): """Trains the default model with the given config.""" set_random_main_port() reset_seed() ckpt_path = trainer_options["weights_save_path"] trainer_options.update(callbacks=[ModelCheckpoint(dirpath=ckpt_path)]) class TestModel(BoringModel): def on_train_start(self) -> None: expected_device = torch.device( "cuda", self.trainer.local_rank) if on_gpu else torch.device("cpu") assert self.device == expected_device def training_epoch_end(self, outputs) -> None: res = self.trainer.strategy.reduce(torch.tensor( 1.0, device=self.device), reduce_op="sum") assert res.sum() == self.trainer.strategy.world_size model = TestModel() trainer = Trainer(**trainer_options) trainer.fit(model) assert trainer.state.finished, f"Training failed with {trainer.state}" trainer.test(model) assert model.device == torch.device("cpu") # Horovod should be initialized following training. If not, this will raise an exception. if check_size: assert hvd.size() == 2 if trainer.global_rank > 0: return # test model loading pretrained_model = BoringModel.load_from_checkpoint( trainer.checkpoint_callback.best_model_path) # test new model accuracy test_loaders = model.test_dataloader() if not isinstance(test_loaders, list): test_loaders = [test_loaders] for dataloader in test_loaders: batch = next(iter(dataloader)) pretrained_model(batch) # test HPC saving # save logger to make sure we get all the metrics if trainer.logger: trainer.logger.finalize("finished") hpc_save_path = trainer._checkpoint_connector.hpc_save_path(ckpt_path) trainer.save_checkpoint(hpc_save_path) # test HPC loading checkpoint_path = trainer._checkpoint_connector._CheckpointConnector__get_max_ckpt_path_from_folder( ckpt_path) trainer._checkpoint_connector.restore(checkpoint_path) if on_gpu: trainer = Trainer(gpus=1, strategy="horovod", max_epochs=1) # Test the root_gpu property assert trainer.root_gpu == hvd.local_rank()
def test_result_reduce_ddp(): """Make sure result logging works with DDP.""" tutils.set_random_main_port() worldsize = 2 mp.spawn(_ddp_test_fn, args=(worldsize, ), nprocs=worldsize)