def test_dp_test(tmpdir): tutils.set_random_master_port() dm = ClassifDataModule() model = CustomClassificationModelDP() trainer = pl.Trainer( default_root_dir=tmpdir, max_epochs=2, limit_train_batches=10, limit_val_batches=10, gpus=[0, 1], accelerator='dp', ) trainer.fit(model, datamodule=dm) assert 'ckpt' in trainer.checkpoint_callback.best_model_path results = trainer.test(datamodule=dm) assert 'test_acc' in results[0] old_weights = model.layer_0.weight.clone().detach().cpu() results = trainer.test(model, datamodule=dm) assert 'test_acc' in results[0] # make sure weights didn't change new_weights = model.layer_0.weight.clone().detach().cpu() assert torch.all(torch.eq(old_weights, new_weights))
def test_fast_forward_sampler_with_distributed_sampler_and_iterative_dataset(): """Make sure result logging works with DDP""" tutils.set_random_master_port() worldsize = 2 mp.spawn( _test_fast_forward_sampler_with_distributed_sampler_and_iterative_dataset, args=(worldsize,), nprocs=worldsize )
def test_sync_batchnorm_ddp(tmpdir): seed_everything(234) set_random_master_port() # define datamodule and dataloader dm = MNISTDataModule() dm.prepare_data() dm.setup(stage=None) train_dataloader = dm.train_dataloader() model = SyncBNModule() bn_outputs = [] # shuffle is false by default for batch_idx, batch in enumerate(train_dataloader): x, _ = batch _, out_bn = model.forward(x, batch_idx) bn_outputs.append(out_bn) # get 3 steps if batch_idx == 2: break bn_outputs = [x.cuda() for x in bn_outputs] # reset datamodule # batch-size = 16 because 2 GPUs in DDP dm = MNISTDataModule(batch_size=16, dist_sampler=True) dm.prepare_data() dm.setup(stage=None) model = SyncBNModule(gpu_count=2, bn_targets=bn_outputs) ddp = DDPSpawnPlugin( parallel_devices=[torch.device("cuda", 0), torch.device("cuda", 1)], num_nodes=1, sync_batchnorm=True, cluster_environment=LightningEnvironment(), find_unused_parameters=True, ) trainer = Trainer( default_root_dir=tmpdir, gpus=2, num_nodes=1, accelerator="ddp_spawn", max_epochs=1, max_steps=3, sync_batchnorm=True, num_sanity_val_steps=0, replace_sampler_ddp=False, plugins=[ddp], ) trainer.fit(model, dm) assert trainer.state.finished, "Sync batchnorm failing with DDP"
def test_result_reduce_horovod(tmpdir): """Make sure result logging works with Horovod. This test mirrors tests/core/test_results.py::_ddp_test_fn """ tutils.reset_seed() tutils.set_random_master_port() def hvd_test_fn(): path_here = os.path.abspath(os.path.dirname(__file__)) path_root = os.path.abspath(os.path.join(path_here, "..", "..")) sys.path.insert(0, os.path.abspath(path_root)) class TestModel(BoringModel): def training_step(self, batch, batch_idx): self.training_step_called = True tensor = torch.tensor([1.0]) self.log("test_tensor", tensor, sync_dist=True, reduce_fx="sum", on_step=True, on_epoch=True) res = self._results # Check that `tensor` is summed across all ranks automatically assert ( res["test_tensor"].item() == hvd.size() ), "Result-Log does not work properly with Horovod and Tensors" def training_epoch_end(self, outputs) -> None: assert len(outputs) == 0 model = TestModel() model.val_dataloader = None trainer = Trainer( default_root_dir=tmpdir, limit_train_batches=2, limit_val_batches=2, max_epochs=1, log_every_n_steps=1, weights_summary=None, logger=False, ) trainer.fit(model) horovod.run(hvd_test_fn, np=2)
def test_multi_gpu_none_backend(tmpdir): """Make sure when using multiple GPUs the user can't use `distributed_backend = None`.""" tutils.set_random_master_port() trainer_options = dict( default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=1, limit_train_batches=0.2, limit_val_batches=0.2, gpus=2, ) model = BoringModel() tpipes.run_model_test(trainer_options, model, min_acc=0.20)
def test_multi_gpu_early_stop_ddp_spawn(tmpdir): tutils.set_random_master_port() trainer_options = dict( default_root_dir=tmpdir, callbacks=[EarlyStopping()], max_epochs=50, limit_train_batches=10, limit_val_batches=10, gpus=[0, 1], accelerator='ddp_spawn', ) model = EvalModelTemplate() tpipes.run_model_test(trainer_options, model)
def test_multi_gpu_early_stop_ddp_spawn(tmpdir): tutils.set_random_master_port() trainer_options = dict( default_root_dir=tmpdir, callbacks=[EarlyStopping(monitor='train_acc')], max_epochs=50, limit_train_batches=10, limit_val_batches=10, gpus=[0, 1], accelerator='ddp_spawn', ) dm = ClassifDataModule() model = ClassificationModel() tpipes.run_model_test(trainer_options, model, dm)
def run_test_from_config(trainer_options, on_gpu, check_size=True): """Trains the default model with the given config.""" set_random_master_port() reset_seed() ckpt_path = trainer_options['weights_save_path'] trainer_options.update(callbacks=[ModelCheckpoint(dirpath=ckpt_path)]) model = BoringModel() trainer = Trainer(**trainer_options) trainer.fit(model) assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}" # Horovod should be initialized following training. If not, this will raise an exception. if check_size: assert hvd.size() == 2 if trainer.global_rank > 0: return # test model loading pretrained_model = BoringModel.load_from_checkpoint( trainer.checkpoint_callback.best_model_path) # test new model accuracy test_loaders = model.test_dataloader() if not isinstance(test_loaders, list): test_loaders = [test_loaders] for dataloader in test_loaders: batch = next(iter(dataloader)) pretrained_model(batch) # test HPC saving trainer.checkpoint_connector.hpc_save(ckpt_path, trainer.logger) # test HPC loading checkpoint_path = trainer.checkpoint_connector.get_max_ckpt_path_from_folder( ckpt_path) trainer.checkpoint_connector.hpc_load(checkpoint_path, on_gpu=on_gpu) if on_gpu: trainer = Trainer(gpus=1, accelerator='horovod', max_epochs=1) # Test the root_gpu property assert trainer.root_gpu == hvd.local_rank()
def test_ddp_all_dataloaders_passed_to_fit(tmpdir): """Make sure DDP works with dataloaders passed to fit()""" tutils.set_random_master_port() model = BoringModel() fit_options = dict(train_dataloader=model.train_dataloader(), val_dataloaders=model.val_dataloader()) trainer = Trainer( default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=1, limit_train_batches=0.2, limit_val_batches=0.2, gpus=[0, 1], accelerator="ddp_spawn", ) trainer.fit(model, **fit_options) assert trainer.state.finished, "DDP doesn't work with dataloaders passed to fit()."
def test_model_saves_on_multi_gpu(tmpdir): """Test that ONNX model saves on a distributed backend.""" tutils.set_random_master_port() trainer_options = dict( default_root_dir=tmpdir, max_epochs=1, limit_train_batches=10, limit_val_batches=10, gpus=[0, 1], accelerator="ddp_spawn", progress_bar_refresh_rate=0, ) model = BoringModel() model.example_input_array = torch.randn(5, 32) tpipes.run_model_test(trainer_options, model, min_acc=0.08) file_path = os.path.join(tmpdir, "model.onnx") model.to_onnx(file_path) assert os.path.exists(file_path) is True