def test_multi_gpu_none_backend(tmpdir): """Make sure when using multiple GPUs the user can't use `distributed_backend = None`.""" tutils.set_random_master_port() trainer_options = dict( default_root_dir=tmpdir, distributed_backend=None, progress_bar_refresh_rate=0, max_epochs=1, limit_train_batches=0.2, limit_val_batches=0.2, gpus=2 ) model = EvalModelTemplate() tpipes.run_model_test(trainer_options, model)
def test_running_test_pretrained_model_distrib_dp(tmpdir): """Verify `test()` on pretrained model.""" tutils.set_random_master_port() model = EvalModelTemplate() # exp file to get meta logger = tutils.get_default_logger(tmpdir) # exp file to get weights checkpoint = tutils.init_checkpoint_callback(logger) trainer_options = dict( progress_bar_refresh_rate=0, max_epochs=2, limit_train_batches=0.4, limit_val_batches=0.2, checkpoint_callback=checkpoint, logger=logger, gpus=[0, 1], distributed_backend='dp', default_root_dir=tmpdir, ) # fit model trainer = Trainer(**trainer_options) result = trainer.fit(model) # correct result and ok accuracy assert result == 1, 'training failed to complete' pretrained_model = EvalModelTemplate.load_from_checkpoint( trainer.checkpoint_callback.best_model_path) # run test set new_trainer = Trainer(**trainer_options) results = new_trainer.test(pretrained_model) pretrained_model.cpu() # test we have good test accuracy acc = results[0]['test_acc'] assert acc > 0.5, f"Model failed to get expected {0.5} accuracy. test_acc = {acc}" dataloaders = model.test_dataloader() if not isinstance(dataloaders, list): dataloaders = [dataloaders] for dataloader in dataloaders: tpipes.run_prediction(dataloader, pretrained_model)
def test_sync_batchnorm_ddp(tmpdir): seed_everything(234) set_random_master_port() # define datamodule and dataloader dm = MNISTDataModule() dm.prepare_data() dm.setup(stage=None) train_dataloader = dm.train_dataloader() model = SyncBNModule() bn_outputs = [] # shuffle is false by default for batch_idx, batch in enumerate(train_dataloader): x, _ = batch _, out_bn = model.forward(x, batch_idx) bn_outputs.append(out_bn) # get 3 steps if batch_idx == 2: break bn_outputs = [x.cuda() for x in bn_outputs] # reset datamodule # batch-size = 16 because 2 GPUs in DDP dm = MNISTDataModule(batch_size=16, dist_sampler=True) dm.prepare_data() dm.setup(stage=None) model = SyncBNModule(gpu_count=2, bn_targets=bn_outputs) trainer = Trainer( gpus=2, num_nodes=1, distributed_backend='ddp_spawn', max_epochs=1, max_steps=3, sync_batchnorm=True, num_sanity_val_steps=0, replace_sampler_ddp=False, ) result = trainer.fit(model, dm) assert result == 1, "Sync batchnorm failing with DDP"
def test_multi_gpu_early_stop_ddp_spawn(tmpdir): """Make sure DDP works. with early stopping""" tutils.set_random_master_port() trainer_options = dict( default_root_dir=tmpdir, callbacks=[EarlyStopping()], max_epochs=50, limit_train_batches=10, limit_val_batches=10, gpus=[0, 1], distributed_backend='ddp_spawn', ) model = EvalModelTemplate() tpipes.run_model_test(trainer_options, model)
def test_result_reduce_horovod(enable_pl_optimizer, tmpdir): """Make sure result logging works with Horovod. This test mirrors tests/core/test_results.py::_ddp_test_fn """ tutils.reset_seed() tutils.set_random_master_port() def hvd_test_fn(): path_here = os.path.abspath(os.path.dirname(__file__)) path_root = os.path.abspath(os.path.join(path_here, '..', '..')) sys.path.insert(0, os.path.abspath(path_root)) class TestModel(BoringModel): def training_step(self, batch, batch_idx): self.training_step_called = True tensor = torch.tensor([1.0]) self.log("test_tensor", tensor, sync_dist=True, sync_dist_op='sum', on_step=True, on_epoch=True) res = self._results # Check that `tensor` is summed across all ranks automatically assert res["test_tensor"].item() == hvd.size(), \ "Result-Log does not work properly with Horovod and Tensors" def training_epoch_end(self, outputs) -> None: assert len(outputs) == 0 model = TestModel() model.val_dataloader = None trainer = Trainer( default_root_dir=tmpdir, limit_train_batches=2, limit_val_batches=2, max_epochs=1, log_every_n_steps=1, weights_summary=None, enable_pl_optimizer=enable_pl_optimizer, ) trainer.fit(model) horovod.run(hvd_test_fn, np=2)
def test_multi_gpu_model_ddp_spawn(tmpdir): tutils.set_random_master_port() trainer_options = dict(default_root_dir=tmpdir, max_epochs=1, limit_train_batches=10, limit_val_batches=10, gpus=[0, 1], distributed_backend='ddp_spawn', progress_bar_refresh_rate=0) model = EvalModelTemplate() tpipes.run_model_test(trainer_options, model) # test memory helper functions memory.get_memory_profile('min_max')
def test_ddp_all_dataloaders_passed_to_fit(tmpdir): """Make sure DDP works with dataloaders passed to fit()""" tutils.set_random_master_port() model = EvalModelTemplate() fit_options = dict(train_dataloader=model.train_dataloader(), val_dataloaders=model.val_dataloader()) trainer = Trainer(default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=1, limit_train_batches=0.2, limit_val_batches=0.2, gpus=[0, 1], distributed_backend='ddp_spawn') result = trainer.fit(model, **fit_options) assert result == 1, "DDP doesn't work with dataloaders passed to fit()."
def run_test_from_config(trainer_options): """Trains the default model with the given config.""" set_random_master_port() ckpt_path = trainer_options['default_root_dir'] trainer_options.update(checkpoint_callback=ModelCheckpoint(ckpt_path)) model = EvalModelTemplate() run_model_test(trainer_options, model, on_gpu=args.on_gpu, version=0, with_hpc=False) # Horovod should be initialized following training. If not, this will raise an exception. assert hvd.size() == 2 if args.on_gpu: trainer = Trainer(gpus=1, distributed_backend='horovod', max_epochs=1) # Test the root_gpu property assert trainer.root_gpu == hvd.local_rank()
def test_multi_cpu_model_ddp(tmpdir): """Make sure DDP works.""" tutils.set_random_master_port() trainer_options = dict( default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=1, limit_train_batches=0.4, limit_val_batches=0.2, gpus=None, num_processes=2, distributed_backend='ddp_cpu', ) model = EvalModelTemplate() tpipes.run_model_test(trainer_options, model, on_gpu=False)
def test_single_gpu_test(tmpdir): tutils.set_random_master_port() model = EvalModelTemplate() trainer = pl.Trainer( default_root_dir=os.getcwd(), max_epochs=2, limit_train_batches=10, limit_val_batches=10, gpus=[0], ) trainer.fit(model) assert 'ckpt' in trainer.checkpoint_callback.best_model_path results = trainer.test() assert 'test_acc' in results results = trainer.test(model) assert 'test_acc' in results
def run_test_from_config(trainer_options): """Trains the default model with the given config.""" set_random_master_port() reset_seed() ckpt_path = trainer_options['weights_save_path'] trainer_options.update(checkpoint_callback=ModelCheckpoint(ckpt_path)) model = EvalModelTemplate() trainer = Trainer(**trainer_options) result = trainer.fit(model) assert result == 1 # Horovod should be initialized following training. If not, this will raise an exception. assert hvd.size() == 2 if trainer.global_rank > 0: # on higher ranks the checkpoint location is unknown # we want to test checkpointing on rank 0 only assert not hasattr(trainer, 'ckpt_path') assert not trainer.checkpoint_callback.best_model_path return # test model loading pretrained_model = EvalModelTemplate.load_from_checkpoint( trainer.checkpoint_callback.best_model_path) # test new model accuracy test_loaders = model.test_dataloader() if not isinstance(test_loaders, list): test_loaders = [test_loaders] for dataloader in test_loaders: run_prediction(dataloader, pretrained_model) # test HPC loading / saving trainer.hpc_save(ckpt_path, trainer.logger) trainer.hpc_load(ckpt_path, on_gpu=args.on_gpu) if args.on_gpu: trainer = Trainer(gpus=1, distributed_backend='horovod', max_epochs=1) # Test the root_gpu property assert trainer.root_gpu == hvd.local_rank()
def test_amp_multi_gpu(tmpdir, backend): """Make sure DP/DDP + AMP work.""" tutils.set_random_master_port() model = EvalModelTemplate() trainer_options = dict( default_root_dir=tmpdir, max_epochs=1, # gpus=2, gpus='0, 1', # test init with gpu string distributed_backend=backend, precision=16, ) # tutils.run_model_test(trainer_options, model) trainer = Trainer(**trainer_options) result = trainer.fit(model) assert result
def run_test_from_config(trainer_options): """Trains the default model with the given config.""" set_random_master_port() reset_seed() ckpt_path = trainer_options['weights_save_path'] trainer_options.update(callbacks=[ModelCheckpoint(dirpath=ckpt_path)]) model = EvalModelTemplate() trainer = Trainer(**trainer_options) result = trainer.fit(model) assert result == 1 # Horovod should be initialized following training. If not, this will raise an exception. assert hvd.size() == 2 if trainer.global_rank > 0: return # test model loading pretrained_model = EvalModelTemplate.load_from_checkpoint( trainer.checkpoint_callback.best_model_path) # test new model accuracy test_loaders = model.test_dataloader() if not isinstance(test_loaders, list): test_loaders = [test_loaders] for dataloader in test_loaders: run_prediction(dataloader, pretrained_model) # test HPC saving trainer.checkpoint_connector.hpc_save(ckpt_path, trainer.logger) # test HPC loading checkpoint_path = trainer.checkpoint_connector.get_max_ckpt_path_from_folder( ckpt_path) trainer.checkpoint_connector.hpc_load(checkpoint_path, on_gpu=args.on_gpu) if args.on_gpu: trainer = Trainer(gpus=1, accelerator='horovod', max_epochs=1) # Test the root_gpu property assert trainer.root_gpu == hvd.local_rank()
def test_model_saves_on_multi_gpu(tmpdir): """Test that ONNX model saves on a distributed backend""" tutils.set_random_master_port() trainer_options = dict(default_root_dir=tmpdir, max_epochs=1, limit_train_batches=10, limit_val_batches=10, gpus=[0, 1], distributed_backend='ddp_spawn', progress_bar_refresh_rate=0) model = EvalModelTemplate() tpipes.run_model_test(trainer_options, model) file_path = os.path.join(tmpdir, "model.onnx") model.to_onnx(file_path) assert os.path.exists(file_path) is True
def test_multi_gpu_early_stop(tmpdir, backend): """Make sure DDP works. with early stopping""" tutils.set_random_master_port() trainer_options = dict( default_root_dir=tmpdir, early_stop_callback=True, max_epochs=50, limit_train_batches=10, limit_val_batches=10, gpus=[0, 1], distributed_backend=backend, ) model = EvalModelTemplate() # tutils.run_model_test(trainer_options, model) trainer = Trainer(**trainer_options) result = trainer.fit(model) assert result
def test_multi_gpu_wandb(tmpdir, backend): """Make sure DP/DDP + AMP work.""" from pytorch_lightning.loggers import WandbLogger tutils.set_random_master_port() model = EvalModelTemplate() logger = WandbLogger(name='utest') trainer_options = dict( default_root_dir=tmpdir, max_epochs=1, gpus=2, distributed_backend=backend, precision=16, logger=logger, ) # tutils.run_model_test(trainer_options, model) trainer = Trainer(**trainer_options) result = trainer.fit(model) assert result trainer.test(model)
def test_model_pickable(tmpdir, metric: Metric): """Make sure that metrics are pickable by including into a model and running in multi-gpu mode""" tutils.set_random_master_port() trainer_options = dict( default_root_dir=tmpdir, max_epochs=1, limit_train_batches=10, gpus=[0, 1], distributed_backend="ddp_spawn", ) model = EvalModelTemplate() model.metric = metric() model.training_step = model.training_step__using_metrics trainer = Trainer(**trainer_options) result = trainer.fit(model) # correct result and ok accuracy assert result == 1, "ddp model failed to complete"
def test_multi_gpu_model(tmpdir, backend): """Make sure DDP works.""" tutils.set_random_master_port() trainer_options = dict( default_root_dir=tmpdir, max_epochs=1, limit_train_batches=0.4, limit_val_batches=0.2, gpus=[0, 1], distributed_backend=backend, ) model = EvalModelTemplate() # tutils.run_model_test(trainer_options, model) trainer = Trainer(**trainer_options) result = trainer.fit(model) assert result # test memory helper functions memory.get_memory_profile('min_max')
def test_amp_gpu_ddp_slurm_managed(tmpdir): """Make sure DDP + AMP work.""" # simulate setting slurm flags tutils.set_random_master_port() os.environ['SLURM_LOCALID'] = str(0) model = EvalModelTemplate() # exp file to get meta logger = tutils.get_default_logger(tmpdir) # exp file to get weights checkpoint = tutils.init_checkpoint_callback(logger) # fit model trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, gpus=[0], distributed_backend='ddp_spawn', precision=16, checkpoint_callback=checkpoint, logger=logger, ) trainer.is_slurm_managing_tasks = True result = trainer.fit(model) # correct result and ok accuracy assert result == 1, 'amp + ddp model failed to complete' # test root model address assert trainer.accelerator_connector.resolve_root_node_address( 'abc') == 'abc' assert trainer.accelerator_connector.resolve_root_node_address( 'abc[23]') == 'abc23' assert trainer.accelerator_connector.resolve_root_node_address( 'abc[23-24]') == 'abc23' assert trainer.accelerator_connector.resolve_root_node_address( 'abc[23-24, 45-40, 40]') == 'abc23'
def test_multi_gpu_wandb_ddp_spawn(tmpdir): """Make sure DP/DDP + AMP work.""" from pytorch_lightning.loggers import WandbLogger tutils.set_random_master_port() model = EvalModelTemplate() wandb.run = MagicMock() wandb.init(name='name', project='project') logger = WandbLogger(name='name', offline=True) trainer_options = dict( default_root_dir=tmpdir, max_epochs=1, gpus=2, distributed_backend='ddp_spawn', precision=16, logger=logger, ) # tutils.run_model_test(trainer_options, model) trainer = Trainer(**trainer_options) result = trainer.fit(model) assert result trainer.test(model)
def test_numpy_metric_ddp(): tutils.reset_seed() tutils.set_random_master_port() world_size = 2 mp.spawn(_ddp_test_numpy_metric, args=(world_size, ), nprocs=world_size)