def test_non_sequential_sampler_warning_is_raised_for_eval_dataloader( val_dl, warns): trainer = Trainer() model = BoringModel() trainer._data_connector.attach_data(model, val_dataloaders=val_dl) context = pytest.warns if warns else no_warning_call with context( PossibleUserWarning, match="recommended .* turn shuffling off for val/test/predict"): trainer._data_connector._reset_eval_dataloader(RunningStage.VALIDATING, model)
def test_deepspeed_with_meta_device(tmpdir): with init_meta_context(): model = BoringModel() assert model.layer.weight.device.type == "meta" trainer = Trainer(default_root_dir=tmpdir, plugins=[DeepSpeedPlugin(stage=3)], gpus=2, fast_dev_run=True, precision=16) trainer.fit(model) assert model.layer.weight.device.type == "cpu"
def test_deepspeed_gradient_clip_by_value(tmpdir): """Test to ensure that an exception is raised when using `gradient_clip_algorithm='value'`.""" model = BoringModel() trainer = Trainer( default_root_dir=tmpdir, gpus=1, strategy="deepspeed", gradient_clip_algorithm="value", ) with pytest.raises(MisconfigurationException, match="does not support clipping gradients by value"): trainer.fit(model)
def test_if_test_works_after_train(tmpdir): """Ensure that .test() works after .fit()""" # Train a model on TPU model = BoringModel() trainer = Trainer(max_epochs=1, tpu_cores=8, default_root_dir=tmpdir, fast_dev_run=True) trainer.fit(model) assert len(trainer.test(model)) == 1
def test_overfit_basic(tmpdir, overfit): """Tests that only training_step can be used.""" model = BoringModel() trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, overfit_batches=overfit, weights_summary=None) trainer.fit(model)
def test_replication_factor(tmpdir): """Ensure if the user passes manual poptorch Options with custom parameters set, we set them correctly in the dataloaders.""" plugin = IPUPlugin() trainer = Trainer(ipus=2, default_root_dir=tmpdir, fast_dev_run=True, strategy=plugin) assert trainer.ipus == 2 assert trainer.training_type_plugin.replication_factor == 2 model = BoringModel() training_opts = poptorch.Options() inference_opts = poptorch.Options() training_opts.replicationFactor(8) inference_opts.replicationFactor(7) plugin = IPUPlugin(inference_opts=inference_opts, training_opts=training_opts) trainer = Trainer(default_root_dir=tmpdir, ipus=1, strategy=plugin) trainer.optimizers = model.configure_optimizers()[0] plugin.model = model model.trainer = trainer trainer.state.fn = TrainerFn.FITTING trainer.training_type_plugin.pre_dispatch() trainer.state.stage = RunningStage.TRAINING assert trainer.training_type_plugin.replication_factor == 8 trainer.state.stage = RunningStage.VALIDATING assert trainer.training_type_plugin.replication_factor == 7 for fn, stage in ( (TrainerFn.VALIDATING, RunningStage.VALIDATING), (TrainerFn.TESTING, RunningStage.TESTING), (TrainerFn.PREDICTING, RunningStage.PREDICTING), ): trainer.state.fn = fn trainer.state.stage = stage trainer.training_type_plugin.pre_dispatch() assert trainer.training_type_plugin.replication_factor == 7
def _assert_save_model_is_equal(model, tmpdir, trainer): checkpoint_path = os.path.join(tmpdir, 'model.pt') trainer.save_checkpoint(checkpoint_path) # carry out the check only on rank 0 if trainer.global_rank == 0: saved_model = BoringModel.load_from_checkpoint(checkpoint_path) saved_model = saved_model.float() model = model.float().cpu() # Assert model parameters are identical after loading for orig_param, trained_model_param in zip(model.parameters(), saved_model.parameters()): assert torch.equal(orig_param, trained_model_param)
def test_ddp_barrier_non_consecutive_device_ids(barrier_mock, tmpdir): """ Test correct usage of barriers when device ids do not start at 0 or are not consecutive. """ model = BoringModel() gpus = [1, 3] trainer = Trainer( default_root_dir=tmpdir, max_steps=1, gpus=gpus, accelerator="ddp", ) trainer.fit(model) barrier_mock.assert_any_call(device_ids=[gpus[trainer.local_rank]])
def test_different_accumulate_grad_batches_fails(tmpdir): model = BoringModel() trainer = Trainer(default_root_dir=tmpdir, accumulate_grad_batches={1: 2}, gpus=1, strategy="deepspeed") with pytest.raises( MisconfigurationException, match= "DeepSpeed currently does not support different `accumulate_grad_batches`" ): trainer.fit(model)
def test_ddp_sharded_strategy_fit_ckpt_path_gpu_to_cpu(tmpdir): """Test to ensure that resuming from checkpoint works when going from GPUs- > CPU.""" model = BoringModel() trainer = Trainer(strategy="ddp_sharded_spawn", accelerator="gpu", devices=1, fast_dev_run=True) trainer.fit(model) checkpoint_path = os.path.join(tmpdir, "model.pt") trainer.save_checkpoint(checkpoint_path) model = BoringModel() trainer = Trainer(strategy="ddp_sharded_spawn", accelerator="cpu", devices=2, fast_dev_run=True) trainer.fit(model, ckpt_path=checkpoint_path)
def test_tqdm_progress_bar_progress_refresh(tmpdir, refresh_rate: int): """Test that the three progress bars get correctly updated when using different refresh rates.""" model = BoringModel() class CurrentProgressBar(TQDMProgressBar): train_batches_seen = 0 val_batches_seen = 0 test_batches_seen = 0 def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx): super().on_train_batch_end(trainer, pl_module, outputs, batch, batch_idx) self.train_batches_seen += 1 def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx): super().on_validation_batch_end(trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) self.val_batches_seen += 1 def on_test_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx): super().on_test_batch_end(trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) self.test_batches_seen += 1 pbar = CurrentProgressBar(refresh_rate=refresh_rate) trainer = Trainer( default_root_dir=tmpdir, callbacks=[pbar], limit_train_batches=1.0, num_sanity_val_steps=2, max_epochs=3, ) assert trainer.progress_bar_callback.refresh_rate == refresh_rate trainer.fit(model) assert ( pbar.train_batches_seen + pbar.val_batches_seen == 3 * pbar.main_progress_bar.total + trainer.num_sanity_val_steps ) assert pbar.test_batches_seen == 0 trainer.validate(model) assert ( pbar.train_batches_seen + pbar.val_batches_seen == 3 * pbar.main_progress_bar.total + pbar.val_progress_bar.total + trainer.num_sanity_val_steps ) assert pbar.test_batches_seen == 0 trainer.test(model) assert ( pbar.train_batches_seen + pbar.val_batches_seen == 3 * pbar.main_progress_bar.total + pbar.val_progress_bar.total + trainer.num_sanity_val_steps ) assert pbar.test_batches_seen == pbar.test_progress_bar.total
def test_if_test_works_after_train(tmpdir): """ Ensure that .test() works after .fit() """ # Train a model on TPU model = BoringModel() trainer = Trainer(checkpoint_callback=True, max_epochs=1, tpu_cores=8, default_root_dir=tmpdir) trainer.fit(model) assert trainer.test() == 1
def test_invalid_apex_sharded(tmpdir): """Test to ensure that we raise an error when we try to use apex and sharded.""" model = BoringModel() with pytest.raises(MisconfigurationException, match="Sharded Plugin is not supported with Apex AMP"): trainer = Trainer(fast_dev_run=True, accelerator="ddp_sharded_spawn", precision=16, amp_backend="apex") trainer.fit(model)
def test_deepspeed_warn_train_dataloader_called(tmpdir): """Test DeepSpeed warns when it calls ``lightning_module.train_dataloader`` internally for logging batch size.""" model = BoringModel() trainer = Trainer( default_root_dir=tmpdir, plugins=[DeepSpeedPlugin()], gpus=1, fast_dev_run=True, ) with pytest.warns(UserWarning, match="Inferring the batch size for internal deepspeed logging"): trainer.fit(model)
def test_ort_callback_fails_no_model(tmpdir): model = BoringModel() trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, callbacks=ORTCallback()) with pytest.raises(MisconfigurationException, match="Torch ORT requires to wrap a single model"): trainer.fit( model, train_dataloader=torch.utils.data.DataLoader(DummyDataset()), val_dataloaders=torch.utils.data.DataLoader(DummyDataset()), )
def test_tuner_with_distributed_strategies(): """Test that an error is raised when tuner is used with multi-device strategy.""" trainer = Trainer(auto_scale_batch_size=True, devices=2, strategy="ddp", accelerator="cpu") model = BoringModel() with pytest.raises( MisconfigurationException, match=r"not supported with `Trainer\(strategy='ddp'\)`"): trainer.tune(model)
def test_tqdm_progress_bar_can_be_pickled(): bar = TQDMProgressBar() trainer = Trainer(fast_dev_run=True, callbacks=[bar], max_steps=1) model = BoringModel() pickle.dumps(bar) trainer.fit(model) pickle.dumps(bar) trainer.test(model) pickle.dumps(bar) trainer.predict(model) pickle.dumps(bar)
def test_has_len_all_rank(): trainer = Trainer(fast_dev_run=True) model = BoringModel() with pytest.raises( MisconfigurationException, match="Total length of `Dataloader` across ranks is zero."): assert not has_len_all_ranks(DataLoader(RandomDataset(0, 0)), trainer.strategy, model) assert has_len_all_ranks(DataLoader(RandomDataset(1, 1)), trainer.strategy, model)
def test_invalid_deepspeed_defaults_no_precision(tmpdir): """Test to ensure that using defaults, if precision is not set to 16, we throw an exception.""" model = BoringModel() trainer = Trainer( default_root_dir=tmpdir, fast_dev_run=True, plugins='deepspeed', ) with pytest.raises( MisconfigurationException, match='To use DeepSpeed ZeRO Optimization, you must set precision=16.' ): trainer.fit(model)
def test_ddp_sharded_plugin_test_multigpu(tmpdir): """ Test to ensure we can use test without fit """ model = BoringModel() trainer = Trainer( accelerator='ddp_sharded_spawn', gpus=2, fast_dev_run=True, ) trainer.test(model)
def train(): model = BoringModel() trainer = Trainer( fast_dev_run=True, precision=16, amp_backend='native', gpus=gpus, num_processes=num_processes, accelerator=ddp_backend, callbacks=[CB()], ) trainer.fit(model)
def test_ddp_sharded_plugin_checkpoint_multi_gpu(tmpdir): """ Test to ensure that checkpoint is saved correctly when using multiple GPUs """ model = BoringModel() trainer = Trainer( gpus=2, accelerator='ddp_sharded_spawn', fast_dev_run=True, ) trainer.fit(model) checkpoint_path = os.path.join(tmpdir, 'model.pt') trainer.save_checkpoint(checkpoint_path) saved_model = BoringModel.load_from_checkpoint(checkpoint_path) # Assert model parameters are identical after loading for ddp_param, shard_param in zip(model.parameters(), saved_model.parameters()): assert torch.equal(ddp_param.to("cpu"), shard_param)
def test_progress_bar_totals(tmpdir): """Test that the progress finishes with the correct total steps processed.""" model = BoringModel() trainer = Trainer(default_root_dir=tmpdir, progress_bar_refresh_rate=1, max_epochs=1) bar = trainer.progress_bar_callback assert 0 == bar.total_train_batches assert 0 == bar.total_val_batches assert 0 == bar.total_test_batches trainer.fit(model) # check main progress bar total n = bar.total_train_batches m = bar.total_val_batches assert len(trainer.train_dataloader) == n assert bar.main_progress_bar.total == n + m # check val progress bar total assert sum(len(loader) for loader in trainer.val_dataloaders) == m assert bar.val_progress_bar.total == m # main progress bar should have reached the end (train batches + val batches) assert bar.main_progress_bar.n == n + m assert bar.train_batch_idx == n # val progress bar should have reached the end assert bar.val_progress_bar.n == m assert bar.val_batch_idx == m # check that the test progress bar is off assert 0 == bar.total_test_batches assert bar.test_progress_bar is None trainer.validate(model) assert bar.val_progress_bar.total == m assert bar.val_progress_bar.n == m assert bar.val_batch_idx == m trainer.test(model) # check test progress bar total k = bar.total_test_batches assert sum(len(loader) for loader in trainer.test_dataloaders) == k assert bar.test_progress_bar.total == k # test progress bar should have reached the end assert bar.test_progress_bar.n == k assert bar.test_batch_idx == k
def test_poptorch_models_at_different_stages(tmpdir): plugin = IPUStrategy() trainer = Trainer(default_root_dir=tmpdir, strategy=plugin, accelerator="ipu", devices=8) model = BoringModel() model.trainer = trainer plugin.model = model trainer.optimizers = model.configure_optimizers()[0] trainer.state.fn = TrainerFn.FITTING trainer.strategy.setup(trainer) assert list(trainer.strategy.poptorch_models) == [RunningStage.TRAINING, RunningStage.VALIDATING] for fn, stage in ( (TrainerFn.VALIDATING, RunningStage.VALIDATING), (TrainerFn.TESTING, RunningStage.TESTING), (TrainerFn.PREDICTING, RunningStage.PREDICTING), ): trainer.state.fn = fn trainer.state.stage = stage trainer.strategy.setup(trainer) assert list(trainer.strategy.poptorch_models) == [stage]
def test_deepspeed_stage_3_save_warning(tmpdir): """ Test to ensure that DeepSpeed Stage 3 gives a warning when saving. """ model = BoringModel() trainer = Trainer( default_root_dir=tmpdir, plugins=[DeepSpeedPlugin(stage=3)], gpus=2, fast_dev_run=True, precision=16 ) trainer.fit(model) checkpoint_path = os.path.join(tmpdir, "model.pt") with pytest.warns(UserWarning, match="each worker will save a shard of the checkpoint within a directory."): trainer.save_checkpoint(checkpoint_path)
def _assert_save_model_is_equal(model, tmpdir, trainer): checkpoint_path = os.path.join(tmpdir, 'model.pt') trainer.save_checkpoint(checkpoint_path) # carry out the check only on rank 0 if trainer.global_rank == 0: saved_model = BoringModel.load_from_checkpoint(checkpoint_path) if model.dtype == torch.half: saved_model = saved_model.half() # model is loaded in float32 as default, move it to float16 model = model.cpu() # Assert model parameters are identical after loading for orig_param, trained_model_param in zip(model.parameters(), saved_model.parameters()): assert torch.equal(orig_param, trained_model_param)
def test_has_len_all_rank(): trainer = Trainer(fast_dev_run=True) model = BoringModel() with pytest.warns( UserWarning, match="Total length of `DataLoader` across ranks is zero."): assert has_len_all_ranks(DataLoader(RandomDataset(0, 0)), trainer.strategy, model) assert has_len_all_ranks(DataLoader(RandomDataset(1, 1)), trainer.strategy, model)
def test_deepspeed_multigpu(tmpdir): """ Test to ensure that DeepSpeed with multiple GPUs works. """ model = BoringModel() trainer = Trainer( default_root_dir=tmpdir, plugins=[DeepSpeedPlugin(stage=3)], gpus=2, fast_dev_run=True, precision=16 ) trainer.fit(model) trainer.test(model) _assert_save_model_is_equal(model, tmpdir, trainer)
def test_checkpoint_plugin_called(tmpdir): """ Ensure that the custom checkpoint IO plugin and torch checkpoint IO plugin is called when saving/loading. """ checkpoint_plugin = CustomCheckpointIO() checkpoint_plugin = MagicMock(wraps=checkpoint_plugin, spec=CustomCheckpointIO) ck = ModelCheckpoint(dirpath=tmpdir, save_last=True) model = BoringModel() device = torch.device("cpu") trainer = Trainer( default_root_dir=tmpdir, plugins=SingleDevicePlugin(device, checkpoint_io=checkpoint_plugin), callbacks=ck, max_epochs=1, ) trainer.fit(model) assert checkpoint_plugin.save_checkpoint.call_count == 3 trainer.test(model, ckpt_path=ck.last_model_path) checkpoint_plugin.load_checkpoint.assert_called_with(tmpdir / "last.ckpt") checkpoint_plugin.reset_mock() ck = ModelCheckpoint(dirpath=tmpdir, save_last=True) model = BoringModel() device = torch.device("cpu") trainer = Trainer( default_root_dir=tmpdir, plugins=[SingleDevicePlugin(device), checkpoint_plugin], callbacks=ck, max_epochs=1, ) trainer.fit(model) assert checkpoint_plugin.save_checkpoint.call_count == 3 trainer.test(model, ckpt_path=ck.last_model_path) checkpoint_plugin.load_checkpoint.assert_called_once() checkpoint_plugin.load_checkpoint.assert_called_with(tmpdir / "last.ckpt")
def test_deepspeed_lightning_module(tmpdir): """Test to ensure that a model wrapped in `LightningDeepSpeedModule` moves types and device correctly.""" model = BoringModel() module = LightningDeepSpeedModule(model, precision=16) module.half() assert module.dtype == torch.half assert model.dtype == torch.half module.to(torch.double) assert module.dtype == torch.double assert model.dtype == torch.double