def test_verbose_param(tmpdir, capsys): """Test that output is present when verbose parameter is set""" model = BoringModel() model.example_input_array = torch.randn(5, 32) file_path = os.path.join(tmpdir, "model.onnx") model.to_onnx(file_path, verbose=True) captured = capsys.readouterr() assert "graph(%" in captured.out
def test_torchscript_device(device): """ Test that scripted module is on the correct device. """ model = BoringModel().to(device) model.example_input_array = torch.randn(5, 32) script = model.to_torchscript() assert next(script.parameters()).device == device script_output = script(model.example_input_array.to(device)) assert script_output.device == device
def test_model_saves_with_example_input_array(tmpdir): """Test that ONNX model saves with_example_input_array and size is greater than 3 MB""" model = BoringModel() model.example_input_array = torch.randn(5, 32) file_path = os.path.join(tmpdir, "model.onnx") model.to_onnx(file_path) assert os.path.exists(file_path) is True assert os.path.getsize(file_path) > 4e2
def test_torchscript_with_no_input(tmpdir): """Test that an error is thrown when there is no input tensor""" model = BoringModel() model.example_input_array = None with pytest.raises( ValueError, match= 'requires either `example_inputs` or `model.example_input_array`'): model.to_torchscript(method='trace')
def test_error_if_no_input(tmpdir): """Test that an error is thrown when there is no input tensor""" model = BoringModel() model.example_input_array = None file_path = os.path.join(tmpdir, "model.onnx") with pytest.raises( ValueError, match=r'Could not export to ONNX since neither `input_sample` nor' r' `model.example_input_array` attribute is set.'): model.to_onnx(file_path)
def test_model_saves_with_input_sample(tmpdir): """Test that ONNX model saves with input sample and size is greater than 3 MB""" model = BoringModel() trainer = Trainer(max_epochs=1) trainer.fit(model) file_path = os.path.join(tmpdir, "model.onnx") input_sample = torch.randn((1, 32)) model.to_onnx(file_path, input_sample) assert os.path.isfile(file_path) assert os.path.getsize(file_path) > 4e2
def test_model_saves_on_gpu(tmpdir): """Test that model saves on gpu""" model = BoringModel() trainer = Trainer(gpus=1, max_epochs=1) trainer.fit(model) file_path = os.path.join(tmpdir, "model.onnx") input_sample = torch.randn((1, 32)) model.to_onnx(file_path, input_sample) assert os.path.isfile(file_path) assert os.path.getsize(file_path) > 4e2
def test_torchscript_input_output_trace(): """ Test that traced LightningModule forward works with example_inputs """ model = BoringModel() example_inputs = torch.randn(1, 32) script = model.to_torchscript(example_inputs=example_inputs, method='trace') assert isinstance(script, torch.jit.ScriptModule) model.eval() with torch.no_grad(): model_output = model(example_inputs) script_output = script(example_inputs) assert torch.allclose(script_output, model_output)
def test_skip_on_fast_dev_run_tuner(tmpdir, tuner_alg): """ Test that tuner algorithms are skipped if fast dev run is enabled """ model = BoringModel() model.lr = 0.1 # avoid no-lr-found exception trainer = Trainer( default_root_dir=tmpdir, max_epochs=2, auto_scale_batch_size=True if tuner_alg == 'batch size scaler' else False, auto_lr_find=True if tuner_alg == 'learning rate finder' else False, fast_dev_run=True) expected_message = f'Skipping {tuner_alg} since fast_dev_run is enabled.' with pytest.warns(UserWarning, match=expected_message): trainer.tune(model)
def test_checkpoint_repeated_strategy_tmpdir(tmpdir): """ This test validates that the checkpoint can be called when provided to callacks list """ os.environ['PL_DEV_DEBUG'] = '1' checkpoint_callback = ModelCheckpoint(monitor='val_loss', filepath=os.path.join( tmpdir, "{epoch:02d}")) class ExtendedBoringModel(BoringModel): def validation_step(self, batch, batch_idx): output = self.layer(batch) loss = self.loss(batch, output) return {"val_loss": loss} model = ExtendedBoringModel() model.validation_step_end = None model.validation_epoch_end = None trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, limit_train_batches=2, limit_val_batches=2, limit_test_batches=2, callbacks=[checkpoint_callback]) trainer.fit(model) assert sorted(os.listdir(tmpdir)) == sorted( ['epoch=00.ckpt', 'lightning_logs']) path_to_lightning_logs = osp.join(tmpdir, 'lightning_logs') assert sorted(os.listdir(path_to_lightning_logs)) == sorted(['version_0']) def get_last_checkpoint(): ckpts = os.listdir(tmpdir) ckpts_map = { int(x.split("=")[1].split('.')[0]): osp.join(tmpdir, x) for x in ckpts if "epoch" in x } num_ckpts = len(ckpts_map) - 1 return ckpts_map[num_ckpts] for idx in range(1, 5): # load from checkpoint chk = get_last_checkpoint() model = BoringModel.load_from_checkpoint(chk) trainer = pl.Trainer(default_root_dir=tmpdir, max_epochs=1, limit_train_batches=2, limit_val_batches=2, limit_test_batches=2, resume_from_checkpoint=chk) trainer.fit(model) trainer.test(model) assert sorted(os.listdir(tmpdir)) == sorted( ['epoch=00.ckpt', 'lightning_logs']) assert sorted(os.listdir(path_to_lightning_logs)) == sorted( [f'version_{i}' for i in range(idx + 1)])
def test_model_checkpoint_file_already_exists(tmpdir, max_epochs, save_top_k, expected): """ Test that version is added to filename if required and it already exists in dirpath. """ model_checkpoint = ModelCheckpoint( dirpath=tmpdir, filename='curr_epoch', save_top_k=save_top_k, monitor='epoch', mode='max', ) trainer = Trainer( default_root_dir=tmpdir, callbacks=[model_checkpoint], max_epochs=max_epochs, limit_train_batches=2, limit_val_batches=2, logger=None, weights_summary=None, progress_bar_refresh_rate=0, ) model = BoringModel() trainer.fit(model) ckpt_files = os.listdir(tmpdir) assert set(ckpt_files) == set(expected) epochs_in_ckpt_files = [ pl_load(os.path.join(tmpdir, f))['epoch'] - 1 for f in ckpt_files ] assert sorted(epochs_in_ckpt_files) == list( range(max_epochs - save_top_k, max_epochs))
def test_if_inference_output_is_valid(tmpdir): """Test that the output inferred from ONNX model is same as from PyTorch""" model = BoringModel() model.example_input_array = torch.randn(5, 32) trainer = Trainer(max_epochs=2) trainer.fit(model) model.eval() with torch.no_grad(): torch_out = model(model.example_input_array) file_path = os.path.join(tmpdir, "model.onnx") model.to_onnx(file_path, model.example_input_array, export_params=True) ort_session = onnxruntime.InferenceSession(file_path) def to_numpy(tensor): return tensor.detach().cpu().numpy( ) if tensor.requires_grad else tensor.cpu().numpy() # compute ONNX Runtime output prediction ort_inputs = { ort_session.get_inputs()[0].name: to_numpy(model.example_input_array) } ort_outs = ort_session.run(None, ort_inputs) # compare ONNX Runtime and PyTorch results assert np.allclose(to_numpy(torch_out), ort_outs[0], rtol=1e-03, atol=1e-05)
def test_torchscript_retain_training_state(): """ Test that torchscript export does not alter the training mode of original model. """ model = BoringModel() model.train(True) script = model.to_torchscript() assert model.training assert not script.training model.train(False) _ = model.to_torchscript() assert not model.training assert not script.training
def test_lr_finder_fails_fast_on_bad_config(tmpdir): """ Test that tune fails if the model does not have a lr BEFORE running lr find """ # note: this did not raise an exception before #5638 because lr_find is skipped # during fast_dev_run and the lr attribute check was done after lr_find trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, auto_lr_find=True) with pytest.raises(MisconfigurationException, match='should have one of these fields'): trainer.tune(BoringModel())
def test_default_checkpoint_freq(save_mock, tmpdir, epochs, val_check_interval, expected): model = BoringModel() trainer = Trainer(default_root_dir=tmpdir, max_epochs=epochs, weights_summary=None, val_check_interval=val_check_interval) trainer.fit(model) # make sure types are correct assert save_mock.call_count == expected
def test_checkpoint_repeated_strategy(enable_pl_optimizer, tmpdir): """ This test validates that the checkpoint can be called when provided to callacks list """ checkpoint_callback = ModelCheckpoint(monitor='val_loss', dirpath=tmpdir, filename="{epoch:02d}") class ExtendedBoringModel(BoringModel): def validation_step(self, batch, batch_idx): output = self.layer(batch) loss = self.loss(batch, output) return {"val_loss": loss} model = ExtendedBoringModel() model.validation_step_end = None model.validation_epoch_end = None trainer = Trainer( max_epochs=1, limit_train_batches=2, limit_val_batches=2, limit_test_batches=2, callbacks=[checkpoint_callback], enable_pl_optimizer=enable_pl_optimizer, ) trainer.fit(model) assert os.listdir(tmpdir) == ['epoch=00.ckpt'] def get_last_checkpoint(): ckpts = os.listdir(tmpdir) ckpts_map = { int(x.split("=")[1].split('.')[0]): osp.join(tmpdir, x) for x in ckpts if "epoch" in x } num_ckpts = len(ckpts_map) - 1 return ckpts_map[num_ckpts] for idx in range(1, 5): # load from checkpoint chk = get_last_checkpoint() model = BoringModel.load_from_checkpoint(chk) trainer = pl.Trainer(max_epochs=1, limit_train_batches=2, limit_val_batches=2, limit_test_batches=2, resume_from_checkpoint=chk, enable_pl_optimizer=enable_pl_optimizer) trainer.fit(model) trainer.test(model) assert str(os.listdir(tmpdir)) == "['epoch=00.ckpt']"
def test_torchcript_invalid_method(tmpdir): """Test that an error is thrown with invalid torchscript method""" model = BoringModel() model.train(True) with pytest.raises(ValueError, match="only supports 'script' or 'trace'"): model.to_torchscript(method='temp')
def test_mc_called(tmpdir): seed_everything(1234) # ----------------- # TRAIN LOOP ONLY # ----------------- train_step_only_model = BoringModel() train_step_only_model.validation_step = None # no callback trainer = Trainer(max_epochs=3, checkpoint_callback=False) trainer.fit(train_step_only_model) assert len(trainer.dev_debugger.checkpoint_callback_history) == 0 # ----------------- # TRAIN + VAL LOOP ONLY # ----------------- val_train_model = BoringModel() # no callback trainer = Trainer(max_epochs=3, checkpoint_callback=False) trainer.fit(val_train_model) assert len(trainer.dev_debugger.checkpoint_callback_history) == 0
def test_model_saves_with_example_output(tmpdir): """Test that ONNX model saves when provided with example output""" model = BoringModel() trainer = Trainer(max_epochs=1) trainer.fit(model) file_path = os.path.join(tmpdir, "model.onnx") input_sample = torch.randn((1, 32)) model.eval() example_outputs = model.forward(input_sample) model.to_onnx(file_path, input_sample, example_outputs=example_outputs) assert os.path.exists(file_path) is True
def test_model_torch_save(tmpdir): """Test to ensure torch save does not fail for model and trainer.""" model = BoringModel() num_epochs = 1 trainer = Trainer( default_root_dir=tmpdir, max_epochs=num_epochs, ) temp_path = os.path.join(tmpdir, 'temp.pt') trainer.fit(model) # Ensure these do not fail torch.save(trainer.model, temp_path) torch.save(trainer, temp_path)
def test_model_torch_save_ddp_cuda(tmpdir): """Test to ensure torch save does not fail for model and trainer using gpu ddp.""" model = BoringModel() num_epochs = 1 trainer = Trainer(default_root_dir=tmpdir, max_epochs=num_epochs, accelerator="ddp_spawn", gpus=2) temp_path = os.path.join(tmpdir, 'temp.pt') trainer.fit(model) # Ensure these do not fail torch.save(trainer.model, temp_path) torch.save(trainer, temp_path)
def test_try_resume_from_non_existing_checkpoint(tmpdir): """ Test that trying to resume from non-existing `resume_from_checkpoint` fail without error.""" model = BoringModel() checkpoint_cb = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_last=True) trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, logger=False, callbacks=[checkpoint_cb], limit_train_batches=0.1, limit_val_batches=0.1, ) # Generate checkpoint `last.ckpt` with BoringModel trainer.fit(model) # `True` if resume/restore successfully else `False` assert trainer.checkpoint_connector.restore(str(tmpdir / "last.ckpt"), trainer.on_gpu) assert not trainer.checkpoint_connector.restore(str(tmpdir / "last_non_existing.ckpt"), trainer.on_gpu)
def test_test_progress_bar_update_amount(tmpdir, test_batches, refresh_rate, test_deltas): """ Test that test progress updates with the correct amount. """ model = BoringModel() progress_bar = MockedUpdateProgressBars(refresh_rate=refresh_rate) trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, limit_test_batches=test_batches, callbacks=[progress_bar], logger=False, checkpoint_callback=False, ) trainer.test(model) progress_bar.test_progress_bar.update.assert_has_calls( [call(delta) for delta in test_deltas])
def test_model_torch_save(tmpdir, enable_pl_optimizer): """Test to ensure torch save does not fail for model and trainer.""" model = BoringModel() num_epochs = 1 trainer = Trainer( default_root_dir=tmpdir, max_epochs=num_epochs, enable_pl_optimizer=enable_pl_optimizer, ) temp_path = os.path.join(tmpdir, 'temp.pt') trainer.fit(model) # Ensure these do not fail torch.save(trainer.model, temp_path) torch.save(trainer, temp_path) trainer = torch.load(temp_path) is_lightning_optimizer = isinstance(trainer.optimizers[0], LightningOptimizer) assert is_lightning_optimizer if enable_pl_optimizer else not is_lightning_optimizer
def test_mc_called_on_fastdevrun(tmpdir): seed_everything(1234) train_val_step_model = BoringModel() # fast dev run = called once # train loop only, dict, eval result trainer = Trainer(fast_dev_run=True) trainer.fit(train_val_step_model) # checkpoint should have been called once with fast dev run assert len(trainer.dev_debugger.checkpoint_callback_history) == 1 # ----------------------- # also called once with no val step # ----------------------- class TrainingStepCalled(BoringModel): def __init__(self): super().__init__() self.training_step_called = False self.validation_step_called = False self.test_step_called = False def training_step(self, batch, batch_idx): self.training_step_called = True return super().training_step(batch, batch_idx) train_step_only_model = TrainingStepCalled() train_step_only_model.validation_step = None # fast dev run = called once # train loop only, dict, eval result trainer = Trainer(fast_dev_run=True) trainer.fit(train_step_only_model) # make sure only training step was called assert train_step_only_model.training_step_called assert not train_step_only_model.validation_step_called assert not train_step_only_model.test_step_called # checkpoint should have been called once with fast dev run assert len(trainer.dev_debugger.checkpoint_callback_history) == 1
def test_main_progress_bar_update_amount(tmpdir, train_batches, val_batches, refresh_rate, train_deltas, val_deltas): """ Test that the main progress updates with the correct amount together with the val progress. At the end of the epoch, the progress must not overshoot if the number of steps is not divisible by the refresh rate. """ model = BoringModel() progress_bar = MockedUpdateProgressBars(refresh_rate=refresh_rate) trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, limit_train_batches=train_batches, limit_val_batches=val_batches, callbacks=[progress_bar], logger=False, checkpoint_callback=False, ) trainer.fit(model) progress_bar.main_progress_bar.update.assert_has_calls( [call(delta) for delta in train_deltas]) if val_batches > 0: progress_bar.val_progress_bar.update.assert_has_calls( [call(delta) for delta in val_deltas])
def test_trainer_callback_system(torch_save): """Test the callback system.""" model = BoringModel() callback_mock = MagicMock() trainer_options = dict( callbacks=[callback_mock], max_epochs=1, limit_val_batches=1, limit_train_batches=3, limit_test_batches=2, progress_bar_refresh_rate=0, ) # no call yet callback_mock.assert_not_called() # fit model trainer = Trainer(**trainer_options) # check that only the to calls exists assert trainer.callbacks[0] == callback_mock assert callback_mock.method_calls == [ call.on_init_start(trainer), call.on_init_end(trainer), ] trainer.fit(model) assert callback_mock.method_calls == [ call.on_init_start(trainer), call.on_init_end(trainer), call.setup(trainer, model, 'fit'), call.on_fit_start(trainer, model), call.on_pretrain_routine_start(trainer, model), call.on_pretrain_routine_end(trainer, model), call.on_sanity_check_start(trainer, model), call.on_validation_start(trainer, model), call.on_validation_epoch_start(trainer, model), call.on_validation_batch_start(trainer, model, ANY, 0, 0), call.on_validation_batch_end(trainer, model, ANY, ANY, 0, 0), call.on_validation_epoch_end(trainer, model), call.on_validation_end(trainer, model), call.on_sanity_check_end(trainer, model), call.on_train_start(trainer, model), call.on_epoch_start(trainer, model), call.on_train_epoch_start(trainer, model), call.on_batch_start(trainer, model), call.on_train_batch_start(trainer, model, ANY, 0, 0), call.on_after_backward(trainer, model), call.on_before_zero_grad(trainer, model, trainer.optimizers[0]), call.on_batch_end(trainer, model), call.on_train_batch_end(trainer, model, ANY, ANY, 0, 0), call.on_batch_start(trainer, model), call.on_train_batch_start(trainer, model, ANY, 1, 0), call.on_after_backward(trainer, model), call.on_before_zero_grad(trainer, model, trainer.optimizers[0]), call.on_batch_end(trainer, model), call.on_train_batch_end(trainer, model, ANY, ANY, 1, 0), call.on_batch_start(trainer, model), call.on_train_batch_start(trainer, model, ANY, 2, 0), call.on_after_backward(trainer, model), call.on_before_zero_grad(trainer, model, trainer.optimizers[0]), call.on_batch_end(trainer, model), call.on_train_batch_end(trainer, model, ANY, ANY, 2, 0), call.on_validation_start(trainer, model), call.on_validation_epoch_start(trainer, model), call.on_validation_batch_start(trainer, model, ANY, 0, 0), call.on_validation_batch_end(trainer, model, ANY, ANY, 0, 0), call.on_validation_epoch_end(trainer, model), call.on_validation_end(trainer, model), call.on_save_checkpoint(trainer, model), call.on_epoch_end(trainer, model), call.on_train_epoch_end(trainer, model, ANY), call.on_train_end(trainer, model), call.on_fit_end(trainer, model), call.teardown(trainer, model, 'fit'), ] callback_mock.reset_mock() trainer = Trainer(**trainer_options) trainer.test(model) assert callback_mock.method_calls == [ call.on_init_start(trainer), call.on_init_end(trainer), call.setup(trainer, model, 'test'), call.on_fit_start(trainer, model), call.on_test_start(trainer, model), call.on_test_epoch_start(trainer, model), call.on_test_batch_start(trainer, model, ANY, 0, 0), call.on_test_batch_end(trainer, model, ANY, ANY, 0, 0), call.on_test_batch_start(trainer, model, ANY, 1, 0), call.on_test_batch_end(trainer, model, ANY, ANY, 1, 0), call.on_test_epoch_end(trainer, model), call.on_test_end(trainer, model), call.on_fit_end(trainer, model), call.teardown(trainer, model, 'fit'), call.teardown(trainer, model, 'test'), ]
def test_checkpoint_repeated_strategy_extended(tmpdir): """ This test validates checkpoint can be called several times without increasing internally its global step if nothing run. """ os.environ['PL_DEV_DEBUG'] = '1' class ExtendedBoringModel(BoringModel): def validation_step(self, batch, batch_idx): output = self.layer(batch) loss = self.loss(batch, output) return {"val_loss": loss} model = ExtendedBoringModel() model.validation_step_end = None model.validation_epoch_end = None trainer = pl.Trainer( default_root_dir=tmpdir, max_epochs=1, limit_train_batches=2, limit_val_batches=2, limit_test_batches=2, ) assert trainer.checkpoint_connector.has_trained is not True assert trainer.current_epoch == 0 trainer.fit(model) assert trainer.checkpoint_connector.has_trained is True assert trainer.global_step == 2 assert trainer.current_epoch == 0 trainer.test(model) assert trainer.current_epoch == 0 assert str(os.listdir(osp.join(tmpdir, 'lightning_logs'))) == "['version_0']" def get_last_checkpoint(): logs_dir = osp.join(tmpdir, 'lightning_logs') versions = os.listdir(logs_dir) versions.sort() last_version = versions[-1] ckpt_dir = osp.join(logs_dir, last_version, "checkpoints") ckpts = os.listdir(ckpt_dir) ckpts.sort() return osp.join(ckpt_dir, ckpts[-1]) def assert_checkpoint_content(): chk = pl_load(get_last_checkpoint()) assert chk["epoch"] == 1 assert chk["global_step"] == 2 assert_checkpoint_content() for idx in range(1, 5): # load from checkpoint chk = get_last_checkpoint() assert_checkpoint_content() model = BoringModel.load_from_checkpoint(chk) trainer = pl.Trainer(default_root_dir=tmpdir, max_epochs=1, limit_train_batches=2, limit_val_batches=2, limit_test_batches=2, resume_from_checkpoint=chk) assert trainer.checkpoint_connector.has_trained is not True assert trainer.global_step == 0 trainer.test(model) assert trainer.global_step == 2 trainer.fit(model) assert trainer.global_step == 2 assert trainer.checkpoint_connector.has_trained is not True lightning_logs_path = osp.join(tmpdir, 'lightning_logs') assert sorted(os.listdir(lightning_logs_path)) == [ f"version_{i}" for i in range(idx + 1) ]