def run_checkpoint_test(tmpdir: str, save_full_weights: bool, automatic_optimization: bool = True, accumulate_grad_batches: int = 2): seed_everything(1) if automatic_optimization: model = ModelParallelClassificationModel() else: model = ManualModelParallelClassificationModel() dm = ClassifDataModule() ck = ModelCheckpoint(monitor="val_acc", mode="max", save_last=True, save_top_k=-1) trainer = Trainer( default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=10, plugins=[ DeepSpeedPlugin(stage=3, save_full_weights=save_full_weights) ], gpus=2, precision=16, accumulate_grad_batches=accumulate_grad_batches, callbacks=[ck], ) trainer.fit(model, datamodule=dm) results = trainer.test(model, datamodule=dm) assert results[0]["test_acc"] > 0.7 saved_results = trainer.test(ckpt_path=ck.best_model_path, datamodule=dm) assert saved_results[0]["test_acc"] > 0.7 assert saved_results == results trainer = Trainer( default_root_dir=tmpdir, max_epochs=10, plugins=[ DeepSpeedPlugin(stage=3, save_full_weights=save_full_weights) ], gpus=2, precision=16, accumulate_grad_batches=2, callbacks=[ck], resume_from_checkpoint=ck.best_model_path, ) results = trainer.test(model, datamodule=dm) assert results[0]["test_acc"] > 0.7 dm.predict_dataloader = dm.test_dataloader results = trainer.predict(datamodule=dm) assert results[-1] > 0.7
def test_running_test_pretrained_model_distrib_ddp_spawn(tmpdir): """Verify `test()` on pretrained model.""" tutils.set_random_master_port() dm = ClassifDataModule() model = ClassificationModel() # exp file to get meta logger = tutils.get_default_logger(tmpdir) # exp file to get weights checkpoint = tutils.init_checkpoint_callback(logger) trainer_options = dict( progress_bar_refresh_rate=0, max_epochs=2, limit_train_batches=2, limit_val_batches=2, callbacks=[checkpoint], logger=logger, gpus=[0, 1], accelerator='ddp_spawn', default_root_dir=tmpdir, ) # fit model trainer = Trainer(**trainer_options) trainer.fit(model, datamodule=dm) log.info(os.listdir(tutils.get_data_path(logger, path_dir=tmpdir))) # correct result and ok accuracy assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}" pretrained_model = ClassificationModel.load_from_checkpoint( trainer.checkpoint_callback.best_model_path) # run test set new_trainer = Trainer(**trainer_options) new_trainer.test(pretrained_model) pretrained_model.cpu() dataloaders = dm.test_dataloader() if not isinstance(dataloaders, list): dataloaders = [dataloaders] for dataloader in dataloaders: tpipes.run_prediction_eval_model_template(pretrained_model, dataloader, min_acc=0.1)
def test_lr_monitor_param_groups(tmpdir): """Test that learning rates are extracted and logged for single lr scheduler.""" tutils.reset_seed() class CustomClassificationModel(ClassificationModel): def configure_optimizers(self): param_groups = [ {"params": list(self.parameters())[:2], "lr": self.lr * 0.1}, {"params": list(self.parameters())[2:], "lr": self.lr}, ] optimizer = optim.Adam(param_groups) lr_scheduler = optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.1) return [optimizer], [lr_scheduler] model = CustomClassificationModel() dm = ClassifDataModule() lr_monitor = LearningRateMonitor() trainer = Trainer( default_root_dir=tmpdir, max_epochs=2, limit_val_batches=0.1, limit_train_batches=0.5, callbacks=[lr_monitor] ) trainer.fit(model, datamodule=dm) assert trainer.state.finished, f"Training failed with {trainer.state}" assert lr_monitor.lrs, "No learning rates logged" assert len(lr_monitor.lrs) == 2 * len( trainer.lr_schedulers ), "Number of learning rates logged does not match number of param groups" assert lr_monitor.lr_sch_names == ["lr-Adam"] assert list(lr_monitor.lrs.keys()) == ["lr-Adam/pg1", "lr-Adam/pg2"], "Names of learning rates not set correctly"
def main(): seed_everything(4321) parser = ArgumentParser(add_help=False) parser = Trainer.add_argparse_args(parser) parser.add_argument("--trainer_method", default="fit") parser.add_argument("--tmpdir") parser.add_argument("--workdir") parser.set_defaults(gpus=2) parser.set_defaults(accelerator="ddp") args = parser.parse_args() dm = ClassifDataModule() model = ClassificationModel() trainer = Trainer.from_argparse_args(args) if args.trainer_method == "fit": trainer.fit(model, datamodule=dm) result = None elif args.trainer_method == "test": result = trainer.test(model, datamodule=dm) elif args.trainer_method == "fit_test": trainer.fit(model, datamodule=dm) result = trainer.test(model, datamodule=dm) else: raise ValueError(f"Unsupported: {args.trainer_method}") result_ext = { "status": "complete", "method": args.trainer_method, "result": result } file_path = os.path.join(args.tmpdir, "ddp.result") torch.save(result_ext, file_path)
def test_callbacks_references_resume_from_checkpoint(tmpdir): """ Test that resuming from a checkpoint sets references as expected. """ dm = ClassifDataModule() model = ClassificationModel() args = {'default_root_dir': tmpdir, 'max_steps': 1, 'logger': False} # initial training checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor="val_loss", save_last=True) trainer = Trainer(**args, callbacks=[checkpoint]) assert checkpoint is trainer.callbacks[-1] is trainer.checkpoint_callback trainer.fit(model, datamodule=dm) # resumed training new_checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor="val_loss", save_last=True) # pass in a new checkpoint object, which should take # precedence over the one in the last.ckpt file trainer = Trainer(**args, callbacks=[new_checkpoint], resume_from_checkpoint=str(tmpdir / "last.ckpt")) assert checkpoint is not new_checkpoint assert new_checkpoint is trainer.callbacks[ -1] is trainer.checkpoint_callback trainer.fit(model, datamodule=dm)
def test_optimization(tmpdir): seed_everything(42) dm = ClassifDataModule(length=1024) model = IPUClassificationModel() trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, accelerator="ipu", devices=2) # fit model trainer.fit(model, dm) assert trainer.state.finished, f"Training failed with {trainer.state}" assert dm.trainer is not None # validate result = trainer.validate(datamodule=dm) assert dm.trainer is not None assert result[0]["val_acc"] > 0.7 # test result = trainer.test(model, datamodule=dm) assert dm.trainer is not None test_result = result[0]["test_acc"] assert test_result > 0.6 # test saved model model_path = os.path.join(tmpdir, "model.pt") trainer.save_checkpoint(model_path) model = IPUClassificationModel.load_from_checkpoint(model_path) trainer = Trainer(default_root_dir=tmpdir, accelerator="ipu", devices=2) result = trainer.test(model, datamodule=dm) saved_result = result[0]["test_acc"] assert saved_result == test_result
def test_dp_test(tmpdir): tutils.set_random_master_port() dm = ClassifDataModule() model = CustomClassificationModelDP() trainer = pl.Trainer( default_root_dir=tmpdir, max_epochs=2, limit_train_batches=10, limit_val_batches=10, gpus=[0, 1], accelerator='dp', ) trainer.fit(model, datamodule=dm) assert 'ckpt' in trainer.checkpoint_callback.best_model_path results = trainer.test(datamodule=dm) assert 'test_acc' in results[0] old_weights = model.layer_0.weight.clone().detach().cpu() results = trainer.test(model, datamodule=dm) assert 'test_acc' in results[0] # make sure weights didn't change new_weights = model.layer_0.weight.clone().detach().cpu() assert torch.all(torch.eq(old_weights, new_weights))
def main(): seed_everything(1234) parser = ArgumentParser(add_help=False) parser = Trainer.add_argparse_args(parser) parser.add_argument('--trainer_method', default='fit') parser.add_argument('--tmpdir') parser.add_argument('--workdir') parser.set_defaults(gpus=2) parser.set_defaults(accelerator="ddp") args = parser.parse_args() dm = ClassifDataModule() model = ClassificationModel() trainer = Trainer.from_argparse_args(args) if args.trainer_method == 'fit': trainer.fit(model, datamodule=dm) result = None elif args.trainer_method == 'test': result = trainer.test(model, datamodule=dm) elif args.trainer_method == 'fit_test': trainer.fit(model, datamodule=dm) result = trainer.test(model, datamodule=dm) else: raise ValueError(f'Unsupported: {args.trainer_method}') result_ext = { 'status': 'complete', 'method': args.trainer_method, 'result': result, } file_path = os.path.join(args.tmpdir, 'ddp.result') torch.save(result_ext, file_path)
def test_full_loop(tmpdir): reset_seed() dm = ClassifDataModule() model = ClassificationModel() trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, weights_summary=None, deterministic=True, ) # fit model trainer.fit(model, dm) assert trainer.state.finished, f"Training failed with {trainer.state}" assert dm.trainer is not None # validate result = trainer.validate(datamodule=dm) assert dm.trainer is not None assert result[0]['val_acc'] > 0.7 # test result = trainer.test(datamodule=dm) assert dm.trainer is not None assert result[0]['test_acc'] > 0.6
def test_deepspeed_multigpu_stage_3_warns_resume_training(tmpdir): """Test to ensure with Stage 3 and multiple GPUs that we can resume from training, throwing a warning that the optimizer state and scheduler states cannot be restored.""" dm = ClassifDataModule() model = BoringModel() checkpoint_path = os.path.join(tmpdir, "model.pt") trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True) trainer.fit(model) trainer.save_checkpoint(checkpoint_path) trainer = Trainer( default_root_dir=tmpdir, fast_dev_run=True, strategy=DeepSpeedStrategy(stage=3, load_full_weights=True), accelerator="gpu", devices=1, precision=16, ) with pytest.warns( UserWarning, match= "A single checkpoint file has been given. This means optimizer states cannot be restored. " "If you'd like to restore these states, you must " "provide a path to the originally saved DeepSpeed checkpoint.", ): trainer.fit(model, datamodule=dm, ckpt_path=checkpoint_path)
def test_callbacks_state_resume_from_checkpoint(tmpdir): """ Test that resuming from a checkpoint restores callbacks that persist state. """ dm = ClassifDataModule() model = ClassificationModel() callback_capture = CaptureCallbacksBeforeTraining() def get_trainer_args(): checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor="val_loss", save_last=True) trainer_args = dict(default_root_dir=tmpdir, max_steps=1, logger=False, callbacks=[checkpoint, callback_capture]) assert checkpoint.best_model_path == "" assert checkpoint.best_model_score is None return trainer_args # initial training trainer = Trainer(**get_trainer_args()) trainer.fit(model, datamodule=dm) callbacks_before_resume = deepcopy(trainer.callbacks) # resumed training trainer = Trainer(**get_trainer_args(), resume_from_checkpoint=str(tmpdir / "last.ckpt")) trainer.fit(model, datamodule=dm) assert len(callbacks_before_resume) == len(callback_capture.callbacks) for before, after in zip(callbacks_before_resume, callback_capture.callbacks): if isinstance(before, ModelCheckpoint): assert before.best_model_path == after.best_model_path assert before.best_model_score == after.best_model_score
def test_deepspeed_multigpu_stage_2_accumulated_grad_batches( tmpdir, offload_optimizer): """ Test to ensure with Stage 2 and multiple GPUs, accumulated grad batches works. """ seed_everything(42) class VerificationCallback(Callback): def on_train_batch_start(self, trainer, pl_module: LightningModule, batch: Any, batch_idx: int, dataloader_idx: int) -> None: deepspeed_engine = trainer.training_type_plugin.model assert trainer.global_step == deepspeed_engine.global_steps model = ModelParallelClassificationModel() dm = ClassifDataModule() trainer = Trainer(default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=5, plugins=[ DeepSpeedPlugin(stage=2, offload_optimizer=offload_optimizer) ], gpus=2, limit_val_batches=2, precision=16, accumulate_grad_batches=2, callbacks=[VerificationCallback()]) trainer.fit(model, datamodule=dm)
def test_deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer): """Test to ensure with Stage 2 and multiple GPUs, accumulated grad batches works.""" seed_everything(42) class VerificationCallback(Callback): def __init__(self): self.on_train_batch_start_called = False def on_train_batch_start(self, trainer, pl_module: LightningModule, batch: Any, batch_idx: int) -> None: deepspeed_engine = trainer.strategy.model assert trainer.global_step == deepspeed_engine.global_steps self.on_train_batch_start_called = True model = ModelParallelClassificationModel() dm = ClassifDataModule() verification_callback = VerificationCallback() trainer = Trainer( default_root_dir=tmpdir, enable_progress_bar=False, # TODO: this test fails with max_epochs >1 as there are leftover batches per epoch. # there's divergence in how Lightning handles the last batch of the epoch with how DeepSpeed does it. # we step the optimizers on the last batch but DeepSpeed keeps the accumulation for the next epoch max_epochs=1, strategy=DeepSpeedStrategy(stage=2, offload_optimizer=offload_optimizer), gpus=2, limit_train_batches=5, limit_val_batches=2, precision=16, accumulate_grad_batches=2, callbacks=[verification_callback], ) assert trainer.limit_train_batches % trainer.accumulate_grad_batches != 0, "leftover batches should be tested" trainer.fit(model, datamodule=dm) assert verification_callback.on_train_batch_start_called
def test_deepspeed_multigpu_stage_3_resume_training(tmpdir): """Test to ensure with Stage 3 and single GPU that we can resume training.""" initial_model = ModelParallelClassificationModel() dm = ClassifDataModule() ck = ModelCheckpoint(monitor="val_acc", mode="max", save_last=True, save_top_k=-1) initial_trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, limit_train_batches=2, limit_val_batches=2, limit_test_batches=2, strategy=DeepSpeedStrategy(stage=3), gpus=1, precision=16, callbacks=[ck], enable_progress_bar=False, enable_model_summary=False, ) initial_trainer.fit(initial_model, datamodule=dm) class TestCallback(Callback): def on_train_batch_start( self, trainer: Trainer, pl_module: LightningModule, batch: Any, batch_idx: int ) -> None: original_deepspeed_strategy = initial_trainer.strategy current_deepspeed_strategy = trainer.strategy assert isinstance(original_deepspeed_strategy, DeepSpeedStrategy) assert isinstance(current_deepspeed_strategy, DeepSpeedStrategy) # assert optimizer states are the correctly loaded original_optimizer_dict = original_deepspeed_strategy.deepspeed_engine.optimizer.state_dict() current_optimizer_dict = current_deepspeed_strategy.deepspeed_engine.optimizer.state_dict() for orig_tensor, current_tensor in zip( original_optimizer_dict["fp32_flat_groups"], current_optimizer_dict["fp32_flat_groups"] ): assert torch.all(orig_tensor.eq(current_tensor)) # assert model state is loaded correctly for current_param, initial_param in zip(pl_module.parameters(), initial_model.parameters()): assert torch.equal(current_param.cpu(), initial_param.cpu()) # assert epoch has correctly been restored assert trainer.current_epoch == 1 # assert lr-scheduler states are loaded correctly original_lr_scheduler = initial_trainer.lr_scheduler_configs[0].scheduler current_lr_scheduler = trainer.lr_scheduler_configs[0].scheduler assert original_lr_scheduler.state_dict() == current_lr_scheduler.state_dict() model = ModelParallelClassificationModel() trainer = Trainer( default_root_dir=tmpdir, fast_dev_run=True, strategy=DeepSpeedStrategy(stage=3), gpus=1, precision=16, callbacks=TestCallback(), enable_progress_bar=False, enable_model_summary=False, ) trainer.fit(model, datamodule=dm, ckpt_path=ck.best_model_path)
def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, automatic_optimization, accumulate_grad_batches): seed_everything(1) if automatic_optimization: model = ModelParallelClassificationModel() else: model = ManualModelParallelClassificationModel() dm = ClassifDataModule() ck = ModelCheckpoint(monitor="val_acc", mode="max", save_last=True, save_top_k=-1) trainer = Trainer( default_root_dir=tmpdir, max_epochs=10, strategy=DeepSpeedStrategy(stage=3), gpus=2, precision=16, accumulate_grad_batches=accumulate_grad_batches, callbacks=[ck], ) trainer.fit(model, datamodule=dm) results = trainer.test(datamodule=dm) assert results[0]["test_acc"] > 0.7 saved_results = trainer.test(ckpt_path=ck.best_model_path, datamodule=dm) assert saved_results[0]["test_acc"] > 0.7 assert saved_results == results if automatic_optimization: model = ModelParallelClassificationModel() else: model = ManualModelParallelClassificationModel() trainer = Trainer(default_root_dir=tmpdir, gpus=2, strategy=DeepSpeedStrategy(stage=3), precision=16) results = trainer.test(model, datamodule=dm, ckpt_path=ck.best_model_path) assert results[0]["test_acc"] > 0.7
def test_evaluate(tmpdir, trainer_kwargs): tutils.set_random_master_port() dm = ClassifDataModule() model = CustomClassificationModelDP() trainer = Trainer( default_root_dir=tmpdir, max_epochs=2, limit_train_batches=10, limit_val_batches=10, deterministic=True, **trainer_kwargs ) trainer.fit(model, datamodule=dm) assert "ckpt" in trainer.checkpoint_callback.best_model_path old_weights = model.layer_0.weight.clone().detach().cpu() result = trainer.validate(datamodule=dm) assert result[0]["val_acc"] > 0.55 result = trainer.test(datamodule=dm) assert result[0]["test_acc"] > 0.55 # make sure weights didn't change new_weights = model.layer_0.weight.clone().detach().cpu() torch.testing.assert_allclose(old_weights, new_weights)
def test_running_test_pretrained_model_distrib_dp(tmpdir): """Verify `test()` on pretrained model.""" tutils.set_random_main_port() dm = ClassifDataModule() model = CustomClassificationModelDP(lr=0.1) # exp file to get meta logger = tutils.get_default_logger(tmpdir) # exp file to get weights checkpoint = tutils.init_checkpoint_callback(logger) trainer_options = dict( enable_progress_bar=False, max_epochs=2, limit_train_batches=5, limit_val_batches=5, callbacks=[checkpoint], logger=logger, accelerator="gpu", devices=[0, 1], strategy="dp", default_root_dir=tmpdir, ) # fit model trainer = Trainer(**trainer_options) trainer.fit(model, datamodule=dm) # correct result and ok accuracy assert trainer.state.finished, f"Training failed with {trainer.state}" pretrained_model = CustomClassificationModelDP.load_from_checkpoint( trainer.checkpoint_callback.best_model_path) # run test set new_trainer = Trainer(**trainer_options) new_trainer.test(pretrained_model, datamodule=dm) pretrained_model.cpu() dataloaders = dm.test_dataloader() if not isinstance(dataloaders, list): dataloaders = [dataloaders] for dataloader in dataloaders: tpipes.run_model_prediction(pretrained_model, dataloader)
def test_deepspeed_multigpu_stage_3_resume_training(tmpdir): """Test to ensure with Stage 3 and multiple GPUs that we can resume training.""" initial_model = ModelParallelClassificationModel() dm = ClassifDataModule() ck = ModelCheckpoint(monitor="val_acc", mode="max", save_last=True, save_top_k=-1) initial_trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, limit_train_batches=2, limit_val_batches=2, limit_test_batches=2, plugins=DeepSpeedPlugin(stage=3), gpus=1, precision=16, callbacks=[ck], ) initial_trainer.fit(initial_model, datamodule=dm) class TestCallback(Callback): def on_train_batch_start(self, trainer: Trainer, pl_module: LightningModule, batch: Any, batch_idx: int, dataloader_idx: int) -> None: original_deepspeed_plugin = initial_trainer.accelerator.training_type_plugin current_deepspeed_plugin = trainer.accelerator.training_type_plugin assert isinstance(original_deepspeed_plugin, DeepSpeedPlugin) assert isinstance(current_deepspeed_plugin, DeepSpeedPlugin) # assert optimizer states are the correctly loaded original_optimizer_dict = original_deepspeed_plugin.deepspeed_engine.optimizer.state_dict( ) current_optimizer_dict = current_deepspeed_plugin.deepspeed_engine.optimizer.state_dict( ) for orig_tensor, current_tensor in zip( original_optimizer_dict["fp32_flat_groups"], current_optimizer_dict["fp32_flat_groups"]): assert torch.all(orig_tensor.eq(current_tensor)) # assert model state is loaded correctly for current_param, initial_param in zip( pl_module.parameters(), initial_model.parameters()): assert torch.equal(current_param.cpu(), initial_param.cpu()) # assert epoch has correctly been restored assert trainer.current_epoch == 1 model = ModelParallelClassificationModel() trainer = Trainer( default_root_dir=tmpdir, fast_dev_run=True, plugins=DeepSpeedPlugin(stage=3), gpus=1, precision=16, resume_from_checkpoint=ck.best_model_path, callbacks=TestCallback(), ) trainer.fit(model, datamodule=dm)
def run_checkpoint_test(tmpdir, save_full_weights): seed_everything(42) model = ModelParallelClassificationModel() dm = ClassifDataModule() ck = ModelCheckpoint(monitor="val_acc", mode="max", save_last=True, save_top_k=-1) trainer = Trainer(max_epochs=10, plugins=[ DeepSpeedPlugin(stage=3, save_full_weights=save_full_weights) ], default_root_dir=tmpdir, gpus=2, precision=16, accumulate_grad_batches=2, callbacks=[ck]) trainer.fit(model, datamodule=dm) results = trainer.test(model, datamodule=dm) assert results[0]['test_acc'] > 0.7 saved_results = trainer.test(ckpt_path=ck.best_model_path, datamodule=dm) assert saved_results[0]['test_acc'] > 0.7 assert saved_results == results trainer = Trainer(max_epochs=10, plugins=[ DeepSpeedPlugin(stage=3, save_full_weights=save_full_weights) ], default_root_dir=tmpdir, gpus=2, precision=16, accumulate_grad_batches=2, callbacks=[ck], resume_from_checkpoint=ck.best_model_path) results = trainer.test(model, datamodule=dm) assert results[0]['test_acc'] > 0.7 dm.predict_dataloader = dm.test_dataloader results = trainer.predict(datamodule=dm) assert results[-1] > 0.7
def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir): """ Test to ensure with Stage 3 and multiple GPUs that we can save/load a model resuming from a checkpoint, and see convergence. """ seed_everything(42) model = ModelParallelClassificationModel() dm = ClassifDataModule() ck = ModelCheckpoint(monitor="val_acc", mode="max", save_last=True, save_top_k=-1) trainer = Trainer(max_epochs=10, plugins=[DeepSpeedPlugin(stage=3)], default_root_dir=tmpdir, gpus=2, precision=16, accumulate_grad_batches=2, callbacks=[ck]) trainer.fit(model, datamodule=dm) results = trainer.test(model, datamodule=dm) assert results[0]['test_acc'] > 0.7 saved_results = trainer.test(ckpt_path=ck.best_model_path, datamodule=dm) assert saved_results[0]['test_acc'] > 0.7 assert saved_results == results trainer = Trainer(max_epochs=10, plugins=[DeepSpeedPlugin(stage=3)], default_root_dir=tmpdir, gpus=2, precision=16, accumulate_grad_batches=2, callbacks=[ck], resume_from_checkpoint=ck.best_model_path) results = trainer.test(model, datamodule=dm) assert results[0]['test_acc'] > 0.7 dm.predict_dataloader = dm.test_dataloader results = trainer.predict(datamodule=dm) assert results[-1] > 0.7
def test_callbacks_state_fit_ckpt_path(tmpdir): """Test that resuming from a checkpoint restores callbacks that persist state.""" dm = ClassifDataModule() model = ClassificationModel() callback_capture = CaptureCallbacksBeforeTraining() def get_trainer_args(): checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor="val_loss", save_last=True) trainer_args = dict( default_root_dir=tmpdir, limit_train_batches=1, limit_val_batches=2, max_epochs=1, logger=False, callbacks=[checkpoint, callback_capture], ) assert checkpoint.best_model_path == "" assert checkpoint.best_model_score is None return trainer_args # initial training trainer = Trainer(**get_trainer_args()) with pytest.deprecated_call( match= "`Callback.on_pretrain_routine_end` hook has been deprecated in v1.6" ): trainer.fit(model, datamodule=dm) callbacks_before_resume = deepcopy(trainer.callbacks) # resumed training trainer = Trainer(**get_trainer_args()) with pytest.deprecated_call( match= "`Callback.on_pretrain_routine_end` hook has been deprecated in v1.6" ): trainer.fit(model, datamodule=dm, ckpt_path=str(tmpdir / "last.ckpt")) assert len(callbacks_before_resume) == len(callback_capture.callbacks) for before, after in zip(callbacks_before_resume, callback_capture.callbacks): if isinstance(before, ModelCheckpoint): for attribute in ( "best_model_path", "best_model_score", "best_k_models", "kth_best_model_path", "kth_value", "last_model_path", ): assert getattr(before, attribute) == getattr(after, attribute)
def test_fit_csv_logger(tmpdir): dm = ClassifDataModule() model = ClassificationModel() logger = CSVLogger(save_dir=tmpdir) trainer = Trainer(default_root_dir=tmpdir, max_steps=10, logger=logger, log_every_n_steps=1) trainer.fit(model, datamodule=dm) metrics_file = os.path.join(logger.log_dir, ExperimentWriter.NAME_METRICS_FILE) assert os.path.isfile(metrics_file)
def test_suggestion_parameters_work(tmpdir): """Test that default skipping does not alter results in basic case.""" dm = ClassifDataModule() model = ClassificationModel() # logger file to get meta trainer = Trainer(default_root_dir=tmpdir, max_epochs=3) lrfinder = trainer.tuner.lr_find(model, datamodule=dm) lr1 = lrfinder.suggestion(skip_begin=10) # default lr2 = lrfinder.suggestion(skip_begin=150) # way too high, should have an impact assert lr1 != lr2, "Skipping parameter did not influence learning rate"
def test_early_stopping_no_val_step(tmpdir): """Test that early stopping callback falls back to training metrics when no validation defined.""" model = ClassificationModel() dm = ClassifDataModule() model.validation_step = None model.val_dataloader = None stopping = EarlyStopping(monitor="train_loss", min_delta=0.1, patience=0, check_on_train_epoch_end=True) trainer = Trainer(default_root_dir=tmpdir, callbacks=[stopping], overfit_batches=0.20, max_epochs=10) trainer.fit(model, datamodule=dm) assert trainer.state.finished, f"Training failed with {trainer.state}" assert trainer.current_epoch < trainer.max_epochs - 1
def test_multi_gpu_none_backend(tmpdir): """Make sure when using multiple GPUs the user can't use `accelerator = None`.""" tutils.set_random_main_port() trainer_options = dict( default_root_dir=tmpdir, enable_progress_bar=False, max_epochs=1, limit_train_batches=0.2, limit_val_batches=0.2, gpus=2, ) dm = ClassifDataModule() model = ClassificationModel() tpipes.run_model_test(trainer_options, model, dm)
def test_multi_gpu_none_backend(tmpdir): """Make sure when using multiple GPUs the user can't use `distributed_backend = None`.""" tutils.set_random_master_port() trainer_options = dict( default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=1, limit_train_batches=0.2, limit_val_batches=0.2, gpus=2, ) dm = ClassifDataModule() model = ClassificationModel() tpipes.run_model_test(trainer_options, model, dm)
def test_datamodule_parameter(tmpdir): """Test that the datamodule parameter works""" seed_everything(1) dm = ClassifDataModule() model = ClassificationModel() before_lr = model.lr # logger file to get meta trainer = Trainer(default_root_dir=tmpdir, max_epochs=2) lrfinder = trainer.tuner.lr_find(model, datamodule=dm) after_lr = lrfinder.suggestion() model.lr = after_lr assert before_lr != after_lr, "Learning rate was not altered after running learning rate finder"
def test_multi_gpu_early_stop_ddp_spawn(tmpdir): tutils.set_random_main_port() trainer_options = dict( default_root_dir=tmpdir, callbacks=[EarlyStopping(monitor="train_acc")], max_epochs=50, limit_train_batches=10, limit_val_batches=10, gpus=[0, 1], strategy="ddp_spawn", ) dm = ClassifDataModule() model = ClassificationModel() tpipes.run_model_test(trainer_options, model, dm)
def test_resume_early_stopping_from_checkpoint(tmpdir): """Prevent regressions to bugs: https://github.com/PyTorchLightning/pytorch-lightning/issues/1464 https://github.com/PyTorchLightning/pytorch-lightning/issues/1463 """ seed_everything(42) model = ClassificationModel() dm = ClassifDataModule() checkpoint_callback = ModelCheckpoint(dirpath=tmpdir, monitor="train_loss", save_top_k=1) early_stop_callback = EarlyStoppingTestRestore(None, monitor="train_loss") trainer = Trainer( default_root_dir=tmpdir, callbacks=[early_stop_callback, checkpoint_callback], num_sanity_val_steps=0, max_epochs=4, ) trainer.fit(model, datamodule=dm) assert len(early_stop_callback.saved_states) == 4 checkpoint_filepath = checkpoint_callback.kth_best_model_path # ensure state is persisted properly checkpoint = torch.load(checkpoint_filepath) # the checkpoint saves "epoch + 1" early_stop_callback_state = early_stop_callback.saved_states[ checkpoint["epoch"] - 1] assert 4 == len(early_stop_callback.saved_states) es_name = "EarlyStoppingTestRestore{'monitor': 'train_loss', 'mode': 'min'}" assert checkpoint["callbacks"][es_name] == early_stop_callback_state # ensure state is reloaded properly (assertion in the callback) early_stop_callback = EarlyStoppingTestRestore(early_stop_callback_state, monitor="train_loss") new_trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, resume_from_checkpoint=checkpoint_filepath, callbacks=[early_stop_callback], ) with pytest.raises(MisconfigurationException, match=r"You restored a checkpoint with current_epoch"): new_trainer.fit(model)
def test_full_loop_dp(tmpdir): set_random_master_port() class CustomClassificationModelDP(ClassificationModel): def _step(self, batch, batch_idx): x, y = batch logits = self(x) return {'logits': logits, 'y': y} def training_step(self, batch, batch_idx): _, y = batch out = self._step(batch, batch_idx) loss = F.cross_entropy(out['logits'], y) return loss def validation_step(self, batch, batch_idx): return self._step(batch, batch_idx) def test_step(self, batch, batch_idx): return self._step(batch, batch_idx) def test_step_end(self, outputs): self.log('test_acc', self.test_acc(outputs['logits'], outputs['y'])) dm = ClassifDataModule() model = CustomClassificationModelDP() trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, weights_summary=None, accelerator='dp', gpus=2, deterministic=True, ) # fit model result = trainer.fit(model, datamodule=dm) assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}" assert result # test result = trainer.test(datamodule=dm) assert result[0]['test_acc'] > 0.6