def run_test_from_config(trainer_options): """Trains the default model with the given config.""" set_random_master_port() reset_seed() ckpt_path = trainer_options['weights_save_path'] trainer_options.update(checkpoint_callback=ModelCheckpoint(ckpt_path)) model = EvalModelTemplate() trainer = Trainer(**trainer_options) result = trainer.fit(model) assert result == 1 # Horovod should be initialized following training. If not, this will raise an exception. assert hvd.size() == 2 if trainer.global_rank > 0: # on higher ranks the checkpoint location is unknown # we want to test checkpointing on rank 0 only assert not trainer.checkpoint_callback.best_model_path return # test model loading pretrained_model = EvalModelTemplate.load_from_checkpoint( trainer.checkpoint_callback.best_model_path) # test new model accuracy test_loaders = model.test_dataloader() if not isinstance(test_loaders, list): test_loaders = [test_loaders] for dataloader in test_loaders: run_prediction(dataloader, pretrained_model) # test HPC loading / saving trainer.hpc_save(ckpt_path, trainer.logger) trainer.hpc_load(ckpt_path, on_gpu=args.on_gpu) if args.on_gpu: trainer = Trainer(gpus=1, distributed_backend='horovod', max_epochs=1) # Test the root_gpu property assert trainer.root_gpu == hvd.local_rank()
def test_dm_checkpoint_save(tmpdir): reset_seed() dm = TrialMNISTDataModule(tmpdir) model = EvalModelTemplate() trainer = Trainer( default_root_dir=tmpdir, max_epochs=3, weights_summary=None, callbacks=[ModelCheckpoint(dirpath=tmpdir, monitor='early_stop_on')], ) # fit model result = trainer.fit(model, dm) checkpoint_path = list(trainer.checkpoint_callback.best_k_models.keys())[0] checkpoint = torch.load(checkpoint_path) assert dm.__class__.__name__ in checkpoint assert checkpoint[dm.__class__.__name__] == dm.__class__.__name__
def test_logger_reset_correctly(tmpdir, extra_params): """ Test that the tuners do not alter the logger reference """ tutils.reset_seed() model = EvalModelTemplate() trainer = Trainer( default_root_dir=tmpdir, **extra_params, ) logger1 = trainer.logger trainer.fit(model) logger2 = trainer.logger logger3 = model.logger assert logger1 == logger2, \ 'Finder altered the logger of trainer' assert logger2 == logger3, \ 'Finder altered the logger of model'
def test_val_loop_config(tmpdir): """" When either val loop or val data are missing raise warning """ tutils.reset_seed() hparams = EvalModelTemplate.get_default_hparams() trainer = Trainer(default_root_dir=tmpdir, max_epochs=1) # no val data has val loop with pytest.warns(UserWarning): model = EvalModelTemplate(**hparams) model.validation_step = None trainer.fit(model) # has val loop but no val data with pytest.warns(UserWarning): model = EvalModelTemplate(**hparams) model.val_dataloader = None trainer.fit(model)
def test_model_checkpoint_correct_score(tmpdir, save_top_k): """Test that when a model checkpoint is saved, it saves with the correct score appended to ckpt_path""" tutils.reset_seed() model = LogInTwoMethods() filename = "{val_acc:.4f}-{epoch}" checkpoint = ModelCheckpoint(dirpath=tmpdir, filename=filename, monitor='val_acc', save_top_k=save_top_k) trainer = Trainer(default_root_dir=tmpdir, callbacks=[checkpoint], overfit_batches=0.20, max_epochs=2) trainer.fit(model) ckpt_files = list(Path(tmpdir).glob('*.ckpt')) metrics = trainer.dev_debugger.logged_metrics expected_filenames = {f'val_acc={metric["val_acc"]:.4f}-epoch={metric["epoch"]}.ckpt' for metric in metrics} for ckpt_file in ckpt_files: assert os.path.basename(ckpt_file) in expected_filenames
def test_trainer_arg(tmpdir, scale_arg): """ Check that trainer arg works with bool input. """ tutils.reset_seed() hparams = EvalModelTemplate.get_default_hparams() model = EvalModelTemplate(**hparams) before_batch_size = hparams.get('batch_size') # logger file to get meta trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, auto_scale_batch_size=scale_arg, ) trainer.fit(model) after_batch_size = model.batch_size assert before_batch_size != after_batch_size, \ 'Batch size was not altered after running auto scaling of batch size'
def run_model_test(trainer_options, model, on_gpu: bool = True, version=None, with_hpc: bool = True): reset_seed() save_dir = trainer_options['default_root_dir'] # logger file to get meta logger = get_default_logger(save_dir, version=version) trainer_options.update(logger=logger) if 'checkpoint_callback' not in trainer_options: trainer_options.update(checkpoint_callback=True) trainer = Trainer(**trainer_options) initial_values = torch.tensor([torch.sum(torch.abs(x)) for x in model.parameters()]) result = trainer.fit(model) post_train_values = torch.tensor([torch.sum(torch.abs(x)) for x in model.parameters()]) assert result == 1, 'trainer failed' # Check that the model is actually changed post-training assert torch.norm(initial_values - post_train_values) > 0.1 # test model loading pretrained_model = load_model_from_checkpoint(logger, trainer.checkpoint_callback.best_model_path) # test new model accuracy test_loaders = model.test_dataloader() if not isinstance(test_loaders, list): test_loaders = [test_loaders] for dataloader in test_loaders: run_prediction(dataloader, pretrained_model) if with_hpc: if trainer.use_ddp or trainer.use_ddp2: # on hpc this would work fine... but need to hack it for the purpose of the test trainer.model = pretrained_model trainer.optimizers, trainer.lr_schedulers, trainer.optimizer_frequencies = \ trainer.init_optimizers(pretrained_model) # test HPC loading / saving trainer.checkpoint_connector.hpc_save(save_dir, logger) trainer.checkpoint_connector.hpc_load(save_dir, on_gpu=on_gpu)
def test_result_obj_predictions(tmpdir, test_option, do_train, gpus): tutils.reset_seed() dm = TrialMNISTDataModule(tmpdir) prediction_file = Path(tmpdir) / 'predictions.pt' model = EvalModelTemplate() model.test_option = test_option model.prediction_file = prediction_file.as_posix() model.test_step = model.test_step_result_preds model.test_step_end = None model.test_epoch_end = None model.test_end = None if prediction_file.exists(): prediction_file.unlink() trainer = Trainer( default_root_dir=tmpdir, max_epochs=3, weights_summary=None, deterministic=True, gpus=gpus ) # Prediction file shouldn't exist yet because we haven't done anything assert not prediction_file.exists() if do_train: result = trainer.fit(model, dm) assert result == 1 result = trainer.test(datamodule=dm) result = result[0] assert result['test_loss'] < 0.6 assert result['test_acc'] > 0.8 else: result = trainer.test(model, datamodule=dm) # check prediction file now exists and is of expected length assert prediction_file.exists() predictions = torch.load(prediction_file) assert len(predictions) == len(dm.mnist_test)
def test_full_loop_single_gpu(tmpdir): reset_seed() dm = TrialMNISTDataModule(tmpdir) model = EvalModelTemplate() trainer = Trainer(default_root_dir=tmpdir, max_epochs=3, weights_summary=None, gpus=1) # fit model result = trainer.fit(model, dm) assert result == 1 # test result = trainer.test(datamodule=dm) result = result[0] assert result['test_acc'] > 0.8
def test_model_checkpoint_to_yaml(tmpdir, save_top_k): """ Test that None in checkpoint callback is valid and that chkp_path is set correctly """ tutils.reset_seed() model = LogInTwoMethods() checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor='early_stop_on', save_top_k=save_top_k) trainer = Trainer(default_root_dir=tmpdir, callbacks=[checkpoint], overfit_batches=0.20, max_epochs=2) trainer.fit(model) path_yaml = os.path.join(tmpdir, 'best_k_models.yaml') checkpoint.to_yaml(path_yaml) d = yaml.full_load(open(path_yaml, 'r')) best_k = {k: v for k, v in checkpoint.best_k_models.items()} assert d == best_k
def test_train_val_loop_only(tmpdir): reset_seed() dm = TrialMNISTDataModule(tmpdir) model = EvalModelTemplate() model.validation_step = None model.validation_step_end = None model.validation_epoch_end = None trainer = Trainer( default_root_dir=tmpdir, max_epochs=3, weights_summary=None, ) # fit model result = trainer.fit(model, dm) assert result == 1 assert trainer.logger_connector.callback_metrics['loss'] < 0.6
def test_call_to_trainer_method(tmpdir, scale_method): """ Test that calling the trainer method itself works. """ tutils.reset_seed() hparams = EvalModelTemplate.get_default_hparams() model = EvalModelTemplate(**hparams) before_batch_size = hparams.get('batch_size') # logger file to get meta trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, ) after_batch_size = trainer.tuner.scale_batch_size(model, mode=scale_method, max_trials=5) model.batch_size = after_batch_size trainer.fit(model) assert before_batch_size != after_batch_size, \ 'Batch size was not altered after running auto scaling of batch size'
def test_model_checkpoint_correct_score(tmpdir, save_top_k): os.environ['PL_DEV_DEBUG'] = '1' """Test that when a model checkpoint is saved, it saves with the correct score appended to ckpt_path""" tutils.reset_seed() model = EvalModelTemplate() filepath = os.path.join(tmpdir, "{val_acc:.4f}-{epoch}") checkpoint = ModelCheckpoint(filepath=filepath, monitor='val_acc', save_top_k=save_top_k) trainer = Trainer(default_root_dir=tmpdir, checkpoint_callback=checkpoint, overfit_batches=0.20, max_epochs=2) trainer.fit(model) ckpt_files = list(Path(tmpdir).glob('*.ckpt')) metrics = trainer.dev_debugger.logged_metrics expected_filenames = {f'val_acc={metric["val_acc"]:.4f}-epoch={metric["epoch"]}.ckpt' for metric in metrics} for ckpt_file in ckpt_files: assert os.path.basename(ckpt_file) in expected_filenames
def test_model_reset_correctly(tmpdir): """ Check that model weights are correctly reset after scaling batch size. """ tutils.reset_seed() model = EvalModelTemplate() # logger file to get meta trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, ) before_state_dict = deepcopy(model.state_dict()) trainer.tuner.scale_batch_size(model, max_trials=5) after_state_dict = model.state_dict() for key in before_state_dict.keys(): assert torch.all(torch.eq(before_state_dict[key], after_state_dict[key])), \ 'Model was not reset correctly after scaling batch size'
def run_model_test(trainer_options, model, on_gpu: bool = True, version=None, with_hpc: bool = True): reset_seed() save_dir = trainer_options['default_root_dir'] # logger file to get meta logger = get_default_logger(save_dir, version=version) trainer_options.update(logger=logger) if 'checkpoint_callback' not in trainer_options: trainer_options.update(checkpoint_callback=True) trainer = Trainer(**trainer_options) result = trainer.fit(model) # correct result and ok accuracy assert result == 1, 'trainer failed' # test model loading pretrained_model = load_model_from_checkpoint(logger, trainer.checkpoint_callback.best_model_path) # test new model accuracy test_loaders = model.test_dataloader() if not isinstance(test_loaders, list): test_loaders = [test_loaders] for dataloader in test_loaders: run_prediction(dataloader, pretrained_model) if with_hpc: if trainer.use_ddp or trainer.use_ddp2: # on hpc this would work fine... but need to hack it for the purpose of the test trainer.model = pretrained_model trainer.optimizers, trainer.lr_schedulers, trainer.optimizer_frequencies = \ trainer.init_optimizers(pretrained_model) # test HPC loading / saving trainer.hpc_save(save_dir, logger) trainer.hpc_load(save_dir, on_gpu=on_gpu)
def test_model_checkpoint_with_non_string_input(tmpdir, save_top_k): """Test that dirpath=None in checkpoint callback is valid and that ckpt_path is set correctly""" tutils.reset_seed() model = LogInTwoMethods() checkpoint = ModelCheckpoint(monitor='early_stop_on', dirpath=None, filename='{epoch}', save_top_k=save_top_k) max_epochs = 2 trainer = Trainer( default_root_dir=tmpdir, callbacks=[checkpoint], overfit_batches=0.20, max_epochs=max_epochs, ) trainer.fit(model) assert ( checkpoint.dirpath == tmpdir / trainer.logger.name / "version_0" / "checkpoints" ) if save_top_k == -1: ckpt_files = os.listdir(checkpoint.dirpath) expected_ckpt_files = [f'epoch={i}.ckpt' for i in range(max_epochs)] assert len(ckpt_files) == len(expected_ckpt_files) == max_epochs assert set(ckpt_files) == set(expected_ckpt_files)
def test_lr_monitor_single_lr(tmpdir): """ Test that learning rates are extracted and logged for single lr scheduler. """ tutils.reset_seed() model = EvalModelTemplate() model.configure_optimizers = model.configure_optimizers__single_scheduler lr_monitor = LearningRateMonitor() trainer = Trainer( default_root_dir=tmpdir, max_epochs=2, limit_val_batches=0.1, limit_train_batches=0.5, callbacks=[lr_monitor], ) result = trainer.fit(model) assert result assert lr_monitor.lrs, 'No learning rates logged' assert len(lr_monitor.lrs) == len(trainer.lr_schedulers), \ 'Number of learning rates logged does not match number of lr schedulers' assert all([k in ['lr-Adam'] for k in lr_monitor.lrs.keys()]), \ 'Names of learning rates not set correctly'
def test_trainer_attached_to_dm(tmpdir): reset_seed() dm = TrialMNISTDataModule(tmpdir) model = EvalModelTemplate() trainer = Trainer( default_root_dir=tmpdir, max_epochs=3, weights_summary=None, deterministic=True, ) # fit model result = trainer.fit(model, dm) assert result == 1 assert dm.trainer is not None # test result = trainer.test(datamodule=dm) result = result[0] assert dm.trainer is not None
def test_auto_scale_batch_size_set_model_attribute(tmpdir, use_hparams): """ Test that new batch size gets written to the correct hyperparameter attribute. """ tutils.reset_seed() hparams = EvalModelTemplate.get_default_hparams() before_batch_size = hparams.get('batch_size') class HparamsEvalModelTemplate(EvalModelTemplate): def dataloader(self, *args, **kwargs): # artificially set batch_size so we can get a dataloader # remove it immediately after, because we want only self.hparams.batch_size setattr(self, "batch_size", before_batch_size) dataloader = super().dataloader(*args, **kwargs) del self.batch_size return dataloader datamodule_model = MNISTDataModule( data_dir=tmpdir, batch_size=111) # this datamodule should get ignored! datamodule_fit = MNISTDataModule(data_dir=tmpdir, batch_size=before_batch_size) model_class = HparamsEvalModelTemplate if use_hparams else EvalModelTemplate model = model_class(**hparams) model.datamodule = datamodule_model # unused when another module gets passed to .tune() / .fit() trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, auto_scale_batch_size=True, gpus=1) trainer.tune(model, datamodule_fit) after_batch_size = model.hparams.batch_size if use_hparams else model.batch_size assert trainer.datamodule == datamodule_fit assert before_batch_size != after_batch_size assert after_batch_size <= len(trainer.train_dataloader.dataset) assert datamodule_fit.batch_size == after_batch_size # should be left unchanged, since it was not passed to .tune() assert datamodule_model.batch_size == 111
def test_loading_meta_tags(tmpdir): """ test for backward compatibility to meta_tags.csv """ tutils.reset_seed() hparams = EvalModelTemplate.get_default_hparams() # save tags logger = tutils.get_default_logger(tmpdir) logger.log_hyperparams(Namespace(some_str='a_str', an_int=1, a_float=2.0)) logger.log_hyperparams(hparams) logger.save() # load hparams path_expt_dir = tutils.get_data_path(logger, path_dir=tmpdir) hparams_path = os.path.join(path_expt_dir, TensorBoardLogger.NAME_HPARAMS_FILE) hparams = load_hparams_from_yaml(hparams_path) # save as legacy meta_tags.csv tags_path = os.path.join(path_expt_dir, 'meta_tags.csv') save_hparams_to_tags_csv(tags_path, hparams) tags = load_hparams_from_tags_csv(tags_path) assert hparams == tags
def test_auto_scale_batch_size_set_model_attribute(tmpdir, use_hparams): """ Test that new batch size gets written to the correct hyperparameter attribute. """ tutils.reset_seed() hparams = EvalModelTemplate.get_default_hparams() before_batch_size = hparams.get('batch_size') class HparamsEvalModelTemplate(EvalModelTemplate): def dataloader(self, *args, **kwargs): # artificially set batch_size so we can get a dataloader # remove it immediately after, because we want only self.hparams.batch_size setattr(self, "batch_size", before_batch_size) dataloader = super().dataloader(*args, **kwargs) del self.batch_size return dataloader model_class = HparamsEvalModelTemplate if use_hparams else EvalModelTemplate model = model_class(**hparams) trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, auto_scale_batch_size=True) trainer.fit(model) after_batch_size = model.hparams.batch_size if use_hparams else model.batch_size assert before_batch_size != after_batch_size
def test_full_loop_dp(tmpdir): reset_seed() dm = TrialMNISTDataModule(tmpdir) model = EvalModelTemplate() trainer = Trainer( default_root_dir=tmpdir, max_epochs=3, weights_summary=None, distributed_backend='dp', gpus=2, deterministic=True, ) # fit model result = trainer.fit(model, dm) assert result == 1 # test result = trainer.test(datamodule=dm) result = result[0] assert result['test_acc'] > 0.8
def test_lr_monitor_single_lr_with_momentum(tmpdir): """ Test that learning rates and momentum are extracted and logged for single lr scheduler. """ tutils.reset_seed() model = EvalModelTemplate() model.configure_optimizers = model.configure_optimizers__onecycle_scheduler lr_monitor = LearningRateMonitor(log_momentum=True) trainer = Trainer( default_root_dir=tmpdir, max_epochs=2, limit_val_batches=0.1, limit_train_batches=0.5, callbacks=[lr_monitor], ) result = trainer.fit(model) assert result assert all(v is not None for v in lr_monitor.last_momentum_values.values()), \ 'Expected momentum to be logged' assert len(lr_monitor.last_momentum_values) == len(trainer.lr_schedulers), \ 'Number of momentum values logged does not match number of lr schedulers' assert all([k in ['lr-SGD-momentum'] for k in lr_monitor.last_momentum_values.keys()]), \ 'Names of momentum values not set correctly'
def test_full_loop_ddp_spawn(tmpdir): import os os.environ['CUDA_VISIBLE_DEVICES'] = '0,1' reset_seed() dm = TrialMNISTDataModule(tmpdir) model = EvalModelTemplate() trainer = Trainer(default_root_dir=tmpdir, max_epochs=3, weights_summary=None, distributed_backend='ddp_spawn', gpus=[0, 1]) # fit model result = trainer.fit(model, dm) assert result == 1 # test result = trainer.test(datamodule=dm) result = result[0] assert result['test_acc'] > 0.8
def test_numpy_metric_ddp(): tutils.reset_seed() tutils.set_random_master_port() world_size = 2 mp.spawn(_ddp_test_numpy_metric, args=(world_size, ), nprocs=world_size)