def test_model_saving_loading(): """ Tests use case where trainer saves the model, and user loads it from tags independently :return: """ reset_seed() hparams = get_hparams() model = LightningTestModel(hparams) save_dir = init_save_dir() # logger file to get meta logger = get_test_tube_logger(False) logger.log_hyperparams(hparams) logger.save() trainer_options = dict(max_nb_epochs=1, logger=logger, checkpoint_callback=ModelCheckpoint(save_dir)) # fit model trainer = Trainer(**trainer_options) result = trainer.fit(model) # traning complete assert result == 1, 'amp + ddp model failed to complete' # make a prediction for dataloader in model.test_dataloader(): for batch in dataloader: break x, y = batch x = x.view(x.size(0), -1) # generate preds before saving model model.eval() pred_before_saving = model(x) # save model new_weights_path = os.path.join(save_dir, 'save_test.ckpt') trainer.save_checkpoint(new_weights_path) # load new model tags_path = logger.experiment.get_data_path(logger.experiment.name, logger.experiment.version) tags_path = os.path.join(tags_path, 'meta_tags.csv') model_2 = LightningTestModel.load_from_metrics( weights_path=new_weights_path, tags_csv=tags_path) model_2.eval() # make prediction # assert that both predictions are the same new_pred = model_2(x) assert torch.all(torch.eq(pred_before_saving, new_pred)).item() == 1 clear_save_dir()
def test_running_test_pretrained_model_ddp(): """Verify test() on pretrained model""" if not can_run_gpu_test(): return reset_seed() set_random_master_port() hparams = get_hparams() model = LightningTestModel(hparams) save_dir = init_save_dir() # exp file to get meta logger = get_test_tube_logger(False) # exp file to get weights checkpoint = init_checkpoint_callback(logger) trainer_options = dict(show_progress_bar=False, max_nb_epochs=1, train_percent_check=0.4, val_percent_check=0.2, checkpoint_callback=checkpoint, logger=logger, gpus=[0, 1], distributed_backend='ddp') # fit model trainer = Trainer(**trainer_options) result = trainer.fit(model) exp = logger.experiment print(os.listdir(exp.get_data_path(exp.name, exp.version))) # correct result and ok accuracy assert result == 1, 'training failed to complete' pretrained_model = load_model(logger.experiment, save_dir, module_class=LightningTestModel) # run test set new_trainer = Trainer(**trainer_options) new_trainer.test(pretrained_model) [ run_prediction(dataloader, pretrained_model) for dataloader in model.test_dataloader() ] # test we have good test accuracy clear_save_dir()
def test_running_test_pretrained_model_ddp(tmpdir): """Verify `test()` on pretrained model.""" if not tutils.can_run_gpu_test(): return tutils.reset_seed() tutils.set_random_master_port() hparams = tutils.get_hparams() model = LightningTestModel(hparams) # exp file to get meta logger = tutils.get_test_tube_logger(tmpdir, False) # exp file to get weights checkpoint = tutils.init_checkpoint_callback(logger) trainer_options = dict( show_progress_bar=False, max_epochs=1, train_percent_check=0.4, val_percent_check=0.2, checkpoint_callback=checkpoint, logger=logger, gpus=[0, 1], distributed_backend='ddp' ) # fit model trainer = Trainer(**trainer_options) result = trainer.fit(model) exp = logger.experiment logging.info(os.listdir(exp.get_data_path(exp.name, exp.version))) # correct result and ok accuracy assert result == 1, 'training failed to complete' pretrained_model = tutils.load_model(logger.experiment, trainer.checkpoint_callback.filepath, module_class=LightningTestModel) # run test set new_trainer = Trainer(**trainer_options) new_trainer.test(pretrained_model) for dataloader in model.test_dataloader(): tutils.run_prediction(dataloader, pretrained_model)
def test_cpu_slurm_save_load(): """ Verify model save/load/checkpoint on CPU :return: """ reset_seed() hparams = get_hparams() model = LightningTestModel(hparams) save_dir = init_save_dir() # logger file to get meta logger = get_test_tube_logger(False) version = logger.version trainer_options = dict(max_nb_epochs=1, logger=logger, checkpoint_callback=ModelCheckpoint(save_dir)) # fit model trainer = Trainer(**trainer_options) result = trainer.fit(model) real_global_step = trainer.global_step # traning complete assert result == 1, 'amp + ddp model failed to complete' # predict with trained model before saving # make a prediction for dataloader in model.test_dataloader(): for batch in dataloader: break x, y = batch x = x.view(x.size(0), -1) model.eval() pred_before_saving = model(x) # test HPC saving # simulate snapshot on slurm saved_filepath = trainer.hpc_save(save_dir, logger) assert os.path.exists(saved_filepath) # new logger file to get meta logger = get_test_tube_logger(False, version=version) trainer_options = dict( max_nb_epochs=1, logger=logger, checkpoint_callback=ModelCheckpoint(save_dir), ) trainer = Trainer(**trainer_options) model = LightningTestModel(hparams) # set the epoch start hook so we can predict before the model does the full training def assert_pred_same(): assert trainer.global_step == real_global_step and trainer.global_step > 0 # predict with loaded model to make sure answers are the same trainer.model.eval() new_pred = trainer.model(x) assert torch.all(torch.eq(pred_before_saving, new_pred)).item() == 1 model.on_epoch_start = assert_pred_same # by calling fit again, we trigger training, loading weights from the cluster # and our hook to predict using current model before any more weight updates trainer.fit(model) clear_save_dir()