def test_strict_model_load_more_params(monkeypatch, tmpdir, tmpdir_server, url_ckpt): """Tests use case where trainer saves the model, and user loads it from tags independently.""" # set $TORCH_HOME, which determines torch hub's cache path, to tmpdir monkeypatch.setenv('TORCH_HOME', tmpdir) model = BoringModel() # Extra layer model.c_d3 = torch.nn.Linear(32, 32) # logger file to get meta logger = tutils.get_default_logger(tmpdir) # fit model trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, limit_train_batches=2, limit_val_batches=2, logger=logger, callbacks=[ModelCheckpoint(dirpath=tmpdir)], ) trainer.fit(model) # traning complete assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}" # save model new_weights_path = os.path.join(tmpdir, 'save_test.ckpt') trainer.save_checkpoint(new_weights_path) # load new model hparams_path = os.path.join(tutils.get_data_path(logger, path_dir=tmpdir), 'hparams.yaml') hparams_url = f'http://{tmpdir_server[0]}:{tmpdir_server[1]}/{os.path.basename(new_weights_path)}' ckpt_path = hparams_url if url_ckpt else new_weights_path BoringModel.load_from_checkpoint( checkpoint_path=ckpt_path, hparams_file=hparams_path, strict=False, ) with pytest.raises( RuntimeError, match= r'Unexpected key\(s\) in state_dict: "c_d3.weight", "c_d3.bias"'): BoringModel.load_from_checkpoint( checkpoint_path=ckpt_path, hparams_file=hparams_path, strict=True, )
def run_test_from_config(trainer_options, on_gpu, check_size=True): """Trains the default model with the given config.""" set_random_master_port() reset_seed() ckpt_path = trainer_options["weights_save_path"] trainer_options.update(callbacks=[ModelCheckpoint(dirpath=ckpt_path)]) class TestModel(BoringModel): def on_train_start(self) -> None: expected_device = torch.device("cuda", self.trainer.local_rank) if on_gpu else torch.device("cpu") assert self.device == expected_device def training_epoch_end(self, outputs) -> None: res = self.trainer.training_type_plugin.reduce(torch.tensor(1.0, device=self.device), reduce_op="sum") assert res.sum() == self.trainer.training_type_plugin.world_size model = TestModel() trainer = Trainer(**trainer_options) trainer.fit(model) assert trainer.state.finished, f"Training failed with {trainer.state}" trainer.test(model) assert model.device == torch.device("cpu") # Horovod should be initialized following training. If not, this will raise an exception. if check_size: assert hvd.size() == 2 if trainer.global_rank > 0: return # test model loading pretrained_model = BoringModel.load_from_checkpoint(trainer.checkpoint_callback.best_model_path) # test new model accuracy test_loaders = model.test_dataloader() if not isinstance(test_loaders, list): test_loaders = [test_loaders] for dataloader in test_loaders: batch = next(iter(dataloader)) pretrained_model(batch) # test HPC saving trainer.checkpoint_connector.hpc_save(ckpt_path, trainer.logger) # test HPC loading checkpoint_path = trainer.checkpoint_connector.get_max_ckpt_path_from_folder(ckpt_path) trainer.checkpoint_connector.restore(checkpoint_path) if on_gpu: trainer = Trainer(gpus=1, accelerator="horovod", max_epochs=1) # Test the root_gpu property assert trainer.root_gpu == hvd.local_rank()
def test_model_saving_loading(tmpdir): """Tests use case where trainer saves the model, and user loads it from tags independently.""" model = BoringModel() # logger file to get meta logger = tutils.get_default_logger(tmpdir) # fit model trainer = Trainer( max_epochs=1, limit_train_batches=2, limit_val_batches=2, logger=logger, callbacks=[ModelCheckpoint(dirpath=tmpdir)], default_root_dir=tmpdir, ) trainer.fit(model) # traning complete assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}" # make a prediction dataloaders = model.test_dataloader() if not isinstance(dataloaders, list): dataloaders = [dataloaders] batch = next(iter(dataloaders[0])) # generate preds before saving model model.eval() pred_before_saving = model(batch) # save model new_weights_path = os.path.join(tmpdir, 'save_test.ckpt') trainer.save_checkpoint(new_weights_path) # load new model hparams_path = tutils.get_data_path(logger, path_dir=tmpdir) hparams_path = os.path.join(hparams_path, 'hparams.yaml') model_2 = BoringModel.load_from_checkpoint( checkpoint_path=new_weights_path, hparams_file=hparams_path, ) model_2.eval() # make prediction # assert that both predictions are the same new_pred = model_2(batch) assert torch.all(torch.eq(pred_before_saving, new_pred)).item() == 1
def run_test_from_config(trainer_options, on_gpu, check_size=True): """Trains the default model with the given config.""" set_random_master_port() reset_seed() ckpt_path = trainer_options['weights_save_path'] trainer_options.update(callbacks=[ModelCheckpoint(dirpath=ckpt_path)]) model = BoringModel() trainer = Trainer(**trainer_options) trainer.fit(model) assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}" # Horovod should be initialized following training. If not, this will raise an exception. if check_size: assert hvd.size() == 2 if trainer.global_rank > 0: return # test model loading pretrained_model = BoringModel.load_from_checkpoint( trainer.checkpoint_callback.best_model_path) # test new model accuracy test_loaders = model.test_dataloader() if not isinstance(test_loaders, list): test_loaders = [test_loaders] for dataloader in test_loaders: batch = next(iter(dataloader)) pretrained_model(batch) # test HPC saving trainer.checkpoint_connector.hpc_save(ckpt_path, trainer.logger) # test HPC loading checkpoint_path = trainer.checkpoint_connector.get_max_ckpt_path_from_folder( ckpt_path) trainer.checkpoint_connector.hpc_load(checkpoint_path, on_gpu=on_gpu) if on_gpu: trainer = Trainer(gpus=1, accelerator='horovod', max_epochs=1) # Test the root_gpu property assert trainer.root_gpu == hvd.local_rank()