def test_strict_model_load_more_params(monkeypatch, tmpdir, tmpdir_server, url_ckpt): """Tests use case where trainer saves the model, and user loads it from tags independently.""" # set $TORCH_HOME, which determines torch hub's cache path, to tmpdir monkeypatch.setenv("TORCH_HOME", tmpdir) model = BoringModel() # Extra layer model.c_d3 = torch.nn.Linear(32, 32) # logger file to get meta logger = tutils.get_default_logger(tmpdir) # fit model trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, limit_train_batches=2, limit_val_batches=2, logger=logger, callbacks=[ModelCheckpoint(dirpath=tmpdir)], ) trainer.fit(model) # traning complete assert trainer.state.finished, f"Training failed with {trainer.state}" # save model new_weights_path = os.path.join(tmpdir, "save_test.ckpt") trainer.save_checkpoint(new_weights_path) # load new model hparams_path = os.path.join(tutils.get_data_path(logger, path_dir=tmpdir), "hparams.yaml") hparams_url = f"http://{tmpdir_server[0]}:{tmpdir_server[1]}/{os.path.basename(new_weights_path)}" ckpt_path = hparams_url if url_ckpt else new_weights_path BoringModel.load_from_checkpoint(checkpoint_path=ckpt_path, hparams_file=hparams_path, strict=False) with pytest.raises(RuntimeError, match=r'Unexpected key\(s\) in state_dict: "c_d3.weight", "c_d3.bias"'): BoringModel.load_from_checkpoint(checkpoint_path=ckpt_path, hparams_file=hparams_path, strict=True)
def test_ddp_sharded_strategy_finetune(tmpdir): """Test to ensure that we can save and restart training (simulate fine-tuning)""" model = BoringModel() trainer = Trainer(accelerator="gpu", devices=2, strategy="ddp_sharded_spawn", fast_dev_run=True) trainer.fit(model) checkpoint_path = os.path.join(tmpdir, "model.pt") trainer.save_checkpoint(checkpoint_path) saved_model = BoringModel.load_from_checkpoint(checkpoint_path) trainer = Trainer(fast_dev_run=True) trainer.fit(saved_model)
def test_model_saving_loading(tmpdir): """Tests use case where trainer saves the model, and user loads it from tags independently.""" model = BoringModel() # logger file to get meta logger = tutils.get_default_logger(tmpdir) # fit model trainer = Trainer( max_epochs=1, limit_train_batches=2, limit_val_batches=2, logger=logger, callbacks=[ModelCheckpoint(dirpath=tmpdir)], default_root_dir=tmpdir, ) trainer.fit(model) # traning complete assert trainer.state.finished, f"Training failed with {trainer.state}" # make a prediction dataloaders = model.test_dataloader() if not isinstance(dataloaders, list): dataloaders = [dataloaders] batch = next(iter(dataloaders[0])) # generate preds before saving model model.eval() pred_before_saving = model(batch) # save model new_weights_path = os.path.join(tmpdir, "save_test.ckpt") trainer.save_checkpoint(new_weights_path) # load new model hparams_path = tutils.get_data_path(logger, path_dir=tmpdir) hparams_path = os.path.join(hparams_path, "hparams.yaml") model_2 = BoringModel.load_from_checkpoint(checkpoint_path=new_weights_path, hparams_file=hparams_path) model_2.eval() # make prediction # assert that both predictions are the same new_pred = model_2(batch) assert torch.all(torch.eq(pred_before_saving, new_pred)).item() == 1
def test_ddp_sharded_strategy_checkpoint_multi_gpu(tmpdir): """Test to ensure that checkpoint is saved correctly when using multiple GPUs.""" model = BoringModel() trainer = Trainer(accelerator="gpu", devices=2, strategy="ddp_sharded_spawn", fast_dev_run=True) trainer.fit(model) checkpoint_path = os.path.join(tmpdir, "model.pt") trainer.save_checkpoint(checkpoint_path) saved_model = BoringModel.load_from_checkpoint(checkpoint_path) # Assert model parameters are identical after loading for ddp_param, shard_param in zip(model.parameters(), saved_model.parameters()): assert torch.equal(ddp_param.to("cpu"), shard_param)
def run_test_from_config(trainer_options, on_gpu, check_size): """Trains the default model with the given config.""" set_random_main_port() reset_seed() ckpt_path = trainer_options["default_root_dir"] trainer_options.update(callbacks=[ModelCheckpoint(dirpath=ckpt_path)]) class TestModel(BoringModel): def on_train_start(self) -> None: expected_device = torch.device( "cuda", self.trainer.local_rank) if on_gpu else torch.device("cpu") assert self.device == expected_device def training_epoch_end(self, outputs) -> None: res = self.trainer.strategy.reduce(torch.tensor( 1.0, device=self.device), reduce_op="sum") assert res.sum() == self.trainer.strategy.world_size model = TestModel() trainer = Trainer(**trainer_options) trainer.fit(model) assert trainer.state.finished, f"Training failed with {trainer.state}" trainer.test(model) assert model.device == torch.device("cpu") # Horovod should be initialized following training. If not, this will raise an exception. if check_size: assert hvd.size() == 2 if trainer.global_rank > 0: return # test model loading pretrained_model = BoringModel.load_from_checkpoint( trainer.checkpoint_callback.best_model_path) # test new model accuracy test_loaders = model.test_dataloader() if not isinstance(test_loaders, list): test_loaders = [test_loaders] for dataloader in test_loaders: batch = next(iter(dataloader)) pretrained_model(batch) # test HPC saving # save logger to make sure we get all the metrics if trainer.logger: trainer.logger.finalize("finished") hpc_save_path = trainer._checkpoint_connector.hpc_save_path(ckpt_path) trainer.save_checkpoint(hpc_save_path) # test HPC loading checkpoint_path = trainer._checkpoint_connector._CheckpointConnector__get_max_ckpt_path_from_folder( ckpt_path) trainer._checkpoint_connector.restore(checkpoint_path) if on_gpu: trainer = Trainer(accelerator="gpu", devices=1, strategy="horovod", max_epochs=1) # test root gpu index assert trainer.strategy.root_device.index == hvd.local_rank()