def test_early_stopping_cpu_model(tmpdir): class ModelTrainVal(BoringModel): def validation_step(self, *args, **kwargs): output = super().validation_step(*args, **kwargs) self.log("val_loss", output["x"]) return output tutils.reset_seed() stopping = EarlyStopping(monitor="val_loss", min_delta=0.1) trainer_options = dict( callbacks=[stopping], default_root_dir=tmpdir, gradient_clip_val=1.0, track_grad_norm=2, enable_progress_bar=False, accumulate_grad_batches=2, limit_train_batches=0.1, limit_val_batches=0.1, ) model = ModelTrainVal() tpipes.run_model_test(trainer_options, model) # test freeze on cpu model.freeze() model.unfreeze()
def test_tpu_precision_16_clip_gradients(mock_clip_grad_norm, clip_val, tmpdir): """Ensure that clip gradients is only called if the value is greater than 0. TODO: Fix (test fails with parametrize) """ tutils.reset_seed() trainer_options = dict( default_root_dir=tmpdir, enable_progress_bar=False, max_epochs=1, accelerator="tpu", devices=1, precision=16, limit_train_batches=4, limit_val_batches=4, gradient_clip_val=clip_val, ) model = BoringModel() tpipes.run_model_test(trainer_options, model, with_hpc=False) if clip_val > 0: mock_clip_grad_norm.assert_called() else: mock_clip_grad_norm.assert_not_called()
def test_cpu_model(tmpdir): """Make sure model trains on CPU.""" trainer_options = dict(default_root_dir=tmpdir, enable_progress_bar=False, max_epochs=1, limit_train_batches=4, limit_val_batches=4) model = BoringModel() tpipes.run_model_test(trainer_options, model)
def test_single_gpu_model(tmpdir, devices): """Make sure single GPU works.""" trainer_options = dict( default_root_dir=tmpdir, enable_progress_bar=False, max_epochs=1, limit_train_batches=0.1, limit_val_batches=0.1, accelerator="mps", devices=devices, ) model = BoringModel() tpipes.run_model_test(trainer_options, model)
def test_model_tpu_devices_1(tmpdir): """Make sure model trains on TPU.""" tutils.reset_seed() trainer_options = dict( default_root_dir=tmpdir, enable_progress_bar=False, max_epochs=2, accelerator="tpu", devices=1, limit_train_batches=4, limit_val_batches=4, ) model = BoringModel() tpipes.run_model_test(trainer_options, model, with_hpc=False)
def test_model_tpu_devices_8(tmpdir): """Make sure model trains on TPU.""" tutils.reset_seed() trainer_options = dict( default_root_dir=tmpdir, enable_progress_bar=False, max_epochs=1, accelerator="tpu", devices=8, limit_train_batches=4, limit_val_batches=4, ) # 8 cores needs a big dataset model = SerialLoaderBoringModel() tpipes.run_model_test(trainer_options, model, with_hpc=False, min_acc=0.05)
def test_model_tpu_index(tmpdir, tpu_core): """Make sure model trains on TPU.""" tutils.reset_seed() trainer_options = dict( default_root_dir=tmpdir, enable_progress_bar=False, max_epochs=2, accelerator="tpu", devices=[tpu_core], limit_train_batches=4, limit_val_batches=4, ) model = BoringModel() tpipes.run_model_test(trainer_options, model, with_hpc=False) assert torch_xla._XLAC._xla_get_default_device() == f"xla:{tpu_core}"
def test_tpu_grad_norm(tmpdir): """Test if grad_norm works on TPU.""" tutils.reset_seed() trainer_options = dict( default_root_dir=tmpdir, enable_progress_bar=False, max_epochs=4, accelerator="tpu", devices=1, limit_train_batches=0.4, limit_val_batches=0.4, gradient_clip_val=0.5, ) model = BoringModel() tpipes.run_model_test(trainer_options, model, with_hpc=False)
def test_multi_gpu_model_ddp_spawn(tmpdir): tutils.set_random_main_port() trainer_options = dict( default_root_dir=tmpdir, max_epochs=1, limit_train_batches=10, limit_val_batches=10, accelerator="gpu", devices=[0, 1], strategy="ddp_spawn", enable_progress_bar=False, ) model = BoringModel() tpipes.run_model_test(trainer_options, model)
def test_multi_gpu_early_stop_ddp_spawn(tmpdir): tutils.set_random_main_port() trainer_options = dict( default_root_dir=tmpdir, callbacks=[EarlyStopping(monitor="train_acc")], max_epochs=50, limit_train_batches=10, limit_val_batches=10, accelerator="gpu", devices=[0, 1], strategy="ddp_spawn", ) dm = ClassifDataModule() model = ClassificationModel() tpipes.run_model_test(trainer_options, model, dm)
def test_all_features_cpu_model(tmpdir): """Test each of the trainer options.""" trainer_options = dict( default_root_dir=tmpdir, gradient_clip_val=1.0, overfit_batches=0.20, track_grad_norm=2, enable_progress_bar=False, accumulate_grad_batches=2, max_epochs=1, limit_train_batches=0.4, limit_val_batches=0.4, ) model = BoringModel() tpipes.run_model_test(trainer_options, model, min_acc=0.01)
def test_multi_cpu_model_ddp(tmpdir): """Make sure DDP works.""" tutils.set_random_main_port() trainer_options = dict( default_root_dir=tmpdir, enable_progress_bar=False, max_epochs=1, limit_train_batches=0.4, limit_val_batches=0.2, accelerator="cpu", devices=2, strategy="ddp_spawn", ) dm = ClassifDataModule() model = ClassificationModel() tpipes.run_model_test(trainer_options, model, data=dm)
def test_multi_gpu_early_stop_dp(tmpdir): """Make sure DDP works. with early stopping """ tutils.set_random_main_port() dm = ClassifDataModule() model = CustomClassificationModelDP() trainer_options = dict( default_root_dir=tmpdir, callbacks=[EarlyStopping(monitor="val_acc")], max_epochs=50, limit_train_batches=10, limit_val_batches=10, accelerator="gpu", devices=[0, 1], strategy="dp", ) tpipes.run_model_test(trainer_options, model, dm)
def test_tpu_host_world_size(tmpdir): """Test Host World size env setup on TPU.""" class DebugModel(BoringModel): def on_train_start(self): assert os.environ.get("XRT_HOST_WORLD_SIZE") == str(1) def teardown(self, stage): assert "XRT_HOST_WORLD_SIZE" not in os.environ tutils.reset_seed() trainer_options = dict( default_root_dir=tmpdir, enable_progress_bar=False, max_epochs=4, accelerator="tpu", devices=8, limit_train_batches=0.4, limit_val_batches=0.4, ) model = DebugModel() tpipes.run_model_test(trainer_options, model, with_hpc=False)
def test_model_saves_on_multi_gpu(tmpdir): """Test that ONNX model saves on a distributed backend.""" tutils.set_random_main_port() trainer_options = dict( default_root_dir=tmpdir, max_epochs=1, limit_train_batches=10, limit_val_batches=10, accelerator="gpu", devices=[0, 1], strategy="ddp_spawn", enable_progress_bar=False, ) model = BoringModel() model.example_input_array = torch.randn(5, 32) tpipes.run_model_test(trainer_options, model, min_acc=0.08) file_path = os.path.join(tmpdir, "model.onnx") model.to_onnx(file_path) assert os.path.exists(file_path) is True
def test_tpu_debug_mode(tmpdir): """Test if debug mode works on TPU.""" class DebugModel(BoringModel): def on_train_start(self): assert os.environ.get("PT_XLA_DEBUG") == str( 1), "PT_XLA_DEBUG was not set in environment variables" def teardown(self, stage): assert "PT_XLA_DEBUG" not in os.environ tutils.reset_seed() trainer_options = dict( default_root_dir=tmpdir, enable_progress_bar=False, max_epochs=4, accelerator="tpu", devices=8, limit_train_batches=0.4, limit_val_batches=0.4, strategy=TPUSpawnStrategy(debug=True), ) model = DebugModel() tpipes.run_model_test(trainer_options, model, with_hpc=False)