def test_early_stopping_cpu_model(tmpdir): class ModelTrainVal(BoringModel): def validation_step(self, *args, **kwargs): output = super().validation_step(*args, **kwargs) self.log("val_loss", output["x"]) return output tutils.reset_seed() stopping = EarlyStopping(monitor="val_loss", min_delta=0.1) trainer_options = dict( callbacks=[stopping], default_root_dir=tmpdir, gradient_clip_val=1.0, track_grad_norm=2, enable_progress_bar=False, accumulate_grad_batches=2, limit_train_batches=0.1, limit_val_batches=0.1, ) model = ModelTrainVal() tpipes.run_model_test(trainer_options, model, on_gpu=False) # test freeze on cpu model.freeze() model.unfreeze()
def test_tpu_precision_16_clip_gradients(mock_clip_grad_norm, clip_val, tmpdir): """Ensure that clip gradients is only called if the value is greater than 0. TODO: Fix (test fails with parametrize) """ tutils.reset_seed() trainer_options = dict( default_root_dir=tmpdir, enable_progress_bar=False, max_epochs=1, accelerator="tpu", devices=1, precision=16, limit_train_batches=4, limit_val_batches=4, gradient_clip_val=clip_val, ) model = BoringModel() tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False) if clip_val > 0: mock_clip_grad_norm.assert_called() else: mock_clip_grad_norm.assert_not_called()
def test_cpu_model(tmpdir): """Make sure model trains on CPU.""" trainer_options = dict( default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=1, limit_train_batches=4, limit_val_batches=4 ) model = BoringModel() tpipes.run_model_test(trainer_options, model, on_gpu=False)
def test_single_gpu_model(tmpdir, gpus): """Make sure single GPU works (DP mode).""" trainer_options = dict(default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=1, limit_train_batches=0.1, limit_val_batches=0.1, gpus=gpus) model = BoringModel() tpipes.run_model_test(trainer_options, model)
def test_multi_gpu_none_backend(tmpdir): """Make sure when using multiple GPUs the user can't use `distributed_backend = None`.""" tutils.set_random_master_port() trainer_options = dict( default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=1, limit_train_batches=0.2, limit_val_batches=0.2, gpus=2, ) model = BoringModel() tpipes.run_model_test(trainer_options, model, min_acc=0.20)
def test_tpu_clip_grad_by_value(tmpdir): """Test if clip_gradients by value works on TPU""" tutils.reset_seed() trainer_options = dict(default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=4, tpu_cores=1, limit_train_batches=10, limit_val_batches=10, gradient_clip_val=0.5, gradient_clip_algorithm='value') model = BoringModel() tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
def test_model_tpu_cores_1(tmpdir): """Make sure model trains on TPU.""" tutils.reset_seed() trainer_options = dict( default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=2, tpu_cores=1, limit_train_batches=4, limit_val_batches=4, ) model = BoringModel() tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
def test_single_gpu_model(tmpdir, devices): """Make sure single GPU works (DP mode).""" trainer_options = dict( default_root_dir=tmpdir, enable_progress_bar=False, max_epochs=1, limit_train_batches=0.1, limit_val_batches=0.1, accelerator="gpu", devices=devices, ) model = BoringModel() tpipes.run_model_test(trainer_options, model)
def test_model_16bit_tpu_cores_1(tmpdir): """Make sure model trains on TPU.""" tutils.reset_seed() trainer_options = dict( default_root_dir=tmpdir, precision=16, enable_progress_bar=False, max_epochs=2, tpu_cores=1, limit_train_batches=8, limit_val_batches=2, ) model = BoringModel() tpipes.run_model_test(trainer_options, model, on_gpu=False)
def test_cpu_model_with_amp(tmpdir): """Make sure model trains on CPU.""" trainer_options = dict( default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=1, limit_train_batches=0.4, limit_val_batches=0.4, precision=16, ) model = BoringModel() with pytest.raises((MisconfigurationException, ModuleNotFoundError)): tpipes.run_model_test(trainer_options, model, on_gpu=False)
def test_model_tpu_devices_1(tmpdir): """Make sure model trains on TPU.""" tutils.reset_seed() trainer_options = dict( default_root_dir=tmpdir, enable_progress_bar=False, max_epochs=2, accelerator="tpu", devices=1, limit_train_batches=4, limit_val_batches=4, ) model = BoringModel() tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
def test_model_tpu_index(tmpdir, tpu_core): """Make sure model trains on TPU.""" tutils.reset_seed() trainer_options = dict( default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=2, tpu_cores=[tpu_core], limit_train_batches=4, limit_val_batches=4, ) model = BoringModel() tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False) assert torch_xla._XLAC._xla_get_default_device() == f'xla:{tpu_core}'
def test_multi_gpu_none_backend(tmpdir): """Make sure when using multiple GPUs the user can't use `accelerator = None`.""" tutils.set_random_main_port() trainer_options = dict( default_root_dir=tmpdir, enable_progress_bar=False, max_epochs=1, limit_train_batches=0.2, limit_val_batches=0.2, gpus=2, ) dm = ClassifDataModule() model = ClassificationModel() tpipes.run_model_test(trainer_options, model, dm)
def test_model_tpu_cores_8(tmpdir): """Make sure model trains on TPU.""" tutils.reset_seed() trainer_options = dict( default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=1, tpu_cores=8, limit_train_batches=0.4, limit_val_batches=0.4, ) # 8 cores needs a big dataset model = SerialLoaderBoringModel() tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False, min_acc=0.05)
def test_tpu_grad_norm(tmpdir): """Test if grad_norm works on TPU.""" tutils.reset_seed() trainer_options = dict( default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=1, tpu_cores=1, limit_train_batches=0.4, limit_val_batches=0.4, gradient_clip_val=0.1, ) model = BoringModel() tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
def test_multi_gpu_early_stop_ddp_spawn(tmpdir): tutils.set_random_master_port() trainer_options = dict( default_root_dir=tmpdir, callbacks=[EarlyStopping()], max_epochs=50, limit_train_batches=10, limit_val_batches=10, gpus=[0, 1], accelerator='ddp_spawn', ) model = EvalModelTemplate() tpipes.run_model_test(trainer_options, model)
def test_multi_gpu_early_stop_ddp_spawn(tmpdir): tutils.set_random_main_port() trainer_options = dict( default_root_dir=tmpdir, callbacks=[EarlyStopping(monitor="train_acc")], max_epochs=50, limit_train_batches=10, limit_val_batches=10, gpus=[0, 1], strategy="ddp_spawn", ) dm = ClassifDataModule() model = ClassificationModel() tpipes.run_model_test(trainer_options, model, dm)
def test_tpu_grad_norm(tmpdir): """Test if grad_norm works on TPU.""" tutils.reset_seed() trainer_options = dict( default_root_dir=tmpdir, enable_progress_bar=False, max_epochs=4, accelerator="tpu", devices=1, limit_train_batches=0.4, limit_val_batches=0.4, gradient_clip_val=0.5, ) model = BoringModel() tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
def test_model_16bit_tpu_cores_1(tmpdir): """Make sure model trains on TPU.""" tutils.reset_seed() trainer_options = dict( default_root_dir=tmpdir, precision=16, progress_bar_refresh_rate=0, max_epochs=1, tpu_cores=1, limit_train_batches=4, limit_val_batches=4, ) model = BoringModel() tpipes.run_model_test(trainer_options, model, on_gpu=False) assert os.environ.get('XLA_USE_BF16') == str(1), "XLA_USE_BF16 was not set in environment variables"
def test_model_16bit_tpu_index(tmpdir, tpu_core): """Make sure model trains on TPU.""" tutils.reset_seed() trainer_options = dict( default_root_dir=tmpdir, precision=16, enable_progress_bar=False, max_epochs=2, accelerator="tpu", devices=[tpu_core], limit_train_batches=4, limit_val_batches=2, ) model = BoringModel() tpipes.run_model_test(trainer_options, model, on_gpu=False) assert torch_xla._XLAC._xla_get_default_device() == f"xla:{tpu_core}"
def test_all_features_cpu_model(tmpdir): """Test each of the trainer options.""" trainer_options = dict( default_root_dir=tmpdir, gradient_clip_val=1.0, overfit_batches=0.20, track_grad_norm=2, enable_progress_bar=False, accumulate_grad_batches=2, max_epochs=1, limit_train_batches=0.4, limit_val_batches=0.4, ) model = BoringModel() tpipes.run_model_test(trainer_options, model, on_gpu=False, min_acc=0.01)
def test_model_16bit_tpu_index(tmpdir, tpu_core): """Make sure model trains on TPU.""" tutils.reset_seed() trainer_options = dict( default_root_dir=tmpdir, precision=16, enable_progress_bar=False, max_epochs=2, tpu_cores=[tpu_core], limit_train_batches=4, limit_val_batches=2, ) model = BoringModel() tpipes.run_model_test(trainer_options, model, on_gpu=False) assert torch_xla._XLAC._xla_get_default_device() == f"xla:{tpu_core}" assert os.environ.get("XLA_USE_BF16") == str(1), "XLA_USE_BF16 was not set in environment variables"
def test_multi_gpu_model_dp(tmpdir): tutils.set_random_main_port() trainer_options = dict( default_root_dir=tmpdir, max_epochs=1, limit_train_batches=10, limit_val_batches=10, accelerator="gpu", devices=[0, 1], strategy="dp", enable_progress_bar=False, ) model = BoringModel() tpipes.run_model_test(trainer_options, model)
def test_multi_cpu_model_ddp(tmpdir): """Make sure DDP works.""" tutils.set_random_main_port() trainer_options = dict( default_root_dir=tmpdir, enable_progress_bar=False, max_epochs=1, limit_train_batches=0.4, limit_val_batches=0.2, gpus=None, num_processes=2, strategy="ddp_spawn", ) dm = ClassifDataModule() model = ClassificationModel() tpipes.run_model_test(trainer_options, model, data=dm, on_gpu=False)
def test_multi_gpu_early_stop_dp(tmpdir): """Make sure DDP works. with early stopping""" tutils.set_random_master_port() dm = ClassifDataModule() model = CustomClassificationModelDP() trainer_options = dict( default_root_dir=tmpdir, callbacks=[EarlyStopping(monitor='val_acc')], max_epochs=50, limit_train_batches=10, limit_val_batches=10, gpus=[0, 1], accelerator='dp', ) tpipes.run_model_test(trainer_options, model, dm)
def test_multi_cpu_model_ddp(tmpdir): """Make sure DDP works.""" tutils.set_random_master_port() trainer_options = dict( default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=1, limit_train_batches=0.4, limit_val_batches=0.2, gpus=None, num_processes=2, accelerator='ddp_cpu', ) dm = ClassifDataModule() model = ClassificationModel() tpipes.run_model_test(trainer_options, model, data=dm, on_gpu=False)
def test_multi_gpu_model_ddp_spawn(tmpdir): tutils.set_random_main_port() trainer_options = dict( default_root_dir=tmpdir, max_epochs=1, limit_train_batches=10, limit_val_batches=10, gpus=[0, 1], strategy="ddp_spawn", enable_progress_bar=False, ) model = BoringModel() tpipes.run_model_test(trainer_options, model) # test memory helper functions memory.get_memory_profile("min_max")
def test_multi_gpu_model_dp(tmpdir): tutils.set_random_master_port() trainer_options = dict( default_root_dir=tmpdir, max_epochs=1, limit_train_batches=10, limit_val_batches=10, gpus=[0, 1], accelerator='dp', progress_bar_refresh_rate=0, ) model = BoringModel() tpipes.run_model_test(trainer_options, model) # test memory helper functions memory.get_memory_profile('min_max')
def test_model_16bit_tpu_devices_8(tmpdir): """Make sure model trains on TPU.""" tutils.reset_seed() trainer_options = dict( default_root_dir=tmpdir, precision=16, enable_progress_bar=False, max_epochs=1, accelerator="tpu", devices=8, limit_train_batches=4, limit_val_batches=4, ) # 8 cores needs a big dataset model = SerialLoaderBoringModel() tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False, min_acc=0.05)
def test_tpu_host_world_size(tmpdir): """Test Host World size env setup on TPU.""" class DebugModel(BoringModel): def on_train_start(self): assert os.environ.get("XRT_HOST_WORLD_SIZE") == str(1) def teardown(self, stage): assert "XRT_HOST_WORLD_SIZE" not in os.environ tutils.reset_seed() trainer_options = dict( default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=4, tpu_cores=8, limit_train_batches=0.4, limit_val_batches=0.4, ) model = DebugModel() tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)