예제 #1
0
def test_result_obj_on_tpu(tmpdir):
    seed_everything(1234)

    batches = 5
    epochs = 2

    model = EvalModelTemplate()
    model.training_step = model.training_step_result_obj
    model.training_step_end = None
    model.training_epoch_end = None
    model.validation_step = model.validation_step_result_obj
    model.validation_step_end = None
    model.validation_epoch_end = None
    model.test_step = model.test_step_result_obj
    model.test_step_end = None
    model.test_epoch_end = None

    trainer_options = dict(default_root_dir=tmpdir,
                           max_epochs=epochs,
                           callbacks=[EarlyStopping()],
                           log_every_n_steps=2,
                           limit_train_batches=batches,
                           weights_summary=None,
                           tpu_cores=8)

    tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
예제 #2
0
def test_result_obj_on_tpu(tmpdir):
    seed_everything(1234)
    os.environ['PL_DEV_DEBUG'] = '1'

    batches = 5
    epochs = 2

    model = EvalModelTemplate()
    model.training_step = model.training_step_result_obj
    model.training_step_end = None
    model.training_epoch_end = None
    model.validation_step = model.validation_step_result_obj
    model.validation_step_end = None
    model.validation_epoch_end = None
    model.test_step = model.test_step_result_obj
    model.test_step_end = None
    model.test_epoch_end = None

    trainer_options = dict(default_root_dir=tmpdir,
                           max_epochs=epochs,
                           early_stop_callback=True,
                           row_log_interval=2,
                           limit_train_batches=batches,
                           weights_summary=None,
                           tpu_cores=8)

    tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
예제 #3
0
def test_single_gpu_model(tmpdir, gpus):
    """Make sure single GPU works (DP mode)."""
    trainer_options = dict(default_root_dir=tmpdir,
                           progress_bar_refresh_rate=0,
                           max_epochs=1,
                           limit_train_batches=0.1,
                           limit_val_batches=0.1,
                           gpus=gpus)

    model = EvalModelTemplate()
    tpipes.run_model_test(trainer_options, model)
예제 #4
0
def test_cpu_model(tmpdir):
    """Make sure model trains on CPU."""
    trainer_options = dict(default_root_dir=tmpdir,
                           progress_bar_refresh_rate=0,
                           max_epochs=1,
                           limit_train_batches=0.4,
                           limit_val_batches=0.4)

    model = EvalModelTemplate()

    tpipes.run_model_test(trainer_options, model, on_gpu=False)
예제 #5
0
def test_multi_gpu_none_backend(tmpdir):
    """Make sure when using multiple GPUs the user can't use `distributed_backend = None`."""
    trainer_options = dict(default_root_dir=tmpdir,
                           progress_bar_refresh_rate=0,
                           max_epochs=1,
                           limit_train_batches=0.1,
                           limit_val_batches=0.1,
                           gpus='-1')

    model = EvalModelTemplate()
    with pytest.warns(UserWarning):
        tpipes.run_model_test(trainer_options, model)
예제 #6
0
def test_cpu_model_with_amp(tmpdir):
    """Make sure model trains on CPU."""
    trainer_options = dict(default_root_dir=tmpdir,
                           progress_bar_refresh_rate=0,
                           max_epochs=1,
                           limit_train_batches=0.4,
                           limit_val_batches=0.4,
                           precision=16)

    model = EvalModelTemplate()

    with pytest.raises((MisconfigurationException, ModuleNotFoundError)):
        tpipes.run_model_test(trainer_options, model, on_gpu=False)
예제 #7
0
def test_multi_gpu_none_backend(tmpdir):
    """Make sure when using multiple GPUs the user can't use `distributed_backend = None`."""
    tutils.set_random_master_port()
    trainer_options = dict(default_root_dir=tmpdir,
                           distributed_backend=None,
                           progress_bar_refresh_rate=0,
                           max_epochs=1,
                           limit_train_batches=0.2,
                           limit_val_batches=0.2,
                           gpus=2)

    model = EvalModelTemplate()
    tpipes.run_model_test(trainer_options, model)
예제 #8
0
def test_model_tpu_index(tmpdir, tpu_core):
    """Make sure model trains on TPU."""
    trainer_options = dict(
        default_root_dir=tmpdir,
        progress_bar_refresh_rate=0,
        max_epochs=1,
        tpu_cores=[tpu_core],
        limit_train_batches=0.4,
        limit_val_batches=0.4,
    )

    model = EvalModelTemplate()
    tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
    assert torch_xla._XLAC._xla_get_default_device() == f'xla:{tpu_core}'
예제 #9
0
def test_tpu_grad_norm(tmpdir):
    """Test if grad_norm works on TPU."""
    trainer_options = dict(
        default_root_dir=tmpdir,
        progress_bar_refresh_rate=0,
        max_epochs=1,
        tpu_cores=1,
        limit_train_batches=0.4,
        limit_val_batches=0.4,
        gradient_clip_val=0.1,
    )

    model = EvalModelTemplate()
    tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
예제 #10
0
def test_all_features_cpu_model(tmpdir):
    """Test each of the trainer options."""
    trainer_options = dict(default_root_dir=tmpdir,
                           gradient_clip_val=1.0,
                           overfit_batches=0.20,
                           track_grad_norm=2,
                           progress_bar_refresh_rate=0,
                           accumulate_grad_batches=2,
                           max_epochs=1,
                           limit_train_batches=0.4,
                           limit_val_batches=0.4)

    model = EvalModelTemplate()
    tpipes.run_model_test(trainer_options, model, on_gpu=False)
예제 #11
0
def test_model_tpu_cores_1(tmpdir):
    """Make sure model trains on TPU."""
    trainer_options = dict(
        default_root_dir=tmpdir,
        progress_bar_refresh_rate=0,
        max_epochs=1,
        distributed_backend='tpu',
        tpu_cores=1,
        limit_train_batches=0.4,
        limit_val_batches=0.4,
    )

    model = EvalModelTemplate()
    tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
예제 #12
0
def test_model_16bit_tpu_cores_1(tmpdir):
    """Make sure model trains on TPU."""
    trainer_options = dict(
        default_root_dir=tmpdir,
        precision=16,
        progress_bar_refresh_rate=0,
        max_epochs=1,
        tpu_cores=1,
        limit_train_batches=0.4,
        limit_val_batches=0.4,
    )

    model = EvalModelTemplate()
    tpipes.run_model_test(trainer_options, model, on_gpu=False)
    assert os.environ.get('XLA_USE_BF16') == str(1), "XLA_USE_BF16 was not set in environment variables"
예제 #13
0
def test_multi_gpu_early_stop_ddp_spawn(tmpdir):
    """Make sure DDP works. with early stopping"""
    tutils.set_random_master_port()

    trainer_options = dict(
        default_root_dir=tmpdir,
        callbacks=[EarlyStopping()],
        max_epochs=50,
        limit_train_batches=10,
        limit_val_batches=10,
        gpus=[0, 1],
        distributed_backend='ddp_spawn',
    )

    model = EvalModelTemplate()
    tpipes.run_model_test(trainer_options, model)
예제 #14
0
def test_multi_gpu_model_ddp_spawn(tmpdir):
    tutils.set_random_master_port()

    trainer_options = dict(default_root_dir=tmpdir,
                           max_epochs=1,
                           limit_train_batches=10,
                           limit_val_batches=10,
                           gpus=[0, 1],
                           distributed_backend='ddp_spawn',
                           progress_bar_refresh_rate=0)

    model = EvalModelTemplate()

    tpipes.run_model_test(trainer_options, model)

    # test memory helper functions
    memory.get_memory_profile('min_max')
예제 #15
0
def test_model_tpu_cores_8(tmpdir):
    """Make sure model trains on TPU."""
    trainer_options = dict(
        default_root_dir=tmpdir,
        progress_bar_refresh_rate=0,
        max_epochs=1,
        tpu_cores=8,
        limit_train_batches=0.4,
        limit_val_batches=0.4,
    )

    model = EvalModelTemplate()
    # 8 cores needs a big dataset
    model.train_dataloader = _serial_train_loader
    model.val_dataloader = _serial_train_loader

    tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
def run_test_from_config(trainer_options):
    """Trains the default model with the given config."""
    set_random_master_port()

    ckpt_path = trainer_options['default_root_dir']
    trainer_options.update(checkpoint_callback=ModelCheckpoint(ckpt_path))

    model = EvalModelTemplate()
    run_model_test(trainer_options, model, on_gpu=args.on_gpu, version=0, with_hpc=False)

    # Horovod should be initialized following training. If not, this will raise an exception.
    assert hvd.size() == 2

    if args.on_gpu:
        trainer = Trainer(gpus=1, distributed_backend='horovod', max_epochs=1)
        # Test the root_gpu property
        assert trainer.root_gpu == hvd.local_rank()
예제 #17
0
def test_multi_cpu_model_ddp(tmpdir):
    """Make sure DDP works."""
    tutils.set_random_master_port()

    trainer_options = dict(
        default_root_dir=tmpdir,
        progress_bar_refresh_rate=0,
        max_epochs=1,
        limit_train_batches=0.4,
        limit_val_batches=0.2,
        gpus=None,
        num_processes=2,
        distributed_backend='ddp_cpu',
    )

    model = EvalModelTemplate()
    tpipes.run_model_test(trainer_options, model, on_gpu=False)
예제 #18
0
def test_model_16bit_tpu_index_1(tmpdir):
    """Make sure model trains on TPU."""
    trainer_options = dict(
        default_root_dir=tmpdir,
        precision=16,
        progress_bar_refresh_rate=0,
        max_epochs=1,
        distributed_backend='tpu',
        tpu_cores=[1],
        limit_train_batches=0.4,
        limit_val_batches=0.4,
    )

    model = EvalModelTemplate()
    tpipes.run_model_test(trainer_options, model, on_gpu=False)
    assert torch_xla._XLAC._xla_get_default_device() == 'xla:1'
    assert os.environ.get('XLA_USE_BF16') == str(
        1), "XLA_USE_BF16 was not set in environment variables"
예제 #19
0
def test_model_saves_on_multi_gpu(tmpdir):
    """Test that ONNX model saves on a distributed backend"""
    tutils.set_random_master_port()

    trainer_options = dict(default_root_dir=tmpdir,
                           max_epochs=1,
                           limit_train_batches=10,
                           limit_val_batches=10,
                           gpus=[0, 1],
                           distributed_backend='ddp_spawn',
                           progress_bar_refresh_rate=0)

    model = EvalModelTemplate()

    tpipes.run_model_test(trainer_options, model)

    file_path = os.path.join(tmpdir, "model.onnx")
    model.to_onnx(file_path)
    assert os.path.exists(file_path) is True
예제 #20
0
def test_early_stopping_cpu_model(tmpdir):
    """Test each of the trainer options."""
    stopping = EarlyStopping(monitor='early_stop_on', min_delta=0.1)
    trainer_options = dict(
        default_root_dir=tmpdir,
        early_stop_callback=stopping,
        max_epochs=2,
        gradient_clip_val=1.0,
        overfit_batches=0.20,
        track_grad_norm=2,
        limit_train_batches=0.1,
        limit_val_batches=0.1,
    )

    model = EvalModelTemplate()
    tpipes.run_model_test(trainer_options, model, on_gpu=False)

    # test freeze on cpu
    model.freeze()
    model.unfreeze()
예제 #21
0
def test_base_tpu_model_8(tmpdir):
    """Make sure model trains on TPU."""
    trainer_options = dict(
        default_root_dir=tmpdir,
        progress_bar_refresh_rate=0,
        max_epochs=1,
        tpu_cores=8,
        limit_train_batches=0.4,
        limit_val_batches=0.4
    )

    model = EvalModelTemplate()

    # 8 cores needs a big dataset
    def long_train_loader():
        dataset = DataLoader(TrialMNIST(download=True, num_samples=15000, digits=(0, 1, 2, 5, 8)), batch_size=32)
        return dataset
    model.train_dataloader = long_train_loader
    model.val_dataloader = long_train_loader

    tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
예제 #22
0
def test_base_tpu_16bit_model_8_cores(tmpdir):
    """Make sure model trains on TPU."""
    trainer_options = dict(
        default_root_dir=tmpdir,
        precision=16,
        progress_bar_refresh_rate=0,
        max_epochs=1,
        tpu_cores=8,
        limit_train_batches=0.4,
        limit_val_batches=0.4
    )

    model = EvalModelTemplate()

    # 8 cores needs a big dataset
    def long_train_loader():
        dataset = DataLoader(TrialMNIST(download=True, num_samples=15000, digits=(0, 1, 2, 5, 8)), batch_size=32)
        return dataset
    model.train_dataloader = long_train_loader
    model.val_dataloader = long_train_loader

    tpipes.run_model_test(trainer_options, model, on_gpu=False)
    assert os.environ.get('XLA_USE_BF16') == str(1), "XLA_USE_BF16 was not set in environment variables"