Exemplo n.º 1
0
def test_tensorboard_hparams_reload(tmpdir):
    model = EvalModelTemplate()

    trainer = Trainer(max_epochs=1, default_root_dir=tmpdir)
    assert trainer.log_dir == trainer.logger.log_dir
    trainer.fit(model)

    assert trainer.log_dir == trainer.logger.log_dir
    folder_path = trainer.log_dir

    # make sure yaml is there
    with open(os.path.join(folder_path, "hparams.yaml")) as file:
        # The FullLoader parameter handles the conversion from YAML
        # scalar values to Python the dictionary format
        yaml_params = yaml.safe_load(file)
        assert yaml_params["b1"] == 0.5
        assert len(yaml_params.keys()) == 10

    # verify artifacts
    assert len(os.listdir(os.path.join(folder_path, "checkpoints"))) == 1

    # verify tb logs
    event_acc = EventAccumulator(folder_path)
    event_acc.Reload()

    data_pt_1_5 = b'\x12\x93\x01"\x0b\n\tdrop_prob"\x0c\n\nbatch_size"\r\n\x0bin_features"\x0f\n\rlearning_rate"' \
                  b'\x10\n\x0eoptimizer_name"\x0b\n\tdata_root"\x0e\n\x0cout_features"\x0c\n\nhidden_dim"' \
                  b'\x04\n\x02b1"\x04\n\x02b2*\r\n\x0b\x12\thp_metric'
    data_pt_1_6 = b'\x12\xa7\x01"\r\n\tdrop_prob \x03"\x0e\n\nbatch_size \x03"\x0f\n\x0bin_features \x03"' \
                  b'\x11\n\rlearning_rate \x03"\x12\n\x0eoptimizer_name \x01"\r\n\tdata_root \x01"' \
                  b'\x10\n\x0cout_features \x03"\x0e\n\nhidden_dim \x03"\x06\n\x02b1 \x03"' \
                  b'\x06\n\x02b2 \x03*\r\n\x0b\x12\thp_metric'

    hparams_data = data_pt_1_6 if LooseVersion(
        torch.__version__) >= LooseVersion("1.6.0") else data_pt_1_5

    assert event_acc.summary_metadata[
        '_hparams_/experiment'].plugin_data.plugin_name == 'hparams'
    assert event_acc.summary_metadata[
        '_hparams_/experiment'].plugin_data.content == hparams_data
Exemplo n.º 2
0
    def _new_model():
        # Create a model that tracks epochs and batches seen
        model = EvalModelTemplate(**hparams)
        model.num_epochs_seen = 0
        model.num_batches_seen = 0
        model.num_on_load_checkpoint_called = 0

        def increment_epoch(self):
            self.num_epochs_seen += 1

        def increment_batch(self, batch, batch_idx, dataloader_idx):
            self.num_batches_seen += 1

        def increment_on_load_checkpoint(self, _):
            self.num_on_load_checkpoint_called += 1

        # Bind methods to keep track of epoch numbers, batch numbers it has seen
        # as well as number of times it has called on_load_checkpoint()
        model.on_epoch_end = types.MethodType(increment_epoch, model)
        model.on_train_batch_start = types.MethodType(increment_batch, model)
        model.on_load_checkpoint = types.MethodType(
            increment_on_load_checkpoint, model)
        return model
Exemplo n.º 3
0
def test_model_saving_loading(tmpdir):
    """Tests use case where trainer saves the model, and user loads it from tags independently."""
    model = EvalModelTemplate()

    # logger file to get meta
    logger = tutils.get_default_logger(tmpdir)

    # fit model
    trainer = Trainer(
        max_epochs=1, logger=logger,
        checkpoint_callback=ModelCheckpoint(dirpath=tmpdir), default_root_dir=tmpdir,
    )
    result = trainer.fit(model)

    # traning complete
    assert result == 1, 'amp + ddp model failed to complete'

    # make a prediction
    dataloaders = model.test_dataloader()
    if not isinstance(dataloaders, list):
        dataloaders = [dataloaders]

    for dataloader in dataloaders:
        for batch in dataloader:
            break

    x, y = batch
    x = x.view(x.size(0), -1)

    # generate preds before saving model
    model.eval()
    pred_before_saving = model(x)

    # save model
    new_weights_path = os.path.join(tmpdir, 'save_test.ckpt')
    trainer.save_checkpoint(new_weights_path)

    # load new model
    hparams_path = tutils.get_data_path(logger, path_dir=tmpdir)
    hparams_path = os.path.join(hparams_path, 'hparams.yaml')
    model_2 = EvalModelTemplate.load_from_checkpoint(checkpoint_path=new_weights_path, hparams_file=hparams_path,)
    model_2.eval()

    # make prediction
    # assert that both predictions are the same
    new_pred = model_2(x)
    assert torch.all(torch.eq(pred_before_saving, new_pred)).item() == 1
Exemplo n.º 4
0
def test_full_loop(tmpdir):
    dm = TrialMNISTDataModule(tmpdir)
    dm.prepare_data()
    dm.setup()

    model = EvalModelTemplate()

    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=3,
        weights_summary=None,
    )
    trainer.fit(model, dm)

    # fit model
    result = trainer.fit(model)
    assert result == 1

    # test
    result = trainer.test(datamodule=dm)
    result = result[0]
    assert result['test_acc'] > 0.8
Exemplo n.º 5
0
def test_horovod_multi_optimizer(tmpdir):
    model = TestGAN(**EvalModelTemplate.get_default_hparams())

    trainer_options = dict(
        default_root_dir=str(tmpdir),
        progress_bar_refresh_rate=0,
        max_epochs=1,
        limit_train_batches=0.4,
        limit_val_batches=0.2,
        deterministic=True,
        distributed_backend='horovod',
    )

    # fit model
    trainer = Trainer(**trainer_options)
    result = trainer.fit(model)
    assert result == 1, 'model failed to complete'

    assert len(trainer.optimizers) == 2
    for i, optimizer in enumerate(trainer.optimizers):
        assert hasattr(
            optimizer, 'synchronize'
        ), 'optimizer has not been wrapped into DistributedOptimizer'

    def get_model_params(model):
        return set([p for p in model.parameters()])

    def get_optimizer_params(optimizer):
        return set([
            p for group in optimizer.param_groups
            for p in group.get('params', [])
        ])

    assert get_model_params(model.generator) != get_model_params(
        model.discriminator)
    assert get_model_params(model.generator) == get_optimizer_params(
        trainer.optimizers[0])
    assert get_model_params(model.discriminator) == get_optimizer_params(
        trainer.optimizers[1])
Exemplo n.º 6
0
def test_auto_scale_batch_size_set_model_attribute(tmpdir, use_hparams):
    """ Test that new batch size gets written to the correct hyperparameter attribute. """
    tutils.reset_seed()

    hparams = EvalModelTemplate.get_default_hparams()
    before_batch_size = hparams.get('batch_size')

    class HparamsEvalModelTemplate(EvalModelTemplate):
        def dataloader(self, *args, **kwargs):
            # artificially set batch_size so we can get a dataloader
            # remove it immediately after, because we want only self.hparams.batch_size
            setattr(self, "batch_size", before_batch_size)
            dataloader = super().dataloader(*args, **kwargs)
            del self.batch_size
            return dataloader

    datamodule_model = MNISTDataModule(
        data_dir=tmpdir, batch_size=111)  # this datamodule should get ignored!
    datamodule_fit = MNISTDataModule(data_dir=tmpdir,
                                     batch_size=before_batch_size)

    model_class = HparamsEvalModelTemplate if use_hparams else EvalModelTemplate
    model = model_class(**hparams)
    model.datamodule = datamodule_model  # unused when another module gets passed to .tune() / .fit()

    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=1,
        auto_scale_batch_size=True,
        gpus=1,
    )
    trainer.tune(model, datamodule_fit)
    after_batch_size = model.hparams.batch_size if use_hparams else model.batch_size
    assert trainer.datamodule == datamodule_fit
    assert before_batch_size != after_batch_size
    assert after_batch_size <= len(trainer.train_dataloader.dataset)
    assert datamodule_fit.batch_size == after_batch_size
    # should be left unchanged, since it was not passed to .tune()
    assert datamodule_model.batch_size == 111
Exemplo n.º 7
0
def test_model_properties_resume_from_checkpoint(tmpdir):
    """ Test that properties like `current_epoch` and `global_step`
    in model and trainer are always the same. """
    model = EvalModelTemplate()
    checkpoint_callback = ModelCheckpoint(dirpath=tmpdir,
                                          monitor="early_stop_on",
                                          save_last=True)
    trainer_args = dict(
        default_root_dir=tmpdir,
        max_epochs=1,
        logger=False,
        callbacks=[checkpoint_callback,
                   ModelTrainerPropertyParity()
                   ]  # this performs the assertions
    )
    trainer = Trainer(**trainer_args)
    trainer.fit(model)

    trainer_args.update(max_epochs=2)
    trainer = Trainer(**trainer_args,
                      resume_from_checkpoint=str(tmpdir / "last.ckpt"))
    trainer.fit(model)
def test_full_loop(tmpdir):
    reset_seed()

    dm = TrialMNISTDataModule(tmpdir)

    model = EvalModelTemplate()

    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=3,
        weights_summary=None,
        deterministic=True,
    )

    # fit model
    result = trainer.fit(model, dm)
    assert result == 1

    # test
    result = trainer.test(datamodule=dm)
    result = result[0]
    assert result['test_acc'] > 0.8
Exemplo n.º 9
0
def test_amp_gpu_ddp_slurm_managed(tmpdir):
    """Make sure DDP + AMP work."""
    # simulate setting slurm flags
    tutils.set_random_master_port()
    os.environ['SLURM_LOCALID'] = str(0)

    model = EvalModelTemplate()

    # exp file to get meta
    logger = tutils.get_default_logger(tmpdir)

    # exp file to get weights
    checkpoint = tutils.init_checkpoint_callback(logger)

    # fit model
    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=1,
        gpus=[0],
        distributed_backend='ddp_spawn',
        precision=16,
        checkpoint_callback=checkpoint,
        logger=logger,
    )
    trainer.is_slurm_managing_tasks = True
    result = trainer.fit(model)

    # correct result and ok accuracy
    assert result == 1, 'amp + ddp model failed to complete'

    # test root model address
    assert trainer.accelerator_connector.resolve_root_node_address(
        'abc') == 'abc'
    assert trainer.accelerator_connector.resolve_root_node_address(
        'abc[23]') == 'abc23'
    assert trainer.accelerator_connector.resolve_root_node_address(
        'abc[23-24]') == 'abc23'
    assert trainer.accelerator_connector.resolve_root_node_address(
        'abc[23-24, 45-40, 40]') == 'abc23'
def test_model_checkpoint_none_monitor(tmpdir):
    model = EvalModelTemplate()
    epochs = 2
    checkpoint_callback = ModelCheckpoint(monitor='val_loss', filepath=tmpdir, save_top_k=-1)
    trainer = Trainer(
        default_root_dir=tmpdir,
        early_stop_callback=False,
        checkpoint_callback=checkpoint_callback,
        max_epochs=epochs,
    )
    trainer.fit(model)

    # these should not be set if monitor is None
    assert checkpoint_callback.best_model_path == ''
    assert checkpoint_callback.best_model_score == 0
    assert checkpoint_callback.best_k_models == {}
    assert checkpoint_callback.kth_best_model_path == ''

    # check that the correct ckpts were created
    expected = ['lightning_logs']
    expected.extend(f'epoch={e}.ckpt' for e in range(epochs))
    assert set(os.listdir(tmpdir)) == set(expected)
def test_ckpt_metric_names(tmpdir):
    model = EvalModelTemplate()

    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=1,
        gradient_clip_val=1.0,
        overfit_batches=0.20,
        progress_bar_refresh_rate=0,
        limit_train_batches=0.01,
        limit_val_batches=0.01,
        checkpoint_callback=ModelCheckpoint(monitor='val_loss', filepath=tmpdir + "/{val_loss:.2f}"),
    )

    trainer.fit(model)

    # make sure the checkpoint we saved has the metric in the name
    ckpts = os.listdir(tmpdir)
    ckpts = [x for x in ckpts if "val_loss" in x]
    assert len(ckpts) == 1
    val = re.sub("[^0-9.]", "", ckpts[0])
    assert len(val) > 3
def test_model_checkpoint_period(tmpdir, period):
    model = EvalModelTemplate()
    epochs = 5
    checkpoint_callback = ModelCheckpoint(filepath=tmpdir,
                                          save_top_k=-1,
                                          period=period)
    trainer = Trainer(
        default_root_dir=tmpdir,
        early_stop_callback=False,
        checkpoint_callback=checkpoint_callback,
        max_epochs=epochs,
        limit_train_batches=0.1,
        limit_val_batches=0.1,
        logger=False,
    )
    trainer.fit(model)

    # check that the correct ckpts were created
    expected = [
        f'epoch={e}.ckpt' for e in range(epochs) if not (e + 1) % period
    ] if period > 0 else []
    assert set(os.listdir(tmpdir)) == set(expected)
Exemplo n.º 13
0
def test_grad_tracking_interval(tmpdir, log_every_n_steps):
    """ Test that gradient norms get tracked in the right interval and that everytime the same keys get logged. """
    trainer = Trainer(
        default_root_dir=tmpdir,
        track_grad_norm=2,
        log_every_n_steps=log_every_n_steps,
        max_steps=10,
    )

    with patch.object(trainer.logger, "log_metrics") as mocked:
        model = EvalModelTemplate()
        trainer.fit(model)
        expected = trainer.global_step // log_every_n_steps
        grad_norm_dicts = []
        for _, kwargs in mocked.call_args_list:
            metrics = kwargs.get("metrics", {})
            grad_norm_dict = {k: v for k, v in metrics.items() if k.startswith("grad_")}
            if grad_norm_dict:
                grad_norm_dicts.append(grad_norm_dict)

        assert len(grad_norm_dicts) == expected
        assert all(grad_norm_dicts[0].keys() == g.keys() for g in grad_norm_dicts)
Exemplo n.º 14
0
def test_dataloaders_load_every_epoch(tmpdir):
    os.environ['PL_DEV_DEBUG'] = '1'

    model = EvalModelTemplate()
    train_loader = model.train_dataloader()
    model.train_dataloader = None
    val_loader = model.val_dataloader()
    model.val_dataloader = None
    test_loader = model.test_dataloader()
    model.test_dataloader = None

    # logger file to get meta
    trainer = Trainer(
        default_root_dir=tmpdir,
        limit_train_batches=0.3,
        limit_val_batches=0.3,
        reload_dataloaders_every_epoch=True,
        max_epochs=3,
    )
    result = trainer.fit(model, train_loader, val_loader)

    trainer.test(test_dataloaders=test_loader)

    assert len(trainer.dev_debugger.val_dataloader_calls) == 4
    assert len(trainer.dev_debugger.train_dataloader_calls) == 3
    assert len(trainer.dev_debugger.test_dataloader_calls) == 1

    # verify the sequence
    calls = trainer.dev_debugger.dataloader_sequence_calls
    expected_sequence = [
        'val_dataloader', 'train_dataloader', 'val_dataloader',
        'train_dataloader', 'val_dataloader', 'train_dataloader',
        'val_dataloader', 'test_dataloader'
    ]
    for call, expected in zip(calls, expected_sequence):
        assert call['name'] == expected
Exemplo n.º 15
0
def test_train_loop_only(tmpdir):
    dm = TrialMNISTDataModule(tmpdir)
    dm.prepare_data()
    dm.setup()

    model = EvalModelTemplate()
    model.validation_step = None
    model.validation_step_end = None
    model.validation_epoch_end = None
    model.test_step = None
    model.test_step_end = None
    model.test_epoch_end = None

    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=3,
        weights_summary=None,
    )
    trainer.fit(model, dm)

    # fit model
    result = trainer.fit(model)
    assert result == 1
    assert trainer.callback_metrics['loss'] < 0.50
Exemplo n.º 16
0
def test_dataloaders_with_limit_percent_batches(tmpdir, limit_train_batches,
                                                limit_val_batches,
                                                limit_test_batches):
    """Verify num_batches for val & test dataloaders passed with batch limit in percent"""
    model = EvalModelTemplate()
    model.val_dataloader = model.val_dataloader__multiple_mixed_length
    model.test_dataloader = model.test_dataloader__multiple_mixed_length
    model.validation_step = model.validation_step__multiple_dataloaders
    model.validation_epoch_end = model.validation_epoch_end__multiple_dataloaders
    model.test_step = model.test_step__multiple_dataloaders
    model.test_epoch_end = model.test_epoch_end__multiple_dataloaders

    # train, multiple val and multiple test passed with percent_check
    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=1,
        limit_train_batches=limit_train_batches,
        limit_val_batches=limit_val_batches,
        limit_test_batches=limit_test_batches,
    )
    trainer.fit(model)
    expected_train_batches = int(
        len(trainer.train_dataloader) * limit_train_batches)
    expected_val_batches = [
        int(len(dataloader) * limit_val_batches)
        for dataloader in trainer.val_dataloaders
    ]
    assert trainer.num_training_batches == expected_train_batches
    assert trainer.num_val_batches == expected_val_batches

    trainer.test(ckpt_path=None)
    expected_test_batches = [
        int(len(dataloader) * limit_test_batches)
        for dataloader in trainer.test_dataloaders
    ]
    assert trainer.num_test_batches == expected_test_batches
Exemplo n.º 17
0
def test_callbacks_state_resume_from_checkpoint(enable_pl_optimizer, tmpdir):
    """ Test that resuming from a checkpoint restores callbacks that persist state. """
    model = EvalModelTemplate()
    callback_capture = CaptureCallbacksBeforeTraining()

    def get_trainer_args():
        checkpoint = ModelCheckpoint(dirpath=tmpdir,
                                     monitor="early_stop_on",
                                     save_last=True)
        trainer_args = dict(default_root_dir=tmpdir,
                            max_steps=1,
                            logger=False,
                            enable_pl_optimizer=enable_pl_optimizer,
                            callbacks=[
                                checkpoint,
                                callback_capture,
                            ])
        assert checkpoint.best_model_path == ""
        assert checkpoint.best_model_score is None
        return trainer_args

    # initial training
    trainer = Trainer(**get_trainer_args())
    trainer.fit(model)
    callbacks_before_resume = deepcopy(trainer.callbacks)

    # resumed training
    trainer = Trainer(**get_trainer_args(),
                      resume_from_checkpoint=str(tmpdir / "last.ckpt"))
    trainer.fit(model)

    assert len(callbacks_before_resume) == len(callback_capture.callbacks)

    for before, after in zip(callbacks_before_resume,
                             callback_capture.callbacks):
        if isinstance(before, ModelCheckpoint):
            assert before.best_model_path == after.best_model_path
            assert before.best_model_score == after.best_model_score
Exemplo n.º 18
0
def test_resume_early_stopping_from_checkpoint(tmpdir):
    """
    Prevent regressions to bugs:
    https://github.com/PyTorchLightning/pytorch-lightning/issues/1464
    https://github.com/PyTorchLightning/pytorch-lightning/issues/1463
    """
    seed_everything(42)
    model = EvalModelTemplate()
    checkpoint_callback = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_top_k=1)
    early_stop_callback = EarlyStoppingTestRestore()
    trainer = Trainer(
        default_root_dir=tmpdir,
        checkpoint_callback=checkpoint_callback,
        callbacks=[early_stop_callback],
        num_sanity_val_steps=0,
        max_epochs=4,
    )
    trainer.fit(model)

    checkpoint_filepath = checkpoint_callback.kth_best_model_path
    # ensure state is persisted properly
    checkpoint = torch.load(checkpoint_filepath)
    # the checkpoint saves "epoch + 1"
    early_stop_callback_state = early_stop_callback.saved_states[checkpoint["epoch"] - 1]
    assert 4 == len(early_stop_callback.saved_states)
    assert checkpoint["callbacks"][type(early_stop_callback)] == early_stop_callback_state

    # ensure state is reloaded properly (assertion in the callback)
    early_stop_callback = EarlyStoppingTestRestore(early_stop_callback_state)
    new_trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=1,
        resume_from_checkpoint=checkpoint_filepath,
        callbacks=[early_stop_callback],
    )

    with pytest.raises(MisconfigurationException, match=r'.*you restored a checkpoint with current_epoch*'):
        new_trainer.fit(model)
Exemplo n.º 19
0
def test_resume_early_stopping_from_checkpoint(tmpdir):
    """
    Prevent regressions to bugs:
    https://github.com/PyTorchLightning/pytorch-lightning/issues/1464
    https://github.com/PyTorchLightning/pytorch-lightning/issues/1463
    """

    model = EvalModelTemplate()
    checkpoint_callback = ModelCheckpoint(save_top_k=1)
    early_stop_callback = EarlyStoppingTestRestore()
    trainer = Trainer(
        default_root_dir=tmpdir,
        checkpoint_callback=checkpoint_callback,
        early_stop_callback=early_stop_callback,
        num_sanity_val_steps=0,
        max_epochs=4,
    )
    trainer.fit(model)

    checkpoint_filepath = checkpoint_callback.kth_best_model
    # ensure state is persisted properly
    checkpoint = torch.load(checkpoint_filepath)
    # the checkpoint saves "epoch + 1"
    early_stop_callback_state = early_stop_callback.saved_states[
        checkpoint["epoch"] - 1]
    assert 4 == len(early_stop_callback.saved_states)
    assert checkpoint["callbacks"][type(
        early_stop_callback)] == early_stop_callback_state

    # ensure state is reloaded properly (assertion in the callback)
    early_stop_callback = EarlyStoppingTestRestore(early_stop_callback_state)
    new_trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=2,
        resume_from_checkpoint=checkpoint_filepath,
        early_stop_callback=early_stop_callback,
    )
    new_trainer.fit(model)
Exemplo n.º 20
0
def test_trainer_interrupted_flag(tmpdir):
    """Test the flag denoting that a user interrupted training."""

    model = EvalModelTemplate()

    class InterruptCallback(Callback):
        def __init__(self):
            super().__init__()

        def on_train_batch_start(self, trainer, pl_module, batch, batch_idx,
                                 dataloader_idx):
            raise KeyboardInterrupt

    class HandleInterruptCallback(Callback):
        def __init__(self):
            super().__init__()
            self.exc_info = None

        def on_keyboard_interrupt(self, trainer, pl_module):
            self.exc_info = sys.exc_info()

    interrupt_callback = InterruptCallback()
    handle_interrupt_callback = HandleInterruptCallback()

    trainer = Trainer(
        callbacks=[interrupt_callback, handle_interrupt_callback],
        max_epochs=1,
        limit_val_batches=0.1,
        limit_train_batches=0.2,
        progress_bar_refresh_rate=0,
        logger=False,
        default_root_dir=tmpdir,
    )
    assert not trainer.interrupted
    assert handle_interrupt_callback.exc_info is None
    trainer.fit(model)
    assert trainer.interrupted
    assert isinstance(handle_interrupt_callback.exc_info[1], KeyboardInterrupt)
def test_mlflow_logger_dirs_creation(tmpdir):
    """ Test that the logger creates the folders and files in the right place. """
    assert not os.listdir(tmpdir)
    logger = MLFlowLogger('test', save_dir=tmpdir)
    assert logger.save_dir == tmpdir
    assert set(os.listdir(tmpdir)) == {'.trash'}
    run_id = logger.run_id
    exp_id = logger.experiment_id

    # multiple experiment calls should not lead to new experiment folders
    for i in range(2):
        _ = logger.experiment
        assert set(os.listdir(tmpdir)) == {'.trash', exp_id}
        assert set(os.listdir(tmpdir / exp_id)) == {run_id, 'meta.yaml'}

    model = EvalModelTemplate()
    trainer = Trainer(default_root_dir=tmpdir, logger=logger, max_epochs=1, limit_val_batches=3)
    trainer.fit(model)
    assert set(os.listdir(tmpdir / exp_id)) == {run_id, 'meta.yaml'}
    assert 'epoch' in os.listdir(tmpdir / exp_id / run_id / 'metrics')
    assert set(os.listdir(tmpdir / exp_id / run_id / 'params')) == model.hparams.keys()
    assert trainer.ckpt_path == trainer.weights_save_path == (tmpdir / exp_id / run_id / 'checkpoints')
    assert set(os.listdir(trainer.ckpt_path)) == {'epoch=0.ckpt'}
def test_model_checkpoint_save_last(tmpdir):
    """Tests that save_last produces only one last checkpoint."""
    model = EvalModelTemplate()
    epochs = 3
    ModelCheckpoint.CHECKPOINT_NAME_LAST = 'last-{epoch}'
    model_checkpoint = ModelCheckpoint(filepath=tmpdir,
                                       save_top_k=-1,
                                       save_last=True)
    trainer = Trainer(
        default_root_dir=tmpdir,
        early_stop_callback=False,
        checkpoint_callback=model_checkpoint,
        max_epochs=epochs,
    )
    trainer.fit(model)
    last_filename = model_checkpoint._format_checkpoint_name(
        ModelCheckpoint.CHECKPOINT_NAME_LAST, epochs - 1, {})
    last_filename = last_filename + '.ckpt'
    assert str(tmpdir / last_filename) == model_checkpoint.last_model_path
    assert set(
        os.listdir(tmpdir)) == set([f'epoch={i}.ckpt' for i in range(epochs)] +
                                   [last_filename, 'lightning_logs'])
    ModelCheckpoint.CHECKPOINT_NAME_LAST = 'last'
Exemplo n.º 23
0
def test_trainer_reset_correctly(tmpdir):
    """Check that all trainer parameters are reset correctly after scaling batch size."""
    tutils.reset_seed()

    model = EvalModelTemplate()

    # logger file to get meta
    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1)

    changed_attributes = [
        "callbacks",
        "checkpoint_callback",
        "current_epoch",
        "limit_train_batches",
        "logger",
        "max_steps",
        "weights_summary",
    ]
    expected = {ca: getattr(trainer, ca) for ca in changed_attributes}
    trainer.tuner.scale_batch_size(model, max_trials=5)
    actual = {ca: getattr(trainer, ca) for ca in changed_attributes}

    assert actual == expected
Exemplo n.º 24
0
def test_trainer_reset_correctly(tmpdir):
    """Check that all trainer parameters are reset correctly after lr_find()"""

    model = EvalModelTemplate()

    # logger file to get meta
    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1)

    changed_attributes = [
        "accumulate_grad_batches",
        "auto_lr_find",
        "callbacks",
        "checkpoint_callback",
        "current_epoch",
        "logger",
        "max_steps",
    ]
    expected = {ca: getattr(trainer, ca) for ca in changed_attributes}
    trainer.tuner.lr_find(model, num_training=5)
    actual = {ca: getattr(trainer, ca) for ca in changed_attributes}

    assert actual == expected
    assert model.trainer == trainer
Exemplo n.º 25
0
def test_trainer_attached_to_dm(tmpdir):
    reset_seed()

    dm = TrialMNISTDataModule(tmpdir)

    model = EvalModelTemplate()

    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=3,
        weights_summary=None,
        deterministic=True,
    )

    # fit model
    result = trainer.fit(model, dm)
    assert result == 1
    assert dm.trainer is not None

    # test
    result = trainer.test(datamodule=dm)
    result = result[0]
    assert dm.trainer is not None
Exemplo n.º 26
0
def test_wrong_train_setting(tmpdir):
    """
    * Test that an error is thrown when no `training_dataloader()` is defined
    * Test that an error is thrown when no `training_step()` is defined
    """
    tutils.reset_seed()
    hparams = tutils.get_default_hparams()
    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1)

    with pytest.raises(MisconfigurationException):
        model = EvalModelTemplate(hparams)
        model.train_dataloader = None
        trainer.fit(model)

    with pytest.raises(MisconfigurationException):
        model = EvalModelTemplate(hparams)
        model.training_step = None
        trainer.fit(model)
Exemplo n.º 27
0
def test_trainer_subclassing():
    model = EvalModelTemplate()

    # First way of pulling out args from signature is to list them
    class TrainerSubclass(Trainer):
        def __init__(self, custom_arg, *args, custom_kwarg='test', **kwargs):
            super().__init__(*args, **kwargs)
            self.custom_arg = custom_arg
            self.custom_kwarg = custom_kwarg

    trainer = TrainerSubclass(123, custom_kwarg='custom', fast_dev_run=True)
    result = trainer.fit(model)
    assert result == 1
    assert trainer.custom_arg == 123
    assert trainer.custom_kwarg == 'custom'
    assert trainer.fast_dev_run

    # Second way is to pop from the dict
    # It's a special case because Trainer does not have any positional args
    class TrainerSubclass(Trainer):
        def __init__(self, **kwargs):
            self.custom_arg = kwargs.pop('custom_arg', 0)
            self.custom_kwarg = kwargs.pop('custom_kwarg', 'test')
            super().__init__(**kwargs)

    trainer = TrainerSubclass(custom_kwarg='custom', fast_dev_run=True)
    result = trainer.fit(model)
    assert result == 1
    assert trainer.custom_kwarg == 'custom'
    assert trainer.fast_dev_run

    # when we pass in an unknown arg, the base class should complain
    with pytest.raises(
            TypeError,
            match=r"__init__\(\) got an unexpected keyword argument 'abcdefg'"
    ):
        TrainerSubclass(abcdefg='unknown_arg')
def test_gpu_stats_monitor(tmpdir):
    """
    Test GPU stats are logged using a logger.
    """
    model = EvalModelTemplate()
    gpu_stats = GPUStatsMonitor(intra_step_time=True)
    logger = CSVLogger(tmpdir)
    log_every_n_steps = 2

    trainer = Trainer(default_root_dir=tmpdir,
                      max_epochs=2,
                      limit_train_batches=7,
                      log_every_n_steps=log_every_n_steps,
                      gpus=1,
                      callbacks=[gpu_stats],
                      logger=logger)

    results = trainer.fit(model)
    assert results

    path_csv = os.path.join(logger.log_dir, ExperimentWriter.NAME_METRICS_FILE)
    met_data = np.genfromtxt(path_csv,
                             delimiter=',',
                             names=True,
                             deletechars='',
                             replace_space=' ')

    batch_time_data = met_data['batch_time/intra_step (ms)']
    batch_time_data = batch_time_data[~np.isnan(batch_time_data)]
    assert batch_time_data.shape[0] == trainer.global_step // log_every_n_steps

    fields = [
        'utilization.gpu', 'memory.used', 'memory.free', 'utilization.memory'
    ]

    for f in fields:
        assert any([f in h for h in met_data.dtype.names])
Exemplo n.º 29
0
def test_multiple_test_dataloader(tmpdir, ckpt_path):
    """Verify multiple test_dataloader."""

    model_template = EvalModelTemplate()

    class MultipleTestDataloaderModel(EvalModelTemplate):
        def test_dataloader(self):
            return [self.dataloader(train=False), self.dataloader(train=False)]

        def test_step(self, batch, batch_idx, *args, **kwargs):
            return model_template.test_step__multiple_dataloaders(
                batch, batch_idx, *args, **kwargs)

    model = MultipleTestDataloaderModel()

    # fit model
    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=1,
        limit_val_batches=10,
        limit_train_batches=100,
    )
    trainer.fit(model)
    if ckpt_path == 'specific':
        ckpt_path = trainer.checkpoint_callback.best_model_path
    trainer.test(ckpt_path=ckpt_path)

    # verify there are 2 test loaders
    assert len(trainer.test_dataloaders
               ) == 2, 'Multiple test_dataloaders not initiated properly'

    # make sure predictions are good for each test set
    for dataloader in trainer.test_dataloaders:
        tpipes.run_prediction_eval_model_template(trainer.model, dataloader)

    # run the test method
    trainer.test(ckpt_path=ckpt_path)
Exemplo n.º 30
0
def test_default_logger_callbacks_cpu_model(tmpdir):
    """Test each of the trainer options."""
    trainer_options = dict(
        default_root_dir=tmpdir,
        max_epochs=1,
        gradient_clip_val=1.0,
        overfit_batches=0.20,
        progress_bar_refresh_rate=0,
        limit_train_batches=0.01,
        limit_val_batches=0.01,
    )

    model = EvalModelTemplate()
    tpipes.run_model_test_without_loggers(trainer_options, model)

    # test freeze on cpu
    model.freeze()
    model.unfreeze()