Пример #1
0
def run_test_from_config(trainer_options):
    """Trains the default model with the given config."""
    set_random_master_port()
    reset_seed()

    ckpt_path = trainer_options['weights_save_path']
    trainer_options.update(checkpoint_callback=ModelCheckpoint(ckpt_path))

    model = EvalModelTemplate()

    trainer = Trainer(**trainer_options)
    result = trainer.fit(model)
    assert result == 1

    # Horovod should be initialized following training. If not, this will raise an exception.
    assert hvd.size() == 2

    if trainer.global_rank > 0:
        # on higher ranks the checkpoint location is unknown
        # we want to test checkpointing on rank 0 only
        assert not trainer.checkpoint_callback.best_model_path
        return

    # test model loading
    pretrained_model = EvalModelTemplate.load_from_checkpoint(
        trainer.checkpoint_callback.best_model_path)

    # test new model accuracy
    test_loaders = model.test_dataloader()
    if not isinstance(test_loaders, list):
        test_loaders = [test_loaders]

    for dataloader in test_loaders:
        run_prediction(dataloader, pretrained_model)

    # test HPC loading / saving
    trainer.hpc_save(ckpt_path, trainer.logger)
    trainer.hpc_load(ckpt_path, on_gpu=args.on_gpu)

    if args.on_gpu:
        trainer = Trainer(gpus=1, distributed_backend='horovod', max_epochs=1)
        # Test the root_gpu property
        assert trainer.root_gpu == hvd.local_rank()
def test_dm_checkpoint_save(tmpdir):
    reset_seed()

    dm = TrialMNISTDataModule(tmpdir)

    model = EvalModelTemplate()
    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=3,
        weights_summary=None,
        callbacks=[ModelCheckpoint(dirpath=tmpdir, monitor='early_stop_on')],
    )

    # fit model
    result = trainer.fit(model, dm)
    checkpoint_path = list(trainer.checkpoint_callback.best_k_models.keys())[0]
    checkpoint = torch.load(checkpoint_path)
    assert dm.__class__.__name__ in checkpoint
    assert checkpoint[dm.__class__.__name__] == dm.__class__.__name__
Пример #3
0
def test_logger_reset_correctly(tmpdir, extra_params):
    """ Test that the tuners do not alter the logger reference """
    tutils.reset_seed()

    model = EvalModelTemplate()

    trainer = Trainer(
        default_root_dir=tmpdir,
        **extra_params,
    )
    logger1 = trainer.logger
    trainer.fit(model)
    logger2 = trainer.logger
    logger3 = model.logger

    assert logger1 == logger2, \
        'Finder altered the logger of trainer'
    assert logger2 == logger3, \
        'Finder altered the logger of model'
def test_val_loop_config(tmpdir):
    """"
    When either val loop or val data are missing raise warning
    """
    tutils.reset_seed()
    hparams = EvalModelTemplate.get_default_hparams()
    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1)

    # no val data has val loop
    with pytest.warns(UserWarning):
        model = EvalModelTemplate(**hparams)
        model.validation_step = None
        trainer.fit(model)

    # has val loop but no val data
    with pytest.warns(UserWarning):
        model = EvalModelTemplate(**hparams)
        model.val_dataloader = None
        trainer.fit(model)
def test_model_checkpoint_correct_score(tmpdir, save_top_k):
    """Test that when a model checkpoint is saved, it saves with the correct score appended to ckpt_path"""
    tutils.reset_seed()

    model = LogInTwoMethods()

    filename = "{val_acc:.4f}-{epoch}"

    checkpoint = ModelCheckpoint(dirpath=tmpdir, filename=filename, monitor='val_acc', save_top_k=save_top_k)

    trainer = Trainer(default_root_dir=tmpdir, callbacks=[checkpoint], overfit_batches=0.20, max_epochs=2)
    trainer.fit(model)

    ckpt_files = list(Path(tmpdir).glob('*.ckpt'))

    metrics = trainer.dev_debugger.logged_metrics
    expected_filenames = {f'val_acc={metric["val_acc"]:.4f}-epoch={metric["epoch"]}.ckpt' for metric in metrics}
    for ckpt_file in ckpt_files:
        assert os.path.basename(ckpt_file) in expected_filenames
def test_trainer_arg(tmpdir, scale_arg):
    """ Check that trainer arg works with bool input. """
    tutils.reset_seed()

    hparams = EvalModelTemplate.get_default_hparams()
    model = EvalModelTemplate(**hparams)

    before_batch_size = hparams.get('batch_size')
    # logger file to get meta
    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=1,
        auto_scale_batch_size=scale_arg,
    )

    trainer.fit(model)
    after_batch_size = model.batch_size
    assert before_batch_size != after_batch_size, \
        'Batch size was not altered after running auto scaling of batch size'
Пример #7
0
def run_model_test(trainer_options, model, on_gpu: bool = True, version=None, with_hpc: bool = True):

    reset_seed()
    save_dir = trainer_options['default_root_dir']

    # logger file to get meta
    logger = get_default_logger(save_dir, version=version)
    trainer_options.update(logger=logger)

    if 'checkpoint_callback' not in trainer_options:
        trainer_options.update(checkpoint_callback=True)

    trainer = Trainer(**trainer_options)
    initial_values = torch.tensor([torch.sum(torch.abs(x)) for x in model.parameters()])
    result = trainer.fit(model)
    post_train_values = torch.tensor([torch.sum(torch.abs(x)) for x in model.parameters()])

    assert result == 1, 'trainer failed'
    # Check that the model is actually changed post-training
    assert torch.norm(initial_values - post_train_values) > 0.1

    # test model loading
    pretrained_model = load_model_from_checkpoint(logger, trainer.checkpoint_callback.best_model_path)

    # test new model accuracy
    test_loaders = model.test_dataloader()
    if not isinstance(test_loaders, list):
        test_loaders = [test_loaders]

    for dataloader in test_loaders:
        run_prediction(dataloader, pretrained_model)

    if with_hpc:
        if trainer.use_ddp or trainer.use_ddp2:
            # on hpc this would work fine... but need to hack it for the purpose of the test
            trainer.model = pretrained_model
            trainer.optimizers, trainer.lr_schedulers, trainer.optimizer_frequencies = \
                trainer.init_optimizers(pretrained_model)

        # test HPC loading / saving
        trainer.checkpoint_connector.hpc_save(save_dir, logger)
        trainer.checkpoint_connector.hpc_load(save_dir, on_gpu=on_gpu)
Пример #8
0
def test_result_obj_predictions(tmpdir, test_option, do_train, gpus):
    tutils.reset_seed()

    dm = TrialMNISTDataModule(tmpdir)
    prediction_file = Path(tmpdir) / 'predictions.pt'

    model = EvalModelTemplate()
    model.test_option = test_option
    model.prediction_file = prediction_file.as_posix()
    model.test_step = model.test_step_result_preds
    model.test_step_end = None
    model.test_epoch_end = None
    model.test_end = None

    if prediction_file.exists():
        prediction_file.unlink()

    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=3,
        weights_summary=None,
        deterministic=True,
        gpus=gpus
    )

    # Prediction file shouldn't exist yet because we haven't done anything
    assert not prediction_file.exists()

    if do_train:
        result = trainer.fit(model, dm)
        assert result == 1
        result = trainer.test(datamodule=dm)
        result = result[0]
        assert result['test_loss'] < 0.6
        assert result['test_acc'] > 0.8
    else:
        result = trainer.test(model, datamodule=dm)

    # check prediction file now exists and is of expected length
    assert prediction_file.exists()
    predictions = torch.load(prediction_file)
    assert len(predictions) == len(dm.mnist_test)
Пример #9
0
def test_full_loop_single_gpu(tmpdir):
    reset_seed()

    dm = TrialMNISTDataModule(tmpdir)

    model = EvalModelTemplate()

    trainer = Trainer(default_root_dir=tmpdir,
                      max_epochs=3,
                      weights_summary=None,
                      gpus=1)

    # fit model
    result = trainer.fit(model, dm)
    assert result == 1

    # test
    result = trainer.test(datamodule=dm)
    result = result[0]
    assert result['test_acc'] > 0.8
def test_model_checkpoint_to_yaml(tmpdir, save_top_k):
    """ Test that None in checkpoint callback is valid and that chkp_path is set correctly """
    tutils.reset_seed()
    model = LogInTwoMethods()

    checkpoint = ModelCheckpoint(dirpath=tmpdir,
                                 monitor='early_stop_on',
                                 save_top_k=save_top_k)

    trainer = Trainer(default_root_dir=tmpdir,
                      callbacks=[checkpoint],
                      overfit_batches=0.20,
                      max_epochs=2)
    trainer.fit(model)

    path_yaml = os.path.join(tmpdir, 'best_k_models.yaml')
    checkpoint.to_yaml(path_yaml)
    d = yaml.full_load(open(path_yaml, 'r'))
    best_k = {k: v for k, v in checkpoint.best_k_models.items()}
    assert d == best_k
Пример #11
0
def test_train_val_loop_only(tmpdir):
    reset_seed()

    dm = TrialMNISTDataModule(tmpdir)

    model = EvalModelTemplate()
    model.validation_step = None
    model.validation_step_end = None
    model.validation_epoch_end = None

    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=3,
        weights_summary=None,
    )

    # fit model
    result = trainer.fit(model, dm)
    assert result == 1
    assert trainer.logger_connector.callback_metrics['loss'] < 0.6
def test_call_to_trainer_method(tmpdir, scale_method):
    """ Test that calling the trainer method itself works. """
    tutils.reset_seed()

    hparams = EvalModelTemplate.get_default_hparams()
    model = EvalModelTemplate(**hparams)

    before_batch_size = hparams.get('batch_size')
    # logger file to get meta
    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=1,
    )

    after_batch_size = trainer.tuner.scale_batch_size(model, mode=scale_method, max_trials=5)
    model.batch_size = after_batch_size
    trainer.fit(model)

    assert before_batch_size != after_batch_size, \
        'Batch size was not altered after running auto scaling of batch size'
Пример #13
0
def test_model_checkpoint_correct_score(tmpdir, save_top_k):
    os.environ['PL_DEV_DEBUG'] = '1'

    """Test that when a model checkpoint is saved, it saves with the correct score appended to ckpt_path"""
    tutils.reset_seed()

    model = EvalModelTemplate()

    filepath = os.path.join(tmpdir, "{val_acc:.4f}-{epoch}")

    checkpoint = ModelCheckpoint(filepath=filepath, monitor='val_acc', save_top_k=save_top_k)

    trainer = Trainer(default_root_dir=tmpdir, checkpoint_callback=checkpoint, overfit_batches=0.20, max_epochs=2)
    trainer.fit(model)

    ckpt_files = list(Path(tmpdir).glob('*.ckpt'))

    metrics = trainer.dev_debugger.logged_metrics
    expected_filenames = {f'val_acc={metric["val_acc"]:.4f}-epoch={metric["epoch"]}.ckpt' for metric in metrics}
    for ckpt_file in ckpt_files:
        assert os.path.basename(ckpt_file) in expected_filenames
Пример #14
0
def test_model_reset_correctly(tmpdir):
    """ Check that model weights are correctly reset after scaling batch size. """
    tutils.reset_seed()

    model = EvalModelTemplate()

    # logger file to get meta
    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=1,
    )

    before_state_dict = deepcopy(model.state_dict())

    trainer.tuner.scale_batch_size(model, max_trials=5)

    after_state_dict = model.state_dict()

    for key in before_state_dict.keys():
        assert torch.all(torch.eq(before_state_dict[key], after_state_dict[key])), \
            'Model was not reset correctly after scaling batch size'
Пример #15
0
def run_model_test(trainer_options, model, on_gpu: bool = True, version=None, with_hpc: bool = True):

    reset_seed()
    save_dir = trainer_options['default_root_dir']

    # logger file to get meta
    logger = get_default_logger(save_dir, version=version)
    trainer_options.update(logger=logger)

    if 'checkpoint_callback' not in trainer_options:
        trainer_options.update(checkpoint_callback=True)

    trainer = Trainer(**trainer_options)
    result = trainer.fit(model)

    # correct result and ok accuracy
    assert result == 1, 'trainer failed'

    # test model loading
    pretrained_model = load_model_from_checkpoint(logger, trainer.checkpoint_callback.best_model_path)

    # test new model accuracy
    test_loaders = model.test_dataloader()
    if not isinstance(test_loaders, list):
        test_loaders = [test_loaders]

    for dataloader in test_loaders:
        run_prediction(dataloader, pretrained_model)

    if with_hpc:
        if trainer.use_ddp or trainer.use_ddp2:
            # on hpc this would work fine... but need to hack it for the purpose of the test
            trainer.model = pretrained_model
            trainer.optimizers, trainer.lr_schedulers, trainer.optimizer_frequencies = \
                trainer.init_optimizers(pretrained_model)

        # test HPC loading / saving
        trainer.hpc_save(save_dir, logger)
        trainer.hpc_load(save_dir, on_gpu=on_gpu)
def test_model_checkpoint_with_non_string_input(tmpdir, save_top_k):
    """Test that dirpath=None in checkpoint callback is valid and that ckpt_path is set correctly"""
    tutils.reset_seed()
    model = LogInTwoMethods()

    checkpoint = ModelCheckpoint(monitor='early_stop_on', dirpath=None, filename='{epoch}', save_top_k=save_top_k)
    max_epochs = 2
    trainer = Trainer(
        default_root_dir=tmpdir,
        callbacks=[checkpoint],
        overfit_batches=0.20,
        max_epochs=max_epochs,
    )
    trainer.fit(model)
    assert (
        checkpoint.dirpath == tmpdir / trainer.logger.name / "version_0" / "checkpoints"
    )

    if save_top_k == -1:
        ckpt_files = os.listdir(checkpoint.dirpath)
        expected_ckpt_files = [f'epoch={i}.ckpt' for i in range(max_epochs)]
        assert len(ckpt_files) == len(expected_ckpt_files) == max_epochs
        assert set(ckpt_files) == set(expected_ckpt_files)
Пример #17
0
def test_lr_monitor_single_lr(tmpdir):
    """ Test that learning rates are extracted and logged for single lr scheduler. """
    tutils.reset_seed()

    model = EvalModelTemplate()
    model.configure_optimizers = model.configure_optimizers__single_scheduler

    lr_monitor = LearningRateMonitor()
    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=2,
        limit_val_batches=0.1,
        limit_train_batches=0.5,
        callbacks=[lr_monitor],
    )
    result = trainer.fit(model)
    assert result

    assert lr_monitor.lrs, 'No learning rates logged'
    assert len(lr_monitor.lrs) == len(trainer.lr_schedulers), \
        'Number of learning rates logged does not match number of lr schedulers'
    assert all([k in ['lr-Adam'] for k in lr_monitor.lrs.keys()]), \
        'Names of learning rates not set correctly'
Пример #18
0
def test_trainer_attached_to_dm(tmpdir):
    reset_seed()

    dm = TrialMNISTDataModule(tmpdir)

    model = EvalModelTemplate()

    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=3,
        weights_summary=None,
        deterministic=True,
    )

    # fit model
    result = trainer.fit(model, dm)
    assert result == 1
    assert dm.trainer is not None

    # test
    result = trainer.test(datamodule=dm)
    result = result[0]
    assert dm.trainer is not None
Пример #19
0
def test_auto_scale_batch_size_set_model_attribute(tmpdir, use_hparams):
    """ Test that new batch size gets written to the correct hyperparameter attribute. """
    tutils.reset_seed()

    hparams = EvalModelTemplate.get_default_hparams()
    before_batch_size = hparams.get('batch_size')

    class HparamsEvalModelTemplate(EvalModelTemplate):
        def dataloader(self, *args, **kwargs):
            # artificially set batch_size so we can get a dataloader
            # remove it immediately after, because we want only self.hparams.batch_size
            setattr(self, "batch_size", before_batch_size)
            dataloader = super().dataloader(*args, **kwargs)
            del self.batch_size
            return dataloader

    datamodule_model = MNISTDataModule(
        data_dir=tmpdir, batch_size=111)  # this datamodule should get ignored!
    datamodule_fit = MNISTDataModule(data_dir=tmpdir,
                                     batch_size=before_batch_size)

    model_class = HparamsEvalModelTemplate if use_hparams else EvalModelTemplate
    model = model_class(**hparams)
    model.datamodule = datamodule_model  # unused when another module gets passed to .tune() / .fit()

    trainer = Trainer(default_root_dir=tmpdir,
                      max_epochs=1,
                      auto_scale_batch_size=True,
                      gpus=1)
    trainer.tune(model, datamodule_fit)
    after_batch_size = model.hparams.batch_size if use_hparams else model.batch_size
    assert trainer.datamodule == datamodule_fit
    assert before_batch_size != after_batch_size
    assert after_batch_size <= len(trainer.train_dataloader.dataset)
    assert datamodule_fit.batch_size == after_batch_size
    # should be left unchanged, since it was not passed to .tune()
    assert datamodule_model.batch_size == 111
Пример #20
0
def test_loading_meta_tags(tmpdir):
    """ test for backward compatibility to meta_tags.csv """
    tutils.reset_seed()

    hparams = EvalModelTemplate.get_default_hparams()

    # save tags
    logger = tutils.get_default_logger(tmpdir)
    logger.log_hyperparams(Namespace(some_str='a_str', an_int=1, a_float=2.0))
    logger.log_hyperparams(hparams)
    logger.save()

    # load hparams
    path_expt_dir = tutils.get_data_path(logger, path_dir=tmpdir)
    hparams_path = os.path.join(path_expt_dir, TensorBoardLogger.NAME_HPARAMS_FILE)
    hparams = load_hparams_from_yaml(hparams_path)

    # save as legacy meta_tags.csv
    tags_path = os.path.join(path_expt_dir, 'meta_tags.csv')
    save_hparams_to_tags_csv(tags_path, hparams)

    tags = load_hparams_from_tags_csv(tags_path)

    assert hparams == tags
Пример #21
0
def test_auto_scale_batch_size_set_model_attribute(tmpdir, use_hparams):
    """ Test that new batch size gets written to the correct hyperparameter attribute. """
    tutils.reset_seed()

    hparams = EvalModelTemplate.get_default_hparams()
    before_batch_size = hparams.get('batch_size')

    class HparamsEvalModelTemplate(EvalModelTemplate):

        def dataloader(self, *args, **kwargs):
            # artificially set batch_size so we can get a dataloader
            # remove it immediately after, because we want only self.hparams.batch_size
            setattr(self, "batch_size", before_batch_size)
            dataloader = super().dataloader(*args, **kwargs)
            del self.batch_size
            return dataloader

    model_class = HparamsEvalModelTemplate if use_hparams else EvalModelTemplate
    model = model_class(**hparams)

    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, auto_scale_batch_size=True)
    trainer.fit(model)
    after_batch_size = model.hparams.batch_size if use_hparams else model.batch_size
    assert before_batch_size != after_batch_size
Пример #22
0
def test_full_loop_dp(tmpdir):
    reset_seed()

    dm = TrialMNISTDataModule(tmpdir)

    model = EvalModelTemplate()

    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=3,
        weights_summary=None,
        distributed_backend='dp',
        gpus=2,
        deterministic=True,
    )

    # fit model
    result = trainer.fit(model, dm)
    assert result == 1

    # test
    result = trainer.test(datamodule=dm)
    result = result[0]
    assert result['test_acc'] > 0.8
def test_lr_monitor_single_lr_with_momentum(tmpdir):
    """ Test that learning rates and momentum are extracted and logged for single lr scheduler. """
    tutils.reset_seed()

    model = EvalModelTemplate()
    model.configure_optimizers = model.configure_optimizers__onecycle_scheduler

    lr_monitor = LearningRateMonitor(log_momentum=True)
    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=2,
        limit_val_batches=0.1,
        limit_train_batches=0.5,
        callbacks=[lr_monitor],
    )
    result = trainer.fit(model)
    assert result

    assert all(v is not None for v in lr_monitor.last_momentum_values.values()), \
        'Expected momentum to be logged'
    assert len(lr_monitor.last_momentum_values) == len(trainer.lr_schedulers), \
        'Number of momentum values logged does not match number of lr schedulers'
    assert all([k in ['lr-SGD-momentum'] for k in lr_monitor.last_momentum_values.keys()]), \
        'Names of momentum values not set correctly'
Пример #24
0
def test_full_loop_ddp_spawn(tmpdir):
    import os
    os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'

    reset_seed()

    dm = TrialMNISTDataModule(tmpdir)

    model = EvalModelTemplate()

    trainer = Trainer(default_root_dir=tmpdir,
                      max_epochs=3,
                      weights_summary=None,
                      distributed_backend='ddp_spawn',
                      gpus=[0, 1])

    # fit model
    result = trainer.fit(model, dm)
    assert result == 1

    # test
    result = trainer.test(datamodule=dm)
    result = result[0]
    assert result['test_acc'] > 0.8
def test_numpy_metric_ddp():
    tutils.reset_seed()
    tutils.set_random_master_port()
    world_size = 2
    mp.spawn(_ddp_test_numpy_metric, args=(world_size, ), nprocs=world_size)