Пример #1
0
def test_deepspeed_run_configure_optimizers(tmpdir):
    """Test end to end that deepspeed works with defaults (without ZeRO as that requires compilation), whilst using
    configure_optimizers for optimizers and schedulers."""
    class TestCB(Callback):
        def on_train_start(self, trainer, pl_module) -> None:
            from deepspeed.runtime.zero.stage2 import FP16_DeepSpeedZeroOptimizer

            assert isinstance(trainer.optimizers[0],
                              FP16_DeepSpeedZeroOptimizer)
            assert isinstance(trainer.optimizers[0].optimizer, torch.optim.SGD)
            assert isinstance(trainer.lr_schedulers[0]["scheduler"],
                              torch.optim.lr_scheduler.StepLR)
            # check that the lr_scheduler config was preserved
            assert trainer.lr_schedulers[0]["name"] == "Sean"
            # Ensure DeepSpeed engine has initialized with our lr_scheduler
            assert isinstance(trainer.model.lr_scheduler,
                              torch.optim.lr_scheduler.StepLR)

    class TestModel(BoringModel):
        def configure_optimizers(self):
            [optimizer], [scheduler] = super().configure_optimizers()
            return {
                "optimizer": optimizer,
                "lr_scheduler": {
                    "scheduler": scheduler,
                    "name": "Sean"
                }
            }

    model = TestModel()
    lr_monitor = LearningRateMonitor()
    trainer = Trainer(
        plugins=DeepSpeedPlugin(
        ),  # disable ZeRO so our optimizers are not wrapped
        default_root_dir=tmpdir,
        gpus=1,
        fast_dev_run=True,
        precision=16,
        callbacks=[TestCB(), lr_monitor],
    )
    trainer.fit(model)

    assert lr_monitor.lrs == {"Sean": [0.1]}

    _assert_save_model_is_equal(model, tmpdir, trainer)
Пример #2
0
def training_loop(train, valid, save_path, pl_module, callbacks,
        n_epochs, checkpoint_callback, use_neptune=False, resume=True, limit_train_batches=None, neptune_tags="", neptune_name=""):
    """
    Largely model/application agnostic training code.
    """
    # Train with proper resuming
    # Copy gin configs used, for reference, to the save folder

    if not limit_train_batches:
        limit_train_batches = len(train)
        
    os.system("rm " + os.path.join(save_path, "*gin"))
    for gin_config in sys.argv[2].split(";"):
        os.system("cp {} {}/base_config.gin".format(gin_config, save_path))
    with open(os.path.join(save_path, "config.gin"), "w") as f:
        f.write(gin.operative_config_str())
    hparams = parse_gin_config(os.path.join(save_path, 'config.gin'))
    if 'train.callbacks' in hparams:
        del hparams['train.callbacks']
    # TODO: What is a less messy way to pass hparams? This is only that logging is aware of hyperparameters
    pl_module._set_hparams(hparams)
    pl_module._hparams_initial = copy.deepcopy(hparams)
    loggers = []
    loggers.append(pl_loggers.CSVLogger(save_path))
    if use_neptune:
        from pytorch_lightning.loggers import NeptuneLogger
        loggers.append(NeptuneLogger(
            api_key=NEPTUNE_TOKEN,
            project_name=NEPTUNE_USER + "/" + NEPTUNE_PROJECT,
            experiment_name=neptune_name if len(neptune_name) else os.path.basename(save_path),
            tags=neptune_tags.split(',') if len(neptune_tags) else None,
        ))
        callbacks += [MetaSaver(), Heartbeat(), LearningRateMonitor()]
    trainer = pl.Trainer(
        default_root_dir=save_path,
        limit_train_batches=limit_train_batches,
        max_epochs=n_epochs,
        logger=loggers,
        callbacks=callbacks,
        log_every_n_steps=1,
        checkpoint_callback=checkpoint_callback,
        resume_from_checkpoint=os.path.join(save_path, 'last.ckpt')
        if resume and os.path.exists(os.path.join(save_path, 'last.ckpt')) else None)
    trainer.fit(pl_module, train, valid)
    return trainer
Пример #3
0
def callback_objects(model_config, lr_logger=False):

    callback_list = model_config["callbacks"]
    callback_list = handle_config_cases(callback_list)

    model_set = model_config["set"]
    model_library = model_config["model_library"]
    callback_object_list = [
        find_model(model_set, callback, model_library)()
        for callback in callback_list
    ]

    if lr_logger:
        lr_monitor = LearningRateMonitor(logging_interval='epoch')
        callback_object_list = callback_object_list + [lr_monitor]

    logging.info("Callbacks found")
    return callback_object_list
Пример #4
0
def generateModel():
    CHECKPOINT_ROOT = BASE / "checkpoints" 
    model = AlexnetFineTuning((3,224,224))
    lr_monitor = LearningRateMonitor(logging_interval='epoch')
    CHECKPOINT_PATH = CHECKPOINT_ROOT
    checkpoint_callback = ModelCheckpoint(
        dirpath = CHECKPOINT_PATH,
        filename = '{epoch}-{val_acc:.2f}',
        monitor = "val_acc",
        save_last = True,
        mode = "max",
        save_top_k = -1,
        period = 300
        )
    trainer = pl.Trainer(gpus=1, callbacks=[checkpoint_callback,lr_monitor],max_epochs=EPOCHS, 
    progress_bar_refresh_rate=0, weights_summary=None)
    trainer.fit(model, train_loader, val_loader)
    trainer.test(test_dataloaders=test_loader)
Пример #5
0
def test_lr_monitor_no_lr_scheduler(tmpdir):
    tutils.reset_seed()

    class CustomBoringModel(BoringModel):
        def configure_optimizers(self):
            optimizer = optim.SGD(self.parameters(), lr=0.1)
            return optimizer

    model = CustomBoringModel()

    lr_monitor = LearningRateMonitor()
    trainer = Trainer(
        default_root_dir=tmpdir, max_epochs=2, limit_val_batches=0.1, limit_train_batches=0.5, callbacks=[lr_monitor]
    )

    with pytest.warns(RuntimeWarning, match="have no learning rate schedulers"):
        trainer.fit(model)
    assert trainer.state.finished, f"Training failed with {trainer.state}"
Пример #6
0
def test_lr_monitor_no_lr_scheduler(tmpdir):
    tutils.reset_seed()

    model = EvalModelTemplate()

    lr_monitor = LearningRateMonitor()
    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=2,
        limit_val_batches=0.1,
        limit_train_batches=0.5,
        callbacks=[lr_monitor],
    )

    with pytest.warns(RuntimeWarning,
                      match='have no learning rate schedulers'):
        result = trainer.fit(model)
        assert result
Пример #7
0
 def configure_callbacks(self):
     cwd = hydra.utils.get_original_cwd()
     filename = "{}_{}_{}_H{}_W{}".format(
         self.args.exp_name,
         self.args.arch.decoder,
         self.args.arch.encoder,
         self.args.arch.image_height,
         self.args.arch.image_width,
     )
     checkpoint_callback = ModelCheckpoint(
         monitor="val_loss",
         mode="min",
         dirpath=os.path.join(cwd, self.args.path2weight),
         filename=filename,
         save_top_k=1,
     )
     lr_monitor = LearningRateMonitor(logging_interval="step")
     return [checkpoint_callback, lr_monitor]
Пример #8
0
def main(args):
    pl.seed_everything(42)
    
    # load configs
    with open(args.cfg_path, 'r') as fp:
        cfg = json.load(fp)
    cfg_train = cfg['train']
    print('### TRAIN CONFIGS:')
    pprint(cfg_train)
    print('### MODEL CONFIGS:')
    pprint(cfg['model'])
    #os.environ['CUDA_VISIBLE_DEVICES'] = cfg_train['trainer_kwargs']['gpus']
        
    # init model
    ModelClass = {
        'cvae': CvaeInception,
        'vae': VaeInception
    }[cfg_train['type']]
    model = ModelClass(cfg['model'])
    
    # init data loader
    dm = UrbanSoundsModule(
        cfg['dataset'], 
        num_workers=cfg_train['num_workers'], 
        batch_size=cfg_train['batch_size'])
    dm.setup()
    
    # logger
    log_name = 'usnds_{}_{}'.format(ModelClass.model_name, cfg_train['descr'])
    logger = TensorBoardLogger(save_dir='logs', name=log_name)
    
    # callbacks
    early_stop = EarlyStopping(monitor='val_loss', patience=cfg_train['patience'])
    lr_monitor = LearningRateMonitor(logging_interval='epoch')

    # trainer
    trainer = pl.Trainer(
        max_epochs=cfg_train['max_epochs'],
        logger=logger,
        callbacks=[early_stop, lr_monitor],
        **cfg_train['trainer_kwargs'])
    
    # train
    trainer.fit(model=model, datamodule=dm)
Пример #9
0
def main():
    pl.seed_everything(1234)
    # ------------
    # args
    # ------------
    parser = get_parser()
    parser = MrcSpanProposal.add_model_specific_args(parser)
    parser = pl.Trainer.add_argparse_args(parser)
    args = parser.parse_args()

    # ------------
    # model
    # ------------
    model = MrcSpanProposal(args)

    # load pretrained_model
    if args.pretrained:
        model.load_state_dict(
            torch.load(args.pretrained, map_location=torch.device('cpu'))["state_dict"]
        )

    # call backs
    checkpoint_callback = ModelCheckpoint(
        monitor=f'val_top{MrcSpanProposal.acc_topk}_acc',
        dirpath=args.default_root_dir,
        save_top_k=10,
        save_last=True,
        mode='max',
        verbose=True
    )

    lr_monitor = LearningRateMonitor(logging_interval='step')
    print_model = ModelPrintCallback(print_modules=["model"])
    callbacks = [checkpoint_callback, lr_monitor, print_model]
    if args.freeze_bert:
        callbacks.append(EvalCallback(["model.bert"]))

    trainer = pl.Trainer.from_argparse_args(
        args,
        callbacks=callbacks,
        replace_sampler_ddp=False
    )

    trainer.fit(model)
Пример #10
0
def test_lr_monitor_no_lr_scheduler_multi_lrs(tmpdir, logging_interval: str):
    """Test that learning rates are extracted and logged for multi optimizers but no lr scheduler."""
    tutils.reset_seed()

    class CustomBoringModel(BoringModel):
        def training_step(self, batch, batch_idx, optimizer_idx):
            return super().training_step(batch, batch_idx)

        def configure_optimizers(self):
            optimizer1 = optim.Adam(self.parameters(), lr=1e-2)
            optimizer2 = optim.Adam(self.parameters(), lr=1e-2)

            return [optimizer1, optimizer2]

    model = CustomBoringModel()
    model.training_epoch_end = None

    lr_monitor = LearningRateMonitor(logging_interval=logging_interval)
    log_every_n_steps = 2

    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=2,
        log_every_n_steps=log_every_n_steps,
        limit_train_batches=7,
        limit_val_batches=0.1,
        callbacks=[lr_monitor],
    )
    trainer.fit(model)

    assert lr_monitor.lrs, "No learning rates logged"
    assert len(lr_monitor.lrs) == len(trainer.optimizers)
    assert list(lr_monitor.lrs) == [
        "lr-Adam", "lr-Adam-1"
    ], "Names of learning rates not set correctly"

    if logging_interval == "step":
        # divide by 2 because we have 2 optimizers
        expected_number_logged = trainer.global_step // 2 // log_every_n_steps
    if logging_interval == "epoch":
        expected_number_logged = trainer.max_epochs

    assert all(
        len(lr) == expected_number_logged for lr in lr_monitor.lrs.values())
Пример #11
0
def train(cfg):
    datamodule_args = {}
    if cfg.transforms.train:
        train_transforms = [
            hydra.utils.instantiate(t) for t in cfg.transforms.train
        ]
        datamodule_args["train_transforms"] = train_transforms
    if cfg.transforms.val:
        val_transforms = [
            hydra.utils.instantiate(t) for t in cfg.transforms.val
        ]
        datamodule_args["val_transforms"] = val_transforms
    data_module = hydra.utils.instantiate(cfg.dataset, **datamodule_args)
    data_module.prepare_data()
    lr_scheduler = resolve_steps_per_epoch(cfg,
                                           len_train=data_module.len_train)

    model = FlowerClassifier(
        **cfg.model,
        optimizer_config=cfg.optimizer,
        lr_scheduler_config=lr_scheduler,
        batch_size=cfg.dataset.batch_size,
    )
    logger = hydra.utils.instantiate(cfg.trainer.logger) or False
    lr_logger = LearningRateMonitor(logging_interval="step")
    callbacks = [lr_logger]

    # checkpoint callback requires dynamic configuration
    experiment = getattr(logger, "experiment", None)
    logger_dir = getattr(experiment, "dir", "logger")
    checkpoints_dir = os.path.join(logger_dir, "{epoch}")
    checkpoint_callback = hydra.utils.instantiate(
        cfg.trainer.checkpoint_callback, filepath=checkpoints_dir) or False
    if checkpoint_callback:
        callbacks.append(checkpoint_callback)

    trainer_args = {
        **cfg.trainer,
        "logger": logger,
        "callbacks": callbacks,
    }
    trainer = Trainer(**trainer_args)

    trainer.fit(model, datamodule=data_module)
def test_lr_monitor_custom_name(tmpdir):
    class TestModel(BoringModel):
        def configure_optimizers(self):
            optimizer, [scheduler] = super().configure_optimizers()
            lr_scheduler = {'scheduler': scheduler, 'name': 'my_logging_name'}
            return optimizer, [lr_scheduler]

    lr_monitor = LearningRateMonitor()
    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=2,
        limit_val_batches=0.1,
        limit_train_batches=0.5,
        callbacks=[lr_monitor],
        progress_bar_refresh_rate=0,
        weights_summary=None,
    )
    trainer.fit(TestModel())
    assert lr_monitor.lr_sch_names == list(lr_monitor.lrs.keys()) == ['my_logging_name']
Пример #13
0
def train(args, trainer_args, model_args):
    # df = pd.read_csv(os.path.join(args['data_directory'], 'train.csv'))
    # train_df, val_df = model_selection.train_test_split(df, test_size=0.1, random_state=42, stratify=df.label.values)
    # train_df.reset_index(inplace=True, drop=True)
    # val_df.reset_index(inplace=True, drop=True)
    datamodule = CassavaDataModule(batch_size=args['batch_size'],
                                   data_dir=args['data_directory'],
                                   num_workers=4,
                                   sample_size=args['sample_size'])
    classifier_list = [Resnet18, Resnet50, EfficientNetB1, VisTransformer]
    classifier_names = [elem.__name__.lower() for elem in classifier_list]
    classifier_model_name = args['model_type']
    classifier = classifier_list[classifier_names.index(classifier_model_name)]
    classifier_model_dir = os.path.join('logs', classifier_model_name)
    #trainer_args = {'max_epochs' :8, 'profiler' : 'simple', 'precision' :16, 'gradient_clip_val' : 100, 'gpus':1 }
    #model_args = {'lr' : 5e-5}
    load_pretrained = False
    load_pretrained = os.path.exists(classifier_model_dir) and load_pretrained
    checkpoints = list(
        filter(lambda x: '.ckpt' in x,
               os.listdir(classifier_model_dir))) if load_pretrained else []
    load_pretrained = load_pretrained and len(checkpoints) > 0
    if load_pretrained:
        checkpoint_path = os.path.join(classifier_model_dir, checkpoints[-1])
        model = classifier.load_from_checkpoint(checkpoint_path)
    else:
        model = classifier(**model_args)
    print(model)
    logger = TensorBoardLogger("logs",
                               name=classifier_model_name,
                               log_graph=True)
    lr_monitor = LearningRateMonitor(logging_interval='step')
    model_chkpt = ModelCheckpoint(dirpath=classifier_model_dir,
                                  monitor='val_acc_epoch',
                                  filename='{epoch}-{val_acc_epoch:.2f}',
                                  verbose=True)
    early_stopper = EarlyStopping(monitor='val_acc_epoch',
                                  patience=6,
                                  verbose=True)
    trainer = pl.Trainer(logger=logger,
                         callbacks=[lr_monitor, model_chkpt, early_stopper],
                         **trainer_args)
    trainer.fit(model, datamodule)
Пример #14
0
def test_lightning_cli_config_and_subclass_mode(tmpdir):
    input_config = {
        "fit": {
            "model": {
                "class_path": "tests.helpers.BoringModel"
            },
            "data": {
                "class_path": "tests.helpers.BoringDataModule",
                "init_args": {
                    "data_dir": str(tmpdir)
                }
            },
            "trainer": {
                "default_root_dir": str(tmpdir),
                "max_epochs": 1,
                "weights_summary": None
            },
        }
    }
    config_path = tmpdir / "config.yaml"
    with open(config_path, "w") as f:
        f.write(yaml.dump(input_config))

    with mock.patch("sys.argv", ["any.py", "--config", str(config_path)]):
        cli = LightningCLI(
            BoringModel,
            BoringDataModule,
            subclass_mode_model=True,
            subclass_mode_data=True,
            trainer_defaults={"callbacks": LearningRateMonitor()},
        )

    config_path = tmpdir / "lightning_logs" / "version_0" / "config.yaml"
    assert os.path.isfile(config_path)
    with open(config_path) as f:
        loaded_config = yaml.safe_load(f.read())

    loaded_config = loaded_config["fit"]
    cli_config = cli.config["fit"]

    assert loaded_config["model"] == cli_config["model"]
    assert loaded_config["data"] == cli_config["data"]
    assert loaded_config["trainer"] == cli_config["trainer"]
Пример #15
0
def get_callbacks(
    logging_interval: str,
    experiment_type: str,
    save_top_k: int = 1,
    period: int = 1,
    monitor: str = "checkpoint_saving_loss",
):
    upload_comet_logs = UploadCometLogs(
        logging_interval, get_console_logger("callback"), experiment_type
    )
    lr_monitor = LearningRateMonitor(logging_interval=logging_interval)
    # saving the best model as per the validation loss.
    checkpoint_callback = UpdatedModelCheckpoint(
        save_top_k=save_top_k, period=period, monitor=monitor
    )
    return {
        "callbacks": [lr_monitor, upload_comet_logs],
        "checkpoint_callback": checkpoint_callback,
    }
Пример #16
0
def test_lr_monitor_single_lr(tmpdir):
    """Test that learning rates are extracted and logged for single lr scheduler."""
    tutils.reset_seed()

    model = BoringModel()

    lr_monitor = LearningRateMonitor()
    trainer = Trainer(default_root_dir=tmpdir,
                      max_epochs=2,
                      limit_val_batches=0.1,
                      limit_train_batches=0.5,
                      callbacks=[lr_monitor])
    trainer.fit(model)

    assert lr_monitor.lrs, "No learning rates logged"
    assert all(v is None for v in lr_monitor.last_momentum_values.values()
               ), "Momentum should not be logged by default"
    assert len(lr_monitor.lrs) == len(trainer.lr_scheduler_configs)
    assert list(lr_monitor.lrs) == ["lr-SGD"]
Пример #17
0
def test_lr_monitor_custom_name(tmpdir):
    class TestModel(BoringModel):
        def configure_optimizers(self):
            optimizer, [scheduler] = super().configure_optimizers()
            lr_scheduler = {"scheduler": scheduler, "name": "my_logging_name"}
            return optimizer, [lr_scheduler]

    lr_monitor = LearningRateMonitor()
    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=2,
        limit_val_batches=0.1,
        limit_train_batches=0.5,
        callbacks=[lr_monitor],
        enable_progress_bar=False,
        enable_model_summary=False,
    )
    trainer.fit(TestModel())
    assert list(lr_monitor.lrs) == ["my_logging_name"]
Пример #18
0
def _get_trainer_callbacks(cfg: CfgNode) -> List[Callback]:
    """Gets the trainer callbacks based on the given D2Go Config.

    Args:
        cfg: The normalized ConfigNode for this D2Go Task.

    Returns:
        A list of configured Callbacks to be used by the Lightning Trainer.
    """
    callbacks: List[Callback] = [
        LearningRateMonitor(logging_interval="step"),
        ModelCheckpoint(
            dirpath=cfg.OUTPUT_DIR,
            save_last=True,
        ),
    ]
    if cfg.QUANTIZATION.QAT.ENABLED:
        callbacks.append(QuantizationAwareTraining.from_config(cfg))
    return callbacks
Пример #19
0
def train(params: Params):
    seed_everything(params.d.seed)

    tb_logger = TensorBoardLogger(
        params.t.save_dir,
        name=f'011_popularity',
        version=str(int(time())),
    )

    log_dir = Path(tb_logger.log_dir)
    log_dir.mkdir(parents=True, exist_ok=True)

    logger = getLogger('lightning')
    logger.addHandler(FileHandler(log_dir / 'train.log'))
    logger.info(params.pretty())

    callbacks = [
        LearningRateMonitor(),
    ]
    if params.t.checkpoint_callback:
        callbacks.append(ModelCheckpoint(
            monitor=None,
            verbose=True,
        ), )
    trainer = pl.Trainer(
        max_epochs=params.t.epochs,
        gpus=params.t.gpus,
        tpu_cores=params.t.num_tpu_cores,
        logger=tb_logger,
        precision=params.t.precision,
        resume_from_checkpoint=params.t.resume_from_checkpoint,
        weights_save_path=params.t.weights_save_path,
        checkpoint_callback=params.t.weights_save_path is not None,
        callbacks=callbacks,
        deterministic=True,
        benchmark=True,
        accumulate_grad_batches=params.t.accumulate_grad_batches,
        val_check_interval=params.t.val_check_interval,
    )
    net = PLModule(params.m.to_dict())
    dm = PopularityDataModule(params.d)

    trainer.fit(net, datamodule=dm)
Пример #20
0
def test_lr_monitor_single_lr_with_momentum(tmpdir, opt):
    """
    Test that learning rates and momentum are extracted and logged for single lr scheduler.
    """
    class LogMomentumModel(BoringModel):
        def __init__(self, opt):
            super().__init__()
            self.opt = opt

        def configure_optimizers(self):
            if self.opt == 'SGD':
                opt_kwargs = {'momentum': 0.9}
            elif self.opt == 'Adam':
                opt_kwargs = {'betas': (0.9, 0.999)}

            optimizer = getattr(optim, self.opt)(self.parameters(),
                                                 lr=1e-2,
                                                 **opt_kwargs)
            lr_scheduler = optim.lr_scheduler.OneCycleLR(optimizer,
                                                         max_lr=1e-2,
                                                         total_steps=10_000)
            return [optimizer], [lr_scheduler]

    model = LogMomentumModel(opt=opt)
    lr_monitor = LearningRateMonitor(log_momentum=True)
    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=2,
        limit_val_batches=2,
        limit_train_batches=5,
        log_every_n_steps=1,
        callbacks=[lr_monitor],
    )
    trainer.fit(model)
    assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"

    assert all(v is not None for v in lr_monitor.last_momentum_values.values()), \
        'Expected momentum to be logged'
    assert len(lr_monitor.last_momentum_values) == len(trainer.lr_schedulers), \
        'Number of momentum values logged does not match number of lr schedulers'
    assert all(k == f'lr-{opt}-momentum' for k in lr_monitor.last_momentum_values.keys()), \
        'Names of momentum values not set correctly'
Пример #21
0
def train_and_test(model, dm, logger):

    ES = EarlyStopping(
        monitor="train_loss",
        min_delta=0.001,
        patience=10,
        verbose=True,
        mode="min",
        strict=True,
    )
    LRM = LearningRateMonitor("epoch")
    tag = "unsupervised_v1"
    CKPT = ModelCheckpoint(
        dirpath="/gaia/models",
        filename=tag,
        monitor="train_loss",
        mode="min",
        verbose=True,
    )
    LSS = LatentSpaceSaver()

    trainer = pl.Trainer(
        auto_lr_find=False,
        gpus=1,
        auto_select_gpus=False,
        gradient_clip_val=1.0,
        log_gpu_memory="min_max",
        reload_dataloaders_every_epoch=True,
        callbacks=[ES, LRM, CKPT, LSS, Plotter()],
        accelerator="ddp",
        log_every_n_steps=150,
        flush_logs_every_n_steps=300,
        terminate_on_nan=True,
        track_grad_norm=2,
        weights_summary="full",
        profiler="simple",
        replace_sampler_ddp=True,
        logger=logger,
    )
    # trainer.tune(model=model, datamodule=dm)
    trainer.fit(model=model, datamodule=dm)
    trainer.test(model=model, datamodule=dm, verbose=True)
Пример #22
0
def train(args):

    print('Training arguments: ', args)

    seed_everything(args.seed)

    os.makedirs(args.log_dir, exist_ok=True)

    data = SNLIData(batch_size=args.batch_size)
    train_loader, val_loader, test_loader = data.get_iters()

    checkpoint_callback = ModelCheckpoint(monitor='val_loss')

    trainer = Trainer(
        default_root_dir=args.log_dir,
        limit_train_batches=args.
        limit_train_batches,  # for testing with less data
        fast_dev_run=False,  # for checking with 1 batch,
        callbacks=[
            LearningRateMonitor(logging_interval='step'), checkpoint_callback
        ],
        logger=TensorBoardLogger(args.log_dir, name=args.encoder_type),
        gpus=1 if torch.cuda.is_available() else 0,
        max_epochs=args.epochs,
        progress_bar_refresh_rate=args.refresh_rate)

    model = NLINet(encoder_type=args.encoder_type,
                   enc_hidden_dim=args.enc_hidden_dim,
                   cls_hidden_dim=args.cls_hidden_dim,
                   lr=args.lr,
                   dataset_sizes=data.sizes)

    # Training
    trainer.fit(model, train_loader, val_loader)
    print('Best checkpoint:', checkpoint_callback.best_model_path)
    # Testing
    #   model = NLINet.load_from_checkpoint(
    #       trainer.checkpoint_callback.best_model_path)
    test_result = trainer.test(model,
                               test_dataloaders=test_loader,
                               verbose=True)
    return test_result
Пример #23
0
def test_lr_monitor_custom_pg_name(tmpdir):
    class TestModel(BoringModel):
        def configure_optimizers(self):
            optimizer = torch.optim.SGD([{"params": list(self.layer.parameters()), "name": "linear"}], lr=0.1)
            lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
            return [optimizer], [lr_scheduler]

    lr_monitor = LearningRateMonitor()
    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=2,
        limit_val_batches=2,
        limit_train_batches=2,
        callbacks=[lr_monitor],
        progress_bar_refresh_rate=0,
        weights_summary=None,
    )
    trainer.fit(TestModel())
    assert lr_monitor.lr_sch_names == ["lr-SGD"]
    assert list(lr_monitor.lrs) == ["lr-SGD/linear"]
Пример #24
0
 def create_trainer(self, logger=None, callbacks=[], **kwargs):
     """Create a pytorch lightning training by reading config files
     Args:
         callbacks (list): a list of pytorch-lightning callback classes
     """
     
     #If val data is passed, monitor learning rate and setup classification metrics
     if not self.config["validation"]["csv_file"] is None:
         if logger is not None:
             lr_monitor = LearningRateMonitor(logging_interval='epoch')
             callbacks.append(lr_monitor)
     
     self.trainer = pl.Trainer(logger=logger,
                               max_epochs=self.config["train"]["epochs"],
                               gpus=self.config["gpus"],
                               enable_checkpointing=False,
                               accelerator=self.config["distributed_backend"],
                               fast_dev_run=self.config["train"]["fast_dev_run"],
                               callbacks=callbacks,
                               **kwargs)
Пример #25
0
def train():
    model = SDDSegModel()
    checkpoint_callback = pl.callbacks.ModelCheckpoint(
        dirpath='checkpoints_sdd/deeplabv3_effnet-b2/',
        save_top_k=1,
        verbose=True,
        monitor='val_loss',
        mode='min',
        prefix='')
    lr_monitor = LearningRateMonitor(logging_interval='step')
    trainer = pl.Trainer(
        gpus=1,
        callbacks=[lr_monitor],
        checkpoint_callback=checkpoint_callback,
        num_sanity_val_steps=-1,
        log_every_n_steps=4,
        max_epochs=22,
        resume_from_checkpoint=
        'checkpoints_sdd/deeplabv3_effnet-b2/epoch=21-step=2639.ckpt')
    trainer.fit(model)
Пример #26
0
def _get_trainer(project_parameters):
    callbacks = [
        ModelCheckpoint(monitor='validation accuracy', mode='max'),
        LearningRateMonitor(logging_interval='epoch', log_momentum=True)
    ]
    if project_parameters.use_early_stopping:
        callbacks.append(
            EarlyStopping(monitor='validation loss',
                          patience=project_parameters.patience,
                          mode='min'))
    return Trainer(callbacks=callbacks,
                   gpus=project_parameters.gpus,
                   max_epochs=project_parameters.train_iter,
                   weights_summary=project_parameters.weights_summary,
                   profiler=project_parameters.profiler,
                   deterministic=True,
                   check_val_every_n_epoch=project_parameters.val_iter,
                   default_root_dir=project_parameters.save_path,
                   num_sanity_val_steps=0,
                   precision=project_parameters.precision)
def train_regression(args_lightning_model_parameters,
                     epochs,
                     gpus=1,
                     es_patience=30):
    net = UNet(**args_lightning_model_parameters)

    torchsummary.summary(net, (12, 288, 288), device="cpu")
    # return
    default_save_path = "output/lightning/precip_regression"
    if not os.path.exists(default_save_path):
        os.makedirs(default_save_path)

    checkpoint_callback = ModelCheckpoint(
        filepath=os.getcwd() + "/" + default_save_path + "/" +
        net.__class__.__name__ + "/{epoch}-{val_loss:.6f}",
        save_top_k=-1,
        verbose=False,
        monitor='val_loss',
        mode='min',
        prefix=net.__class__.__name__ + "_rain_threshhold_50_")
    lr_logger = LearningRateMonitor()
    tb_logger = loggers.TensorBoardLogger(save_dir=default_save_path,
                                          name=net.__class__.__name__)

    earlystopping_callback = EarlyStopping(
        monitor='val_loss',
        mode='min',
        patience=es_patience,
        # is effectively half (due to a bug in pytorch-lightning)
    )
    trainer = pl.Trainer(gpus=gpus,
                         weights_summary=None,
                         max_epochs=epochs,
                         weights_save_path=default_save_path,
                         logger=tb_logger,
                         callbacks=[lr_logger, earlystopping_callback],
                         val_check_interval=0.25,
                         overfit_batches=0.1)
    # resume_from_checkpoint=resume_from_checkpoint,
    trainer.fit(net)
    return
Пример #28
0
def test_lr_monitor_param_groups(tmpdir):
    """Test that learning rates are extracted and logged for single lr scheduler."""
    tutils.reset_seed()

    class CustomClassificationModel(ClassificationModel):
        def configure_optimizers(self):
            param_groups = [
                {
                    "params": list(self.parameters())[:2],
                    "lr": self.lr * 0.1
                },
                {
                    "params": list(self.parameters())[2:],
                    "lr": self.lr
                },
            ]

            optimizer = optim.Adam(param_groups)
            lr_scheduler = optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.1)
            return [optimizer], [lr_scheduler]

    model = CustomClassificationModel()
    dm = ClassifDataModule()

    lr_monitor = LearningRateMonitor()
    trainer = Trainer(default_root_dir=tmpdir,
                      max_epochs=2,
                      limit_val_batches=0.1,
                      limit_train_batches=0.5,
                      callbacks=[lr_monitor])
    trainer.fit(model, datamodule=dm)
    assert trainer.state.finished, f"Training failed with {trainer.state}"

    assert lr_monitor.lrs, "No learning rates logged"
    assert len(lr_monitor.lrs) == 2 * len(
        trainer.lr_schedulers
    ), "Number of learning rates logged does not match number of param groups"
    assert lr_monitor.lr_sch_names == ["lr-Adam"]
    assert list(lr_monitor.lrs.keys()) == [
        "lr-Adam/pg1", "lr-Adam/pg2"
    ], "Names of learning rates not set correctly"
Пример #29
0
def test_lr_monitor_single_lr(tmpdir):
    """Test that learning rates are extracted and logged for single lr scheduler."""
    tutils.reset_seed()

    model = BoringModel()

    lr_monitor = LearningRateMonitor()
    trainer = Trainer(
        default_root_dir=tmpdir, max_epochs=2, limit_val_batches=0.1, limit_train_batches=0.5, callbacks=[lr_monitor]
    )
    trainer.fit(model)
    assert trainer.state.finished, f"Training failed with {trainer.state}"

    assert lr_monitor.lrs, "No learning rates logged"
    assert all(v is None for v in lr_monitor.last_momentum_values.values()), "Momentum should not be logged by default"
    assert len(lr_monitor.lrs) == len(
        trainer.lr_schedulers
    ), "Number of learning rates logged does not match number of lr schedulers"
    assert (
        lr_monitor.lr_sch_names == list(lr_monitor.lrs.keys()) == ["lr-SGD"]
    ), "Names of learning rates not set correctly"
Пример #30
0
 def train(self,
           log_dir=os.path.abspath(os.path.join(os.path.dirname(__file__), '../data/logs/ocr')),
           seed: int = None
           ) -> NPOcrNet:
     """
     TODO: describe method
     """
     if seed is not None:
         aug_seed(seed)
         pl.seed_everything(seed)
     self.create_model()
     checkpoint_callback = ModelCheckpoint(dirpath=log_dir, monitor='val_loss')
     lr_monitor = LearningRateMonitor(logging_interval='step')
     self.trainer = pl.Trainer(max_epochs=self.epochs,
                               gpus=self.gpus,
                               callbacks=[checkpoint_callback, lr_monitor],
                               weights_summary=None)
     self.trainer.fit(self.model, self.dm)
     print("[INFO] best model path", checkpoint_callback.best_model_path)
     self.trainer.test()
     return self.model