예제 #1
0
def train_model(model, model_dir):
    # Setup trainer

    cb1 = callbacks.ModelCheckpoint(filename='best-{epoch}',
                                    monitor='val_loss_mean',
                                    save_top_k=1,
                                    mode='min')
    cb2 = callbacks.ModelCheckpoint(filename='last-{epoch}', save_last=True)

    tb_logger = pl_loggers.TensorBoardLogger('{}/logs/'.format(model_dir))
    if Constants.n_gpus != 0:
        #trainer = Trainer(gpus=Constants.n_gpus, distributed_backend='ddp', logger = tb_logger, precision=16, default_root_dir=model_dir, max_epochs=n_epochs)
        trainer = Trainer(gpus=Constants.n_gpus,
                          callbacks=[cb1, cb2],
                          plugins=DDPPlugin(find_unused_parameters=False),
                          accelerator='ddp_spawn',
                          precision=16,
                          logger=tb_logger,
                          default_root_dir=model_dir,
                          max_epochs=n_epochs)
    else:
        trainer = Trainer(gpus=0,
                          default_root_dir=model_dir,
                          callbacks=[cb1, cb2],
                          logger=tb_logger,
                          distributed_backend='ddp_spawn',
                          max_epochs=n_epochs)

    trainer.fit(model)
    trainer.test()
예제 #2
0
def build_callbacks(config):
    callback_list = []
    if config.TRAIN.CALLBACKS.LEARNING_RATE_MONITOR.ENABLE:
        callback_list.append(
            callbacks.LearningRateMonitor(
                logging_interval = config.TRAIN.CALLBACKS.LEARNING_RATE_MONITOR.LOGGING_INTERVAL
                )
            )
    if config.TRAIN.CALLBACKS.MODEL_CHECKPOINT.ENABLE:
        callback_list.append(
            callbacks.ModelCheckpoint(
                dirpath = config.OUTPUT,
                filename = config.TRAIN.CALLBACKS.MODEL_CHECKPOINT.FILE_NAME,
                monitor = config.TRAIN.CALLBACKS.MODEL_CHECKPOINT.MONITOR,
                save_top_k = config.TRAIN.CALLBACKS.MODEL_CHECKPOINT.SAVE_TOP_K,
                mode = config.TRAIN.CALLBACKS.MODEL_CHECKPOINT.MODE
            )
        )
    # if config.TRAIN.CALLBACKS.INTERVAL_STEP_VALIDATE.ENABLE:
    #     callback_list.append(
    #         IntervalStepValidate(config)
    #     )
    return callback_list

# Run validation on specified steps
# class IntervalStepValidate(Callback):
#     def __init__(self, config):
#         self.config = config
#         self.total_steps = config.TRAIN.STEPS
#         self.validation_interval = config.TRAIN.CALLBACKS.INTERVAL_STEP_VALIDATE.INTERVAL

#     def on_batch_end(self, trainer, pl_module):
#         if self.total_steps % self.validation_interval == 0:
#             trainer.validate_step()
def test_top_k(save_mock, tmpdir, k: int, epochs: int,
               val_check_interval: float, expected: int):
    class TestModel(BoringModel):
        def __init__(self):
            super().__init__()
            self.last_coeff = 10.0

        def training_step(self, batch, batch_idx):
            loss = self.step(torch.ones(32))
            loss = loss / (loss + 0.0000001)
            loss += self.last_coeff
            self.log('my_loss', loss)
            self.last_coeff *= 0.999
            return loss

    model = TestModel()
    trainer = Trainer(callbacks=[
        callbacks.ModelCheckpoint(dirpath=tmpdir,
                                  monitor='my_loss',
                                  save_top_k=k)
    ],
                      default_root_dir=tmpdir,
                      max_epochs=epochs,
                      weights_summary=None,
                      val_check_interval=val_check_interval)
    trainer.fit(model)

    # make sure types are correct
    assert save_mock.call_count == expected
def test_top_k(save_mock, tmpdir, k: int, epochs: int,
               val_check_interval: float, expected: int, save_last: bool):
    class TestModel(BoringModel):
        def __init__(self):
            super().__init__()
            self.last_coeff = 10.0

        def training_step(self, batch, batch_idx):
            loss = self.step(torch.ones(32))
            loss = loss / (loss + 0.0000001)
            loss += self.last_coeff
            self.log("my_loss", loss)
            self.last_coeff *= 0.999
            return loss

    model = TestModel()
    trainer = Trainer(
        callbacks=[
            callbacks.ModelCheckpoint(dirpath=tmpdir,
                                      monitor="my_loss",
                                      save_top_k=k,
                                      save_last=save_last)
        ],
        default_root_dir=tmpdir,
        max_epochs=epochs,
        weights_summary=None,
        val_check_interval=val_check_interval,
    )
    trainer.fit(model)

    if save_last:
        # last epochs are saved every step (so double the save calls) and once `on_train_end`
        expected = expected * 2 + 1
    assert save_mock.call_count == expected
예제 #5
0
    def run(config="config/base.yml"):
        config = util.load_config(config)
        now = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
        run_dir = path.join("wandb", now)
        run_dir = path.abspath(run_dir)
        os.environ['WANDB_PROJECT'] = "linear_turing"
        os.environ['TOKENIZERS_PARALLELISM'] = 'true'

        checkpoint_callback = callbacks.ModelCheckpoint(monitor='val_loss',
                                                        mode='min',
                                                        save_weights_only=True,
                                                        save_last=True,
                                                        filename='{epoch}_{val_loss:.2f}')

        other_callbacks = [
            pl.callbacks.LearningRateMonitor(),
            callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=10)
        ]

        experiment = Experiment(config)

        trainer = pl.Trainer(logger=pl.loggers.WandbLogger(log_model=True),
                             checkpoint_callback=checkpoint_callback,
                             callbacks=other_callbacks,
                             **config['trainer'])

        trainer.fit(experiment)
예제 #6
0
def train_center_net(train_df, oof_df):
    train_dataset = centernet.WheatDataset(train_df, transforms=get_train_transforms())
    train_dataloader = DataLoader(train_dataset, batch_size=Config.Train.batch_size,
                                  shuffle=True, num_workers=4, drop_last=True, pin_memory=True)
    oof_dataset = centernet.WheatDataset(oof_df, test=True, transforms=get_valid_transforms())
    oof_dataloader = DataLoader(oof_dataset, batch_size=Config.Train.batch_size,
                                shuffle=False, num_workers=4, pin_memory=True)
    model = Resnest50CenterNet(conf=Config)
    early_stop = callbacks.EarlyStopping(monitor='val_map',
                                         patience=10,
                                         mode='max',
                                         verbose=True)
    checkpoint = callbacks.ModelCheckpoint(str(Config.Train.checkpoint_dir),
                                           monitor='val_map',
                                           verbose=True,
                                           mode='max',
                                           save_top_k=1)
    cbs = [
        callbacks.LearningRateLogger()
    ]
    trainer = Trainer(gpus=1,
                      early_stop_callback=early_stop,
                      checkpoint_callback=checkpoint,
                      callbacks=cbs,
                      benchmark=True,
                      deterministic=True,
                      max_epochs=Config.Train.epochs)
    trainer.fit(model, train_dataloader=train_dataloader,
                val_dataloaders=oof_dataloader)

    valid_dataset = centernet.WheatDataset(get_data(mode='valid'), test=True, transforms=get_test_transforms())
    valid_dataloader = DataLoader(valid_dataset, batch_size=Config.Train.batch_size,
                                  shuffle=False, num_workers=4, pin_memory=True)
    trainer.test(model, test_dataloaders=valid_dataloader)
예제 #7
0
def train_step(args, timestr='', best_ckpt=None):
    data_cfg = {
        "VIDEO_FPS": args["VIDEO_FPS"],
        "DATA_DIRECTORY": args["DATA_DIRECTORY"],
        "PRETRAIN_NUM_WORDS": args["PRETRAIN_NUM_WORDS"],
        "CHAR_TO_INDEX": args["CHAR_TO_INDEX"],
        "STEP_SIZE": args["STEP_SIZE"],
        "NUM_WORKERS": args["NUM_WORKERS"],
        "BATCH_SIZE": args["BATCH_SIZE"],
        "PRETRAIN": args["PRETRAIN"]
    }
    train_cfg = {
        "INIT_LR": args["INIT_LR"],
        "MOMENTUM1": args["MOMENTUM1"],
        "MOMENTUM2": args["MOMENTUM2"],
        "LR_SCHEDULER_FACTOR": args["LR_SCHEDULER_FACTOR"],
        "LR_SCHEDULER_WAIT": args["LR_SCHEDULER_WAIT"],
        "LR_SCHEDULER_THRESH": args["LR_SCHEDULER_THRESH"],
        "FINAL_LR": args["FINAL_LR"],
    }
    net_cfg = {
        "dModel": args["TX_NUM_FEATURES"],
        "nHeads": args["TX_ATTENTION_HEADS"],
        "numLayers": args["TX_NUM_LAYERS"],
        "peMaxLen": args["PE_MAX_LENGTH"],
        "fcHiddenSize": args["TX_FEEDFORWARD_DIM"],
        "dropout": args["TX_DROPOUT"],
        "numClasses": args["NUM_CLASSES"]
    }

    logger = pl_loggers.NeptuneLogger(
        project_name='benso/deep-avsr',
        experiment_name=f'video_only_curriculum',
        params=args,
        tags={'start_date': timestr}
    )

    model_checkpoint = pl_callbacks.ModelCheckpoint(
        filename=args["NUM_WORDS"] + '/{epoch:02d}-{val_wer:.2f}',
        save_weights_only=True,
        save_top_k=3,
        monitor='val_wer',
        period=1
    )

    trainer = pl.Trainer(
        logger=logger,
        checkpoint_callback=model_checkpoint,
        gpus=2,
        auto_select_gpus=False,
        max_epochs=args["NUM_STEPS"],
        accelerator=args["ACCELERATOR"],
        resume_from_checkpoint=best_ckpt
    )

    data = VideoNetDataModule(data_cfg=data_cfg)
    network = VideoNetPL(net_class=VideoNet, net_cfg=net_cfg, train_cfg=train_cfg)
    trainer.fit(model=network, datamodule=data)

    return model_checkpoint.best_model_path
def test_top_k_ddp(save_mock, tmpdir, k, epochs, val_check_interval, expected):

    class TestModel(BoringModel):

        def training_step(self, batch, batch_idx):
            local_rank = int(os.getenv("LOCAL_RANK"))
            self.log('my_loss', batch_idx * (1 + local_rank), on_epoch=True)
            return super().training_step(batch, batch_idx)

        def training_epoch_end(self, outputs) -> None:
            data = str(self.global_rank)
            obj = [[data], (data, ), set(data)]
            out = self.trainer.training_type_plugin.broadcast(obj)
            assert obj == [[str(self.global_rank)], (str(self.global_rank), ), set(str(self.global_rank))]
            assert out == [['0'], ('0', ), set('0')]

    model = TestModel()
    trainer = Trainer(
        callbacks=[callbacks.ModelCheckpoint(dirpath=tmpdir, monitor='my_loss_step', save_top_k=k, mode="max")],
        default_root_dir=tmpdir,
        max_epochs=epochs,
        weights_summary=None,
        val_check_interval=val_check_interval,
        accelerator="ddp",
        gpus=2,
        limit_train_batches=64,
        limit_val_batches=32,
    )
    if os.getenv("LOCAL_RANK") == "0":
        with pytest.raises(UserWarning, match="The value associated to the key my_loss_epoch: [15.5, 31.0]"):
            trainer.fit(model)
        assert save_mock.call_count == expected
    else:
        trainer.fit(model)
예제 #9
0
def test_top_k_ddp(save_mock, tmpdir, k, epochs, val_check_interval, expected):
    class TestModel(BoringModel):
        def training_step(self, batch, batch_idx):
            local_rank = int(os.getenv("LOCAL_RANK"))
            self.log("my_loss", batch_idx * (1 + local_rank), on_epoch=True)
            return super().training_step(batch, batch_idx)

        def training_epoch_end(self, outputs) -> None:
            local_rank = int(os.getenv("LOCAL_RANK"))
            if self.trainer.is_global_zero:
                self.log("my_loss_2", (1 + local_rank), on_epoch=True, rank_zero_only=True)
            data = str(self.global_rank)
            obj = [[data], (data,), set(data)]
            out = self.trainer.strategy.broadcast(obj)
            assert obj == [[str(self.global_rank)], (str(self.global_rank),), set(str(self.global_rank))]
            assert out == [["0"], ("0",), set("0")]

    model = TestModel()
    trainer = Trainer(
        callbacks=[callbacks.ModelCheckpoint(dirpath=tmpdir, monitor="my_loss_step", save_top_k=k, mode="max")],
        default_root_dir=tmpdir,
        enable_progress_bar=False,
        max_epochs=epochs,
        enable_model_summary=False,
        val_check_interval=val_check_interval,
        strategy="ddp",
        gpus=2,
        limit_train_batches=64,
        limit_val_batches=32,
    )
    trainer.fit(model)
    if os.getenv("LOCAL_RANK") == "0":
        assert save_mock.call_count == expected
예제 #10
0
def train(args, custom_callbacks=None):
    data_module = GwtDataModule(
        args.batch_size,
        args.num_dataset_workers,
        f'{args.dataset_base_path}/{args.split}/train.jsonl',
        f'{args.dataset_base_path}/{args.split}/validate.jsonl',
        f'{args.dataset_base_path}/{args.split}/test.jsonl',
        f'{args.dataset_base_path}/bpe_ast_vocab.txt',
    )

    if args.invalidate_line_caches:
        data_module.invalidate_caches()

    model = GwtSectionPredictionTransformer(
        data_module.vocab.get_size(),
        data_module.vocab.get_index(data_module.vocab.PAD_TOKEN),
        args.max_sequence_length,
        args.embedding_size,
        args.learning_rate,
        args.num_attention_heads,
        args.num_encoder_layers,
        args.num_decoder_layers,
        args.feedforward_dimensions,
        args.positional_encoding_dropout,
        args.transformer_dropout,
        args.lr_warmup_steps,
        args.optimize_on_smoothed_loss,
    )

    logger = loggers.TensorBoardLogger(
        args.tensorboard_dir,
        name=args.experiment_name,
        version=args.version,
    )
    logger.log_hyperparams(args)

    checkpoint_dir = os.path.join(logger.log_dir, 'checkpoints')

    loss_key = 'val_loss' if not args.optimize_on_smoothed_loss else 'label_smoothed_val_loss'

    trainer = pl.Trainer.from_argparse_args(
        args,
        resume_from_checkpoint=load_checkpoint_if_available(checkpoint_dir),
        logger=logger,
        checkpoint_callback=callbacks.ModelCheckpoint(
            filepath=f'{checkpoint_dir}/{{epoch}}-{{{loss_key}}}',
            save_top_k=5,
            monitor=loss_key,
            mode='min',
        ),
        **({
            'callbacks': custom_callbacks
        } if custom_callbacks else {}),
    )

    trainer.fit(model, data_module)
    return trainer
예제 #11
0
def main():
    # parse the arguments
    args = config.parse_args()

    if args.ngpu == 0:
        args.device = 'cpu'

    pl.seed_everything(args.manual_seed)

    callbacks = [cbs.RichProgressBar()]
    if args.save_results:
        logger = TensorBoardLogger(save_dir=args.logs_dir,
                                   log_graph=True,
                                   name=args.project_name)
        checkpoint = cbs.ModelCheckpoint(
            dirpath=os.path.join(args.save_dir, args.project_name),
            filename=args.project_name + '-{epoch:03d}-{val_loss:.3f}',
            monitor='val_loss',
            save_top_k=args.checkpoint_max_history,
            save_weights_only=True)
        enable_checkpointing = True
        callbacks.append(checkpoint)
    else:
        logger = False
        checkpoint = None
        enable_checkpointing = False

    if args.swa:
        callbacks.append(cbs.StochasticWeightAveraging())

    dataloader = getattr(datasets, args.dataset)(args)
    model = Model(args, dataloader)

    if args.ngpu == 0:
        strategy = None
        sync_batchnorm = False
    elif args.ngpu > 1:
        strategy = 'ddp'
        sync_batchnorm = True
    else:
        strategy = 'dp'
        sync_batchnorm = False

    trainer = pl.Trainer(gpus=args.ngpu,
                         strategy=strategy,
                         sync_batchnorm=sync_batchnorm,
                         benchmark=True,
                         callbacks=callbacks,
                         enable_checkpointing=enable_checkpointing,
                         logger=logger,
                         min_epochs=1,
                         max_epochs=args.nepochs,
                         precision=args.precision)

    trainer.fit(model)
    trainer.predict(model, test_dataloaders=dataloader.test_dataloader())
예제 #12
0
def test_monitor_val_epoch_end(tmpdir):
    epoch_min_loss_override = 0
    model = SimpleModule()
    checkpoint_callback = callbacks.ModelCheckpoint(dirpath=tmpdir, save_top_k=1, monitor="avg_val_loss")
    trainer = Trainer(
        max_epochs=epoch_min_loss_override + 2,
        logger=False,
        checkpoint_callback=checkpoint_callback,
    )
    trainer.fit(model)
예제 #13
0
def train(args):
    model = RNN()
    data_module = DataModule(args)

    callbacks_list = None
    if args.val_path:
        callbacks_list = []
        callbacks_list.append(callbacks.EarlyStopping(monitor='val_acc', patience=PATIENCE))
        callbacks_list.append(callbacks.ModelCheckpoint(filepath=args.out_path, monitor='val_acc', prefix='rnn'))

    gpus = N_GPU if torch.cuda.is_available() else None
    trainer = pl.Trainer(gpus=gpus, max_epochs=MAX_EPOCHS, callbacks=callbacks_list)

    trainer.fit(model, datamodule=data_module)
예제 #14
0
파일: train.py 프로젝트: cthorey/ludos
def get_callbacks(cfg, output_dir):
    cbacks = []
    checkpoint_path = os.path.join(output_dir, cfg.checkpoint.name)
    checkpoint = pl_callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                              save_last=False,
                                              monitor=cfg.checkpoint.monitor,
                                              mode=cfg.checkpoint.monitor_mode)
    cs = [
        pl.callbacks.EarlyStopping(monitor=cfg.checkpoint.monitor,
                                   mode=cfg.checkpoint.monitor_mode,
                                   **cfg.early_stopping),
        pl.callbacks.LearningRateMonitor(),
    ]
    return checkpoint, cs
예제 #15
0
def get_callbacks(cfg, output_dir):
    cbacks = []
    checkpoint_path = os.path.join(output_dir, cfg.CHECKPOINT.NAME)
    checkpoint = pl_callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                              save_last=False,
                                              monitor=cfg.CHECKPOINT.MONITOR,
                                              mode=cfg.CHECKPOINT.MONITOR_MODE)
    cs = [
        pl_callbacks.EarlyStopping(monitor=cfg.CHECKPOINT.MONITOR,
                                   mode=cfg.CHECKPOINT.MONITOR_MODE,
                                   **cfg.EARLY_STOPPING),
        pl_callbacks.LearningRateLogger(),
        inspector.AnalysisCallback()
    ]
    return checkpoint, cs
예제 #16
0
def load_callbacks():
    callbacks = []
    callbacks.append(
        plc.EarlyStopping(monitor='val_acc',
                          mode='max',
                          patience=10,
                          min_delta=0.001))

    callbacks.append(
        plc.ModelCheckpoint(monitor='val_acc',
                            filename='best-{epoch:02d}-{val_acc:.3f}',
                            save_top_k=1,
                            mode='max',
                            save_last=True))

    if args.lr_scheduler:
        callbacks.append(plc.LearningRateMonitor(logging_interval='epoch'))
    return callbacks
예제 #17
0
def get_loggers_callbacks(args, model=None):

    try:
        # Setup logger(s) params
        csv_logger_params = dict(
            save_dir="./experiments",
            name=os.path.join(*args.save_dir.split("/")[1:-1]),
            version=args.save_dir.split("/")[-1],
        )
        wandb_logger_params = dict(
            log_model=False,
            name=os.path.join(*args.save_dir.split("/")[1:]),
            offline=args.debug,
            project="utime",
            save_dir=args.save_dir,
        )
        loggers = [
            pl_loggers.CSVLogger(**csv_logger_params),
            pl_loggers.WandbLogger(**wandb_logger_params),
        ]
        if model:
            loggers[-1].watch(model)

        # Setup callback(s) params
        checkpoint_monitor_params = dict(
            filepath=os.path.join(args.save_dir,
                                  "{epoch:03d}-{eval_loss:.2f}"),
            monitor=args.checkpoint_monitor,
            save_last=True,
            save_top_k=1,
        )
        earlystopping_parameters = dict(
            monitor=args.earlystopping_monitor,
            patience=args.earlystopping_patience,
        )
        callbacks = [
            pl_callbacks.ModelCheckpoint(**checkpoint_monitor_params),
            pl_callbacks.EarlyStopping(**earlystopping_parameters),
            pl_callbacks.LearningRateMonitor(),
        ]

        return loggers, callbacks
    except AttributeError:
        return None, None
예제 #18
0
def get_checkpoint_callback(dirpath,
                            monitor='train/loss',
                            mode='min',
                            filename="{epoch}",
                            save_last=True,
                            save_top_k=2,
                            every_n_train_steps=None):
    ckpt_callback = plc.ModelCheckpoint(
        dirpath=dirpath,
        filename=filename,  # ckpt_name + "_{epoch}",
        monitor=monitor,
        save_last=save_last,
        save_top_k=save_top_k,
        mode=mode,
        every_n_train_steps=every_n_train_steps,
        # verbose=True,
    )
    ckpt_callback.CHECKPOINT_NAME_LAST = "{epoch}_last"
    return ckpt_callback
def main():
    logger.remove()
    logger.add(sys.stdout,
               colorize=True,
               format="<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> " +
               "| <level>{level}</level> " +
               "| <light-black>{file.path}:{line}</light-black> | {message}")
    hparams = parse_args()
    if hparams.restore:
        wandb.init(project=hparams.project, tags=hparams.tags)
        model = LevelClassification.load_from_checkpoint(hparams.restore)
        logger.info("Restored model")
    else:
        # wandb.init is called in LevelClassification
        model = LevelClassification(hparams)
        experiment_logger = loggers.WandbLogger(project=hparams.project,
                                                tags=hparams.tags)
        hparams.checkpoint_dir = os.path.join(experiment_logger.experiment.dir,
                                              "checkpoints")
        checkpoint_cb = callbacks.ModelCheckpoint(hparams.checkpoint_dir,
                                                  save_top_k=1)
        trainer = pl.Trainer(logger=experiment_logger,
                             gpus=1 if hparams.device == "cuda" else 0,
                             checkpoint_callback=checkpoint_cb,
                             callbacks=[EmbeddingsCallback()],
                             early_stop_callback=callbacks.EarlyStopping(),
                             fast_dev_run=hparams.debug)
        trainer.fit(model)
    model.freeze()
    baseline_datasets = []
    logger.info("Baselines {}", os.listdir(hparams.baseline_level_dir))
    for i, baseline_level_dir in enumerate(
            sorted(os.listdir(hparams.baseline_level_dir))):
        baseline_dataset = LevelSnippetDataset(
            level_dir=os.path.join(os.getcwd(), hparams.baseline_level_dir,
                                   baseline_level_dir),
            slice_width=model.dataset.slice_width,
            token_list=model.dataset.token_list)
        baseline_datasets.append(baseline_dataset)
    visualize_embeddings(model.dataset, model, "test", hparams, None,
                         baseline_datasets)
예제 #20
0
파일: experiment.py 프로젝트: xvr-hlt/dpfk
def run(config):
    if isinstance(config, str):
        with open(config) as f:
            config = yaml.safe_load(f)

    now = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
    run_dir = path.join("wandb", now)
    run_dir = path.abspath(run_dir)
    os.environ['WANDB_RUN_DIR'] = run_dir

    checkpoint_callback = callbacks.ModelCheckpoint(
        run_dir, monitor=config['early_stopping']['monitor'])
    early_stopping_callback = callbacks.EarlyStopping(
        **config['early_stopping'])

    experiment = Experiment(config)
    trainer = pl.Trainer(logger=False,
                         checkpoint_callback=checkpoint_callback,
                         early_stop_callback=early_stopping_callback,
                         **config['trainer'])
    trainer.fit(experiment)
예제 #21
0
def main(cfg: DictConfig):
    model = pixellstm_pl(cfg)
    data = MNISTDataModule(cfg)
    logger = pl_loggers.TensorBoardLogger(save_dir=cfg.train.log_dir,
                                          version=cfg.train.version)
    checkpoint_callback = callbacks.ModelCheckpoint(
        monitor='val_loss',
        dirpath=cfg.train.checkpoint_dir,
        save_top_k=cfg.train.save_top_k)
    trainer = Trainer(
        accelerator=None if platform.system() == 'Windows' else 'ddp',
        accumulate_grad_batches=cfg.train.accumulate,
        auto_scale_batch_size=True,
        max_epochs=cfg.train.epochs,
        callbacks=[checkpoint_callback],
        default_root_dir=cfg.train.log_dir,
        fast_dev_run=cfg.train.fast_dev_run,
        gpus=cfg.train.gpus,
        logger=logger,
        terminate_on_nan=True,
        weights_save_path=cfg.train.checkpoint_dir,
        check_val_every_n_epoch=cfg.train.check_val_freq)
    trainer.fit(model, datamodule=data)
예제 #22
0
def train_faster_rcnn(train_df, oof_df):
    train_dataset = rcnn.WheatDataset(train_df, transforms=get_train_transforms())
    train_dataloader = DataLoader(train_dataset, batch_size=Config.Train.batch_size,
                                  shuffle=True, num_workers=4, drop_last=True,
                                  collate_fn=collate_fn, pin_memory=True)
    oof_dataset = rcnn.WheatDataset(oof_df, test=True, transforms=get_valid_transforms())
    oof_dataloader = DataLoader(oof_dataset, batch_size=Config.Train.batch_size,
                                shuffle=False, num_workers=4,
                                collate_fn=collate_fn, pin_memory=True)
    # model = FasterRCNNResnet50FPN.load_from_checkpoint('checkpoints\\faster_rcnn\\epoch=9.ckpt', **Config)
    model = FasterRCNNResnet50FPN(conf=Config)
    early_stop = callbacks.EarlyStopping(monitor='val_loss',
                                         patience=20,
                                         mode='min',
                                         verbose=True)
    checkpoint = callbacks.ModelCheckpoint(str(Config.Train.checkpoint_dir),
                                           monitor='val_loss',
                                           verbose=True,
                                           save_top_k=1)
    cbs = [
        callbacks.LearningRateLogger()
    ]
    trainer = Trainer(gpus=1,
                      early_stop_callback=early_stop,
                      checkpoint_callback=checkpoint,
                      callbacks=cbs,
                      benchmark=True,
                      deterministic=True,
                      max_epochs=Config.Train.epochs)
    trainer.fit(model, train_dataloader=train_dataloader,
                val_dataloaders=oof_dataloader)

    valid_dataset = rcnn.WheatDataset(get_data(mode='valid'), test=True, transforms=get_test_transforms())
    valid_dataloader = DataLoader(valid_dataset, batch_size=Config.Train.batch_size,
                                  shuffle=False, num_workers=4,
                                  collate_fn=collate_fn, pin_memory=True)
    trainer.test(model, test_dataloaders=valid_dataloader)
예제 #23
0
def main(cfg):
    model = e2vid(cfg)
    data = e2tensor_datamodule(cfg, hopath(cfg.dataset.dir))
    logger = pl_loggers.TensorBoardLogger(save_dir=cfg.train.log_dir,
                                          version=cfg.train.version)
    checkpoint_callback = callbacks.ModelCheckpoint(
        monitor="val_loss",
        dirpath=cfg.train.checkpoint_dir,
        save_top_k=cfg.train.save_top_k,
    )
    trainer = Trainer(
        accelerator=None if platform.system() == "Windows" else "ddp",
        auto_scale_batch_size=True,
        max_epochs=cfg.train.epochs,
        callbacks=[checkpoint_callback],
        default_root_dir=cfg.train.log_dir,
        fast_dev_run=True if cfg.runtype == "debug" else False,
        gpus=cfg.train.gpus,
        logger=logger,
        terminate_on_nan=True,
        weights_save_path=cfg.train.checkpoint_dir,
        check_val_every_n_epoch=cfg.train.check_val_freq,
    )
    trainer.fit(model, datamodule=data)
예제 #24
0
def main():
    args = parse_args()

    if args.debug or not args.non_deterministic:
        np.random.seed(1)
        torch.manual_seed(1)
        torch.cuda.manual_seed(1)
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

        # torch.set_deterministic(True) # grid_sampler_2d_backward_cuda does not have a deterministic implementation

    if args.debug:
        torch.autograd.set_detect_anomaly(True)

    dataloader_args = EasyDict(
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=0 if args.debug else args.data_workers)
    if args.dataset == 'mnist':
        args.num_classes = 10
        args.im_channels = 1
        args.image_size = (40, 40)

        from torchvision.datasets import MNIST

        t = transforms.Compose([
            transforms.RandomCrop(size=(40, 40), pad_if_needed=True),
            transforms.ToTensor(),
            # norm_1c
        ])
        train_dataloader = DataLoader(
            MNIST(data_path / 'mnist', train=True, transform=t, download=True),
            **dataloader_args)
        val_dataloader = DataLoader(
            MNIST(data_path / 'mnist', train=False, transform=t,
                  download=True), **dataloader_args)
    elif args.dataset == 'usps':
        args.num_classes = 10
        args.im_channels = 1
        args.image_size = (40, 40)

        from torchvision.datasets import USPS

        t = transforms.Compose([
            transforms.RandomCrop(size=(40, 40), pad_if_needed=True),
            transforms.ToTensor(),
            # norm_1c
        ])
        train_dataloader = DataLoader(
            USPS(data_path / 'usps', train=True, transform=t, download=True),
            **dataloader_args)
        val_dataloader = DataLoader(
            USPS(data_path / 'usps', train=False, transform=t, download=True),
            **dataloader_args)
    elif args.dataset == 'constellation':

        data_gen = create_constellation(
            batch_size=args.batch_size,
            shuffle_corners=True,
            gaussian_noise=.0,
            drop_prob=0.5,
            which_patterns=[[0], [1], [0]],
            rotation_percent=180 / 360.,
            max_scale=3.,
            min_scale=3.,
            use_scale_schedule=False,
            schedule_steps=0,
        )

        train_dataloader = DataLoader(data_gen, **dataloader_args)
        val_dataloader = DataLoader(data_gen, **dataloader_args)

    elif args.dataset == 'cifar10':
        args.num_classes = 10
        args.im_channels = 3
        args.image_size = (32, 32)

        from torchvision.datasets import CIFAR10

        t = transforms.Compose([transforms.ToTensor()])
        train_dataloader = DataLoader(
            CIFAR10(data_path / 'cifar10',
                    train=True,
                    transform=t,
                    download=True), **dataloader_args)
        val_dataloader = DataLoader(
            CIFAR10(data_path / 'cifar10',
                    train=False,
                    transform=t,
                    download=True), **dataloader_args)
    elif args.dataset == 'svhn':
        args.num_classes = 10
        args.im_channels = 3
        args.image_size = (32, 32)

        from torchvision.datasets import SVHN

        t = transforms.Compose([transforms.ToTensor()])
        train_dataloader = DataLoader(
            SVHN(data_path / 'svhn', split='train', transform=t,
                 download=True), **dataloader_args)
        val_dataloader = DataLoader(
            SVHN(data_path / 'svhn', split='test', transform=t, download=True),
            **dataloader_args)
    else:
        raise NotImplementedError()

    logger = WandbLogger(project=args.log.project,
                         name=args.log.run_name,
                         entity=args.log.team,
                         config=args,
                         offline=not args.log.upload)

    if args.model == 'ccae':
        from scae.modules.attention import SetTransformer
        from scae.modules.capsule import CapsuleLayer
        from scae.models.ccae import CCAE

        encoder = SetTransformer(2)
        decoder = CapsuleLayer(input_dims=32,
                               n_caps=3,
                               n_caps_dims=2,
                               n_votes=4,
                               n_caps_params=32,
                               n_hiddens=128,
                               learn_vote_scale=True,
                               deformations=True,
                               noise_type='uniform',
                               noise_scale=4.,
                               similarity_transform=False)

        model = CCAE(encoder, decoder, args)

        # logger.watch(encoder._encoder, log='all', log_freq=args.log_frequency)
        # logger.watch(decoder, log='all', log_freq=args.log_frequency)
    elif args.model == 'pcae':
        from scae.modules.part_capsule_ae import CapsuleImageEncoder, TemplateImageDecoder
        from scae.models.pcae import PCAE

        encoder = CapsuleImageEncoder(args)
        decoder = TemplateImageDecoder(args)
        model = PCAE(encoder, decoder, args)

        logger.watch(encoder._encoder, log='all', log_freq=args.log.frequency)
        logger.watch(decoder, log='all', log_freq=args.log.frequency)
    elif args.model == 'ocae':
        from scae.modules.object_capsule_ae import SetTransformer, ImageCapsule
        from scae.models.ocae import OCAE

        encoder = SetTransformer()
        decoder = ImageCapsule()
        model = OCAE(encoder, decoder, args)

        #  TODO: after ccae
    else:
        raise NotImplementedError()

    # Execute Experiment
    lr_logger = cb.LearningRateMonitor(logging_interval='step')
    best_checkpointer = cb.ModelCheckpoint(save_top_k=1,
                                           monitor='val_rec_ll',
                                           filepath=logger.experiment.dir)
    last_checkpointer = cb.ModelCheckpoint(save_last=True,
                                           filepath=logger.experiment.dir)
    trainer = pl.Trainer(
        max_epochs=args.num_epochs,
        logger=logger,
        callbacks=[lr_logger, best_checkpointer, last_checkpointer])
    trainer.fit(model, train_dataloader, val_dataloader)
def test_eval_logging_auto_reduce(tmpdir):
    """
    Tests that only training_step can be used
    """
    seed_everything(1234)

    os.environ['PL_DEV_DEBUG'] = '1'

    class TestModel(BoringModel):
        def on_pretrain_routine_end(self) -> None:
            self.seen_vals = []
            self.manual_epoch_end_mean = None

        def on_validation_epoch_start(self) -> None:
            self.seen_vals = []

        def validation_step(self, batch, batch_idx):
            output = self.layer(batch)
            loss = self.loss(batch, output)
            self.seen_vals.append(loss)
            self.log('val_loss',
                     loss,
                     on_epoch=True,
                     on_step=True,
                     prog_bar=True)
            return {"x": loss}

        def validation_epoch_end(self, outputs) -> None:
            for passed_in, manually_tracked in zip(outputs, self.seen_vals):
                assert passed_in['x'] == manually_tracked
            self.manual_epoch_end_mean = torch.stack([x['x'] for x in outputs
                                                      ]).mean()

    model = TestModel()

    trainer = Trainer(
        default_root_dir=tmpdir,
        limit_train_batches=3,
        limit_val_batches=3,
        max_epochs=1,
        log_every_n_steps=1,
        weights_summary=None,
        checkpoint_callback=callbacks.ModelCheckpoint(dirpath='val_loss'))
    trainer.fit(model)

    # make sure all the metrics are available for callbacks
    manual_mean = model.manual_epoch_end_mean
    callback_metrics = set(trainer.callback_metrics.keys())
    assert callback_metrics == {'debug_epoch', 'val_loss', 'val_loss_epoch'}

    # make sure values are correct
    assert trainer.logged_metrics['val_loss_epoch'] == manual_mean
    assert trainer.callback_metrics['val_loss'] == trainer.logged_metrics[
        'val_loss_step/epoch_0']

    # make sure correct values were logged
    logged_val = trainer.dev_debugger.logged_metrics

    # sanity check
    assert logged_val[0]['global_step'] == 0
    assert logged_val[1]['global_step'] == 0

    # 3 val batches
    assert logged_val[2]['val_loss_step/epoch_0'] == model.seen_vals[0]
    assert logged_val[3]['val_loss_step/epoch_0'] == model.seen_vals[1]
    assert logged_val[4]['val_loss_step/epoch_0'] == model.seen_vals[2]

    # epoch mean
    assert logged_val[5]['val_loss_epoch'] == model.manual_epoch_end_mean

    # only those logged
    assert len(logged_val) == 6
예제 #26
0
                                collate_fn=_collate_fn)

        test_dataset = LanguageModelingDataset(datasets['test'])
        test_batch_sampler = BPTTBatchSampler(test_dataset, hparams.bptt,
                                              hparams.batch_size)
        test_data = DataLoader(test_dataset,
                               num_workers=8,
                               pin_memory=True,
                               batch_sampler=test_batch_sampler,
                               collate_fn=_collate_fn)

        early_stop_callback = callbacks.EarlyStopping(monitor='val_ppl',
                                                      mode='min')
        model_checkpoint_callback = callbacks.ModelCheckpoint(
            monitor='val_ppl',
            save_last=True,
            save_top_k=5,
            save_weights_only=False,
            mode='min')

        trainer = Trainer.from_argparse_args(
            hparams,
            default_root_dir=os.path.abspath(
                os.path.expanduser("~/data/awd-lstm")),
            callbacks=[
                early_stop_callback, model_checkpoint_callback,
                NNICallback()
            ])

        del hparams.tpu_cores
        model = LanguageModel(hparams)
예제 #27
0
def test__training_step__log(tmpdir):
    """
    Tests that only training_step can be used
    """
    class TestModel(DeterministicModel):
        def training_step(self, batch, batch_idx):
            acc = self.step(batch, batch_idx)
            acc = acc + batch_idx

            # -----------
            # default
            # -----------
            self.log('default', acc)

            # -----------
            # logger
            # -----------
            # on_step T on_epoch F
            self.log('l_s',
                     acc,
                     on_step=True,
                     on_epoch=False,
                     prog_bar=False,
                     logger=True)

            # on_step F on_epoch T
            self.log('l_e',
                     acc,
                     on_step=False,
                     on_epoch=True,
                     prog_bar=False,
                     logger=True)

            # on_step T on_epoch T
            self.log('l_se',
                     acc,
                     on_step=True,
                     on_epoch=True,
                     prog_bar=False,
                     logger=True)

            # -----------
            # pbar
            # -----------
            # on_step T on_epoch F
            self.log('p_s',
                     acc,
                     on_step=True,
                     on_epoch=False,
                     prog_bar=True,
                     logger=False)

            # on_step F on_epoch T
            self.log('p_e',
                     acc,
                     on_step=False,
                     on_epoch=True,
                     prog_bar=True,
                     logger=False)

            # on_step T on_epoch T
            self.log('p_se',
                     acc,
                     on_step=True,
                     on_epoch=True,
                     prog_bar=True,
                     logger=False)

            self.training_step_called = True
            return acc

        def backward(self, loss, optimizer, optimizer_idx):
            return LightningModule.backward(self, loss, optimizer,
                                            optimizer_idx)

    model = TestModel()
    model.val_dataloader = None

    trainer = Trainer(
        default_root_dir=tmpdir,
        limit_train_batches=2,
        limit_val_batches=2,
        max_epochs=2,
        log_every_n_steps=1,
        weights_summary=None,
        checkpoint_callback=callbacks.ModelCheckpoint(monitor='l_se'))
    trainer.fit(model)

    # make sure correct steps were called
    assert model.training_step_called
    assert not model.training_step_end_called
    assert not model.training_epoch_end_called

    # make sure all the metrics are available for callbacks
    logged_metrics = set(trainer.logged_metrics.keys())
    expected_logged_metrics = {
        'epoch',
        'default',
        'l_e',
        'l_s',
        'l_se_step',
        'l_se_epoch',
    }
    assert logged_metrics == expected_logged_metrics

    pbar_metrics = set(trainer.progress_bar_metrics.keys())
    expected_pbar_metrics = {
        'p_e',
        'p_s',
        'p_se_step',
        'p_se_epoch',
    }
    assert pbar_metrics == expected_pbar_metrics

    callback_metrics = set(trainer.callback_metrics.keys())
    callback_metrics.remove('debug_epoch')
    expected_callback_metrics = set()
    expected_callback_metrics = expected_callback_metrics.union(logged_metrics)
    expected_callback_metrics = expected_callback_metrics.union(pbar_metrics)
    expected_callback_metrics.update({'p_se', 'l_se'})
    expected_callback_metrics.remove('epoch')
    assert callback_metrics == expected_callback_metrics
예제 #28
0
        solver = FewshotSolver(classifier)

    if classifier_name.startswith("reg"):
        solver.weight_decay = 1e-4
    else:
        solver.weight_decay = 0

    tb_logger = pllog.TensorBoardLogger("logs_byte/" + classifier_name)
    trainer = pl.Trainer(
        logger=tb_logger,
        gpus=1,
        max_epochs=10,
        log_every_n_steps=1,
        precision=32,
        check_val_every_n_epoch=1,
        auto_lr_find=True,
        callbacks=[
            FewshotDatasetReplacement(datasets, every_batch=20),
            plcb.ModelCheckpoint()
        ])

    trainer.tune(solver, train_dataloader=datasets.train_dataloader())
    trainer.fit(
        solver,
        train_dataloader=datasets.train_dataloader(),
        val_dataloaders=[
            datasets.val_dataloader(seen=False, unseen=True),
            datasets.val_dataloader(seen=True, unseen=False),
        ]
    )
예제 #29
0
파일: trainer.py 프로젝트: iimuz/til
def train(config: Config):
    """学習処理の実行スクリプト."""
    pl.seed_everything(config.random_seed)

    # 学習を途中から再開する場合などの設定
    cache_dir = pathlib.Path(config.cache_dir)
    cache_dir.mkdir(exist_ok=True)
    trainer_params = dict()
    lastckpt = cache_dir.joinpath("last.ckpt")
    if config.resume:
        trainer_params["resume_from_checkpoint"] = str(lastckpt)
    elif lastckpt.exists():
        lastckpt.unlink()
    for filepath in cache_dir.glob("epoch*.ckpt"):
        filepath.unlink()

    # ログ設定
    pl_logger = pl_loggers.MLFlowLogger(
        experiment_name=config.experiment_name,
        tracking_uri=os.environ.get("MLFLOW_TRACKING_URI", None),
        tags={
            "mlflow.source.name": pathlib.Path(__file__).name,
            "mlflow.source.git.commit": ut.get_commit_id(),
        },
    )

    # ネットワーク、データセットの取得及び学習
    network = tv_models.vgg16(pretrained=False)
    params = dc.asdict(config)
    model = Trainer(network, **params)

    callbacks: t.List[t.Any] = list()
    model_checkpoint = pl_callbacks.ModelCheckpoint(
        filepath=str(cache_dir),
        monitor="val_loss",
        save_last=True,
        save_top_k=config.save_top_k,
        save_weights_only=config.save_weights_only,
        mode="min",
        period=1,
    )
    callbacks.append(model_checkpoint)
    if config.early_stop:
        callbacks.append(
            pl_callbacks.EarlyStopping(
                monitor="val_loss",
                min_delta=0.0,
                patience=3,
                verbose=False,
                mode="auto",
            ))

    pl_trainer = pl.Trainer(
        default_root_dir=str(cache_dir),
        fast_dev_run=False,
        min_epochs=config.min_epochs,
        max_epochs=config.max_epochs,
        gpus=[0] if config.use_gpu and cuda.is_available() else None,
        progress_bar_refresh_rate=config.progress_bar_refresh_rate,
        profiler=config.profiler,
        callbacks=callbacks,
        logger=pl_logger,
        log_gpu_memory=True,
        **trainer_params,
    )
    datamodule = dataset_food101.Food101WithLableModule(
        batch_size=config.batch_size,
        num_workers=config.num_workers,
    )
    pl_trainer.fit(model, datamodule)

    # ログに追加情報を設定
    mlf_client = mlflow.tracking.MlflowClient()
    for ckptfile in cache_dir.glob("epoch*.ckpt"):
        model = model.load_from_checkpoint(str(ckptfile), network, **params)
        with tempfile.TemporaryDirectory() as dname:
            mlf_model_path = pathlib.Path(dname).joinpath(ckptfile.stem)
            mlf_pytorch.save_model(model.network, mlf_model_path)
            mlf_client.log_artifact(pl_logger.run_id, mlf_model_path)
예제 #30
0
from pytorch_lightning import Trainer, callbacks
from src.main.config import cfg_ds_v1, cfg_train
from src.main.feeder import NtuFeeder
from src.main.model import KhoiddNet
from torch.utils.data import DataLoader

if __name__ == "__main__":
    trainer: Trainer = Trainer(
        gpus=-1,  # -1: train on all gpus
        use_amp=True,
        max_epochs=200,
        # callback
        checkpoint_callback=callbacks.ModelCheckpoint(
            filepath=cfg_train.output_train + "/model",
            save_best_only=True,
            monitor="val_loss",
            mode="min",
        ),
        # only use when debug
        fast_dev_run=False,
        show_progress_bar=True,
        train_percent_check=1.0,  # percent of train data
        val_percent_check=1.0,  # percent of val data
        check_val_every_n_epoch=1,  # epoch per val
        val_check_interval=1.0,  # val per epoch
    )
    trainer.fit(
        model=KhoiddNet(),
        train_dataloader=DataLoader(
            dataset=NtuFeeder(
                path_data=cfg_ds_v1.path_data_preprocess +