Exemplo n.º 1
0
def train_mnist_tune_checkpoint(config,
                                checkpoint_dir=None,
                                num_epochs=10,
                                num_gpus=0,
                                data_dir="~/data"):
    data_dir = os.path.expanduser(data_dir)
    kwargs = {
        "max_epochs": num_epochs,
        # If fractional GPUs passed in, convert to int.
        "gpus": math.ceil(num_gpus),
        "logger": TensorBoardLogger(
            save_dir=tune.get_trial_dir(), name="", version="."),
        "progress_bar_refresh_rate": 0,
        "callbacks": [
            TuneReportCheckpointCallback(
                metrics={
                    "loss": "ptl/val_loss",
                    "mean_accuracy": "ptl/val_accuracy"
                },
                filename="checkpoint",
                on="validation_end")
        ]
    }

    if checkpoint_dir:
        kwargs["resume_from_checkpoint"] = os.path.join(
            checkpoint_dir, "checkpoint")

    model = LightningMNISTClassifier(config=config, data_dir=data_dir)
    trainer = pl.Trainer(**kwargs)

    trainer.fit(model)
Exemplo n.º 2
0
def train_mnist_tune_checkpoint(config,
                                checkpoint_dir=None,
                                data_dir=None,
                                num_epochs=10,
                                num_gpus=0):
    trainer = pl.Trainer(
        max_epochs=num_epochs,
        gpus=num_gpus,
        logger=TensorBoardLogger(
            save_dir=tune.get_trial_dir(), name="", version="."),
        progress_bar_refresh_rate=0,
        callbacks=[
            TuneReportCheckpointCallback(
                metrics={
                    "loss": "ptl/val_loss",
                    "mean_accuracy": "ptl/val_accuracy"
                },
                filename="checkpoint",
                on="validation_end")
        ])
    if checkpoint_dir:
        # Currently, this leads to errors:
        # model = LightningMNISTClassifier.load_from_checkpoint(
        #     os.path.join(checkpoint, "checkpoint"))
        # Workaround:
        ckpt = pl_load(
            os.path.join(checkpoint_dir, "checkpoint"),
            map_location=lambda storage, loc: storage)
        model = LightningMNISTClassifier._load_model_state(ckpt, config=config, data_dir=data_dir)
        trainer.current_epoch = ckpt["epoch"]
    else:
        model = LightningMNISTClassifier(config=config, data_dir=data_dir)

    trainer.fit(model)
Exemplo n.º 3
0
    def train_tune(config, epochs, resources, checkpoint_dir=None):
        # viz logger
        logger = TensorBoardLogger(save_dir=tune.get_trial_dir(),
                                   name=model_name)

        # metric reporter + checkpoint callback
        callback = TuneReportCheckpointCallback(
            metrics=pbt_config['metrics_to_report'])

        # search trainer object
        trainer = pl.Trainer(
            max_epochs=epochs,
            gpus=resources['gpu'],
            logger=logger,
            callbacks=[callback],
            progress_bar_refresh_rate=50,
            precision=16,
        )

        # checkpointing system
        if checkpoint_dir:
            model = network.load_from_checkpoint(
                os.path.join(checkpoint_dir, 'checkpoint'))
        else:
            model = network(config)

        # fits model/data module with current hyperparameter set
        data_module = dm(config)
        trainer.fit(model, datamodule=data_module)
def train_mult(config, checkpoint_dir=None):
    hyp_params.attn_dropout = config["attn_dropout"]
    hyp_params.attn_dropout_a = config["attn_dropout_a"]
    hyp_params.attn_dropout_v = config["attn_dropout_v"]
    hyp_params.embed_dropout = config["embed_dropout"]
    hyp_params.out_dropout = config["out_dropout"]
    hyp_params.relu_dropout = config["relu_dropout"]
    hyp_params.res_dropout = config["res_dropout"]

    # hyp_params.layers = int(config["layers"])
    # hyp_params.num_heads = int(config["num_heads"])
    # hyp_params.project_dim = int(config["num_heads"]) * int(config["head_dim"])
    hyp_params.lr = config["lr"]
    hyp_params.weight_decay = config["weight_decay"]

    comet_logger = CometLogger(
        api_key="cgss7piePhyFPXRw1J2uUEjkQ",
        workspace="transformer",
        project_name=hyp_params.project_name,
        save_dir="logs/comet_ml",
    )
    experiement_key = comet_logger.experiment.get_key()
    csv_logger = CSVLogger("logs/csv", name=experiement_key)
    early_stopping = EarlyStopping(
        monitor="valid_1mae", patience=10, verbose=True, mode="max"
    )
    checkpoint = ModelCheckpoint(save_top_k=1, monitor="valid_1mae", mode="max")
    # tune_reporter = TuneReportCallback(["valid_loss", "valid_1mae"])
    tune_checkpoint_reporter = TuneReportCheckpointCallback(
        metrics=["valid_loss", "valid_1mae"]
    )

    model = MULTModelWarpedAll(hyp_params, early_stopping=early_stopping)
    trainer = pl.Trainer(
        gpus=1,
        max_epochs=hyp_params.num_epochs,
        log_every_n_steps=1,
        callbacks=[early_stopping, checkpoint, tune_checkpoint_reporter],
        logger=[csv_logger, comet_logger],
        limit_train_batches=hyp_params.limit,
        limit_val_batches=hyp_params.limit,
        weights_summary="full",
        weights_save_path="logs/weights",
        progress_bar_refresh_rate=0,
    )

    if checkpoint_dir is not None:
        ck = th.load(os.path.join(checkpoint_dir, "checkpoint"))
        model.load_state_dict(ck["state_dict"])
        trainer.current_epoch = ck["epoch"]

    trainer.fit(model)

    ck = th.load(checkpoint.best_model_path)
    model.load_state_dict(ck["state_dict"])

    trainer.test(model)
Exemplo n.º 5
0
 def train(config):
     module = _MockModule(10, 20)
     trainer = pl.Trainer(max_epochs=1,
                          callbacks=[
                              TuneReportCheckpointCallback(
                                  ["avg_val_loss"],
                                  "trainer.ckpt",
                                  on="validation_end")
                          ])
     trainer.fit(module)
Exemplo n.º 6
0
def clip_fine_tune(
    config,
    num_epochs,
    num_gpus,
    dataset: pa.Table,
    init_config: CLIPConfig,
    init_state_dict: dict,
    processor: CLIPProcessor,
):
    if "SLURM_NTASKS" in os.environ:
        del os.environ["SLURM_NTASKS"]

    if "SLURM_JOB_NAME" in os.environ:
        del os.environ["SLURM_JOB_NAME"]

    bird_dataset = dataset
    data_mod = MultiModalDataModule(
        dataset=bird_dataset,
        processor=processor,
        test_size=config["test_size"],
        batch_size=config["batch_size"],
        val_batch_size=config["val_batch_size"],
        num_workers=config["num_workers"],
    )

    clip_model = CLIPModel(init_config)
    clip_model.load_state_dict(init_state_dict)
    model = CLIPFineTunedModel(clip_model, **config)

    tune_cbs = [
        TuneReportCheckpointCallback(["val_loss"], on="validation_end")
    ]
    logger = TensorBoardLogger(save_dir=tune.get_trial_dir(),
                               name="",
                               version=".")

    trainer = pl.Trainer(
        logger=logger,
        num_sanity_val_steps=0,
        max_epochs=num_epochs,
        gpus=math.ceil(num_gpus),
        progress_bar_refresh_rate=0,
        log_every_n_steps=1,
        callbacks=[LearningRateMonitor(logging_interval="step")] + tune_cbs,
    )

    trainer.validate(model, data_mod)
    trainer.fit(model, data_mod)
    return trainer
Exemplo n.º 7
0
def trainWithTune(config,
                  checkpoint_dir=None,
                  datamodule=None,
                  num_epochs=10,
                  num_gpus=0):
    trainer = Trainer(
        max_epochs=num_epochs,
        # If fractional GPUs passed in, convert to int.
        gpus=math.ceil(num_gpus),
        logger=TensorBoardLogger(save_dir=tune.get_trial_dir(),
                                 name="",
                                 version="."),
        progress_bar_refresh_rate=0,
        callbacks=[
            TuneReportCheckpointCallback(metrics={
                "loss": "val_loss",
                "mean_accuracy": "val_acc",
                "mean_iou": "val_iou",
            },
                                         filename="checkpoint",
                                         on="validation_end")
        ])

    if checkpoint_dir:
        # Currently, this leads to errors:
        # model = LightningMNISTClassifier.load_from_checkpoint(
        #     os.path.join(checkpoint, "checkpoint"))
        # Workaround:
        ckpt = pl_load(os.path.join(checkpoint_dir, "checkpoint"),
                       map_location=lambda storage, loc: storage)
        model = MMETrainingModule._load_model_state(
            ckpt,
            lr=10**config['log_lr'],
            lrRatio=10**config['log_lrRatio'],
            decay=10**config['log_decay'],
            num_cls=NUM_CLS)
        trainer.current_epoch = ckpt["epoch"]
    else:
        model = MMETrainingModule(lr=10**config['log_lr'],
                                  lrRatio=10**config['log_lrRatio'],
                                  decay=10**config['log_decay'],
                                  num_cls=NUM_CLS)

    trainer.fit(model, datamodule=datamodule)
Exemplo n.º 8
0
def train(config, batch_size, num_epochs=20, num_gpus=0):
    training = dl.loader(55000, batch_size, 0)
    validation = dl.loader(8250, 1, 55000)
    cae = ContractiveAutoEncoder(training_dataloader=training,
                                 val_dataloader=validation,
                                 config=config)
    trainer = pl.Trainer(
        max_epochs=num_epochs,
        gpus=num_gpus,
        auto_select_gpus=True if num_gpus else False,
        logger=TensorBoardLogger(save_dir=tune.get_trial_dir(),
                                 name="",
                                 version='.'),
        stochastic_weight_avg=True,
        benchmark=True,
        callbacks=[
            TuneReportCheckpointCallback({"loss": "val_loss"},
                                         filename="checkpoint",
                                         on="validation_end")
        ])

    trainer.fit(cae)
Exemplo n.º 9
0
def trainable(config, train_loader, val_loader, test_loader):

    input_size = 28
    ae_arch = architecture.get_ae_architecture(input_size=input_size,
                                               latent_dim=128)

    # model = ConvAutoencoder(**{**ae_arch, 'verbose': True})
    model = ConvAutoencoder(
        **{
            **ae_arch, 'optimizer_name': config['optimizer_name'],
            'lr': config['lr']
        })
    model.logdir = 'ConvAutoencoder'

    model.set_latent(input_size)
    # print('model latent dim:', model.latent_size)

    config_str = json.dumps({
        **config, 'channels': ae_arch['encoder_channels'],
        'stride': ae_arch['encoder_stride'],
        'kernel_size': ae_arch['encoder_kernel_size'],
        'latent_dim': model.latent_size
    })

    # SET UP LOGGER
    section_name = 'ConvAutoencoder'
    save_dir = f'{os.path.expanduser("~")}/ai-core/Embedder/runs/{section_name}/'
    # save_dir =f'{os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")}/runs/{section_name}/'
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    # print(save_dir)
    # print(__name__)
    # print(__file__)
    # sdfcds
    experiment_name = f'ConvAutoencoder-{config_str}-{time()}'
    model.experiment_name = experiment_name
    logger = pl.loggers.TensorBoardLogger(
        save_dir=save_dir,
        name=experiment_name,
        default_hp_metric=False,
    )

    # CREATE CHECKPOINTS DIR
    checkpoint_dir = f'checkpoints/{experiment_name}'
    os.makedirs(checkpoint_dir)

    # RUN TRAINER
    trainer = pl.Trainer(
        logger=logger,
        log_every_n_steps=1,
        max_epochs=10,
        val_check_interval=0.05,  # for dev
        progress_bar_refresh_rate=0,
        callbacks=[
            TuneReportCallback(metrics={
                "loss": "val_loss",
            },
                               on="validation_end"),
            TuneReportCheckpointCallback(
                metrics={"loss": "val_loss"},
                filename=
                f"{checkpoint_dir}/latest_checkpoint.ckpt",  # TODO edit callback so that it saves history of checkpoints and make PR to ray[tune]
                on="validation_end"),
            SampleReconstructionCallback(loader=val_loader)
        ])
    trainer.fit(model,
                train_dataloader=train_loader,
                val_dataloaders=val_loader)
    test_result = Trainer.test(model=model,
                               test_dataloaders=test_loader,
                               verbose=True)
Exemplo n.º 10
0
    # CREATE CHECKPOINTS DIR
    checkpoint_dir = f'checkpoints/{experiment_name}'
    os.makedirs(checkpoint_dir)

    # RUN TRAINER
    trainer = pl.Trainer(
        logger=logger,
        log_every_n_steps=1,
        max_epochs=10,
        val_check_interval=0.05,  # for dev
        progress_bar_refresh_rate=1,
        callbacks=[
            TuneReportCallback(metrics={
                "loss": "val_loss",
            },
                               on="validation_end"),
            TuneReportCheckpointCallback(
                metrics={"loss": "val_loss"},
                filename=
                f"{checkpoint_dir}/latest_checkpoint.ckpt",  # TODO edit callback so that it saves history of checkpoints and make PR to ray[tune]
                on="validation_end"),
            SampleReconstructionCallback(loader=val_loader)
        ])
    trainer.fit(model,
                train_dataloader=train_loader,
                val_dataloaders=val_loader)
    test_result = Trainer.test(model=model,
                               test_dataloaders=test_loader,
                               verbose=True)
Exemplo n.º 11
0
def _tune(tune_param_config,
          vl_bert_config=None,
          pl_ckpt_path=None,
          checkpoint_dir=None,
          num_gpus=1):
    pickle.DEFAULT_PROTOCOL = 4

    with logger.catch(reraise=True):
        config = copy.deepcopy(vl_bert_config)

        # config.TRAIN.LR = lr
        # config.TRAIN.WD = weight_decay

        # config.TRAIN.BATCH_IMAGES = batch_size
        # config.TRAIN.END_EPOCH = max_epoch

        # config.TRAIN.WARMUP_FACTOR = warmup_factor
        # config.TRAIN.WARMUP_STEPS = warmup_steps
        logger.warning(os.path.abspath('.'))

        checkpoint = ModelCheckpoint(
            filepath=pl_ckpt_path,
            save_last=False,
            save_top_k=3,
            monitor='val_accuracy',
        )
        tune_report = TuneReportCheckpointCallback(
            {
                # "loss": "val_checkpoint_on",
                "mean_accuracy": "val_checkpoint_on"
            },
            on="validation_end")
        adhoc_logger = TensorBoardLogger(save_dir=tune.get_trial_dir(),
                                         name="",
                                         version=".")

        trainer = pl.Trainer(
            # limit_train_batches=0.1,
            # limit_val_batches=0.1,
            accumulate_grad_batches=config.TRAIN.GRAD_ACCUMULATE_STEPS,
            checkpoint_callback=None,
            callbacks=[tune_report],
            logger=adhoc_logger,
            default_root_dir=pl_ckpt_path,
            gpus=num_gpus,
            num_nodes=1,
            distributed_backend='dp',
            precision=16,
            max_epochs=config.TRAIN.END_EPOCH,
            resume_from_checkpoint=None,
        )

        # vl_bert = LitVLBERT(config)
        hateful_meme = LitHatefulMeme(config)

        if checkpoint_dir:
            # Currently, this leads to errors:
            # model = LightningMNISTClassifier.load_from_checkpoint(
            #     os.path.join(checkpoint, "checkpoint"))
            # Workaround:
            ckpt = pl_load(os.path.join(checkpoint_dir, "checkpoint"),
                           map_location=lambda storage, loc: storage)
            vl_bert = LitVLBERT._load_model_state(ckpt, config)
            trainer.current_epoch = ckpt["epoch"]
        else:
            logger.info(config)
            vl_bert = LitVLBERT(config)

        trainer.fit(vl_bert, datamodule=hateful_meme)
Exemplo n.º 12
0
    def run(self, args: AttributeDict):
        """Run hyperparameter search using the `tune.schedulers.ASHAScheduler`

        Args:
            args (AttributeDict): Arguments

        Side-effects:
            Saves logs to `TUNE_LOGS_PATH / args.id`
        """
        try:
            from ray import tune
            from ray.tune.integration.pytorch_lightning import (
                TuneReportCheckpointCallback,
            )
        except ModuleNotFoundError as e:  # pragma: no cover
            logger.error(
                "To use hyperparameter search, first install Ray Tune via `pip install 'ray[tune]'` or `pip install 'ride[extras]'`"
            )
            raise e

        if not hasattr(args, "id"):
            args.id = "hparamsearch"

        module_config = (
            Configs.from_file(args.from_hparam_space_file)
            if args.from_hparam_space_file
            else self.Module.configs()
        ).tune_config()

        config = {
            **dict(args),
            **module_config,
            # pl.Trainer args:
            "gpus": args.gpus_per_trial,
            "logger": False,
            "accumulate_grad_batches": (
                (8 // args.gpus_per_trial) * args.accumulate_grad_batches
                if args.gpus_per_trial
                else args.accumulate_grad_batches
            ),
        }
        scheduler = tune.schedulers.ASHAScheduler(
            metric=f"val/{args.optimization_metric}",
            mode=self.Module.metrics()[args.optimization_metric].value,
            max_t=args.max_epochs,
            grace_period=1,
            reduction_factor=2,
        )

        metric_names = [f"val/{m}" for m in self.Module.metrics().keys()]

        reporter = tune.CLIReporter(
            metric_columns=[*metric_names, "training_iteration"],
        )
        tune_callbacks = [
            TuneReportCheckpointCallback(
                metrics=metric_names,
                filename="checkpoint",
                on="validation_end",
            )
        ]
        cpus_per_trial = max(
            1,
            (
                min(10 * args.gpus_per_trial, NUM_CPU - 10)
                if args.gpus_per_trial
                else min(10, NUM_CPU - 2)
            ),
        )

        analysis = tune.run(
            partial(
                Runner.static_train_and_val,
                self.Module,
                trainer_callbacks=tune_callbacks,
            ),
            name=args.id,
            local_dir=str(TUNE_LOGS_PATH),
            resources_per_trial={"cpu": cpus_per_trial, "gpu": args.gpus_per_trial},
            config=config,
            num_samples=args.trials,
            scheduler=scheduler,
            progress_reporter=reporter,
            raise_on_failed_trial=False,
        )

        best_hparams = analysis.get_best_config(
            metric=f"val/{args.optimization_metric}",
            mode=self.Module.metrics()[args.optimization_metric].value,
            scope="all",
        )
        # Select only model parameters
        if best_hparams:
            best_hparams = {
                k: best_hparams[k]
                for k in [
                    *self.Module.configs().names,
                    # Trainer parameters that influence model hparams:
                    "accumulate_grad_batches",
                    "batch_size",
                    "gpus",
                ]
            }
        return best_hparams