예제 #1
0
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from __init__ import *

pl.seed_everything(hparams.seed)
"""
Call all scripts from __init__, define and initialize model, set and activate Tensorboard logging and run training, validation and testing loop.
early stopping set fix to 10 rounds of no improvement of at least 0.001 validation accuracy.
"""

parser = get_parser()
hparams = parser.parse_args()

# Define Early Stopping condition
early_stop_callback = EarlyStopping(monitor='val_acc',
                                    min_delta=0.001,
                                    patience=10,
                                    verbose=False,
                                    mode='max')

# Define Model
if hparams.classifier_type == "autoregressive":
    model = Protein_GRU_Sequencer_Autoregressive()
elif hparams.encoder_type == "gru":
    model = Protein_GRU_Sequencer_CNN()
elif hparams.encoder_type == "lstm":
    model = Protein_LSTM_Sequencer_CNN()
else:
    raise Exception('Unknown encoder type: ' + hparams.encoder_type)

# Set Logging
if hparams.logger == True:
def cli_main():

    parser = ArgumentParser()
    parser.add_argument("--DATA_PATH",
                        type=str,
                        help="path to folders with images")
    parser.add_argument("--MODEL_PATH",
                        default=None,
                        type=str,
                        help="path to model checkpoint")
    parser.add_argument("--batch_size",
                        default=128,
                        type=int,
                        help="batch size for SSL")
    parser.add_argument("--image_size",
                        default=256,
                        type=int,
                        help="image size for SSL")
    parser.add_argument(
        "--image_type",
        default="tif",
        type=str,
        help=
        "extension of image for PIL to open and parse - i.e. jpeg, gif, tif, etc. Only put the extension name, not the dot (.)"
    )
    parser.add_argument("--num_workers",
                        default=1,
                        type=int,
                        help="number of CPU cores to use for data processing")
    parser.add_argument("--image_embedding_size",
                        default=128,
                        type=int,
                        help="size of image representation of SIMCLR")
    parser.add_argument("--epochs",
                        default=200,
                        type=int,
                        help="number of epochs to train model")
    parser.add_argument("--lr",
                        default=1e-3,
                        type=float,
                        help="learning rate for training model")
    parser.add_argument(
        "--patience",
        default=-1,
        type=int,
        help=
        "automatically cuts off training if validation does not drop for (patience) epochs. Leave blank to have no validation based early stopping."
    )
    parser.add_argument("--val_split",
                        default=0.2,
                        type=float,
                        help="percent in validation data")
    parser.add_argument(
        "--pretrain_encoder",
        default=False,
        type=bool,
        help=
        "initialize resnet encoder with pretrained imagenet weights. Cannot be true if passing previous SSL model checkpoint."
    )
    parser.add_argument(
        "--withold_train_percent",
        default=0,
        type=float,
        help=
        "decimal from 0-1 representing how much of the training data to withold during SSL training"
    )
    parser.add_argument("--version",
                        default="0",
                        type=str,
                        help="version to name checkpoint for saving")
    parser.add_argument("--gpus",
                        default=1,
                        type=int,
                        help="number of gpus to use for training")

    args = parser.parse_args()
    URL = args.DATA_PATH
    batch_size = args.batch_size
    image_size = args.image_size
    image_type = args.image_type
    num_workers = args.num_workers
    embedding_size = args.image_embedding_size
    epochs = args.epochs
    lr = args.lr
    patience = args.patience
    val_split = args.val_split
    pretrain = args.pretrain_encoder
    withold_train_percent = args.withold_train_percent
    version = args.version
    model_checkpoint = args.MODEL_PATH
    gpus = args.gpus

    # #testing
    # batch_size = 128
    # image_type = 'tif'
    # image_size = 256
    # num_workers = 4
    # URL ='/content/UCMerced_LandUse/Images'
    # embedding_size = 128
    # epochs = 2
    # lr = 1e-3
    # patience = 1
    # val_split = 0.2
    # pretrain = False
    # withold_train_percent = 0.2
    # version = "1"
    # model_checkpoint = '/content/models/SSL/SIMCLR_SSL_0.pt'
    # gpus = 1

    # #gets dataset. We can't combine since validation data has different transform needed
    train_dataset = FolderDataset(
        URL,
        validation=False,
        val_split=val_split,
        withold_train_percent=withold_train_percent,
        transform=SimCLRTrainDataTransform(image_size),
        image_type=image_type)

    data_loader = torch.utils.data.DataLoader(train_dataset,
                                              batch_size=batch_size,
                                              num_workers=num_workers,
                                              drop_last=True)

    print('Training Data Loaded...')
    val_dataset = FolderDataset(URL,
                                validation=True,
                                val_split=val_split,
                                transform=SimCLREvalDataTransform(image_size),
                                image_type=image_type)

    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=batch_size,
                                             num_workers=num_workers,
                                             drop_last=True)
    print('Validation Data Loaded...')

    num_samples = len(train_dataset)

    #init model with batch size, num_samples (len of data), epochs to train, and autofinds learning rate
    model = SimCLR(arch='resnet18',
                   batch_size=batch_size,
                   num_samples=num_samples,
                   gpus=gpus,
                   dataset='None',
                   max_epochs=epochs,
                   learning_rate=lr)  #

    model.encoder = resnet18(pretrained=pretrain,
                             first_conv=model.first_conv,
                             maxpool1=model.maxpool1,
                             return_all_feature_maps=False)
    model.projection = Projection(input_dim=512,
                                  hidden_dim=256,
                                  output_dim=embedding_size)  #overrides

    if patience > 0:
        cb = EarlyStopping('val_loss', patience=patience)
        trainer = Trainer(gpus=gpus,
                          max_epochs=epochs,
                          callbacks=[cb],
                          progress_bar_refresh_rate=5)
    else:
        trainer = Trainer(gpus=gpus,
                          max_epochs=epochs,
                          progress_bar_refresh_rate=5)

    if model_checkpoint is not None:
        model.load_state_dict(torch.load(model_checkpoint))
        print(
            'Successfully loaded your checkpoint. Keep in mind that this does not preserve the previous trainer states, only the model weights'
        )

    model.cuda()

    print('Model Initialized')
    trainer.fit(model, data_loader, val_loader)

    Path(f"./models/SSL/SIMCLR_SSL_{version}").mkdir(parents=True,
                                                     exist_ok=True)
    torch.save(model.state_dict(),
               f"./models/SSL/SIMCLR_SSL_{version}/SIMCLR_SSL_{version}.pt")
예제 #3
0

model = model.apply(init_weights)

tb_save_dir = os.path.join(os.getcwd(), 'runs')
cp_save_dir = os.path.join(os.getcwd(), "CKP", model_file_name)

logger = TensorBoardLogger(save_dir=tb_save_dir, name=model_file_name)

checkpoint_callback = ModelCheckpoint(filepath=cp_save_dir,
                                      save_top_k=1,
                                      verbose=True,
                                      monitor='loss_val',
                                      mode='min')

early_stop_callback = EarlyStopping(monitor='loss_val', verbose=True, mode=min)

trainer = pl.Trainer(gpus=1,
                     max_epochs=hparams["max_epochs"],
                     weights_summary=None,
                     logger=logger,
                     checkpoint_callback=checkpoint_callback,
                     callbacks=[early_stop_callback])

trainer.fit(model, train_dataloader, val_dataloader)

print("Best Model Path", checkpoint_callback.best_model_path)
best_model_path = checkpoint_callback.best_model_path

print(trainer.test(model, test_dataloaders=test_dataloader))
def setup_trainer(args):
    # init model
    set_seed(args)

    # Setup distant debugging if needed

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))

    checkpoint_callback = pl.callbacks.ModelCheckpoint(filepath=os.path.join(
        args.output_dir, '{epoch}'),
                                                       monitor="val_loss",
                                                       mode="min",
                                                       verbose=True,
                                                       save_top_k=1)
    early_stop_callback = EarlyStopping(monitor='val_loss',
                                        min_delta=0.00,
                                        patience=args.early_stop_patience,
                                        verbose=True,
                                        mode='min')
    # wandb logger
    wandb_logger = WandbLogger(project="bart-qa-to-nli")
    train_params = dict(
        accumulate_grad_batches=args.gradient_accumulation_steps,
        gpus=args.n_gpu,
        max_epochs=args.num_train_epochs,
        early_stop_callback=early_stop_callback,
        gradient_clip_val=args.max_grad_norm,
        checkpoint_callback=checkpoint_callback,
        logger=wandb_logger,
        callbacks=[LoggingCallback()],
        val_check_interval=0.25,
    )

    if args.fp16:
        train_params["use_amp"] = args.fp16
        train_params["amp_level"] = args.fp16_opt_level

    if args.n_tpu_cores > 0:
        global xm
        import torch_xla.core.xla_model as xm

        train_params["num_tpu_cores"] = args.n_tpu_cores
        train_params["gpus"] = 0

    if args.n_gpu > 1:
        train_params["distributed_backend"] = "ddp"

    trainer = pl.Trainer(**train_params)

    return trainer
def cli_main():

    parser = ArgumentParser()
    parser.add_argument("--DATA_PATH",
                        type=str,
                        help="path to folders with images")
    parser.add_argument("--MODEL_PATH",
                        default=None,
                        type=str,
                        help="path to model checkpoint")
    parser.add_argument("--batch_size",
                        default=128,
                        type=int,
                        help="batch size for SSL")
    parser.add_argument("--image_size",
                        default=256,
                        type=int,
                        help="image size for SSL")
    parser.add_argument("--image_embedding_size",
                        default=128,
                        type=int,
                        help="size of image representation of SIMCLR")
    parser.add_argument("--epochs",
                        default=200,
                        type=int,
                        help="number of epochs to train model")
    parser.add_argument("--lr",
                        default=1e-3,
                        type=float,
                        help="learning rate for training model")
    parser.add_argument(
        "--patience",
        default=-1,
        type=int,
        help=
        "automatically cuts off training if validation does not drop for (patience) epochs. Leave blank to have no validation based early stopping."
    )
    parser.add_argument("--val_split",
                        default=0.2,
                        type=float,
                        help="percent in validation data")
    parser.add_argument(
        "--pretrain_encoder",
        default=False,
        type=bool,
        help=
        "initialize resnet encoder with pretrained imagenet weights. Cannot be true if passing previous SSL model checkpoint."
    )
    parser.add_argument("--version",
                        default="0",
                        type=str,
                        help="version to name checkpoint for saving")
    parser.add_argument("--gpus",
                        default=1,
                        type=int,
                        help="number of gpus to use for training")
    parser.add_argument("--num_workers",
                        default=0,
                        type=int,
                        help="number of workers to use to fetch data")

    args = parser.parse_args()
    DATA_PATH = args.DATA_PATH
    batch_size = args.batch_size
    image_size = args.image_size
    num_workers = args.num_workers
    embedding_size = args.image_embedding_size
    epochs = args.epochs
    lr = args.lr
    patience = args.patience
    val_split = args.val_split
    pretrain = args.pretrain_encoder
    version = args.version
    model_checkpoint = args.MODEL_PATH
    gpus = args.gpus
    num_workers = args.num_workers

    dm = ImageModule(DATA_PATH,
                     val_split=val_split,
                     train_transform=SimCLRTrainDataTransform(image_size),
                     val_transform=SimCLREvalDataTransform(image_size),
                     num_workers=num_workers)
    dm.setup()

    #init model with batch size, num_samples (len of data), epochs to train, and autofinds learning rate
    model = SimCLR(arch='resnet18',
                   batch_size=batch_size,
                   num_samples=dm.num_samples,
                   gpus=gpus,
                   dataset='None',
                   max_epochs=epochs,
                   learning_rate=lr)  #

    model.encoder = resnet18(pretrained=pretrain,
                             first_conv=model.first_conv,
                             maxpool1=model.maxpool1,
                             return_all_feature_maps=False)
    model.projection = Projection(input_dim=512,
                                  hidden_dim=256,
                                  output_dim=embedding_size)  #overrides

    if patience > 0:
        cb = EarlyStopping('val_loss', patience=patience)
        trainer = Trainer(gpus=gpus,
                          max_epochs=epochs,
                          callbacks=[cb],
                          progress_bar_refresh_rate=5)
    else:
        trainer = Trainer(gpus=gpus,
                          max_epochs=epochs,
                          progress_bar_refresh_rate=5)

    if model_checkpoint is not None:
        model.load_state_dict(torch.load(model_checkpoint))
        print(
            'Successfully loaded your checkpoint. Keep in mind that this does not preserve the previous trainer states, only the model weights'
        )

    print('Model Initialized')
    trainer.fit(model, dm)

    Path(f"./models/SSL/SIMCLR_SSL_{version}").mkdir(parents=True,
                                                     exist_ok=True)
    torch.save(model.state_dict(),
               f"./models/SSL/SIMCLR_SSL_{version}/SIMCLR_SSL_{version}.pt")
예제 #6
0
    def _active_body(self, pid=None):
        # init model 
        self.model = self.init_model(self.config.hparams)

        # init active learner and set it up for training
        self.learner = ActiveLearner(
            config      = self.config.exp.active_learning,
            model       = self.model,
            datamodule  = self.dm,
        ).setup(pid)
        
        # init dict to store results
        results = {'config': config_to_dict(self.config)}

        # init training with pl.LightningModule models
        if self.config.trainer is not None:
            # init logger
            if self.config.logger is not None:
                logger = self.init_logger(pid)

            # init early stopping
            callbacks = list()
            if self.config.early_stop is not None:
                earlystop_callback = EarlyStopping(**vars(self.config.early_stop))
                callbacks.append(earlystop_callback)

            # init checkpoint callback
            if self.learner.val_ratio > 0:
                checkpoint_callback = ModelCheckpoint(
                    monitor     = self.config.early_stop.monitor,
                    save_last   = True,
                    mode        = self.config.early_stop.mode,
                )
                callbacks.append(checkpoint_callback)

            # make trainer
            trainer_args = vars(self.config.trainer)
            trainer_args.update({
                'logger': logger,
                'callbacks': callbacks
            })
            trainer = pl.Trainer(**trainer_args)

            # find optimal lr
            if self.config.exp.tune:
                trainer.auto_lr_find = True
                trainer.tune(
                    model               = self.learner.model,
                    train_dataloader    = self.learner.init_loader,
                    val_dataloaders     = self.learner.val_loader
                )

            # fit model to initial batch
            trainer.fit(
                model               = self.learner.model,
                train_dataloader    = self.learner.init_loader,
                val_dataloaders     = self.learner.val_loader
            )
            
            # test model and get results
            # self.learner.model = self.init_model(self.config.hparams).load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
            [metr] = trainer.test(
                model               = self.learner.model,
                test_dataloaders    = self.dm.test_dataloader()
            )
            counts = dict(self.learner._get_counts())
            cm = self.learner.model.cm
            # print(earlystop_callback.best_score)
            # print(trainer.checkpoint_callback.best_model_path)
            print(cm)

            # reset early stopping
            trainer.should_stop = False
            earlystop_callback.wait_count = 0
            # earlystop_callback.stopped_epoch = 0
            
            # log test results for the initial batch
            results.setdefault('metrics', list()).append(metr)
            results.setdefault('counts', list()).append(counts)
            results.setdefault('cms', list()).append(cm.tolist())

            # now receive samples one by one from a stream
            for i, (inp, tgt) in enumerate(self.learner.stream_loader):

                # infer label
                w = self.learner.infer(inp, use_torch=True).detach().cpu()  # logit odds
                p = torch.sigmoid(w)  # estimated probability that the current sample is in class 'high'

                # query if condition is met
                if self.learner.query(w, p):
                    self.learner.n_queried += 1
                    self.learner.queried.append((inp, tgt))  # add current sample to queried set
                
                # update the model coverage
                self.learner.coverage = self.learner.n_queried / (len(self.learner.init_batch) + i + 1)

                # if the number of queried samples is larger than the update size
                if len(self.learner.queried) >= self.learner.update_size:
                    self.learner.update()  # update active learner

                    # rebuild model from scratch
                    if self.learner.rebuild:
                        self.learner.model = self.init_model(self.config.hparams)
                    # or incrementally update existing model
                    # reload model from the last best checkpoint if there is a validation set
                    elif self.learner.val_loader is not None:
                        self.learner.model = self.learner.model.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)

                    # update model to use a different learning rate for learning from datastream
                    if self.learner.update_lr is not None and not self.config.exp.tune:
                        self.learner.model.hparams.learning_rate = self.learner.update_lr

                    # increment the number of maximum training epochs
                    trainer.max_epochs += self.learner.update_epochs

                    # # find optimal lr
                    # if self.config.exp.tune:
                    #     trainer.tune(
                    #         model               = self.learner.model,
                    #         train_dataloader    = self.learner.train_loader,
                    #         val_dataloaders     = self.learner.val_loader
                    #     )

                    # re-fit model to the new trainset
                    trainer.fit(
                        model               = self.learner.model,
                        train_dataloader    = self.learner.train_loader,
                        val_dataloaders     = self.learner.val_loader
                    )
                    
                    # test fitted model again on test set
                    # self.learner.model = self.init_model(self.config.hparams).load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
                    [metr] = trainer.test(
                        model               = self.learner.model,
                        test_dataloaders    = self.dm.test_dataloader()
                    )
                    counts = dict(self.learner._get_counts())
                    cm = self.learner.model.cm
                    # print(earlystop_callback.best_score)
                    # print(trainer.checkpoint_callback.best_model_path)
                    print(cm)

                    # reset early stopping
                    trainer.should_stop = False
                    earlystop_callback.wait_count = 0
                    # earlystop_callback.stopped_epoch = 0
                    # earlystop_callback.based_on_eval_results = False

                    # log test results
                    results['metrics'].append(metr)
                    results['counts'].append(counts)
                    results['cms'].append(cm.tolist())

            # TODO: now that we have seen all samples from the stream, do we want to do anything else?

        # active learning with XGBoost
        else:
            # get initial batch
            X_init, y_init = map(lambda x: torch.cat(x, dim=0).numpy(), zip(self.learner.init_batch[:], self.learner.val_batch[:]))

            # fit model to initial batch
            self.learner.model.train(X_init, y_init)
            # self.model.train(X_init, y_init)

            # test model and get results
            X_test, y_test = map(lambda x: x.numpy(), self.dm.kemocon_test[:])
            metr, cm = self.learner.model.test(X_test, y_test)
            counts = dict(self.learner._get_counts())

            # save test results
            results.update({
                'metrics': [metr],
                'counts': [counts],
                'confmats': [cm.tolist()]
            })
            print(metr)
            print(cm)

            ## update xgb parameters before learning from the stream
            # vars(self.learner.model.hparams.bst).update({
            #     'process_type': 'update',
            #     'updater': 'refresh,prune',
            #     'refresh_leaf': True
            # })

            # get samples from a stream
            for i, (inp, tgt) in enumerate(self.learner.datastream):

                # infer label
                w = self.learner.infer(inp, use_torch=False)
                p = 1 / (1 + np.exp(-w))
                
                # query if condition is met
                if self.learner.query(w, p):
                    self.learner.n_queried += 1
                    self.learner.queried.append((inp.unsqueeze(0), tgt.unsqueeze(0)))

                # update model covera
                self.learner.coverage = self.learner.n_queried / (len(self.learner.init_batch) + i + 1)

                # if queried the update size number of samples
                if len(self.learner.queried) >= self.learner.update_size:
                    # update train + val & minority label
                    # & reset queried samples buffer
                    self.learner.update()

                    # update model with queried samples
                    X_train, y_train = self.learner.train_inp.numpy(), self.learner.train_tgt.numpy()

                    # rebuild model from scratch
                    if self.learner.rebuild:
                        self.learner.model = self.init_model(self.config.hparams)
                        self.learner.model.train(X_train, y_train)
                    # or incrementally update existing model
                    else:
                        self.learner.model.train(X_train, y_train, model=self.learner.model.bst)

                    # test updated model
                    metr, cm = self.learner.model.test(X_test, y_test)
                    counts = dict(self.learner._get_counts())

                    # save results
                    results['metrics'].append(metr)
                    results['counts'].append(counts)
                    results['confmats'].append(cm.tolist())
                    print(metr)
                    print(cm)

        return results
예제 #7
0
파일: train.py 프로젝트: mlej8/ECSE552
def train(model):
    # create a logger
    logger = DictLogger()

    # create folder for each run
    folder = "models/{}".format(datetime.now().strftime("%b-%d-%H-%M-%S"))
    if not os.path.exists(folder):
        os.makedirs(folder)

    # early stoppping
    early_stopping_callback = EarlyStopping(
        monitor='val_loss',  # monitor validation loss
        verbose=True,  # log early-stop events
        patience=patience,
        min_delta=0.00  # minimum change is 0
    )

    # update checkpoints based on validation loss by using ModelCheckpoint callback monitoring 'val_loss'
    checkpoint_callback = ModelCheckpoint(monitor='val_loss')

    # define trainer
    trainer = pl.Trainer(
        default_root_dir=
        folder,  # Lightning automates saving and loading checkpoints
        max_epochs=epochs,
        gpus=0,
        logger=logger,
        progress_bar_refresh_rate=30,
        callbacks=[early_stopping_callback, checkpoint_callback])

    # train
    trainer.fit(model=model,
                train_dataloader=train_loader,
                val_dataloaders=val_loader)

    # test
    result = trainer.test(test_dataloaders=test_loader, verbose=True)

    # save test result
    PATH = folder + '/result'
    with open(PATH, "w") as f:
        f.write(f"Model: {str(model)}\n")
        f.write(json.dumps(logger.metrics))
        f.write("\n")
        f.write(
            f"Lowest training loss: {str(min(logger.metrics['train_loss']))}\n"
        )
        f.write(
            f"Lowest validation loss: {str(min(logger.metrics['val_loss']))}\n"
        )
        f.write(f"Test loss: {result}")

    # plot training
    plt.plot(range(len(logger.metrics['train_loss'])),
             logger.metrics['train_loss'],
             lw=2,
             label='Training Loss')
    plt.plot(range(len(logger.metrics['val_loss'])),
             logger.metrics['val_loss'],
             lw=2,
             label='Validation Loss')
    plt.legend()
    plt.xlabel('Epoch')
    plt.ylabel('RMSE Loss')
    plt.savefig(folder +
                f"/{type(model).__name__}_training_validation_test_loss.png")
    plt.clf()

    # plot p loss
    plt.plot(range(len(logger.metrics['train_p_loss'])),
             logger.metrics['train_p_loss'],
             lw=2,
             label='Training Loss')
    plt.plot(range(len(logger.metrics['val_p_loss'])),
             logger.metrics['val_p_loss'],
             lw=2,
             label='Validation Loss')
    plt.legend()
    plt.xlabel('Epoch')
    plt.ylabel('RMSE Loss')
    plt.savefig(folder + f"/p_loss.png")
    plt.clf()

    # plot T loss
    plt.plot(range(len(logger.metrics['train_T_loss'])),
             logger.metrics['train_T_loss'],
             lw=2,
             label='Training Loss')
    plt.plot(range(len(logger.metrics['val_T_loss'])),
             logger.metrics['val_T_loss'],
             lw=2,
             label='Validation Loss')
    plt.legend()
    plt.xlabel('Epoch')
    plt.ylabel('RMSE Loss')
    plt.savefig(folder + f"/T_loss.png")
    plt.clf()

    # plot T loss
    plt.plot(range(len(logger.metrics['train_rh_loss'])),
             logger.metrics['train_rh_loss'],
             lw=2,
             label='Training Loss')
    plt.plot(range(len(logger.metrics['val_rh_loss'])),
             logger.metrics['val_rh_loss'],
             lw=2,
             label='Validation Loss')
    plt.legend()
    plt.xlabel('Epoch')
    plt.ylabel('RMSE Loss')
    plt.savefig(folder + f"/rh_loss.png")
    plt.clf()

    # plot wv loss
    plt.plot(range(len(logger.metrics['train_wv_loss'])),
             logger.metrics['train_wv_loss'],
             lw=2,
             label='Training Loss')
    plt.plot(range(len(logger.metrics['val_wv_loss'])),
             logger.metrics['val_wv_loss'],
             lw=2,
             label='Validation Loss')
    plt.legend()
    plt.xlabel('Epoch')
    plt.ylabel('RMSE Loss')
    plt.savefig(folder + f"/wv_loss.png")
    plt.clf()
def train_default_zoobot_from_scratch(
        # absolutely crucial arguments
        save_dir,  # save model here
        schema,  # answer these questions
        # input data - specify *either* catalog (to be split) or the splits themselves
    catalog=None,
        train_catalog=None,
        val_catalog=None,
        test_catalog=None,
        # model training parameters
        model_architecture='efficientnet',
        batch_size=256,
        epochs=1000,
        patience=8,
        # data and augmentation parameters
        # datamodule_class=GalaxyDataModule,  # generic catalog of galaxies, will not download itself. Can replace with any datamodules from pytorch_galaxy_datasets
        color=False,
        resize_size=224,
        crop_scale_bounds=(0.7, 0.8),
        crop_ratio_bounds=(0.9, 1.1),
        # hardware parameters
        accelerator='auto',
        nodes=1,
        gpus=2,
        num_workers=4,
        prefetch_factor=4,
        mixed_precision=False,
        # replication parameters
        random_state=42,
        wandb_logger=None):

    slurm_debugging_logs()

    pl.seed_everything(random_state)

    assert save_dir is not None
    if not os.path.isdir(save_dir):
        os.mkdir(save_dir)

    if color:
        logging.warning(
            'Training on color images, not converting to greyscale')
        channels = 3
    else:
        logging.info('Converting images to greyscale before training')
        channels = 1

    strategy = None
    if (gpus is not None) and (gpus > 1):
        # only works as plugins, not strategy
        # strategy = 'ddp'
        strategy = DDPPlugin(find_unused_parameters=False)
        logging.info('Using multi-gpu training')

    if nodes > 1:
        assert gpus == 2
        logging.info('Using multi-node training')
        # this hangs silently on Manchester's slurm cluster - perhaps you will have more success?

    precision = 32
    if mixed_precision:
        logging.info(
            'Training with automatic mixed precision. Will reduce memory footprint but may cause training instability for e.g. resnet'
        )
        precision = 16

    assert num_workers > 0

    if (gpus is not None) and (num_workers * gpus > os.cpu_count()):
        logging.warning("""num_workers * gpu > num cpu.
            You may be spawning more dataloader workers than you have cpus, causing bottlenecks. 
            Suggest reducing num_workers.""")
    if num_workers > os.cpu_count():
        logging.warning("""num_workers > num cpu.
            You may be spawning more dataloader workers than you have cpus, causing bottlenecks. 
            Suggest reducing num_workers.""")

    if catalog is not None:
        assert train_catalog is None
        assert val_catalog is None
        assert test_catalog is None
        catalogs_to_use = {'catalog': catalog}
    else:
        assert catalog is None
        catalogs_to_use = {
            'train_catalog': train_catalog,
            'val_catalog': val_catalog,
            'test_catalog': test_catalog
        }

    datamodule = GalaxyDataModule(
        label_cols=schema.label_cols,
        # can take either a catalog (and split it), or a pre-split catalog
        **catalogs_to_use,
        #   augmentations parameters
        album=False,
        greyscale=not color,
        resize_size=resize_size,
        crop_scale_bounds=crop_scale_bounds,
        crop_ratio_bounds=crop_ratio_bounds,
        #   hardware parameters
        batch_size=
        batch_size,  # on 2xA100s, 256 with DDP, 512 with distributed (i.e. split batch)
        num_workers=num_workers,
        prefetch_factor=prefetch_factor)
    datamodule.setup()

    get_architecture, representation_dim = select_base_architecture_func_from_name(
        model_architecture)

    model = define_model.get_plain_pytorch_zoobot_model(
        output_dim=len(schema.answers),
        include_top=True,
        channels=channels,
        get_architecture=get_architecture,
        representation_dim=representation_dim)

    # This just adds schema.question_index_groups as an arg to the usual (labels, preds) loss arg format
    # Would use lambda but multi-gpu doesn't support as lambda can't be pickled
    def loss_func(preds, labels):  # pytorch convention is preds, labels
        return losses.calculate_multiquestion_loss(
            labels, preds, schema.question_index_groups
        )  # my and sklearn convention is labels, preds

    lightning_model = define_model.GenericLightningModule(model, loss_func)

    callbacks = [
        ModelCheckpoint(dirpath=os.path.join(save_dir, 'checkpoints'),
                        monitor="val_loss",
                        save_weights_only=True,
                        mode='min',
                        save_top_k=3),
        EarlyStopping(monitor='val_loss', patience=patience, check_finite=True)
    ]

    trainer = pl.Trainer(
        log_every_n_steps=3,
        accelerator=accelerator,
        gpus=gpus,  # per node
        num_nodes=nodes,
        strategy=strategy,
        precision=precision,
        logger=wandb_logger,
        callbacks=callbacks,
        max_epochs=epochs,
        default_root_dir=save_dir)

    logging.info((trainer.training_type_plugin, trainer.world_size,
                  trainer.local_rank, trainer.global_rank, trainer.node_rank))

    trainer.fit(lightning_model, datamodule)

    trainer.test(
        model=lightning_model,
        datamodule=datamodule,
        ckpt_path=
        'best'  # can optionally point to a specific checkpoint here e.g. "/share/nas2/walml/repos/gz-decals-classifiers/results/early_stopping_1xgpu_greyscale/checkpoints/epoch=26-step=16847.ckpt"
    )
예제 #9
0
def ml_mlp_mul_ms(station_name="종로구"):
    print("Start Multivariate MLP Mean Seasonality Decomposition (MSE) Model")
    targets = ["PM10", "PM25"]
    # targets = ["SO2", "CO", "O3", "NO2", "PM10", "PM25",
    #                   "temp", "u", "v", "pres", "humid", "prep", "snow"]
    # 24*14 = 336
    #sample_size = 336
    sample_size = 48
    output_size = 24
    # If you want to debug, fast_dev_run = True and n_trials should be small number
    fast_dev_run = False
    n_trials = 128
    # fast_dev_run = True
    # n_trials = 1

    # Hyper parameter
    epoch_size = 500
    batch_size = 64
    learning_rate = 1e-3

    # Blocked Cross Validation
    # neglect small overlap between train_dates and valid_dates
    # 11y = ((2y, 0.5y), (2y, 0.5y), (2y, 0.5y), (2.5y, 1y))
    train_dates = [(dt.datetime(2008, 1, 4, 1).astimezone(SEOULTZ),
                    dt.datetime(2009, 12, 31, 23).astimezone(SEOULTZ)),
                   (dt.datetime(2010, 7, 1, 0).astimezone(SEOULTZ),
                    dt.datetime(2012, 6, 30, 23).astimezone(SEOULTZ)),
                   (dt.datetime(2013, 1, 1, 0).astimezone(SEOULTZ),
                    dt.datetime(2014, 12, 31, 23).astimezone(SEOULTZ)),
                   (dt.datetime(2015, 7, 1, 0).astimezone(SEOULTZ),
                    dt.datetime(2017, 12, 31, 23).astimezone(SEOULTZ))]
    valid_dates = [(dt.datetime(2010, 1, 1, 0).astimezone(SEOULTZ),
                    dt.datetime(2010, 6, 30, 23).astimezone(SEOULTZ)),
                   (dt.datetime(2012, 7, 1, 0).astimezone(SEOULTZ),
                    dt.datetime(2012, 12, 31, 23).astimezone(SEOULTZ)),
                   (dt.datetime(2015, 1, 1, 0).astimezone(SEOULTZ),
                    dt.datetime(2015, 6, 30, 23).astimezone(SEOULTZ)),
                   (dt.datetime(2018, 1, 1, 0).astimezone(SEOULTZ),
                    dt.datetime(2018, 12, 31, 23).astimezone(SEOULTZ))]
    train_valid_fdate = dt.datetime(2008, 1, 3, 1).astimezone(SEOULTZ)
    train_valid_tdate = dt.datetime(2018, 12, 31, 23).astimezone(SEOULTZ)

    # Debug
    if fast_dev_run:
        train_dates = [(dt.datetime(2015, 7, 1, 0).astimezone(SEOULTZ),
                        dt.datetime(2017, 12, 31, 23).astimezone(SEOULTZ))]
        valid_dates = [(dt.datetime(2018, 1, 1, 0).astimezone(SEOULTZ),
                        dt.datetime(2018, 12, 31, 23).astimezone(SEOULTZ))]
        train_valid_fdate = dt.datetime(2015, 7, 1, 0).astimezone(SEOULTZ)
        train_valid_tdate = dt.datetime(2018, 12, 31, 23).astimezone(SEOULTZ)

    test_fdate = dt.datetime(2019, 1, 1, 0).astimezone(SEOULTZ)
    test_tdate = dt.datetime(2020, 10, 31, 23).astimezone(SEOULTZ)

    # check date range assumption
    assert len(train_dates) == len(valid_dates)
    for i, (td, vd) in enumerate(zip(train_dates, valid_dates)):
        assert vd[0] > td[1]
    assert test_fdate > train_dates[-1][1]
    assert test_fdate > valid_dates[-1][1]

    train_features = [
        "SO2", "CO", "NO2", "PM10", "PM25", "temp", "wind_spd", "wind_cdir",
        "wind_sdir", "pres", "humid", "prep"
    ]
    train_features_periodic = [
        "SO2", "CO", "NO2", "PM10", "PM25", "temp", "wind_spd", "wind_cdir",
        "wind_sdir", "pres", "humid"
    ]
    train_features_nonperiodic = ["prep"]

    for target in targets:
        print("Training " + target + "...")
        output_dir = Path(
            f"/mnt/data/MLPMSMultivariate/{station_name}/{target}/")
        Path.mkdir(output_dir, parents=True, exist_ok=True)
        model_dir = output_dir / "models"
        Path.mkdir(model_dir, parents=True, exist_ok=True)
        log_dir = output_dir / "log"
        Path.mkdir(log_dir, parents=True, exist_ok=True)

        _df_h = data.load_imputed(HOURLY_DATA_PATH)
        df_h = _df_h.query('stationCode == "' +
                           str(SEOUL_STATIONS[station_name]) + '"')

        if station_name == '종로구' and \
            not Path("/input/python/input_jongno_imputed_hourly_pandas.csv").is_file():
            # load imputed result

            df_h.to_csv("/input/python/input_jongno_imputed_hourly_pandas.csv")

        # construct dataset for seasonality
        print("Construct Train/Validation Sets...", flush=True)
        train_valid_dataset = construct_dataset(train_valid_fdate,
                                                train_valid_tdate,
                                                filepath=HOURLY_DATA_PATH,
                                                station_name=station_name,
                                                target=target,
                                                sample_size=sample_size,
                                                output_size=output_size,
                                                transform=False)
        # compute seasonality
        train_valid_dataset.preprocess()

        # For Block Cross Validation..
        # load dataset in given range dates and transform using scaler from train_valid_set
        # all dataset are saved in tuple
        print("Construct Training Sets...", flush=True)
        train_datasets = tuple(
            construct_dataset(td[0],
                              td[1],
                              scaler_X=train_valid_dataset.scaler_X,
                              scaler_Y=train_valid_dataset.scaler_Y,
                              filepath=HOURLY_DATA_PATH,
                              station_name=station_name,
                              target=target,
                              sample_size=sample_size,
                              output_size=output_size,
                              features=train_features,
                              features_periodic=train_features_periodic,
                              features_nonperiodic=train_features_nonperiodic,
                              transform=True) for td in train_dates)

        print("Construct Validation Sets...", flush=True)
        valid_datasets = tuple(
            construct_dataset(vd[0],
                              vd[1],
                              scaler_X=train_valid_dataset.scaler_X,
                              scaler_Y=train_valid_dataset.scaler_Y,
                              filepath=HOURLY_DATA_PATH,
                              station_name=station_name,
                              target=target,
                              sample_size=sample_size,
                              output_size=output_size,
                              features=train_features,
                              features_periodic=train_features_periodic,
                              features_nonperiodic=train_features_nonperiodic,
                              transform=True) for vd in valid_dates)

        # just single test set
        print("Construct Test Sets...", flush=True)
        test_dataset = construct_dataset(
            test_fdate,
            test_tdate,
            scaler_X=train_valid_dataset.scaler_X,
            scaler_Y=train_valid_dataset.scaler_Y,
            filepath=HOURLY_DATA_PATH,
            station_name=station_name,
            target=target,
            sample_size=sample_size,
            output_size=output_size,
            features=train_features,
            features_periodic=train_features_periodic,
            features_nonperiodic=train_features_nonperiodic,
            transform=True)

        # convert tuple of datasets to ConcatDataset
        train_dataset = ConcatDataset(train_datasets)
        val_dataset = ConcatDataset(valid_datasets)

        # num_layer == number of hidden layer
        hparams = Namespace(num_layers=1,
                            layer_size=128,
                            learning_rate=learning_rate,
                            batch_size=batch_size)

        def objective(trial):
            model = BaseMLPModel(
                trial=trial,
                hparams=hparams,
                input_size=sample_size * len(train_features),
                sample_size=sample_size,
                output_size=output_size,
                station_name=station_name,
                target=target,
                features=train_features,
                features_periodic=train_features_periodic,
                features_nonperiodic=train_features_nonperiodic,
                train_dataset=train_dataset,
                val_dataset=val_dataset,
                test_dataset=test_dataset,
                scaler_X=train_valid_dataset.scaler_X,
                scaler_Y=train_valid_dataset.scaler_Y,
                output_dir=output_dir)

            # most basic trainer, uses good defaults
            trainer = Trainer(gpus=1 if torch.cuda.is_available() else None,
                              precision=32,
                              min_epochs=1,
                              max_epochs=20,
                              default_root_dir=output_dir,
                              fast_dev_run=fast_dev_run,
                              logger=True,
                              checkpoint_callback=False,
                              callbacks=[
                                  PyTorchLightningPruningCallback(
                                      trial, monitor="valid/MSE")
                              ])

            trainer.fit(model)

            # Don't Log
            # hyperparameters = model.hparams
            # trainer.logger.log_hyperparams(hyperparameters)

            return trainer.callback_metrics.get("valid/MSE")

        if n_trials > 1:
            study = optuna.create_study(direction="minimize")
            study.enqueue_trial({
                'sigma': 1.3,
                'num_layers': 4,
                'layer_size': 8,
                'learning_rate': learning_rate,
                'batch_size': batch_size
            })
            study.enqueue_trial({
                'sigma': 1.3,
                'num_layers': 4,
                'layer_size': 32,
                'learning_rate': learning_rate,
                'batch_size': batch_size
            })
            study.enqueue_trial({
                'sigma': 1.3,
                'num_layers': 4,
                'layer_size': 64,
                'learning_rate': learning_rate,
                'batch_size': batch_size
            })
            study.enqueue_trial({
                'sigma': 1.3,
                'num_layers': 4,
                'layer_size': 32,
                'learning_rate': learning_rate,
                'batch_size': batch_size
            })
            study.enqueue_trial({
                'sigma': 1.3,
                'num_layers': 8,
                'layer_size': 32,
                'learning_rate': learning_rate,
                'batch_size': batch_size
            })
            study.enqueue_trial({
                'sigma': 1.3,
                'num_layers': 12,
                'layer_size': 32,
                'learning_rate': learning_rate,
                'batch_size': batch_size
            })
            study.enqueue_trial({
                'sigma': 0.7,
                'num_layers': 4,
                'layer_size': 32,
                'learning_rate': learning_rate,
                'batch_size': batch_size
            })
            study.enqueue_trial({
                'sigma': 2.0,
                'num_layers': 4,
                'layer_size': 32,
                'learning_rate': learning_rate,
                'batch_size': batch_size
            })
            # timeout = 3600*36 = 36h
            study.optimize(objective, n_trials=n_trials, timeout=3600 * 36)

            trial = study.best_trial

            print("  Value: ", trial.value)

            print("  Params: ")
            for key, value in trial.params.items():
                print("    {}: {}".format(key, value))
            print("sample_size : ", sample_size)
            print("output_size : ", output_size)

            # plot optmization results
            fig_cont1 = optv.plot_contour(study,
                                          params=['num_layers', 'layer_size'])
            fig_cont1.write_image(
                str(output_dir / "contour_num_layers_layer_size.png"))
            fig_cont1.write_image(
                str(output_dir / "contour_num_layers_layer_size.svg"))

            fig_edf = optv.plot_edf(study)
            fig_edf.write_image(str(output_dir / "edf.png"))
            fig_edf.write_image(str(output_dir / "edf.svg"))

            fig_iv = optv.plot_intermediate_values(study)
            fig_iv.write_image(str(output_dir / "intermediate_values.png"))
            fig_iv.write_image(str(output_dir / "intermediate_values.svg"))

            fig_his = optv.plot_optimization_history(study)
            fig_his.write_image(str(output_dir / "opt_history.png"))
            fig_his.write_image(str(output_dir / "opt_history.svg"))

            fig_pcoord = optv.plot_parallel_coordinate(
                study, params=['num_layers', 'layer_size'])
            fig_pcoord.write_image(str(output_dir / "parallel_coord.png"))
            fig_pcoord.write_image(str(output_dir / "parallel_coord.svg"))

            fig_slice = optv.plot_slice(study,
                                        params=['num_layers', 'layer_size'])
            fig_slice.write_image(str(output_dir / "slice.png"))
            fig_slice.write_image(str(output_dir / "slice.svg"))

            # set hparams with optmized value
            hparams.num_layers = trial.params['num_layers']
            hparams.layer_size = trial.params['layer_size']

            dict_hparams = copy.copy(vars(hparams))
            dict_hparams["sample_size"] = sample_size
            dict_hparams["output_size"] = output_size
            with open(output_dir / 'hparams.json', 'w') as f:
                print(dict_hparams, file=f)
            with open(output_dir / 'hparams.csv', 'w') as f:
                print(pd.DataFrame.from_dict(dict_hparams, orient='index'),
                      file=f)

        model = BaseMLPModel(hparams=hparams,
                             input_size=sample_size * len(train_features),
                             sample_size=sample_size,
                             output_size=output_size,
                             station_name=station_name,
                             target=target,
                             features=train_features,
                             features_periodic=train_features_periodic,
                             features_nonperiodic=train_features_nonperiodic,
                             train_dataset=train_dataset,
                             val_dataset=val_dataset,
                             test_dataset=test_dataset,
                             scaler_X=train_valid_dataset.scaler_X,
                             scaler_Y=train_valid_dataset.scaler_Y,
                             output_dir=output_dir)

        # record input
        for i, _train_set in enumerate(train_datasets):
            _train_set.to_csv(
                model.data_dir /
                ("df_trainset_{0}_".format(str(i).zfill(2)) + target + ".csv"))
        for i, _valid_set in enumerate(valid_datasets):
            _valid_set.to_csv(
                model.data_dir /
                ("df_validset_{0}_".format(str(i).zfill(2)) + target + ".csv"))
        train_valid_dataset.to_csv(model.data_dir /
                                   ("df_trainvalidset_" + target + ".csv"))
        test_dataset.to_csv(model.data_dir / ("df_testset_" + target + ".csv"))

        checkpoint_callback = pl.callbacks.ModelCheckpoint(os.path.join(
            model_dir, "train_{epoch}_{valid/MSE:.2f}"),
                                                           monitor="valid/MSE",
                                                           period=10)

        early_stop_callback = EarlyStopping(monitor='valid/MSE',
                                            min_delta=0.001,
                                            patience=30,
                                            verbose=True,
                                            mode='min')

        log_version = dt.date.today().strftime("%y%m%d-%H-%M")
        loggers = [ \
            TensorBoardLogger(log_dir, version=log_version),
            CSVLogger(log_dir, version=log_version)]

        # most basic trainer, uses good defaults
        trainer = Trainer(gpus=1 if torch.cuda.is_available() else None,
                          precision=32,
                          min_epochs=1,
                          max_epochs=epoch_size,
                          default_root_dir=output_dir,
                          fast_dev_run=fast_dev_run,
                          logger=loggers,
                          log_every_n_steps=5,
                          flush_logs_every_n_steps=10,
                          callbacks=[early_stop_callback],
                          checkpoint_callback=checkpoint_callback)

        trainer.fit(model)

        # run test set
        trainer.test(ckpt_path=None)

        shutil.rmtree(model_dir)
예제 #10
0
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks import ModelCheckpoint
from main import DistillBart
from preprocessing import load_multilingual_dataset
import pytorch_lightning as pl
import os

if __name__ == "__main__":
    # trainer = pl.Trainer(gpus=None)
    trainer = pl.Trainer(
        gpus=-1,
        callbacks=[
            EarlyStopping(monitor="val_loss"),
            ModelCheckpoint(
                dirpath="./drive/MyDrive/mlbart_ckpt",
                monitor="val_loss",
                filename="paraphrase_mlbart_{epoch:02d}-{val_loss:.2f}",
                save_top_k=-1,
                mode="min",
            ),
        ],
        progress_bar_refresh_rate=20,
    )
    train_dataloader, validation_dataloader = load_multilingual_dataset(
        dataset_path=f"{os.getcwd()}/drive/MyDrive/dataset", batch_size=4)
    model = DistillBart(9, 3)
    trainer.fit(model,
                train_dataloader=train_dataloader,
                val_dataloaders=validation_dataloader)
예제 #11
0
    hparams['bands'] = len(util.get_wavelengths_for(opt.camera_type))
    hparams['augmentation_config'] = AUGMENTATION_CONFIG
    hparams['test_augmentation'] = True

    print("Hparams: %s" % hparams)

    model = DeepHsAblationStudyModule(hparams)
    logger = WandbLogger(hparams['git_id'],
                         offline=not opt.online_logging,
                         save_dir=opt.log_path,
                         project='deephs')

    early_stop_callback = EarlyStopping(monitor='val_loss',
                                        min_delta=0.00,
                                        verbose=True,
                                        mode='min',
                                        patience=20)

    checkpoint_callback = ModelCheckpoint(filepath='best.ckpt',
                                          save_top_k=1,
                                          verbose=True,
                                          monitor='val_loss',
                                          mode='min')

    trainer = lightning.Trainer(max_epochs=opt.num_epochs,
                                gpus=-1,
                                logger=logger,
                                early_stop_callback=early_stop_callback,
                                min_epochs=50,
                                checkpoint_callback=checkpoint_callback,
예제 #12
0
    def finetune(self,
                 dataset,
                 validation_split: float = 0.15,
                 epochs: int = 20,
                 batch_size: int = None,
                 optimal_batch_size: int = None,
                 early_stopping: bool = True,
                 trainer=None):
        self.batch_size = batch_size or 1

        if not torch.cuda.is_available():
            raise Exception(
                "You need a cuda capable (Nvidia) GPU for finetuning")

        len_train = int(len(dataset) * (1 - validation_split))
        len_valid = len(dataset) - len_train
        dataset_train, dataset_valid = torch.utils.data.random_split(
            dataset, [len_train, len_valid])

        self.dataset_train = dataset_train
        self.dataset_valid = dataset_valid

        if batch_size == None:
            # Find batch size
            temp_trainer = pl.Trainer(auto_scale_batch_size="power", gpus=-1)
            print("Finding the optimal batch size...")
            temp_trainer.tune(self)

            # Ensure that memory gets cleared
            del self.trainer
            del temp_trainer
            garbage_collection_cuda()

        trainer_kwargs = {}

        if optimal_batch_size:
            # Don't go over
            batch_size = min(self.batch_size, optimal_batch_size)
            accumulate_grad_batches = max(1,
                                          int(optimal_batch_size / batch_size))
            trainer_kwargs["accumulate_grad_batches"] = accumulate_grad_batches

        if early_stopping:
            # Stop when val loss stops improving
            early_stopping = EarlyStopping(monitor="val_loss", patience=1)
            trainer_kwargs["callbacks"] = [early_stopping]

        if not trainer:
            trainer = pl.Trainer(gpus=-1,
                                 max_epochs=epochs,
                                 checkpoint_callback=False,
                                 logger=False,
                                 **trainer_kwargs)

        self.model.train()
        trainer.fit(self)

        del self.dataset_train
        del self.dataset_valid
        del self.trainer

        # For some reason the model can end up on CPU after training
        self.to(self._model_device)
        self.model.eval()
        print(
            "Training finished! Save your model for later with backprop.save or upload it with backprop.upload"
        )
예제 #13
0
                f.write(f'{model.test_preds[i]}\n\n')


if __name__ == "__main__":
    # set random seed
    pl.seed_everything(42)

    parser = ArgumentParser()
    parser = pl.Trainer.add_argparse_args(parser)
    parser.add_argument("--savename", type=str, default='no_name')
    parser.add_argument('--checkpoint', type=str, default='')
    parser.add_argument('--dataset', type=str, default='WQ')
    parser.add_argument('--batch_size', type=int, default=2)
    parser.add_argument('--pre_trained', type=str, default='t5', help='t5 or bart')

    args = parser.parse_args()


    # Define trainer
    tb_logger = pl_loggers.TensorBoardLogger('logs/')
    trainer = pl.Trainer.from_argparse_args( 
        args,  # max_epochs, gpus
        logger=tb_logger,
        callbacks=[EarlyStopping(monitor='val_loss')]
        )

    kgqg = KGQGDataModule('data/' + args.dataset, batch_size=args.batch_size, pre_trained=args.pre_trained)
    model = KGQGTuner.load_from_checkpoint(args.checkpoint, datamodule=kgqg, pre_trained=args.pre_trained)

    trainer.test(model=model, datamodule=kgqg)
    write_test_files(model, kgqg, name=args.savename)
예제 #14
0
    def train(
        self,
        train_df: pd.DataFrame,
        test_df: pd.DataFrame,
        source_max_token_len: int = 512,
        target_max_token_len: int = 512,
        batch_size: int = 8,
        max_epochs: int = 5,
        use_gpu: bool = True,
        outputdir: str = "outputs",
        early_stopping_patience_epochs:
        int = 0,  # 0 to disable early stopping feature
        test_split=0.1,
        tpu_cores=None,
    ):
        """
        trains T5 model on custom dataset
        Args:
            data_df (pd.DataFrame): training datarame. Dataframe must have 2 column --> "keywords" and "text"
            source_max_token_len (int, optional): max token length of source text. Defaults to 512.
            target_max_token_len (int, optional): max token length of target text. Defaults to 512.
            batch_size (int, optional): batch size. Defaults to 8.
            max_epochs (int, optional): max number of epochs. Defaults to 5.
            use_gpu (bool, optional): if True, model uses gpu for training. Defaults to True.
            outputdir (str, optional): output directory to save model checkpoints. Defaults to "outputs".
            early_stopping_patience_epochs (int, optional): monitors val_loss on epoch end and stops training,
             if val_loss does not improve after the specied number of epochs. set 0 to disable early stopping.
             Defaults to 0 (disabled)
            :param test_df:
            :param train_df:
        """
        self.target_max_token_len = target_max_token_len
        self.max_epoch = max_epochs
        self.train_df = train_df
        self.test_df = test_df

        self.data_module = PLDataModule(
            train_df=train_df,
            test_df=test_df,
            tokenizer=self.tokenizer,
            batch_size=batch_size,
            source_max_token_len=source_max_token_len,
            target_max_token_len=target_max_token_len,
            split=test_split,
        )

        self.T5Model = LightningModel(tokenizer=self.tokenizer,
                                      model=self.model,
                                      output=outputdir)

        logger = WandbLogger(project="keytotext")

        early_stop_callback = ([
            EarlyStopping(
                monitor="val_loss",
                min_delta=0.00,
                patience=early_stopping_patience_epochs,
                verbose=True,
                mode="min",
            )
        ] if early_stopping_patience_epochs > 0 else None)

        gpus = -1 if use_gpu else 0

        trainer = Trainer(logger=logger,
                          callbacks=early_stop_callback,
                          max_epochs=max_epochs,
                          gpus=gpus,
                          progress_bar_refresh_rate=5,
                          tpu_cores=tpu_cores)

        trainer.fit(self.T5Model, self.data_module)
예제 #15
0
    'precision': 16,
    'subset': 0.1,
    'test_size': 0.2,
    'seed': 42,
    'size': 256,
    'backbone': 'resnet18',
    'val_batches': 10
}

dm = DataModule(**config)

model = Resnet(config)

wandb_logger = WandbLogger(project="cassava", config=config)

es = EarlyStopping(monitor='val_acc', mode='max', patience=3)
checkpoint = ModelCheckpoint(
    dirpath='./',
    filename=f'{config["backbone"]}-{config["size"]}-{{val_acc:.5f}}',
    save_top_k=1,
    monitor='val_acc',
    mode='max')

trainer = pl.Trainer(gpus=1,
                     precision=config['precision'],
                     logger=wandb_logger,
                     max_epochs=config['max_epochs'],
                     callbacks=[es, checkpoint],
                     limit_val_batches=config['val_batches'])

trainer.fit(model, dm)
예제 #16
0
                      checkpoint_callback=False,
                      logger=False)

    checkpoint_callback = ModelCheckpoint(
        filepath=os.getcwd(),
        save_top_k=2,
        verbose=True,
        monitor="val/loss",
        mode="min",
    )

    experiment_name = ...
    PROJECT_NAME = ...

    logger = WandbLogger(name=experiment_name, project=PROJECT_NAME)

    # And then actual training
    pl.seed_everything(42)
    trainer = Trainer(
        max_epochs=40,
        logger=logger,
        gpus=1,
        # precision=16,
        deterministic=True,
        accumulate_grad_batches=2,
        callbacks=[EarlyStopping(monitor="val/loss")],
        # resume_from_checkpoint = 'my_checkpoint.ckpt'
    )

    trainer.fit(model, dm)
예제 #17
0
    def _body(self, pid=None):
        # init model
        self.model = self.init_model(self.config.hparams)

        # setup datamodule
        self.dm.setup(stage=None, test_id=pid)

        # init training with pl.LightningModule models
        if self.config.trainer is not None:
            # init logger
            if self.config.logger is not None:
                logger = self.init_logger(pid)

            # init lr monitor and callbacks
            callbacks = list()
            if self.config.hparams.scheduler is not None:
                callbacks.append(LearningRateMonitor(logging_interval='epoch'))

            # init early stopping
            if self.config.early_stop is not None:
                callbacks.append(EarlyStopping(**vars(self.config.early_stop)))

            # make trainer
            trainer_args = vars(self.config.trainer)
            trainer_args.update({
                'logger': logger,
                'callbacks': callbacks,
                'auto_lr_find': True if self.config.exp.tune else False
            })
            trainer = pl.Trainer(**trainer_args)

            # find optimal lr
            if self.config.exp.tune:
                trainer.tune(self.model, datamodule=self.dm)
            
            # train model
            trainer.fit(self.model, self.dm)

            # test model and get results
            [results] = trainer.test(self.model)

            # return metrics and confusion matrix
            metr = {
                'pid': pid,
                'acc': results['test_acc'],
                'ap': results['test_ap'],
                'f1': results['test_f1'],
                'auroc': results['test_auroc'],
                'num_epochs': self.model.current_epoch,
            }
            cm = self.model.cm
        
        else:
            # train model: concat train and valid inputs and labels and convert torch tensors to numpy arrays
            X_train, y_train = map(lambda x: torch.cat(x, dim=0).numpy(), zip(self.dm.kemocon_train[:], self.dm.kemocon_val[:]))
            self.model.train(X_train, y_train)

            # test model
            X_test, y_test = map(lambda x: x.numpy(), self.dm.kemocon_test[:])
            metr, cm = self.model.test(X_test, y_test)

        return metr, cm
예제 #18
0
def cli_main():
    parser = ArgumentParser()
    parser.add_argument("--DATA_PATH",
                        type=str,
                        help="path to folders with images")
    parser.add_argument("--MODEL_PATH",
                        default=None,
                        type=str,
                        help="path to model checkpoint.")
    parser.add_argument("--encoder",
                        default=None,
                        type=str,
                        help="encoder for model found in encoders.py")
    parser.add_argument("--batch_size",
                        default=128,
                        type=int,
                        help="batch size for SSL")
    parser.add_argument("--num_workers",
                        default=0,
                        type=int,
                        help="number of workers to use to fetch data")
    parser.add_argument(
        "--hidden_dims",
        default=128,
        type=int,
        help=
        "hidden dimensions in classification layer added onto model for finetuning"
    )
    parser.add_argument("--epochs",
                        default=200,
                        type=int,
                        help="number of epochs to train model")
    parser.add_argument("--lr",
                        default=1e-3,
                        type=float,
                        help="learning rate for training model")
    parser.add_argument(
        "--patience",
        default=-1,
        type=int,
        help=
        "automatically cuts off training if validation does not drop for (patience) epochs. Leave blank to have no validation based early stopping."
    )
    parser.add_argument("--val_split",
                        default=0.2,
                        type=float,
                        help="percent in validation data")
    parser.add_argument(
        "--withhold_split",
        default=0,
        type=float,
        help=
        "decimal from 0-1 representing how much of the training data to withold from either training or validation"
    )
    parser.add_argument("--gpus",
                        default=1,
                        type=int,
                        help="number of gpus to use for training")
    parser.add_argument(
        "--eval",
        default=True,
        type=bool,
        help=
        "Eval Mode will train and evaluate the finetuned model's performance")
    parser.add_argument(
        "--pretrain_encoder",
        default=False,
        type=bool,
        help=
        "initialize resnet encoder with pretrained imagenet weights. Ignored if MODEL_PATH is specified."
    )
    parser.add_argument("--version",
                        default="0",
                        type=str,
                        help="version to name checkpoint for saving")

    args = parser.parse_args()
    DATA_PATH = args.DATA_PATH
    batch_size = args.batch_size
    num_workers = args.num_workers
    hidden_dims = args.hidden_dims
    epochs = args.epochs
    lr = args.lr
    patience = args.patience
    val_split = args.val_split
    withhold = args.withhold_split
    version = args.version
    MODEL_PATH = args.MODEL_PATH
    gpus = args.gpus
    eval_model = args.eval
    version = args.version
    pretrain = args.pretrain_encoder
    encoder = args.encoder

    model = sslSIMCLR(encoder=encoder,
                      epochs=epochs,
                      pretrained=pretrain,
                      MODEL_PATH=MODEL_PATH,
                      DATA_PATH=DATA_PATH,
                      withhold=withhold,
                      batch_size=batch_size,
                      val_split=val_split,
                      hidden_dims=hidden_dims,
                      train_transform=SimCLRTrainDataTransform,
                      val_transform=SimCLRTrainDataTransform,
                      num_workers=num_workers)

    online_evaluator = SSLOnlineEvaluator(drop_p=0.,
                                          hidden_dim=None,
                                          z_dim=model.embedding_size,
                                          num_classes=26,
                                          dataset='None')

    if patience > 0:
        cb = EarlyStopping('val_loss', patience=patience)
        trainer = Trainer(gpus=gpus,
                          max_epochs=epochs,
                          callbacks=[cb, online_evaluator],
                          progress_bar_refresh_rate=5)
    else:
        trainer = Trainer(gpus=gpus,
                          max_epochs=epochs,
                          callbacks=[online_evaluator],
                          progress_bar_refresh_rate=5)

    trainer.fit(model)
    Path(f"./models/SSL/SIMCLR_SSL_{version}").mkdir(parents=True,
                                                     exist_ok=True)
    torch.save(model.encoder.state_dict(),
               f"./models/SSL/SIMCLR_SSL_{version}/SIMCLR_SSL_{version}.pt")
예제 #19
0
def train(debug=False,
          use_hdr=True,
          normalize=False,
          n_points=1280,
          num_workers=16,
          batch_size=32):
    """Train PointAR model

    Parameters
    ----------
    debug : bool
        Set debugging flag
    use_hdr : bool
        Use HDR SH coefficients data for training
    normalize : bool
        Normalize SH coefficients
    n_points : int
        Number of model input points, default 1280
    num_workers : int
        Number of workers for loading data, default 16
    batch_size : int
        Training batch size
    """

    # Specify dataset
    TestDataset = PointARTestDataset
    TrainDataset = TestDataset if debug else PointARTrainDataset

    # Get loaders ready
    loader_param = {'use_hdr': use_hdr}
    loaders, scaler = train_valid_test_split(TrainDataset,
                                             loader_param,
                                             TestDataset,
                                             loader_param,
                                             normalize=normalize,
                                             num_workers=num_workers,
                                             batch_size=batch_size)

    train_loader, valid_loader, test_loader = loaders

    # Get model ready
    model = PointAR(
        hparams={
            'n_shc':
            27,
            'n_points':
            n_points,
            'min':
            torch.from_numpy(scaler.min_) if normalize else torch.zeros((27)),
            'scale':
            torch.from_numpy(scaler.scale_) if normalize else torch.ones((27))
        })

    # Train
    sample_input = (torch.zeros(
        (1, 3, n_points)).float().cuda(), torch.zeros(
            (1, 3, n_points)).float().cuda())

    trainer = pl.Trainer(gpus=1,
                         check_val_every_n_epoch=1,
                         callbacks=[
                             ModelSavingCallback(sample_input=sample_input),
                             EarlyStopping(monitor='valid_shc_mse')
                         ])

    # Start training
    trainer.fit(model,
                train_dataloader=train_loader,
                val_dataloaders=[valid_loader, test_loader])
예제 #20
0
                        wandb_logger = WandbLogger(project='recommender-xai',
                                                   tags=['vae', train_tag],
                                                   name=wandb_name)
                        trainer = pl.Trainer.from_argparse_args(
                            args,
                            # limit_test_batches=0.1,
                            # precision =16,
                            logger=wandb_logger,  # False
                            gradient_clip_val=0.5,
                            # accumulate_grad_batches=0,
                            gpus=0,
                            weights_summary='full',
                            checkpoint_callback=False,
                            callbacks=[
                                ProgressBar(),
                                EarlyStopping(monitor='train_loss')
                            ])

                        if (train):
                            print(
                                '<---------------------------------- VAE Training ---------------------------------->'
                            )
                            print(
                                "Running with the following configuration: \n{}"
                                .format(args))
                            if (synthetic_data):
                                model_params['synthetic_data'], model_params[
                                    'syn_y'] = data_utils.create_synthetic_data(
                                        no_generative_factors, experiment_path,
                                        expanded_user_item, continous_data,
                                        normalvariate, noise)
예제 #21
0
파일: exp.py 프로젝트: CMUSTRUDEL/DIRTY
def train(args):
    config = json.loads(_jsonnet.evaluate_file(args["CONFIG_FILE"]))

    if args["--extra-config"]:
        extra_config = args["--extra-config"]
        extra_config = json.loads(extra_config)
        config = util.update(config, extra_config)

    # dataloaders
    batch_size = config["train"]["batch_size"]
    train_set = Dataset(
        config["data"]["train_file"],
        config["data"],
        percent=float(args["--percent"]),
    )
    dev_set = Dataset(config["data"]["dev_file"], config["data"])
    train_loader = DataLoader(
        train_set,
        batch_size=batch_size,
        collate_fn=Dataset.collate_fn,
        num_workers=16,
        pin_memory=True,
    )
    val_loader = DataLoader(
        dev_set,
        batch_size=batch_size,
        collate_fn=Dataset.collate_fn,
        num_workers=8,
        pin_memory=True,
    )

    # model
    model = TypeReconstructionModel(config)

    wandb_logger = WandbLogger(name=args["--expname"],
                               project="dire",
                               log_model=True)
    wandb_logger.log_hyperparams(config)
    resume_from_checkpoint = (args["--eval-ckpt"]
                              if args["--eval-ckpt"] else args["--resume"])
    if resume_from_checkpoint == "":
        resume_from_checkpoint = None
    trainer = pl.Trainer(
        max_epochs=config["train"]["max_epoch"],
        logger=wandb_logger,
        gpus=1 if args["--cuda"] else None,
        auto_select_gpus=True,
        gradient_clip_val=1,
        callbacks=[
            EarlyStopping(
                monitor="val_retype_acc"
                if config["data"]["retype"] else "val_rename_acc",
                mode="max",
                patience=config["train"]["patience"],
            )
        ],
        check_val_every_n_epoch=config["train"]["check_val_every_n_epoch"],
        progress_bar_refresh_rate=10,
        accumulate_grad_batches=config["train"]["grad_accum_step"],
        resume_from_checkpoint=resume_from_checkpoint,
    )
    if args["--eval-ckpt"]:
        # HACK: necessary to make pl test work for IterableDataset
        Dataset.__len__ = lambda self: 1000000
        test_set = Dataset(config["data"]["test_file"], config["data"])
        test_loader = DataLoader(
            test_set,
            batch_size=config["test"]["batch_size"],
            collate_fn=Dataset.collate_fn,
            num_workers=8,
            pin_memory=True,
        )
        trainer.test(model,
                     test_dataloaders=test_loader,
                     ckpt_path=args["--eval-ckpt"])
    else:
        trainer.fit(model, train_loader, val_loader)
def main(model_name, seed, group, save_path, save_images, baseline):
    """

    :return:
    """
    save_path = os.path.join(save_path, "experiments", group)
    os.makedirs(save_path, exist_ok=True)

    hparams = get_params(model_name)

    configuration_dict = get_configuration(model_name, hparams)

    # setup wandb pipeline
    wandb_logger = WandbLogger(
        name="{}-{}-{}".format(group, model_name, seed),
        save_dir=save_path,
        project=PROJECT,
        group=group,
        tags=group,
    )

    train, valid, test = get_loaders(hparams, configuration_dict)

    model_module = importlib.import_module(
        "src.segmentation.models.{}.model".format(model_name))

    model = model_module.Model(hparams)
    model.configuration = configuration_dict

    early_stop_callback = EarlyStopping(monitor="val_loss",
                                        min_delta=0.00,
                                        patience=30,
                                        verbose=False,
                                        mode="min")

    # Pytorch lightning trainer
    trainer = Trainer(
        gpus=1,
        weights_summary="top",
        max_epochs=50,
        logger=wandb_logger,
        early_stop_callback=early_stop_callback,
        num_sanity_val_steps=0,
        callbacks=[LearningRateLogger()]
        if hparams["scheduler_type"] != None else None,
        default_root_dir=save_path,
    )

    trainer.fit(model, train_dataloader=train, val_dataloaders=valid)

    del model
    torch.cuda.empty_cache()

    save_path = os.path.join(save_path, PROJECT,
                             wandb_logger.__getstate__()["_id"])

    model = load_best(model_module, configuration_dict, save_path)

    scores = get_results(model, valid, wandb_logger, save_path, save_images,
                         baseline)

    save_metrics(scores, save_path)

    move_best(save_path, group)
예제 #23
0
def main():
    TRAIN_BATCH_SIZE = 32
    VAL_BATCH_SIZE = 32
    TEST_BATCH_SIZE = 32

    LEARNING_RATE = 1e-5
    MAX_EPOCHS = 20
    WEIGHT_DECAY = 0.0

    #DATA_PATH = "/bigtemp/rm5tx/nlp_project/data_cache/"
    DATA_PATH = os.path.expanduser("~/data_cache/")
    # DATA_NORM_PATH = os.path.expanduser("~/data_cache/")
    # DATA_ADJACENT_PATH = os.path.expanduser("~/data_adjacent_cache/")

    MAX_LEN = 128
    ADJACENT = True
    ADJRAT = 0.23
    ADJTOT = 2
    RATIO = 1

    data = ProjData(max_len=MAX_LEN,
                    ratio=RATIO,
                    adjacent=ADJACENT,
                    adjrat=ADJRAT,
                    adjtot=ADJTOT)
    if ADJACENT:
        # DATA_PATH = DATA_ADJACENT_PATH
        MODEL_NAME = 'nlp_proj_adjacent' + str(MAX_LEN)
    else:
        # DATA_PATH = DATA_NORM_PATH
        MODEL_NAME = 'nlp_proj_norm' + str(MAX_LEN)

    try:
        data.load(DATA_PATH)
        print("Loaded Saved Data")
    except Exception as e:
        print(e)
        data.setup()
        data.save(DATA_PATH)
    ### Comment out the try block and uncomment below while you're working on the data part or you'll just skip it and use old data.
    # data.setup()
    # data.save(DATA_PATH)

    model = ProjModel(learning_rate=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

    logger = TensorBoardLogger(os.path.expanduser("~/tb_logs"),
                               name=MODEL_NAME)
    checkpoint_callback = ModelCheckpoint(
        monitor='valid_loss',
        dirpath=os.path.expanduser("~/saved_models"),
        save_last=True,
        filename=MODEL_NAME + '-{epoch:02d}-{avg_acc:.2f}',
    )
    earlystopping = EarlyStopping(monitor='avg_acc', verbose=True, patience=0)

    trainer = pl.Trainer(
        logger=logger,
        accelerator='ddp',  # jupyter can't use ddp, use dp instead
        # effective batch size is batch_size * num_gpus * num_nodes
        gpus=1,
        gradient_clip_val=1.0,
        max_epochs=MAX_EPOCHS,
        fast_dev_run=False,
        callbacks=[checkpoint_callback, earlystopping])
    trainer.fit(model, data.train_dataloader(batch_size=TRAIN_BATCH_SIZE),
                data.val_dataloader(batch_size=VAL_BATCH_SIZE))
예제 #24
0
        filtered = []
        for token in decoded_sentence_list:
            if token not in special_tokens:
                filtered.append(token)
        return " ".join(filtered)


class MyEarlyStopping(EarlyStopping):
    def on_validation_end(self, trainer, pl_module):
        if pl_module.iter > pl_module.args.num_warmup_steps:
            self._run_early_stopping_check(trainer, pl_module)


early_stop_callback = EarlyStopping(monitor='val_qa_decode_Bleu_4',
                                    min_delta=0.00,
                                    patience=10,
                                    verbose=True,
                                    mode='max')

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--hidden_dim",
                        type=int,
                        default=768,
                        help="Hidden dimensionality of the model")
    parser.add_argument("--latent_dim",
                        type=int,
                        default=768,
                        help="Hidden dimensionality of the model")
    parser.add_argument("--lr",
                        type=float,
예제 #25
0
def optimize(trial: optuna.Trial, data_dict):
    gts = PurgedGroupTimeSeriesSplit(n_splits=5, group_gap=10)
    input_size = data_dict['data'].shape[-1]
    output_size = 5
    checkpoint_callback = pl.callbacks.ModelCheckpoint(os.path.join(
        'models/', "trial_resnet_{}".format(trial.number)),
                                                       monitor="val_auc",
                                                       mode='max')
    logger = MetricsCallback()
    metrics = []
    sizes = []
    # trial_file = 'HPO/nn_hpo_2021-01-05.pkl'
    trial_file = None
    p = create_param_dict(trial, trial_file)
    p['batch_size'] = trial.suggest_int('batch_size', 8000, 15000)
    for i, (train_idx, val_idx) in enumerate(
            gts.split(data_dict['data'], groups=data_dict['date'])):
        idx = np.concatenate([train_idx, val_idx])
        data = copy.deepcopy(data_dict['data'][idx])
        target = copy.deepcopy(data_dict['target'][idx])
        date = copy.deepcopy(data_dict['date'][idx])
        train_idx = [i for i in range(0, max(train_idx) + 1)]
        val_idx = [i for i in range(len(train_idx), len(idx))]
        data[train_idx] = calc_data_mean(data[train_idx],
                                         './cache',
                                         train=True,
                                         mode='mean')
        data[val_idx] = calc_data_mean(data[val_idx],
                                       './cache',
                                       train=False,
                                       mode='mean')
        model = Classifier(input_size, output_size, params=p)
        # model.apply(init_weights)
        dataset = FinData(data=data, target=target, date=date, multi=True)
        dataloaders = create_dataloaders(dataset,
                                         indexes={
                                             'train': train_idx,
                                             'val': val_idx
                                         },
                                         batch_size=p['batch_size'])
        es = EarlyStopping(monitor='val_loss',
                           patience=10,
                           min_delta=0.0005,
                           mode='min')
        trainer = pl.Trainer(logger=False,
                             max_epochs=500,
                             gpus=1,
                             callbacks=[
                                 checkpoint_callback, logger,
                                 PyTorchLightningPruningCallback(
                                     trial, monitor='val_loss'), es
                             ],
                             precision=16)
        trainer.fit(model,
                    train_dataloader=dataloaders['train'],
                    val_dataloaders=dataloaders['val'])
        val_loss = logger.metrics[-1]['val_loss'].item()
        metrics.append(val_loss)
        sizes.append(len(train_idx))
    metrics_mean = weighted_mean(metrics, sizes)
    return metrics_mean
예제 #26
0
                        type=str,
                        default='t5',
                        help='t5 or bart')

    # add all the available trainer options to argparse
    parser = pl.Trainer.add_argparse_args(parser)
    args = parser.parse_args()

    # Define trainer
    tb_logger = pl_loggers.TensorBoardLogger(args.logdir + '/')
    trainer = pl.Trainer.from_argparse_args(
        args,  # max_epochs, gpus
        logger=tb_logger,
        callbacks=[
            EarlyStopping(monitor='bleu_score',
                          verbose=True,
                          mode='max',
                          patience=5)
        ])

    # Load data and model
    kgqg = KGQGDataModule('data/' + args.dataset,
                          batch_size=args.batch_size,
                          pre_trained=args.pre_trained)
    model = KGQGTuner(kgqg,
                      learning_rate=args.learning_rate,
                      batch_size=args.batch_size,
                      optimizer=args.optimizer,
                      dataset=args.dataset,
                      pre_trained=args.pre_trained)

    # Fit model
    #     'train_batch_size': 64,  # configurable
    #     'eval_batch_size': 64  # configurable
    # })

    args = argparse.Namespace(**args_dict)

    checkpoint_callback = pl.callbacks.ModelCheckpoint(
        filepath=args.output_dir,
        prefix="checkpoint",
        monitor="val_loss",
        mode="min",
        save_top_k=5)

    early_stop_callback = EarlyStopping(monitor='val_loss',
                                        min_delta=0.00,
                                        patience=3,
                                        verbose=False,
                                        mode='min')

    # -------------------------- sanity check conll -------------------------- #
    tokenizer = T5Tokenizer.from_pretrained(
        args.tokenizer_name_or_path)  # t5-base | t5-small

    dataset = MyDataset(tokenizer,
                        args.data_dir,
                        'val',
                        max_len=args.max_seq_length)

    print('Length of dataset is {}'.format(len(dataset)))
    data = dataset[0]
    print(tokenizer.decode(data['source_ids'], skip_special_tokens=True))
def train_model(args):
    # do not run this test for pytorch lightning below min supported verson
    import pytorch_lightning as pl
    if LooseVersion(pl.__version__) < LooseVersion(MIN_PL_VERSION):
        print("Skip test for pytorch_ligthning=={}, min support version is {}".
              format(pl.__version__, MIN_PL_VERSION))
        return

    # Initialize SparkSession
    conf = SparkConf().setAppName('pytorch_spark_mnist').set(
        'spark.sql.shuffle.partitions', '16')
    if args.master:
        conf.setMaster(args.master)
    elif args.num_proc:
        conf.setMaster('local[{}]'.format(args.num_proc))
    spark = SparkSession.builder.config(conf=conf).getOrCreate()

    # Setup our store for intermediate data
    store = Store.create(args.work_dir)

    # Download MNIST dataset
    data_url = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/mnist.bz2'
    libsvm_path = os.path.join(args.data_dir, 'mnist.bz2')
    if not os.path.exists(libsvm_path):
        subprocess.check_output(['wget', data_url, '-O', libsvm_path])

    # Load dataset into a Spark DataFrame
    df = spark.read.format('libsvm') \
        .option('numFeatures', '784') \
        .load(libsvm_path)

    # One-hot encode labels into SparseVectors
    encoder = OneHotEncoder(inputCols=['label'],
                            outputCols=['label_vec'],
                            dropLast=False)
    model = encoder.fit(df)
    train_df = model.transform(df)

    # Train/test split
    train_df, test_df = train_df.randomSplit([0.9, 0.1])

    # Define the PyTorch model without any Horovod-specific parameters
    class Net(LightningModule):
        def __init__(self):
            super(Net, self).__init__()
            self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
            self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
            self.conv2_drop = nn.Dropout2d()
            self.fc1 = nn.Linear(320, 50)
            self.fc2 = nn.Linear(50, 10)

        def forward(self, x):
            x = x.float().reshape((-1, 1, 28, 28))
            x = F.relu(F.max_pool2d(self.conv1(x), 2))
            x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
            x = x.view(-1, 320)
            x = F.relu(self.fc1(x))
            x = F.dropout(x, training=self.training)
            x = self.fc2(x)
            return F.log_softmax(x, -1)

        def configure_optimizers(self):
            return optim.SGD(self.parameters(), lr=0.01, momentum=0.5)

        def training_step(self, batch, batch_idx):
            if batch_idx == 0:
                print(f"training data batch size: {batch['label'].shape}")
            x, y = batch['features'], batch['label']
            y_hat = self(x)
            loss = F.nll_loss(y_hat, y.long())
            self.log('train_loss', loss)
            return loss

        def validation_step(self, batch, batch_idx):
            if batch_idx == 0:
                print(f"validation data batch size: {batch['label'].shape}")
            x, y = batch['features'], batch['label']
            y_hat = self(x)
            loss = F.nll_loss(y_hat, y.long())
            self.log('val_loss', loss)

        def validation_epoch_end(self, outputs):
            avg_loss = torch.stack([
                x['val_loss'] for x in outputs
            ]).mean() if len(outputs) > 0 else float('inf')
            self.log('avg_val_loss', avg_loss)

    model = Net()

    # Train a Horovod Spark Estimator on the DataFrame
    backend = SparkBackend(num_proc=args.num_proc,
                           stdout=sys.stdout,
                           stderr=sys.stderr,
                           prefix_output_with_timestamp=True)

    from pytorch_lightning.callbacks import Callback

    epochs = args.epochs

    class MyDummyCallback(Callback):
        def __init__(self):
            self.epcoh_end_counter = 0
            self.train_epcoh_end_counter = 0
            self.validation_epoch_end_counter = 0

        def on_init_start(self, trainer):
            print('Starting to init trainer!')

        def on_init_end(self, trainer):
            print('Trainer is initialized.')

        def on_epoch_end(self, trainer, model):
            print('A train or eval epoch ended.')
            self.epcoh_end_counter += 1

        def on_train_epoch_end(self, trainer, model, unused=None):
            print('A train epoch ended.')
            self.train_epcoh_end_counter += 1

        def on_validation_epoch_end(self, trainer, model, unused=None):
            print('A val epoch ended.')
            self.validation_epoch_end_counter += 1

        def on_train_end(self, trainer, model):
            print(
                "Training ends:"
                f"epcoh_end_counter={self.epcoh_end_counter}, "
                f"train_epcoh_end_counter={self.train_epcoh_end_counter}, "
                f"validation_epoch_end_counter={self.validation_epoch_end_counter} \n"
            )
            assert self.train_epcoh_end_counter <= epochs
            assert self.epcoh_end_counter == self.train_epcoh_end_counter + self.validation_epoch_end_counter

    callbacks = [MyDummyCallback()]

    # added EarlyStopping and ModelCheckpoint
    from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
    callbacks.append(ModelCheckpoint(dirpath=args.work_dir))

    from pytorch_lightning.callbacks.early_stopping import EarlyStopping
    callbacks.append(
        EarlyStopping(monitor='val_loss',
                      min_delta=0.00,
                      patience=3,
                      verbose=True,
                      mode='max'))

    torch_estimator = hvd.TorchEstimator(backend=backend,
                                         store=store,
                                         model=model,
                                         input_shapes=[[-1, 1, 28, 28]],
                                         feature_cols=['features'],
                                         label_cols=['label'],
                                         batch_size=args.batch_size,
                                         epochs=args.epochs,
                                         validation=0.1,
                                         verbose=1,
                                         callbacks=callbacks,
                                         profiler="simple")

    torch_model = torch_estimator.fit(train_df).setOutputCols(['label_prob'])

    # Evaluate the model on the held-out test DataFrame
    pred_df = torch_model.transform(test_df)

    argmax = udf(lambda v: float(np.argmax(v)), returnType=T.DoubleType())
    pred_df = pred_df.withColumn('label_pred', argmax(pred_df.label_prob))
    evaluator = MulticlassClassificationEvaluator(predictionCol='label_pred',
                                                  labelCol='label',
                                                  metricName='accuracy')
    print('Test accuracy:', evaluator.evaluate(pred_df))

    spark.stop()
예제 #29
0
def main(cfg: DictConfig):

    log.info("Arguments:\n %s", OmegaConf.to_yaml(cfg))

    seed_everything(42)

    if not cfg.dataset.fine_grained:
        target_encoding = {"negative": 0, "positive": 1}
    else:
        target_encoding = {
            "very negative": 0,
            "negative": 1,
            "neutral": 2,
            "positive": 3,
            "very positive": 4,
        }

    # hydra generates a new working directory for each run
    # want to store data in same directory each run
    root = hydra.utils.to_absolute_path(".data")

    log.info("Downloading data...")
    # 1. Get SST dataset
    train, val, test = SSTDatasetAlt(root=root,
                                     tokenizer=TokenizerSST(),
                                     **cfg.dataset)

    # 2. Setup encoder
    encoder = TransformerEncoder()
    encoder.add_vocab([train, val, test],
                      special_tokens={
                          "cls_token": "<cls>",
                          "sep_token": "<sep>"
                      },
                      **cfg.vocab)
    encoder.add_target_encoding(target_encoding)

    # 5. Setup train, val and test dataloaders
    dm = DataModule(
        train=train,
        val=val,
        test=test,
        collate_fn=encoder.collate_fn,
        batch_size=cfg.datamodule.batch_size,
    )

    # 6. Setup model
    num_class = 5 if cfg.dataset.fine_grained else 2
    model = TransformerWithClassifierHead(input_size=len(encoder.vocab),
                                          num_class=num_class,
                                          **cfg.model)
    optimizer = get_optimizer(model, **OmegaConf.to_container(cfg.optimizer))
    scheduler_args = {
        "lr_lambda":
        linear_schedule_with_warmup(num_warmup_steps=1000,
                                    num_training_steps=cfg.trainer.max_steps)
    }
    scheduler = get_scheduler(optimizer, name="LambdaLR", args=scheduler_args)
    classifier = TextClassifier(model,
                                optimizer=optimizer,
                                scheduler=scheduler)

    # 7. Setup trainer
    early_stop_callback = EarlyStopping(
        monitor="val_epoch_loss",
        min_delta=0.0001,
        patience=3,
        verbose=True,
        mode="min",
    )

    checkpoint_callback = ModelCheckpoint(
        filepath="./checkpoints/" + "{epoch}",
        save_top_k=1,
        verbose=True,
        monitor="val_epoch_loss",
        mode="min",
    )

    trainer = Trainer(checkpoint_callback=checkpoint_callback,
                      callbacks=[LoggingCallback(), early_stop_callback],
                      **cfg.trainer)
    log.info("Training...")
    # 8. Fit model
    trainer.fit(classifier, dm.train_dataloader(), dm.val_dataloader())

    # 9. Test model
    results = trainer.test(
        test_dataloaders=dm.test_dataloader(),
        ckpt_path=checkpoint_callback.best_model_path,
    )

    log.info(results)
예제 #30
0
def run(cfg):
    pl.seed_everything(cfg.seed)

    output_dir = os.getcwd()
    output_dir = os.path.join(output_dir, "lightning_logs")
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    os.chdir(hydra.utils.get_original_cwd())

    # lightningmodule checkpoint
    checkpoint_callback = ModelCheckpoint(
        monitor="val/acc",
        dirpath=output_dir,
        filename="epoch{epoch:02d}-val_acc{val/acc:.2f}",
        auto_insert_metric_name=False,
        save_top_k=3,
        mode="max",
    )

    # early stopping
    early_stopping_callback = EarlyStopping(
        monitor="val/acc",
        min_delta=0.0,
        patience=10,
        verbose=False,
        mode="max"
    )  # for bnn, wo need more patience due to parameter sensitive

    # timer
    timer = Timer()

    callbacks = [early_stopping_callback, checkpoint_callback, timer]

    # logger
    wandb_logger = WandbLogger(
        name="conv_cnn_" +
        datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S"),
        save_dir=output_dir,
        project="radial-bnn",
    )

    # initialize trainer
    trainer = pl.Trainer(logger=wandb_logger,
                         callbacks=callbacks,
                         **cfg.trainer)

    # datamodule
    dm = MNISTDataModule(cfg=cfg.datamodule)
    dm.prepare_data()
    dm.setup()

    # model
    model = ConvModule(cfg=cfg.lightningmodule)

    # train model
    trainer.fit(model=model, datamodule=dm)

    # test model
    trainer.test(datamodule=dm, ckpt_path="best")

    # training time
    logger.info("{} elapsed in training".format(timer.time_elapsed("train")))