예제 #1
0
파일: train.py 프로젝트: jxtxwsbn/GEM-CNN
def main(config):

    with open(config) as f:
        conf = yaml.load(f, Loader=yaml.FullLoader)

    neptune_args = Namespace(**conf['logger'])
    model_args = Namespace(**conf['model'])
    trainer_args = Namespace(**conf['trainer'])

    neptune_api_key = os.environ['NEPTUNE_API_KEY']
    neptune_logger = NeptuneLogger(
        api_key=neptune_api_key,
        project_name=neptune_args.project_name,
        experiment_name=neptune_args.experiment_name,
        tags=neptune_args.tags
    )
    if model_args.task == 'regression':
        from gem_cnn.models.regression import MeshNetwork
    elif model_args.task == 'segmentation':
        from gem_cnn.models.segmentation import MeshNetwork
    else:
        raise Exception('Unknown task')

    model_args.loss = loss_registry.get(model_args.loss)
    model_args.head_nonlinearity = nonlinearity_registry.get(model_args.head_nonlinearity)
    model_args.gem_nonlinearity = nonlinearity_registry.get(model_args.gem_nonlinearity)

    seed_everything()

    model = MeshNetwork(hparams=model_args)
    trainer_args.logger = neptune_logger
    trainer = Trainer.from_argparse_args(trainer_args)
    trainer.fit(model)
예제 #2
0
def generic_train(model: BaseTransformer, args: argparse.Namespace):
    # init model
    set_seed(args)

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))

    checkpoint_callback = pl.callbacks.ModelCheckpoint(
        filepath=args.output_dir,
        prefix="checkpoint",
        monitor="val_loss",
        mode="min",
        save_top_k=5)

    train_params = dict(
        accumulate_grad_batches=args.gradient_accumulation_steps,
        gpus=args.n_gpu,
        max_epochs=args.num_train_epochs,
        early_stop_callback=False,
        gradient_clip_val=args.max_grad_norm,
        checkpoint_callback=checkpoint_callback,
        callbacks=[LoggingCallback()],
    )

    if args.fp16:
        train_params["use_amp"] = args.fp16
        train_params["amp_level"] = args.fp16_opt_level

    if args.n_tpu_cores > 0:
        global xm
        import torch_xla.core.xla_model as xm

        train_params["num_tpu_cores"] = args.n_tpu_cores
        train_params["gpus"] = 0

    if args.n_gpu > 1:
        train_params["distributed_backend"] = "dp"

    neptune_logger = NeptuneLogger(
        api_key=os.environ['NEPTUNE_API_TOKEN'],
        project_name="kevinjo/acl2020",
        experiment_name="default",  # Optional,
        params=vars(args),  # Optional,
        tags=args.tags  # Optional,
    )
    train_params.update({'logger': neptune_logger})

    trainer = pl.Trainer(**train_params, )

    if args.do_train:
        trainer.fit(model)

    return trainer
예제 #3
0
def setup_logger(config: Config, additional_tags: Optional[List[str]] = None):
    if additional_tags is None:
        additional_tags = []
    tags = additional_tags + config.experiment.tags.split()
    return NeptuneLogger(project_name="reformer-tts/reformer-tts",
                         experiment_name=config.experiment.experiment_name,
                         params={
                             **asdict(config),
                             **asdict(config.dataset),
                             **asdict(config.model),
                             **asdict(config.experiment.tts_training),
                         },
                         tags=tags)
def main(args):
    dict_args = vars(args)
    FLAGS = {}
    FLAGS["num_workers"] = dict_args["num_workers"]
    FLAGS["batch_size"] = dict_args["batch_size"]
    FLAGS["accumulation_steps"] = dict_args["acc_steps"]
    FLAGS["learning_rate"] = dict_args["lr"]
    FLAGS["weight_decay"] = dict_args["weight_decay"]
    FLAGS["num_epochs"] = dict_args["num_epochs"]
    FLAGS["exp_name"] = dict_args["model_name"]
    FLAGS["fold"] = dict_args["folds"]  # "0, 1, 2, 3"
    FLAGS["scheduler_pat"] = dict_args["scheduler_patience"]
    FLAGS["img_size"] = dict_args["img_size"]
    FLAGS["use_gn"] = dict_args["use_gn"]

    model = LightningWheat(model_name=dict_args["model_name"], hparams=FLAGS)
    checkpoint_callback = ModelCheckpoint(
        filepath="./" + dict_args["model_name"] + "_{epoch}-{avg_score:.5f}",
        monitor="avg_score",
        mode="max",
        save_last=True,
        save_weights_only=True,
        save_top_k=3)
    loggers = []
    tb_logger = TensorBoardLogger(save_dir="./lightning_logs")
    loggers.append(tb_logger)
    if dict_args['neptune_key'] != 'none':
        neptune_logger = NeptuneLogger(api_key=dict_args['neptune_key'],
                                       project_name="utsav/wheat-det",
                                       params=FLAGS,
                                       tags=["pytorch-lightning"])
        loggers.append(neptune_logger)
    trainer = Trainer(
        gpus=dict_args['gpus'],
        distributed_backend=dict_args['distributed_backend'],
        deterministic=True,
        benchmark=False,
        progress_bar_refresh_rate=200,
        logger=loggers,
        max_epochs=FLAGS["num_epochs"],
        accumulate_grad_batches=FLAGS["accumulation_steps"],
        weights_summary="top",
        checkpoint_callback=checkpoint_callback,
    )
    trainer.fit(model)
예제 #5
0
def main(hparams):
    neptune_logger = NeptuneLogger(
        api_key="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vdWkubmVwdHVuZS5haSIsImFwaV91cmwiOiJodHRwczovL3VpLm5lcHR1bmUuYWkiLCJhcGlfa2V5IjoiN2I2ZWM0NmQtNjg0NS00ZjM5LTkzNTItN2I4Nzc0YTUzMmM0In0=",
        project_name="hirune924/kaggle-PANDA",
        close_after_fit=False,
        upload_source_files=['*.py','*.ipynb'],
        params=vars(hparams),
        experiment_name=hparams.experiment_name,  # Optional,
        #tags=["pytorch-lightning", "mlp"]  # Optional,
    )
    '''
    comet_logger = CometLogger(
        api_key="QCxbRVX2qhQj1t0ajIZl2nk2c",
        workspace='hirune924',  # Optional
        save_dir='.',  # Optional
        project_name="kaggle-panda",  # Optional
        #rest_api_key=os.environ.get('COMET_REST_API_KEY'),  # Optional
        #experiment_name='default'  # Optional
    )'''
    tb_logger = loggers.TensorBoardLogger(save_dir=hparams.log_dir, name='default', version=None)
 
    logger_list = [tb_logger, neptune_logger] #if hparams.distributed_backend!='ddp' else tb_logger

    checkpoint_callback = ModelCheckpoint(
        filepath=os.path.join(hparams.log_dir, 'fold'+str(hparams.fold)+'-'+'{epoch}-{avg_val_loss}-{val_qwk}'),
        save_top_k=10,
        verbose=True,
        monitor='avg_val_loss',
        mode='min',
        save_weights_only = True,
        period = 1
    )

    # default used by the Trainer
    early_stop_callback = EarlyStopping(
        monitor='avg_val_loss',
        patience=20,
        min_delta = 0.0,
        strict=True,
        verbose=True,
        mode='min'
    )

    if hparams.head == 'default':
        head = None
        avg_pool = 1
    elif hparams.head == 'custom':
        avg_pool = [2,2]
        head = nn.Sequential(
            nn.Linear(2048*2*2,512), Mish(),nn.BatchNorm1d(512),
            nn.Dropout(0.5),nn.Linear(512,1))
    elif hparams.head == 'thin-2':
        avg_pool = [2,2]
        head = nn.Linear(2048*2*2,1)
    elif hparams.head == 'thin-3':
        avg_pool = [3,3]
        head = nn.Linear(2048*3*3,1)
    model = get_cls_model_from_name(model_name=hparams.model_name, num_classes=1, pretrained=True, head=head, avg_pool=avg_pool)
    ckpt_pth = glob.glob(os.path.join(hparams.ckpt_dir,'fold'+str(hparams.fold)+'*.ckpt'))
    model = load_pytorch_model(ckpt_pth[0], model)

    pl_model = PLRegressionImageClassificationSystem(model, hparams)

    my_callback = MyCallback() if hparams.tile != 2 else MyCallback2()
###
    if hparams.auto_lr_find:
        trainer = Trainer()
        lr_finder = trainer.lr_find(pl_model)
        print(lr_finder.results)
        print(lr_finder.suggestion())
        pl_model.learning_rate = lr_finder.suggestion()
###

    trainer = Trainer(gpus=hparams.gpus, max_epochs=hparams.max_epochs,min_epochs=hparams.min_epochs,
                    max_steps=None,min_steps=None,
                    checkpoint_callback=checkpoint_callback,
                    #early_stop_callback=early_stop_callback,
                    early_stop_callback=False,
                    callbacks=[my_callback],
                    logger=logger_list,
                    accumulate_grad_batches=hparams.accumulate_grad_batches,
                    precision=hparams.precision,
                    amp_level='O1',
                    auto_lr_find=False,
                    benchmark=True,
                    check_val_every_n_epoch=hparams.check_val_every_n_epoch,
                    distributed_backend=hparams.distributed_backend,
                    num_nodes=1,
                    fast_dev_run=False,
                    gradient_clip_val=0.0,
                    log_gpu_memory=False,
                    log_save_interval=100,
                    num_sanity_val_steps=5,
                    overfit_pct=0.0)

    # fit model !
    trainer.fit(pl_model)
예제 #6
0
# set seeds
torch.manual_seed(hparams.seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True  # False
np.random.seed(hparams.seed)

# logging
# in order to use neptune logging:
# export NEPTUNE_API_TOKEN = '...' !!!
logging.getLogger().setLevel('INFO')
source_files = [__file__]
if hparams.config:
    source_files.append(hparams.config)
neptune_logger = NeptuneLogger(project_name=hparams.neptune_project,
                               params=vars(hparams),
                               experiment_name=hparams.experiment_name,
                               tags=[hparams.experiment_name],
                               upload_source_files=source_files)
tb_logger = loggers.TensorBoardLogger(hparams.log_dir)

transform = Compose([
    BrightnessTransform(mu=0.0, sigma=0.3, data_key='data'),
    GammaTransform(gamma_range=(0.7, 1.3), data_key='data'),
    ContrastAugmentationTransform(contrast_range=(0.3, 1.7), data_key='data')
])

with open(hparams.train_set, 'r') as keyfile:
    train_keys = [l.strip() for l in keyfile.readlines()]
print(train_keys)

with open(hparams.val_set, 'r') as keyfile:
예제 #7
0
def main():

    args = make_parser()
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus
    seed_everything(args.seed)

    # Prepare output directory
    if not os.path.exists(os.path.join('../', args.output_dir)):
        os.mkdir(os.path.join('../', args.output_dir))
  
    args.output_dir = os.path.join('../', args.output_dir, args.exp_name)
    if os.path.exists(args.output_dir):
        flag_continue = input(f"Model name [{args.exp_name}] already exists. Do you want to overwrite? (y/n): ")
        if flag_continue.lower() == 'y' or flag_continue.lower() == 'yes':
            shutil.rmtree(args.output_dir)
            os.mkdir(args.output_dir)
        else:
            print("Exit pre-training")
            exit()
    else:
        os.mkdir(args.output_dir)

    # Setup for neptune logger
    neptune_api_key = os.environ['NEPTUNE_API_TOKEN']
    neptune_project_name = 'kevinjo/cs372'
    neptune_experiment_name = args.exp_name
    neptune_logger = NeptuneLogger(
        api_key=neptune_api_key,
        project_name=neptune_project_name,
        experiment_name=neptune_experiment_name,
        tags=["torch", "pretrain"],
        params=vars(args)
    )

    # Setup for pytorch-lightning params
    train_params = dict(
        logger=neptune_logger,
        gpus=args.n_gpu,
        gradient_clip_val=args.max_grad_norm,
        early_stop_callback=None,
        checkpoint_callback=False,
        # val_check_interval=args.validation_step,
        accumulate_grad_batches=args.grad_accum_steps,
        max_steps=args.max_steps,
        benchmark=args.benchmark,
    )

    # Setup for albert model 
    albert_base_configuration = AlbertConfig(
        classifier_dropout_prob = args.classifier_dropout_prob,
        hidden_size=args.hidden_size,
        embedding_size=args.embedding_size,
        num_attention_heads=args.num_attention_heads,
        num_hidden_layers=args.num_hidden_layers,
        num_hidden_groups=args.num_hidden_groups,
        intermediate_size=args.intermediate_size,
        vocab_size = args.vocab_size,
        max_position_embeddings= args.max_position_embeddings,
        output_vocab_size = args.output_vocab_size,
        type_vocab_size = args.type_vocab_size,
    )
    model = ConsonantAlbert(args, albert_base_configuration)

    # Start model training
    trainer = pl.Trainer(auto_lr_find=False, profiler=False, amp_level='O2', precision=16, **train_params)
    if args.do_train:
        trainer.fit(model)
    return
예제 #8
0
def main(hparams):
    neptune_logger = NeptuneLogger(
        api_key=
        "eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vdWkubmVwdHVuZS5haSIsImFwaV91cmwiOiJodHRwczovL3VpLm5lcHR1bmUuYWkiLCJhcGlfa2V5IjoiN2I2ZWM0NmQtNjg0NS00ZjM5LTkzNTItN2I4Nzc0YTUzMmM0In0=",
        project_name="hirune924/kaggle-PANDA",
        close_after_fit=False,
        upload_source_files=['*.py', '*.ipynb'],
        params=vars(hparams),
        experiment_name=hparams.experiment_name,  # Optional,
        #tags=["pytorch-lightning", "mlp"]  # Optional,
    )
    '''
    comet_logger = CometLogger(
        api_key="QCxbRVX2qhQj1t0ajIZl2nk2c",
        workspace='hirune924',  # Optional
        save_dir='.',  # Optional
        project_name="kaggle-panda",  # Optional
        #rest_api_key=os.environ.get('COMET_REST_API_KEY'),  # Optional
        #experiment_name='default'  # Optional
    )'''
    tb_logger = loggers.TensorBoardLogger(save_dir=hparams.log_dir,
                                          name='default',
                                          version=None)

    logger_list = [tb_logger, neptune_logger
                   ] if hparams.distributed_backend != 'ddp' else tb_logger

    checkpoint_callback = ModelCheckpoint(filepath=os.path.join(
        hparams.log_dir,
        'fold' + str(hparams.fold) + '-' + '{epoch}-{avg_val_loss}'),
                                          save_top_k=10,
                                          verbose=True,
                                          monitor='avg_val_loss',
                                          mode='min',
                                          save_weights_only=True,
                                          period=1)

    # default used by the Trainer
    early_stop_callback = EarlyStopping(monitor='avg_val_loss',
                                        patience=20,
                                        min_delta=0.0,
                                        strict=True,
                                        verbose=True,
                                        mode='min')

    model = get_seg_model_from_name(model_name=hparams.model_name,
                                    in_channels=3,
                                    num_classes=6,
                                    pretrained=True)
    pl_model = PLImageSegmentationRegSystem(model, hparams)

    ###
    if hparams.auto_lr_find:
        trainer = Trainer()
        lr_finder = trainer.lr_find(pl_model)
        print(lr_finder.results)
        print(lr_finder.suggestion())
        pl_model.learning_rate = lr_finder.suggestion()


###

    trainer = Trainer(gpus=hparams.gpus,
                      max_epochs=hparams.max_epochs,
                      min_epochs=hparams.min_epochs,
                      max_steps=None,
                      min_steps=None,
                      checkpoint_callback=checkpoint_callback,
                      early_stop_callback=early_stop_callback,
                      logger=logger_list,
                      accumulate_grad_batches=1,
                      precision=hparams.precision,
                      amp_level='O1',
                      auto_lr_find=False,
                      benchmark=True,
                      check_val_every_n_epoch=hparams.check_val_every_n_epoch,
                      distributed_backend=hparams.distributed_backend,
                      num_nodes=1,
                      fast_dev_run=False,
                      gradient_clip_val=0.0,
                      log_gpu_memory=None,
                      log_save_interval=100,
                      num_sanity_val_steps=5,
                      overfit_pct=0.0)

    # fit model !
    trainer.fit(pl_model)
예제 #9
0
            os.mkdir(args.output_dir)
        else:
            print("Exit pre-training")
            exit()
    else:
        os.mkdir(args.output_dir)

    model = BaseElectra(args, config)
    
    neptune_api_key = os.environ['NEPTUNE_API_TOKEN']
    neptune_project_name = 'IRNLP/electra'
    neptune_experiment_name = 'electra_pytorch'

    neptune_logger = NeptuneLogger(
        api_key=neptune_api_key,
        project_name=neptune_project_name,
        experiment_name=neptune_experiment_name,
        tags=["torch", "pretrain"],
    )

    train_params = dict(
        gpus=args.n_gpu,
        gradient_clip_val=args.max_grad_norm,
        logger=neptune_logger,
        early_stop_callback=None,
    )

    trainer = pl.Trainer(profiler=False, **train_params)
    if args.do_train:
        trainer.fit(model)

    return
예제 #10
0
                            default=[64, 128, 256, 512])
        parser.add_argument('--depths', type=list, default=[3, 4, 23, 3])
        parser.add_argument('--res_block', default=ResnetBottleneckBlock)

        # parser.add_argument('--backbone', default=model)

        return parser


# =========================================NEPTUNE AI===============================================================

CHECKPOINTS_DIR = '/media/backup/Arsenal/thesis_results/res_tl_usrp_intf_bpsk/'  # change this
neptune_logger = NeptuneLogger(
    api_key=
    "eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vdWkubmVwdHVuZS5haSIsImFwaV91cmwiOiJodHRwczovL3VpLm5lcHR1bmU"
    "uYWkiLCJhcGlfa2V5IjoiZjAzY2IwZjMtYzU3MS00ZmVhLWIzNmItM2QzOTY2NTIzOWNhIn0=",
    project_name="rachneet/sandbox",
    experiment_name="res_tl_usrp_intf_bpsk",  # change this  for new runs
)

# ===================================================================================================================


# function to test the model separately
def test_lightning(hparams: argparse.Namespace):
    # test on other set
    model = TransferLearningModel(hparams)
    checkpoint_path = '/media/backup/Arsenal/thesis_results/res_tl_usrp_vsg/epoch=0.ckpt'
    checkpoint = torch.load(checkpoint_path,
                            map_location=lambda storage, loc: storage)
    model.load_state_dict(checkpoint['state_dict'])