def main(config): with open(config) as f: conf = yaml.load(f, Loader=yaml.FullLoader) neptune_args = Namespace(**conf['logger']) model_args = Namespace(**conf['model']) trainer_args = Namespace(**conf['trainer']) neptune_api_key = os.environ['NEPTUNE_API_KEY'] neptune_logger = NeptuneLogger( api_key=neptune_api_key, project_name=neptune_args.project_name, experiment_name=neptune_args.experiment_name, tags=neptune_args.tags ) if model_args.task == 'regression': from gem_cnn.models.regression import MeshNetwork elif model_args.task == 'segmentation': from gem_cnn.models.segmentation import MeshNetwork else: raise Exception('Unknown task') model_args.loss = loss_registry.get(model_args.loss) model_args.head_nonlinearity = nonlinearity_registry.get(model_args.head_nonlinearity) model_args.gem_nonlinearity = nonlinearity_registry.get(model_args.gem_nonlinearity) seed_everything() model = MeshNetwork(hparams=model_args) trainer_args.logger = neptune_logger trainer = Trainer.from_argparse_args(trainer_args) trainer.fit(model)
def generic_train(model: BaseTransformer, args: argparse.Namespace): # init model set_seed(args) if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) checkpoint_callback = pl.callbacks.ModelCheckpoint( filepath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=5) train_params = dict( accumulate_grad_batches=args.gradient_accumulation_steps, gpus=args.n_gpu, max_epochs=args.num_train_epochs, early_stop_callback=False, gradient_clip_val=args.max_grad_norm, checkpoint_callback=checkpoint_callback, callbacks=[LoggingCallback()], ) if args.fp16: train_params["use_amp"] = args.fp16 train_params["amp_level"] = args.fp16_opt_level if args.n_tpu_cores > 0: global xm import torch_xla.core.xla_model as xm train_params["num_tpu_cores"] = args.n_tpu_cores train_params["gpus"] = 0 if args.n_gpu > 1: train_params["distributed_backend"] = "dp" neptune_logger = NeptuneLogger( api_key=os.environ['NEPTUNE_API_TOKEN'], project_name="kevinjo/acl2020", experiment_name="default", # Optional, params=vars(args), # Optional, tags=args.tags # Optional, ) train_params.update({'logger': neptune_logger}) trainer = pl.Trainer(**train_params, ) if args.do_train: trainer.fit(model) return trainer
def setup_logger(config: Config, additional_tags: Optional[List[str]] = None): if additional_tags is None: additional_tags = [] tags = additional_tags + config.experiment.tags.split() return NeptuneLogger(project_name="reformer-tts/reformer-tts", experiment_name=config.experiment.experiment_name, params={ **asdict(config), **asdict(config.dataset), **asdict(config.model), **asdict(config.experiment.tts_training), }, tags=tags)
def main(args): dict_args = vars(args) FLAGS = {} FLAGS["num_workers"] = dict_args["num_workers"] FLAGS["batch_size"] = dict_args["batch_size"] FLAGS["accumulation_steps"] = dict_args["acc_steps"] FLAGS["learning_rate"] = dict_args["lr"] FLAGS["weight_decay"] = dict_args["weight_decay"] FLAGS["num_epochs"] = dict_args["num_epochs"] FLAGS["exp_name"] = dict_args["model_name"] FLAGS["fold"] = dict_args["folds"] # "0, 1, 2, 3" FLAGS["scheduler_pat"] = dict_args["scheduler_patience"] FLAGS["img_size"] = dict_args["img_size"] FLAGS["use_gn"] = dict_args["use_gn"] model = LightningWheat(model_name=dict_args["model_name"], hparams=FLAGS) checkpoint_callback = ModelCheckpoint( filepath="./" + dict_args["model_name"] + "_{epoch}-{avg_score:.5f}", monitor="avg_score", mode="max", save_last=True, save_weights_only=True, save_top_k=3) loggers = [] tb_logger = TensorBoardLogger(save_dir="./lightning_logs") loggers.append(tb_logger) if dict_args['neptune_key'] != 'none': neptune_logger = NeptuneLogger(api_key=dict_args['neptune_key'], project_name="utsav/wheat-det", params=FLAGS, tags=["pytorch-lightning"]) loggers.append(neptune_logger) trainer = Trainer( gpus=dict_args['gpus'], distributed_backend=dict_args['distributed_backend'], deterministic=True, benchmark=False, progress_bar_refresh_rate=200, logger=loggers, max_epochs=FLAGS["num_epochs"], accumulate_grad_batches=FLAGS["accumulation_steps"], weights_summary="top", checkpoint_callback=checkpoint_callback, ) trainer.fit(model)
def main(hparams): neptune_logger = NeptuneLogger( api_key="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vdWkubmVwdHVuZS5haSIsImFwaV91cmwiOiJodHRwczovL3VpLm5lcHR1bmUuYWkiLCJhcGlfa2V5IjoiN2I2ZWM0NmQtNjg0NS00ZjM5LTkzNTItN2I4Nzc0YTUzMmM0In0=", project_name="hirune924/kaggle-PANDA", close_after_fit=False, upload_source_files=['*.py','*.ipynb'], params=vars(hparams), experiment_name=hparams.experiment_name, # Optional, #tags=["pytorch-lightning", "mlp"] # Optional, ) ''' comet_logger = CometLogger( api_key="QCxbRVX2qhQj1t0ajIZl2nk2c", workspace='hirune924', # Optional save_dir='.', # Optional project_name="kaggle-panda", # Optional #rest_api_key=os.environ.get('COMET_REST_API_KEY'), # Optional #experiment_name='default' # Optional )''' tb_logger = loggers.TensorBoardLogger(save_dir=hparams.log_dir, name='default', version=None) logger_list = [tb_logger, neptune_logger] #if hparams.distributed_backend!='ddp' else tb_logger checkpoint_callback = ModelCheckpoint( filepath=os.path.join(hparams.log_dir, 'fold'+str(hparams.fold)+'-'+'{epoch}-{avg_val_loss}-{val_qwk}'), save_top_k=10, verbose=True, monitor='avg_val_loss', mode='min', save_weights_only = True, period = 1 ) # default used by the Trainer early_stop_callback = EarlyStopping( monitor='avg_val_loss', patience=20, min_delta = 0.0, strict=True, verbose=True, mode='min' ) if hparams.head == 'default': head = None avg_pool = 1 elif hparams.head == 'custom': avg_pool = [2,2] head = nn.Sequential( nn.Linear(2048*2*2,512), Mish(),nn.BatchNorm1d(512), nn.Dropout(0.5),nn.Linear(512,1)) elif hparams.head == 'thin-2': avg_pool = [2,2] head = nn.Linear(2048*2*2,1) elif hparams.head == 'thin-3': avg_pool = [3,3] head = nn.Linear(2048*3*3,1) model = get_cls_model_from_name(model_name=hparams.model_name, num_classes=1, pretrained=True, head=head, avg_pool=avg_pool) ckpt_pth = glob.glob(os.path.join(hparams.ckpt_dir,'fold'+str(hparams.fold)+'*.ckpt')) model = load_pytorch_model(ckpt_pth[0], model) pl_model = PLRegressionImageClassificationSystem(model, hparams) my_callback = MyCallback() if hparams.tile != 2 else MyCallback2() ### if hparams.auto_lr_find: trainer = Trainer() lr_finder = trainer.lr_find(pl_model) print(lr_finder.results) print(lr_finder.suggestion()) pl_model.learning_rate = lr_finder.suggestion() ### trainer = Trainer(gpus=hparams.gpus, max_epochs=hparams.max_epochs,min_epochs=hparams.min_epochs, max_steps=None,min_steps=None, checkpoint_callback=checkpoint_callback, #early_stop_callback=early_stop_callback, early_stop_callback=False, callbacks=[my_callback], logger=logger_list, accumulate_grad_batches=hparams.accumulate_grad_batches, precision=hparams.precision, amp_level='O1', auto_lr_find=False, benchmark=True, check_val_every_n_epoch=hparams.check_val_every_n_epoch, distributed_backend=hparams.distributed_backend, num_nodes=1, fast_dev_run=False, gradient_clip_val=0.0, log_gpu_memory=False, log_save_interval=100, num_sanity_val_steps=5, overfit_pct=0.0) # fit model ! trainer.fit(pl_model)
# set seeds torch.manual_seed(hparams.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = True # False np.random.seed(hparams.seed) # logging # in order to use neptune logging: # export NEPTUNE_API_TOKEN = '...' !!! logging.getLogger().setLevel('INFO') source_files = [__file__] if hparams.config: source_files.append(hparams.config) neptune_logger = NeptuneLogger(project_name=hparams.neptune_project, params=vars(hparams), experiment_name=hparams.experiment_name, tags=[hparams.experiment_name], upload_source_files=source_files) tb_logger = loggers.TensorBoardLogger(hparams.log_dir) transform = Compose([ BrightnessTransform(mu=0.0, sigma=0.3, data_key='data'), GammaTransform(gamma_range=(0.7, 1.3), data_key='data'), ContrastAugmentationTransform(contrast_range=(0.3, 1.7), data_key='data') ]) with open(hparams.train_set, 'r') as keyfile: train_keys = [l.strip() for l in keyfile.readlines()] print(train_keys) with open(hparams.val_set, 'r') as keyfile:
def main(): args = make_parser() os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus seed_everything(args.seed) # Prepare output directory if not os.path.exists(os.path.join('../', args.output_dir)): os.mkdir(os.path.join('../', args.output_dir)) args.output_dir = os.path.join('../', args.output_dir, args.exp_name) if os.path.exists(args.output_dir): flag_continue = input(f"Model name [{args.exp_name}] already exists. Do you want to overwrite? (y/n): ") if flag_continue.lower() == 'y' or flag_continue.lower() == 'yes': shutil.rmtree(args.output_dir) os.mkdir(args.output_dir) else: print("Exit pre-training") exit() else: os.mkdir(args.output_dir) # Setup for neptune logger neptune_api_key = os.environ['NEPTUNE_API_TOKEN'] neptune_project_name = 'kevinjo/cs372' neptune_experiment_name = args.exp_name neptune_logger = NeptuneLogger( api_key=neptune_api_key, project_name=neptune_project_name, experiment_name=neptune_experiment_name, tags=["torch", "pretrain"], params=vars(args) ) # Setup for pytorch-lightning params train_params = dict( logger=neptune_logger, gpus=args.n_gpu, gradient_clip_val=args.max_grad_norm, early_stop_callback=None, checkpoint_callback=False, # val_check_interval=args.validation_step, accumulate_grad_batches=args.grad_accum_steps, max_steps=args.max_steps, benchmark=args.benchmark, ) # Setup for albert model albert_base_configuration = AlbertConfig( classifier_dropout_prob = args.classifier_dropout_prob, hidden_size=args.hidden_size, embedding_size=args.embedding_size, num_attention_heads=args.num_attention_heads, num_hidden_layers=args.num_hidden_layers, num_hidden_groups=args.num_hidden_groups, intermediate_size=args.intermediate_size, vocab_size = args.vocab_size, max_position_embeddings= args.max_position_embeddings, output_vocab_size = args.output_vocab_size, type_vocab_size = args.type_vocab_size, ) model = ConsonantAlbert(args, albert_base_configuration) # Start model training trainer = pl.Trainer(auto_lr_find=False, profiler=False, amp_level='O2', precision=16, **train_params) if args.do_train: trainer.fit(model) return
def main(hparams): neptune_logger = NeptuneLogger( api_key= "eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vdWkubmVwdHVuZS5haSIsImFwaV91cmwiOiJodHRwczovL3VpLm5lcHR1bmUuYWkiLCJhcGlfa2V5IjoiN2I2ZWM0NmQtNjg0NS00ZjM5LTkzNTItN2I4Nzc0YTUzMmM0In0=", project_name="hirune924/kaggle-PANDA", close_after_fit=False, upload_source_files=['*.py', '*.ipynb'], params=vars(hparams), experiment_name=hparams.experiment_name, # Optional, #tags=["pytorch-lightning", "mlp"] # Optional, ) ''' comet_logger = CometLogger( api_key="QCxbRVX2qhQj1t0ajIZl2nk2c", workspace='hirune924', # Optional save_dir='.', # Optional project_name="kaggle-panda", # Optional #rest_api_key=os.environ.get('COMET_REST_API_KEY'), # Optional #experiment_name='default' # Optional )''' tb_logger = loggers.TensorBoardLogger(save_dir=hparams.log_dir, name='default', version=None) logger_list = [tb_logger, neptune_logger ] if hparams.distributed_backend != 'ddp' else tb_logger checkpoint_callback = ModelCheckpoint(filepath=os.path.join( hparams.log_dir, 'fold' + str(hparams.fold) + '-' + '{epoch}-{avg_val_loss}'), save_top_k=10, verbose=True, monitor='avg_val_loss', mode='min', save_weights_only=True, period=1) # default used by the Trainer early_stop_callback = EarlyStopping(monitor='avg_val_loss', patience=20, min_delta=0.0, strict=True, verbose=True, mode='min') model = get_seg_model_from_name(model_name=hparams.model_name, in_channels=3, num_classes=6, pretrained=True) pl_model = PLImageSegmentationRegSystem(model, hparams) ### if hparams.auto_lr_find: trainer = Trainer() lr_finder = trainer.lr_find(pl_model) print(lr_finder.results) print(lr_finder.suggestion()) pl_model.learning_rate = lr_finder.suggestion() ### trainer = Trainer(gpus=hparams.gpus, max_epochs=hparams.max_epochs, min_epochs=hparams.min_epochs, max_steps=None, min_steps=None, checkpoint_callback=checkpoint_callback, early_stop_callback=early_stop_callback, logger=logger_list, accumulate_grad_batches=1, precision=hparams.precision, amp_level='O1', auto_lr_find=False, benchmark=True, check_val_every_n_epoch=hparams.check_val_every_n_epoch, distributed_backend=hparams.distributed_backend, num_nodes=1, fast_dev_run=False, gradient_clip_val=0.0, log_gpu_memory=None, log_save_interval=100, num_sanity_val_steps=5, overfit_pct=0.0) # fit model ! trainer.fit(pl_model)
os.mkdir(args.output_dir) else: print("Exit pre-training") exit() else: os.mkdir(args.output_dir) model = BaseElectra(args, config) neptune_api_key = os.environ['NEPTUNE_API_TOKEN'] neptune_project_name = 'IRNLP/electra' neptune_experiment_name = 'electra_pytorch' neptune_logger = NeptuneLogger( api_key=neptune_api_key, project_name=neptune_project_name, experiment_name=neptune_experiment_name, tags=["torch", "pretrain"], ) train_params = dict( gpus=args.n_gpu, gradient_clip_val=args.max_grad_norm, logger=neptune_logger, early_stop_callback=None, ) trainer = pl.Trainer(profiler=False, **train_params) if args.do_train: trainer.fit(model) return
default=[64, 128, 256, 512]) parser.add_argument('--depths', type=list, default=[3, 4, 23, 3]) parser.add_argument('--res_block', default=ResnetBottleneckBlock) # parser.add_argument('--backbone', default=model) return parser # =========================================NEPTUNE AI=============================================================== CHECKPOINTS_DIR = '/media/backup/Arsenal/thesis_results/res_tl_usrp_intf_bpsk/' # change this neptune_logger = NeptuneLogger( api_key= "eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vdWkubmVwdHVuZS5haSIsImFwaV91cmwiOiJodHRwczovL3VpLm5lcHR1bmU" "uYWkiLCJhcGlfa2V5IjoiZjAzY2IwZjMtYzU3MS00ZmVhLWIzNmItM2QzOTY2NTIzOWNhIn0=", project_name="rachneet/sandbox", experiment_name="res_tl_usrp_intf_bpsk", # change this for new runs ) # =================================================================================================================== # function to test the model separately def test_lightning(hparams: argparse.Namespace): # test on other set model = TransferLearningModel(hparams) checkpoint_path = '/media/backup/Arsenal/thesis_results/res_tl_usrp_vsg/epoch=0.ckpt' checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage) model.load_state_dict(checkpoint['state_dict'])