def _get_trainer_callbacks(cfg: CfgNode) -> List[Callback]: """Gets the trainer callbacks based on the given D2Go Config. Args: cfg: The normalized ConfigNode for this D2Go Task. Returns: A list of configured Callbacks to be used by the Lightning Trainer. """ callbacks: List[Callback] = [ LearningRateMonitor(logging_interval="step"), ModelCheckpoint( dirpath=cfg.OUTPUT_DIR, save_last=True, ), ] if cfg.QUANTIZATION.QAT.ENABLED: qat = cfg.QUANTIZATION.QAT callbacks.append( QuantizationAwareTraining( qconfig_dicts={ submodule: None for submodule in cfg.QUANTIZATION.MODULES } if cfg.QUANTIZATION.MODULES else None, start_step=qat.START_ITER, enable_observer=(qat.ENABLE_OBSERVER_ITER, qat.DISABLE_OBSERVER_ITER), freeze_bn_step=qat.FREEZE_BN_ITER, )) return callbacks
def main(): # get config and arguments args, config = pretrain_args() set_fixed_seed(args) system_config = {"args":args, "training_config": config} module_path = f'downstream.{args.downstream}' system = importlib.import_module(module_path +'.system') dataset = importlib.import_module(module_path+ '.dataset') downstream_system = system.DownstreamSystem(**system_config) datamodule_config = {'data_config': config['datarc'], "dataloader_config":config['dataloader']} downstream_dataset = dataset.DownstreamDataModule(**datamodule_config) wandb_logger = WandbLogger(name=args.expname,save_dir=args.expdir,config=system_config) checkpoint_callback = ModelCheckpoint(**config['ModelCheckpoint_config']) # program_callback = ProgressBar() trainer_config = { **config['trainer_config'], \ 'default_root_dir': args.expdir, "logger": wandb_logger, \ "weights_save_path": args.expdir, \ "callbacks": [checkpoint_callback]} trainer = Trainer(**trainer_config) trainer.fit(downstream_system, downstream_dataset) trainer.test()
def checkpointer(checkdir: Path, prefix: str, monitor: str = "train_loss") -> ModelCheckpoint: """Setup up a ModelCheckpoint callback. Parameters ---------- checkdir: Path Directory where checkpoints will be saved. prefix: str Identifier to prepend to checkpoint filename. monitor: str Quanitity to monitor """ suffix = "{epoch}__{train_loss:.3f}-{val_less:.3f}" fullpath = Path(checkdir).resolve() / prefix filepath = str(fullpath) + suffix return ModelCheckpoint(filepath=filepath, monitor=monitor, verbose=True, save_top_k=3, mode="auto", save_weights_only=False, period=1)
def main(): # run_dataloader() """main""" # ''' parser = get_parser() # add model specific args parser = BertLabeling.add_model_specific_args(parser) # add all the available trainer options to argparse # ie: now --gpus --num_nodes ... --fast_dev_run all work in the cli parser = Trainer.add_argparse_args(parser) args = parser.parse_args() model = BertLabeling(args) if args.pretrained_checkpoint: model.load_state_dict( torch.load(args.pretrained_checkpoint, map_location=torch.device('cpu'))["state_dict"]) checkpoint_callback = ModelCheckpoint( filepath=args.default_root_dir, save_top_k=2, verbose=True, monitor="coach_f1", period=-1, mode="max", ) early_stop_callback = EarlyStopping(monitor="coach_f1", patience=args.early_stop, verbose=True, mode="max", min_delta=0.00) trainer = Trainer.from_argparse_args( args, checkpoint_callback=checkpoint_callback, callbacks=[early_stop_callback]) if not args.only_test: trainer.fit(model) print(checkpoint_callback.best_model_path) # test model = BertLabeling.load_from_checkpoint( checkpoint_path=checkpoint_callback.best_model_path, map_location=None, batch_size=16, max_length=128, workers=0) trainer.test(model=model) # test on seen and unseen print("**********testing on unseen data**********") dataset_seen, dataset_unseen = get_dataloader_test( model.args.tgt_domain, tokenizer=model.tokenizer) model.dataset_test = dataset_unseen trainer.test(model=model) print("**********testing on unseen data**********") model.dataset_test = dataset_seen trainer.test(model=model)
def main(): """main""" parser = get_parser() # add model specific arguments. parser = BertForQA.add_model_specific_args(parser) # add all the available trainer options to argparse parser = Trainer.add_argparse_args(parser) args = parser.parse_args() model = BertForQA(args) if len(args.pretrained_checkpoint) > 1: model.load_state_dict( torch.load(args.pretrained_checkpoint, map_location=torch.device('cpu'))["state_dict"]) # print(args.output_dir) checkpoint_callback = ModelCheckpoint(dirpath=args.output_dir, filename='{epoch}-{val_loss:.2f}', verbose=True, period=-1, mode="auto") trainer = Trainer.from_argparse_args( args, checkpoint_callback=checkpoint_callback, accelerator="ddp", deterministic=True) trainer.fit(model)
def train_autoencoder(): data = utils.load_data(root_dir='./data/', mode='train') data, target, features, date = utils.preprocess_data(data, nn=True) dataset = utils.FinData(data=data, target=target, date=date) p = {'batch_size': 4597, 'dim_1': 231, 'dim_2': 851, 'dim_3': 777, 'dim_4': 192, 'hidden': 50, 'dropout': 0.017122456592972537, 'lr': 0.0013131268366473552, 'activation': nn.GELU, 'label_smoothing': 0.09401544509474698, 'weight_decay': 0.005078413740277699, 'amsgrad': True} train_idx = [i for i in range(len(data))] val_idx = [i for i in range(10000)] dataloaders = utils.create_dataloaders(dataset=dataset, indexes={ 'train': train_idx, 'val': val_idx}, batch_size=p['batch_size']) checkpoint_callback = ModelCheckpoint( dirpath='logs', monitor='t_loss', mode='min', save_top_k=1, period=10) input_size = data.shape[-1] output_size = 1 model = AutoEncoder(input_size=input_size, output_size=output_size, params=p) es = EarlyStopping(monitor='t_loss', patience=10, min_delta=0.0005, mode='min') trainer = pl.Trainer(max_epochs=500, gpus=1, callbacks=[checkpoint_callback, es], precision=16) trainer.fit(model, train_dataloader=dataloaders['train'])
def main(args): tt_logger = TestTubeLogger(save_dir=args.log_path, name="", debug=args.debug, description=args.description, create_git_tag=args.git_tag, log_graph=True) tt_logger.experiment log_dir = Path(tt_logger.save_dir) / f"version_{tt_logger.version}" checkpoint_dir = log_dir / "checkpoints" os.makedirs(checkpoint_dir, exist_ok=True) chkpt_callback = ModelCheckpoint(checkpoint_dir, monitor='Loss/d_loss_epoch', save_last=True, mode='auto', save_top_k=5, period=5) data_loader = TorchDataLoader.from_argparse_args(args) img_shape = data_loader.train_data[0][0].shape model = Engine.from_argparse_args(args, img_shape=img_shape) save_args(args, log_dir) trainer = Trainer.from_argparse_args(args, logger=tt_logger, checkpoint_callback=chkpt_callback) trainer.fit(model, data_loader)
def train(args): # if save path does not exits, create it if not os.path.exists(args.save_path): os.mkdir(args.save_path) model = ExplainNLP(args) checkpoint_callback = ModelCheckpoint( filepath=os.path.join(args.save_path, '{epoch}-{valid_loss:.4f}-{valid_acc_end:.4f}'), save_top_k=args.save_topk, save_last=True, monitor="valid_acc_end", mode="max", ) logger = TensorBoardLogger(save_dir=args.save_path, name='log') # save args with open(os.path.join(args.save_path, "args.json"), 'w') as f: args_dict = args.__dict__ del args_dict['tpu_cores'] json.dump(args_dict, f, indent=4) trainer = Trainer.from_argparse_args( args, checkpoint_callback=checkpoint_callback, distributed_backend="ddp", logger=logger) trainer.fit(model)
def main(args): model = LightningLinearVAE(args) if (args.eigvectors is not None and args.eigvalues is not None): eigvectors = np.loadtxt(args.eigvectors) eigvalues = np.loadtxt(args.eigvalues) model.set_eigs(eigvectors, eigvalues) trainer = Trainer( max_epochs=args.epochs, gpus=args.gpus, check_val_every_n_epoch=1, gradient_clip_val=args.grad_clip, ) ckpt_path = os.path.join( args.output_directory, trainer.logger.name, f"linear_vae_version_{trainer.logger.version}", "checkpoints", ) checkpoint_callback = ModelCheckpoint(filepath=ckpt_path, period=1, monitor='val_loss', mode='min', verbose=True) trainer.checkpoint_callback = checkpoint_callback trainer.fit(model) torch.save(model.state_dict(), args.output_directory + '/last_ckpt.pt')
def main(): """main""" parser = get_parser() parser = Trainer.add_argparse_args(parser) args = parser.parse_args() model = BertClassificationTask(args) checkpoint_callback = ModelCheckpoint( filepath=os.path.join(args.save_path, 'checkpoint', '{epoch}-{val_loss:.4f}-{val_acc:.4f}'), save_top_k=1, save_last=False, monitor="val_acc", mode="max", ) logger = TensorBoardLogger(save_dir=args.save_path, name='log') # save args with open(os.path.join(args.save_path, 'checkpoint', "args.json"), 'w') as f: args_dict = args.__dict__ del args_dict['tpu_cores'] json.dump(args_dict, f, indent=4) trainer = Trainer.from_argparse_args( args, checkpoint_callback=checkpoint_callback, distributed_backend="ddp", logger=logger) trainer.fit(model)
def train_single(self, train_dataloader, dev_dataloader): finetuner = LightningModel(train_dataloader=train_dataloader, dev_dataloader=dev_dataloader, model_type=self.model_type, model_path=self.model_base_path, test_dataloader=self.test_dataloader) early_stopping_callback = EarlyStopping(monitor='val_pearsonr', patience=10, mode="max") lr_logger_callback = LearningRateLogger() makedirs(self.model_base_path, exist_ok=True) model_checkpoint_callback = ModelCheckpoint( filepath=self.model_base_path, monitor="val_pearsonr", save_top_k=1) trainer = pl.Trainer(gpus=self.gpus, max_epochs=100, default_root_dir=self.model_base_path, gradient_clip_val=self.grad_clip_norm, log_save_interval=100, val_check_interval=0.25, checkpoint_callback=model_checkpoint_callback, early_stop_callback=early_stopping_callback, callbacks=[lr_logger_callback], progress_bar_refresh_rate=1) trainer.fit(finetuner) trainer.test()
def main(): """main""" parser = get_parser() # add model specific args parser = BertLabeling.add_model_specific_args(parser) # add all the available trainer options to argparse # ie: now --gpus --num_nodes ... --fast_dev_run all work in the cli parser = Trainer.add_argparse_args(parser) args = parser.parse_args() model = BertLabeling(args) if args.pretrained_checkpoint: model.load_state_dict( torch.load(args.pretrained_checkpoint, map_location=torch.device('cpu'))["state_dict"]) checkpoint_callback = ModelCheckpoint( filepath=args.default_root_dir, save_top_k=10, verbose=True, monitor="span_f1", period=-1, mode="max", ) trainer = Trainer.from_argparse_args( args, checkpoint_callback=checkpoint_callback) trainer.fit(model)
def __init__(self, **kwargs): # Experiment results of name 'foo' are placed in directory results/foo/version_n/ kwargs.setdefault('logger', loggers.TensorBoardLogger( 'results/', name=kwargs['name'], version=kwargs.get('version'))) # Early stopping is disabled kwargs.setdefault('early_stop_callback', False) # Create results and/or results/name if they don't exist if not os.path.exists('results'): os.system('mkdir results') if not os.path.exists('results/' + kwargs['name']): os.system('mkdir results/' + kwargs['name']) # Checkpoint are saved in directory results/foo/version_n/ kwargs.setdefault('checkpoint_callback', ModelCheckpoint( filepath=('results/' + kwargs['name'] + '/version_' + str(kwargs['logger'].version) + '/c'), monitor='val_energy', prefix='', save_top_k=-1 )) kwargs.setdefault('log_save_interval', 100) # logs are written to disk every 100 episodes kwargs.setdefault('row_log_interval', 1) # logs are created every episode kwargs.setdefault('progress_bar_refresh_rate', 1) super(Trainer, self).__init__(**kwargs)
def main(): """main""" parser = get_parser() # add model specific arguments. parser = BertForQA.add_model_specific_args(parser) # add all the available trainer options to argparse parser = Trainer.add_argparse_args(parser) args = parser.parse_args() print(args, "init") model = BertForQA(args) if len(args.pretrained_checkpoint) > 1: model.load_state_dict( torch.load(args.pretrained_checkpoint, map_location=torch.device('cpu'))["state_dict"]) if args.load_ner_bert: model.model.bert.load_state_dict( torch.load("./cached_models/ner_bert"), strict=False) checkpoint_callback = ModelCheckpoint(filepath=args.output_dir, save_top_k=args.max_keep_ckpt, verbose=True, period=-1, mode="auto") trainer = Trainer.from_argparse_args( args, checkpoint_callback=checkpoint_callback, deterministic=True) trainer.fit(model)
def main(): from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) model = LNNP(args) checkpoint_callback = ModelCheckpoint( filepath=args.log_dir, monitor="val_loss", save_top_k=8, period=args.eval_interval, ) lr_monitor = LearningRateMonitor(logging_interval='epoch') tb_logger = pl.loggers.TensorBoardLogger(args.log_dir) trainer = pl.Trainer(gpus=args.gpus, max_epochs=args.num_epochs, distributed_backend=args.distributed_backend, num_nodes=args.num_nodes, default_root_dir=args.log_dir, auto_lr_find=False, resume_from_checkpoint=args.load_model, checkpoint_callback=checkpoint_callback, callbacks=[lr_monitor], logger=tb_logger, reload_dataloaders_every_epoch=False) trainer.fit(model) # run test set after completing the fit trainer.test()
def main(): parser = get_parser() parser = TNewsClassificationTask.add_model_specific_args(parser) parser = Trainer.add_argparse_args(parser) args = parser.parse_args() task_model = TNewsClassificationTask(args) checkpoint_callback = ModelCheckpoint(filepath=args.output_dir, save_top_k=args.max_keep_ckpt, save_last=False, monitor="val_f1", verbose=True, mode='max', period=-1) task_trainer = Trainer.from_argparse_args( args, checkpoint_callback=checkpoint_callback, deterministic=True) task_trainer.fit(task_model) # after training, use the model checkpoint which achieves the best f1 score on dev set to compute the f1 on test set. best_f1_on_dev, path_to_best_checkpoint = find_best_checkpoint_on_dev( args.output_dir, only_keep_the_best_ckpt=args.only_keep_the_best_ckpt_after_training) task_model.result_logger.info("=&" * 20) task_model.result_logger.info(f"Best F1 on DEV is {best_f1_on_dev}") task_model.result_logger.info( f"Best checkpoint on DEV set is {path_to_best_checkpoint}") task_model.result_logger.info("=&" * 20)
def test_load_ema_weights(self, tmp_dir): cfg = self._get_cfg(tmp_dir) cfg.MODEL_EMA.ENABLED = True task = GeneralizedRCNNTask(cfg) checkpoint_callback = ModelCheckpoint(dirpath=task.cfg.OUTPUT_DIR, save_last=True) trainer = pl.Trainer( max_steps=1, limit_train_batches=1, num_sanity_val_steps=0, callbacks=[checkpoint_callback], ) with EventStorage() as storage: task.storage = storage trainer.fit(task) # load EMA weights from checkpoint task2 = GeneralizedRCNNTask.load_from_checkpoint( os.path.join(tmp_dir, "last.ckpt")) self.assertTrue( self._compare_state_dict(task.ema_state.state_dict(), task2.ema_state.state_dict())) # apply EMA weights to model task2.ema_state.apply_to(task2.model) self.assertTrue( self._compare_state_dict(task.ema_state.state_dict(), task2.model.state_dict()))
def test_model_checkpoint_callback(self): from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint with spark_session('test_fit_model') as spark: df = create_noisy_xor_data(spark) model = create_xor_model() with tempdir() as dir: checkpoint_callback = ModelCheckpoint(dirpath=dir) callbacks = [checkpoint_callback] with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( num_proc=2, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=4, epochs=2, verbose=2, callbacks=callbacks) torch_model = torch_estimator.fit(df) # TODO: Find a way to pass log metrics from remote, and assert base on the logger. trained_model = torch_model.getModel() pred = trained_model(torch.ones([1, 2], dtype=torch.int32)) assert len(pred) == 1 assert pred.dtype == torch.float32
def _get_trainer(self, output_dir: str) -> pl.Trainer: checkpoint_callback = ModelCheckpoint(dirpath=output_dir, save_last=True) return pl.Trainer( max_steps=1, limit_train_batches=1, num_sanity_val_steps=0, callbacks=[checkpoint_callback], logger=None, )
def test_qat(self, tmp_dir): @META_ARCH_REGISTRY.register() class QuantizableDetMetaArchForTest(mah.DetMetaArchForTest): custom_config_dict = {"preserved_attributes": ["preserved_attr"]} def __init__(self, cfg): super().__init__(cfg) self.avgpool.preserved_attr = "foo" self.avgpool.not_preserved_attr = "bar" def prepare_for_quant(self, cfg): example_inputs = (torch.rand(1, 3, 3, 3), ) self.avgpool = prepare_qat_fx( self.avgpool, { "": set_backend_and_create_qconfig(cfg, is_train=self.training) }, example_inputs, self.custom_config_dict, ) return self def prepare_for_quant_convert(self, cfg): self.avgpool = convert_fx( self.avgpool, convert_custom_config_dict=self.custom_config_dict) return self cfg = self._get_cfg(tmp_dir) cfg.MODEL.META_ARCHITECTURE = "QuantizableDetMetaArchForTest" cfg.QUANTIZATION.QAT.ENABLED = True task = GeneralizedRCNNTask(cfg) callbacks = [ QuantizationAwareTraining.from_config(cfg), ModelCheckpoint(dirpath=task.cfg.OUTPUT_DIR, save_last=True), ] trainer = pl.Trainer( max_steps=1, limit_train_batches=1, num_sanity_val_steps=0, callbacks=callbacks, logger=False, ) with EventStorage() as storage: task.storage = storage trainer.fit(task) prepared_avgpool = task._prepared.model.avgpool self.assertEqual(prepared_avgpool.preserved_attr, "foo") self.assertFalse(hasattr(prepared_avgpool, "not_preserved_attr")) with temp_defrost(cfg): cfg.MODEL.WEIGHTS = os.path.join(tmp_dir, "last.ckpt") model = GeneralizedRCNNTask.build_model(cfg, eval_only=True) self.assertTrue(isinstance(model.avgpool, torch.fx.GraphModule))
def main(args): print(args) if args.load_from_checkpoint is None: raise ValueError('`load-from-checkpoint` should be specified.') model = TripletVAE(args.load_from_checkpoint, n_hidden=args.n_hidden, n_layers=args.n_layers, learning_rate=args.learning_rate, vae_learning_rate=args.vae_lr, scheduler=args.scheduler) print(model) if args.profile: profiler = AdvancedProfiler() else: profiler = None dm = TripletDataModule(args.train_biom, args.test_biom, args.val_biom, metadata=args.sample_metadata, batch_category=args.batch_category, class_category=args.class_category, segment_triples=args.segment_triples, batch_size=args.batch_size, num_workers=args.num_workers) ckpt_path = os.path.join(args.output_directory, "checkpoints") checkpoint_callback = ModelCheckpoint(dirpath=ckpt_path, period=1, monitor='val/triplet_loss', mode='min', verbose=True) os.mkdir(args.output_directory) tb_logger = pl_loggers.TensorBoardLogger(f'{args.output_directory}/logs/') # save hyper-parameters to yaml file with open(f'{args.output_directory}/hparams.yaml', 'w') as outfile: yaml.dump(model._hparams, outfile, default_flow_style=False) trainer = Trainer(max_epochs=args.epochs, gpus=args.gpus, check_val_every_n_epoch=10, gradient_clip_val=args.grad_clip, profiler=profiler, logger=tb_logger, callbacks=[checkpoint_callback]) trainer.fit(model, dm) ckpt_path = args.output_directory + '/last_ckpt.pt' trainer.save_checkpoint(ckpt_path) # Perform KNN classification batch = next(iter(dm.test_dataloader())) res = model.test_step(batch, 0)['test/knn_results'] open(f'{args.output_directory}/cross_validation.csv', 'w').write(res)
def get_lt_trainer(output_dir: str, cfg): checkpoint_callback = ModelCheckpoint(dirpath=output_dir, save_last=True) return pl.Trainer( max_epochs=10**8, max_steps=cfg.SOLVER.MAX_ITER, val_check_interval=cfg.TEST.EVAL_PERIOD if cfg.TEST.EVAL_PERIOD > 0 else cfg.SOLVER.MAX_ITER, callbacks=[checkpoint_callback], logger=False, )
def main(): args = get_args() pl.seed_everything(args.seed, workers=True) # initialize data module data = DataModule(args) data.prepare_data() data.setup("fit") prior = None if args.prior_model: assert hasattr(priors, args.prior_model), ( f"Unknown prior model {args['prior_model']}. " f"Available models are {', '.join(priors.__all__)}" ) # initialize the prior model prior = getattr(priors, args.prior_model)(dataset=data.dataset) args.prior_args = prior.get_init_args() # initialize lightning module model = LNNP(args, prior_model=prior, mean=data.mean, std=data.std) checkpoint_callback = ModelCheckpoint( dirpath=args.log_dir, monitor="val_loss", save_top_k=10, # -1 to save all every_n_epochs=args.save_interval, filename="{epoch}-{val_loss:.4f}-{test_loss:.4f}", ) early_stopping = EarlyStopping("val_loss", patience=args.early_stopping_patience) tb_logger = pl.loggers.TensorBoardLogger( args.log_dir, name="tensorbord", version="", default_hp_metric=False ) csv_logger = CSVLogger(args.log_dir, name="", version="") trainer = pl.Trainer( strategy=DDPStrategy(find_unused_parameters=False), max_epochs=args.num_epochs, gpus=args.ngpus, num_nodes=args.num_nodes, default_root_dir=args.log_dir, auto_lr_find=False, resume_from_checkpoint=None if args.reset_trainer else args.load_model, callbacks=[early_stopping, checkpoint_callback], logger=[tb_logger, csv_logger], precision=args.precision, ) trainer.fit(model, data) # run test set after completing the fit model = LNNP.load_from_checkpoint(trainer.checkpoint_callback.best_model_path) trainer = pl.Trainer(logger=[tb_logger, csv_logger]) trainer.test(model, data)
def get_callbacks(config, dm, only_train_and_test=False): #callbacks early_stopping = EarlyStopping(monitor='_val_loss', mode="min", patience=10, verbose=True, check_finite=True) checkpoint_callback = ModelCheckpoint( monitor='_val_loss', dirpath=config.PATH_CHECKPOINT, filename='-{epoch:02d}-{val_loss:.6f}', mode="min", save_last=True, save_top_k=3, ) learning_rate_monitor = LearningRateMonitor(logging_interval="epoch") # save_result=True if config.num_fold==0 or only_train_and_test else False save_result = True prediction_plot_test = PredictionPlotsAfterTrain(config.dataset_name, config.model_name, split="test", lr_used=config.lr, save_result=save_result) prediction_plot_val = PredictionPlotsAfterTrain(config.dataset_name, config.model_name, split="val", lr_used=config.lr, save_result=save_result) prediction_plot_train = PredictionPlotsAfterTrain(config.dataset_name, config.model_name, split="train", lr_used=config.lr, save_result=save_result) callbacks = [ # early_stopping, prediction_plot_val, prediction_plot_test, prediction_plot_train, learning_rate_monitor, ] if config.num_fold >= 1: split_dataset = SplitDatasetWithKFoldStrategy( folds=config.num_fold, repetitions=config.repetitions, dm=dm, only_train_and_test=only_train_and_test) callbacks.append(split_dataset) return callbacks
def setup_callbacks_loggers(args): log_path = Path('/home/yyousfi1/LogFiles/comma/') name = args.backbone version = args.version tb_logger = TensorBoardLogger(log_path, name=name, version=version) lr_logger = LearningRateLogger(logging_interval='epoch') ckpt_callback = ModelCheckpoint(filepath=Path(tb_logger.log_dir)/'checkpoints/{epoch:02d}_{val_loss:.4f}', save_top_k=10, save_last=True) return ckpt_callback, tb_logger, lr_logger
def setup_callbacks_loggers(args): log_path = Path('/home/jared/Research/comma10k-baseline/LogFiles/') name = args.backbone version = args.version tb_logger = TensorBoardLogger(log_path, name=name, version=version) lr_logger = LearningRateMonitor(logging_interval='epoch') ckpt_callback = ModelCheckpoint(dirpath=Path(tb_logger.log_dir)/'checkpoints/', filename='{epoch:02d}_{val_loss:.4f}', monitor='val_loss', save_top_k=10, mode='min', save_last=True) return ckpt_callback, tb_logger, lr_logger
def train_vposer_once(_config): resume_training_if_possible = True model = VPoserTrainer(_config) model.vp_ps.logging.expr_msg = create_expr_message(model.vp_ps) # model.text_logger(model.vp_ps.logging.expr_msg.replace(". ", '.\n')) dump_config(model.vp_ps, osp.join(model.work_dir, '{}.yaml'.format(model.expr_id))) logger = TensorBoardLogger(model.work_dir, name='tensorboard') lr_monitor = LearningRateMonitor() snapshots_dir = osp.join(model.work_dir, 'snapshots') checkpoint_callback = ModelCheckpoint( dirpath=makepath(snapshots_dir, isfile=True), filename="%s_{epoch:02d}_{val_loss:.2f}" % model.expr_id, save_top_k=1, verbose=True, monitor='val_loss', mode='min', ) early_stop_callback = EarlyStopping( **model.vp_ps.train_parms.early_stopping) resume_from_checkpoint = None if resume_training_if_possible: available_ckpts = sorted(glob.glob(osp.join(snapshots_dir, '*.ckpt')), key=os.path.getmtime) if len(available_ckpts) > 0: resume_from_checkpoint = available_ckpts[-1] model.text_logger( 'Resuming the training from {}'.format(resume_from_checkpoint)) trainer = pl.Trainer( gpus=1, weights_summary='top', distributed_backend='ddp', # replace_sampler_ddp=False, # accumulate_grad_batches=4, # profiler=False, # overfit_batches=0.05, # fast_dev_run = True, # limit_train_batches=0.02, # limit_val_batches=0.02, # num_sanity_val_steps=2, plugins=[DDPPlugin(find_unused_parameters=False)], callbacks=[lr_monitor, early_stop_callback, checkpoint_callback], max_epochs=model.vp_ps.train_parms.num_epochs, logger=logger, resume_from_checkpoint=resume_from_checkpoint) trainer.fit(model)
def setup_mlflowlogger_and_checkpointer(hparams): mlflow_logger = MLFlowLogger( experiment_name="exp_name", tracking_uri=hparams.save_path ) run_id = mlflow_logger.run_id checkpoints_folder = os.path.join( hparams.save_path, mlflow_logger._expt_id, run_id, "checkpoints" ) os.makedirs(checkpoints_folder, exist_ok=True) checkpoint = ModelCheckpoint( filepath=checkpoints_folder, monitor="val_loss", save_top_k=1 ) return checkpoint, mlflow_logger, run_id
def setup_callbacks_loggers(args): log_path = Path('/home/yyousfi1/LogFiles/OneHotConv/') log_path = log_path / args.qf / args.stego_scheme / args.payload name = args.backbone version = args.version tb_logger = TensorBoardLogger(log_path, name=name, version=version) lr_logger = LearningRateLogger(logging_interval='epoch') ckpt_callback = ModelCheckpoint(filepath=Path(tb_logger.log_dir) / 'checkpoints/{epoch:02d}_{val_FC_acc:.3f}', save_top_k=5, save_last=True) return ckpt_callback, tb_logger, lr_logger
def main(cli_args): seed_everything(24) args = DotMap() args.bert_model_type = 'bert-base-uncased' args.lowercase = 'uncased' in args.bert_model_type args.train_file = cli_args.train_file args.eval_file = cli_args.eval_file # args.test_file = '../data/eval_v2.1_public.json' args.learning_rate = cli_args.learning_rate args.batch_size = cli_args.batch_size args.model_save_path = cli_args.model_save_path args.training_epochs = 10 args.mlm_probability = 0.15 # ---------------------------------------------------------------------- mlm_model = MarcoMLM(args) checkpoint_callback = ModelCheckpoint( filepath=cli_args. model_save_path, # '/home/wallat/msmarco-models/models/mlm/' save_top_k=1, verbose=True, monitor='val_loss', mode='min', prefix='') if cli_args.use_wandb_logging: print( 'If you are having issues with wandb, make sure to give the correct python executable to --python_executable' ) sys.executable = cli_args.python_executable logger = WandbLogger(project=cli_args.wandb_project_name, name=cli_args.wandb_run_name) else: logger = TensorBoardLogger("{}/tb_logs".format(args.output_dir)) trainer = Trainer.from_argparse_args( cli_args, checkpoint_callback=checkpoint_callback, early_stop_callback=True, logger=logger) trainer.fit(mlm_model) trainer.save_checkpoint( os.path.join(cli_args.model_save_path, 'trained_checkpoint')) mlm_model.model.save_pretrained(cli_args.model_save_path)