def train_center_net(train_df, oof_df): train_dataset = centernet.WheatDataset(train_df, transforms=get_train_transforms()) train_dataloader = DataLoader(train_dataset, batch_size=Config.Train.batch_size, shuffle=True, num_workers=4, drop_last=True, pin_memory=True) oof_dataset = centernet.WheatDataset(oof_df, test=True, transforms=get_valid_transforms()) oof_dataloader = DataLoader(oof_dataset, batch_size=Config.Train.batch_size, shuffle=False, num_workers=4, pin_memory=True) model = Resnest50CenterNet(conf=Config) early_stop = callbacks.EarlyStopping(monitor='val_map', patience=10, mode='max', verbose=True) checkpoint = callbacks.ModelCheckpoint(str(Config.Train.checkpoint_dir), monitor='val_map', verbose=True, mode='max', save_top_k=1) cbs = [ callbacks.LearningRateLogger() ] trainer = Trainer(gpus=1, early_stop_callback=early_stop, checkpoint_callback=checkpoint, callbacks=cbs, benchmark=True, deterministic=True, max_epochs=Config.Train.epochs) trainer.fit(model, train_dataloader=train_dataloader, val_dataloaders=oof_dataloader) valid_dataset = centernet.WheatDataset(get_data(mode='valid'), test=True, transforms=get_test_transforms()) valid_dataloader = DataLoader(valid_dataset, batch_size=Config.Train.batch_size, shuffle=False, num_workers=4, pin_memory=True) trainer.test(model, test_dataloaders=valid_dataloader)
def run(config="config/base.yml"): config = util.load_config(config) now = datetime.datetime.now().strftime('%Y%m%d-%H%M%S') run_dir = path.join("wandb", now) run_dir = path.abspath(run_dir) os.environ['WANDB_PROJECT'] = "linear_turing" os.environ['TOKENIZERS_PARALLELISM'] = 'true' checkpoint_callback = callbacks.ModelCheckpoint(monitor='val_loss', mode='min', save_weights_only=True, save_last=True, filename='{epoch}_{val_loss:.2f}') other_callbacks = [ pl.callbacks.LearningRateMonitor(), callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=10) ] experiment = Experiment(config) trainer = pl.Trainer(logger=pl.loggers.WandbLogger(log_model=True), checkpoint_callback=checkpoint_callback, callbacks=other_callbacks, **config['trainer']) trainer.fit(experiment)
def train(args): model = RNN() data_module = DataModule(args) callbacks_list = None if args.val_path: callbacks_list = [] callbacks_list.append(callbacks.EarlyStopping(monitor='val_acc', patience=PATIENCE)) callbacks_list.append(callbacks.ModelCheckpoint(filepath=args.out_path, monitor='val_acc', prefix='rnn')) gpus = N_GPU if torch.cuda.is_available() else None trainer = pl.Trainer(gpus=gpus, max_epochs=MAX_EPOCHS, callbacks=callbacks_list) trainer.fit(model, datamodule=data_module)
def get_callbacks(cfg, output_dir): cbacks = [] checkpoint_path = os.path.join(output_dir, cfg.CHECKPOINT.NAME) checkpoint = pl_callbacks.ModelCheckpoint(filepath=checkpoint_path, save_last=False, monitor=cfg.CHECKPOINT.MONITOR, mode=cfg.CHECKPOINT.MONITOR_MODE) cs = [ pl_callbacks.EarlyStopping(monitor=cfg.CHECKPOINT.MONITOR, mode=cfg.CHECKPOINT.MONITOR_MODE, **cfg.EARLY_STOPPING), pl_callbacks.LearningRateLogger(), inspector.AnalysisCallback() ] return checkpoint, cs
def load_callbacks(): callbacks = [] callbacks.append( plc.EarlyStopping(monitor='val_acc', mode='max', patience=10, min_delta=0.001)) callbacks.append( plc.ModelCheckpoint(monitor='val_acc', filename='best-{epoch:02d}-{val_acc:.3f}', save_top_k=1, mode='max', save_last=True)) if args.lr_scheduler: callbacks.append(plc.LearningRateMonitor(logging_interval='epoch')) return callbacks
def get_loggers_callbacks(args, model=None): try: # Setup logger(s) params csv_logger_params = dict( save_dir="./experiments", name=os.path.join(*args.save_dir.split("/")[1:-1]), version=args.save_dir.split("/")[-1], ) wandb_logger_params = dict( log_model=False, name=os.path.join(*args.save_dir.split("/")[1:]), offline=args.debug, project="utime", save_dir=args.save_dir, ) loggers = [ pl_loggers.CSVLogger(**csv_logger_params), pl_loggers.WandbLogger(**wandb_logger_params), ] if model: loggers[-1].watch(model) # Setup callback(s) params checkpoint_monitor_params = dict( filepath=os.path.join(args.save_dir, "{epoch:03d}-{eval_loss:.2f}"), monitor=args.checkpoint_monitor, save_last=True, save_top_k=1, ) earlystopping_parameters = dict( monitor=args.earlystopping_monitor, patience=args.earlystopping_patience, ) callbacks = [ pl_callbacks.ModelCheckpoint(**checkpoint_monitor_params), pl_callbacks.EarlyStopping(**earlystopping_parameters), pl_callbacks.LearningRateMonitor(), ] return loggers, callbacks except AttributeError: return None, None
def main(): logger.remove() logger.add(sys.stdout, colorize=True, format="<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> " + "| <level>{level}</level> " + "| <light-black>{file.path}:{line}</light-black> | {message}") hparams = parse_args() if hparams.restore: wandb.init(project=hparams.project, tags=hparams.tags) model = LevelClassification.load_from_checkpoint(hparams.restore) logger.info("Restored model") else: # wandb.init is called in LevelClassification model = LevelClassification(hparams) experiment_logger = loggers.WandbLogger(project=hparams.project, tags=hparams.tags) hparams.checkpoint_dir = os.path.join(experiment_logger.experiment.dir, "checkpoints") checkpoint_cb = callbacks.ModelCheckpoint(hparams.checkpoint_dir, save_top_k=1) trainer = pl.Trainer(logger=experiment_logger, gpus=1 if hparams.device == "cuda" else 0, checkpoint_callback=checkpoint_cb, callbacks=[EmbeddingsCallback()], early_stop_callback=callbacks.EarlyStopping(), fast_dev_run=hparams.debug) trainer.fit(model) model.freeze() baseline_datasets = [] logger.info("Baselines {}", os.listdir(hparams.baseline_level_dir)) for i, baseline_level_dir in enumerate( sorted(os.listdir(hparams.baseline_level_dir))): baseline_dataset = LevelSnippetDataset( level_dir=os.path.join(os.getcwd(), hparams.baseline_level_dir, baseline_level_dir), slice_width=model.dataset.slice_width, token_list=model.dataset.token_list) baseline_datasets.append(baseline_dataset) visualize_embeddings(model.dataset, model, "test", hparams, None, baseline_datasets)
def run(config): if isinstance(config, str): with open(config) as f: config = yaml.safe_load(f) now = datetime.datetime.now().strftime('%Y%m%d-%H%M%S') run_dir = path.join("wandb", now) run_dir = path.abspath(run_dir) os.environ['WANDB_RUN_DIR'] = run_dir checkpoint_callback = callbacks.ModelCheckpoint( run_dir, monitor=config['early_stopping']['monitor']) early_stopping_callback = callbacks.EarlyStopping( **config['early_stopping']) experiment = Experiment(config) trainer = pl.Trainer(logger=False, checkpoint_callback=checkpoint_callback, early_stop_callback=early_stopping_callback, **config['trainer']) trainer.fit(experiment)
def train_faster_rcnn(train_df, oof_df): train_dataset = rcnn.WheatDataset(train_df, transforms=get_train_transforms()) train_dataloader = DataLoader(train_dataset, batch_size=Config.Train.batch_size, shuffle=True, num_workers=4, drop_last=True, collate_fn=collate_fn, pin_memory=True) oof_dataset = rcnn.WheatDataset(oof_df, test=True, transforms=get_valid_transforms()) oof_dataloader = DataLoader(oof_dataset, batch_size=Config.Train.batch_size, shuffle=False, num_workers=4, collate_fn=collate_fn, pin_memory=True) # model = FasterRCNNResnet50FPN.load_from_checkpoint('checkpoints\\faster_rcnn\\epoch=9.ckpt', **Config) model = FasterRCNNResnet50FPN(conf=Config) early_stop = callbacks.EarlyStopping(monitor='val_loss', patience=20, mode='min', verbose=True) checkpoint = callbacks.ModelCheckpoint(str(Config.Train.checkpoint_dir), monitor='val_loss', verbose=True, save_top_k=1) cbs = [ callbacks.LearningRateLogger() ] trainer = Trainer(gpus=1, early_stop_callback=early_stop, checkpoint_callback=checkpoint, callbacks=cbs, benchmark=True, deterministic=True, max_epochs=Config.Train.epochs) trainer.fit(model, train_dataloader=train_dataloader, val_dataloaders=oof_dataloader) valid_dataset = rcnn.WheatDataset(get_data(mode='valid'), test=True, transforms=get_test_transforms()) valid_dataloader = DataLoader(valid_dataset, batch_size=Config.Train.batch_size, shuffle=False, num_workers=4, collate_fn=collate_fn, pin_memory=True) trainer.test(model, test_dataloaders=valid_dataloader)
valid_data = DataLoader(valid_dataset, num_workers=8, pin_memory=True, batch_sampler=valid_batch_sampler, collate_fn=_collate_fn) test_dataset = LanguageModelingDataset(datasets['test']) test_batch_sampler = BPTTBatchSampler(test_dataset, hparams.bptt, hparams.batch_size) test_data = DataLoader(test_dataset, num_workers=8, pin_memory=True, batch_sampler=test_batch_sampler, collate_fn=_collate_fn) early_stop_callback = callbacks.EarlyStopping(monitor='val_ppl', mode='min') model_checkpoint_callback = callbacks.ModelCheckpoint( monitor='val_ppl', save_last=True, save_top_k=5, save_weights_only=False, mode='min') trainer = Trainer.from_argparse_args( hparams, default_root_dir=os.path.abspath( os.path.expanduser("~/data/awd-lstm")), callbacks=[ early_stop_callback, model_checkpoint_callback, NNICallback() ])
def train(config: Config): """学習処理の実行スクリプト.""" pl.seed_everything(config.random_seed) # 学習を途中から再開する場合などの設定 cache_dir = pathlib.Path(config.cache_dir) cache_dir.mkdir(exist_ok=True) trainer_params = dict() lastckpt = cache_dir.joinpath("last.ckpt") if config.resume: trainer_params["resume_from_checkpoint"] = str(lastckpt) elif lastckpt.exists(): lastckpt.unlink() for filepath in cache_dir.glob("epoch*.ckpt"): filepath.unlink() # ログ設定 pl_logger = pl_loggers.MLFlowLogger( experiment_name=config.experiment_name, tracking_uri=os.environ.get("MLFLOW_TRACKING_URI", None), tags={ "mlflow.source.name": pathlib.Path(__file__).name, "mlflow.source.git.commit": ut.get_commit_id(), }, ) # ネットワーク、データセットの取得及び学習 network = tv_models.vgg16(pretrained=False) params = dc.asdict(config) model = Trainer(network, **params) callbacks: t.List[t.Any] = list() model_checkpoint = pl_callbacks.ModelCheckpoint( filepath=str(cache_dir), monitor="val_loss", save_last=True, save_top_k=config.save_top_k, save_weights_only=config.save_weights_only, mode="min", period=1, ) callbacks.append(model_checkpoint) if config.early_stop: callbacks.append( pl_callbacks.EarlyStopping( monitor="val_loss", min_delta=0.0, patience=3, verbose=False, mode="auto", )) pl_trainer = pl.Trainer( default_root_dir=str(cache_dir), fast_dev_run=False, min_epochs=config.min_epochs, max_epochs=config.max_epochs, gpus=[0] if config.use_gpu and cuda.is_available() else None, progress_bar_refresh_rate=config.progress_bar_refresh_rate, profiler=config.profiler, callbacks=callbacks, logger=pl_logger, log_gpu_memory=True, **trainer_params, ) datamodule = dataset_food101.Food101WithLableModule( batch_size=config.batch_size, num_workers=config.num_workers, ) pl_trainer.fit(model, datamodule) # ログに追加情報を設定 mlf_client = mlflow.tracking.MlflowClient() for ckptfile in cache_dir.glob("epoch*.ckpt"): model = model.load_from_checkpoint(str(ckptfile), network, **params) with tempfile.TemporaryDirectory() as dname: mlf_model_path = pathlib.Path(dname).joinpath(ckptfile.stem) mlf_pytorch.save_model(model.network, mlf_model_path) mlf_client.log_artifact(pl_logger.run_id, mlf_model_path)
if earlystopping_tracking in ['val_loss',]: earlystopping_tracking = earlystopping_tracking earlystopping_mode = 'min' earlystopping_min_delta = 0.0001 elif earlystopping_tracking in ['val_epoch_F1','val_epoch_auPRC']: earlystopping_mode = 'max' earlystopping_min_delta = 0.001 else: raise checkpoint_callback = pl_callbacks.ModelCheckpoint(dirpath=save_model_folder, mode = earlystopping_mode, monitor=earlystopping_tracking, save_top_k=1,save_last=True,) earlystop_callback = pl_callbacks.EarlyStopping(earlystopping_tracking,verbose=True, mode = earlystopping_mode, min_delta=earlystopping_min_delta, patience=10,) trainer = Trainer( gpus=[gpus,], accelerator=None, max_epochs=200, min_epochs=5, default_root_dir= save_folder, fast_dev_run=False, check_val_every_n_epoch=1, callbacks= [checkpoint_callback, earlystop_callback,], ) trainer.fit(model, datamodule=datamodule,)
from model.trainer import Train_GraphDialogRe from utils.data_reader import Vocab if __name__ == "__main__": seed_everything(config.seed) dgl.random.seed(config.seed) model = Train_GraphDialogRe(config) logger = loggers.TensorBoardLogger(save_dir=config.save_dir) checkpoint_args = dict( monitor='eval_f1', mode='max', ) early_stopping = callbacks.EarlyStopping(patience=5, strict=True, verbose=True, **checkpoint_args) ckpt_callback = callbacks.ModelCheckpoint( filepath=os.path.join( logger.log_dir, '{epoch}-{val_loss:.4f}-{eval_f1:.4f}-{eval_T2:.3f}' ), # same path with logdir save_top_k=1, verbose=True, prefix='', **checkpoint_args, ) trainer_args = dict( gpus=config.gpus, num_nodes=config.num_nodes,