def main(hparams): pl.seed_everything(hparams.seed) if hparams.train: model = TransformerGenomicModel(WORD_NUM, EMBEDDING_DIM, BATCH_SIZE) trainer = pl.Trainer( resume_from_checkpoint='checkpoint/epoch=43.ckpt', logger=pl_loggers.TensorBoardLogger(save_dir='logs', name='TensorBoard', version=3), checkpoint_callback=pl.callbacks.ModelCheckpoint( filepath='checkpoint', verbose=True, save_top_k=hparams.save_top_k), early_stop_callback=pl.callbacks.EarlyStopping(monitor='val_loss', patience=3, verbose=True, mode='min'), default_root_dir=os.getcwd(), gpus=hparams.gpus, accumulate_grad_batches=2, distributed_backend='ddp', precision=16, log_gpu_memory='all') trainer.fit(model) else: model = TransformerGenomicModel(WORD_NUM, EMBEDDING_DIM, BATCH_SIZE) model.load_state_dict( torch.load('checkpoint/epoch=43.ckpt')['state_dict'])
def train_unet(args): seed_everything(args.seed) checkpoint_callback = ModelCheckpoint(filepath=os.path.join( "runs", args.experiment, 'checkpoints'), save_top_k=1, monitor='val_loss', verbose=False, period=args.save_epoch) tb_logger = pl_loggers.TensorBoardLogger(save_dir=os.path.join( "runs", 'logs/'), name="UNet", version=args.version) model = DepthRegressorTrainer(args) trainer = Trainer(gpus=[args.gpu], num_sanity_val_steps=args.sanity_steps, checkpoint_callback=checkpoint_callback, max_epochs=args.max_epoch, limit_val_batches=args.val_check_percent, val_check_interval=min(args.val_check_interval, 1.0), check_val_every_n_epoch=max(1, args.val_check_interval), resume_from_checkpoint=args.resume, logger=tb_logger, benchmark=True, precision=args.precision) trainer.fit(model)
def cnn_main(): # torch.autograd.set_detect_anomaly(True) pl.seed_everything(1234) tb_logger = pl_loggers.TensorBoardLogger('cnn_logs/', default_hp_metric=False, log_graph=True) trainer = pl.Trainer(logger=tb_logger, gpus=1, auto_select_gpus=True, max_epochs=10) folder = Path("data/aus_data/3d-expanded") # dataset = customdata.AusDataCube(folder, normalise=True) # dataset = customdata.AusDataImg(folder, normalise=True) batch_size = 512 lr = 1e-4 data = customdata.AusDataModule(folder, cube=True, normalise=True, batch_size=batch_size) # model = models.SCNN(70, 2) # model = models.SCNN3D(1, 2) # model = models.SSNet(1, 2) model = models.HybridSN(1, 2, 70, 64, lr, batch_size) trainer.fit(model, data)
def build_logger(model_type, task=None): log_dir = _get_log_dir() task = task or 'general' experiment_name = f'{model_type}_{task}' logger = loggers.TensorBoardLogger(log_dir, experiment_name) return logger
def train(experiment_path): assert os.path.exists( os.path.join(experiment_path, "experiment.yaml") ), "No experiment configuration was found, please create an experiment.yaml" with open(os.path.join(experiment_path, "experiment.yaml"), mode="r") as config_file: configuration_dict = yaml.load(config_file, Loader=yaml.Loader) model = TextClassificationModel(configuration_dict) tb_logger = pl_loggers.TensorBoardLogger( configuration_dict['Training']['log_dir']) assert not os.path.exists(configuration_dict['Training'] ['snapshot_dir']), "Experiment already exists." checkpoint = ModelCheckpoint( filepath=os.path.join(configuration_dict['Training']['snapshot_dir'], 'best_model_{epoch:02d}-{val_loss:.2f}'), verbose=True, monitor=configuration_dict['Training']['snapshot_selection_scheme'], mode='min') trainer = Trainer(logger=tb_logger, checkpoint_callback=checkpoint, max_epochs=configuration_dict['Training']['epochs'], gpus=configuration_dict['Training']['gpus']) trainer.fit(model) trainer.test(ckpt_path=trainer.checkpoint_callback.best_model_path)
def main(args): hp = OmegaConf.load(args.config) model = AEINet(hp) save_path = os.path.join(hp.log.chkpt_dir, args.name) os.makedirs(save_path, exist_ok=True) checkpoint_callback = ModelCheckpoint( filepath=os.path.join(hp.log.chkpt_dir, args.name), monitor='val_loss', verbose=True, save_top_k=args.save_top_k, # save all ) trainer = Trainer( logger=pl_loggers.TensorBoardLogger(hp.log.log_dir), early_stop_callback=None, checkpoint_callback=checkpoint_callback, weights_save_path=save_path, gpus=-1 if args.gpus is None else args.gpus, distributed_backend='ddp', num_sanity_val_steps=1, resume_from_checkpoint=args.checkpoint_path, gradient_clip_val=hp.model.grad_clip, fast_dev_run=args.fast_dev_run, val_check_interval=args.val_interval, progress_bar_refresh_rate=1, max_epochs=10000, ) trainer.fit(model)
def main(exp_root, exp_name, version, _config, load_epoch=None): tb_logger = pl_loggers.TensorBoardLogger(exp_root, exp_name, version) label_names, train_loader, val_loader, test_loader = load_train_val_test( _config['dataset']) backbone = load_backbone(**_config.get('backbone', {})) model = BasicClassifierModel(backbone, label_names, _config['optimizer'], _config['scheduler']) checkpointer = ExistedModelCheckpoint( monitor='val_loss', mode='min', save_top_k=5, dirpath=os.path.join(exp_root, exp_name, version, 'checkpoints'), filename=_config['backbone']['name'] + '-{epoch}-{val_loss:.3f}-{train_loss:.3f}') callbacks = [checkpointer, EarlyStopping(monitor='val_loss', patience=10)] trainer = pl.Trainer( logger=tb_logger, resume_from_checkpoint=checkpointer.get_checkpoint_path(load_epoch), callbacks=callbacks, **_config.get('trainer', {})) trainer.fit(model, train_loader, val_loader) print('load best epoch ', checkpointer.best_model_path) model.load_from_checkpoint(checkpointer.best_model_path) result = trainer.test(test_dataloaders=test_loader) print(result)
def train(): args = parse_args() model_module = importlib.import_module('models.' + args.model + '.' + args.model) config_module = importlib.import_module('configs.' + args.model + '.' + args.config) data_module = importlib.import_module('datasets.' + args.dataset) model_cls = getattr(model_module, args.model) hparams = config_module.get_hparams(args.option) dataset_cls = getattr(data_module, args.dataset + 'DataModule') data_hparams = config_module.get_data_hparams(args.option) model = model_cls(hparams) dataset = dataset_cls(data_hparams) tb_logger = loggers.TensorBoardLogger(save_dir=os.path.join( 'logs', args.model + '_logs'), name=args.config + '_' + args.option) checkpoint_callback = ModelCheckpoint(save_top_k=-1) weights_save_path = os.path.join('logs', args.model + '_logs') trainer = Trainer(gpus=-1, logger=tb_logger, checkpoint_callback=checkpoint_callback, log_every_n_steps=250, weights_save_path=weights_save_path, distributed_backend='ddp', replace_sampler_ddp=False) trainer.fit(model, dataset)
def train( dataset_name="cifar10", version="efficientnet-b0", batch_size=10, epochs=100, checkpoint=None, output_path=None, **model_params, ): cifar_dm = CifarDataModule(batch_size=batch_size, dataset_name=dataset_name) model = EfficientNet( model_name=version, num_classes=cf.num_classes[dataset_name], image_size=32, **model_params, ) if output_path is None: output_path = f"lightning_logs/{dataset_name}/{version}" logger = loggers.TensorBoardLogger(output_path) trainer = pl.Trainer( progress_bar_refresh_rate=20, max_epochs=epochs, gpus=1, logger=logger, resume_from_checkpoint=checkpoint, ) trainer.fit(model, cifar_dm)
def trainer_builder(args): # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ logging.info("PyTorch Lighting Trainer constructing...") tb_logger = pl_loggers.TensorBoardLogger(save_dir=args.exp_name) # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ check_point_dir = args.exp_name checkpoint_callback = ModelCheckpoint(monitor='valid_loss', mode='min', save_top_k=-1, dirpath=check_point_dir, filename='HGN_hotpotQA-{epoch:02d}-{valid_loss:.4f}') # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ if args.gpus > 0: gpu_list_str = args.gpu_list gpu_ids = [int(x) for x in gpu_list_str.split(',')] trainer = pl.Trainer(logger=tb_logger, gradient_clip_val=args.max_grad_norm, gpus=gpu_ids, val_check_interval=args.val_check_interval, accumulate_grad_batches=args.gradient_accumulation_steps, callbacks=[checkpoint_callback], accelerator=args.accelerator, precision=args.precision, plugins=args.plugins, log_every_n_steps=args.logging_steps, max_epochs=int(args.num_train_epochs)) else: trainer = pl.Trainer(logger=tb_logger, gradient_clip_val=args.max_grad_norm, val_check_interval=args.val_check_interval, accumulate_grad_batches=args.gradient_accumulation_steps, log_every_n_steps=args.logging_steps, max_epochs=int(args.num_train_epochs)) return trainer
def train_model(model, model_dir): # Setup trainer cb1 = callbacks.ModelCheckpoint(filename='best-{epoch}', monitor='val_loss_mean', save_top_k=1, mode='min') cb2 = callbacks.ModelCheckpoint(filename='last-{epoch}', save_last=True) tb_logger = pl_loggers.TensorBoardLogger('{}/logs/'.format(model_dir)) if Constants.n_gpus != 0: #trainer = Trainer(gpus=Constants.n_gpus, distributed_backend='ddp', logger = tb_logger, precision=16, default_root_dir=model_dir, max_epochs=n_epochs) trainer = Trainer(gpus=Constants.n_gpus, callbacks=[cb1, cb2], plugins=DDPPlugin(find_unused_parameters=False), accelerator='ddp_spawn', precision=16, logger=tb_logger, default_root_dir=model_dir, max_epochs=n_epochs) else: trainer = Trainer(gpus=0, default_root_dir=model_dir, callbacks=[cb1, cb2], logger=tb_logger, distributed_backend='ddp_spawn', max_epochs=n_epochs) trainer.fit(model) trainer.test()
def main(): args = parse_args() seed_everything(args.seed) tb_logger = loggers.TensorBoardLogger("logs/") wandb_logger = loggers.WandbLogger(save_dir="logs/", project="xldst") assert wandb_logger.experiment.id checkpoint_callback = ModelCheckpoint( filepath=os.path.join("ckpts", wandb_logger.experiment.id, "{epoch}-{val_loss:.4f}"), verbose=True, ) early_stop_callback = EarlyStopping(patience=2, verbose=True) trainer = Trainer.from_argparse_args( args, logger=[tb_logger, wandb_logger], checkpoint_callback=checkpoint_callback, early_stop_callback=early_stop_callback, ) dm = CldstMBartDataModule(args) dm.prepare_data() dm.setup("fit") model = MBartDST(args) trainer.fit(model, datamodule=dm) dm.setup("test") trainer.test(datamodule=dm)
def run_trainer(): early_stop_call_back = EarlyStopping(monitor='val_loss', min_delta=0.00, patience=5, verbose=False, mode='max') '''log learning rate''' lr_callback = pl.callbacks.LearningRateMonitor(logging_interval='epoch') model = Colorization_model( loss=opt.loss) #TODO set loss as RarityWeighted or L2, default: L2 logger = loggers.TensorBoardLogger(save_dir='logs/') print("using GPU", torch.cuda.is_available()) trainer = Trainer( max_epochs=300, #gpus=1, logger=logger, #use default tensorboard log_every_n_steps=20, #log every update step for debugging limit_train_batches=1.0, limit_val_batches=1.0, check_val_every_n_epoch=2, callbacks=[early_stop_call_back, lr_callback]) trainer.fit(model) '''we may not need the below. lightning model can be loaded from last checkpoint''' os.makedirs('trained_models', exist_ok=True) name = 'ColorizationModelOverfitTest.pth' torch.save(model, os.path.join('trained_models', name))
def main(): model = ResidualNetwork() tb_logger = pl_loggers.TensorBoardLogger('logs/') trainer = pl.Trainer(max_epochs=5, gpus=get_gpu(), logger=tb_logger) trainer.fit(model) return 0
def __init__(self, **kwargs): # Experiment results of name 'foo' are placed in directory results/foo/version_n/ kwargs.setdefault('logger', loggers.TensorBoardLogger( 'results/', name=kwargs['name'], version=kwargs.get('version'))) # Early stopping is disabled kwargs.setdefault('early_stop_callback', False) # Create results and/or results/name if they don't exist if not os.path.exists('results'): os.system('mkdir results') if not os.path.exists('results/' + kwargs['name']): os.system('mkdir results/' + kwargs['name']) # Checkpoint are saved in directory results/foo/version_n/ kwargs.setdefault('checkpoint_callback', ModelCheckpoint( filepath=('results/' + kwargs['name'] + '/version_' + str(kwargs['logger'].version) + '/c'), monitor='val_energy', prefix='', save_top_k=-1 )) kwargs.setdefault('log_save_interval', 100) # logs are written to disk every 100 episodes kwargs.setdefault('row_log_interval', 1) # logs are created every episode kwargs.setdefault('progress_bar_refresh_rate', 1) super(Trainer, self).__init__(**kwargs)
def main(hparams): model = Longformer(hparams) if hparams.output_dir is not None: if "test" in hparams.data_path: name = "longformer_test" else: name = "longformer" logger = loggers.TensorBoardLogger(save_dir=hparams.output_dir, name=name, version=str(hparams.amount_labels), log_graph=True) else: logger = True trainer = Trainer(default_root_dir=logger.log_dir + "/checkpoints/", logger=logger, log_save_interval=10, gpus=hparams.gpus, tpu_cores=hparams.tpu_cores, fast_dev_run=hparams.fast_dev_run, max_epochs=hparams.max_epochs, auto_lr_find=hparams.auto_lr_find, gradient_clip_val=hparams.gradient_clip_val, check_val_every_n_epoch=hparams.check_val_every_n_epoch, amp_level=hparams.amp_level, accumulate_grad_batches=hparams.accumulate_grad_batches) print("Hyperparameter:") print("_______________") print(json.dumps(vars(hparams), indent=4)) trainer.fit(model) test_result = trainer.test(model) trainer.logger.save()
def common_train(args, metric, model_class, build_method, task: str, **model_kwargs): pl.seed_everything(args.seed) early_stop_callback = EarlyStopping(monitor=metric, min_delta=1e-5, patience=3, verbose=False, mode='max') checkpoint_callback = ModelCheckpoint(monitor=metric, save_top_k=1, verbose=True, mode='max', save_last=True) model = model_class(args, **model_kwargs) build_method(model) this_time = time.strftime("%m-%d_%H-%M-%S", time.localtime()) try: import wandb logger = loggers.WandbLogger(save_dir='lightning_logs', name=f'{task}_{this_time}', project='ltp') except Exception as e: logger = loggers.TensorBoardLogger(save_dir='lightning_logs', name=f'{task}_{this_time}') trainer: Trainer = Trainer.from_argparse_args( args, logger=logger, callbacks=[early_stop_callback], checkpoint_callback=checkpoint_callback) # Ready to train with new learning rate trainer.fit(model) trainer.test()
def main(args): tb_logger = pl_loggers.TensorBoardLogger( os.path.join(TrainingConfig.EXPERIMENT_NAME, 'logs')) qa_data_module = QADataModule( tokenizer_name_or_path=TrainingConfig.TOKENIZER_CHECKPOINT, train_path=TrainingConfig.TRAIN_DATA_PATH, valid_path=TrainingConfig.VALID_DATA_PATH, batch_size=TrainingConfig.BATCH_SIZE, max_seq_length=PreprocessingConfig.MAX_LENGTH, doc_stride=PreprocessingConfig.DOC_STRIDE, max_query_length=PreprocessingConfig.MAX_QUERY_LENGTH, ) biobert = BioBERT(model_name_or_path=TrainingConfig.MODEL_CHECKPOINT, n_steps=TrainingConfig.N_EPOCHS * qa_data_module.number_of_steps_per_epoch(), lr=TrainingConfig.LR, weight_decay=TrainingConfig.WEIGHT_DECAY, warm_up_prop=TrainingConfig.WARM_UP_PROP, model_save_dir=TrainingConfig.EXPERIMENT_NAME) trainer = pl.Trainer( max_epochs=TrainingConfig.N_EPOCHS, gpus=args.gpus, callbacks=[SaveCallback(TrainingConfig.EXPERIMENT_NAME)], logger=tb_logger, ) trainer.fit(biobert, qa_data_module)
def run_fold(fold, train_df, args,size=(224, 224), arch='resnet18', pretrained=True, path='MODELS/', data_transforms=None): torch.cuda.empty_cache() fold_train = train_df[train_df.fold != fold].reset_index(drop=True) fold_val = train_df[train_df.fold == fold].reset_index(drop=True) train_ds = AudioDataset(images_path=args.specs_images_path, df=fold_train, transforms=data_transforms['train']) val_ds = AudioDataset(images_path=args.specs_images_path, df=fold_val, transforms=data_transforms['train']) trainloader = DataLoader(train_ds, batch_size=args.train_batch_size, shuffle=True , num_workers=os.cpu_count()) validloader = DataLoader(val_ds, batch_size=args.test_batch_size, shuffle=False , num_workers=os.cpu_count()) del train_ds del val_ds del fold_train del fold_val model = AudioClassifier(arch_name=arch, lr=args.lr, pretrained=pretrained) tb_logger = loggers.TensorBoardLogger(save_dir='./runs', name='ZINDI-GIZ-NLP-AGRI-KEYWORDS', version=fold) ckpt_callback = pl.callbacks.ModelCheckpoint(filename=f'ZINDI-GIZ-NLP-AGRI-KEYWORDS-{model.hparams.arch_name}-{fold}-based', dirpath=path, monitor='val_logLoss', mode='min') trainer = Trainer(max_epochs=args.num_epochs, gpus=args.gpus, logger=tb_logger, callbacks=[ckpt_callback]) trainer.fit(model, trainloader, validloader) gc.collect() # collect garbage return trainer.logged_metrics
def train(args: Namespace, datamodule: LightningDataModule) -> None: """Train Scyclone on PyTorch-Lightning. """ ckptAndLogging = CheckpointAndLogging(args.dir_root, args.name_exp, args.name_version) # setup gpus: int = 1 if torch.cuda.is_available() else 0 # single GPU or CPU model = Scyclone(args.sampling_rate, args.noiseless_d) ckpt_cb = ModelCheckpoint(period=60, save_last=True, save_top_k=1, monitor="val_loss") trainer = pl.Trainer( gpus=gpus, auto_select_gpus=True, precision=32 if args.no_amp else 16, max_epochs=args.max_epochs, check_val_every_n_epoch=args.val_interval_epoch, # logging/checkpointing resume_from_checkpoint=ckptAndLogging.resume_from_checkpoint, default_root_dir=ckptAndLogging.default_root_dir, checkpoint_callback=ckpt_cb, logger=pl_loggers.TensorBoardLogger( ckptAndLogging.save_dir, ckptAndLogging.name, ckptAndLogging.version ), # reload_dataloaders_every_epoch=True, profiler=args.profiler, progress_bar_refresh_rate=30 ) # training trainer.fit(model, datamodule=datamodule)
def main(hparams): hparams = vars(hparams) hparams, loaderDict, normalizer, collate = get_data(hparams) # ------------------------ # Model # ------------------------ add_device_hparams(hparams) # define logger Path(hparams['log_path']).mkdir(parents=True, exist_ok=True) logger = loggers.TensorBoardLogger(hparams['log_path'], version=hparams['version']) logger.log_hyperparams(params=hparams) # define model model = RegressionModel(hparams, loaderDict['train'], loaderDict['valid'], normalizer, collate) chkpt = None if hparams['load'] is None else get_checkpoint_path( hparams['load']) trainer = pl.Trainer(gpus=hparams['gpus'], logger=logger, max_epochs=hparams['epochs'], distributed_backend=hparams['distributed_backend'], precision=16 if hparams['use_amp'] else 32, default_root_dir=hparams['log_path'], deterministic=True, resume_from_checkpoint=chkpt, auto_lr_find=hparams['auto_lr'], auto_scale_batch_size=hparams['auto_bsz']) trainer.fit(model)
def main(hparams): today = datetime.datetime.now().strftime('%d.%m.%Y') checkpoint_callback = ModelCheckpoint( dirpath=join(hparams.logger_save_dir, hparams.experiment_name, 'ckpts'), filename='ckpt-' + today + '-{epoch:02d}-{val_loss:2f}', save_top_k=hparams.save_top_k, verbose=True, monitor=hparams.monitor_loss, prefix='') tb_logger = loggers.TensorBoardLogger(save_dir=hparams.logger_save_dir, name=hparams.experiment_name) if hparams.checkpoint_path is None: model = hparams.Model(**vars(hparams)) else: # If any arguments were explicitly given, then force those seen_params = { a : getattr(hparams, a) for a in hparams.seen_args_ if a != '==SUPPRESS==' } model = hparams.Model.load_from_checkpoint(hparams.checkpoint_path, **seen_params) trainer = Trainer.from_argparse_args( hparams, callbacks=[checkpoint_callback], logger=tb_logger) trainer.fit(model)
def main(hparams, fold): seed_everything(hparams.seed) MAIN_DIR = os.path.join(config.path_to_summaries, "DWSCAllDatasets/") model = DWSCClassifier(hparams, fold) tb_logger = pl_loggers.TensorBoardLogger(os.path.join(MAIN_DIR, "logs")) if hparams.dataset != "SONYCUST": early_stopping = EarlyStopping("2_valid/1_accuracy0.5", patience=50, mode="max") else: early_stopping = EarlyStopping("2_valid_coarse/1_auprc_macro", patience=30, mode="max") trainer = Trainer.from_argparse_args( hparams, default_root_dir=MAIN_DIR, logger=tb_logger, early_stop_callback=early_stopping, # fast_dev_run=True, checkpoint_callback=None, gpus=1, ) trainer.fit(model) with open(os.path.join(MAIN_DIR, "logs/report.txt"), "a") as file: if hparams.dataset != "SONYCUST": file.write(hparams.dataset + " fold : " + str(fold) + "\n") else: file.write(hparams.dataset + "\n") file.write(str(model.best_scores) + "\n")
def train_cross_val(p): data_ = load_data(root_dir='./data/', mode='train') data_, target_, features, date = preprocess_data(data_, nn=True) gts = PurgedGroupTimeSeriesSplit(n_splits=5, group_gap=5) input_size = data_.shape[-1] output_size = 1 tb_logger = pl_loggers.TensorBoardLogger('logs/') models = [] for i, (train_idx, val_idx) in enumerate(gts.split(data_, groups=date)): idx = np.concatenate([train_idx, val_idx]) data = copy.deepcopy(data_[idx]) target = copy.deepcopy(target_[idx]) checkpoint_callback = pl.callbacks.ModelCheckpoint(os.path.join( 'models/', "fold_{}".format(i)), monitor="val_auc", mode='max', save_top_k=1, period=10) model = Classifier(input_size=input_size, output_size=output_size, params=p) if p['activation'] == nn.ReLU: model.apply(lambda m: init_weights(m, 'relu')) elif p['activation'] == nn.LeakyReLU: model.apply(lambda m: init_weights(m, 'leaky_relu')) train_idx = [i for i in range(0, max(train_idx) + 1)] val_idx = [i for i in range(len(train_idx), len(idx))] data[train_idx] = calc_data_mean(data[train_idx], './cache', train=True, mode='mean') data[val_idx] = calc_data_mean(data[val_idx], './cache', train=False, mode='mean') dataset = FinData(data=data, target=target, date=date) dataloaders = create_dataloaders(dataset, indexes={ 'train': train_idx, 'val': val_idx }, batch_size=p['batch_size']) es = EarlyStopping(monitor='val_auc', patience=10, min_delta=0.0005, mode='max') trainer = pl.Trainer(logger=tb_logger, max_epochs=500, gpus=1, callbacks=[checkpoint_callback, es], precision=16) trainer.fit(model, train_dataloader=dataloaders['train'], val_dataloaders=dataloaders['val']) torch.save(model.state_dict(), f'models/fold_{i}_state_dict.pth') models.append(model) return models, features
def run(option=None): pl.trainer.seed_everything(40) option = option or args.log # set path saved_dir = f"{args.mind_type}/{args.model_class}" ckpt_dir = f"saved/checkpoint/{saved_dir}/{option}" if args.resume and os.path.exists(args.resume): best_model_path = args.resume else: best_model_path = os.path.join(ckpt_dir, "best_model.ckpt") resume_path = best_model_path if os.path.exists( best_model_path) and args.resume else None train_news_file, train_behaviors_file = get_path("train", mind_type=args.mind_type) valid_news_file, valid_behaviors_file = get_path("valid", mind_type=args.mind_type) converter = Converter(hparams).converter train_dataset = TrainingDataset(train_news_file, train_behaviors_file, hparams, converter, npratio=hparams.npratio) train_dataloader = DataLoader(train_dataset, hparams.batch_size, num_workers=args.num_workers, pin_memory=True) hparams.update(**{"user_embedding_size": len(train_dataset.uid2index)}) # set validation interval and max epochs interval, epochs = len(train_dataloader) // 3, hparams.epochs accelerator = "ddp" if int(args.gpus) > 1 else None valid_callback = ValidationCallback(valid_news_file, valid_behaviors_file, hparams, converter, ckpt_dir, interval) tb_logger = pl_loggers.TensorBoardLogger(f"saved/logs/{saved_dir}") # trainer object, uses good defaults (auto-tensorboard, checkpoints, logs, and more) trainer = pl.Trainer(gpus=int(args.gpus), accelerator=accelerator, max_epochs=epochs, deterministic=True, logger=tb_logger, callbacks=[valid_callback], resume_from_checkpoint=resume_path, profiler="simple") model_class = get_model_class(args.model_class) trainer.fit(model_class(hparams), train_dataloader) group_auc = [ float(file.split("==")[1].replace(".ckpt", "")) for file in os.listdir(ckpt_dir) if "==" in file ] best_auc = max(group_auc) for file in os.scandir(ckpt_dir): if "best_model" in file.name: continue auc = float(file.name.split("==")[1].replace(".ckpt", "")) if auc < best_auc: if os.path.exists(file.path): os.remove(file.path)
def main(): parser = HfArgumentParser((ModelArguments, ParaphraseDataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=2, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, model_max_length=data_args.model_max_length ) language_model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) if data_args.neptune_logging: neptune_logger = NeptuneLogger( project_name=os.environ['NEPTUNE_PROJECT'], experiment_name=model_args.config_name if model_args.config_name else model_args.model_name_or_path ) train_dataset = ParaphraseDetectionDataset(data_dir=os.path.join(data_args.data_dir, TRAIN_PATH), tokenizer=tokenizer, task_name="paraphrase_detection") val_datasets= [ ParaphraseDetectionDataset(data_dir=os.path.join(data_args.data_dir, EVAL_PATH), tokenizer=tokenizer, name=EVAL_NAME) for (EVAL_PATH, EVAL_NAME) in zip(EVAL_PATHS, EVAL_NAMES) ] model = LMFinetuner(language_model, tokenizer, training_args.learning_rate, model_args.batch_size, train_dataset, val_datasets, data_args, freeze_backend=False) tb_logger = pl_loggers.TensorBoardLogger(os.path.join(training_args.output_dir, model_args.model_name_or_path)) trainer = pl.Trainer( # auto_lr_find=True, # auto_scale_batch_size=True, max_epochs=int(training_args.num_train_epochs), accumulate_grad_batches=training_args.gradient_accumulation_steps, weights_save_path=training_args.output_dir, gpus=torch.cuda.device_count(), precision=16 if training_args.fp16 and torch.cuda.is_available() else 32, distributed_backend='ddp' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, progress_bar_refresh_rate=training_args.logging_steps, logger=[neptune_logger, tb_logger] if data_args.neptune_logging else tb_logger, ) trainer.fit(model) model.lm.save_pretrained(os.path.join(training_args.output_dir, model_args.model_name_or_path))
def train(args, custom_callbacks=None): data_module = GwtDataModule( args.batch_size, args.num_dataset_workers, f'{args.dataset_base_path}/{args.split}/train.jsonl', f'{args.dataset_base_path}/{args.split}/validate.jsonl', f'{args.dataset_base_path}/{args.split}/test.jsonl', f'{args.dataset_base_path}/bpe_ast_vocab.txt', ) if args.invalidate_line_caches: data_module.invalidate_caches() model = GwtSectionPredictionTransformer( data_module.vocab.get_size(), data_module.vocab.get_index(data_module.vocab.PAD_TOKEN), args.max_sequence_length, args.embedding_size, args.learning_rate, args.num_attention_heads, args.num_encoder_layers, args.num_decoder_layers, args.feedforward_dimensions, args.positional_encoding_dropout, args.transformer_dropout, args.lr_warmup_steps, args.optimize_on_smoothed_loss, ) logger = loggers.TensorBoardLogger( args.tensorboard_dir, name=args.experiment_name, version=args.version, ) logger.log_hyperparams(args) checkpoint_dir = os.path.join(logger.log_dir, 'checkpoints') loss_key = 'val_loss' if not args.optimize_on_smoothed_loss else 'label_smoothed_val_loss' trainer = pl.Trainer.from_argparse_args( args, resume_from_checkpoint=load_checkpoint_if_available(checkpoint_dir), logger=logger, checkpoint_callback=callbacks.ModelCheckpoint( filepath=f'{checkpoint_dir}/{{epoch}}-{{{loss_key}}}', save_top_k=5, monitor=loss_key, mode='min', ), **({ 'callbacks': custom_callbacks } if custom_callbacks else {}), ) trainer.fit(model, data_module) return trainer
def train_model(user_image: Image, style_image: Image): """Trains a Deep Learning model to extract the stylings from `style_image` and applies them onto `user_image` then returns `user_image`. Args: user_image (Image): Image you want to apply styles onto. style_image (Image): Image you want to extract styles from. Returns: user_image (Image): `user_image` with styling applied. """ image_processor = ImageProcessor(maximum_image_size=(512, 512)) print(f"user_image.size: {user_image.size} | style_image.size: {style_image.size}") image_size = image_processor.get_common_image_size(user_image, style_image) # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device = "cpu" user_image = image_processor.prepare_images(user_image, image_size) style_image = image_processor.prepare_images(style_image, image_size) normalization_mean = torch.tensor([0.485, 0.456, 0.406]).to(image_processor.device) normalization_std = torch.tensor([0.229, 0.224, 0.225]).to(image_processor.device) datamodule = DataModule(user_image, style_image) model = StyleTransferModel( user_image=user_image, style_image=style_image, normalization_mean=normalization_mean, normalization_std=normalization_std, ) cnn = models.vgg19(pretrained=True).features.to(device).eval() model._build_model_and_loss_functions(base_cnn_model=cnn) print("Training...") tb_logger = pl_loggers.TensorBoardLogger("logs/") trainer_params = {"max_epochs": 6000} if device != "cpu": print("Using a gpu!") trainer_params["gpus"] = 1 trainer = pl.Trainer(**trainer_params) trainer.fit(model, datamodule) sample = user_image.to(device) model.to(device) image = model(sample) image = image_processor.save_image( image, "output.png", "Sample Output", display_image=True ) return image
def main(): parser = argparse.ArgumentParser(description="Trains the network.") parser.add_argument("train", help="Training data (.bin or .binpack)") parser.add_argument("val", help="Validation data (.bin or .binpack)") parser.add_argument("--architecture", default='normal', help="architecture of model") parser = pl.Trainer.add_argparse_args(parser) parser.add_argument("--py-data", action="store_true", help="Use python data loader (default=False)") parser.add_argument("--lambda", default=1.0, type=float, dest='lambda_', help="lambda=1.0 = train on evaluations, lambda=0.0 = train on game results, interpolates between (default=1.0).") parser.add_argument("--num-workers", default=1, type=int, dest='num_workers', help="Number of worker threads to use for data loading. Currently only works well for binpack.") parser.add_argument("--batch-size", default=-1, type=int, dest='batch_size', help="Number of positions per batch / per iteration. Default on GPU = 8192 on CPU = 128.") parser.add_argument("--threads", default=-1, type=int, dest='threads', help="Number of torch threads to use. Default automatic (cores) .") parser.add_argument("--seed", default=42, type=int, dest='seed', help="torch seed to use.") parser.add_argument("--smart-fen-skipping", action='store_true', dest='smart_fen_skipping', help="If enabled positions that are bad training targets will be skipped during loading. Default: False") args = parser.parse_args() if args.architecture.lower() == "leiser": data_name = halfkp.LEISER_NAME model_inputs = halfkp.LEISER_INPUTS elif args.architecture.lower() == "normal": data_name = halfkp.NAME model_inputs = halfkp.INPUTS else: raise Exception("Incorrect architecture name") nnue = M.NNUE(num_inputs=model_inputs, lambda_=args.lambda_) print("Training with {} validating with {}".format(args.train, args.val)) pl.seed_everything(args.seed) print("Seed {}".format(args.seed)) batch_size = args.batch_size if batch_size <= 0: batch_size = 128 if args.gpus == 0 else 8192 print('Using batch size {}'.format(batch_size)) print('Smart fen skipping: {}'.format(args.smart_fen_skipping)) if args.threads > 0: print('limiting torch to {} threads.'.format(args.threads)) t_set_num_threads(args.threads) if args.py_data: print('Using python data loader') train, val = data_loader_py(args.train, args.val, batch_size) else: print('Using c++ data loader') train, val = data_loader_cc(args.train, args.val, data_name, args.num_workers, batch_size, args.smart_fen_skipping) logdir = args.default_root_dir if args.default_root_dir else 'logs/' print('Using log dir {}'.format(logdir), flush=True) tb_logger = pl_loggers.TensorBoardLogger(logdir) checkpoint_callback = pl.callbacks.ModelCheckpoint(save_top_k=1, save_last=True, monitor='val_loss', filename='best_model') trainer = pl.Trainer.from_argparse_args(args, callbacks=[checkpoint_callback], logger=tb_logger) trainer.fit(nnue, train, val)
def main(config): """ Main function for training LSTMs. After training, results on validation & test sets are recorded in the specified log_path. """ dataset, train_loader, subgraph_loader = get_data(config) # define logger Path(config['log_path']).mkdir(parents=True, exist_ok=True) logger = loggers.TensorBoardLogger(config['log_path'], version=config['version']) logger.log_hyperparams(params=config) # define model model = Model(config, dataset, train_loader, subgraph_loader) chkpt = None if config['load'] is None else get_checkpoint_path( config['load']) trainer = pl.Trainer(gpus=config['gpus'], logger=logger, max_epochs=config['epochs'], distributed_backend='dp', precision=16 if config['use_amp'] else 32, default_root_dir=config['log_path'], deterministic=True, resume_from_checkpoint=chkpt, auto_lr_find=config['auto_lr'], auto_scale_batch_size=config['auto_bsz']) trainer.fit(model) for phase in ['test', 'valid']: if phase == 'valid': trainer.eval_split = 'val' trainer.eval_mask = dataset.data.val_mask print(phase, trainer.eval_split) ret = trainer.test() if isinstance(ret, list): ret = ret[0] per_node = ret.pop('per_node') test_results = ret res_dir = Path(config['log_path']) / 'default' if config['version'] is not None: res_dir = res_dir / config['version'] else: res_dir = res_dir / ('results_' + str(config['seed'])) print(phase, ':', test_results) Path(res_dir).mkdir(parents=True, exist_ok=True) write_json(test_results, res_dir / f'{phase}_results.json', sort_keys=True, verbose=True) write_pkl(per_node, res_dir / f'{phase}_per_node.pkl') path_results = Path(config['log_path']) / f'all_{phase}_results.csv' record_results(path_results, config, test_results)