from pytorch_lightning.callbacks.early_stopping import EarlyStopping from __init__ import * pl.seed_everything(hparams.seed) """ Call all scripts from __init__, define and initialize model, set and activate Tensorboard logging and run training, validation and testing loop. early stopping set fix to 10 rounds of no improvement of at least 0.001 validation accuracy. """ parser = get_parser() hparams = parser.parse_args() # Define Early Stopping condition early_stop_callback = EarlyStopping(monitor='val_acc', min_delta=0.001, patience=10, verbose=False, mode='max') # Define Model if hparams.classifier_type == "autoregressive": model = Protein_GRU_Sequencer_Autoregressive() elif hparams.encoder_type == "gru": model = Protein_GRU_Sequencer_CNN() elif hparams.encoder_type == "lstm": model = Protein_LSTM_Sequencer_CNN() else: raise Exception('Unknown encoder type: ' + hparams.encoder_type) # Set Logging if hparams.logger == True:
def cli_main(): parser = ArgumentParser() parser.add_argument("--DATA_PATH", type=str, help="path to folders with images") parser.add_argument("--MODEL_PATH", default=None, type=str, help="path to model checkpoint") parser.add_argument("--batch_size", default=128, type=int, help="batch size for SSL") parser.add_argument("--image_size", default=256, type=int, help="image size for SSL") parser.add_argument( "--image_type", default="tif", type=str, help= "extension of image for PIL to open and parse - i.e. jpeg, gif, tif, etc. Only put the extension name, not the dot (.)" ) parser.add_argument("--num_workers", default=1, type=int, help="number of CPU cores to use for data processing") parser.add_argument("--image_embedding_size", default=128, type=int, help="size of image representation of SIMCLR") parser.add_argument("--epochs", default=200, type=int, help="number of epochs to train model") parser.add_argument("--lr", default=1e-3, type=float, help="learning rate for training model") parser.add_argument( "--patience", default=-1, type=int, help= "automatically cuts off training if validation does not drop for (patience) epochs. Leave blank to have no validation based early stopping." ) parser.add_argument("--val_split", default=0.2, type=float, help="percent in validation data") parser.add_argument( "--pretrain_encoder", default=False, type=bool, help= "initialize resnet encoder with pretrained imagenet weights. Cannot be true if passing previous SSL model checkpoint." ) parser.add_argument( "--withold_train_percent", default=0, type=float, help= "decimal from 0-1 representing how much of the training data to withold during SSL training" ) parser.add_argument("--version", default="0", type=str, help="version to name checkpoint for saving") parser.add_argument("--gpus", default=1, type=int, help="number of gpus to use for training") args = parser.parse_args() URL = args.DATA_PATH batch_size = args.batch_size image_size = args.image_size image_type = args.image_type num_workers = args.num_workers embedding_size = args.image_embedding_size epochs = args.epochs lr = args.lr patience = args.patience val_split = args.val_split pretrain = args.pretrain_encoder withold_train_percent = args.withold_train_percent version = args.version model_checkpoint = args.MODEL_PATH gpus = args.gpus # #testing # batch_size = 128 # image_type = 'tif' # image_size = 256 # num_workers = 4 # URL ='/content/UCMerced_LandUse/Images' # embedding_size = 128 # epochs = 2 # lr = 1e-3 # patience = 1 # val_split = 0.2 # pretrain = False # withold_train_percent = 0.2 # version = "1" # model_checkpoint = '/content/models/SSL/SIMCLR_SSL_0.pt' # gpus = 1 # #gets dataset. We can't combine since validation data has different transform needed train_dataset = FolderDataset( URL, validation=False, val_split=val_split, withold_train_percent=withold_train_percent, transform=SimCLRTrainDataTransform(image_size), image_type=image_type) data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers, drop_last=True) print('Training Data Loaded...') val_dataset = FolderDataset(URL, validation=True, val_split=val_split, transform=SimCLREvalDataTransform(image_size), image_type=image_type) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, num_workers=num_workers, drop_last=True) print('Validation Data Loaded...') num_samples = len(train_dataset) #init model with batch size, num_samples (len of data), epochs to train, and autofinds learning rate model = SimCLR(arch='resnet18', batch_size=batch_size, num_samples=num_samples, gpus=gpus, dataset='None', max_epochs=epochs, learning_rate=lr) # model.encoder = resnet18(pretrained=pretrain, first_conv=model.first_conv, maxpool1=model.maxpool1, return_all_feature_maps=False) model.projection = Projection(input_dim=512, hidden_dim=256, output_dim=embedding_size) #overrides if patience > 0: cb = EarlyStopping('val_loss', patience=patience) trainer = Trainer(gpus=gpus, max_epochs=epochs, callbacks=[cb], progress_bar_refresh_rate=5) else: trainer = Trainer(gpus=gpus, max_epochs=epochs, progress_bar_refresh_rate=5) if model_checkpoint is not None: model.load_state_dict(torch.load(model_checkpoint)) print( 'Successfully loaded your checkpoint. Keep in mind that this does not preserve the previous trainer states, only the model weights' ) model.cuda() print('Model Initialized') trainer.fit(model, data_loader, val_loader) Path(f"./models/SSL/SIMCLR_SSL_{version}").mkdir(parents=True, exist_ok=True) torch.save(model.state_dict(), f"./models/SSL/SIMCLR_SSL_{version}/SIMCLR_SSL_{version}.pt")
model = model.apply(init_weights) tb_save_dir = os.path.join(os.getcwd(), 'runs') cp_save_dir = os.path.join(os.getcwd(), "CKP", model_file_name) logger = TensorBoardLogger(save_dir=tb_save_dir, name=model_file_name) checkpoint_callback = ModelCheckpoint(filepath=cp_save_dir, save_top_k=1, verbose=True, monitor='loss_val', mode='min') early_stop_callback = EarlyStopping(monitor='loss_val', verbose=True, mode=min) trainer = pl.Trainer(gpus=1, max_epochs=hparams["max_epochs"], weights_summary=None, logger=logger, checkpoint_callback=checkpoint_callback, callbacks=[early_stop_callback]) trainer.fit(model, train_dataloader, val_dataloader) print("Best Model Path", checkpoint_callback.best_model_path) best_model_path = checkpoint_callback.best_model_path print(trainer.test(model, test_dataloaders=test_dataloader))
def setup_trainer(args): # init model set_seed(args) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) checkpoint_callback = pl.callbacks.ModelCheckpoint(filepath=os.path.join( args.output_dir, '{epoch}'), monitor="val_loss", mode="min", verbose=True, save_top_k=1) early_stop_callback = EarlyStopping(monitor='val_loss', min_delta=0.00, patience=args.early_stop_patience, verbose=True, mode='min') # wandb logger wandb_logger = WandbLogger(project="bart-qa-to-nli") train_params = dict( accumulate_grad_batches=args.gradient_accumulation_steps, gpus=args.n_gpu, max_epochs=args.num_train_epochs, early_stop_callback=early_stop_callback, gradient_clip_val=args.max_grad_norm, checkpoint_callback=checkpoint_callback, logger=wandb_logger, callbacks=[LoggingCallback()], val_check_interval=0.25, ) if args.fp16: train_params["use_amp"] = args.fp16 train_params["amp_level"] = args.fp16_opt_level if args.n_tpu_cores > 0: global xm import torch_xla.core.xla_model as xm train_params["num_tpu_cores"] = args.n_tpu_cores train_params["gpus"] = 0 if args.n_gpu > 1: train_params["distributed_backend"] = "ddp" trainer = pl.Trainer(**train_params) return trainer
def cli_main(): parser = ArgumentParser() parser.add_argument("--DATA_PATH", type=str, help="path to folders with images") parser.add_argument("--MODEL_PATH", default=None, type=str, help="path to model checkpoint") parser.add_argument("--batch_size", default=128, type=int, help="batch size for SSL") parser.add_argument("--image_size", default=256, type=int, help="image size for SSL") parser.add_argument("--image_embedding_size", default=128, type=int, help="size of image representation of SIMCLR") parser.add_argument("--epochs", default=200, type=int, help="number of epochs to train model") parser.add_argument("--lr", default=1e-3, type=float, help="learning rate for training model") parser.add_argument( "--patience", default=-1, type=int, help= "automatically cuts off training if validation does not drop for (patience) epochs. Leave blank to have no validation based early stopping." ) parser.add_argument("--val_split", default=0.2, type=float, help="percent in validation data") parser.add_argument( "--pretrain_encoder", default=False, type=bool, help= "initialize resnet encoder with pretrained imagenet weights. Cannot be true if passing previous SSL model checkpoint." ) parser.add_argument("--version", default="0", type=str, help="version to name checkpoint for saving") parser.add_argument("--gpus", default=1, type=int, help="number of gpus to use for training") parser.add_argument("--num_workers", default=0, type=int, help="number of workers to use to fetch data") args = parser.parse_args() DATA_PATH = args.DATA_PATH batch_size = args.batch_size image_size = args.image_size num_workers = args.num_workers embedding_size = args.image_embedding_size epochs = args.epochs lr = args.lr patience = args.patience val_split = args.val_split pretrain = args.pretrain_encoder version = args.version model_checkpoint = args.MODEL_PATH gpus = args.gpus num_workers = args.num_workers dm = ImageModule(DATA_PATH, val_split=val_split, train_transform=SimCLRTrainDataTransform(image_size), val_transform=SimCLREvalDataTransform(image_size), num_workers=num_workers) dm.setup() #init model with batch size, num_samples (len of data), epochs to train, and autofinds learning rate model = SimCLR(arch='resnet18', batch_size=batch_size, num_samples=dm.num_samples, gpus=gpus, dataset='None', max_epochs=epochs, learning_rate=lr) # model.encoder = resnet18(pretrained=pretrain, first_conv=model.first_conv, maxpool1=model.maxpool1, return_all_feature_maps=False) model.projection = Projection(input_dim=512, hidden_dim=256, output_dim=embedding_size) #overrides if patience > 0: cb = EarlyStopping('val_loss', patience=patience) trainer = Trainer(gpus=gpus, max_epochs=epochs, callbacks=[cb], progress_bar_refresh_rate=5) else: trainer = Trainer(gpus=gpus, max_epochs=epochs, progress_bar_refresh_rate=5) if model_checkpoint is not None: model.load_state_dict(torch.load(model_checkpoint)) print( 'Successfully loaded your checkpoint. Keep in mind that this does not preserve the previous trainer states, only the model weights' ) print('Model Initialized') trainer.fit(model, dm) Path(f"./models/SSL/SIMCLR_SSL_{version}").mkdir(parents=True, exist_ok=True) torch.save(model.state_dict(), f"./models/SSL/SIMCLR_SSL_{version}/SIMCLR_SSL_{version}.pt")
def _active_body(self, pid=None): # init model self.model = self.init_model(self.config.hparams) # init active learner and set it up for training self.learner = ActiveLearner( config = self.config.exp.active_learning, model = self.model, datamodule = self.dm, ).setup(pid) # init dict to store results results = {'config': config_to_dict(self.config)} # init training with pl.LightningModule models if self.config.trainer is not None: # init logger if self.config.logger is not None: logger = self.init_logger(pid) # init early stopping callbacks = list() if self.config.early_stop is not None: earlystop_callback = EarlyStopping(**vars(self.config.early_stop)) callbacks.append(earlystop_callback) # init checkpoint callback if self.learner.val_ratio > 0: checkpoint_callback = ModelCheckpoint( monitor = self.config.early_stop.monitor, save_last = True, mode = self.config.early_stop.mode, ) callbacks.append(checkpoint_callback) # make trainer trainer_args = vars(self.config.trainer) trainer_args.update({ 'logger': logger, 'callbacks': callbacks }) trainer = pl.Trainer(**trainer_args) # find optimal lr if self.config.exp.tune: trainer.auto_lr_find = True trainer.tune( model = self.learner.model, train_dataloader = self.learner.init_loader, val_dataloaders = self.learner.val_loader ) # fit model to initial batch trainer.fit( model = self.learner.model, train_dataloader = self.learner.init_loader, val_dataloaders = self.learner.val_loader ) # test model and get results # self.learner.model = self.init_model(self.config.hparams).load_from_checkpoint(trainer.checkpoint_callback.best_model_path) [metr] = trainer.test( model = self.learner.model, test_dataloaders = self.dm.test_dataloader() ) counts = dict(self.learner._get_counts()) cm = self.learner.model.cm # print(earlystop_callback.best_score) # print(trainer.checkpoint_callback.best_model_path) print(cm) # reset early stopping trainer.should_stop = False earlystop_callback.wait_count = 0 # earlystop_callback.stopped_epoch = 0 # log test results for the initial batch results.setdefault('metrics', list()).append(metr) results.setdefault('counts', list()).append(counts) results.setdefault('cms', list()).append(cm.tolist()) # now receive samples one by one from a stream for i, (inp, tgt) in enumerate(self.learner.stream_loader): # infer label w = self.learner.infer(inp, use_torch=True).detach().cpu() # logit odds p = torch.sigmoid(w) # estimated probability that the current sample is in class 'high' # query if condition is met if self.learner.query(w, p): self.learner.n_queried += 1 self.learner.queried.append((inp, tgt)) # add current sample to queried set # update the model coverage self.learner.coverage = self.learner.n_queried / (len(self.learner.init_batch) + i + 1) # if the number of queried samples is larger than the update size if len(self.learner.queried) >= self.learner.update_size: self.learner.update() # update active learner # rebuild model from scratch if self.learner.rebuild: self.learner.model = self.init_model(self.config.hparams) # or incrementally update existing model # reload model from the last best checkpoint if there is a validation set elif self.learner.val_loader is not None: self.learner.model = self.learner.model.load_from_checkpoint(trainer.checkpoint_callback.best_model_path) # update model to use a different learning rate for learning from datastream if self.learner.update_lr is not None and not self.config.exp.tune: self.learner.model.hparams.learning_rate = self.learner.update_lr # increment the number of maximum training epochs trainer.max_epochs += self.learner.update_epochs # # find optimal lr # if self.config.exp.tune: # trainer.tune( # model = self.learner.model, # train_dataloader = self.learner.train_loader, # val_dataloaders = self.learner.val_loader # ) # re-fit model to the new trainset trainer.fit( model = self.learner.model, train_dataloader = self.learner.train_loader, val_dataloaders = self.learner.val_loader ) # test fitted model again on test set # self.learner.model = self.init_model(self.config.hparams).load_from_checkpoint(trainer.checkpoint_callback.best_model_path) [metr] = trainer.test( model = self.learner.model, test_dataloaders = self.dm.test_dataloader() ) counts = dict(self.learner._get_counts()) cm = self.learner.model.cm # print(earlystop_callback.best_score) # print(trainer.checkpoint_callback.best_model_path) print(cm) # reset early stopping trainer.should_stop = False earlystop_callback.wait_count = 0 # earlystop_callback.stopped_epoch = 0 # earlystop_callback.based_on_eval_results = False # log test results results['metrics'].append(metr) results['counts'].append(counts) results['cms'].append(cm.tolist()) # TODO: now that we have seen all samples from the stream, do we want to do anything else? # active learning with XGBoost else: # get initial batch X_init, y_init = map(lambda x: torch.cat(x, dim=0).numpy(), zip(self.learner.init_batch[:], self.learner.val_batch[:])) # fit model to initial batch self.learner.model.train(X_init, y_init) # self.model.train(X_init, y_init) # test model and get results X_test, y_test = map(lambda x: x.numpy(), self.dm.kemocon_test[:]) metr, cm = self.learner.model.test(X_test, y_test) counts = dict(self.learner._get_counts()) # save test results results.update({ 'metrics': [metr], 'counts': [counts], 'confmats': [cm.tolist()] }) print(metr) print(cm) ## update xgb parameters before learning from the stream # vars(self.learner.model.hparams.bst).update({ # 'process_type': 'update', # 'updater': 'refresh,prune', # 'refresh_leaf': True # }) # get samples from a stream for i, (inp, tgt) in enumerate(self.learner.datastream): # infer label w = self.learner.infer(inp, use_torch=False) p = 1 / (1 + np.exp(-w)) # query if condition is met if self.learner.query(w, p): self.learner.n_queried += 1 self.learner.queried.append((inp.unsqueeze(0), tgt.unsqueeze(0))) # update model covera self.learner.coverage = self.learner.n_queried / (len(self.learner.init_batch) + i + 1) # if queried the update size number of samples if len(self.learner.queried) >= self.learner.update_size: # update train + val & minority label # & reset queried samples buffer self.learner.update() # update model with queried samples X_train, y_train = self.learner.train_inp.numpy(), self.learner.train_tgt.numpy() # rebuild model from scratch if self.learner.rebuild: self.learner.model = self.init_model(self.config.hparams) self.learner.model.train(X_train, y_train) # or incrementally update existing model else: self.learner.model.train(X_train, y_train, model=self.learner.model.bst) # test updated model metr, cm = self.learner.model.test(X_test, y_test) counts = dict(self.learner._get_counts()) # save results results['metrics'].append(metr) results['counts'].append(counts) results['confmats'].append(cm.tolist()) print(metr) print(cm) return results
def train(model): # create a logger logger = DictLogger() # create folder for each run folder = "models/{}".format(datetime.now().strftime("%b-%d-%H-%M-%S")) if not os.path.exists(folder): os.makedirs(folder) # early stoppping early_stopping_callback = EarlyStopping( monitor='val_loss', # monitor validation loss verbose=True, # log early-stop events patience=patience, min_delta=0.00 # minimum change is 0 ) # update checkpoints based on validation loss by using ModelCheckpoint callback monitoring 'val_loss' checkpoint_callback = ModelCheckpoint(monitor='val_loss') # define trainer trainer = pl.Trainer( default_root_dir= folder, # Lightning automates saving and loading checkpoints max_epochs=epochs, gpus=0, logger=logger, progress_bar_refresh_rate=30, callbacks=[early_stopping_callback, checkpoint_callback]) # train trainer.fit(model=model, train_dataloader=train_loader, val_dataloaders=val_loader) # test result = trainer.test(test_dataloaders=test_loader, verbose=True) # save test result PATH = folder + '/result' with open(PATH, "w") as f: f.write(f"Model: {str(model)}\n") f.write(json.dumps(logger.metrics)) f.write("\n") f.write( f"Lowest training loss: {str(min(logger.metrics['train_loss']))}\n" ) f.write( f"Lowest validation loss: {str(min(logger.metrics['val_loss']))}\n" ) f.write(f"Test loss: {result}") # plot training plt.plot(range(len(logger.metrics['train_loss'])), logger.metrics['train_loss'], lw=2, label='Training Loss') plt.plot(range(len(logger.metrics['val_loss'])), logger.metrics['val_loss'], lw=2, label='Validation Loss') plt.legend() plt.xlabel('Epoch') plt.ylabel('RMSE Loss') plt.savefig(folder + f"/{type(model).__name__}_training_validation_test_loss.png") plt.clf() # plot p loss plt.plot(range(len(logger.metrics['train_p_loss'])), logger.metrics['train_p_loss'], lw=2, label='Training Loss') plt.plot(range(len(logger.metrics['val_p_loss'])), logger.metrics['val_p_loss'], lw=2, label='Validation Loss') plt.legend() plt.xlabel('Epoch') plt.ylabel('RMSE Loss') plt.savefig(folder + f"/p_loss.png") plt.clf() # plot T loss plt.plot(range(len(logger.metrics['train_T_loss'])), logger.metrics['train_T_loss'], lw=2, label='Training Loss') plt.plot(range(len(logger.metrics['val_T_loss'])), logger.metrics['val_T_loss'], lw=2, label='Validation Loss') plt.legend() plt.xlabel('Epoch') plt.ylabel('RMSE Loss') plt.savefig(folder + f"/T_loss.png") plt.clf() # plot T loss plt.plot(range(len(logger.metrics['train_rh_loss'])), logger.metrics['train_rh_loss'], lw=2, label='Training Loss') plt.plot(range(len(logger.metrics['val_rh_loss'])), logger.metrics['val_rh_loss'], lw=2, label='Validation Loss') plt.legend() plt.xlabel('Epoch') plt.ylabel('RMSE Loss') plt.savefig(folder + f"/rh_loss.png") plt.clf() # plot wv loss plt.plot(range(len(logger.metrics['train_wv_loss'])), logger.metrics['train_wv_loss'], lw=2, label='Training Loss') plt.plot(range(len(logger.metrics['val_wv_loss'])), logger.metrics['val_wv_loss'], lw=2, label='Validation Loss') plt.legend() plt.xlabel('Epoch') plt.ylabel('RMSE Loss') plt.savefig(folder + f"/wv_loss.png") plt.clf()
def train_default_zoobot_from_scratch( # absolutely crucial arguments save_dir, # save model here schema, # answer these questions # input data - specify *either* catalog (to be split) or the splits themselves catalog=None, train_catalog=None, val_catalog=None, test_catalog=None, # model training parameters model_architecture='efficientnet', batch_size=256, epochs=1000, patience=8, # data and augmentation parameters # datamodule_class=GalaxyDataModule, # generic catalog of galaxies, will not download itself. Can replace with any datamodules from pytorch_galaxy_datasets color=False, resize_size=224, crop_scale_bounds=(0.7, 0.8), crop_ratio_bounds=(0.9, 1.1), # hardware parameters accelerator='auto', nodes=1, gpus=2, num_workers=4, prefetch_factor=4, mixed_precision=False, # replication parameters random_state=42, wandb_logger=None): slurm_debugging_logs() pl.seed_everything(random_state) assert save_dir is not None if not os.path.isdir(save_dir): os.mkdir(save_dir) if color: logging.warning( 'Training on color images, not converting to greyscale') channels = 3 else: logging.info('Converting images to greyscale before training') channels = 1 strategy = None if (gpus is not None) and (gpus > 1): # only works as plugins, not strategy # strategy = 'ddp' strategy = DDPPlugin(find_unused_parameters=False) logging.info('Using multi-gpu training') if nodes > 1: assert gpus == 2 logging.info('Using multi-node training') # this hangs silently on Manchester's slurm cluster - perhaps you will have more success? precision = 32 if mixed_precision: logging.info( 'Training with automatic mixed precision. Will reduce memory footprint but may cause training instability for e.g. resnet' ) precision = 16 assert num_workers > 0 if (gpus is not None) and (num_workers * gpus > os.cpu_count()): logging.warning("""num_workers * gpu > num cpu. You may be spawning more dataloader workers than you have cpus, causing bottlenecks. Suggest reducing num_workers.""") if num_workers > os.cpu_count(): logging.warning("""num_workers > num cpu. You may be spawning more dataloader workers than you have cpus, causing bottlenecks. Suggest reducing num_workers.""") if catalog is not None: assert train_catalog is None assert val_catalog is None assert test_catalog is None catalogs_to_use = {'catalog': catalog} else: assert catalog is None catalogs_to_use = { 'train_catalog': train_catalog, 'val_catalog': val_catalog, 'test_catalog': test_catalog } datamodule = GalaxyDataModule( label_cols=schema.label_cols, # can take either a catalog (and split it), or a pre-split catalog **catalogs_to_use, # augmentations parameters album=False, greyscale=not color, resize_size=resize_size, crop_scale_bounds=crop_scale_bounds, crop_ratio_bounds=crop_ratio_bounds, # hardware parameters batch_size= batch_size, # on 2xA100s, 256 with DDP, 512 with distributed (i.e. split batch) num_workers=num_workers, prefetch_factor=prefetch_factor) datamodule.setup() get_architecture, representation_dim = select_base_architecture_func_from_name( model_architecture) model = define_model.get_plain_pytorch_zoobot_model( output_dim=len(schema.answers), include_top=True, channels=channels, get_architecture=get_architecture, representation_dim=representation_dim) # This just adds schema.question_index_groups as an arg to the usual (labels, preds) loss arg format # Would use lambda but multi-gpu doesn't support as lambda can't be pickled def loss_func(preds, labels): # pytorch convention is preds, labels return losses.calculate_multiquestion_loss( labels, preds, schema.question_index_groups ) # my and sklearn convention is labels, preds lightning_model = define_model.GenericLightningModule(model, loss_func) callbacks = [ ModelCheckpoint(dirpath=os.path.join(save_dir, 'checkpoints'), monitor="val_loss", save_weights_only=True, mode='min', save_top_k=3), EarlyStopping(monitor='val_loss', patience=patience, check_finite=True) ] trainer = pl.Trainer( log_every_n_steps=3, accelerator=accelerator, gpus=gpus, # per node num_nodes=nodes, strategy=strategy, precision=precision, logger=wandb_logger, callbacks=callbacks, max_epochs=epochs, default_root_dir=save_dir) logging.info((trainer.training_type_plugin, trainer.world_size, trainer.local_rank, trainer.global_rank, trainer.node_rank)) trainer.fit(lightning_model, datamodule) trainer.test( model=lightning_model, datamodule=datamodule, ckpt_path= 'best' # can optionally point to a specific checkpoint here e.g. "/share/nas2/walml/repos/gz-decals-classifiers/results/early_stopping_1xgpu_greyscale/checkpoints/epoch=26-step=16847.ckpt" )
def ml_mlp_mul_ms(station_name="종로구"): print("Start Multivariate MLP Mean Seasonality Decomposition (MSE) Model") targets = ["PM10", "PM25"] # targets = ["SO2", "CO", "O3", "NO2", "PM10", "PM25", # "temp", "u", "v", "pres", "humid", "prep", "snow"] # 24*14 = 336 #sample_size = 336 sample_size = 48 output_size = 24 # If you want to debug, fast_dev_run = True and n_trials should be small number fast_dev_run = False n_trials = 128 # fast_dev_run = True # n_trials = 1 # Hyper parameter epoch_size = 500 batch_size = 64 learning_rate = 1e-3 # Blocked Cross Validation # neglect small overlap between train_dates and valid_dates # 11y = ((2y, 0.5y), (2y, 0.5y), (2y, 0.5y), (2.5y, 1y)) train_dates = [(dt.datetime(2008, 1, 4, 1).astimezone(SEOULTZ), dt.datetime(2009, 12, 31, 23).astimezone(SEOULTZ)), (dt.datetime(2010, 7, 1, 0).astimezone(SEOULTZ), dt.datetime(2012, 6, 30, 23).astimezone(SEOULTZ)), (dt.datetime(2013, 1, 1, 0).astimezone(SEOULTZ), dt.datetime(2014, 12, 31, 23).astimezone(SEOULTZ)), (dt.datetime(2015, 7, 1, 0).astimezone(SEOULTZ), dt.datetime(2017, 12, 31, 23).astimezone(SEOULTZ))] valid_dates = [(dt.datetime(2010, 1, 1, 0).astimezone(SEOULTZ), dt.datetime(2010, 6, 30, 23).astimezone(SEOULTZ)), (dt.datetime(2012, 7, 1, 0).astimezone(SEOULTZ), dt.datetime(2012, 12, 31, 23).astimezone(SEOULTZ)), (dt.datetime(2015, 1, 1, 0).astimezone(SEOULTZ), dt.datetime(2015, 6, 30, 23).astimezone(SEOULTZ)), (dt.datetime(2018, 1, 1, 0).astimezone(SEOULTZ), dt.datetime(2018, 12, 31, 23).astimezone(SEOULTZ))] train_valid_fdate = dt.datetime(2008, 1, 3, 1).astimezone(SEOULTZ) train_valid_tdate = dt.datetime(2018, 12, 31, 23).astimezone(SEOULTZ) # Debug if fast_dev_run: train_dates = [(dt.datetime(2015, 7, 1, 0).astimezone(SEOULTZ), dt.datetime(2017, 12, 31, 23).astimezone(SEOULTZ))] valid_dates = [(dt.datetime(2018, 1, 1, 0).astimezone(SEOULTZ), dt.datetime(2018, 12, 31, 23).astimezone(SEOULTZ))] train_valid_fdate = dt.datetime(2015, 7, 1, 0).astimezone(SEOULTZ) train_valid_tdate = dt.datetime(2018, 12, 31, 23).astimezone(SEOULTZ) test_fdate = dt.datetime(2019, 1, 1, 0).astimezone(SEOULTZ) test_tdate = dt.datetime(2020, 10, 31, 23).astimezone(SEOULTZ) # check date range assumption assert len(train_dates) == len(valid_dates) for i, (td, vd) in enumerate(zip(train_dates, valid_dates)): assert vd[0] > td[1] assert test_fdate > train_dates[-1][1] assert test_fdate > valid_dates[-1][1] train_features = [ "SO2", "CO", "NO2", "PM10", "PM25", "temp", "wind_spd", "wind_cdir", "wind_sdir", "pres", "humid", "prep" ] train_features_periodic = [ "SO2", "CO", "NO2", "PM10", "PM25", "temp", "wind_spd", "wind_cdir", "wind_sdir", "pres", "humid" ] train_features_nonperiodic = ["prep"] for target in targets: print("Training " + target + "...") output_dir = Path( f"/mnt/data/MLPMSMultivariate/{station_name}/{target}/") Path.mkdir(output_dir, parents=True, exist_ok=True) model_dir = output_dir / "models" Path.mkdir(model_dir, parents=True, exist_ok=True) log_dir = output_dir / "log" Path.mkdir(log_dir, parents=True, exist_ok=True) _df_h = data.load_imputed(HOURLY_DATA_PATH) df_h = _df_h.query('stationCode == "' + str(SEOUL_STATIONS[station_name]) + '"') if station_name == '종로구' and \ not Path("/input/python/input_jongno_imputed_hourly_pandas.csv").is_file(): # load imputed result df_h.to_csv("/input/python/input_jongno_imputed_hourly_pandas.csv") # construct dataset for seasonality print("Construct Train/Validation Sets...", flush=True) train_valid_dataset = construct_dataset(train_valid_fdate, train_valid_tdate, filepath=HOURLY_DATA_PATH, station_name=station_name, target=target, sample_size=sample_size, output_size=output_size, transform=False) # compute seasonality train_valid_dataset.preprocess() # For Block Cross Validation.. # load dataset in given range dates and transform using scaler from train_valid_set # all dataset are saved in tuple print("Construct Training Sets...", flush=True) train_datasets = tuple( construct_dataset(td[0], td[1], scaler_X=train_valid_dataset.scaler_X, scaler_Y=train_valid_dataset.scaler_Y, filepath=HOURLY_DATA_PATH, station_name=station_name, target=target, sample_size=sample_size, output_size=output_size, features=train_features, features_periodic=train_features_periodic, features_nonperiodic=train_features_nonperiodic, transform=True) for td in train_dates) print("Construct Validation Sets...", flush=True) valid_datasets = tuple( construct_dataset(vd[0], vd[1], scaler_X=train_valid_dataset.scaler_X, scaler_Y=train_valid_dataset.scaler_Y, filepath=HOURLY_DATA_PATH, station_name=station_name, target=target, sample_size=sample_size, output_size=output_size, features=train_features, features_periodic=train_features_periodic, features_nonperiodic=train_features_nonperiodic, transform=True) for vd in valid_dates) # just single test set print("Construct Test Sets...", flush=True) test_dataset = construct_dataset( test_fdate, test_tdate, scaler_X=train_valid_dataset.scaler_X, scaler_Y=train_valid_dataset.scaler_Y, filepath=HOURLY_DATA_PATH, station_name=station_name, target=target, sample_size=sample_size, output_size=output_size, features=train_features, features_periodic=train_features_periodic, features_nonperiodic=train_features_nonperiodic, transform=True) # convert tuple of datasets to ConcatDataset train_dataset = ConcatDataset(train_datasets) val_dataset = ConcatDataset(valid_datasets) # num_layer == number of hidden layer hparams = Namespace(num_layers=1, layer_size=128, learning_rate=learning_rate, batch_size=batch_size) def objective(trial): model = BaseMLPModel( trial=trial, hparams=hparams, input_size=sample_size * len(train_features), sample_size=sample_size, output_size=output_size, station_name=station_name, target=target, features=train_features, features_periodic=train_features_periodic, features_nonperiodic=train_features_nonperiodic, train_dataset=train_dataset, val_dataset=val_dataset, test_dataset=test_dataset, scaler_X=train_valid_dataset.scaler_X, scaler_Y=train_valid_dataset.scaler_Y, output_dir=output_dir) # most basic trainer, uses good defaults trainer = Trainer(gpus=1 if torch.cuda.is_available() else None, precision=32, min_epochs=1, max_epochs=20, default_root_dir=output_dir, fast_dev_run=fast_dev_run, logger=True, checkpoint_callback=False, callbacks=[ PyTorchLightningPruningCallback( trial, monitor="valid/MSE") ]) trainer.fit(model) # Don't Log # hyperparameters = model.hparams # trainer.logger.log_hyperparams(hyperparameters) return trainer.callback_metrics.get("valid/MSE") if n_trials > 1: study = optuna.create_study(direction="minimize") study.enqueue_trial({ 'sigma': 1.3, 'num_layers': 4, 'layer_size': 8, 'learning_rate': learning_rate, 'batch_size': batch_size }) study.enqueue_trial({ 'sigma': 1.3, 'num_layers': 4, 'layer_size': 32, 'learning_rate': learning_rate, 'batch_size': batch_size }) study.enqueue_trial({ 'sigma': 1.3, 'num_layers': 4, 'layer_size': 64, 'learning_rate': learning_rate, 'batch_size': batch_size }) study.enqueue_trial({ 'sigma': 1.3, 'num_layers': 4, 'layer_size': 32, 'learning_rate': learning_rate, 'batch_size': batch_size }) study.enqueue_trial({ 'sigma': 1.3, 'num_layers': 8, 'layer_size': 32, 'learning_rate': learning_rate, 'batch_size': batch_size }) study.enqueue_trial({ 'sigma': 1.3, 'num_layers': 12, 'layer_size': 32, 'learning_rate': learning_rate, 'batch_size': batch_size }) study.enqueue_trial({ 'sigma': 0.7, 'num_layers': 4, 'layer_size': 32, 'learning_rate': learning_rate, 'batch_size': batch_size }) study.enqueue_trial({ 'sigma': 2.0, 'num_layers': 4, 'layer_size': 32, 'learning_rate': learning_rate, 'batch_size': batch_size }) # timeout = 3600*36 = 36h study.optimize(objective, n_trials=n_trials, timeout=3600 * 36) trial = study.best_trial print(" Value: ", trial.value) print(" Params: ") for key, value in trial.params.items(): print(" {}: {}".format(key, value)) print("sample_size : ", sample_size) print("output_size : ", output_size) # plot optmization results fig_cont1 = optv.plot_contour(study, params=['num_layers', 'layer_size']) fig_cont1.write_image( str(output_dir / "contour_num_layers_layer_size.png")) fig_cont1.write_image( str(output_dir / "contour_num_layers_layer_size.svg")) fig_edf = optv.plot_edf(study) fig_edf.write_image(str(output_dir / "edf.png")) fig_edf.write_image(str(output_dir / "edf.svg")) fig_iv = optv.plot_intermediate_values(study) fig_iv.write_image(str(output_dir / "intermediate_values.png")) fig_iv.write_image(str(output_dir / "intermediate_values.svg")) fig_his = optv.plot_optimization_history(study) fig_his.write_image(str(output_dir / "opt_history.png")) fig_his.write_image(str(output_dir / "opt_history.svg")) fig_pcoord = optv.plot_parallel_coordinate( study, params=['num_layers', 'layer_size']) fig_pcoord.write_image(str(output_dir / "parallel_coord.png")) fig_pcoord.write_image(str(output_dir / "parallel_coord.svg")) fig_slice = optv.plot_slice(study, params=['num_layers', 'layer_size']) fig_slice.write_image(str(output_dir / "slice.png")) fig_slice.write_image(str(output_dir / "slice.svg")) # set hparams with optmized value hparams.num_layers = trial.params['num_layers'] hparams.layer_size = trial.params['layer_size'] dict_hparams = copy.copy(vars(hparams)) dict_hparams["sample_size"] = sample_size dict_hparams["output_size"] = output_size with open(output_dir / 'hparams.json', 'w') as f: print(dict_hparams, file=f) with open(output_dir / 'hparams.csv', 'w') as f: print(pd.DataFrame.from_dict(dict_hparams, orient='index'), file=f) model = BaseMLPModel(hparams=hparams, input_size=sample_size * len(train_features), sample_size=sample_size, output_size=output_size, station_name=station_name, target=target, features=train_features, features_periodic=train_features_periodic, features_nonperiodic=train_features_nonperiodic, train_dataset=train_dataset, val_dataset=val_dataset, test_dataset=test_dataset, scaler_X=train_valid_dataset.scaler_X, scaler_Y=train_valid_dataset.scaler_Y, output_dir=output_dir) # record input for i, _train_set in enumerate(train_datasets): _train_set.to_csv( model.data_dir / ("df_trainset_{0}_".format(str(i).zfill(2)) + target + ".csv")) for i, _valid_set in enumerate(valid_datasets): _valid_set.to_csv( model.data_dir / ("df_validset_{0}_".format(str(i).zfill(2)) + target + ".csv")) train_valid_dataset.to_csv(model.data_dir / ("df_trainvalidset_" + target + ".csv")) test_dataset.to_csv(model.data_dir / ("df_testset_" + target + ".csv")) checkpoint_callback = pl.callbacks.ModelCheckpoint(os.path.join( model_dir, "train_{epoch}_{valid/MSE:.2f}"), monitor="valid/MSE", period=10) early_stop_callback = EarlyStopping(monitor='valid/MSE', min_delta=0.001, patience=30, verbose=True, mode='min') log_version = dt.date.today().strftime("%y%m%d-%H-%M") loggers = [ \ TensorBoardLogger(log_dir, version=log_version), CSVLogger(log_dir, version=log_version)] # most basic trainer, uses good defaults trainer = Trainer(gpus=1 if torch.cuda.is_available() else None, precision=32, min_epochs=1, max_epochs=epoch_size, default_root_dir=output_dir, fast_dev_run=fast_dev_run, logger=loggers, log_every_n_steps=5, flush_logs_every_n_steps=10, callbacks=[early_stop_callback], checkpoint_callback=checkpoint_callback) trainer.fit(model) # run test set trainer.test(ckpt_path=None) shutil.rmtree(model_dir)
from pytorch_lightning.callbacks.early_stopping import EarlyStopping from pytorch_lightning.callbacks import ModelCheckpoint from main import DistillBart from preprocessing import load_multilingual_dataset import pytorch_lightning as pl import os if __name__ == "__main__": # trainer = pl.Trainer(gpus=None) trainer = pl.Trainer( gpus=-1, callbacks=[ EarlyStopping(monitor="val_loss"), ModelCheckpoint( dirpath="./drive/MyDrive/mlbart_ckpt", monitor="val_loss", filename="paraphrase_mlbart_{epoch:02d}-{val_loss:.2f}", save_top_k=-1, mode="min", ), ], progress_bar_refresh_rate=20, ) train_dataloader, validation_dataloader = load_multilingual_dataset( dataset_path=f"{os.getcwd()}/drive/MyDrive/dataset", batch_size=4) model = DistillBart(9, 3) trainer.fit(model, train_dataloader=train_dataloader, val_dataloaders=validation_dataloader)
hparams['bands'] = len(util.get_wavelengths_for(opt.camera_type)) hparams['augmentation_config'] = AUGMENTATION_CONFIG hparams['test_augmentation'] = True print("Hparams: %s" % hparams) model = DeepHsAblationStudyModule(hparams) logger = WandbLogger(hparams['git_id'], offline=not opt.online_logging, save_dir=opt.log_path, project='deephs') early_stop_callback = EarlyStopping(monitor='val_loss', min_delta=0.00, verbose=True, mode='min', patience=20) checkpoint_callback = ModelCheckpoint(filepath='best.ckpt', save_top_k=1, verbose=True, monitor='val_loss', mode='min') trainer = lightning.Trainer(max_epochs=opt.num_epochs, gpus=-1, logger=logger, early_stop_callback=early_stop_callback, min_epochs=50, checkpoint_callback=checkpoint_callback,
def finetune(self, dataset, validation_split: float = 0.15, epochs: int = 20, batch_size: int = None, optimal_batch_size: int = None, early_stopping: bool = True, trainer=None): self.batch_size = batch_size or 1 if not torch.cuda.is_available(): raise Exception( "You need a cuda capable (Nvidia) GPU for finetuning") len_train = int(len(dataset) * (1 - validation_split)) len_valid = len(dataset) - len_train dataset_train, dataset_valid = torch.utils.data.random_split( dataset, [len_train, len_valid]) self.dataset_train = dataset_train self.dataset_valid = dataset_valid if batch_size == None: # Find batch size temp_trainer = pl.Trainer(auto_scale_batch_size="power", gpus=-1) print("Finding the optimal batch size...") temp_trainer.tune(self) # Ensure that memory gets cleared del self.trainer del temp_trainer garbage_collection_cuda() trainer_kwargs = {} if optimal_batch_size: # Don't go over batch_size = min(self.batch_size, optimal_batch_size) accumulate_grad_batches = max(1, int(optimal_batch_size / batch_size)) trainer_kwargs["accumulate_grad_batches"] = accumulate_grad_batches if early_stopping: # Stop when val loss stops improving early_stopping = EarlyStopping(monitor="val_loss", patience=1) trainer_kwargs["callbacks"] = [early_stopping] if not trainer: trainer = pl.Trainer(gpus=-1, max_epochs=epochs, checkpoint_callback=False, logger=False, **trainer_kwargs) self.model.train() trainer.fit(self) del self.dataset_train del self.dataset_valid del self.trainer # For some reason the model can end up on CPU after training self.to(self._model_device) self.model.eval() print( "Training finished! Save your model for later with backprop.save or upload it with backprop.upload" )
f.write(f'{model.test_preds[i]}\n\n') if __name__ == "__main__": # set random seed pl.seed_everything(42) parser = ArgumentParser() parser = pl.Trainer.add_argparse_args(parser) parser.add_argument("--savename", type=str, default='no_name') parser.add_argument('--checkpoint', type=str, default='') parser.add_argument('--dataset', type=str, default='WQ') parser.add_argument('--batch_size', type=int, default=2) parser.add_argument('--pre_trained', type=str, default='t5', help='t5 or bart') args = parser.parse_args() # Define trainer tb_logger = pl_loggers.TensorBoardLogger('logs/') trainer = pl.Trainer.from_argparse_args( args, # max_epochs, gpus logger=tb_logger, callbacks=[EarlyStopping(monitor='val_loss')] ) kgqg = KGQGDataModule('data/' + args.dataset, batch_size=args.batch_size, pre_trained=args.pre_trained) model = KGQGTuner.load_from_checkpoint(args.checkpoint, datamodule=kgqg, pre_trained=args.pre_trained) trainer.test(model=model, datamodule=kgqg) write_test_files(model, kgqg, name=args.savename)
def train( self, train_df: pd.DataFrame, test_df: pd.DataFrame, source_max_token_len: int = 512, target_max_token_len: int = 512, batch_size: int = 8, max_epochs: int = 5, use_gpu: bool = True, outputdir: str = "outputs", early_stopping_patience_epochs: int = 0, # 0 to disable early stopping feature test_split=0.1, tpu_cores=None, ): """ trains T5 model on custom dataset Args: data_df (pd.DataFrame): training datarame. Dataframe must have 2 column --> "keywords" and "text" source_max_token_len (int, optional): max token length of source text. Defaults to 512. target_max_token_len (int, optional): max token length of target text. Defaults to 512. batch_size (int, optional): batch size. Defaults to 8. max_epochs (int, optional): max number of epochs. Defaults to 5. use_gpu (bool, optional): if True, model uses gpu for training. Defaults to True. outputdir (str, optional): output directory to save model checkpoints. Defaults to "outputs". early_stopping_patience_epochs (int, optional): monitors val_loss on epoch end and stops training, if val_loss does not improve after the specied number of epochs. set 0 to disable early stopping. Defaults to 0 (disabled) :param test_df: :param train_df: """ self.target_max_token_len = target_max_token_len self.max_epoch = max_epochs self.train_df = train_df self.test_df = test_df self.data_module = PLDataModule( train_df=train_df, test_df=test_df, tokenizer=self.tokenizer, batch_size=batch_size, source_max_token_len=source_max_token_len, target_max_token_len=target_max_token_len, split=test_split, ) self.T5Model = LightningModel(tokenizer=self.tokenizer, model=self.model, output=outputdir) logger = WandbLogger(project="keytotext") early_stop_callback = ([ EarlyStopping( monitor="val_loss", min_delta=0.00, patience=early_stopping_patience_epochs, verbose=True, mode="min", ) ] if early_stopping_patience_epochs > 0 else None) gpus = -1 if use_gpu else 0 trainer = Trainer(logger=logger, callbacks=early_stop_callback, max_epochs=max_epochs, gpus=gpus, progress_bar_refresh_rate=5, tpu_cores=tpu_cores) trainer.fit(self.T5Model, self.data_module)
'precision': 16, 'subset': 0.1, 'test_size': 0.2, 'seed': 42, 'size': 256, 'backbone': 'resnet18', 'val_batches': 10 } dm = DataModule(**config) model = Resnet(config) wandb_logger = WandbLogger(project="cassava", config=config) es = EarlyStopping(monitor='val_acc', mode='max', patience=3) checkpoint = ModelCheckpoint( dirpath='./', filename=f'{config["backbone"]}-{config["size"]}-{{val_acc:.5f}}', save_top_k=1, monitor='val_acc', mode='max') trainer = pl.Trainer(gpus=1, precision=config['precision'], logger=wandb_logger, max_epochs=config['max_epochs'], callbacks=[es, checkpoint], limit_val_batches=config['val_batches']) trainer.fit(model, dm)
checkpoint_callback=False, logger=False) checkpoint_callback = ModelCheckpoint( filepath=os.getcwd(), save_top_k=2, verbose=True, monitor="val/loss", mode="min", ) experiment_name = ... PROJECT_NAME = ... logger = WandbLogger(name=experiment_name, project=PROJECT_NAME) # And then actual training pl.seed_everything(42) trainer = Trainer( max_epochs=40, logger=logger, gpus=1, # precision=16, deterministic=True, accumulate_grad_batches=2, callbacks=[EarlyStopping(monitor="val/loss")], # resume_from_checkpoint = 'my_checkpoint.ckpt' ) trainer.fit(model, dm)
def _body(self, pid=None): # init model self.model = self.init_model(self.config.hparams) # setup datamodule self.dm.setup(stage=None, test_id=pid) # init training with pl.LightningModule models if self.config.trainer is not None: # init logger if self.config.logger is not None: logger = self.init_logger(pid) # init lr monitor and callbacks callbacks = list() if self.config.hparams.scheduler is not None: callbacks.append(LearningRateMonitor(logging_interval='epoch')) # init early stopping if self.config.early_stop is not None: callbacks.append(EarlyStopping(**vars(self.config.early_stop))) # make trainer trainer_args = vars(self.config.trainer) trainer_args.update({ 'logger': logger, 'callbacks': callbacks, 'auto_lr_find': True if self.config.exp.tune else False }) trainer = pl.Trainer(**trainer_args) # find optimal lr if self.config.exp.tune: trainer.tune(self.model, datamodule=self.dm) # train model trainer.fit(self.model, self.dm) # test model and get results [results] = trainer.test(self.model) # return metrics and confusion matrix metr = { 'pid': pid, 'acc': results['test_acc'], 'ap': results['test_ap'], 'f1': results['test_f1'], 'auroc': results['test_auroc'], 'num_epochs': self.model.current_epoch, } cm = self.model.cm else: # train model: concat train and valid inputs and labels and convert torch tensors to numpy arrays X_train, y_train = map(lambda x: torch.cat(x, dim=0).numpy(), zip(self.dm.kemocon_train[:], self.dm.kemocon_val[:])) self.model.train(X_train, y_train) # test model X_test, y_test = map(lambda x: x.numpy(), self.dm.kemocon_test[:]) metr, cm = self.model.test(X_test, y_test) return metr, cm
def cli_main(): parser = ArgumentParser() parser.add_argument("--DATA_PATH", type=str, help="path to folders with images") parser.add_argument("--MODEL_PATH", default=None, type=str, help="path to model checkpoint.") parser.add_argument("--encoder", default=None, type=str, help="encoder for model found in encoders.py") parser.add_argument("--batch_size", default=128, type=int, help="batch size for SSL") parser.add_argument("--num_workers", default=0, type=int, help="number of workers to use to fetch data") parser.add_argument( "--hidden_dims", default=128, type=int, help= "hidden dimensions in classification layer added onto model for finetuning" ) parser.add_argument("--epochs", default=200, type=int, help="number of epochs to train model") parser.add_argument("--lr", default=1e-3, type=float, help="learning rate for training model") parser.add_argument( "--patience", default=-1, type=int, help= "automatically cuts off training if validation does not drop for (patience) epochs. Leave blank to have no validation based early stopping." ) parser.add_argument("--val_split", default=0.2, type=float, help="percent in validation data") parser.add_argument( "--withhold_split", default=0, type=float, help= "decimal from 0-1 representing how much of the training data to withold from either training or validation" ) parser.add_argument("--gpus", default=1, type=int, help="number of gpus to use for training") parser.add_argument( "--eval", default=True, type=bool, help= "Eval Mode will train and evaluate the finetuned model's performance") parser.add_argument( "--pretrain_encoder", default=False, type=bool, help= "initialize resnet encoder with pretrained imagenet weights. Ignored if MODEL_PATH is specified." ) parser.add_argument("--version", default="0", type=str, help="version to name checkpoint for saving") args = parser.parse_args() DATA_PATH = args.DATA_PATH batch_size = args.batch_size num_workers = args.num_workers hidden_dims = args.hidden_dims epochs = args.epochs lr = args.lr patience = args.patience val_split = args.val_split withhold = args.withhold_split version = args.version MODEL_PATH = args.MODEL_PATH gpus = args.gpus eval_model = args.eval version = args.version pretrain = args.pretrain_encoder encoder = args.encoder model = sslSIMCLR(encoder=encoder, epochs=epochs, pretrained=pretrain, MODEL_PATH=MODEL_PATH, DATA_PATH=DATA_PATH, withhold=withhold, batch_size=batch_size, val_split=val_split, hidden_dims=hidden_dims, train_transform=SimCLRTrainDataTransform, val_transform=SimCLRTrainDataTransform, num_workers=num_workers) online_evaluator = SSLOnlineEvaluator(drop_p=0., hidden_dim=None, z_dim=model.embedding_size, num_classes=26, dataset='None') if patience > 0: cb = EarlyStopping('val_loss', patience=patience) trainer = Trainer(gpus=gpus, max_epochs=epochs, callbacks=[cb, online_evaluator], progress_bar_refresh_rate=5) else: trainer = Trainer(gpus=gpus, max_epochs=epochs, callbacks=[online_evaluator], progress_bar_refresh_rate=5) trainer.fit(model) Path(f"./models/SSL/SIMCLR_SSL_{version}").mkdir(parents=True, exist_ok=True) torch.save(model.encoder.state_dict(), f"./models/SSL/SIMCLR_SSL_{version}/SIMCLR_SSL_{version}.pt")
def train(debug=False, use_hdr=True, normalize=False, n_points=1280, num_workers=16, batch_size=32): """Train PointAR model Parameters ---------- debug : bool Set debugging flag use_hdr : bool Use HDR SH coefficients data for training normalize : bool Normalize SH coefficients n_points : int Number of model input points, default 1280 num_workers : int Number of workers for loading data, default 16 batch_size : int Training batch size """ # Specify dataset TestDataset = PointARTestDataset TrainDataset = TestDataset if debug else PointARTrainDataset # Get loaders ready loader_param = {'use_hdr': use_hdr} loaders, scaler = train_valid_test_split(TrainDataset, loader_param, TestDataset, loader_param, normalize=normalize, num_workers=num_workers, batch_size=batch_size) train_loader, valid_loader, test_loader = loaders # Get model ready model = PointAR( hparams={ 'n_shc': 27, 'n_points': n_points, 'min': torch.from_numpy(scaler.min_) if normalize else torch.zeros((27)), 'scale': torch.from_numpy(scaler.scale_) if normalize else torch.ones((27)) }) # Train sample_input = (torch.zeros( (1, 3, n_points)).float().cuda(), torch.zeros( (1, 3, n_points)).float().cuda()) trainer = pl.Trainer(gpus=1, check_val_every_n_epoch=1, callbacks=[ ModelSavingCallback(sample_input=sample_input), EarlyStopping(monitor='valid_shc_mse') ]) # Start training trainer.fit(model, train_dataloader=train_loader, val_dataloaders=[valid_loader, test_loader])
wandb_logger = WandbLogger(project='recommender-xai', tags=['vae', train_tag], name=wandb_name) trainer = pl.Trainer.from_argparse_args( args, # limit_test_batches=0.1, # precision =16, logger=wandb_logger, # False gradient_clip_val=0.5, # accumulate_grad_batches=0, gpus=0, weights_summary='full', checkpoint_callback=False, callbacks=[ ProgressBar(), EarlyStopping(monitor='train_loss') ]) if (train): print( '<---------------------------------- VAE Training ---------------------------------->' ) print( "Running with the following configuration: \n{}" .format(args)) if (synthetic_data): model_params['synthetic_data'], model_params[ 'syn_y'] = data_utils.create_synthetic_data( no_generative_factors, experiment_path, expanded_user_item, continous_data, normalvariate, noise)
def train(args): config = json.loads(_jsonnet.evaluate_file(args["CONFIG_FILE"])) if args["--extra-config"]: extra_config = args["--extra-config"] extra_config = json.loads(extra_config) config = util.update(config, extra_config) # dataloaders batch_size = config["train"]["batch_size"] train_set = Dataset( config["data"]["train_file"], config["data"], percent=float(args["--percent"]), ) dev_set = Dataset(config["data"]["dev_file"], config["data"]) train_loader = DataLoader( train_set, batch_size=batch_size, collate_fn=Dataset.collate_fn, num_workers=16, pin_memory=True, ) val_loader = DataLoader( dev_set, batch_size=batch_size, collate_fn=Dataset.collate_fn, num_workers=8, pin_memory=True, ) # model model = TypeReconstructionModel(config) wandb_logger = WandbLogger(name=args["--expname"], project="dire", log_model=True) wandb_logger.log_hyperparams(config) resume_from_checkpoint = (args["--eval-ckpt"] if args["--eval-ckpt"] else args["--resume"]) if resume_from_checkpoint == "": resume_from_checkpoint = None trainer = pl.Trainer( max_epochs=config["train"]["max_epoch"], logger=wandb_logger, gpus=1 if args["--cuda"] else None, auto_select_gpus=True, gradient_clip_val=1, callbacks=[ EarlyStopping( monitor="val_retype_acc" if config["data"]["retype"] else "val_rename_acc", mode="max", patience=config["train"]["patience"], ) ], check_val_every_n_epoch=config["train"]["check_val_every_n_epoch"], progress_bar_refresh_rate=10, accumulate_grad_batches=config["train"]["grad_accum_step"], resume_from_checkpoint=resume_from_checkpoint, ) if args["--eval-ckpt"]: # HACK: necessary to make pl test work for IterableDataset Dataset.__len__ = lambda self: 1000000 test_set = Dataset(config["data"]["test_file"], config["data"]) test_loader = DataLoader( test_set, batch_size=config["test"]["batch_size"], collate_fn=Dataset.collate_fn, num_workers=8, pin_memory=True, ) trainer.test(model, test_dataloaders=test_loader, ckpt_path=args["--eval-ckpt"]) else: trainer.fit(model, train_loader, val_loader)
def main(model_name, seed, group, save_path, save_images, baseline): """ :return: """ save_path = os.path.join(save_path, "experiments", group) os.makedirs(save_path, exist_ok=True) hparams = get_params(model_name) configuration_dict = get_configuration(model_name, hparams) # setup wandb pipeline wandb_logger = WandbLogger( name="{}-{}-{}".format(group, model_name, seed), save_dir=save_path, project=PROJECT, group=group, tags=group, ) train, valid, test = get_loaders(hparams, configuration_dict) model_module = importlib.import_module( "src.segmentation.models.{}.model".format(model_name)) model = model_module.Model(hparams) model.configuration = configuration_dict early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=0.00, patience=30, verbose=False, mode="min") # Pytorch lightning trainer trainer = Trainer( gpus=1, weights_summary="top", max_epochs=50, logger=wandb_logger, early_stop_callback=early_stop_callback, num_sanity_val_steps=0, callbacks=[LearningRateLogger()] if hparams["scheduler_type"] != None else None, default_root_dir=save_path, ) trainer.fit(model, train_dataloader=train, val_dataloaders=valid) del model torch.cuda.empty_cache() save_path = os.path.join(save_path, PROJECT, wandb_logger.__getstate__()["_id"]) model = load_best(model_module, configuration_dict, save_path) scores = get_results(model, valid, wandb_logger, save_path, save_images, baseline) save_metrics(scores, save_path) move_best(save_path, group)
def main(): TRAIN_BATCH_SIZE = 32 VAL_BATCH_SIZE = 32 TEST_BATCH_SIZE = 32 LEARNING_RATE = 1e-5 MAX_EPOCHS = 20 WEIGHT_DECAY = 0.0 #DATA_PATH = "/bigtemp/rm5tx/nlp_project/data_cache/" DATA_PATH = os.path.expanduser("~/data_cache/") # DATA_NORM_PATH = os.path.expanduser("~/data_cache/") # DATA_ADJACENT_PATH = os.path.expanduser("~/data_adjacent_cache/") MAX_LEN = 128 ADJACENT = True ADJRAT = 0.23 ADJTOT = 2 RATIO = 1 data = ProjData(max_len=MAX_LEN, ratio=RATIO, adjacent=ADJACENT, adjrat=ADJRAT, adjtot=ADJTOT) if ADJACENT: # DATA_PATH = DATA_ADJACENT_PATH MODEL_NAME = 'nlp_proj_adjacent' + str(MAX_LEN) else: # DATA_PATH = DATA_NORM_PATH MODEL_NAME = 'nlp_proj_norm' + str(MAX_LEN) try: data.load(DATA_PATH) print("Loaded Saved Data") except Exception as e: print(e) data.setup() data.save(DATA_PATH) ### Comment out the try block and uncomment below while you're working on the data part or you'll just skip it and use old data. # data.setup() # data.save(DATA_PATH) model = ProjModel(learning_rate=LEARNING_RATE, weight_decay=WEIGHT_DECAY) logger = TensorBoardLogger(os.path.expanduser("~/tb_logs"), name=MODEL_NAME) checkpoint_callback = ModelCheckpoint( monitor='valid_loss', dirpath=os.path.expanduser("~/saved_models"), save_last=True, filename=MODEL_NAME + '-{epoch:02d}-{avg_acc:.2f}', ) earlystopping = EarlyStopping(monitor='avg_acc', verbose=True, patience=0) trainer = pl.Trainer( logger=logger, accelerator='ddp', # jupyter can't use ddp, use dp instead # effective batch size is batch_size * num_gpus * num_nodes gpus=1, gradient_clip_val=1.0, max_epochs=MAX_EPOCHS, fast_dev_run=False, callbacks=[checkpoint_callback, earlystopping]) trainer.fit(model, data.train_dataloader(batch_size=TRAIN_BATCH_SIZE), data.val_dataloader(batch_size=VAL_BATCH_SIZE))
filtered = [] for token in decoded_sentence_list: if token not in special_tokens: filtered.append(token) return " ".join(filtered) class MyEarlyStopping(EarlyStopping): def on_validation_end(self, trainer, pl_module): if pl_module.iter > pl_module.args.num_warmup_steps: self._run_early_stopping_check(trainer, pl_module) early_stop_callback = EarlyStopping(monitor='val_qa_decode_Bleu_4', min_delta=0.00, patience=10, verbose=True, mode='max') if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--hidden_dim", type=int, default=768, help="Hidden dimensionality of the model") parser.add_argument("--latent_dim", type=int, default=768, help="Hidden dimensionality of the model") parser.add_argument("--lr", type=float,
def optimize(trial: optuna.Trial, data_dict): gts = PurgedGroupTimeSeriesSplit(n_splits=5, group_gap=10) input_size = data_dict['data'].shape[-1] output_size = 5 checkpoint_callback = pl.callbacks.ModelCheckpoint(os.path.join( 'models/', "trial_resnet_{}".format(trial.number)), monitor="val_auc", mode='max') logger = MetricsCallback() metrics = [] sizes = [] # trial_file = 'HPO/nn_hpo_2021-01-05.pkl' trial_file = None p = create_param_dict(trial, trial_file) p['batch_size'] = trial.suggest_int('batch_size', 8000, 15000) for i, (train_idx, val_idx) in enumerate( gts.split(data_dict['data'], groups=data_dict['date'])): idx = np.concatenate([train_idx, val_idx]) data = copy.deepcopy(data_dict['data'][idx]) target = copy.deepcopy(data_dict['target'][idx]) date = copy.deepcopy(data_dict['date'][idx]) train_idx = [i for i in range(0, max(train_idx) + 1)] val_idx = [i for i in range(len(train_idx), len(idx))] data[train_idx] = calc_data_mean(data[train_idx], './cache', train=True, mode='mean') data[val_idx] = calc_data_mean(data[val_idx], './cache', train=False, mode='mean') model = Classifier(input_size, output_size, params=p) # model.apply(init_weights) dataset = FinData(data=data, target=target, date=date, multi=True) dataloaders = create_dataloaders(dataset, indexes={ 'train': train_idx, 'val': val_idx }, batch_size=p['batch_size']) es = EarlyStopping(monitor='val_loss', patience=10, min_delta=0.0005, mode='min') trainer = pl.Trainer(logger=False, max_epochs=500, gpus=1, callbacks=[ checkpoint_callback, logger, PyTorchLightningPruningCallback( trial, monitor='val_loss'), es ], precision=16) trainer.fit(model, train_dataloader=dataloaders['train'], val_dataloaders=dataloaders['val']) val_loss = logger.metrics[-1]['val_loss'].item() metrics.append(val_loss) sizes.append(len(train_idx)) metrics_mean = weighted_mean(metrics, sizes) return metrics_mean
type=str, default='t5', help='t5 or bart') # add all the available trainer options to argparse parser = pl.Trainer.add_argparse_args(parser) args = parser.parse_args() # Define trainer tb_logger = pl_loggers.TensorBoardLogger(args.logdir + '/') trainer = pl.Trainer.from_argparse_args( args, # max_epochs, gpus logger=tb_logger, callbacks=[ EarlyStopping(monitor='bleu_score', verbose=True, mode='max', patience=5) ]) # Load data and model kgqg = KGQGDataModule('data/' + args.dataset, batch_size=args.batch_size, pre_trained=args.pre_trained) model = KGQGTuner(kgqg, learning_rate=args.learning_rate, batch_size=args.batch_size, optimizer=args.optimizer, dataset=args.dataset, pre_trained=args.pre_trained) # Fit model
# 'train_batch_size': 64, # configurable # 'eval_batch_size': 64 # configurable # }) args = argparse.Namespace(**args_dict) checkpoint_callback = pl.callbacks.ModelCheckpoint( filepath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=5) early_stop_callback = EarlyStopping(monitor='val_loss', min_delta=0.00, patience=3, verbose=False, mode='min') # -------------------------- sanity check conll -------------------------- # tokenizer = T5Tokenizer.from_pretrained( args.tokenizer_name_or_path) # t5-base | t5-small dataset = MyDataset(tokenizer, args.data_dir, 'val', max_len=args.max_seq_length) print('Length of dataset is {}'.format(len(dataset))) data = dataset[0] print(tokenizer.decode(data['source_ids'], skip_special_tokens=True))
def train_model(args): # do not run this test for pytorch lightning below min supported verson import pytorch_lightning as pl if LooseVersion(pl.__version__) < LooseVersion(MIN_PL_VERSION): print("Skip test for pytorch_ligthning=={}, min support version is {}". format(pl.__version__, MIN_PL_VERSION)) return # Initialize SparkSession conf = SparkConf().setAppName('pytorch_spark_mnist').set( 'spark.sql.shuffle.partitions', '16') if args.master: conf.setMaster(args.master) elif args.num_proc: conf.setMaster('local[{}]'.format(args.num_proc)) spark = SparkSession.builder.config(conf=conf).getOrCreate() # Setup our store for intermediate data store = Store.create(args.work_dir) # Download MNIST dataset data_url = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/mnist.bz2' libsvm_path = os.path.join(args.data_dir, 'mnist.bz2') if not os.path.exists(libsvm_path): subprocess.check_output(['wget', data_url, '-O', libsvm_path]) # Load dataset into a Spark DataFrame df = spark.read.format('libsvm') \ .option('numFeatures', '784') \ .load(libsvm_path) # One-hot encode labels into SparseVectors encoder = OneHotEncoder(inputCols=['label'], outputCols=['label_vec'], dropLast=False) model = encoder.fit(df) train_df = model.transform(df) # Train/test split train_df, test_df = train_df.randomSplit([0.9, 0.1]) # Define the PyTorch model without any Horovod-specific parameters class Net(LightningModule): def __init__(self): super(Net, self).__init__() self.conv1 = nn.Conv2d(1, 10, kernel_size=5) self.conv2 = nn.Conv2d(10, 20, kernel_size=5) self.conv2_drop = nn.Dropout2d() self.fc1 = nn.Linear(320, 50) self.fc2 = nn.Linear(50, 10) def forward(self, x): x = x.float().reshape((-1, 1, 28, 28)) x = F.relu(F.max_pool2d(self.conv1(x), 2)) x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) x = x.view(-1, 320) x = F.relu(self.fc1(x)) x = F.dropout(x, training=self.training) x = self.fc2(x) return F.log_softmax(x, -1) def configure_optimizers(self): return optim.SGD(self.parameters(), lr=0.01, momentum=0.5) def training_step(self, batch, batch_idx): if batch_idx == 0: print(f"training data batch size: {batch['label'].shape}") x, y = batch['features'], batch['label'] y_hat = self(x) loss = F.nll_loss(y_hat, y.long()) self.log('train_loss', loss) return loss def validation_step(self, batch, batch_idx): if batch_idx == 0: print(f"validation data batch size: {batch['label'].shape}") x, y = batch['features'], batch['label'] y_hat = self(x) loss = F.nll_loss(y_hat, y.long()) self.log('val_loss', loss) def validation_epoch_end(self, outputs): avg_loss = torch.stack([ x['val_loss'] for x in outputs ]).mean() if len(outputs) > 0 else float('inf') self.log('avg_val_loss', avg_loss) model = Net() # Train a Horovod Spark Estimator on the DataFrame backend = SparkBackend(num_proc=args.num_proc, stdout=sys.stdout, stderr=sys.stderr, prefix_output_with_timestamp=True) from pytorch_lightning.callbacks import Callback epochs = args.epochs class MyDummyCallback(Callback): def __init__(self): self.epcoh_end_counter = 0 self.train_epcoh_end_counter = 0 self.validation_epoch_end_counter = 0 def on_init_start(self, trainer): print('Starting to init trainer!') def on_init_end(self, trainer): print('Trainer is initialized.') def on_epoch_end(self, trainer, model): print('A train or eval epoch ended.') self.epcoh_end_counter += 1 def on_train_epoch_end(self, trainer, model, unused=None): print('A train epoch ended.') self.train_epcoh_end_counter += 1 def on_validation_epoch_end(self, trainer, model, unused=None): print('A val epoch ended.') self.validation_epoch_end_counter += 1 def on_train_end(self, trainer, model): print( "Training ends:" f"epcoh_end_counter={self.epcoh_end_counter}, " f"train_epcoh_end_counter={self.train_epcoh_end_counter}, " f"validation_epoch_end_counter={self.validation_epoch_end_counter} \n" ) assert self.train_epcoh_end_counter <= epochs assert self.epcoh_end_counter == self.train_epcoh_end_counter + self.validation_epoch_end_counter callbacks = [MyDummyCallback()] # added EarlyStopping and ModelCheckpoint from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint callbacks.append(ModelCheckpoint(dirpath=args.work_dir)) from pytorch_lightning.callbacks.early_stopping import EarlyStopping callbacks.append( EarlyStopping(monitor='val_loss', min_delta=0.00, patience=3, verbose=True, mode='max')) torch_estimator = hvd.TorchEstimator(backend=backend, store=store, model=model, input_shapes=[[-1, 1, 28, 28]], feature_cols=['features'], label_cols=['label'], batch_size=args.batch_size, epochs=args.epochs, validation=0.1, verbose=1, callbacks=callbacks, profiler="simple") torch_model = torch_estimator.fit(train_df).setOutputCols(['label_prob']) # Evaluate the model on the held-out test DataFrame pred_df = torch_model.transform(test_df) argmax = udf(lambda v: float(np.argmax(v)), returnType=T.DoubleType()) pred_df = pred_df.withColumn('label_pred', argmax(pred_df.label_prob)) evaluator = MulticlassClassificationEvaluator(predictionCol='label_pred', labelCol='label', metricName='accuracy') print('Test accuracy:', evaluator.evaluate(pred_df)) spark.stop()
def main(cfg: DictConfig): log.info("Arguments:\n %s", OmegaConf.to_yaml(cfg)) seed_everything(42) if not cfg.dataset.fine_grained: target_encoding = {"negative": 0, "positive": 1} else: target_encoding = { "very negative": 0, "negative": 1, "neutral": 2, "positive": 3, "very positive": 4, } # hydra generates a new working directory for each run # want to store data in same directory each run root = hydra.utils.to_absolute_path(".data") log.info("Downloading data...") # 1. Get SST dataset train, val, test = SSTDatasetAlt(root=root, tokenizer=TokenizerSST(), **cfg.dataset) # 2. Setup encoder encoder = TransformerEncoder() encoder.add_vocab([train, val, test], special_tokens={ "cls_token": "<cls>", "sep_token": "<sep>" }, **cfg.vocab) encoder.add_target_encoding(target_encoding) # 5. Setup train, val and test dataloaders dm = DataModule( train=train, val=val, test=test, collate_fn=encoder.collate_fn, batch_size=cfg.datamodule.batch_size, ) # 6. Setup model num_class = 5 if cfg.dataset.fine_grained else 2 model = TransformerWithClassifierHead(input_size=len(encoder.vocab), num_class=num_class, **cfg.model) optimizer = get_optimizer(model, **OmegaConf.to_container(cfg.optimizer)) scheduler_args = { "lr_lambda": linear_schedule_with_warmup(num_warmup_steps=1000, num_training_steps=cfg.trainer.max_steps) } scheduler = get_scheduler(optimizer, name="LambdaLR", args=scheduler_args) classifier = TextClassifier(model, optimizer=optimizer, scheduler=scheduler) # 7. Setup trainer early_stop_callback = EarlyStopping( monitor="val_epoch_loss", min_delta=0.0001, patience=3, verbose=True, mode="min", ) checkpoint_callback = ModelCheckpoint( filepath="./checkpoints/" + "{epoch}", save_top_k=1, verbose=True, monitor="val_epoch_loss", mode="min", ) trainer = Trainer(checkpoint_callback=checkpoint_callback, callbacks=[LoggingCallback(), early_stop_callback], **cfg.trainer) log.info("Training...") # 8. Fit model trainer.fit(classifier, dm.train_dataloader(), dm.val_dataloader()) # 9. Test model results = trainer.test( test_dataloaders=dm.test_dataloader(), ckpt_path=checkpoint_callback.best_model_path, ) log.info(results)
def run(cfg): pl.seed_everything(cfg.seed) output_dir = os.getcwd() output_dir = os.path.join(output_dir, "lightning_logs") if not os.path.exists(output_dir): os.makedirs(output_dir) os.chdir(hydra.utils.get_original_cwd()) # lightningmodule checkpoint checkpoint_callback = ModelCheckpoint( monitor="val/acc", dirpath=output_dir, filename="epoch{epoch:02d}-val_acc{val/acc:.2f}", auto_insert_metric_name=False, save_top_k=3, mode="max", ) # early stopping early_stopping_callback = EarlyStopping( monitor="val/acc", min_delta=0.0, patience=10, verbose=False, mode="max" ) # for bnn, wo need more patience due to parameter sensitive # timer timer = Timer() callbacks = [early_stopping_callback, checkpoint_callback, timer] # logger wandb_logger = WandbLogger( name="conv_cnn_" + datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S"), save_dir=output_dir, project="radial-bnn", ) # initialize trainer trainer = pl.Trainer(logger=wandb_logger, callbacks=callbacks, **cfg.trainer) # datamodule dm = MNISTDataModule(cfg=cfg.datamodule) dm.prepare_data() dm.setup() # model model = ConvModule(cfg=cfg.lightningmodule) # train model trainer.fit(model=model, datamodule=dm) # test model trainer.test(datamodule=dm, ckpt_path="best") # training time logger.info("{} elapsed in training".format(timer.time_elapsed("train")))