def run_experiment(hparams, *_): print(os.environ) num_workers = int(os.environ['SLURM_NNODES']) node_id = int(os.environ['SLURM_NODEID']) fold = 0 kfold = 5 debug = True path = os.environ['SCRATCH'] + f"/summer_school/hopt{fold}/job" + os.environ['SLURM_TASK_PID'] + os.environ['HOSTNAME'] print(node_id, path) exp = Experiment(save_dir=f'{path}/exp') exp.argparse(hparams) hparams.optimizer = tfa.optimizers.LAMB(lr=hparams.lr, weight_decay_rate=hparams.wd) print(hparams, flush=True) # start trainer auc = train(vars(hparams), num_workers, node_id, fold, kfold, debug, path) print(auc) # save Experiment exp.add_scalar('auc', auc) exp.save()
def train(hparams, *args): """Train your awesome model. :param hparams: The arguments to run the model with. """ # Initialize experiments and track all the hyperparameters exp = Experiment( name=hparams.test_tube_exp_name, # Location to save the metrics. save_dir=hparams.log_path, # The experiment version is optional, but using the one # from SLURM means the exp will not collide with other # versions if SLURM runs multiple at once. version=hparams.hpc_exp_number, autosave=False, ) exp.argparse(hparams) # Pretend to train. x = hparams.x_val for train_step in range(0, 100): y = hparams.y_val out = x * y exp.log({'fake_err': out.item()}) # Log metrics. # Save exp when done. exp.save()
def train(hparams): # this won't crash ever. If no exp number is there, it'll be None exp_version_from_slurm_script = hparams.hpc_exp_number # init exp and track all the parameters from the HyperOptArgumentParser # the experiment version is optional, but using the one from slurm means the exp will not collide with other # versions if slurm runs multiple at once. exp = Experiment( name=hparams.test_tube_exp_name, save_dir=hparams.log_path, version=exp_version_from_slurm_script, autosave=False, ) exp.argparse(hparams) # pretend to train x = hparams.x_val for train_step in range(0, 100): y = hparams.y_val out = x * y exp.log({'fake_err': out.item()}) # save exp when we're done exp.save()
def train(hparams, *args): """Train your awesome model. :param hparams: The arguments to run the model with. """ # Initialize experiments and track all the hyperparameters # if hparams.disease_model: # save_model_path = hparams.save_model_dir+'/disease' # else: # save_model_path = hparams.save_model_dir+'/synthetic' # Set seeds SEED = hparams.seed torch.manual_seed(SEED) np.random.seed(SEED) print(hparams) print(args) exp = Experiment( name=hparams.test_tube_exp_name, # Location to save the metrics. save_dir=hparams.log_path, autosave=False, ) exp.argparse(hparams) # checkpoint_callback = ModelCheckpoint( # filepath=save_model_path+'/'+hparams.cage_nr + # '/version_'+str(cluster.hpc_exp_number)+'/checkpoints', # verbose=True, # monitor='val_loss', # mode='min', # prefix='' # ) # # Pretend to train. # x = torch.rand((1, hparams.x_val)) # for train_step in range(0, 100): # y = torch.rand((hparams.x_val, 1)) # out = x.mm(y) # exp.log({'fake_err': out.item()}) dsl, \ trainedmodels,\ validatedmodels,\ losses,\ lossdf,\ knnres = runevaler("opsitu", hparams.epochs, [ESNNSystem], [TorchEvaler], [eval_dual_ann], networklayers=[hparams.c_layers, hparams.g_layers], lrs=[hparams.lr], dropoutrates=[hparams.dropout], validate_on_k=10, n=1, filenamepostfixes=["esnn"]) stats = stat(lossdf, hparams.epochs, "esnn") print(f"type : {type(stats)}") print(f"innertype : {type(stats[0])}") print(f"stats : {stats}") print(f"stats0 : {stats[0]}") exp.log({'loss': stats[0]}) #exp.log('tng_err': tng_err) #exp.log({"loss", stats[0]}) # Save exp when . exp.save()
def main(hparams): """ Main training routine specific for this project :param hparams: :return: """ # ------------------------ # 1 INIT LIGHTNING MODEL # ------------------------ model = LightningTemplateModel(hparams) # ------------------------ # 2 INIT TEST TUBE EXP # ------------------------ # init experiment exp = Experiment(name='test_exp', save_dir=hyperparams.log_dir, autosave=False, description='test demo') # ------------------------ # 2 INIT TRAINER # ------------------------ trainer = Trainer(experiment=exp, gpus=8, nb_gpu_nodes=2) # ------------------------ # 5 START TRAINING # ------------------------ trainer.fit(model)
def main(): config = Config() model = Classifier2DC(config) exp = Experiment(config.train_out_dir) with open(os.path.join(exp.log_dir, '../config.json'), 'w') as f: config_dict = {k: getattr(Config, k) for k in dir(Config) if not k.startswith('__')} json.dump(config_dict, f, indent=2) checkpoint_callback = ModelCheckpoint( filepath=os.path.join(config.train_out_dir, "models"), save_best_only=True, verbose=True, monitor='avg_val_loss', mode='min', prefix='' ) trainer = Trainer(experiment=exp, distributed_backend='dp', max_nb_epochs=config.max_epoch, checkpoint_callback=checkpoint_callback, gpus=config.gpus, nb_sanity_val_steps=20, val_check_interval=0.25, row_log_interval=1000, accumulate_grad_batches=config.accumulate_grad_batches ) trainer.fit(model)
def _init_experiment(self): self.experiment = Experiment(save_dir=None, name='default', debug=False, version=None, autosave=False, description=None)
def main(): # input_scan_dim=28 is row-by-row sequential MNIST # input_scan_dim=1 to make it pixel-by-pixel input_scan_dim = 28 hidden_dim = 128 output_dim = 10 learning_rate = 0.0005 batch_size = 32 gradient_clip = 2.0 is_permuted = False max_epochs = 100 percent_validation = 0.2 gpus = None if torch.cuda.is_available(): gpus = [0] model = LstmModel(input_scan_dim, hidden_dim, output_dim) lightning_module = SeqMNIST(model, learning_rate, batch_size, is_permuted, percent_validation) exp = Experiment(save_dir='experiments') trainer = Trainer(experiment=exp, track_grad_norm=-1, print_nan_grads=False, gradient_clip=gradient_clip, gpus=gpus, max_nb_epochs=max_epochs) trainer.fit(lightning_module)
def main(): config = Config() model = SegmentationModel(config) exp = Experiment(config.train_out_dir) with open(os.path.join(exp.log_dir, '../config.json'), 'w') as f: config_dict = { k: getattr(Config, k) for k in dir(Config) if not k.startswith('__') } json.dump(config_dict, f, indent=2) checkpoint_callback = ModelCheckpoint(filepath=os.path.join( config.train_out_dir, "models"), save_best_only=True, verbose=True, monitor='val_iou_any', mode='max', prefix='') trainer = Trainer(experiment=exp, distributed_backend='dp', max_nb_epochs=config.max_epoch, checkpoint_callback=checkpoint_callback, gpus=config.gpus, nb_sanity_val_steps=10, val_check_interval=1, row_log_interval=10) trainer.fit(model)
def get_exp(debug=True, version=None): # set up exp object without actually saving logs root_dir = os.path.dirname(os.path.realpath(__file__)) exp = Experiment(debug=debug, save_dir=root_dir, name='tests_tt_dir', version=version) return exp
def main(hparams): # save tensorboard logs exp = Experiment(save_dir=os.getcwd()) # init model model = GAN(hparams) # fit trainer on CPU trainer = pl.Trainer(experiment=exp, max_nb_epochs=200) trainer.fit(model)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model', choices=['srcnn', 'srgan'], required=True) parser.add_argument('--scale_factor', type=int, default=4) parser.add_argument('--batch_size', type=int, default=16) parser.add_argument('--patch_size', type=int, default=96) parser.add_argument('--gpus', type=str, default='0') opt = parser.parse_args() # load model class if opt.model == 'srcnn': Model = models.SRCNNModel elif opt.model == 'srgan': Model = models.SRGANModel # add model specific arguments to original parser parser = Model.add_model_specific_args(parser) opt = parser.parse_args() # instantiate experiment exp = Experiment(save_dir=f'./logs/{opt.model}') exp.argparse(opt) model = Model(opt) # define callbacks checkpoint_callback = ModelCheckpoint( filepath=exp.get_media_path(exp.name, exp.version), ) # instantiate trainer trainer = Trainer( experiment=exp, max_nb_epochs=4000, add_log_row_interval=50, check_val_every_n_epoch=10, checkpoint_callback=checkpoint_callback, gpus=[int(i) for i in opt.gpus.split(',')] ) # start training! trainer.fit(model)
def experiment(self): if self._experiment is not None: return self._experiment self._experiment = Experiment( save_dir=self.save_dir, name=self.name, debug=self.debug, version=self.version, create_git_tag=self.create_git_tag, rank=self.rank, ) return self._experiment
def main_trainer(hparams): print_params(hparams) exp = Experiment(name=hparams.tt_name, debug=hparams.debug, autosave=False, description=hparams.tt_description, save_dir=hparams.tt_save_path) exp.add_argparse_meta(hparams) # fit model val_scores = [] best_score = 0 for trial_nb in range(hparams.nb_trials): data = dataset_loader.IndividualSequencesData( hparams.data_path, y_labels=hparams.y_labels.split(',')) X, Y, lengths = flatten_data(data.train_x_y) # fit model = hmm.GaussianHMM(n_components=hparams.nb_components, n_iter=hparams.nb_hmm_iters) model.fit(X, lengths) val_X, val_Y, lengths = flatten_data(data.val_x_y) Y_hat = model.predict(val_X, lengths) val_score = np.equal(Y_hat, val_Y).sum() / float(len(Y_hat)) # save model if val_score > best_score: best_score = val_score save_model(model, hparams, exp, trial_nb) val_scores.append(val_score) exp.add_metric_row({'val_acc': val_score, 'trail_nb': trial_nb}) mean_val_acc = np.mean(val_scores) exp.add_metric_row({'final_val_acc': mean_val_acc}) exp.save()
def search_train(args, *extra_args): exp = Experiment( # Location to save the metrics. save_dir=args.ckptdir) exp.argparse(args) train(args, exp) exp.save()
def main(hparams): """ Main training routine specific for this project :param hparams: :return: """ # ------------------------ # 1 INIT LIGHTNING MODEL # ------------------------ print('loading model...') model = LightningTemplateModel(hparams) print('model built') # ------------------------ # 2 INIT TEST TUBE EXP # ------------------------ # init experiment exp = Experiment( name=hyperparams.experiment_name, save_dir=hyperparams.test_tube_save_path, autosave=False, description='test demo' ) exp.argparse(hparams) exp.save() # ------------------------ # 3 DEFINE CALLBACKS # ------------------------ model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) early_stop = EarlyStopping( monitor='val_acc', patience=3, verbose=True, mode='max' ) checkpoint = ModelCheckpoint( filepath=model_save_path, save_best_only=True, verbose=True, monitor='val_loss', mode='min' ) # ------------------------ # 4 INIT TRAINER # ------------------------ trainer = Trainer( experiment=exp, checkpoint_callback=checkpoint, early_stop_callback=early_stop, ) # ------------------------ # 5 START TRAINING # ------------------------ trainer.fit(model)
def main(): parent_parser = HyperOptArgumentParser(strategy="grid_search", add_help=False) logdir = "logs" parent_parser.add_argument( "--test_tube_save_path", default=os.path.join(logdir, "test_tube_data") ) parent_parser.add_argument( "--model_save_path", default=os.path.join(logdir, "model_weights") ) parent_parser.add_argument( "--experiment_name", default=os.path.join(logdir, "vampire") ) parser = VAMPIRE.add_model_specific_args(parent_parser, ".") hparams = parser.parse_args() model = VAMPIRE(hparams) exp = Experiment( name=hparams.experiment_name, save_dir=hparams.test_tube_save_path, autosave=False, ) exp.argparse(hparams) exp.save() trainer = Trainer(experiment=exp, fast_dev_run=False) trainer.fit(model)
def main(hparams, cluster, results_dict): """ Main training routine specific for this project :param hparams: :return: """ # ------------------------ # 1 INIT LIGHTNING MODEL # ------------------------ print('loading model...') model = LightningTemplateModel(hparams) print('model built') # ------------------------ # 2 INIT TEST TUBE EXP # ------------------------ # when using grid search, it's possible for all models to start at once # and use the same test tube experiment version relative_node_id = int(os.environ['SLURM_NODEID']) sleep(relative_node_id + 1) # init experiment exp = Experiment(name=hyperparams.experiment_name, save_dir=hyperparams.test_tube_save_path, autosave=False, description='test demo') exp.argparse(hparams) exp.save() # ------------------------ # 3 DEFINE CALLBACKS # ------------------------ model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) early_stop = EarlyStopping(monitor='val_acc', patience=3, verbose=True, mode='max') checkpoint = ModelCheckpoint(filepath=model_save_path, save_best_only=True, verbose=True, monitor='val_loss', mode='min') # ------------------------ # 4 INIT TRAINER # ------------------------ trainer = Trainer(experiment=exp, cluster=cluster, checkpoint_callback=checkpoint, early_stop_callback=early_stop, gpus=hparams.gpus, nb_gpu_nodes=hyperparams.nb_gpu_nodes) # ------------------------ # 5 START TRAINING # ------------------------ trainer.fit(model)
def main(hparams): """ Main training routine specific for this project :param hparams: :return: """ # init experiment exp = Experiment( name=hparams.tt_name, debug=hparams.debug, save_dir=hparams.tt_save_path, version=hparams.hpc_exp_number, autosave=False, description=hparams.tt_description ) exp.argparse(hparams) exp.save() # build model model = LightningTemplateModel(hparams) # configure trainer trainer = Trainer(experiment=exp) # train model trainer.fit(model)
def main(hparams): """ Main training routine specific for this project :param hparams: :return: """ # ------------------------ # 1 INIT LIGHTNING MODEL # ------------------------ model = LightningTemplateModel(hparams) # ------------------------ # 2 INIT EXP # ------------------------ # init experiment exp = Experiment(name=hyperparams.experiment_name, save_dir=hyperparams.test_tube_save_path, autosave=False, description='test demo') exp.argparse(hparams) exp.save() # ------------------------ # 3 INIT TRAINER # ------------------------ trainer = Trainer(experiment=exp) # ------------------------ # 4 START TRAINING # ------------------------ trainer.fit(model)
def main(hparams): """ Main training routine specific for this project :param hparams: :return: """ # ------------------------ # 1 INIT LIGHTNING MODEL # ------------------------ print('loading model...') model = LightningTemplateModel(hparams) print('model built') # ------------------------ # 2 INIT TEST TUBE EXP # ------------------------ # init experiment exp = Experiment(name=hyperparams.experiment_name, save_dir=hyperparams.test_tube_save_path, autosave=False, description='test demo') exp.argparse(hparams) exp.save() # ------------------------ # 3 INIT TRAINER # ------------------------ trainer = Trainer(experiment=exp, gpus=hparams.gpus, use_amp=True) # ------------------------ # 4 START TRAINING # ------------------------ trainer.fit(model)
def create_tt_experiment(hparams): """Create test-tube experiment for logging training and storing models. Parameters ---------- hparams : :obj:`dict` dictionary of hyperparameters defining experiment that will be saved as a csv file Returns ------- :obj:`tuple` - if experiment defined by hparams already exists, returns :obj:`(None, None, None)` - if experiment does not exist, returns :obj:`(hparams, sess_ids, exp)` """ from test_tube import Experiment # get session_dir hparams['session_dir'], sess_ids = get_session_dir( hparams, session_source=hparams.get('all_source', 'save')) if not os.path.isdir(hparams['session_dir']): os.makedirs(hparams['session_dir']) export_session_info_to_csv(hparams['session_dir'], sess_ids) hparams['expt_dir'] = get_expt_dir(hparams) if not os.path.isdir(hparams['expt_dir']): os.makedirs(hparams['expt_dir']) # check to see if experiment already exists if experiment_exists(hparams): return None, None, None exp = Experiment( name=hparams['experiment_name'], debug=False, save_dir=os.path.dirname(hparams['expt_dir'])) exp.save() hparams['version'] = exp.version return hparams, sess_ids, exp
def main_trainer(hparams): print_params(hparams) exp = Experiment(name=hparams.tt_name, debug=hparams.debug, autosave=False, description=hparams.tt_description, save_dir=hparams.tt_save_path) exp.add_argparse_meta(hparams) # init data loader # fit model val_scores, train_scores = [], [] best_score = 0 for trial_nb in range(hparams.nb_trials): data = SequentialReadingsData(window_size=hparams.time_steps, data_path=hparams.data_path, flatten_x=True) clf = RandomForestClassifier(n_estimators=hparams.nb_estimators) clf.fit(data.train_x, data.train_y) train_score = clf.score(data.train_x, data.train_y) val_score = clf.score(data.val_x, data.val_y) # save model when we have a better one if val_score > best_score: best_score = val_score save_model(clf, hparams, exp, trial_nb) train_scores.append(train_score) val_scores.append(val_score) exp.add_metric_row({'val_acc': val_score, 'train_acc': train_score, 'trail_nb': trial_nb}) mean_val_acc = np.mean(val_scores) mean_train_acc = np.mean(train_scores) exp.add_metric_row({'final_val_acc': mean_val_acc, 'final_train_acc': mean_train_acc}) exp.save()
def main(hparams, cluster=None, results_dict=None): """ Main training routine specific for this project :param hparams: :return: """ # init experiment log_dir = os.path.dirname(os.path.realpath(__file__)) exp = Experiment( name='test_tube_exp', debug=True, save_dir=log_dir, version=0, autosave=False, description='test demo' ) hparams.training_set_path = '/Volumes/Elements/Datasets/Immersions/house_data_mp3/training' hparams.validation_set_path = '/Volumes/Elements/Datasets/Immersions/house_data_mp3/validation' hparams.test_task_set_path = '/Volumes/Elements/Datasets/Immersions/house_data_mp3/test_task' hparams.batch_size = 4 # set the hparams for the experiment exp.argparse(hparams) exp.save() # build model model = ContrastivePredictiveSystem(hparams) # callbacks early_stop = EarlyStopping( monitor=hparams.early_stop_metric, patience=hparams.early_stop_patience, verbose=True, mode=hparams.early_stop_mode ) model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) checkpoint = ModelCheckpoint( filepath=model_save_path, save_best_only=True, verbose=True, monitor=hparams.model_save_monitor_value, mode=hparams.model_save_monitor_mode ) # configure trainer trainer = Trainer( experiment=exp, checkpoint_callback=checkpoint, early_stop_callback=early_stop, # distributed_backend='dp', #gpus=[0], nb_sanity_val_steps=2 ) # train model trainer.fit(model)
def teacher_forcing_training(): """Builds everything needed for the training, and launch the training.""" xp_info, xp_params = load_params() xp_name, debug = xp_info xp_path = join(os.environ["XP_PATH"], xp_name) hparams, optimizer_params, training_params = xp_params trainer_params = training_params["trainer"] # device, pin_memory = set_device["trainer"] loaders_params = training_params["loaders"] loaders_params["collate_fn"] = collate_by_padding loaders_params["pin_memory"] = trainer_params["gpus"] is not None special_tokens = { "<pad>": 0, "<unk>": 1, "<start>": 2, "<end>": 3 } token_indexer, embeddings = get_embedders( vocab_size=hparams["vocab_size"], embedding_dim=hparams["embedding_dim"], special_tokens=special_tokens ) hparams["vocab_size"] += len(special_tokens) dca_summarizer = build_multi_agt_summarizer(**hparams, embeddings=embeddings) summarizer_module = SummarizerModule( summarizer=dca_summarizer, token_indexer=token_indexer, loaders_params=loaders_params, optimizer_params=optimizer_params ) trainer_params["experiment"] = Experiment(save_dir=xp_path, name=xp_name, debug=debug) trainer_params["checkpoint_callback"] = ModelCheckpoint( filepath=join(xp_path, "checkpoint"), save_best_only=True, verbose=True, monitor='avg_val_loss', mode='min' ) trainer = Trainer(**trainer_params) print(f"tensorboard --logdir {xp_path}", end="\n\n") trainer.fit(summarizer_module)
def init_experiments(self): if (self._experiments[Mode.Training] is not None and self._experiments[Mode.Validation] is not None): return self._experiments[Mode.Training] = Experiment( save_dir=self.save_dir, name="train", debug=self.debug, version=self.version, description=self.description, create_git_tag=self.create_git_tag, rank=self.rank, ) self._experiments[Mode.Validation] = Experiment( save_dir=self.save_dir, name="valid", debug=self.debug, version=self.version, description=self.description, create_git_tag=self.create_git_tag, rank=self.rank, )
def main(hparams): """ Main training routine specific for this project """ # ------------------------ # 1 INIT LIGHTNING MODEL # ------------------------ print('loading model...') model = DSANet(hparams) print('model built') # ------------------------ # 2 INIT TEST TUBE EXP # ------------------------ # init experiment exp = Experiment( name='dsanet_exp_{}_window={}_horizon={}'.format(hparams.data_name, hparams.window, hparams.horizon), save_dir=hparams.test_tube_save_path, autosave=False, description='test demo' ) exp.argparse(hparams) exp.save() # ------------------------ # 3 DEFINE CALLBACKS # ------------------------ model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) early_stop = EarlyStopping( monitor='val_loss', patience=5, verbose=True, mode='min' ) # ------------------------ # 4 INIT TRAINER # ------------------------ trainer = Trainer( gpus=[0], # auto_scale_batch_size=True, max_epochs=10, # num_processes=2, # num_nodes=2 ) # ------------------------ # 5 START TRAINING # ------------------------ trainer.fit(model) print('View tensorboard logs by running\ntensorboard --logdir %s' % os.getcwd()) print('and going to http://localhost:6006 on your browser')
def main(): model = CoolSystem() # PyTorch summarywriter with a few bells and whistles exp = Experiment(save_dir='../output/tmp') print(f"exp.save_dir: {exp.save_dir}") exp.save() print(f"saved !!!") # train on cpu using only 10% of the data (for demo purposes) # pass in experiment for automatic tensorboard logging. trainer = Trainer(experiment=exp, max_nb_epochs=1, train_percent_check=0.1) # train on 4 gpus (lightning chooses GPUs for you) # trainer = Trainer(experiment=exp, max_nb_epochs=1, gpus=4) # train on 4 gpus (you choose GPUs) # trainer = Trainer(experiment=exp, max_nb_epochs=1, gpus=[0, 1, 3, 7]) # train on 32 gpus across 4 nodes (make sure to submit appropriate SLURM job) # trainer = Trainer(experiment=exp, max_nb_epochs=1, gpus=8, nb_gpu_nodes=4) # train (1 epoch only here for demo) trainer.fit(model)
def main_process_entrypoint(gpu_nb): world = 2 torch.distributed.init_process_group("nccl", rank=gpu_nb, world_size=world) torch.cuda.set_device(gpu_nb) model = ConvNet() model.cuda(gpu_nb) model = torch.nn.parallel.distributed.DistributedDataParallel( model, device_ids=[gpu_nb]) exp = Experiment(save_dir='./logs_cifar10_{}'.format(gpu_nb)) trainer = Trainer(experiment=exp, gpus=[gpu_nb], max_nb_epochs=20) # trainer = Trainer(gpus=[0, 1], max_nb_epochs=20) # trainer = Trainer(experiment=exp, gpus=[0], max_nb_epochs=20) trainer.fit(model)
def train(hparams): # init exp and track all the parameters from the HyperOptArgumentParser exp = Experiment( name=hparams.test_tube_exp_name, save_dir=hparams.log_path, autosave=False, ) exp.argparse(hparams) # pretend to train x = torch.rand((1, hparams.x_val)) for train_step in range(0, 100): y = torch.rand((hparams.x_val, 1)) out = x.mm(y) exp.log({'fake_err': out.item()}) # save exp when we're done exp.save()