def main(hparams): """ Main training routine specific for this project :param hparams: :return: """ # ------------------------ # 1 INIT LIGHTNING MODEL # ------------------------ print('loading model...') model = LightningTemplateModel(hparams) print('model built') # ------------------------ # 2 INIT TEST TUBE EXP # ------------------------ # init experiment exp = Experiment(name=hyperparams.experiment_name, save_dir=hyperparams.test_tube_save_path, autosave=False, description='test demo') exp.argparse(hparams) exp.save() # ------------------------ # 3 DEFINE CALLBACKS # ------------------------ model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) early_stop = EarlyStopping(monitor='val_acc', patience=3, verbose=True, mode='max') checkpoint = ModelCheckpoint(filepath=model_save_path, save_best_only=True, verbose=True, monitor='val_loss', mode='min') # ------------------------ # 4 INIT TRAINER # ------------------------ trainer = Trainer( experiment=exp, checkpoint_callback=checkpoint, early_stop_callback=early_stop, gpus=hparams.gpus, ) # ------------------------ # 5 START TRAINING # ------------------------ trainer.fit(model)
def main(hparams): """ Main training routine specific for this project """ # ------------------------ # 1 INIT LIGHTNING MODEL # ------------------------ print('loading model...') model = DSANet(hparams) print('model built') # ------------------------ # 2 INIT TEST TUBE EXP # ------------------------ # init experiment exp = Experiment(name='dsanet_exp_{}_window={}_horizon={}'.format( hparams.data_name, hparams.window, hparams.horizon), save_dir=hparams.test_tube_save_path, autosave=False, description='test demo') exp.argparse(hparams) exp.save() # ------------------------ # 3 DEFINE CALLBACKS # ------------------------ model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) early_stop = EarlyStopping(monitor='val_loss', patience=5, verbose=True, mode='min') # ------------------------ # 4 INIT TRAINER # ------------------------ trainer = Trainer( experiment=exp, early_stop_callback=early_stop, ) # ------------------------ # 5 START TRAINING # ------------------------ trainer.fit(model) print('View tensorboard logs by running\ntensorboard --logdir %s' % os.getcwd()) print('and going to http://localhost:6006 on your browser')
def main(hparams): """ Main training routine specific for this project :param hparams: :return: """ # init experiment exp = Experiment( name=hparams.tt_name, debug=hparams.debug, save_dir=hparams.tt_save_path, version=hparams.hpc_exp_number, autosave=False, description=hparams.tt_description ) exp.argparse(hparams) exp.save() # build model model = ExampleModel(hparams) # callbacks early_stop = EarlyStopping( monitor='val_acc', patience=3, mode='min', verbose=True, ) model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) checkpoint = ModelCheckpoint( filepath=model_save_path, save_function=None, save_best_only=True, verbose=True, monitor='val_acc', mode='min' ) # configure trainer trainer = Trainer( experiment=exp, checkpoint_callback=checkpoint, early_stop_callback=early_stop, ) # train model trainer.fit(model)
def main(hparams): # load model model = MyModel(hparams) # init experiment exp = Experiment( name=hparams.experiment_name, save_dir=hparams.test_tube_save_path, autosave=False, description='baseline attn interval' ) exp.argparse(hparams) exp.save() # define callbackes model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) early_stop = EarlyStopping( monitor='val_loss', patience=5, verbose=True, mode='min' ) checkpoint = ModelCheckpoint( filepath=model_save_path, save_best_only=True, verbose=True, monitor='pr', mode='max' ) # init trainer trainer = Trainer( experiment=exp, checkpoint_callback=checkpoint, early_stop_callback=early_stop, gpus=hparams.gpus, val_check_interval=1 ) # start training trainer.fit(model)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model', choices=['srcnn', 'srgan'], required=True) parser.add_argument('--scale_factor', type=int, default=4) parser.add_argument('--batch_size', type=int, default=16) parser.add_argument('--patch_size', type=int, default=96) parser.add_argument('--gpus', type=str, default='0') opt = parser.parse_args() # load model class if opt.model == 'srcnn': Model = models.SRCNNModel elif opt.model == 'srgan': Model = models.SRGANModel # add model specific arguments to original parser parser = Model.add_model_specific_args(parser) opt = parser.parse_args() # instantiate experiment exp = Experiment(save_dir=f'./logs/{opt.model}') exp.argparse(opt) model = Model(opt) # define callbacks checkpoint_callback = ModelCheckpoint( filepath=exp.get_media_path(exp.name, exp.version), ) # instantiate trainer trainer = Trainer( experiment=exp, max_nb_epochs=4000, add_log_row_interval=50, check_val_every_n_epoch=10, checkpoint_callback=checkpoint_callback, gpus=[int(i) for i in opt.gpus.split(',')] ) # start training! trainer.fit(model)
def main(hparams, cluster, results_dict): """ Main training routine specific for this project :param hparams: :return: """ on_gpu = torch.cuda.is_available() if hparams.disable_cuda: on_gpu = False device = 'cuda' if on_gpu else 'cpu' hparams.__setattr__('device', device) hparams.__setattr__('on_gpu', on_gpu) hparams.__setattr__('nb_gpus', torch.cuda.device_count()) hparams.__setattr__('inference_mode', hparams.model_load_weights_path is not None) # delay each training start to not overwrite logs process_position, current_gpu = TRAINING_MODEL.get_process_position( hparams.gpus) sleep(process_position + 1) # init experiment exp = Experiment(name=hparams.tt_name, debug=hparams.debug, save_dir=hparams.tt_save_path, version=hparams.hpc_exp_number, autosave=False, description=hparams.tt_description) exp.argparse(hparams) exp.save() # build model print('loading model...') model = TRAINING_MODEL(hparams) print('model built') # callbacks early_stop = EarlyStopping(monitor=hparams.early_stop_metric, patience=hparams.early_stop_patience, verbose=True, mode=hparams.early_stop_mode) model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) checkpoint = ModelCheckpoint(filepath=model_save_path, save_function=None, save_best_only=True, verbose=True, monitor=hparams.model_save_monitor_value, mode=hparams.model_save_monitor_mode) # configure trainer trainer = Trainer( experiment=exp, cluster=cluster, checkpoint_callback=checkpoint, early_stop_callback=early_stop, ) # train model trainer.fit(model)
def main(hparams, cluster, results_dict): """ Main training routine specific for this project :param hparams: :return: """ on_gpu = torch.cuda.is_available() if hparams.disable_cuda: on_gpu = False device = 'cuda' if on_gpu else 'cpu' hparams.__setattr__('device', device) hparams.__setattr__('on_gpu', on_gpu) hparams.__setattr__('nb_gpus', torch.cuda.device_count()) hparams.__setattr__('inference_mode', hparams.model_load_weights_path is not None) # delay each training start to not overwrite logs process_position, current_gpu = TRAINING_MODEL.get_process_position( hparams.gpus) sleep(process_position + 1) # init experiment exp = Experiment(name=hparams.tt_name, debug=hparams.debug, save_dir=hparams.tt_save_path, version=hparams.hpc_exp_number, autosave=False, description=hparams.tt_description) exp.argparse(hparams) exp.save() # build model print('loading model...') model = TRAINING_MODEL(hparams) print('model built') # callbacks early_stop = EarlyStopping(monitor=hparams.early_stop_metric, patience=hparams.early_stop_patience, verbose=True, mode=hparams.early_stop_mode) model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) checkpoint = ModelCheckpoint(filepath=model_save_path, save_function=None, save_best_only=True, verbose=True, monitor=hparams.model_save_monitor_value, mode=hparams.model_save_monitor_mode) # configure trainer trainer = Trainer(experiment=exp, on_gpu=on_gpu, cluster=cluster, enable_tqdm=hparams.enable_tqdm, overfit_pct=hparams.overfit, track_grad_norm=hparams.track_grad_norm, fast_dev_run=hparams.fast_dev_run, check_val_every_n_epoch=hparams.check_val_every_n_epoch, accumulate_grad_batches=hparams.accumulate_grad_batches, process_position=process_position, current_gpu_name=current_gpu, checkpoint_callback=checkpoint, early_stop_callback=early_stop, enable_early_stop=hparams.enable_early_stop, max_nb_epochs=hparams.max_nb_epochs, min_nb_epochs=hparams.min_nb_epochs, train_percent_check=hparams.train_percent_check, val_percent_check=hparams.val_percent_check, test_percent_check=hparams.test_percent_check, val_check_interval=hparams.val_check_interval, log_save_interval=hparams.log_save_interval, add_log_row_interval=hparams.add_log_row_interval, lr_scheduler_milestones=hparams.lr_scheduler_milestones) # train model trainer.fit(model)
def main(hparams, cluster, results_dict): """ Main training routine specific for this project :param hparams: :return: """ on_gpu = hparams.gpus is not None and torch.cuda.is_available() device = 'cuda' if on_gpu else 'cpu' hparams.__setattr__('device', device) hparams.__setattr__('on_gpu', on_gpu) hparams.__setattr__('nb_gpus', torch.cuda.device_count()) hparams.__setattr__('inference_mode', hparams.model_load_weights_path is not None) # delay each training start to not overwrite logs process_position, current_gpu = TRAINING_MODEL.get_process_position( hparams.gpus) sleep(process_position + 1) # init experiment log_dir = os.path.dirname(os.path.realpath(__file__)) exp = Experiment(name='test_tube_exp', debug=True, save_dir=log_dir, version=0, autosave=False, description='test demo') exp.argparse(hparams) exp.save() # build model print('loading model...') model = TRAINING_MODEL(hparams) print('model built') # callbacks early_stop = EarlyStopping(monitor=hparams.early_stop_metric, patience=hparams.early_stop_patience, verbose=True, mode=hparams.early_stop_mode) model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) checkpoint = ModelCheckpoint(filepath=model_save_path, save_function=None, save_best_only=True, verbose=True, monitor=hparams.model_save_monitor_value, mode=hparams.model_save_monitor_mode) # gpus are ; separated for inside a node and , within nodes gpu_list = None if hparams.gpus is not None: gpu_list = [int(x) for x in hparams.gpus.split(';')] # configure trainer trainer = Trainer(experiment=exp, cluster=cluster, checkpoint_callback=checkpoint, early_stop_callback=early_stop, gpus=gpu_list) # train model trainer.fit(model)
patience=5, verbose=True, mode='auto') checkpoint = ModelCheckpoint( filepath=model_save_path, # save_best_only=True, # save_weights_only=True, verbose=True, monitor='val_loss', mode='auto', period=100, ) #----------------------------------------------------------------------- # 4 INIT TRAINER #----------------------------------------------------------------------- trainer = Trainer( experiment=exp, checkpoint_callback=checkpoint, # early_stop_callback=early_stop, max_nb_epochs=EPOCH, gpus=args.gpu #map(int, args.gpu.split(',')), #hparams.gpus, # distributed_backend='ddp' ) #----------------------------------------------------------------------- # 5 START TRAINING #----------------------------------------------------------------------- trainer.fit(model) sys.exit()