def main(): parent_parser = HyperOptArgumentParser(strategy="grid_search", add_help=False) logdir = "logs" parent_parser.add_argument( "--test_tube_save_path", default=os.path.join(logdir, "test_tube_data") ) parent_parser.add_argument( "--model_save_path", default=os.path.join(logdir, "model_weights") ) parent_parser.add_argument( "--experiment_name", default=os.path.join(logdir, "vampire") ) parser = VAMPIRE.add_model_specific_args(parent_parser, ".") hparams = parser.parse_args() model = VAMPIRE(hparams) exp = Experiment( name=hparams.experiment_name, save_dir=hparams.test_tube_save_path, autosave=False, ) exp.argparse(hparams) exp.save() trainer = Trainer(experiment=exp, fast_dev_run=False) trainer.fit(model)
def train(hparams, *args): """Train your awesome model. :param hparams: The arguments to run the model with. """ # Initialize experiments and track all the hyperparameters exp = Experiment( name=hparams.test_tube_exp_name, # Location to save the metrics. save_dir=hparams.log_path, # The experiment version is optional, but using the one # from SLURM means the exp will not collide with other # versions if SLURM runs multiple at once. version=hparams.hpc_exp_number, autosave=False, ) exp.argparse(hparams) # Pretend to train. x = hparams.x_val for train_step in range(0, 100): y = hparams.y_val out = x * y exp.log({'fake_err': out.item()}) # Log metrics. # Save exp when done. exp.save()
def main(hparams): """ Main training routine specific for this project :param hparams: :return: """ # ------------------------ # 1 INIT LIGHTNING MODEL # ------------------------ print('loading model...') model = LightningTemplateModel(hparams) print('model built') # ------------------------ # 2 INIT TEST TUBE EXP # ------------------------ # init experiment exp = Experiment( name=hyperparams.experiment_name, save_dir=hyperparams.test_tube_save_path, autosave=False, description='test demo' ) exp.argparse(hparams) exp.save() # ------------------------ # 3 DEFINE CALLBACKS # ------------------------ model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) early_stop = EarlyStopping( monitor='val_acc', patience=3, verbose=True, mode='max' ) checkpoint = ModelCheckpoint( filepath=model_save_path, save_best_only=True, verbose=True, monitor='val_loss', mode='min' ) # ------------------------ # 4 INIT TRAINER # ------------------------ trainer = Trainer( experiment=exp, checkpoint_callback=checkpoint, early_stop_callback=early_stop, ) # ------------------------ # 5 START TRAINING # ------------------------ trainer.fit(model)
def main(hparams): """ Main training routine specific for this project :param hparams: :return: """ # ------------------------ # 1 INIT LIGHTNING MODEL # ------------------------ model = LightningTemplateModel(hparams) # ------------------------ # 2 INIT EXP # ------------------------ # init experiment exp = Experiment(name=hyperparams.experiment_name, save_dir=hyperparams.test_tube_save_path, autosave=False, description='test demo') exp.argparse(hparams) exp.save() # ------------------------ # 3 INIT TRAINER # ------------------------ trainer = Trainer(experiment=exp) # ------------------------ # 4 START TRAINING # ------------------------ trainer.fit(model)
def search_train(args, *extra_args): exp = Experiment( # Location to save the metrics. save_dir=args.ckptdir) exp.argparse(args) train(args, exp) exp.save()
def main(hparams): """ Main training routine specific for this project :param hparams: :return: """ # ------------------------ # 1 INIT LIGHTNING MODEL # ------------------------ print('loading model...') model = LightningTemplateModel(hparams) print('model built') # ------------------------ # 2 INIT TEST TUBE EXP # ------------------------ # init experiment exp = Experiment(name=hyperparams.experiment_name, save_dir=hyperparams.test_tube_save_path, autosave=False, description='test demo') exp.argparse(hparams) exp.save() # ------------------------ # 3 INIT TRAINER # ------------------------ trainer = Trainer(experiment=exp, gpus=hparams.gpus, use_amp=True) # ------------------------ # 4 START TRAINING # ------------------------ trainer.fit(model)
def run_experiment(hparams, *_): print(os.environ) num_workers = int(os.environ['SLURM_NNODES']) node_id = int(os.environ['SLURM_NODEID']) fold = 0 kfold = 5 debug = True path = os.environ['SCRATCH'] + f"/summer_school/hopt{fold}/job" + os.environ['SLURM_TASK_PID'] + os.environ['HOSTNAME'] print(node_id, path) exp = Experiment(save_dir=f'{path}/exp') exp.argparse(hparams) hparams.optimizer = tfa.optimizers.LAMB(lr=hparams.lr, weight_decay_rate=hparams.wd) print(hparams, flush=True) # start trainer auc = train(vars(hparams), num_workers, node_id, fold, kfold, debug, path) print(auc) # save Experiment exp.add_scalar('auc', auc) exp.save()
def main(hparams, cluster, results_dict): """ Main training routine specific for this project :param hparams: :return: """ # ------------------------ # 1 INIT LIGHTNING MODEL # ------------------------ print('loading model...') model = LightningTemplateModel(hparams) print('model built') # ------------------------ # 2 INIT TEST TUBE EXP # ------------------------ # when using grid search, it's possible for all models to start at once # and use the same test tube experiment version relative_node_id = int(os.environ['SLURM_NODEID']) sleep(relative_node_id + 1) # init experiment exp = Experiment(name=hyperparams.experiment_name, save_dir=hyperparams.test_tube_save_path, autosave=False, description='test demo') exp.argparse(hparams) exp.save() # ------------------------ # 3 DEFINE CALLBACKS # ------------------------ model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) early_stop = EarlyStopping(monitor='val_acc', patience=3, verbose=True, mode='max') checkpoint = ModelCheckpoint(filepath=model_save_path, save_best_only=True, verbose=True, monitor='val_loss', mode='min') # ------------------------ # 4 INIT TRAINER # ------------------------ trainer = Trainer(experiment=exp, cluster=cluster, checkpoint_callback=checkpoint, early_stop_callback=early_stop, gpus=hparams.gpus, nb_gpu_nodes=hyperparams.nb_gpu_nodes) # ------------------------ # 5 START TRAINING # ------------------------ trainer.fit(model)
def train(hparams): # this won't crash ever. If no exp number is there, it'll be None exp_version_from_slurm_script = hparams.hpc_exp_number # init exp and track all the parameters from the HyperOptArgumentParser # the experiment version is optional, but using the one from slurm means the exp will not collide with other # versions if slurm runs multiple at once. exp = Experiment( name=hparams.test_tube_exp_name, save_dir=hparams.log_path, version=exp_version_from_slurm_script, autosave=False, ) exp.argparse(hparams) # pretend to train x = hparams.x_val for train_step in range(0, 100): y = hparams.y_val out = x * y exp.log({'fake_err': out.item()}) # save exp when we're done exp.save()
def main(hparams): """ Main training routine specific for this project :param hparams: :return: """ # init experiment exp = Experiment( name=hparams.tt_name, debug=hparams.debug, save_dir=hparams.tt_save_path, version=hparams.hpc_exp_number, autosave=False, description=hparams.tt_description ) exp.argparse(hparams) exp.save() # build model model = LightningTemplateModel(hparams) # configure trainer trainer = Trainer(experiment=exp) # train model trainer.fit(model)
def train(hparams, *args): """Train your awesome model. :param hparams: The arguments to run the model with. """ # Initialize experiments and track all the hyperparameters # if hparams.disease_model: # save_model_path = hparams.save_model_dir+'/disease' # else: # save_model_path = hparams.save_model_dir+'/synthetic' # Set seeds SEED = hparams.seed torch.manual_seed(SEED) np.random.seed(SEED) print(hparams) print(args) exp = Experiment( name=hparams.test_tube_exp_name, # Location to save the metrics. save_dir=hparams.log_path, autosave=False, ) exp.argparse(hparams) # checkpoint_callback = ModelCheckpoint( # filepath=save_model_path+'/'+hparams.cage_nr + # '/version_'+str(cluster.hpc_exp_number)+'/checkpoints', # verbose=True, # monitor='val_loss', # mode='min', # prefix='' # ) # # Pretend to train. # x = torch.rand((1, hparams.x_val)) # for train_step in range(0, 100): # y = torch.rand((hparams.x_val, 1)) # out = x.mm(y) # exp.log({'fake_err': out.item()}) dsl, \ trainedmodels,\ validatedmodels,\ losses,\ lossdf,\ knnres = runevaler("opsitu", hparams.epochs, [ESNNSystem], [TorchEvaler], [eval_dual_ann], networklayers=[hparams.c_layers, hparams.g_layers], lrs=[hparams.lr], dropoutrates=[hparams.dropout], validate_on_k=10, n=1, filenamepostfixes=["esnn"]) stats = stat(lossdf, hparams.epochs, "esnn") print(f"type : {type(stats)}") print(f"innertype : {type(stats[0])}") print(f"stats : {stats}") print(f"stats0 : {stats[0]}") exp.log({'loss': stats[0]}) #exp.log('tng_err': tng_err) #exp.log({"loss", stats[0]}) # Save exp when . exp.save()
def main(hparams, cluster=None, results_dict=None): """ Main training routine specific for this project :param hparams: :return: """ # init experiment log_dir = os.path.dirname(os.path.realpath(__file__)) exp = Experiment( name='test_tube_exp', debug=True, save_dir=log_dir, version=0, autosave=False, description='test demo' ) hparams.training_set_path = '/Volumes/Elements/Datasets/Immersions/house_data_mp3/training' hparams.validation_set_path = '/Volumes/Elements/Datasets/Immersions/house_data_mp3/validation' hparams.test_task_set_path = '/Volumes/Elements/Datasets/Immersions/house_data_mp3/test_task' hparams.batch_size = 4 # set the hparams for the experiment exp.argparse(hparams) exp.save() # build model model = ContrastivePredictiveSystem(hparams) # callbacks early_stop = EarlyStopping( monitor=hparams.early_stop_metric, patience=hparams.early_stop_patience, verbose=True, mode=hparams.early_stop_mode ) model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) checkpoint = ModelCheckpoint( filepath=model_save_path, save_best_only=True, verbose=True, monitor=hparams.model_save_monitor_value, mode=hparams.model_save_monitor_mode ) # configure trainer trainer = Trainer( experiment=exp, checkpoint_callback=checkpoint, early_stop_callback=early_stop, # distributed_backend='dp', #gpus=[0], nb_sanity_val_steps=2 ) # train model trainer.fit(model)
def main(hparams): """ Main training routine specific for this project """ # ------------------------ # 1 INIT LIGHTNING MODEL # ------------------------ print('loading model...') model = DSANet(hparams) print('model built') # ------------------------ # 2 INIT TEST TUBE EXP # ------------------------ # init experiment exp = Experiment( name='dsanet_exp_{}_window={}_horizon={}'.format(hparams.data_name, hparams.window, hparams.horizon), save_dir=hparams.test_tube_save_path, autosave=False, description='test demo' ) exp.argparse(hparams) exp.save() # ------------------------ # 3 DEFINE CALLBACKS # ------------------------ model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) early_stop = EarlyStopping( monitor='val_loss', patience=5, verbose=True, mode='min' ) # ------------------------ # 4 INIT TRAINER # ------------------------ trainer = Trainer( gpus=[0], # auto_scale_batch_size=True, max_epochs=10, # num_processes=2, # num_nodes=2 ) # ------------------------ # 5 START TRAINING # ------------------------ trainer.fit(model) print('View tensorboard logs by running\ntensorboard --logdir %s' % os.getcwd()) print('and going to http://localhost:6006 on your browser')
def main(hparams): """ Main training routine specific for this project :param hparams: :return: """ # init experiment exp = Experiment( name=hparams.tt_name, debug=hparams.debug, save_dir=hparams.tt_save_path, version=hparams.hpc_exp_number, autosave=False, description=hparams.tt_description ) exp.argparse(hparams) exp.save() # build model model = ExampleModel(hparams) # callbacks early_stop = EarlyStopping( monitor='val_acc', patience=3, mode='min', verbose=True, ) model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) checkpoint = ModelCheckpoint( filepath=model_save_path, save_function=None, save_best_only=True, verbose=True, monitor='val_acc', mode='min' ) # configure trainer trainer = Trainer( experiment=exp, checkpoint_callback=checkpoint, early_stop_callback=early_stop, ) # train model trainer.fit(model)
def train(hparams): # init exp and track all the parameters from the HyperOptArgumentParser exp = Experiment( name=hparams.test_tube_exp_name, save_dir=hparams.log_path, autosave=False, ) exp.argparse(hparams) # pretend to train x = torch.rand((1, hparams.x_val)) for train_step in range(0, 100): y = torch.rand((hparams.x_val, 1)) out = x.mm(y) exp.log({'fake_err': out.item()}) # save exp when we're done exp.save()
def main(hparams): # load model model = MyModel(hparams) # init experiment exp = Experiment( name=hparams.experiment_name, save_dir=hparams.test_tube_save_path, autosave=False, description='baseline attn interval' ) exp.argparse(hparams) exp.save() # define callbackes model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) early_stop = EarlyStopping( monitor='val_loss', patience=5, verbose=True, mode='min' ) checkpoint = ModelCheckpoint( filepath=model_save_path, save_best_only=True, verbose=True, monitor='pr', mode='max' ) # init trainer trainer = Trainer( experiment=exp, checkpoint_callback=checkpoint, early_stop_callback=early_stop, gpus=hparams.gpus, val_check_interval=1 ) # start training trainer.fit(model)
def main(hparams, cluster): """ Main training routine specific for this project :param hparams: :return: """ # ------------------------ # 1 INIT LIGHTNING MODEL # ------------------------ print('loading model...') model = LightningTemplateModel(hparams) print('model built') # ------------------------ # 2 INIT TEST TUBE EXP # ------------------------ # when using grid search, it's possible for all models to start at once # and use the same test tube experiment version relative_node_id = int(os.environ['SLURM_NODEID']) sleep(relative_node_id + 1) # init experiment exp = Experiment( name=hyperparams.experiment_name, save_dir=hyperparams.test_tube_save_path, autosave=False, version=hparams.hpc_exp_number, # match the slurm job version number description='test demo') exp.argparse(hparams) exp.save() # ------------------------ # 4 INIT TRAINER # ------------------------ trainer = Trainer(experiment=exp, gpus=hparams.per_experiment_nb_gpus, nb_gpu_nodes=hyperparams.nb_gpu_nodes, distributed_backend=hyperparams.distributed_backend) # ------------------------ # 5 START TRAINING # ------------------------ trainer.fit(model)
def main(hparams): exp = Experiment( name=hparams.tt_name, debug=hparams.debug, save_dir=hparams.tt_save_path, version=hparams.hpc_exp_number, autosave=False, description=hparams.tt_description, ) exp.argparse(hparams) exp.save() model = AutoregressiveFaceVAE(hparams) early_stop = EarlyStopping(monitor="avg_val_loss", patience=3, verbose=True, mode="min") model_save_path = "{}/{}/{}".format(hparams.model_save_path, exp.name, exp.version) checkpoint = ModelCheckpoint( filepath=model_save_path, save_best_only=True, verbose=True, monitor="avg_val_loss", mode="min", ) trainer = Trainer( experiment=exp, checkpoint_callback=checkpoint, early_stop_callback=early_stop, gpus=hparams.gpus, distributed_backend=hparams.dist_backend, # val_check_interval=0.5, # distributed_backend="dp", # overfit_pct=0.01 ) trainer.fit(model)
def main(hparams, data): # init experiment log_dir = os.path.dirname(os.path.realpath(__file__)) exp = Experiment(name=hparams.exp_name, debug=False, save_dir=log_dir, version=0, autosave=True, description='P2R codebase') # set the hparams for the experiment exp.argparse(hparams) exp.save() # build model model = P2rSystem(hparams, data) model_save_path = '{}/{}/version_{}/checkpoints'.format( exp.save_dir, exp.name, exp.version) checkpoint = ModelCheckpoint(filepath=model_save_path, verbose=True, monitor='tng_loss', mode='min', save_best_only=True) # configure trainer trainer = Trainer(experiment=exp, checkpoint_callback=checkpoint, min_nb_epochs=1, max_nb_epochs=hparams.max_nb_epochs, track_grad_norm=2, accumulate_grad_batches=1, row_log_interval=1, amp_level='O2', use_amp=True, gpus=1) # train model trainer.fit(model) trainer.test() filepath = '{}/_ckpt_epoch_final.ckpt'.format(model_save_path) checkpoint.save_model(filepath, False)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model', choices=['srcnn', 'srgan'], required=True) parser.add_argument('--scale_factor', type=int, default=4) parser.add_argument('--batch_size', type=int, default=16) parser.add_argument('--patch_size', type=int, default=96) parser.add_argument('--gpus', type=str, default='0') opt = parser.parse_args() # load model class if opt.model == 'srcnn': Model = models.SRCNNModel elif opt.model == 'srgan': Model = models.SRGANModel # add model specific arguments to original parser parser = Model.add_model_specific_args(parser) opt = parser.parse_args() # instantiate experiment exp = Experiment(save_dir=f'./logs/{opt.model}') exp.argparse(opt) model = Model(opt) # define callbacks checkpoint_callback = ModelCheckpoint( filepath=exp.get_media_path(exp.name, exp.version), ) # instantiate trainer trainer = Trainer( experiment=exp, max_nb_epochs=4000, add_log_row_interval=50, check_val_every_n_epoch=10, checkpoint_callback=checkpoint_callback, gpus=[int(i) for i in opt.gpus.split(',')] ) # start training! trainer.fit(model)
def main(hparams): # init experiment experiment_args = parse_argdict_for_method(Experiment.__init__, hparams) exp = Experiment(**experiment_args) # set the hparams for the experiment exp.argparse(hparams) exp.save() # build model model = Network(hparams) # callbacks if hparams.enable_early_stop: early_stop = EarlyStopping(monitor=hparams.monitor_value, patience=hparams.patience, verbose=True, mode=hparams.monitor_mode) else: early_stop = None if hparams.enable_model_checkpoint: model_save_path = pathlib.Path(exp.log_dir).parent / 'model_weights' checkpoint = ModelCheckpoint( filepath=model_save_path, save_best_only=hparams.save_best_only, save_weights_only=hparams.save_weights_only, verbose=True, monitor=hparams.monitor_value, mode=hparams.monitor_mode) else: checkpoint = None # configure trainer trainer_args = parse_argdict_for_method(Trainer.__init__, hparams) trainer = Trainer(experiment=exp, early_stop_callback=early_stop, checkpoint_callback=checkpoint, **trainer_args) # train model trainer.fit(model)
def train(hparams, *args): """Train your awesome model. :param hparams: The arguments to run the model with. """ # Initialize experiments and track all the hyperparameters exp = Experiment( name=hparams.test_tube_exp_name, # Location to save the metrics. save_dir=hparams.log_path, autosave=False, ) exp.argparse(hparams) # Pretend to train. x = torch.rand((1, hparams.x_val)) for train_step in range(0, 100): y = torch.rand((hparams.x_val, 1)) out = x.mm(y) exp.log({'fake_err': out.item()}) # Save exp when . exp.save()
def train(hparams): # init exp and track all the parameters from the HyperOptArgumentParser exp = Experiment( name=hparams.test_tube_exp_name, save_dir=hparams.log_path, autosave=False, ) exp.argparse(hparams) # define tensorflow graph x = tf.placeholder(dtype=tf.int32, name='x') y = tf.placeholder(dtype=tf.int32, name='y') out = x * y sess = tf.Session() # Run the tf op for train_step in range(0, 100): output = sess.run(out, feed_dict={x: hparams.x_val, y: hparams.y_val}) exp.log({'fake_err': output}) # save exp when we're done exp.save()
"--batch_size", default=128, type=int, tunable=True, options=[2**n for n in range(5, 10)], ) args = parser.parse_args() args.max_steps = 1000 args.subpolicy_duration = 200 args.num_policies = 10 args.max_buffer_size = 1_000_000 args.env_names = ["Ant-v2"] exp.argparse(args) State = Any Action = Any Timestep = int class MasterPolicy(nn.Module): """Returns categorical distribution over subpolicies.""" def __init__(self, state_size, hidden_size, output_size=args.num_policies): super().__init__() S, H, O = state_size, hidden_size, output_size self.fc1 = Linear(S, H) self.fc2 = Linear(H, H) self.out = Linear(H, O)
def main(hparams, cluster, results_dict): """ Main training routine specific for this project :param hparams: :return: """ on_gpu = torch.cuda.is_available() if hparams.disable_cuda: on_gpu = False device = 'cuda' if on_gpu else 'cpu' hparams.__setattr__('device', device) hparams.__setattr__('on_gpu', on_gpu) hparams.__setattr__('nb_gpus', torch.cuda.device_count()) hparams.__setattr__('inference_mode', hparams.model_load_weights_path is not None) # delay each training start to not overwrite logs process_position, current_gpu = TRAINING_MODEL.get_process_position( hparams.gpus) sleep(process_position + 1) # init experiment exp = Experiment(name=hparams.tt_name, debug=hparams.debug, save_dir=hparams.tt_save_path, version=hparams.hpc_exp_number, autosave=False, description=hparams.tt_description) exp.argparse(hparams) exp.save() # build model print('loading model...') model = TRAINING_MODEL(hparams) print('model built') # callbacks early_stop = EarlyStopping(monitor=hparams.early_stop_metric, patience=hparams.early_stop_patience, verbose=True, mode=hparams.early_stop_mode) model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) checkpoint = ModelCheckpoint(filepath=model_save_path, save_function=None, save_best_only=True, verbose=True, monitor=hparams.model_save_monitor_value, mode=hparams.model_save_monitor_mode) # configure trainer trainer = Trainer( experiment=exp, cluster=cluster, checkpoint_callback=checkpoint, early_stop_callback=early_stop, ) # train model trainer.fit(model)
if epoch % 10 == 0 and epoch > 5: self.save(self.model_save_dir / 'checkpoints_{}.pth'.format(epoch)) def save(self, path: Path): torch.save(self.net.state_dict(), path) def load(self, path: Path): self.net.load_state_dict(torch.load(path)) def to_cpu(tensor): return tensor.detach().cpu().numpy() if __name__ == '__main__': sequences = ['MOT16-02', 'MOT16-04', 'MOT16-05', 'MOT16-09', 'MOT16-10', 'MOT16-11', 'MOT16-13'] args = get_parser().parse_args() args.train_sequences = sequences[:6] args.val_sequences = sequences[6:] output_dir = Path(args.log_dir) output_dir.mkdir(exist_ok=True, parents=True) logger = Experiment(output_dir, name=args.name, autosave=True, flush_secs=15) logger.argparse(args) model = GraphNNMOTracker(args, logger) model.train()
def main(hparams, cluster, results_dict): """ Main training routine specific for this project :param hparams: :return: """ on_gpu = torch.cuda.is_available() if hparams.disable_cuda: on_gpu = False device = 'cuda' if on_gpu else 'cpu' hparams.__setattr__('device', device) hparams.__setattr__('on_gpu', on_gpu) hparams.__setattr__('nb_gpus', torch.cuda.device_count()) hparams.__setattr__('inference_mode', hparams.model_load_weights_path is not None) # delay each training start to not overwrite logs process_position, current_gpu = TRAINING_MODEL.get_process_position( hparams.gpus) sleep(process_position + 1) # init experiment exp = Experiment(name=hparams.tt_name, debug=hparams.debug, save_dir=hparams.tt_save_path, version=hparams.hpc_exp_number, autosave=False, description=hparams.tt_description) exp.argparse(hparams) exp.save() # build model print('loading model...') model = TRAINING_MODEL(hparams) print('model built') # callbacks early_stop = EarlyStopping(monitor=hparams.early_stop_metric, patience=hparams.early_stop_patience, verbose=True, mode=hparams.early_stop_mode) model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) checkpoint = ModelCheckpoint(filepath=model_save_path, save_function=None, save_best_only=True, verbose=True, monitor=hparams.model_save_monitor_value, mode=hparams.model_save_monitor_mode) # configure trainer trainer = Trainer(experiment=exp, on_gpu=on_gpu, cluster=cluster, enable_tqdm=hparams.enable_tqdm, overfit_pct=hparams.overfit, track_grad_norm=hparams.track_grad_norm, fast_dev_run=hparams.fast_dev_run, check_val_every_n_epoch=hparams.check_val_every_n_epoch, accumulate_grad_batches=hparams.accumulate_grad_batches, process_position=process_position, current_gpu_name=current_gpu, checkpoint_callback=checkpoint, early_stop_callback=early_stop, enable_early_stop=hparams.enable_early_stop, max_nb_epochs=hparams.max_nb_epochs, min_nb_epochs=hparams.min_nb_epochs, train_percent_check=hparams.train_percent_check, val_percent_check=hparams.val_percent_check, test_percent_check=hparams.test_percent_check, val_check_interval=hparams.val_check_interval, log_save_interval=hparams.log_save_interval, add_log_row_interval=hparams.add_log_row_interval, lr_scheduler_milestones=hparams.lr_scheduler_milestones) # train model trainer.fit(model)
def main(hparams, cluster=None, results_dict=None): """ Main training routine specific for this project :param hparams: :return: """ # init experiment log_dir = os.path.dirname(os.path.realpath(__file__)) exp = Experiment(name='test_tube_exp', debug=True, save_dir=log_dir, version=0, autosave=False, description='test demo') hparams.training_set_path = '/Volumes/Elements/Datasets/Immersions/house_data_mp3/training' hparams.validation_set_path = '/Volumes/Elements/Datasets/Immersions/house_data_mp3/validation' hparams.test_task_set_path = '/Volumes/Elements/Datasets/Immersions/house_data_mp3/test_task' hparams.dummy_datasets = False hparams.audio_noise = 3e-3 hparams.cqt_fmin = 40. hparams.cqt_bins_per_octave = 24 hparams.cqt_n_bins = 216 hparams.cqt_hop_length = 512 hparams.cqt_filter_scale = 0.43 hparams.enc_channels = (1, 8, 16, 32, 64, 128, 256, 512, 512) hparams.enc_kernel_1_w = (3, 3, 3, 3, 3, 3, 3, 3) hparams.enc_kernel_1_h = (3, 3, 3, 3, 3, 3, 3, 3) hparams.enc_kernel_2_w = (1, 3, 1, 3, 1, 3, 1, 3) hparams.enc_kernel_2_h = (25, 3, 25, 3, 25, 3, 4, 3) hparams.enc_padding_1 = (1, 1, 1, 1, 1, 1, 1, 1) hparams.enc_padding_2 = (0, 1, 0, 1, 0, 1, 0, 0) hparams.enc_stride_1 = (1, 1, 1, 1, 1, 1, 1, 1) hparams.enc_stride_2 = (1, 1, 1, 1, 1, 1, 1, 1) hparams.enc_pooling_1 = (2, 1, 1, 1, 2, 1, 1, 1) hparams.ar_kernel_sizes = (5, 4, 1, 3, 3, 1, 3, 1, 6) hparams.ar_self_attention = (False, False, False, False, False, False, False, False, False) hparams.batch_size = 4 hparams.learning_rate = 3e-4 hparams.warmup_steps = 1000 hparams.annealing_steps = 100000 hparams.score_over_all_timesteps = False hparams.visible_steps = 60 # set the hparams for the experiment exp.argparse(hparams) exp.save() # build model model = ContrastivePredictiveSystem(hparams) # callbacks early_stop = EarlyStopping(monitor=hparams.early_stop_metric, patience=hparams.early_stop_patience, verbose=True, mode=hparams.early_stop_mode) model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) checkpoint = ModelCheckpoint(filepath=model_save_path, save_best_only=True, verbose=True, monitor=hparams.model_save_monitor_value, mode=hparams.model_save_monitor_mode) # configure trainer trainer = Trainer( experiment=exp, checkpoint_callback=checkpoint, #early_stop_callback=early_stop, # distributed_backend='dp', #gpus=[0], nb_sanity_val_steps=2, gradient_clip=0.5) # train model trainer.fit(model)
def main(hparams, cluster=None, results_dict=None): """ Main training routine specific for this project :param hparams: :return: """ # init experiment name = 'immersions_scalogram_resnet_maestro' version = 0 hparams.log_dir = '/home/idivinci3005/experiments/logs' hparams.checkpoint_dir = '/home/idivinci3005/experiments/checkpoints/' + name + '/' + str( version) hparams.training_set_path = '/home/idivinci3005/data/maestro-v2.0.0' hparams.validation_set_path = '/home/idivinci3005/data/maestro-v2.0.0' hparams.test_task_set_path = '/home/idivinci3005/data/maestro-v2.0.0' hparams.audio_noise = 3e-3 hparams.ar_kernel_sizes = (5, 4, 1, 3, 3, 1, 3, 1, 6) hparams.ar_self_attention = (False, False, False, False, False, False, False, False, False) hparams.batch_size = 32 hparams.learning_rate = 3e-4 hparams.warmup_steps = 1000 hparams.annealing_steps = 100000 hparams.score_over_all_timesteps = False hparams.visible_steps = 62 if not os.path.exists(hparams.checkpoint_dir): os.mkdir(hparams.checkpoint_dir) exp = Experiment(name=name, debug=False, save_dir=hparams.log_dir, version=version, autosave=False, description='maestro dataset experiment') # set the hparams for the experiment exp.argparse(hparams) exp.save() # build model model = ContrastivePredictiveSystemMaestro(hparams) task_model = MaestroClassificationTaskModel( model, task_dataset_path=hparams.test_task_set_path) model.test_task_model = task_model # callbacks early_stop = EarlyStopping(monitor=hparams.early_stop_metric, patience=hparams.early_stop_patience, verbose=True, mode=hparams.early_stop_mode) checkpoint = ModelCheckpoint(filepath=hparams.checkpoint_dir, save_best_only=False, verbose=True, monitor=hparams.model_save_monitor_value, mode=hparams.model_save_monitor_mode) # configure trainer trainer = Trainer( experiment=exp, checkpoint_callback=checkpoint, #early_stop_callback=early_stop, # distributed_backend='dp', gpus=[0], nb_sanity_val_steps=5, val_check_interval=0.1, val_percent_check=0.25, #train_percent_check=0.01 ) # train model trainer.fit(model)
def main(hparams, cluster, results_dict): """ Main training routine specific for this project :param hparams: :return: """ on_gpu = hparams.gpus is not None and torch.cuda.is_available() device = 'cuda' if on_gpu else 'cpu' hparams.__setattr__('device', device) hparams.__setattr__('on_gpu', on_gpu) hparams.__setattr__('nb_gpus', torch.cuda.device_count()) hparams.__setattr__('inference_mode', hparams.model_load_weights_path is not None) # delay each training start to not overwrite logs process_position, current_gpu = TRAINING_MODEL.get_process_position( hparams.gpus) sleep(process_position + 1) # init experiment log_dir = os.path.dirname(os.path.realpath(__file__)) exp = Experiment(name='test_tube_exp', debug=True, save_dir=log_dir, version=0, autosave=False, description='test demo') exp.argparse(hparams) exp.save() # build model print('loading model...') model = TRAINING_MODEL(hparams) print('model built') # callbacks early_stop = EarlyStopping(monitor=hparams.early_stop_metric, patience=hparams.early_stop_patience, verbose=True, mode=hparams.early_stop_mode) model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) checkpoint = ModelCheckpoint(filepath=model_save_path, save_function=None, save_best_only=True, verbose=True, monitor=hparams.model_save_monitor_value, mode=hparams.model_save_monitor_mode) # gpus are ; separated for inside a node and , within nodes gpu_list = None if hparams.gpus is not None: gpu_list = [int(x) for x in hparams.gpus.split(';')] # configure trainer trainer = Trainer(experiment=exp, cluster=cluster, checkpoint_callback=checkpoint, early_stop_callback=early_stop, gpus=gpu_list) # train model trainer.fit(model)