示例#1
0
def run_experiment(hparams, *_):
    print(os.environ)
    num_workers = int(os.environ['SLURM_NNODES'])
    node_id = int(os.environ['SLURM_NODEID'])

    fold  = 0
    kfold = 5
    debug = True
    path = os.environ['SCRATCH'] + f"/summer_school/hopt{fold}/job" + os.environ['SLURM_TASK_PID'] + os.environ['HOSTNAME']
    print(node_id, path)

    exp = Experiment(save_dir=f'{path}/exp')
    exp.argparse(hparams)

    hparams.optimizer = tfa.optimizers.LAMB(lr=hparams.lr,
                                            weight_decay_rate=hparams.wd)
    print(hparams, flush=True)

    # start trainer
    auc = train(vars(hparams), num_workers, node_id, fold, kfold, debug, path)
    print(auc)

    # save Experiment
    exp.add_scalar('auc', auc)
    exp.save()
示例#2
0
def train(hparams, *args):
    """Train your awesome model.

    :param hparams: The arguments to run the model with.
    """
    # Initialize experiments and track all the hyperparameters
    exp = Experiment(
        name=hparams.test_tube_exp_name,
        # Location to save the metrics.
        save_dir=hparams.log_path,
        # The experiment version is optional, but using the one
        # from SLURM means the exp will not collide with other
        # versions if SLURM runs multiple at once.
        version=hparams.hpc_exp_number,
        autosave=False,
    )
    exp.argparse(hparams)

    # Pretend to train.
    x = hparams.x_val
    for train_step in range(0, 100):
        y = hparams.y_val
        out = x * y
        exp.log({'fake_err': out.item()})  # Log metrics.

    # Save exp when done.
    exp.save()
示例#3
0
def train(hparams):

    # this won't crash ever. If no exp number is there, it'll be None
    exp_version_from_slurm_script = hparams.hpc_exp_number

    # init exp and track all the parameters from the HyperOptArgumentParser
    # the experiment version is optional, but using the one from slurm means the exp will not collide with other
    # versions if slurm runs multiple at once.
    exp = Experiment(
        name=hparams.test_tube_exp_name,
        save_dir=hparams.log_path,
        version=exp_version_from_slurm_script,
        autosave=False,
    )
    exp.argparse(hparams)

    # pretend to train
    x = hparams.x_val
    for train_step in range(0, 100):
        y = hparams.y_val
        out = x * y
        exp.log({'fake_err': out.item()})

    # save exp when we're done
    exp.save()
示例#4
0
def train(hparams, *args):
    """Train your awesome model.
    :param hparams: The arguments to run the model with.
    """
    # Initialize experiments and track all the hyperparameters
    # if hparams.disease_model:
    #     save_model_path = hparams.save_model_dir+'/disease'
    # else:
    #     save_model_path = hparams.save_model_dir+'/synthetic'
    # Set seeds
    SEED = hparams.seed
    torch.manual_seed(SEED)
    np.random.seed(SEED)
    print(hparams)
    print(args)
    exp = Experiment(
        name=hparams.test_tube_exp_name,
        # Location to save the metrics.
        save_dir=hparams.log_path,
        autosave=False,
    )
    exp.argparse(hparams)
    # checkpoint_callback = ModelCheckpoint(
    #     filepath=save_model_path+'/'+hparams.cage_nr +
    #     '/version_'+str(cluster.hpc_exp_number)+'/checkpoints',
    #     verbose=True,
    #     monitor='val_loss',
    #     mode='min',
    #     prefix=''
    # )
    # # Pretend to train.
    # x = torch.rand((1, hparams.x_val))
    # for train_step in range(0, 100):
    #     y = torch.rand((hparams.x_val, 1))
    #     out = x.mm(y)
    #     exp.log({'fake_err': out.item()})

    dsl, \
        trainedmodels,\
        validatedmodels,\
        losses,\
        lossdf,\
        knnres = runevaler("opsitu", hparams.epochs, [ESNNSystem],
                           [TorchEvaler], [eval_dual_ann],
                           networklayers=[hparams.c_layers, hparams.g_layers],
                           lrs=[hparams.lr],
                           dropoutrates=[hparams.dropout],
                           validate_on_k=10, n=1,
                           filenamepostfixes=["esnn"])
    stats = stat(lossdf, hparams.epochs, "esnn")
    print(f"type : {type(stats)}")
    print(f"innertype : {type(stats[0])}")
    print(f"stats : {stats}")
    print(f"stats0 : {stats[0]}")
    exp.log({'loss': stats[0]})
    #exp.log('tng_err': tng_err)
    #exp.log({"loss", stats[0]})
    # Save exp when .
    exp.save()
def main(hparams):
    """
    Main training routine specific for this project
    :param hparams:
    :return:
    """
    # ------------------------
    # 1 INIT LIGHTNING MODEL
    # ------------------------
    model = LightningTemplateModel(hparams)

    # ------------------------
    # 2 INIT TEST TUBE EXP
    # ------------------------
    # init experiment
    exp = Experiment(name='test_exp',
                     save_dir=hyperparams.log_dir,
                     autosave=False,
                     description='test demo')

    # ------------------------
    # 2 INIT TRAINER
    # ------------------------
    trainer = Trainer(experiment=exp, gpus=8, nb_gpu_nodes=2)

    # ------------------------
    # 5 START TRAINING
    # ------------------------
    trainer.fit(model)
示例#6
0
def main():
    config = Config()
    model = Classifier2DC(config)

    exp = Experiment(config.train_out_dir)

    with open(os.path.join(exp.log_dir, '../config.json'), 'w') as f:
        config_dict = {k: getattr(Config, k) for k in dir(Config) if not k.startswith('__')}
        json.dump(config_dict, f, indent=2)

    checkpoint_callback = ModelCheckpoint(
        filepath=os.path.join(config.train_out_dir, "models"),
        save_best_only=True,
        verbose=True,
        monitor='avg_val_loss',
        mode='min',
        prefix=''
    )

    trainer = Trainer(experiment=exp,
                      distributed_backend='dp',
                      max_nb_epochs=config.max_epoch,
                      checkpoint_callback=checkpoint_callback,
                      gpus=config.gpus,
                      nb_sanity_val_steps=20,
                      val_check_interval=0.25,
                      row_log_interval=1000,
                      accumulate_grad_batches=config.accumulate_grad_batches
                      )

    trainer.fit(model)
示例#7
0
 def _init_experiment(self):
     self.experiment = Experiment(save_dir=None,
                                  name='default',
                                  debug=False,
                                  version=None,
                                  autosave=False,
                                  description=None)
示例#8
0
def main():

    # input_scan_dim=28 is row-by-row sequential MNIST
    # input_scan_dim=1 to make it pixel-by-pixel
    input_scan_dim = 28
    hidden_dim = 128
    output_dim = 10
    learning_rate = 0.0005
    batch_size = 32
    gradient_clip = 2.0
    is_permuted = False
    max_epochs = 100
    percent_validation = 0.2

    gpus = None
    if torch.cuda.is_available():
        gpus = [0]

    model = LstmModel(input_scan_dim, hidden_dim, output_dim)
    lightning_module = SeqMNIST(model, learning_rate, batch_size, is_permuted,
                                percent_validation)
    exp = Experiment(save_dir='experiments')
    trainer = Trainer(experiment=exp,
                      track_grad_norm=-1,
                      print_nan_grads=False,
                      gradient_clip=gradient_clip,
                      gpus=gpus,
                      max_nb_epochs=max_epochs)
    trainer.fit(lightning_module)
示例#9
0
def main():
    config = Config()
    model = SegmentationModel(config)

    exp = Experiment(config.train_out_dir)

    with open(os.path.join(exp.log_dir, '../config.json'), 'w') as f:
        config_dict = {
            k: getattr(Config, k)
            for k in dir(Config) if not k.startswith('__')
        }
        json.dump(config_dict, f, indent=2)

    checkpoint_callback = ModelCheckpoint(filepath=os.path.join(
        config.train_out_dir, "models"),
                                          save_best_only=True,
                                          verbose=True,
                                          monitor='val_iou_any',
                                          mode='max',
                                          prefix='')

    trainer = Trainer(experiment=exp,
                      distributed_backend='dp',
                      max_nb_epochs=config.max_epoch,
                      checkpoint_callback=checkpoint_callback,
                      gpus=config.gpus,
                      nb_sanity_val_steps=10,
                      val_check_interval=1,
                      row_log_interval=10)

    trainer.fit(model)
示例#10
0
def get_exp(debug=True, version=None):
    # set up exp object without actually saving logs
    root_dir = os.path.dirname(os.path.realpath(__file__))
    exp = Experiment(debug=debug,
                     save_dir=root_dir,
                     name='tests_tt_dir',
                     version=version)
    return exp
示例#11
0
def main(hparams):
    # save tensorboard logs
    exp = Experiment(save_dir=os.getcwd())

    # init model
    model = GAN(hparams)

    # fit trainer on CPU
    trainer = pl.Trainer(experiment=exp, max_nb_epochs=200)
    trainer.fit(model)
示例#12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model', choices=['srcnn', 'srgan'], required=True)
    parser.add_argument('--scale_factor', type=int, default=4)
    parser.add_argument('--batch_size', type=int, default=16)
    parser.add_argument('--patch_size', type=int, default=96)
    parser.add_argument('--gpus', type=str, default='0')
    opt = parser.parse_args()

    # load model class
    if opt.model == 'srcnn':
        Model = models.SRCNNModel
    elif opt.model == 'srgan':
        Model = models.SRGANModel

    # add model specific arguments to original parser
    parser = Model.add_model_specific_args(parser)
    opt = parser.parse_args()

    # instantiate experiment
    exp = Experiment(save_dir=f'./logs/{opt.model}')
    exp.argparse(opt)

    model = Model(opt)

    # define callbacks
    checkpoint_callback = ModelCheckpoint(
        filepath=exp.get_media_path(exp.name, exp.version),
    )

    # instantiate trainer
    trainer = Trainer(
        experiment=exp,
        max_nb_epochs=4000,
        add_log_row_interval=50,
        check_val_every_n_epoch=10,
        checkpoint_callback=checkpoint_callback,
        gpus=[int(i) for i in opt.gpus.split(',')]
    )

    # start training!
    trainer.fit(model)
示例#13
0
 def experiment(self):
     if self._experiment is not None:
         return self._experiment
     self._experiment = Experiment(
         save_dir=self.save_dir,
         name=self.name,
         debug=self.debug,
         version=self.version,
         create_git_tag=self.create_git_tag,
         rank=self.rank,
     )
     return self._experiment
def main_trainer(hparams):
    print_params(hparams)

    exp = Experiment(name=hparams.tt_name,
                     debug=hparams.debug,
                     autosave=False,
                     description=hparams.tt_description,
                     save_dir=hparams.tt_save_path)

    exp.add_argparse_meta(hparams)

    # fit model
    val_scores = []
    best_score = 0
    for trial_nb in range(hparams.nb_trials):
        data = dataset_loader.IndividualSequencesData(
            hparams.data_path, y_labels=hparams.y_labels.split(','))
        X, Y, lengths = flatten_data(data.train_x_y)

        # fit
        model = hmm.GaussianHMM(n_components=hparams.nb_components,
                                n_iter=hparams.nb_hmm_iters)
        model.fit(X, lengths)

        val_X, val_Y, lengths = flatten_data(data.val_x_y)
        Y_hat = model.predict(val_X, lengths)
        val_score = np.equal(Y_hat, val_Y).sum() / float(len(Y_hat))

        # save model
        if val_score > best_score:
            best_score = val_score
            save_model(model, hparams, exp, trial_nb)

        val_scores.append(val_score)

        exp.add_metric_row({'val_acc': val_score, 'trail_nb': trial_nb})

    mean_val_acc = np.mean(val_scores)
    exp.add_metric_row({'final_val_acc': mean_val_acc})
    exp.save()
示例#15
0
def search_train(args, *extra_args):
    exp = Experiment(
        # Location to save the metrics.
        save_dir=args.ckptdir)
    exp.argparse(args)
    train(args, exp)
    exp.save()
def main(hparams):
    """
    Main training routine specific for this project
    :param hparams:
    :return:
    """
    # ------------------------
    # 1 INIT LIGHTNING MODEL
    # ------------------------
    print('loading model...')
    model = LightningTemplateModel(hparams)
    print('model built')

    # ------------------------
    # 2 INIT TEST TUBE EXP
    # ------------------------

    # init experiment
    exp = Experiment(
        name=hyperparams.experiment_name,
        save_dir=hyperparams.test_tube_save_path,
        autosave=False,
        description='test demo'
    )

    exp.argparse(hparams)
    exp.save()

    # ------------------------
    # 3 DEFINE CALLBACKS
    # ------------------------
    model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version)
    early_stop = EarlyStopping(
        monitor='val_acc',
        patience=3,
        verbose=True,
        mode='max'
    )

    checkpoint = ModelCheckpoint(
        filepath=model_save_path,
        save_best_only=True,
        verbose=True,
        monitor='val_loss',
        mode='min'
    )

    # ------------------------
    # 4 INIT TRAINER
    # ------------------------
    trainer = Trainer(
        experiment=exp,
        checkpoint_callback=checkpoint,
        early_stop_callback=early_stop,
    )

    # ------------------------
    # 5 START TRAINING
    # ------------------------
    trainer.fit(model)
示例#17
0
文件: train.py 项目: agatan/vampire
def main():
    parent_parser = HyperOptArgumentParser(strategy="grid_search", add_help=False)
    logdir = "logs"
    parent_parser.add_argument(
        "--test_tube_save_path", default=os.path.join(logdir, "test_tube_data")
    )
    parent_parser.add_argument(
        "--model_save_path", default=os.path.join(logdir, "model_weights")
    )
    parent_parser.add_argument(
        "--experiment_name", default=os.path.join(logdir, "vampire")
    )
    parser = VAMPIRE.add_model_specific_args(parent_parser, ".")
    hparams = parser.parse_args()

    model = VAMPIRE(hparams)

    exp = Experiment(
        name=hparams.experiment_name,
        save_dir=hparams.test_tube_save_path,
        autosave=False,
    )
    exp.argparse(hparams)
    exp.save()

    trainer = Trainer(experiment=exp, fast_dev_run=False)
    trainer.fit(model)
示例#18
0
def main(hparams, cluster, results_dict):
    """
    Main training routine specific for this project
    :param hparams:
    :return:
    """
    # ------------------------
    # 1 INIT LIGHTNING MODEL
    # ------------------------
    print('loading model...')
    model = LightningTemplateModel(hparams)
    print('model built')

    # ------------------------
    # 2 INIT TEST TUBE EXP
    # ------------------------
    # when using grid search, it's possible for all models to start at once
    # and use the same test tube experiment version
    relative_node_id = int(os.environ['SLURM_NODEID'])
    sleep(relative_node_id + 1)

    # init experiment
    exp = Experiment(name=hyperparams.experiment_name,
                     save_dir=hyperparams.test_tube_save_path,
                     autosave=False,
                     description='test demo')

    exp.argparse(hparams)
    exp.save()

    # ------------------------
    # 3 DEFINE CALLBACKS
    # ------------------------
    model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name,
                                        exp.version)
    early_stop = EarlyStopping(monitor='val_acc',
                               patience=3,
                               verbose=True,
                               mode='max')

    checkpoint = ModelCheckpoint(filepath=model_save_path,
                                 save_best_only=True,
                                 verbose=True,
                                 monitor='val_loss',
                                 mode='min')

    # ------------------------
    # 4 INIT TRAINER
    # ------------------------
    trainer = Trainer(experiment=exp,
                      cluster=cluster,
                      checkpoint_callback=checkpoint,
                      early_stop_callback=early_stop,
                      gpus=hparams.gpus,
                      nb_gpu_nodes=hyperparams.nb_gpu_nodes)

    # ------------------------
    # 5 START TRAINING
    # ------------------------
    trainer.fit(model)
示例#19
0
def main(hparams):
    """
    Main training routine specific for this project
    :param hparams:
    :return:
    """
    # init experiment
    exp = Experiment(
        name=hparams.tt_name,
        debug=hparams.debug,
        save_dir=hparams.tt_save_path,
        version=hparams.hpc_exp_number,
        autosave=False,
        description=hparams.tt_description
    )

    exp.argparse(hparams)
    exp.save()

    # build model
    model = LightningTemplateModel(hparams)

    # configure trainer
    trainer = Trainer(experiment=exp)

    # train model
    trainer.fit(model)
示例#20
0
def main(hparams):
    """
    Main training routine specific for this project
    :param hparams:
    :return:
    """
    # ------------------------
    # 1 INIT LIGHTNING MODEL
    # ------------------------
    model = LightningTemplateModel(hparams)

    # ------------------------
    # 2 INIT EXP
    # ------------------------
    # init experiment
    exp = Experiment(name=hyperparams.experiment_name,
                     save_dir=hyperparams.test_tube_save_path,
                     autosave=False,
                     description='test demo')

    exp.argparse(hparams)
    exp.save()

    # ------------------------
    # 3 INIT TRAINER
    # ------------------------
    trainer = Trainer(experiment=exp)

    # ------------------------
    # 4 START TRAINING
    # ------------------------
    trainer.fit(model)
def main(hparams):
    """
    Main training routine specific for this project
    :param hparams:
    :return:
    """
    # ------------------------
    # 1 INIT LIGHTNING MODEL
    # ------------------------
    print('loading model...')
    model = LightningTemplateModel(hparams)
    print('model built')

    # ------------------------
    # 2 INIT TEST TUBE EXP
    # ------------------------

    # init experiment
    exp = Experiment(name=hyperparams.experiment_name,
                     save_dir=hyperparams.test_tube_save_path,
                     autosave=False,
                     description='test demo')

    exp.argparse(hparams)
    exp.save()

    # ------------------------
    # 3 INIT TRAINER
    # ------------------------
    trainer = Trainer(experiment=exp, gpus=hparams.gpus, use_amp=True)

    # ------------------------
    # 4 START TRAINING
    # ------------------------
    trainer.fit(model)
示例#22
0
def create_tt_experiment(hparams):
    """Create test-tube experiment for logging training and storing models.

    Parameters
    ----------
    hparams : :obj:`dict`
        dictionary of hyperparameters defining experiment that will be saved as a csv file

    Returns
    -------
    :obj:`tuple`
        - if experiment defined by hparams already exists, returns :obj:`(None, None, None)`
        - if experiment does not exist, returns :obj:`(hparams, sess_ids, exp)`

    """
    from test_tube import Experiment

    # get session_dir
    hparams['session_dir'], sess_ids = get_session_dir(
        hparams, session_source=hparams.get('all_source', 'save'))
    if not os.path.isdir(hparams['session_dir']):
        os.makedirs(hparams['session_dir'])
        export_session_info_to_csv(hparams['session_dir'], sess_ids)
    hparams['expt_dir'] = get_expt_dir(hparams)
    if not os.path.isdir(hparams['expt_dir']):
        os.makedirs(hparams['expt_dir'])

    # check to see if experiment already exists
    if experiment_exists(hparams):
        return None, None, None

    exp = Experiment(
        name=hparams['experiment_name'],
        debug=False,
        save_dir=os.path.dirname(hparams['expt_dir']))
    exp.save()
    hparams['version'] = exp.version

    return hparams, sess_ids, exp
def main_trainer(hparams):
    print_params(hparams)

    exp = Experiment(name=hparams.tt_name,
                     debug=hparams.debug,
                     autosave=False,
                     description=hparams.tt_description,
                     save_dir=hparams.tt_save_path)

    exp.add_argparse_meta(hparams)

    # init data loader

    # fit model
    val_scores, train_scores = [], []
    best_score = 0
    for trial_nb in range(hparams.nb_trials):
        data = SequentialReadingsData(window_size=hparams.time_steps, data_path=hparams.data_path, flatten_x=True)

        clf = RandomForestClassifier(n_estimators=hparams.nb_estimators)
        clf.fit(data.train_x, data.train_y)

        train_score = clf.score(data.train_x, data.train_y)
        val_score = clf.score(data.val_x, data.val_y)

        # save model when we have a better one
        if val_score > best_score:
            best_score = val_score
            save_model(clf, hparams, exp, trial_nb)

        train_scores.append(train_score)
        val_scores.append(val_score)

        exp.add_metric_row({'val_acc': val_score, 'train_acc': train_score, 'trail_nb': trial_nb})

    mean_val_acc = np.mean(val_scores)
    mean_train_acc = np.mean(train_scores)
    exp.add_metric_row({'final_val_acc': mean_val_acc, 'final_train_acc': mean_train_acc})
    exp.save()
示例#24
0
def main(hparams, cluster=None, results_dict=None):
    """
    Main training routine specific for this project
    :param hparams:
    :return:
    """
    # init experiment
    log_dir = os.path.dirname(os.path.realpath(__file__))
    exp = Experiment(
        name='test_tube_exp',
        debug=True,
        save_dir=log_dir,
        version=0,
        autosave=False,
        description='test demo'
    )

    hparams.training_set_path = '/Volumes/Elements/Datasets/Immersions/house_data_mp3/training'
    hparams.validation_set_path = '/Volumes/Elements/Datasets/Immersions/house_data_mp3/validation'
    hparams.test_task_set_path = '/Volumes/Elements/Datasets/Immersions/house_data_mp3/test_task'
    hparams.batch_size = 4

    # set the hparams for the experiment
    exp.argparse(hparams)
    exp.save()

    # build model
    model = ContrastivePredictiveSystem(hparams)

    # callbacks
    early_stop = EarlyStopping(
        monitor=hparams.early_stop_metric,
        patience=hparams.early_stop_patience,
        verbose=True,
        mode=hparams.early_stop_mode
    )

    model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version)
    checkpoint = ModelCheckpoint(
        filepath=model_save_path,
        save_best_only=True,
        verbose=True,
        monitor=hparams.model_save_monitor_value,
        mode=hparams.model_save_monitor_mode
    )

    # configure trainer
    trainer = Trainer(
        experiment=exp,
        checkpoint_callback=checkpoint,
        early_stop_callback=early_stop,
        # distributed_backend='dp',
        #gpus=[0],
        nb_sanity_val_steps=2
    )

    # train model
    trainer.fit(model)
示例#25
0
def teacher_forcing_training():
    """Builds everything needed for the training, and launch the training."""
    xp_info, xp_params = load_params()
    xp_name, debug = xp_info
    xp_path = join(os.environ["XP_PATH"], xp_name)
    hparams, optimizer_params, training_params = xp_params

    trainer_params = training_params["trainer"]
    # device, pin_memory = set_device["trainer"]

    loaders_params = training_params["loaders"]
    loaders_params["collate_fn"] = collate_by_padding
    loaders_params["pin_memory"] = trainer_params["gpus"] is not None

    special_tokens = {
        "<pad>": 0,
        "<unk>": 1,
        "<start>": 2,
        "<end>": 3
    }
    token_indexer, embeddings = get_embedders(
        vocab_size=hparams["vocab_size"],
        embedding_dim=hparams["embedding_dim"],
        special_tokens=special_tokens
    )
    hparams["vocab_size"] += len(special_tokens)
    dca_summarizer = build_multi_agt_summarizer(**hparams,
                                                embeddings=embeddings)

    summarizer_module = SummarizerModule(
        summarizer=dca_summarizer,
        token_indexer=token_indexer,
        loaders_params=loaders_params,
        optimizer_params=optimizer_params
    )

    trainer_params["experiment"] = Experiment(save_dir=xp_path, name=xp_name,
                                              debug=debug)

    trainer_params["checkpoint_callback"] = ModelCheckpoint(
        filepath=join(xp_path, "checkpoint"),
        save_best_only=True,
        verbose=True,
        monitor='avg_val_loss',
        mode='min'
    )

    trainer = Trainer(**trainer_params)
    print(f"tensorboard --logdir {xp_path}", end="\n\n")
    trainer.fit(summarizer_module)
示例#26
0
    def init_experiments(self):
        if (self._experiments[Mode.Training] is not None
                and self._experiments[Mode.Validation] is not None):
            return

        self._experiments[Mode.Training] = Experiment(
            save_dir=self.save_dir,
            name="train",
            debug=self.debug,
            version=self.version,
            description=self.description,
            create_git_tag=self.create_git_tag,
            rank=self.rank,
        )
        self._experiments[Mode.Validation] = Experiment(
            save_dir=self.save_dir,
            name="valid",
            debug=self.debug,
            version=self.version,
            description=self.description,
            create_git_tag=self.create_git_tag,
            rank=self.rank,
        )
示例#27
0
def main(hparams):
    """
    Main training routine specific for this project
    """
    # ------------------------
    # 1 INIT LIGHTNING MODEL
    # ------------------------
    print('loading model...')
    model = DSANet(hparams)
    print('model built')

    # ------------------------
    # 2 INIT TEST TUBE EXP
    # ------------------------

    # init experiment
    exp = Experiment(
        name='dsanet_exp_{}_window={}_horizon={}'.format(hparams.data_name, hparams.window, hparams.horizon),
        save_dir=hparams.test_tube_save_path,
        autosave=False,
        description='test demo'
    )

    exp.argparse(hparams)
    exp.save()

    # ------------------------
    # 3 DEFINE CALLBACKS
    # ------------------------
    model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version)
    early_stop = EarlyStopping(
        monitor='val_loss',
        patience=5,
        verbose=True,
        mode='min'
    )

    # ------------------------
    # 4 INIT TRAINER
    # ------------------------
    trainer = Trainer(
        gpus=[0],
        # auto_scale_batch_size=True,
        max_epochs=10,
        # num_processes=2,
        # num_nodes=2
        
    )

    # ------------------------
    # 5 START TRAINING
    # ------------------------
    trainer.fit(model)

    print('View tensorboard logs by running\ntensorboard --logdir %s' % os.getcwd())
    print('and going to http://localhost:6006 on your browser')
示例#28
0
def main():
    model = CoolSystem()

    # PyTorch summarywriter with a few bells and whistles
    exp = Experiment(save_dir='../output/tmp')
    print(f"exp.save_dir: {exp.save_dir}")
    exp.save()
    print(f"saved !!!")

    # train on cpu using only 10% of the data (for demo purposes)
    # pass in experiment for automatic tensorboard logging.
    trainer = Trainer(experiment=exp, max_nb_epochs=1, train_percent_check=0.1)

    # train on 4 gpus (lightning chooses GPUs for you)
    # trainer = Trainer(experiment=exp, max_nb_epochs=1, gpus=4)

    # train on 4 gpus (you choose GPUs)
    # trainer = Trainer(experiment=exp, max_nb_epochs=1, gpus=[0, 1, 3, 7])

    # train on 32 gpus across 4 nodes (make sure to submit appropriate SLURM job)
    # trainer = Trainer(experiment=exp, max_nb_epochs=1, gpus=8, nb_gpu_nodes=4)

    # train (1 epoch only here for demo)
    trainer.fit(model)
示例#29
0
def main_process_entrypoint(gpu_nb):
    world = 2
    torch.distributed.init_process_group("nccl", rank=gpu_nb, world_size=world)

    torch.cuda.set_device(gpu_nb)

    model = ConvNet()
    model.cuda(gpu_nb)
    model = torch.nn.parallel.distributed.DistributedDataParallel(
        model, device_ids=[gpu_nb])

    exp = Experiment(save_dir='./logs_cifar10_{}'.format(gpu_nb))
    trainer = Trainer(experiment=exp, gpus=[gpu_nb], max_nb_epochs=20)
    # trainer = Trainer(gpus=[0, 1], max_nb_epochs=20)
    # trainer = Trainer(experiment=exp, gpus=[0], max_nb_epochs=20)
    trainer.fit(model)
示例#30
0
def train(hparams):
    # init exp and track all the parameters from the HyperOptArgumentParser
    exp = Experiment(
        name=hparams.test_tube_exp_name,
        save_dir=hparams.log_path,
        autosave=False,
    )
    exp.argparse(hparams)

    # pretend to train
    x = torch.rand((1, hparams.x_val))
    for train_step in range(0, 100):
        y = torch.rand((hparams.x_val, 1))
        out = x.mm(y)
        exp.log({'fake_err': out.item()})

    # save exp when we're done
    exp.save()