示例#1
0
def test_model_saving_loading():
    """
    Tests use case where trainer saves the model, and user loads it from tags independently
    :return:
    """
    hparams = get_hparams()
    model = LightningTestModel(hparams)

    save_dir = init_save_dir()

    # exp file to get meta
    exp = get_exp(False)
    exp.argparse(hparams)
    exp.save()

    trainer_options = dict(
        max_nb_epochs=1,
        cluster=SlurmCluster(),
        experiment=exp,
        checkpoint_callback=ModelCheckpoint(save_dir)
    )

    # fit model
    trainer = Trainer(**trainer_options)
    result = trainer.fit(model)

    # traning complete
    assert result == 1, 'amp + ddp model failed to complete'

    # make a prediction
    for batch in model.test_dataloader:
        break

    x, y = batch
    x = x.view(x.size(0), -1)

    # generate preds before saving model
    model.eval()
    pred_before_saving = model(x)

    # save model
    new_weights_path = os.path.join(save_dir, 'save_test.ckpt')
    trainer.save_checkpoint(new_weights_path)

    # load new model
    tags_path = exp.get_data_path(exp.name, exp.version)
    tags_path = os.path.join(tags_path, 'meta_tags.csv')
    model_2 = LightningTestModel.load_from_metrics(weights_path=new_weights_path,
                                                   tags_csv=tags_path, on_gpu=False)
    model_2.eval()

    # make prediction
    # assert that both predictions are the same
    new_pred = model_2(x)
    assert torch.all(torch.eq(pred_before_saving, new_pred)).item() == 1

    clear_save_dir()
def test_no_val_end_module():
    """
    Tests use case where trainer saves the model, and user loads it from tags independently
    :return:
    """

    class CurrentTestModel(LightningValidationStepMixin, LightningTestModelBase):
        pass
    hparams = get_hparams()
    model = CurrentTestModel(hparams)

    save_dir = init_save_dir()

    # exp file to get meta
    exp = get_exp(False)
    exp.argparse(hparams)
    exp.save()

    trainer_options = dict(
        max_nb_epochs=1,
        cluster=SlurmCluster(),
        experiment=exp,
        checkpoint_callback=ModelCheckpoint(save_dir)
    )

    # fit model
    trainer = Trainer(**trainer_options)
    result = trainer.fit(model)

    # traning complete
    assert result == 1, 'amp + ddp model failed to complete'

    # save model
    new_weights_path = os.path.join(save_dir, 'save_test.ckpt')
    trainer.save_checkpoint(new_weights_path)

    # load new model
    tags_path = exp.get_data_path(exp.name, exp.version)
    tags_path = os.path.join(tags_path, 'meta_tags.csv')
    model_2 = LightningTestModel.load_from_metrics(weights_path=new_weights_path,
                                                   tags_csv=tags_path, on_gpu=False)
    model_2.eval()

    # make prediction
    clear_save_dir()
示例#3
0
def optimize_on_cluster(hyperparams):
    # enable cluster training
    # log all scripts to the test tube folder
    cluster = SlurmCluster(
        hyperparam_optimizer=hyperparams,
        log_path=hyperparams.slurm_log_path,
    )

    # email for cluster coms
    cluster.notify_job_status(email='add_email_here',
                              on_done=True,
                              on_fail=True)

    # configure cluster
    cluster.per_experiment_nb_gpus = hyperparams.per_experiment_nb_gpus
    cluster.per_experiment_nb_nodes = hyperparams.nb_gpu_nodes
    cluster.job_time = '2:00:00'
    cluster.gpu_type = 'volta'
    cluster.memory_mb_per_node = 0

    # any modules for code to run in env
    cluster.add_command('source activate lightning')

    # run only on 32GB voltas
    cluster.add_slurm_cmd(cmd='constraint',
                          value='volta32gb',
                          comment='use 32gb gpus')
    cluster.add_slurm_cmd(cmd='partition',
                          value=hyperparams.gpu_partition,
                          comment='use 32gb gpus')

    # run hopt
    # creates and submits jobs to slurm
    cluster.optimize_parallel_cluster_gpu(main,
                                          nb_trials=hyperparams.nb_hopt_trials,
                                          job_name=hyperparams.experiment_name)
示例#4
0
def optimize_on_cluster(hyperparams):
    # enable cluster training
    cluster = SlurmCluster(hyperparam_optimizer=hyperparams,
                           log_path=hyperparams.tt_save_path,
                           test_tube_exp_name=hyperparams.tt_name)

    # email for cluster coms
    cluster.notify_job_status(email='add_email_here',
                              on_done=True,
                              on_fail=True)

    # configure cluster
    cluster.per_experiment_nb_gpus = hyperparams.per_experiment_nb_gpus
    cluster.job_time = '48:00:00'
    cluster.gpu_type = '1080ti'
    cluster.memory_mb_per_node = 48000

    # any modules for code to run in env
    cluster.add_command('source activate pytorch_lightning')

    # name of exp
    job_display_name = hyperparams.tt_name.split('_')[0]
    job_display_name = job_display_name[0:3]

    # run hopt
    print('submitting jobs...')
    cluster.optimize_parallel_cluster_gpu(main,
                                          nb_trials=hyperparams.nb_hopt_trials,
                                          job_name=job_display_name)
示例#5
0
def test_cpu_slurm_save_load():
    """
    Verify model save/load/checkpoint on CPU
    :return:
    """
    hparams = get_hparams()
    model = LightningTestModel(hparams)

    save_dir = init_save_dir()

    # exp file to get meta
    exp = get_exp(False)
    exp.argparse(hparams)
    exp.save()

    cluster_a = SlurmCluster()
    trainer_options = dict(
        max_nb_epochs=1,
        cluster=cluster_a,
        experiment=exp,
        checkpoint_callback=ModelCheckpoint(save_dir)
    )

    # fit model
    trainer = Trainer(**trainer_options)
    result = trainer.fit(model)
    real_global_step = trainer.global_step

    # traning complete
    assert result == 1, 'amp + ddp model failed to complete'

    # predict with trained model before saving
    # make a prediction
    for batch in model.test_dataloader:
        break

    x, y = batch
    x = x.view(x.size(0), -1)

    model.eval()
    pred_before_saving = model(x)

    # test registering a save function
    trainer.enable_auto_hpc_walltime_manager()

    # test HPC saving
    # simulate snapshot on slurm
    saved_filepath = trainer.hpc_save(save_dir, exp)
    assert os.path.exists(saved_filepath)

    # wipe-out trainer and model
    # retrain with not much data... this simulates picking training back up after slurm
    # we want to see if the weights come back correctly
    continue_tng_hparams = get_hparams(continue_training=True,
                                       hpc_exp_number=cluster_a.hpc_exp_number)
    trainer_options = dict(
        max_nb_epochs=1,
        cluster=SlurmCluster(continue_tng_hparams),
        experiment=exp,
        checkpoint_callback=ModelCheckpoint(save_dir),
    )
    trainer = Trainer(**trainer_options)
    model = LightningTestModel(hparams)

    # set the epoch start hook so we can predict before the model does the full training
    def assert_pred_same():
        assert trainer.global_step == real_global_step and trainer.global_step > 0

        # predict with loaded model to make sure answers are the same
        trainer.model.eval()
        new_pred = trainer.model(x)
        assert torch.all(torch.eq(pred_before_saving, new_pred)).item() == 1

    model.on_epoch_start = assert_pred_same

    # by calling fit again, we trigger training, loading weights from the cluster
    # and our hook to predict using current model before any more weight updates
    trainer.fit(model)

    clear_save_dir()
示例#6
0
    parser.add_argument('--test_tube_exp_name', default='my_test')
    parser.add_argument('--log_path', default='~/logs')
    parser.opt_list('--y_val',
                    default=12,
                    options=[1, 2, 3, 4, 5, 6],
                    tunable=True)
    parser.opt_list('--x_val',
                    default=12,
                    options=[20, 12, 30, 45],
                    tunable=True)
    hyperparams = parser.parse_args()

    # Enable cluster training.
    cluster = SlurmCluster(
        hyperparam_optimizer=hyperparams,
        log_path=hyperparams.log_path,
        python_cmd='python3',
        #         test_tube_exp_name=hyperparams.test_tube_exp_name
    )

    # Email results if your hpc supports it.
    cluster.notify_job_status(email='*****@*****.**',
                              on_done=True,
                              on_fail=True)

    # SLURM Module to load.
    cluster.load_modules(['python-3', 'anaconda3'])

    # Add commands to the non-SLURM portion.
    cluster.add_command('source activate transformers')

    # Add custom SLURM commands which show up as:
def optimize_on_cluster(hyperparams):
    # enable cluster training
    # log all scripts to the test tube folder
    cluster = SlurmCluster(
        hyperparam_optimizer=hyperparams,
        log_path=hyperparams.slurm_log_path,
    )

    # email for cluster coms
    cluster.notify_job_status(email=hyperparams.email, on_done=True, on_fail=True)

    # configure cluster
    cluster.per_experiment_nb_gpus = hyperparams.per_experiment_nb_gpus
    cluster.per_experiment_nb_nodes = hyperparams.nb_gpu_nodes
    cluster.job_time = '2:00:00'
    cluster.gpu_type = 'volta'
    cluster.memory_mb_per_node = 0

    # any modules for code to run in env
    cluster.add_command(f'source activate {hyperparams.conda_env}')

    # set DDP master port
    cluster.add_command(f'export MASTER_PORT={PORT}')

    # OPTIONAL for debugging
    # without these flags errors in your code will 
    # appear to be nccl errors
    cluster.add_command('export NCCL_DEBUG=INFO')
    cluster.add_command('export PYTHONFAULTHANDLER=1')

    # depending on your cluster config, you probably want
    # to limit the wired connection device
    # cluster.add_command('export NCCL_SOCKET_IFNAME=^docker0,lo')

    # depending on your cluster, you might need to load
    # the latest NCCL version
    # cluster.load_modules(['NCCL/2.4.7-1-cuda.10.0'])

    # run only on 32GB voltas
    cluster.add_slurm_cmd(cmd='constraint', value='volta32gb',
                          comment='use 32gb gpus')
    cluster.add_slurm_cmd(cmd='partition', value=hyperparams.gpu_partition,
                          comment='use 32gb gpus')

    # run hopt
    # creates and submits jobs to slurm
    cluster.optimize_parallel_cluster_gpu(
        main,
        nb_trials=hyperparams.num_hyperparam_trials,
        job_name=hyperparams.experiment_name
    )
示例#8
0
def optimize_on_cluster(hyperparams):
    # enable cluster training
    # log all scripts to the test tube folder
    cluster = SlurmCluster(
        hyperparam_optimizer=hyperparams,
        log_path=hyperparams.slurm_log_path,
    )

    # email for cluster coms
    # cluster.notify_job_status(email='add_email_here', on_done=True, on_fail=True)

    # configure cluster
    cluster.per_experiment_nb_gpus = hyperparams.per_experiment_nb_gpus
    cluster.per_experiment_nb_nodes = hyperparams.nb_gpu_nodes
    cluster.per_experiment_nb_cpus = hyperparams.per_experiment_nb_cpus
    cluster.job_time = hyperparams.job_time
    cluster.gpu_type = hyperparams.gpu_type
    cluster.memory_mb_per_node = 0

    # any modules for code to run in env
    cluster.add_command("source activate dialog")
    cluster.add_command(
        "export PYTHONPATH=$PYTHONPATH:/private/home/koustuvs/mlp/latentDialogAnalysis"
    )

    # run only on 32GB voltas
    # cluster.add_slurm_cmd(cmd='constraint', value='volta32gb',
    #                     comment='use 32gb gpus')
    cluster.add_slurm_cmd(cmd="partition",
                          value=hyperparams.gpu_partition,
                          comment="use 32gb gpus")

    # run hopt
    # creates and submits jobs to slurm
    cluster.optimize_parallel_cluster_gpu(
        main,
        nb_trials=hyperparams.nb_hopt_trials,
        job_name=hyperparams.id + "_grid_search",
        job_display_name=hyperparams.id,
    )

# set up our argparser and make the y_val tunable
parser = HyperOptArgumentParser(strategy='random_search')
parser.add_argument('--test_tube_exp_name', default='my_test')
parser.add_argument('--log_path', default='/some/path/to/log')
parser.opt_list('--y_val',
                default=12,
                options=[1, 2, 3, 4, 5, 6],
                tunable=True)
parser.opt_list('--x_val', default=12, options=[20, 12, 30, 45], tunable=True)
hyperparams = parser.parse_args()

# enable cluster training
cluster = SlurmCluster(hyperparam_optimizer=hyperparams,
                       log_path=hyperparams.log_path,
                       python_cmd='python3',
                       test_tube_exp_name=hyperparams.test_tube_exp_name)

# email results if your hpc supports it
cluster.notify_job_status(email='*****@*****.**', on_done=True, on_fail=True)

# any modules for code to run in env
cluster.load_modules(['python-3', 'anaconda3'])
cluster.add_command('source activate myCondaEnv')

# set job compute details (this will apply PER set of hyperparameters)
cluster.per_experiment_nb_gpus = 4
cluster.per_experiment_nb_nodes = 2
cluster.gpu_type = '1080ti'

# each job (24 in total here) will use 8 gpus for each set of hyperparams
示例#10
0
def run_on_cluster(hyperparams):
    # enable cluster training
    cluster = SlurmCluster(hyperparam_optimizer=hyperparams,
                           log_path=hyperparams.logs_save_path)

    # email results if your hpc supports it
    cluster.notify_job_status(email='*****@*****.**',
                              on_done=True,
                              on_fail=True)
    # any modules for code to run in env
    cluster.add_command(f'source activate {hyperparams.conda_env}')
    # pick the gpu resources
    cluster.per_experiment_nb_gpus = hyperparams.gpus
    cluster.per_experiment_nb_cpus = 1
    cluster.per_experiment_nb_nodes = 1
    #cluster.gpu_type = 'k80'
    cluster.job_time = '48:00:00'
    cluster.minutes_to_checkpoint_before_walltime = 5
    cluster.memory_mb_per_node = 250000  #180000
    # come up with a short exp name
    job_display_name = hyperparams.tt_name.split('_')[0]
    job_display_name = job_display_name[0:4]
    # optimize across all gpus
    print('submitting jobs...')
    cluster.optimize_parallel_cluster_gpu(main,
                                          nb_trials=hyperparams.nb_hopt_trials,
                                          job_name=job_display_name)
示例#11
0
def run_cluster(parser, fn_main, lt_system):
    params = parser.parse_args()

    if params.system_mode == "3d" and "3d" not in params.model_name:
        params.model_name += "_3d"

    if not ':' in params.time:
        params.time = f"{int(params.time):02d}:00:00"

    arch = platform.uname().processor

    loaded_module = ''
    partition = params.partition
    # if partition is None:
    if arch == 'x86_64':
        partition = 'npl'
    elif arch == 'ppc64le':
        partition = 'dcs,rpi'

    if partition == 'npl':
        loaded_module = "module load gcc cuda openmpi"
    else:
        loaded_module = "module load spectrum-mpi"

    log_path = os.path.join(os.environ['HOME'], params.slurm_log_root)

    cluster = SlurmCluster(hyperparam_optimizer=params,
                           log_path=log_path,
                           python_cmd="python")

    # cluster.notify_job_status(email='',
    #                           on_fail=True,
    #                           on_done=False)
    # configure cluster
    cluster.per_experiment_nb_gpus = params.n_gpus
    cluster.per_experiment_nb_nodes = params.num_nodes
    cluster.per_experiment_nb_cpus = 0  # disable this option
    cluster.job_time = params.time
    cluster.minutes_to_checkpoint_before_walltime = 2  # 2 min walltime
    cluster.memory_mb_per_node = int(params.n_gpus) * int(
        params.cpus_per_task) * int(params.mem_per_cpu)

    if params.partition is not None:
        cluster.add_slurm_cmd('partition',
                              value=params.partition,
                              comment='cluster partition name')
    cluster.add_slurm_cmd('ntasks-per-node',
                          value=params.n_gpus,
                          comment='#task per node')
    cluster.add_slurm_cmd('cpus-per-task',
                          value=params.cpus_per_task,
                          comment='#cpu per task/gpu')
    cluster.add_slurm_cmd('mem-per-cpu',
                          value=params.mem_per_cpu,
                          comment="memory per cpu")

    # cluster.memory_mb_per_node = params.memory  # disable this option

    cluster.add_command('export PYTHONFAULTHANDLER=1')
    # cluster.add_command('export NCCL_DEBUG=INFO')
    cluster.add_command(loaded_module)

    # Master address for multi-node training
    cluster.add_command(
        "export SLURM_JOB_NODELIST=$(scontrol show hostnames $SLURM_JOB_NODELIST | tr '\\n' ' ')"
    )
    cluster.add_command("export SLURM_NODELIST=$SLURM_JOB_NODELIST")
    cluster.add_command(
        "slurm_nodes=$(scontrol show hostnames $SLURM_JOB_NODELIST)")
    cluster.add_command(
        "export MASTER_ADDRESS=$(echo $slurm_nodes | cut -d' ' -f1)")

    if params.job_name is None:
        job_name = params.model_name
    else:
        job_name = params.job_name

    # Each hyperparameter combination will use 8 gpus.
    cluster.optimize_parallel_cluster_gpu(
        # Function to execute
        lambda par, _optimizer: fn_main(par, lt_system, _optimizer),
        # Number of hyperparameter combinations to search:
        nb_trials=params.nb_trials,
        enable_auto_resubmit=params.auto_resubmit,
        # This is what will display in the slurm queue:
        job_name=job_name)
示例#12
0
def optimize_on_cluster(hyperparams):
    '''
    This function is in charge of creating the slurm bash scripts that will send our task to the cluster.
    For a reference single script check pl_submit.sh, located in this same folder.
    '''


    # enable cluster training
    # log all scripts to the test tube folder
    cluster = SlurmCluster(
        hyperparam_optimizer=hyperparams,
        log_path=hyperparams.slurm_log_path,
    )

    cluster.add_slurm_cmd(cmd = 'partition', value = 'gpu2', comment = 'queue')

    cluster.add_slurm_cmd(cmd = 'ntasks-per-node', value='2', comment = 'Tasks per node')
    cluster.job_time = '0-17:00:00'

    # email for cluster coms
    cluster.add_slurm_cmd('mail-type', value = 'all', comment = 'Mail type')
    cluster.add_slurm_cmd('mail-user', value = '*****@*****.**', comment = 'Mail account')

    # configure cluster
    cluster.per_experiment_nb_gpus = 2
    cluster.per_experiment_nb_nodes = 13

    cluster.memory_mb_per_node = 0

    # any modules for code to run in env
    cluster.add_command('module purge')
    cluster.add_command('module load python/3.8.2')
    cluster.add_command('module load nvidia/cuda/9.1')

    cluster.add_command('set')


    # run hopt
    # creates and submits jobs to slurm
    cluster.optimize_parallel_cluster_gpu(
        main,
        nb_trials=24,
        job_name='grid_test'
    )
示例#13
0
if __name__ == '__main__':
    # Set up our argparser and make the y_val tunable.
    parser = HyperOptArgumentParser(strategy='random_search')
    parser.add_argument('--test_tube_exp_name', default='tt_exp')
    parser.add_argument('--batch_size', default=80, type=int)
    parser.add_argument('--epochs', default=3, type=int)
    parser.add_argument('--gpus', default=2, type=int)
    parser.add_argument('--log_path', default=os.environ['SCRATCH'] + '/summer_school/hs_log')
    parser.opt_range('--lr', default=2e-4, type=float, tunable=True, low=1e-4, high=1e-3, nb_samples=100, log_base=10)
    parser.opt_range('--wd', default=1e-5, type=float, tunable=True, low=1e-7, high=1e-4, nb_samples=100, log_base=10)
    hyperparams = parser.parse_args()

    # Enable cluster training.
    cluster = SlurmCluster(
        hyperparam_optimizer=hyperparams,
        log_path=hyperparams.log_path,
        python_cmd='python3',
    )

    # Email results if your hpc supports it.
#    cluster.notify_job_statusi(email='*****@*****.**', on_done=True, on_fail=True)

    # SLURM Module to load.
    cluster.load_modules([
        'daint-gpu',
    ])

    # Add commands to the non-SLURM portion.
    cluster.add_command('. /apps/daint/UES/6.0.UP04/sandboxes/sarafael/miniconda-ss2020/bin/activate')

    cluster.add_command('export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK')