示例#1
0
def optimize_on_cluster(hyperparams):
    # enable cluster training
    cluster = SlurmCluster(hyperparam_optimizer=hyperparams,
                           log_path=hyperparams.tt_save_path,
                           test_tube_exp_name=hyperparams.tt_name)

    # email for cluster coms
    cluster.notify_job_status(email='add_email_here',
                              on_done=True,
                              on_fail=True)

    # configure cluster
    cluster.per_experiment_nb_gpus = hyperparams.per_experiment_nb_gpus
    cluster.job_time = '48:00:00'
    cluster.gpu_type = '1080ti'
    cluster.memory_mb_per_node = 48000

    # any modules for code to run in env
    cluster.add_command('source activate pytorch_lightning')

    # name of exp
    job_display_name = hyperparams.tt_name.split('_')[0]
    job_display_name = job_display_name[0:3]

    # run hopt
    print('submitting jobs...')
    cluster.optimize_parallel_cluster_gpu(main,
                                          nb_trials=hyperparams.nb_hopt_trials,
                                          job_name=job_display_name)
def optimize_on_cluster(hyperparams):
    # enable cluster training
    # log all scripts to the test tube folder
    cluster = SlurmCluster(
        hyperparam_optimizer=hyperparams,
        log_path=hyperparams.slurm_log_path,
    )

    # email for cluster coms
    cluster.notify_job_status(email=hyperparams.email, on_done=True, on_fail=True)

    # configure cluster
    cluster.per_experiment_nb_gpus = hyperparams.per_experiment_nb_gpus
    cluster.per_experiment_nb_nodes = hyperparams.nb_gpu_nodes
    cluster.job_time = '2:00:00'
    cluster.gpu_type = 'volta'
    cluster.memory_mb_per_node = 0

    # any modules for code to run in env
    cluster.add_command(f'source activate {hyperparams.conda_env}')

    # run only on 32GB voltas
    cluster.add_slurm_cmd(cmd='constraint', value='volta32gb',
                          comment='use 32gb gpus')
    cluster.add_slurm_cmd(cmd='partition', value=hyperparams.gpu_partition,
                          comment='use 32gb gpus')

    # run hopt
    # creates and submits jobs to slurm
    cluster.optimize_parallel_cluster_gpu(
        main,
        nb_trials=hyperparams.num_hyperparam_trials,
        job_name=hyperparams.experiment_name
    )
示例#3
0
def run_on_cluster(hyperparams):
    # enable cluster training
    cluster = SlurmCluster(hyperparam_optimizer=hyperparams,
                           log_path=hyperparams.logs_save_path)

    # email results if your hpc supports it
    cluster.notify_job_status(email='*****@*****.**',
                              on_done=True,
                              on_fail=True)
    # any modules for code to run in env
    cluster.add_command(f'source activate {hyperparams.conda_env}')
    # pick the gpu resources
    cluster.per_experiment_nb_gpus = hyperparams.gpus
    cluster.per_experiment_nb_cpus = 1
    cluster.per_experiment_nb_nodes = 1
    #cluster.gpu_type = 'k80'
    cluster.job_time = '48:00:00'
    cluster.minutes_to_checkpoint_before_walltime = 5
    cluster.memory_mb_per_node = 250000  #180000
    # come up with a short exp name
    job_display_name = hyperparams.tt_name.split('_')[0]
    job_display_name = job_display_name[0:4]
    # optimize across all gpus
    print('submitting jobs...')
    cluster.optimize_parallel_cluster_gpu(main,
                                          nb_trials=hyperparams.nb_hopt_trials,
                                          job_name=job_display_name)
def optimize_on_cluster(hyperparams):
    # enable cluster training
    # log all scripts to the test tube folder
    cluster = SlurmCluster(
        hyperparam_optimizer=hyperparams,
        log_path=hyperparams.slurm_log_path,
    )

    # email for cluster coms
    cluster.notify_job_status(email=hyperparams.email, on_done=True, on_fail=True)

    # configure cluster
    cluster.per_experiment_nb_gpus = hyperparams.per_experiment_nb_gpus
    cluster.per_experiment_nb_nodes = hyperparams.nb_gpu_nodes
    cluster.job_time = '2:00:00'
    cluster.gpu_type = 'volta'
    cluster.memory_mb_per_node = 0

    # any modules for code to run in env
    cluster.add_command(f'source activate {hyperparams.conda_env}')

    # set DDP master port
    cluster.add_command(f'export MASTER_PORT={PORT}')

    # OPTIONAL for debugging
    # without these flags errors in your code will 
    # appear to be nccl errors
    cluster.add_command('export NCCL_DEBUG=INFO')
    cluster.add_command('export PYTHONFAULTHANDLER=1')

    # depending on your cluster config, you probably want
    # to limit the wired connection device
    # cluster.add_command('export NCCL_SOCKET_IFNAME=^docker0,lo')

    # depending on your cluster, you might need to load
    # the latest NCCL version
    # cluster.load_modules(['NCCL/2.4.7-1-cuda.10.0'])

    # run only on 32GB voltas
    cluster.add_slurm_cmd(cmd='constraint', value='volta32gb',
                          comment='use 32gb gpus')
    cluster.add_slurm_cmd(cmd='partition', value=hyperparams.gpu_partition,
                          comment='use 32gb gpus')

    # run hopt
    # creates and submits jobs to slurm
    cluster.optimize_parallel_cluster_gpu(
        main,
        nb_trials=hyperparams.num_hyperparam_trials,
        job_name=hyperparams.experiment_name
    )
示例#5
0
def optimize_on_cluster(hyperparams):
    '''
    This function is in charge of creating the slurm bash scripts that will send our task to the cluster.
    For a reference single script check pl_submit.sh, located in this same folder.
    '''


    # enable cluster training
    # log all scripts to the test tube folder
    cluster = SlurmCluster(
        hyperparam_optimizer=hyperparams,
        log_path=hyperparams.slurm_log_path,
    )

    cluster.add_slurm_cmd(cmd = 'partition', value = 'gpu2', comment = 'queue')

    cluster.add_slurm_cmd(cmd = 'ntasks-per-node', value='2', comment = 'Tasks per node')
    cluster.job_time = '0-17:00:00'

    # email for cluster coms
    cluster.add_slurm_cmd('mail-type', value = 'all', comment = 'Mail type')
    cluster.add_slurm_cmd('mail-user', value = '*****@*****.**', comment = 'Mail account')

    # configure cluster
    cluster.per_experiment_nb_gpus = 2
    cluster.per_experiment_nb_nodes = 13

    cluster.memory_mb_per_node = 0

    # any modules for code to run in env
    cluster.add_command('module purge')
    cluster.add_command('module load python/3.8.2')
    cluster.add_command('module load nvidia/cuda/9.1')

    cluster.add_command('set')


    # run hopt
    # creates and submits jobs to slurm
    cluster.optimize_parallel_cluster_gpu(
        main,
        nb_trials=24,
        job_name='grid_test'
    )
示例#6
0
def optimize_on_cluster(hyperparams):
    # enable cluster training
    # log all scripts to the test tube folder
    cluster = SlurmCluster(
        hyperparam_optimizer=hyperparams,
        log_path=hyperparams.slurm_log_path,
    )

    # email for cluster coms
    # cluster.notify_job_status(email='add_email_here', on_done=True, on_fail=True)

    # configure cluster
    cluster.per_experiment_nb_gpus = hyperparams.per_experiment_nb_gpus
    cluster.per_experiment_nb_nodes = hyperparams.nb_gpu_nodes
    cluster.per_experiment_nb_cpus = hyperparams.per_experiment_nb_cpus
    cluster.job_time = hyperparams.job_time
    cluster.gpu_type = hyperparams.gpu_type
    cluster.memory_mb_per_node = 0

    # any modules for code to run in env
    cluster.add_command("source activate dialog")
    cluster.add_command(
        "export PYTHONPATH=$PYTHONPATH:/private/home/koustuvs/mlp/latentDialogAnalysis"
    )

    # run only on 32GB voltas
    # cluster.add_slurm_cmd(cmd='constraint', value='volta32gb',
    #                     comment='use 32gb gpus')
    cluster.add_slurm_cmd(cmd="partition",
                          value=hyperparams.gpu_partition,
                          comment="use 32gb gpus")

    # run hopt
    # creates and submits jobs to slurm
    cluster.optimize_parallel_cluster_gpu(
        main,
        nb_trials=hyperparams.nb_hopt_trials,
        job_name=hyperparams.id + "_grid_search",
        job_display_name=hyperparams.id,
    )
示例#7
0
    cluster.notify_job_status(email='*****@*****.**',
                              on_done=True,
                              on_fail=True)

    # SLURM Module to load.
    cluster.load_modules(['python-3', 'anaconda3'])

    # Add commands to the non-SLURM portion.
    cluster.add_command('source activate transformers')

    # Add custom SLURM commands which show up as:
    # #comment
    # #SBATCH --cmd=value
    # ############
    # cluster.add_slurm_cmd(
    #    cmd='cpus-per-task', value='1', comment='CPUS per task.')

    # Set job compute details (this will apply PER set of hyperparameters.)
    cluster.per_experiment_nb_gpus = 4
    cluster.per_experiment_nb_nodes = 2
    cluster.gpu_type = '1080ti'

    # Each hyperparameter combination will use 8 gpus.
    cluster.optimize_parallel_cluster_gpu(
        # Function to execute:
        train,
        # Number of hyperparameter combinations to search:
        nb_trials=24,
        # This is what will display in the slurm queue:
        job_name='first_tt_job')
示例#8
0
def run_cluster(parser, fn_main, lt_system):
    params = parser.parse_args()

    if params.system_mode == "3d" and "3d" not in params.model_name:
        params.model_name += "_3d"

    if not ':' in params.time:
        params.time = f"{int(params.time):02d}:00:00"

    arch = platform.uname().processor

    loaded_module = ''
    partition = params.partition
    # if partition is None:
    if arch == 'x86_64':
        partition = 'npl'
    elif arch == 'ppc64le':
        partition = 'dcs,rpi'

    if partition == 'npl':
        loaded_module = "module load gcc cuda openmpi"
    else:
        loaded_module = "module load spectrum-mpi"

    log_path = os.path.join(os.environ['HOME'], params.slurm_log_root)

    cluster = SlurmCluster(hyperparam_optimizer=params,
                           log_path=log_path,
                           python_cmd="python")

    # cluster.notify_job_status(email='',
    #                           on_fail=True,
    #                           on_done=False)
    # configure cluster
    cluster.per_experiment_nb_gpus = params.n_gpus
    cluster.per_experiment_nb_nodes = params.num_nodes
    cluster.per_experiment_nb_cpus = 0  # disable this option
    cluster.job_time = params.time
    cluster.minutes_to_checkpoint_before_walltime = 2  # 2 min walltime
    cluster.memory_mb_per_node = int(params.n_gpus) * int(
        params.cpus_per_task) * int(params.mem_per_cpu)

    if params.partition is not None:
        cluster.add_slurm_cmd('partition',
                              value=params.partition,
                              comment='cluster partition name')
    cluster.add_slurm_cmd('ntasks-per-node',
                          value=params.n_gpus,
                          comment='#task per node')
    cluster.add_slurm_cmd('cpus-per-task',
                          value=params.cpus_per_task,
                          comment='#cpu per task/gpu')
    cluster.add_slurm_cmd('mem-per-cpu',
                          value=params.mem_per_cpu,
                          comment="memory per cpu")

    # cluster.memory_mb_per_node = params.memory  # disable this option

    cluster.add_command('export PYTHONFAULTHANDLER=1')
    # cluster.add_command('export NCCL_DEBUG=INFO')
    cluster.add_command(loaded_module)

    # Master address for multi-node training
    cluster.add_command(
        "export SLURM_JOB_NODELIST=$(scontrol show hostnames $SLURM_JOB_NODELIST | tr '\\n' ' ')"
    )
    cluster.add_command("export SLURM_NODELIST=$SLURM_JOB_NODELIST")
    cluster.add_command(
        "slurm_nodes=$(scontrol show hostnames $SLURM_JOB_NODELIST)")
    cluster.add_command(
        "export MASTER_ADDRESS=$(echo $slurm_nodes | cut -d' ' -f1)")

    if params.job_name is None:
        job_name = params.model_name
    else:
        job_name = params.job_name

    # Each hyperparameter combination will use 8 gpus.
    cluster.optimize_parallel_cluster_gpu(
        # Function to execute
        lambda par, _optimizer: fn_main(par, lt_system, _optimizer),
        # Number of hyperparameter combinations to search:
        nb_trials=params.nb_trials,
        enable_auto_resubmit=params.auto_resubmit,
        # This is what will display in the slurm queue:
        job_name=job_name)