def optimize_on_cluster(hyperparams): # enable cluster training # log all scripts to the test tube folder cluster = SlurmCluster( hyperparam_optimizer=hyperparams, log_path=hyperparams.slurm_log_path, ) # email for cluster coms cluster.notify_job_status(email=hyperparams.email, on_done=True, on_fail=True) # configure cluster cluster.per_experiment_nb_gpus = hyperparams.per_experiment_nb_gpus cluster.per_experiment_nb_nodes = hyperparams.nb_gpu_nodes cluster.job_time = '2:00:00' cluster.gpu_type = 'volta' cluster.memory_mb_per_node = 0 # any modules for code to run in env cluster.add_command(f'source activate {hyperparams.conda_env}') # run only on 32GB voltas cluster.add_slurm_cmd(cmd='constraint', value='volta32gb', comment='use 32gb gpus') cluster.add_slurm_cmd(cmd='partition', value=hyperparams.gpu_partition, comment='use 32gb gpus') # run hopt # creates and submits jobs to slurm cluster.optimize_parallel_cluster_gpu( main, nb_trials=hyperparams.num_hyperparam_trials, job_name=hyperparams.experiment_name )
def optimize_on_cluster(hyperparams): # enable cluster training # log all scripts to the test tube folder cluster = SlurmCluster( hyperparam_optimizer=hyperparams, log_path=hyperparams.slurm_log_path, ) # email for cluster coms cluster.notify_job_status(email=hyperparams.email, on_done=True, on_fail=True) # configure cluster cluster.per_experiment_nb_gpus = hyperparams.per_experiment_nb_gpus cluster.per_experiment_nb_nodes = hyperparams.nb_gpu_nodes cluster.job_time = '2:00:00' cluster.gpu_type = 'volta' cluster.memory_mb_per_node = 0 # any modules for code to run in env cluster.add_command(f'source activate {hyperparams.conda_env}') # set DDP master port cluster.add_command(f'export MASTER_PORT={PORT}') # OPTIONAL for debugging # without these flags errors in your code will # appear to be nccl errors cluster.add_command('export NCCL_DEBUG=INFO') cluster.add_command('export PYTHONFAULTHANDLER=1') # depending on your cluster config, you probably want # to limit the wired connection device # cluster.add_command('export NCCL_SOCKET_IFNAME=^docker0,lo') # depending on your cluster, you might need to load # the latest NCCL version # cluster.load_modules(['NCCL/2.4.7-1-cuda.10.0']) # run only on 32GB voltas cluster.add_slurm_cmd(cmd='constraint', value='volta32gb', comment='use 32gb gpus') cluster.add_slurm_cmd(cmd='partition', value=hyperparams.gpu_partition, comment='use 32gb gpus') # run hopt # creates and submits jobs to slurm cluster.optimize_parallel_cluster_gpu( main, nb_trials=hyperparams.num_hyperparam_trials, job_name=hyperparams.experiment_name )
def optimize_on_cluster(hyperparams): # enable cluster training # log all scripts to the test tube folder cluster = SlurmCluster( hyperparam_optimizer=hyperparams, log_path=hyperparams.slurm_log_path, ) # email for cluster coms # cluster.notify_job_status(email='add_email_here', on_done=True, on_fail=True) # configure cluster cluster.per_experiment_nb_gpus = hyperparams.per_experiment_nb_gpus cluster.per_experiment_nb_nodes = hyperparams.nb_gpu_nodes cluster.per_experiment_nb_cpus = hyperparams.per_experiment_nb_cpus cluster.job_time = hyperparams.job_time cluster.gpu_type = hyperparams.gpu_type cluster.memory_mb_per_node = 0 # any modules for code to run in env cluster.add_command("source activate dialog") cluster.add_command( "export PYTHONPATH=$PYTHONPATH:/private/home/koustuvs/mlp/latentDialogAnalysis" ) # run only on 32GB voltas # cluster.add_slurm_cmd(cmd='constraint', value='volta32gb', # comment='use 32gb gpus') cluster.add_slurm_cmd(cmd="partition", value=hyperparams.gpu_partition, comment="use 32gb gpus") # run hopt # creates and submits jobs to slurm cluster.optimize_parallel_cluster_gpu( main, nb_trials=hyperparams.nb_hopt_trials, job_name=hyperparams.id + "_grid_search", job_display_name=hyperparams.id, )
def optimize_on_cluster(hyperparams): ''' This function is in charge of creating the slurm bash scripts that will send our task to the cluster. For a reference single script check pl_submit.sh, located in this same folder. ''' # enable cluster training # log all scripts to the test tube folder cluster = SlurmCluster( hyperparam_optimizer=hyperparams, log_path=hyperparams.slurm_log_path, ) cluster.add_slurm_cmd(cmd = 'partition', value = 'gpu2', comment = 'queue') cluster.add_slurm_cmd(cmd = 'ntasks-per-node', value='2', comment = 'Tasks per node') cluster.job_time = '0-17:00:00' # email for cluster coms cluster.add_slurm_cmd('mail-type', value = 'all', comment = 'Mail type') cluster.add_slurm_cmd('mail-user', value = '*****@*****.**', comment = 'Mail account') # configure cluster cluster.per_experiment_nb_gpus = 2 cluster.per_experiment_nb_nodes = 13 cluster.memory_mb_per_node = 0 # any modules for code to run in env cluster.add_command('module purge') cluster.add_command('module load python/3.8.2') cluster.add_command('module load nvidia/cuda/9.1') cluster.add_command('set') # run hopt # creates and submits jobs to slurm cluster.optimize_parallel_cluster_gpu( main, nb_trials=24, job_name='grid_test' )
def run_cluster(parser, fn_main, lt_system): params = parser.parse_args() if params.system_mode == "3d" and "3d" not in params.model_name: params.model_name += "_3d" if not ':' in params.time: params.time = f"{int(params.time):02d}:00:00" arch = platform.uname().processor loaded_module = '' partition = params.partition # if partition is None: if arch == 'x86_64': partition = 'npl' elif arch == 'ppc64le': partition = 'dcs,rpi' if partition == 'npl': loaded_module = "module load gcc cuda openmpi" else: loaded_module = "module load spectrum-mpi" log_path = os.path.join(os.environ['HOME'], params.slurm_log_root) cluster = SlurmCluster(hyperparam_optimizer=params, log_path=log_path, python_cmd="python") # cluster.notify_job_status(email='', # on_fail=True, # on_done=False) # configure cluster cluster.per_experiment_nb_gpus = params.n_gpus cluster.per_experiment_nb_nodes = params.num_nodes cluster.per_experiment_nb_cpus = 0 # disable this option cluster.job_time = params.time cluster.minutes_to_checkpoint_before_walltime = 2 # 2 min walltime cluster.memory_mb_per_node = int(params.n_gpus) * int( params.cpus_per_task) * int(params.mem_per_cpu) if params.partition is not None: cluster.add_slurm_cmd('partition', value=params.partition, comment='cluster partition name') cluster.add_slurm_cmd('ntasks-per-node', value=params.n_gpus, comment='#task per node') cluster.add_slurm_cmd('cpus-per-task', value=params.cpus_per_task, comment='#cpu per task/gpu') cluster.add_slurm_cmd('mem-per-cpu', value=params.mem_per_cpu, comment="memory per cpu") # cluster.memory_mb_per_node = params.memory # disable this option cluster.add_command('export PYTHONFAULTHANDLER=1') # cluster.add_command('export NCCL_DEBUG=INFO') cluster.add_command(loaded_module) # Master address for multi-node training cluster.add_command( "export SLURM_JOB_NODELIST=$(scontrol show hostnames $SLURM_JOB_NODELIST | tr '\\n' ' ')" ) cluster.add_command("export SLURM_NODELIST=$SLURM_JOB_NODELIST") cluster.add_command( "slurm_nodes=$(scontrol show hostnames $SLURM_JOB_NODELIST)") cluster.add_command( "export MASTER_ADDRESS=$(echo $slurm_nodes | cut -d' ' -f1)") if params.job_name is None: job_name = params.model_name else: job_name = params.job_name # Each hyperparameter combination will use 8 gpus. cluster.optimize_parallel_cluster_gpu( # Function to execute lambda par, _optimizer: fn_main(par, lt_system, _optimizer), # Number of hyperparameter combinations to search: nb_trials=params.nb_trials, enable_auto_resubmit=params.auto_resubmit, # This is what will display in the slurm queue: job_name=job_name)
cluster.add_command('. /apps/daint/UES/6.0.UP04/sandboxes/sarafael/miniconda-ss2020/bin/activate') cluster.add_command('export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK') cluster.add_command('export NCCL_DEBUG=INFO') cluster.add_command('export PYTHONFAULTHANDLER=1') cluster.add_command('export NCCL_IB_HCA=ipogif0') cluster.add_command('export NCCL_IB_CUDA_SUPPORT=1') cluster.add_command('srun nproc') cluster.add_command('srun which python') # Add custom SLURM commands which show up as: # #comment # #SBATCH --cmd=value # ############ cluster.add_slurm_cmd(cmd='constraint', value='gpu', comment='GPU nodes') cluster.add_slurm_cmd(cmd='cpus-per-task', value=24, comment='ncpus') cluster.add_slurm_cmd(cmd='ntasks-per-core', value=1, comment='ntasks-per-core') cluster.add_slurm_cmd(cmd='ntasks-per-node', value=1, comment='ntasks-per-node') # Set job compute details (this will apply PER set of hyperparameters.) cluster.per_experiment_nb_gpus = 1 cluster.per_experiment_nb_nodes = hyperparams.gpus cluster.memory_mb_per_node = 32*1024 # Run hyperparameter combinations. cluster.optimize_parallel_cluster_gpu( # Function to execute: run_experiment, # Number of hyperparameter combinations to search: