def optimize_on_cluster(hyperparams): # enable cluster training cluster = SlurmCluster(hyperparam_optimizer=hyperparams, log_path=hyperparams.tt_save_path, test_tube_exp_name=hyperparams.tt_name) # email for cluster coms cluster.notify_job_status(email='add_email_here', on_done=True, on_fail=True) # configure cluster cluster.per_experiment_nb_gpus = hyperparams.per_experiment_nb_gpus cluster.job_time = '48:00:00' cluster.gpu_type = '1080ti' cluster.memory_mb_per_node = 48000 # any modules for code to run in env cluster.add_command('source activate pytorch_lightning') # name of exp job_display_name = hyperparams.tt_name.split('_')[0] job_display_name = job_display_name[0:3] # run hopt print('submitting jobs...') cluster.optimize_parallel_cluster_gpu(main, nb_trials=hyperparams.nb_hopt_trials, job_name=job_display_name)
def optimize_on_cluster(hyperparams): # enable cluster training # log all scripts to the test tube folder cluster = SlurmCluster( hyperparam_optimizer=hyperparams, log_path=hyperparams.slurm_log_path, ) # email for cluster coms cluster.notify_job_status(email=hyperparams.email, on_done=True, on_fail=True) # configure cluster cluster.per_experiment_nb_gpus = hyperparams.per_experiment_nb_gpus cluster.per_experiment_nb_nodes = hyperparams.nb_gpu_nodes cluster.job_time = '2:00:00' cluster.gpu_type = 'volta' cluster.memory_mb_per_node = 0 # any modules for code to run in env cluster.add_command(f'source activate {hyperparams.conda_env}') # run only on 32GB voltas cluster.add_slurm_cmd(cmd='constraint', value='volta32gb', comment='use 32gb gpus') cluster.add_slurm_cmd(cmd='partition', value=hyperparams.gpu_partition, comment='use 32gb gpus') # run hopt # creates and submits jobs to slurm cluster.optimize_parallel_cluster_gpu( main, nb_trials=hyperparams.num_hyperparam_trials, job_name=hyperparams.experiment_name )
def run_on_cluster(hyperparams): # enable cluster training cluster = SlurmCluster(hyperparam_optimizer=hyperparams, log_path=hyperparams.logs_save_path) # email results if your hpc supports it cluster.notify_job_status(email='*****@*****.**', on_done=True, on_fail=True) # any modules for code to run in env cluster.add_command(f'source activate {hyperparams.conda_env}') # pick the gpu resources cluster.per_experiment_nb_gpus = hyperparams.gpus cluster.per_experiment_nb_cpus = 1 cluster.per_experiment_nb_nodes = 1 #cluster.gpu_type = 'k80' cluster.job_time = '48:00:00' cluster.minutes_to_checkpoint_before_walltime = 5 cluster.memory_mb_per_node = 250000 #180000 # come up with a short exp name job_display_name = hyperparams.tt_name.split('_')[0] job_display_name = job_display_name[0:4] # optimize across all gpus print('submitting jobs...') cluster.optimize_parallel_cluster_gpu(main, nb_trials=hyperparams.nb_hopt_trials, job_name=job_display_name)
def optimize_on_cluster(hyperparams): # enable cluster training # log all scripts to the test tube folder cluster = SlurmCluster( hyperparam_optimizer=hyperparams, log_path=hyperparams.slurm_log_path, ) # email for cluster coms cluster.notify_job_status(email=hyperparams.email, on_done=True, on_fail=True) # configure cluster cluster.per_experiment_nb_gpus = hyperparams.per_experiment_nb_gpus cluster.per_experiment_nb_nodes = hyperparams.nb_gpu_nodes cluster.job_time = '2:00:00' cluster.gpu_type = 'volta' cluster.memory_mb_per_node = 0 # any modules for code to run in env cluster.add_command(f'source activate {hyperparams.conda_env}') # set DDP master port cluster.add_command(f'export MASTER_PORT={PORT}') # OPTIONAL for debugging # without these flags errors in your code will # appear to be nccl errors cluster.add_command('export NCCL_DEBUG=INFO') cluster.add_command('export PYTHONFAULTHANDLER=1') # depending on your cluster config, you probably want # to limit the wired connection device # cluster.add_command('export NCCL_SOCKET_IFNAME=^docker0,lo') # depending on your cluster, you might need to load # the latest NCCL version # cluster.load_modules(['NCCL/2.4.7-1-cuda.10.0']) # run only on 32GB voltas cluster.add_slurm_cmd(cmd='constraint', value='volta32gb', comment='use 32gb gpus') cluster.add_slurm_cmd(cmd='partition', value=hyperparams.gpu_partition, comment='use 32gb gpus') # run hopt # creates and submits jobs to slurm cluster.optimize_parallel_cluster_gpu( main, nb_trials=hyperparams.num_hyperparam_trials, job_name=hyperparams.experiment_name )
def optimize_on_cluster(hyperparams): # enable cluster training # log all scripts to the test tube folder cluster = SlurmCluster( hyperparam_optimizer=hyperparams, log_path=hyperparams.slurm_log_path, ) # email for cluster coms # cluster.notify_job_status(email='add_email_here', on_done=True, on_fail=True) # configure cluster cluster.per_experiment_nb_gpus = hyperparams.per_experiment_nb_gpus cluster.per_experiment_nb_nodes = hyperparams.nb_gpu_nodes cluster.per_experiment_nb_cpus = hyperparams.per_experiment_nb_cpus cluster.job_time = hyperparams.job_time cluster.gpu_type = hyperparams.gpu_type cluster.memory_mb_per_node = 0 # any modules for code to run in env cluster.add_command("source activate dialog") cluster.add_command( "export PYTHONPATH=$PYTHONPATH:/private/home/koustuvs/mlp/latentDialogAnalysis" ) # run only on 32GB voltas # cluster.add_slurm_cmd(cmd='constraint', value='volta32gb', # comment='use 32gb gpus') cluster.add_slurm_cmd(cmd="partition", value=hyperparams.gpu_partition, comment="use 32gb gpus") # run hopt # creates and submits jobs to slurm cluster.optimize_parallel_cluster_gpu( main, nb_trials=hyperparams.nb_hopt_trials, job_name=hyperparams.id + "_grid_search", job_display_name=hyperparams.id, )
def optimize_on_cluster(hyperparams): ''' This function is in charge of creating the slurm bash scripts that will send our task to the cluster. For a reference single script check pl_submit.sh, located in this same folder. ''' # enable cluster training # log all scripts to the test tube folder cluster = SlurmCluster( hyperparam_optimizer=hyperparams, log_path=hyperparams.slurm_log_path, ) cluster.add_slurm_cmd(cmd = 'partition', value = 'gpu2', comment = 'queue') cluster.add_slurm_cmd(cmd = 'ntasks-per-node', value='2', comment = 'Tasks per node') cluster.job_time = '0-17:00:00' # email for cluster coms cluster.add_slurm_cmd('mail-type', value = 'all', comment = 'Mail type') cluster.add_slurm_cmd('mail-user', value = '*****@*****.**', comment = 'Mail account') # configure cluster cluster.per_experiment_nb_gpus = 2 cluster.per_experiment_nb_nodes = 13 cluster.memory_mb_per_node = 0 # any modules for code to run in env cluster.add_command('module purge') cluster.add_command('module load python/3.8.2') cluster.add_command('module load nvidia/cuda/9.1') cluster.add_command('set') # run hopt # creates and submits jobs to slurm cluster.optimize_parallel_cluster_gpu( main, nb_trials=24, job_name='grid_test' )
hyperparam_optimizer=hyperparams, log_path=hyperparams.log_path, python_cmd='python3', # test_tube_exp_name=hyperparams.test_tube_exp_name ) # Email results if your hpc supports it. cluster.notify_job_status(email='*****@*****.**', on_done=True, on_fail=True) # SLURM Module to load. cluster.load_modules(['python-3', 'anaconda3']) # Add commands to the non-SLURM portion. cluster.add_command('source activate transformers') # Add custom SLURM commands which show up as: # #comment # #SBATCH --cmd=value # ############ # cluster.add_slurm_cmd( # cmd='cpus-per-task', value='1', comment='CPUS per task.') # Set job compute details (this will apply PER set of hyperparameters.) cluster.per_experiment_nb_gpus = 4 cluster.per_experiment_nb_nodes = 2 cluster.gpu_type = '1080ti' # Each hyperparameter combination will use 8 gpus. cluster.optimize_parallel_cluster_gpu(
parser.add_argument('--log_path', default='/some/path/to/log') parser.opt_list('--y_val', default=12, options=[1, 2, 3, 4, 5, 6], tunable=True) parser.opt_list('--x_val', default=12, options=[20, 12, 30, 45], tunable=True) hyperparams = parser.parse_args() # enable cluster training cluster = SlurmCluster(hyperparam_optimizer=hyperparams, log_path=hyperparams.log_path, python_cmd='python3', test_tube_exp_name=hyperparams.test_tube_exp_name) # email results if your hpc supports it cluster.notify_job_status(email='*****@*****.**', on_done=True, on_fail=True) # any modules for code to run in env cluster.load_modules(['python-3', 'anaconda3']) cluster.add_command('source activate myCondaEnv') # set job compute details (this will apply PER set of hyperparameters) cluster.per_experiment_nb_gpus = 4 cluster.per_experiment_nb_nodes = 2 cluster.gpu_type = '1080ti' # each job (24 in total here) will use 8 gpus for each set of hyperparams cluster.optimize_parallel_cluster_gpu(train, nb_trials=24, job_name='first_tt_job')
def run_cluster(parser, fn_main, lt_system): params = parser.parse_args() if params.system_mode == "3d" and "3d" not in params.model_name: params.model_name += "_3d" if not ':' in params.time: params.time = f"{int(params.time):02d}:00:00" arch = platform.uname().processor loaded_module = '' partition = params.partition # if partition is None: if arch == 'x86_64': partition = 'npl' elif arch == 'ppc64le': partition = 'dcs,rpi' if partition == 'npl': loaded_module = "module load gcc cuda openmpi" else: loaded_module = "module load spectrum-mpi" log_path = os.path.join(os.environ['HOME'], params.slurm_log_root) cluster = SlurmCluster(hyperparam_optimizer=params, log_path=log_path, python_cmd="python") # cluster.notify_job_status(email='', # on_fail=True, # on_done=False) # configure cluster cluster.per_experiment_nb_gpus = params.n_gpus cluster.per_experiment_nb_nodes = params.num_nodes cluster.per_experiment_nb_cpus = 0 # disable this option cluster.job_time = params.time cluster.minutes_to_checkpoint_before_walltime = 2 # 2 min walltime cluster.memory_mb_per_node = int(params.n_gpus) * int( params.cpus_per_task) * int(params.mem_per_cpu) if params.partition is not None: cluster.add_slurm_cmd('partition', value=params.partition, comment='cluster partition name') cluster.add_slurm_cmd('ntasks-per-node', value=params.n_gpus, comment='#task per node') cluster.add_slurm_cmd('cpus-per-task', value=params.cpus_per_task, comment='#cpu per task/gpu') cluster.add_slurm_cmd('mem-per-cpu', value=params.mem_per_cpu, comment="memory per cpu") # cluster.memory_mb_per_node = params.memory # disable this option cluster.add_command('export PYTHONFAULTHANDLER=1') # cluster.add_command('export NCCL_DEBUG=INFO') cluster.add_command(loaded_module) # Master address for multi-node training cluster.add_command( "export SLURM_JOB_NODELIST=$(scontrol show hostnames $SLURM_JOB_NODELIST | tr '\\n' ' ')" ) cluster.add_command("export SLURM_NODELIST=$SLURM_JOB_NODELIST") cluster.add_command( "slurm_nodes=$(scontrol show hostnames $SLURM_JOB_NODELIST)") cluster.add_command( "export MASTER_ADDRESS=$(echo $slurm_nodes | cut -d' ' -f1)") if params.job_name is None: job_name = params.model_name else: job_name = params.job_name # Each hyperparameter combination will use 8 gpus. cluster.optimize_parallel_cluster_gpu( # Function to execute lambda par, _optimizer: fn_main(par, lt_system, _optimizer), # Number of hyperparameter combinations to search: nb_trials=params.nb_trials, enable_auto_resubmit=params.auto_resubmit, # This is what will display in the slurm queue: job_name=job_name)
cluster = SlurmCluster( hyperparam_optimizer=hyperparams, log_path=hyperparams.log_path, python_cmd='python3', ) # Email results if your hpc supports it. # cluster.notify_job_statusi(email='*****@*****.**', on_done=True, on_fail=True) # SLURM Module to load. cluster.load_modules([ 'daint-gpu', ]) # Add commands to the non-SLURM portion. cluster.add_command('. /apps/daint/UES/6.0.UP04/sandboxes/sarafael/miniconda-ss2020/bin/activate') cluster.add_command('export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK') cluster.add_command('export NCCL_DEBUG=INFO') cluster.add_command('export PYTHONFAULTHANDLER=1') cluster.add_command('export NCCL_IB_HCA=ipogif0') cluster.add_command('export NCCL_IB_CUDA_SUPPORT=1') cluster.add_command('srun nproc') cluster.add_command('srun which python') # Add custom SLURM commands which show up as: # #comment # #SBATCH --cmd=value # ############ cluster.add_slurm_cmd(cmd='constraint', value='gpu', comment='GPU nodes')