def run_on_cluster(hyperparams): # enable cluster training cluster = SlurmCluster(hyperparam_optimizer=hyperparams, log_path=hyperparams.logs_save_path) # email results if your hpc supports it cluster.notify_job_status(email='*****@*****.**', on_done=True, on_fail=True) # any modules for code to run in env cluster.add_command(f'source activate {hyperparams.conda_env}') # pick the gpu resources cluster.per_experiment_nb_gpus = hyperparams.gpus cluster.per_experiment_nb_cpus = 1 cluster.per_experiment_nb_nodes = 1 #cluster.gpu_type = 'k80' cluster.job_time = '48:00:00' cluster.minutes_to_checkpoint_before_walltime = 5 cluster.memory_mb_per_node = 250000 #180000 # come up with a short exp name job_display_name = hyperparams.tt_name.split('_')[0] job_display_name = job_display_name[0:4] # optimize across all gpus print('submitting jobs...') cluster.optimize_parallel_cluster_gpu(main, nb_trials=hyperparams.nb_hopt_trials, job_name=job_display_name)
def optimize_on_cluster(hyperparams): # enable cluster training # log all scripts to the test tube folder cluster = SlurmCluster( hyperparam_optimizer=hyperparams, log_path=hyperparams.slurm_log_path, ) # email for cluster coms # cluster.notify_job_status(email='add_email_here', on_done=True, on_fail=True) # configure cluster cluster.per_experiment_nb_gpus = hyperparams.per_experiment_nb_gpus cluster.per_experiment_nb_nodes = hyperparams.nb_gpu_nodes cluster.per_experiment_nb_cpus = hyperparams.per_experiment_nb_cpus cluster.job_time = hyperparams.job_time cluster.gpu_type = hyperparams.gpu_type cluster.memory_mb_per_node = 0 # any modules for code to run in env cluster.add_command("source activate dialog") cluster.add_command( "export PYTHONPATH=$PYTHONPATH:/private/home/koustuvs/mlp/latentDialogAnalysis" ) # run only on 32GB voltas # cluster.add_slurm_cmd(cmd='constraint', value='volta32gb', # comment='use 32gb gpus') cluster.add_slurm_cmd(cmd="partition", value=hyperparams.gpu_partition, comment="use 32gb gpus") # run hopt # creates and submits jobs to slurm cluster.optimize_parallel_cluster_gpu( main, nb_trials=hyperparams.nb_hopt_trials, job_name=hyperparams.id + "_grid_search", job_display_name=hyperparams.id, )
# enable cluster training cluster = SlurmCluster(hyperparam_optimizer=hyperparams, log_path=hyperparams.log_path, python_cmd='python3', test_tube_exp_name=hyperparams.test_tube_exp_name) # email results if your hpc supports it cluster.notify_job_status(email='*****@*****.**', on_done=True, on_fail=True) # any modules for code to run in env cluster.load_modules(['python-3', 'anaconda3']) # add commands to the non slurm portion cluster.add_command('source activate myCondaEnv') # can also add custom slurm commands which show up as: # #comment # #SBATCH --cmd=value # ############ # cluster.add_slurm_cmd(cmd='cpus-per-task', value='1', comment='nb cpus per task') # set job compute details (this will apply PER set of hyperparameters) cluster.per_experiment_nb_cpus = 20 cluster.per_experiment_nb_nodes = 10 # each job (24 in total here) will use 200 cpus for each set of hyperparams # if job_display_name is set, it's what will display in the slurm queue cluster.optimize_parallel_cluster_cpu(train, nb_trials=24, job_name='first_tt_job', job_display_name='short_name')
def run_cluster(parser, fn_main, lt_system): params = parser.parse_args() if params.system_mode == "3d" and "3d" not in params.model_name: params.model_name += "_3d" if not ':' in params.time: params.time = f"{int(params.time):02d}:00:00" arch = platform.uname().processor loaded_module = '' partition = params.partition # if partition is None: if arch == 'x86_64': partition = 'npl' elif arch == 'ppc64le': partition = 'dcs,rpi' if partition == 'npl': loaded_module = "module load gcc cuda openmpi" else: loaded_module = "module load spectrum-mpi" log_path = os.path.join(os.environ['HOME'], params.slurm_log_root) cluster = SlurmCluster(hyperparam_optimizer=params, log_path=log_path, python_cmd="python") # cluster.notify_job_status(email='', # on_fail=True, # on_done=False) # configure cluster cluster.per_experiment_nb_gpus = params.n_gpus cluster.per_experiment_nb_nodes = params.num_nodes cluster.per_experiment_nb_cpus = 0 # disable this option cluster.job_time = params.time cluster.minutes_to_checkpoint_before_walltime = 2 # 2 min walltime cluster.memory_mb_per_node = int(params.n_gpus) * int( params.cpus_per_task) * int(params.mem_per_cpu) if params.partition is not None: cluster.add_slurm_cmd('partition', value=params.partition, comment='cluster partition name') cluster.add_slurm_cmd('ntasks-per-node', value=params.n_gpus, comment='#task per node') cluster.add_slurm_cmd('cpus-per-task', value=params.cpus_per_task, comment='#cpu per task/gpu') cluster.add_slurm_cmd('mem-per-cpu', value=params.mem_per_cpu, comment="memory per cpu") # cluster.memory_mb_per_node = params.memory # disable this option cluster.add_command('export PYTHONFAULTHANDLER=1') # cluster.add_command('export NCCL_DEBUG=INFO') cluster.add_command(loaded_module) # Master address for multi-node training cluster.add_command( "export SLURM_JOB_NODELIST=$(scontrol show hostnames $SLURM_JOB_NODELIST | tr '\\n' ' ')" ) cluster.add_command("export SLURM_NODELIST=$SLURM_JOB_NODELIST") cluster.add_command( "slurm_nodes=$(scontrol show hostnames $SLURM_JOB_NODELIST)") cluster.add_command( "export MASTER_ADDRESS=$(echo $slurm_nodes | cut -d' ' -f1)") if params.job_name is None: job_name = params.model_name else: job_name = params.job_name # Each hyperparameter combination will use 8 gpus. cluster.optimize_parallel_cluster_gpu( # Function to execute lambda par, _optimizer: fn_main(par, lt_system, _optimizer), # Number of hyperparameter combinations to search: nb_trials=params.nb_trials, enable_auto_resubmit=params.auto_resubmit, # This is what will display in the slurm queue: job_name=job_name)