def queue_experiments(config_file, force_duplicates): seml_config, slurm_config, experiment_config = db_utils.read_config( config_file) # Set Slurm config with default parameters as fall-back option default_slurm_config = get_default_slurm_config() for k, v in default_slurm_config['sbatch_options'].items(): if k not in slurm_config['sbatch_options'].keys(): slurm_config['sbatch_options'][k] = v del default_slurm_config['sbatch_options'] for k, v in default_slurm_config.items(): if k not in slurm_config.keys(): slurm_config[k] = v slurm_config['sbatch_options'] = utils.remove_dashes( slurm_config['sbatch_options']) collection = db_utils.get_collection(seml_config['db_collection']) configs = generate_configs(experiment_config) if not force_duplicates: len_before = len(configs) configs = filter_experiments(collection, configs) len_after = len(configs) if len_after != len_before: print( f"{len_before - len_after} of {len_before} experiment{s_if(len_before)} were already found " f"in the database. They were not added again.") # Add the configurations to the database with QUEUED status. if len(configs) > 0: queue_configs(collection, seml_config, slurm_config, configs)
def queue_experiments(config_file, force_duplicates): tracking_config, _, experiment_config = db_utils.read_config(config_file) collection = db_utils.get_collection(tracking_config['db_collection']) configs = generate_configs(experiment_config) if not force_duplicates: len_before = len(configs) configs = filter_experiments(collection, configs) len_after = len(configs) if len_after != len_before: print(f"{len_before - len_after} of {len_before} experiment{s_if(len_before)} were already found " f"in the database. They were not added again.") # Add the configurations to the database with QUEUED status. if len(configs) > 0: queue_configs(collection, tracking_config, configs)
def start_slurm_job(exps, log_verbose, output_dir=".", sbatch_options=None): """Run a list of experiments as a job on the Slurm cluster. Parameters ---------- exps: List[dict] List of experiments to run. log_verbose: bool Print all the Python syscalls before running them. output_dir: str Directory (relative to home directory) where to store the slurm output files. sbatch_options: dict A dictionary that contains options for #SBATCH, e.g., {'--mem': 8000} to limit the job's memory to 8,000 MB. Returns ------- None """ id_strs = [str(exp['_id']) for exp in exps] job_name = f"{exps[0]['tracking']['db_collection']}_{','.join(id_strs)}" output_dir_path = os.path.abspath(os.path.expanduser(output_dir)) if not os.path.isdir(output_dir_path): raise ValueError( f"Slurm output directory '{output_dir_path}' does not exist.") sbatch_dict = get_default_sbatch_dict() if sbatch_options is not None: sbatch_dict.update(sbatch_options) sbatch_dict['--job-name'] = job_name sbatch_dict['--output'] = f'{output_dir_path}/slurm-%j.out' script = "#!/bin/bash\n" for key, value in sbatch_dict.items(): if key in ['--partition', 'p'] and isinstance(value, list): script += f"#SBATCH {key}={','.join(value)}\n" else: script += f"#SBATCH {key}={value}\n" script += "\n" script += "cd ${SLURM_SUBMIT_DIR} \n" script += "echo Starting job ${SLURM_JOBID} \n" script += "echo SLURM assigned me these nodes:\n" script += "squeue -j ${SLURM_JOBID} -O nodelist | tail -n +2\n" collection = db_utils.get_collection(exps[0]['tracking']['db_collection']) if "conda_environment" in exps[0]['tracking']: script += "CONDA_BASE=$(conda info --base)\n" script += "source $CONDA_BASE/etc/profile.d/conda.sh\n" script += f"conda activate {exps[0]['tracking']['conda_environment']}\n" check_file = check_cancelled.__file__ script += "process_ids=() \n" script += f"exp_ids=({' '.join([str(e['_id']) for e in exps])}) \n" for ix, exp in enumerate(exps): cmd = get_cmd_from_exp_dict(exp) collection_str = exp['tracking']['db_collection'] script += f"python {check_file} --experiment_id {exp['_id']} --database_collection {collection_str}\n" script += "ret=$?\n" script += "if [ $ret -eq 0 ]\n" script += "then\n" script += f" {cmd} & \n" script += f" process_ids[{ix}]=$!\n" script += "elif [ $ret -eq 1 ]\n" script += "then\n" script += f" echo WARNING: Experiment with ID {exp['_id']} has status INTERRUPTED and will not be run. \n" script += "elif [ $ret -eq 2 ]\n" script += "then\n" script += f" (>&2 echo ERROR: Experiment with id {exp['_id']} not found in the database.)\n" script += "fi\n" collection.update_one({'_id': exp['_id']}, {'$set': { 'status': 'PENDING' }}) collection.update_one({'_id': exp['_id']}, { '$set': { 'slurm': dict(sbatch_options=sbatch_options, step_id=ix) } }) if log_verbose: print(f'Running the following command:\n {cmd}') script += f"echo Experiments are running under the following process IDs:\n" script += f"num_it=${{#process_ids[@]}}\n" script += f"for ((i=0; i<$num_it; i++))\n" script += f"do\n" script += f" echo \"Experiment ID: ${{exp_ids[$i]}}\tProcess ID: ${{process_ids[$i]}}\"\n" script += f"done\n" script += f"wait \n" random_int = np.random.randint(0, 999999) path = f"/tmp/{random_int}.sh" while os.path.exists(path): random_int = np.random.randint(0, 999999) path = f"/tmp/{random_int}.sh" with open(path, "w") as f: f.write(script) output = subprocess.check_output(f'sbatch {path}', shell=True) os.remove(path) slurm_job_id = int(output.split(b' ')[-1]) for exp in exps: collection.update_one({'_id': exp['_id']}, { '$set': { 'slurm.id': slurm_job_id, 'slurm.output_file': f"{output_dir_path}/slurm-{slurm_job_id}.out" } }) if log_verbose: print(f"Started experiment with ID {slurm_job_id}")
def do_work(collection_name, log_verbose, slurm=True, num_exps=-1, slurm_config=None, filter_dict=None): """Pull queued experiments from the database and run them. Parameters ---------- collection_name: str Name of the collection in the MongoDB. log_verbose: bool Print all the Python syscalls before running them. slurm: bool Use the Slurm cluster. num_exps: int, default: -1 If >0, will only submit the specified number of experiments to the cluster. This is useful when you only want to test your setup. slurm_config: dict Settings for the Slurm job. See `start_slurm_job` for details. filter_dict: dict Dictionary for filtering the entries in the collection. Returns ------- None """ if slurm_config is None: # Default Slurm config. slurm_config = {'output_dir': '.', 'experiments_per_job': 1} if filter_dict is None: filter_dict = {} collection = db_utils.get_collection(collection_name) query_dict = {'status': {"$in": ['QUEUED']}} query_dict.update(filter_dict) if collection.count_documents(query_dict) <= 0: print("No queued experiments.") return exps_list = list(collection.find(query_dict)) # divide experiments into chunks of <experiments_per_job> that will be run in parallel on one GPU. def chunk_list(seq, size): return (seq[pos:pos + size] for pos in range(0, len(seq), size)) nexps = num_exps if num_exps > 0 else len(exps_list) exp_chunks = chunk_list(exps_list[:nexps], size=slurm_config['experiments_per_job']) njobs = math.ceil(nexps / slurm_config['experiments_per_job']) del slurm_config['experiments_per_job'] if slurm: print(f"Starting {nexps} experiment{s_if(nexps)} in " f"{njobs} Slurm job{s_if(njobs)}.") else: print(f"Starting {nexps} experiment{s_if(nexps)} locally.") for exp in exps_list[:nexps]: collection.update_one({'_id': exp['_id']}, {'$set': { 'status': 'PENDING' }}) for ix, exps in tqdm(enumerate(exp_chunks), total=njobs): if slurm: start_slurm_job(exps, log_verbose, **slurm_config) else: if 'fileserver' in os.uname()[1]: raise ValueError( "Refusing to run a compute experiment on a file server. " "Please use a GPU machine or slurm.") for exp in exps: cmd = get_cmd_from_exp_dict(exp) if log_verbose: print(f'Running the following command:\n {cmd}') # pdb works with check_call but not with check_output. Maybe because of stdout/stdin. subprocess.check_call(cmd, shell=True)
from seml import database_utils as db_utils if __name__ == "__main__": parser = argparse.ArgumentParser( description= "Check whether the experiment with given ID has been cancelled before its start.", formatter_class=argparse.RawTextHelpFormatter) parser.add_argument("--experiment_id", type=int, help="The experiment ID to check in the database.") parser.add_argument("--database_collection", type=str, help="The collection in the database to use.") args = parser.parse_args() exp_id = args.experiment_id db_collection = args.database_collection mongodb_config = db_utils.get_mongodb_config() collection = db_utils.get_collection(db_collection, mongodb_config) exp = collection.find_one({'_id': exp_id}) if exp is None: exit(2) if exp['status'] not in ["QUEUED", "PENDING"]: exit(1) else: exit(0)
def do_work(collection_name, log_verbose, slurm=True, unobserved=False, post_mortem=False, num_exps=-1, filter_dict={}, dry_run=False): """Pull queued experiments from the database and run them. Parameters ---------- collection_name: str Name of the collection in the MongoDB. log_verbose: bool Print all the Python syscalls before running them. slurm: bool Use the Slurm cluster. unobserved: bool Disable all Sacred observers (nothing written to MongoDB). post_mortem: bool Activate post-mortem debugging. num_exps: int, default: -1 If >0, will only submit the specified number of experiments to the cluster. This is useful when you only want to test your setup. filter_dict: dict Dictionary for filtering the entries in the collection. dry_run: bool Just return the executables and configurations instead of running them. Returns ------- None """ collection = db_utils.get_collection(collection_name) query_dict = {'status': {"$in": ['QUEUED']}} query_dict.update(filter_dict) if collection.count_documents(query_dict) <= 0: print("No queued experiments.") return exps_list = list(collection.find(query_dict)) nexps = num_exps if num_exps > 0 else len(exps_list) exp_chunks = db_utils.chunk_list(exps_list[:nexps]) njobs = len(exp_chunks) if dry_run: configs = [] for exps in exp_chunks: for exp in exps: configs.append( get_config_from_exp(exp, log_verbose=log_verbose, unobserved=unobserved, post_mortem=post_mortem)) return configs elif slurm: print(f"Starting {nexps} experiment{s_if(nexps)} in " f"{njobs} Slurm job{s_if(njobs)}.") for exps in tqdm(exp_chunks): slurm_config = exps[0]['slurm'] seml_config = exps[0]['seml'] if 'output_dir' in slurm_config: warnings.warn( "'output_dir' has moved from 'slurm' to 'seml'. Please adapt your YAML accordingly" "by moving the 'output_dir' parameter from 'slurm' to 'seml'." ) elif 'output_dir' in seml_config: slurm_config['output_dir'] = seml_config['output_dir'] del slurm_config['experiments_per_job'] start_slurm_job(collection, exps, log_verbose, unobserved, post_mortem, **slurm_config) else: login_node_name = 'fs' if login_node_name in os.uname()[1]: raise ValueError( "Refusing to run a compute experiment on a login node. " "Please use Slurm or a compute node.") print( f'Starting local worker thread that will run up to {nexps} experiments, ' f'until no queued experiments remain.') collection.update_many(query_dict, {"$set": {"status": "PENDING"}}) num_exceptions = 0 i_exp = 0 tq = tqdm(exp_chunks) for exps in tq: for exp in exps: exe, config = get_config_from_exp(exp, log_verbose=log_verbose, unobserved=unobserved, post_mortem=post_mortem) cmd = f"python {exe} with {' '.join(config)}" if not unobserved: # check also whether PENDING experiments have their Slurm ID set, in this case they are waiting # for Slurm execution and we don't start them locally. db_entry = collection.find_one_and_update( filter={ '_id': exp['_id'], 'status': 'PENDING', 'slurm.id': { '$exists': False } }, update={ '$set': { 'seml.command': cmd, 'status': 'RUNNING' } }, upsert=False) if db_entry is None: # another worker already set this entry to PENDING (or at least, it's no longer QUEUED) # so we ignore it. continue if log_verbose: print(f'Running the following command:\n {cmd}') try: output_dir = "." slurm_config = exps[0]['slurm'] seml_config = exps[0]['seml'] if 'output_dir' in slurm_config: warnings.warn( "'output_dir' has moved from 'slurm' to 'seml'. Please adapt your YAML accordingly" "by moving the 'output_dir' parameter from 'slurm' to 'seml'." ) output_dir = slurm_config['output_dir'] if 'output_dir' in seml_config: output_dir = seml_config['output_dir'] output_dir_path = os.path.abspath( os.path.expanduser(output_dir)) exp_name = slurm_config['name'] output_file = f"{output_dir_path}/{exp_name}_{exp['_id']}-out.txt" collection.find_and_modify( {'_id': exp['_id']}, {"$set": { "seml.output_file": output_file }}) with open(output_file, "w") as log_file: # pdb works with check_call but not with check_output. Maybe because of stdout/stdin. subprocess.check_call( cmd, shell=True, stderr=log_file, stdout=log_file, ) except subprocess.CalledProcessError as e: num_exceptions += 1 except IOError: print(f"Log file {output_file} could not be written.") # Since Sacred is never called in case of I/O error, we need to set the experiment state manually. collection.find_one_and_update( filter={'_id': exp['_id']}, update={'$set': { 'status': 'FAILED' }}, upsert=False) finally: i_exp += 1 tq.set_postfix( failed=f"{num_exceptions}/{i_exp} experiments")
def do_work(collection_name, log_verbose, slurm=True, num_exps=-1, filter_dict={}): """Pull queued experiments from the database and run them. Parameters ---------- collection_name: str Name of the collection in the MongoDB. log_verbose: bool Print all the Python syscalls before running them. slurm: bool Use the Slurm cluster. num_exps: int, default: -1 If >0, will only submit the specified number of experiments to the cluster. This is useful when you only want to test your setup. filter_dict: dict Dictionary for filtering the entries in the collection. Returns ------- None """ collection = db_utils.get_collection(collection_name) query_dict = {'status': {"$in": ['QUEUED']}} query_dict.update(filter_dict) if collection.count_documents(query_dict) <= 0: print("No queued experiments.") return exps_list = list(collection.find(query_dict)) nexps = num_exps if num_exps > 0 else len(exps_list) exp_chunks = db_utils.chunk_list(exps_list[:nexps]) njobs = len(exp_chunks) if slurm: print(f"Starting {nexps} experiment{s_if(nexps)} in " f"{njobs} Slurm job{s_if(njobs)}.") for exps in tqdm(exp_chunks): slurm_config = exps[0]['slurm'] del slurm_config['experiments_per_job'] start_slurm_job(collection, exps, log_verbose, **slurm_config) else: login_node_name = 'fs' if login_node_name in os.uname()[1]: raise ValueError( "Refusing to run a compute experiment on a login node. " "Please use Slurm or a compute node.") print(f"Starting {nexps} experiment{s_if(nexps)} locally.") for exp in exps_list[:nexps]: collection.update_one({'_id': exp['_id']}, {'$set': { 'status': 'PENDING' }}) for exps in tqdm(exp_chunks): for exp in exps: cmd = get_cmd_from_exp_dict(exp) if log_verbose: print(f'Running the following command:\n {cmd}') # pdb works with check_call but not with check_output. Maybe because of stdout/stdin. subprocess.check_call(cmd, shell=True)