def parse_arguments() -> argparse.ArgumentParser: parser = argparse.ArgumentParser() msg = "The corresponding attribute to be passed to the job scheduler." parser.add_argument("--mem", dest="mem", default="48G", help=msg) parser.add_argument("--cores", dest="cores", default=4, help=msg) parser.add_argument("--time", dest="time", default="02:00:00", help=msg) parser.add_argument("--partition", dest="partition", default="panda", help=msg) choices = divvy.ComputingConfiguration().list_compute_packages() msg = "`Divvy` compute configuration to be used when submitting the jobs." parser.add_argument("--divvy-configuration", dest="compute", choices=choices, help=msg) msg = "Whether to do all steps except job submission." parser.add_argument( "-d", "--dry-run", dest="dry_run", action="store_true", default=False, help=msg, ) msg = "Attribute in sample annotation containing the path to the input files." parser.add_argument( "--attribute", dest="sample_file_attribute", default="sample_name", help=msg, ) msg = "The parent directory of containting the input data." parser.add_argument("--input-dir", dest="input_dir", default="data", help=msg) msg = "Parent directory for output files." parser.add_argument("--output-dir", dest="output_dir", default="processed", help=msg) msg = "CSV file with metadata for all samples." parser.add_argument(dest="metadata", help=msg) msg = ( "Whether all samples or only samples marked with a positive value in the `toggle`" "column should be processed." ) parser.add_argument("--toggle", dest="toggle", action="store_true", default=False, help=msg) return parser
def set_comp_env(): global active_settings if globs.compute_config is None: globs.compute_config = divvy.ComputingConfiguration() selected_package = request.args.get('compute', type=str) globs.status_check_interval = int(request.args.get('interval', type=int) or globs.status_check_interval or POLL_INTERVAL) if globs.compute_package is None: globs.compute_package = "default" if selected_package is not None: success = globs.compute_config.clean_start(selected_package) if not success: msg = "Compute package '{}' cannot be activated".format(selected_package) app.logger.warning(msg) return jsonify(active_settings=render_template('compute_info.html', active_settings=None, msg=msg)) globs.compute_package = selected_package active_settings = globs.compute_config.get_active_package() write_preferences({"status_check_interval": globs.status_check_interval, "compute_package": globs.compute_package}) return jsonify(active_settings=render_template('compute_info.html', active_settings=active_settings)) active_settings = globs.compute_config.get_active_package() notify_not_set = COMPUTE_SETTINGS_VARNAME[0] if \ globs.compute_config.default_config_file == globs.compute_config.config_file else None write_preferences({"status_check_interval": globs.status_check_interval, "compute_package": globs.compute_package}) return render_template('preferences.html', env_conf_file=globs.compute_config.config_file, compute_packages=globs.compute_config.list_compute_packages(), active_settings=active_settings, compute_package=globs.compute_package, notify_not_set=notify_not_set, default_interval=globs.status_check_interval)
def set_comp_env(): global active_settings if globs.compute_config is None: globs.compute_config = divvy.ComputingConfiguration() selected_package = request.args.get('compute', type=str) if globs.currently_selected_package is None: globs.currently_selected_package = "default" if selected_package is not None: success = globs.compute_config.clean_start(selected_package) if not success: msg = "Compute package '{}' cannot be activated".format( selected_package) app.logger.warning(msg) return jsonify(active_settings=render_template( 'compute_info.html', active_settings=None, msg=msg)) globs.currently_selected_package = selected_package active_settings = globs.compute_config.get_active_package() return jsonify(active_settings=render_template( 'compute_info.html', active_settings=active_settings)) active_settings = globs.compute_config.get_active_package() notify_not_set = COMPUTE_SETTINGS_VARNAME[0] if globs.compute_config.default_config_file == globs.compute_config.config_file\ else None return render_template( 'set_comp_env.html', env_conf_file=globs.compute_config.config_file, compute_packages=globs.compute_config.list_compute_packages(), active_settings=active_settings, currently_selected_package=globs.currently_selected_package, notify_not_set=notify_not_set)
def main(cli=None) -> int: log.info("IMCpipeline runner") parser = parse_arguments() args, unknown = parser.parse_known_args(cli) # the extra arguments will be passed to the pipeline and # compounded arguments (mostly the --cellprofiler-exec argument) # should be quoted again args.cli = ["'" + x + "'" if " " in x else x for x in unknown] log.info("Generating project from given CSV annotation.") annot = pd.read_csv(args.metadata).set_index(args.sample_file_attribute) if args.toggle: log.info("Subsampling samples based on the `toggle` column.") annot = annot.loc[annot["toggle"].isin([1, "1", True, "TRUE", "True"]), :] log.info("Setting compute settings using divvy.") compute = divvy.ComputingConfiguration() compute.activate_package(args.compute) # Now prepare job submission jobs = list() cli_args = " ".join(args.cli) # the '--' is to separate the nargs from the positional in case there aren't more args if cli_args == "": cli_args = "--" for sample, _ in annot.iterrows(): log.info("Processing sample %s", sample) input_dir = pjoin(args.input_dir, sample) output_dir = pjoin(args.output_dir, sample) cmd = f"imcpipeline {cli_args} -i {input_dir} -o {output_dir}" job_name = f"imcpipeline_{sample}" output_prefix = pjoin("submission", job_name) job_file = output_prefix + ".sh" data = { "jobname": job_name, "logfile": output_prefix + ".log", "mem": args.mem, "cores": args.cores, "time": args.time, "partition": args.partition, "code": cmd, } compute.write_script(job_file, data) jobs.append(job_file) log.info("Submitting jobs.") cmd = compute.get_active_package()["submission_command"] if not args.dry_run: for job in jobs: print(cmd, job) subprocess.call([cmd, job]) log.info("Finished with all samples.") return 0
def test_activate_package(self): dcc = divvy.ComputingConfiguration() dcc.activate_package("default") t = dcc.compute.submission_template t2 = dcc["compute"]["submission_template"] assert t == t2 dcc.activate_package("slurm") t = dcc.compute.submission_template t2 = dcc["compute"]["submission_template"] assert t == t2
def test_write_script_adapters(self, compute, package): """Test successful adapter sourcing from various Mapping types""" dcc = divvy.ComputingConfiguration() dcc.activate_package(package) extra_vars = {"compute": compute} dcc.write_script("test.sub", extra_vars) with open("test.sub", "r") as f: contents = f.read() assert contents.find("1000") > 0 os.remove("test.sub")
def test_adapters_overwitten_by_others(self): dcc = divvy.ComputingConfiguration() dcc.activate_package("singularity_slurm") compute = YacAttMap({"mem": 1000}) extra_vars = [{"compute": compute}, {"MEM": 333}] dcc.write_script("test1.sub", extra_vars) with open("test1.sub", "r") as f: contents = f.read() assert not (contents.find("1000") > 0) assert contents.find("333") > 0 os.remove("test1.sub")
def test_write_script(self): dcc = divvy.ComputingConfiguration() dcc dcc.activate_package("singularity_slurm") extra_vars = { "singularity_image": "simg", "jobname": "jbname", "code": "mycode", } dcc.write_script("test.sub", extra_vars) with open("test.sub", "r") as f: contents = f.read() assert contents.find("mycode") > 0 assert contents.find("{SINGULARITY_ARGS}") < 0 os.remove("test.sub")
return path def _req_input_to_args(req_input): """ Given a list of the required inputs for the build command, create an args string :param list[str] req_input: input names :return str: args string """ return ["--" + x + " <arg_here>" for x in req_input] subdir_path = _make_sub_dir(args.path, args.genome) dcc = divvy.ComputingConfiguration() dcc.activate_package("slurm") cmd_template = "refgenie build -g {g} -a {a} {req_input_str}" genome = args.genome to_remove = ["genome", "path"] data = vars(args) for i in to_remove: data.pop(i) for asset in asset_build_packages: sub_script = os.path.join(subdir_path, asset + ".sub") req_input = asset_build_packages[asset]["required_inputs"] if req_input: print( "{} asset requires additional input in the command ({}), so '{}'"
def dcc(request): """Provide ComputingConfiguration objects for all files in divcfg repository""" return divvy.ComputingConfiguration(filepath=request.param)
def empty_dcc(): """Provide the empty/default ComputingConfiguration object""" return divvy.ComputingConfiguration()
import os import glob import divvy import pytest THIS_DIR = os.path.dirname(os.path.abspath(__file__)) DATA_DIR = os.path.join(THIS_DIR, "data/divcfg-master") FILES = glob.glob(DATA_DIR + "/*.yaml") DCC_ATTRIBUTES = divvy.ComputingConfiguration().keys() @pytest.fixture def empty_dcc(): """Provide the empty/default ComputingConfiguration object""" return divvy.ComputingConfiguration() @pytest.fixture(params=FILES) def dcc(request): """Provide ComputingConfiguration objects for all files in divcfg repository""" return divvy.ComputingConfiguration(filepath=request.param) @pytest.fixture def mock_env_missing(monkeypatch): [ monkeypatch.delenv(env_var, raising=False) for env_var in divvy.const.COMPUTE_SETTINGS_VARNAME ]
def submit_job( code, job_file, log_file=None, computing_configuration=None, dry_run=False, limited_number=False, total_job_lim=500, refresh_time=10, in_between_time=5, **kwargs ): """ Submit a job to be run. Uses divvy to allow running on a local computer or distributed computing resources. Parameters ---------- code : :obj:`str` String of command(s) to be run. job_file : :obj:`str` File to write job ``code`` to. log_file : :obj:`str` Log file to write job output to. Defaults to ``job_file`` with ".log" ending. computing_configuration : :obj:`str` Name of :class:`divvy` computing configuration to use. Defaults to 'default' which is to run job in localhost. dry_run: :obj:`bool` Whether not to actually run job. Defaults to :obj:`False`. limited_number: :obj:`bool` Whether to restrict jobs to a maximum number. Currently only possible if using "slurm". Defaults to :obj:`False`. total_job_lim : :obj:`int` Maximum number of jobs to restrict to. Defaults to 500. refresh_time : :obj:`int` Time in between checking number of jobs in seconds. Defaults to 10. in_between_time : :obj:`int` Time in between job submission in seconds. Defaults to 5. **kwargs : :obj:`dict` Additional keyword arguments will be passed to the chosen submission template according to `computing_configuration`. Pass for example: jobname="job", cores=2, mem=8000, partition="longq". """ import time import subprocess import divvy from ngs_toolkit import _CONFIG, _LOGGER # reduce level of logging from divvy # only for divvy <=0. if "logging" in divvy.__dict__.keys(): divvy.logging.getLogger("divvy").setLevel("ERROR") def count_jobs_running(check_cmd="squeue", sep="\n"): """ Count running jobs on a cluster by invoquing a command that lists the jobs. """ return subprocess.check_output(check_cmd).split(sep).__len__() def submit_job_if_possible( cmd, check_cmd="squeue", total_job_lim=800, refresh_time=10, in_between_time=5 ): submit = count_jobs_running(check_cmd) < total_job_lim while not submit: time.sleep(refresh_time) submit = count_jobs_running(check_cmd) < total_job_lim subprocess.call(cmd) time.sleep(in_between_time) if log_file is None: log_file = ".".join(job_file.split(".")[:-1]) + ".log" # Get computing configuration from config if computing_configuration is None: try: computing_configuration = _CONFIG["preferences"]["computing_configuration"] except KeyError: msg = "'computing_configuration' was not given" msg += " and default could not be get from config." hint = " Pass a value or add one to the section" hint += " preferences:computing_configuration'" hint += " in the ngs_toolkit config file." _LOGGER.error(msg + hint) raise dcc = divvy.ComputingConfiguration() if computing_configuration is not None: dcc.activate_package(computing_configuration) # Generate job script d = {"code": code, "logfile": log_file} d.update(kwargs) dcc.write_script(job_file, d) # Submit job if not dry_run: scmd = dcc["compute"]["submission_command"] cmd = scmd.split(" ") + [job_file] # simply submit if not limiting submission to the number of already running jobs if not limited_number: subprocess.call(cmd) else: # otherwise, submit only after `total_job_lim` is less than number of runnning jobs # this is only possible for slurm now though if scmd != "slurm": subprocess.call(cmd) else: submit_job_if_possible(cmd, check_cmd="slurm")