def _write_tool(step_dir, name, inputs, outputs, parallel): out_file = os.path.join(step_dir, "%s.cwl" % name) out = {"class": "CommandLineTool", "baseCommand": ["bcbio_nextgen.py", "runfn", name, "cwl"], "inputs": [], "outputs": []} if not parallel: inputs = [{"id": "#sentinel", "type": {"type": "array", "items": "string"}, "default": ["multisample"]}] + inputs for i, inp in enumerate(inputs): base_id = workflow.get_base_id(inp["id"]) inp_tool = copy.deepcopy(inp) inp_tool["id"] = "#%s" % base_id inp_binding = {"prefix": "%s=" % base_id, "separate": False, "itemSeparator": ";;", "position": i} if "secondaryFiles" in inp_tool: inp_binding["secondaryFiles"] = inp_tool.pop("secondaryFiles") if parallel: inp_tool["inputBinding"] = inp_binding else: inp_tool["type"]["inputBinding"] = inp_binding out["inputs"].append(inp_tool) # XXX Need to generalize outputs, just a hack for now to test align_prep for outp in outputs: outp_tool = copy.deepcopy(outp) outp_tool["id"] = "#%s" % workflow.get_base_id(outp["id"]) out["outputs"].append(outp_tool) with open(out_file, "w") as out_handle: def str_presenter(dumper, data): if len(data.splitlines()) > 1: # check for multiline string return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|') return dumper.represent_scalar('tag:yaml.org,2002:str', data) yaml.add_representer(str, str_presenter) yaml.dump(out, out_handle, default_flow_style=False, allow_unicode=False) return os.path.join("steps", os.path.basename(out_file))
def _step_template(name, run_file, inputs, outputs, parallel, scatter=None): """Templating function for writing a step to avoid repeating namespaces. """ scatter_inputs = [] sinputs = [] for inp in inputs: step_inp = {"id": workflow.get_base_id(inp["id"]), "source": inp["id"]} if inp.get("wf_duplicate"): step_inp["id"] += "_toolinput" for attr in ["source", "valueFrom"]: if attr in inp: step_inp[attr] = inp[attr] sinputs.append(step_inp) # scatter on inputs from previous processes that have been arrayed if (_is_scatter_parallel(parallel) and (_do_scatter_var(inp, parallel) or (scatter and inp["id"] in scatter))): scatter_inputs.append(step_inp["id"]) out = {"run": run_file, "id": name, "in": sinputs, "out": [{"id": workflow.get_base_id(output["id"])} for output in outputs]} if _is_scatter_parallel(parallel): assert scatter_inputs, "Did not find items to scatter on: %s" % name out.update({"scatterMethod": "dotproduct", "scatter": scatter_inputs}) return out
def _write_tool(step_dir, name, inputs, outputs, parallel): out_file = os.path.join(step_dir, "%s.cwl" % name) out = {"class": "CommandLineTool", "baseCommand": ["bcbio_nextgen.py", "runfn", name, "cwl"], "inputs": [], "outputs": []} pinputs = [{"id": "#sentinel-parallel", "type": "string", "default": parallel}] inputs = pinputs + inputs for i, inp in enumerate(inputs): base_id = workflow.get_base_id(inp["id"]) inp_tool = copy.deepcopy(inp) inp_tool["id"] = "#%s" % base_id inp_binding = {"prefix": "%s=" % base_id, "separate": False, "itemSeparator": ";;", "position": i} inp_tool = _place_input_binding(inp_tool, inp_binding, parallel) inp_tool = _place_secondary_files(inp_tool, inp_binding) out["inputs"].append(inp_tool) for outp in outputs: outp_tool = copy.deepcopy(outp) outp_tool["id"] = "#%s" % workflow.get_base_id(outp["id"]) out["outputs"].append(outp_tool) with open(out_file, "w") as out_handle: def str_presenter(dumper, data): if len(data.splitlines()) > 1: # check for multiline string return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|') return dumper.represent_scalar('tag:yaml.org,2002:str', data) yaml.add_representer(str, str_presenter) yaml.dump(out, out_handle, default_flow_style=False, allow_unicode=False) return os.path.join("steps", os.path.basename(out_file))
def _write_tool(step_dir, name, inputs, outputs, parallel, programs, file_estimates, disk, samples): out_file = os.path.join(step_dir, "%s.cwl" % name) cores, mem_gb_per_core = resources.cpu_and_memory((programs or []) + ["default"], samples) mem_mb_total = int(mem_gb_per_core * cores * 1024) cwl_res = {"class": "ResourceRequirement", "coresMin": cores, "ramMin": mem_mb_total} if file_estimates and disk: total_estimate = 0 for key, multiplier in disk.items(): if key in file_estimates: total_estimate += int(multiplier * file_estimates[key]) if total_estimate: cwl_res["tmpdirMin"] = total_estimate out = {"class": "CommandLineTool", "cwlVersion": "cwl:draft-3", "baseCommand": ["bcbio_nextgen.py", "runfn", name, "cwl"], "hints": [cwl_res], "arguments": [], "inputs": [], "outputs": []} out["arguments"].append({"position": 0, "valueFrom": "sentinel-runtime=$(runtime)"}) std_inputs = [{"id": "#sentinel-parallel", "type": "string", "default": parallel}, {"id": "#sentinel-outputs", "type": "string", "default": json.dumps([workflow.get_base_id(x["id"]) for x in outputs], sort_keys=True, separators=(',', ':'))}] inputs = std_inputs + inputs for i, inp in enumerate(inputs): base_id = workflow.get_base_id(inp["id"]) inp_tool = copy.deepcopy(inp) inp_tool["id"] = "#%s" % base_id for attr in ["source", "valueFrom"]: inp_tool.pop(attr, None) inp_binding = {"prefix": "%s=" % base_id, "separate": False, "itemSeparator": ";;", "position": i} inp_tool = _place_input_binding(inp_tool, inp_binding, parallel) inp_tool = _place_secondary_files(inp_tool, inp_binding) out["inputs"].append(inp_tool) for outp in outputs: outp_tool = copy.deepcopy(outp) outp_tool["id"] = "#%s" % workflow.get_base_id(outp["id"]) out["outputs"].append(outp_tool) with open(out_file, "w") as out_handle: def str_presenter(dumper, data): if len(data.splitlines()) > 1: # check for multiline string return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|') return dumper.represent_scalar('tag:yaml.org,2002:str', data) yaml.add_representer(str, str_presenter) yaml.dump(out, out_handle, default_flow_style=False, allow_unicode=False) return os.path.join("steps", os.path.basename(out_file))
def _add_outputs_to_tool(outputs, tool): for outp in outputs: outp_tool = copy.deepcopy(outp) outp_tool = _clean_record(outp_tool) outp_tool["id"] = workflow.get_base_id(outp["id"]) tool["outputs"].append(outp_tool) return tool
def _get_sentinel_val(v): """Retrieve expected sentinel value for an output, expanding records. """ out = workflow.get_base_id(v["id"]) if workflow.is_cwl_record(v): out += ":%s" % ";".join([x["name"] for x in _get_record_fields(v)]) return out
def _write_tool(step_dir, name, inputs, outputs, parallel): out_file = os.path.join(step_dir, "%s.cwl" % name) out = {"class": "CommandLineTool", "baseCommand": ["bcbio_nextgen.py", "runfn", name, "cwl"], "inputs": [], "outputs": []} pinputs = [{"id": "#sentinel-pin", "type": {"type": "array", "items": "string"}, "default": [parallel.input]}, {"id": "#sentinel-pout", "type": {"type": "array", "items": "string"}, "default": [parallel.output]}] inputs = pinputs + inputs for i, inp in enumerate(inputs): base_id = workflow.get_base_id(inp["id"]) inp_tool = copy.deepcopy(inp) inp_tool["id"] = "#%s" % base_id inp_binding = {"prefix": "%s=" % base_id, "separate": False, "itemSeparator": ";;", "position": i} if "secondaryFiles" in inp_tool: # if we have a nested list of files, ensure we pass the index for each # Need a second input binding we ignore to get the secondaryFiles # XXX Ideally could use `valueFrom: null` but that doesn't seem to work if parallel.baseline in ["single", "merge"] and tz.get_in(["type", "type"], inp_tool) == "array": nested_inp_binding = copy.deepcopy(inp_binding) nested_inp_binding["prefix"] = "ignore=" nested_inp_binding["secondaryFiles"] = inp_tool.pop("secondaryFiles") inp_tool["type"]["inputBinding"] = nested_inp_binding # otherwise, add it at the top level else: inp_binding["secondaryFiles"] = inp_tool.pop("secondaryFiles") if parallel.baseline in ["single", "merge"] or not isinstance(inp_tool["type"], dict): inp_tool["inputBinding"] = inp_binding else: inp_tool["type"]["inputBinding"] = inp_binding out["inputs"].append(inp_tool) for outp in outputs: outp_tool = copy.deepcopy(outp) outp_tool["id"] = "#%s" % workflow.get_base_id(outp["id"]) out["outputs"].append(outp_tool) with open(out_file, "w") as out_handle: def str_presenter(dumper, data): if len(data.splitlines()) > 1: # check for multiline string return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|') return dumper.represent_scalar('tag:yaml.org,2002:str', data) yaml.add_representer(str, str_presenter) yaml.dump(out, out_handle, default_flow_style=False, allow_unicode=False) return os.path.join("steps", os.path.basename(out_file))
def _write_tool(step_dir, name, inputs, outputs, parallel, programs, samples): out_file = os.path.join(step_dir, "%s.cwl" % name) cores, mem_gb_per_core = resources.cpu_and_memory(programs if programs else ["default"], samples) mem_mb_total = int(mem_gb_per_core * cores * 1024) out = {"class": "CommandLineTool", "baseCommand": ["bcbio_nextgen.py", "runfn", name, "cwl"], "hints": [{"class": "ResourceRequirement", "coresMin": cores, "ramMin": mem_mb_total}], "arguments": [], "inputs": [], "outputs": []} out["arguments"].append({"position": 0, "prefix": "sentinel-runtime=", "separate": False, "valueFrom": "$(JSON.stringify(runtime))"}) std_inputs = [{"id": "#sentinel-parallel", "type": "string", "default": parallel}] inputs = std_inputs + inputs for i, inp in enumerate(inputs): base_id = workflow.get_base_id(inp["id"]) inp_tool = copy.deepcopy(inp) inp_tool["id"] = "#%s" % base_id for attr in ["source", "valueFrom"]: inp_tool.pop(attr, None) inp_binding = {"prefix": "%s=" % base_id, "separate": False, "itemSeparator": ";;", "position": i} inp_tool = _place_input_binding(inp_tool, inp_binding, parallel) inp_tool = _place_secondary_files(inp_tool, inp_binding) out["inputs"].append(inp_tool) for outp in outputs: outp_tool = copy.deepcopy(outp) outp_tool["id"] = "#%s" % workflow.get_base_id(outp["id"]) out["outputs"].append(outp_tool) with open(out_file, "w") as out_handle: def str_presenter(dumper, data): if len(data.splitlines()) > 1: # check for multiline string return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|') return dumper.represent_scalar('tag:yaml.org,2002:str', data) yaml.add_representer(str, str_presenter) yaml.dump(out, out_handle, default_flow_style=False, allow_unicode=False) return os.path.join("steps", os.path.basename(out_file))
def _step_template(name, run_file, inputs, outputs, parallel): """Templating function for writing a step to avoid repeating namespaces. """ scatter_inputs = [] sinputs = [] for inp in inputs: step_inp = {"id": workflow.get_base_id(inp["id"]), "source": inp["id"]} for attr in ["source", "valueFrom"]: if attr in inp: step_inp[attr] = inp[attr] sinputs.append(step_inp) # scatter on inputs from previous processes that have been arrayed if parallel in "multi-parallel" or len(inp["id"].split("/")) > 1: scatter_inputs.append("%s/%s" % (name, step_inp["id"])) out = {"run": run_file, "id": name, "in": sinputs, "out": [{"id": workflow.get_base_id(output["id"])} for output in outputs]} if parallel in ["single-parallel", "multi-parallel", "batch-parallel"]: out.update({"scatterMethod": "dotproduct", "scatter": scatter_inputs}) return out
def _step_template(name, step_dir, inputs, outputs, parallel): """Templating function for writing a step to avoid repeating namespaces. """ step_file = _write_tool(step_dir, name, inputs, outputs, parallel) inputs = [{"id": "#%s.%s" % (name, workflow.get_base_id(inp["id"])), "source": inp["id"]} for inp in inputs] out = {"run": {"import": step_file}, "id": "#%s" % name, "inputs": inputs, "outputs": [{"id": output["id"]} for output in outputs]} if parallel: out.update({"scatterMethod": "dotproduct", "scatter": [x["id"] for x in inputs]}) return out
def _get_sentinel_val(v): """Retrieve expected sentinel value for an output, expanding records. """ out = workflow.get_base_id(v["id"]) if workflow.is_cwl_record(v): def _get_fields(d): if isinstance(d, dict): if "fields" in d: return d["fields"] else: for v in d.values(): fields = _get_fields(v) if fields: return fields out += ":%s" % ";".join([x["name"] for x in _get_fields(v)]) return out
def _step_template(name, run_file, inputs, outputs, parallel): """Templating function for writing a step to avoid repeating namespaces. """ scatter_inputs = [] sinputs = [] for inp in inputs: step_inp = {"id": "#%s.%s" % (name, workflow.get_base_id(inp["id"])), "source": inp["id"]} sinputs.append(step_inp) # scatter on inputs from previous processes that have been arrayed if parallel.baseline == "multi" or len(inp["id"].split(".")) > 1: scatter_inputs.append(step_inp["id"]) out = {"run": {"import": run_file}, "id": "#%s" % name, "inputs": sinputs, "outputs": [{"id": output["id"]} for output in outputs]} if parallel.input in ["batch"] and parallel.baseline in ["single", "multi"]: out.update({"scatterMethod": "dotproduct", "scatter": scatter_inputs}) return out
def _step_template(name, step_dir, inputs, outputs, source=""): """Templating function for writing a step to avoid repeating namespaces. """ step_file = _write_tool(step_dir, name, inputs, outputs) inputs = [{ "id": "#%s.%s" % (name, workflow.get_base_id(inp["id"])), "source": inp["id"] } for inp in inputs] return { "run": { "import": step_file }, "id": "#%s" % name, "scatterMethod": "dotproduct", "scatter": [x["id"] for x in inputs], "inputs": inputs, "outputs": [{ "id": output["id"] } for output in outputs] }
def _add_inputs_to_tool(inputs, tool, parallel, use_commandline_args=False): for i, inp in enumerate(inputs): base_id = workflow.get_base_id(inp["id"]) inp_tool = copy.deepcopy(inp) inp_tool["id"] = base_id if inp.get("wf_duplicate"): inp_tool["id"] += "_toolinput" for attr in ["source", "valueFrom", "wf_duplicate"]: inp_tool.pop(attr, None) # Ensure records and workflow inputs get scattered if (_is_scatter_parallel(parallel) and _do_scatter_var(inp, parallel) and (workflow.is_cwl_record(inp) or inp["wf_duplicate"])): inp_tool = workflow._flatten_nested_input(inp_tool) if use_commandline_args: inp_binding = {"prefix": "%s=" % base_id, "separate": False, "itemSeparator": ";;", "position": i} inp_tool = _place_input_binding(inp_tool, inp_binding, parallel) else: inp_binding = None inp_tool = _place_secondary_files(inp_tool, inp_binding) inp_tool = _clean_record(inp_tool) tool["inputs"].append(inp_tool) return tool
def _write_tool(step_dir, name, inputs, outputs, parallel, image, programs, file_estimates, disk, step_cores, samples): out_file = os.path.join(step_dir, "%s.cwl" % name) resource_cores, mem_gb_per_core = resources.cpu_and_memory((programs or []) + ["default"], samples) cores = step_cores if step_cores else resource_cores mem_mb_total = int(mem_gb_per_core * cores * 1024) bcbio_docker_disk = 1 * 1024 # Minimum requirements for bcbio Docker image cwl_res = {"class": "ResourceRequirement", "coresMin": cores, "ramMin": mem_mb_total, "outdirMin": bcbio_docker_disk} docker_image = "bcbio/bcbio" if image == "bcbio" else "quay.io/bcbio/%s" % image docker = {"class": "DockerRequirement", "dockerPull": docker_image, "dockerImageId": docker_image} if file_estimates and disk: total_estimate = 0 for key, multiplier in disk.items(): if key in file_estimates: total_estimate += int(multiplier * file_estimates[key]) if total_estimate: cwl_res["tmpdirMin"] = total_estimate cwl_res["outdirMin"] += total_estimate out = {"class": "CommandLineTool", "cwlVersion": "v1.0", "baseCommand": ["bcbio_nextgen.py", "runfn", name, "cwl"], "requirements": [], "hints": [docker, cwl_res], "arguments": [], "inputs": [], "outputs": []} if programs: def resolve_package(p): out = {} parts = p.split("=") if len(parts) == 2: out["package"] = parts[0] out["version"] = [parts[1]] else: out["package"] = p out["specs"] = ["https://anaconda.org/bioconda/%s" % out["package"]] return out out["hints"].append({"class": "SoftwareRequirement", "packages": [resolve_package(p) for p in programs]}) # Use JSON for inputs, rather than command line arguments # Correctly handles multiple values and batching across CWL runners use_commandline_args = False out["requirements"] += [{"class": "InlineJavascriptRequirement"}, {"class": "InitialWorkDirRequirement", "listing": [{"entryname": "cwl.inputs.json", "entry": "$(JSON.stringify(inputs))"}]}] out["arguments"] += [{"position": 0, "valueFrom": "sentinel_runtime=cores,$(runtime['cores']),ram,$(runtime['ram'])"}, "sentinel_parallel=%s" % parallel, "sentinel_outputs=%s" % ",".join([_get_sentinel_val(v) for v in outputs]), "sentinel_inputs=%s" % ",".join(["%s:%s" % (workflow.get_base_id(v["id"]), "record" if workflow.is_cwl_record(v) else "var") for v in inputs])] for i, inp in enumerate(inputs): base_id = workflow.get_base_id(inp["id"]) inp_tool = copy.deepcopy(inp) inp_tool["id"] = base_id if inp.get("wf_duplicate"): inp_tool["id"] += "_toolinput" for attr in ["source", "valueFrom", "wf_duplicate"]: inp_tool.pop(attr, None) if _is_scatter_parallel(parallel) and _do_scatter_var(inp, parallel): inp_tool = workflow._flatten_nested_input(inp_tool) if use_commandline_args: inp_binding = {"prefix": "%s=" % base_id, "separate": False, "itemSeparator": ";;", "position": i} inp_tool = _place_input_binding(inp_tool, inp_binding, parallel) else: inp_binding = None inp_tool = _place_secondary_files(inp_tool, inp_binding) out["inputs"].append(inp_tool) for outp in outputs: outp_tool = copy.deepcopy(outp) outp_tool["id"] = workflow.get_base_id(outp["id"]) out["outputs"].append(outp_tool) with open(out_file, "w") as out_handle: def str_presenter(dumper, data): if len(data.splitlines()) > 1: # check for multiline string return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|') return dumper.represent_scalar('tag:yaml.org,2002:str', data) yaml.add_representer(str, str_presenter) yaml.dump(out, out_handle, default_flow_style=False, allow_unicode=False) return os.path.join("steps", os.path.basename(out_file))
def _write_tool(step_dir, name, inputs, outputs, parallel, image, programs, file_estimates, disk, step_cores, samples, cur_remotes, no_files, container_tags=None): out_file = os.path.join(step_dir, "%s.cwl" % name) resource_cores, mem_gb_per_core = resources.cpu_and_memory( (programs or []) + ["default"], samples) cores = min([step_cores, resource_cores]) if step_cores else resource_cores mem_mb_total = int(mem_gb_per_core * cores * 1024) cwl_res = { "class": "ResourceRequirement", "coresMin": cores, "ramMin": mem_mb_total } disk_hint, input_hint = _get_disk_estimates(name, parallel, inputs, file_estimates, samples, disk, cur_remotes, no_files) cwl_res.update(disk_hint) docker_image = "bcbio/bcbio" if image == "bcbio" else "quay.io/bcbio/%s" % image if container_tags is not None: docker_image, container_tags = _add_current_quay_tag( docker_image, container_tags) docker = { "class": "DockerRequirement", "dockerPull": docker_image, "dockerImageId": docker_image } out = { "class": "CommandLineTool", "cwlVersion": "v1.0", "baseCommand": ["bcbio_nextgen.py", "runfn", name, "cwl"], "requirements": [], "hints": [docker, cwl_res, input_hint], "arguments": [], "inputs": [], "outputs": [] } if programs: def resolve_package(p): out = {} parts = p.split("=") if len(parts) == 2: out["package"] = parts[0] out["version"] = [parts[1]] else: out["package"] = p out["specs"] = [ "https://anaconda.org/bioconda/%s" % out["package"] ] return out out["hints"].append({ "class": "SoftwareRequirement", "packages": [resolve_package(p) for p in programs] }) # GATK requires networking for setting up log4j logging, use arvados extension if any(p.startswith(("gatk", "sentieon")) for p in programs): out["hints"] += [{"class": "arv:APIRequirement"}] # Multi-process methods that read heavily from BAM files need extra keep cache for Arvados if name in ["pipeline_summary", "variantcall_batch_region", "detect_sv"]: out["hints"] += [{ "class": "arv:RuntimeConstraints", "keep_cache": 4096 }] def add_to_namespaces(k, v, out): if "$namespaces" not in out: out["$namespaces"] = {} out["$namespaces"][k] = v return out if any(h.get("class", "").startswith("arv:") for h in out["hints"]): out = add_to_namespaces("arv", "http://arvados.org/cwl#", out) if any(h.get("class", "").startswith("dx") for h in out["hints"]): out = add_to_namespaces("dx", "https://www.dnanexus.com/cwl#", out) # Use JSON for inputs, rather than command line arguments # Correctly handles multiple values and batching across CWL runners use_commandline_args = False out["requirements"] += [{ "class": "InlineJavascriptRequirement" }, { "class": "InitialWorkDirRequirement", "listing": [{ "entryname": "cwl.inputs.json", "entry": "$(JSON.stringify(inputs))" }] }] out["arguments"] += [{ "position": 0, "valueFrom": "sentinel_runtime=cores,$(runtime['cores']),ram,$(runtime['ram'])" }, "sentinel_parallel=%s" % parallel, "sentinel_outputs=%s" % ",".join([_get_sentinel_val(v) for v in outputs]), "sentinel_inputs=%s" % ",".join([ "%s:%s" % (workflow.get_base_id(v["id"]), "record" if workflow.is_cwl_record(v) else "var") for v in inputs ]), "run_number=0"] out = _add_inputs_to_tool(inputs, out, parallel, use_commandline_args) out = _add_outputs_to_tool(outputs, out) _tool_to_file(out, out_file) return os.path.join("steps", os.path.basename(out_file))
def _write_tool(step_dir, name, inputs, outputs, parallel, image, programs, file_estimates, disk, step_cores, samples): out_file = os.path.join(step_dir, "%s.cwl" % name) resource_cores, mem_gb_per_core = resources.cpu_and_memory( (programs or []) + ["default"], samples) cores = step_cores if step_cores else resource_cores mem_mb_total = int(mem_gb_per_core * cores * 1024) bcbio_docker_disk = 1 * 1024 # Minimum requirements for bcbio Docker image cwl_res = { "class": "ResourceRequirement", "coresMin": cores, "ramMin": mem_mb_total, "outdirMin": bcbio_docker_disk } docker_image = "bcbio/bcbio" if image == "bcbio" else "quay.io/bcbio/%s" % image docker = { "class": "DockerRequirement", "dockerPull": docker_image, "dockerImageId": docker_image } if file_estimates and disk: total_estimate = 0 for key, multiplier in disk.items(): if key in file_estimates: total_estimate += int(multiplier * file_estimates[key]) if total_estimate: cwl_res["tmpdirMin"] = total_estimate cwl_res["outdirMin"] += total_estimate out = { "class": "CommandLineTool", "cwlVersion": "v1.0", "baseCommand": ["bcbio_nextgen.py", "runfn", name, "cwl"], "requirements": [], "hints": [docker, cwl_res], "arguments": [], "inputs": [], "outputs": [] } if programs: def resolve_package(p): out = {} parts = p.split("=") if len(parts) == 2: out["package"] = parts[0] out["version"] = [parts[1]] else: out["package"] = p out["specs"] = [ "https://anaconda.org/bioconda/%s" % out["package"] ] return out out["hints"].append({ "class": "SoftwareRequirement", "packages": [resolve_package(p) for p in programs] }) # Use JSON for inputs, rather than command line arguments # Correctly handles multiple values and batching across CWL runners use_commandline_args = False out["requirements"] += [{ "class": "InlineJavascriptRequirement" }, { "class": "InitialWorkDirRequirement", "listing": [{ "entryname": "cwl.inputs.json", "entry": "$(JSON.stringify(inputs))" }] }] out["arguments"] += [{ "position": 0, "valueFrom": "sentinel_runtime=cores,$(runtime['cores']),ram,$(runtime['ram'])" }, "sentinel_parallel=%s" % parallel, "sentinel_outputs=%s" % ",".join([_get_sentinel_val(v) for v in outputs]), "sentinel_inputs=%s" % ",".join([ "%s:%s" % (workflow.get_base_id(v["id"]), "record" if workflow.is_cwl_record(v) else "var") for v in inputs ])] for i, inp in enumerate(inputs): base_id = workflow.get_base_id(inp["id"]) inp_tool = copy.deepcopy(inp) inp_tool["id"] = base_id if inp.get("wf_duplicate"): inp_tool["id"] += "_toolinput" for attr in ["source", "valueFrom", "wf_duplicate"]: inp_tool.pop(attr, None) if _is_scatter_parallel(parallel) and _do_scatter_var(inp, parallel): inp_tool = workflow._flatten_nested_input(inp_tool) if use_commandline_args: inp_binding = { "prefix": "%s=" % base_id, "separate": False, "itemSeparator": ";;", "position": i } inp_tool = _place_input_binding(inp_tool, inp_binding, parallel) else: inp_binding = None inp_tool = _place_secondary_files(inp_tool, inp_binding) inp_tool = _clean_record(inp_tool) out["inputs"].append(inp_tool) for outp in outputs: outp_tool = copy.deepcopy(outp) outp_tool = _clean_record(outp_tool) outp_tool["id"] = workflow.get_base_id(outp["id"]) out["outputs"].append(outp_tool) with open(out_file, "w") as out_handle: def str_presenter(dumper, data): if len(data.splitlines()) > 1: # check for multiline string return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|') return dumper.represent_scalar('tag:yaml.org,2002:str', data) yaml.add_representer(str, str_presenter) yaml.dump(out, out_handle, default_flow_style=False, allow_unicode=False) return os.path.join("steps", os.path.basename(out_file))
def _write_tool(step_dir, name, inputs, outputs, parallel, programs, file_estimates, disk, samples): out_file = os.path.join(step_dir, "%s.cwl" % name) cores, mem_gb_per_core = resources.cpu_and_memory( (programs or []) + ["default"], samples) mem_mb_total = int(mem_gb_per_core * cores * 1024) cwl_res = { "class": "ResourceRequirement", "coresMin": cores, "ramMin": mem_mb_total } if file_estimates and disk: total_estimate = 0 for key, multiplier in disk.items(): if key in file_estimates: total_estimate += int(multiplier * file_estimates[key]) if total_estimate: cwl_res["tmpdirMin"] = total_estimate out = { "class": "CommandLineTool", "cwlVersion": "cwl:draft-3", "baseCommand": ["bcbio_nextgen.py", "runfn", name, "cwl"], "hints": [cwl_res], "arguments": [], "inputs": [], "outputs": [] } out["arguments"].append({ "position": 0, "valueFrom": "sentinel-runtime=$(runtime)" }) std_inputs = [{ "id": "#sentinel-parallel", "type": "string", "default": parallel }, { "id": "#sentinel-outputs", "type": "string", "default": json.dumps([workflow.get_base_id(x["id"]) for x in outputs], sort_keys=True, separators=(',', ':')) }] inputs = std_inputs + inputs for i, inp in enumerate(inputs): base_id = workflow.get_base_id(inp["id"]) inp_tool = copy.deepcopy(inp) inp_tool["id"] = "#%s" % base_id for attr in ["source", "valueFrom"]: inp_tool.pop(attr, None) inp_binding = { "prefix": "%s=" % base_id, "separate": False, "itemSeparator": ";;", "position": i } inp_tool = _place_input_binding(inp_tool, inp_binding, parallel) inp_tool = _place_secondary_files(inp_tool, inp_binding) out["inputs"].append(inp_tool) for outp in outputs: outp_tool = copy.deepcopy(outp) outp_tool["id"] = "#%s" % workflow.get_base_id(outp["id"]) out["outputs"].append(outp_tool) with open(out_file, "w") as out_handle: def str_presenter(dumper, data): if len(data.splitlines()) > 1: # check for multiline string return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|') return dumper.represent_scalar('tag:yaml.org,2002:str', data) yaml.add_representer(str, str_presenter) yaml.dump(out, out_handle, default_flow_style=False, allow_unicode=False) return os.path.join("steps", os.path.basename(out_file))
def _write_tool(step_dir, name, inputs, outputs, parallel, programs, file_estimates, disk, samples): out_file = os.path.join(step_dir, "%s.cwl" % name) cores, mem_gb_per_core = resources.cpu_and_memory((programs or []) + ["default"], samples) mem_mb_total = int(mem_gb_per_core * cores * 1024) bcbio_docker_disk = 1 * 1024 # Minimum requirements for bcbio Docker image cwl_res = {"class": "ResourceRequirement", "coresMin": cores, "ramMin": mem_mb_total, "outdirMin": bcbio_docker_disk} if file_estimates and disk: total_estimate = 0 for key, multiplier in disk.items(): if key in file_estimates: total_estimate += int(multiplier * file_estimates[key]) if total_estimate: cwl_res["tmpdirMin"] = total_estimate cwl_res["outdirMin"] += total_estimate out = {"class": "CommandLineTool", "cwlVersion": "v1.0", "baseCommand": ["bcbio_nextgen.py", "runfn", name, "cwl"], "hints": [cwl_res], "arguments": [], "inputs": [], "outputs": []} if programs: def resolve_package(p): out = {} parts = p.split("=") if len(parts) == 2: out["package"] = parts[0] out["version"] = [parts[1]] else: out["package"] = p out["specs"] = ["https://anaconda.org/bioconda/%s" % out["package"]] return out out["hints"].append({"class": "SoftwareRequirement", "packages": [resolve_package(p) for p in programs]}) out["arguments"].append({"position": 0, "valueFrom": "sentinel-runtime=cores,$(runtime['cores']),ram,$(runtime['ram'])"}) std_inputs = [{"id": "sentinel-parallel", "type": "string", "default": parallel}, {"id": "sentinel-outputs", "type": "string", "default": ",".join([workflow.get_base_id(x["id"]) for x in outputs])}] inputs = std_inputs + inputs for i, inp in enumerate(inputs): base_id = workflow.get_base_id(inp["id"]) inp_tool = copy.deepcopy(inp) inp_tool["id"] = base_id for attr in ["source", "valueFrom"]: inp_tool.pop(attr, None) inp_binding = {"prefix": "%s=" % base_id, "separate": False, "itemSeparator": ";;", "position": i} inp_tool = _place_input_binding(inp_tool, inp_binding, parallel) inp_tool = _place_secondary_files(inp_tool, inp_binding) out["inputs"].append(inp_tool) for outp in outputs: outp_tool = copy.deepcopy(outp) outp_tool["id"] = workflow.get_base_id(outp["id"]) out["outputs"].append(outp_tool) with open(out_file, "w") as out_handle: def str_presenter(dumper, data): if len(data.splitlines()) > 1: # check for multiline string return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|') return dumper.represent_scalar('tag:yaml.org,2002:str', data) yaml.add_representer(str, str_presenter) yaml.dump(out, out_handle, default_flow_style=False, allow_unicode=False) return os.path.join("steps", os.path.basename(out_file))
def _write_tool(step_dir, name, inputs, outputs, parallel, programs, samples): out_file = os.path.join(step_dir, "%s.cwl" % name) cores, mem_gb_per_core = resources.cpu_and_memory( programs if programs else ["default"], samples) mem_mb_total = int(mem_gb_per_core * cores * 1024) out = { "class": "CommandLineTool", "baseCommand": ["bcbio_nextgen.py", "runfn", name, "cwl"], "hints": [{ "class": "ResourceRequirement", "coresMin": cores, "ramMin": mem_mb_total }], "arguments": [], "inputs": [], "outputs": [] } out["arguments"].append({ "position": 0, "prefix": "sentinel-runtime=", "separate": False, "valueFrom": "$(JSON.stringify(runtime))" }) std_inputs = [{ "id": "#sentinel-parallel", "type": "string", "default": parallel }] inputs = std_inputs + inputs for i, inp in enumerate(inputs): base_id = workflow.get_base_id(inp["id"]) inp_tool = copy.deepcopy(inp) inp_tool["id"] = "#%s" % base_id for attr in ["source", "valueFrom"]: inp_tool.pop(attr, None) inp_binding = { "prefix": "%s=" % base_id, "separate": False, "itemSeparator": ";;", "position": i } inp_tool = _place_input_binding(inp_tool, inp_binding, parallel) inp_tool = _place_secondary_files(inp_tool, inp_binding) out["inputs"].append(inp_tool) for outp in outputs: outp_tool = copy.deepcopy(outp) outp_tool["id"] = "#%s" % workflow.get_base_id(outp["id"]) out["outputs"].append(outp_tool) with open(out_file, "w") as out_handle: def str_presenter(dumper, data): if len(data.splitlines()) > 1: # check for multiline string return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|') return dumper.represent_scalar('tag:yaml.org,2002:str', data) yaml.add_representer(str, str_presenter) yaml.dump(out, out_handle, default_flow_style=False, allow_unicode=False) return os.path.join("steps", os.path.basename(out_file))
def _write_tool(step_dir, name, inputs, outputs, parallel, image, programs, file_estimates, disk, step_cores, samples, cur_remotes, no_files, container_tags=None): out_file = os.path.join(step_dir, "%s.cwl" % name) resource_cores, mem_gb_per_core = resources.cpu_and_memory((programs or []) + ["default"], samples) cores = min([step_cores, resource_cores]) if step_cores else resource_cores mem_mb_total = int(mem_gb_per_core * cores * 1024) cwl_res = {"class": "ResourceRequirement", "coresMin": cores, "ramMin": mem_mb_total} disk_hint, input_hint = _get_disk_estimates(name, parallel, inputs, file_estimates, samples, disk, cur_remotes, no_files) cwl_res.update(disk_hint) docker_image = "bcbio/bcbio" if image == "bcbio" else "quay.io/bcbio/%s" % image if container_tags is not None: docker_image, container_tags = _add_current_quay_tag(docker_image, container_tags) docker = {"class": "DockerRequirement", "dockerPull": docker_image, "dockerImageId": docker_image} out = {"class": "CommandLineTool", "cwlVersion": "v1.0", "baseCommand": ["bcbio_nextgen.py", "runfn", name, "cwl"], "requirements": [], "hints": [docker, cwl_res, input_hint], "arguments": [], "inputs": [], "outputs": []} if programs: def resolve_package(p): out = {} parts = p.split("=") if len(parts) == 2: out["package"] = parts[0] out["version"] = [parts[1]] else: out["package"] = p out["specs"] = ["https://anaconda.org/bioconda/%s" % out["package"]] return out out["hints"].append({"class": "SoftwareRequirement", "packages": [resolve_package(p) for p in programs]}) # GATK requires networking for setting up log4j logging, use arvados extension if any(p.startswith(("gatk", "sentieon")) for p in programs): out["hints"] += [{"class": "arv:APIRequirement"}] # Multi-process methods that read heavily from BAM files need extra keep cache for Arvados if name in ["pipeline_summary", "variantcall_batch_region", "detect_sv"]: out["hints"] += [{"class": "arv:RuntimeConstraints", "keep_cache": 4096}] def add_to_namespaces(k, v, out): if "$namespaces" not in out: out["$namespaces"] = {} out["$namespaces"][k] = v return out if any(h.get("class", "").startswith("arv:") for h in out["hints"]): out = add_to_namespaces("arv", "http://arvados.org/cwl#", out) if any(h.get("class", "").startswith("dx") for h in out["hints"]): out = add_to_namespaces("dx", "https://www.dnanexus.com/cwl#", out) # Use JSON for inputs, rather than command line arguments # Correctly handles multiple values and batching across CWL runners use_commandline_args = False out["requirements"] += [{"class": "InlineJavascriptRequirement"}, {"class": "InitialWorkDirRequirement", "listing": [{"entryname": "cwl.inputs.json", "entry": "$(JSON.stringify(inputs))"}]}] out["arguments"] += [{"position": 0, "valueFrom": "sentinel_runtime=cores,$(runtime['cores']),ram,$(runtime['ram'])"}, "sentinel_parallel=%s" % parallel, "sentinel_outputs=%s" % ",".join([_get_sentinel_val(v) for v in outputs]), "sentinel_inputs=%s" % ",".join(["%s:%s" % (workflow.get_base_id(v["id"]), "record" if workflow.is_cwl_record(v) else "var") for v in inputs]), "run_number=0"] out = _add_inputs_to_tool(inputs, out, parallel, use_commandline_args) out = _add_outputs_to_tool(outputs, out) _tool_to_file(out, out_file) return os.path.join("steps", os.path.basename(out_file))
def _write_tool(step_dir, name, inputs, outputs, parallel, image, programs, file_estimates, disk, step_cores, samples, cur_remotes): out_file = os.path.join(step_dir, "%s.cwl" % name) resource_cores, mem_gb_per_core = resources.cpu_and_memory( (programs or []) + ["default"], samples) cores = min([step_cores, resource_cores]) if step_cores else resource_cores mem_mb_total = int(mem_gb_per_core * cores * 1024) bcbio_docker_disk = 1 * 1024 # Minimum requirements for bcbio Docker image cwl_res = { "class": "ResourceRequirement", "coresMin": cores, "ramMin": mem_mb_total, "outdirMin": bcbio_docker_disk } cwl_res = _add_disk_estimates(cwl_res, inputs, file_estimates, disk) docker_image = "bcbio/bcbio" if image == "bcbio" else "quay.io/bcbio/%s" % image docker = { "class": "DockerRequirement", "dockerPull": docker_image, "dockerImageId": docker_image } out = { "class": "CommandLineTool", "cwlVersion": "v1.0", "baseCommand": ["bcbio_nextgen.py", "runfn", name, "cwl"], "requirements": [], "hints": [docker, cwl_res], "arguments": [], "inputs": [], "outputs": [] } if programs: def resolve_package(p): out = {} parts = p.split("=") if len(parts) == 2: out["package"] = parts[0] out["version"] = [parts[1]] else: out["package"] = p out["specs"] = [ "https://anaconda.org/bioconda/%s" % out["package"] ] return out out["hints"].append({ "class": "SoftwareRequirement", "packages": [resolve_package(p) for p in programs] }) # GATK requires networking for setting up log4j logging, use arvados extension if any(p.startswith(("gatk", "sentieon")) for p in programs): out["hints"] += [{"class": "arv:APIRequirement"}] # Multi-process methods that read heavily from BAM files need extra keep cache for Arvados if name in ["pipeline_summary", "variantcall_batch_region"]: out["hints"] += [{ "class": "arv:RuntimeConstraints", "keep_cache": 4096 }] if any(h.get("class", "").startswith("arv:") for h in out["hints"]): out["$namespaces"] = {"arv": "http://arvados.org/cwl#"} # Use JSON for inputs, rather than command line arguments # Correctly handles multiple values and batching across CWL runners use_commandline_args = False out["requirements"] += [{ "class": "InlineJavascriptRequirement" }, { "class": "InitialWorkDirRequirement", "listing": [{ "entryname": "cwl.inputs.json", "entry": "$(JSON.stringify(inputs))" }] }] out["arguments"] += [{ "position": 0, "valueFrom": "sentinel_runtime=cores,$(runtime['cores']),ram,$(runtime['ram'])" }, "sentinel_parallel=%s" % parallel, "sentinel_outputs=%s" % ",".join([_get_sentinel_val(v) for v in outputs]), "sentinel_inputs=%s" % ",".join([ "%s:%s" % (workflow.get_base_id(v["id"]), "record" if workflow.is_cwl_record(v) else "var") for v in inputs ])] for i, inp in enumerate(inputs): base_id = workflow.get_base_id(inp["id"]) inp_tool = copy.deepcopy(inp) inp_tool["id"] = base_id if inp.get("wf_duplicate"): inp_tool["id"] += "_toolinput" for attr in ["source", "valueFrom", "wf_duplicate"]: inp_tool.pop(attr, None) # Ensure records and workflow inputs get scattered if (_is_scatter_parallel(parallel) and _do_scatter_var(inp, parallel) and (workflow.is_cwl_record(inp) or inp["wf_duplicate"])): inp_tool = workflow._flatten_nested_input(inp_tool) if use_commandline_args: inp_binding = { "prefix": "%s=" % base_id, "separate": False, "itemSeparator": ";;", "position": i } inp_tool = _place_input_binding(inp_tool, inp_binding, parallel) else: inp_binding = None inp_tool = _place_secondary_files(inp_tool, inp_binding) inp_tool = _clean_record(inp_tool) out["inputs"].append(inp_tool) for outp in outputs: outp_tool = copy.deepcopy(outp) outp_tool = _clean_record(outp_tool) outp_tool["id"] = workflow.get_base_id(outp["id"]) out["outputs"].append(outp_tool) with open(out_file, "w") as out_handle: def str_presenter(dumper, data): if len(data.splitlines()) > 1: # check for multiline string return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|') return dumper.represent_scalar('tag:yaml.org,2002:str', data) yaml.add_representer(str, str_presenter) yaml.dump(out, out_handle, default_flow_style=False, allow_unicode=False) return os.path.join("steps", os.path.basename(out_file))