def _run_break_point_inspector(data, variant_file, paired, work_dir): output_vcf = "%s-%s.vcf.gz" % (utils.splitext_plus(variant_file)[0], "bpi") stats_file = "%s-%s_stats.txt" % (utils.splitext_plus(variant_file)[0], "bpi") if not utils.file_exists(output_vcf): with file_transaction(data, output_vcf) as tx_output_vcf, file_transaction( data, stats_file) as tx_stats_file: cores = dd.get_num_cores(data) resources = config_utils.get_resources("break-point-inspector", data["config"]) jvm_mem_opts = config_utils.adjust_opts( resources.get("jvm_opts", ["-Xms1000m", "-Xmx2000m"]), { "algorithm": { "memory_adjust": { "magnitude": cores, "direction": "increase" } } }) jvm_tmp_arg = "-Djava.io.tmpdir=" + utils.safe_makedir( os.path.join(work_dir, "bpi_tmp")) cmd = ["break-point-inspector" ] + jvm_mem_opts + [jvm_tmp_arg, "-vcf", variant_file] if paired: cmd += ["-ref", paired.normal_bam, "-tumor", paired.tumor_bam] cmd += ["-output_vcf", tx_output_vcf, ">", tx_stats_file] do.run(cmd, "Running Break Point Inspector for Manta SV calls") return output_vcf
def cl_gatk(self, params, tmp_dir): support_nt = set() support_nct = set(["BaseRecalibrator"]) gatk_jar = self._get_jar("GenomeAnalysisTK", ["GenomeAnalysisTKLite"]) local_args = [] cores = self._config["algorithm"].get("num_cores", 1) config = copy.deepcopy(self._config) if cores and int(cores) > 1: atype_index = params.index("-T") if params.count("-T") > 0 \ else params.index("--analysis_type") prog = params[atype_index + 1] if prog in support_nt: params.extend(["-nt", str(cores)]) elif prog in support_nct: params.extend(["-nct", str(cores)]) if config["algorithm"].get("memory_adjust") is None: config["algorithm"]["memory_adjust"] = {"direction": "increase", "magnitude": int(cores) // 2} if self.get_gatk_version() > "1.9": if len([x for x in params if x.startswith(("-U", "--unsafe"))]) == 0: params.extend(["-U", "LENIENT_VCF_PROCESSING"]) params.extend(["--read_filter", "BadCigar", "--read_filter", "NotPrimaryAlignment"]) local_args.append("-Djava.io.tmpdir=%s" % tmp_dir) return ["java"] + config_utils.adjust_opts(self._jvm_opts, config) + local_args + \ ["-jar", gatk_jar] + [str(x) for x in params]
def _get_gatk_opts(config, names, tmp_dir=None, memscale=None, include_gatk=True): """Retrieve GATK memory specifications, moving down a list of potential specifications. """ if include_gatk: opts = [ "-U", "LENIENT_VCF_PROCESSING", "--read_filter", "BadCigar", "--read_filter", "NotPrimaryAlignment" ] else: opts = [] jvm_opts = ["-Xms750m", "-Xmx2g"] for n in names: resources = config_utils.get_resources(n, config) if resources and resources.get("jvm_opts"): jvm_opts = resources.get("jvm_opts") break if memscale: jvm_opts = config_utils.adjust_opts( jvm_opts, {"algorithm": { "memory_adjust": memscale }}) jvm_opts += get_default_jvm_opts(tmp_dir) return jvm_opts + opts
def cl_gatk(self, params, tmp_dir, memscale=None): support_nt = set() support_nct = set(["BaseRecalibrator"]) gatk_jar = self._get_jar("GenomeAnalysisTK", ["GenomeAnalysisTKLite"]) cores = self._config["algorithm"].get("num_cores", 1) config = self._config if cores and int(cores) > 1: atype_index = params.index("-T") if params.count("-T") > 0 \ else params.index("--analysis_type") prog = params[atype_index + 1] if prog in support_nt: params.extend(["-nt", str(cores)]) elif prog in support_nct: params.extend(["-nct", str(cores)]) if config["algorithm"].get("memory_adjust") is None: config = utils.deepish_copy(config) config["algorithm"]["memory_adjust"] = {"direction": "increase", "magnitude": int(cores) // 2} if LooseVersion(self.gatk_major_version()) > LooseVersion("1.9"): if len([x for x in params if x.startswith(("-U", "--unsafe"))]) == 0: params.extend(["-U", "LENIENT_VCF_PROCESSING"]) params.extend(["--read_filter", "BadCigar", "--read_filter", "NotPrimaryAlignment"]) if memscale: jvm_opts = get_gatk_opts(config, tmp_dir=tmp_dir, memscale=memscale, include_gatk=False) else: # Decrease memory slightly from configuration to avoid memory allocation errors jvm_opts = config_utils.adjust_opts(self._jvm_opts, {"algorithm": {"memory_adjust": {"magnitude": 1.1, "direction": "decrease"}}}) jvm_opts += get_default_jvm_opts(tmp_dir) if "keyfile" in self._gatk_resources: params = ["-et", "NO_ET", "-K", self._gatk_resources["keyfile"]] + params return ["java"] + jvm_opts + ["-jar", gatk_jar] + [str(x) for x in params]
def _prioritize_vcf(caller, vcf_file, prioritize_by, post_prior_fn, work_dir, data): """Provide prioritized tab delimited output for a single caller. """ sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller)) simple_vcf = os.path.join(work_dir, "%s-%s-simple.vcf.gz" % (sample, caller)) if not utils.file_exists(simple_vcf): gene_list = _find_gene_list_from_bed(prioritize_by, out_file, data) # If we have a standard gene list we can skip BED based prioritization priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0] if gene_list: if vcf_file.endswith(".vcf.gz"): utils.symlink_plus(vcf_file, priority_vcf) else: assert vcf_file.endswith(".vcf") utils.symlink_plus(vcf_file, priority_vcf.replace(".vcf.gz", ".vcf")) vcfutils.bgzip_and_index(priority_vcf.replace(".vcf.gz", ".vcf"), data["config"], remove_orig=False) # otherwise prioritize based on BED and proceed else: if not utils.file_exists(priority_vcf): with file_transaction(data, priority_vcf) as tx_out_file: resources = config_utils.get_resources("bcbio_prioritize", data["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms1g", "-Xmx4g"]) jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust": {"direction": "increase", "maximum": "30000M", "magnitude": dd.get_cores(data)}}}) jvm_opts = " ".join(jvm_opts) export = utils.local_path_export() cmd = ("{export} bcbio-prioritize {jvm_opts} known -i {vcf_file} -o {tx_out_file} " " -k {prioritize_by}") do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest") data_dir = os.path.dirname(os.path.realpath(utils.which("simple_sv_annotation.py"))) with file_transaction(data, simple_vcf) as tx_out_file: fusion_file = os.path.join(data_dir, "fusion_pairs.txt") opts = "" if os.path.exists(fusion_file): opts += " --known_fusion_pairs %s" % fusion_file if not gene_list: opts += " --gene_list %s" % os.path.join(data_dir, "az-cancer-panel.txt") else: opts += " --gene_list %s" % gene_list cmd = "simple_sv_annotation.py {opts} -o - {priority_vcf} | bgzip -c > {tx_out_file}" do.run(cmd.format(**locals()), "Prioritize: simplified annotation output") simple_vcf = vcfutils.bgzip_and_index(vcfutils.sort_by_ref(simple_vcf, data), data["config"]) if post_prior_fn: simple_vcf = post_prior_fn(simple_vcf, work_dir, data) if not utils.file_uptodate(out_file, simple_vcf): with file_transaction(data, out_file) as tx_out_file: export = utils.local_path_export(env_cmd="vawk") cmd = ("{export} zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} " """'{{if (($7 == "PASS" || $7 == ".") && (S${sample}$GT != "0/0")) """ "print CALLER,SNAME,$1,$2,I$END," """I$SVTYPE=="BND" ? I$SVTYPE":"$3":"I$MATEID : I$SVTYPE,""" "I$LOF,I$SIMPLE_ANN," "S${sample}$SR,S${sample}$PE,S${sample}$PR}}' > {tx_out_file}") do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited") return out_file, simple_vcf
def _get_snpeff_cmd(cmd_name, datadir, data, out_file): """Retrieve snpEff base command line. """ resources = config_utils.get_resources("snpeff", data["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx3g"]) # scale by cores, defaulting to 2x base usage to ensure we have enough memory # for single core runs to use with human genomes. # Sets a maximum amount of memory to avoid core dumps exceeding 32Gb # We shouldn't need that much memory for snpEff, so avoid issues # https://www.elastic.co/guide/en/elasticsearch/guide/current/heap-sizing.html#compressed_oops jvm_opts = config_utils.adjust_opts( jvm_opts, { "algorithm": { "memory_adjust": { "direction": "increase", "maximum": "30000M", "magnitude": max(2, dd.get_cores(data)) } } }) memory = " ".join(jvm_opts) snpeff = config_utils.get_program("snpEff", data["config"]) java_args = "-Djava.io.tmpdir=%s" % utils.safe_makedir( os.path.join(os.path.dirname(out_file), "tmp")) export = "unset JAVA_HOME && export PATH=%s:$PATH && " % ( utils.get_java_binpath()) cmd = "{export} {snpeff} {memory} {java_args} {cmd_name} -dataDir {datadir}" return cmd.format(**locals())
def summary(samples, config): """Provide summary information on a single sample across regions of interest. """ try: bc_jar = config_utils.get_jar("bcbio.coverage", config_utils.get_program("bcbio_coverage", config, "dir")) except ValueError: logger.warning("No coverage calculations: Did not find bcbio.coverage jar from system config") return [[x] for x in samples] config_file, out_file = _prep_coverage_config(samples, config) tmp_dir = utils.safe_makedir(os.path.join(os.path.dirname(out_file), "tmp")) resources = config_utils.get_resources("bcbio_coverage", config) config = copy.deepcopy(config) config["algorithm"]["memory_adjust"] = {"direction": "increase", "magnitude": config["algorithm"].get("num_cores", 1)} jvm_opts = config_utils.adjust_opts(resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"]), config) if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: java_args = ["-Djava.io.tmpdir=%s" % tmp_dir, "-Djava.awt.headless=true"] cmd = ["java"] + jvm_opts + java_args + ["-jar", bc_jar, "multicompare", config_file, tx_out_file, "-c", str(config["algorithm"].get("num_cores", 1))] do.run(cmd, "Summarizing coverage with bcbio.coverage", samples[0]) out = [] for x in samples: x["coverage"] = {"summary": out_file} out.append([x]) return out
def _prioritize_vcf(caller, vcf_file, prioritize_by, post_prior_fn, work_dir, data): """Provide prioritized tab delimited output for a single caller. """ sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller)) simple_vcf = os.path.join(work_dir, "%s-%s-simple.vcf.gz" % (sample, caller)) if not utils.file_exists(simple_vcf): gene_list = _find_gene_list_from_bed(prioritize_by, out_file, data) # If we have a standard gene list we can skip BED based prioritization priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0] if gene_list: if vcf_file.endswith(".vcf.gz"): utils.symlink_plus(vcf_file, priority_vcf) else: assert vcf_file.endswith(".vcf") utils.symlink_plus(vcf_file, priority_vcf.replace(".vcf.gz", ".vcf")) vcfutils.bgzip_and_index(priority_vcf.replace(".vcf.gz", ".vcf"), data["config"], remove_orig=False) # otherwise prioritize based on BED and proceed else: if not utils.file_exists(priority_vcf): with file_transaction(data, priority_vcf) as tx_out_file: resources = config_utils.get_resources("bcbio_prioritize", data["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms1g", "-Xmx4g"]) jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust": {"direction": "increase", "maximum": "30000M", "magnitude": dd.get_cores(data)}}}) jvm_opts = " ".join(jvm_opts) export = utils.local_path_export() cmd = ("{export} bcbio-prioritize {jvm_opts} known -i {vcf_file} -o {tx_out_file} " " -k {prioritize_by}") do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest") data_dir = os.path.dirname(os.path.realpath(utils.which("simple_sv_annotation.py"))) with file_transaction(data, simple_vcf) as tx_out_file: fusion_file = os.path.join(data_dir, "fusion_pairs.txt") opts = "" if os.path.exists(fusion_file): opts += " --known_fusion_pairs %s" % fusion_file if not gene_list: opts += " --gene_list %s" % os.path.join(data_dir, "az-cancer-panel.txt") else: opts += " --gene_list %s" % gene_list cmd = "simple_sv_annotation.py {opts} -o - {priority_vcf} | bgzip -c > {tx_out_file}" do.run(cmd.format(**locals()), "Prioritize: simplified annotation output") simple_vcf = vcfutils.bgzip_and_index(vcfutils.sort_by_ref(simple_vcf, data), data["config"]) if post_prior_fn: simple_vcf = post_prior_fn(simple_vcf, work_dir, data) if not utils.file_uptodate(out_file, simple_vcf): with file_transaction(data, out_file) as tx_out_file: export = utils.local_path_export() cmd = ("{export} zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} " """'{{if (($7 == "PASS" || $7 == ".") && (S${sample}$GT != "0/0")) """ "print CALLER,SNAME,$1,$2,I$END," """I$SVTYPE=="BND" ? I$SVTYPE":"$3":"I$MATEID : I$SVTYPE,""" "I$LOF,I$SIMPLE_ANN," "S${sample}$SR,S${sample}$PE,S${sample}$PR}}' > {tx_out_file}") do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited") return out_file, simple_vcf
def _square_batch_bcbio_variation(data, region, bam_files, vrn_files, out_file, todo="square"): """Run squaring or merging analysis using bcbio.variation.recall. """ ref_file = tz.get_in(("reference", "fasta", "base"), data) cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1) resources = config_utils.get_resources("bcbio-variation-recall", data["config"]) # adjust memory by cores but leave room for run program memory memcores = int(math.ceil(float(cores) / 5.0)) jvm_opts = config_utils.adjust_opts(resources.get("jvm_opts", ["-Xms250m", "-Xmx2g"]), {"algorithm": {"memory_adjust": {"direction": "increase", "magnitude": memcores}}}) # Write unique VCFs and BAMs to input file input_file = "%s-inputs.txt" % os.path.splitext(out_file)[0] with open(input_file, "w") as out_handle: out_handle.write("\n".join(sorted(list(set(vrn_files)))) + "\n") if todo == "square": out_handle.write("\n".join(sorted(list(set(bam_files)))) + "\n") variantcaller = tz.get_in(("config", "algorithm", "jointcaller"), data).replace("-joint", "") cmd = ["bcbio-variation-recall", todo] + jvm_opts + broad.get_default_jvm_opts() + \ ["-c", cores, "-r", bamprep.region_to_gatk(region)] if todo == "square": cmd += ["--caller", variantcaller] cmd += [out_file, ref_file, input_file] bcbio_env = utils.get_bcbio_env() cmd = " ".join(str(x) for x in cmd) do.run(cmd, "%s in region: %s" % (cmd, bamprep.region_to_gatk(region)), env=bcbio_env) return out_file
def cl_gatk(self, params, tmp_dir): support_nt = set() support_nct = set(["BaseRecalibrator"]) gatk_jar = self._get_jar("GenomeAnalysisTK", ["GenomeAnalysisTKLite"]) local_args = [] cores = self._config["algorithm"].get("num_cores", 1) config = copy.deepcopy(self._config) if cores and int(cores) > 1: atype_index = params.index("-T") if params.count("-T") > 0 \ else params.index("--analysis_type") prog = params[atype_index + 1] if prog in support_nt: params.extend(["-nt", str(cores)]) elif prog in support_nct: params.extend(["-nct", str(cores)]) if config["algorithm"].get("memory_adjust") is None: config["algorithm"]["memory_adjust"] = { "direction": "increase", "magnitude": int(cores) // 2 } if LooseVersion(self.gatk_major_version()) > LooseVersion("1.9"): if len([x for x in params if x.startswith(("-U", "--unsafe"))]) == 0: params.extend(["-U", "LENIENT_VCF_PROCESSING"]) params.extend([ "--read_filter", "BadCigar", "--read_filter", "NotPrimaryAlignment" ]) local_args.append("-Djava.io.tmpdir=%s" % tmp_dir) if "keyfile" in self._gatk_resources: params = ["-et", "NO_ET", "-K", self._gatk_resources["keyfile"] ] + params return ["java"] + config_utils.adjust_opts(self._jvm_opts, config) + local_args + \ ["-jar", gatk_jar] + [str(x) for x in params]
def _run_gridss(inputs, background, work_dir): out_file = os.path.join(work_dir, "%s-gridss.sv.vcf" % (dd.get_batch(inputs[0]) or dd.get_sample_name(inputs[0]))) if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): with file_transaction(inputs[0], out_file) as tx_out_file: htsjdk_opts = ["-Dsamjdk.create_index=true", "-Dsamjdk.use_async_io_read_samtools=true", "-Dsamjdk.use_async_io_write_samtools=true", "-Dsamjdk.use_async_io_write_tribble=true"] cores = dd.get_cores(inputs[0]) resources = config_utils.get_resources("gridss", inputs[0]["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx4g"]) jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust": {"direction": "increase", "magnitude": cores}}}) jvm_opts = _finalize_memory(jvm_opts) tx_ref_file = _setup_reference_files(inputs[0], os.path.dirname(tx_out_file)) blacklist_bed = sshared.prepare_exclude_file(inputs + background, out_file) cmd = ["gridss"] + jvm_opts + htsjdk_opts + ["gridss.CallVariants"] + \ ["THREADS=%s" % cores, "TMP_DIR=%s" % os.path.dirname(tx_out_file), "WORKING_DIR=%s" % os.path.dirname(tx_out_file), "OUTPUT=%s" % tx_out_file, "ASSEMBLY=%s" % tx_out_file.replace(".sv.vcf", ".gridss.assembly.bam"), "REFERENCE_SEQUENCE=%s" % tx_ref_file, "BLACKLIST=%s" % blacklist_bed] for data in inputs + background: cmd += ["INPUT=%s" % dd.get_align_bam(data), "INPUT_LABEL=%s" % dd.get_sample_name(data)] exports = utils.local_path_export() cmd = exports + " ".join(cmd) do.run(cmd, "GRIDSS SV analysis") return vcfutils.bgzip_and_index(out_file, inputs[0]["config"])
def cl_gatk(self, params, tmp_dir, memscale=None): support_nt = set() support_nct = set(["BaseRecalibrator"]) gatk_jar = self._get_jar("GenomeAnalysisTK", ["GenomeAnalysisTKLite"]) cores = self._config["algorithm"].get("num_cores", 1) config = self._config if cores and int(cores) > 1: atype_index = params.index("-T") if params.count("-T") > 0 \ else params.index("--analysis_type") prog = params[atype_index + 1] if prog in support_nt: params.extend(["-nt", str(cores)]) elif prog in support_nct: params.extend(["-nct", str(cores)]) if config["algorithm"].get("memory_adjust") is None: config = utils.deepish_copy(config) config["algorithm"]["memory_adjust"] = {"direction": "increase", "magnitude": int(cores) // 2} if LooseVersion(self.gatk_major_version()) > LooseVersion("1.9"): if len([x for x in params if x.startswith(("-U", "--unsafe"))]) == 0: params.extend(["-U", "LENIENT_VCF_PROCESSING"]) params.extend(["--read_filter", "BadCigar", "--read_filter", "NotPrimaryAlignment"]) if memscale: jvm_opts = get_gatk_opts(config, tmp_dir=tmp_dir, memscale=memscale, include_gatk=False) else: # Decrease memory slightly from configuration to avoid memory allocation errors jvm_opts = config_utils.adjust_opts(self._jvm_opts, {"algorithm": {"memory_adjust": {"magnitude": 1.1, "direction": "decrease"}}}) jvm_opts += get_default_jvm_opts(tmp_dir) if "keyfile" in self._gatk_resources: params = ["-et", "NO_ET", "-K", self._gatk_resources["keyfile"]] + params return ["java"] + jvm_opts + ["-jar", gatk_jar] + [str(x) for x in params]
def _square_batch_bcbio_variation(data, region, bam_files, vrn_files, out_file, todo="square"): """Run squaring or merging analysis using bcbio.variation.recall. """ ref_file = tz.get_in(("reference", "fasta", "base"), data) cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1) resources = config_utils.get_resources("bcbio-variation-recall", data["config"]) # adjust memory by cores but leave room for run program memory memcores = int(math.ceil(float(cores) / 5.0)) jvm_opts = config_utils.adjust_opts(resources.get("jvm_opts", ["-Xms250m", "-Xmx2g"]), {"algorithm": {"memory_adjust": {"direction": "increase", "magnitude": memcores}}}) # Write unique VCFs and BAMs to input file input_file = "%s-inputs.txt" % os.path.splitext(out_file)[0] with open(input_file, "w") as out_handle: out_handle.write("\n".join(sorted(list(set(vrn_files)))) + "\n") if todo == "square": out_handle.write("\n".join(sorted(list(set(bam_files)))) + "\n") variantcaller = tz.get_in(("config", "algorithm", "jointcaller"), data).replace("-joint", "") cmd = ["bcbio-variation-recall", todo] + jvm_opts + broad.get_default_jvm_opts() + \ ["-c", cores, "-r", bamprep.region_to_gatk(region)] if todo == "square": cmd += ["--caller", variantcaller] cmd += [out_file, ref_file, input_file] do.run(cmd, "%s in region: %s" % (cmd, bamprep.region_to_gatk(region))) return out_file
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data): """Run evaluation of a caller against the truth set using rtg vcfeval. """ out_dir = os.path.join(base_dir, "rtg") if not utils.file_exists(os.path.join(out_dir, "done")): if os.path.exists(out_dir): shutil.rmtree(out_dir) vrn_file, rm_file, interval_bed = _prepare_inputs( vrn_file, rm_file, rm_interval_file, base_dir, data) rtg_ref = tz.get_in(["reference", "rtg"], data) assert rtg_ref and os.path.exists(rtg_ref), ( "Did not find rtg indexed reference file for validation:\n%s\n" "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref) # handle CWL where we have a reference to a single file in the RTG directory if os.path.isfile(rtg_ref): rtg_ref = os.path.dirname(rtg_ref) # get core and memory usage from standard configuration threads = min(dd.get_num_cores(data), 6) resources = config_utils.get_resources("rtg", data["config"]) memory = config_utils.adjust_opts( resources.get("jvm_opts", ["-Xms500m", "-Xmx1500m"]), { "algorithm": { "memory_adjust": { "magnitude": threads, "direction": "increase" } } }) jvm_stack = [x for x in memory if x.startswith("-Xms")] jvm_mem = [x for x in memory if x.startswith("-Xmx")] jvm_stack = jvm_stack[0] if len(jvm_stack) > 0 else "-Xms500m" jvm_mem = jvm_mem[0].replace("-Xmx", "") if len(jvm_mem) > 0 else "3g" cmd = [ "rtg", "vcfeval", "--threads", str(threads), "-b", rm_file, "--bed-regions", interval_bed, "-c", vrn_file, "-t", rtg_ref, "-o", out_dir ] cmd += [ "--vcf-score-field='%s'" % (_pick_best_quality_score(vrn_file)) ] mem_export = "%s export RTG_JAVA_OPTS='%s' && export RTG_MEM=%s" % ( utils.local_path_export(), jvm_stack, jvm_mem) cmd = mem_export + " && " + " ".join(cmd) do.run(cmd, "Validate calls using rtg vcfeval", data) out = { "fp": os.path.join(out_dir, "fp.vcf.gz"), "fn": os.path.join(out_dir, "fn.vcf.gz") } tp_calls = os.path.join(out_dir, "tp.vcf.gz") tp_baseline = os.path.join(out_dir, "tp-baseline.vcf.gz") if os.path.exists(tp_baseline): out["tp"] = tp_baseline out["tp-calls"] = tp_calls else: out["tp"] = tp_calls return out
def cl_gatk(self, params, tmp_dir, memscale=None, parallel_gc=False): support_nt = set() support_nct = set(["BaseRecalibrator"]) if self._has_gatk_conda_wrapper(): gatk_jar = None else: gatk_jar = self._get_jar("GenomeAnalysisTK", ["GenomeAnalysisTKLite"], allow_missing=True) if not gatk_jar: raise ValueError("GATK processing requested but gatk or older jar install not found: " "http://bcbio-nextgen.readthedocs.io/en/latest/contents/" "installation.html#gatk-and-mutect-mutect2") is_gatk4 = "gatk4" not in dd.get_tools_off({"config": self._config}) cores = self._config["algorithm"].get("num_cores", 1) config = self._config atype_index = params.index("-T") if params.count("-T") > 0 \ else params.index("--analysis_type") prog = params[atype_index + 1] # For GATK4 specify command first, so swap params to accomplish if is_gatk4: params = params[:] del params[atype_index + 1] del params[atype_index] params = [prog] + params if cores and int(cores) > 1: if prog in support_nt: params.extend(["-nt", str(cores)]) elif prog in support_nct: params.extend(["-nct", str(cores)]) memscale = config["algorithm"]["memory_adjust"] = {"direction": "increase", "magnitude": max(1, int(cores) // 2)} # Filters and unsafe specifications not in GATK4 if LooseVersion(self.gatk_major_version()) > LooseVersion("1.9") and not is_gatk4: if len([x for x in params if x.startswith(("-U", "--unsafe"))]) == 0: params.extend(["-U", "LENIENT_VCF_PROCESSING"]) params.extend(["--read_filter", "BadCigar", "--read_filter", "NotPrimaryAlignment"]) if memscale: jvm_opts = get_gatk_opts(config, tmp_dir=tmp_dir, memscale=memscale, include_gatk=False, parallel_gc=parallel_gc) else: # Decrease memory slightly from configuration to avoid memory allocation errors jvm_opts = config_utils.adjust_opts(self._jvm_opts, {"algorithm": {"memory_adjust": {"magnitude": 1.1, "direction": "decrease"}}}) jvm_opts += get_default_jvm_opts(tmp_dir, parallel_gc=parallel_gc) if "keyfile" in self._gatk_resources: params = ["-et", "NO_ET", "-K", self._gatk_resources["keyfile"]] + params if gatk_jar: return " ".join(["java"] + jvm_opts + ["-jar", gatk_jar] + [str(x) for x in params]) else: cmd = gatk_cmd("gatk", jvm_opts, params, config=self._config) if cmd: return cmd else: raise ValueError("GATK processing requested but gatk or older jar install not found: " "http://bcbio-nextgen.readthedocs.io/en/latest/contents/" "installation.html#gatk-and-mutect-mutect2")
def cl_gatk(self, params, tmp_dir, memscale=None, parallel_gc=False): support_nt = set() support_nct = set(["BaseRecalibrator"]) if self._has_gatk_conda_wrapper(): gatk_jar = None else: gatk_jar = self._get_jar("GenomeAnalysisTK", ["GenomeAnalysisTKLite"], allow_missing=True) if not gatk_jar: raise ValueError("GATK processing requested but gatk or older jar install not found: " "http://bcbio-nextgen.readthedocs.io/en/latest/contents/" "installation.html#gatk-and-mutect-mutect2") is_gatk4 = "gatk4" not in dd.get_tools_off({"config": self._config}) cores = self._config["algorithm"].get("num_cores", 1) config = self._config atype_index = params.index("-T") if params.count("-T") > 0 \ else params.index("--analysis_type") prog = params[atype_index + 1] # For GATK4 specify command first, so swap params to accomplish if is_gatk4: params = params[:] del params[atype_index + 1] del params[atype_index] params = [prog] + params if cores and int(cores) > 1: if prog in support_nt: params.extend(["-nt", str(cores)]) elif prog in support_nct: params.extend(["-nct", str(cores)]) memscale = config["algorithm"]["memory_adjust"] = {"direction": "increase", "magnitude": max(1, int(cores) // 2)} # Filters and unsafe specifications not in GATK4 if LooseVersion(self.gatk_major_version()) > LooseVersion("1.9") and not is_gatk4: if len([x for x in params if x.startswith(("-U", "--unsafe"))]) == 0: params.extend(["-U", "LENIENT_VCF_PROCESSING"]) params.extend(["--read_filter", "BadCigar", "--read_filter", "NotPrimaryAlignment"]) if memscale: jvm_opts = get_gatk_opts(config, tmp_dir=tmp_dir, memscale=memscale, include_gatk=False, parallel_gc=parallel_gc) else: # Decrease memory slightly from configuration to avoid memory allocation errors jvm_opts = config_utils.adjust_opts(self._jvm_opts, {"algorithm": {"memory_adjust": {"magnitude": 1.1, "direction": "decrease"}}}) jvm_opts += get_default_jvm_opts(tmp_dir, parallel_gc=parallel_gc) if "keyfile" in self._gatk_resources: params = ["-et", "NO_ET", "-K", self._gatk_resources["keyfile"]] + params if gatk_jar: return " ".join(["java"] + jvm_opts + ["-jar", gatk_jar] + [str(x) for x in params]) else: cmd = gatk_cmd("gatk", jvm_opts, params, config=self._config) if cmd: return cmd else: raise ValueError("GATK processing requested but gatk or older jar install not found: " "http://bcbio-nextgen.readthedocs.io/en/latest/contents/" "installation.html#gatk-and-mutect-mutect2")
def cl_mutect(self, params, tmp_dir): """Define parameters to run the mutect paired algorithm. """ gatk_jar = self._get_jar("muTect", ["mutect"]) # Decrease memory slightly from configuration to avoid memory allocation errors jvm_opts = config_utils.adjust_opts(self._jvm_opts, {"algorithm": {"memory_adjust": {"magnitude": 1.1, "direction": "decrease"}}}) return ["java"] + jvm_opts + get_default_jvm_opts(tmp_dir) + \ ["-jar", gatk_jar] + [str(x) for x in params]
def cl_mutect(self, params, tmp_dir): """Define parameters to run the mutect paired algorithm.""" gatk_jar = self._get_jar("muTect") local_args = [] config = copy.deepcopy(self._config) local_args.append("-Djava.io.tmpdir=%s" % tmp_dir) return ["java"] + config_utils.adjust_opts(self._jvm_opts, config) + local_args + \ ["-jar", gatk_jar] + [str(x) for x in params]
def cl_mutect(self, params, tmp_dir): """Define parameters to run the mutect paired algorithm. """ gatk_jar = self._get_jar("muTect", ["mutect"]) # Decrease memory slightly from configuration to avoid memory allocation errors jvm_opts = config_utils.adjust_opts(self._jvm_opts, {"algorithm": {"memory_adjust": {"magnitude": 1.1, "direction": "decrease"}}}) return ["java"] + jvm_opts + get_default_jvm_opts(tmp_dir) + \ ["-jar", gatk_jar] + [str(x) for x in params]
def cl_mutect(self, params, tmp_dir): """Define parameters to run the mutect paired algorithm.""" gatk_jar = self._get_jar("muTect") local_args = [] config = copy.deepcopy(self._config) local_args.append("-Djava.io.tmpdir=%s" % tmp_dir) return ["java"] + config_utils.adjust_opts(self._jvm_opts, config) + local_args + \ ["-jar", gatk_jar] + [str(x) for x in params]
def _get_fgbio_jvm_opts(data, tmpdir, scale_factor=None): cores, mem = _get_cores_memory(data) resources = config_utils.get_resources("fgbio", data["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx4g"]) if scale_factor and cores > scale_factor: jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust": {"direction": "increase", "magnitude": cores // scale_factor}}}) jvm_opts += broad.get_default_jvm_opts() jvm_opts = " ".join(jvm_opts) return jvm_opts + " --tmp-dir %s" % tmpdir
def _get_fgbio_jvm_opts(data, tmpdir, scale_factor=None): cores, mem = _get_cores_memory(data) resources = config_utils.get_resources("fgbio", data["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx4g"]) if scale_factor and cores > scale_factor: jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust": {"direction": "increase", "magnitude": cores // scale_factor}}}) jvm_opts += broad.get_default_jvm_opts() jvm_opts = " ".join(jvm_opts) return jvm_opts + " --tmp-dir %s" % tmpdir
def _get_jvm_opts(out_file, data): """Retrieve Java options, adjusting memory for available cores. """ resources = config_utils.get_resources("purple", data["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx3500m"]) jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust": {"direction": "increase", "maximum": "30000M", "magnitude": dd.get_cores(data)}}}) jvm_opts += broad.get_default_jvm_opts(os.path.dirname(out_file)) return jvm_opts
def _get_varscan_opts(config, tmp_dir): """Retrieve common options for running VarScan. Handles jvm_opts, setting user and country to English to avoid issues with different locales producing non-compliant VCF. """ resources = config_utils.get_resources("varscan", config) jvm_opts = resources.get("jvm_opts", ["-Xmx750m", "-Xmx2g"]) jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust": {"magnitude": 1.1, "direction": "decrease"}}}) jvm_opts += ["-Duser.language=en", "-Duser.country=US"] jvm_opts += broad.get_default_jvm_opts(tmp_dir) return " ".join(jvm_opts)
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data, validate_method): """Run evaluation of a caller against the truth set using rtg vcfeval. """ out_dir = os.path.join(base_dir, "rtg") if not utils.file_exists(os.path.join(out_dir, "done")): if os.path.exists(out_dir): shutil.rmtree(out_dir) vrn_file, rm_file, interval_bed = _prepare_inputs(vrn_file, rm_file, rm_interval_file, base_dir, data) rtg_ref = tz.get_in(["reference", "rtg"], data) if isinstance(rtg_ref, dict) and "base" in rtg_ref: rtg_ref = os.path.dirname(rtg_ref["base"]) assert rtg_ref and os.path.exists(rtg_ref), ("Did not find rtg indexed reference file for validation:\n%s\n" "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref) # handle CWL where we have a reference to a single file in the RTG directory if os.path.isfile(rtg_ref): rtg_ref = os.path.dirname(rtg_ref) # get core and memory usage from standard configuration threads = min(dd.get_num_cores(data), 6) resources = config_utils.get_resources("rtg", data["config"]) memory = config_utils.adjust_opts(resources.get("jvm_opts", ["-Xms500m", "-Xmx1500m"]), {"algorithm": {"memory_adjust": {"magnitude": threads, "direction": "increase"}}}) jvm_stack = [x for x in memory if x.startswith("-Xms")] jvm_mem = [x for x in memory if x.startswith("-Xmx")] jvm_stack = jvm_stack[0] if len(jvm_stack) > 0 else "-Xms500m" jvm_mem = jvm_mem[0].replace("-Xmx", "") if len(jvm_mem) > 0 else "3g" cmd = ["rtg", "vcfeval", "--threads", str(threads), "-b", rm_file, "--bed-regions", interval_bed, "-c", vrn_file, "-t", rtg_ref, "-o", out_dir] if validate_method == "rtg-squash-ploidy": cmd += ["--squash-ploidy"] rm_samples = vcfutils.get_samples(rm_file) if len(rm_samples) > 1 and dd.get_sample_name(data) in rm_samples: cmd += ["--sample=%s" % dd.get_sample_name(data)] cmd += ["--vcf-score-field='%s'" % (_pick_best_quality_score(vrn_file))] mem_export = "%s export RTG_JAVA_OPTS='%s' && export RTG_MEM=%s" % (utils.local_path_export(), jvm_stack, jvm_mem) cmd = mem_export + " && " + " ".join(cmd) do.run(cmd, "Validate calls using rtg vcfeval", data) out = {"fp": os.path.join(out_dir, "fp.vcf.gz"), "fn": os.path.join(out_dir, "fn.vcf.gz")} tp_calls = os.path.join(out_dir, "tp.vcf.gz") tp_baseline = os.path.join(out_dir, "tp-baseline.vcf.gz") if os.path.exists(tp_baseline): out["tp"] = tp_baseline out["tp-calls"] = tp_calls else: out["tp"] = tp_calls return out
def _get_snpeff_cmd(cmd_name, datadir, data, out_file): """Retrieve snpEff base command line. """ resources = config_utils.get_resources("snpeff", data["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx3g"]) # scale by cores, defaulting to 2x base usage to ensure we have enough memory # for single core runs to use with human genomes jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust": {"direction": "increase", "magnitude": max(2, dd.get_cores(data))}}}) memory = " ".join(jvm_opts) snpeff = config_utils.get_program("snpEff", data["config"]) java_args = "-Djava.io.tmpdir=%s" % utils.safe_makedir(os.path.join(os.path.dirname(out_file), "tmp")) export = "unset JAVA_HOME && export PATH=%s:$PATH && " % (utils.get_java_binpath()) cmd = "{export} {snpeff} {memory} {java_args} {cmd_name} -dataDir {datadir}" return cmd.format(**locals())
def _get_snpeff_cmd(cmd_name, datadir, data, out_file): """Retrieve snpEff base command line. """ resources = config_utils.get_resources("snpeff", data["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx3g"]) # scale by cores, defaulting to 2x base usage to ensure we have enough memory # for single core runs to use with human genomes jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust": {"direction": "increase", "magnitude": max(2, dd.get_cores(data))}}}) memory = " ".join(jvm_opts) snpeff = config_utils.get_program("snpEff", data["config"]) java_args = "-Djava.io.tmpdir=%s" % utils.safe_makedir(os.path.join(os.path.dirname(out_file), "tmp")) export = "unset JAVA_HOME && export PATH=%s:$PATH && " % (utils.get_java_binpath()) cmd = "{export} {snpeff} {memory} {java_args} {cmd_name} -dataDir {datadir}" return cmd.format(**locals())
def _get_gatk_opts(config, names, tmp_dir=None, memscale=None): """Retrieve GATK memory specifications, moving down a list of potential specifications. """ opts = ["-U", "LENIENT_VCF_PROCESSING", "--read_filter", "BadCigar", "--read_filter", "NotPrimaryAlignment"] if tmp_dir: opts.append("-Djava.io.tmpdir=%s" % tmp_dir) jvm_opts = ["-Xms750m", "-Xmx2g"] for n in names: resources = config_utils.get_resources(n, config) if resources and resources.get("jvm_opts"): jvm_opts = resources.get("jvm_opts") break if memscale: jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust": memscale}}) return jvm_opts + opts
def _run_gridss(inputs, background, work_dir): out_file = os.path.join( work_dir, "%s-gridss.sv.vcf" % (dd.get_batch(inputs[0]) or dd.get_sample_name(inputs[0]))) if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): with file_transaction(inputs[0], out_file) as tx_out_file: htsjdk_opts = [ "-Dsamjdk.create_index=true", "-Dsamjdk.use_async_io_read_samtools=true", "-Dsamjdk.use_async_io_write_samtools=true", "-Dsamjdk.use_async_io_write_tribble=true" ] cores = dd.get_cores(inputs[0]) resources = config_utils.get_resources("gridss", inputs[0]["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx4g"]) jvm_opts = config_utils.adjust_opts( jvm_opts, { "algorithm": { "memory_adjust": { "direction": "increase", "magnitude": cores } } }) jvm_opts = _finalize_memory(jvm_opts) tx_ref_file = _setup_reference_files(inputs[0], os.path.dirname(tx_out_file)) blacklist_bed = sshared.prepare_exclude_file( inputs + background, out_file) cmd = ["gridss"] + jvm_opts + htsjdk_opts + ["gridss.CallVariants"] + \ ["THREADS=%s" % cores, "TMP_DIR=%s" % os.path.dirname(tx_out_file), "WORKING_DIR=%s" % os.path.dirname(tx_out_file), "OUTPUT=%s" % tx_out_file, "ASSEMBLY=%s" % tx_out_file.replace(".sv.vcf", ".gridss.assembly.bam"), "REFERENCE_SEQUENCE=%s" % tx_ref_file, "BLACKLIST=%s" % blacklist_bed] for data in inputs + background: cmd += [ "INPUT=%s" % dd.get_align_bam(data), "INPUT_LABEL=%s" % dd.get_sample_name(data) ] exports = utils.local_path_export() cmd = exports + " ".join(cmd) do.run(cmd, "GRIDSS SV analysis") return vcfutils.bgzip_and_index(out_file, inputs[0]["config"])
def _run_break_point_inspector(data, variant_file, paired): output_vcf = "%s-%s.vcf.gz" % (utils.splitext_plus(variant_file)[0], "bpi") if not utils.file_exists(output_vcf): with file_transaction(data, output_vcf) as tx_output_vcf: cores = dd.get_num_cores(data) resources = config_utils.get_resources("break-point-inspector", data["config"]) memory = config_utils.adjust_opts(resources.get("jvm_opts", ["-Xms1000m", "-Xmx2000m"]), {"algorithm": {"memory_adjust": {"magnitude": cores, "direction": "increase"}}}) cmd = ["break-point-inspector"] cmd += memory cmd += ["-vcf", variant_file] if paired: cmd += ["-ref", paired.normal_bam, "-tumor", paired.tumor_bam] cmd += ["-output_vcf", tx_output_vcf] do.run(cmd, "Running Break Point Inspector for Manta SV calls") return output_vcf
def _get_jvm_opts(out_file, data): """Retrieve Java options, adjusting memory for available cores. """ resources = config_utils.get_resources("purple", data["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx3500m"]) jvm_opts = config_utils.adjust_opts( jvm_opts, { "algorithm": { "memory_adjust": { "direction": "increase", "maximum": "30000M", "magnitude": dd.get_cores(data) } } }) jvm_opts += broad.get_default_jvm_opts(os.path.dirname(out_file)) return jvm_opts
def _run_break_point_inspector(data, variant_file, paired, work_dir): output_vcf = "%s-%s.vcf.gz" % (utils.splitext_plus(variant_file)[0], "bpi") stats_file = "%s-%s_stats.txt" % (utils.splitext_plus(variant_file)[0], "bpi") if not utils.file_exists(output_vcf): with file_transaction(data, output_vcf) as tx_output_vcf, file_transaction(data, stats_file) as tx_stats_file: cores = dd.get_num_cores(data) resources = config_utils.get_resources("break-point-inspector", data["config"]) jvm_mem_opts = config_utils.adjust_opts(resources.get("jvm_opts", ["-Xms1000m", "-Xmx2000m"]), {"algorithm": {"memory_adjust": {"magnitude": cores, "direction": "increase"}}}) jvm_tmp_arg = "-Djava.io.tmpdir=" + utils.safe_makedir(os.path.join(work_dir, "bpi_tmp")) cmd = ["break-point-inspector"] + jvm_mem_opts + [jvm_tmp_arg, "-vcf", variant_file] if paired: cmd += ["-ref", paired.normal_bam, "-tumor", paired.tumor_bam] cmd += ["-output_vcf", tx_output_vcf, ">", tx_stats_file] do.run(cmd, "Running Break Point Inspector for Manta SV calls") return output_vcf
def _get_gatk_opts(config, names, tmp_dir=None, memscale=None, include_gatk=True, parallel_gc=False): """Retrieve GATK memory specifications, moving down a list of potential specifications. """ if include_gatk and "gatk4" in dd.get_tools_off({"config": config}): opts = ["-U", "LENIENT_VCF_PROCESSING", "--read_filter", "BadCigar", "--read_filter", "NotPrimaryAlignment"] else: opts = [] jvm_opts = ["-Xms750m", "-Xmx2g"] for n in names: resources = config_utils.get_resources(n, config) if resources and resources.get("jvm_opts"): jvm_opts = resources.get("jvm_opts") break if memscale: jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust": memscale}}) jvm_opts += get_default_jvm_opts(tmp_dir, parallel_gc=parallel_gc) return jvm_opts + opts
def _get_snpeff_cmd(cmd_name, datadir, data, out_file): """Retrieve snpEff base command line. """ resources = config_utils.get_resources("snpeff", data["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx3g"]) # scale by cores, defaulting to 2x base usage to ensure we have enough memory # for single core runs to use with human genomes. # Sets a maximum amount of memory to avoid core dumps exceeding 32Gb # We shouldn't need that much memory for snpEff, so avoid issues # https://www.elastic.co/guide/en/elasticsearch/guide/current/heap-sizing.html#compressed_oops jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust": {"direction": "increase", "maximum": "30000M", "magnitude": max(2, dd.get_cores(data))}}}) memory = " ".join(jvm_opts) snpeff = config_utils.get_program("snpEff", data["config"]) java_args = "-Djava.io.tmpdir=%s" % utils.safe_makedir(os.path.join(os.path.dirname(out_file), "tmp")) export = "unset JAVA_HOME && export PATH=%s:\"$PATH\" && " % (utils.get_java_binpath()) cmd = "{export} {snpeff} {memory} {java_args} {cmd_name} -dataDir {datadir}" return cmd.format(**locals())
def _get_snpeff_cmd(cmd_name, datadir, data, out_file): """Retrieve snpEff base command line. """ resources = config_utils.get_resources("snpeff", data["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx3g"]) jvm_opts = config_utils.adjust_opts( jvm_opts, { "algorithm": { "memory_adjust": { "direction": "increase", "magnitude": dd.get_cores(data) } } }) memory = " ".join(jvm_opts) snpeff = config_utils.get_program("snpEff", data["config"]) java_args = "-Djava.io.tmpdir=%s" % utils.safe_makedir( os.path.join(os.path.dirname(out_file), "tmp")) export = utils.local_path_export() cmd = "{export} {snpeff} {memory} {java_args} {cmd_name} -dataDir {datadir}" return cmd.format(**locals())
def _square_batch_bcbio_variation(data, region, bam_files, vrn_files, out_file): """ """ ref_file = tz.get_in(("reference", "fasta", "base"), data) cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1) resources = config_utils.get_resources("bcbio-variation-recall", data["config"]) jvm_opts = config_utils.adjust_opts( resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"]), { "algorithm": { "memory_adjust": { "direction": "increase", "magnitude": cores } } }) cmd = ["bcbio-variation-recall", "square"] + jvm_opts + \ ["-c", cores, "-r", bamprep.region_to_gatk(region)] + \ [out_file, ref_file] + vrn_files + bam_files do.run(cmd, "Squaring off in region: %s" % bamprep.region_to_gatk(region)) return out_file
def summary(samples, config): """Provide summary information on a single sample across regions of interest. """ try: bc_jar = config_utils.get_jar( "bcbio.coverage", config_utils.get_program("bcbio_coverage", config, "dir")) except ValueError: logger.warning( "No coverage calculations: Did not find bcbio.coverage jar from system config" ) return [[x] for x in samples] config_file, out_file = _prep_coverage_config(samples, config) tmp_dir = utils.safe_makedir(os.path.join(os.path.dirname(out_file), "tmp")) resources = config_utils.get_resources("bcbio_coverage", config) config = copy.deepcopy(config) config["algorithm"]["memory_adjust"] = { "direction": "increase", "magnitude": config["algorithm"].get("num_cores", 1) } jvm_opts = config_utils.adjust_opts( resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"]), config) if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: java_args = [ "-Djava.io.tmpdir=%s" % tmp_dir, "-Djava.awt.headless=true" ] cmd = ["java"] + jvm_opts + java_args + [ "-jar", bc_jar, "multicompare", config_file, tx_out_file, "-c", str(config["algorithm"].get("num_cores", 1)) ] do.run(cmd, "Summarizing coverage with bcbio.coverage", samples[0]) out = [] for x in samples: x["coverage"] = {"summary": out_file} out.append([x]) return out