def parallel_calling(data, run_parallel): """This is needed only if running methylated veruss hidroxy-methulated""" out = [] for sample in data: work_bam = dd.get_work_bam(sample[0]) with closing(pysam.Samfile(work_bam, "rb")) as pysam_work_bam: chroms = pysam_work_bam.references for chrom in chroms: new_sample = copy.deepcopy(sample) if chrom.find("_") > -1: continue new_sample[0]['chr_to_run'] = chrom out.append(new_sample) out = run_parallel("cpg_calling", out) for sample in out: phenotype = dd.get_phenotype(sample[0]) batch = dd.get_batch(sample[0]) if phenotype == "mC": for sample2 in out: if batch in dd.get_batch(sample2[0]) and dd.get_phenotype( sample2[0]) == "hmC": if sample[0]["chr_to_run"] == sample2[0]["chr_to_run"]: sample[0]["control"] = sample2[0]["cpg_file"] break out = run_parallel("cpg_processing", out) for sample in data: sample[0]["cpg_split"] = [] sample[0]["hmc_split"] = [] name = dd.get_sample_name(sample[0]) for chunck in out: if name == dd.get_sample_name(chunck[0]): sample[0]["cpg_split"].append(chunck[0]["cpg_file"]) if "hmc_file" in chunck[0]: sample[0]["hmc_split"].append(chunck[0]["hmc_file"])
def _get_paired_samples(sample, data): """Get input sample for each chip bam file.""" dd.get_phenotype(sample) for origin in data: if dd.get_batch(sample) in dd.get_batch(origin[0]) and dd.get_phenotype(origin[0]) == "input": sample["work_bam_input"] = dd.get_work_bam(origin[0]) return [sample]
def _get_paired_samples(sample, data): """Get input sample for each chip bam file.""" dd.get_phenotype(sample) for origin in data: if dd.get_batch(sample) in dd.get_batch( origin[0]) and dd.get_phenotype(origin[0]) == "input": sample["work_bam_input"] = dd.get_work_bam(origin[0]) return [sample]
def _check(sample, data): """Get input sample for each chip bam file.""" if dd.get_chip_method(sample).lower() == "atac": return [sample] if dd.get_phenotype(sample) == "input": return None for origin in data: if dd.get_batch(sample) in dd.get_batch(origin[0]) and dd.get_phenotype(origin[0]) == "input": sample["work_bam_input"] = dd.get_work_bam(origin[0]) return [sample] return [sample]
def _get_replicate_samples(sample, data): """Get input sample for each chip bam file.""" dd.get_phenotype(sample) rep_bam = "" for origin in data: if dd.get_batch(sample) in dd.get_batch(origin[0]) and dd.get_phenotype(sample) in dd.get_phenotype(origin[0]) and dd.get_work_bam(sample) != dd.get_work_bam(origin[0]) and dd.get_phenotype(origin[0]) != "control": if rep_bam != "": rep_bam = rep_bam + "," + dd.get_work_bam(origin[0]) else: rep_bam = dd.get_work_bam(origin[0]) sample["work_bam_rep"] = dd.get_work_bam(origin[0]) return [sample]
def finalize_sv(samples, config): """Combine results from multiple sv callers into a single ordered 'sv' key. """ by_bam = collections.OrderedDict() for x in samples: batch = dd.get_batch(x) or [dd.get_sample_name(x)] try: by_bam[x["align_bam"], tuple(batch)].append(x) except KeyError: by_bam[x["align_bam"], tuple(batch)] = [x] by_batch = collections.OrderedDict() lead_batches = {} for grouped_calls in by_bam.values(): def orig_svcaller_order(x): orig_callers = tz.get_in(["config", "algorithm", "svcaller_orig"], x) cur_caller = tz.get_in(["config", "algorithm", "svcaller"], x) return orig_callers.index(cur_caller) sorted_svcalls = sorted([x for x in grouped_calls if "sv" in x], key=orig_svcaller_order) final = grouped_calls[0] if len(sorted_svcalls) > 0: final["sv"] = reduce(operator.add, [x["sv"] for x in sorted_svcalls]) final["config"]["algorithm"]["svcaller"] = final["config"][ "algorithm"].pop("svcaller_orig") batch = dd.get_batch(final) or dd.get_sample_name(final) batches = batch if isinstance(batch, (list, tuple)) else [batch] if len(batches) > 1: lead_batches[(dd.get_sample_name(final), dd.get_phenotype(final) == "germline")] = batches[0] for batch in batches: try: by_batch[batch].append(final) except KeyError: by_batch[batch] = [final] out = [] for batch, items in by_batch.items(): if any("svplots" in dd.get_tools_on(d) for d in items): items = plot.by_regions(items) for data in items: if lead_batches.get( (dd.get_sample_name(data), dd.get_phenotype(data) == "germline")) in [batch, None]: out.append([data]) return out
def _get_original_targets(data): """Back compatible: get pre-existing target BEDs. """ work_dir = os.path.join(_sv_workdir(data), "raw") batch = dd.get_batch(data) or dd.get_sample_name(data) return (glob.glob(os.path.join(work_dir, "*-%s.target.bed" % batch))[0], glob.glob(os.path.join(work_dir, "*-%s.antitarget.bed" % batch))[0])
def _batch_split_by_sv(samples, stage): to_process = collections.OrderedDict() extras = [] background = [] for data in (utils.to_single_data(x) for x in samples): ready_data = _handle_multiple_svcallers(data, stage) if len(ready_data) > 0: background.append(data) for x in ready_data: svcaller = tz.get_in(["config", "algorithm", "svcaller"], x) batch = dd.get_batch(x) or dd.get_sample_name(x) if stage in ["ensemble"]: # no batching for ensemble methods if isinstance(batch, six.string_types) and batch != dd.get_sample_name(x): batch += "_%s" % dd.get_sample_name(x) else: batch = dd.get_sample_name(x) if dd.get_phenotype(x) == "germline": batch += "_germline" elif svcaller in _GLOBAL_BATCHING: # All samples batched together for analyses batch = "all" batches = batch if isinstance(batch, (list, tuple)) else [batch] for b in batches: try: to_process[(svcaller, b)].append(x) except KeyError: to_process[(svcaller, b)] = [x] else: extras.append([data]) return to_process, extras, background
def _cnvkit_targets(raw_target_bed, access_bed, cov_interval, work_dir, data): """Create target and antitarget regions from target and access files. """ batch = dd.get_batch(data) or dd.get_sample_name(data) basename = os.path.splitext(os.path.basename(raw_target_bed))[0] target_bed = os.path.join(work_dir, "%s-%s.target.bed" % (basename, batch)) # back compatible with previous runs to avoid re-calculating target_bed_old = os.path.join(work_dir, "%s.target.bed" % basename) if utils.file_exists(target_bed_old): target_bed = target_bed_old if not utils.file_exists(target_bed): with file_transaction(data, target_bed) as tx_out_file: cmd = [_get_cmd(), "target", raw_target_bed, "--split", "-o", tx_out_file] bin_estimates = _cnvkit_coverage_bin_estimate(raw_target_bed, access_bed, cov_interval, work_dir, data) if bin_estimates.get("target"): cmd += ["--avg-size", str(bin_estimates["target"])] do.run(_prep_cmd(cmd, tx_out_file), "CNVkit target") antitarget_bed = os.path.join(work_dir, "%s-%s.antitarget.bed" % (basename, batch)) antitarget_bed_old = os.path.join(work_dir, "%s.antitarget.bed" % basename) # back compatible with previous runs to avoid re-calculating if os.path.exists(antitarget_bed_old): antitarget_bed = antitarget_bed_old if not os.path.exists(antitarget_bed): with file_transaction(data, antitarget_bed) as tx_out_file: cmd = [_get_cmd(), "antitarget", "-g", access_bed, target_bed, "-o", tx_out_file] bin_estimates = _cnvkit_coverage_bin_estimate(raw_target_bed, access_bed, cov_interval, work_dir, data) if bin_estimates.get("antitarget"): cmd += ["--avg-size", str(bin_estimates["antitarget"])] do.run(_prep_cmd(cmd, tx_out_file), "CNVkit antitarget") return target_bed, antitarget_bed
def _combine_qc_samples(samples): """Combine split QC analyses into single samples based on BAM files. """ by_bam = collections.defaultdict(list) for data in [utils.to_single_data(x) for x in samples]: batch = dd.get_batch(data) or dd.get_sample_name(data) if not isinstance(batch, (list, tuple)): batch = [batch] batch = tuple(batch) by_bam[(dd.get_align_bam(data) or dd.get_work_bam(data), batch)].append(data) out = [] for data_group in by_bam.values(): data = data_group[0] alg_qc = [] qc = {} metrics = {} for d in data_group: qc.update(dd.get_summary_qc(d)) metrics.update(dd.get_summary_metrics(d)) alg_qc.extend(dd.get_algorithm_qc(d)) data["config"]["algorithm"]["qc"] = alg_qc data["summary"]["qc"] = qc data["summary"]["metrics"] = metrics out.append([data]) return out
def _run_gridss(inputs, background, work_dir): out_file = os.path.join(work_dir, "%s-gridss.sv.vcf" % (dd.get_batch(inputs[0]) or dd.get_sample_name(inputs[0]))) if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): with file_transaction(inputs[0], out_file) as tx_out_file: htsjdk_opts = ["-Dsamjdk.create_index=true", "-Dsamjdk.use_async_io_read_samtools=true", "-Dsamjdk.use_async_io_write_samtools=true", "-Dsamjdk.use_async_io_write_tribble=true"] cores = dd.get_cores(inputs[0]) resources = config_utils.get_resources("gridss", inputs[0]["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx4g"]) jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust": {"direction": "increase", "magnitude": cores}}}) jvm_opts = _finalize_memory(jvm_opts) tx_ref_file = _setup_reference_files(inputs[0], os.path.dirname(tx_out_file)) blacklist_bed = sshared.prepare_exclude_file(inputs + background, out_file) cmd = ["gridss"] + jvm_opts + htsjdk_opts + ["gridss.CallVariants"] + \ ["THREADS=%s" % cores, "TMP_DIR=%s" % os.path.dirname(tx_out_file), "WORKING_DIR=%s" % os.path.dirname(tx_out_file), "OUTPUT=%s" % tx_out_file, "ASSEMBLY=%s" % tx_out_file.replace(".sv.vcf", ".gridss.assembly.bam"), "REFERENCE_SEQUENCE=%s" % tx_ref_file, "BLACKLIST=%s" % blacklist_bed] for data in inputs + background: cmd += ["INPUT=%s" % dd.get_align_bam(data), "INPUT_LABEL=%s" % dd.get_sample_name(data)] exports = utils.local_path_export() cmd = exports + " ".join(cmd) do.run(cmd, "GRIDSS SV analysis") return vcfutils.bgzip_and_index(out_file, inputs[0]["config"])
def _batch_split_by_sv(samples, stage): to_process = collections.OrderedDict() extras = [] background = [] for data in (utils.to_single_data(x) for x in samples): ready_data = _handle_multiple_svcallers(data, stage) if len(ready_data) > 0: background.append(data) for x in ready_data: svcaller = tz.get_in(["config", "algorithm", "svcaller"], x) batch = dd.get_batch(x) or dd.get_sample_name(x) if stage in ["precall", "ensemble" ]: # no batching for precall or ensemble methods if isinstance( batch, basestring) and batch != dd.get_sample_name(x): batch += "_%s" % dd.get_sample_name(x) else: batch = dd.get_sample_name(x) if dd.get_phenotype(x) == "germline": batch += "_germline" elif svcaller in _GLOBAL_BATCHING: # All samples batched together for analyses batch = "all" batches = batch if isinstance(batch, (list, tuple)) else [batch] for b in batches: try: to_process[(svcaller, b)].append(x) except KeyError: to_process[(svcaller, b)] = [x] else: extras.append([data]) return to_process, extras, background
def _cnvkit_coverage(data, bed_file, input_type): """Calculate coverage in a BED file for CNVkit. """ bam_file = dd.get_align_bam(data) work_dir = utils.safe_makedir(os.path.join(_sv_workdir(data), "raw")) exts = {".target.bed": ("target", "targetcoverage.cnn"), ".antitarget.bed": ("antitarget", "antitargetcoverage.cnn")} cnntype = None for orig, (cur_cnntype, ext) in exts.items(): if bed_file.endswith(orig): cnntype = cur_cnntype break if cnntype is None: assert bed_file.endswith(".bed"), "Unexpected BED file extension for coverage %s" % bed_file cnntype = "" batch = dd.get_batch(data) or dd.get_sample_name(data) base = _bam_to_outbase(bam_file, work_dir) out_file = "%s-%s.%s" % (base, batch, ext) out_file_old = "%s.%s" % (base, ext) # back compatible with previous runs to avoid re-calculating if utils.file_exists(out_file_old): out_file = out_file_old if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: cmd = [_get_cmd(), "coverage", "-p", str(dd.get_cores(data)), bam_file, bed_file, "-o", tx_out_file] do.run(_prep_cmd(cmd, tx_out_file), "CNVkit coverage") return {"itype": input_type, "file": out_file, "bam": bam_file, "cnntype": cnntype, "sample": dd.get_sample_name(data)}
def run(samples, run_parallel, initial_only=False): """Run structural variation detection. initial_only indicates we should run structural variation inputs, like CNV calling, we can use to inform low frequency variant calling. """ to_process = collections.OrderedDict() extras = [] background = [] for data in (xs[0] for xs in samples): ready_data = _handle_multiple_svcallers(data) if len(ready_data) > 0: background.append(data) for x in ready_data: svcaller = x["config"]["algorithm"].get("svcaller_active") batch = dd.get_batch(x) if svcaller in _BATCH_CALLERS and batch: batches = batch if isinstance(batch, (list, tuple)) else [batch] for b in batches: try: to_process[(svcaller, b)].append(x) except KeyError: to_process[(svcaller, b)] = [x] else: to_process[(svcaller, dd.get_sample_name(x))] = [x] else: extras.append([data]) processed = run_parallel("detect_sv", ([xs, background, xs[0]["config"], initial_only] for xs in to_process.values())) finalized = (run_parallel("finalize_sv", [([xs[0] for xs in processed], processed[0][0]["config"])]) if len(processed) > 0 else []) return extras + finalized
def _combine_qc_samples(samples): """Combine split QC analyses into single samples based on BAM files. """ by_bam = collections.defaultdict(list) for data in [utils.to_single_data(x) for x in samples]: batch = dd.get_batch(data) or dd.get_sample_name(data) if not isinstance(batch, (list, tuple)): batch = [batch] batch = tuple(batch) by_bam[(dd.get_align_bam(data), batch)].append(data) out = [] for data_group in by_bam.values(): data = data_group[0] alg_qc = [] qc = {} metrics = {} for d in data_group: qc.update(dd.get_summary_qc(d)) metrics.update(dd.get_summary_metrics(d)) alg_qc.extend(dd.get_algorithm_qc(d)) data["config"]["algorithm"]["qc"] = alg_qc data["summary"]["qc"] = qc data["summary"]["metrics"] = metrics out.append([data]) return out
def run(samples, run_parallel, stage): """Run structural variation detection. The stage indicates which level of structural variant calling to run. - initial, run prior to other callers and variant calling - standard, regular batch calling - ensemble, post-calling, combine other callers- """ to_process = collections.OrderedDict() extras = [] background = [] for data in (xs[0] for xs in samples): ready_data = _handle_multiple_svcallers(data, stage) if len(ready_data) > 0: background.append(data) for x in ready_data: svcaller = x["config"]["algorithm"].get("svcaller_active") if stage == "ensemble": # no batching for ensemble methods batch = dd.get_sample_name(x) else: batch = dd.get_batch(x) or dd.get_sample_name(x) batches = batch if isinstance(batch, (list, tuple)) else [batch] for b in batches: try: to_process[(svcaller, b)].append(x) except KeyError: to_process[(svcaller, b)] = [x] else: extras.append([data]) processed = run_parallel("detect_sv", ([xs, background, xs[0]["config"], stage] for xs in to_process.values())) finalized = (run_parallel("finalize_sv", [([xs[0] for xs in processed], processed[0][0]["config"])]) if len(processed) > 0 else []) return extras + finalized
def calculate_sv_coverage(data): """Calculate coverage within bins for downstream CNV calling. Creates corrected cnr files with log2 ratios and depths. data is one sample """ calcfns = {"cnvkit": _calculate_sv_coverage_cnvkit, "gatk-cnv": _calculate_sv_coverage_gatk} from bcbio.structural import cnvkit data = utils.to_single_data(data) from bcbio.structural import get_svcallers sv_callers = get_svcallers(data) has_cnvkit_or_gatkcnv = bool(set(["cnvkit", "gatk-cnv"]) & set(sv_callers)) if not cnvkit.use_general_sv_bins(data) or not has_cnvkit_or_gatkcnv: out_target_file, out_anti_file = (None, None) else: work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural", dd.get_sample_name(data), "bins")) out_target_file, out_anti_file = calcfns[cnvkit.bin_approach(data)](data, work_dir) if not os.path.exists(out_target_file): out_target_file, out_anti_file = (None, None) if "seq2c" in dd.get_svcaller(data): from bcbio.structural import seq2c seq2c_target = seq2c.precall(data) else: seq2c_target = None if "purecn" in dd.get_svcaller(data): # set purecn_pon_build flag batches = dd.get_batch(data) if batches and "pon_build" in dd.get_batch(data): data["config"]["algorithm"]["purecn_pon_build"] = True from bcbio.structural import purecn # still calculate coverage even when not building pon - for t-only analysis purecn_target = purecn.get_coverage(data) else: purecn_target = None if not tz.get_in(["depth", "bins"], data): data = tz.update_in(data, ["depth", "bins"], lambda x: {}) data["depth"]["bins"] = {"target": out_target_file, "antitarget": out_anti_file, "seq2c": seq2c_target, "purecn": purecn_target} return [[data]]
def _get_batches(data, require_bam=True): if bam_needs_processing(data) or not require_bam: batches = dd.get_batch(data) or dd.get_sample_name(data) else: batches = dd.get_sample_name(data) if not isinstance(batches, (list, tuple)): batches = [batches] return batches
def get_samples_by_batch(samples): batch_samples = defaultdict(list) for data in dd.sample_data_iterator(samples): batch = dd.get_batch(data) or dd.get_sample_name(data) if isinstance(batch, list): batch = tuple(batch) batch_samples[batch].append(data) return batch_samples
def _get_batch_name(sample): """Retrieve batch name for use in SV calling outputs. Handles multiple batches split via SV calling. """ batch = dd.get_batch(sample) or dd.get_sample_name(sample) if isinstance(batch, (list, tuple)) and len(batch) > 1: batch = dd.get_sample_name(sample) return batch
def _cnvkit_coverage_bin_estimate(raw_target_bed, access_bed, cov_interval, work_dir, data): """Estimate good coverage bin sizes for target regions based on coverage. """ batch = dd.get_batch(data) or dd.get_sample_name(data) out_file = os.path.join( work_dir, "%s-%s-bin_estimate.txt" % (os.path.splitext(os.path.basename(raw_target_bed))[0], batch)) method_map = { "genome": "wgs", "regional": "hybrid", "amplicon": "amplicon" } if not os.path.exists(out_file): with file_transaction(data, out_file) as tx_out_file: cmd = [ _get_cmd("coverage_bin_size.py"), dd.get_align_bam(data), "-m", method_map[cov_interval], "-t", raw_target_bed, "-g", access_bed ] cmd = " ".join(cmd) + " > " + tx_out_file try: do.run(_prep_cmd(cmd, tx_out_file), "CNVkit coverage bin estimation", log_error=False) except subprocess.CalledProcessError: logger.info("Bin size estimate failed, using default values") with open(tx_out_file, "w") as out_handle: out_handle.write( "Bin size estimate failed, using default values") avg_bin_sizes = {} estimate_map = { "On-target": "target", "Off-target": "antitarget", "Genome": "target", "Targets (sampling)": "target" } range_map = { ("genome", "target"): (500, 1000), ("regional", "target"): (50, 267), ("regional", "antitarget"): (20000, 200000), ("amplicon", "target"): (50, 267) } with open(out_file) as in_handle: for line in in_handle: if line.startswith(tuple(estimate_map.keys())): name, depth, bin_size = line.strip().split("\t") name = estimate_map[name.replace(":", "").strip()] try: bin_size = int(bin_size) except ValueError: bin_size = None if bin_size and bin_size > 0: cur_min, cur_max = range_map[(cov_interval, name)] avg_bin_sizes[name] = max(min(bin_size, cur_max), cur_min) return avg_bin_sizes
def _bam_to_outbase(bam_file, work_dir, data): """Convert an input BAM file into CNVkit expected output. Handles previous non-batch cases to avoid re-calculating, returning both new and old values: """ batch = dd.get_batch(data) or dd.get_sample_name(data) out_base = os.path.splitext(os.path.basename(bam_file))[0].split(".")[0] base = os.path.join(work_dir, out_base) return "%s-%s" % (base, batch), base
def run_peddy(samples, out_dir=None): vcf_file = None for d in samples: vcinfo = variant.get_active_vcinfo(d) if vcinfo and vcinfo.get("vrn_file") and utils.file_exists(vcinfo["vrn_file"]): if vcinfo["vrn_file"] and dd.get_sample_name(d) in vcfutils.get_samples(vcinfo["vrn_file"]): vcf_file = vcinfo["vrn_file"] break data = samples[0] peddy = config_utils.get_program("peddy", data) if config_utils.program_installed("peddy", data) else None if not peddy or not vcf_file or not is_human(data): logger.info("peddy is not installed, not human or sample VCFs don't match, skipping correspondence checking " "for %s." % vcf_file) return samples batch = dd.get_batch(data) or dd.get_sample_name(data) if out_dir: peddy_dir = safe_makedir(out_dir) else: peddy_dir = safe_makedir(os.path.join(dd.get_work_dir(data), "qc", batch, "peddy")) ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir) peddy_prefix = os.path.join(peddy_dir, batch) peddy_report = peddy_prefix + ".html" peddyfiles = expected_peddy_files(peddy_report, batch) if file_exists(peddy_report): return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles) if file_exists(peddy_prefix + "-failed.log"): return samples num_cores = dd.get_num_cores(data) with tx_tmpdir(data) as tx_dir: peddy_prefix_tx = os.path.join(tx_dir, os.path.basename(peddy_prefix)) # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2 stderr_log = os.path.join(tx_dir, "run-stderr.log") cmd = "{peddy} -p {num_cores} --plot --prefix {peddy_prefix_tx} {vcf_file} {ped_file} 2> {stderr_log}" message = "Running peddy on {vcf_file} against {ped_file}." try: do.run(cmd.format(**locals()), message.format(**locals())) except: to_show = collections.deque(maxlen=100) with open(stderr_log) as in_handle: for line in in_handle: to_show.append(line) if any([l.find("IndexError") >=0 and l.find("is out of bounds for axis") >= 0 for l in to_show]): logger.info("Skipping peddy because no variants overlap with checks: %s" % batch) with open(peddy_prefix + "-failed.log", "w") as out_handle: out_handle.write("peddy did not find overlaps with 1kg sites in VCF, skipping") return samples else: logger.warning("".join(to_show)) raise for ext in PEDDY_OUT_EXTENSIONS: if os.path.exists(peddy_prefix_tx + ext): shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext) return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)
def _group_by_samplename(samples): """Group samples split by QC method back into a single sample. """ out = collections.defaultdict(list) for data in samples: batch = dd.get_batch(data) or dd.get_sample_name(data) if not isinstance(batch, (list, tuple)): batch = [batch] batch = tuple(batch) out[(dd.get_sample_name(data), dd.get_align_bam(data), batch)].append(data) return [xs[0] for xs in out.values()]
def finalize_sv(samples, config): """Combine results from multiple sv callers into a single ordered 'sv' key. """ by_bam = collections.OrderedDict() for x in samples: batch = dd.get_batch(x) or [dd.get_sample_name(x)] try: by_bam[x["align_bam"], tuple(batch)].append(x) except KeyError: by_bam[x["align_bam"], tuple(batch)] = [x] by_batch = collections.OrderedDict() lead_batches = {} for grouped_calls in by_bam.values(): def orig_svcaller_order(x): orig_callers = tz.get_in(["config", "algorithm", "svcaller_orig"], x) cur_caller = tz.get_in(["config", "algorithm", "svcaller"], x) return orig_callers.index(cur_caller) sorted_svcalls = sorted([x for x in grouped_calls if "sv" in x], key=orig_svcaller_order) final = grouped_calls[0] if len(sorted_svcalls) > 0: final["sv"] = reduce(operator.add, [x["sv"] for x in sorted_svcalls]) final["config"]["algorithm"]["svcaller"] = final["config"]["algorithm"].pop("svcaller_orig") batch = dd.get_batch(final) or dd.get_sample_name(final) batches = batch if isinstance(batch, (list, tuple)) else [batch] lead_batches[dd.get_sample_name(final)] = batches[0] for batch in batches: try: by_batch[batch].append(final) except KeyError: by_batch[batch] = [final] out = [] for batch, items in by_batch.items(): if any("svplots" in dd.get_tools_on(d) for d in items): plot_items = plot.by_regions(items) else: plot_items = items for data in plot_items: if lead_batches[dd.get_sample_name(data)] == batch: out.append([data]) return out
def _variant_checkpoints(samples): """Check sample configuration to identify required steps in analysis. """ checkpoints = {} checkpoints["vc"] = any([dd.get_variantcaller(d) for d in samples]) checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples]) checkpoints["jointvc"] = any([ (dd.get_jointcaller(d) or ("gvcf" in dd.get_tools_on(d))) and dd.get_batch(d) for d in samples ]) checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples]) return checkpoints
def detect_sv(items, all_items=None, stage="standard"): """Top level parallel target for examining structural variation. items = sample-sv_caller list, from one batch """ items = [utils.to_single_data(x) for x in items] items = cwlutils.unpack_tarballs(items, items[0]) svcaller = items[0]["config"]["algorithm"].get("svcaller") caller_fn = _get_callers(items, stage, special_cases=True).get(svcaller) out = [] batch = dd.get_batch(items[0]) # no SV calling when just creating a PON for PureCN if batch == "pon_build" and "purecn" in dd.get_svcaller(items[0]): return out if svcaller and caller_fn: if (all_items and svcaller in _NEEDS_BACKGROUND and not vcfutils.is_paired_analysis( [x.get("align_bam") for x in items], items)): names = set([dd.get_sample_name(x) for x in items]) background = [ x for x in all_items if dd.get_sample_name(x) not in names ] for svdata in caller_fn(items, background): out.append([svdata]) else: for svdata in caller_fn(items): out.append([svdata]) else: for data in items: out.append([data]) # Avoid nesting of callers for CWL runs for easier extraction if cwlutils.is_cwl_run(items[0]): out_cwl = [] for data in [utils.to_single_data(x) for x in out]: # Run validation directly from CWL runs since we're single stage data = validate.evaluate(data) data["svvalidate"] = { "summary": tz.get_in(["sv-validate", "csv"], data) } svs = data.get("sv") if svs: assert len(svs) == 1, svs data["sv"] = svs[0] else: data["sv"] = {} data = _add_supplemental(data) out_cwl.append([data]) return out_cwl return out
def finalize_sv(samples, config, initial_only=False): """Combine results from multiple sv callers into a single ordered 'sv' key. Handles ensemble calling and plotting of results. """ by_bam = collections.OrderedDict() for x in samples: try: by_bam[x["align_bam"]].append(x) except KeyError: by_bam[x["align_bam"]] = [x] by_batch = collections.OrderedDict() lead_batches = {} for grouped_calls in by_bam.values(): def orig_svcaller_order(x): return _get_svcallers(x).index(x["config"]["algorithm"]["svcaller_active"]) sorted_svcalls = sorted([x for x in grouped_calls if "sv" in x], key=orig_svcaller_order) final = grouped_calls[0] if len(sorted_svcalls) > 0: final_calls = reduce(operator.add, [x["sv"] for x in sorted_svcalls]) if not initial_only: for caller in (c for c in _get_svcallers(final) if c in _ENSEMBLE_CALLERS): final_calls = _ENSEMBLE_CALLERS[caller](final_calls, final) final_calls = ensemble.summarize(final_calls, final, grouped_calls) final_calls = validate.evaluate(final, final_calls) final["sv"] = final_calls del final["config"]["algorithm"]["svcaller_active"] batch = dd.get_batch(final) or dd.get_sample_name(final) batches = batch if isinstance(batch, (list, tuple)) else [batch] lead_batches[dd.get_sample_name(final)] = batches[0] for batch in batches: try: by_batch[batch].append(final) except KeyError: by_batch[batch] = [final] out = [] for batch, items in by_batch.items(): if any("svplots" in dd.get_tools_on(d) for d in items): plot_items = plot.by_regions(items) else: plot_items = items for data in plot_items: if lead_batches[dd.get_sample_name(data)] == batch: out.append([data]) return out
def _variant_checkpoints(samples): """Check sample configuration to identify required steps in analysis. """ checkpoints = {} checkpoints["vc"] = any([dd.get_variantcaller(d) for d in samples]) checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples]) checkpoints["jointvc"] = any([ (dd.get_jointcaller(d) or ("gvcf" in dd.get_tools_on(d))) and dd.get_batch(d) for d in samples ]) checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples]) checkpoints["align"] = any([(dd.get_aligner(d) or dd.get_bam_clean(d)) for d in samples]) checkpoints["align_split"] = not all([(dd.get_align_split_size(d) is False or not dd.get_aligner(d)) for d in samples]) return checkpoints
def _run_gridss(inputs, background, work_dir): out_file = os.path.join( work_dir, "%s-gridss.sv.vcf" % (dd.get_batch(inputs[0]) or dd.get_sample_name(inputs[0]))) if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): with file_transaction(inputs[0], out_file) as tx_out_file: htsjdk_opts = [ "-Dsamjdk.create_index=true", "-Dsamjdk.use_async_io_read_samtools=true", "-Dsamjdk.use_async_io_write_samtools=true", "-Dsamjdk.use_async_io_write_tribble=true" ] cores = dd.get_cores(inputs[0]) resources = config_utils.get_resources("gridss", inputs[0]["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx4g"]) jvm_opts = config_utils.adjust_opts( jvm_opts, { "algorithm": { "memory_adjust": { "direction": "increase", "magnitude": cores } } }) jvm_opts = _finalize_memory(jvm_opts) tx_ref_file = _setup_reference_files(inputs[0], os.path.dirname(tx_out_file)) blacklist_bed = sshared.prepare_exclude_file( inputs + background, out_file) cmd = ["gridss"] + jvm_opts + htsjdk_opts + ["gridss.CallVariants"] + \ ["THREADS=%s" % cores, "TMP_DIR=%s" % os.path.dirname(tx_out_file), "WORKING_DIR=%s" % os.path.dirname(tx_out_file), "OUTPUT=%s" % tx_out_file, "ASSEMBLY=%s" % tx_out_file.replace(".sv.vcf", ".gridss.assembly.bam"), "REFERENCE_SEQUENCE=%s" % tx_ref_file, "BLACKLIST=%s" % blacklist_bed] for data in inputs + background: cmd += [ "INPUT=%s" % dd.get_align_bam(data), "INPUT_LABEL=%s" % dd.get_sample_name(data) ] exports = utils.local_path_export() cmd = exports + " ".join(cmd) do.run(cmd, "GRIDSS SV analysis") return vcfutils.bgzip_and_index(out_file, inputs[0]["config"])
def finalize_sv(samples, config): """Combine results from multiple sv callers into a single ordered 'sv' key. Handles ensemble calling and plotting of results. """ by_bam = collections.OrderedDict() for x in samples: try: by_bam[x["align_bam"]].append(x) except KeyError: by_bam[x["align_bam"]] = [x] highdepths = filter( lambda x: x is not None, list(set([tz.get_in(["config", "algorithm", "highdepth_regions"], x) for x in samples])), ) by_batch = collections.OrderedDict() lead_batches = {} for grouped_calls in by_bam.values(): def orig_svcaller_order(x): return _get_svcallers(x).index(x["config"]["algorithm"]["svcaller_active"]) sorted_svcalls = sorted([x for x in grouped_calls if "sv" in x], key=orig_svcaller_order) final = grouped_calls[0] if len(sorted_svcalls) > 0: final_calls = reduce(operator.add, [x["sv"] for x in sorted_svcalls]) final_calls = ensemble.summarize(final_calls, final, highdepths) final_calls = validate.evaluate(final, final_calls) final["sv"] = final_calls del final["config"]["algorithm"]["svcaller_active"] batch = dd.get_batch(final) or dd.get_sample_name(final) batches = batch if isinstance(batch, (list, tuple)) else [batch] lead_batches[dd.get_sample_name(final)] = batches[0] for batch in batches: try: by_batch[batch].append(final) except KeyError: by_batch[batch] = [final] out = [] for batch, items in by_batch.items(): plot_items = plot.by_regions(items) for data in plot_items: if lead_batches[dd.get_sample_name(data)] == batch: out.append([data]) return out
def _batch_split_by_sv(samples, stage): """Return - to_process = svcaller-batch => [svcaller-sample1, svcaller-sample2...] odict - extras = samples without sv calling (should there be any?) - background - all samples """ to_process = collections.OrderedDict() extras = [] background = [] for data in (utils.to_single_data(x) for x in samples): # data = sample ready_data = _handle_multiple_svcallers(data, stage) if len(ready_data) > 0: # why appending every sample to background? background.append(data) # x is sample - sv caller pair for x in ready_data: svcaller = tz.get_in(["config", "algorithm", "svcaller"], x) batch = dd.get_batch(x) or dd.get_sample_name(x) if stage in ["ensemble"]: # no batching for ensemble methods if isinstance(batch, six.string_types ) and batch != dd.get_sample_name(x): batch += "_%s" % dd.get_sample_name(x) else: batch = dd.get_sample_name(x) if dd.get_phenotype(x) == "germline": batch += "_germline" elif svcaller in _GLOBAL_BATCHING: # All samples batched together for analyses batch = "all" # just creating PON - no calling if stage in ["standard"] and batch in ["pon_build"]: extras.append(x) else: batches = batch if isinstance(batch, (list, tuple)) else [batch] for b in batches: try: to_process[(svcaller, b)].append(x) except KeyError: to_process[(svcaller, b)] = [x] else: extras.append([data]) return to_process, extras, background
def finalize_sv(samples, config): """Combine results from multiple sv callers into a single ordered 'sv' key. """ by_bam = collections.OrderedDict() for x in samples: try: by_bam[x["align_bam"]].append(x) except KeyError: by_bam[x["align_bam"]] = [x] by_batch = collections.OrderedDict() lead_batches = {} for grouped_calls in by_bam.values(): def orig_svcaller_order(x): return _get_svcallers(x).index( x["config"]["algorithm"]["svcaller_active"]) sorted_svcalls = sorted([x for x in grouped_calls if "sv" in x], key=orig_svcaller_order) final = grouped_calls[0] if len(sorted_svcalls) > 0: final["sv"] = reduce(operator.add, [x["sv"] for x in sorted_svcalls]) del final["config"]["algorithm"]["svcaller_active"] batch = dd.get_batch(final) or dd.get_sample_name(final) batches = batch if isinstance(batch, (list, tuple)) else [batch] lead_batches[dd.get_sample_name(final)] = batches[0] for batch in batches: try: by_batch[batch].append(final) except KeyError: by_batch[batch] = [final] out = [] for batch, items in by_batch.items(): if any("svplots" in dd.get_tools_on(d) for d in items): plot_items = plot.by_regions(items) else: plot_items = items for data in plot_items: if lead_batches[dd.get_sample_name(data)] == batch: out.append([data]) return out
def run(samples, run_parallel, initial_only=False): """Run structural variation detection. initial_only indicates we should run structural variation inputs, like CNV calling, we can use to inform low frequency variant calling. """ to_process = collections.OrderedDict() extras = [] background = [] for data in (xs[0] for xs in samples): ready_data = _handle_multiple_svcallers(data) if len(ready_data) > 0: background.append(data) for x in ready_data: svcaller = x["config"]["algorithm"].get("svcaller_active") # reset SV information if we're running a second pass SV call if "sv" in x: del x["sv"] batch = dd.get_batch(x) paired = vcfutils.get_paired_phenotype(x) if ((svcaller in _BATCH_CALLERS and batch) or (svcaller in _SOMATIC_CALLERS and paired and batch)): batches = batch if isinstance(batch, (list, tuple)) else [batch] for b in batches: try: to_process[(svcaller, b)].append(x) except KeyError: to_process[(svcaller, b)] = [x] else: to_process[(svcaller, dd.get_sample_name(x))] = [x] else: extras.append([data]) processed = run_parallel("detect_sv", ([xs, background, xs[0]["config"], initial_only] for xs in to_process.values())) finalized = (run_parallel("finalize_sv", [([xs[0] for xs in processed], processed[0][0]["config"], initial_only)]) if len(processed) > 0 else []) return extras + finalized
def _cnvkit_coverage_bin_estimate(raw_target_bed, access_bed, cov_interval, work_dir, data): """Estimate good coverage bin sizes for target regions based on coverage. """ batch = dd.get_batch(data) or dd.get_sample_name(data) out_file = os.path.join(work_dir, "%s-%s-bin_estimate.txt" % ( os.path.splitext(os.path.basename(raw_target_bed))[0], batch)) method_map = {"genome": "wgs", "regional": "hybrid", "amplicon": "amplicon"} if not os.path.exists(out_file): with file_transaction(data, out_file) as tx_out_file: cmd = [_get_cmd("coverage_bin_size.py"), dd.get_align_bam(data), "-m", method_map[cov_interval], "-t", raw_target_bed, "-g", access_bed] cmd = " ".join(cmd) + " > " + tx_out_file try: do.run(_prep_cmd(cmd, tx_out_file), "CNVkit coverage bin estimation", log_error=False) except subprocess.CalledProcessError: logger.info("Bin size estimate failed, using default values") with open(tx_out_file, "w") as out_handle: out_handle.write("Bin size estimate failed, using default values") avg_bin_sizes = {} estimate_map = {"On-target": "target", "Off-target": "antitarget", "Genome": "target", "Targets (sampling)": "target"} range_map = {("genome", "target"): (500, 1000), ("regional", "target"): (50, 267), ("regional", "antitarget"): (20000, 200000), ("amplicon", "target"): (50, 267)} with open(out_file) as in_handle: for line in in_handle: if line.startswith(tuple(estimate_map.keys())): name, depth, bin_size = line.strip().split("\t") name = estimate_map[name.replace(":", "").strip()] try: bin_size = int(bin_size) except ValueError: bin_size = None if bin_size and bin_size > 0: cur_min, cur_max = range_map[(cov_interval, name)] avg_bin_sizes[name] = max(min(bin_size, cur_max), cur_min) return avg_bin_sizes
def _get_batch(x): b = dd.get_batch(x) return [b] if not isinstance(b, (list, tuple)) else b
def _get_batches(data): batches = dd.get_batch(data) or dd.get_sample_name(data) if not isinstance(batches, (list, tuple)): batches = [batches] return batches
def run_peddy(samples, out_dir=None): data = samples[0] batch = dd.get_batch(data) or dd.get_sample_name(data) if isinstance(batch, (list, tuple)): batch = batch[0] if out_dir: peddy_dir = safe_makedir(out_dir) else: peddy_dir = safe_makedir( os.path.join(dd.get_work_dir(data), "qc", batch, "peddy")) peddy_prefix = os.path.join(peddy_dir, batch) peddy_report = peddy_prefix + ".html" vcf_file = None for d in samples: vcinfo = None if dd.get_phenotype(d) == "germline" or dd.get_phenotype(d) not in [ "tumor" ]: vcinfo = variant.get_active_vcinfo(d, use_ensemble=False) if not vcinfo and dd.get_phenotype(d) in ["tumor"]: vcinfo = variant.extract_germline_vcinfo(d, peddy_dir) if vcinfo: for key in ["germline", "vrn_file"]: if vcinfo and vcinfo.get(key) and utils.file_exists( vcinfo[key]): if vcinfo[key] and dd.get_sample_name( d) in vcfutils.get_samples(vcinfo[key]): if vcinfo[ key] and vcfutils.vcf_has_nonfiltered_variants( vcinfo[key]): vcf_file = vcinfo[key] break peddy = config_utils.get_program("peddy", data) if config_utils.program_installed( "peddy", data) else None config_skips = any(["peddy" in dd.get_tools_off(d) for d in samples]) if not peddy or not vcf_file or not vcfanno.is_human(data) or config_skips: if not peddy: reason = "peddy executable not found" elif config_skips: reason = "peddy in tools_off configuration" elif not vcfanno.is_human(data): reason = "sample is not human" else: assert not vcf_file reason = "no suitable VCF files found with the sample and non-filtered variants" msg = "Skipping peddy QC, %s: %s" % ( reason, [dd.get_sample_name(d) for d in samples]) with open(peddy_prefix + "-failed.log", "w") as out_handle: out_handle.write(msg) logger.info(msg) return samples if file_exists(peddy_prefix + "-failed.log"): return samples if not file_exists(peddy_report): ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir) num_cores = dd.get_num_cores(data) with tx_tmpdir(data) as tx_dir: peddy_prefix_tx = os.path.join(tx_dir, os.path.basename(peddy_prefix)) # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2 stderr_log = os.path.join(tx_dir, "run-stderr.log") sites_str = "--sites hg38" if dd.get_genome_build( data) == "hg38" else "" locale = utils.locale_export() cmd = ( "{locale} {peddy} -p {num_cores} {sites_str} --plot --prefix {peddy_prefix_tx} " "{vcf_file} {ped_file} 2> {stderr_log}") message = "Running peddy on {vcf_file} against {ped_file}." try: do.run(cmd.format(**locals()), message.format(**locals())) except: to_show = collections.deque(maxlen=100) with open(stderr_log) as in_handle: for line in in_handle: to_show.append(line) def allowed_errors(l): return ( (l.find("IndexError") >= 0 and l.find("is out of bounds for axis") >= 0) or (l.find("n_components=") >= 0 and l.find("must be between 1 and n_features=") >= 0) or (l.find("n_components=") >= 0 and l.find("must be between 1 and min") >= 0) or (l.find( "Input contains NaN, infinity or a value too large for dtype" ) >= 0)) def all_line_errors(l): return (l.find("no intervals found for") >= 0) if any([allowed_errors(l) for l in to_show]) or all( [all_line_errors(l) for l in to_show]): logger.info( "Skipping peddy because no variants overlap with checks: %s" % batch) with open(peddy_prefix + "-failed.log", "w") as out_handle: out_handle.write( "peddy did not find overlaps with 1kg sites in VCF, skipping" ) return samples else: logger.warning("".join(to_show)) raise for ext in PEDDY_OUT_EXTENSIONS: if os.path.exists(peddy_prefix_tx + ext): shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext) peddyfiles = expected_peddy_files(peddy_report, batch) return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)
def mutect2_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's MuTect2. This requires the full non open-source version of GATK 3.5+. items = 1 sample or T/N pair """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): # call somatic variants keeping germline sites and using germline 1KG resource # use --native-pair-hmm-threads? broad_runner = broad.runner_from_config(items[0]["config"]) gatk_type = broad_runner.gatk_type() # shared Mutect2 settings for PureCN analysis in the case of: # - PON creation # - Tumor-only PureCN run # - T/N PureCN run # PURECN requirement alters Mutect2 variants calling! if "purecn" in dd.get_svcaller(items[0]): # mutect call for PON creation or purecn T-only analysis _prep_inputs(align_bams, ref_file, items) with file_transaction(items[0], out_file) as tx_out_file: germline_resource = tz.get_in(["genome_resources", "variation", "af_only_gnomad"], items[0]) germline_path = os.path.normpath(os.path.join(os.path.dirname(ref_file), germline_resource)) input_bam = dd.get_work_bam(items[0]) tx_prefilt_vcf = utils.splitext_plus(tx_out_file)[0] + ".prefilt.vcf" tx_vcf = os.path.splitext(tx_out_file)[0] out_file_ungz = os.path.splitext(out_file)[0] params = ["-T", "Mutect2"] # T/N pair if len(items) == 2: paired = vcfutils.get_paired_bams(align_bams, items) # not really running purecn with mutect1/gatk3 params += _add_tumor_params(paired, items, gatk_type) logger.debug("You are running mutect2 in PureCN analysis in T/N mode, T-only + PON is recommended") else: #T only params += ["-I", input_bam] # adding SNV PON from background/variant snv_pon = tz.get_in(["config", "algorithm", "background", "variant"], items[0]) if snv_pon and dd.get_batch(items[0]) != "pon_build": params += ["-pon", snv_pon] params += ["--genotype-pon-sites"] opt_list = config_utils.get_resources("mutect2", items[0]["config"]).get("options") # default is 50, sometimes 100 or 200 is recommended for better sensitivity in detection # hom del CNVs (calling more variants helps) interval_padding = 50 if opt_list: opt_dict = dict(zip(opt_list[::2], opt_list[1::2])) if "--interval_padding" in opt_dict: interval_padding = opt_dict["--interval_padding"] params += ["--max-mnp-distance", "0", "--interval-padding", interval_padding, "--germline-resource", germline_path, "--genotype-germline-sites", "--reference", ref_file, "-O", tx_prefilt_vcf] params += _add_region_params(region, out_file, items, gatk_type) broad_runner.new_resources("mutect2") gatk_cmd = broad_runner.cl_gatk(params, os.path.dirname(tx_out_file)) filter_cmd = _mutect2_filter(broad_runner, items, tx_prefilt_vcf, out_file_ungz, ref_file) cmd = "{gatk_cmd} && {filter_cmd}" do.run(cmd.format(**locals()), "MuTect2") # no AF filter for PureCN variants out_file = vcfutils.bgzip_and_index(out_file_ungz, items[0]["config"]) else: # a regular mutect call paired = vcfutils.get_paired_bams(align_bams, items) f1r2_file = None _prep_inputs(align_bams, ref_file, items) with file_transaction(items[0], out_file) as tx_out_file: params = ["-T", "Mutect2" if gatk_type == "gatk4" else "MuTect2", "--annotation", "ClippingRankSumTest", "--annotation", "DepthPerSampleHC"] if gatk_type == "gatk4": params += ["--reference", ref_file] else: params += ["-R", ref_file] for a in annotation.get_gatk_annotations(items[0]["config"], include_baseqranksum=False): params += ["--annotation", a] # Avoid issues with BAM CIGAR reads that GATK doesn't like if gatk_type == "gatk4": params += ["--read-validation-stringency", "LENIENT"] params += _add_tumor_params(paired, items, gatk_type) params += _add_region_params(region, out_file, items, gatk_type) if all(is_paired(bam) for bam in align_bams) and ( "mutect2_readmodel" in utils.get_in(items[0], "config", "tools_on")): orientation_filter = True else: orientation_filter = False if gatk_type == "gatk4" and orientation_filter: f1r2_file = "{}-f1r2.tar.gz".format(utils.splitext_plus(out_file)[0]) params += ["--f1r2-tar-gz", f1r2_file] # Avoid adding dbSNP/Cosmic so they do not get fed to variant filtering algorithm # Not yet clear how this helps or hurts in a general case. #params += _add_assoc_params(assoc_files) resources = config_utils.get_resources("mutect2", items[0]["config"]) if "options" in resources: params += [str(x) for x in resources.get("options", [])] assert LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.5"), \ "Require full version of GATK 3.5+ for mutect2 calling" broad_runner.new_resources("mutect2") gatk_cmd = broad_runner.cl_gatk(params, os.path.dirname(tx_out_file)) if gatk_type == "gatk4": tx_raw_prefilt_file = "%s-raw%s" % utils.splitext_plus(out_file) tx_raw_file = "%s-raw-filt%s" % utils.splitext_plus(tx_out_file) if orientation_filter: tx_f1r2_file = "{}-read-orientation-model.tar.gz" tx_f1r2_file = tx_f1r2_file.format(utils.splitext_plus(f1r2_file)[0]) tx_read_orient_cmd = _mutect2_read_filter(broad_runner, f1r2_file, tx_f1r2_file) filter_cmd = _mutect2_filter(broad_runner, items, tx_raw_prefilt_file, tx_raw_file, ref_file, tx_f1r2_file) else: filter_cmd = _mutect2_filter(broad_runner, items, tx_raw_prefilt_file, tx_raw_file, ref_file) if orientation_filter: cmd = "{gatk_cmd} -O {tx_raw_prefilt_file} && {tx_read_orient_cmd} && {filter_cmd}" else: cmd = "{gatk_cmd} -O {tx_raw_prefilt_file} && {filter_cmd}" else: tx_raw_file = "%s-raw%s" % utils.splitext_plus(tx_out_file) cmd = "{gatk_cmd} > {tx_raw_file}" do.run(cmd.format(**locals()), "MuTect2") out_file = _af_filter(paired.tumor_data, tx_raw_file, out_file) return vcfutils.bgzip_and_index(out_file, items[0]["config"])