def assign_interval(data): """Identify coverage based on percent of genome covered and relation to targets. Classifies coverage into 3 categories: - genome: Full genome coverage - regional: Regional coverage, like exome capture, with off-target reads - amplicon: Amplication based regional coverage without off-target reads """ if not dd.get_coverage_interval(data): vrs = dd.get_variant_regions_merged(data) callable_file = dd.get_sample_callable(data) if vrs: callable_size = pybedtools.BedTool(vrs).total_coverage() else: callable_size = pybedtools.BedTool(callable_file).total_coverage() total_size = sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])]) genome_cov_pct = callable_size / float(total_size) if genome_cov_pct > GENOME_COV_THRESH: cov_interval = "genome" offtarget_pct = 0.0 elif not vrs: cov_interval = "regional" offtarget_pct = 0.0 else: offtarget_pct = _count_offtarget(data, dd.get_align_bam(data) or dd.get_work_bam(data), vrs or callable_file, "variant_regions") if offtarget_pct > OFFTARGET_THRESH: cov_interval = "regional" else: cov_interval = "amplicon" logger.info("%s: Assigned coverage as '%s' with %.1f%% genome coverage and %.1f%% offtarget coverage" % (dd.get_sample_name(data), cov_interval, genome_cov_pct * 100.0, offtarget_pct * 100.0)) data["config"]["algorithm"]["coverage_interval"] = cov_interval return data
def _prep_config(items, paired, work_dir): """Run initial configuration, generating a run directory for Manta. """ assert utils.which("configManta.py"), "Could not find installed configManta.py" out_file = os.path.join(work_dir, "runWorkflow.py") if not utils.file_exists(out_file) or _out_of_date(out_file): config_script = os.path.realpath(utils.which("configManta.py")) cmd = [utils.get_program_python("configManta.py"), config_script] if paired: if paired.normal_bam: cmd += ["--normalBam=%s" % paired.normal_bam, "--tumorBam=%s" % paired.tumor_bam] else: cmd += ["--tumorBam=%s" % paired.tumor_bam] else: cmd += ["--bam=%s" % dd.get_align_bam(data) for data in items] data = paired.tumor_data if paired else items[0] cmd += ["--referenceFasta=%s" % dd.get_ref_file(data), "--runDir=%s" % work_dir] if dd.get_coverage_interval(data) not in ["genome"]: cmd += ["--exome"] for region in _maybe_limit_chromosomes(data): cmd += ["--region", region] resources = config_utils.get_resources("manta", data["config"]) if resources.get("options"): cmd += [str(x) for x in resources["options"]] # If we are removing polyX, avoid calling on small indels which require # excessively long runtimes on noisy WGS runs if "polyx" in dd.get_exclude_regions(data): cmd += ["--config", _prep_streamlined_config(config_script, work_dir)] do.run(cmd, "Configure manta SV analysis") return out_file
def _get_maxcov_downsample(data): """Calculate maximum coverage downsampling for whole genome samples. Returns None if we're not doing downsampling. """ from bcbio.bam import ref from bcbio.ngsalign import alignprep, bwa from bcbio.variation import coverage params = {"min_coverage_for_downsampling": 10, "maxcov_downsample_multiplier": dd.get_maxcov_downsample(data)} fastq_file = data["files"][0] num_reads = alignprep.total_reads_from_grabix(fastq_file) if num_reads and params["maxcov_downsample_multiplier"] and params["maxcov_downsample_multiplier"] > 0: vrs = dd.get_variant_regions_merged(data) total_size = sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])]) if vrs: callable_size = pybedtools.BedTool(vrs).total_coverage() genome_cov_pct = callable_size / float(total_size) else: callable_size = total_size genome_cov_pct = 1.0 if (genome_cov_pct > coverage.GENOME_COV_THRESH and dd.get_coverage_interval(data) in ["genome", None, False]): total_counts, total_sizes = 0, 0 for count, size in bwa.fastq_size_output(fastq_file, 5000): total_counts += int(count) total_sizes += (int(size) * int(count)) read_size = float(total_sizes) / float(total_counts) avg_cov = float(num_reads * read_size) / callable_size if avg_cov >= params["min_coverage_for_downsampling"]: return int(avg_cov * params["maxcov_downsample_multiplier"]) return None
def prepare_exclude_file(items, base_file, chrom=None): """Prepare a BED file for exclusion. Excludes high depth and centromere regions which contribute to long run times and false positive structural variant calls. """ out_file = "%s-exclude%s.bed" % (utils.splitext_plus(base_file)[0], "-%s" % chrom if chrom else "") if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): with shared.bedtools_tmpdir(items[0]): # Get a bedtool for the full region if no variant regions want_bedtool = callable.get_ref_bedtool(tz.get_in(["reference", "fasta", "base"], items[0]), items[0]["config"], chrom) if chrom: want_bedtool = pybedtools.BedTool(shared.subset_bed_by_chrom(want_bedtool.saveas().fn, chrom, items[0])) sv_exclude_bed = _get_sv_exclude_file(items) if sv_exclude_bed and len(want_bedtool) > 0: want_bedtool = want_bedtool.subtract(sv_exclude_bed, nonamecheck=True).saveas() if any(dd.get_coverage_interval(d) == "genome" for d in items): want_bedtool = pybedtools.BedTool(shared.remove_highdepth_regions(want_bedtool.saveas().fn, items)) with file_transaction(items[0], out_file) as tx_out_file: full_bedtool = callable.get_ref_bedtool(tz.get_in(["reference", "fasta", "base"], items[0]), items[0]["config"]) if len(want_bedtool) > 0: full_bedtool.subtract(want_bedtool, nonamecheck=True).saveas(tx_out_file) else: full_bedtool.saveas(tx_out_file) return out_file
def _goleft_indexcov(bam_file, data, out_dir): """Use goleft indexcov to estimate coverage distributions using BAM index. Only used for whole genome runs as captures typically don't have enough data to be useful for index-only summaries. """ if not dd.get_coverage_interval(data) == "genome": return [] out_dir = utils.safe_makedir(os.path.join(out_dir, "indexcov")) out_files = [os.path.join(out_dir, "%s-indexcov.%s" % (dd.get_sample_name(data), ext)) for ext in ["roc", "ped", "bed.gz"]] if not utils.file_uptodate(out_files[-1], bam_file): with transaction.tx_tmpdir(data) as tmp_dir: tmp_dir = utils.safe_makedir(os.path.join(tmp_dir, dd.get_sample_name(data))) gender_chroms = [x.name for x in ref.file_contigs(dd.get_ref_file(data)) if chromhacks.is_sex(x.name)] gender_args = "--sex %s" % (",".join(gender_chroms)) if gender_chroms else "" cmd = "goleft indexcov --directory {tmp_dir} {gender_args} -- {bam_file}" try: do.run(cmd.format(**locals()), "QC: goleft indexcov") except subprocess.CalledProcessError as msg: if not ("indexcov: no usable" in str(msg) or ("indexcov: expected" in str(msg) and "sex chromosomes, found:" in str(msg))): raise for out_file in out_files: orig_file = os.path.join(tmp_dir, os.path.basename(out_file)) if utils.file_exists(orig_file): utils.copy_plus(orig_file, out_file) # MultiQC needs non-gzipped/BED inputs so unpack the file out_bed = out_files[-1].replace(".bed.gz", ".tsv") if utils.file_exists(out_files[-1]) and not utils.file_exists(out_bed): with transaction.file_transaction(data, out_bed) as tx_out_bed: cmd = "gunzip -c %s > %s" % (out_files[-1], tx_out_bed) do.run(cmd, "Unpack indexcov BED file") out_files[-1] = out_bed return [x for x in out_files if utils.file_exists(x)]
def segment_from_cnr(cnr_file, data, out_base): """Provide segmentation on a cnr file, used in external PureCN integration. """ cns_file = _cnvkit_segment(cnr_file, dd.get_coverage_interval(data), data, [data], out_file="%s.cns" % out_base, detailed=True) out = _add_seg_to_output({"cns": cns_file}, data, enumerate_chroms=False) return out["seg"]
def _run_cnvkit_shared(data, test_bams, background_bams, access_file, work_dir, background_name=None): """Shared functionality to run CNVkit. """ ref_file = dd.get_ref_file(data) raw_work_dir = os.path.join(work_dir, "raw") out_base = os.path.splitext(os.path.basename(test_bams[0]))[0].split(".")[0] background_cnn = "%s_background.cnn" % (background_name if background_name else "flat") files = {"cnr": os.path.join(raw_work_dir, "%s.cnr" % out_base), "cns": os.path.join(raw_work_dir, "%s.cns" % out_base), "back_cnn": os.path.join(raw_work_dir, background_cnn)} if not utils.file_exists(files["cnr"]): if os.path.exists(raw_work_dir): shutil.rmtree(raw_work_dir) with tx_tmpdir(data, work_dir) as tx_work_dir: # pick targets, anti-targets and access files based on analysis type # http://cnvkit.readthedocs.org/en/latest/nonhybrid.html cov_interval = dd.get_coverage_interval(data) base_regions = dd.get_variant_regions(data) # For genome calls, subset to regions within 10kb of genes if cov_interval == "genome": base_regions = annotate.subset_by_genes(base_regions, data, work_dir, pad=1e4) raw_target_bed = bedutils.merge_overlaps(base_regions, data, out_dir=work_dir) target_bed = annotate.add_genes(raw_target_bed, data) # bail out if we ended up with no regions if not utils.file_exists(target_bed): return {} if cov_interval == "amplicon": target_opts = ["--targets", target_bed, "--access", target_bed] elif cov_interval == "genome": target_opts = ["--targets", target_bed, "--access", dd.get_variant_regions(data)] else: target_opts = ["--targets", target_bed, "--access", access_file] cores = min(tz.get_in(["config", "algorithm", "num_cores"], data, 1), len(test_bams) + len(background_bams)) cmd = [_get_cmd(), "batch"] + \ test_bams + ["-n"] + background_bams + ["-f", ref_file] + \ target_opts + \ ["-d", tx_work_dir, "--split", "-p", str(cores), "--output-reference", os.path.join(tx_work_dir, background_cnn)] at_avg, at_min, t_avg = _get_antitarget_size(access_file, target_bed) if at_avg: cmd += ["--antitarget-avg-size", str(at_avg), "--antitarget-min-size", str(at_min), "--target-avg-size", str(t_avg)] local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") cmd += ["--rlibpath", local_sitelib] do.run(cmd, "CNVkit batch") shutil.move(tx_work_dir, raw_work_dir) for ftype in ["cnr", "cns"]: if not os.path.exists(files[ftype]): raise IOError("Missing CNVkit %s file: %s" % (ftype, files[ftype])) return files
def _run_cnvkit_shared(items, test_bams, background_bams, work_dir, background_name=None): """Shared functionality to run CNVkit, parallelizing over multiple BAM files. """ raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) background_cnn = os.path.join(raw_work_dir, "%s_background.cnn" % (background_name if background_name else "flat")) ckouts = [] for test_bam in test_bams: out_base = _bam_to_outbase(test_bam, raw_work_dir) ckouts.append({"cnr": "%s.cns" % out_base, "cns": "%s.cns" % out_base, "back_cnn": background_cnn}) if not utils.file_exists(ckouts[0]["cnr"]): data = items[0] cov_interval = dd.get_coverage_interval(data) raw_target_bed, access_bed = _get_target_access_files(cov_interval, data, work_dir) # bail out if we ended up with no regions if not utils.file_exists(raw_target_bed): return {} raw_target_bed = annotate.add_genes(raw_target_bed, data) parallel = {"type": "local", "cores": dd.get_cores(data), "progs": ["cnvkit"]} target_bed, antitarget_bed = _cnvkit_targets(raw_target_bed, access_bed, cov_interval, raw_work_dir, data) def _bam_to_itype(bam): return "background" if bam in background_bams else "evaluate" split_cnns = run_multicore( _cnvkit_coverage, [ (bam, bed, _bam_to_itype(bam), raw_work_dir, data) for bam in test_bams + background_bams for bed in _split_bed(target_bed, data) + _split_bed(antitarget_bed, data) ], data["config"], parallel, ) coverage_cnns = _merge_coverage(split_cnns, data) background_cnn = _cnvkit_background( [x["file"] for x in coverage_cnns if x["itype"] == "background"], background_cnn, target_bed, antitarget_bed, data, ) fixed_cnrs = run_multicore( _cnvkit_fix, [ (cnns, background_cnn, data) for cnns in tz.groupby("bam", [x for x in coverage_cnns if x["itype"] == "evaluate"]).values() ], data["config"], parallel, ) called_segs = run_multicore( _cnvkit_segment, [(cnr, cov_interval, data) for cnr in fixed_cnrs], data["config"], parallel ) return ckouts
def _configure_somatic(paired, ref_file, region, out_file, tx_work_dir): utils.safe_makedir(tx_work_dir) cmd = [sys.executable, os.path.realpath(utils.which("configureStrelkaSomaticWorkflow.py"))] cmd += ["--referenceFasta=%s" % ref_file, "--callRegions=%s" % _get_region_bed(region, [paired.tumor_data, paired.normal_data], out_file), "--runDir=%s" % tx_work_dir, "--normalBam=%s" % paired.normal_bam, "--tumorBam=%s" % paired.tumor_bam] if dd.get_coverage_interval(paired.tumor_data) not in ["genome"]: cmd += ["--targeted"] do.run(cmd, "Configure Strelka2 germline calling: %s" % paired.tumor_name) return os.path.join(tx_work_dir, "runWorkflow.py")
def _configure_germline(align_bams, items, ref_file, region, out_file, tx_work_dir): utils.safe_makedir(tx_work_dir) cmd = [sys.executable, os.path.realpath(utils.which("configureStrelkaGermlineWorkflow.py"))] cmd += ["--referenceFasta=%s" % ref_file, "--callRegions=%s" % _get_region_bed(region, items, out_file), "--ploidy=%s" % _get_ploidy(shared.to_multiregion(region), items, out_file), "--runDir=%s" % tx_work_dir] cmd += ["--bam=%s" % b for b in align_bams] if any(dd.get_coverage_interval(d) not in ["genome"] for d in items): cmd += ["--targeted"] do.run(cmd, "Configure Strelka2 germline calling: %s" % (", ".join([dd.get_sample_name(d) for d in items]))) return os.path.join(tx_work_dir, "runWorkflow.py")
def _run_cnvkit_shared_orig(inputs, backgrounds): """Original CNVkit implementation with full normalization and segmentation. """ work_dir = _sv_workdir(inputs[0]) raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) background_name = dd.get_sample_name(backgrounds[0]) if backgrounds else "flat" background_cnn = os.path.join(raw_work_dir, "%s_background.cnn" % (background_name)) ckouts = [] for cur_input in inputs: cur_raw_work_dir = utils.safe_makedir(os.path.join(_sv_workdir(cur_input), "raw")) out_base, out_base_old = _bam_to_outbase(dd.get_align_bam(cur_input), cur_raw_work_dir, cur_input) if utils.file_exists(out_base_old + ".cns"): out_base = out_base_old ckouts.append({"cnr": "%s.cnr" % out_base, "cns": "%s.cns" % out_base}) if not utils.file_exists(ckouts[0]["cns"]): cov_interval = dd.get_coverage_interval(inputs[0]) samples_to_run = list(zip(["background"] * len(backgrounds), backgrounds)) + \ list(zip(["evaluate"] * len(inputs), inputs)) # New style shared SV bins if tz.get_in(["depth", "bins", "target"], inputs[0]): target_bed = tz.get_in(["depth", "bins", "target"], inputs[0]) antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], inputs[0]) raw_coverage_cnns = reduce(operator.add, [_get_general_coverage(cdata, itype) for itype, cdata in samples_to_run]) # Back compatible with pre-existing runs else: target_bed, antitarget_bed = _get_original_targets(inputs[0]) raw_coverage_cnns = reduce(operator.add, [_get_original_coverage(cdata, itype) for itype, cdata in samples_to_run]) # Currently metrics not calculated due to speed and needing re-evaluation # We could re-enable with larger truth sets to evaluate background noise # But want to reimplement in a more general fashion as part of normalization if False: coverage_cnns = reduce(operator.add, [_cnvkit_metrics(cnns, target_bed, antitarget_bed, cov_interval, inputs + backgrounds) for cnns in tz.groupby("bam", raw_coverage_cnns).values()]) background_cnn = cnvkit_background(_select_background_cnns(coverage_cnns), background_cnn, inputs, target_bed, antitarget_bed) else: coverage_cnns = raw_coverage_cnns background_cnn = cnvkit_background([x["file"] for x in coverage_cnns if x["itype"] == "background"], background_cnn, inputs, target_bed, antitarget_bed) parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"]} fixed_cnrs = run_multicore(_cnvkit_fix, [(cnns, background_cnn, inputs, ckouts) for cnns in tz.groupby("bam", [x for x in coverage_cnns if x["itype"] == "evaluate"]).values()], inputs[0]["config"], parallel) [_cnvkit_segment(cnr, cov_interval, data, inputs + backgrounds) for cnr, data in fixed_cnrs] return ckouts
def _is_targeted_region(cur_bed, data): """Calculate if we should process region as a targeted or WGS. Currently always based on total coverage interval, as that validates best and is consistent between CWL (larger blocks) and non-CWL runs (smaller blocks). We can check core usage and provide a consistent report when moving to CWL exclusively. """ cores = dd.get_num_cores(data) if cores > 0: # Apply to all core setups now for consistency return dd.get_coverage_interval(data) not in ["genome"] else: return coverage_interval_from_bed(cur_bed, per_chrom=False) == "targeted"
def _add_segmetrics_to_output(out, data): """Add metrics for measuring reliability of CNV estimates. """ out_file = "%s-segmetrics.txt" % os.path.splitext(out["cns"])[0] if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "segmetrics", "--ci", "--pi", "-s", out["cns"], "-o", tx_out_file, out["cnr"]] if dd.get_coverage_interval(data) != "genome": cmd += ["--alpha", "0.001", "--bootstrap", "2000"] do.run(cmd, "CNVkit segmetrics") out["segmetrics"] = out_file return out
def prepare_intervals(data, region_file, work_dir): """Prepare interval regions for targeted and gene based regions. """ target_file = os.path.join(work_dir, "%s-target.interval_list" % dd.get_sample_name(data)) if not utils.file_uptodate(target_file, region_file): with file_transaction(data, target_file) as tx_out_file: params = ["-T", "PreprocessIntervals", "-R", dd.get_ref_file(data), "--interval-merging-rule", "OVERLAPPING_ONLY", "-O", tx_out_file] if dd.get_coverage_interval(data) == "genome": params += ["--bin-length", "1000", "--padding", "0"] else: params += ["-L", region_file, "--bin-length", "0", "--padding", "250"] _run_with_memory_scaling(params, tx_out_file, data) return target_file
def _run_cnvkit_shared(inputs, backgrounds): """Shared functionality to run CNVkit, parallelizing over multiple BAM files. """ work_dir = _sv_workdir(inputs[0]) raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) background_name = dd.get_sample_name(backgrounds[0]) if backgrounds else "flat" background_cnn = os.path.join(raw_work_dir, "%s_background.cnn" % (background_name)) ckouts = [] for cur_input in inputs: cur_raw_work_dir = utils.safe_makedir(os.path.join(_sv_workdir(cur_input), "raw")) out_base = _bam_to_outbase(dd.get_align_bam(cur_input), cur_raw_work_dir) ckouts.append({"cnr": "%s.cnr" % out_base, "cns": "%s.cns" % out_base, "back_cnn": background_cnn}) if not utils.file_exists(ckouts[0]["cnr"]): cov_interval = dd.get_coverage_interval(inputs[0]) raw_target_bed, access_bed = _get_target_access_files(cov_interval, inputs[0], work_dir) # bail out if we ended up with no regions if not utils.file_exists(raw_target_bed): return {} raw_target_bed = annotate.add_genes(raw_target_bed, inputs[0]) parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"]} pct_coverage = (pybedtools.BedTool(raw_target_bed).total_coverage() / float(pybedtools.BedTool(access_bed).total_coverage())) * 100.0 target_bed, antitarget_bed = _cnvkit_targets(raw_target_bed, access_bed, cov_interval, pct_coverage, raw_work_dir, inputs[0]) split_beds = _split_bed(target_bed, inputs[0]) + _split_bed(antitarget_bed, inputs[0]) samples_to_run = zip(["background"] * len(backgrounds), backgrounds) + \ zip(["evaluate"] * len(inputs), inputs) split_cnns = run_multicore(_cnvkit_coverage, [(cdata, bed, itype) for itype, cdata in samples_to_run for bed in split_beds], inputs[0]["config"], parallel) raw_coverage_cnns = _merge_coverage(split_cnns, inputs[0]) coverage_cnns = run_multicore(_cnvkit_metrics, [(cnns, target_bed, antitarget_bed, cov_interval, inputs + backgrounds) for cnns in tz.groupby("bam", raw_coverage_cnns).values()], inputs[0]["config"], parallel) background_cnn = _cnvkit_background(_select_background_cnns(coverage_cnns), background_cnn, target_bed, antitarget_bed, inputs[0]) fixed_cnrs = run_multicore(_cnvkit_fix, [(cnns, background_cnn, inputs + backgrounds) for cnns in tz.groupby("bam", [x for x in coverage_cnns if x["itype"] == "evaluate"]).values()], inputs[0]["config"], parallel) run_multicore(_cnvkit_segment, [(cnr, cov_interval, data) for cnr, data in fixed_cnrs], inputs[0]["config"], parallel) return ckouts
def identify(data): """Identify high depth regions in the alignment file for potential filtering. """ high_multiplier = 20 sample_size = int(1e6) high_percentage = 25.0 min_coverage = 10 window_size = 250 work_bam, out_file, stats_file = _get_files(data) if not os.path.exists(out_file) and dd.get_coverage_interval(data) == "genome": cores = dd.get_num_cores(data) with file_transaction(data, out_file) as tx_out_file: tx_raw_file = "%s-raw%s" % utils.splitext_plus(tx_out_file) py_cl = os.path.join(os.path.dirname(sys.executable), "py") cmd = ("sambamba depth window -t {cores} -c {min_coverage} " "--window-size {window_size} {work_bam} " "| head -n {sample_size} " """| cut -f 5 | {py_cl} -l 'numpy.median([float(x) for x in l if not x.startswith("mean")])'""") median_depth_out = subprocess.check_output(cmd.format(**locals()), shell=True) try: median_cov = float(median_depth_out) except ValueError: logger.info("Skipping high coverage region detection; problem calculating median depth: %s" % median_depth_out) median_cov = None if median_cov and not numpy.isnan(median_cov): high_thresh = int(high_multiplier * median_cov) cmd = ("sambamba depth window -t {cores} -c {median_cov} " "--window-size {window_size} -T {high_thresh} {work_bam} " "| {py_cl} -fx 'float(x.split()[5]) >= {high_percentage} " """if not x.startswith("#") else None' """ "| cut -f 1-3,7 > {tx_raw_file} ") do.run(cmd.format(**locals()), "Identify high coverage regions") with open(stats_file, "w") as out_handle: yaml.safe_dump({"median_cov": median_cov}, out_handle, allow_unicode=False, default_flow_style=False) else: with open(tx_raw_file, "w") as out_handle: out_handle.write("") if utils.file_exists(tx_raw_file): cmd = "bedtools merge -i {tx_raw_file} -c 4 -o distinct > {tx_out_file}" do.run(cmd.format(**locals()), "Clean up raw coverage file") else: shutil.move(tx_raw_file, tx_out_file) return out_file if os.path.exists(out_file) else None
def _run_cnvkit_shared(data, test_bams, background_bams, work_dir, background_name=None): """Shared functionality to run CNVkit. """ ref_file = dd.get_ref_file(data) raw_work_dir = os.path.join(work_dir, "raw") out_base = os.path.splitext(os.path.basename(test_bams[0]))[0].split(".")[0] background_cnn = "%s_background.cnn" % (background_name if background_name else "flat") files = {"cnr": os.path.join(raw_work_dir, "%s.cnr" % out_base), "cns": os.path.join(raw_work_dir, "%s.cns" % out_base), "back_cnn": os.path.join(raw_work_dir, background_cnn)} if not utils.file_exists(files["cnr"]): if os.path.exists(raw_work_dir): shutil.rmtree(raw_work_dir) with tx_tmpdir(data, work_dir) as tx_work_dir: cov_interval = dd.get_coverage_interval(data) raw_target_bed, access_bed = _get_target_access_files(cov_interval, data, work_dir) # bail out if we ended up with no regions if not utils.file_exists(raw_target_bed): return {} target_bed = annotate.add_genes(raw_target_bed, data) # Do not paralleize cnvkit due to current issues with multi-processing cores = 1 # cores = min(tz.get_in(["config", "algorithm", "num_cores"], data, 1), # len(test_bams) + len(background_bams)) cmd = [_get_cmd(), "batch"] + \ test_bams + ["-n"] + background_bams + ["-f", ref_file] + \ ["--targets", target_bed, "--access", access_bed] + \ ["-d", tx_work_dir, "--split", "-p", str(cores), "--output-reference", os.path.join(tx_work_dir, background_cnn)] if cov_interval not in ["amplicon", "genome"]: at_avg, at_min, t_avg = _get_antitarget_size(access_bed, target_bed) if at_avg: cmd += ["--antitarget-avg-size", str(at_avg), "--antitarget-min-size", str(at_min), "--target-avg-size", str(t_avg)] local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") cmd += ["--rlibpath", local_sitelib] do.run(cmd, "CNVkit batch") shutil.move(tx_work_dir, raw_work_dir) for ftype in ["cnr", "cns"]: if not os.path.exists(files[ftype]): raise IOError("Missing CNVkit %s file: %s" % (ftype, files[ftype])) return files
def _get_vqsr_annotations(filter_type, data): """Retrieve appropriate annotations to use for VQSR based on filter type. Issues reported with MQ and bwa-mem quality distribution, results in intermittent failures to use VQSR: http://gatkforums.broadinstitute.org/discussion/4425/variant-recalibration-failing http://gatkforums.broadinstitute.org/discussion/4248/variantrecalibrator-removing-all-snps-from-the-training-set """ if filter_type == "SNP": # MQ, MQRankSum anns = ["QD", "FS", "ReadPosRankSum", "SOR"] else: assert filter_type == "INDEL" # MQRankSum anns = ["QD", "FS", "ReadPosRankSum", "SOR"] if dd.get_coverage_interval(data) == "genome": anns += ["DP"] return anns
def _run_cnvkit_shared(inputs, backgrounds): """Shared functionality to run CNVkit, parallelizing over multiple BAM files. Handles new style cases where we have pre-normalized inputs and old cases where we run CNVkit individually. """ if tz.get_in(["depth", "bins", "normalized"], inputs[0]): ckouts = [] for data in inputs: cnr_file = tz.get_in(["depth", "bins", "normalized"], data) cns_file = os.path.join(_sv_workdir(data), "%s.cns" % dd.get_sample_name(data)) cns_file = _cnvkit_segment(cnr_file, dd.get_coverage_interval(data), data, inputs + backgrounds, cns_file) ckouts.append({"cnr": cnr_file, "cns": cns_file, "background": tz.get_in(["depth", "bins", "background"], data)}) return ckouts else: return _run_cnvkit_shared_orig(inputs, backgrounds)
def get_base_cnv_regions(data, work_dir, genome_default="transcripts1e4", include_gene_names=True): """Retrieve set of target regions for CNV analysis. Subsets to extended transcript regions for WGS experiments to avoid long runtimes. """ cov_interval = dd.get_coverage_interval(data) base_regions = get_sv_bed(data, include_gene_names=include_gene_names) # if we don't have a configured BED or regions to use for SV caling if not base_regions: # For genome calls, subset to regions near genes as targets if cov_interval == "genome": base_regions = get_sv_bed(data, genome_default, work_dir, include_gene_names=include_gene_names) if base_regions: base_regions = remove_exclude_regions(base_regions, base_regions, [data]) # Finally, default to the defined variant regions if not base_regions: base_regions = dd.get_variant_regions(data) return bedutils.clean_file(base_regions, data)
def get_base_cnv_regions(data, work_dir): """Retrieve set of target regions for CNV analysis. Subsets to extended transcript regions for WGS experiments to avoid long runtimes. """ cov_interval = dd.get_coverage_interval(data) base_regions = regions.get_sv_bed(data) # if we don't have a configured BED or regions to use for SV caling if not base_regions: # For genome calls, subset to regions within 10kb of genes if cov_interval == "genome": base_regions = regions.get_sv_bed(data, "transcripts1e4", work_dir) if base_regions: base_regions = remove_exclude_regions(base_regions, base_regions, [data]) # Finally, default to the defined variant regions if not base_regions: base_regions = dd.get_variant_regions(data) return base_regions
def _prep_config(items, paired, work_dir): """Run initial configuration, generating a run directory for Manta. """ assert utils.which("configManta.py"), "Could not find installed configManta.py" out_file = os.path.join(work_dir, "runWorkflow.py") if not utils.file_exists(out_file): cmd = [sys.executable, utils.which("configManta.py")] if paired: if paired.normal_bam: cmd += ["--normalBam=%s" % paired.normal_bam, "--tumorBam=%s" % paired.tumor_bam] else: cmd += ["--tumorBam=%s" % paired.tumor_bam] else: cmd += ["--bam=%s" % dd.get_align_bam(data) for data in items] data = paired.tumor_data if paired else items[0] cmd += ["--referenceFasta=%s" % dd.get_ref_file(data), "--runDir=%s" % work_dir] if dd.get_coverage_interval(data) not in ["genome"]: cmd += ["--exome"] do.run(cmd, "Configure manta SV analysis") return out_file
def prepare_intervals(data, region_file, work_dir): """Prepare interval regions for targeted and gene based regions. """ target_file = os.path.join( work_dir, "%s-target.interval_list" % dd.get_sample_name(data)) if not utils.file_uptodate(target_file, region_file): with file_transaction(data, target_file) as tx_out_file: params = [ "-T", "PreprocessIntervals", "-R", dd.get_ref_file(data), "--interval-merging-rule", "OVERLAPPING_ONLY", "-O", tx_out_file ] if dd.get_coverage_interval(data) == "genome": params += ["--bin-length", "1000", "--padding", "0"] else: params += [ "-L", region_file, "--bin-length", "0", "--padding", "250" ] _run_with_memory_scaling(params, tx_out_file, data) return target_file
def assign_interval(data): """Identify coverage based on percent of genome covered and relation to targets. Classifies coverage into 3 categories: - genome: Full genome coverage - regional: Regional coverage, like exome capture, with off-target reads - amplicon: Amplication based regional coverage without off-target reads """ genome_cov_thresh = 0.40 # percent of genome covered for whole genome analysis offtarget_thresh = 0.10 # percent of offtarget reads required to be capture (not amplification) based if not dd.get_coverage_interval(data): vrs = dd.get_variant_regions(data) callable_file = dd.get_sample_callable(data) if vrs: seq_size = pybedtools.BedTool(vrs).total_coverage() else: seq_size = pybedtools.BedTool(callable_file).total_coverage() total_size = sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])]) genome_cov_pct = seq_size / float(total_size) if genome_cov_pct > genome_cov_thresh: cov_interval = "genome" offtarget_pct = 0.0 else: offtarget_stat_file = dd.get_offtarget_stats(data) if not offtarget_stat_file: offtarget_pct = 0.0 else: with open(offtarget_stat_file) as in_handle: stats = yaml.safe_load(in_handle) if float(stats["mapped"]) > 0: offtarget_pct = stats["offtarget"] / float(stats["mapped"]) else: offtarget_pct = 0.0 if offtarget_pct > offtarget_thresh: cov_interval = "regional" else: cov_interval = "amplicon" logger.info("%s: Assigned coverage as '%s' with %.1f%% genome coverage and %.1f%% offtarget coverage" % (dd.get_sample_name(data), cov_interval, genome_cov_pct * 100.0, offtarget_pct * 100.0)) data["config"]["algorithm"]["coverage_interval"] = cov_interval return data
def assign_interval(data): """Identify coverage based on percent of genome covered and relation to targets. Classifies coverage into 3 categories: - genome: Full genome coverage - regional: Regional coverage, like exome capture, with off-target reads - amplicon: Amplication based regional coverage without off-target reads """ if not dd.get_coverage_interval(data): vrs = dd.get_variant_regions_merged(data) callable_file = dd.get_sample_callable(data) if vrs: callable_size = pybedtools.BedTool(vrs).total_coverage() else: callable_size = pybedtools.BedTool(callable_file).total_coverage() total_size = sum([ c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"]) ]) genome_cov_pct = callable_size / float(total_size) if genome_cov_pct > GENOME_COV_THRESH: cov_interval = "genome" offtarget_pct = 0.0 elif not vrs: cov_interval = "regional" offtarget_pct = 0.0 else: offtarget_pct = _count_offtarget( data, dd.get_align_bam(data) or dd.get_work_bam(data), vrs or callable_file, "variant_regions") if offtarget_pct > OFFTARGET_THRESH: cov_interval = "regional" else: cov_interval = "amplicon" logger.info( "%s: Assigned coverage as '%s' with %.1f%% genome coverage and %.1f%% offtarget coverage" % (dd.get_sample_name(data), cov_interval, genome_cov_pct * 100.0, offtarget_pct * 100.0)) data["config"]["algorithm"]["coverage_interval"] = cov_interval return data
def _configure_germline(align_bams, items, ref_file, region, out_file, tx_work_dir): utils.safe_makedir(tx_work_dir) cmd = [ sys.executable, os.path.realpath(utils.which("configureStrelkaGermlineWorkflow.py")) ] cmd += [ "--referenceFasta=%s" % ref_file, "--callRegions=%s" % get_region_bed(region, items, out_file), "--ploidy=%s" % _get_ploidy(shared.to_multiregion(region), items, out_file), "--runDir=%s" % tx_work_dir ] cmd += ["--bam=%s" % b for b in align_bams] if any(dd.get_coverage_interval(d) not in ["genome"] for d in items): cmd += ["--targeted"] do.run( cmd, "Configure Strelka2 germline calling: %s" % (", ".join([dd.get_sample_name(d) for d in items]))) return os.path.join(tx_work_dir, "runWorkflow.py")
def _scalpel_bed_file_opts(items, config, out_file, region, tmp_path): variant_regions = bedutils.population_variant_regions(items) target = shared.subset_variant_regions(variant_regions, region, out_file, items) if target: if isinstance(target, six.string_types) and os.path.isfile(target): target_bed = target else: target_bed = os.path.join(tmp_path, "tmp.bed") if not utils.file_exists(target_bed): with file_transaction(config, target_bed) as tx_tmp_bed: if not isinstance(region, (list, tuple)): message = ("Region must be a tuple - something odd just happened") raise ValueError(message) chrom, start, end = region with open(tx_tmp_bed, "w") as out_handle: print("%s\t%s\t%s" % (chrom, start, end), file=out_handle) if any(dd.get_coverage_interval(x) == "genome" for x in items): target_bed = shared.remove_lcr_regions(target_bed, items) return ["--bed", target_bed] else: return []
def _run_cnvkit_shared(inputs, backgrounds): """Shared functionality to run CNVkit, parallelizing over multiple BAM files. """ work_dir = _sv_workdir(inputs[0]) raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) background_name = dd.get_sample_name(backgrounds[0]) if backgrounds else "flat" background_cnn = os.path.join(raw_work_dir, "%s_background.cnn" % (background_name)) ckouts = [] for cur_input in inputs: cur_raw_work_dir = utils.safe_makedir(os.path.join(_sv_workdir(cur_input), "raw")) out_base = _bam_to_outbase(dd.get_align_bam(cur_input), cur_raw_work_dir) ckouts.append({"cnr": "%s.cnr" % out_base, "cns": "%s.cns" % out_base, "back_cnn": background_cnn}) if not utils.file_exists(ckouts[0]["cns"]): cov_interval = dd.get_coverage_interval(inputs[0]) raw_target_bed, access_bed = _get_target_access_files(cov_interval, inputs[0], work_dir) # bail out if we ended up with no regions if not utils.file_exists(raw_target_bed): return {} raw_target_bed = annotate.add_genes(raw_target_bed, inputs[0]) parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"]} target_bed, antitarget_bed = _cnvkit_targets(raw_target_bed, access_bed, cov_interval, raw_work_dir, inputs[0]) samples_to_run = zip(["background"] * len(backgrounds), backgrounds) + \ zip(["evaluate"] * len(inputs), inputs) raw_coverage_cnns = [_cnvkit_coverage(cdata, bed, itype) for itype, cdata in samples_to_run for bed in [target_bed, antitarget_bed]] coverage_cnns = reduce(operator.add, [_cnvkit_metrics(cnns, target_bed, antitarget_bed, cov_interval, inputs + backgrounds) for cnns in tz.groupby("bam", raw_coverage_cnns).values()]) background_cnn = _cnvkit_background(_select_background_cnns(coverage_cnns), background_cnn, target_bed, antitarget_bed, inputs[0]) fixed_cnrs = run_multicore(_cnvkit_fix, [(cnns, background_cnn, inputs + backgrounds) for cnns in tz.groupby("bam", [x for x in coverage_cnns if x["itype"] == "evaluate"]).values()], inputs[0]["config"], parallel) [_cnvkit_segment(cnr, cov_interval, data) for cnr, data in fixed_cnrs] return ckouts
def _prep_config(items, paired, work_dir): """Run initial configuration, generating a run directory for Manta. """ assert utils.which( "configManta.py"), "Could not find installed configManta.py" out_file = os.path.join(work_dir, "runWorkflow.py") if not utils.file_exists(out_file) or _out_of_date(out_file): config_script = os.path.realpath(utils.which("configManta.py")) cmd = [sys.executable, config_script] if paired: if paired.normal_bam: cmd += [ "--normalBam=%s" % paired.normal_bam, "--tumorBam=%s" % paired.tumor_bam ] else: cmd += ["--tumorBam=%s" % paired.tumor_bam] else: cmd += ["--bam=%s" % dd.get_align_bam(data) for data in items] data = paired.tumor_data if paired else items[0] cmd += [ "--referenceFasta=%s" % dd.get_ref_file(data), "--runDir=%s" % work_dir ] if dd.get_coverage_interval(data) not in ["genome"]: cmd += ["--exome"] for region in _maybe_limit_chromosomes(data): cmd += ["--region", region] resources = config_utils.get_resources("manta", data["config"]) if resources.get("options"): cmd += [str(x) for x in resources["options"]] # If we are removing polyX, avoid calling on small indels which require # excessively long runtimes on noisy WGS runs if "polyx" in dd.get_exclude_regions(data): cmd += [ "--config", _prep_streamlined_config(config_script, work_dir) ] do.run(cmd, "Configure manta SV analysis") return out_file
def _prep_config(items, paired, work_dir): """Run initial configuration, generating a run directory for Manta. """ assert utils.which("configManta.py"), "Could not find installed configManta.py" out_file = os.path.join(work_dir, "runWorkflow.py") if not utils.file_exists(out_file) or _out_of_date(out_file): cmd = [sys.executable, os.path.realpath(utils.which("configManta.py"))] if paired: if paired.normal_bam: cmd += ["--normalBam=%s" % paired.normal_bam, "--tumorBam=%s" % paired.tumor_bam] else: cmd += ["--tumorBam=%s" % paired.tumor_bam] else: cmd += ["--bam=%s" % dd.get_align_bam(data) for data in items] data = paired.tumor_data if paired else items[0] cmd += ["--referenceFasta=%s" % dd.get_ref_file(data), "--runDir=%s" % work_dir] if dd.get_coverage_interval(data) not in ["genome"]: cmd += ["--exome"] for region in _maybe_limit_chromosomes(data): cmd += ["--region", region] do.run(cmd, "Configure manta SV analysis") return out_file
def precall(items): """Perform initial pre-calling steps -- coverage calcuation by sample. Use sambamba to call average region coverage in regions, and convert into a correct format. """ items = [utils.to_single_data(x) for x in items] assert len(items) == 1, "Expect one item to Seq2C coverage calculation" data = utils.to_single_data(items) assert dd.get_coverage_interval(data) != "genome", "Seq2C only for amplicon and exome sequencing" work_dir = _sv_workdir(data) bed_file = _prep_bed(data, work_dir) bam_file = dd.get_align_bam(data) sample_name = dd.get_sample_name(data) cov_file = _calculate_coverage(data, work_dir, bed_file, bam_file, sample_name) if "sv" not in data: data["sv"] = [] data["sv"].append({"variantcaller": "seq2c", "coverage": cov_file}) return [data]
def _get_maxcov_downsample(data): """Calculate maximum coverage downsampling for whole genome samples. Returns None if we're not doing downsampling. """ from bcbio.bam import ref from bcbio.ngsalign import alignprep, bwa from bcbio.variation import coverage params = { "min_coverage_for_downsampling": 10, "maxcov_downsample_multiplier": dd.get_maxcov_downsample(data) } fastq_file = data["files"][0] num_reads = alignprep.total_reads_from_grabix(fastq_file) if num_reads and params["maxcov_downsample_multiplier"] and params[ "maxcov_downsample_multiplier"] > 0: vrs = dd.get_variant_regions_merged(data) total_size = sum([ c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"]) ]) if vrs: callable_size = pybedtools.BedTool(vrs).total_coverage() genome_cov_pct = callable_size / float(total_size) else: callable_size = total_size genome_cov_pct = 1.0 if (genome_cov_pct > coverage.GENOME_COV_THRESH and dd.get_coverage_interval(data) in ["genome", None, False]): total_counts, total_sizes = 0, 0 for count, size in bwa.fastq_size_output(fastq_file, 5000): total_counts += int(count) total_sizes += (int(size) * int(count)) read_size = float(total_sizes) / float(total_counts) avg_cov = float(num_reads * read_size) / callable_size if avg_cov >= params["min_coverage_for_downsampling"]: return int(avg_cov * params["maxcov_downsample_multiplier"]) return None
def assign_interval(data): """Identify coverage based on percent of genome covered and relation to targets. Classifies coverage into 3 categories: - genome: Full genome coverage - regional: Regional coverage, like exome capture, with off-target reads - amplicon: Amplication based regional coverage without off-target reads """ genome_cov_thresh = 0.40 # percent of genome covered for whole genome analysis offtarget_thresh = 0.10 # percent of offtarget reads required to be capture (not amplification) based if not dd.get_coverage_interval(data): vrs = dd.get_variant_regions(data) callable_file = dd.get_sample_callable(data) if vrs: seq_size = pybedtools.BedTool(vrs).total_coverage() else: seq_size = pybedtools.BedTool(callable_file).total_coverage() total_size = sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])]) genome_cov_pct = seq_size / float(total_size) if genome_cov_pct > genome_cov_thresh: cov_interval = "genome" offtarget_pct = 0.0 else: offtarget_stat_file = dd.get_offtarget_stats(data) if not offtarget_stat_file: offtarget_pct = 0.0 else: with open(offtarget_stat_file) as in_handle: stats = yaml.safe_load(in_handle) offtarget_pct = stats["offtarget"] / float(stats["mapped"]) if offtarget_pct > offtarget_thresh: cov_interval = "regional" else: cov_interval = "amplicon" logger.info("Assigned coverage as '%s' with %.1f%% genome coverage and %.1f%% offtarget coverage" % (cov_interval, genome_cov_pct * 100.0, offtarget_pct * 100.0)) data["config"]["algorithm"]["coverage_interval"] = cov_interval return data
def _prep_real_counts(bam_file, data, samtools_stats): out = {} if dd.get_coverage(data) and dd.get_coverage(data) not in ["None"]: bed = dd.get_coverage_merged(data) target_name = "coverage" elif dd.get_coverage_interval(data) != "genome": bed = dd.get_variant_regions_merged(data) target_name = "variant_regions" else: bed = None target_name = "genome" dedupped = utils.get_in(data, ("config", "algorithm", "mark_duplicates"), True) if bed: out["Preseq_genome_size"] = pybedtools.BedTool(bed).total_coverage() out["Preseq_read_count"] = readstats.number_of_mapped_reads( data, bam_file, keep_dups=True, bed_file=bed, target_name=target_name) ontrg_unique_depth = cov.get_average_coverage(target_name, bed, data, bam_file) if dedupped: out["Preseq_unique_count"] = readstats.number_of_mapped_reads( data, bam_file, keep_dups=False, bed_file=bed, target_name=target_name) # Counting average on-target alignment length, based on the equation: # avg depth ~~ num (unique) on-target alignments * avg on-target aln length / target size total_alignments = out.get("Preseq_unique_count") or out["Preseq_read_count"] out["Preseq_read_length"] = ontrg_unique_depth * out["Preseq_genome_size"] // total_alignments else: # WGS out["Preseq_genome_size"] = sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])]) out["Preseq_read_count"] = int(samtools_stats["Total_reads"]) out["Preseq_read_length"] = int(samtools_stats["Average_read_length"]) if dedupped: out["Preseq_unique_count"] = out["Preseq_read_count"] - int(samtools_stats["Duplicates"]) return out
def prepare_exclude_file(items, base_file, chrom=None): """Prepare a BED file for exclusion. Excludes high depth and centromere regions which contribute to long run times and false positive structural variant calls. """ out_file = "%s-exclude%s.bed" % (utils.splitext_plus(base_file)[0], "-%s" % chrom if chrom else "") if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): with shared.bedtools_tmpdir(items[0]): # Get a bedtool for the full region if no variant regions want_bedtool = callable.get_ref_bedtool( tz.get_in(["reference", "fasta", "base"], items[0]), items[0]["config"], chrom) if chrom: want_bedtool = pybedtools.BedTool( shared.subset_bed_by_chrom(want_bedtool.saveas().fn, chrom, items[0])) sv_exclude_bed = _get_sv_exclude_file(items) if sv_exclude_bed and len(want_bedtool) > 0: want_bedtool = want_bedtool.subtract( sv_exclude_bed, nonamecheck=True).saveas() if any(dd.get_coverage_interval(d) == "genome" for d in items): want_bedtool = pybedtools.BedTool( shared.remove_highdepth_regions(want_bedtool.saveas().fn, items)) with file_transaction(items[0], out_file) as tx_out_file: full_bedtool = callable.get_ref_bedtool( tz.get_in(["reference", "fasta", "base"], items[0]), items[0]["config"]) if len(want_bedtool) > 0: full_bedtool.subtract(want_bedtool, nonamecheck=True).saveas(tx_out_file) else: full_bedtool.saveas(tx_out_file) return out_file
def _run_titancna(cn_file, het_file, ploidy, num_clusters, work_dir, data): """Run titanCNA wrapper script on given ploidy and clusters. """ sample = dd.get_sample_name(data) cores = dd.get_num_cores(data) export_cmd = utils.get_R_exports() ploidy_dir = utils.safe_makedir( os.path.join(work_dir, "run_ploidy%s" % ploidy)) cluster_dir = "%s_cluster%02d" % (sample, num_clusters) out_dir = os.path.join(ploidy_dir, cluster_dir) if not utils.file_uptodate(out_dir + ".titan.txt", cn_file): with tx_tmpdir(data) as tmp_dir: with utils.chdir(tmp_dir): cmd = ( "{export_cmd} && titanCNA.R --id {sample} --hetFile {het_file} --cnFile {cn_file} " "--numClusters {num_clusters} --ploidy {ploidy} --numCores {cores} --outDir {tmp_dir}" ) if data["genome_build"] in ("hg19", "hg38"): cmd += " --genomeStyle UCSC" # TitanCNA's model is influenced by the variance in read coverage data # and data type: set reasonable defaults for non-WGS runs # (see https://github.com/gavinha/TitanCNA/tree/master/scripts/R_scripts) if dd.get_coverage_interval(data) != "genome": cmd += " --alphaK=2500 --alphaKHigh=2500" do.run( cmd.format(**locals()), "TitanCNA CNV detection: ploidy %s, cluster %s" % (ploidy, num_clusters)) for fname in glob.glob(os.path.join(tmp_dir, cluster_dir + "*")): shutil.move(fname, ploidy_dir) if os.path.exists(os.path.join(tmp_dir, "Rplots.pdf")): shutil.move( os.path.join(tmp_dir, "Rplots.pdf"), os.path.join(ploidy_dir, "%s.Rplots.pdf" % cluster_dir)) return ploidy_dir
def _run_coverage_qc(bam_file, data, out_dir): """Run coverage QC analysis""" out = dict() total_reads = sambamba.number_of_reads(data, bam_file) out['Total_reads'] = total_reads mapped = sambamba.number_of_mapped_reads(data, bam_file) out['Mapped_reads'] = mapped if total_reads: out['Mapped_reads_pct'] = 100.0 * mapped / total_reads if mapped: mapped_unique = sambamba.number_of_mapped_reads(data, bam_file, keep_dups=False) out['Mapped_unique_reads'] = mapped mapped_dups = mapped - mapped_unique out['Duplicates'] = mapped_dups out['Duplicates_pct'] = 100.0 * mapped_dups / mapped if dd.get_coverage(data): cov_bed_file = clean_file(dd.get_coverage(data), data, prefix="cov-", simple=True) merged_bed_file = bedutils.merge_overlaps(cov_bed_file, data) target_name = "coverage" elif dd.get_coverage_interval(data) != "genome": merged_bed_file = dd.get_variant_regions_merged(data) target_name = "variant_regions" else: merged_bed_file = None target_name = "genome" if merged_bed_file: ontarget = sambamba.number_mapped_reads_on_target( data, merged_bed_file, bam_file, keep_dups=False, target_name=target_name) if mapped_unique: out["Ontarget_unique_reads"] = ontarget out["Ontarget_pct"] = 100.0 * ontarget / mapped_unique out['Offtarget_pct'] = 100.0 * (mapped_unique - ontarget) / mapped_unique padded_bed_file = bedutils.get_padded_bed_file( merged_bed_file, 200, data) ontarget_padded = sambamba.number_mapped_reads_on_target( data, padded_bed_file, bam_file, keep_dups=False, target_name=target_name + "_padded") out["Ontarget_padded_pct"] = 100.0 * ontarget_padded / mapped_unique if total_reads: out['Usable_pct'] = 100.0 * ontarget / total_reads avg_coverage = get_average_coverage(data, bam_file, merged_bed_file, target_name) out['Avg_coverage'] = avg_coverage priority = cov.priority_coverage(data, out_dir) cov.priority_total_coverage(data, out_dir) region_coverage_file = cov.coverage_region_detailed_stats(data, out_dir) # Re-enable with annotations from internally installed # problem region directory # if priority: # annotated = cov.decorate_problem_regions(priority, problem_regions) return out
def run(bam_file, data, out_dir): """Run coverage QC analysis """ out = dict() out_dir = utils.safe_makedir(out_dir) if dd.get_coverage(data) and dd.get_coverage(data) not in ["None"]: merged_bed_file = bedutils.clean_file(dd.get_coverage_merged(data), data, prefix="cov-", simple=True) target_name = "coverage" elif dd.get_coverage_interval(data) != "genome": merged_bed_file = dd.get_variant_regions_merged(data) target_name = "variant_regions" else: merged_bed_file = None target_name = "genome" avg_depth = cov.get_average_coverage(target_name, merged_bed_file, data) if target_name == "coverage": out_files = cov.coverage_region_detailed_stats(target_name, merged_bed_file, data, out_dir) else: out_files = [] out['Avg_coverage'] = avg_depth samtools_stats_dir = os.path.join(out_dir, os.path.pardir, 'samtools') from bcbio.qc import samtools samtools_stats = samtools.run(bam_file, data, samtools_stats_dir)["metrics"] out["Total_reads"] = total_reads = int(samtools_stats["Total_reads"]) out["Mapped_reads"] = mapped = int(samtools_stats["Mapped_reads"]) out["Mapped_paired_reads"] = int(samtools_stats["Mapped_paired_reads"]) out['Duplicates'] = dups = int(samtools_stats["Duplicates"]) if total_reads: out["Mapped_reads_pct"] = 100.0 * mapped / total_reads if mapped: out['Duplicates_pct'] = 100.0 * dups / mapped if dd.get_coverage_interval(data) == "genome": mapped_unique = mapped - dups else: mapped_unique = readstats.number_of_mapped_reads(data, bam_file, keep_dups=False) out['Mapped_unique_reads'] = mapped_unique if merged_bed_file: ontarget = readstats.number_of_mapped_reads(data, bam_file, keep_dups=False, bed_file=merged_bed_file, target_name=target_name) out["Ontarget_unique_reads"] = ontarget if mapped_unique: out["Ontarget_pct"] = 100.0 * ontarget / mapped_unique out['Offtarget_pct'] = 100.0 * (mapped_unique - ontarget) / mapped_unique if dd.get_coverage_interval(data) != "genome": # Skip padded calculation for WGS even if the "coverage" file is specified # the padded statistic makes only sense for exomes and panels padded_bed_file = bedutils.get_padded_bed_file( out_dir, merged_bed_file, 200, data) ontarget_padded = readstats.number_of_mapped_reads( data, bam_file, keep_dups=False, bed_file=padded_bed_file, target_name=target_name + "_padded") out["Ontarget_padded_pct"] = 100.0 * ontarget_padded / mapped_unique if total_reads: out['Usable_pct'] = 100.0 * ontarget / total_reads indexcov_files = _goleft_indexcov(bam_file, data, out_dir) out_files += [x for x in indexcov_files if x and utils.file_exists(x)] out = {"metrics": out} if len(out_files) > 0: out["base"] = out_files[0] out["secondary"] = out_files[1:] return out
def _skip_duplicates(data): return (dd.get_coverage_interval(data) == "amplicon" or (dd.get_aligner(data) and not dd.get_mark_duplicates(data)))
def _run_coverage_qc(bam_file, data, out_dir): """Run coverage QC analysis""" out = dict() samtools_stats_dir = os.path.join(out_dir, os.path.pardir, out_dir) from bcbio.qc import samtools samtools_stats = samtools.run(bam_file, data, samtools_stats_dir) if "Total_reads" not in samtools_stats: return out["Total_reads"] = total_reads = int(samtools_stats["Total_reads"]) if not total_reads: return if "Mapped_reads_raw" not in samtools_stats or "Mapped_reads" not in samtools_stats: return out["Mapped_reads"] = mapped = int(samtools_stats["Mapped_reads"]) out["Mapped_reads_pct"] = 100.0 * mapped / total_reads if not mapped: return out if "Duplicates" in samtools_stats: out['Duplicates'] = dups = int(samtools_stats["Duplicates"]) out['Duplicates_pct'] = 100.0 * dups / int(samtools_stats["Mapped_reads_raw"]) else: dups = 0 if dd.get_coverage(data) and dd.get_coverage(data) not in ["None"]: cov_bed_file = bedutils.clean_file(dd.get_coverage(data), data, prefix="cov-", simple=True) merged_bed_file = bedutils.merge_overlaps(cov_bed_file, data) target_name = "coverage" elif dd.get_coverage_interval(data) != "genome": merged_bed_file = dd.get_variant_regions_merged(data) target_name = "variant_regions" else: merged_bed_file = None target_name = "genome" # Whole genome runs do not need detailed on-target calculations, use total unique mapped if dd.get_coverage_interval(data) == "genome": mapped_unique = mapped - dups else: out['Mapped_unique_reads'] = mapped_unique = sambamba.number_of_mapped_reads(data, bam_file, keep_dups=False) if merged_bed_file: ontarget = sambamba.number_of_mapped_reads( data, bam_file, keep_dups=False, bed_file=merged_bed_file, target_name=target_name) if mapped_unique: out["Ontarget_unique_reads"] = ontarget out["Ontarget_pct"] = 100.0 * ontarget / mapped_unique out['Offtarget_pct'] = 100.0 * (mapped_unique - ontarget) / mapped_unique if dd.get_coverage_interval(data) != "genome": # Skip padded calculation for WGS even if the "coverage" file is specified # the padded statistic makes only sense for exomes and panels padded_bed_file = bedutils.get_padded_bed_file(merged_bed_file, 200, data) ontarget_padded = sambamba.number_of_mapped_reads( data, bam_file, keep_dups=False, bed_file=padded_bed_file, target_name=target_name + "_padded") out["Ontarget_padded_pct"] = 100.0 * ontarget_padded / mapped_unique if total_reads: out['Usable_pct'] = 100.0 * ontarget / total_reads avg_depth = cov.get_average_coverage(data, bam_file, merged_bed_file, target_name) out['Avg_coverage'] = avg_depth region_coverage_file = cov.coverage_region_detailed_stats(data, out_dir, extra_cutoffs=set([max(1, int(avg_depth * 0.8))])) return out
def _get_max_depth(average_coverage, params, data): """Calculate maximum depth based on a rough multiplier of average coverage. """ if dd.get_coverage_interval(data) == "genome": avg_cov = max(30.0, average_coverage) return avg_cov * params["high_multiplier"]
def _skip_duplicates(data): return dd.get_coverage_interval( data) == "amplicon" or not dd.get_mark_duplicates(data)
def _skip_duplicates(data): return dd.get_coverage_interval(data) == "amplicon" or not dd.get_mark_duplicates(data)
def _run_cnvkit_shared_orig(inputs, backgrounds): """Original CNVkit implementation with full normalization and segmentation. """ work_dir = _sv_workdir(inputs[0]) raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) background_name = dd.get_sample_name( backgrounds[0]) if backgrounds else "flat" background_cnn = os.path.join(raw_work_dir, "%s_background.cnn" % (background_name)) ckouts = [] for cur_input in inputs: cur_raw_work_dir = utils.safe_makedir( os.path.join(_sv_workdir(cur_input), "raw")) out_base, out_base_old = _bam_to_outbase(dd.get_align_bam(cur_input), cur_raw_work_dir, cur_input) if utils.file_exists(out_base_old + ".cns"): out_base = out_base_old ckouts.append({"cnr": "%s.cnr" % out_base, "cns": "%s.cns" % out_base}) if not utils.file_exists(ckouts[0]["cns"]): cov_interval = dd.get_coverage_interval(inputs[0]) samples_to_run = zip(["background"] * len(backgrounds), backgrounds) + \ zip(["evaluate"] * len(inputs), inputs) # New style shared SV bins if tz.get_in(["depth", "bins", "target"], inputs[0]): target_bed = tz.get_in(["depth", "bins", "target"], inputs[0]) antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], inputs[0]) raw_coverage_cnns = reduce(operator.add, [ _get_general_coverage(cdata, itype) for itype, cdata in samples_to_run ]) # Back compatible with pre-existing runs else: target_bed, antitarget_bed = _get_original_targets(inputs[0]) raw_coverage_cnns = reduce(operator.add, [ _get_original_coverage(cdata, itype) for itype, cdata in samples_to_run ]) # Currently metrics not calculated due to speed and needing re-evaluation # We could re-enable with larger truth sets to evaluate background noise # But want to reimplement in a more general fashion as part of normalization if False: coverage_cnns = reduce(operator.add, [ _cnvkit_metrics(cnns, target_bed, antitarget_bed, cov_interval, inputs + backgrounds) for cnns in tz.groupby("bam", raw_coverage_cnns).values() ]) background_cnn = cnvkit_background( _select_background_cnns(coverage_cnns), background_cnn, inputs, target_bed, antitarget_bed) else: coverage_cnns = raw_coverage_cnns background_cnn = cnvkit_background([ x["file"] for x in coverage_cnns if x["itype"] == "background" ], background_cnn, inputs, target_bed, antitarget_bed) parallel = { "type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"] } fixed_cnrs = run_multicore( _cnvkit_fix, [(cnns, background_cnn, inputs, ckouts) for cnns in tz.groupby( "bam", [x for x in coverage_cnns if x["itype"] == "evaluate"]).values()], inputs[0]["config"], parallel) [ _cnvkit_segment(cnr, cov_interval, data, inputs + backgrounds) for cnr, data in fixed_cnrs ] return ckouts
def _run_cnvkit_shared(items, test_bams, background_bams, work_dir, background_name=None): """Shared functionality to run CNVkit, parallelizing over multiple BAM files. """ raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) background_cnn = os.path.join( raw_work_dir, "%s_background.cnn" % (background_name if background_name else "flat")) ckouts = [] for test_bam in test_bams: out_base = os.path.splitext( os.path.basename(test_bam))[0].split(".")[0] ckouts.append({ "cnr": os.path.join(raw_work_dir, "%s.cns" % out_base), "cns": os.path.join(raw_work_dir, "%s.cns" % out_base), "back_cnn": background_cnn }) if not utils.file_exists(ckouts[0]["cnr"]): data = items[0] cov_interval = dd.get_coverage_interval(data) raw_target_bed, access_bed = _get_target_access_files( cov_interval, data, work_dir) # bail out if we ended up with no regions if not utils.file_exists(raw_target_bed): return {} raw_target_bed = annotate.add_genes(raw_target_bed, data) parallel = { "type": "local", "cores": dd.get_cores(data), "progs": ["cnvkit"] } target_bed, antitarget_bed = _cnvkit_targets(raw_target_bed, access_bed, cov_interval, raw_work_dir, data) def _bam_to_itype(bam): return "background" if bam in background_bams else "evaluate" coverage_cnns = run_multicore( _cnvkit_coverage, [(bam, bed, _bam_to_itype(bam), raw_work_dir, data) for bam in test_bams + background_bams for bed in [target_bed, antitarget_bed]], data["config"], parallel) background_cnn = _cnvkit_background( [x["file"] for x in coverage_cnns if x["itype"] == "background"], background_cnn, target_bed, antitarget_bed, data) fixed_cnrs = run_multicore(_cnvkit_fix, [ (cnns, background_cnn, data) for cnns in tz.groupby(lambda x: x[ "bam"], [x for x in coverage_cnns if x["itype"] == "evaluate"]).values() ], data["config"], parallel) called_segs = run_multicore(_cnvkit_segment, [(cnr, cov_interval, data) for cnr in fixed_cnrs], data["config"], parallel) return ckouts
def _run_titancna(cn_file, het_file, ploidy, num_clusters, work_dir, data): """Run titanCNA wrapper script on given ploidy and clusters. """ sample = dd.get_sample_name(data) cores = dd.get_num_cores(data) export_cmd = utils.get_R_exports() ploidy_dir = utils.safe_makedir( os.path.join(work_dir, "run_ploidy%s" % ploidy)) cluster_dir = "%s_cluster%02d" % (sample, num_clusters) out_dir = os.path.join(ploidy_dir, cluster_dir) if not utils.file_uptodate(out_dir + ".titan.txt", cn_file): with tx_tmpdir(data) as tmp_dir: with utils.chdir(tmp_dir): cmd = ( "{export_cmd} && titanCNA.R --id {sample} --hetFile {het_file} --cnFile {cn_file} " "--numClusters {num_clusters} --ploidy {ploidy} --numCores {cores} --outDir {tmp_dir} " "--libdir None") chroms = [ "'%s'" % c.name.replace("chr", "") for c in ref.file_contigs(dd.get_ref_file(data)) if chromhacks.is_autosomal_or_x(c.name) ] if "'X'" not in chroms: chroms += ["'X'"] # Use UCSC style naming for human builds to support BSgenome genome_build = ("hg19" if dd.get_genome_build(data) in [ "GRCh37", "hg19" ] else dd.get_genome_build(data)) cmd += """ --chrs "c(%s)" """ % ",".join(chroms) cmd += " --genomeBuild {genome_build}" if data["genome_build"] in ("hg19", "hg38"): cmd += " --genomeStyle UCSC" if data["genome_build"] in ["hg38"]: data_dir = os.path.normpath( os.path.join( os.path.dirname( os.path.realpath( os.path.join( os.path.dirname(utils.Rscript_cmd()), "titanCNA.R"))), os.pardir, os.pardir, "data")) cytoband_file = os.path.join(data_dir, "cytoBand_hg38.txt") assert os.path.exists(cytoband_file), cytoband_file cmd += " --cytobandFile %s" % cytoband_file # TitanCNA's model is influenced by the variance in read coverage data # and data type: set reasonable defaults for non-WGS runs # (see https://github.com/gavinha/TitanCNA/tree/master/scripts/R_scripts) if dd.get_coverage_interval(data) != "genome": cmd += " --alphaK=2500 --alphaKHigh=2500" do.run( cmd.format(**locals()), "TitanCNA CNV detection: ploidy %s, cluster %s" % (ploidy, num_clusters)) for fname in glob.glob(os.path.join(tmp_dir, cluster_dir + "*")): shutil.move(fname, ploidy_dir) if os.path.exists(os.path.join(tmp_dir, "Rplots.pdf")): shutil.move( os.path.join(tmp_dir, "Rplots.pdf"), os.path.join(ploidy_dir, "%s.Rplots.pdf" % cluster_dir)) return ploidy_dir
def _run_coverage_qc(bam_file, data, out_dir): """Run coverage QC analysis""" out = dict() samtools_stats_dir = os.path.join(out_dir, os.path.pardir, out_dir) from bcbio.qc import samtools samtools_stats = samtools.run(bam_file, data, samtools_stats_dir) if "Total_reads" not in samtools_stats: return out["Total_reads"] = total_reads = int(samtools_stats["Total_reads"]) if not total_reads: return if "Mapped_reads_raw" not in samtools_stats or "Mapped_reads" not in samtools_stats: return out["Mapped_reads"] = mapped = int(samtools_stats["Mapped_reads"]) out["Mapped_reads_pct"] = 100.0 * mapped / total_reads if not mapped: return out if "Duplicates" in samtools_stats: out['Duplicates'] = dups = int(samtools_stats["Duplicates"]) out['Duplicates_pct'] = 100.0 * dups / int( samtools_stats["Mapped_reads_raw"]) else: dups = 0 if dd.get_coverage(data): cov_bed_file = bedutils.clean_file(dd.get_coverage(data), data, prefix="cov-", simple=True) merged_bed_file = bedutils.merge_overlaps(cov_bed_file, data) target_name = "coverage" elif dd.get_coverage_interval(data) != "genome": merged_bed_file = dd.get_variant_regions_merged(data) target_name = "variant_regions" else: merged_bed_file = None target_name = "genome" # Whole genome runs do not need detailed on-target calculations, use total unique mapped if dd.get_coverage_interval(data) == "genome": mapped_unique = mapped - dups else: out['Mapped_unique_reads'] = mapped_unique = sambamba.number_of_mapped_reads( data, bam_file, keep_dups=False) if merged_bed_file: ontarget = sambamba.number_of_mapped_reads(data, bam_file, keep_dups=False, bed_file=merged_bed_file, target_name=target_name) if mapped_unique: out["Ontarget_unique_reads"] = ontarget out["Ontarget_pct"] = 100.0 * ontarget / mapped_unique out['Offtarget_pct'] = 100.0 * (mapped_unique - ontarget) / mapped_unique if dd.get_coverage_interval(data) != "genome": # Skip padded calculation for WGS even if the "coverage" file is specified # the padded statistic makes only sense for exomes and panels padded_bed_file = bedutils.get_padded_bed_file( merged_bed_file, 200, data) ontarget_padded = sambamba.number_of_mapped_reads( data, bam_file, keep_dups=False, bed_file=padded_bed_file, target_name=target_name + "_padded") out["Ontarget_padded_pct"] = 100.0 * ontarget_padded / mapped_unique if total_reads: out['Usable_pct'] = 100.0 * ontarget / total_reads avg_depth = cov.get_average_coverage(data, bam_file, merged_bed_file, target_name) out['Avg_coverage'] = avg_depth region_coverage_file = cov.coverage_region_detailed_stats( data, out_dir, extra_cutoffs=set([max(1, int(avg_depth * 0.8))])) return out
def _get_max_depth(average_coverage, params, data): """Calculate maximum depth based on a rough multiplier of average coverage. """ if dd.get_coverage_interval(data) == "genome": avg_cov = min(30.0, average_coverage) return avg_cov * params["high_multiplier"]