def clean_inputs(data): """Clean BED input files to avoid overlapping segments that cause downstream issues. Per-merges inputs to avoid needing to call multiple times during later parallel steps. """ if not utils.get_in(data, ("config", "algorithm", "variant_regions_orig")): data["config"]["algorithm"][ "variant_regions_orig"] = dd.get_variant_regions(data) clean_vr = clean_file(dd.get_variant_regions(data), data) merged_vr = merge_overlaps(clean_vr, data) data["config"]["algorithm"]["variant_regions"] = clean_vr data["config"]["algorithm"]["variant_regions_merged"] = merged_vr if dd.get_coverage(data): if not utils.get_in(data, ("config", "algorithm", "coverage_orig")): data["config"]["algorithm"]["coverage_orig"] = dd.get_coverage( data) clean_cov_bed = clean_file(dd.get_coverage(data), data, prefix="cov-", simple=True) merged_cov_bed = merge_overlaps(clean_cov_bed, data) data["config"]["algorithm"]["coverage"] = clean_cov_bed data["config"]["algorithm"]["coverage_merged"] = merged_cov_bed if 'seq2c' in get_svcallers(data): seq2c_ready_bed = prep_seq2c_bed(data) if not seq2c_ready_bed: logger.warning( "Can't run Seq2C without a svregions or variant_regions BED file" ) else: data["config"]["algorithm"]["seq2c_bed_ready"] = seq2c_ready_bed return data
def _prep_bed(data, work_dir): """Selecting the bed file, cleaning, and properly annotating for Seq2C """ bed_file = regions.get_sv_bed(data) if bed_file: bed_file = clean_file(bed_file, data, prefix="svregions-") else: bed_file = clean_file(dd.get_variant_regions(data), data) col_num = bt.BedTool(bed_file).field_count() if col_num < 4: annotated_file = annotate.add_genes(bed_file, data, max_distance=0) if annotated_file == bed_file: raise ValueError("BED file for Seq2C must be annotated with gene names, " "however the input BED is 3-columns and we have no transcript " "data to annotate with " + bed_file) annotated_file = annotate.gene_one_per_line(annotated_file, data) else: annotated_file = bed_file ready_file = "%s-seq2cclean.bed" % (utils.splitext_plus(annotated_file)[0]) if not utils.file_uptodate(ready_file, annotated_file): bed = bt.BedTool(annotated_file) if col_num > 4 and col_num != 8: bed = bed.cut(range(4)) bed = bed.filter(lambda x: x.name not in ["", ".", "-"]) with file_transaction(data, ready_file) as tx_out_file: bed.saveas(tx_out_file) logger.debug("Saved Seq2C clean annotated ready input BED into " + ready_file) return ready_file
def clean_inputs(data): """Clean BED input files to avoid overlapping segments that cause downstream issues. Per-merges inputs to avoid needing to call multiple times during later parallel steps. """ if not utils.get_in(data, ("config", "algorithm", "variant_regions_orig")): data["config"]["algorithm"]["variant_regions_orig"] = dd.get_variant_regions(data) clean_vr = clean_file(dd.get_variant_regions(data), data) merged_vr = merge_overlaps(clean_vr, data) data["config"]["algorithm"]["variant_regions"] = clean_vr data["config"]["algorithm"]["variant_regions_merged"] = merged_vr if dd.get_coverage(data): if not utils.get_in(data, ("config", "algorithm", "coverage_orig")): data["config"]["algorithm"]["coverage_orig"] = dd.get_coverage(data) clean_cov_bed = clean_file(dd.get_coverage(data), data, prefix="cov-", simple=True) merged_cov_bed = merge_overlaps(clean_cov_bed, data) data["config"]["algorithm"]["coverage"] = clean_cov_bed data["config"]["algorithm"]["coverage_merged"] = merged_cov_bed if 'seq2c' in get_svcallers(data): seq2c_ready_bed = prep_seq2c_bed(data) if not seq2c_ready_bed: logger.warning("Can't run Seq2C without a svregions or variant_regions BED file") else: data["config"]["algorithm"]["seq2c_bed_ready"] = seq2c_ready_bed return data
def calculate(bam_file, data): """Calculate coverage in parallel using mosdepth. Removes duplicates and secondary reads from the counts: if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue; """ params = {"min": dd.get_coverage_depth_min(data)} variant_regions = dd.get_variant_regions_merged(data) if not variant_regions: variant_regions = _create_genome_regions(data) # Back compatible with previous pre-mosdepth callable files callable_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))), "%s-coverage.callable.bed" % (dd.get_sample_name(data))) if not utils.file_uptodate(callable_file, bam_file): vr_quantize = ("0:1:%s:" % (params["min"]), ["NO_COVERAGE", "LOW_COVERAGE", "CALLABLE"]) to_calculate = [("variant_regions", variant_regions, vr_quantize, None), ("sv_regions", bedutils.clean_file(regions.get_sv_bed(data), data), None, None), ("coverage", bedutils.clean_file(dd.get_coverage(data), data), None, DEPTH_THRESHOLDS)] depth_files = {} for target_name, region_bed, quantize, thresholds in to_calculate: if region_bed: cur_depth = {} depth_info = run_mosdepth(data, target_name, region_bed, quantize=quantize, thresholds=thresholds) for attr in ("dist", "regions", "thresholds"): val = getattr(depth_info, attr, None) if val: cur_depth[attr] = val depth_files[target_name] = cur_depth if target_name == "variant_regions": callable_file = depth_info.quantize else: depth_files = {} final_callable = _subset_to_variant_regions(callable_file, variant_regions, data) return final_callable, depth_files
def run(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) resources = config_utils.get_resources("qualimap", data["config"]) options = " ".join(resources.get("options", "")) results_file = os.path.join(results_dir, "genome_results.txt") report_file = os.path.join(results_dir, "qualimapReport.html") utils.safe_makedir(results_dir) pdf_file = "qualimapReport.pdf" if not utils.file_exists(results_file) and not utils.file_exists(os.path.join(results_dir, pdf_file)): if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info("Full qualimap analysis for %s may be slow." % bam_file) ds_bam = bam_file else: ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if options.find("PDF") > -1: options = "%s -outfile %s" % (options, pdf_file) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) with file_transaction(data, results_dir) as tx_results_dir: utils.safe_makedir(tx_results_dir) export = "%s%s export JAVA_OPTS='-Xms32m -Xmx%s -Djava.io.tmpdir=%s' && " % ( utils.java_freetype_fix(), utils.local_path_export(), max_mem, tx_results_dir) cmd = ("unset DISPLAY && {export} {qualimap} bamqc -bam {bam_file} -outdir {tx_results_dir} " "--skip-duplicated --skip-dup-mode 0 " "-nt {num_cores} {options}") species = None if (tz.get_in(("genome_resources", "aliases", "human"), data, "") or dd.get_genome_build(data).startswith(("hg", "GRCh"))): species = "HUMAN" elif dd.get_genome_build(data).startswith(("mm", "GRCm")): species = "MOUSE" if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = (dd.get_coverage(data) if dd.get_coverage(data) not in [None, False, "None"] else dd.get_variant_regions_merged(data)) if regions: regions = bedutils.merge_overlaps(bedutils.clean_file(regions, data), data) bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" bcbio_env = utils.get_bcbio_env() do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data), env=bcbio_env) tx_results_file = os.path.join(tx_results_dir, "genome_results.txt") cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (dd.get_sample_name(data), tx_results_file) do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data))) # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order # to keep its name after upload, we need to put the base QC file (results_file) into the root directory (out_dir): base_results_file = os.path.join(out_dir, os.path.basename(results_file)) shutil.copyfile(results_file, base_results_file) return {"base": base_results_file, "secondary": _find_qualimap_secondary_files(results_dir, base_results_file)}
def summarize(calls, data, items): """Summarize results from multiple callers into a single flattened BED file. Approach: - Combine all calls found in all files - Filter files retaining those present with multiple levels of support. - Remove calls in high depth regions. - Remove calls with ends overlapping exclusion regions like low complexity regions. """ sample = tz.get_in(["rgnames", "sample"], data) work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "structural", sample, "ensemble")) with shared.bedtools_tmpdir(data): input_beds = filter(lambda xs: xs[1] is not None and utils.file_exists(xs[1]), [(c["variantcaller"], _create_bed(c, sample, work_dir, calls, data)) for c in calls]) if len(input_beds) > 0: out_file = combine_bed_by_size([xs[1] for xs in input_beds], sample, work_dir, data) if utils.file_exists(out_file): if len(input_beds) > N_FILTER_CALLERS: filter_file = _filter_ensemble(out_file, data) else: filter_file = out_file limit_file = shared.remove_highdepth_regions(filter_file, items) exclude_files = [f for f in [x.get("exclude_file") for x in calls] if f] exclude_file = exclude_files[0] if len(exclude_files) > 0 else None if exclude_file: noexclude_file, _ = sshared.exclude_by_ends(limit_file, exclude_file, data) else: noexclude_file = limit_file bedprep_dir = utils.safe_makedir(os.path.join(os.path.dirname(noexclude_file), "bedprep")) if utils.file_exists(noexclude_file): calls.append({"variantcaller": "sv-ensemble", "input_beds": input_beds, "vrn_file": bedutils.clean_file(noexclude_file, data, bedprep_dir=bedprep_dir)}) return calls
def get_base_cnv_regions(data, work_dir, genome_default="transcripts1e4", include_gene_names=True): """Retrieve set of target regions for CNV analysis. Subsets to extended transcript regions for WGS experiments to avoid long runtimes. """ cov_interval = dd.get_coverage_interval(data) base_regions = get_sv_bed(data, include_gene_names=include_gene_names) # if we don't have a configured BED or regions to use for SV caling if not base_regions: # For genome calls, subset to regions near genes as targets if cov_interval == "genome": base_regions = get_sv_bed(data, genome_default, work_dir, include_gene_names=include_gene_names) if base_regions: base_regions = remove_exclude_regions(base_regions, base_regions, [data]) # Finally, default to the defined variant regions if not base_regions: base_regions = dd.get_variant_regions(data) return bedutils.clean_file(base_regions, data)
def priority_total_coverage(data, out_dir): """ calculate coverage at 10 depth intervals in the priority regions """ from bcbio.structural import prioritize bed_file = dd.get_svprioritize(data) if not bed_file and not file_exists(bed_file) or prioritize.is_gene_list( bed_file): return {} in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) cleaned_bed = clean_file(bed_file, data, prefix="svprioritize-") work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed") if utils.file_uptodate(out_file, cleaned_bed) and utils.file_uptodate( out_file, in_bam): return out_file cmdl = sambamba.make_command( data, "depth region", in_bam, cleaned_bed, depth_thresholds=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) with file_transaction(out_file) as tx_out_file: message = "Calculating region coverage of {bed_file} in {in_bam}" do.run(cmdl + " -o " + tx_out_file, message.format(**locals())) logger.debug("Saved svprioritize coverage into " + out_file) return out_file
def summary(items): cutoff = 4 # coverage for completeness out_dir = utils.safe_makedir( os.path.join(items[0]["dirs"]["work"], "coverage")) clean_bed = bedutils.clean_file( tz.get_in(["config", "algorithm", "coverage"], items[0]), items[0]) bed_file = _uniquify_bed_names(clean_bed, out_dir, items[0]) batch = _get_group_batch(items) out_file = os.path.join(out_dir, "%s-coverage.db" % batch) if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: chanjo = os.path.join(os.path.dirname(sys.executable), "chanjo") cmd = ("{chanjo} --db {tx_out_file} build {bed_file}") do.run(cmd.format(**locals()), "Prep chanjo database") for data in items: sample = dd.get_sample_name(data) bam_file = data["work_bam"] cmd = ( "{chanjo} annotate -s {sample} -g {batch} -c {cutoff} {bam_file} {bed_file} | " "{chanjo} --db {tx_out_file} import") do.run(cmd.format(**locals()), "Chanjo coverage", data) out = [] for data in items: data["coverage"] = {"summary": out_file} out.append([data]) return out
def compare_to_rm(data): """Compare final variant calls against reference materials of known calls. """ if isinstance(data, (list, tuple)): data = _normalize_cwl_inputs(data) toval_data = _get_validate(data) if toval_data: caller = _get_caller(toval_data) sample = dd.get_sample_name(toval_data) base_dir = utils.safe_makedir( os.path.join(toval_data["dirs"]["work"], "validate", sample, caller)) if isinstance(toval_data["vrn_file"], (list, tuple)): raise NotImplementedError( "Multiple input files for validation: %s" % toval_data["vrn_file"]) else: vrn_file = os.path.abspath(toval_data["vrn_file"]) rm_file = normalize_input_path( toval_data["config"]["algorithm"]["validate"], toval_data) rm_interval_file = _gunzip( normalize_input_path( toval_data["config"]["algorithm"].get("validate_regions"), toval_data), toval_data) rm_interval_file = bedutils.clean_file( rm_interval_file, toval_data, bedprep_dir=utils.safe_makedir(os.path.join(base_dir, "bedprep"))) rm_file = naming.handle_synonyms(rm_file, dd.get_ref_file(data), data.get("genome_build"), base_dir, data) rm_interval_file = (naming.handle_synonyms( rm_interval_file, dd.get_ref_file(data), data.get("genome_build"), base_dir, data) if rm_interval_file else None) vmethod = tz.get_in(["config", "algorithm", "validate_method"], data, "rtg") if not vcfutils.vcf_has_variants(vrn_file): # RTG can fail on totally empty files. Skip these since we have nothing. pass # empty validation file, every call is a false positive elif not vcfutils.vcf_has_variants(rm_file): eval_files = _setup_call_fps(vrn_file, rm_interval_file, base_dir, toval_data) data["validate"] = _rtg_add_summary_file(eval_files, base_dir, toval_data) elif vmethod == "rtg": eval_files = _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, toval_data) data["validate"] = _rtg_add_summary_file(eval_files, base_dir, toval_data) elif vmethod == "hap.py": data["validate"] = _run_happy_eval(vrn_file, rm_file, rm_interval_file, base_dir, toval_data) elif vmethod == "bcbio.variation": data["validate"] = _run_bcbio_variation(vrn_file, rm_file, rm_interval_file, base_dir, sample, caller, toval_data) return [[data]]
def priority_coverage(data, out_dir): from bcbio.structural import prioritize bed_file = dd.get_svprioritize(data) if not bed_file or not file_exists(bed_file) or prioritize.is_gene_list( bed_file): return data work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_depth.bed") if file_exists(out_file): return out_file nthreads = dd.get_num_cores(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sambamba = config_utils.get_program("sambamba", data, default="sambamba") with tx_tmpdir(data, work_dir) as tmp_dir: cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True) with file_transaction(out_file) as tx_out_file: parse_cmd = "awk '{print $1\"\t\"$2\"\t\"$2\"\t\"$3\"\t\"$10}' | sed '1d'" cmd = ("{sambamba} depth base -t {nthreads} -L {cleaned_bed} " "-F \"not unmapped\" " "{in_bam} | {parse_cmd} > {tx_out_file}") message = "Calculating coverage of {bed_file} regions in {in_bam}" do.run(cmd.format(**locals()), message.format(**locals())) return out_file
def priority_total_coverage(data, out_dir): """ calculate coverage at 10 depth intervals in the priority regions """ from bcbio.structural import prioritize bed_file = dd.get_svprioritize(data) if not bed_file and not file_exists(bed_file) or prioritize.is_gene_list( bed_file): return {} work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed") if file_exists(out_file): # data['priority_total_coverage'] = os.path.abspath(out_file) return out_file nthreads = dd.get_num_cores(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sambamba = config_utils.get_program("sambamba", data, default="sambamba") with tx_tmpdir(data, work_dir) as tmp_dir: cleaned_bed = clean_file(bed_file, data) with file_transaction(out_file) as tx_out_file: cmd = ( "{sambamba} depth region -t {nthreads} -L {cleaned_bed} " "-F \"not unmapped\" " "-T 10 -T 20 -T 30 -T 40 -T 50 -T 60 -T 70 -T 80 -T 90 -T 100 " "{in_bam} -o {tx_out_file}") message = "Calculating coverage of {bed_file} regions in {in_bam}" do.run(cmd.format(**locals()), message.format(**locals())) # data['priority_total_coverage'] = os.path.abspath(out_file) return out_file
def summary(items): cutoff = DEFAULT_COVERAGE_CUTOFF data = items[0] work_dir = dd.get_work_dir(data) out_dir = utils.safe_makedir(os.path.join(work_dir, "coverage")) coverage_bed = dd.get_coverage_regions(data) priority_bed = dd.get_priority_regions(data) combined_bed = bed.concat([coverage_bed, priority_bed]) clean_bed = bedutils.clean_file(combined_bed.fn, data) if len(combined_bed) > 0 else combined_bed.fn bed_file = _uniquify_bed_names(clean_bed, out_dir, data) batch = _get_group_batch(items) assert batch, ("Did not find batch for samples: %s" % ",".join([dd.get_sample_name(x) for x in items])) out_file = os.path.join(out_dir, "%s-coverage.db" % batch) if not utils.file_exists(out_file) and utils.file_exists(bed_file): with file_transaction(data, out_file) as tx_out_file: chanjo = os.path.join(os.path.dirname(sys.executable), "chanjo") cmd = ("{chanjo} --db {tx_out_file} build {bed_file}") do.run(cmd.format(**locals()), "Prep chanjo database") for data in items: sample = dd.get_sample_name(data) bam_file = data["work_bam"] cmd = ("{chanjo} annotate -s {sample} -g {batch} -c {cutoff} " "{bam_file} {bed_file} | " "{chanjo} --db {tx_out_file} import") do.run(cmd.format(**locals()), "Chanjo coverage", data) incomplete = incomplete_regions(out_file, batch, out_dir) out = [] for data in items: if utils.file_exists(out_file): data["coverage"] = {"summary": out_file, "incomplete": incomplete} out.append([data]) return out
def summary(items): cutoff = 4 # coverage for completeness out_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "coverage")) clean_bed = bedutils.clean_file(tz.get_in(["config", "algorithm", "coverage"], items[0]), items[0]) bed_file = _uniquify_bed_names(clean_bed, out_dir, items[0]) batch = _get_group_batch(items) out_file = os.path.join(out_dir, "%s-coverage.db" % batch) if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: chanjo = os.path.join(os.path.dirname(sys.executable), "chanjo") cmd = ("{chanjo} --db {tx_out_file} build {bed_file}") do.run(cmd.format(**locals()), "Prep chanjo database") for data in items: sample = dd.get_sample_name(data) bam_file = data["work_bam"] cmd = ("{chanjo} annotate -s {sample} -g {batch} -c {cutoff} {bam_file} {bed_file} | " "{chanjo} --db {tx_out_file} import") do.run(cmd.format(**locals()), "Chanjo coverage", data) out = [] for data in items: data["coverage"] = {"summary": out_file} out.append([data]) return out
def summarize(calls, data): """Summarize results from multiple callers into a single flattened BED file. """ import pybedtools sample = tz.get_in(["rgnames", "sample"], data) work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "structural", sample, "ensemble")) out_file = os.path.join(work_dir, "%s-ensemble.bed" % sample) with shared.bedtools_tmpdir(data): input_beds = filter(lambda x: x is not None, [_create_bed(c, out_file, data) for c in calls]) if len(input_beds) > 0: size_beds = [] for e_start, e_end in validate.EVENT_SIZES: base, ext = os.path.splitext(out_file) size_out_file = "%s-%s_%s%s" % (base, e_start, e_end, ext) if not utils.file_exists(size_out_file): with file_transaction(data, size_out_file) as tx_out_file: with shared.bedtools_tmpdir(data): all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0] with open(all_file, "w") as out_handle: for line in fileinput.input(input_beds): chrom, start, end = line.split()[:3] size = int(end) - int(start) if size >= e_start and size < e_end: out_handle.write(line) pybedtools.BedTool(all_file).sort(stream=True)\ .merge(c=4, o="distinct", delim=",").saveas(tx_out_file) size_beds.append(size_out_file) out_file = bedutils.combine(size_beds, out_file, data["config"]) if utils.file_exists(out_file): bedprep_dir = utils.safe_makedir(os.path.join(os.path.dirname(out_file), "bedprep")) calls.append({"variantcaller": "ensemble", "vrn_file": bedutils.clean_file(out_file, data, bedprep_dir=bedprep_dir)}) return calls
def coverage_region_detailed_stats(data, out_dir): """ Calculate coverage at different completeness cutoff for region in coverage option. """ bed_file = dd.get_coverage(data) if not bed_file: return None work_dir = safe_makedir(out_dir) cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True) with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sample = dd.get_sample_name(data) logger.debug("doing coverage for %s" % sample) os.path.join(sample + "_cov_total.tsv") parse_file = os.path.join(sample + "_coverage.bed") if utils.file_uptodate(parse_file, cleaned_bed) and utils.file_uptodate(parse_file, in_bam): pass else: with file_transaction(data, parse_file) as out_tx: cmdl = sambamba.make_command(data, "depth region", in_bam, cleaned_bed, depth_thresholds=[1, 5, 10, 20, 40, 50, 60, 70, 80, 100], max_cov=1000) cmdl += " | sed 's/# chrom/chrom/' > " + out_tx do.run(cmdl, "Run coverage regional analysis for {}".format(sample)) parse_file = _add_high_covered_regions(parse_file, cleaned_bed, sample, data=data) parse_file = _calculate_percentiles(os.path.abspath(parse_file), sample, data=data) return os.path.abspath(parse_file)
def coverage(data, out_dir): """ Calculate coverage at different completeness cutoff for region in coverage option. """ bed_file = dd.get_coverage(data) sambamba = config_utils.get_program("sambamba", data["config"]) work_dir = safe_makedir(out_dir) if not bed_file: return None cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True) with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sample = dd.get_sample_name(data) logger.debug("doing coverage for %s" % sample) parse_file = os.path.join(sample + "_coverage.bed") parse_total_file = os.path.join(sample + "_cov_total.tsv") cores = dd.get_num_cores(data) if not file_exists(parse_file): with tx_tmpdir(data, work_dir) as tmp_dir: with file_transaction(parse_file) as out_tx: cmd = ("{sambamba} depth region -F \"not unmapped\" -t {cores} " "%s -T 1 -T 5 -T 10 -T 20 -T 40 -T 50 -T 60 -T 70 " "-T 80 -T 100 -L {cleaned_bed} {in_bam} | sed 's/# " "chrom/chrom/' > {out_tx}") do.run(cmd.format(**locals()) % "-C 1000", "Run coverage for {}".format(sample)) parse_file = _add_high_covered_regions(parse_file, cleaned_bed, sample) parse_file = _calculate_percentiles(os.path.abspath(parse_file), sample) return os.path.abspath(parse_file)
def priority_total_coverage(data, out_dir): """ calculate coverage at 10 depth intervals in the priority regions """ from bcbio.structural import prioritize bed_file = dd.get_svprioritize(data) if not bed_file and not file_exists(bed_file) or prioritize.is_gene_list(bed_file): return {} work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed") if file_exists(out_file): # data['priority_total_coverage'] = os.path.abspath(out_file) return out_file nthreads = dd.get_num_cores(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sambamba = config_utils.get_program("sambamba", data, default="sambamba") with tx_tmpdir(data, work_dir) as tmp_dir: cleaned_bed = clean_file(bed_file, data) with file_transaction(out_file) as tx_out_file: cmd = ("{sambamba} depth region -t {nthreads} -L {cleaned_bed} " "-F \"not unmapped\" " "-T 10 -T 20 -T 30 -T 40 -T 50 -T 60 -T 70 -T 80 -T 90 -T 100 " "{in_bam} -o {tx_out_file}") message = "Calculating coverage of {bed_file} regions in {in_bam}" do.run(cmd.format(**locals()), message.format(**locals())) # data['priority_total_coverage'] = os.path.abspath(out_file) return out_file
def coverage_region_detailed_stats(data, out_dir, extra_cutoffs=None): """ Calculate coverage at different completeness cutoff for region in coverage option. """ bed_file = dd.get_coverage(data) if not bed_file or not utils.file_exists(bed_file): return [] work_dir = safe_makedir(out_dir) cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True) cutoffs = {1, 5, 10, 20, 50, 100, 250, 500, 1000, 5000, 10000, 50000} with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sample = dd.get_sample_name(data) logger.debug("doing coverage for %s" % sample) parse_file = os.path.join(sample + "_coverage.bed") if utils.file_uptodate(parse_file, cleaned_bed) and utils.file_uptodate(parse_file, in_bam): pass else: with file_transaction(data, parse_file) as out_tx: depth_thresholds = sorted(list(cutoffs | extra_cutoffs)) cmdl = sambamba.make_command(data, "depth region", in_bam, cleaned_bed, depth_thresholds=depth_thresholds) cmdl += " | sed 's/# chrom/chrom/' > " + out_tx do.run(cmdl, "Run coverage regional analysis for {}".format(sample)) out_files = _calculate_percentiles(os.path.abspath(parse_file), sample, data=data, cutoffs=cutoffs) return [os.path.abspath(x) for x in out_files]
def coverage(data, out_dir): """ Calculate coverage at different completeness cutoff for region in coverage option. """ bed_file = dd.get_coverage(data) sambamba = config_utils.get_program("sambamba", data["config"]) work_dir = safe_makedir(out_dir) if not bed_file: return None cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True) with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sample = dd.get_sample_name(data) logger.debug("doing coverage for %s" % sample) parse_file = os.path.join(sample + "_coverage.bed") parse_total_file = os.path.join(sample + "_cov_total.tsv") cores = dd.get_num_cores(data) if not file_exists(parse_file): with tx_tmpdir(data, work_dir) as tmp_dir: with file_transaction(parse_file) as out_tx: cmd = ( "{sambamba} depth region -F \"not unmapped\" -t {cores} " "%s -T 1 -T 5 -T 10 -T 20 -T 40 -T 50 -T 60 -T 70 " "-T 80 -T 100 -L {cleaned_bed} {in_bam} | sed 's/# " "chrom/chrom/' > {out_tx}") do.run( cmd.format(**locals()) % "-C 1000", "Run coverage for {}".format(sample)) parse_file = _add_high_covered_regions(parse_file, cleaned_bed, sample) parse_file = _calculate_percentiles(os.path.abspath(parse_file), sample) return os.path.abspath(parse_file)
def coverage_region_detailed_stats(data, out_dir): """ Calculate coverage at different completeness cutoff for region in coverage option. """ bed_file = dd.get_coverage(data) if not bed_file: return None work_dir = safe_makedir(out_dir) cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True) with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sample = dd.get_sample_name(data) logger.debug("doing coverage for %s" % sample) parse_total_file = os.path.join(sample + "_cov_total.tsv") parse_file = os.path.join(sample + "_coverage.bed") if utils.file_uptodate(parse_file, cleaned_bed) and utils.file_uptodate(parse_file, in_bam): pass else: with file_transaction(parse_file) as out_tx: cmdl = sambamba.make_command(data, "depth region", in_bam, cleaned_bed, depth_thresholds=[1, 5, 10, 20, 40, 50, 60, 70, 80, 100], max_cov=1000) cmdl += " | sed 's/# chrom/chrom/' > " + out_tx do.run(cmdl, "Run coverage regional analysis for {}".format(sample)) parse_file = _add_high_covered_regions(parse_file, cleaned_bed, sample) parse_file = _calculate_percentiles(os.path.abspath(parse_file), sample) return os.path.abspath(parse_file)
def variants(data, out_dir): """Variants QC metrics""" if not "variants" in data: return None work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) bcfstats = _run_bcftools(data, work_dir) bed_file = dd.get_coverage(data) bcf_out = os.path.join(sample + "_bcbio_variants_stats.txt") cg_file = os.path.join(sample + "_with-gc.vcf.gz") parse_file = os.path.join(sample + "_gc-depth-parse.tsv") qc_file = os.path.join(sample + "_bcbio_variants.txt") with chdir(work_dir): if not file_exists(bcf_out): with open(bcf_out, "w") as out_handle: yaml.safe_dump(bcfstats, out_handle, default_flow_style=False, allow_unicode=False) if "vrn_file" not in data or not bed_file: return None in_vcf = data['vrn_file'] cleaned_bed = clean_file(bed_file, data) if file_exists(qc_file): return qc_file in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) ref_file = dd.get_ref_file(data) assert ref_file, "Need the reference genome fasta file." bed_file = dd.get_variant_regions(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) num_cores = dd.get_num_cores(data) broad_runner = broad.runner_from_config_safe(data["config"]) if in_bam and broad_runner and broad_runner.has_gatk(): if not file_exists(parse_file): with file_transaction(cg_file) as tx_out: params = [ "-T", "VariantAnnotator", "-R", ref_file, "-L", cleaned_bed, "-I", in_bam, "-A", "GCContent", "-A", "Coverage", "--variant", in_vcf, "--out", tx_out ] broad_runner.run_gatk(params) cg_file = vcfutils.bgzip_and_index(cg_file, data["config"]) if not file_exists(parse_file): with file_transaction(parse_file) as out_tx: with open(out_tx, 'w') as out_handle: print >> out_handle, "CG\tdepth\tsample" cmd = ( "bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R " "{bed_file} {cg_file} >> {out_tx}") do.run(cmd.format(**locals()), "Calculating GC content and depth for %s" % in_vcf) logger.debug('parsing coverage: %s' % sample) if not file_exists(qc_file): # This files will be copied to final _summary_variants(parse_file, qc_file) if file_exists(qc_file) and file_exists(parse_file): remove_plus(cg_file)
def variants(data, out_dir): """Variants QC metrics""" if not "variants" in data: return None work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) bcfstats = _run_bcftools(data, work_dir) bed_file = dd.get_coverage(data) bcf_out = os.path.join(sample + "_bcbio_variants_stats.txt") cg_file = os.path.join(sample + "_with-gc.vcf.gz") parse_file = os.path.join(sample + "_gc-depth-parse.tsv") qc_file = os.path.join(sample + "_bcbio_variants.txt") with chdir(work_dir): if not file_exists(bcf_out): with open(bcf_out, "w") as out_handle: yaml.safe_dump(bcfstats, out_handle, default_flow_style=False, allow_unicode=False) if "vrn_file" not in data or not bed_file: return None in_vcf = data['vrn_file'] cleaned_bed = clean_file(bed_file, data) if file_exists(qc_file): return qc_file in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) ref_file = dd.get_ref_file(data) assert ref_file, "Need the reference genome fasta file." bed_file = dd.get_variant_regions(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) num_cores = dd.get_num_cores(data) broad_runner = broad.runner_from_config_safe(data["config"]) if in_bam and broad_runner and broad_runner.has_gatk(): if not file_exists(parse_file): with file_transaction(cg_file) as tx_out: params = ["-T", "VariantAnnotator", "-R", ref_file, "-L", cleaned_bed, "-I", in_bam, "-A", "GCContent", "-A", "Coverage", "--variant", in_vcf, "--out", tx_out] broad_runner.run_gatk(params) cg_file = vcfutils.bgzip_and_index(cg_file, data["config"]) if not file_exists(parse_file): with file_transaction(parse_file) as out_tx: with open(out_tx, 'w') as out_handle: print >>out_handle, "CG\tdepth\tsample" cmd = ("bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R " "{bed_file} {cg_file} >> {out_tx}") do.run(cmd.format(**locals()), "Calculating GC content and depth for %s" % in_vcf) logger.debug('parsing coverage: %s' % sample) if not file_exists(qc_file): # This files will be copied to final _summary_variants(parse_file, qc_file) if file_exists(qc_file) and file_exists(parse_file): remove_plus(cg_file)
def _merge_by_batch(batch, fnames): """Merge all calls in a family into a single callset. """ merge_dir = utils.safe_makedir(os.path.join(os.getcwd(), "merged")) clean_dir = utils.safe_makedir(os.path.join(merge_dir, "clean")) merge_file = os.path.join(merge_dir, "%s-ensemble.bed" % batch) if not utils.file_uptodate(merge_file, fnames[0]): for fname in glob.glob(os.path.join(merge_dir, "%s-ensemble*" % batch)): os.remove(fname) ensemble.combine_bed_by_size(fnames, batch, merge_dir, {}, delim="&&") return bedutils.clean_file(merge_file, {}, bedprep_dir=clean_dir)
def summary(items): data = items[0] cutoff = dd.get_coverage_depth_min(data) work_dir = dd.get_work_dir(data) out_dir = utils.safe_makedir(os.path.join(work_dir, "coverage")) coverage_bed = dd.get_coverage_regions(data) priority_bed = dd.get_priority_regions(data) batch = _get_group_batch(items) assert batch, "Did not find batch for samples: %s" % ",".join([dd.get_sample_name(x) for x in items]) out_file = os.path.join(out_dir, "%s-coverage.db" % batch) if not utils.file_exists(out_file): if coverage_bed: mini_coverage = bed.minimize(coverage_bed).fn if priority_bed: mini_priority = bed.minimize(priority_bed).fn if coverage_bed and priority_bed: combined_bed = bed.concat([mini_coverage, mini_priority]).fn elif coverage_bed: combined_bed = mini_coverage elif priority_bed: combined_bed = mini_priority else: # no coverage or priority file has been set return items clean_bed = bedutils.clean_file(combined_bed, data) if len(combined_bed) > 0 else combined_bed.fn bed_file = _uniquify_bed_names(clean_bed, out_dir, data) if bed_file and utils.file_exists(bed_file): with file_transaction(data, out_file) as tx_out_file: chanjo = os.path.join(os.path.dirname(sys.executable), "chanjo") cmd = "{chanjo} --db {tx_out_file} build {bed_file}" do.run(cmd.format(**locals()), "Prep chanjo database") for data in items: sample = dd.get_sample_name(data) bam_file = data["work_bam"] cmd = ( "{chanjo} annotate -s {sample} -g {batch} -c {cutoff} " "{bam_file} {bed_file} | " "{chanjo} --db {tx_out_file} import" ) do.run(cmd.format(**locals()), "Chanjo coverage", data) if bed_file: os.remove(bed_file) coverage = regions_coverage(out_file, batch, out_dir) problem_regions = dd.get_problem_region_dir(data) if problem_regions: coverage = decorate_problem_regions(coverage, problem_regions) out = [] for data in items: if utils.file_exists(out_file): data["coverage"] = {"summary": out_file, "all": coverage} out.append([data]) return out
def calculate(bam_file, data, sv_bed): """Calculate coverage in parallel using mosdepth. Removes duplicates and secondary reads from the counts: if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue; """ params = {"min": dd.get_coverage_depth_min(data)} variant_regions = dd.get_variant_regions_merged(data) if not variant_regions: variant_regions = _create_genome_regions(data) # Back compatible with previous pre-mosdepth callable files callable_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))), "%s-coverage.callable.bed" % (dd.get_sample_name(data))) if not utils.file_uptodate(callable_file, bam_file): vr_quantize = ("0:1:%s:" % (params["min"]), ["NO_COVERAGE", "LOW_COVERAGE", "CALLABLE"]) to_calculate = [("variant_regions", variant_regions, vr_quantize, None, "coverage_perbase" in dd.get_tools_on(data)), ("sv_regions", bedutils.clean_file(sv_bed, data, prefix="svregions-"), None, None, False), ("coverage", bedutils.clean_file(dd.get_coverage(data), data, prefix="cov-"), None, DEPTH_THRESHOLDS, False)] depth_files = {} for target_name, region_bed, quantize, thresholds, per_base in to_calculate: if region_bed: cur_depth = {} depth_info = run_mosdepth(data, target_name, region_bed, quantize=quantize, thresholds=thresholds, per_base=per_base) for attr in ("dist", "regions", "thresholds", "per_base"): val = getattr(depth_info, attr, None) if val: cur_depth[attr] = val depth_files[target_name] = cur_depth if target_name == "variant_regions": callable_file = depth_info.quantize else: depth_files = {} final_callable = _subset_to_variant_regions(callable_file, variant_regions, data) return final_callable, depth_files
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) config["config"] = {} config["dirs"] = {"work": os.getcwd()} groups = organize_vcf_reps(tz.get_in(["inputs", "vcfs"], config), tz.get_in(["inputs", "namere"], config), config["remap"]) groups = add_bams(tz.get_in(["inputs", "bams"], config), tz.get_in(["inputs", "namere"], config), groups, config["remap"]) bed_file = bedutils.clean_file(tz.get_in(["inputs", "regions"], config), config) + ".gz" groups = preprocess_vcfs(groups, bed_file, config["resources"], config["annotations"], config.get("filters", [])) #pprint.pprint(groups) incon = {} for name, fnames in groups.items(): incon[name] = find_inconsistent(name, fnames["vcf"], bed_file, config["resources"]) incon_check, totals, counts = [], [], [] for name, info in sorted(incon.items(), key=lambda x: np.mean(x[1]["counts"]), reverse=True): totals.extend(info["totals"]) counts.extend(info["counts"]) print name, info["counts"] if np.mean(info["counts"]) > 100: incon_check.extend( investigate_high_counts(info["summary"], info["vcf_files"])) totalm = np.median(totals) countm = np.median(counts) print "Overall discordants: %s-%s; %s-%s; %s / %s => %.1f%%" % ( min(counts), max(counts), min(totals), max(totals), countm, totalm, countm * 100.0 / totalm) for to_check in incon_check: deconvolute_inconsistent(to_check, groups, bed_file) disc_bed, incon = identify_shared_discordants(incon) filtered_bed = merge_filtered(incon) # only use filtered since annotations supplied upstream now #ann_bed = annotate_disc_bed(disc_bed, filtered_bed, config["annotations"]) #remain_disc = check_annotated_disc(ann_bed, incon, config["annotations"]) ann_bed = annotate_disc_bed(disc_bed, filtered_bed, {}) remain_disc = check_annotated_disc(ann_bed, incon, {}) summarize_remaining_disc(incon) if len(remain_disc) < 10: identify_discordant_reasons(remain_disc, incon) calculate_annotation_overlap(bed_file, filtered_bed, config["annotations"])
def _run_coverage_qc(bam_file, data, out_dir): """Run coverage QC analysis""" out = dict() total_reads = sambamba.number_of_reads(data, bam_file) out['Total_reads'] = total_reads mapped = sambamba.number_of_mapped_reads(data, bam_file) out['Mapped_reads'] = mapped if total_reads: out['Mapped_reads_pct'] = 100.0 * mapped / total_reads if mapped: mapped_unique = sambamba.number_of_mapped_reads(data, bam_file, keep_dups=False) out['Mapped_unique_reads'] = mapped mapped_dups = mapped - mapped_unique out['Duplicates'] = mapped_dups out['Duplicates_pct'] = 100.0 * mapped_dups / mapped if dd.get_coverage(data): cov_bed_file = clean_file(dd.get_coverage(data), data, prefix="cov-", simple=True) merged_bed_file = bedutils.merge_overlaps(cov_bed_file, data) target_name = "coverage" else: merged_bed_file = dd.get_variant_regions_merged(data) target_name = "variant_regions" ontarget = sambamba.number_mapped_reads_on_target( data, merged_bed_file, bam_file, keep_dups=False, target_name=target_name) if mapped_unique: out["Ontarget_unique_reads"] = ontarget out["Ontarget_pct"] = 100.0 * ontarget / mapped_unique out['Offtarget_pct'] = 100.0 * (mapped_unique - ontarget) / mapped_unique padded_bed_file = bedutils.get_padded_bed_file(merged_bed_file, 200, data) ontarget_padded = sambamba.number_mapped_reads_on_target( data, padded_bed_file, bam_file, keep_dups=False, target_name=target_name + "_padded") out["Ontarget_padded_pct"] = 100.0 * ontarget_padded / mapped_unique if total_reads: out['Usable_pct'] = 100.0 * ontarget / total_reads avg_coverage = get_average_coverage(data, bam_file, merged_bed_file, target_name) out['Avg_coverage'] = avg_coverage priority = cov.priority_coverage(data, out_dir) cov.priority_total_coverage(data, out_dir) region_coverage_file = cov.coverage_region_detailed_stats(data, out_dir) # Re-enable with annotations from internally installed # problem region directory # if priority: # annotated = cov.decorate_problem_regions(priority, problem_regions) return out
def _validate_caller_vcf(call_vcf, truth_vcf, callable_regions, svcaller, detail_dir, data): """Validate a caller VCF against truth within callable regions, returning stratified stats """ stats = _calculate_comparison_stats(truth_vcf) callable_regions = bedutils.clean_file(callable_regions, data) callable_bed = pybedtools.BedTool(callable_regions).merge( d=stats["merge_size"]).saveas().fn match_calls = set([]) truth_stats = {"tp": [], "fn": [], "fp": []} detail_handles = {} for stat in ["tp", "tp-baseline", "fn", "fp"]: detail_handles[stat] = open(os.path.join(detail_dir, "%s.vcf" % stat), "w") calls_by_region = {} call_vcf = slim_vcf(call_vcf, data) for call in _callable_intersect(call_vcf, callable_bed, data): calls_by_region[tuple(call[-3:])] = call truth = None regions = [] for parts in _callable_intersect(truth_vcf, callable_bed, data): cur_region = tuple(parts[-3:]) cur_truth = parts if truth is None: truth = cur_truth if _get_key(cur_truth) == _get_key(truth): regions.append(cur_region) else: match_calls, truth_stats = _check_call(truth, regions, calls_by_region, match_calls, truth_stats, detail_handles) truth = cur_truth regions = [cur_region] with utils.open_gzipsafe(call_vcf) as in_handle: for call in (l.split("\t") for l in in_handle if not l.startswith("#")): start, end = _get_start_end(call) if end: key = _get_key(call) if key not in match_calls: call_info = _summarize_call(key) if _event_passes(call_info, stats): detail_handles["fp"].write("\t".join(call)) truth_stats["fp"].append(call_info) return _to_csv(truth_stats, stats, dd.get_sample_name(data), svcaller)
def summarize(calls, data): """Summarize results from multiple callers into a single flattened BED file. """ sample = tz.get_in(["rgnames", "sample"], data) work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "structural", sample, "ensemble")) with shared.bedtools_tmpdir(data): input_beds = filter(lambda x: x is not None, [_create_bed(c, sample, work_dir, data) for c in calls]) if len(input_beds) > 0: out_file = _combine_bed_by_size(input_beds, sample, work_dir, data) if utils.file_exists(out_file): bedprep_dir = utils.safe_makedir(os.path.join(os.path.dirname(out_file), "bedprep")) calls.append({"variantcaller": "ensemble", "vrn_file": bedutils.clean_file(out_file, data, bedprep_dir=bedprep_dir)}) return calls
def _subset_to_sample(bed_file, vcf_file, data): """Convert the global BED file into sample specific calls. """ name = dd.get_sample_name(data) base, ext = os.path.splitext(bed_file) out_file = "%s-%s%s" % (base, name, ext) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: calls = _get_sample_calls(vcf_file, name) with open(bed_file) as in_handle: with open(tx_out_file, "w") as out_handle: for line in in_handle: sample_line = _check_bed_call(line, calls) if sample_line: out_handle.write(sample_line) bedprep_dir = utils.safe_makedir(os.path.join(os.path.dirname(out_file), "bedprep")) return bedutils.clean_file(out_file, data, bedprep_dir=bedprep_dir)
def compare_to_rm(data): """Compare final variant calls against reference materials of known calls. """ if isinstance(data, (list, tuple)) and cwlutils.is_cwl_run(utils.to_single_data(data[0])): data = _normalize_cwl_inputs(data) toval_data = _get_validate(data) toval_data = cwlutils.unpack_tarballs(toval_data, toval_data) if toval_data: caller = _get_caller(toval_data) sample = dd.get_sample_name(toval_data) base_dir = utils.safe_makedir(os.path.join(toval_data["dirs"]["work"], "validate", sample, caller)) if isinstance(toval_data["vrn_file"], (list, tuple)): raise NotImplementedError("Multiple input files for validation: %s" % toval_data["vrn_file"]) else: vrn_file = os.path.abspath(toval_data["vrn_file"]) rm_file = normalize_input_path(toval_data["config"]["algorithm"]["validate"], toval_data) rm_interval_file = _gunzip(normalize_input_path(toval_data["config"]["algorithm"].get("validate_regions"), toval_data), toval_data) rm_interval_file = bedutils.clean_file(rm_interval_file, toval_data, prefix="validateregions-", bedprep_dir=utils.safe_makedir(os.path.join(base_dir, "bedprep"))) rm_file = naming.handle_synonyms(rm_file, dd.get_ref_file(toval_data), data.get("genome_build"), base_dir, data) rm_interval_file = (naming.handle_synonyms(rm_interval_file, dd.get_ref_file(toval_data), data.get("genome_build"), base_dir, data) if rm_interval_file else None) vmethod = tz.get_in(["config", "algorithm", "validate_method"], data, "rtg") # RTG can fail on totally empty files. Call everything in truth set as false negatives if not vcfutils.vcf_has_variants(vrn_file): eval_files = _setup_call_false(rm_file, rm_interval_file, base_dir, toval_data, "fn") data["validate"] = _rtg_add_summary_file(eval_files, base_dir, toval_data) # empty validation file, every call is a false positive elif not vcfutils.vcf_has_variants(rm_file): eval_files = _setup_call_fps(vrn_file, rm_interval_file, base_dir, toval_data, "fp") data["validate"] = _rtg_add_summary_file(eval_files, base_dir, toval_data) elif vmethod in ["rtg", "rtg-squash-ploidy"]: eval_files = _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, toval_data, vmethod) eval_files = _annotate_validations(eval_files, toval_data) data["validate"] = _rtg_add_summary_file(eval_files, base_dir, toval_data) elif vmethod == "hap.py": data["validate"] = _run_happy_eval(vrn_file, rm_file, rm_interval_file, base_dir, toval_data) elif vmethod == "bcbio.variation": data["validate"] = _run_bcbio_variation(vrn_file, rm_file, rm_interval_file, base_dir, sample, caller, toval_data) return [[data]]
def _run_coverage_qc(bam_file, data, out_dir): """Run coverage QC analysis""" out = dict() if dd.get_coverage(data): bed_file = bedutils.merge_overlaps(dd.get_coverage(data), data) target_name = "coverage" elif dd.get_variant_regions_merged(data): bed_file = dd.get_variant_regions_merged(data) target_name = "variant_regions" else: bed_file = None target_name = "wgs" bed_file = clean_file(bed_file, data, prefix="cov-", simple=True) offtarget_stats_file = calculate_offtarget_stats(bam_file, data, bed_file, target_name) if offtarget_stats_file and utils.file_exists(offtarget_stats_file): with open(offtarget_stats_file) as in_handle: stats = yaml.safe_load(in_handle) offtarget = stats.get('offtarget') mapped_unique = stats['mapped_unique'] if offtarget and mapped_unique: out['offtarget_rate'] = 1.0 * offtarget / mapped_unique mapped = stats['mapped'] if mapped: out['Duplicates'] = mapped - mapped_unique out['Duplicates_pct'] = 1.0 * (mapped - mapped_unique) / mapped total_reads = stats['total_reads'] if total_reads: out['usable_rate'] = 1.0 * (mapped_unique - offtarget) / total_reads avg_coverage = get_average_coverage(data, bam_file, bed_file, target_name) out['avg_coverage'] = avg_coverage priority = cov.priority_coverage(data, out_dir) cov.priority_total_coverage(data, out_dir) region_coverage_file = cov.coverage_region_detailed_stats(data, out_dir) # Re-enable with annotations from internally installed # problem region directory # if priority: # annotated = cov.decorate_problem_regions(priority, problem_regions) return out
def _calculate_sv_coverage_gatk(data, work_dir): """Calculate coverage in defined regions using GATK tools TODO: This does double calculations to get GATK4 compatible HDF read counts and then depth and gene annotations. Both are needed for creating heterogeneity inputs. Ideally replace with a single mosdepth coverage calculation, and creat GATK4 TSV format: CONTIG START END COUNT chrM 1 1000 13268 """ from bcbio.variation import coverage from bcbio.structural import annotate # GATK compatible target_file = gatkcnv.collect_read_counts(data, work_dir) # heterogeneity compatible target_in = bedutils.clean_file(tz.get_in(["regions", "bins", "target"], data), data, bedprep_dir=work_dir) target_cov = coverage.run_mosdepth(data, "target-gatk", target_in) target_cov_genes = annotate.add_genes(target_cov.regions, data, max_distance=0) return target_file, target_cov_genes
def _validate_caller_vcf(call_vcf, truth_vcf, callable_regions, svcaller, data): """Validate a caller VCF against truth within callable regions, returning stratified stats """ stats = _calculate_comparison_stats(truth_vcf) callable_regions = bedutils.clean_file(callable_regions, data) callable_bed = pybedtools.BedTool(callable_regions).merge( d=stats["merge_size"]).saveas().fn match_calls = set([]) truth_stats = {"tp": [], "fn": [], "fp": []} calls_by_region = {} call_vcf = slim_vcf(call_vcf, data) for call in _callable_intersect(call_vcf, callable_bed, data): key = tuple(call[:5] + call[7:8]) calls_by_region[tuple(call[-3:])] = key truth = None regions = [] for parts in _callable_intersect(truth_vcf, callable_bed, data): cur_region = tuple(parts[-3:]) cur_truth = tuple(parts[:5] + parts[7:8]) if truth is None: truth = cur_truth if cur_truth == truth: regions.append(cur_region) else: match_calls, truth_stats = _check_call(truth, regions, calls_by_region, match_calls, truth_stats) truth = cur_truth regions = [cur_region] with utils.open_gzipsafe(call_vcf) as in_handle: for call in (l.split("\t") for l in in_handle if not l.startswith("#")): start, end = _get_start_end(call) if end: key = tuple(call[:5] + call[7:8]) if key not in match_calls: call_info = _summarize_call(key) if _event_passes(call_info, stats): truth_stats["fp"].append(call_info) return _to_csv(truth_stats, stats, dd.get_sample_name(data), svcaller)
def _validate_caller_vcf(call_vcf, truth_vcf, callable_regions, svcaller, detail_dir, data): """Validate a caller VCF against truth within callable regions, returning stratified stats """ stats = _calculate_comparison_stats(truth_vcf) callable_regions = bedutils.clean_file(callable_regions, data) callable_bed = pybedtools.BedTool(callable_regions).merge(d=stats["merge_size"]).saveas().fn match_calls = set([]) truth_stats = {"tp": [], "fn": [], "fp": []} detail_handles = {} for stat in ["tp", "tp-baseline", "fn", "fp"]: detail_handles[stat] = open(os.path.join(detail_dir, "%s.vcf" % stat), "w") calls_by_region = {} call_vcf = slim_vcf(call_vcf, data) for call in _callable_intersect(call_vcf, callable_bed, data): calls_by_region[tuple(call[-3:])] = call truth = None regions = [] for parts in _callable_intersect(truth_vcf, callable_bed, data): cur_region = tuple(parts[-3:]) cur_truth = parts if truth is None: truth = cur_truth if _get_key(cur_truth) == _get_key(truth): regions.append(cur_region) else: match_calls, truth_stats = _check_call(truth, regions, calls_by_region, match_calls, truth_stats, detail_handles) truth = cur_truth regions = [cur_region] with utils.open_gzipsafe(call_vcf) as in_handle: for call in (l.split("\t") for l in in_handle if not l.startswith("#")): start, end = _get_start_end(call) if end: key = _get_key(call) if key not in match_calls: call_info = _summarize_call(key) if _event_passes(call_info, stats): detail_handles["fp"].write("\t".join(call)) truth_stats["fp"].append(call_info) return _to_csv(truth_stats, stats, dd.get_sample_name(data), svcaller)
def summary(items): data = items[0] cutoff = dd.get_coverage_depth_min(data) work_dir = dd.get_work_dir(data) out_dir = utils.safe_makedir(os.path.join(work_dir, "coverage")) coverage_bed = dd.get_coverage_regions(data) priority_bed = dd.get_priority_regions(data) batch = _get_group_batch(items) assert batch, ("Did not find batch for samples: %s" % ",".join([dd.get_sample_name(x) for x in items])) out_file = os.path.join(out_dir, "%s-coverage.db" % batch) if not utils.file_exists(out_file): combined_bed = bed.concat([coverage_bed, priority_bed]) clean_bed = bedutils.clean_file( combined_bed.fn, data) if len(combined_bed) > 0 else combined_bed.fn bed_file = _uniquify_bed_names(clean_bed, out_dir, data) if utils.file_exists(bed_file): with file_transaction(data, out_file) as tx_out_file: chanjo = os.path.join(os.path.dirname(sys.executable), "chanjo") cmd = ("{chanjo} --db {tx_out_file} build {bed_file}") do.run(cmd.format(**locals()), "Prep chanjo database") for data in items: sample = dd.get_sample_name(data) bam_file = data["work_bam"] cmd = ( "{chanjo} annotate -s {sample} -g {batch} -c {cutoff} " "{bam_file} {bed_file} | " "{chanjo} --db {tx_out_file} import") do.run(cmd.format(**locals()), "Chanjo coverage", data) os.remove(bed_file) coverage = regions_coverage(out_file, batch, out_dir) problem_regions = dd.get_problem_region_dir(data) if problem_regions: coverage = decorate_problem_regions(coverage, problem_regions) out = [] for data in items: if utils.file_exists(out_file): data["coverage"] = {"summary": out_file, "all": coverage} out.append([data]) return out
def _subset_to_sample(bed_file, data): """Convert the global BED file into sample specific calls. """ name = dd.get_sample_name(data) base, ext = os.path.splitext(bed_file) out_file = "%s-%s%s" % (base, name, ext) if not utils.file_uptodate(out_file, bed_file): with file_transaction(data, out_file) as tx_out_file: with open(bed_file) as in_handle: with open(tx_out_file, "w") as out_handle: for line in in_handle: sample_line = _check_bed_call(line, name) if sample_line: out_handle.write(sample_line) if utils.file_exists(out_file): bedprep_dir = utils.safe_makedir( os.path.join(os.path.dirname(out_file), "bedprep")) return bedutils.clean_file(out_file, data, bedprep_dir=bedprep_dir) else: return out_file
def priority_coverage(data, out_dir): from bcbio.structural import prioritize bed_file = dd.get_svprioritize(data) if not bed_file or not file_exists(bed_file) or prioritize.is_gene_list(bed_file): return data work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True) out_file = os.path.join(work_dir, sample + "_priority_depth.bed") in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) if utils.file_uptodate(out_file, cleaned_bed) and utils.file_uptodate(out_file, in_bam): return out_file with file_transaction(out_file) as tx_out_file: cmdl = sambamba.make_command(data, "depth base", in_bam, cleaned_bed) parse_cmd = "awk '{print $1\"\t\"$2\"\t\"$2\"\t\"$3\"\t\"$10}' | sed '1d'" cmdl += " | {parse_cmd} > {tx_out_file}" message = "Calculating base coverage of {bed_file} in {in_bam}" do.run(cmdl.format(**locals()), message.format(**locals())) return out_file
def priority_coverage(data, out_dir): from bcbio.structural import prioritize bed_file = dd.get_svprioritize(data) if not bed_file or not file_exists(bed_file) or prioritize.is_gene_list(bed_file): return data work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True) out_file = os.path.join(work_dir, sample + "_priority_depth.bed") in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) if utils.file_uptodate(out_file, cleaned_bed) and utils.file_uptodate(out_file, in_bam): return out_file with file_transaction(data, out_file) as tx_out_file: cmdl = sambamba.make_command(data, "depth base", in_bam, cleaned_bed) parse_cmd = "awk '{print $1\"\t\"$2\"\t\"$2\"\t\"$3\"\t\"$10}' | sed '1d'" cmdl += " | {parse_cmd} > {tx_out_file}" message = "Calculating base coverage of {bed_file} in {in_bam}" do.run(cmdl.format(**locals()), message.format(**locals())) return out_file
def priority_total_coverage(data, out_dir): """ calculate coverage at 10 depth intervals in the priority regions """ from bcbio.structural import prioritize bed_file = dd.get_svprioritize(data) if not bed_file and not file_exists(bed_file) or prioritize.is_gene_list(bed_file): return {} in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) cleaned_bed = clean_file(bed_file, data, prefix="svprioritize-") work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed") if utils.file_uptodate(out_file, cleaned_bed) and utils.file_uptodate(out_file, in_bam): return out_file cmdl = sambamba.make_command(data, "depth region", in_bam, cleaned_bed, depth_thresholds=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) with file_transaction(out_file) as tx_out_file: message = "Calculating region coverage of {bed_file} in {in_bam}" do.run(cmdl + " -o " + tx_out_file, message.format(**locals())) logger.debug("Saved svprioritize coverage into " + out_file) return out_file
def _validate_caller_vcf(call_vcf, truth_vcf, callable_regions, svcaller, data): """Validate a caller VCF against truth within callable regions, returning stratified stats """ stats = _calculate_comparison_stats(truth_vcf) callable_regions = bedutils.clean_file(callable_regions, data) callable_bed = pybedtools.BedTool(callable_regions).merge(d=stats["merge_size"]).saveas().fn match_calls = set([]) truth_stats = {"tp": [], "fn": [], "fp": []} calls_by_region = {} call_vcf = slim_vcf(call_vcf, data) for call in _callable_intersect(call_vcf, callable_bed, data): key = tuple(call[:5] + call[7:8]) calls_by_region[tuple(call[-3:])] = key truth = None regions = [] for parts in _callable_intersect(truth_vcf, callable_bed, data): cur_region = tuple(parts[-3:]) cur_truth = tuple(parts[:5] + parts[7:8]) if truth is None: truth = cur_truth if cur_truth == truth: regions.append(cur_region) else: match_calls, truth_stats = _check_call(truth, regions, calls_by_region, match_calls, truth_stats) truth = cur_truth regions = [cur_region] with utils.open_gzipsafe(call_vcf) as in_handle: for call in (l.split("\t") for l in in_handle if not l.startswith("#")): start, end = _get_start_end(call) if end: key = tuple(call[:5] + call[7:8]) if key not in match_calls: call_info = _summarize_call(key) if _event_passes(call_info, stats): truth_stats["fp"].append(call_info) return _to_csv(truth_stats, stats, dd.get_sample_name(data), svcaller)
def priority_coverage(data, out_dir): bed_file = dd.get_svprioritize(data) if not bed_file or not file_exists(bed_file): return data work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_depth.bed") if file_exists(out_file): return out_file nthreads = dd.get_num_cores(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sambamba = config_utils.get_program("sambamba", data, default="sambamba") with tx_tmpdir(data, work_dir) as tmp_dir: cleaned_bed = clean_file(bed_file, data) with file_transaction(out_file) as tx_out_file: parse_cmd = "awk '{print $1\"\t\"$2\"\t\"$2\"\t\"$3\"\t\"$10}' | sed '1d'" cmd = ("{sambamba} depth base -t {nthreads} -L {cleaned_bed} " "-F \"not unmapped\" " "{in_bam} | {parse_cmd} > {tx_out_file}") message = "Calculating coverage of {bed_file} regions in {in_bam}" do.run(cmd.format(**locals()), message.format(**locals())) return out_file
def compare_to_rm(data): """Compare final variant calls against reference materials of known calls. """ if isinstance(data, (list, tuple)): data = _normalize_cwl_inputs(data) toval_data = _get_validate(data) if toval_data: caller = _get_caller(toval_data) sample = dd.get_sample_name(toval_data) base_dir = utils.safe_makedir(os.path.join(toval_data["dirs"]["work"], "validate", sample, caller)) if isinstance(toval_data["vrn_file"], (list, tuple)): raise NotImplementedError("Multiple input files for validation: %s" % toval_data["vrn_file"]) else: vrn_file = os.path.abspath(toval_data["vrn_file"]) rm_file = normalize_input_path(toval_data["config"]["algorithm"]["validate"], toval_data) rm_interval_file = _gunzip(normalize_input_path(toval_data["config"]["algorithm"].get("validate_regions"), toval_data), toval_data) rm_interval_file = bedutils.clean_file(rm_interval_file, toval_data, bedprep_dir=utils.safe_makedir(os.path.join(base_dir, "bedprep"))) rm_file = naming.handle_synonyms(rm_file, dd.get_ref_file(data), data["genome_build"], base_dir, data) rm_interval_file = (naming.handle_synonyms(rm_interval_file, dd.get_ref_file(data), data["genome_build"], base_dir, data) if rm_interval_file else None) vmethod = tz.get_in(["config", "algorithm", "validate_method"], data, "rtg") if not vcfutils.vcf_has_variants(vrn_file): # RTG can fail on totally empty files. Skip these since we have nothing. pass elif vmethod == "rtg": eval_files = _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, toval_data) data["validate"] = _rtg_add_summary_file(eval_files, base_dir, toval_data) elif vmethod == "bcbio.variation": data["validate"] = _run_bcbio_variation(vrn_file, rm_file, rm_interval_file, base_dir, sample, caller, toval_data) return [[data]]
def summarize(calls, data, highdepth_beds): """Summarize results from multiple callers into a single flattened BED file. """ sample = tz.get_in(["rgnames", "sample"], data) work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "structural", sample, "ensemble")) with shared.bedtools_tmpdir(data): input_beds = filter(lambda x: x is not None and utils.file_exists(x), [_create_bed(c, sample, work_dir, data) for c in calls]) if len(input_beds) > 0: out_file = combine_bed_by_size(input_beds, sample, work_dir, data) if utils.file_exists(out_file): if len(input_beds) > N_FILTER_CALLERS: filter_file = _filter_ensemble(out_file, data) else: filter_file = out_file if len(highdepth_beds) > 0: limit_file = _limit_calls(filter_file, highdepth_beds, data) else: limit_file = filter_file bedprep_dir = utils.safe_makedir(os.path.join(os.path.dirname(limit_file), "bedprep")) calls.append({"variantcaller": "sv-ensemble", "vrn_file": bedutils.clean_file(limit_file, data, bedprep_dir=bedprep_dir)}) return calls
def run(bam_file, data, out_dir): """Run coverage QC analysis """ out = dict() out_dir = utils.safe_makedir(out_dir) if dd.get_coverage(data) and dd.get_coverage(data) not in ["None"]: merged_bed_file = bedutils.clean_file(dd.get_coverage_merged(data), data, prefix="cov-", simple=True) target_name = "coverage" elif dd.get_coverage_interval(data) != "genome": merged_bed_file = dd.get_variant_regions_merged(data) target_name = "variant_regions" else: merged_bed_file = None target_name = "genome" avg_depth = cov.get_average_coverage(target_name, merged_bed_file, data) if target_name == "coverage": out_files = cov.coverage_region_detailed_stats(target_name, merged_bed_file, data, out_dir) else: out_files = [] out['Avg_coverage'] = avg_depth samtools_stats_dir = os.path.join(out_dir, os.path.pardir, 'samtools') from bcbio.qc import samtools samtools_stats = samtools.run(bam_file, data, samtools_stats_dir)["metrics"] out["Total_reads"] = total_reads = int(samtools_stats["Total_reads"]) out["Mapped_reads"] = mapped = int(samtools_stats["Mapped_reads"]) out["Mapped_paired_reads"] = int(samtools_stats["Mapped_paired_reads"]) out['Duplicates'] = dups = int(samtools_stats["Duplicates"]) if total_reads: out["Mapped_reads_pct"] = 100.0 * mapped / total_reads if mapped: out['Duplicates_pct'] = 100.0 * dups / mapped if dd.get_coverage_interval(data) == "genome": mapped_unique = mapped - dups else: mapped_unique = readstats.number_of_mapped_reads(data, bam_file, keep_dups=False) out['Mapped_unique_reads'] = mapped_unique if merged_bed_file: ontarget = readstats.number_of_mapped_reads(data, bam_file, keep_dups=False, bed_file=merged_bed_file, target_name=target_name) out["Ontarget_unique_reads"] = ontarget if mapped_unique: out["Ontarget_pct"] = 100.0 * ontarget / mapped_unique out['Offtarget_pct'] = 100.0 * (mapped_unique - ontarget) / mapped_unique if dd.get_coverage_interval(data) != "genome": # Skip padded calculation for WGS even if the "coverage" file is specified # the padded statistic makes only sense for exomes and panels padded_bed_file = bedutils.get_padded_bed_file( out_dir, merged_bed_file, 200, data) ontarget_padded = readstats.number_of_mapped_reads( data, bam_file, keep_dups=False, bed_file=padded_bed_file, target_name=target_name + "_padded") out["Ontarget_padded_pct"] = 100.0 * ontarget_padded / mapped_unique if total_reads: out['Usable_pct'] = 100.0 * ontarget / total_reads indexcov_files = _goleft_indexcov(bam_file, data, out_dir) out_files += [x for x in indexcov_files if x and utils.file_exists(x)] out = {"metrics": out} if len(out_files) > 0: out["base"] = out_files[0] out["secondary"] = out_files[1:] return out
def _merge_target_information(samples, metrics_dir): out_file = os.path.abspath(os.path.join(metrics_dir, "target_info.yaml")) if utils.file_exists(out_file): return samples genomes = set(dd.get_genome_build(data) for data in samples) coverage_beds = set(dd.get_coverage(data) for data in samples) original_variant_regions = set( dd.get_variant_regions_orig(data) for data in samples) data = samples[0] info = {} # Reporting in MultiQC only if the genome is the same across all samples if len(genomes) == 1: info["genome_info"] = { "name": dd.get_genome_build(data), "size": sum([ c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"]) ]), } # Reporting in MultiQC only if the target is the same across all samples vcr_orig = None if len(original_variant_regions) == 1 and list( original_variant_regions)[0] is not None: vcr_orig = list(original_variant_regions)[0] vcr_clean = bedutils.clean_file(vcr_orig, data) info["variants_regions_info"] = { "bed": vcr_orig, "size": sum( len(x) for x in pybedtools.BedTool( dd.get_variant_regions_merged(data))), "regions": pybedtools.BedTool(vcr_clean).count(), } gene_num = annotate.count_genes(vcr_clean, data) if gene_num is not None: info["variants_regions_info"]["genes"] = gene_num else: info["variants_regions_info"] = { "bed": "callable regions", } # Reporting in MultiQC only if the target is the same across samples if len(coverage_beds) == 1: cov_bed = list(coverage_beds)[0] if cov_bed not in [None, "None"]: if vcr_orig and vcr_orig == cov_bed: info["coverage_bed_info"] = info["variants_regions_info"] else: clean_bed = bedutils.clean_file(cov_bed, data, prefix="cov-", simple=True) info["coverage_bed_info"] = { "bed": cov_bed, "size": pybedtools.BedTool(cov_bed).total_coverage(), "regions": pybedtools.BedTool(clean_bed).count(), } gene_num = annotate.count_genes(clean_bed, data) if gene_num is not None: info["coverage_bed_info"]["genes"] = gene_num else: info["coverage_bed_info"] = info["variants_regions_info"] coverage_intervals = set(data["config"]["algorithm"]["coverage_interval"] for data in samples) if len(coverage_intervals) == 1: info["coverage_interval"] = list(coverage_intervals)[0] if info: with open(out_file, "w") as out_handle: yaml.safe_dump(info, out_handle) return samples
def _merge_target_information(samples, metrics_dir): out_file = os.path.abspath(os.path.join(metrics_dir, "target_info.yaml")) if utils.file_exists(out_file): return samples genomes = set(dd.get_genome_build(data) for data in samples) coverage_beds = set(dd.get_coverage(data) for data in samples) original_variant_regions = set(dd.get_variant_regions_orig(data) for data in samples) data = samples[0] info = {} # Reporting in MultiQC only if the genome is the same across all samples if len(genomes) == 1: info["genome_info"] = { "name": dd.get_genome_build(data), "size": sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])]), } # Reporting in MultiQC only if the target is the same across all samples vcr_orig = None if len(original_variant_regions) == 1 and list(original_variant_regions)[0] is not None: vcr_orig = list(original_variant_regions)[0] vcr_clean = bedutils.clean_file(vcr_orig, data) info["variants_regions_info"] = { "bed": vcr_orig, "size": sum(len(x) for x in pybedtools.BedTool(dd.get_variant_regions_merged(data))), "regions": pybedtools.BedTool(vcr_clean).count(), } gene_num = annotate.count_genes(vcr_clean, data) if gene_num is not None: info["variants_regions_info"]["genes"] = gene_num else: info["variants_regions_info"] = { "bed": "callable regions", } # Reporting in MultiQC only if the target is the same across samples if len(coverage_beds) == 1: cov_bed = list(coverage_beds)[0] if cov_bed not in [None, "None"]: if vcr_orig and vcr_orig == cov_bed: info["coverage_bed_info"] = info["variants_regions_info"] else: clean_bed = bedutils.clean_file(cov_bed, data, prefix="cov-", simple=True) info["coverage_bed_info"] = { "bed": cov_bed, "size": pybedtools.BedTool(cov_bed).total_coverage(), "regions": pybedtools.BedTool(clean_bed).count(), } gene_num = annotate.count_genes(clean_bed, data) if gene_num is not None: info["coverage_bed_info"]["genes"] = gene_num else: info["coverage_bed_info"] = info["variants_regions_info"] coverage_intervals = set(data["config"]["algorithm"]["coverage_interval"] for data in samples) if len(coverage_intervals) == 1: info["coverage_interval"] = list(coverage_intervals)[0] if info: with open(out_file, "w") as out_handle: yaml.safe_dump(info, out_handle) return samples
def _run_coverage_qc(bam_file, data, out_dir): """Run coverage QC analysis""" out = dict() samtools_stats_dir = os.path.join(out_dir, os.path.pardir, out_dir) from bcbio.qc import samtools samtools_stats = samtools.run(bam_file, data, samtools_stats_dir) if "Total_reads" not in samtools_stats: return out["Total_reads"] = total_reads = int(samtools_stats["Total_reads"]) if not total_reads: return if "Mapped_reads_raw" not in samtools_stats or "Mapped_reads" not in samtools_stats: return out["Mapped_reads"] = mapped = int(samtools_stats["Mapped_reads"]) out["Mapped_reads_pct"] = 100.0 * mapped / total_reads if not mapped: return out if "Duplicates" in samtools_stats: out['Duplicates'] = dups = int(samtools_stats["Duplicates"]) out['Duplicates_pct'] = 100.0 * dups / int( samtools_stats["Mapped_reads_raw"]) else: dups = 0 if dd.get_coverage(data): cov_bed_file = bedutils.clean_file(dd.get_coverage(data), data, prefix="cov-", simple=True) merged_bed_file = bedutils.merge_overlaps(cov_bed_file, data) target_name = "coverage" elif dd.get_coverage_interval(data) != "genome": merged_bed_file = dd.get_variant_regions_merged(data) target_name = "variant_regions" else: merged_bed_file = None target_name = "genome" # Whole genome runs do not need detailed on-target calculations, use total unique mapped if dd.get_coverage_interval(data) == "genome": mapped_unique = mapped - dups else: out['Mapped_unique_reads'] = mapped_unique = sambamba.number_of_mapped_reads( data, bam_file, keep_dups=False) if merged_bed_file: ontarget = sambamba.number_of_mapped_reads(data, bam_file, keep_dups=False, bed_file=merged_bed_file, target_name=target_name) if mapped_unique: out["Ontarget_unique_reads"] = ontarget out["Ontarget_pct"] = 100.0 * ontarget / mapped_unique out['Offtarget_pct'] = 100.0 * (mapped_unique - ontarget) / mapped_unique if dd.get_coverage_interval(data) != "genome": # Skip padded calculation for WGS even if the "coverage" file is specified # the padded statistic makes only sense for exomes and panels padded_bed_file = bedutils.get_padded_bed_file( merged_bed_file, 200, data) ontarget_padded = sambamba.number_of_mapped_reads( data, bam_file, keep_dups=False, bed_file=padded_bed_file, target_name=target_name + "_padded") out["Ontarget_padded_pct"] = 100.0 * ontarget_padded / mapped_unique if total_reads: out['Usable_pct'] = 100.0 * ontarget / total_reads avg_depth = cov.get_average_coverage(data, bam_file, merged_bed_file, target_name) out['Avg_coverage'] = avg_depth region_coverage_file = cov.coverage_region_detailed_stats( data, out_dir, extra_cutoffs=set([max(1, int(avg_depth * 0.8))])) return out
def _run_coverage_qc(bam_file, data, out_dir): """Run coverage QC analysis""" out = dict() samtools_stats_dir = os.path.join(out_dir, os.path.pardir, out_dir) from bcbio.qc import samtools samtools_stats = samtools.run(bam_file, data, samtools_stats_dir) if "Total_reads" not in samtools_stats: return out["Total_reads"] = total_reads = int(samtools_stats["Total_reads"]) if not total_reads: return if "Mapped_reads_raw" not in samtools_stats or "Mapped_reads" not in samtools_stats: return out["Mapped_reads"] = mapped = int(samtools_stats["Mapped_reads"]) out["Mapped_reads_pct"] = 100.0 * mapped / total_reads if not mapped: return out if "Duplicates" in samtools_stats: out['Duplicates'] = dups = int(samtools_stats["Duplicates"]) out['Duplicates_pct'] = 100.0 * dups / int(samtools_stats["Mapped_reads_raw"]) else: dups = 0 if dd.get_coverage(data) and dd.get_coverage(data) not in ["None"]: cov_bed_file = bedutils.clean_file(dd.get_coverage(data), data, prefix="cov-", simple=True) merged_bed_file = bedutils.merge_overlaps(cov_bed_file, data) target_name = "coverage" elif dd.get_coverage_interval(data) != "genome": merged_bed_file = dd.get_variant_regions_merged(data) target_name = "variant_regions" else: merged_bed_file = None target_name = "genome" # Whole genome runs do not need detailed on-target calculations, use total unique mapped if dd.get_coverage_interval(data) == "genome": mapped_unique = mapped - dups else: out['Mapped_unique_reads'] = mapped_unique = sambamba.number_of_mapped_reads(data, bam_file, keep_dups=False) if merged_bed_file: ontarget = sambamba.number_of_mapped_reads( data, bam_file, keep_dups=False, bed_file=merged_bed_file, target_name=target_name) if mapped_unique: out["Ontarget_unique_reads"] = ontarget out["Ontarget_pct"] = 100.0 * ontarget / mapped_unique out['Offtarget_pct'] = 100.0 * (mapped_unique - ontarget) / mapped_unique if dd.get_coverage_interval(data) != "genome": # Skip padded calculation for WGS even if the "coverage" file is specified # the padded statistic makes only sense for exomes and panels padded_bed_file = bedutils.get_padded_bed_file(merged_bed_file, 200, data) ontarget_padded = sambamba.number_of_mapped_reads( data, bam_file, keep_dups=False, bed_file=padded_bed_file, target_name=target_name + "_padded") out["Ontarget_padded_pct"] = 100.0 * ontarget_padded / mapped_unique if total_reads: out['Usable_pct'] = 100.0 * ontarget / total_reads avg_depth = cov.get_average_coverage(data, bam_file, merged_bed_file, target_name) out['Avg_coverage'] = avg_depth region_coverage_file = cov.coverage_region_detailed_stats(data, out_dir, extra_cutoffs=set([max(1, int(avg_depth * 0.8))])) return out
def _run_coverage_qc(bam_file, data, out_dir): """Run coverage QC analysis""" out = dict() total_reads = sambamba.number_of_reads(data, bam_file) out['Total_reads'] = total_reads mapped = sambamba.number_of_mapped_reads(data, bam_file) out['Mapped_reads'] = mapped if total_reads: out['Mapped_reads_pct'] = 100.0 * mapped / total_reads if mapped: mapped_unique = sambamba.number_of_mapped_reads(data, bam_file, keep_dups=False) out['Mapped_unique_reads'] = mapped mapped_dups = mapped - mapped_unique out['Duplicates'] = mapped_dups out['Duplicates_pct'] = 100.0 * mapped_dups / mapped if dd.get_coverage(data): cov_bed_file = clean_file(dd.get_coverage(data), data, prefix="cov-", simple=True) merged_bed_file = bedutils.merge_overlaps(cov_bed_file, data) target_name = "coverage" else: merged_bed_file = dd.get_variant_regions_merged(data) target_name = "variant_regions" ontarget = sambamba.number_mapped_reads_on_target( data, merged_bed_file, bam_file, keep_dups=False, target_name=target_name) if mapped_unique: out["Ontarget_unique_reads"] = ontarget out["Ontarget_pct"] = 100.0 * ontarget / mapped_unique out['Offtarget_pct'] = 100.0 * (mapped_unique - ontarget) / mapped_unique padded_bed_file = bedutils.get_padded_bed_file( merged_bed_file, 200, data) ontarget_padded = sambamba.number_mapped_reads_on_target( data, padded_bed_file, bam_file, keep_dups=False, target_name=target_name + "_padded") out["Ontarget_padded_pct"] = 100.0 * ontarget_padded / mapped_unique if total_reads: out['Usable_pct'] = 100.0 * ontarget / total_reads avg_coverage = get_average_coverage(data, bam_file, merged_bed_file, target_name) out['Avg_coverage'] = avg_coverage priority = cov.priority_coverage(data, out_dir) cov.priority_total_coverage(data, out_dir) region_coverage_file = cov.coverage_region_detailed_stats(data, out_dir) # Re-enable with annotations from internally installed # problem region directory # if priority: # annotated = cov.decorate_problem_regions(priority, problem_regions) return out