def _detect_duplicates(bam_file, out_dir, data): """ count duplicate percentage """ out_file = os.path.join(out_dir, "dup_metrics.txt") if not utils.file_exists(out_file): dup_align_bam = postalign.dedup_bam(bam_file, data) logger.info("Detecting duplicates in %s." % dup_align_bam) dup_count = readstats.number_of_mapped_reads(data, dup_align_bam, keep_dups=False) tot_count = readstats.number_of_mapped_reads(data, dup_align_bam, keep_dups=True) with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: out_handle.write("%s\n%s\n" % (dup_count, tot_count)) with open(out_file) as in_handle: dupes = float(next(in_handle).strip()) total = float(next(in_handle).strip()) if total == 0: rate = "NA" else: rate = dupes / total return {"Duplication Rate of Mapped": rate}
def _count_offtarget(data, bam_file, bed_file, target_name): mapped_unique = readstats.number_of_mapped_reads(data, bam_file, keep_dups=False) ontarget = readstats.number_of_mapped_reads( data, bam_file, keep_dups=False, bed_file=bed_file, target_name=target_name) if mapped_unique: return float(mapped_unique - ontarget) / mapped_unique else: return 0.0
def _prep_real_counts(bam_file, data, samtools_stats): out = {} if dd.get_coverage(data) and dd.get_coverage(data) not in ["None"]: bed = dd.get_coverage_merged(data) target_name = "coverage" elif dd.get_coverage_interval(data) != "genome": bed = dd.get_variant_regions_merged(data) or dd.get_sample_callable( data) target_name = "variant_regions" else: bed = None target_name = "genome" dedupped = utils.get_in(data, ("config", "algorithm", "mark_duplicates"), True) if bed: out["Preseq_genome_size"] = pybedtools.BedTool(bed).total_coverage() out["Preseq_read_count"] = readstats.number_of_mapped_reads( data, bam_file, keep_dups=True, bed_file=bed, target_name=target_name) ontrg_unique_depth = cov.get_average_coverage(target_name, bed, data, bam_file) if dedupped: out["Preseq_unique_count"] = readstats.number_of_mapped_reads( data, bam_file, keep_dups=False, bed_file=bed, target_name=target_name) # Counting average on-target alignment length, based on the equation: # avg depth ~~ num (unique) on-target alignments * avg on-target aln length / target size total_alignments = out.get( "Preseq_unique_count") or out["Preseq_read_count"] out["Preseq_read_length"] = ontrg_unique_depth * out[ "Preseq_genome_size"] // total_alignments else: # WGS out["Preseq_genome_size"] = sum([ c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"]) ]) out["Preseq_read_count"] = int(samtools_stats["Total_reads"]) out["Preseq_read_length"] = int(samtools_stats["Average_read_length"]) if dedupped: out["Preseq_unique_count"] = out["Preseq_read_count"] - int( samtools_stats["Duplicates"]) return out
def _reads_in_peaks(bam_file, peaks_file, sample): """Calculate number of reads in peaks""" if not peaks_file: return {} rip = number_of_mapped_reads(sample, bam_file, bed_file=peaks_file) return {"metrics": {"RiP": rip}}
def _prep_real_counts(bam_file, data, samtools_stats): out = {} if dd.get_coverage(data) and dd.get_coverage(data) not in ["None"]: bed = dd.get_coverage_merged(data) target_name = "coverage" elif dd.get_coverage_interval(data) != "genome": bed = dd.get_variant_regions_merged(data) target_name = "variant_regions" else: bed = None target_name = "genome" dedupped = utils.get_in(data, ("config", "algorithm", "mark_duplicates"), True) if bed: out["Preseq_genome_size"] = pybedtools.BedTool(bed).total_coverage() out["Preseq_read_count"] = readstats.number_of_mapped_reads( data, bam_file, keep_dups=True, bed_file=bed, target_name=target_name) ontrg_unique_depth = cov.get_average_coverage(target_name, bed, data, bam_file) if dedupped: out["Preseq_unique_count"] = readstats.number_of_mapped_reads( data, bam_file, keep_dups=False, bed_file=bed, target_name=target_name) # Counting average on-target alignment length, based on the equation: # avg depth ~~ num (unique) on-target alignments * avg on-target aln length / target size total_alignments = out.get("Preseq_unique_count") or out["Preseq_read_count"] out["Preseq_read_length"] = ontrg_unique_depth * out["Preseq_genome_size"] // total_alignments else: # WGS out["Preseq_genome_size"] = sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])]) out["Preseq_read_count"] = int(samtools_stats["Total_reads"]) out["Preseq_read_length"] = int(samtools_stats["Average_read_length"]) if dedupped: out["Preseq_unique_count"] = out["Preseq_read_count"] - int(samtools_stats["Duplicates"]) return out
def run(bam_file, data, out_dir): """Run coverage QC analysis """ out = dict() out_dir = utils.safe_makedir(out_dir) if dd.get_coverage(data) and dd.get_coverage(data) not in ["None"]: merged_bed_file = bedutils.clean_file(dd.get_coverage_merged(data), data, prefix="cov-", simple=True) target_name = "coverage" elif dd.get_coverage_interval(data) != "genome": merged_bed_file = dd.get_variant_regions_merged(data) target_name = "variant_regions" else: merged_bed_file = None target_name = "genome" avg_depth = cov.get_average_coverage(target_name, merged_bed_file, data) if target_name == "coverage": out_files = cov.coverage_region_detailed_stats(target_name, merged_bed_file, data, out_dir) else: out_files = [] out['Avg_coverage'] = avg_depth samtools_stats_dir = os.path.join(out_dir, os.path.pardir, 'samtools') from bcbio.qc import samtools samtools_stats = samtools.run(bam_file, data, samtools_stats_dir)["metrics"] out["Total_reads"] = total_reads = int(samtools_stats["Total_reads"]) out["Mapped_reads"] = mapped = int(samtools_stats["Mapped_reads"]) out["Mapped_paired_reads"] = int(samtools_stats["Mapped_paired_reads"]) out['Duplicates'] = dups = int(samtools_stats["Duplicates"]) if total_reads: out["Mapped_reads_pct"] = 100.0 * mapped / total_reads if mapped: out['Duplicates_pct'] = 100.0 * dups / mapped if dd.get_coverage_interval(data) == "genome": mapped_unique = mapped - dups else: mapped_unique = readstats.number_of_mapped_reads(data, bam_file, keep_dups=False) out['Mapped_unique_reads'] = mapped_unique if merged_bed_file: ontarget = readstats.number_of_mapped_reads(data, bam_file, keep_dups=False, bed_file=merged_bed_file, target_name=target_name) out["Ontarget_unique_reads"] = ontarget if mapped_unique: out["Ontarget_pct"] = 100.0 * ontarget / mapped_unique out['Offtarget_pct'] = 100.0 * (mapped_unique - ontarget) / mapped_unique if dd.get_coverage_interval(data) != "genome": # Skip padded calculation for WGS even if the "coverage" file is specified # the padded statistic makes only sense for exomes and panels padded_bed_file = bedutils.get_padded_bed_file( out_dir, merged_bed_file, 200, data) ontarget_padded = readstats.number_of_mapped_reads( data, bam_file, keep_dups=False, bed_file=padded_bed_file, target_name=target_name + "_padded") out["Ontarget_padded_pct"] = 100.0 * ontarget_padded / mapped_unique if total_reads: out['Usable_pct'] = 100.0 * ontarget / total_reads indexcov_files = _goleft_indexcov(bam_file, data, out_dir) out_files += [x for x in indexcov_files if x and utils.file_exists(x)] out = {"metrics": out} if len(out_files) > 0: out["base"] = out_files[0] out["secondary"] = out_files[1:] return out
def _reads_in_peaks(bam_file, peaks_file, sample): """Calculate number of reads in peaks""" if not peaks_file: return {} rip = number_of_mapped_reads(sample, bam_file, bed_file = peaks_file) return {"metrics": {"RiP": rip}}