def calculate(bam_file, data): """Calculate coverage in parallel using samtools depth through goleft. samtools depth removes duplicates and secondary reads from the counts: if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue; """ params = {"window_size": 5000, "parallel_window_size": 1e5, "min": dd.get_coverage_depth_min(data), "high_multiplier": 20} prefix = os.path.join( utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))), "%s-coverage" % (dd.get_sample_name(data))) depth_file = prefix + ".depth.bed" callable_file = prefix + ".callable.bed" variant_regions = dd.get_variant_regions_merged(data) variant_regions_avg_cov = get_average_coverage(data, bam_file, variant_regions, "variant_regions") if not utils.file_uptodate(callable_file, bam_file): cmd = ["goleft", "depth", "--q", "1", "--mincov", str(params["min"]), "--processes", str(dd.get_num_cores(data)), "--ordered"] max_depth = _get_max_depth(variant_regions_avg_cov, params, data) if max_depth: cmd += ["--maxmeandepth", str(int(max_depth))] with file_transaction(data, depth_file) as tx_depth_file: with utils.chdir(os.path.dirname(tx_depth_file)): tx_callable_file = tx_depth_file.replace(".depth.bed", ".callable.bed") prefix = tx_depth_file.replace(".depth.bed", "") bam_ref_file = "%s-bamref.fa" % utils.splitext_plus(bam_file)[0] bam.fai_from_bam(dd.get_ref_file(data), bam_file, bam_ref_file + ".fai", data) cmd += ["--reference", bam_ref_file] cmd += ["--prefix", prefix, bam_file] bcbio_env = utils.get_bcbio_env() msg = "Calculate coverage: %s" % dd.get_sample_name(data) do.run(cmd, msg, env=bcbio_env) shutil.move(tx_callable_file, callable_file) final_callable = _subset_to_variant_regions(callable_file, variant_regions, data) return depth_file, final_callable, _extract_highdepth(final_callable, data), variant_regions_avg_cov
def calculate(bam_file, data): """Calculate coverage in parallel using mosdepth. Removes duplicates and secondary reads from the counts: if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue; """ params = {"min": dd.get_coverage_depth_min(data)} variant_regions = dd.get_variant_regions_merged(data) if not variant_regions: variant_regions = _create_genome_regions(data) # Back compatible with previous pre-mosdepth callable files callable_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))), "%s-coverage.callable.bed" % (dd.get_sample_name(data))) if not utils.file_uptodate(callable_file, bam_file): vr_quantize = ("0:1:%s:" % (params["min"]), ["NO_COVERAGE", "LOW_COVERAGE", "CALLABLE"]) to_calculate = [("variant_regions", variant_regions, vr_quantize, None), ("sv_regions", regions.get_sv_bed(data), None, None), ("coverage", dd.get_coverage(data), None, DEPTH_THRESHOLDS)] depth_files = {} for target_name, region_bed, quantize, thresholds in to_calculate: if region_bed: cur_depth = {} depth_info = run_mosdepth(data, target_name, region_bed, quantize=quantize, thresholds=thresholds) for attr in ("dist", "regions", "thresholds"): val = getattr(depth_info, attr, None) if val: cur_depth[attr] = val depth_files[target_name] = cur_depth if target_name == "variant_regions": callable_file = depth_info.quantize else: depth_files = {} final_callable = _subset_to_variant_regions(callable_file, variant_regions, data) return final_callable, depth_files
def calculate(bam_file, data): """Calculate coverage in parallel using mosdepth. Removes duplicates and secondary reads from the counts: if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue; """ params = {"min": dd.get_coverage_depth_min(data)} variant_regions = dd.get_variant_regions_merged(data) if not variant_regions: variant_regions = _create_genome_regions(data) # Back compatible with previous pre-mosdepth callable files callable_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))), "%s-coverage.callable.bed" % (dd.get_sample_name(data))) if not utils.file_uptodate(callable_file, bam_file): vr_quantize = ("0:1:%s:" % (params["min"]), ["NO_COVERAGE", "LOW_COVERAGE", "CALLABLE"]) to_calculate = [("variant_regions", variant_regions, vr_quantize, None), ("sv_regions", bedutils.clean_file(regions.get_sv_bed(data), data), None, None), ("coverage", bedutils.clean_file(dd.get_coverage(data), data), None, DEPTH_THRESHOLDS)] depth_files = {} for target_name, region_bed, quantize, thresholds in to_calculate: if region_bed: cur_depth = {} depth_info = run_mosdepth(data, target_name, region_bed, quantize=quantize, thresholds=thresholds) for attr in ("dist", "regions", "thresholds"): val = getattr(depth_info, attr, None) if val: cur_depth[attr] = val depth_files[target_name] = cur_depth if target_name == "variant_regions": callable_file = depth_info.quantize else: depth_files = {} final_callable = _subset_to_variant_regions(callable_file, variant_regions, data) return final_callable, depth_files
def calculate(bam_file, data): """Calculate coverage in parallel using samtools depth through goleft. samtools depth removes duplicates and secondary reads from the counts: if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue; """ params = { "window_size": 5000, "parallel_window_size": 1e5, "min": dd.get_coverage_depth_min(data), "high_multiplier": 20 } prefix = os.path.join( utils.safe_makedir( os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))), "%s-coverage" % (dd.get_sample_name(data))) out_file = prefix + ".depth.bed" callable_file = prefix + ".callable.bed" variant_regions = dd.get_variant_regions_merged(data) variant_regions_avg_cov = get_average_coverage(data, bam_file, variant_regions, "variant_regions", file_prefix=prefix) if not utils.file_uptodate(out_file, bam_file): ref_file = dd.get_ref_file(data) cmd = [ "goleft", "depth", "--windowsize", str(params["window_size"]), "--q", "1", "--mincov", str(params["min"]), "--reference", ref_file, "--processes", str(dd.get_num_cores(data)), "--stats", "--ordered" ] if variant_regions: window_file = "%s-tocalculate-windows.bed" % utils.splitext_plus( out_file)[0] if not utils.file_uptodate(window_file, bam_file): with file_transaction(data, window_file) as tx_out_file: pybedtools.BedTool().window_maker( w=params["parallel_window_size"], b=pybedtools.BedTool(variant_regions)).saveas( tx_out_file) cmd += ["--bed", window_file] max_depth = _get_max_depth(variant_regions_avg_cov, params, data) if max_depth: cmd += ["--maxmeandepth", str(int(max_depth))] with file_transaction(data, out_file) as tx_out_file: with utils.chdir(os.path.dirname(tx_out_file)): tx_callable_file = tx_out_file.replace(".depth.bed", ".callable.bed") prefix = tx_out_file.replace(".depth.bed", "") cmd += ["--prefix", prefix, bam_file] do.run(cmd, "Calculate coverage: %s" % dd.get_sample_name(data)) shutil.move(tx_callable_file, callable_file) return out_file, callable_file, _extract_highdepth( callable_file, data), variant_regions_avg_cov
def summary(items): data = items[0] cutoff = dd.get_coverage_depth_min(data) work_dir = dd.get_work_dir(data) out_dir = utils.safe_makedir(os.path.join(work_dir, "coverage")) coverage_bed = dd.get_coverage_regions(data) priority_bed = dd.get_priority_regions(data) batch = _get_group_batch(items) assert batch, "Did not find batch for samples: %s" % ",".join([dd.get_sample_name(x) for x in items]) out_file = os.path.join(out_dir, "%s-coverage.db" % batch) if not utils.file_exists(out_file): if coverage_bed: mini_coverage = bed.minimize(coverage_bed).fn if priority_bed: mini_priority = bed.minimize(priority_bed).fn if coverage_bed and priority_bed: combined_bed = bed.concat([mini_coverage, mini_priority]).fn elif coverage_bed: combined_bed = mini_coverage elif priority_bed: combined_bed = mini_priority else: # no coverage or priority file has been set return items clean_bed = bedutils.clean_file(combined_bed, data) if len(combined_bed) > 0 else combined_bed.fn bed_file = _uniquify_bed_names(clean_bed, out_dir, data) if bed_file and utils.file_exists(bed_file): with file_transaction(data, out_file) as tx_out_file: chanjo = os.path.join(os.path.dirname(sys.executable), "chanjo") cmd = "{chanjo} --db {tx_out_file} build {bed_file}" do.run(cmd.format(**locals()), "Prep chanjo database") for data in items: sample = dd.get_sample_name(data) bam_file = data["work_bam"] cmd = ( "{chanjo} annotate -s {sample} -g {batch} -c {cutoff} " "{bam_file} {bed_file} | " "{chanjo} --db {tx_out_file} import" ) do.run(cmd.format(**locals()), "Chanjo coverage", data) if bed_file: os.remove(bed_file) coverage = regions_coverage(out_file, batch, out_dir) problem_regions = dd.get_problem_region_dir(data) if problem_regions: coverage = decorate_problem_regions(coverage, problem_regions) out = [] for data in items: if utils.file_exists(out_file): data["coverage"] = {"summary": out_file, "all": coverage} out.append([data]) return out
def calc_callable_loci(data, region=None, out_file=None): """Determine callable bases for an input BAM in the given region. """ if out_file is None: out_file = "%s-callable.bed" % os.path.splitext(data["work_bam"])[0] depth = {"min": dd.get_coverage_depth_min(data)} if not utils.file_exists(out_file): ref_file = tz.get_in(["reference", "fasta", "base"], data) region_file, calc_callable = _regions_for_coverage(data, region, ref_file, out_file) if calc_callable: _group_by_ctype(_get_coverage_file(data["work_bam"], ref_file, region, region_file, depth, out_file, data), depth, region, region_file, out_file, data) # special case, do not calculate if we are in a chromosome not covered by BED file else: with file_transaction(data, out_file) as tx_out_file: shutil.move(region_file, tx_out_file) return [{"callable_bed": out_file, "config": data["config"], "work_bam": data["work_bam"]}]
def calculate(bam_file, data): """Calculate coverage in parallel using samtools depth through goleft. samtools depth removes duplicates and secondary reads from the counts: if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue; """ params = {"window_size": 5000, "parallel_window_size": 1e5, "min": dd.get_coverage_depth_min(data), "high_multiplier": 20} prefix = os.path.join( utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))), "%s-coverage" % (dd.get_sample_name(data))) out_file = prefix + ".depth.bed" callable_file = prefix + ".callable.bed" variant_regions = dd.get_variant_regions_merged(data) variant_regions_avg_cov = get_average_coverage(data, bam_file, variant_regions, "variant_regions") if not utils.file_uptodate(out_file, bam_file): ref_file = dd.get_ref_file(data) cmd = ["goleft", "depth", "--windowsize", str(params["window_size"]), "--q", "1", "--mincov", str(params["min"]), "--reference", ref_file, "--processes", str(dd.get_num_cores(data)), "--stats", "--ordered"] window_file = "%s-tocalculate-windows.bed" % utils.splitext_plus(out_file)[0] if not utils.file_uptodate(window_file, bam_file): with file_transaction(data, window_file) as tx_out_file: if not variant_regions: variant_regions = "%s-genome.bed" % utils.splitext_plus(tx_out_file)[0] with open(variant_regions, "w") as out_handle: for c in shared.get_noalt_contigs(data): out_handle.write("%s\t%s\t%s\n" % (c.name, 0, c.size)) pybedtools.BedTool().window_maker(w=params["parallel_window_size"], b=pybedtools.BedTool(variant_regions)).saveas(tx_out_file) cmd += ["--bed", window_file] max_depth = _get_max_depth(variant_regions_avg_cov, params, data) if max_depth: cmd += ["--maxmeandepth", str(int(max_depth))] with file_transaction(data, out_file) as tx_out_file: with utils.chdir(os.path.dirname(tx_out_file)): tx_callable_file = tx_out_file.replace(".depth.bed", ".callable.bed") prefix = tx_out_file.replace(".depth.bed", "") cmd += ["--prefix", prefix, bam_file] bcbio_env = utils.get_bcbio_env() msg = "Calculate coverage: %s" % dd.get_sample_name(data) do.run(cmd, msg, env=bcbio_env) shutil.move(tx_callable_file, callable_file) return out_file, callable_file, _extract_highdepth(callable_file, data), variant_regions_avg_cov
def summary(items): data = items[0] cutoff = dd.get_coverage_depth_min(data) work_dir = dd.get_work_dir(data) out_dir = utils.safe_makedir(os.path.join(work_dir, "coverage")) coverage_bed = dd.get_coverage_regions(data) priority_bed = dd.get_priority_regions(data) batch = _get_group_batch(items) assert batch, ("Did not find batch for samples: %s" % ",".join([dd.get_sample_name(x) for x in items])) out_file = os.path.join(out_dir, "%s-coverage.db" % batch) if not utils.file_exists(out_file): combined_bed = bed.concat([coverage_bed, priority_bed]) clean_bed = bedutils.clean_file( combined_bed.fn, data) if len(combined_bed) > 0 else combined_bed.fn bed_file = _uniquify_bed_names(clean_bed, out_dir, data) if utils.file_exists(bed_file): with file_transaction(data, out_file) as tx_out_file: chanjo = os.path.join(os.path.dirname(sys.executable), "chanjo") cmd = ("{chanjo} --db {tx_out_file} build {bed_file}") do.run(cmd.format(**locals()), "Prep chanjo database") for data in items: sample = dd.get_sample_name(data) bam_file = data["work_bam"] cmd = ( "{chanjo} annotate -s {sample} -g {batch} -c {cutoff} " "{bam_file} {bed_file} | " "{chanjo} --db {tx_out_file} import") do.run(cmd.format(**locals()), "Chanjo coverage", data) os.remove(bed_file) coverage = regions_coverage(out_file, batch, out_dir) problem_regions = dd.get_problem_region_dir(data) if problem_regions: coverage = decorate_problem_regions(coverage, problem_regions) out = [] for data in items: if utils.file_exists(out_file): data["coverage"] = {"summary": out_file, "all": coverage} out.append([data]) return out