def priority_total_coverage(data): """ calculate coverage at 10 depth intervals in the priority regions """ bed_file = dd.get_priority_regions(data) if not bed_file: return data work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage") sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed") if file_exists(out_file): data['priority_total_coverage'] = os.path.abspath(out_file) return data nthreads = dd.get_num_cores(data) in_bam = dd.get_work_bam(data) sambamba = config_utils.get_program("sambamba", data, default="sambamba") with tx_tmpdir(data, work_dir) as tmp_dir: cleaned_bed = os.path.join(tmp_dir, os.path.basename(bed_file)) cleaned_bed = bed.decomment(bed_file, cleaned_bed) with file_transaction(out_file) as tx_out_file: cmd = ( "{sambamba} depth region -t {nthreads} -L {cleaned_bed} " "-F \"not unmapped\" " "-T 10 -T 20 -T 30 -T 40 -T 50 -T 60 -T 70 -T 80 -T 90 -T 100 " "{in_bam} -o {tx_out_file}") message = "Calculating coverage of {bed_file} regions in {in_bam}" do.run(cmd.format(**locals()), message.format(**locals())) data['priority_total_coverage'] = os.path.abspath(out_file) return data
def priority_total_coverage(data): """ calculate coverage at depth 20 in the priority regions """ bed_file = dd.get_priority_regions(data) if not bed_file: return data work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage") sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed") if file_exists(out_file): data['priority_total_coverage'] = os.path.abspath(out_file) return data nthreads = dd.get_num_cores(data) in_bam = dd.get_work_bam(data) sambamba = config_utils.get_program("sambamba", data, default="sambamba") with file_transaction(out_file) as tx_out_file: cmd = ("{sambamba} depth region -t {nthreads} -L {bed_file} " "-F \"not unmapped\" " "-T 20 {in_bam} -o {tx_out_file}") message = "Calculating coverage of {bed_file} regions in {in_bam}" do.run(cmd.format(**locals()), message.format(**locals())) data['priority_total_coverage'] = os.path.abspath(out_file) return data
def priority_total_coverage(data): """ calculate coverage at 10 depth intervals in the priority regions """ bed_file = dd.get_priority_regions(data) if not bed_file: return data work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage") sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed") if file_exists(out_file): data['priority_total_coverage'] = os.path.abspath(out_file) return data nthreads = dd.get_num_cores(data) in_bam = dd.get_work_bam(data) sambamba = config_utils.get_program("sambamba", data, default="sambamba") with tx_tmpdir(data, work_dir) as tmp_dir: cleaned_bed = os.path.join(tmp_dir, os.path.basename(bed_file)) cleaned_bed = bed.decomment(bed_file, cleaned_bed) with file_transaction(out_file) as tx_out_file: cmd = ("{sambamba} depth region -t {nthreads} -L {cleaned_bed} " "-F \"not unmapped\" " "-T 10 -T 20 -T 30 -T 40 -T 50 -T 60 -T 70 -T 80 -T 90 -T 100 " "{in_bam} -o {tx_out_file}") message = "Calculating coverage of {bed_file} regions in {in_bam}" do.run(cmd.format(**locals()), message.format(**locals())) data['priority_total_coverage'] = os.path.abspath(out_file) return data
def summary(items): cutoff = DEFAULT_COVERAGE_CUTOFF data = items[0] work_dir = dd.get_work_dir(data) out_dir = utils.safe_makedir(os.path.join(work_dir, "coverage")) coverage_bed = dd.get_coverage_regions(data) priority_bed = dd.get_priority_regions(data) combined_bed = bed.concat([coverage_bed, priority_bed]) clean_bed = bedutils.clean_file(combined_bed.fn, data) if len(combined_bed) > 0 else combined_bed.fn bed_file = _uniquify_bed_names(clean_bed, out_dir, data) batch = _get_group_batch(items) assert batch, ("Did not find batch for samples: %s" % ",".join([dd.get_sample_name(x) for x in items])) out_file = os.path.join(out_dir, "%s-coverage.db" % batch) if not utils.file_exists(out_file) and utils.file_exists(bed_file): with file_transaction(data, out_file) as tx_out_file: chanjo = os.path.join(os.path.dirname(sys.executable), "chanjo") cmd = ("{chanjo} --db {tx_out_file} build {bed_file}") do.run(cmd.format(**locals()), "Prep chanjo database") for data in items: sample = dd.get_sample_name(data) bam_file = data["work_bam"] cmd = ("{chanjo} annotate -s {sample} -g {batch} -c {cutoff} " "{bam_file} {bed_file} | " "{chanjo} --db {tx_out_file} import") do.run(cmd.format(**locals()), "Chanjo coverage", data) incomplete = incomplete_regions(out_file, batch, out_dir) out = [] for data in items: if utils.file_exists(out_file): data["coverage"] = {"summary": out_file, "incomplete": incomplete} out.append([data]) return out
def summary(items): data = items[0] cutoff = dd.get_coverage_depth_min(data) work_dir = dd.get_work_dir(data) out_dir = utils.safe_makedir(os.path.join(work_dir, "coverage")) coverage_bed = dd.get_coverage_regions(data) priority_bed = dd.get_priority_regions(data) batch = _get_group_batch(items) assert batch, "Did not find batch for samples: %s" % ",".join([dd.get_sample_name(x) for x in items]) out_file = os.path.join(out_dir, "%s-coverage.db" % batch) if not utils.file_exists(out_file): if coverage_bed: mini_coverage = bed.minimize(coverage_bed).fn if priority_bed: mini_priority = bed.minimize(priority_bed).fn if coverage_bed and priority_bed: combined_bed = bed.concat([mini_coverage, mini_priority]).fn elif coverage_bed: combined_bed = mini_coverage elif priority_bed: combined_bed = mini_priority else: # no coverage or priority file has been set return items clean_bed = bedutils.clean_file(combined_bed, data) if len(combined_bed) > 0 else combined_bed.fn bed_file = _uniquify_bed_names(clean_bed, out_dir, data) if bed_file and utils.file_exists(bed_file): with file_transaction(data, out_file) as tx_out_file: chanjo = os.path.join(os.path.dirname(sys.executable), "chanjo") cmd = "{chanjo} --db {tx_out_file} build {bed_file}" do.run(cmd.format(**locals()), "Prep chanjo database") for data in items: sample = dd.get_sample_name(data) bam_file = data["work_bam"] cmd = ( "{chanjo} annotate -s {sample} -g {batch} -c {cutoff} " "{bam_file} {bed_file} | " "{chanjo} --db {tx_out_file} import" ) do.run(cmd.format(**locals()), "Chanjo coverage", data) if bed_file: os.remove(bed_file) coverage = regions_coverage(out_file, batch, out_dir) problem_regions = dd.get_problem_region_dir(data) if problem_regions: coverage = decorate_problem_regions(coverage, problem_regions) out = [] for data in items: if utils.file_exists(out_file): data["coverage"] = {"summary": out_file, "all": coverage} out.append([data]) return out
def _add_scatter_plot(out, data): out_file = "%s-scatter.pdf" % os.path.splitext(out["cnr"])[0] priority_regions = dd.get_priority_regions(data) if not priority_regions: return None priority_bed = plot._prioritize_plot_regions(pybedtools.BedTool(priority_regions), data) if utils.file_exists(out_file): return out_file cnr = _remove_haplotype_chroms(out["cnr"], data) cns = _remove_haplotype_chroms(out["cns"], data) with file_transaction(data, out_file) as tx_out_file: cmd = [_get_cmd(), "scatter", "-s", cns, "-o", tx_out_file, "-l", priority_bed, cnr] do.run(cmd, "CNVkit scatter plot") return out_file
def _add_scatter_plot(out, data): out_file = "%s-scatter.pdf" % os.path.splitext(out["cnr"])[0] priority_regions = dd.get_priority_regions(data) if not priority_regions: return None priority_bed = plot._prioritize_plot_regions( pybedtools.BedTool(priority_regions), data) if utils.file_exists(out_file): return out_file cnr = _remove_haplotype_chroms(out["cnr"], data) cns = _remove_haplotype_chroms(out["cns"], data) with file_transaction(data, out_file) as tx_out_file: cmd = [ _get_cmd(), "scatter", "-s", cns, "-o", tx_out_file, "-l", priority_bed, cnr ] do.run(cmd, "CNVkit scatter plot") return out_file
def priority_coverage(data): AVERAGE_REGION_STRING_LENGTH = 100 bed_file = dd.get_priority_regions(data) if not bed_file: return data work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage") batch_size = max_command_length() / AVERAGE_REGION_STRING_LENGTH sample = dd.get_sample_name(data) out_file = os.path.join(sample + "_priority_depth.bed") if file_exists(out_file): data['priority_coverage'] = os.path.abspath(out_file) return data with chdir(work_dir): in_bam = data['work_bam'] logger.debug("Calculating priority coverage for %s" % sample) region_bed = pybedtools.BedTool(bed_file) with file_transaction(out_file) as tx_out_file: lcount = 0 for chunk in robust_partition_all(batch_size, region_bed): coord_batch = [] line_batch = "" for line in chunk: lcount += 1 chrom = line.chrom start = max(line.start, 0) end = line.end coords = "%s:%s-%s" % (chrom, start, end) coord_batch.append(coords) line_batch += str(line) if not coord_batch: continue region_file = pybedtools.BedTool(line_batch, from_string=True).saveas().fn coord_string = " ".join(coord_batch) awk_string = r"""'BEGIN {OFS="\t"} {print $1,$2+$5,$2+$5,$4,$6"\t%s"}'""" % sample cmd = ("samtools view -b {in_bam} {coord_string} | " "bedtools coverage -d -a {region_file} -b - | " "awk {awk_string} >> {tx_out_file}") _silence_run(cmd.format(**locals())) data['priority_coverage'] = os.path.abspath(out_file) return data
def summary(items): data = items[0] cutoff = dd.get_coverage_depth_min(data) work_dir = dd.get_work_dir(data) out_dir = utils.safe_makedir(os.path.join(work_dir, "coverage")) coverage_bed = dd.get_coverage_regions(data) priority_bed = dd.get_priority_regions(data) batch = _get_group_batch(items) assert batch, ("Did not find batch for samples: %s" % ",".join([dd.get_sample_name(x) for x in items])) out_file = os.path.join(out_dir, "%s-coverage.db" % batch) if not utils.file_exists(out_file): combined_bed = bed.concat([coverage_bed, priority_bed]) clean_bed = bedutils.clean_file( combined_bed.fn, data) if len(combined_bed) > 0 else combined_bed.fn bed_file = _uniquify_bed_names(clean_bed, out_dir, data) if utils.file_exists(bed_file): with file_transaction(data, out_file) as tx_out_file: chanjo = os.path.join(os.path.dirname(sys.executable), "chanjo") cmd = ("{chanjo} --db {tx_out_file} build {bed_file}") do.run(cmd.format(**locals()), "Prep chanjo database") for data in items: sample = dd.get_sample_name(data) bam_file = data["work_bam"] cmd = ( "{chanjo} annotate -s {sample} -g {batch} -c {cutoff} " "{bam_file} {bed_file} | " "{chanjo} --db {tx_out_file} import") do.run(cmd.format(**locals()), "Chanjo coverage", data) os.remove(bed_file) coverage = regions_coverage(out_file, batch, out_dir) problem_regions = dd.get_problem_region_dir(data) if problem_regions: coverage = decorate_problem_regions(coverage, problem_regions) out = [] for data in items: if utils.file_exists(out_file): data["coverage"] = {"summary": out_file, "all": coverage} out.append([data]) return out
def _needs_coverage(data): return dd.get_coverage_regions(data) or dd.get_priority_regions(data)