def clean_inputs(data): """Clean BED input files to avoid overlapping segments that cause downstream issues. Per-merges inputs to avoid needing to call multiple times during later parallel steps. """ if not utils.get_in(data, ("config", "algorithm", "variant_regions_orig")): data["config"]["algorithm"]["variant_regions_orig"] = dd.get_variant_regions(data) clean_vr = clean_file(dd.get_variant_regions(data), data) merged_vr = merge_overlaps(clean_vr, data) data["config"]["algorithm"]["variant_regions"] = clean_vr data["config"]["algorithm"]["variant_regions_merged"] = merged_vr if dd.get_coverage(data): if not utils.get_in(data, ("config", "algorithm", "coverage_orig")): data["config"]["algorithm"]["coverage_orig"] = dd.get_coverage(data) clean_cov_bed = clean_file(dd.get_coverage(data), data, prefix="cov-", simple=True) merged_cov_bed = merge_overlaps(clean_cov_bed, data) data["config"]["algorithm"]["coverage"] = clean_cov_bed data["config"]["algorithm"]["coverage_merged"] = merged_cov_bed if 'seq2c' in get_svcallers(data): seq2c_ready_bed = prep_seq2c_bed(data) if not seq2c_ready_bed: logger.warning("Can't run Seq2C without a svregions or variant_regions BED file") else: data["config"]["algorithm"]["seq2c_bed_ready"] = seq2c_ready_bed return data
def postprocess_alignment(data): """Perform post-processing steps required on full BAM files. Prepares list of callable genome regions allowing subsequent parallelization. """ data = cwlutils.normalize_missing(utils.to_single_data(data)) data = cwlutils.unpack_tarballs(data, data) bam_file = data.get("align_bam") or data.get("work_bam") ref_file = dd.get_ref_file(data) if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith(".bam"): out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))) bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file)) if not utils.file_exists(bam_file_ready): utils.symlink_plus(bam_file, bam_file_ready) bam.index(bam_file_ready, data["config"]) covinfo = callable.sample_callable_bed(bam_file_ready, ref_file, data) callable_region_bed, nblock_bed = \ callable.block_regions(covinfo.raw_callable, bam_file_ready, ref_file, data) data["regions"] = {"nblock": nblock_bed, "callable": covinfo.raw_callable, "sample_callable": covinfo.callable, "mapped_stats": readstats.get_cache_file(data)} data["depth"] = covinfo.depth_files data = coverage.assign_interval(data) data = samtools.run_and_save(data) data = recalibrate.prep_recal(data) data = recalibrate.apply_recal(data) elif dd.get_variant_regions(data): callable_region_bed, nblock_bed = \ callable.block_regions(dd.get_variant_regions(data), bam_file, ref_file, data) data["regions"] = {"nblock": nblock_bed, "callable": dd.get_variant_regions(data), "sample_callable": dd.get_variant_regions(data)} return [[data]]
def _run_cnvkit_shared(data, test_bams, background_bams, access_file, work_dir, background_name=None): """Shared functionality to run CNVkit. """ ref_file = dd.get_ref_file(data) raw_work_dir = os.path.join(work_dir, "raw") out_base = os.path.splitext(os.path.basename(test_bams[0]))[0].split(".")[0] background_cnn = "%s_background.cnn" % (background_name if background_name else "flat") files = {"cnr": os.path.join(raw_work_dir, "%s.cnr" % out_base), "cns": os.path.join(raw_work_dir, "%s.cns" % out_base), "back_cnn": os.path.join(raw_work_dir, background_cnn)} if not utils.file_exists(files["cnr"]): if os.path.exists(raw_work_dir): shutil.rmtree(raw_work_dir) with tx_tmpdir(data, work_dir) as tx_work_dir: # pick targets, anti-targets and access files based on analysis type # http://cnvkit.readthedocs.org/en/latest/nonhybrid.html cov_interval = dd.get_coverage_interval(data) base_regions = dd.get_variant_regions(data) # For genome calls, subset to regions within 10kb of genes if cov_interval == "genome": base_regions = annotate.subset_by_genes(base_regions, data, work_dir, pad=1e4) raw_target_bed = bedutils.merge_overlaps(base_regions, data, out_dir=work_dir) target_bed = annotate.add_genes(raw_target_bed, data) # bail out if we ended up with no regions if not utils.file_exists(target_bed): return {} if cov_interval == "amplicon": target_opts = ["--targets", target_bed, "--access", target_bed] elif cov_interval == "genome": target_opts = ["--targets", target_bed, "--access", dd.get_variant_regions(data)] else: target_opts = ["--targets", target_bed, "--access", access_file] cores = min(tz.get_in(["config", "algorithm", "num_cores"], data, 1), len(test_bams) + len(background_bams)) cmd = [_get_cmd(), "batch"] + \ test_bams + ["-n"] + background_bams + ["-f", ref_file] + \ target_opts + \ ["-d", tx_work_dir, "--split", "-p", str(cores), "--output-reference", os.path.join(tx_work_dir, background_cnn)] at_avg, at_min, t_avg = _get_antitarget_size(access_file, target_bed) if at_avg: cmd += ["--antitarget-avg-size", str(at_avg), "--antitarget-min-size", str(at_min), "--target-avg-size", str(t_avg)] local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") cmd += ["--rlibpath", local_sitelib] do.run(cmd, "CNVkit batch") shutil.move(tx_work_dir, raw_work_dir) for ftype in ["cnr", "cns"]: if not os.path.exists(files[ftype]): raise IOError("Missing CNVkit %s file: %s" % (ftype, files[ftype])) return files
def _get_variant_regions(data): out = dd.get_variant_regions(data) or dd.get_sample_callable(data) # Only need to merge for variant region inputs, not callable BED regions which don't overlap if merged and dd.get_variant_regions(data): merged_out = dd.get_variant_regions_merged(data) if merged_out: out = merged_out else: out = merge_overlaps(out, data) return out
def sample_callable_bed(bam_file, ref_file, data): """Retrieve callable regions for a sample subset by defined analysis regions. """ from bcbio.heterogeneity import chromhacks CovInfo = collections.namedtuple("CovInfo", "callable, raw_callable, depth_files") noalt_calling = "noalt_calling" in dd.get_tools_on( data) or "altcontigs" in dd.get_exclude_regions(data) def callable_chrom_filter(r): """Filter to callable region, potentially limiting by chromosomes. """ return r.name == "CALLABLE" and (not noalt_calling or chromhacks.is_nonalt(r.chrom)) out_file = "%s-callable_sample.bed" % os.path.splitext(bam_file)[0] with shared.bedtools_tmpdir(data): callable_bed, depth_files = coverage.calculate(bam_file, data) input_regions_bed = dd.get_variant_regions(data) if not utils.file_uptodate(out_file, callable_bed): with file_transaction(data, out_file) as tx_out_file: callable_regions = pybedtools.BedTool(callable_bed) filter_regions = callable_regions.filter(callable_chrom_filter) if input_regions_bed: if not utils.file_uptodate(out_file, input_regions_bed): input_regions = pybedtools.BedTool(input_regions_bed) filter_regions.intersect( input_regions, nonamecheck=True).saveas(tx_out_file) else: filter_regions.saveas(tx_out_file) return CovInfo(out_file, callable_bed, depth_files)
def process_intervals(data): """Prepare intervals file""" bed_file = regions.get_sv_bed(data) if not bed_file: bed_file = bedutils.clean_file(dd.get_variant_regions(data), data) if not bed_file: return None basename = os.path.splitext(bed_file)[0] ready_file = basename + ".txt" if os.path.exists(ready_file): return ready_file optimized_bed = basename + ".optimized.bed" rscript = utils.Rscript_cmd("r36") interval_file_r = utils.R_package_script("r36", "PureCN", "extdata/IntervalFile.R") ref_file = dd.get_ref_file(data) mappability_resource = dd.get_variation_resources( data)["purecn_mappability"] genome = dd.get_genome_build(data) cmd = [ rscript, interval_file_r, "--infile", bed_file, "--fasta", ref_file, "--outfile", ready_file, "--offtarget", "--genome", genome, "--export", optimized_bed, "--mappability", mappability_resource ] try: cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib( env="r36"), utils.get_R_exports(env="r36"), " ".join( [str(x) for x in cmd])) do.run(cmd_line, "PureCN intervals") except subprocess.CalledProcessError as msg: logger.info("PureCN failed to prepare intervals") logger.debug("Saved PureCN interval file into " + ready_file) return ready_file
def filter_multimappers(align_file, data): """ Filtering a BWA alignment file for uniquely mapped reads, from here: https://bioinformatics.stackexchange.com/questions/508/obtaining-uniquely-mapped-reads-from-bwa-mem-alignment """ config = dd.get_config(data) type_flag = "" if bam.is_bam(align_file) else "S" base, ext = os.path.splitext(align_file) out_file = base + ".unique" + ext bed_file = dd.get_variant_regions(data) or dd.get_sample_callable(data) bed_cmd = '-L {0}'.format(bed_file) if bed_file else " " if utils.file_exists(out_file): return out_file base_filter = '-F "not unmapped {paired_filter} and not duplicate and [XA] == null and [SA] == null and not supplementary " ' if bam.is_paired(align_file): paired_filter = "and paired and proper_pair" else: paired_filter = "" filter_string = base_filter.format(paired_filter=paired_filter) sambamba = config_utils.get_program("sambamba", config) num_cores = dd.get_num_cores(data) with file_transaction(out_file) as tx_out_file: cmd = ('{sambamba} view -h{type_flag} ' '--nthreads {num_cores} ' '-f bam {bed_cmd} ' '{filter_string} ' '{align_file} ' '> {tx_out_file}') message = "Removing multimapped reads from %s." % align_file do.run(cmd.format(**locals()), message) bam.index(out_file, config) return out_file
def prep_seq2c_bed(data): """Selecting the bed file, cleaning, and properly annotating for Seq2C """ bed_file = regions.get_sv_bed(data) if bed_file: bed_file = bedutils.clean_file(bed_file, data, prefix="svregions-") else: bed_file = bedutils.clean_file(dd.get_variant_regions(data), data) if not bed_file: return None col_num = bt.BedTool(bed_file).field_count() if col_num < 4: annotated_file = annotate.add_genes(bed_file, data, max_distance=0) if annotated_file == bed_file: raise ValueError( "BED file for Seq2C must be annotated with gene names, " "however the input BED is 3-columns and we have no transcript " "data to annotate with " + bed_file) annotated_file = annotate.gene_one_per_line(annotated_file, data) else: annotated_file = bed_file ready_file = "%s-seq2cclean.bed" % (utils.splitext_plus(annotated_file)[0]) if not utils.file_uptodate(ready_file, annotated_file): bed = bt.BedTool(annotated_file) if col_num > 4 and col_num != 8: bed = bed.cut(range(4)) bed = bed.filter(lambda x: x.name not in ["", ".", "-"]) with file_transaction(data, ready_file) as tx_out_file: bed.saveas(tx_out_file) logger.debug("Saved Seq2C clean annotated ready input BED into " + ready_file) return ready_file
def calculate_offtarget(bam_file, ref_file, data): """Generate file of offtarget read counts for inputs with variant regions. """ vrs_file = dd.get_variant_regions(data) if vrs_file: out_file = "%s-offtarget-stats.yaml" % os.path.splitext(bam_file)[0] if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: offtarget_regions = "%s-regions.bed" % utils.splitext_plus( out_file)[0] ref_bed = get_ref_bedtool(ref_file, data["config"]) ref_bed.subtract(pybedtools.BedTool(vrs_file), nonamecheck=True).saveas(offtarget_regions) cmd = ( "samtools view -u {bam_file} -L {offtarget_regions} | " "bedtools intersect -abam - -b {offtarget_regions} -f 1.0 -bed | wc -l" ) offtarget_count = int( subprocess.check_output(cmd.format(**locals()), shell=True)) cmd = "samtools idxstats {bam_file} | awk '{{s+=$3}} END {{print s}}'" mapped_count = int( subprocess.check_output(cmd.format(**locals()), shell=True)) with open(tx_out_file, "w") as out_handle: yaml.safe_dump( { "mapped": mapped_count, "offtarget": offtarget_count }, out_handle, allow_unicode=False, default_flow_style=False) return out_file
def get_base_cnv_regions(data, work_dir, genome_default="transcripts1e4", include_gene_names=True): """Retrieve set of target regions for CNV analysis. Subsets to extended transcript regions for WGS experiments to avoid long runtimes. """ cov_interval = dd.get_coverage_interval(data) base_regions = get_sv_bed(data, include_gene_names=include_gene_names) # if we don't have a configured BED or regions to use for SV caling if not base_regions: # For genome calls, subset to regions near genes as targets if cov_interval == "genome": base_regions = get_sv_bed(data, genome_default, work_dir, include_gene_names=include_gene_names) if base_regions: base_regions = remove_exclude_regions(base_regions, base_regions, [data]) # Finally, default to the defined variant regions if not base_regions: base_regions = dd.get_variant_regions( data) or dd.get_sample_callable(data) return bedutils.clean_file(base_regions, data)
def _prep_bed(data, work_dir): """Selecting the bed file, cleaning, and properly annotating for Seq2C """ bed_file = regions.get_sv_bed(data) if bed_file: bed_file = clean_file(bed_file, data, prefix="svregions-") else: bed_file = clean_file(dd.get_variant_regions(data), data) col_num = bt.BedTool(bed_file).field_count() if col_num < 4: annotated_file = annotate.add_genes(bed_file, data, max_distance=0) if annotated_file == bed_file: raise ValueError("BED file for Seq2C must be annotated with gene names, " "however the input BED is 3-columns and we have no transcript " "data to annotate with " + bed_file) annotated_file = annotate.gene_one_per_line(annotated_file, data) else: annotated_file = bed_file ready_file = "%s-seq2cclean.bed" % (utils.splitext_plus(annotated_file)[0]) if not utils.file_uptodate(ready_file, annotated_file): bed = bt.BedTool(annotated_file) if col_num > 4 and col_num != 8: bed = bed.cut(range(4)) bed = bed.filter(lambda x: x.name not in ["", ".", "-"]) with file_transaction(data, ready_file) as tx_out_file: bed.saveas(tx_out_file) logger.debug("Saved Seq2C clean annotated ready input BED into " + ready_file) return ready_file
def _regions_for_coverage(data, region, ref_file, out_file): """Retrieve BED file of regions we need to calculate coverage in. Checks for variant region specifications that do not overlap contigs (in which case we do not calculate coverage) and regions smaller than callable_min_size (in which case we assign everything as callable). callable_min_size avoids calculations for small chromosomes we won't split on later, saving computation and disk IO. """ variant_regions = bedutils.merge_overlaps(dd.get_variant_regions(data), data) ready_region = shared.subset_variant_regions(variant_regions, region, out_file) custom_file = "%s-coverageregions.bed" % utils.splitext_plus(out_file)[0] region_size = _get_region_size(ref_file, data, region) if variant_regions is None and region_size is not None and region_size < dd.get_callable_min_size(data): coverage_str = "CALLABLE" if realign.has_aligned_reads(dd.get_work_bam(data), region) else "NO_COVERAGE" custom_file = _write_all_chrom_file(coverage_str, custom_file, ref_file, region, data) return custom_file, False elif not ready_region: get_ref_bedtool(ref_file, data["config"]).saveas(custom_file) return custom_file, True elif os.path.isfile(ready_region): return ready_region, True elif isinstance(ready_region, (list, tuple)): c, s, e = ready_region pybedtools.BedTool("%s\t%s\t%s\n" % (c, s, e), from_string=True).saveas(custom_file) return custom_file, True else: custom_file = _write_all_chrom_file("NO_COVERAGE", custom_file, ref_file, region, data) return custom_file, variant_regions is None
def sample_callable_bed(bam_file, ref_file, data): """Retrieve callable regions for a sample subset by defined analysis regions. """ from bcbio.heterogeneity import chromhacks CovInfo = collections.namedtuple("CovInfo", "callable, raw_callable, depth_files") noalt_calling = "noalt_calling" in dd.get_tools_on(data) or "altcontigs" in dd.get_exclude_regions(data) def callable_chrom_filter(r): """Filter to callable region, potentially limiting by chromosomes. """ return r.name == "CALLABLE" and (not noalt_calling or chromhacks.is_nonalt(r.chrom)) out_file = "%s-callable_sample.bed" % os.path.splitext(bam_file)[0] with shared.bedtools_tmpdir(data): sv_bed = regions.get_sv_bed(data) callable_bed, depth_files = coverage.calculate(bam_file, data, sv_bed) input_regions_bed = dd.get_variant_regions(data) if not utils.file_uptodate(out_file, callable_bed): with file_transaction(data, out_file) as tx_out_file: callable_regions = pybedtools.BedTool(callable_bed) filter_regions = callable_regions.filter(callable_chrom_filter) if input_regions_bed: if not utils.file_uptodate(out_file, input_regions_bed): input_regions = pybedtools.BedTool(input_regions_bed) filter_regions.intersect(input_regions, nonamecheck=True).saveas(tx_out_file) else: filter_regions.saveas(tx_out_file) return CovInfo(out_file, callable_bed, depth_files)
def cutoff_w_expression(vcf_file, expression, data, name="+", filterext="", extra_cmd="", limit_regions="variant_regions"): """Perform cutoff-based soft filtering using bcftools expressions like %QUAL < 20 || DP < 4. """ base, ext = utils.splitext_plus(vcf_file) out_file = "{base}-filter{filterext}{ext}".format(**locals()) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: if vcfutils.vcf_has_variants(vcf_file): bcftools = config_utils.get_program("bcftools", data["config"]) bgzip_cmd = "| bgzip -c" if out_file.endswith(".gz") else "" intervals = "" if limit_regions == "variant_regions": variant_regions = dd.get_variant_regions(data) if variant_regions: intervals = "-T %s" % vcfutils.bgzip_and_index(variant_regions, data["config"]) cmd = ("{bcftools} filter -O v {intervals} --soft-filter '{name}' " "-e '{expression}' -m '+' {vcf_file} {extra_cmd} {bgzip_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Cutoff-based soft filtering %s with %s" % (vcf_file, expression), data) else: shutil.copy(vcf_file, out_file) if out_file.endswith(".vcf.gz"): out_file = vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def _add_config_regions(nblock_regions, ref_regions, data): """Add additional nblock regions based on configured regions to call. Identifies user defined regions which we should not be analyzing. """ input_regions_bed = dd.get_variant_regions(data) if input_regions_bed: input_regions = pybedtools.BedTool(input_regions_bed) # work around problem with single region not subtracted correctly. if len(input_regions) == 1: str_regions = str(input_regions[0]).strip() input_regions = pybedtools.BedTool("%s\n%s" % (str_regions, str_regions), from_string=True) input_nblock = ref_regions.subtract(input_regions, nonamecheck=True) if input_nblock == ref_regions: raise ValueError("Input variant_region file (%s) " "excludes all genomic regions. Do the chromosome names " "in the BED file match your genome (chr1 vs 1)?" % input_regions_bed) all_intervals = _combine_regions([input_nblock, nblock_regions], ref_regions) else: all_intervals = nblock_regions if "noalt_calling" in dd.get_tools_on(data) or "altcontigs" in dd.get_exclude_regions(data): from bcbio.heterogeneity import chromhacks remove_intervals = ref_regions.filter(lambda r: not chromhacks.is_nonalt(r.chrom)) all_intervals = _combine_regions([all_intervals, remove_intervals], ref_regions) return all_intervals.merge()
def run_tnscope(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variants with Sentieon's TNscope somatic caller. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): variant_regions = bedutils.merge_overlaps( dd.get_variant_regions(items[0]), items[0]) interval = _get_interval(variant_regions, region, out_file, items) with file_transaction(items[0], out_file) as tx_out_file: paired = vcfutils.get_paired_bams(align_bams, items) assert paired and paired.normal_bam, "Require normal BAM for Sentieon TNscope" dbsnp = "--dbsnp %s" % ( assoc_files.get("dbsnp")) if "dbsnp" in assoc_files else "" license = license_export(items[0]) cores = dd.get_num_cores(items[0]) cmd = ( "{license}sentieon driver -t {cores} -r {ref_file} " "-i {paired.tumor_bam} -i {paired.normal_bam} {interval} " "--algo TNscope " "--tumor_sample {paired.tumor_name} --normal_sample {paired.normal_name} " "{dbsnp} {tx_out_file}") do.run(cmd.format(**locals()), "Sentieon TNscope") return out_file
def _regions_for_coverage(data, region, ref_file, out_file): """Retrieve BED file of regions we need to calculate coverage in. """ variant_regions = bedutils.merge_overlaps(dd.get_variant_regions(data), data) ready_region = shared.subset_variant_regions(variant_regions, region, out_file) custom_file = "%s-coverageregions.bed" % utils.splitext_plus(out_file)[0] if not ready_region: get_ref_bedtool(ref_file, data["config"]).saveas(custom_file) return custom_file, True elif os.path.isfile(ready_region): return ready_region, True elif isinstance(ready_region, (list, tuple)): c, s, e = ready_region pybedtools.BedTool("%s\t%s\t%s\n" % (c, s, e), from_string=True).saveas(custom_file) return custom_file, True else: with file_transaction(data, custom_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: for feat in get_ref_bedtool(ref_file, data["config"], region): out_handle.write( "%s\t%s\t%s\t%s\n" % (feat.chrom, feat.start, feat.end, "NO_COVERAGE")) return custom_file, variant_regions is None
def run_tnhaplotyper(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variants with Sentieon's TNhaplotyper (MuTect2 like). """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): variant_regions = bedutils.merge_overlaps(dd.get_variant_regions(items[0]), items[0]) interval = _get_interval(variant_regions, region, out_file, items) with file_transaction(items[0], out_file) as tx_out_file: paired = vcfutils.get_paired_bams(align_bams, items) assert paired.normal_bam, "Require normal BAM for Sentieon TNhaplotyper" dbsnp = "--dbsnp %s" % (assoc_files.get("dbsnp")) if "dbsnp" in assoc_files else "" cosmic = "--cosmic %s" % (assoc_files.get("cosmic")) if "cosmic" in assoc_files else "" license = license_export(items[0]) tx_orig_file = "%s-orig%s" % utils.splitext_plus(tx_out_file) cores = dd.get_num_cores(items[0]) cmd = ("{license}sentieon driver -t {cores} -r {ref_file} " "-i {paired.tumor_bam} -i {paired.normal_bam} {interval} " "--algo TNhaplotyper " "--tumor_sample {paired.tumor_name} --normal_sample {paired.normal_name} " "{dbsnp} {cosmic} {tx_orig_file}") do.run(cmd.format(**locals()), "Sentieon TNhaplotyper") cmd = ("gunzip -c {tx_orig_file} | " "sed 's/ID=ECNT,Number=1,Type=Integer/ID=ECNT,Number=1,Type=String/' | " "sed 's/ID=HCNT,Number=1,Type=Integer/ID=HCNT,Number=1,Type=String/' | " "sed 's/ID=NLOD,Number=1,Type=Float/ID=NLOD,Number=1,Type=String/' | " "sed 's/ID=TLOD,Number=1,Type=Float/ID=TLOD,Number=1,Type=String/' | " "sed 's/ID=PON,Number=1,Type=Integer/ID=PON,Number=1,Type=String/' | " "bgzip -c > {tx_out_file}") do.run(cmd.format(**locals()), "Sentieon TNhaplotyper: make headers GATK compatible") vcfutils.bgzip_and_index(tx_out_file, items[0]["config"]) return out_file
def umi_consensus(data): """Convert UMI grouped reads into fastq pair for re-alignment. """ align_bam = dd.get_work_bam(data) umi_method, umi_tag = _check_umi_type(align_bam) f1_out = "%s-cumi-1.fq.gz" % utils.splitext_plus(align_bam)[0] f2_out = "%s-cumi-2.fq.gz" % utils.splitext_plus(align_bam)[0] avg_coverage = coverage.get_average_coverage("rawumi", dd.get_variant_regions(data), data) if not utils.file_uptodate(f1_out, align_bam): with file_transaction(data, f1_out, f2_out) as (tx_f1_out, tx_f2_out): jvm_opts = _get_fgbio_jvm_opts(data, os.path.dirname(tx_f1_out), 2) # Improve speeds by avoiding compression read/write bottlenecks io_opts = "--async-io=true --compression=0" est_options = _estimate_fgbio_defaults(avg_coverage) group_opts, cons_opts, filter_opts = _get_fgbio_options( data, est_options, umi_method) cons_method = "CallDuplexConsensusReads" if umi_method == "paired" else "CallMolecularConsensusReads" tempfile = "%s-bamtofastq-tmp" % utils.splitext_plus(f1_out)[0] ref_file = dd.get_ref_file(data) cmd = ( "unset JAVA_HOME && " "fgbio {jvm_opts} {io_opts} GroupReadsByUmi {group_opts} -t {umi_tag} -s {umi_method} " "-i {align_bam} | " "fgbio {jvm_opts} {io_opts} {cons_method} {cons_opts} --sort-order=:none: " "-i /dev/stdin -o /dev/stdout | " "fgbio {jvm_opts} {io_opts} FilterConsensusReads {filter_opts} -r {ref_file} " "-i /dev/stdin -o /dev/stdout | " "bamtofastq collate=1 T={tempfile} F={tx_f1_out} F2={tx_f2_out} tags=cD,cM,cE gz=1" ) do.run(cmd.format(**locals()), "UMI consensus fastq generation") return f1_out, f2_out, avg_coverage
def run(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ resources = config_utils.get_resources("qualimap", data["config"]) options = " ".join(resources.get("options", "")) report_file = os.path.join(out_dir, "qualimapReport.html") pdf_file = "qualimapReport.pdf" if not utils.file_exists(report_file) and not utils.file_exists(os.path.join(out_dir, pdf_file)): if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info("Full qualimap analysis for %s may be slow." % bam_file) ds_bam = bam_file else: ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if options.find("PDF") > -1: options = "%s -outfile %s" % (options, pdf_file) utils.safe_makedir(out_dir) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) export = utils.local_path_export() cmd = ("unset DISPLAY && {export} {qualimap} bamqc -bam {bam_file} -outdir {out_dir} " "-nt {num_cores} --java-mem-size={max_mem} {options}") species = tz.get_in(("genome_resources", "aliases", "ensembl"), data, "") if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = bedutils.merge_overlaps(dd.get_variant_regions(data), data) if regions: bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data)) return _parse_qualimap_metrics(report_file)
def _get_target_access_files(cov_interval, data, work_dir): """Retrieve target and access files based on the type of data to process. pick targets, anti-targets and access files based on analysis type http://cnvkit.readthedocs.org/en/latest/nonhybrid.html """ base_regions = regions.get_sv_bed(data) # if we don't have a configured BED or regions to use for SV caling if not base_regions: # For genome calls, subset to regions within 10kb of genes if cov_interval == "genome": base_regions = regions.get_sv_bed(data, "transcripts1e4", work_dir) if base_regions: base_regions = shared.remove_exclude_regions(base_regions, base_regions, [data]) # Finally, default to the defined variant regions if not base_regions: base_regions = dd.get_variant_regions(data) target_bed = bedutils.merge_overlaps(base_regions, data, out_dir=work_dir) if cov_interval == "amplicon": return target_bed, target_bed elif cov_interval == "genome": return target_bed, target_bed else: access_file = _create_access_file(dd.get_ref_file(data), _sv_workdir(data), data) return target_bed, access_file
def _run_qualimap(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ report_file = os.path.join(out_dir, "qualimapReport.html") if not os.path.exists(report_file): ds_bam = bam.downsample(bam_file, data, 1e7) bam_file = ds_bam if ds_bam else bam_file utils.safe_makedir(out_dir) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) resources = config_utils.get_resources("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) cmd = ( "unset DISPLAY && {qualimap} bamqc -bam {bam_file} -outdir {out_dir} " "-nt {num_cores} --java-mem-size={max_mem}") species = data["genome_resources"]["aliases"].get("ensembl", "").upper() if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = bedutils.merge_overlaps(dd.get_variant_regions(data), data) if regions: bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" do.run(cmd.format(**locals()), "Qualimap: %s" % data["name"][-1]) return _parse_qualimap_metrics(report_file)
def filter_multimappers(align_file, data): """ It does not seem like bowtie2 has a corollary to the -m 1 flag in bowtie, there are some options that are close but don't do the same thing. Bowtie2 sets the XS flag for reads mapping in more than one place, so we can just filter on that. This will not work for other aligners. """ config = dd.get_config(data) type_flag = "" if bam.is_bam(align_file) else "S" base, ext = os.path.splitext(align_file) out_file = base + ".unique" + ext bed_file = dd.get_variant_regions(data) or dd.get_sample_callable(data) bed_cmd = '-L {0}'.format(bed_file) if bed_file else " " if utils.file_exists(out_file): return out_file base_filter = '-F "[XS] == null and not unmapped {paired_filter} and not duplicate" ' if bam.is_paired(align_file): paired_filter = "and paired and proper_pair" else: paired_filter = "" filter_string = base_filter.format(paired_filter=paired_filter) sambamba = config_utils.get_program("sambamba", config) num_cores = dd.get_num_cores(data) with file_transaction(out_file) as tx_out_file: cmd = ('{sambamba} view -h{type_flag} ' '--nthreads {num_cores} ' '-f bam {bed_cmd} ' '{filter_string} ' '{align_file} ' '> {tx_out_file}') message = "Removing multimapped reads from %s." % align_file do.run(cmd.format(**locals()), message) bam.index(out_file, config) return out_file
def run(bam_file, data, out_dir): if "picard" not in dd.get_tools_on(data): return {} ref_file = dd.get_ref_file(data) sample = dd.get_sample_name(data) target_file = dd.get_variant_regions(data) broad_runner = broad.PicardCmdRunner("picard", data["config"]) bam_fname = os.path.abspath(bam_file) path = os.path.dirname(bam_fname) utils.safe_makedir(out_dir) out_base = utils.splitext_plus(os.path.basename(bam_fname))[0] hsmetric_file = os.path.join(out_dir, "%s.hs_metrics" % out_base) hsinsert_file = os.path.join(out_dir, "%s.insert_metrics" % out_base) if not utils.file_exists(hsmetric_file) and not utils.file_exists(hsinsert_file): with utils.chdir(out_dir): with tx_tmpdir() as tmp_dir: cur_bam = os.path.basename(bam_fname) if not os.path.exists(cur_bam): os.symlink(bam_fname, cur_bam) gen_metrics = PicardMetrics(broad_runner, tmp_dir) gen_metrics.report(cur_bam, ref_file, bam.is_paired(bam_fname), target_file, target_file, None, data["config"]) if utils.file_exists(hsmetric_file): do.run("sed -i 's/%s.bam//g' %s" % (out_base.replace(sample, ""), hsmetric_file), "") if utils.file_exists(hsinsert_file): do.run("sed -i 's/%s.bam//g' %s" % (out_base.replace(sample, ""), hsinsert_file), "") return hsmetric_file
def _get_target_access_files(cov_interval, data, work_dir): """Retrieve target and access files based on the type of data to process. pick targets, anti-targets and access files based on analysis type http://cnvkit.readthedocs.org/en/latest/nonhybrid.html """ base_regions = regions.get_sv_bed(data) # if we don't have a configured BED or regions to use for SV caling if not base_regions: # For genome calls, subset to regions within 10kb of genes if cov_interval == "genome": base_regions = regions.get_sv_bed(data, "transcripts1e4", work_dir) base_regions = shared.remove_exclude_regions( base_regions, base_regions, [data]) # Finally, default to the defined variant regions if not base_regions: base_regions = dd.get_variant_regions(data) target_bed = bedutils.merge_overlaps(base_regions, data, out_dir=work_dir) if cov_interval == "amplicon": return target_bed, target_bed elif cov_interval == "genome": return target_bed, target_bed else: access_file = _create_access_file(dd.get_ref_file(data), _sv_workdir(data), data) return target_bed, access_file
def run(bam_file, data, out_dir): if "picard" not in dd.get_tools_on(data): return {} ref_file = dd.get_ref_file(data) sample = dd.get_sample_name(data) target_file = dd.get_variant_regions(data) or dd.get_sample_callable(data) broad_runner = broad.PicardCmdRunner("picard", data["config"]) bam_fname = os.path.abspath(bam_file) path = os.path.dirname(bam_fname) utils.safe_makedir(out_dir) out_base = utils.splitext_plus(os.path.basename(bam_fname))[0] hsmetric_file = os.path.join(out_dir, "%s.hs_metrics" % out_base) hsinsert_file = os.path.join(out_dir, "%s.insert_metrics" % out_base) if not utils.file_exists(hsmetric_file) and not utils.file_exists(hsinsert_file): with utils.chdir(out_dir): with tx_tmpdir() as tmp_dir: cur_bam = os.path.basename(bam_fname) if not os.path.exists(cur_bam): os.symlink(bam_fname, cur_bam) gen_metrics = PicardMetrics(broad_runner, tmp_dir) gen_metrics.report(cur_bam, ref_file, bam.is_paired(bam_fname), target_file, target_file, None, data["config"]) if utils.file_exists(hsmetric_file): do.run("sed -i 's/%s.bam//g' %s" % (out_base.replace(sample, ""), hsmetric_file), "") if utils.file_exists(hsinsert_file): do.run("sed -i 's/%s.bam//g' %s" % (out_base.replace(sample, ""), hsinsert_file), "") return hsmetric_file
def variants(data): if not "vrn_file" in data: return data in_vcf = data['vrn_file'] work_dir = os.path.join(dd.get_work_dir(data), "report", "variants") with chdir(work_dir): in_bam = data['work_bam'] ref_file = dd.get_ref_file(data) assert ref_file, "Need the reference genome fasta file." jvm_opts = broad.get_gatk_framework_opts(data['config']) gatk_jar = config_utils.get_program("gatk", data['config'], "dir") bed_file = dd.get_variant_regions(data) sample = splitext_plus(os.path.basename(in_vcf))[0] in_bam = data["work_bam"] cg_file = os.path.join(sample + "_with-gc.vcf.gz") parse_file = os.path.join(sample + "_cg-depth-parse.tsv") if not file_exists(cg_file): with file_transaction(cg_file) as tx_out: cmd = ("java -jar {gatk_jar}/GenomeAnalysisTK.jar -T VariantAnnotator -R {ref_file} " "-L {bed_file} -I {in_bam} " "-A GCContent --variant {in_vcf} --out {tx_out}") do.run(cmd.format(**locals()), " GC bias for %s" % in_vcf) if not file_exists(parse_file): with file_transaction(parse_file) as out_tx: with open(out_tx, 'w') as out_handle: print >>out_handle, "CG\tdepth\tsample" cmd = ("bcftools query -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R {bed_file} {cg_file} >> {out_tx}") do.run(cmd.format(**locals()), " query for %s" % in_vcf) logger.debug('parsing coverage: %s' % sample) # return df return data
def run_haplotyper(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variants with Sentieon's haplotyper (GATK HaplotypeCaller like). """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): variant_regions = bedutils.merge_overlaps( dd.get_variant_regions(items[0]), items[0]) interval = _get_interval(variant_regions, region, out_file, items) with file_transaction(items[0], out_file) as tx_out_file: dbsnp = "--dbsnp %s" % ( assoc_files.get("dbsnp")) if "dbsnp" in assoc_files else "" bams = " ".join(["-i %s" % x for x in align_bams]) license = license_export(items[0]) cores = dd.get_num_cores(items[0]) out_mode = "--emit_mode gvcf" if joint.want_gvcf(items) else "" cmd = ( "{license}sentieon driver -t {cores} -r {ref_file} " "{bams} {interval} --algo Haplotyper {out_mode} {dbsnp} {tx_out_file}" ) do.run(cmd.format(**locals()), "Sentieon Haplotyper") return out_file
def calculate_offtarget(bam_file, ref_file, data): """Generate file of offtarget read counts for inputs with variant regions. """ vrs_file = dd.get_variant_regions(data) if vrs_file: out_file = "%s-offtarget-stats.yaml" % os.path.splitext(bam_file)[0] if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: offtarget_regions = "%s-regions.bed" % utils.splitext_plus(out_file)[0] ref_bed = get_ref_bedtool(ref_file, data["config"]) ref_bed.subtract(pybedtools.BedTool(vrs_file), nonamecheck=True).saveas(offtarget_regions) cmd = ( "samtools view -u {bam_file} -L {offtarget_regions} | " "bedtools intersect -abam - -b {offtarget_regions} -f 1.0 -bed | wc -l" ) offtarget_count = int(subprocess.check_output(cmd.format(**locals()), shell=True)) cmd = "samtools idxstats {bam_file} | awk '{{s+=$3}} END {{print s}}'" mapped_count = int(subprocess.check_output(cmd.format(**locals()), shell=True)) with open(tx_out_file, "w") as out_handle: yaml.safe_dump( {"mapped": mapped_count, "offtarget": offtarget_count}, out_handle, allow_unicode=False, default_flow_style=False, ) return out_file
def run(bam_file, data, out_dir): config = data["config"] if "picard" not in dd.get_tools_on(data): return {} ref_file = dd.get_ref_file(data) sample = dd.get_sample_name(data) target_file = dd.get_variant_regions(data) broad_runner = broad.PicardCmdRunner("picard", config) bam_fname = os.path.abspath(bam_file) path = os.path.dirname(bam_fname) utils.safe_makedir(out_dir) hsmetric_file = os.path.join(out_dir, "%s-sort.hs_metrics" % sample) if utils.file_exists(hsmetric_file): return hsmetric_file with utils.chdir(out_dir): with tx_tmpdir() as tmp_dir: cur_bam = os.path.basename(bam_fname) if not os.path.exists(cur_bam): os.symlink(bam_fname, cur_bam) gen_metrics = PicardMetrics(broad_runner, tmp_dir) gen_metrics.report(cur_bam, ref_file, bam.is_paired(bam_fname), target_file, target_file, None, config) do.run("sed -i 's/-sort.bam//g' %s" % hsmetric_file, "") return hsmetric_file
def _add_config_regions(nblock_regions, ref_regions, data): """Add additional nblock regions based on configured regions to call. Identifies user defined regions which we should not be analyzing. """ input_regions_bed = dd.get_variant_regions(data) if input_regions_bed: input_regions = pybedtools.BedTool(input_regions_bed) # work around problem with single region not subtracted correctly. if len(input_regions) == 1: str_regions = str(input_regions[0]).strip() input_regions = pybedtools.BedTool("%s\n%s" % (str_regions, str_regions), from_string=True) input_nblock = ref_regions.subtract(input_regions, nonamecheck=True) if input_nblock == ref_regions: raise ValueError( "Input variant_region file (%s) " "excludes all genomic regions. Do the chromosome names " "in the BED file match your genome (chr1 vs 1)?" % input_regions_bed) all_intervals = _combine_regions([input_nblock, nblock_regions], ref_regions) else: all_intervals = nblock_regions if "noalt_calling" in dd.get_tools_on( data) or "altcontigs" in dd.get_exclude_regions(data): from bcbio.heterogeneity import chromhacks remove_intervals = ref_regions.filter( lambda r: not chromhacks.is_nonalt(r.chrom)) all_intervals = _combine_regions([all_intervals, remove_intervals], ref_regions) return all_intervals.merge()
def run(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ resources = config_utils.get_resources("qualimap", data["config"]) options = " ".join(resources.get("options", "")) report_file = os.path.join(out_dir, "qualimapReport.html") pdf_file = "qualimapReport.pdf" if not utils.file_exists(report_file) and not utils.file_exists(os.path.join(out_dir, pdf_file)): if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info("Full qualimap analysis for %s may be slow." % bam_file) ds_bam = bam_file else: ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if options.find("PDF") > -1: options = "%s -outfile %s" % (options, pdf_file) utils.safe_makedir(out_dir) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) cmd = ("unset DISPLAY && {qualimap} bamqc -bam {bam_file} -outdir {out_dir} " "-nt {num_cores} --java-mem-size={max_mem} {options}") species = tz.get_in(("genome_resources", "aliases", "ensembl"), data, "") if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = bedutils.merge_overlaps(dd.get_variant_regions(data), data) if regions: bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data)) return _parse_qualimap_metrics(report_file)
def filter_multimappers(align_file, data): """ It does not seem like bowtie2 has a corollary to the -m 1 flag in bowtie, there are some options that are close but don't do the same thing. Bowtie2 sets the XS flag for reads mapping in more than one place, so we can just filter on that. This will not work for other aligners. """ config = dd.get_config(data) type_flag = "" if bam.is_bam(align_file) else "S" base, ext = os.path.splitext(align_file) out_file = base + ".unique" + ext bed_file = dd.get_variant_regions(data) bed_cmd = '-L {0}'.format(bed_file) if bed_file else " " if utils.file_exists(out_file): return out_file base_filter = '-F "[XS] == null and not unmapped {paired_filter} and not duplicate" ' if bam.is_paired(align_file): paired_filter = "and paired and proper_pair" else: paired_filter = "" filter_string = base_filter.format(paired_filter=paired_filter) sambamba = config_utils.get_program("sambamba", config) num_cores = dd.get_num_cores(data) with file_transaction(out_file) as tx_out_file: cmd = ('{sambamba} view -h{type_flag} ' '--nthreads {num_cores} ' '-f bam {bed_cmd} ' '{filter_string} ' '{align_file} ' '> {tx_out_file}') message = "Removing multimapped reads from %s." % align_file do.run(cmd.format(**locals()), message) bam.index(out_file, config) return out_file
def gatk_rnaseq_calling(data): """Use GATK to perform gVCF variant calling on RNA-seq data """ from bcbio.bam import callable data = utils.deepish_copy(data) tools_on = dd.get_tools_on(data) if not tools_on: tools_on = [] tools_on.append("gvcf") data = dd.set_tools_on(data, tools_on) data = dd.set_jointcaller(data, ["%s-joint" % v for v in dd.get_variantcaller(data)]) out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "variation", "rnaseq", "gatk-haplotype")) data = _setup_variant_regions(data, out_dir) out_file = os.path.join(out_dir, "%s-gatk-haplotype.vcf.gz" % dd.get_sample_name(data)) if not utils.file_exists(out_file): region_files = [] regions = [] for cur_region in callable.get_split_regions(dd.get_variant_regions(data), data): str_region = "_".join([str(x) for x in cur_region]) region_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "variation", "rnaseq", "gatk-haplotype", "regions")), "%s-%s-gatk-haplotype.vcf.gz" % (dd.get_sample_name(data), str_region)) region_file = gatk.haplotype_caller([dd.get_split_bam(data)], [data], dd.get_ref_file(data), {}, region=cur_region, out_file=region_file) region_files.append(region_file) regions.append(cur_region) out_file = vcfutils.concat_variant_files(region_files, out_file, regions, dd.get_ref_file(data), data["config"]) return dd.set_vrn_file(data, out_file)
def get_sv_bed(data, method=None, out_dir=None, include_gene_names=True): """Retrieve a BED file of regions for SV and heterogeneity calling using the provided method. method choices: - exons: Raw BED file of exon regions - transcripts: Full collapsed regions with the min and max of each transcript. - transcriptsXXXX: Collapsed regions around transcripts with a window size of XXXX. - A custom BED file of regions """ if method is None: method = (tz.get_in(["config", "algorithm", "sv_regions"], data) or dd.get_variant_regions(data) or dd.get_sample_callable(data)) gene_file = dd.get_gene_bed(data) if method and os.path.isfile(method): return method elif not gene_file or not method: return None elif method == "exons": return gene_file elif method.startswith("transcripts"): window = method.split("transcripts")[-1] window = int(float(window)) if window else 0 return _collapse_transcripts(gene_file, window, data, out_dir, include_gene_names=include_gene_names) else: raise ValueError("Unexpected transcript retrieval method: %s" % method)
def umi_consensus(data): """Convert UMI grouped reads into fastq pair for re-alignment. """ align_bam = dd.get_work_bam(data) umi_method, umi_tag = _check_umi_type(align_bam) f1_out = "%s-cumi-1.fq.gz" % utils.splitext_plus(align_bam)[0] f2_out = "%s-cumi-2.fq.gz" % utils.splitext_plus(align_bam)[0] avg_coverage = coverage.get_average_coverage("rawumi", dd.get_variant_regions(data), data) if not utils.file_uptodate(f1_out, align_bam): with file_transaction(data, f1_out, f2_out) as (tx_f1_out, tx_f2_out): jvm_opts = _get_fgbio_jvm_opts(data, os.path.dirname(tx_f1_out), 2) # Improve speeds by avoiding compression read/write bottlenecks io_opts = "--async-io=true --compression=0" est_options = _estimate_fgbio_defaults(avg_coverage) group_opts, cons_opts, filter_opts = _get_fgbio_options(data, est_options, umi_method) cons_method = "CallDuplexConsensusReads" if umi_method == "paired" else "CallMolecularConsensusReads" tempfile = "%s-bamtofastq-tmp" % utils.splitext_plus(f1_out)[0] ref_file = dd.get_ref_file(data) cmd = ("unset JAVA_HOME && " "fgbio {jvm_opts} {io_opts} GroupReadsByUmi {group_opts} -t {umi_tag} -s {umi_method} " "-i {align_bam} | " "fgbio {jvm_opts} {io_opts} {cons_method} {cons_opts} --sort-order=:none: " "-i /dev/stdin -o /dev/stdout | " "fgbio {jvm_opts} {io_opts} FilterConsensusReads {filter_opts} -r {ref_file} " "-i /dev/stdin -o /dev/stdout | " "bamtofastq collate=1 T={tempfile} F={tx_f1_out} F2={tx_f2_out} tags=cD,cM,cE gz=1") do.run(cmd.format(**locals()), "UMI consensus fastq generation") return f1_out, f2_out, avg_coverage
def prep_recal(data): """Do pre-BQSR recalibration, calculation of recalibration tables. """ if dd.get_recalibrate(data) in [True, "gatk"]: logger.info("Prepare BQSR tables with GATK: %s " % str(dd.get_sample_name(data))) dbsnp_file = tz.get_in(("genome_resources", "variation", "dbsnp"), data) if not dbsnp_file: logger.info( "Skipping GATK BaseRecalibrator because no VCF file of known variants was found." ) return data broad_runner = broad.runner_from_config(data["config"]) data["prep_recal"] = _gatk_base_recalibrator( broad_runner, dd.get_align_bam(data), dd.get_ref_file(data), dd.get_platform(data), dbsnp_file, dd.get_variant_regions(data), data) elif dd.get_recalibrate(data) == "sentieon": logger.info("Prepare BQSR tables with sentieon: %s " % str(dd.get_sample_name(data))) data["prep_recal"] = sentieon.bqsr_table(data) elif dd.get_recalibrate(data): raise NotImplementedError("Unsupported recalibration type: %s" % (dd.get_recalibrate(data))) return data
def variants(data, out_dir): """Variants QC metrics""" if not "variants" in data: return None work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) bcfstats = _run_bcftools(data, work_dir) bed_file = dd.get_coverage(data) bcf_out = os.path.join(sample + "_bcbio_variants_stats.txt") cg_file = os.path.join(sample + "_with-gc.vcf.gz") parse_file = os.path.join(sample + "_gc-depth-parse.tsv") qc_file = os.path.join(sample + "_bcbio_variants.txt") with chdir(work_dir): if not file_exists(bcf_out): with open(bcf_out, "w") as out_handle: yaml.safe_dump(bcfstats, out_handle, default_flow_style=False, allow_unicode=False) if "vrn_file" not in data or not bed_file: return None in_vcf = data['vrn_file'] cleaned_bed = clean_file(bed_file, data) if file_exists(qc_file): return qc_file in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) ref_file = dd.get_ref_file(data) assert ref_file, "Need the reference genome fasta file." bed_file = dd.get_variant_regions(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) num_cores = dd.get_num_cores(data) broad_runner = broad.runner_from_config_safe(data["config"]) if in_bam and broad_runner and broad_runner.has_gatk(): if not file_exists(parse_file): with file_transaction(cg_file) as tx_out: params = [ "-T", "VariantAnnotator", "-R", ref_file, "-L", cleaned_bed, "-I", in_bam, "-A", "GCContent", "-A", "Coverage", "--variant", in_vcf, "--out", tx_out ] broad_runner.run_gatk(params) cg_file = vcfutils.bgzip_and_index(cg_file, data["config"]) if not file_exists(parse_file): with file_transaction(parse_file) as out_tx: with open(out_tx, 'w') as out_handle: print >> out_handle, "CG\tdepth\tsample" cmd = ( "bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R " "{bed_file} {cg_file} >> {out_tx}") do.run(cmd.format(**locals()), "Calculating GC content and depth for %s" % in_vcf) logger.debug('parsing coverage: %s' % sample) if not file_exists(qc_file): # This files will be copied to final _summary_variants(parse_file, qc_file) if file_exists(qc_file) and file_exists(parse_file): remove_plus(cg_file)
def combine_sample_regions(*samples): """Create batch-level sets of callable regions for multi-sample calling. Intersects all non-callable (nblock) regions from all samples in a batch, producing a global set of callable regions. """ samples = utils.unpack_worlds(samples) # back compatibility -- global file for entire sample set global_analysis_file = os.path.join(samples[0]["dirs"]["work"], "analysis_blocks.bed") if utils.file_exists(global_analysis_file) and not _needs_region_update( global_analysis_file, samples): global_no_analysis_file = os.path.join( os.path.dirname(global_analysis_file), "noanalysis_blocks.bed") else: global_analysis_file = None out = [] analysis_files = [] batches = [] with shared.bedtools_tmpdir(samples[0]): for batch, items in vmulti.group_by_batch(samples, require_bam=False).items(): batches.append(items) if global_analysis_file: analysis_file, no_analysis_file = global_analysis_file, global_no_analysis_file else: analysis_file, no_analysis_file = _combine_sample_regions_batch( batch, items) for data in items: vr_file = dd.get_variant_regions(data) if analysis_file: analysis_files.append(analysis_file) data["config"]["algorithm"][ "callable_regions"] = analysis_file data["config"]["algorithm"][ "non_callable_regions"] = no_analysis_file data["config"]["algorithm"][ "callable_count"] = pybedtools.BedTool( analysis_file).count() elif vr_file: data["config"]["algorithm"][ "callable_count"] = pybedtools.BedTool( vr_file).count() highdepth_bed = tz.get_in(["regions", "highdepth"], data) if highdepth_bed: data["config"]["algorithm"][ "highdepth_regions"] = highdepth_bed # attach a representative sample for calculating callable region if not data.get("work_bam"): for x in items: if x.get("work_bam"): data["work_bam_callable"] = x["work_bam"] out.append([data]) assert len(out) == len(samples) if len(analysis_files) > 0: final_regions = pybedtools.BedTool(analysis_files[0]) _analysis_block_stats(final_regions, batches[0]) return out
def variants(data, out_dir): """Variants QC metrics""" if not "variants" in data: return None work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) bcfstats = _run_bcftools(data, work_dir) bed_file = dd.get_coverage(data) bcf_out = os.path.join(sample + "_bcbio_variants_stats.txt") cg_file = os.path.join(sample + "_with-gc.vcf.gz") parse_file = os.path.join(sample + "_gc-depth-parse.tsv") qc_file = os.path.join(sample + "_bcbio_variants.txt") with chdir(work_dir): if not file_exists(bcf_out): with open(bcf_out, "w") as out_handle: yaml.safe_dump(bcfstats, out_handle, default_flow_style=False, allow_unicode=False) if "vrn_file" not in data or not bed_file: return None in_vcf = data['vrn_file'] cleaned_bed = clean_file(bed_file, data) if file_exists(qc_file): return qc_file in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) ref_file = dd.get_ref_file(data) assert ref_file, "Need the reference genome fasta file." bed_file = dd.get_variant_regions(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) num_cores = dd.get_num_cores(data) broad_runner = broad.runner_from_config_safe(data["config"]) if in_bam and broad_runner and broad_runner.has_gatk(): if not file_exists(parse_file): with file_transaction(cg_file) as tx_out: params = ["-T", "VariantAnnotator", "-R", ref_file, "-L", cleaned_bed, "-I", in_bam, "-A", "GCContent", "-A", "Coverage", "--variant", in_vcf, "--out", tx_out] broad_runner.run_gatk(params) cg_file = vcfutils.bgzip_and_index(cg_file, data["config"]) if not file_exists(parse_file): with file_transaction(parse_file) as out_tx: with open(out_tx, 'w') as out_handle: print >>out_handle, "CG\tdepth\tsample" cmd = ("bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R " "{bed_file} {cg_file} >> {out_tx}") do.run(cmd.format(**locals()), "Calculating GC content and depth for %s" % in_vcf) logger.debug('parsing coverage: %s' % sample) if not file_exists(qc_file): # This files will be copied to final _summary_variants(parse_file, qc_file) if file_exists(qc_file) and file_exists(parse_file): remove_plus(cg_file)
def postprocess_alignment(data): """Perform post-processing steps required on full BAM files. Prepares list of callable genome regions allowing subsequent parallelization. """ data = cwlutils.normalize_missing(utils.to_single_data(data)) data = cwlutils.unpack_tarballs(data, data) bam_file = data.get("align_bam") or data.get("work_bam") ref_file = dd.get_ref_file(data) artifacts = gatk.collect_artifact_metrics(data) if artifacts: data = dd.update_summary_qc(data, "picard", artifacts.pop(), artifacts) oxog = gatk.collect_oxog_metrics(data) data = dd.update_summary_qc(data, "picard", oxog.pop(), oxog) if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith( ".bam"): out_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))) bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file)) if not utils.file_exists(bam_file_ready): utils.symlink_plus(bam_file, bam_file_ready) bam.index(bam_file_ready, data["config"]) covinfo = callable.sample_callable_bed(bam_file_ready, ref_file, data) callable_region_bed, nblock_bed = \ callable.block_regions(covinfo.raw_callable, bam_file_ready, ref_file, data) data["regions"] = { "nblock": nblock_bed, "callable": covinfo.raw_callable, "sample_callable": covinfo.callable, "mapped_stats": readstats.get_cache_file(data) } data["depth"] = covinfo.depth_files data = coverage.assign_interval(data) data = samtools.run_and_save(data) data = recalibrate.prep_recal(data) data = recalibrate.apply_recal(data) elif dd.get_variant_regions(data): callable_region_bed, nblock_bed = \ callable.block_regions(dd.get_variant_regions(data), bam_file, ref_file, data) data["regions"] = { "nblock": nblock_bed, "callable": dd.get_variant_regions(data), "sample_callable": dd.get_variant_regions(data) } return [[data]]
def _get_variant_regions(data): out = dd.get_variant_regions(data) or dd.get_sample_callable(data) if merged: merged_out = dd.get_variant_regions_merged(data) if merged_out: out = merged_out else: out = merge_overlaps(out, data) return out
def variants(data): if "vrn_file" not in data: return data if not dd.get_coverage(data): return data in_vcf = data["vrn_file"] work_dir = os.path.join(dd.get_work_dir(data), "report", "variants") with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) ref_file = dd.get_ref_file(data) assert ref_file, "Need the reference genome fasta file." bed_file = dd.get_variant_regions(data) sample = dd.get_sample_name(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) cg_file = os.path.join(sample + "_with-gc.vcf.gz") parse_file = os.path.join(sample + "_gc-depth-parse.tsv") num_cores = dd.get_num_cores(data) broad_runner = broad.runner_from_config_safe(data["config"]) if in_bam and broad_runner and broad_runner.has_gatk(): if not file_exists(cg_file): with file_transaction(cg_file) as tx_out: params = [ "-T", "VariantAnnotator", "-R", ref_file, "-L", bed_file, "-I", in_bam, "-A", "GCContent", "-A", "Coverage", "--variant", in_vcf, "--out", tx_out, ] broad_runner.run_gatk(params) cg_file = vcfutils.bgzip_and_index(cg_file, data["config"]) if not file_exists(parse_file): with file_transaction(parse_file) as out_tx: with open(out_tx, "w") as out_handle: print >> out_handle, "CG\tdepth\tsample" cmd = ( "bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R " "{bed_file} {cg_file} >> {out_tx}" ) do.run(cmd.format(**locals()), "Calculating GC content and depth for %s" % in_vcf) logger.debug("parsing coverage: %s" % sample) return data
def _get_chroms(data): """Retrieve chromosomes included in variant_regions for parallelization. """ chroms = set([]) with shared.bedtools_tmpdir(data): for r in pybedtools.BedTool(dd.get_variant_regions(data)): chroms.add(r.chrom) out = [] for c in ref.file_contigs(dd.get_ref_file(data)): if c.name in chroms: out.append((c.name, 0, c.size)) return out
def clean_inputs(data): """Clean BED input files to avoid overlapping segments that cause downstream issues. Per-merges inputs to avoid needing to call multiple times during later parallel steps. """ if not utils.get_in(data, ("config", "algorithm", "variant_regions_orig")): data["config"]["algorithm"][ "variant_regions_orig"] = dd.get_variant_regions(data) clean_vr = clean_file(dd.get_variant_regions(data), data, prefix="cleaned-") merged_vr = merge_overlaps(clean_vr, data) data["config"]["algorithm"]["variant_regions"] = clean_vr data["config"]["algorithm"]["variant_regions_merged"] = merged_vr if dd.get_coverage(data): if not utils.get_in(data, ("config", "algorithm", "coverage_orig")): data["config"]["algorithm"]["coverage_orig"] = dd.get_coverage( data) clean_cov_bed = clean_file(dd.get_coverage(data), data, prefix="cov-", simple=True) merged_cov_bed = merge_overlaps(clean_cov_bed, data) data["config"]["algorithm"]["coverage"] = clean_cov_bed data["config"]["algorithm"]["coverage_merged"] = merged_cov_bed if 'seq2c' in get_svcallers(data): seq2c_ready_bed = prep_seq2c_bed(data) if not seq2c_ready_bed: logger.warning( "Can't run Seq2C without a svregions or variant_regions BED file" ) else: data["config"]["algorithm"]["seq2c_bed_ready"] = seq2c_ready_bed elif regions.get_sv_bed(data): dd.set_sv_regions( data, clean_file(regions.get_sv_bed(data), data, prefix="svregions-")) return data
def population_variant_regions(items): """Retrieve the variant region BED file from a population of items. If tumor/normal, return the tumor BED file. If a population, return the BED file covering the most bases. """ import pybedtools if len(items) == 1: return dd.get_variant_regions(items[0]) else: paired = vcfutils.get_paired(items) if paired: return dd.get_variant_regions(paired.tumor_data) else: vrs = [] for data in items: vr_bed = dd.get_variant_regions(data) if vr_bed: vrs.append((pybedtools.BedTool(vr_bed).total_coverage(), vr_bed)) vrs.sort(reverse=True) if vrs: return vrs[0][1]
def run(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) resources = config_utils.get_resources("qualimap", data["config"]) options = " ".join(resources.get("options", "")) report_file = os.path.join(results_dir, "qualimapReport.html") utils.safe_makedir(results_dir) pdf_file = "qualimapReport.pdf" if not utils.file_exists(report_file) and not utils.file_exists(os.path.join(results_dir, pdf_file)): if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info("Full qualimap analysis for %s may be slow." % bam_file) ds_bam = bam_file else: ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if options.find("PDF") > -1: options = "%s -outfile %s" % (options, pdf_file) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) # Fixing the file name: MultiQC picks sample name from BAM file name. fixed_bam_fname = os.path.join(out_dir, dd.get_sample_name(data) + ".bam") if not os.path.islink(fixed_bam_fname): os.symlink(bam_file, fixed_bam_fname) export = utils.local_path_export() cmd = ("unset DISPLAY && {export} {qualimap} bamqc -bam {fixed_bam_fname} -outdir {results_dir} " "--skip-duplicated --skip-dup-mode 0 " "-nt {num_cores} --java-mem-size={max_mem} {options}") species = None if tz.get_in(("genome_resources", "aliases", "human"), data, ""): species = "HUMAN" elif any(tz.get_in("genome_build", data, "").startswith(k) for k in ["mm", "GRCm"]): species = "MOUSE" if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = bedutils.merge_overlaps(dd.get_coverage(data) or dd.get_variant_regions(data), data) if regions: bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data)) return _parse_qualimap_metrics(report_file, data)
def combine_sample_regions(*samples): """Create batch-level sets of callable regions for multi-sample calling. Intersects all non-callable (nblock) regions from all samples in a batch, producing a global set of callable regions. """ samples = utils.unpack_worlds(samples) samples = [cwlutils.unpack_tarballs(x, x) for x in samples] # back compatibility -- global file for entire sample set global_analysis_file = os.path.join(samples[0]["dirs"]["work"], "analysis_blocks.bed") if utils.file_exists(global_analysis_file) and not _needs_region_update(global_analysis_file, samples): global_no_analysis_file = os.path.join(os.path.dirname(global_analysis_file), "noanalysis_blocks.bed") else: global_analysis_file = None out = [] analysis_files = [] batches = [] with shared.bedtools_tmpdir(samples[0]): for batch, items in vmulti.group_by_batch(samples, require_bam=False).items(): batches.append(items) if global_analysis_file: analysis_file, no_analysis_file = global_analysis_file, global_no_analysis_file else: analysis_file, no_analysis_file = _combine_sample_regions_batch(batch, items) for data in items: vr_file = dd.get_variant_regions(data) if analysis_file: analysis_files.append(analysis_file) data["config"]["algorithm"]["callable_regions"] = analysis_file data["config"]["algorithm"]["non_callable_regions"] = no_analysis_file data["config"]["algorithm"]["callable_count"] = pybedtools.BedTool(analysis_file).count() elif vr_file: data["config"]["algorithm"]["callable_count"] = pybedtools.BedTool(vr_file).count() # attach a representative sample for calculating callable region if not data.get("work_bam"): for x in items: if x.get("work_bam"): data["work_bam_callable"] = x["work_bam"] out.append([data]) # Ensure output order matches input order, consistency for CWL-based runs assert len(out) == len(samples) sample_indexes = {dd.get_sample_name(d): i for i, d in enumerate(samples)} def by_input_index(xs): return sample_indexes[dd.get_sample_name(xs[0])] out.sort(key=by_input_index) if len(analysis_files) > 0: final_regions = pybedtools.BedTool(analysis_files[0]) _analysis_block_stats(final_regions, batches[0]) return out
def combine_sample_regions(*samples): """Create batch-level sets of callable regions for multi-sample calling. Intersects all non-callable (nblock) regions from all samples in a batch, producing a global set of callable regions. """ samples = utils.unpack_worlds(samples) samples = cwlutils.unpack_tarballs(samples, samples[0]) # back compatibility -- global file for entire sample set global_analysis_file = os.path.join(samples[0]["dirs"]["work"], "analysis_blocks.bed") if utils.file_exists(global_analysis_file) and not _needs_region_update(global_analysis_file, samples): global_no_analysis_file = os.path.join(os.path.dirname(global_analysis_file), "noanalysis_blocks.bed") else: global_analysis_file = None out = [] analysis_files = [] batches = [] with shared.bedtools_tmpdir(samples[0]): for batch, items in vmulti.group_by_batch(samples, require_bam=False).items(): batches.append(items) if global_analysis_file: analysis_file, no_analysis_file = global_analysis_file, global_no_analysis_file else: analysis_file, no_analysis_file = _combine_sample_regions_batch(batch, items) for data in items: vr_file = dd.get_variant_regions(data) if analysis_file: analysis_files.append(analysis_file) data["config"]["algorithm"]["callable_regions"] = analysis_file data["config"]["algorithm"]["non_callable_regions"] = no_analysis_file data["config"]["algorithm"]["callable_count"] = pybedtools.BedTool(analysis_file).count() elif vr_file: data["config"]["algorithm"]["callable_count"] = pybedtools.BedTool(vr_file).count() # attach a representative sample for calculating callable region if not data.get("work_bam"): for x in items: if x.get("work_bam"): data["work_bam_callable"] = x["work_bam"] out.append([data]) # Ensure output order matches input order, consistency for CWL-based runs assert len(out) == len(samples) sample_indexes = {dd.get_sample_name(d): i for i, d in enumerate(samples)} def by_input_index(xs): return sample_indexes[dd.get_sample_name(xs[0])] out.sort(key=by_input_index) if len(analysis_files) > 0: final_regions = pybedtools.BedTool(analysis_files[0]) _analysis_block_stats(final_regions, batches[0]) return out
def variants(data): if "vrn_file" not in data: return data if not dd.get_coverage(data): return data in_vcf = data['vrn_file'] sample = dd.get_sample_name(data) cg_file = os.path.join(sample + "_with-gc.vcf.gz") parse_file = os.path.join(sample + "_gc-depth-parse.tsv") qc_file = os.path.join(sample + "_bcbio_variants.txt") work_dir = os.path.join(dd.get_work_dir(data), "report", "variants") with chdir(work_dir): if file_exists(qc_file): return data in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) ref_file = dd.get_ref_file(data) assert ref_file, "Need the reference genome fasta file." bed_file = dd.get_variant_regions(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) num_cores = dd.get_num_cores(data) broad_runner = broad.runner_from_config_safe(data["config"]) if in_bam and broad_runner and broad_runner.has_gatk(): if not file_exists(cg_file): with file_transaction(cg_file) as tx_out: params = [ "-T", "VariantAnnotator", "-R", ref_file, "-L", bed_file, "-I", in_bam, "-A", "GCContent", "-A", "Coverage", "--variant", in_vcf, "--out", tx_out ] broad_runner.run_gatk(params) cg_file = vcfutils.bgzip_and_index(cg_file, data["config"]) if not file_exists(parse_file): with file_transaction(parse_file) as out_tx: with open(out_tx, 'w') as out_handle: print >> out_handle, "CG\tdepth\tsample" cmd = ( "bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R " "{bed_file} {cg_file} >> {out_tx}") do.run(cmd.format(**locals()), "Calculating GC content and depth for %s" % in_vcf) logger.debug('parsing coverage: %s' % sample) if not file_exists(qc_file): # This files will be copied to final _summary_variants(parse_file, qc_file) if file_exists(qc_file) and file_exists(parse_file): os.remove(cg_file) return data
def combine_sample_regions(*samples): """Create batch-level sets of callable regions for multi-sample calling. Intersects all non-callable (nblock) regions from all samples in a batch, producing a global set of callable regions. """ import pybedtools samples = [x[0] for x in samples] # back compatibility -- global file for entire sample set global_analysis_file = os.path.join(samples[0]["dirs"]["work"], "analysis_blocks.bed") if utils.file_exists(global_analysis_file) and not _needs_region_update(global_analysis_file, samples): global_no_analysis_file = os.path.join(os.path.dirname(global_analysis_file), "noanalysis_blocks.bed") else: global_analysis_file = None out = [] analysis_files = [] batches = [] with shared.bedtools_tmpdir(samples[0]): for batch, items in vmulti.group_by_batch(samples, require_bam=False).items(): batches.append(items) if global_analysis_file: analysis_file, no_analysis_file = global_analysis_file, global_no_analysis_file else: analysis_file, no_analysis_file = _combine_sample_regions_batch(batch, items) for data in items: vr_file = dd.get_variant_regions(data) if analysis_file: analysis_files.append(analysis_file) data["config"]["algorithm"]["callable_regions"] = analysis_file data["config"]["algorithm"]["non_callable_regions"] = no_analysis_file data["config"]["algorithm"]["callable_count"] = pybedtools.BedTool(analysis_file).count() elif vr_file: data["config"]["algorithm"]["callable_count"] = pybedtools.BedTool(vr_file).count() highdepth_bed = tz.get_in(["regions", "highdepth"], data) if highdepth_bed: data["config"]["algorithm"]["highdepth_regions"] = highdepth_bed # attach a representative sample for calculating callable region if not data.get("work_bam"): for x in items: if x.get("work_bam"): data["work_bam_callable"] = x["work_bam"] out.append([data]) assert len(out) == len(samples) if len(analysis_files) > 0: final_regions = pybedtools.BedTool(analysis_files[0]) _analysis_block_stats(final_regions, batches[0]) return out
def assign_interval(data): """Identify coverage based on percent of genome covered and relation to targets. Classifies coverage into 3 categories: - genome: Full genome coverage - regional: Regional coverage, like exome capture, with off-target reads - amplicon: Amplication based regional coverage without off-target reads """ genome_cov_thresh = 0.40 # percent of genome covered for whole genome analysis offtarget_thresh = 0.05 # percent of offtarget reads required to be capture (not amplification) based if not dd.get_coverage_interval(data): vrs = dd.get_variant_regions(data) callable_file = dd.get_sample_callable(data) if vrs: seq_size = pybedtools.BedTool(vrs).total_coverage() else: seq_size = pybedtools.BedTool(callable_file).total_coverage() total_size = sum([ c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"]) ]) genome_cov_pct = seq_size / float(total_size) if genome_cov_pct > genome_cov_thresh: cov_interval = "genome" offtarget_pct = 0.0 else: offtarget_stats_file = dd.get_offtarget_stats(data) if not offtarget_stats_file: offtarget_pct = 0.0 else: with open(offtarget_stats_file) as in_handle: stats = yaml.safe_load(in_handle) if stats.get("offtarget") and stats["mapped_unique"]: offtarget_pct = float( stats["offtarget"]) / stats["mapped_unique"] else: offtarget_pct = 0.0 if offtarget_pct > offtarget_thresh: cov_interval = "regional" else: cov_interval = "amplicon" logger.info( "%s: Assigned coverage as '%s' with %.1f%% genome coverage and %.1f%% offtarget coverage" % (dd.get_sample_name(data), cov_interval, genome_cov_pct * 100.0, offtarget_pct * 100.0)) data["config"]["algorithm"]["coverage_interval"] = cov_interval return data
def run_haplotyper(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variants with Sentieon's haplotyper (GATK HaplotypeCaller like). """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): variant_regions = bedutils.merge_overlaps(dd.get_variant_regions(items[0]), items[0]) interval = _get_interval(variant_regions, region, out_file, items) with file_transaction(items[0], out_file) as tx_out_file: dbsnp = "--dbsnp %s" % (assoc_files.get("dbsnp")) if "dbsnp" in assoc_files else "" bams = " ".join(["-i %s" % x for x in align_bams]) license = license_export(items[0]) cmd = ("{license}sentieon driver -t 1 -r {ref_file} " "{bams} {interval} --algo Haplotyper {dbsnp} {tx_out_file}") do.run(cmd.format(**locals()), "Sentieon TNhaplotyper") return out_file
def prep_recal(data): """Do pre-BQSR recalibration, calculation of recalibration tables. """ if dd.get_recalibrate(data) in [True, "gatk"]: logger.info("Prepare BQSR tables with GATK: %s " % str(dd.get_sample_name(data))) dbsnp_file = tz.get_in(("genome_resources", "variation", "dbsnp"), data) if not dbsnp_file: logger.info("Skipping GATK BaseRecalibrator because no VCF file of known variants was found.") return data broad_runner = broad.runner_from_config(data["config"]) data["prep_recal"] = _gatk_base_recalibrator(broad_runner, dd.get_align_bam(data), dd.get_ref_file(data), dd.get_platform(data), dbsnp_file, dd.get_variant_regions(data), data) elif dd.get_recalibrate(data) == "sentieon": logger.info("Prepare BQSR tables with sentieon: %s " % str(dd.get_sample_name(data))) data["prep_recal"] = sentieon.bqsr_table(data) elif dd.get_recalibrate(data): raise NotImplementedError("Unsupported recalibration type: %s" % (dd.get_recalibrate(data))) return data
def get_base_cnv_regions(data, work_dir): """Retrieve set of target regions for CNV analysis. Subsets to extended transcript regions for WGS experiments to avoid long runtimes. """ cov_interval = dd.get_coverage_interval(data) base_regions = regions.get_sv_bed(data) # if we don't have a configured BED or regions to use for SV caling if not base_regions: # For genome calls, subset to regions within 10kb of genes if cov_interval == "genome": base_regions = regions.get_sv_bed(data, "transcripts1e4", work_dir) if base_regions: base_regions = remove_exclude_regions(base_regions, base_regions, [data]) # Finally, default to the defined variant regions if not base_regions: base_regions = dd.get_variant_regions(data) return base_regions
def get_base_cnv_regions(data, work_dir, genome_default="transcripts1e4", include_gene_names=True): """Retrieve set of target regions for CNV analysis. Subsets to extended transcript regions for WGS experiments to avoid long runtimes. """ cov_interval = dd.get_coverage_interval(data) base_regions = get_sv_bed(data, include_gene_names=include_gene_names) # if we don't have a configured BED or regions to use for SV caling if not base_regions: # For genome calls, subset to regions near genes as targets if cov_interval == "genome": base_regions = get_sv_bed(data, genome_default, work_dir, include_gene_names=include_gene_names) if base_regions: base_regions = remove_exclude_regions(base_regions, base_regions, [data]) # Finally, default to the defined variant regions if not base_regions: base_regions = dd.get_variant_regions(data) return bedutils.clean_file(base_regions, data)