def haplotype_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's HaplotypeCaller. This requires the full non open-source version of GATK. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): config = items[0]["config"] broad_runner, params = \ _shared_gatk_call_prep(align_bams, ref_file, items[0]["config"], assoc_files["dbsnp"], region, out_file) assert broad_runner.gatk_type() == "restricted", \ "Require full version of GATK 2.4+ for haplotype calling" if not all(has_aligned_reads(x, region) for x in align_bams): vcfutils.write_empty_vcf(out_file, config) else: with file_transaction(out_file) as tx_out_file: params += ["-T", "HaplotypeCaller", "-o", tx_out_file, "--annotation", "ClippingRankSumTest", "--annotation", "DepthPerSampleHC"] # Enable hardware based optimizations in GATK 3.1+ if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.1"): params += ["--pair_hmm_implementation", "VECTOR_LOGLESS_CACHING"] broad_runner.new_resources("gatk-haplotype") broad_runner.run_gatk(params) return out_file
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run the MuTect paired analysis algorithm.""" if out_file is None: out_file = "%s-paired-variants.vcf" % os.path.splitext( align_bams[0])[0] if not file_exists(out_file): broad_runner, params = \ _mutect_call_prep(align_bams, items, ref_file, assoc_files, region, out_file) if (not isinstance(region, (list, tuple)) and not all(has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) return with file_transaction(out_file) as tx_out_file: # Rationale: MuTect writes another table to stdout, # which we don't need params += ["--vcf", tx_out_file, "-o", os.devnull] broad_runner.run_mutect(params) return out_file
def shared_variantcall(call_fn, name, align_bams, ref_file, config, assoc_files, region=None, out_file=None): """Provide base functionality for prepping and indexing for variant calling. """ broad_runner = broad.runner_from_config(config) for x in align_bams: broad_runner.run_fn("picard_index", x) if out_file is None: out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): logger.info("Genotyping with {name}: {region} {fname}".format( name=name, region=region, fname=os.path.basename(align_bams[0]))) variant_regions = config["algorithm"].get("variant_regions", None) target_regions = subset_variant_regions(variant_regions, region, out_file) if ((variant_regions is not None and isinstance(target_regions, basestring) and not os.path.isfile(target_regions)) or not all( realign.has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) else: with file_transaction(out_file) as tx_out_file: call_fn(align_bams, ref_file, config, target_regions, tx_out_file) return out_file
def unified_genotyper(align_bams, ref_file, config, dbsnp=None, region=None, out_file=None): """Perform SNP genotyping on the given alignment file. """ if out_file is None: out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): broad_runner, params = \ _shared_gatk_call_prep(align_bams, ref_file, config, dbsnp, region, out_file) if (not isinstance(region, (list, tuple)) and not all(has_aligned_reads(x, region) for x in align_bams)): write_empty_vcf(out_file) else: with file_transaction(out_file) as tx_out_file: params += [ "-T", "UnifiedGenotyper", "-o", tx_out_file, "--genotype_likelihoods_model", "BOTH" ] broad_runner.run_gatk(params) return out_file
def _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform, snp_file, intervals): """Step 1 of GATK recalibration process, producing table of covariates. """ out_file = "%s.grp" % os.path.splitext(dup_align_bam)[0] if not file_exists(out_file): if has_aligned_reads(dup_align_bam): with curdir_tmpdir() as tmp_dir: with file_transaction(out_file) as tx_out_file: params = ["-T", "BaseRecalibrator", "-o", tx_out_file, "-I", dup_align_bam, "-R", ref_file, ] # GATK-lite does not have support for # insertion/deletion quality modeling if not broad_runner.has_gatk_full(): params += ["--disable_indel_quals"] if snp_file: params += ["--knownSites", snp_file] if intervals: params += ["-L", intervals, "--interval_set_rule", "INTERSECTION"] broad_runner.run_gatk(params, tmp_dir) else: with open(out_file, "w") as out_handle: out_handle.write("# No aligned reads") return out_file
def write_recal_bam(data, region=None, out_file=None): """Step 2 of GATK recalibration -- use covariates to re-write output file. """ config = data["config"] if out_file is None: out_file = "%s-gatkrecal.bam" % os.path.splitext(data["work_bam"])[0] logger.info("Writing recalibrated BAM for %s to %s" % (data["name"], out_file)) if region == "nochr": out_bam = write_nochr_reads(data["work_bam"], out_file) else: out_bam = _run_recal_bam(data["work_bam"], data["prep_recal"], region, data["sam_ref"], out_file, config) qual_bin = config["algorithm"].get("quality_bin", None) if ((qual_bin is True or qual_bin == "postrecal" or isinstance(qual_bin, list) and "postrecal" in qual_bin) and has_aligned_reads(out_bam)): binned_bam = cram.illumina_qual_bin(out_bam, data["sam_ref"], os.path.dirname(out_bam), config) shutil.move(out_bam, out_bam + ".binned") shutil.move(binned_bam, out_bam) utils.save_diskspace(out_bam + ".binned", "Quality binned to %s" % out_bam, config) data["work_bam"] = out_bam return [data]
def _call_variants_samtools(align_bams, ref_file, items, target_regions, out_file): """Call variants with samtools in target_regions. Works around a GATK VCF compatibility issue in samtools 0.20 by removing extra Version information from VCF header lines. """ config = items[0]["config"] max_read_depth = "1000" mpileup = prep_mpileup(align_bams, ref_file, max_read_depth, config, target_regions=target_regions) bcftools = config_utils.get_program("bcftools", config) bcftools_version = programs.get_version("bcftools", config=config) samtools_version = programs.get_version("samtools", config=config) if LooseVersion(bcftools_version) > LooseVersion("0.1.19"): if LooseVersion(samtools_version) <= LooseVersion("0.1.19"): raise ValueError("samtools calling not supported with 0.1.19 samtools and 0.20 bcftools") bcftools_opts = "call -v -c" else: bcftools_opts = "view -v -c -g" compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" vcfutils = config_utils.get_program("vcfutils.pl", config) # XXX Check if we need this when supporting samtools 0.2.0 calling. # 0.1.9 fails on regions without reads. if not any(realign.has_aligned_reads(x, target_regions) for x in align_bams): vcfutils.write_empty_vcf(out_file, config) else: cmd = ("{mpileup} " "| {bcftools} {bcftools_opts} - " "| {vcfutils} varFilter -D {max_read_depth} " "| sed 's/,Version=3>/>/'" "{compress_cmd} > {out_file}") logger.info(cmd.format(**locals())) do.run(cmd.format(**locals()), "Variant calling with samtools", {})
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run the MuTect paired analysis algorithm. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): base_config = items[0]["config"] broad_runner = broad.runner_from_config(base_config, "mutect") out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf") if "vcf" in out_file else out_file + "-mutect.vcf") broad_runner, params = \ _mutect_call_prep(align_bams, items, ref_file, assoc_files, region, out_file_mutect) if (not isinstance(region, (list, tuple)) and not all(has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) return with file_transaction(out_file_mutect) as tx_out_file: # Rationale: MuTect writes another table to stdout, which we don't need params += ["--vcf", tx_out_file, "-o", os.devnull] broad_runner.run_mutect(params) _rename_allelic_fraction_field(out_file_mutect,config) disable_SID = True # SID isn't great, so use Scalpel instead if "appistry" not in broad_runner.get_mutect_version() or disable_SID: # Scalpel InDels is_paired = "-I:normal" in params out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") if scalpel.is_installed(items[0]["config"]): with file_transaction(out_file_indels) as tx_out_file2: if not is_paired: scalpel._run_scalpel_caller(align_bams, items, ref_file, assoc_files, region=region, out_file=tx_out_file2) else: scalpel._run_scalpel_paired(align_bams, items, ref_file, assoc_files, region=region, out_file=tx_out_file2) out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=items[0]["sam_ref"], config=items[0]["config"], region=region) else: utils.symlink_plus(out_file_mutect, out_file) else: # SomaticIndelDetector modifications out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") params_indels = _SID_call_prep(align_bams, items, ref_file, assoc_files, region, out_file_indels) with file_transaction(out_file_indels) as tx_out_file: params_indels += ["-o", tx_out_file] broad_runner.run_mutect(params_indels) out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=items[0]["sam_ref"], config=items[0]["config"], region=region) return out_file
def _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform, dbsnp_file, intervals): """Step 1 of GATK recalibration process, producing table of covariates. """ out_file = "%s.grp" % os.path.splitext(dup_align_bam)[0] plot_file = "%s-plots.pdf" % os.path.splitext(dup_align_bam)[0] if not file_exists(out_file): if has_aligned_reads(dup_align_bam, intervals): with curdir_tmpdir() as tmp_dir: with file_transaction(out_file) as tx_out_file: params = ["-T", "BaseRecalibrator", "-o", tx_out_file, "--plot_pdf_file", plot_file, "-I", dup_align_bam, "-R", ref_file, ] downsample_pct = _get_downsample_pct(broad_runner, dup_align_bam) if downsample_pct: params += ["--downsample_to_fraction", str(downsample_pct), "--downsampling_type", "ALL_READS"] # GATK-lite does not have support for # insertion/deletion quality modeling if broad_runner.gatk_type() == "lite": params += ["--disable_indel_quals"] if dbsnp_file: params += ["--knownSites", dbsnp_file] if intervals: params += ["-L", intervals, "--interval_set_rule", "INTERSECTION"] broad_runner.run_gatk(params, tmp_dir) else: with open(out_file, "w") as out_handle: out_handle.write("# No aligned reads") return out_file
def _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform, dbsnp_file, intervals, data): """Step 1 of GATK recalibration process, producing table of covariates. For GATK 4 we use local multicore spark runs: https://github.com/broadinstitute/gatk/issues/2345 For GATK3, Large whole genome BAM files take an excessively long time to recalibrate and the extra inputs don't help much beyond a certain point. See the 'Downsampling analysis' plots in the GATK documentation: http://gatkforums.broadinstitute.org/discussion/44/base-quality-score-recalibrator#latest This identifies large files and calculates the fraction to downsample to. """ target_counts = 1e8 # 100 million reads per read group, 20x the plotted max out_file = "%s-recal.grp" % os.path.splitext(dup_align_bam)[0] if not utils.file_exists(out_file): if has_aligned_reads(dup_align_bam, intervals): with file_transaction(data, out_file) as tx_out_file: gatk_type = broad_runner.gatk_type() assert gatk_type in ["restricted", "gatk4"], \ "Require full version of GATK 2.4+ or GATK4 for BQSR" params = ["-I", dup_align_bam] if gatk_type == "gatk4": params += [ "-T", "BaseRecalibratorSpark", "--sparkMaster", "local[%s]" % dd.get_num_cores(data), "--output", tx_out_file, "--reference", dd.get_ref_twobit(data) ] else: params += [ "-T", "BaseRecalibrator", "-o", tx_out_file, "-R", ref_file ] downsample_pct = bam.get_downsample_pct( dup_align_bam, target_counts, data) if downsample_pct: params += [ "--downsample_to_fraction", str(downsample_pct), "--downsampling_type", "ALL_READS" ] if platform.lower() == "solid": params += [ "--solid_nocall_strategy", "PURGE_READ", "--solid_recal_mode", "SET_Q_ZERO_BASE_N" ] if dbsnp_file: params += ["--knownSites", dbsnp_file] if intervals: params += [ "-L", intervals, "--interval_set_rule", "INTERSECTION" ] broad_runner.run_gatk(params, os.path.dirname(tx_out_file)) else: with open(out_file, "w") as out_handle: out_handle.write("# No aligned reads") return out_file
def haplotype_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's HaplotypeCaller. This requires the full non open-source version of GATK. """ if out_file is None: out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): broad_runner, params = \ _shared_gatk_call_prep(align_bams, ref_file, items[0]["config"], assoc_files["dbsnp"], region, out_file) assert broad_runner.gatk_type() == "restricted", \ "Require full version of GATK 2.4+ for haplotype calling" if not all(has_aligned_reads(x, region) for x in align_bams): vcfutils.write_empty_vcf(out_file) else: with file_transaction(out_file) as tx_out_file: params += ["-T", "HaplotypeCaller", "-o", tx_out_file] #params = _gatk_location_hack(params) broad_runner.new_resources("gatk-haplotype") broad_runner.run_gatk(params) return out_file
def unified_genotyper(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Perform SNP genotyping on the given alignment file. """ if out_file is None: out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): broad_runner, params = \ _shared_gatk_call_prep(align_bams, ref_file, items[0]["config"], assoc_files["dbsnp"], region, out_file) if (not isinstance(region, (list, tuple)) and not all(has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) else: with file_transaction(out_file) as tx_out_file: params += [ "-T", "UnifiedGenotyper", "-o", tx_out_file, "-ploidy", (str(ploidy.get_ploidy(items, region)) if broad_runner.gatk_type() == "restricted" else "2"), "--genotype_likelihoods_model", "BOTH" ] broad_runner.run_gatk(params) return out_file
def _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform, snp_file, intervals): """Step 1 of GATK recalibration process, producing table of covariates. """ out_file = "%s.grp" % os.path.splitext(dup_align_bam)[0] if not file_exists(out_file): if has_aligned_reads(dup_align_bam): with curdir_tmpdir() as tmp_dir: with file_transaction(out_file) as tx_out_file: params = [ "-T", "BaseRecalibrator", "-o", tx_out_file, "-I", dup_align_bam, "-R", ref_file, ] # GATK-lite does not have support for # insertion/deletion quality modeling if not broad_runner.has_gatk_full(): params += ["--disable_indel_quals"] if snp_file: params += ["--knownSites", snp_file] if intervals: params += [ "-L", intervals, "--interval_set_rule", "INTERSECTION" ] broad_runner.run_gatk(params, tmp_dir) else: with open(out_file, "w") as out_handle: out_handle.write("# No aligned reads") return out_file
def haplotype_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's HaplotypeCaller. This requires the full non open-source version of GATK. """ if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): config = items[0]["config"] broad_runner, params = \ _shared_gatk_call_prep(align_bams, ref_file, items[0]["config"], assoc_files["dbsnp"], region, out_file) assert broad_runner.gatk_type() == "restricted", \ "Require full version of GATK 2.4+ for haplotype calling" if not all(has_aligned_reads(x, region) for x in align_bams): vcfutils.write_empty_vcf(out_file, config) else: with file_transaction(out_file) as tx_out_file: params += ["-T", "HaplotypeCaller", "-o", tx_out_file] #params = _gatk_location_hack(params) broad_runner.new_resources("gatk-haplotype") broad_runner.run_gatk(params) return out_file
def shared_variantcall(call_fn, name, align_bams, ref_file, config, assoc_files, region=None, out_file=None): """Provide base functionality for prepping and indexing for variant calling. """ broad_runner = broad.runner_from_config(config) for x in align_bams: broad_runner.run_fn("picard_index", x) if out_file is None: out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): logger.info("Genotyping with {name}: {region} {fname}".format(name=name, region=region, fname=os.path.basename(align_bams[0]))) variant_regions = config["algorithm"].get("variant_regions", None) target_regions = subset_variant_regions(variant_regions, region, out_file) if ((variant_regions is not None and isinstance(target_regions, basestring) and not os.path.isfile(target_regions)) or not all(realign.has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) else: with file_transaction(out_file) as tx_out_file: call_fn(align_bams, ref_file, config, target_regions, tx_out_file) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.dbsnp, ref_file, config) return ann_file
def _regions_for_coverage(data, region, ref_file, out_file): """Retrieve BED file of regions we need to calculate coverage in. Checks for variant region specifications that do not overlap contigs (in which case we do not calculate coverage) and regions smaller than callable_min_size (in which case we assign everything as callable). callable_min_size avoids calculations for small chromosomes we won't split on later, saving computation and disk IO. """ variant_regions = bedutils.merge_overlaps(dd.get_variant_regions(data), data) ready_region = shared.subset_variant_regions(variant_regions, region, out_file) custom_file = "%s-coverageregions.bed" % utils.splitext_plus(out_file)[0] region_size = _get_region_size(ref_file, data, region) if variant_regions is None and region_size is not None and region_size < dd.get_callable_min_size(data): coverage_str = "CALLABLE" if realign.has_aligned_reads(dd.get_work_bam(data), region) else "NO_COVERAGE" custom_file = _write_all_chrom_file(coverage_str, custom_file, ref_file, region, data) return custom_file, False elif not ready_region: get_ref_bedtool(ref_file, data["config"]).saveas(custom_file) return custom_file, True elif os.path.isfile(ready_region): return ready_region, True elif isinstance(ready_region, (list, tuple)): c, s, e = ready_region pybedtools.BedTool("%s\t%s\t%s\n" % (c, s, e), from_string=True).saveas(custom_file) return custom_file, True else: custom_file = _write_all_chrom_file("NO_COVERAGE", custom_file, ref_file, region, data) return custom_file, variant_regions is None
def shared_variantcall(call_fn, name, align_bams, ref_file, items, assoc_files, region=None, out_file=None): """Provide base functionality for prepping and indexing for variant calling. """ config = items[0]["config"] if out_file is None: if vcfutils.is_paired_analysis(align_bams, items): out_file = "%s-paired-variants.vcf" % config["metdata"]["batch"] else: out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): logger.info("Genotyping with {name}: {region} {fname}".format( name=name, region=region, fname=os.path.basename(align_bams[0]))) for x in align_bams: bam.index(x, config) variant_regions = config["algorithm"].get("variant_regions", None) target_regions = subset_variant_regions(variant_regions, region, out_file) if ((variant_regions is not None and isinstance(target_regions, basestring) and not os.path.isfile(target_regions)) or not all(realign.has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) else: with file_transaction(out_file) as tx_out_file: call_fn(align_bams, ref_file, items, target_regions, tx_out_file) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files["dbsnp"], ref_file, config) return ann_file
def _gatk_count_covariates(broad_runner, dup_align_bam, ref_file, platform, snp_file, intervals): """Step 1 of GATK recalibration process -- counting covariates. """ out_file = "%s.recal" % os.path.splitext(dup_align_bam)[0] if not file_exists(out_file): if has_aligned_reads(dup_align_bam): with curdir_tmpdir() as tmp_dir: with file_transaction(out_file) as tx_out_file: params = ["-T", "CountCovariates", "-cov", "ReadGroupCovariate", "-cov", "QualityScoreCovariate", "-cov", "CycleCovariate", "-cov", "DinucCovariate", "-recalFile", tx_out_file, "-I", dup_align_bam, "-R", ref_file, "-l", "INFO", "-U", "-OQ", "--default_platform", platform, ] if snp_file: params += ["--knownSites", snp_file] if intervals: params += ["-L", intervals, "--interval_set_rule", "INTERSECTION"] broad_runner.run_gatk(params, tmp_dir) else: with open(out_file, "w") as out_handle: out_handle.write("# No aligned reads") return out_file
def _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform, dbsnp_file, intervals, data): """Step 1 of GATK recalibration process, producing table of covariates. Large whole genome BAM files take an excessively long time to recalibrate and the extra inputs don't help much beyond a certain point. See the 'Downsampling analysis' plots in the GATK documentation: http://gatkforums.broadinstitute.org/discussion/44/base-quality-score-recalibrator#latest This identifies large files and calculates the fraction to downsample to. TODO: Use new GATK 2.6+ AnalyzeCovariates tool to plot recalibration results. """ target_counts = 1e8 # 100 million reads per read group, 20x the plotted max out_file = "%s.grp" % os.path.splitext(dup_align_bam)[0] if not file_exists(out_file): if has_aligned_reads(dup_align_bam, intervals): with tx_tmpdir(data) as tmp_dir: with file_transaction(data, out_file) as tx_out_file: params = [ "-T", "BaseRecalibrator", "-o", tx_out_file, "-I", dup_align_bam, "-R", ref_file, ] downsample_pct = bam.get_downsample_pct( dup_align_bam, target_counts, data) if downsample_pct: params += [ "--downsample_to_fraction", str(downsample_pct), "--downsampling_type", "ALL_READS" ] if platform.lower() == "solid": params += [ "--solid_nocall_strategy", "PURGE_READ", "--solid_recal_mode", "SET_Q_ZERO_BASE_N" ] # GATK-lite does not have support for # insertion/deletion quality modeling if broad_runner.gatk_type() == "lite": params += ["--disable_indel_quals"] if dbsnp_file: params += ["--knownSites", dbsnp_file] if intervals: params += [ "-L", intervals, "--interval_set_rule", "INTERSECTION" ] broad_runner.run_gatk(params, tmp_dir) else: with open(out_file, "w") as out_handle: out_handle.write("# No aligned reads") return out_file
def pipeline_summary(data): """Provide summary information on processing sample. """ work_bam = data.get("work_bam") if data["sam_ref"] is not None and work_bam and work_bam.endswith(".bam") and has_aligned_reads(work_bam): logger.info("Generating summary files: %s" % str(data["name"])) data["summary"] = _run_qc_tools(work_bam, data) return [[data]]
def pipeline_summary(data): """Provide summary information on processing sample. """ work_bam = data.get("work_bam") if data["sam_ref"] is not None and work_bam and work_bam.endswith( ".bam") and has_aligned_reads(work_bam): logger.info("Generating summary files: %s" % str(data["name"])) data["summary"] = _run_qc_tools(work_bam, data) return [[data]]
def pipeline_summary(data): """Provide summary information on processing sample. """ work_bam = (data.get("work_bam") if data["config"]["algorithm"].get( "merge_bamprep", True) else data.get("callable_bam")) if data["sam_ref"] is not None and work_bam and has_aligned_reads( work_bam): logger.info("Generating summary files: %s" % str(data["name"])) data["summary"] = _run_qc_tools(work_bam, data) return [[data]]
def pipeline_summary(data): """Provide summary information on processing sample. """ work_bam = (data.get("work_bam") if data["config"]["algorithm"].get("merge_bamprep", True) else data.get("callable_bam")) if data["sam_ref"] is not None and work_bam and has_aligned_reads(work_bam): logger.info("Generating summary files: %s" % str(data["name"])) data["summary"] = _run_qc_tools(work_bam, data) return [[data]]
def _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform, dbsnp_file, intervals, data): """Step 1 of GATK recalibration process, producing table of covariates. For GATK 4 we use local multicore spark runs: https://github.com/broadinstitute/gatk/issues/2345 For GATK3, Large whole genome BAM files take an excessively long time to recalibrate and the extra inputs don't help much beyond a certain point. See the 'Downsampling analysis' plots in the GATK documentation: http://gatkforums.broadinstitute.org/discussion/44/base-quality-score-recalibrator#latest This identifies large files and calculates the fraction to downsample to. """ target_counts = 1e8 # 100 million reads per read group, 20x the plotted max out_file = os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data), "%s-recal.grp" % utils.splitext_plus(os.path.basename(dup_align_bam))[0]) if not utils.file_exists(out_file): if has_aligned_reads(dup_align_bam, intervals): with file_transaction(data, out_file) as tx_out_file: gatk_type = broad_runner.gatk_type() assert gatk_type in ["restricted", "gatk4"], \ "Require full version of GATK 2.4+ or GATK4 for BQSR" params = ["-I", dup_align_bam] cores = dd.get_num_cores(data) if gatk_type == "gatk4": params += ["-T", "BaseRecalibratorSpark", "--sparkMaster", "local[%s]" % cores, "--output", tx_out_file, "--reference", dd.get_ref_twobit(data), "--conf", "spark.local.dir=%s" % os.path.dirname(tx_out_file)] else: params += ["-T", "BaseRecalibrator", "-o", tx_out_file, "-R", ref_file] downsample_pct = bam.get_downsample_pct(dup_align_bam, target_counts, data) if downsample_pct: params += ["--downsample_to_fraction", str(downsample_pct), "--downsampling_type", "ALL_READS"] if platform.lower() == "solid": params += ["--solid_nocall_strategy", "PURGE_READ", "--solid_recal_mode", "SET_Q_ZERO_BASE_N"] if dbsnp_file: params += ["--knownSites", dbsnp_file] if intervals: params += ["-L", intervals, "--interval_set_rule", "INTERSECTION"] memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale, parallel_gc=True) else: with open(out_file, "w") as out_handle: out_handle.write("# No aligned reads") return out_file
def _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform, dbsnp_file, intervals): """Step 1 of GATK recalibration process, producing table of covariates. TODO: Use new GATK 2.6+ AnalyzeCovariates tool to plot recalibration results. """ out_file = "%s.grp" % os.path.splitext(dup_align_bam)[0] if not file_exists(out_file): if has_aligned_reads(dup_align_bam, intervals): with curdir_tmpdir() as tmp_dir: with file_transaction(out_file) as tx_out_file: params = [ "-T", "BaseRecalibrator", "-o", tx_out_file, "-I", dup_align_bam, "-R", ref_file, ] downsample_pct = _get_downsample_pct( broad_runner, dup_align_bam) if downsample_pct: params += [ "--downsample_to_fraction", str(downsample_pct), "--downsampling_type", "ALL_READS" ] if platform.lower() == "solid": params += [ "--solid_nocall_strategy", "PURGE_READ", "--solid_recal_mode", "SET_Q_ZERO_BASE_N" ] # GATK-lite does not have support for # insertion/deletion quality modeling if broad_runner.gatk_type() == "lite": params += ["--disable_indel_quals"] if dbsnp_file: params += ["--knownSites", dbsnp_file] if intervals: params += [ "-L", intervals, "--interval_set_rule", "INTERSECTION" ] broad_runner.run_gatk(params, tmp_dir) else: with open(out_file, "w") as out_handle: out_handle.write("# No aligned reads") return out_file
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run the MuTect paired analysis algorithm. """ if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext( align_bams[0])[0] if not file_exists(out_file): base_config = items[0]["config"] broad_runner = broad.runner_from_config(base_config, "mutect") if "appistry" in broad_runner.get_mutect_version(): out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf") if "vcf" in out_file else out_file + "-mutect.vcf") else: out_file_mutect = out_file broad_runner, params = \ _mutect_call_prep(align_bams, items, ref_file, assoc_files, region, out_file_mutect) if (not isinstance(region, (list, tuple)) and not all(has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) return with file_transaction(out_file_mutect) as tx_out_file: # Rationale: MuTect writes another table to stdout, which we don't need params += ["--vcf", tx_out_file, "-o", os.devnull] broad_runner.run_mutect(params) if "appistry" in broad_runner.get_mutect_version(): # SomaticIndelDetector modifications out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") params_indels = _SID_call_prep(align_bams, items, ref_file, assoc_files, region, out_file_indels) with file_transaction(out_file_indels) as tx_out_file: params_indels += ["-o", tx_out_file] broad_runner.run_mutect(params_indels) out_file = vcfutils.combine_variant_files( orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=items[0]["sam_ref"], config=items[0]["config"], region=region) return out_file
def unified_genotyper(align_bam, ref_file, config, dbsnp=None, region=None, out_file=None): """Perform SNP genotyping on the given alignment file. """ broad_runner = broad.runner_from_config(config) broad_runner.run_fn("picard_index_ref", ref_file) broad_runner.run_fn("picard_index", align_bam) coverage_depth = config["algorithm"].get("coverage_depth", "high").lower() variant_regions = config["algorithm"].get("variant_regions", None) if coverage_depth in ["low"]: confidence = "4.0" else: confidence = "30.0" if out_file is None: out_file = "%s-variants.vcf" % os.path.splitext(align_bam)[0] if not file_exists(out_file): if has_aligned_reads(align_bam, region): with file_transaction(out_file) as tx_out_file: params = ["-T", "UnifiedGenotyper", "-I", align_bam, "-R", ref_file, "-o", tx_out_file, "--annotation", "QualByDepth", "--annotation", "HaplotypeScore", "--annotation", "MappingQualityRankSumTest", "--annotation", "ReadPosRankSumTest", "--annotation", "FisherStrand", "--annotation", "RMSMappingQuality", "--annotation", "DepthOfCoverage", "--genotype_likelihoods_model", "BOTH", "--standard_min_confidence_threshold_for_calling", confidence, "--standard_min_confidence_threshold_for_emitting", confidence, "-l", "INFO", ] if dbsnp: params += ["--dbsnp", dbsnp] if region: params += ["-L", region] if variant_regions: params += ["-L", variant_regions, "--interval_set_rule", "INTERSECTION"] broad_runner.run_gatk(params) else: with open(out_file, "w") as out_handle: out_handle.write("##fileformat=VCFv4.1\n" "## No variants; no reads aligned in region\n" "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n") return out_file
def unified_genotyper(align_bams, ref_file, config, dbsnp=None, region=None, out_file=None): """Perform SNP genotyping on the given alignment file. """ broad_runner, params, out_file = \ _shared_gatk_call_prep(align_bams, ref_file, config, dbsnp, region, out_file) if not file_exists(out_file): if not all(has_aligned_reads(x, region) for x in align_bams): write_empty_vcf(out_file) else: with file_transaction(out_file) as tx_out_file: params += ["-T", "UnifiedGenotyper", "-o", tx_out_file, "--genotype_likelihoods_model", "BOTH"] broad_runner.run_gatk(params) return out_file
def unified_genotyper(align_bam, ref_file, config, dbsnp=None, region=None, out_file=None): """Perform SNP genotyping on the given alignment file. """ broad_runner, params, out_file = \ _shared_gatk_call_prep(align_bam, ref_file, config, dbsnp, region, out_file) if not file_exists(out_file): if not has_aligned_reads(align_bam, region): write_empty_vcf(out_file) else: with file_transaction(out_file) as tx_out_file: params += ["-T", "UnifiedGenotyper", "-o", tx_out_file, "--genotype_likelihoods_model", "BOTH"] broad_runner.run_gatk(params) return out_file
def unified_genotyper(align_bam, ref_file, config, dbsnp=None, region=None, out_file=None): """Perform SNP genotyping on the given alignment file. """ broad_runner = broad.runner_from_config(config) broad_runner.run_fn("picard_index_ref", ref_file) broad_runner.run_fn("picard_index", align_bam) coverage_depth = config["algorithm"].get("coverage_depth", "high").lower() variant_regions = config["algorithm"].get("variant_regions", None) if coverage_depth in ["low"]: confidence = "4.0" else: confidence = "30.0" if out_file is None: out_file = "%s-variants.vcf" % os.path.splitext(align_bam)[0] if not file_exists(out_file): if has_aligned_reads(align_bam, region): with file_transaction(out_file) as tx_out_file: params = ["-T", "UnifiedGenotyper", "-I", align_bam, "-R", ref_file, "-o", tx_out_file, "--annotation", "QualByDepth", "--annotation", "HaplotypeScore", "--annotation", "MappingQualityRankSumTest", "--annotation", "ReadPosRankSumTest", "--annotation", "FisherStrand", "--annotation", "RMSMappingQuality", "--annotation", "DepthOfCoverage", "--genotype_likelihoods_model", "BOTH", "--standard_min_confidence_threshold_for_calling", confidence, "--standard_min_confidence_threshold_for_emitting", confidence, "-l", "INFO", ] if dbsnp: params += ["--dbsnp", dbsnp] region = subset_variant_regions(variant_regions, region, tx_out_file) if region: params += ["-L", region, "--interval_set_rule", "INTERSECTION"] broad_runner.run_gatk(params) else: with open(out_file, "w") as out_handle: out_handle.write("##fileformat=VCFv4.1\n" "## No variants; no reads aligned in region\n" "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n") return out_file
def _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform, dbsnp_file, intervals): """Step 1 of GATK recalibration process, producing table of covariates. Large whole genome BAM files take an excessively long time to recalibrate and the extra inputs don't help much beyond a certain point. See the 'Downsampling analysis' plots in the GATK documentation: http://gatkforums.broadinstitute.org/discussion/44/base-quality-score-recalibrator#latest This identifies large files and calculates the fraction to downsample to. TODO: Use new GATK 2.6+ AnalyzeCovariates tool to plot recalibration results. """ target_counts = 1e8 # 100 million reads per read group, 20x the plotted max out_file = "%s.grp" % os.path.splitext(dup_align_bam)[0] if not file_exists(out_file): if has_aligned_reads(dup_align_bam, intervals): with curdir_tmpdir() as tmp_dir: with file_transaction(out_file) as tx_out_file: params = ["-T", "BaseRecalibrator", "-o", tx_out_file, "-I", dup_align_bam, "-R", ref_file, ] downsample_pct = bam.get_downsample_pct(broad_runner, dup_align_bam, target_counts) if downsample_pct: params += ["--downsample_to_fraction", str(downsample_pct), "--downsampling_type", "ALL_READS"] if platform.lower() == "solid": params += ["--solid_nocall_strategy", "PURGE_READ", "--solid_recal_mode", "SET_Q_ZERO_BASE_N"] # GATK-lite does not have support for # insertion/deletion quality modeling if broad_runner.gatk_type() == "lite": params += ["--disable_indel_quals"] if dbsnp_file: params += ["--knownSites", dbsnp_file] if intervals: params += ["-L", intervals, "--interval_set_rule", "INTERSECTION"] broad_runner.run_gatk(params, tmp_dir) else: with open(out_file, "w") as out_handle: out_handle.write("# No aligned reads") return out_file
def _gatk_count_covariates(broad_runner, dup_align_bam, ref_file, platform, snp_file, intervals): """Step 1 of GATK recalibration process -- counting covariates. """ out_file = "%s.recal" % os.path.splitext(dup_align_bam)[0] if not file_exists(out_file): if has_aligned_reads(dup_align_bam): with curdir_tmpdir() as tmp_dir: with file_transaction(out_file) as tx_out_file: params = [ "-T", "CountCovariates", "-cov", "ReadGroupCovariate", "-cov", "QualityScoreCovariate", "-cov", "CycleCovariate", "-cov", "DinucCovariate", "-recalFile", tx_out_file, "-I", dup_align_bam, "-R", ref_file, "-l", "INFO", "-U", "-OQ", "--default_platform", platform, ] if snp_file: params += ["--knownSites", snp_file] if intervals: params += [ "-L", intervals, "--interval_set_rule", "INTERSECTION" ] broad_runner.run_gatk(params, tmp_dir) else: with open(out_file, "w") as out_handle: out_handle.write("# No aligned reads") return out_file
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run the MuTect paired analysis algorithm.""" if out_file is None: out_file = "%s-paired-variants.vcf" % os.path.splitext( align_bams[0])[0] if not file_exists(out_file): broad_runner, params = \ _mutect_call_prep(align_bams, items, ref_file, assoc_files, region, out_file) if (not isinstance(region, (list, tuple)) and not all(has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) return with file_transaction(out_file) as tx_out_file: # Rationale: MuTect writes another table to stdout, # which we don't need params += ["--vcf", tx_out_file, "-o", os.devnull] try: broad_runner.run_mutect(params) except CalledProcessError as error: java_exception = _parse_gatk_java_error_string(error.cmd) #HACK: Currently MuTect bails out on certain small BAM files # Until the issue is fixed by Broad, this specific exception # will be ignored. All the other exceptions will be raised # correctly. if java_exception in _PASS_EXCEPTIONS: vcfutils.write_empty_vcf(tx_out_file) return else: raise return out_file
def unified_genotyper(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Perform SNP genotyping on the given alignment file. """ if out_file is None: out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): broad_runner, params = \ _shared_gatk_call_prep(align_bams, ref_file, items[0]["config"], assoc_files["dbsnp"], region, out_file) if (not isinstance(region, (list, tuple)) and not all(has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) else: with file_transaction(out_file) as tx_out_file: params += ["-T", "UnifiedGenotyper", "-o", tx_out_file, "--genotype_likelihoods_model", "BOTH"] broad_runner.run_gatk(params) return out_file
def haplotype_caller(align_bam, ref_file, config, dbsnp=None, region=None, out_file=None): """Call variation with GATK's HaplotypeCaller. This requires the full non open-source version of GATK. """ broad_runner, params, out_file = \ _shared_gatk_call_prep(align_bam, ref_file, config, dbsnp, region, out_file) assert broad_runner.has_gatk_full(), \ "Require full version of GATK 2.0 for haplotype based calling" if not file_exists(out_file): if not has_aligned_reads(align_bam, region): write_empty_vcf(out_file) else: with file_transaction(out_file) as tx_out_file: params += ["-T", "HaplotypeCaller", "-o", tx_out_file] broad_runner.run_gatk(params) return out_file
def haplotype_caller(align_bam, ref_file, config, dbsnp=None, region=None, out_file=None): """Call variation with GATK's HaplotypeCaller. This requires the full non open-source version of GATK. """ broad_runner, params, out_file = \ _shared_gatk_call_prep(align_bams, ref_file, config, dbsnp, region, out_file) assert broad_runner.has_gatk_full(), \ "Require full version of GATK 2.0 for haplotype based calling" if not file_exists(out_file): if not all(has_aligned_reads(x, region) for x in align_bams): write_empty_vcf(out_file) else: with file_transaction(out_file) as tx_out_file: params += ["-T", "HaplotypeCaller", "-o", tx_out_file] broad_runner.run_gatk(params) return out_file
def _call_variants_samtools(align_bams, ref_file, items, target_regions, out_file): """Call variants with samtools in target_regions. Works around a GATK VCF compatibility issue in samtools 0.20 by removing extra Version information from VCF header lines. """ config = items[0]["config"] max_read_depth = "1000" mpileup = prep_mpileup(align_bams, ref_file, max_read_depth, config, target_regions=target_regions) bcftools = config_utils.get_program("bcftools", config) bcftools_version = programs.get_version("bcftools", config=config) samtools_version = programs.get_version("samtools", config=config) if LooseVersion(bcftools_version) > LooseVersion("0.1.19"): if LooseVersion(samtools_version) <= LooseVersion("0.1.19"): raise ValueError( "samtools calling not supported with 0.1.19 samtools and 0.20 bcftools" ) bcftools_opts = "call -v -c" else: bcftools_opts = "view -v -c -g" compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" vcfutils = config_utils.get_program("vcfutils.pl", config) # XXX Check if we need this when supporting samtools 0.2.0 calling. # 0.1.9 fails on regions without reads. if not any( realign.has_aligned_reads(x, target_regions) for x in align_bams): vcfutils.write_empty_vcf(out_file, config) else: cmd = ("{mpileup} " "| {bcftools} {bcftools_opts} - " "| {vcfutils} varFilter -D {max_read_depth} " "| sed 's/,Version=3>/>/'" "{compress_cmd} > {out_file}") logger.info(cmd.format(**locals())) do.run(cmd.format(**locals()), "Variant calling with samtools", {})
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run the MuTect paired analysis algorithm. """ if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): base_config = items[0]["config"] broad_runner = broad.runner_from_config(base_config, "mutect") if "appistry" in broad_runner.get_mutect_version(): out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf") if "vcf" in out_file else out_file + "-mutect.vcf") else: out_file_mutect = out_file broad_runner, params = \ _mutect_call_prep(align_bams, items, ref_file, assoc_files, region, out_file_mutect) if (not isinstance(region, (list, tuple)) and not all(has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) return with file_transaction(out_file_mutect) as tx_out_file: # Rationale: MuTect writes another table to stdout, which we don't need params += ["--vcf", tx_out_file, "-o", os.devnull] broad_runner.run_mutect(params) if "appistry" in broad_runner.get_mutect_version(): # SomaticIndelDetector modifications out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") params_indels = _SID_call_prep(align_bams, items, ref_file, assoc_files, region, out_file_indels) with file_transaction(out_file_indels) as tx_out_file: params_indels += ["-o", tx_out_file] broad_runner.run_mutect(params_indels) out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=items[0]["sam_ref"], config=items[0]["config"], region=region) return out_file
def write_recal_bam(data, region=None, out_file=None): """Step 2 of GATK recalibration -- use covariates to re-write output file. """ config = data["config"] if out_file is None: out_file = "%s-gatkrecal.bam" % os.path.splitext(data["work_bam"])[0] logger.info("Writing recalibrated BAM for %s to %s" % (data["name"], out_file)) if region == "nochr": out_bam = write_nochr_reads(data["work_bam"], out_file, data["config"]) else: out_bam = _run_recal_bam(data["work_bam"], data["prep_recal"], region, data["sam_ref"], out_file, config) qual_bin = config["algorithm"].get("quality_bin", None) if ((qual_bin is True or qual_bin == "postrecal" or isinstance(qual_bin, list) and "postrecal" in qual_bin) and has_aligned_reads(out_bam)): binned_bam = cram.illumina_qual_bin(out_bam, data["sam_ref"], os.path.dirname(out_bam), config) shutil.move(out_bam, out_bam + ".binned") shutil.move(binned_bam, out_bam) utils.save_diskspace(out_bam + ".binned", "Quality binned to %s" % out_bam, config) data["work_bam"] = out_bam return [data]
def haplotype_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's HaplotypeCaller. This requires the full non open-source version of GATK. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): config = items[0]["config"] broad_runner, params = \ _shared_gatk_call_prep(align_bams, items, ref_file, assoc_files["dbsnp"], region, out_file) assert broad_runner.gatk_type() == "restricted", \ "Require full version of GATK 2.4+ for haplotype calling" if not all(has_aligned_reads(x, region) for x in align_bams): vcfutils.write_empty_vcf(out_file, config) else: with file_transaction(out_file) as tx_out_file: params += [ "-T", "HaplotypeCaller", "-o", tx_out_file, "--annotation", "ClippingRankSumTest", "--annotation", "DepthPerSampleHC" ] # Enable hardware based optimizations in GATK 3.1+ if LooseVersion(broad_runner.gatk_major_version() ) >= LooseVersion("3.1"): params += [ "--pair_hmm_implementation", "VECTOR_LOGLESS_CACHING" ] broad_runner.new_resources("gatk-haplotype") broad_runner.run_gatk(params) return out_file
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run the MuTect paired analysis algorithm. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext( align_bams[0])[0] if not file_exists(out_file): base_config = items[0]["config"] broad_runner = broad.runner_from_config(base_config, "mutect") out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf") if "vcf" in out_file else out_file + "-mutect.vcf") broad_runner, params = \ _mutect_call_prep(align_bams, items, ref_file, assoc_files, region, out_file_mutect) if (not isinstance(region, (list, tuple)) and not all(has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) return out_file_orig = "%s-orig%s" % utils.splitext_plus(out_file_mutect) if not file_exists(out_file_orig): with file_transaction(config, out_file_orig) as tx_out_file: # Rationale: MuTect writes another table to stdout, which we don't need params += ["--vcf", tx_out_file, "-o", os.devnull] broad_runner.run_mutect(params) is_paired = "-I:normal" in params if not utils.file_uptodate(out_file_mutect, out_file_orig): out_file_mutect = _fix_mutect_output(out_file_orig, config, out_file_mutect, is_paired) indelcaller = vcfutils.get_indelcaller(base_config) if ("scalpel" in indelcaller.lower() and region and isinstance(region, (tuple, list)) and chromhacks.is_autosomal_or_sex(region[0])): # Scalpel InDels out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") if scalpel.is_installed(items[0]["config"]): if not is_paired: vcfutils.check_paired_problems(items) scalpel._run_scalpel_caller(align_bams, items, ref_file, assoc_files, region=region, out_file=out_file_indels) else: scalpel._run_scalpel_paired(align_bams, items, ref_file, assoc_files, region=region, out_file=out_file_indels) out_file = vcfutils.combine_variant_files( orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=items[0]["sam_ref"], config=items[0]["config"], region=region) else: utils.symlink_plus(out_file_mutect, out_file) elif "pindel" in indelcaller.lower(): from bcbio.structural import pindel out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") if pindel.is_installed(items[0]["config"]): pindel._run_tumor_pindel_caller(align_bams, items, ref_file, assoc_files, region=region, out_file=out_file_indels) out_file = vcfutils.combine_variant_files( orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=ref_file, config=items[0]["config"], region=region) else: utils.symlink_plus(out_file_mutect, out_file) elif (("somaticindeldetector" in indelcaller.lower() or "sid" in indelcaller.lower()) and "appistry" in broad_runner.get_mutect_version()): # SomaticIndelDetector InDels out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") params_indels = _SID_call_prep(align_bams, items, ref_file, assoc_files, region, out_file_indels) with file_transaction(config, out_file_indels) as tx_out_file: params_indels += ["-o", tx_out_file] broad_runner.run_mutect(params_indels) out_file = vcfutils.combine_variant_files( orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=items[0]["sam_ref"], config=items[0]["config"], region=region) else: utils.symlink_plus(out_file_mutect, out_file) return out_file
def _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform, dbsnp_file, intervals, data): """Step 1 of GATK recalibration process, producing table of covariates. For GATK 4 we use local multicore spark runs: https://github.com/broadinstitute/gatk/issues/2345 For GATK3, Large whole genome BAM files take an excessively long time to recalibrate and the extra inputs don't help much beyond a certain point. See the 'Downsampling analysis' plots in the GATK documentation: http://gatkforums.broadinstitute.org/discussion/44/base-quality-score-recalibrator#latest This identifies large files and calculates the fraction to downsample to. spark host and timeout settings help deal with runs on restricted systems where we encounter network and timeout errors """ target_counts = 1e8 # 100 million reads per read group, 20x the plotted max out_file = os.path.join( dd.get_work_dir(data), "align", dd.get_sample_name(data), "%s-recal.grp" % utils.splitext_plus(os.path.basename(dup_align_bam))[0]) if not utils.file_exists(out_file): if has_aligned_reads(dup_align_bam, intervals): with file_transaction(data, out_file) as tx_out_file: gatk_type = broad_runner.gatk_type() assert gatk_type in ["restricted", "gatk4"], \ "Require full version of GATK 2.4+ or GATK4 for BQSR" params = ["-I", dup_align_bam] cores = dd.get_num_cores(data) if gatk_type == "gatk4": params += [ "-T", "BaseRecalibratorSpark", "--spark-master", "local[%s]" % cores, "--output", tx_out_file, "--reference", dd.get_ref_twobit(data), "--conf", "spark.driver.host=localhost", "--conf", "spark.network.timeout=800", "--conf", "spark.executor.heartbeatInterval=100", "--conf", "spark.local.dir=%s" % os.path.dirname(tx_out_file) ] if dbsnp_file: params += ["--known-sites", dbsnp_file] if intervals: params += [ "-L", intervals, "--interval-set-rule", "INTERSECTION" ] else: params += [ "-T", "BaseRecalibrator", "-o", tx_out_file, "-R", ref_file ] downsample_pct = bam.get_downsample_pct( dup_align_bam, target_counts, data) if downsample_pct: params += [ "--downsample_to_fraction", str(downsample_pct), "--downsampling_type", "ALL_READS" ] if platform.lower() == "solid": params += [ "--solid_nocall_strategy", "PURGE_READ", "--solid_recal_mode", "SET_Q_ZERO_BASE_N" ] if dbsnp_file: params += ["--knownSites", dbsnp_file] if intervals: params += [ "-L", intervals, "--interval_set_rule", "INTERSECTION" ] memscale = { "magnitude": 0.9 * cores, "direction": "increase" } if cores > 1 else None broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale, parallel_gc=True) else: with open(out_file, "w") as out_handle: out_handle.write("# No aligned reads") return out_file
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run the MuTect paired analysis algorithm. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): base_config = items[0]["config"] broad_runner = broad.runner_from_config(base_config, "mutect") out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf") if "vcf" in out_file else out_file + "-mutect.vcf") broad_runner, params = \ _mutect_call_prep(align_bams, items, ref_file, assoc_files, region, out_file_mutect) if (not isinstance(region, (list, tuple)) and not all(has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) return out_file_orig = "%s-orig%s" % utils.splitext_plus(out_file_mutect) with file_transaction(config, out_file_orig) as tx_out_file: # Rationale: MuTect writes another table to stdout, which we don't need params += ["--vcf", tx_out_file, "-o", os.devnull] broad_runner.run_mutect(params) is_paired = "-I:normal" in params out_file_mutect = _fix_mutect_output(out_file_orig, config, out_file_mutect, is_paired) indelcaller = vcfutils.get_indelcaller(base_config) if "scalpel" in indelcaller.lower(): # Scalpel InDels out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") if scalpel.is_installed(items[0]["config"]): with file_transaction(config, out_file_indels) as tx_out_file2: if not is_paired: vcfutils.check_paired_problems(items) scalpel._run_scalpel_caller(align_bams, items, ref_file, assoc_files, region=region, out_file=tx_out_file2) else: scalpel._run_scalpel_paired(align_bams, items, ref_file, assoc_files, region=region, out_file=tx_out_file2) out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=items[0]["sam_ref"], config=items[0]["config"], region=region) else: utils.symlink_plus(out_file_mutect, out_file) elif "pindel" in indelcaller.lower(): out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") if pindel.is_installed(items[0]["config"]): pindel._run_tumor_pindel_caller(align_bams, items, ref_file, assoc_files, region=region, out_file=out_file_indels) out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=ref_file, config=items[0]["config"], region=region) else: utils.symlink_plus(out_file_mutect, out_file) elif (("somaticindeldetector" in indelcaller.lower() or "sid" in indelcaller.lower()) and "appistry" in broad_runner.get_mutect_version()): # SomaticIndelDetector InDels out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") params_indels = _SID_call_prep(align_bams, items, ref_file, assoc_files, region, out_file_indels) with file_transaction(config, out_file_indels) as tx_out_file: params_indels += ["-o", tx_out_file] broad_runner.run_mutect(params_indels) out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=items[0]["sam_ref"], config=items[0]["config"], region=region) else: utils.symlink_plus(out_file_mutect, out_file) return out_file