def gatk_snp_cutoff(in_file, data): """Perform cutoff-based soft filtering on GATK SNPs using best-practice recommendations. We have a more lenient mapping quality (MQ) filter compared to GATK defaults. The recommended filter (MQ < 40) is too stringent, so we adjust to 30: http://imgur.com/a/oHRVB QD and FS are not calculated when generating gVCF output: https://github.com/broadgsa/gatk-protected/blob/e91472ddc7d58ace52db0cab4d70a072a918d64c/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java#L300 The extra command removes escaped quotes in the VCF output which pyVCF fails on. Does not use the GATK best practice recommend SOR filter (SOR > 3.0) as it has a negative impact on sensitivity relative to precision: https://github.com/bcbio/bcbio_validations/tree/master/gatk4#na12878-hg38 """ filters = ["MQRankSum < -12.5", "ReadPosRankSum < -8.0"] # GATK Haplotype caller (v2.2) appears to have much larger HaplotypeScores # resulting in excessive filtering, so avoid this metric variantcaller = utils.get_in(data, ("config", "algorithm", "variantcaller")) if variantcaller not in ["gatk-haplotype", "haplotyper"]: filters.append("HaplotypeScore > 13.0") # Additional filter metrics, unless using raw GATK HaplotypeCaller or Sentieon gVCFs if not (vcfutils.is_gvcf_file(in_file) and variantcaller in ["gatk-haplotype", "haplotyper"]): filters += ["QD < 2.0"] filters += ["FS > 60.0"] filters += _gatk_general() # Additional filter metrics, unless using raw Sentieon gVCFs if not (vcfutils.is_gvcf_file(in_file) and variantcaller in ["haplotyper"]): filters += ["MQ < 30.0"] return cutoff_w_expression(in_file, 'TYPE="snp" && (%s)' % " || ".join(filters), data, "GATKCutoffSNP", "SNP", extra_cmd=r"""| sed 's/\\"//g'""")
def get_analysis_intervals(data, vrn_file, base_dir): """Retrieve analysis regions for the current variant calling pipeline. """ from bcbio.bam import callable if vrn_file and vcfutils.is_gvcf_file(vrn_file): callable_bed = _callable_from_gvcf(data, vrn_file, base_dir) if callable_bed: return callable_bed if data.get("ensemble_bed"): return data["ensemble_bed"] elif dd.get_sample_callable(data): return dd.get_sample_callable(data) elif data.get("align_bam"): return callable.sample_callable_bed(data["align_bam"], dd.get_ref_file(data), data)[0] elif data.get("work_bam"): return callable.sample_callable_bed(data["work_bam"], dd.get_ref_file(data), data)[0] elif data.get("work_bam_callable"): data = utils.deepish_copy(data) data["work_bam"] = data.pop("work_bam_callable") return callable.sample_callable_bed(data["work_bam"], dd.get_ref_file(data), data)[0] elif tz.get_in(["config", "algorithm", "callable_regions"], data): return tz.get_in(["config", "algorithm", "callable_regions"], data) elif tz.get_in(["config", "algorithm", "variant_regions"], data): return tz.get_in(["config", "algorithm", "variant_regions"], data)
def run(call_file, ref_file, vrn_files, data): """Run filtering on the input call file, handling SNPs and indels separately. """ algs = [data["config"]["algorithm"]] * len(data.get("vrn_files", [1])) if config_utils.use_vqsr(algs): if vcfutils.is_gvcf_file(call_file): raise ValueError( "Cannot force gVCF output with joint calling using tools_on: [gvcf] and use VQSR. " "Try using cutoff-based soft filtering with tools_off: [vqsr]") snp_file, indel_file = vcfutils.split_snps_indels( call_file, ref_file, data["config"]) snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files, data, "SNP", vfilter.gatk_snp_cutoff) indel_filter_file = _variant_filtration(indel_file, ref_file, vrn_files, data, "INDEL", vfilter.gatk_indel_cutoff) orig_files = [snp_filter_file, indel_filter_file] out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files) combined_file = vcfutils.combine_variant_files(orig_files, out_file, ref_file, data["config"]) return combined_file else: snp_filter = vfilter.gatk_snp_cutoff(call_file, data) indel_filter = vfilter.gatk_indel_cutoff(snp_filter, data) return indel_filter
def gatk_snp_cutoff(in_file, data): """Perform cutoff-based soft filtering on GATK SNPs using best-practice recommendations. We have a more lenient mapping quality (MQ) filter compared to GATK defaults. The recommended filter (MQ < 40) is too stringent, so we adjust to 30: http://imgur.com/a/oHRVB QD and FS are not calculated when generating gVCF output: https://github.com/broadgsa/gatk-protected/blob/e91472ddc7d58ace52db0cab4d70a072a918d64c/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java#L300 The extra command removes escaped quotes in the VCF output which pyVCF fails on. Does not use the GATK best practice recommend SOR filter (SOR > 3.0) as it has a negative impact on sensitivity relative to precision: https://github.com/bcbio/bcbio_validations/tree/master/gatk4#na12878-hg38 """ filters = ["MQRankSum < -12.5", "ReadPosRankSum < -8.0"] # GATK Haplotype caller (v2.2) appears to have much larger HaplotypeScores # resulting in excessive filtering, so avoid this metric variantcaller = utils.get_in(data, ("config", "algorithm", "variantcaller")) if variantcaller not in ["gatk-haplotype", "haplotyper"]: filters.append("HaplotypeScore > 13.0") # Additional filter metrics, unless using raw GATK HaplotypeCaller or Sentieon gVCFs if not (vcfutils.is_gvcf_file(in_file) and variantcaller in ["gatk-haplotype", "haplotyper"]): filters += ["QD < 2.0"] filters += ["FS > 60.0"] filters += _gatk_general() filters += ["MQ < 30.0"] return cutoff_w_expression(in_file, 'TYPE="snp" && (%s)' % " || ".join(filters), data, "GATKCutoffSNP", "SNP", extra_cmd=r"""| sed 's/\\"//g'""")
def use_vqsr(algs, call_file=None): """Processing uses GATK's Variant Quality Score Recalibration. """ from bcbio.variation import vcfutils vqsr_callers = set(["gatk", "gatk-haplotype"]) vqsr_sample_thresh = 50 vqsr_supported = collections.defaultdict(int) coverage_intervals = set([]) for alg in algs: callers = alg.get("variantcaller") if isinstance(callers, basestring): callers = [callers] if not callers: # no variant calling, no VQSR continue if "vqsr" in (alg.get("tools_off") or []): # VQSR turned off continue for c in callers: if c in vqsr_callers: if "vqsr" in (alg.get("tools_on") or []): # VQSR turned on: vqsr_supported[c] += 1 coverage_intervals.add("genome") # Do not try VQSR for gVCF inputs elif call_file and vcfutils.is_gvcf_file(call_file): pass else: coverage_intervals.add(alg.get("coverage_interval", "exome").lower()) vqsr_supported[c] += 1 if len(vqsr_supported) > 0: num_samples = max(vqsr_supported.values()) if "genome" in coverage_intervals or num_samples >= vqsr_sample_thresh: return True return False
def run(call_file, ref_file, vrn_files, data): """Run filtering on the input call file, handling SNPs and indels separately. """ algs = [data["config"]["algorithm"]] * len(data.get("vrn_files", [1])) if includes_missingalt(data): logger.info("Removing variants with missing alts from %s." % call_file) call_file = gatk_remove_missingalt(call_file, data) if "gatkcnn" in dd.get_tools_on(data): return _cnn_filter(call_file, vrn_files, data) elif config_utils.use_vqsr(algs, call_file): if vcfutils.is_gvcf_file(call_file): raise ValueError("Cannot force gVCF output with joint calling using tools_on: [gvcf] and use VQSR. " "Try using cutoff-based soft filtering with tools_off: [vqsr]") snp_file, indel_file = vcfutils.split_snps_indels(call_file, ref_file, data["config"]) snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files, data, "SNP", vfilter.gatk_snp_cutoff) indel_filter_file = _variant_filtration(indel_file, ref_file, vrn_files, data, "INDEL", vfilter.gatk_indel_cutoff) orig_files = [snp_filter_file, indel_filter_file] out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files) combined_file = vcfutils.combine_variant_files(orig_files, out_file, ref_file, data["config"]) return combined_file else: snp_filter = vfilter.gatk_snp_cutoff(call_file, data) indel_filter = vfilter.gatk_indel_cutoff(snp_filter, data) return indel_filter
def use_vqsr(algs, call_file=None): """Processing uses GATK's Variant Quality Score Recalibration. """ from bcbio.variation import vcfutils vqsr_callers = set(["gatk", "gatk-haplotype"]) vqsr_sample_thresh = 50 vqsr_supported = collections.defaultdict(int) coverage_intervals = set([]) for alg in algs: callers = alg.get("variantcaller") if isinstance(callers, six.string_types): callers = [callers] if not callers: # no variant calling, no VQSR continue if "vqsr" in (alg.get("tools_off") or []): # VQSR turned off continue for c in callers: if c in vqsr_callers: if "vqsr" in (alg.get("tools_on") or []): # VQSR turned on: vqsr_supported[c] += 1 coverage_intervals.add("genome") # Do not try VQSR for gVCF inputs elif call_file and vcfutils.is_gvcf_file(call_file): pass else: coverage_intervals.add(alg.get("coverage_interval", "exome").lower()) vqsr_supported[c] += 1 if len(vqsr_supported) > 0: num_samples = max(vqsr_supported.values()) if "genome" in coverage_intervals or num_samples >= vqsr_sample_thresh: return True return False
def get_analysis_intervals(data, vrn_file, base_dir): """Retrieve analysis regions for the current variant calling pipeline. """ from bcbio.bam import callable if vrn_file and vcfutils.is_gvcf_file(vrn_file): callable_bed = _callable_from_gvcf(data, vrn_file, base_dir) if callable_bed: return callable_bed if data.get("ensemble_bed"): return data["ensemble_bed"] elif dd.get_sample_callable(data): return dd.get_sample_callable(data) elif data.get("align_bam"): return callable.sample_callable_bed(data["align_bam"], dd.get_ref_file(data), data)[0] elif data.get("work_bam"): return callable.sample_callable_bed(data["work_bam"], dd.get_ref_file(data), data)[0] elif data.get("work_bam_callable"): data = utils.deepish_copy(data) data["work_bam"] = data.pop("work_bam_callable") return callable.sample_callable_bed(data["work_bam"], dd.get_ref_file(data), data)[0] elif tz.get_in(["config", "algorithm", "callable_regions"], data): return tz.get_in(["config", "algorithm", "callable_regions"], data) elif tz.get_in(["config", "algorithm", "variant_regions"], data): return tz.get_in(["config", "algorithm", "variant_regions"], data)
def gatk_indel_cutoff(in_file, data): """Perform cutoff-based soft filtering on GATK indels using best-practice recommendations. """ filters = ["ReadPosRankSum < -20.0"] variantcaller = utils.get_in(data, ("config", "algorithm", "variantcaller")) # Additional filter metrics, unless using raw GATK HaplotypeCaller or Sentieon gVCFs if not (vcfutils.is_gvcf_file(in_file) and variantcaller in ["gatk-haplotype", "haplotyper"]): filters += ["QD < 2.0"] filters += ["FS > 200.0"] filters += ["SOR > 10.0"] filters += _gatk_general() return cutoff_w_expression(in_file, 'TYPE="indel" && (%s)' % " || ".join(filters), data, "GATKCutoffIndel", "INDEL", extra_cmd=r"""| sed 's/\\"//g'""")
def gatk_indel_cutoff(in_file, data): """Perform cutoff-based soft filtering on GATK indels using best-practice recommendations. """ filters = ["ReadPosRankSum < -20.0"] variantcaller = utils.get_in(data, ("config", "algorithm", "variantcaller")) # Additional filter metrics, unless using raw GATK HaplotypeCaller or Sentieon gVCFs if not (vcfutils.is_gvcf_file(in_file) and variantcaller in ["gatk-haplotype", "haplotyper"]): filters += ["QD < 2.0"] filters += ["FS > 200.0"] filters += ["SOR > 10.0"] filters += _gatk_general() return cutoff_w_expression(in_file, 'TYPE="indel" && (%s)' % " || ".join(filters), data, "GATKCutoffIndel", "INDEL", extra_cmd=r"""| sed 's/\\"//g'""")
def platypus(in_file, data): """Filter Platypus calls, removing Q20 filter and replacing with depth and quality based filter. Platypus uses its own VCF nomenclature: TC == DP, FR == AF Platypus gVCF output appears to have an 0/1 index problem so the reference block regions are 1 base outside regions of interest. We avoid limiting regions during filtering when using it. """ filters = ('(FR[0] <= 0.5 && TC < 4 && %QUAL < 20) || ' '(TC < 13 && %QUAL < 10) || ' '(FR[0] > 0.5 && TC < 4 && %QUAL < 50)') limit_regions = "variant_regions" if not vcfutils.is_gvcf_file(in_file) else None return cutoff_w_expression(in_file, filters, data, name="PlatQualDepth", extra_cmd="| sed 's/\\tQ20\\t/\\tPASS\\t/'", limit_regions=limit_regions)
def platypus(in_file, data): """Filter Platypus calls, removing Q20 filter and replacing with depth and quality based filter. Platypus uses its own VCF nomenclature: TC == DP, FR == AF Platypus gVCF output appears to have an 0/1 index problem so the reference block regions are 1 base outside regions of interest. We avoid limiting regions during filtering when using it. """ filters = ('(FR[0] <= 0.5 && TC < 4 && %QUAL < 20) || ' '(TC < 13 && %QUAL < 10) || ' '(FR[0] > 0.5 && TC < 4 && %QUAL < 50)') limit_regions = "variant_regions" if not vcfutils.is_gvcf_file(in_file) else None return cutoff_w_expression(in_file, filters, data, name="PlatQualDepth", extra_cmd="| sed 's/\\tQ20\\t/\\tPASS\\t/'", limit_regions=limit_regions)
def _add_dbsnp(orig_file, dbsnp_file, data, out_file=None, post_cl=None): """Annotate a VCF file with dbSNP. Adds rsIDs to NON_REF positions for gVCF inputs, but requires strict allele matching for non-gVCF. """ orig_file = vcfutils.bgzip_and_index(orig_file, data["config"]) if out_file is None: out_file = "%s-wdbsnp.vcf.gz" % utils.splitext_plus(orig_file)[0] if not utils.file_uptodate(out_file, orig_file): with file_transaction(data, out_file) as tx_out_file: conf_file = os.path.join(os.path.dirname(out_file), "dbsnp.conf") with open(conf_file, "w") as out_handle: out_handle.write(_DBSNP_TEMPLATE % os.path.normpath(os.path.join(dd.get_work_dir(data), dbsnp_file))) if not post_cl: post_cl = "" cores = dd.get_num_cores(data) opts = " -permissive-overlap" if vcfutils.is_gvcf_file(orig_file) else "" cmd = ("vcfanno -p {cores}{opts} {conf_file} {orig_file} | {post_cl} " "bgzip -c > {tx_out_file}") do.run(cmd.format(**locals()), "Annotate with dbSNP") return vcfutils.bgzip_and_index(out_file, data["config"])