def _variant_filtration_indel(snp_file, ref_file, vrn_files, config): """Filter indel variant calls using GATK best practice recommendations. """ broad_runner = broad.runner_from_config(config) filter_type = "INDEL" variantcaller = config["algorithm"].get("variantcaller", "gatk") if not config_utils.use_vqsr([config["algorithm"]]): return vfilter.jexl_hard(broad_runner, snp_file, ref_file, filter_type, ["QD < 2.0", "ReadPosRankSum < -20.0", "FS > 200.0"]) else: # also check if we've failed recal and needed to do strict filtering filter_file = "{base}-filter{ext}.vcf".format(base=os.path.splitext(snp_file)[0], ext=filter_type) if file_exists(filter_file): config["algorithm"]["coverage_interval"] = "regional" return _variant_filtration_indel(snp_file, ref_file, vrn_files, config) assert "train_indels" in vrn_files, "Need indel training file specified" params, recal_file, tranches_file = _shared_variant_filtration( filter_type, snp_file, ref_file, vrn_files, variantcaller) if not file_exists(recal_file): with file_transaction(recal_file, tranches_file) as (tx_recal, tx_tranches): params.extend(["--recal_file", tx_recal, "--tranches_file", tx_tranches]) if LooseVersion(broad_runner.get_gatk_version()) >= LooseVersion("2.7"): params.extend(["--numBadVariants", "3000"]) try: broad_runner.new_resources("gatk-vqsr") broad_runner.run_gatk(params, log_error=False) except: logger.info("VQSR failed due to lack of training data. Using hard filtering.") config["algorithm"]["coverage_interval"] = "regional" return _variant_filtration_indel(snp_file, ref_file, vrn_files, config) return _apply_variant_recal(broad_runner, snp_file, ref_file, recal_file, tranches_file, filter_type)
def _variant_filtration_snp(snp_file, ref_file, vrn_files, config): """Filter SNP variant calls using GATK best practice recommendations. """ broad_runner = broad.runner_from_config(config) filter_type = "SNP" variantcaller = config["algorithm"].get("variantcaller", "gatk") filters = [ "QD < 2.0", "MQ < 40.0", "FS > 60.0", "MQRankSum < -12.5", "ReadPosRankSum < -8.0" ] # GATK Haplotype caller (v2.2) appears to have much larger HaplotypeScores # resulting in excessive filtering, so avoid this metric if variantcaller not in ["gatk-haplotype"]: filters.append("HaplotypeScore > 13.0") if not config_utils.use_vqsr([config["algorithm"]]): return vfilter.jexl_hard(broad_runner, snp_file, ref_file, filter_type, filters) else: # also check if we've failed recal and needed to do strict filtering filter_file = "{base}-filter{ext}.vcf".format( base=os.path.splitext(snp_file)[0], ext=filter_type) if file_exists(filter_file): config["algorithm"]["coverage_interval"] = "regional" return _variant_filtration_snp(snp_file, ref_file, vrn_files, config) assert "train_hapmap" in vrn_files and "train_1000g_omni" in vrn_files, \ "Need HapMap and 1000 genomes training files" params, recal_file, tranches_file = _shared_variant_filtration( filter_type, snp_file, ref_file, vrn_files, variantcaller) if not file_exists(recal_file): with file_transaction(recal_file, tranches_file) as (tx_recal, tx_tranches): params.extend( ["--recal_file", tx_recal, "--tranches_file", tx_tranches]) try: broad_runner.new_resources("gatk-vqsr") broad_runner.run_gatk(params, log_error=False) # Can fail to run if not enough values are present to train. Rerun with regional # filtration approach instead except: logger.info( "VQSR failed due to lack of training data. Using hard filtering." ) config["algorithm"]["coverage_interval"] = "regional" return _variant_filtration_snp(snp_file, ref_file, vrn_files, config) return _apply_variant_recal(broad_runner, snp_file, ref_file, recal_file, tranches_file, filter_type)
def _variant_filtration_indel(snp_file, ref_file, vrn_files, config): """Filter indel variant calls using GATK best practice recommendations. """ broad_runner = broad.runner_from_config(config) filter_type = "INDEL" variantcaller = config["algorithm"].get("variantcaller", "gatk") if not config_utils.use_vqsr([config["algorithm"]]): return vfilter.jexl_hard( broad_runner, snp_file, ref_file, filter_type, ["QD < 2.0", "ReadPosRankSum < -20.0", "FS > 200.0"]) else: # also check if we've failed recal and needed to do strict filtering filter_file = "{base}-filter{ext}.vcf".format( base=os.path.splitext(snp_file)[0], ext=filter_type) if file_exists(filter_file): config["algorithm"]["coverage_interval"] = "regional" return _variant_filtration_indel(snp_file, ref_file, vrn_files, config) assert "train_indels" in vrn_files, "Need indel training file specified" params, recal_file, tranches_file = _shared_variant_filtration( filter_type, snp_file, ref_file, vrn_files, variantcaller) if not file_exists(recal_file): with file_transaction(recal_file, tranches_file) as (tx_recal, tx_tranches): params.extend( ["--recal_file", tx_recal, "--tranches_file", tx_tranches]) if LooseVersion(broad_runner.get_gatk_version() ) >= LooseVersion("2.7"): params.extend(["--numBadVariants", "3000"]) try: broad_runner.new_resources("gatk-vqsr") broad_runner.run_gatk(params, log_error=False) except: logger.info( "VQSR failed due to lack of training data. Using hard filtering." ) config["algorithm"]["coverage_interval"] = "regional" return _variant_filtration_indel(snp_file, ref_file, vrn_files, config) return _apply_variant_recal(broad_runner, snp_file, ref_file, recal_file, tranches_file, filter_type)
def _variant_filtration_snp(snp_file, ref_file, vrn_files, config): """Filter SNP variant calls using GATK best practice recommendations. """ broad_runner = broad.runner_from_config(config) filter_type = "SNP" variantcaller = config["algorithm"].get("variantcaller", "gatk") filters = ["QD < 2.0", "MQ < 40.0", "FS > 60.0", "MQRankSum < -12.5", "ReadPosRankSum < -8.0"] # GATK Haplotype caller (v2.2) appears to have much larger HaplotypeScores # resulting in excessive filtering, so avoid this metric if variantcaller not in ["gatk-haplotype"]: filters.append("HaplotypeScore > 13.0") if not config_utils.use_vqsr([config["algorithm"]]): return vfilter.jexl_hard(broad_runner, snp_file, ref_file, filter_type, filters) else: # also check if we've failed recal and needed to do strict filtering filter_file = "{base}-filter{ext}.vcf".format(base=os.path.splitext(snp_file)[0], ext=filter_type) if file_exists(filter_file): config["algorithm"]["coverage_interval"] = "regional" return _variant_filtration_snp(snp_file, ref_file, vrn_files, config) assert "train_hapmap" in vrn_files and "train_1000g_omni" in vrn_files, \ "Need HapMap and 1000 genomes training files" params, recal_file, tranches_file = _shared_variant_filtration( filter_type, snp_file, ref_file, vrn_files, variantcaller) if not file_exists(recal_file): with file_transaction(recal_file, tranches_file) as (tx_recal, tx_tranches): params.extend(["--recal_file", tx_recal, "--tranches_file", tx_tranches]) try: broad_runner.new_resources("gatk-vqsr") broad_runner.run_gatk(params, log_error=False) # Can fail to run if not enough values are present to train. Rerun with regional # filtration approach instead except: logger.info("VQSR failed due to lack of training data. Using hard filtering.") config["algorithm"]["coverage_interval"] = "regional" return _variant_filtration_snp(snp_file, ref_file, vrn_files, config) return _apply_variant_recal(broad_runner, snp_file, ref_file, recal_file, tranches_file, filter_type)