示例#1
0
def _variant_filtration_indel(snp_file, ref_file, vrn_files, config):
    """Filter indel variant calls using GATK best practice recommendations.
    """
    broad_runner = broad.runner_from_config(config)
    filter_type = "INDEL"
    variantcaller = config["algorithm"].get("variantcaller", "gatk")
    if not config_utils.use_vqsr([config["algorithm"]]):
        return vfilter.jexl_hard(broad_runner, snp_file, ref_file, filter_type,
                                 ["QD < 2.0", "ReadPosRankSum < -20.0", "FS > 200.0"])
    else:
        # also check if we've failed recal and needed to do strict filtering
        filter_file = "{base}-filter{ext}.vcf".format(base=os.path.splitext(snp_file)[0], ext=filter_type)
        if file_exists(filter_file):
            config["algorithm"]["coverage_interval"] = "regional"
            return _variant_filtration_indel(snp_file, ref_file, vrn_files, config)
        assert "train_indels" in vrn_files, "Need indel training file specified"
        params, recal_file, tranches_file = _shared_variant_filtration(
            filter_type, snp_file, ref_file, vrn_files, variantcaller)
        if not file_exists(recal_file):
            with file_transaction(recal_file, tranches_file) as (tx_recal, tx_tranches):
                params.extend(["--recal_file", tx_recal,
                               "--tranches_file", tx_tranches])
                if LooseVersion(broad_runner.get_gatk_version()) >= LooseVersion("2.7"):
                    params.extend(["--numBadVariants", "3000"])
                try:
                    broad_runner.new_resources("gatk-vqsr")
                    broad_runner.run_gatk(params, log_error=False)
                except:
                    logger.info("VQSR failed due to lack of training data. Using hard filtering.")
                    config["algorithm"]["coverage_interval"] = "regional"
                    return _variant_filtration_indel(snp_file, ref_file, vrn_files, config)
        return _apply_variant_recal(broad_runner, snp_file, ref_file, recal_file,
                                    tranches_file, filter_type)
示例#2
0
def _variant_filtration_snp(snp_file, ref_file, vrn_files, config):
    """Filter SNP variant calls using GATK best practice recommendations.
    """
    broad_runner = broad.runner_from_config(config)
    filter_type = "SNP"
    variantcaller = config["algorithm"].get("variantcaller", "gatk")
    filters = [
        "QD < 2.0", "MQ < 40.0", "FS > 60.0", "MQRankSum < -12.5",
        "ReadPosRankSum < -8.0"
    ]
    # GATK Haplotype caller (v2.2) appears to have much larger HaplotypeScores
    # resulting in excessive filtering, so avoid this metric
    if variantcaller not in ["gatk-haplotype"]:
        filters.append("HaplotypeScore > 13.0")
    if not config_utils.use_vqsr([config["algorithm"]]):
        return vfilter.jexl_hard(broad_runner, snp_file, ref_file, filter_type,
                                 filters)
    else:
        # also check if we've failed recal and needed to do strict filtering
        filter_file = "{base}-filter{ext}.vcf".format(
            base=os.path.splitext(snp_file)[0], ext=filter_type)
        if file_exists(filter_file):
            config["algorithm"]["coverage_interval"] = "regional"
            return _variant_filtration_snp(snp_file, ref_file, vrn_files,
                                           config)
        assert "train_hapmap" in vrn_files and "train_1000g_omni" in vrn_files, \
            "Need HapMap and 1000 genomes training files"
        params, recal_file, tranches_file = _shared_variant_filtration(
            filter_type, snp_file, ref_file, vrn_files, variantcaller)
        if not file_exists(recal_file):
            with file_transaction(recal_file,
                                  tranches_file) as (tx_recal, tx_tranches):
                params.extend(
                    ["--recal_file", tx_recal, "--tranches_file", tx_tranches])
                try:
                    broad_runner.new_resources("gatk-vqsr")
                    broad_runner.run_gatk(params, log_error=False)
                # Can fail to run if not enough values are present to train. Rerun with regional
                # filtration approach instead
                except:
                    logger.info(
                        "VQSR failed due to lack of training data. Using hard filtering."
                    )
                    config["algorithm"]["coverage_interval"] = "regional"
                    return _variant_filtration_snp(snp_file, ref_file,
                                                   vrn_files, config)
        return _apply_variant_recal(broad_runner, snp_file, ref_file,
                                    recal_file, tranches_file, filter_type)
示例#3
0
def _variant_filtration_indel(snp_file, ref_file, vrn_files, config):
    """Filter indel variant calls using GATK best practice recommendations.
    """
    broad_runner = broad.runner_from_config(config)
    filter_type = "INDEL"
    variantcaller = config["algorithm"].get("variantcaller", "gatk")
    if not config_utils.use_vqsr([config["algorithm"]]):
        return vfilter.jexl_hard(
            broad_runner, snp_file, ref_file, filter_type,
            ["QD < 2.0", "ReadPosRankSum < -20.0", "FS > 200.0"])
    else:
        # also check if we've failed recal and needed to do strict filtering
        filter_file = "{base}-filter{ext}.vcf".format(
            base=os.path.splitext(snp_file)[0], ext=filter_type)
        if file_exists(filter_file):
            config["algorithm"]["coverage_interval"] = "regional"
            return _variant_filtration_indel(snp_file, ref_file, vrn_files,
                                             config)
        assert "train_indels" in vrn_files, "Need indel training file specified"
        params, recal_file, tranches_file = _shared_variant_filtration(
            filter_type, snp_file, ref_file, vrn_files, variantcaller)
        if not file_exists(recal_file):
            with file_transaction(recal_file,
                                  tranches_file) as (tx_recal, tx_tranches):
                params.extend(
                    ["--recal_file", tx_recal, "--tranches_file", tx_tranches])
                if LooseVersion(broad_runner.get_gatk_version()
                                ) >= LooseVersion("2.7"):
                    params.extend(["--numBadVariants", "3000"])
                try:
                    broad_runner.new_resources("gatk-vqsr")
                    broad_runner.run_gatk(params, log_error=False)
                except:
                    logger.info(
                        "VQSR failed due to lack of training data. Using hard filtering."
                    )
                    config["algorithm"]["coverage_interval"] = "regional"
                    return _variant_filtration_indel(snp_file, ref_file,
                                                     vrn_files, config)
        return _apply_variant_recal(broad_runner, snp_file, ref_file,
                                    recal_file, tranches_file, filter_type)
示例#4
0
def _variant_filtration_snp(snp_file, ref_file, vrn_files, config):
    """Filter SNP variant calls using GATK best practice recommendations.
    """
    broad_runner = broad.runner_from_config(config)
    filter_type = "SNP"
    variantcaller = config["algorithm"].get("variantcaller", "gatk")
    filters = ["QD < 2.0", "MQ < 40.0", "FS > 60.0",
               "MQRankSum < -12.5", "ReadPosRankSum < -8.0"]
    # GATK Haplotype caller (v2.2) appears to have much larger HaplotypeScores
    # resulting in excessive filtering, so avoid this metric
    if variantcaller not in ["gatk-haplotype"]:
        filters.append("HaplotypeScore > 13.0")
    if not config_utils.use_vqsr([config["algorithm"]]):
        return vfilter.jexl_hard(broad_runner, snp_file, ref_file, filter_type, filters)
    else:
        # also check if we've failed recal and needed to do strict filtering
        filter_file = "{base}-filter{ext}.vcf".format(base=os.path.splitext(snp_file)[0], ext=filter_type)
        if file_exists(filter_file):
            config["algorithm"]["coverage_interval"] = "regional"
            return _variant_filtration_snp(snp_file, ref_file, vrn_files, config)
        assert "train_hapmap" in vrn_files and "train_1000g_omni" in vrn_files, \
            "Need HapMap and 1000 genomes training files"
        params, recal_file, tranches_file = _shared_variant_filtration(
            filter_type, snp_file, ref_file, vrn_files, variantcaller)
        if not file_exists(recal_file):
            with file_transaction(recal_file, tranches_file) as (tx_recal, tx_tranches):
                params.extend(["--recal_file", tx_recal,
                               "--tranches_file", tx_tranches])
                try:
                    broad_runner.new_resources("gatk-vqsr")
                    broad_runner.run_gatk(params, log_error=False)
                # Can fail to run if not enough values are present to train. Rerun with regional
                # filtration approach instead
                except:
                    logger.info("VQSR failed due to lack of training data. Using hard filtering.")
                    config["algorithm"]["coverage_interval"] = "regional"
                    return _variant_filtration_snp(snp_file, ref_file, vrn_files, config)
        return _apply_variant_recal(broad_runner, snp_file, ref_file, recal_file,
                                    tranches_file, filter_type)