示例#1
0
def gatk_filter_rnaseq(vrn_file, data):
    """
    this incorporates filters listed here, dropping clusters of variants
    within a 35 nucleotide window, high fischer strand values and low
    quality by depth
    https://software.broadinstitute.org/gatk/guide/article?id=3891
    java -jar GenomeAnalysisTK.jar -T VariantFiltration -R hg_19.fasta -V
    input.vcf -window 35 -cluster 3 -filterName FS -filter "FS > 30.0"
    -filterName QD -filter "QD < 2.0" -o output.vcf
    """
    out_file = "%s-filter%s" % utils.splitext_plus(vrn_file)
    if not file_exists(out_file):
        ref_file = dd.get_ref_file(data)
        with file_transaction(data, out_file) as tx_out_file:
            params = ["VariantFiltration",
                      "-R", ref_file,
                      "-V", vrn_file,
                      "--cluster-window-size", "35",
                      "--cluster-size", "3",
                      "--filter-expression", "'FS > 30.0'",
                      "--filter-name", "FS",
                      "--filter-expression", "'QD < 2.0'",
                      "--filter-name", "QD",
                      "--output", tx_out_file]
            # Use GATK4 for filtering, tools_off is for variant calling
            config = utils.deepish_copy(dd.get_config(data))
            if "gatk4" in dd.get_tools_off({"config": config}):
                config["algorithm"]["tools_off"].remove("gatk4")
            jvm_opts = broad.get_gatk_opts(config, os.path.dirname(tx_out_file))
            do.run(broad.gatk_cmd("gatk", jvm_opts, params, config), "Filter RNA-seq variants.")
    return out_file
示例#2
0
def gatk_filter_rnaseq(vrn_file, data):
    """
    this incorporates filters listed here, dropping clusters of variants
    within a 35 nucleotide window, high fischer strand values and low
    quality by depth
    https://software.broadinstitute.org/gatk/guide/article?id=3891
    java -jar GenomeAnalysisTK.jar -T VariantFiltration -R hg_19.fasta -V
    input.vcf -window 35 -cluster 3 -filterName FS -filter "FS > 30.0"
    -filterName QD -filter "QD < 2.0" -o output.vcf
    """
    out_file = "%s-filter%s" % utils.splitext_plus(vrn_file)
    if not file_exists(out_file):
        ref_file = dd.get_ref_file(data)
        with file_transaction(data, out_file) as tx_out_file:
            params = [
                "VariantFiltration", "-R", ref_file, "-V", vrn_file,
                "--cluster-window-size", "35", "--cluster-size", "3",
                "--filter-expression", "'FS > 30.0'", "--filter-name", "FS",
                "--filter-expression", "'QD < 2.0'", "--filter-name", "QD",
                "--output", tx_out_file
            ]
            jvm_opts = broad.get_gatk_opts(dd.get_config(data),
                                           os.path.dirname(tx_out_file))
            do.run(broad.gatk_cmd("gatk", jvm_opts, params),
                   "Filter RNA-seq variants.")
    return out_file
示例#3
0
def _filter_paired(tumor, normal, out_file, reference, data):
    """filter paired vcf file with GATK
    :param    tumor: (str) sample name for tumor
    :param    normal: (str) sample name for normal
    :param    out_file: (str) final vcf file
    :param    reference: (str) genome in fasta format
    :param    data: (dict) information from yaml file(items[0])
    :returns: (str) name of final vcf file
    """
    in_file = utils.splitext_plus(out_file)[0] + "-tmp.vcf"
    shutil.move(out_file, in_file)
    config = data["config"]
    with file_transaction(data, out_file) as tx_out_file:
        params = ["-T", "SomaticPindelFilter", "-V", in_file, "-o",
                  tx_out_file, "-TID", tumor, "-NID", normal, "-R", reference]
        jvm_opts = broad.get_gatk_opts(config)
        do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params), "Filter pindel variants")
    return out_file
示例#4
0
def _filter_bad_reads(in_bam, ref_file, data):
    """Use GATK filter to remove problem reads which choke GATK and Picard.
    """
    bam.index(in_bam, data["config"])
    out_file = "%s-gatkfilter.bam" % os.path.splitext(in_bam)[0]
    if not utils.file_exists(out_file):
        with tx_tmpdir(data) as tmp_dir:
            with file_transaction(data, out_file) as tx_out_file:
                params = [
                    ("FixMisencodedBaseQualityReads" if dd.get_quality_format(
                        data, "").lower() == "illumina" else "PrintReads"),
                    "-R", ref_file, "-I", in_bam, "-O", tx_out_file, "-RF",
                    "MatchingBasesAndQualsReadFilter", "-RF",
                    "SeqIsStoredReadFilter", "-RF", "CigarContainsNoNOperator"
                ]
                jvm_opts = broad.get_gatk_opts(data["config"], tmp_dir)
                do.run(broad.gatk_cmd("gatk", jvm_opts, params),
                       "Filter problem reads")
    bam.index(out_file, data["config"])
    return out_file
示例#5
0
def _filter_bad_reads(in_bam, ref_file, data):
    """Use GATK filter to remove problem reads which choke GATK and Picard.
    """
    bam.index(in_bam, data["config"])
    out_file = "%s-gatkfilter.bam" % os.path.splitext(in_bam)[0]
    if not utils.file_exists(out_file):
        with tx_tmpdir(data) as tmp_dir:
            with file_transaction(data, out_file) as tx_out_file:
                params = [("FixMisencodedBaseQualityReads"
                           if dd.get_quality_format(data, "").lower() == "illumina"
                           else "PrintReads"),
                          "-R", ref_file,
                          "-I", in_bam,
                          "-O", tx_out_file,
                          "-RF", "MatchingBasesAndQualsReadFilter",
                          "-RF", "SeqIsStoredReadFilter",
                          "-RF", "CigarContainsNoNOperator"]
                jvm_opts = broad.get_gatk_opts(data["config"], tmp_dir)
                do.run(broad.gatk_cmd("gatk", jvm_opts, params), "Filter problem reads")
    bam.index(out_file, data["config"])
    return out_file