def gatk_filter_rnaseq(vrn_file, data): """ this incorporates filters listed here, dropping clusters of variants within a 35 nucleotide window, high fischer strand values and low quality by depth https://software.broadinstitute.org/gatk/guide/article?id=3891 java -jar GenomeAnalysisTK.jar -T VariantFiltration -R hg_19.fasta -V input.vcf -window 35 -cluster 3 -filterName FS -filter "FS > 30.0" -filterName QD -filter "QD < 2.0" -o output.vcf """ out_file = "%s-filter%s" % utils.splitext_plus(vrn_file) if not file_exists(out_file): ref_file = dd.get_ref_file(data) with file_transaction(data, out_file) as tx_out_file: params = ["VariantFiltration", "-R", ref_file, "-V", vrn_file, "--cluster-window-size", "35", "--cluster-size", "3", "--filter-expression", "'FS > 30.0'", "--filter-name", "FS", "--filter-expression", "'QD < 2.0'", "--filter-name", "QD", "--output", tx_out_file] # Use GATK4 for filtering, tools_off is for variant calling config = utils.deepish_copy(dd.get_config(data)) if "gatk4" in dd.get_tools_off({"config": config}): config["algorithm"]["tools_off"].remove("gatk4") jvm_opts = broad.get_gatk_opts(config, os.path.dirname(tx_out_file)) do.run(broad.gatk_cmd("gatk", jvm_opts, params, config), "Filter RNA-seq variants.") return out_file
def gatk_filter_rnaseq(vrn_file, data): """ this incorporates filters listed here, dropping clusters of variants within a 35 nucleotide window, high fischer strand values and low quality by depth https://software.broadinstitute.org/gatk/guide/article?id=3891 java -jar GenomeAnalysisTK.jar -T VariantFiltration -R hg_19.fasta -V input.vcf -window 35 -cluster 3 -filterName FS -filter "FS > 30.0" -filterName QD -filter "QD < 2.0" -o output.vcf """ out_file = "%s-filter%s" % utils.splitext_plus(vrn_file) if not file_exists(out_file): ref_file = dd.get_ref_file(data) with file_transaction(data, out_file) as tx_out_file: params = [ "VariantFiltration", "-R", ref_file, "-V", vrn_file, "--cluster-window-size", "35", "--cluster-size", "3", "--filter-expression", "'FS > 30.0'", "--filter-name", "FS", "--filter-expression", "'QD < 2.0'", "--filter-name", "QD", "--output", tx_out_file ] jvm_opts = broad.get_gatk_opts(dd.get_config(data), os.path.dirname(tx_out_file)) do.run(broad.gatk_cmd("gatk", jvm_opts, params), "Filter RNA-seq variants.") return out_file
def _filter_paired(tumor, normal, out_file, reference, data): """filter paired vcf file with GATK :param tumor: (str) sample name for tumor :param normal: (str) sample name for normal :param out_file: (str) final vcf file :param reference: (str) genome in fasta format :param data: (dict) information from yaml file(items[0]) :returns: (str) name of final vcf file """ in_file = utils.splitext_plus(out_file)[0] + "-tmp.vcf" shutil.move(out_file, in_file) config = data["config"] with file_transaction(data, out_file) as tx_out_file: params = ["-T", "SomaticPindelFilter", "-V", in_file, "-o", tx_out_file, "-TID", tumor, "-NID", normal, "-R", reference] jvm_opts = broad.get_gatk_opts(config) do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params), "Filter pindel variants") return out_file
def _filter_bad_reads(in_bam, ref_file, data): """Use GATK filter to remove problem reads which choke GATK and Picard. """ bam.index(in_bam, data["config"]) out_file = "%s-gatkfilter.bam" % os.path.splitext(in_bam)[0] if not utils.file_exists(out_file): with tx_tmpdir(data) as tmp_dir: with file_transaction(data, out_file) as tx_out_file: params = [ ("FixMisencodedBaseQualityReads" if dd.get_quality_format( data, "").lower() == "illumina" else "PrintReads"), "-R", ref_file, "-I", in_bam, "-O", tx_out_file, "-RF", "MatchingBasesAndQualsReadFilter", "-RF", "SeqIsStoredReadFilter", "-RF", "CigarContainsNoNOperator" ] jvm_opts = broad.get_gatk_opts(data["config"], tmp_dir) do.run(broad.gatk_cmd("gatk", jvm_opts, params), "Filter problem reads") bam.index(out_file, data["config"]) return out_file
def _filter_bad_reads(in_bam, ref_file, data): """Use GATK filter to remove problem reads which choke GATK and Picard. """ bam.index(in_bam, data["config"]) out_file = "%s-gatkfilter.bam" % os.path.splitext(in_bam)[0] if not utils.file_exists(out_file): with tx_tmpdir(data) as tmp_dir: with file_transaction(data, out_file) as tx_out_file: params = [("FixMisencodedBaseQualityReads" if dd.get_quality_format(data, "").lower() == "illumina" else "PrintReads"), "-R", ref_file, "-I", in_bam, "-O", tx_out_file, "-RF", "MatchingBasesAndQualsReadFilter", "-RF", "SeqIsStoredReadFilter", "-RF", "CigarContainsNoNOperator"] jvm_opts = broad.get_gatk_opts(data["config"], tmp_dir) do.run(broad.gatk_cmd("gatk", jvm_opts, params), "Filter problem reads") bam.index(out_file, data["config"]) return out_file