def _call_variants_samtools(align_bams, ref_file, items, target_regions, tx_out_file): """Call variants with samtools in target_regions. Works around a GATK VCF 4.2 compatibility issue in samtools 1.0 by removing addition 4.2-only isms from VCF header lines. """ config = items[0]["config"] mpileup = prep_mpileup(align_bams, ref_file, config, target_regions=target_regions, want_bcf=True) bcftools = config_utils.get_program("bcftools", config) samtools_version = programs.get_version("samtools", config=config) if samtools_version and LooseVersion(samtools_version) <= LooseVersion("0.1.19"): raise ValueError("samtools calling not supported with pre-1.0 samtools") bcftools_opts = "call -v -m" compress_cmd = "| bgzip -c" if tx_out_file.endswith(".gz") else "" fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) cmd = ("{mpileup} " "| {bcftools} {bcftools_opts} - " "| {fix_ambig_ref} | {fix_ambig_alt} " "| vt normalize -n -q -r {ref_file} - " "| sed 's/VCFv4.2/VCFv4.1/' " "| sed 's/,Version=3>/>/' " "| sed 's/,Version=\"3\">/>/' " "| sed 's/Number=R/Number=./' " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Variant calling with samtools", items[0])
def _run_freebayes_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with FreeBayes. Performs post-filtering to remove very low quality variants which can cause issues feeding into GATK. Breaks variants into individual allelic primitives for analysis and evaluation. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: for align_bam in align_bams: bam.index(align_bam, config) freebayes = config_utils.get_program("freebayes", config) vcffilter = config_utils.get_program("vcffilter", config) vcfallelicprimitives = config_utils.get_program("vcfallelicprimitives", config) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) input_bams = " ".join("-b %s" % x for x in align_bams) opts = " ".join(_freebayes_options_from_config(items, config, out_file, region)) # Recommended options from 1000 genomes low-complexity evaluation # https://groups.google.com/d/msg/freebayes/GvxIzjcpbas/1G6e3ArxQ4cJ opts += " --min-repeat-entropy 1 --experimental-gls" compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" fix_ambig = vcfutils.fix_ambiguous_cl() cmd = ("{freebayes} -f {ref_file} {input_bams} {opts} | " "{vcffilter} -f 'QUAL > 5' -s | {fix_ambig} | {vcfallelicprimitives} | {vcfstreamsort} " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with FreeBayes", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def run(align_bams, items, ref_file, assoc_files, region, out_file): """Run platypus variant calling, germline whole genome or exome. """ assert out_file.endswith(".vcf.gz") if not utils.file_exists(out_file): p_out_file = out_file.replace(".vcf.gz", ".vcf") with file_transaction(items[0], p_out_file) as tx_out_file: for align_bam in align_bams: bam.index(align_bam, items[0]["config"]) cmd = ["platypus", "callVariants", "--regions=%s" % _bed_to_platypusin(region, out_file, items), "--bamFiles=%s" % ",".join(align_bams), "--refFile=%s" % dd.get_ref_file(items[0]), "--output=%s" % tx_out_file, "--logFileName", "/dev/null", "--verbosity=1"] cmd += ["--assemble=1"] cmd += ["--hapScoreThreshold", "10", "--scThreshold", "0.99", "--filteredReadsFrac", "0.9"] # Avoid filtering duplicates on high depth targeted regions where we don't mark duplicates if any(not tz.get_in(["config", "algorithm", "mark_duplicates"], data, True) for data in items): cmd += ["--filterDuplicates=0"] do.run(cmd, "platypus variant calling") if p_out_file != out_file: post_process_cmd = "%s | vcfallelicprimitives | vcfstreamsort" % vcfutils.fix_ambiguous_cl() b_out_file = vcfutils.bgzip_and_index(p_out_file, items[0]["config"], prep_cmd=post_process_cmd) assert b_out_file == out_file return out_file
def rnaseq_vardict_variant_calling(data): sample = dd.get_sample_name(data) variation_dir = os.path.join(dd.get_work_dir(data), "variation") safe_makedir(variation_dir) out_file = os.path.join(variation_dir, sample + "-vardict.vcf.gz") if file_exists(out_file): data = dd.set_vrn_file(data, out_file) return data vardict_cmd = vardict.get_vardict_command(data) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" vcfstreamsort = config_utils.get_program("vcfstreamsort", data) compress_cmd = "| bgzip -c" freq = float(dd.get_min_allele_fraction(data, 20) / 100.0) var2vcf_opts = "-v 50" fix_ambig = vcfutils.fix_ambiguous_cl() remove_dup = vcfutils.remove_dup_cl() r_setup = ("unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(Rscript_cmd())) ref_file = dd.get_ref_file(data) bamfile = dd.get_work_bam(data) bed_file = gtf.gtf_to_bed(dd.get_gtf_file(data)) opts = " -c 1 -S 2 -E 3 -g 4 " with file_transaction(out_file) as tx_out_file: jvm_opts = vardict._get_jvm_opts(data, tx_out_file) cmd = ("{r_setup}{jvm_opts}{vardict_cmd} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} {bed_file} " "| {strandbias}" "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} " "| {fix_ambig} | {remove_dup} | {vcfstreamsort} {compress_cmd} " "> {tx_out_file}") message = "Calling RNA-seq variants with VarDict" do.run(cmd.format(**locals()), message) data = dd.set_vrn_file(data, out_file) return data
def run(align_bams, items, ref_file, assoc_files, region, out_file): """Run platypus variant calling, germline whole genome or exome. """ assert out_file.endswith(".vcf.gz") if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: for align_bam in align_bams: bam.index(align_bam, items[0]["config"]) cmd = ["platypus", "callVariants", "--regions=%s" % _bed_to_platypusin(region, out_file, items), "--bamFiles=%s" % ",".join(align_bams), "--refFile=%s" % dd.get_ref_file(items[0]), "--output=-", "--logFileName", "/dev/null", "--verbosity=1"] cmd += ["--assemble=1"] # Adjust default filter thresholds to achieve similar sensitivity/specificity to other callers cmd += ["--hapScoreThreshold", "10", "--scThreshold", "0.99", "--filteredReadsFrac", "0.9", "--rmsmqThreshold", "20", "--qdThreshold", "0", "--abThreshold", "0.0001", "--minVarFreq", "0.0"] # Avoid filtering duplicates on high depth targeted regions where we don't mark duplicates if any(not tz.get_in(["config", "algorithm", "mark_duplicates"], data, True) for data in items): cmd += ["--filterDuplicates=0"] post_process_cmd = " | %s | vcfallelicprimitives | vcfstreamsort | bgzip -c > %s" % ( vcfutils.fix_ambiguous_cl(), tx_out_file) do.run(" ".join(cmd) + post_process_cmd, "platypus variant calling") out_file = vcfutils.bgzip_and_index(out_file, items[0]["config"]) return out_file
def _run_scalpel_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect indels with Scalpel. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: paired = get_paired_bams(align_bams, items) if not paired.normal_bam: ann_file = _run_scalpel_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vcfstreamsort = config_utils.get_program("vcfstreamsort", config) perl_exports = utils.get_perl_exports(os.path.dirname(tx_out_file)) tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0] db_file = os.path.join(tmp_path, "main", "somatic.db") if not os.path.exists(db_file + ".dir"): if os.path.exists(tmp_path): utils.remove_safe(tmp_path) opts = " ".join(_scalpel_options_from_config(items, config, out_file, region, tmp_path)) opts += " --ref {}".format(ref_file) opts += " --dir %s" % tmp_path # caling cl = ("{perl_exports} && " "scalpel-discovery --somatic {opts} --tumor {paired.tumor_bam} --normal {paired.normal_bam}") do.run(cl.format(**locals()), "Genotyping paired variants with Scalpel", {}) # filtering to adjust input parameters bed_opts = " ".join(_scalpel_bed_file_opts(items, config, out_file, region, tmp_path)) use_defaults = True if use_defaults: scalpel_tmp_file = os.path.join(tmp_path, "main/somatic.indel.vcf") # Uses default filters but can tweak min-alt-count-tumor and min-phred-fisher # to swap precision for sensitivity else: scalpel_tmp_file = os.path.join(tmp_path, "main/somatic-indel-filter.vcf.gz") with file_transaction(config, scalpel_tmp_file) as tx_indel_file: cmd = ("{perl_exports} && " "scalpel-export --somatic {bed_opts} --ref {ref_file} --db {db_file} " "--min-alt-count-tumor 5 --min-phred-fisher 10 --min-vaf-tumor 0.1 " "| bgzip -c > {tx_indel_file}") do.run(cmd.format(**locals()), "Scalpel somatic indel filter", {}) scalpel_tmp_file = bgzip_and_index(scalpel_tmp_file, config) scalpel_tmp_file_common = bgzip_and_index(os.path.join(tmp_path, "main/common.indel.vcf"), config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression("chi2", config) bcftools_cmd_common = get_scalpel_bcftools_filter_expression("reject", config) fix_ambig = vcfutils.fix_ambiguous_cl() cl2 = ("vcfcat <({bcftools_cmd_chi2} {scalpel_tmp_file}) " "<({bcftools_cmd_common} {scalpel_tmp_file_common}) | " " {fix_ambig} | {vcfstreamsort} {compress_cmd} > {tx_out_file}") do.run(cl2.format(**locals()), "Finalising Scalpel variants", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def run(align_bams, items, ref_file, assoc_files, region, out_file): """Run platypus variant calling, germline whole genome or exome. """ assert out_file.endswith(".vcf.gz") if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: for align_bam in align_bams: bam.index(align_bam, items[0]["config"]) cmd = ["platypus", "callVariants", "--regions=%s" % _subset_regions(region, out_file, items), "--bamFiles=%s" % ",".join(align_bams), "--refFile=%s" % dd.get_ref_file(items[0]), "--output=-", "--logFileName", "/dev/null", "--verbosity=1"] resources = config_utils.get_resources("platypus", items[0]["config"]) if resources.get("options"): # normalize options so we can set defaults without overwriting user specified for opt in resources["options"]: if "=" in opt: key, val = opt.split("=") cmd.extend([key, val]) else: cmd.append(opt) if any("gvcf" in dd.get_tools_on(d) for d in items): cmd += ["--outputRefCalls", "1", "--refCallBlockSize", "50000"] # Adjust default filter thresholds to achieve similar sensitivity/specificity to other callers # Currently not used after doing more cross validation as they increase false positives # which seems to be a major advantage for Platypus users. # tuned_opts = ["--hapScoreThreshold", "10", "--scThreshold", "0.99", "--filteredReadsFrac", "0.9", # "--rmsmqThreshold", "20", "--qdThreshold", "0", "--abThreshold", "0.0001", # "--minVarFreq", "0.0", "--assemble", "1"] # for okey, oval in utils.partition_all(2, tuned_opts): # if okey not in cmd: # cmd.extend([okey, oval]) # Avoid filtering duplicates on high depth targeted regions where we don't mark duplicates if any(not dd.get_mark_duplicates(data) for data in items): cmd += ["--filterDuplicates=0"] post_process_cmd = (" | %s | %s | %s | vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | " "vcfstreamsort | bgzip -c > %s" % (vcfutils.fix_ambiguous_cl(), vcfutils.fix_ambiguous_cl(5), vcfutils.add_contig_to_header_cl(items[0]), tx_out_file)) do.run(" ".join(cmd) + post_process_cmd, "platypus variant calling") out_file = vcfutils.bgzip_and_index(out_file, items[0]["config"]) return out_file
def _varscan_work(align_bams, ref_file, items, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ config = items[0]["config"] orig_out_file = out_file out_file = orig_out_file.replace(".vcf.gz", ".vcf") max_read_depth = "1000" sample_list = _create_sample_list(align_bams, out_file) mpileup = samtools.prep_mpileup(align_bams, ref_file, config, max_read_depth, target_regions=target_regions, want_bcf=False) # VarScan fails to generate a header on files that start with # zerocoverage calls; strip these with grep, we're not going to # call on them remove_zerocoverage = r"{ ifne grep -v -P '\t0\t\t$' || true; }" # we use ifne from moreutils to ensure we process only on files with input, skipping otherwise # http://manpages.ubuntu.com/manpages/natty/man1/ifne.1.html with tx_tmpdir(items[0]) as tmp_dir: jvm_opts = _get_jvm_opts(config, tmp_dir) opts = " ".join(_varscan_options_from_config(config)) min_af = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) py_cl = os.path.join(os.path.dirname(sys.executable), "py") export = utils.local_path_export() cmd = ("{export} {mpileup} | {remove_zerocoverage} | " "ifne varscan {jvm_opts} mpileup2cns {opts} " "--vcf-sample-list {sample_list} --min-var-freq {min_af} --output-vcf --variants | " """{py_cl} -x 'bcbio.variation.vcfutils.add_contig_to_header(x, "{ref_file}")' | """ "{py_cl} -x 'bcbio.variation.varscan.fix_varscan_output(x)' | " "{fix_ambig_ref} | {fix_ambig_alt} | ifne vcfuniqalleles > {out_file}") do.run(cmd.format(**locals()), "Varscan", None, [do.file_exists(out_file)]) os.remove(sample_list) # VarScan can create completely empty files in regions without # variants, so we create a correctly formatted empty file if os.path.getsize(out_file) == 0: write_empty_vcf(out_file) if orig_out_file.endswith(".gz"): vcfutils.bgzip_and_index(out_file, config)
def _run_vardict_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with VarDict. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: for align_bam in align_bams: bam.index(align_bam, config) num_bams = len(align_bams) sample_vcf_names = [] # for individual sample names, given batch calling may be required for bamfile, item in itertools.izip(align_bams, items): # prepare commands vardict = config_utils.get_program("vardict", config) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" opts = " ".join(_vardict_options_from_config(items, config, out_file, region)) vcfallelicprimitives = config_utils.get_program("vcfallelicprimitives", config) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 coverage_interval = utils.get_in(config, ("algorithm", "coverage_interval"), "exome") # for deep targeted panels, require 50 worth of coverage var2vcf_opts = " -v 50 " if coverage_interval == "regional" else "" fix_ambig = vcfutils.fix_ambiguous_cl() sample = item["name"][1] cmd = ("{vardict} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} " "| {strandbias}" "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} " "| {fix_ambig} | {vcfallelicprimitives} | {vcfstreamsort} {compress_cmd}") if num_bams > 1: temp_file_prefix = out_file.replace(".gz", "").replace(".vcf", "") + item["name"][1] tmp_out = temp_file_prefix + ".temp.vcf" tmp_out += ".gz" if out_file.endswith("gz") else "" sample_vcf_names.append(tmp_out) with file_transaction(item, tmp_out) as tx_tmp_file: cmd += " > {tx_tmp_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) else: cmd += " > {tx_out_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) if num_bams > 1: # N.B. merge_variant_files wants region in 1-based end-inclusive # coordinates. Thus use bamprep.region_to_gatk vcfutils.merge_variant_files(orig_files=sample_vcf_names, out_file=tx_out_file, ref_file=ref_file, config=config, region=bamprep.region_to_gatk(region)) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _run_freebayes_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with FreeBayes. This is used for paired tumor / normal samples. Sources of options for FreeBayes: mailing list: https://groups.google.com/d/msg/freebayes/dTWBtLyM4Vs/HAK_ZhJHguMJ mailing list: https://groups.google.com/forum/#!msg/freebayes/LLH7ZfZlVNs/63FdD31rrfEJ speedseq: https://github.com/cc2qe/speedseq/blob/e6729aa2589eca4e3a946f398c1a2bdc15a7300d/bin/speedseq#L916 sga/freebayes: https://github.com/jts/sga-extra/blob/7e28caf71e8107b697f9be7162050e4fa259694b/ sga_generate_varcall_makefile.pl#L299 """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: paired = get_paired_bams(align_bams, items) if not paired.normal_bam: return _run_freebayes_caller(align_bams, items, ref_file, assoc_files, region, out_file) #raise ValueError("Require both tumor and normal BAM files for FreeBayes cancer calling") freebayes = config_utils.get_program("freebayes", config) opts = " ".join(_freebayes_options_from_config(items, config, out_file, region)) if "--min-alternate-fraction" not in opts and "-F" not in opts: # add minimum reportable allele frequency # FreeBayes defaults to 20%, but use 10% by default for the # tumor case min_af = float(utils.get_in(paired.tumor_config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 opts += " --min-alternate-fraction %s" % min_af opts += " --min-repeat-entropy 1 --experimental-gls" # Recommended settings for cancer calling opts += (" --pooled-discrete --pooled-continuous --genotype-qualities " "--report-genotype-likelihood-max --allele-balance-priors-off") compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" fix_ambig = vcfutils.fix_ambiguous_cl() py_cl = os.path.join(os.path.dirname(sys.executable), "py") cl = ("{freebayes} -f {ref_file} {opts} " "{paired.tumor_bam} {paired.normal_bam} " "| vcffilter -f 'QUAL > 5' -s " "| {py_cl} -x 'bcbio.variation.freebayes.call_somatic(x)' " "| {fix_ambig} | vcfallelicprimitives --keep-info --keep-geno " "| vt normalize -q -r {ref_file} - " "{compress_cmd} > {tx_out_file}") bam.index(paired.tumor_bam, config) bam.index(paired.normal_bam, config) do.run(cl.format(**locals()), "Genotyping paired variants with FreeBayes", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _run_freebayes_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None, somatic=None): """Detect SNPs and indels with FreeBayes. Performs post-filtering to remove very low quality variants which can cause issues feeding into GATK. Breaks variants into individual allelic primitives for analysis and evaluation. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: freebayes = config_utils.get_program("freebayes", config) vcffilter = config_utils.get_program("vcffilter", config) input_bams = " ".join("-b %s" % x for x in align_bams) opts, no_target_regions = _freebayes_options_from_config( items, config, out_file, region) if no_target_regions: vcfutils.write_empty_vcf( tx_out_file, config, samples=[dd.get_sample_name(d) for d in items]) else: opts = " ".join(opts) # Recommended options from 1000 genomes low-complexity evaluation # https://groups.google.com/d/msg/freebayes/GvxIzjcpbas/1G6e3ArxQ4cJ opts += " --min-repeat-entropy 1" if somatic: opts = _add_somatic_opts(opts, somatic) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" fix_ambig = vcfutils.fix_ambiguous_cl() py_cl = os.path.join(os.path.dirname(sys.executable), "py") cmd = ( "{freebayes} -f {ref_file} {opts} {input_bams} | " "{vcffilter} -f 'QUAL > 5' -s | {fix_ambig} | " "bcftools view -a - 2> /dev/null | " "{py_cl} -x 'bcbio.variation.freebayes.remove_missingalt(x)' | " "vcfallelicprimitives --keep-geno | vcffixup - | vcfstreamsort | " "vt normalize -n -r {ref_file} -q - 2> /dev/null | vcfuniqalleles " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with FreeBayes", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _run_scalpel_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect indels with Scalpel. Single sample mode. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: if len(align_bams) > 1: message = ("Scalpel does not currently support batch calling!") raise ValueError(message) input_bams = " ".join("%s" % x for x in align_bams) tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0] if os.path.exists(tmp_path): utils.remove_safe(tmp_path) opts = " ".join( _scalpel_options_from_config(items, config, out_file, region, tmp_path)) opts += " --dir %s" % tmp_path min_cov = "3" # minimum coverage opts += " --mincov %s" % min_cov perl_exports = utils.get_perl_exports() cmd = ( "{perl_exports} && " "scalpel-discovery --single {opts} --ref {ref_file} --bam {input_bams} " ) do.run(cmd.format(**locals()), "Genotyping with Scalpel", {}) # parse produced variant file further scalpel_tmp_file = bgzip_and_index( os.path.join(tmp_path, "variants.indel.vcf"), config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression( "chi2", config) sample_name_str = items[0]["name"][1] fix_ambig = vcfutils.fix_ambiguous_cl() cl2 = ( "{bcftools_cmd_chi2} {scalpel_tmp_file} | sed 's/sample_name/{sample_name_str}/g' " "| {fix_ambig} | vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort " "{compress_cmd} > {tx_out_file}") do.run(cl2.format(**locals()), "Finalising Scalpel variants", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _varscan_work(align_bams, ref_file, items, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ config = items[0]["config"] orig_out_file = out_file out_file = orig_out_file.replace(".vcf.gz", ".vcf") max_read_depth = "1000" version = programs.jar_versioner("varscan", "VarScan")(config) if version < "v2.3.6": raise IOError("Please install version 2.3.6 or better of VarScan" " with support for multisample calling and indels" " in VCF format.") varscan_jar = config_utils.get_jar("VarScan", config_utils.get_program("varscan", config, "dir")) sample_list = _create_sample_list(align_bams, out_file) mpileup = samtools.prep_mpileup(align_bams, ref_file, config, max_read_depth, target_regions=target_regions, want_bcf=False) # VarScan fails to generate a header on files that start with # zerocoverage calls; strip these with grep, we're not going to # call on them remove_zerocoverage = "grep -v -P '\t0\t\t$'" # write a temporary mpileup file so we can check if empty mpfile = "%s.mpileup" % os.path.splitext(out_file)[0] with file_transaction(config, mpfile) as mpfile_tx: cmd = ("{mpileup} | {remove_zerocoverage} > {mpfile_tx}") do.run(cmd.format(**locals()), "mpileup for Varscan") if os.path.getsize(mpfile) == 0: write_empty_vcf(out_file) else: with tx_tmpdir(items[0]) as tmp_dir: jvm_opts = _get_varscan_opts(config, tmp_dir) fix_ambig = vcfutils.fix_ambiguous_cl() cmd = ("cat {mpfile} " "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 " " --vcf-sample-list {sample_list} --output-vcf --variants " "| {fix_ambig} | vcfuniqalleles > {out_file}") do.run(cmd.format(**locals()), "Varscan", None, [do.file_exists(out_file)]) os.remove(sample_list) os.remove(mpfile) # VarScan can create completely empty files in regions without # variants, so we create a correctly formatted empty file if os.path.getsize(out_file) == 0: write_empty_vcf(out_file) else: freebayes.clean_vcf_output(out_file, _clean_varscan_line, config) if orig_out_file.endswith(".gz"): vcfutils.bgzip_and_index(out_file, config)
def _varscan_work(align_bams, ref_file, items, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ config = items[0]["config"] orig_out_file = out_file out_file = orig_out_file.replace(".vcf.gz", ".vcf") max_read_depth = "1000" sample_list = _create_sample_list(align_bams, out_file) mpileup = samtools.prep_mpileup(align_bams, ref_file, config, max_read_depth, target_regions=target_regions, want_bcf=False) # VarScan fails to generate a header on files that start with # zerocoverage calls; strip these with grep, we're not going to # call on them remove_zerocoverage = r"ifne grep -v -P '\t0\t\t$'" # we use ifne from moreutils to ensure we process only on files with input, skipping otherwise # http://manpages.ubuntu.com/manpages/natty/man1/ifne.1.html with tx_tmpdir(items[0]) as tmp_dir: jvm_opts = _get_varscan_opts(config, tmp_dir) fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) py_cl = os.path.join(os.path.dirname(sys.executable), "py") cmd = ("{mpileup} | {remove_zerocoverage} | " "ifne varscan {jvm_opts} mpileup2cns --min-coverage 5 --p-value 0.98 " " --vcf-sample-list {sample_list} --output-vcf --variants | " "{py_cl} -x 'bcbio.variation.varscan.fix_varscan_output(x)' | " "{fix_ambig_ref} | {fix_ambig_alt} | ifne vcfuniqalleles > {out_file}") do.run(cmd.format(**locals()), "Varscan", None, [do.file_exists(out_file)]) os.remove(sample_list) # VarScan can create completely empty files in regions without # variants, so we create a correctly formatted empty file if os.path.getsize(out_file) == 0: write_empty_vcf(out_file) if orig_out_file.endswith(".gz"): vcfutils.bgzip_and_index(out_file, config)
def _run_freebayes_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with FreeBayes for paired tumor/normal samples. Sources of options for FreeBayes: mailing list: https://groups.google.com/d/msg/freebayes/dTWBtLyM4Vs/HAK_ZhJHguMJ mailing list: https://groups.google.com/forum/#!msg/freebayes/LLH7ZfZlVNs/63FdD31rrfEJ speedseq: https://github.com/cc2qe/speedseq/blob/e6729aa2589eca4e3a946f398c1a2bdc15a7300d/bin/speedseq#L916 sga/freebayes: https://github.com/jts/sga-extra/blob/7e28caf71e8107b697f9be7162050e4fa259694b/ sga_generate_varcall_makefile.pl#L299 """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext( align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: paired = get_paired_bams(align_bams, items) assert paired.normal_bam, "Require normal BAM for FreeBayes paired calling and filtering" freebayes = config_utils.get_program("freebayes", config) opts = " ".join( _freebayes_options_from_config(items, config, out_file, region)) opts += " --min-repeat-entropy 1 --experimental-gls" opts = _add_somatic_opts(opts, paired) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" fix_ambig = vcfutils.fix_ambiguous_cl() py_cl = os.path.join(os.path.dirname(sys.executable), "py") cl = ( "{freebayes} -f {ref_file} {opts} " "{paired.tumor_bam} {paired.normal_bam} " "| vcffilter -f 'QUAL > 5' -s " "| {py_cl} -x 'bcbio.variation.freebayes.call_somatic(x)' " "| {fix_ambig} | " "vcfallelicprimitives --keep-geno | vcffixup - | vcfstreamsort | " "vt normalize -r {ref_file} -q - 2> /dev/null | vcfuniqalleles " "{compress_cmd} > {tx_out_file}") bam.index(paired.tumor_bam, config) bam.index(paired.normal_bam, config) do.run(cl.format(**locals()), "Genotyping paired variants with FreeBayes", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _run_freebayes_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with FreeBayes for paired tumor/normal samples. Sources of options for FreeBayes: mailing list: https://groups.google.com/d/msg/freebayes/dTWBtLyM4Vs/HAK_ZhJHguMJ mailing list: https://groups.google.com/forum/#!msg/freebayes/LLH7ZfZlVNs/63FdD31rrfEJ speedseq: https://github.com/cc2qe/speedseq/blob/e6729aa2589eca4e3a946f398c1a2bdc15a7300d/bin/speedseq#L916 sga/freebayes: https://github.com/jts/sga-extra/blob/7e28caf71e8107b697f9be7162050e4fa259694b/ sga_generate_varcall_makefile.pl#L299 """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: paired = get_paired_bams(align_bams, items) assert paired.normal_bam, "Require normal BAM for FreeBayes paired calling and filtering" freebayes = config_utils.get_program("freebayes", config) opts, no_target_regions = _freebayes_options_from_config(items, config, out_file, region) if no_target_regions: vcfutils.write_empty_vcf(tx_out_file, config, samples=[x for x in [paired.tumor_name, paired.normal_name] if x]) else: opts = " ".join(opts) opts += " --min-repeat-entropy 1" opts += " --no-partial-observations" opts = _add_somatic_opts(opts, paired) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" # For multi-sample outputs, ensure consistent order samples = ("-s " + ",".join([dd.get_sample_name(d) for d in items])) if len(items) > 1 else "" fix_ambig = vcfutils.fix_ambiguous_cl() bcbio_py = sys.executable py_cl = os.path.join(os.path.dirname(sys.executable), "py") cl = ("{freebayes} -f {ref_file} {opts} " "{paired.tumor_bam} {paired.normal_bam} " """| bcftools filter -i 'ALT="<*>" || QUAL > 5' """ """| {bcbio_py} -c 'from bcbio.variation import freebayes; """ """freebayes.call_somatic("{paired.tumor_name}", "{paired.normal_name}")' """ "| {fix_ambig} | bcftools view {samples} -a - | " "{py_cl} -x 'bcbio.variation.freebayes.remove_missingalt(x)' | " "vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort | " "vt normalize -n -r {ref_file} -q - | vcfuniqalleles | vt uniq - 2> /dev/null " "{compress_cmd} > {tx_out_file}") do.run(cl.format(**locals()), "Genotyping paired variants with FreeBayes", {}) return out_file
def _run_vardict_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect variants with Vardict. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: paired = vcfutils.get_paired_bams(align_bams, items) if not paired.normal_bam: ann_file = _run_vardict_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vcffilter = config_utils.get_program("vcffilter", config) vardict = dd.get_variantcaller(items[0]) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) strandbias = "testsomatic.R" var2vcf = "var2vcf_paired.pl" compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 # merge bed file regions as amplicon VarDict is only supported in single sample mode opts = " ".join(_vardict_options_from_config(items, config, out_file, region, do_merge=True)) coverage_interval = utils.get_in(config, ("algorithm", "coverage_interval"), "exome") # for deep targeted panels, require 50 worth of coverage var2vcf_opts = " -v 50 " if highdepth.get_median_coverage(items[0]) > 5000 else "" fix_ambig = vcfutils.fix_ambiguous_cl() if any("vardict_somatic_filter" in tz.get_in(("config", "algorithm", "tools_off"), data, []) for data in items): somatic_filter = "" else: somatic_filter = ("| %s -x 'bcbio.variation.freebayes.call_somatic(x)'" % os.path.join(os.path.dirname(sys.executable), "py")) jvm_opts = _get_jvm_opts(items[0], tx_out_file) cmd = ("{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} " "| {strandbias} " "| {var2vcf} -N \"{paired.tumor_name}|{paired.normal_name}\" -f {freq} {var2vcf_opts} " "| bcftools filter -m '+' -s 'REJECT' -e 'STATUS !~ \".*Somatic\"' 2> /dev/null " "| sed 's/\\\\.*Somatic\\\\/Somatic/' " "| sed 's/REJECT,Description=\".*\">/REJECT,Description=\"Not Somatic via VarDict\">/' " "{somatic_filter} | {fix_ambig} | {vcfstreamsort} {compress_cmd} > {tx_out_file}") bam.index(paired.tumor_bam, config) bam.index(paired.normal_bam, config) do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) return out_file
def run(align_bams, items, ref_file, assoc_files, region, out_file): """Run platypus variant calling, germline whole genome or exome. """ assert out_file.endswith(".vcf.gz") if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: for align_bam in align_bams: bam.index(align_bam, items[0]["config"]) cmd = [ "platypus", "callVariants", "--regions=%s" % _subset_regions(region, out_file, items), "--bamFiles=%s" % ",".join(align_bams), "--refFile=%s" % dd.get_ref_file(items[0]), "--output=-", "--logFileName", "/dev/null", "--verbosity=1" ] resources = config_utils.get_resources("platypus", items[0]["config"]) if resources.get("options"): # normalize options so we can set defaults without overwriting user specified for opt in resources["options"]: if "=" in opt: key, val = opt.split("=") cmd.extend([key, val]) else: cmd.append(opt) if any("gvcf" in dd.get_tools_on(d) for d in items): cmd += ["--outputRefCalls", "1", "--refCallBlockSize", "50000"] # Adjust default filter thresholds to achieve similar sensitivity/specificity to other callers # Currently not used after doing more cross validation as they increase false positives # which seems to be a major advantage for Platypus users. # tuned_opts = ["--hapScoreThreshold", "10", "--scThreshold", "0.99", "--filteredReadsFrac", "0.9", # "--rmsmqThreshold", "20", "--qdThreshold", "0", "--abThreshold", "0.0001", # "--minVarFreq", "0.0", "--assemble", "1"] # for okey, oval in utils.partition_all(2, tuned_opts): # if okey not in cmd: # cmd.extend([okey, oval]) # Avoid filtering duplicates on high depth targeted regions where we don't mark duplicates if any(not dd.get_mark_duplicates(data) for data in items): cmd += ["--filterDuplicates=0"] post_process_cmd = ( " | %s | vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | " "vcfstreamsort | bgzip -c > %s" % (vcfutils.fix_ambiguous_cl(), tx_out_file)) do.run(" ".join(cmd) + post_process_cmd, "platypus variant calling") out_file = vcfutils.bgzip_and_index(out_file, items[0]["config"]) return out_file
def _run_vardict_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect variants with Vardict. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: paired = vcfutils.get_paired_bams(align_bams, items) if not paired.normal_bam: ann_file = _run_vardict_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vcffilter = config_utils.get_program("vcffilter", config) vardict = config_utils.get_program("vardict", config) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) strandbias = "testsomatic.R" var2vcf = "var2vcf_paired.pl" compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 # merge bed file regions as amplicon VarDict is only supported in single sample mode opts = " ".join(_vardict_options_from_config(items, config, out_file, region, do_merge=True)) coverage_interval = utils.get_in(config, ("algorithm", "coverage_interval"), "exome") # for deep targeted panels, require 50 worth of coverage var2vcf_opts = " -v 50 " if coverage_interval == "regional" else "" fix_ambig = vcfutils.fix_ambiguous_cl() if any("vardict_somatic_filter" in tz.get_in(("config", "algorithm", "tools_off"), data, []) for data in items): somatic_filter = "" else: somatic_filter = ("| %s -x 'bcbio.variation.freebayes.call_somatic(x)'" % os.path.join(os.path.dirname(sys.executable), "py")) cmd = ("{vardict} -G {ref_file} -f {freq} " "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} " "| {strandbias} " "| {var2vcf} -M -N \"{paired.tumor_name}|{paired.normal_name}\" -f {freq} {var2vcf_opts} " "{somatic_filter} | {fix_ambig} | {vcfstreamsort} {compress_cmd} > {tx_out_file}") bam.index(paired.tumor_bam, config) bam.index(paired.normal_bam, config) do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _run_freebayes_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None, somatic=None): """Detect SNPs and indels with FreeBayes. Performs post-filtering to remove very low quality variants which can cause issues feeding into GATK. Breaks variants into individual allelic primitives for analysis and evaluation. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: freebayes = config_utils.get_program("freebayes", config) input_bams = " ".join("-b %s" % x for x in align_bams) opts, no_target_regions = _freebayes_options_from_config(items, config, out_file, region) if no_target_regions: vcfutils.write_empty_vcf(tx_out_file, config, samples=[dd.get_sample_name(d) for d in items]) else: opts = " ".join(opts) # Recommended options from 1000 genomes low-complexity evaluation # https://groups.google.com/d/msg/freebayes/GvxIzjcpbas/1G6e3ArxQ4cJ opts += " --min-repeat-entropy 1" # Remove partial observations, which cause a preference for heterozygote calls # https://github.com/ekg/freebayes/issues/234#issuecomment-205331765 opts += " --no-partial-observations" if somatic: opts = _add_somatic_opts(opts, somatic) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" fix_ambig = vcfutils.fix_ambiguous_cl() clean_fmt_cmd = _clean_freebayes_fmt_cl() py_cl = os.path.join(os.path.dirname(sys.executable), "py") cmd = ("{freebayes} -f {ref_file} {opts} {input_bams} " """| bcftools filter -i 'ALT="<*>" || QUAL > 5' """ "| {fix_ambig} | {clean_fmt_cmd} bcftools view -a - | " "{py_cl} -x 'bcbio.variation.freebayes.remove_missingalt(x)' | " "vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort | " "vt normalize -n -r {ref_file} -q - | vcfuniqalleles " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with FreeBayes", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _run_scalpel_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect indels with Scalpel. Single sample mode. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: if len(align_bams) > 1: message = ("Scalpel does not currently support batch calling!") raise ValueError(message) input_bams = " ".join("%s" % x for x in align_bams) tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0] tx_tmp_path = "%s-scalpel-work" % utils.splitext_plus(tx_out_file)[0] if os.path.exists(tmp_path): utils.remove_safe(tmp_path) opts = " ".join(_scalpel_options_from_config(items, config, out_file, region, tmp_path)) opts += " --dir %s" % tx_tmp_path min_cov = "3" # minimum coverage opts += " --mincov %s" % min_cov perl_exports = utils.get_perl_exports(os.path.dirname(tx_out_file)) cmd = ("{perl_exports} && " "scalpel-discovery --single {opts} --ref {ref_file} --bam {input_bams} ") do.run(cmd.format(**locals()), "Genotyping with Scalpel", {}) shutil.move(tx_tmp_path, tmp_path) # parse produced variant file further scalpel_tmp_file = bgzip_and_index(os.path.join(tmp_path, "variants.indel.vcf"), config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression("chi2", config) sample_name_str = items[0]["name"][1] fix_ambig = vcfutils.fix_ambiguous_cl() cl2 = ("{bcftools_cmd_chi2} {scalpel_tmp_file} | " r"sed 's/FORMAT\tsample\(_name\)\{{0,1\}}/FORMAT\t{sample_name_str}/g' " "| {fix_ambig} | vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort " "{compress_cmd} > {tx_out_file}") do.run(cl2.format(**locals()), "Finalising Scalpel variants", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def rnaseq_vardict_variant_calling(data): sample = dd.get_sample_name(data) out_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "variation", "rnaseq", "vardict")) out_file = os.path.join(out_dir, sample + "-vardict.vcf.gz") if file_exists(out_file): data = dd.set_vrn_file(data, out_file) return data vardict_cmd = vardict.get_vardict_command(data) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" vcfstreamsort = config_utils.get_program("vcfstreamsort", data) compress_cmd = "| bgzip -c" freq = float(dd.get_min_allele_fraction(data, 20) / 100.0) var2vcf_opts = "-v 50" fix_ambig = vcfutils.fix_ambiguous_cl() remove_dup = vcfutils.remove_dup_cl() r_setup = get_R_exports() ref_file = dd.get_ref_file(data) bamfile = dd.get_work_bam(data) data = _setup_variant_regions(data, out_dir) opts, _ = vardict._vardict_options_from_config( [data], data["config"], out_file, dd.get_variant_regions(data), is_rnaseq=True) cores = dd.get_num_cores(data) if cores and cores > 1: opts += " -th %s" % str(cores) with file_transaction(data, out_file) as tx_out_file: jvm_opts = vardict._get_jvm_opts(data, tx_out_file) cmd = ("{r_setup} && {jvm_opts}{vardict_cmd} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} " "| {strandbias}" "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} " "| {fix_ambig} | {remove_dup} | {vcfstreamsort} {compress_cmd} " "> {tx_out_file}") message = "Calling RNA-seq variants with VarDict" do.run(cmd.format(**locals()), message) out_file = vcfutils.bgzip_and_index(out_file, data["config"]) data = dd.set_vrn_file(data, out_file) return data
def _run_freebayes_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None, somatic=None): """Detect SNPs and indels with FreeBayes. Performs post-filtering to remove very low quality variants which can cause issues feeding into GATK. Breaks variants into individual allelic primitives for analysis and evaluation. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: freebayes = config_utils.get_program("freebayes", config) input_bams = " ".join("-b %s" % x for x in align_bams) opts, no_target_regions = _freebayes_options_from_config(items, config, out_file, region) if no_target_regions: vcfutils.write_empty_vcf(tx_out_file, config, samples=[dd.get_sample_name(d) for d in items]) else: opts = " ".join(opts) # Recommended options from 1000 genomes low-complexity evaluation # https://groups.google.com/d/msg/freebayes/GvxIzjcpbas/1G6e3ArxQ4cJ opts += " --min-repeat-entropy 1" # Remove partial observations, which cause a preference for heterozygote calls # https://github.com/ekg/freebayes/issues/234#issuecomment-205331765 opts += " --no-partial-observations" if somatic: opts = _add_somatic_opts(opts, somatic) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" # For multi-sample outputs, ensure consistent order samples = ("-s" + ",".join([dd.get_sample_name(d) for d in items])) if len(items) > 1 else "" fix_ambig = vcfutils.fix_ambiguous_cl() py_cl = config_utils.get_program("py", config) cmd = ("{freebayes} -f {ref_file} {opts} {input_bams} " """| bcftools filter -i 'ALT="<*>" || QUAL > 5' """ "| {fix_ambig} | bcftools view {samples} -a - | " "{py_cl} -x 'bcbio.variation.freebayes.remove_missingalt(x)' | " "vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort | " "vt normalize -n -r {ref_file} -q - | vcfuniqalleles | vt uniq - 2> /dev/null " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with FreeBayes", {}) return out_file
def _run_freebayes_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with FreeBayes for paired tumor/normal samples. Sources of options for FreeBayes: mailing list: https://groups.google.com/d/msg/freebayes/dTWBtLyM4Vs/HAK_ZhJHguMJ mailing list: https://groups.google.com/forum/#!msg/freebayes/LLH7ZfZlVNs/63FdD31rrfEJ speedseq: https://github.com/cc2qe/speedseq/blob/e6729aa2589eca4e3a946f398c1a2bdc15a7300d/bin/speedseq#L916 sga/freebayes: https://github.com/jts/sga-extra/blob/7e28caf71e8107b697f9be7162050e4fa259694b/ sga_generate_varcall_makefile.pl#L299 """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: paired = get_paired_bams(align_bams, items) assert paired.normal_bam, "Require normal BAM for FreeBayes paired calling and filtering" freebayes = config_utils.get_program("freebayes", config) opts = " ".join(_freebayes_options_from_config(items, config, out_file, region)) opts += " --min-repeat-entropy 1" opts = _add_somatic_opts(opts, paired) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" fix_ambig = vcfutils.fix_ambiguous_cl() py_cl = os.path.join(os.path.dirname(sys.executable), "py") cl = ("{freebayes} -f {ref_file} {opts} " "{paired.tumor_bam} {paired.normal_bam} " "| vcffilter -f 'QUAL > 5' -s " "| {py_cl} -x 'bcbio.variation.freebayes.call_somatic(x)' " "| {fix_ambig} | bcftools view -a - 2> /dev/null | " "{py_cl} -x 'bcbio.variation.freebayes.remove_missingalt(x)' | " "vcfallelicprimitives --keep-geno | vcffixup - | vcfstreamsort | " "vt normalize -r {ref_file} -q - 2> /dev/null | vcfuniqalleles " "{compress_cmd} > {tx_out_file}") bam.index(paired.tumor_bam, config) bam.index(paired.normal_bam, config) do.run(cl.format(**locals()), "Genotyping paired variants with FreeBayes", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _run_vardict_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect variants with Vardict. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: paired = vcfutils.get_paired_bams(align_bams, items) if not paired.normal_bam: ann_file = _run_vardict_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vcffilter = config_utils.get_program("vcffilter", config) vardict = config_utils.get_program("vardict", config) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) vcfallelicprimitives = config_utils.get_program("vcfallelicprimitives", config) strandbias = "testsomatic.R" var2vcf = "var2vcf_somatic.pl" compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 opts = " ".join(_vardict_options_from_config(items, config, out_file, region, do_merge=True)) # merge bed file regions as amplicon VarDict is only supported in single sample mode coverage_interval = utils.get_in(config, ("algorithm", "coverage_interval"), "exome") # for deep targeted panels, require 50 worth of coverage var2vcf_opts = " -v 50 " if coverage_interval == "regional" else "" fix_ambig = vcfutils.fix_ambiguous_cl() cmd = ("{vardict} -G {ref_file} -f {freq} " "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} " "| {strandbias} " "| {var2vcf} -N \"{paired.tumor_name}|{paired.normal_name}\" -f {freq} {var2vcf_opts} " "| {fix_ambig} | {vcfstreamsort} {compress_cmd} > {tx_out_file}") bam.index(paired.tumor_bam, config) bam.index(paired.normal_bam, config) do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def rnaseq_vardict_variant_calling(data): sample = dd.get_sample_name(data) out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "variation", "rnaseq", "vardict")) out_file = os.path.join(out_dir, sample + "-vardict.vcf.gz") if file_exists(out_file): data = dd.set_vrn_file(data, out_file) return data vardict_cmd = vardict.get_vardict_command(data) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" vcfstreamsort = config_utils.get_program("vcfstreamsort", data) compress_cmd = "| bgzip -c" freq = float(dd.get_min_allele_fraction(data, 20) / 100.0) var2vcf_opts = "-v 50" fix_ambig = vcfutils.fix_ambiguous_cl() remove_dup = vcfutils.remove_dup_cl() r_setup = get_R_exports() ref_file = dd.get_ref_file(data) bamfile = dd.get_work_bam(data) data = _setup_variant_regions(data, out_dir) opts, _ = vardict._vardict_options_from_config([data], data["config"], out_file, dd.get_variant_regions(data), is_rnaseq=True) cores = dd.get_num_cores(data) if cores and cores > 1: opts += " -th %s" % str(cores) with file_transaction(data, out_file) as tx_out_file: jvm_opts = vardict._get_jvm_opts(data, tx_out_file) cmd = ("{r_setup} && {jvm_opts}{vardict_cmd} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} " "| {strandbias}" "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} " "| {fix_ambig} | {remove_dup} | {vcfstreamsort} {compress_cmd} " "> {tx_out_file}") message = "Calling RNA-seq variants with VarDict" do.run(cmd.format(**locals()), message) out_file = vcfutils.bgzip_and_index(out_file, data["config"]) data = dd.set_vrn_file(data, out_file) return data
def rnaseq_vardict_variant_calling(data): sample = dd.get_sample_name(data) variation_dir = os.path.join(dd.get_work_dir(data), "variation") safe_makedir(variation_dir) out_file = os.path.join(variation_dir, sample + "-vardict.vcf.gz") if file_exists(out_file): data = dd.set_vrn_file(data, out_file) return data vardict_cmd = vardict.get_vardict_command(data) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" vcfstreamsort = config_utils.get_program("vcfstreamsort", data) compress_cmd = "| bgzip -c" freq = float(dd.get_min_allele_fraction(data, 20) / 100.0) var2vcf_opts = "-v 50" fix_ambig = vcfutils.fix_ambiguous_cl() remove_dup = vcfutils.remove_dup_cl() r_setup = get_R_exports() ref_file = dd.get_ref_file(data) bamfile = dd.get_work_bam(data) bed_file = gtf.gtf_to_bed(dd.get_gtf_file(data)) opts = " -c 1 -S 2 -E 3 -g 4 " resources = config_utils.get_resources("vardict", data) if resources.get("options"): opts += " ".join([str(x) for x in resources["options"]]) with file_transaction(data, out_file) as tx_out_file: jvm_opts = vardict._get_jvm_opts(data, tx_out_file) cmd = ("{r_setup} && {jvm_opts}{vardict_cmd} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} {bed_file} " "| {strandbias}" "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} " "| {fix_ambig} | {remove_dup} | {vcfstreamsort} {compress_cmd} " "> {tx_out_file}") message = "Calling RNA-seq variants with VarDict" do.run(cmd.format(**locals()), message) data = dd.set_vrn_file(data, out_file) return data
def _run_freebayes_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None, somatic=None): """Detect SNPs and indels with FreeBayes. Performs post-filtering to remove very low quality variants which can cause issues feeding into GATK. Breaks variants into individual allelic primitives for analysis and evaluation. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: freebayes = config_utils.get_program("freebayes", config) vcffilter = config_utils.get_program("vcffilter", config) input_bams = " ".join("-b %s" % x for x in align_bams) opts = " ".join(_freebayes_options_from_config(items, config, out_file, region)) # Recommended options from 1000 genomes low-complexity evaluation # https://groups.google.com/d/msg/freebayes/GvxIzjcpbas/1G6e3ArxQ4cJ opts += " --min-repeat-entropy 1" if somatic: opts = _add_somatic_opts(opts, somatic) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" fix_ambig = vcfutils.fix_ambiguous_cl() py_cl = os.path.join(os.path.dirname(sys.executable), "py") cmd = ("{freebayes} -f {ref_file} {opts} {input_bams} | " "{vcffilter} -f 'QUAL > 5' -s | {fix_ambig} | " "bcftools view -a - 2> /dev/null | " "{py_cl} -x 'bcbio.variation.freebayes.remove_missingalt(x)' | " "vcfallelicprimitives --keep-geno | vcffixup - | vcfstreamsort | " "vt normalize -n -r {ref_file} -q - 2> /dev/null | vcfuniqalleles " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with FreeBayes", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _run_vardict_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect variants with Vardict. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: target = shared.subset_variant_regions(dd.get_variant_regions(items[0]), region, out_file, do_merge=True) paired = vcfutils.get_paired_bams(align_bams, items) if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_out_file, config, samples=[x for x in [paired.tumor_name, paired.normal_name] if x]) else: if not paired.normal_bam: ann_file = _run_vardict_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vcffilter = config_utils.get_program("vcffilter", config) vardict = get_vardict_command(items[0]) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) strandbias = "testsomatic.R" var2vcf = "var2vcf_paired.pl" compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 # merge bed file regions as amplicon VarDict is only supported in single sample mode opts = " ".join(_vardict_options_from_config(items, config, out_file, target)) coverage_interval = utils.get_in(config, ("algorithm", "coverage_interval"), "exome") # for deep targeted panels, require 50 worth of coverage var2vcf_opts = " -v 50 " if highdepth.get_median_coverage(items[0]) > 5000 else "" fix_ambig = vcfutils.fix_ambiguous_cl() remove_dup = vcfutils.remove_dup_cl() if any("vardict_somatic_filter" in tz.get_in(("config", "algorithm", "tools_off"), data, []) for data in items): somatic_filter = "" freq_filter = "" else: var2vcf_opts += " -M " # this makes VarDict soft filter non-differential variants somatic_filter = ("| sed 's/\\\\.*Somatic\\\\/Somatic/' " "| sed 's/REJECT,Description=\".*\">/REJECT,Description=\"Not Somatic via VarDict\">/' " "| %s -x 'bcbio.variation.freebayes.call_somatic(x)'" % os.path.join(os.path.dirname(sys.executable), "py")) freq_filter = ("| bcftools filter -m '+' -s 'REJECT' -e 'STATUS !~ \".*Somatic\"' 2> /dev/null " "| %s -x 'bcbio.variation.vardict.depth_freq_filter(x, %s, \"%s\")'" % (os.path.join(os.path.dirname(sys.executable), "py"), 0, dd.get_aligner(paired.tumor_data))) jvm_opts = _get_jvm_opts(items[0], tx_out_file) r_setup = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(utils.Rscript_cmd()) cmd = ("{r_setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} " "| {strandbias} " "| {var2vcf} -P 0.9 -m 4.25 -f {freq} {var2vcf_opts} " "-N \"{paired.tumor_name}|{paired.normal_name}\" " "{freq_filter} " "{somatic_filter} | {fix_ambig} | {remove_dup} | {vcfstreamsort} " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) out_file = (annotation.add_dbsnp(out_file, assoc_files["dbsnp"], config) if assoc_files.get("dbsnp") else out_file) return out_file
def _run_scalpel_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect indels with Scalpel. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext( align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: paired = get_paired_bams(align_bams, items) if not paired.normal_bam: ann_file = _run_scalpel_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vcfstreamsort = config_utils.get_program("vcfstreamsort", config) perl_exports = utils.get_perl_exports(os.path.dirname(tx_out_file)) tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0] db_file = os.path.join(tmp_path, "main", "somatic.db") if not os.path.exists(db_file + ".dir"): if os.path.exists(tmp_path): utils.remove_safe(tmp_path) opts = " ".join( _scalpel_options_from_config(items, config, out_file, region, tmp_path)) opts += " --ref {}".format(ref_file) opts += " --dir %s" % tmp_path # caling cl = ( "{perl_exports} && " "scalpel-discovery --somatic {opts} --tumor {paired.tumor_bam} --normal {paired.normal_bam}" ) do.run(cl.format(**locals()), "Genotyping paired variants with Scalpel", {}) # filtering to adjust input parameters bed_opts = " ".join( _scalpel_bed_file_opts(items, config, out_file, region, tmp_path)) use_defaults = True if use_defaults: scalpel_tmp_file = os.path.join(tmp_path, "main/somatic.indel.vcf") # Uses default filters but can tweak min-alt-count-tumor and min-phred-fisher # to swap precision for sensitivity else: scalpel_tmp_file = os.path.join( tmp_path, "main/somatic-indel-filter.vcf.gz") with file_transaction(config, scalpel_tmp_file) as tx_indel_file: cmd = ( "{perl_exports} && " "scalpel-export --somatic {bed_opts} --ref {ref_file} --db {db_file} " "--min-alt-count-tumor 5 --min-phred-fisher 10 --min-vaf-tumor 0.1 " "| bgzip -c > {tx_indel_file}") do.run(cmd.format(**locals()), "Scalpel somatic indel filter", {}) scalpel_tmp_file = bgzip_and_index(scalpel_tmp_file, config) scalpel_tmp_file_common = bgzip_and_index( os.path.join(tmp_path, "main/common.indel.vcf"), config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression( "chi2", config) bcftools_cmd_common = get_scalpel_bcftools_filter_expression( "reject", config) fix_ambig = vcfutils.fix_ambiguous_cl() cl2 = ( "vcfcat <({bcftools_cmd_chi2} {scalpel_tmp_file}) " "<({bcftools_cmd_common} {scalpel_tmp_file_common}) | " " {fix_ambig} | {vcfstreamsort} {compress_cmd} > {tx_out_file}" ) do.run(cl2.format(**locals()), "Finalising Scalpel variants", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _run_vardict_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with VarDict. var2vcf_valid uses -A flag which reports all alleles and improves sensitivity: https://github.com/AstraZeneca-NGS/VarDict/issues/35#issuecomment-276738191 """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: vrs = bedutils.population_variant_regions(items) target = shared.subset_variant_regions(vrs, region, out_file, items=items, do_merge=False) num_bams = len(align_bams) sample_vcf_names = [ ] # for individual sample names, given batch calling may be required for bamfile, item in zip(align_bams, items): # prepare commands sample = dd.get_sample_name(item) vardict = get_vardict_command(items[0]) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" opts, var2vcf_opts = _vardict_options_from_config( items, config, out_file, target) vcfstreamsort = config_utils.get_program( "vcfstreamsort", config) compress_cmd = "| bgzip -c" if tx_out_file.endswith( "gz") else "" freq = float( utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) remove_dup = vcfutils.remove_dup_cl() py_cl = os.path.join(utils.get_bcbio_bin(), "py") jvm_opts = _get_jvm_opts(items[0], tx_out_file) setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports()) contig_cl = vcfutils.add_contig_to_header_cl( ref_file, tx_out_file) cmd = ( "{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} " "| {strandbias}" "| {var2vcf} -A -N {sample} -E -f {freq} {var2vcf_opts} " "| {contig_cl} | bcftools filter -i 'QUAL >= 0' " "| {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}" ) if num_bams > 1: temp_file_prefix = out_file.replace(".gz", "").replace( ".vcf", "") + item["name"][1] tmp_out = temp_file_prefix + ".temp.vcf" tmp_out += ".gz" if out_file.endswith("gz") else "" sample_vcf_names.append(tmp_out) with file_transaction(item, tmp_out) as tx_tmp_file: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_tmp_file, config, samples=[sample]) else: cmd += " > {tx_tmp_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) else: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_out_file, config, samples=[sample]) else: cmd += " > {tx_out_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) if num_bams > 1: # N.B. merge_variant_files wants region in 1-based end-inclusive # coordinates. Thus use bamprep.region_to_gatk vcfutils.merge_variant_files( orig_files=sample_vcf_names, out_file=tx_out_file, ref_file=ref_file, config=config, region=bamprep.region_to_gatk(region)) return out_file
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file): """Run a paired VarScan analysis, also known as "somatic". """ max_read_depth = "1000" config = items[0]["config"] version = programs.jar_versioner("varscan", "VarScan")(config) if LooseVersion(version) < LooseVersion("v2.3.6"): raise IOError( "Please install version 2.3.6 or better of VarScan with support " "for multisample calling and indels in VCF format.") varscan_jar = config_utils.get_jar( "VarScan", config_utils.get_program("varscan", config, "dir")) remove_zerocoverage = "grep -v -P '\t0\t\t$'" # No need for names in VarScan, hence the "_" paired = get_paired_bams(align_bams, items) if not paired.normal_bam: affected_batch = items[0]["metadata"]["batch"] message = ("Batch {} requires both tumor and normal BAM files for" " VarScan cancer calling").format(affected_batch) raise ValueError(message) if not file_exists(out_file): orig_out_file = out_file out_file = orig_out_file.replace(".vcf.gz", ".vcf") base, ext = utils.splitext_plus(out_file) cleanup_files = [] for fname, mpext in [(paired.normal_bam, "normal"), (paired.tumor_bam, "tumor")]: mpfile = "%s-%s.mpileup" % (base, mpext) cleanup_files.append(mpfile) with file_transaction(config, mpfile) as mpfile_tx: mpileup = samtools.prep_mpileup([fname], ref_file, config, max_read_depth, target_regions=target_regions, want_bcf=False) cmd = "{mpileup} > {mpfile_tx}" cmd = cmd.format(**locals()) do.run(cmd, "samtools mpileup".format(**locals()), None, [do.file_exists(mpfile_tx)]) # Sometimes mpileup writes an empty file: in this case we # just skip the rest of the analysis (VarScan will hang otherwise) if any(os.stat(filename).st_size == 0 for filename in cleanup_files): write_empty_vcf(orig_out_file, config) return # First index is normal, second is tumor normal_tmp_mpileup = cleanup_files[0] tumor_tmp_mpileup = cleanup_files[1] indel_file = base + ".indel.vcf" snp_file = base + ".snp.vcf" cleanup_files.append(indel_file) cleanup_files.append(snp_file) with file_transaction(config, indel_file, snp_file) as (tx_indel, tx_snp): with tx_tmpdir(items[0]) as tmp_dir: jvm_opts = _get_varscan_opts(config, tmp_dir) fix_ambig = vcfutils.fix_ambiguous_cl() tx_snp_in = "%s-orig" % os.path.splitext(tx_snp)[0] tx_indel_in = "%s-orig" % os.path.splitext(tx_indel)[0] varscan_cmd = ( "java {jvm_opts} -jar {varscan_jar} somatic" " {normal_tmp_mpileup} {tumor_tmp_mpileup} " "--output-snp {tx_snp_in} --output-indel {tx_indel_in} " " --output-vcf --min-coverage 5 --p-value 0.98 " "--strand-filter 1 ") # add minimum AF if "--min-var-freq" not in varscan_cmd: min_af = float( utils.get_in(paired.tumor_config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 varscan_cmd += "--min-var-freq {min_af} " do.run(varscan_cmd.format(**locals()), "Varscan", None, None) for orig_fname, fname in [(tx_snp_in, tx_snp), (tx_indel_in, tx_indel)]: cmd = "vcfuniqalleles {orig_fname}.vcf | {fix_ambig} > {fname}" do.run(cmd.format(**locals()), "Varscan paired fix") # VarScan files need to be corrected to match the VCF specification # We do this before combining them otherwise merging may fail # if there are invalid records to_combine = [] if do.file_exists(snp_file): to_combine.append(snp_file) _fix_varscan_vcf(snp_file, paired.normal_name, paired.tumor_name, config) if do.file_exists(indel_file): to_combine.append(indel_file) _fix_varscan_vcf(indel_file, paired.normal_name, paired.tumor_name, config) if not to_combine: write_empty_vcf(orig_out_file, config) return out_file = combine_variant_files([snp_file, indel_file], out_file, ref_file, config, region=target_regions) # Remove cleanup files for extra_file in cleanup_files: for ext in ["", ".gz", ".gz.tbi"]: if os.path.exists(extra_file + ext): os.remove(extra_file + ext) if os.path.getsize(out_file) == 0: write_empty_vcf(out_file) if orig_out_file.endswith(".gz"): out_file = bgzip_and_index(out_file, config) _add_reject_flag(out_file, config)
def _run_vardict_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect variants with Vardict. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext( align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: vrs = bedutils.population_variant_regions(items) target = shared.subset_variant_regions(vrs, region, out_file, items=items, do_merge=True) paired = vcfutils.get_paired_bams(align_bams, items) if not _is_bed_file(target): vcfutils.write_empty_vcf( tx_out_file, config, samples=[ x for x in [paired.tumor_name, paired.normal_name] if x ]) else: if not paired.normal_bam: ann_file = _run_vardict_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vardict = get_vardict_command(items[0]) vcfstreamsort = config_utils.get_program( "vcfstreamsort", config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float( utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 # merge bed file regions as amplicon VarDict is only supported in single sample mode opts, var2vcf_opts = _vardict_options_from_config( items, config, out_file, target) fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) remove_dup = vcfutils.remove_dup_cl() if any("vardict_somatic_filter" in tz.get_in(( "config", "algorithm", "tools_off"), data, []) for data in items): somatic_filter = "" freq_filter = "" else: var2vcf_opts += " -M " # this makes VarDict soft filter non-differential variants somatic_filter = ( "| sed 's/\\\\.*Somatic\\\\/Somatic/' " "| sed 's/REJECT,Description=\".*\">/REJECT,Description=\"Not Somatic via VarDict\">/' " """| %s -c 'from bcbio.variation import freebayes; """ """freebayes.call_somatic("%s", "%s")' """ % (sys.executable, paired.tumor_name, paired.normal_name)) freq_filter = ( "| bcftools filter -m '+' -s 'REJECT' -e 'STATUS !~ \".*Somatic\"' 2> /dev/null " "| %s -x 'bcbio.variation.vardict.add_db_germline_flag(x)' " "| %s " "| %s -x 'bcbio.variation.vardict.depth_freq_filter(x, %s, \"%s\")'" % (os.path.join(os.path.dirname(sys.executable), "py"), _lowfreq_linear_filter(0, True), os.path.join(os.path.dirname(sys.executable), "py"), 0, bam.aligner_from_header(paired.tumor_bam))) jvm_opts = _get_jvm_opts(items[0], tx_out_file) py_cl = os.path.join(utils.get_bcbio_bin(), "py") setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports()) contig_cl = vcfutils.add_contig_to_header_cl( ref_file, tx_out_file) cmd = ( "{setup}{jvm_opts}{vardict} -G {ref_file} " "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} " "| awk 'NF>=48' | testsomatic.R " "| var2vcf_paired.pl -P 0.9 -m 4.25 {var2vcf_opts} " "-N \"{paired.tumor_name}|{paired.normal_name}\" " "| {contig_cl} {freq_filter} " "| bcftools filter -i 'QUAL >= 0' " "{somatic_filter} | {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) return out_file
def _run_vardict_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with VarDict. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: target = shared.subset_variant_regions(dd.get_variant_regions(items[0]), region, out_file, do_merge=False) num_bams = len(align_bams) sample_vcf_names = [] # for individual sample names, given batch calling may be required for bamfile, item in itertools.izip(align_bams, items): # prepare commands sample = dd.get_sample_name(item) vardict = get_vardict_command(items[0]) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" opts = (" ".join(_vardict_options_from_config(items, config, out_file, target)) if _is_bed_file(target) else "") vcfstreamsort = config_utils.get_program("vcfstreamsort", config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 coverage_interval = utils.get_in(config, ("algorithm", "coverage_interval"), "exome") # for deep targeted panels, require 50 worth of coverage var2vcf_opts = " -v 50 " if highdepth.get_median_coverage(items[0]) > 5000 else "" fix_ambig = vcfutils.fix_ambiguous_cl() remove_dup = vcfutils.remove_dup_cl() jvm_opts = _get_jvm_opts(items[0], tx_out_file) r_setup = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(utils.Rscript_cmd()) cmd = ("{r_setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} " "| {strandbias}" "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} " "| {fix_ambig} | {remove_dup} | {vcfstreamsort} {compress_cmd}") if num_bams > 1: temp_file_prefix = out_file.replace(".gz", "").replace(".vcf", "") + item["name"][1] tmp_out = temp_file_prefix + ".temp.vcf" tmp_out += ".gz" if out_file.endswith("gz") else "" sample_vcf_names.append(tmp_out) with file_transaction(item, tmp_out) as tx_tmp_file: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_tmp_file, config, samples=[sample]) else: cmd += " > {tx_tmp_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) else: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_out_file, config, samples=[sample]) else: cmd += " > {tx_out_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) if num_bams > 1: # N.B. merge_variant_files wants region in 1-based end-inclusive # coordinates. Thus use bamprep.region_to_gatk vcfutils.merge_variant_files(orig_files=sample_vcf_names, out_file=tx_out_file, ref_file=ref_file, config=config, region=bamprep.region_to_gatk(region)) out_file = (annotation.add_dbsnp(out_file, assoc_files["dbsnp"], config) if assoc_files.get("dbsnp") else out_file) return out_file
def _run_vardict_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with VarDict. var2vcf_valid uses -A flag which reports all alleles and improves sensitivity: https://github.com/AstraZeneca-NGS/VarDict/issues/35#issuecomment-276738191 """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: vrs = bedutils.population_variant_regions(items) target = shared.subset_variant_regions( vrs, region, out_file, items=items, do_merge=False) num_bams = len(align_bams) sample_vcf_names = [] # for individual sample names, given batch calling may be required for bamfile, item in zip(align_bams, items): # prepare commands sample = dd.get_sample_name(item) vardict = get_vardict_command(items[0]) opts, var2vcf_opts = _vardict_options_from_config(items, config, out_file, target) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) compress_cmd = "| bgzip -c" if tx_out_file.endswith("gz") else "" freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) remove_dup = vcfutils.remove_dup_cl() py_cl = os.path.join(utils.get_bcbio_bin(), "py") jvm_opts = _get_jvm_opts(items[0], tx_out_file) setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports()) contig_cl = vcfutils.add_contig_to_header_cl(ref_file, tx_out_file) lowfreq_filter = _lowfreq_linear_filter(0, False) cmd = ("{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} " "| teststrandbias.R " "| var2vcf_valid.pl -A -N {sample} -E -f {freq} {var2vcf_opts} " "| {contig_cl} | bcftools filter -i 'QUAL >= 0' | {lowfreq_filter} " "| {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}") if num_bams > 1: temp_file_prefix = out_file.replace(".gz", "").replace(".vcf", "") + item["name"][1] tmp_out = temp_file_prefix + ".temp.vcf" tmp_out += ".gz" if out_file.endswith("gz") else "" sample_vcf_names.append(tmp_out) with file_transaction(item, tmp_out) as tx_tmp_file: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_tmp_file, config, samples=[sample]) else: cmd += " > {tx_tmp_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) else: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_out_file, config, samples=[sample]) else: cmd += " > {tx_out_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) if num_bams > 1: # N.B. merge_variant_files wants region in 1-based end-inclusive # coordinates. Thus use bamprep.region_to_gatk vcfutils.merge_variant_files(orig_files=sample_vcf_names, out_file=tx_out_file, ref_file=ref_file, config=config, region=bamprep.region_to_gatk(region)) return out_file
def _run_vardict_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with VarDict. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: vrs = bedutils.population_variant_regions(items) target = shared.subset_variant_regions(vrs, region, out_file, do_merge=False) num_bams = len(align_bams) sample_vcf_names = [ ] # for individual sample names, given batch calling may be required for bamfile, item in itertools.izip(align_bams, items): # prepare commands sample = dd.get_sample_name(item) vardict = get_vardict_command(items[0]) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" opts = (" ".join( _vardict_options_from_config(items, config, out_file, target)) if _is_bed_file(target) else "") vcfstreamsort = config_utils.get_program( "vcfstreamsort", config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float( utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 coverage_interval = utils.get_in( config, ("algorithm", "coverage_interval"), "exome") # for deep targeted panels, require 50 worth of coverage var2vcf_opts = " -v 50 " if highdepth.get_median_coverage( items[0]) > 5000 else "" fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) remove_dup = vcfutils.remove_dup_cl() jvm_opts = _get_jvm_opts(items[0], tx_out_file) r_setup = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname( utils.Rscript_cmd()) cmd = ( "{r_setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} " "| {strandbias}" "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} " "| {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}" ) if num_bams > 1: temp_file_prefix = out_file.replace(".gz", "").replace( ".vcf", "") + item["name"][1] tmp_out = temp_file_prefix + ".temp.vcf" tmp_out += ".gz" if out_file.endswith("gz") else "" sample_vcf_names.append(tmp_out) with file_transaction(item, tmp_out) as tx_tmp_file: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_tmp_file, config, samples=[sample]) else: cmd += " > {tx_tmp_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) else: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_out_file, config, samples=[sample]) else: cmd += " > {tx_out_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) if num_bams > 1: # N.B. merge_variant_files wants region in 1-based end-inclusive # coordinates. Thus use bamprep.region_to_gatk vcfutils.merge_variant_files( orig_files=sample_vcf_names, out_file=tx_out_file, ref_file=ref_file, config=config, region=bamprep.region_to_gatk(region)) out_file = (annotation.add_dbsnp(out_file, assoc_files["dbsnp"], config) if assoc_files.get("dbsnp") else out_file) return out_file
def _run_vardict_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with VarDict. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: for align_bam in align_bams: bam.index(align_bam, config) num_bams = len(align_bams) sample_vcf_names = [ ] # for individual sample names, given batch calling may be required for bamfile, item in itertools.izip(align_bams, items): # prepare commands vardict = config_utils.get_program("vardict", config) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" opts = " ".join( _vardict_options_from_config(items, config, out_file, region)) vcfallelicprimitives = config_utils.get_program( "vcfallelicprimitives", config) vcfstreamsort = config_utils.get_program( "vcfstreamsort", config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float( utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 coverage_interval = utils.get_in( config, ("algorithm", "coverage_interval"), "exome") # for deep targeted panels, require 50 worth of coverage var2vcf_opts = " -v 50 " if coverage_interval == "regional" else "" fix_ambig = vcfutils.fix_ambiguous_cl() sample = item["name"][1] cmd = ( "{vardict} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} " "| {strandbias}" "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} " "| {fix_ambig} | {vcfallelicprimitives} | {vcfstreamsort} {compress_cmd}" ) if num_bams > 1: temp_file_prefix = out_file.replace(".gz", "").replace( ".vcf", "") + item["name"][1] tmp_out = temp_file_prefix + ".temp.vcf" tmp_out += ".gz" if out_file.endswith("gz") else "" sample_vcf_names.append(tmp_out) with file_transaction(item, tmp_out) as tx_tmp_file: cmd += " > {tx_tmp_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) else: cmd += " > {tx_out_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) if num_bams > 1: # N.B. merge_variant_files wants region in 1-based end-inclusive # coordinates. Thus use bamprep.region_to_gatk vcfutils.merge_variant_files( orig_files=sample_vcf_names, out_file=tx_out_file, ref_file=ref_file, config=config, region=bamprep.region_to_gatk(region)) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _run_vardict_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect variants with Vardict. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext( align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: target = shared.subset_variant_regions(dd.get_variant_regions( items[0]), region, out_file, do_merge=True) paired = vcfutils.get_paired_bams(align_bams, items) if not _is_bed_file(target): vcfutils.write_empty_vcf( tx_out_file, config, samples=[ x for x in [paired.tumor_name, paired.normal_name] if x ]) else: if not paired.normal_bam: ann_file = _run_vardict_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vardict = get_vardict_command(items[0]) vcfstreamsort = config_utils.get_program( "vcfstreamsort", config) strandbias = "testsomatic.R" var2vcf = "var2vcf_paired.pl" compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float( utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 # merge bed file regions as amplicon VarDict is only supported in single sample mode opts = " ".join( _vardict_options_from_config(items, config, out_file, target)) coverage_interval = utils.get_in( config, ("algorithm", "coverage_interval"), "exome") # for deep targeted panels, require 50 worth of coverage var2vcf_opts = " -v 50 " if highdepth.get_median_coverage( items[0]) > 5000 else "" fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) remove_dup = vcfutils.remove_dup_cl() if any("vardict_somatic_filter" in tz.get_in(( "config", "algorithm", "tools_off"), data, []) for data in items): somatic_filter = "" freq_filter = "" else: var2vcf_opts += " -M " # this makes VarDict soft filter non-differential variants somatic_filter = ( "| sed 's/\\\\.*Somatic\\\\/Somatic/' " "| sed 's/REJECT,Description=\".*\">/REJECT,Description=\"Not Somatic via VarDict\">/' " "| %s -x 'bcbio.variation.freebayes.call_somatic(x)'" % os.path.join(os.path.dirname(sys.executable), "py")) freq_filter = ( "| bcftools filter -m '+' -s 'REJECT' -e 'STATUS !~ \".*Somatic\"' 2> /dev/null " "| %s -x 'bcbio.variation.vardict.depth_freq_filter(x, %s, \"%s\")'" % (os.path.join(os.path.dirname(sys.executable), "py"), 0, dd.get_aligner(paired.tumor_data))) jvm_opts = _get_jvm_opts(items[0], tx_out_file) r_setup = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname( utils.Rscript_cmd()) cmd = ( "{r_setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} " "| {strandbias} " "| {var2vcf} -P 0.9 -m 4.25 -f {freq} {var2vcf_opts} " "-N \"{paired.tumor_name}|{paired.normal_name}\" " "{freq_filter} " "{somatic_filter} | {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) out_file = (annotation.add_dbsnp(out_file, assoc_files["dbsnp"], config) if assoc_files.get("dbsnp") else out_file) return out_file
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file): """Run a paired VarScan analysis, also known as "somatic". """ max_read_depth = "1000" config = items[0]["config"] paired = get_paired_bams(align_bams, items) if not paired.normal_bam: affected_batch = items[0]["metadata"]["batch"] message = ("Batch {} requires both tumor and normal BAM files for" " VarScan cancer calling").format(affected_batch) raise ValueError(message) if not utils.file_exists(out_file): assert out_file.endswith(".vcf.gz"), "Expect bgzipped output to VarScan" normal_mpileup_cl = samtools.prep_mpileup([paired.normal_bam], ref_file, config, max_read_depth, target_regions=target_regions, want_bcf=False) tumor_mpileup_cl = samtools.prep_mpileup([paired.tumor_bam], ref_file, config, max_read_depth, target_regions=target_regions, want_bcf=False) base, ext = utils.splitext_plus(out_file) indel_file = base + "-indel.vcf" snp_file = base + "-snp.vcf" with file_transaction(config, indel_file, snp_file) as (tx_indel, tx_snp): with tx_tmpdir(items[0]) as tmp_dir: jvm_opts = _get_varscan_opts(config, tmp_dir) remove_zerocoverage = r"ifne grep -v -P '\t0\t\t$'" varscan_cmd = ("varscan {jvm_opts} somatic " " <({normal_mpileup_cl} | {remove_zerocoverage}) " "<({tumor_mpileup_cl} | {remove_zerocoverage}) " "--output-snp {tx_snp} --output-indel {tx_indel} " " --output-vcf --min-coverage 5 --p-value 0.98 " "--strand-filter 1 ") # add minimum AF if "--min-var-freq" not in varscan_cmd: min_af = float(utils.get_in(paired.tumor_config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 varscan_cmd += "--min-var-freq {min_af} " do.run(varscan_cmd.format(**locals()), "Varscan", None, None) to_combine = [] for fname in [snp_file, indel_file]: if utils.file_exists(fname): fix_file = "%s-fix.vcf.gz" % (utils.splitext_plus(fname)[0]) with file_transaction(config, fix_file) as tx_fix_file: fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) py_cl = os.path.join(os.path.dirname(sys.executable), "py") normal_name = paired.normal_name tumor_name = paired.tumor_name cmd = ("cat {fname} | " "{py_cl} -x 'bcbio.variation.varscan.fix_varscan_output(x," """ "{normal_name}", "{tumor_name}")' | """ "{fix_ambig_ref} | {fix_ambig_alt} | ifne vcfuniqalleles | " """bcftools filter -m + -s REJECT -e "SS != '.' && SS != '2'" 2> /dev/null | """ "{py_cl} -x 'bcbio.variation.varscan.spv_freq_filter(x, 1)' | " "bgzip -c > {tx_fix_file}") do.run(cmd.format(**locals()), "Varscan paired fix") to_combine.append(fix_file) if not to_combine: out_file = write_empty_vcf(out_file, config) else: out_file = combine_variant_files(to_combine, out_file, ref_file, config, region=target_regions) if os.path.getsize(out_file) == 0: write_empty_vcf(out_file) if out_file.endswith(".gz"): out_file = bgzip_and_index(out_file, config)
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file): """Run a paired VarScan analysis, also known as "somatic". """ max_read_depth = "1000" config = items[0]["config"] version = programs.jar_versioner("varscan", "VarScan")(config) if LooseVersion(version) < LooseVersion("v2.3.6"): raise IOError( "Please install version 2.3.6 or better of VarScan with support " "for multisample calling and indels in VCF format.") varscan_jar = config_utils.get_jar( "VarScan", config_utils.get_program("varscan", config, "dir")) remove_zerocoverage = "grep -v -P '\t0\t\t$'" # No need for names in VarScan, hence the "_" paired = get_paired_bams(align_bams, items) if not paired.normal_bam: affected_batch = items[0]["metadata"]["batch"] message = ("Batch {} requires both tumor and normal BAM files for" " VarScan cancer calling").format(affected_batch) raise ValueError(message) if not file_exists(out_file): orig_out_file = out_file out_file = orig_out_file.replace(".vcf.gz", ".vcf") base, ext = utils.splitext_plus(out_file) cleanup_files = [] for fname, mpext in [(paired.normal_bam, "normal"), (paired.tumor_bam, "tumor")]: mpfile = "%s-%s.mpileup" % (base, mpext) cleanup_files.append(mpfile) with file_transaction(config, mpfile) as mpfile_tx: mpileup = samtools.prep_mpileup([fname], ref_file, config, max_read_depth, target_regions=target_regions, want_bcf=False) cmd = "{mpileup} > {mpfile_tx}" cmd = cmd.format(**locals()) do.run(cmd, "samtools mpileup".format(**locals()), None, [do.file_exists(mpfile_tx)]) # Sometimes mpileup writes an empty file: in this case we # just skip the rest of the analysis (VarScan will hang otherwise) if any(os.stat(filename).st_size == 0 for filename in cleanup_files): write_empty_vcf(orig_out_file, config) return # First index is normal, second is tumor normal_tmp_mpileup = cleanup_files[0] tumor_tmp_mpileup = cleanup_files[1] indel_file = base + ".indel.vcf" snp_file = base + ".snp.vcf" cleanup_files.append(indel_file) cleanup_files.append(snp_file) with file_transaction(config, indel_file, snp_file) as (tx_indel, tx_snp): with tx_tmpdir(items[0]) as tmp_dir: jvm_opts = _get_varscan_opts(config, tmp_dir) fix_ambig = vcfutils.fix_ambiguous_cl() tx_snp_in = "%s-orig" % os.path.splitext(tx_snp)[0] tx_indel_in = "%s-orig" % os.path.splitext(tx_indel)[0] varscan_cmd = ("java {jvm_opts} -jar {varscan_jar} somatic" " {normal_tmp_mpileup} {tumor_tmp_mpileup} " "--output-snp {tx_snp_in} --output-indel {tx_indel_in} " " --output-vcf --min-coverage 5 --p-value 0.98 " "--strand-filter 1 ") # add minimum AF if "--min-var-freq" not in varscan_cmd: min_af = float(utils.get_in(paired.tumor_config, ("algorithm", "min_allele_fraction"),10)) / 100.0 varscan_cmd += "--min-var-freq {min_af} " do.run(varscan_cmd.format(**locals()), "Varscan", None, None) for orig_fname, fname in [(tx_snp_in, tx_snp), (tx_indel_in, tx_indel)]: cmd = "vcfuniqalleles {orig_fname}.vcf | {fix_ambig} > {fname}" do.run(cmd.format(**locals()), "Varscan paired fix") # VarScan files need to be corrected to match the VCF specification # We do this before combining them otherwise merging may fail # if there are invalid records to_combine = [] if do.file_exists(snp_file): to_combine.append(snp_file) _fix_varscan_vcf(snp_file, paired.normal_name, paired.tumor_name, config) if do.file_exists(indel_file): to_combine.append(indel_file) _fix_varscan_vcf(indel_file, paired.normal_name, paired.tumor_name, config) if not to_combine: write_empty_vcf(orig_out_file, config) return out_file = combine_variant_files([snp_file, indel_file], out_file, ref_file, config, region=target_regions) # Remove cleanup files for extra_file in cleanup_files: for ext in ["", ".gz", ".gz.tbi"]: if os.path.exists(extra_file + ext): os.remove(extra_file + ext) if os.path.getsize(out_file) == 0: write_empty_vcf(out_file) if orig_out_file.endswith(".gz"): out_file = bgzip_and_index(out_file, config) _add_reject_flag(out_file, config)
def _run_vardict_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect variants with Vardict. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: vrs = bedutils.population_variant_regions(items) target = shared.subset_variant_regions(vrs, region, out_file, items=items, do_merge=True) paired = vcfutils.get_paired_bams(align_bams, items) if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_out_file, config, samples=[x for x in [paired.tumor_name, paired.normal_name] if x]) else: if not paired.normal_bam: ann_file = _run_vardict_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vardict = get_vardict_command(items[0]) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 # merge bed file regions as amplicon VarDict is only supported in single sample mode opts, var2vcf_opts = _vardict_options_from_config(items, config, out_file, target) fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) remove_dup = vcfutils.remove_dup_cl() if any("vardict_somatic_filter" in tz.get_in(("config", "algorithm", "tools_off"), data, []) for data in items): somatic_filter = "" freq_filter = "" else: var2vcf_opts += " -M " # this makes VarDict soft filter non-differential variants somatic_filter = ("| sed 's/\\\\.*Somatic\\\\/Somatic/' " "| sed 's/REJECT,Description=\".*\">/REJECT,Description=\"Not Somatic via VarDict\">/' " """| %s -c 'from bcbio.variation import freebayes; """ """freebayes.call_somatic("%s", "%s")' """ % (sys.executable, paired.tumor_name, paired.normal_name)) freq_filter = ("| bcftools filter -m '+' -s 'REJECT' -e 'STATUS !~ \".*Somatic\"' 2> /dev/null " "| %s -x 'bcbio.variation.vardict.add_db_germline_flag(x)' " "| %s " "| %s -x 'bcbio.variation.vardict.depth_freq_filter(x, %s, \"%s\")'" % (os.path.join(os.path.dirname(sys.executable), "py"), _lowfreq_linear_filter(0, True), os.path.join(os.path.dirname(sys.executable), "py"), 0, bam.aligner_from_header(paired.tumor_bam))) jvm_opts = _get_jvm_opts(items[0], tx_out_file) py_cl = os.path.join(utils.get_bcbio_bin(), "py") setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports()) contig_cl = vcfutils.add_contig_to_header_cl(ref_file, tx_out_file) cmd = ("{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} " "| awk 'NF>=48' | testsomatic.R " "| var2vcf_paired.pl -P 0.9 -m 4.25 -f {freq} {var2vcf_opts} " "-N \"{paired.tumor_name}|{paired.normal_name}\" " "| {contig_cl} {freq_filter} " "| bcftools filter -i 'QUAL >= 0' " "{somatic_filter} | {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) return out_file