def _produce_compatible_vcf(out_file, data, is_somatic): """Create a compatible VCF that downstream tools can deal with. - htsjdk and thus GATK and Picard do not support VCF4.3: https://github.com/broadinstitute/gatk/issues/2092 - Use octopus legacy format to avoid incompatibilities. https://github.com/luntergroup/octopus#output-format - Fixes `##contig` lines since octopus only writes contigs used in the BED file region, causing incompatibilies with GatherVcfs when merging - Fixes alleles prefixed with '*' like 'C,*T' which cause downstream failures when reading with GATK. """ base, ext = utils.splitext_plus(out_file) legacy_file = "%s.legacy%s" % (base, ext) if is_somatic: legacy_file = _covert_to_diploid(legacy_file, data) final_file = "%s.vcf.gz" % base cat_cmd = "zcat" if legacy_file.endswith(".gz") else "cat" contig_cl = vcfutils.add_contig_to_header_cl(dd.get_ref_file(data), out_file) remove_problem_alleles = r"sed 's/,\*\([A-Z]\)/,\1/'" cmd = ("{cat_cmd} {legacy_file} | sed 's/fileformat=VCFv4.3/fileformat=VCFv4.2/' | " "{remove_problem_alleles} | {contig_cl} | bgzip -c > {final_file}") do.run(cmd.format(**locals()), "Produce compatible VCF output file from octopus") return vcfutils.bgzip_and_index(out_file, data["config"])
def run(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run tumor only smCounter2 calling. """ paired = vcfutils.get_paired_bams(align_bams, items) assert paired and not paired.normal_bam, ("Pisces supports tumor-only variant calling: %s" % (",".join([dd.get_sample_name(d) for d in items]))) vrs = bedutils.population_variant_regions(items) target = shared.subset_variant_regions(vrs, region, out_file, items=items, do_merge=True) out_file = out_file.replace(".vcf.gz", ".vcf") out_prefix = utils.splitext_plus(os.path.basename(out_file))[0] if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): with file_transaction(paired.tumor_data, out_file) as tx_out_file: cmd = ["smCounter2", "--runPath", os.path.dirname(tx_out_file), "--outPrefix", out_prefix, "--bedTarget", target, "--refGenome", ref_file, "--bamFile", paired.tumor_bam, "--bamType", "consensus", "--nCPU", dd.get_num_cores(paired.tumor_data)] do.run(cmd, "smcounter2 variant calling") for fname in glob.glob(os.path.join(os.path.dirname(tx_out_file), "*.smCounter*")): shutil.move(fname, os.path.join(os.path.dirname(out_file), os.path.basename(fname))) utils.symlink_plus(os.path.join(os.path.dirname(out_file), "%s.smCounter.cut.vcf" % out_prefix), out_file) return vcfutils.bgzip_and_index(out_file, paired.tumor_data["config"], remove_orig=False, prep_cmd="sed 's#FORMAT\t%s#FORMAT\t%s#' | %s" % (out_prefix, dd.get_sample_name(paired.tumor_data), vcfutils.add_contig_to_header_cl(dd.get_ref_file(paired.tumor_data), out_file)))
def _produce_compatible_vcf(out_file, data, is_somatic=False): """Create a compatible VCF that downstream tools can deal with. - htsjdk and thus GATK and Picard do not support VCF4.3: https://github.com/broadinstitute/gatk/issues/2092 - Use octopus legacy format to avoid incompatibilities. https://github.com/luntergroup/octopus#output-format - Fixes `##contig` lines since octopus only writes contigs used in the BED file region, causing incompatibilies with GatherVcfs when merging - Fixes alleles prefixed with '*' like 'C,*T' which cause downstream failures when reading with GATK. - Changes phase set (PS) header to be type Integer. """ base, ext = utils.splitext_plus(out_file) #legacy_file = "%s.legacy%s" % (base, ext) legacy_file = out_file if is_somatic: legacy_file = _covert_to_diploid(legacy_file, data) final_file = "%s.vcf.gz" % base cat_cmd = "zcat" if legacy_file.endswith(".gz") else "cat" contig_cl = vcfutils.add_contig_to_header_cl(dd.get_ref_file(data), out_file) remove_problem_alleles = r"sed 's/,\*\([A-Z]\)/,\1/'" fix_phasing_header = r"sed 's/ID=PS,Number=1,Type=String/ID=PS,Number=1,Type=Integer/'" cmd = ( "{cat_cmd} {legacy_file} | sed 's/fileformat=VCFv4.3/fileformat=VCFv4.2/' | " "{remove_problem_alleles} | {fix_phasing_header} | {contig_cl} | bgzip -c > {final_file}" ) do.run(cmd.format(**locals()), "Produce compatible VCF output file from octopus") return vcfutils.bgzip_and_index(out_file, data["config"])
def _run_scalpel_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect indels with Scalpel. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: paired = get_paired_bams(align_bams, items) if not paired.normal_bam: ann_file = _run_scalpel_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vcfstreamsort = config_utils.get_program("vcfstreamsort", config) perl_exports = utils.get_perl_exports(os.path.dirname(tx_out_file)) tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0] db_file = os.path.join(tmp_path, "main", "somatic.db") if not os.path.exists(db_file + ".dir"): if os.path.exists(tmp_path): utils.remove_safe(tmp_path) opts = " ".join(_scalpel_options_from_config(items, config, out_file, region, tmp_path)) opts += " --ref {}".format(ref_file) opts += " --dir %s" % tmp_path # caling cl = ("{perl_exports} && " "scalpel-discovery --somatic {opts} --tumor {paired.tumor_bam} --normal {paired.normal_bam}") do.run(cl.format(**locals()), "Genotyping paired variants with Scalpel", {}) # filtering to adjust input parameters bed_opts = " ".join(_scalpel_bed_file_opts(items, config, out_file, region, tmp_path)) use_defaults = True if use_defaults: scalpel_tmp_file = os.path.join(tmp_path, "main/somatic.indel.vcf") # Uses default filters but can tweak min-alt-count-tumor and min-phred-fisher # to swap precision for sensitivity else: scalpel_tmp_file = os.path.join(tmp_path, "main/somatic-indel-filter.vcf.gz") with file_transaction(config, scalpel_tmp_file) as tx_indel_file: cmd = ("{perl_exports} && " "scalpel-export --somatic {bed_opts} --ref {ref_file} --db {db_file} " "--min-alt-count-tumor 5 --min-phred-fisher 10 --min-vaf-tumor 0.1 " "| bgzip -c > {tx_indel_file}") do.run(cmd.format(**locals()), "Scalpel somatic indel filter", {}) scalpel_tmp_file = bgzip_and_index(scalpel_tmp_file, config) scalpel_tmp_file_common = bgzip_and_index(os.path.join(tmp_path, "main/common.indel.vcf"), config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression("chi2", config) bcftools_cmd_common = get_scalpel_bcftools_filter_expression("reject", config) fix_ambig = vcfutils.fix_ambiguous_cl() add_contig = vcfutils.add_contig_to_header_cl(dd.get_ref_file(items[0]), tx_out_file) cl2 = ("vcfcat <({bcftools_cmd_chi2} {scalpel_tmp_file}) " "<({bcftools_cmd_common} {scalpel_tmp_file_common}) | " " {fix_ambig} | {vcfstreamsort} | {add_contig} {compress_cmd} > {tx_out_file}") do.run(cl2.format(**locals()), "Finalising Scalpel variants", {}) return out_file
def _add_contig_cl(in_file, items): has_contigs = False with utils.open_gzipsafe(in_file) as in_handle: for line in in_handle: if line.startswith("##contig"): has_contigs = True break elif not line.startswith("##"): break if not has_contigs: return vcfutils.add_contig_to_header_cl(items[0])
def _run_scalpel_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect indels with Scalpel. Single sample mode. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: if len(align_bams) > 1: message = ("Scalpel does not currently support batch calling!") raise ValueError(message) input_bams = " ".join("%s" % x for x in align_bams) tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0] tx_tmp_path = "%s-scalpel-work" % utils.splitext_plus( tx_out_file)[0] if os.path.exists(tmp_path): utils.remove_safe(tmp_path) opts = " ".join( _scalpel_options_from_config(items, config, out_file, region, tmp_path)) opts += " --dir %s" % tx_tmp_path min_cov = "3" # minimum coverage opts += " --mincov %s" % min_cov perl_exports = utils.get_perl_exports(os.path.dirname(tx_out_file)) cmd = ( "{perl_exports} && " "scalpel-discovery --single {opts} --ref {ref_file} --bam {input_bams} " ) do.run(cmd.format(**locals()), "Genotyping with Scalpel", {}) shutil.move(tx_tmp_path, tmp_path) # parse produced variant file further scalpel_tmp_file = bgzip_and_index( os.path.join(tmp_path, "variants.indel.vcf"), config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression( "chi2", config) sample_name_str = items[0]["name"][1] fix_ambig = vcfutils.fix_ambiguous_cl() add_contig = vcfutils.add_contig_to_header_cl( dd.get_ref_file(items[0]), tx_out_file) cl2 = ( "{bcftools_cmd_chi2} {scalpel_tmp_file} | " r"sed 's/FORMAT\tsample\(_name\)\{{0,1\}}/FORMAT\t{sample_name_str}/g' " "| {fix_ambig} | vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort " "| {add_contig} {compress_cmd} > {tx_out_file}") do.run(cl2.format(**locals()), "Finalising Scalpel variants", {}) return out_file
def _add_contig_cl(in_file, items): has_contigs = False with utils.open_gzipsafe(in_file) as in_handle: for line in in_handle: if line.startswith("##contig"): has_contigs = True break elif not line.startswith("##"): break if not has_contigs: return vcfutils.add_contig_to_header_cl(items[0])
def map_coords_to_ucsc(grc_cosmic, ref_file, out_file): hg19_ref_file = ref_file.replace("GRCh37", "hg19") if not os.path.exists(out_file): contig_cl = vcfutils.add_contig_to_header_cl(hg19_ref_file, out_file) cmd = ("zcat {grc_cosmic} | " r'sed "s/^\([0-9]\+\)\t/chr\1\t/g" | sed "s/^MT/chrM/g" | sed "s/^X/chrX/g" | sed "s/^Y/chrY/g" ' "| {contig_cl} " "| bgzip -c > {out_file}") subprocess.check_call(cmd.format(**locals()), shell=True) if os.path.exists("%s-header.txt" % utils.splitext_plus(out_file)[0]): os.remove("%s-header.txt" % utils.splitext_plus(out_file)[0]) return vcfutils.bgzip_and_index(out_file, {})
def map_coords_to_ucsc(grc_cosmic, ref_file, out_file): hg19_ref_file = ref_file.replace("GRCh37", "hg19") if not os.path.exists(out_file): contig_cl = vcfutils.add_contig_to_header_cl(hg19_ref_file, out_file) cmd = ("zcat {grc_cosmic} | " r'sed "s/^\([0-9]\+\)\t/chr\1\t/g" | sed "s/^MT/chrM/g" | sed "s/^X/chrX/g" | sed "s/^Y/chrY/g" ' "| {contig_cl} " "| bgzip -c > {out_file}") subprocess.check_call(cmd.format(**locals()), shell=True) if os.path.exists("%s-header.txt" % utils.splitext_plus(out_file)[0]): os.remove("%s-header.txt" % utils.splitext_plus(out_file)[0]) return vcfutils.bgzip_and_index(out_file, {})
def run(align_bams, items, ref_file, assoc_files, region, out_file): """Run platypus variant calling, germline whole genome or exome. """ assert out_file.endswith(".vcf.gz") if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: for align_bam in align_bams: bam.index(align_bam, items[0]["config"]) cmd = [ "platypus", "callVariants", "--regions=%s" % _subset_regions(region, out_file, items), "--bamFiles=%s" % ",".join(align_bams), "--refFile=%s" % dd.get_ref_file(items[0]), "--output=-", "--logFileName", "/dev/null", "--verbosity=1" ] resources = config_utils.get_resources("platypus", items[0]["config"]) if resources.get("options"): # normalize options so we can set defaults without overwriting user specified for opt in resources["options"]: if "=" in opt: key, val = opt.split("=") cmd.extend([key, val]) else: cmd.append(opt) if any("gvcf" in dd.get_tools_on(d) for d in items): cmd += ["--outputRefCalls", "1", "--refCallBlockSize", "50000"] # Adjust default filter thresholds to achieve similar sensitivity/specificity to other callers # Currently not used after doing more cross validation as they increase false positives # which seems to be a major advantage for Platypus users. # tuned_opts = ["--hapScoreThreshold", "10", "--scThreshold", "0.99", "--filteredReadsFrac", "0.9", # "--rmsmqThreshold", "20", "--qdThreshold", "0", "--abThreshold", "0.0001", # "--minVarFreq", "0.0", "--assemble", "1"] # for okey, oval in utils.partition_all(2, tuned_opts): # if okey not in cmd: # cmd.extend([okey, oval]) # Avoid filtering duplicates on high depth targeted regions where we don't mark duplicates if any(not dd.get_mark_duplicates(data) for data in items): cmd += ["--filterDuplicates=0"] post_process_cmd = ( " | %s | %s | %s | vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | " "vcfstreamsort | bgzip -c > %s" % (vcfutils.fix_ambiguous_cl(), vcfutils.fix_ambiguous_cl(5), vcfutils.add_contig_to_header_cl(dd.get_ref_file(items[0]), tx_out_file), tx_out_file)) do.run(" ".join(cmd) + post_process_cmd, "platypus variant calling") out_file = vcfutils.bgzip_and_index(out_file, items[0]["config"]) return out_file
def sort_to_ref(fname, ref_file, add_chr): """Match reference genome ordering. """ out_file = "%s-prep.vcf.gz" % (fname.replace(".vcf.gz", "")) if not os.path.exists(out_file): if add_chr: fix_chrom = r'| sed "s/^\([0-9]\+\)\t/chr\1\t/g" | sed "s/^MT/chrM/g" | sed "s/^X/chrX/g" | sed "s/^Y/chrY/g" ' else: fix_chrom = '' contig_cl = vcfutils.add_contig_to_header_cl(ref_file, out_file) cmd = ("gunzip -c {fname} {fix_chrom} | " "gsort /dev/stdin {ref_file}.fai | {contig_cl} | " "bgzip -c > {out_file}") subprocess.check_call(cmd.format(**locals()), shell=True) return vcfutils.bgzip_and_index(out_file, {})
def sort_to_ref(fname, ref_file, add_chr): """Match reference genome ordering. """ out_file = "%s-prep.vcf.gz" % (fname.replace(".vcf.gz", "")) if not os.path.exists(out_file): if add_chr: fix_chrom = r'| sed "s/^\([0-9]\+\)\t/chr\1\t/g" | sed "s/^MT/chrM/g" | sed "s/^X/chrX/g" | sed "s/^Y/chrY/g" ' else: fix_chrom = '' contig_cl = vcfutils.add_contig_to_header_cl(ref_file, out_file) cmd = ("gunzip -c {fname} {fix_chrom} | " "bcftools norm --check-ref s --do-not-normalize -f {ref_file} |" "gsort /dev/stdin {ref_file}.fai | {contig_cl} | " "bgzip -c > {out_file}") subprocess.check_call(cmd.format(**locals()), shell=True) return vcfutils.bgzip_and_index(out_file, {})
def run(align_bams, items, ref_file, assoc_files, region, out_file): """Run platypus variant calling, germline whole genome or exome. """ assert out_file.endswith(".vcf.gz") if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: for align_bam in align_bams: bam.index(align_bam, items[0]["config"]) cmd = ["platypus", "callVariants", "--regions=%s" % _subset_regions(region, out_file, items), "--bamFiles=%s" % ",".join(align_bams), "--refFile=%s" % dd.get_ref_file(items[0]), "--output=-", "--logFileName", "/dev/null", "--verbosity=1"] resources = config_utils.get_resources("platypus", items[0]["config"]) if resources.get("options"): # normalize options so we can set defaults without overwriting user specified for opt in resources["options"]: if "=" in opt: key, val = opt.split("=") cmd.extend([key, val]) else: cmd.append(opt) if any("gvcf" in dd.get_tools_on(d) for d in items): cmd += ["--outputRefCalls", "1", "--refCallBlockSize", "50000"] # Adjust default filter thresholds to achieve similar sensitivity/specificity to other callers # Currently not used after doing more cross validation as they increase false positives # which seems to be a major advantage for Platypus users. # tuned_opts = ["--hapScoreThreshold", "10", "--scThreshold", "0.99", "--filteredReadsFrac", "0.9", # "--rmsmqThreshold", "20", "--qdThreshold", "0", "--abThreshold", "0.0001", # "--minVarFreq", "0.0", "--assemble", "1"] # for okey, oval in utils.partition_all(2, tuned_opts): # if okey not in cmd: # cmd.extend([okey, oval]) # Avoid filtering duplicates on high depth targeted regions where we don't mark duplicates if any(not dd.get_mark_duplicates(data) for data in items): cmd += ["--filterDuplicates=0"] post_process_cmd = (" | %s | %s | %s | vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | " "vcfstreamsort | bgzip -c > %s" % (vcfutils.fix_ambiguous_cl(), vcfutils.fix_ambiguous_cl(5), vcfutils.add_contig_to_header_cl(items[0]), tx_out_file)) do.run(" ".join(cmd) + post_process_cmd, "platypus variant calling") out_file = vcfutils.bgzip_and_index(out_file, items[0]["config"]) return out_file
def sort_to_ref(fname, ref_file, add_chr): """Match reference genome ordering. """ logging.info(f"Sorting {fname} to match the order of {ref_file}.") out_file = "%s-prep.vcf.gz" % (fname.replace(".vcf.gz", "")) if not os.path.exists(out_file): if add_chr: fix_chrom = r'| sed "s/^\([0-9]\+\)\t/chr\1\t/g" | sed "s/^MT/chrM/g" | sed "s/^X/chrX/g" | sed "s/^Y/chrY/g" ' else: fix_chrom = '' contig_cl = vcfutils.add_contig_to_header_cl(ref_file, out_file) cmd = ("gunzip -c {fname} {fix_chrom} | " "bcftools norm --check-ref s --do-not-normalize -f {ref_file} |" "bcftools view -e 'SNP=1' |" "gsort /dev/stdin {ref_file}.fai | {contig_cl} | " "bgzip -c > {out_file}") subprocess.check_call(cmd.format(**locals()), shell=True) logging.info(f"bgzipping and indexing {out_file}.") return vcfutils.bgzip_and_index(out_file, {})
def _run_scalpel_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect indels with Scalpel. Single sample mode. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: if len(align_bams) > 1: message = ("Scalpel does not currently support batch calling!") raise ValueError(message) input_bams = " ".join("%s" % x for x in align_bams) tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0] tx_tmp_path = "%s-scalpel-work" % utils.splitext_plus(tx_out_file)[0] if os.path.exists(tmp_path): utils.remove_safe(tmp_path) opts = " ".join(_scalpel_options_from_config(items, config, out_file, region, tmp_path)) opts += " --dir %s" % tx_tmp_path min_cov = "3" # minimum coverage opts += " --mincov %s" % min_cov perl_exports = utils.get_perl_exports(os.path.dirname(tx_out_file)) cmd = ("{perl_exports} && " "scalpel-discovery --single {opts} --ref {ref_file} --bam {input_bams} ") do.run(cmd.format(**locals()), "Genotyping with Scalpel", {}) shutil.move(tx_tmp_path, tmp_path) # parse produced variant file further scalpel_tmp_file = bgzip_and_index(os.path.join(tmp_path, "variants.indel.vcf"), config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression("chi2", config) sample_name_str = items[0]["name"][1] fix_ambig = vcfutils.fix_ambiguous_cl() add_contig = vcfutils.add_contig_to_header_cl(dd.get_ref_file(items[0]), tx_out_file) cl2 = ("{bcftools_cmd_chi2} {scalpel_tmp_file} | " r"sed 's/FORMAT\tsample\(_name\)\{{0,1\}}/FORMAT\t{sample_name_str}/g' " "| {fix_ambig} | vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort " "| {add_contig} {compress_cmd} > {tx_out_file}") do.run(cl2.format(**locals()), "Finalising Scalpel variants", {}) return out_file
def run(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run tumor only pisces calling Handles bgzipping output file and fixing VCF sample naming to match BAM sample. """ paired = vcfutils.get_paired_bams(align_bams, items) assert paired and not paired.normal_bam, ( "Pisces supports tumor-only variant calling: %s" % (",".join([dd.get_sample_name(d) for d in items]))) vrs = bedutils.population_variant_regions(items) target = shared.subset_variant_regions(vrs, region, out_file, items=items, do_merge=True) min_af = float(dd.get_min_allele_fraction(paired.tumor_data)) / 100.0 if not utils.file_exists(out_file): base_out_name = utils.splitext_plus(os.path.basename( paired.tumor_bam))[0] raw_file = "%s.vcf" % utils.splitext_plus(out_file)[0] with file_transaction(paired.tumor_data, raw_file) as tx_out_file: ref_dir = _prep_genome(os.path.dirname(tx_out_file), paired.tumor_data) out_dir = os.path.dirname(tx_out_file) cores = dd.get_num_cores(paired.tumor_data) cmd = ( "pisces --bampaths {paired.tumor_bam} --genomepaths {ref_dir} --intervalpaths {target} " "--maxthreads {cores} --minvf {min_af} --ploidy somatic --gvcf false -o {out_dir}" ) do.run(cmd.format(**locals()), "Pisces tumor-only somatic calling") shutil.move(os.path.join(out_dir, "%s.vcf" % base_out_name), tx_out_file) vcfutils.bgzip_and_index( raw_file, paired.tumor_data["config"], prep_cmd="sed 's#%s.bam#%s#' | %s" % (base_out_name, dd.get_sample_name(paired.tumor_data), vcfutils.add_contig_to_header_cl( dd.get_ref_file(paired.tumor_data), out_file))) return vcfutils.bgzip_and_index(out_file, paired.tumor_data["config"])
def run(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run tumor only pisces calling Handles bgzipping output file and fixing VCF sample naming to match BAM sample. """ paired = vcfutils.get_paired_bams(align_bams, items) assert paired and not paired.normal_bam, ("Pisces supports tumor-only variant calling: %s" % (",".join([dd.get_sample_name(d) for d in items]))) vrs = bedutils.population_variant_regions(items) target = shared.subset_variant_regions(vrs, region, out_file, items=items, do_merge=True) min_af = float(dd.get_min_allele_fraction(paired.tumor_data)) / 100.0 if not utils.file_exists(out_file): base_out_name = utils.splitext_plus(os.path.basename(paired.tumor_bam))[0] raw_file = "%s.vcf" % utils.splitext_plus(out_file)[0] with file_transaction(paired.tumor_data, raw_file) as tx_out_file: ref_dir = _prep_genome(os.path.dirname(tx_out_file), paired.tumor_data) out_dir = os.path.dirname(tx_out_file) cores = dd.get_num_cores(paired.tumor_data) emit_min_af = min_af / 10.0 cmd = ("pisces --bampaths {paired.tumor_bam} --genomepaths {ref_dir} --intervalpaths {target} " "--maxthreads {cores} --minvf {emit_min_af} --vffilter {min_af} " "--ploidy somatic --gvcf false -o {out_dir}") # Recommended filtering for low frequency indels # https://github.com/bcbio/bcbio-nextgen/commit/49d0cbb1f6dcbea629c63749e2f9813bd06dcee3#commitcomment-29765373 cmd += " -RMxNFilter 5,9,0.35" # For low frequency UMI tagged variants, set higher variant thresholds # https://github.com/Illumina/Pisces/issues/14#issuecomment-399756862 if min_af < (1.0 / 100.0): cmd += " --minbasecallquality 30" do.run(cmd.format(**locals()), "Pisces tumor-only somatic calling") shutil.move(os.path.join(out_dir, "%s.vcf" % base_out_name), tx_out_file) vcfutils.bgzip_and_index(raw_file, paired.tumor_data["config"], prep_cmd="sed 's#%s.bam#%s#' | %s" % (base_out_name, dd.get_sample_name(paired.tumor_data), vcfutils.add_contig_to_header_cl(dd.get_ref_file(paired.tumor_data), out_file))) return vcfutils.bgzip_and_index(out_file, paired.tumor_data["config"])
def _produce_compatible_vcf(out_file, data): """Create a compatible VCF that downstream tools can deal with. - htsjdk and thus GATK and Picard do not support VCF4.3: https://github.com/broadinstitute/gatk/issues/2092 - Use octopus legacy format to avoid incompatibilities. https://github.com/luntergroup/octopus#output-format - Fixes `##contig` lines since octopus only writes contigs used in the BED file region, causing incompatibilies with GatherVcfs when merging """ base, ext = utils.splitext_plus(out_file) legacy_file = "%s.legacy%s" % (base, ext) final_file = "%s.vcf.gz" % base cat_cmd = "zcat" if legacy_file.endswith(".gz") else "cat" contig_cl = vcfutils.add_contig_to_header_cl(dd.get_ref_file(data), out_file) cmd = ( "{cat_cmd} {legacy_file} | sed 's/fileformat=VCFv4.3/fileformat=VCFv4.2/' | " "{contig_cl} | bgzip -c > {final_file}") do.run(cmd.format(**locals()), "Produce compatible VCF output file from octopus") return vcfutils.bgzip_and_index(out_file, data["config"])
def _run_vardict_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect variants with Vardict. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext( align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: target = shared.subset_variant_regions(dd.get_variant_regions( items[0]), region, out_file, do_merge=True) paired = vcfutils.get_paired_bams(align_bams, items) if not _is_bed_file(target): vcfutils.write_empty_vcf( tx_out_file, config, samples=[ x for x in [paired.tumor_name, paired.normal_name] if x ]) else: if not paired.normal_bam: ann_file = _run_vardict_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vardict = get_vardict_command(items[0]) vcfstreamsort = config_utils.get_program( "vcfstreamsort", config) strandbias = "testsomatic.R" var2vcf = "var2vcf_paired.pl" compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float( utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 # merge bed file regions as amplicon VarDict is only supported in single sample mode opts, var2vcf_opts = _vardict_options_from_config( items, config, out_file, target) fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) remove_dup = vcfutils.remove_dup_cl() if any("vardict_somatic_filter" in tz.get_in(( "config", "algorithm", "tools_off"), data, []) for data in items): somatic_filter = "" freq_filter = "" else: var2vcf_opts += " -M " # this makes VarDict soft filter non-differential variants somatic_filter = ( "| sed 's/\\\\.*Somatic\\\\/Somatic/' " "| sed 's/REJECT,Description=\".*\">/REJECT,Description=\"Not Somatic via VarDict\">/' " """| %s -c 'from bcbio.variation import freebayes; """ """freebayes.call_somatic("%s", "%s")' """ % (sys.executable, paired.tumor_name, paired.normal_name)) freq_filter = ( "| bcftools filter -m '+' -s 'REJECT' -e 'STATUS !~ \".*Somatic\"' 2> /dev/null " "| %s -x 'bcbio.variation.vardict.depth_freq_filter(x, %s, \"%s\")'" % (os.path.join(os.path.dirname(sys.executable), "py"), 0, dd.get_aligner(paired.tumor_data))) jvm_opts = _get_jvm_opts(items[0], tx_out_file) py_cl = os.path.join(utils.get_bcbio_bin(), "py") setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports()) contig_cl = vcfutils.add_contig_to_header_cl( ref_file, tx_out_file) cmd = ( "{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} " "| {strandbias} " "| {var2vcf} -P 0.9 -m 4.25 -f {freq} {var2vcf_opts} " "-N \"{paired.tumor_name}|{paired.normal_name}\" " "| {contig_cl} {freq_filter} " "| bcftools filter -i 'QUAL >= 0' " "{somatic_filter} | {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) return out_file
def _run_vardict_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with VarDict. var2vcf_valid uses -A flag which reports all alleles and improves sensitivity: https://github.com/AstraZeneca-NGS/VarDict/issues/35#issuecomment-276738191 """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: vrs = bedutils.population_variant_regions(items) target = shared.subset_variant_regions(vrs, region, out_file, items=items, do_merge=False) num_bams = len(align_bams) sample_vcf_names = [ ] # for individual sample names, given batch calling may be required for bamfile, item in zip(align_bams, items): # prepare commands sample = dd.get_sample_name(item) vardict = get_vardict_command(items[0]) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" opts, var2vcf_opts = _vardict_options_from_config( items, config, out_file, target) vcfstreamsort = config_utils.get_program( "vcfstreamsort", config) compress_cmd = "| bgzip -c" if tx_out_file.endswith( "gz") else "" freq = float( utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) remove_dup = vcfutils.remove_dup_cl() py_cl = os.path.join(utils.get_bcbio_bin(), "py") jvm_opts = _get_jvm_opts(items[0], tx_out_file) setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports()) contig_cl = vcfutils.add_contig_to_header_cl( ref_file, tx_out_file) cmd = ( "{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} " "| {strandbias}" "| {var2vcf} -A -N {sample} -E -f {freq} {var2vcf_opts} " "| {contig_cl} | bcftools filter -i 'QUAL >= 0' " "| {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}" ) if num_bams > 1: temp_file_prefix = out_file.replace(".gz", "").replace( ".vcf", "") + item["name"][1] tmp_out = temp_file_prefix + ".temp.vcf" tmp_out += ".gz" if out_file.endswith("gz") else "" sample_vcf_names.append(tmp_out) with file_transaction(item, tmp_out) as tx_tmp_file: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_tmp_file, config, samples=[sample]) else: cmd += " > {tx_tmp_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) else: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_out_file, config, samples=[sample]) else: cmd += " > {tx_out_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) if num_bams > 1: # N.B. merge_variant_files wants region in 1-based end-inclusive # coordinates. Thus use bamprep.region_to_gatk vcfutils.merge_variant_files( orig_files=sample_vcf_names, out_file=tx_out_file, ref_file=ref_file, config=config, region=bamprep.region_to_gatk(region)) return out_file
def _run_scalpel_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect indels with Scalpel. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext( align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: paired = get_paired_bams(align_bams, items) if not paired.normal_bam: ann_file = _run_scalpel_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vcfstreamsort = config_utils.get_program("vcfstreamsort", config) perl_exports = utils.get_perl_exports(os.path.dirname(tx_out_file)) tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0] db_file = os.path.join(tmp_path, "main", "somatic.db") if not os.path.exists(db_file + ".dir"): if os.path.exists(tmp_path): utils.remove_safe(tmp_path) opts = " ".join( _scalpel_options_from_config(items, config, out_file, region, tmp_path)) opts += " --ref {}".format(ref_file) opts += " --dir %s" % tmp_path # caling cl = ( "{perl_exports} && " "scalpel-discovery --somatic {opts} --tumor {paired.tumor_bam} --normal {paired.normal_bam}" ) do.run(cl.format(**locals()), "Genotyping paired variants with Scalpel", {}) # filtering to adjust input parameters bed_opts = " ".join( _scalpel_bed_file_opts(items, config, out_file, region, tmp_path)) use_defaults = True if use_defaults: scalpel_tmp_file = os.path.join(tmp_path, "main/somatic.indel.vcf") # Uses default filters but can tweak min-alt-count-tumor and min-phred-fisher # to swap precision for sensitivity else: scalpel_tmp_file = os.path.join( tmp_path, "main/somatic-indel-filter.vcf.gz") with file_transaction(config, scalpel_tmp_file) as tx_indel_file: cmd = ( "{perl_exports} && " "scalpel-export --somatic {bed_opts} --ref {ref_file} --db {db_file} " "--min-alt-count-tumor 5 --min-phred-fisher 10 --min-vaf-tumor 0.1 " "| bgzip -c > {tx_indel_file}") do.run(cmd.format(**locals()), "Scalpel somatic indel filter", {}) scalpel_tmp_file = bgzip_and_index(scalpel_tmp_file, config) scalpel_tmp_file_common = bgzip_and_index( os.path.join(tmp_path, "main/common.indel.vcf"), config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression( "chi2", config) bcftools_cmd_common = get_scalpel_bcftools_filter_expression( "reject", config) fix_ambig = vcfutils.fix_ambiguous_cl() add_contig = vcfutils.add_contig_to_header_cl(items[0]) cl2 = ( "vcfcat <({bcftools_cmd_chi2} {scalpel_tmp_file}) " "<({bcftools_cmd_common} {scalpel_tmp_file_common}) | " " {fix_ambig} | {vcfstreamsort} | {add_contig} {compress_cmd} > {tx_out_file}" ) do.run(cl2.format(**locals()), "Finalising Scalpel variants", {}) return out_file
def _run_vardict_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with VarDict. var2vcf_valid uses -A flag which reports all alleles and improves sensitivity: https://github.com/AstraZeneca-NGS/VarDict/issues/35#issuecomment-276738191 """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: vrs = bedutils.population_variant_regions(items) target = shared.subset_variant_regions( vrs, region, out_file, items=items, do_merge=False) num_bams = len(align_bams) sample_vcf_names = [] # for individual sample names, given batch calling may be required for bamfile, item in zip(align_bams, items): # prepare commands sample = dd.get_sample_name(item) vardict = get_vardict_command(items[0]) opts, var2vcf_opts = _vardict_options_from_config(items, config, out_file, target) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) compress_cmd = "| bgzip -c" if tx_out_file.endswith("gz") else "" freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) remove_dup = vcfutils.remove_dup_cl() py_cl = os.path.join(utils.get_bcbio_bin(), "py") jvm_opts = _get_jvm_opts(items[0], tx_out_file) setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports()) contig_cl = vcfutils.add_contig_to_header_cl(ref_file, tx_out_file) lowfreq_filter = _lowfreq_linear_filter(0, False) cmd = ("{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} " "| teststrandbias.R " "| var2vcf_valid.pl -A -N {sample} -E -f {freq} {var2vcf_opts} " "| {contig_cl} | bcftools filter -i 'QUAL >= 0' | {lowfreq_filter} " "| {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}") if num_bams > 1: temp_file_prefix = out_file.replace(".gz", "").replace(".vcf", "") + item["name"][1] tmp_out = temp_file_prefix + ".temp.vcf" tmp_out += ".gz" if out_file.endswith("gz") else "" sample_vcf_names.append(tmp_out) with file_transaction(item, tmp_out) as tx_tmp_file: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_tmp_file, config, samples=[sample]) else: cmd += " > {tx_tmp_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) else: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_out_file, config, samples=[sample]) else: cmd += " > {tx_out_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) if num_bams > 1: # N.B. merge_variant_files wants region in 1-based end-inclusive # coordinates. Thus use bamprep.region_to_gatk vcfutils.merge_variant_files(orig_files=sample_vcf_names, out_file=tx_out_file, ref_file=ref_file, config=config, region=bamprep.region_to_gatk(region)) return out_file
def _run_vardict_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect variants with Vardict. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: vrs = bedutils.population_variant_regions(items) target = shared.subset_variant_regions(vrs, region, out_file, items=items, do_merge=True) paired = vcfutils.get_paired_bams(align_bams, items) if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_out_file, config, samples=[x for x in [paired.tumor_name, paired.normal_name] if x]) else: if not paired.normal_bam: ann_file = _run_vardict_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vardict = get_vardict_command(items[0]) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 # merge bed file regions as amplicon VarDict is only supported in single sample mode opts, var2vcf_opts = _vardict_options_from_config(items, config, out_file, target) fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) remove_dup = vcfutils.remove_dup_cl() if any("vardict_somatic_filter" in tz.get_in(("config", "algorithm", "tools_off"), data, []) for data in items): somatic_filter = "" freq_filter = "" else: var2vcf_opts += " -M " # this makes VarDict soft filter non-differential variants somatic_filter = ("| sed 's/\\\\.*Somatic\\\\/Somatic/' " "| sed 's/REJECT,Description=\".*\">/REJECT,Description=\"Not Somatic via VarDict\">/' " """| %s -c 'from bcbio.variation import freebayes; """ """freebayes.call_somatic("%s", "%s")' """ % (sys.executable, paired.tumor_name, paired.normal_name)) freq_filter = ("| bcftools filter -m '+' -s 'REJECT' -e 'STATUS !~ \".*Somatic\"' 2> /dev/null " "| %s -x 'bcbio.variation.vardict.add_db_germline_flag(x)' " "| %s " "| %s -x 'bcbio.variation.vardict.depth_freq_filter(x, %s, \"%s\")'" % (os.path.join(os.path.dirname(sys.executable), "py"), _lowfreq_linear_filter(0, True), os.path.join(os.path.dirname(sys.executable), "py"), 0, bam.aligner_from_header(paired.tumor_bam))) jvm_opts = _get_jvm_opts(items[0], tx_out_file) py_cl = os.path.join(utils.get_bcbio_bin(), "py") setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports()) contig_cl = vcfutils.add_contig_to_header_cl(ref_file, tx_out_file) cmd = ("{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} " "| awk 'NF>=48' | testsomatic.R " "| var2vcf_paired.pl -P 0.9 -m 4.25 -f {freq} {var2vcf_opts} " "-N \"{paired.tumor_name}|{paired.normal_name}\" " "| {contig_cl} {freq_filter} " "| bcftools filter -i 'QUAL >= 0' " "{somatic_filter} | {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) return out_file