def mutect2_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's MuTect2. This requires the full non open-source version of GATK 3.5+. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): _prep_inputs(align_bams, ref_file, items) with file_transaction(items[0], out_file) as tx_out_file: params = ["-T", "MuTect2", "-R", ref_file, "--annotation", "ClippingRankSumTest", "--annotation", "DepthPerSampleHC"] for a in annotation.get_gatk_annotations(items[0]["config"]): params += ["--annotation", a] paired = vcfutils.get_paired_bams(align_bams, items) params += _add_tumor_params(paired) params += _add_region_params(region, out_file, items) params += _add_assoc_params(assoc_files) params += ["-ploidy", str(ploidy.get_ploidy(items, region))] resources = config_utils.get_resources("mutect2", items[0]["config"]) if "options" in resources: params += [str(x) for x in resources.get("options", [])] broad_runner = broad.runner_from_config(items[0]["config"]) assert LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.5"), \ "Require full version of GATK 3.5+ for mutect2 calling" broad_runner.new_resources("mutect2") gatk_cmd = " ".join(broad_runner.cl_gatk(params, os.path.dirname(tx_out_file))) pp_cmd = _post_process_cl(paired) cmd = "{gatk_cmd} | {pp_cmd} | bgzip -c > {tx_out_file}" do.run(cmd.format(**locals()), "MuTect2") out_file = vcfutils.bgzip_and_index(out_file, items[0]["config"]) return out_file
def _mutect_call_prep(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Preparation work for MuTect. """ base_config = items[0]["config"] broad_runner = broad.runner_from_config(base_config, "mutect") _check_mutect_version(broad_runner) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, base_config) paired = vcfutils.get_paired_bams(align_bams, items) if not paired: raise ValueError("Specified MuTect calling but 'tumor' phenotype not present in batch\n" "https://bcbio-nextgen.readthedocs.org/en/latest/contents/" "pipelines.html#cancer-variant-calling\n" "for samples: %s" % ", " .join([dd.get_sample_name(x) for x in items])) params = ["-R", ref_file, "-T", "MuTect", "-U", "ALLOW_N_CIGAR_READS"] params += ["--read_filter", "NotPrimaryAlignment"] params += ["-I:tumor", paired.tumor_bam] params += ["--tumor_sample_name", paired.tumor_name] if paired.normal_bam is not None: params += ["-I:normal", paired.normal_bam] params += ["--normal_sample_name", paired.normal_name] if paired.normal_panel is not None: params += ["--normal_panel", paired.normal_panel] params += _config_params(base_config, assoc_files, region, out_file) return broad_runner, params
def run(items, background=None): """Detect copy number variations from batched set of samples using CNVkit. """ if not background: background = [] paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) if paired: inputs = [paired.tumor_data] background_bams = [paired.normal_bam] background_names = [paired.normal_name] else: inputs = [items] background_bams = [x["align_bam"] for x in background] background_names = [dd.get_sample_name(x) for x in background] orig_vcf_file = _run_wham(inputs, background_bams) wclass_vcf_file = _add_wham_classification(orig_vcf_file, inputs) vcf_file = _fix_vcf(wclass_vcf_file, inputs, background_names) bed_file = _convert_to_bed(vcf_file, inputs) out = [] for data in items: if "sv" not in data: data["sv"] = [] data["sv"].append({ "variantcaller": "wham", "vrn_file": vcf_file, "bed_file": bed_file }) out.append(data) return out
def run(items, background=None): """Detect copy number variations from batched set of samples using WHAM. """ if not background: background = [] background_bams = [] paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) if paired: inputs = [paired.tumor_data] if paired.normal_bam: background = [paired.normal_data] background_bams = [paired.normal_bam] else: assert not background inputs, background = shared.find_case_control(items) background_bams = [x["align_bam"] for x in background] orig_vcf = _run_wham(inputs, background_bams) out = [] for data in inputs: if "sv" not in data: data["sv"] = [] sample_vcf = "%s-%s.vcf.gz" % (utils.splitext_plus(orig_vcf)[0], dd.get_sample_name(data)) sample_vcf = vcfutils.select_sample(orig_vcf, dd.get_sample_name(data), sample_vcf, data["config"]) if background: sample_vcf = filter_by_background(sample_vcf, orig_vcf, background, data) data["sv"].append({"variantcaller": "wham", "vrn_file": sample_vcf}) out.append(data) return out
def _mutect_call_prep(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Preparation work for MuTect. """ base_config = items[0]["config"] broad_runner = broad.runner_from_config(base_config, "mutect") _check_mutect_version(broad_runner) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, base_config) paired = vcfutils.get_paired_bams(align_bams, items) params = ["-R", ref_file, "-T", "MuTect", "-U", "ALLOW_N_CIGAR_READS"] # if coverage_depth_max is not given, default to 10000 downsample_cov = get_in(paired.tumor_config, ("algorithm", "coverage_depth_max"), 10000) # if coverage_depth_max is zero, default to Broad default value (currently 1500) params += ["--downsample_to_coverage", max(1500, downsample_cov)] if downsample_cov > 0 else [] params += ["--read_filter", "NotPrimaryAlignment"] params += ["-I:tumor", paired.tumor_bam] params += ["--tumor_sample_name", paired.tumor_name] if paired.normal_bam is not None: params += ["-I:normal", paired.normal_bam] params += ["--normal_sample_name", paired.normal_name] if paired.normal_panel is not None: params += ["--normal_panel", paired.normal_panel] params += _config_params(base_config, assoc_files, region, out_file) return broad_runner, params
def _SID_call_prep(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Preparation work for SomaticIndelDetector. """ base_config = items[0]["config"] for x in align_bams: bam.index(x, base_config) params = ["-R", ref_file, "-T", "SomaticIndelDetector", "-U", "ALLOW_N_CIGAR_READS"] # Limit per base read start count to between 200-10000, i.e. from any base # can no more 10000 new reads begin. # Further, limit maxNumberOfReads accordingly, otherwise SID discards # windows for high coverage panels. window_size = 200 # default SID value paired = vcfutils.get_paired_bams(align_bams, items) max_depth = min(max(200, get_in(paired.tumor_config, ("algorithm", "coverage_depth_max"), 10000)), 10000) params += ["--downsample_to_coverage", max_depth] params += ["--maxNumberOfReads", str(int(max_depth) * window_size)] params += ["--read_filter", "NotPrimaryAlignment"] params += ["-I:tumor", paired.tumor_bam] min_af = float(get_in(paired.tumor_config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 if paired.normal_bam is not None: params += ["-I:normal", paired.normal_bam] # notice there must be at least 4 reads of coverage in normal params += ["--filter_expressions", "T_COV<6||N_COV<4||T_INDEL_F<%s||T_INDEL_CF<0.7" % min_af] else: params += ["--unpaired"] params += ["--filter_expressions", "COV<6||INDEL_F<%s||INDEL_CF<0.7" % min_af] if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] return params
def estimate(items, batch, config): """Estimate heterogeneity for a pair of tumor/normal samples. Run in parallel. """ hetcallers = { "theta": theta.run, "phylowgs": phylowgs.run, "bubbletree": bubbletree.run } paired = vcfutils.get_paired_bams([dd.get_align_bam(d) for d in items], items) calls = _get_calls(paired.tumor_data) variants = get_variants(paired.tumor_data) het_info = [] for hetcaller in _get_hetcallers(items): try: hetfn = hetcallers[hetcaller] except KeyError: hetfn = None print("%s not yet implemented" % hetcaller) if hetfn: hetout = hetfn(variants[0], calls, paired) if hetout: het_info.append(hetout) out = [] for data in items: if batch == _get_batches(data)[0]: if dd.get_sample_name(data) == paired.tumor_name: if het_info: data["heterogeneity"] = het_info out.append([data]) return out
def _run_scalpel_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect indels with Scalpel. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: paired = get_paired_bams(align_bams, items) if not paired.normal_bam: ann_file = _run_scalpel_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vcfstreamsort = config_utils.get_program("vcfstreamsort", config) perl_exports = utils.get_perl_exports(os.path.dirname(tx_out_file)) tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0] db_file = os.path.join(tmp_path, "main", "somatic.db") if not os.path.exists(db_file + ".dir"): if os.path.exists(tmp_path): utils.remove_safe(tmp_path) opts = " ".join(_scalpel_options_from_config(items, config, out_file, region, tmp_path)) opts += " --ref {}".format(ref_file) opts += " --dir %s" % tmp_path # caling cl = ("{perl_exports} && " "scalpel-discovery --somatic {opts} --tumor {paired.tumor_bam} --normal {paired.normal_bam}") do.run(cl.format(**locals()), "Genotyping paired variants with Scalpel", {}) # filtering to adjust input parameters bed_opts = " ".join(_scalpel_bed_file_opts(items, config, out_file, region, tmp_path)) use_defaults = True if use_defaults: scalpel_tmp_file = os.path.join(tmp_path, "main/somatic.indel.vcf") # Uses default filters but can tweak min-alt-count-tumor and min-phred-fisher # to swap precision for sensitivity else: scalpel_tmp_file = os.path.join(tmp_path, "main/somatic-indel-filter.vcf.gz") with file_transaction(config, scalpel_tmp_file) as tx_indel_file: cmd = ("{perl_exports} && " "scalpel-export --somatic {bed_opts} --ref {ref_file} --db {db_file} " "--min-alt-count-tumor 5 --min-phred-fisher 10 --min-vaf-tumor 0.1 " "| bgzip -c > {tx_indel_file}") do.run(cmd.format(**locals()), "Scalpel somatic indel filter", {}) scalpel_tmp_file = bgzip_and_index(scalpel_tmp_file, config) scalpel_tmp_file_common = bgzip_and_index(os.path.join(tmp_path, "main/common.indel.vcf"), config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression("chi2", config) bcftools_cmd_common = get_scalpel_bcftools_filter_expression("reject", config) fix_ambig = vcfutils.fix_ambiguous_cl() cl2 = ("vcfcat <({bcftools_cmd_chi2} {scalpel_tmp_file}) " "<({bcftools_cmd_common} {scalpel_tmp_file_common}) | " " {fix_ambig} | {vcfstreamsort} {compress_cmd} > {tx_out_file}") do.run(cl2.format(**locals()), "Finalising Scalpel variants", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _paired_load_script(work_bams, names, chrom, pairmode, items): """Prepare BAMs for assessing CNVs in a paired tumor/normal setup. """ paired = vcfutils.get_paired_bams(work_bams, items) return _paired_prep.format(case_file=paired.tumor_bam, case_name=paired.tumor_name, ctrl_file=paired.normal_bam, ctrl_name=paired.normal_name, num_cores=0, chrom=chrom, pairmode=pairmode)
def run_tnhaplotyper(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variants with Sentieon's TNhaplotyper (MuTect2 like). """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): variant_regions = bedutils.merge_overlaps(dd.get_variant_regions(items[0]), items[0]) interval = _get_interval(variant_regions, region, out_file, items) with file_transaction(items[0], out_file) as tx_out_file: paired = vcfutils.get_paired_bams(align_bams, items) assert paired.normal_bam, "Require normal BAM for Sentieon TNhaplotyper" dbsnp = "--dbsnp %s" % (assoc_files.get("dbsnp")) if "dbsnp" in assoc_files else "" cosmic = "--cosmic %s" % (assoc_files.get("cosmic")) if "cosmic" in assoc_files else "" license = license_export(items[0]) tx_orig_file = "%s-orig%s" % utils.splitext_plus(tx_out_file) cores = dd.get_num_cores(items[0]) cmd = ("{license}sentieon driver -t {cores} -r {ref_file} " "-i {paired.tumor_bam} -i {paired.normal_bam} {interval} " "--algo TNhaplotyper " "--tumor_sample {paired.tumor_name} --normal_sample {paired.normal_name} " "{dbsnp} {cosmic} {tx_orig_file}") do.run(cmd.format(**locals()), "Sentieon TNhaplotyper") cmd = ("gunzip -c {tx_orig_file} | " "sed 's/ID=ECNT,Number=1,Type=Integer/ID=ECNT,Number=1,Type=String/' | " "sed 's/ID=HCNT,Number=1,Type=Integer/ID=HCNT,Number=1,Type=String/' | " "sed 's/ID=NLOD,Number=1,Type=Float/ID=NLOD,Number=1,Type=String/' | " "sed 's/ID=TLOD,Number=1,Type=Float/ID=TLOD,Number=1,Type=String/' | " "sed 's/ID=PON,Number=1,Type=Integer/ID=PON,Number=1,Type=String/' | " "bgzip -c > {tx_out_file}") do.run(cmd.format(**locals()), "Sentieon TNhaplotyper: make headers GATK compatible") vcfutils.bgzip_and_index(tx_out_file, items[0]["config"]) return out_file
def estimate(items, batch, config): """Estimate heterogeneity for a pair of tumor/normal samples. Run in parallel. """ hetcallers = {"theta": theta.run, "phylowgs": phylowgs.run, "bubbletree": bubbletree.run} paired = vcfutils.get_paired_bams([dd.get_align_bam(d) for d in items], items) calls = _get_calls(paired.tumor_data) variants = _get_variants(paired.tumor_data) het_info = [] for hetcaller in _get_hetcallers(items): try: hetfn = hetcallers[hetcaller] except KeyError: hetfn = None print "%s not yet implemented" % hetcaller if hetfn: hetout = hetfn(variants[0], calls, paired) if hetout: het_info.append(hetout) out = [] for data in items: if batch == _get_batches(data)[0]: if dd.get_sample_name(data) == paired.tumor_name: if het_info: data["heterogeneity"] = het_info out.append([data]) return out
def run(items): """Perform detection of structural variations with lumpy, using bwa-mem alignment. """ if not all(utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", False, None] for data in items): raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection") paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0]) full_bams, sr_bams, disc_bams = [], [], [] for data in items: dedup_bam, sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir) full_bams.append(dedup_bam) sr_bams.append(sr_bam) disc_bams.append(disc_bam) lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, work_dir, items) out = [] for i, data in enumerate(items): if "sv" not in data: data["sv"] = [] sample = dd.get_sample_name(data) dedup_bam, sr_bam, _ = sshared.get_split_discordants(data, work_dir) sample_vcf = vcfutils.select_sample(lumpy_vcf, sample, utils.append_stem(lumpy_vcf, "-%s" % sample), data["config"]) gt_vcf = _run_svtyper(sample_vcf, dedup_bam, sr_bam, data) filter_vcf = _filter_by_support(gt_vcf, data) data["sv"].append({"variantcaller": "lumpy", "vrn_file": filter_vcf, "exclude_file": exclude_file}) out.append(data) return out
def _run_freebayes_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with FreeBayes. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: paired = get_paired_bams(align_bams, items) if not paired.normal_bam: raise ValueError("Require both tumor and normal BAM files for FreeBayes cancer calling") vcfsamplediff = config_utils.get_program("vcfsamplediff", config) freebayes = config_utils.get_program("freebayes", config) opts = " ".join(_freebayes_options_from_config(items, config, out_file, region)) opts += " -f {}".format(ref_file) # NOTE: The first sample name in the vcfsamplediff call is # the one supposed to be the *germline* one compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" cl = ("{freebayes} --pooled-discrete --pvar 0.7" " --genotype-qualities {opts} {paired.tumor_bam}" " {paired.normal_bam} | {vcfsamplediff} -s VT" " {paired.normal_name} {paired.tumor_name}" " - {compress_cmd} > {tx_out_file}") bam.index(paired.tumor_bam, config) bam.index(paired.normal_bam, config) do.run(cl.format(**locals()), "Genotyping paired variants with FreeBayes", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files["dbsnp"], ref_file, config) return ann_file
def run(items, background=None): """Detect copy number variations from batched set of samples using WHAM. """ if not background: background = [] paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) if paired: inputs = [paired.tumor_data] background_bams = [paired.normal_bam] background_names = [paired.normal_name] else: assert not background inputs, background = shared.find_case_control(items) background_bams = [x["align_bam"] for x in background] background_names = [dd.get_sample_name(x) for x in background] orig_vcf_file = _run_wham(inputs, background_bams) wclass_vcf_file = _add_wham_classification(orig_vcf_file, inputs) vcf_file = _fix_vcf(wclass_vcf_file, inputs, background_names) bed_file = _convert_to_bed(vcf_file, inputs, use_lrt=len(background_bams) > 0) out = [] for data in items: if "sv" not in data: data["sv"] = [] data["sv"].append({"variantcaller": "wham", "vrn_file": _subset_to_sample(bed_file, data), "vcf_file": vcf_file}) out.append(data) return out
def _run_qsnp_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect somatic mutations with qSNP. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): out_file = out_file.replace(".gz", "") with file_transaction(config, out_file) as tx_out_file: with tx_tmpdir(config) as tmpdir: with utils.chdir(tmpdir): paired = get_paired_bams(align_bams, items) qsnp = config_utils.get_program("qsnp", config) resources = config_utils.get_resources("qsnp", config) mem = " ".join(resources.get("jvm_opts", ["-Xms750m -Xmx4g"])) qsnp_log = os.path.join(tmpdir, "qsnp.log") qsnp_init = os.path.join(tmpdir, "qsnp.ini") if region: paired = _create_bam_region(paired, region, tmpdir) _create_input(paired, tx_out_file, ref_file, assoc_files['dbsnp'], qsnp_init) cl = ("{qsnp} {mem} -i {qsnp_init} -log {qsnp_log}") do.run(cl.format(**locals()), "Genotyping paired variants with Qsnp", {}) out_file = _filter_vcf(out_file) out_file = bgzip_and_index(out_file, config) return out_file
def run_qsnp(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run qSNP calling on paired tumor/normal. """ if utils.file_exists(out_file): return out_file paired = get_paired_bams(align_bams, items) if paired.normal_bam: region_files = [] regions = _clean_regions(items, region) if regions: for region in regions: out_region_file = out_file.replace(".vcf.gz", _to_str(region) + ".vcf.gz") region_file = _run_qsnp_paired(align_bams, items, ref_file, assoc_files, region, out_region_file) region_files.append(region_file) out_file = combine_variant_files(region_files, out_file, ref_file, items[0]["config"]) if not region: out_file = _run_qsnp_paired(align_bams, items, ref_file, assoc_files, region, out_file) return out_file else: raise ValueError("qSNP only works on paired samples")
def run_freebayes(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run FreeBayes variant calling, either paired tumor/normal or germline calling. """ if is_paired_analysis(align_bams, items): paired = get_paired_bams(align_bams, items) if not paired.normal_bam: call_file = _run_freebayes_caller(align_bams, items, ref_file, assoc_files, region, out_file, somatic=paired) else: call_file = _run_freebayes_paired(align_bams, items, ref_file, assoc_files, region, out_file) else: vcfutils.check_paired_problems(items) call_file = _run_freebayes_caller(align_bams, items, ref_file, assoc_files, region, out_file) return call_file
def run(items, background=None): """Detect copy number variations from batched set of samples using WHAM. """ if not background: background = [] background_bams = [] paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) if paired: inputs = [paired.tumor_data] if paired.normal_bam: background = [paired.normal_data] background_bams = [paired.normal_bam] else: assert not background inputs, background = shared.find_case_control(items) background_bams = [x["align_bam"] for x in background] orig_vcf = _run_wham(inputs, background_bams) out = [] for data in inputs: if "sv" not in data: data["sv"] = [] sample_vcf = "%s-%s.vcf.gz" % (utils.splitext_plus(orig_vcf)[0], dd.get_sample_name(data)) sample_vcf = vcfutils.select_sample(orig_vcf, dd.get_sample_name(data), sample_vcf, data["config"]) effects_vcf, _ = effects.add_to_vcf(sample_vcf, data, "snpeff") data["sv"].append({ "variantcaller": "wham", "vrn_file": effects_vcf or sample_vcf }) out.append(data) return out
def run(items, background=None): """Detect copy number variations from tumor/normal samples using Battenberg. """ paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) if not paired or not paired.normal_bam: logger.warn( "Battenberg only works on paired tumor/normal inputs, skipping %s" % dd.get_sample_name(items[0])) batout = None elif not tz.get_in(["genome_resources", "aliases", "human"], paired.tumor_data): logger.warn("Battenberg only works on human data, skipping %s" % dd.get_sample_name(items[0])) batout = None else: batout = _do_run(paired) batout["variantcaller"] = "battenberg" out = [] for data in items: if batout: if "sv" not in data: data["sv"] = [] data["sv"].append(batout) out.append(data) return out
def _mutect_call_prep(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Preparation work for MuTect. """ base_config = items[0]["config"] broad_runner = broad.runner_from_config(base_config, "mutect") _check_mutect_version(broad_runner) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, base_config) paired = vcfutils.get_paired_bams(align_bams, items) params = ["-R", ref_file, "-T", "MuTect", "-U", "ALLOW_N_CIGAR_READS"] params += ["--downsample_to_coverage", max(200, get_in(paired.tumor_config, ("algorithm", "coverage_depth_max"), 10000))] params += ["--read_filter", "NotPrimaryAlignment"] params += ["-I:tumor", paired.tumor_bam] params += ["--tumor_sample_name", paired.tumor_name] if paired.normal_bam is not None: params += ["-I:normal", paired.normal_bam] params += ["--normal_sample_name", paired.normal_name] if paired.normal_panel is not None: params += ["--normal_panel", paired.normal_panel] params += _config_params(base_config, assoc_files, region, out_file) return broad_runner, params
def _run_freebayes_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with FreeBayes. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext( align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: paired = get_paired_bams(align_bams, items) if not paired.normal_bam: raise ValueError( "Require both tumor and normal BAM files for FreeBayes cancer calling" ) vcfsamplediff = config_utils.get_program("vcfsamplediff", config) vcffilter = config_utils.get_program("vcffilter", config) freebayes = config_utils.get_program("freebayes", config) opts = " ".join( _freebayes_options_from_config(items, config, out_file, region)) opts += " -f {}".format(ref_file) if "--min-alternate-fraction" not in opts and "-F" not in opts: # add minimum reportable allele frequency # FreeBayes defaults to 20%, but use 10% by default for the # tumor case min_af = float( utils.get_in(paired.tumor_config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 opts += " --min-alternate-fraction %s" % min_af # NOTE: The first sample name in the vcfsamplediff call is # the one supposed to be the *germline* one # NOTE: -s in vcfsamplediff (strict checking: i.e., require no # reads in the germline to call somatic) is not used as it is # too stringent compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" cl = ( "{freebayes} --pooled-discrete --genotype-qualities " "{opts} {paired.tumor_bam} {paired.normal_bam} " "| {vcffilter} -f 'QUAL > 1' -s " "| {vcfsamplediff} VT {paired.normal_name} {paired.tumor_name} - " "{compress_cmd} > {tx_out_file}") bam.index(paired.tumor_bam, config) bam.index(paired.normal_bam, config) do.run(cl.format(**locals()), "Genotyping paired variants with FreeBayes", {}) fix_somatic_calls(out_file, config) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files["dbsnp"], ref_file, config) return ann_file
def run_tnscope(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variants with Sentieon's TNscope somatic caller. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): variant_regions = bedutils.merge_overlaps( dd.get_variant_regions(items[0]), items[0]) interval = _get_interval(variant_regions, region, out_file, items) with file_transaction(items[0], out_file) as tx_out_file: paired = vcfutils.get_paired_bams(align_bams, items) assert paired and paired.normal_bam, "Require normal BAM for Sentieon TNscope" dbsnp = "--dbsnp %s" % ( assoc_files.get("dbsnp")) if "dbsnp" in assoc_files else "" license = license_export(items[0]) cores = dd.get_num_cores(items[0]) cmd = ( "{license}sentieon driver -t {cores} -r {ref_file} " "-i {paired.tumor_bam} -i {paired.normal_bam} {interval} " "--algo TNscope " "--tumor_sample {paired.tumor_name} --normal_sample {paired.normal_name} " "{dbsnp} {tx_out_file}") do.run(cmd.format(**locals()), "Sentieon TNscope") return out_file
def _run_freebayes_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with FreeBayes for paired tumor/normal samples. Sources of options for FreeBayes: mailing list: https://groups.google.com/d/msg/freebayes/dTWBtLyM4Vs/HAK_ZhJHguMJ mailing list: https://groups.google.com/forum/#!msg/freebayes/LLH7ZfZlVNs/63FdD31rrfEJ speedseq: https://github.com/cc2qe/speedseq/blob/e6729aa2589eca4e3a946f398c1a2bdc15a7300d/bin/speedseq#L916 sga/freebayes: https://github.com/jts/sga-extra/blob/7e28caf71e8107b697f9be7162050e4fa259694b/ sga_generate_varcall_makefile.pl#L299 """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext( align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: paired = get_paired_bams(align_bams, items) assert paired.normal_bam, "Require normal BAM for FreeBayes paired calling and filtering" freebayes = config_utils.get_program("freebayes", config) opts, no_target_regions = _freebayes_options_from_config( items, config, out_file, region) if no_target_regions: vcfutils.write_empty_vcf( tx_out_file, config, samples=[ x for x in [paired.tumor_name, paired.normal_name] if x ]) else: opts = " ".join(opts) opts += " --min-repeat-entropy 1" opts += " --no-partial-observations" opts = _add_somatic_opts(opts, paired) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" fix_ambig = vcfutils.fix_ambiguous_cl() clean_fmt_cmd = _clean_freebayes_fmt_cl() py_cl = os.path.join(os.path.dirname(sys.executable), "py") cl = ( "{freebayes} -f {ref_file} {opts} " "{paired.tumor_bam} {paired.normal_bam} " """| bcftools filter -i 'ALT="<*>" || QUAL > 5' """ "| {py_cl} -x 'bcbio.variation.freebayes.call_somatic(x)' " "| {fix_ambig} | {clean_fmt_cmd} bcftools view -a - | " "{py_cl} -x 'bcbio.variation.freebayes.remove_missingalt(x)' | " "vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort | " "vt normalize -n -r {ref_file} -q - | vcfuniqalleles " "{compress_cmd} > {tx_out_file}") do.run(cl.format(**locals()), "Genotyping paired variants with FreeBayes", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _ready_for_het_analysis(items): """Check if a sample has input information for heterogeneity analysis. We currently require a tumor/normal sample containing both CNV and variant calls. """ paired = vcfutils.get_paired_bams([dd.get_align_bam(d) for d in items], items) if paired and paired.normal_bam: return _get_variants(paired.tumor_data) and _get_cnvs(paired.tumor_data)
def mutect2_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's MuTect2. This requires the full non open-source version of GATK 3.5+. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): paired = vcfutils.get_paired_bams(align_bams, items) broad_runner = broad.runner_from_config(items[0]["config"]) gatk_type = broad_runner.gatk_type() _prep_inputs(align_bams, ref_file, items) with file_transaction(items[0], out_file) as tx_out_file: params = [ "-T", "Mutect2" if gatk_type == "gatk4" else "MuTect2", "-R", ref_file, "--annotation", "ClippingRankSumTest", "--annotation", "DepthPerSampleHC" ] for a in annotation.get_gatk_annotations( items[0]["config"], include_baseqranksum=False): params += ["--annotation", a] # Avoid issues with BAM CIGAR reads that GATK doesn't like if gatk_type == "gatk4": params += ["--read-validation-stringency", "LENIENT"] params += _add_tumor_params(paired, items, gatk_type) params += _add_region_params(region, out_file, items, gatk_type) # Avoid adding dbSNP/Cosmic so they do not get fed to variant filtering algorithm # Not yet clear how this helps or hurts in a general case. #params += _add_assoc_params(assoc_files) params += ["-ploidy", str(ploidy.get_ploidy(items, region))] resources = config_utils.get_resources("mutect2", items[0]["config"]) if "options" in resources: params += [str(x) for x in resources.get("options", [])] assert LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.5"), \ "Require full version of GATK 3.5+ for mutect2 calling" broad_runner.new_resources("mutect2") gatk_cmd = broad_runner.cl_gatk(params, os.path.dirname(tx_out_file)) if gatk_type == "gatk4": tx_raw_prefilt_file = "%s-raw%s" % utils.splitext_plus( tx_out_file) tx_raw_file = "%s-raw-filt%s" % utils.splitext_plus( tx_out_file) filter_cmd = _mutect2_filter(broad_runner, tx_raw_prefilt_file, tx_raw_file) cmd = "{gatk_cmd} -O {tx_raw_prefilt_file} && {filter_cmd}" else: tx_raw_file = "%s-raw%s" % utils.splitext_plus(tx_out_file) cmd = "{gatk_cmd} > {tx_raw_file}" do.run(cmd.format(**locals()), "MuTect2") out_file = _af_filter(paired.tumor_data, tx_raw_file, out_file) return vcfutils.bgzip_and_index(out_file, items[0]["config"])
def _run_cnvkit_cancer(items, background, access_file, work_dir): """Run CNVkit on a tumor/normal pair. """ paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) return _run_cnvkit_shared(items[0], [paired.tumor_bam], [paired.normal_bam], access_file, work_dir, background_name=paired.normal_name)
def run(items): """Perform detection of structural variations with lumpy, using bwa-mem alignment. """ if not all(utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", "sentieon-bwa", False, None] for data in items): raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection") paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0]) previous_evidence = {} full_bams, sr_bams, disc_bams = [], [], [] for data in items: sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir) full_bams.append(dd.get_align_bam(data)) sr_bams.append(sr_bam) disc_bams.append(disc_bam) cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir) previous_evidence[dd.get_sample_name(data)] = {} if cur_dels and utils.file_exists(cur_dels): previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels if cur_dups and utils.file_exists(cur_dups): previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, previous_evidence, work_dir, items) gt_vcfs = {} for data in items: sample = dd.get_sample_name(data) sample_vcf = vcfutils.select_sample(lumpy_vcf, sample, utils.append_stem(lumpy_vcf, "-%s" % sample), data["config"]) if "bnd-genotype" in dd.get_tools_on(data): gt_vcf = _run_svtyper(sample_vcf, dd.get_align_bam(data), exclude_file, data) elif "lumpy-genotype" in dd.get_tools_off(data): gt_vcf = sample_vcf else: std_vcf, bnd_vcf = _split_breakends(sample_vcf, data) std_gt_vcf = _run_svtyper(std_vcf, dd.get_align_bam(data), exclude_file, data) gt_vcf = vcfutils.concat_variant_files_bcftools( orig_files=[std_gt_vcf, bnd_vcf], out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0], config=data["config"]) gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data) if paired and paired.normal_name: gt_vcfs = _filter_by_background([paired.tumor_name], [paired.normal_name], gt_vcfs, paired.tumor_data) out = [] for data in items: if "sv" not in data: data["sv"] = [] vcf_file = gt_vcfs[dd.get_sample_name(data)] if dd.get_svprioritize(data): effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff") else: effects_vcf = None data["sv"].append({"variantcaller": "lumpy", "vrn_file": effects_vcf or vcf_file, "exclude_file": exclude_file}) out.append(data) return out
def _run_cnvkit_cancer(items, background, access_file): """Run CNVkit on a tumor/normal pair. """ paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) work_dir = _sv_workdir(items[0]) ckout = _run_cnvkit_shared(items[0], [paired.tumor_bam], [paired.normal_bam], access_file, work_dir, background_name=paired.normal_name) ckout = theta.run(ckout, paired) return _associate_cnvkit_out(ckout, items)
def run(items): """Perform detection of structural variations with lumpy, using bwa-mem alignment. """ if not all(utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", "sentieon-bwa", False, None] for data in items): raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection") paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0]) previous_evidence = {} full_bams, sr_bams, disc_bams = [], [], [] for data in items: sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir) full_bams.append(dd.get_align_bam(data)) sr_bams.append(sr_bam) disc_bams.append(disc_bam) cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir) previous_evidence[dd.get_sample_name(data)] = {} if cur_dels and utils.file_exists(cur_dels): previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels if cur_dups and utils.file_exists(cur_dups): previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, previous_evidence, work_dir, items) gt_vcfs = {} for data in items: sample = dd.get_sample_name(data) sample_vcf = vcfutils.select_sample(lumpy_vcf, sample, utils.append_stem(lumpy_vcf, "-%s" % sample), data["config"]) if "bnd-genotype" in dd.get_tools_on(data): gt_vcf = _run_svtyper(sample_vcf, dd.get_align_bam(data), exclude_file, data) else: std_vcf, bnd_vcf = _split_breakends(sample_vcf, data) std_gt_vcf = _run_svtyper(std_vcf, dd.get_align_bam(data), exclude_file, data) gt_vcf = vcfutils.concat_variant_files_bcftools( orig_files=[std_gt_vcf, bnd_vcf], out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0], config=data["config"]) gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data) if paired and paired.normal_name: gt_vcfs = _filter_by_background([paired.tumor_name], [paired.normal_name], gt_vcfs, paired.tumor_data) out = [] for data in items: if "sv" not in data: data["sv"] = [] vcf_file = gt_vcfs[dd.get_sample_name(data)] if dd.get_svprioritize(data): effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff") else: effects_vcf = None data["sv"].append({"variantcaller": "lumpy", "vrn_file": effects_vcf or vcf_file, "exclude_file": exclude_file}) out.append(data) return out
def run(items): """Perform detection of structural variations with lumpy, using bwa-mem alignment. """ if not all( utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", False, None] for data in items): raise ValueError( "Require bwa-mem alignment input for lumpy structural variation detection" ) paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) work_dir = _sv_workdir( paired.tumor_data if paired and paired.tumor_data else items[0]) full_bams, sr_bams, disc_bams = [], [], [] for data in items: dedup_bam, sr_bam, disc_bam = sshared.get_split_discordants( data, work_dir) full_bams.append(dedup_bam) sr_bams.append(sr_bam) disc_bams.append(disc_bam) lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, work_dir, items) gt_vcfs = {} for data in items: sample = dd.get_sample_name(data) dedup_bam, sr_bam, _ = sshared.get_split_discordants(data, work_dir) sample_vcf = vcfutils.select_sample( lumpy_vcf, sample, utils.append_stem(lumpy_vcf, "-%s" % sample), data["config"]) std_vcf, bnd_vcf = _split_breakends(sample_vcf, data) std_gt_vcf = _run_svtyper(std_vcf, dedup_bam, sr_bam, exclude_file, data) gt_vcf = vcfutils.combine_variant_files( orig_files=[std_gt_vcf, bnd_vcf], out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0], ref_file=dd.get_ref_file(data), config=data["config"]) gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data) if paired and paired.normal_name: gt_vcfs = _filter_by_background([paired.tumor_name], [paired.normal_name], gt_vcfs, paired.tumor_data) out = [] for data in items: if "sv" not in data: data["sv"] = [] vcf_file = gt_vcfs[dd.get_sample_name(data)] effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff") data["sv"].append({ "variantcaller": "lumpy", "vrn_file": effects_vcf or vcf_file, "exclude_file": exclude_file }) out.append(data) return out
def _run_vardict_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect variants with Vardict. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext( align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: paired = vcfutils.get_paired_bams(align_bams, items) if not paired.normal_bam: ann_file = _run_vardict_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vcffilter = config_utils.get_program("vcffilter", config) vardict = config_utils.get_program("vardict", config) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) vcfallelicprimitives = config_utils.get_program( "vcfallelicprimitives", config) strandbias = "testsomatic.R" var2vcf = "var2vcf_somatic.pl" compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float( utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 opts = " ".join( _vardict_options_from_config(items, config, out_file, region)) coverage_interval = utils.get_in( config, ("algorithm", "coverage_interval"), "exome") # for deep targeted panels, require 50 worth of coverage var2vcf_opts = " -v 50 " if coverage_interval == "regional" else "" fix_ambig = vcfutils.fix_ambiguous_cl() cmd = ( "{vardict} -G {ref_file} -f {freq} " "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} " "| {strandbias} " "| {var2vcf} -N \"{paired.tumor_name}|{paired.normal_name}\" -f {freq} {var2vcf_opts} " "| {fix_ambig} | {vcfstreamsort} {compress_cmd} > {tx_out_file}" ) bam.index(paired.tumor_bam, config) bam.index(paired.normal_bam, config) do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _ready_for_het_analysis(items): """Check if a sample has input information for heterogeneity analysis. We currently require a tumor/normal sample containing both CNV and variant calls. """ paired = vcfutils.get_paired_bams([dd.get_align_bam(d) for d in items], items) has_het = any(dd.get_hetcaller(d) for d in items) if has_het and paired: return get_variants(paired.tumor_data) and _get_calls( paired.tumor_data, cnv_only=True)
def _run_cnvkit_cancer(items, background): """Run CNVkit on a tumor/normal pair. """ paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) normal_data = [x for x in items if dd.get_sample_name(x) != paired.tumor_name] ckouts = _run_cnvkit_shared([paired.tumor_data], normal_data) if not ckouts: return items assert len(ckouts) == 1 tumor_data = _associate_cnvkit_out(ckouts, [paired.tumor_data], is_somatic=True) return tumor_data + normal_data
def _run_freebayes_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with FreeBayes. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: paired = get_paired_bams(align_bams, items) if not paired.normal_bam: return _run_freebayes_caller(align_bams, items, ref_file, assoc_files, region, out_file) #raise ValueError("Require both tumor and normal BAM files for FreeBayes cancer calling") vcfsamplediff = config_utils.get_program("vcfsamplediff", config) vcffilter = config_utils.get_program("vcffilter", config) vcfallelicprimitives = config_utils.get_program("vcfallelicprimitives", config) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) freebayes = config_utils.get_program("freebayes", config) opts = " ".join(_freebayes_options_from_config(items, config, out_file, region)) if "--min-alternate-fraction" not in opts and "-F" not in opts: # add minimum reportable allele frequency # FreeBayes defaults to 20%, but use 10% by default for the # tumor case min_af = float(utils.get_in(paired.tumor_config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 opts += " --min-alternate-fraction %s" % min_af opts += " --min-repeat-entropy 1 --experimental-gls" # Recommended settings for cancer calling # https://groups.google.com/d/msg/freebayes/dTWBtLyM4Vs/HAK_ZhJHguMJ opts += " --pooled-discrete --genotype-qualities --report-genotype-likelihood-max" # NOTE: The first sample name in the vcfsamplediff call is # the one supposed to be the *germline* one # NOTE: -s in vcfsamplediff (strict checking: i.e., require no # reads in the germline to call somatic) is not used as it is # too stringent compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" cl = ("{freebayes} -f {ref_file} {opts} " "{paired.tumor_bam} {paired.normal_bam} " "| {vcffilter} -f 'QUAL > 5' -s " "| {vcfallelicprimitives} | {vcfstreamsort} " "| {vcfsamplediff} VT {paired.normal_name} {paired.tumor_name} - " "{compress_cmd} > {tx_out_file}") bam.index(paired.tumor_bam, config) bam.index(paired.normal_bam, config) do.run(cl.format(**locals()), "Genotyping paired variants with FreeBayes", {}) fix_somatic_calls(out_file, config) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def run(align_bams, items, ref_file, assoc_files, region, out_file): """Run octopus variant calling, handling both somatic and germline calling. """ if not utils.file_exists(out_file): paired = vcfutils.get_paired_bams(align_bams, items) regions = _get_regions(region, out_file, items) if paired: return _run_somatic(paired, ref_file, regions, out_file) else: return _run_germline(align_bams, items, ref_file, regions, out_file) return out_file
def run(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run strelka2 variant calling, either paired tumor/normal or germline calling. """ if vcfutils.is_paired_analysis(align_bams, items): paired = vcfutils.get_paired_bams(align_bams, items) assert paired.normal_bam, "Strelka2 requires a normal sample" call_file = _run_somatic(paired, ref_file, assoc_files, region, out_file) else: call_file = _run_germline(align_bams, items, ref_file, assoc_files, region, out_file) return call_file
def _run_freebayes_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with FreeBayes. This is used for paired tumor / normal samples. Sources of options for FreeBayes: mailing list: https://groups.google.com/d/msg/freebayes/dTWBtLyM4Vs/HAK_ZhJHguMJ mailing list: https://groups.google.com/forum/#!msg/freebayes/LLH7ZfZlVNs/63FdD31rrfEJ speedseq: https://github.com/cc2qe/speedseq/blob/e6729aa2589eca4e3a946f398c1a2bdc15a7300d/bin/speedseq#L916 sga/freebayes: https://github.com/jts/sga-extra/blob/7e28caf71e8107b697f9be7162050e4fa259694b/ sga_generate_varcall_makefile.pl#L299 """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: paired = get_paired_bams(align_bams, items) if not paired.normal_bam: return _run_freebayes_caller(align_bams, items, ref_file, assoc_files, region, out_file) #raise ValueError("Require both tumor and normal BAM files for FreeBayes cancer calling") freebayes = config_utils.get_program("freebayes", config) opts = " ".join(_freebayes_options_from_config(items, config, out_file, region)) if "--min-alternate-fraction" not in opts and "-F" not in opts: # add minimum reportable allele frequency # FreeBayes defaults to 20%, but use 10% by default for the # tumor case min_af = float(utils.get_in(paired.tumor_config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 opts += " --min-alternate-fraction %s" % min_af opts += " --min-repeat-entropy 1 --experimental-gls" # Recommended settings for cancer calling opts += (" --pooled-discrete --pooled-continuous --genotype-qualities " "--report-genotype-likelihood-max --allele-balance-priors-off") compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" fix_ambig = vcfutils.fix_ambiguous_cl() py_cl = os.path.join(os.path.dirname(sys.executable), "py") cl = ("{freebayes} -f {ref_file} {opts} " "{paired.tumor_bam} {paired.normal_bam} " "| vcffilter -f 'QUAL > 5' -s " "| {py_cl} -x 'bcbio.variation.freebayes.call_somatic(x)' " "| {fix_ambig} | vcfallelicprimitives --keep-info --keep-geno " "| vt normalize -q -r {ref_file} - " "{compress_cmd} > {tx_out_file}") bam.index(paired.tumor_bam, config) bam.index(paired.normal_bam, config) do.run(cl.format(**locals()), "Genotyping paired variants with FreeBayes", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _paired_load_script(work_bams, names, chrom, pairmode, items): """Prepare BAMs for assessing CNVs in a paired tumor/normal setup. """ paired = vcfutils.get_paired_bams(work_bams, items) bed_file = _get_regional_bed_file(items[0]) if bed_file: return _paired_prep_targeted.format(case_file=paired.tumor_bam, case_name=paired.tumor_name, ctrl_file=paired.normal_bam, ctrl_name=paired.normal_name, num_cores=0, chrom=chrom, pairmode=pairmode, bed_file=bed_file) else: return _paired_prep.format(case_file=paired.tumor_bam, case_name=paired.tumor_name, ctrl_file=paired.normal_bam, ctrl_name=paired.normal_name, num_cores=0, chrom=chrom, pairmode=pairmode)
def run(align_bams, items, ref_file, assoc_files, region, out_file): """Run strelka2 variant calling, either paired tumor/normal or germline calling. region can be a single region or list of multiple regions for multicore calling. """ if vcfutils.is_paired_analysis(align_bams, items): paired = vcfutils.get_paired_bams(align_bams, items) assert paired.normal_bam, "Strelka2 requires a normal sample" call_file = _run_somatic(paired, ref_file, assoc_files, region, out_file) else: call_file = _run_germline(align_bams, items, ref_file, assoc_files, region, out_file) return call_file
def run(align_bams, items, ref_file, assoc_files, region, out_file): """Run octopus variant calling, handling both somatic and germline calling. """ if not utils.file_exists(out_file): paired = vcfutils.get_paired_bams(align_bams, items) vrs = bedutils.population_variant_regions(items) target = shared.subset_variant_regions(vrs, region, out_file, items=items, do_merge=True) if paired: return _run_somatic(paired, ref_file, target, out_file) else: return _run_germline(align_bams, items, ref_file, target, out_file) return out_file
def _run_cnvkit_cancer(items, background): """Run CNVkit on a tumor/normal pair. """ paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) work_dir = _sv_workdir(paired.tumor_data) ckouts = _run_cnvkit_shared([paired.tumor_data], [paired.tumor_bam], [paired.normal_bam], work_dir, background_name=paired.normal_name) if not ckouts: return items assert len(ckouts) == 1 tumor_data = _associate_cnvkit_out(ckouts, [paired.tumor_data]) normal_data = [x for x in items if dd.get_sample_name(x) != paired.tumor_name] return tumor_data + normal_data
def _run_cnvkit_cancer(items, background): """Run CNVkit on a tumor/normal pair. """ paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) work_dir = _sv_workdir(paired.tumor_data) access_file = _create_access_file(dd.get_ref_file(paired.tumor_data), work_dir, paired.tumor_data) ckout = _run_cnvkit_shared(paired.tumor_data, [paired.tumor_bam], [paired.normal_bam], access_file, work_dir, background_name=paired.normal_name) # Skip THetA runs until we can speed up data preparation steps # ckout = theta.run(ckout, paired) tumor_data = _associate_cnvkit_out(ckout, [paired.tumor_data]) normal_data = [x for x in items if dd.get_sample_name(x) != paired.tumor_name] return tumor_data + normal_data
def _run_cnvkit_cancer(items, background): """Run CNVkit on a tumor/normal pair. """ paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) work_dir = _sv_workdir(paired.tumor_data) access_file = _create_access_file(dd.get_ref_file(paired.tumor_data), work_dir, paired.tumor_data) ckout = _run_cnvkit_shared(paired.tumor_data, [paired.tumor_bam], [paired.normal_bam], access_file, work_dir, background_name=paired.normal_name) if not ckout: return items tumor_data = _associate_cnvkit_out(ckout, [paired.tumor_data]) normal_data = [x for x in items if dd.get_sample_name(x) != paired.tumor_name] return tumor_data + normal_data
def _paired_load_script(work_bams, names, chrom, pairmode, items): """Prepare BAMs for assessing CNVs in a paired tumor/normal setup. """ paired = vcfutils.get_paired_bams(work_bams, items) bed_file = items[0]["config"]["algorithm"].get("variant_regions", None) is_genome = items[0]["config"]["algorithm"].get("coverage_interval", "exome").lower() in ["genome"] if utils.file_exists(bed_file) and not is_genome: return _paired_prep_targeted.format(case_file=paired.tumor_bam, case_name=paired.tumor_name, ctrl_file=paired.normal_bam, ctrl_name=paired.normal_name, num_cores=0, chrom=chrom, pairmode=pairmode, bed_file=bed_file) else: return _paired_prep.format(case_file=paired.tumor_bam, case_name=paired.tumor_name, ctrl_file=paired.normal_bam, ctrl_name=paired.normal_name, num_cores=0, chrom=chrom, pairmode=pairmode)
def run_varscan(align_bams, items, ref_file, assoc_files, region=None, out_file=None): paired = get_paired_bams(align_bams, items) if paired and paired.normal_bam and paired.tumor_bam: call_file = samtools.shared_variantcall(_varscan_paired, "varscan", align_bams, ref_file, items, assoc_files, region, out_file) else: vcfutils.check_paired_problems(items) call_file = samtools.shared_variantcall(_varscan_work, "varscan", align_bams, ref_file, items, assoc_files, region, out_file) return call_file
def mutect2_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's MuTect2. This requires the full non open-source version of GATK 3.5+. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): paired = vcfutils.get_paired_bams(align_bams, items) broad_runner = broad.runner_from_config(items[0]["config"]) gatk_type = broad_runner.gatk_type() _prep_inputs(align_bams, ref_file, items) with file_transaction(items[0], out_file) as tx_out_file: params = ["-T", "Mutect2" if gatk_type == "gatk4" else "MuTect2", "--annotation", "ClippingRankSumTest", "--annotation", "DepthPerSampleHC"] if gatk_type == "gatk4": params += ["--reference", ref_file] else: params += ["-R", ref_file] for a in annotation.get_gatk_annotations(items[0]["config"], include_baseqranksum=False): params += ["--annotation", a] # Avoid issues with BAM CIGAR reads that GATK doesn't like if gatk_type == "gatk4": params += ["--read-validation-stringency", "LENIENT"] params += _add_tumor_params(paired, items, gatk_type) params += _add_region_params(region, out_file, items, gatk_type) # Avoid adding dbSNP/Cosmic so they do not get fed to variant filtering algorithm # Not yet clear how this helps or hurts in a general case. #params += _add_assoc_params(assoc_files) resources = config_utils.get_resources("mutect2", items[0]["config"]) if "options" in resources: params += [str(x) for x in resources.get("options", [])] assert LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.5"), \ "Require full version of GATK 3.5+ for mutect2 calling" broad_runner.new_resources("mutect2") gatk_cmd = broad_runner.cl_gatk(params, os.path.dirname(tx_out_file)) if gatk_type == "gatk4": tx_raw_prefilt_file = "%s-raw%s" % utils.splitext_plus(tx_out_file) tx_raw_file = "%s-raw-filt%s" % utils.splitext_plus(tx_out_file) filter_cmd = _mutect2_filter(broad_runner, tx_raw_prefilt_file, tx_raw_file, ref_file) cmd = "{gatk_cmd} -O {tx_raw_prefilt_file} && {filter_cmd}" else: tx_raw_file = "%s-raw%s" % utils.splitext_plus(tx_out_file) cmd = "{gatk_cmd} > {tx_raw_file}" do.run(cmd.format(**locals()), "MuTect2") out_file = _af_filter(paired.tumor_data, tx_raw_file, out_file) return vcfutils.bgzip_and_index(out_file, items[0]["config"])
def estimate(items, batch, config): """Estimate heterogeneity for a pair of tumor/normal samples. Run in parallel. XXX In progress, currently uses THetA but not yet turned on """ paired = vcfutils.get_paired_bams([dd.get_align_bam(d) for d in items], items) cnvs = _get_cnvs(paired.tumor_data) new_cnvs = theta.run(cnvs[0], paired) print(new_cnvs) out = [] for data in items: if batch == _get_batches(data)[0]: out.append([data]) return out
def run(align_bams, items, ref_file, assoc_files, region, out_file): """Run strelka2 variant calling, either paired tumor/normal or germline calling. region can be a single region or list of multiple regions for multicore calling. """ call_file = "%s-raw.vcf.gz" % utils.splitext_plus(out_file)[0] strelka_work_dir = "%s-work" % utils.splitext_plus(out_file)[0] paired = vcfutils.get_paired_bams(align_bams, items) if paired: assert paired.normal_bam, "Strelka2 requires a normal sample" call_file = _run_somatic(paired, ref_file, assoc_files, region, call_file, strelka_work_dir) else: call_file = _run_germline(align_bams, items, ref_file, assoc_files, region, call_file, strelka_work_dir) return _af_annotate_and_filter(paired, items, call_file, out_file)
def _run_tumor_pindel_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect indels with pindel in tumor/[normal] analysis. Only attempts to detect small insertion/deletions and not larger structural events. :param align_bam: (list) bam files :param items: (dict) information from yaml :param ref_file: (str) genome in fasta format :param assoc_file: (dict) files for annotation :param region: (str or tupple) region to analyze :param out_file: (str) final vcf file :returns: (str) final vcf file """ config = items[0]["config"] paired = get_paired_bams(align_bams, items) if out_file is None: out_file = "%s-indels.vcf" % os.path.splitext(align_bams[0])[0] paired_bam = [paired.tumor_bam] paired_name = [paired.tumor_name] if paired.normal_bam: paired_bam.append(paired.normal_bam) paired_name.append(paired.normal_name) if not utils.file_exists(out_file): with tx_tmpdir(config) as tmp_path: for align_bam in align_bams: bam.index(align_bam, config) root_pindel = os.path.join(tmp_path, "pindelroot") pindel = config_utils.get_program("pindel", config) opts = _pindel_options(items, config, out_file, region, tmp_path) tmp_input = _create_tmp_input(paired_bam, paired_name, tmp_path, config) cmd = ( "{pindel} -f {ref_file} -i {tmp_input} -o {root_pindel} " + "{opts} --report_inversions false --report_duplications false " "--report_long_insertions false --report_breakpoints false " "--report_interchromosomal_events false " "--max_range_index 2") do.run(cmd.format(**locals()), "Genotyping with pindel", {}) out_file = _create_vcf(root_pindel, out_file, ref_file, items, paired) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _run_vardict_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect variants with Vardict. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: paired = vcfutils.get_paired_bams(align_bams, items) if not paired.normal_bam: ann_file = _run_vardict_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vcffilter = config_utils.get_program("vcffilter", config) vardict = config_utils.get_program("vardict", config) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) strandbias = "testsomatic.R" var2vcf = "var2vcf_paired.pl" compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 # merge bed file regions as amplicon VarDict is only supported in single sample mode opts = " ".join(_vardict_options_from_config(items, config, out_file, region, do_merge=True)) coverage_interval = utils.get_in(config, ("algorithm", "coverage_interval"), "exome") # for deep targeted panels, require 50 worth of coverage var2vcf_opts = " -v 50 " if coverage_interval == "regional" else "" fix_ambig = vcfutils.fix_ambiguous_cl() if any("vardict_somatic_filter" in tz.get_in(("config", "algorithm", "tools_off"), data, []) for data in items): somatic_filter = "" else: somatic_filter = ("| %s -x 'bcbio.variation.freebayes.call_somatic(x)'" % os.path.join(os.path.dirname(sys.executable), "py")) cmd = ("{vardict} -G {ref_file} -f {freq} " "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} " "| {strandbias} " "| {var2vcf} -M -N \"{paired.tumor_name}|{paired.normal_name}\" -f {freq} {var2vcf_opts} " "{somatic_filter} | {fix_ambig} | {vcfstreamsort} {compress_cmd} > {tx_out_file}") bam.index(paired.tumor_bam, config) bam.index(paired.normal_bam, config) do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _run_cnvkit_cancer(items, background): """Run CNVkit on a tumor/normal pair. """ paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) work_dir = _sv_workdir(paired.tumor_data) ckouts = _run_cnvkit_shared([paired.tumor_data], [paired.tumor_bam], [paired.normal_bam], work_dir, background_name=paired.normal_name) if not ckouts: return items assert len(ckouts) == 1 tumor_data = _associate_cnvkit_out(ckouts, [paired.tumor_data]) normal_data = [ x for x in items if dd.get_sample_name(x) != paired.tumor_name ] return tumor_data + normal_data
def run(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run tumor only smCounter2 calling. """ paired = vcfutils.get_paired_bams(align_bams, items) assert paired and not paired.normal_bam, ( "smCounter2 supports tumor-only variant calling: %s" % (",".join([dd.get_sample_name(d) for d in items]))) vrs = bedutils.population_variant_regions(items) target = shared.subset_variant_regions(vrs, region, out_file, items=items, do_merge=True) out_file = out_file.replace(".vcf.gz", ".vcf") out_prefix = utils.splitext_plus(os.path.basename(out_file))[0] if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): with file_transaction(paired.tumor_data, out_file) as tx_out_file: cmd = [ "smCounter2", "--runPath", os.path.dirname(tx_out_file), "--outPrefix", out_prefix, "--bedTarget", target, "--refGenome", ref_file, "--bamFile", paired.tumor_bam, "--bamType", "consensus", "--nCPU", dd.get_num_cores(paired.tumor_data) ] do.run(cmd, "smcounter2 variant calling") for fname in glob.glob( os.path.join(os.path.dirname(tx_out_file), "*.smCounter*")): shutil.move( fname, os.path.join(os.path.dirname(out_file), os.path.basename(fname))) utils.symlink_plus( os.path.join(os.path.dirname(out_file), "%s.smCounter.cut.vcf" % out_prefix), out_file) return vcfutils.bgzip_and_index( out_file, paired.tumor_data["config"], remove_orig=False, prep_cmd="sed 's#FORMAT\t%s#FORMAT\t%s#' | %s" % (out_prefix, dd.get_sample_name(paired.tumor_data), vcfutils.add_contig_to_header_cl(dd.get_ref_file(paired.tumor_data), out_file)))