def _cnvkit_segment(cnr_file, cov_interval, data): """Perform segmentation and copy number calling on normalized inputs """ out_file = "%s.cns" % os.path.splitext(cnr_file)[0] if not utils.file_uptodate(out_file, cnr_file): with file_transaction(data, out_file) as tx_out_file: if not _cna_has_values(cnr_file): with open(tx_out_file, "w") as out_handle: out_handle.write( "chromosome\tstart\tend\tgene\tlog2\tprobes\tCN1\tCN2\tbaf\tweight\n" ) else: cmd = [ _get_cmd(), "segment", "-p", str(dd.get_cores(data)), "-o", tx_out_file, cnr_file ] small_vrn_files = _compatible_small_variants(data) if len(small_vrn_files) > 0 and _cna_has_values( cnr_file) and cov_interval != "genome": cmd += ["-v", small_vrn_files[0]] if cov_interval == "genome": cmd += ["--threshold", "0.00001"] # preferentially use conda installed Rscript export_cmd = ( "%s && export TMPDIR=%s && " % (utils.get_R_exports(), os.path.dirname(tx_out_file))) do.run(export_cmd + " ".join(cmd), "CNVkit segment") return out_file
def _run_bubbletree(vcf_csv, cnv_csv, data, wide_lrr=False, do_plots=True, handle_failures=True): """Create R script and run on input data BubbleTree has some internal hardcoded paramters that assume a smaller distribution of log2 scores. This is not true for tumor-only calls, so if we specify wide_lrr we scale the calculations to actually get calls. Need a better long term solution with flexible parameters. """ lrr_scale = 10.0 if wide_lrr else 1.0 local_sitelib = utils.R_sitelib() base = utils.splitext_plus(vcf_csv)[0] r_file = "%s-run.R" % base bubbleplot_out = "%s-bubbleplot.pdf" % base trackplot_out = "%s-trackplot.pdf" % base calls_out = "%s-calls.rds" % base freqs_out = "%s-bubbletree_prevalence.txt" % base sample = dd.get_sample_name(data) do_plots = "yes" if do_plots else "no" with open(r_file, "w") as out_handle: out_handle.write(_script.format(**locals())) if not utils.file_exists(freqs_out): cmd = "%s && %s --no-environ %s" % (utils.get_R_exports(), utils.Rscript_cmd(), r_file) try: do.run(cmd, "Assess heterogeneity with BubbleTree") except subprocess.CalledProcessError as msg: if handle_failures and _allowed_bubbletree_errorstates(str(msg)): with open(freqs_out, "w") as out_handle: out_handle.write('bubbletree failed:\n %s"\n' % (str(msg))) else: logger.exception() raise return {"caller": "bubbletree", "report": freqs_out, "plot": {"bubble": bubbleplot_out, "track": trackplot_out}}
def process_intervals(data): """Prepare intervals file""" bed_file = regions.get_sv_bed(data) if not bed_file: bed_file = bedutils.clean_file(dd.get_variant_regions(data), data) if not bed_file: return None basename = os.path.splitext(bed_file)[0] ready_file = basename + ".txt" if os.path.exists(ready_file): return ready_file optimized_bed = basename + ".optimized.bed" rscript = utils.Rscript_cmd("r36") interval_file_r = utils.R_package_script("r36", "PureCN", "extdata/IntervalFile.R") ref_file = dd.get_ref_file(data) mappability_resource = dd.get_variation_resources(data)["purecn_mappability"] genome = dd.get_genome_build(data) cmd = [rscript, interval_file_r, "--infile", bed_file, "--fasta", ref_file, "--outfile", ready_file, "--offtarget", "--genome", genome, "--export", optimized_bed, "--mappability", mappability_resource] try: cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(env = "r36"), utils.get_R_exports(env = "r36"), " ".join([str(x) for x in cmd])) do.run(cmd_line, "PureCN intervals") except subprocess.CalledProcessError as msg: logger.info("PureCN failed to prepare intervals") logger.debug("Saved PureCN interval file into " + ready_file) return ready_file
def create_normal_db(coverage_files_txt, snv_pon, out_dir, genome_build): """create normal db input: coverage files calculated by purecn for each sample snv_pon - mutect2 SNV PON output: mapping_bias_hg38.rds normalDB_hg38.rds """ rscript = utils.Rscript_cmd("r36") normaldb_r = utils.R_package_script("r36", "PureCN", "extdata/NormalDB.R") cmd = [rscript, normaldb_r, "--outdir", out_dir, "--coveragefiles", coverage_files_txt, "--normal_panel" , snv_pon, "--genome", genome_build, "--force"] try: cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(env = "r36"), utils.get_R_exports(env = "r36"), " ".join([str(x) for x in cmd])) do.run(cmd_line, "PureCN normalDB") except subprocess.CalledProcessError as msg: logger.info("PureCN failed to create a normal db") return out_dir
def get_coverage(data): """Calculate coverage for a sample.bam, account for GC content data is single sample """ data = utils.to_single_data(data) bed_file = tz.get_in(["config", "algorithm", "purecn_bed_ready"], data) sample_name = dd.get_sample_name(data) work_dir = _sv_workdir(data) rscript = utils.Rscript_cmd("r36") coverage_r = utils.R_package_script("r36", "PureCN", "extdata/Coverage.R") intervals = tz.get_in(["config", "algorithm", "purecn_bed_ready"], data) # PureCN resolves symlinks and the actual output PureCN coverage file name # is derived from the end bam not from bam_file bam_file = os.path.realpath(dd.get_align_bam(data)) bam_name = os.path.basename(bam_file) (bname, ext) = os.path.splitext(bam_name) result_file = os.path.join(work_dir, bname + "_coverage_loess.txt.gz") if not os.path.exists(result_file): cmd = [rscript, coverage_r, "--outdir", work_dir, "--bam", bam_file, "--intervals", intervals] try: cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(env = "r36"), utils.get_R_exports(env = "r36"), " ".join([str(x) for x in cmd])) do.run(cmd_line, "PureCN coverage") except subprocess.CalledProcessError as msg: logger.info("PureCN failed to calculate coverage") logger.debug("Saved PureCN coverage files to " + result_file) return result_file
def _amber_het_file(vrn_files, work_dir, paired): """Create file of BAFs in normal heterozygous positions compatible with AMBER. https://github.com/hartwigmedical/hmftools/tree/master/amber https://github.com/hartwigmedical/hmftools/blob/637e3db1a1a995f4daefe2d0a1511a5bdadbeb05/hmf-common/src/test/resources/amber/new.amber.baf """ assert vrn_files, "Did not find compatible variant calling files for TitanCNA inputs" from bcbio.heterogeneity import bubbletree prep_file = bubbletree.prep_vrn_file(vrn_files[0]["vrn_file"], vrn_files[0]["variantcaller"], work_dir, paired, AmberWriter) amber_dir = utils.safe_makedir(os.path.join(work_dir, "amber")) out_file = os.path.join( amber_dir, "%s.amber.baf" % dd.get_sample_name(paired.tumor_data)) utils.symlink_plus(prep_file, out_file) pcf_file = out_file + ".pcf" if not utils.file_exists(pcf_file): with file_transaction(paired.tumor_data, pcf_file) as tx_out_file: r_file = os.path.join(os.path.dirname(tx_out_file), "bafSegmentation.R") with open(r_file, "w") as out_handle: out_handle.write(_amber_seg_script) cmd = "%s && %s --no-environ %s %s %s" % (utils.get_R_exports( ), utils.Rscript_cmd(), r_file, out_file, pcf_file) do.run(cmd, "PURPLE: AMBER baf segmentation") return out_file
def _run_cobalt(paired, work_dir): """Run Cobalt for counting read depth across genomic windows. PURPLE requires even 1000bp windows so use integrated counting solution directly rather than converting from CNVkit calculations. If this approach is useful should be moved upstream to be available to other tools as an input comparison. https://github.com/hartwigmedical/hmftools/tree/master/count-bam-lines """ cobalt_dir = utils.safe_makedir(os.path.join(work_dir, "cobalt")) out_file = os.path.join( cobalt_dir, "%s.cobalt" % dd.get_sample_name(paired.tumor_data)) if not utils.file_exists(out_file): with file_transaction(paired.tumor_data, out_file) as tx_out_file: cmd = [ "COBALT", "-reference", paired.normal_name, "-reference_bam", paired.normal_bam, "-tumor", paired.tumor_name, "-tumor_bam", paired.tumor_bam, "-threads", dd.get_num_cores(paired.tumor_data), "-output_dir", os.path.dirname(tx_out_file), "-gc_profile", dd.get_variation_resources(paired.tumor_data)["gc_profile"] ] cmd = "%s && %s" % (utils.get_R_exports(), " ".join( [str(x) for x in cmd])) do.run(cmd, "PURPLE: COBALT read depth normalization") for f in os.listdir(os.path.dirname(tx_out_file)): if f != os.path.basename(tx_out_file): shutil.move(os.path.join(os.path.dirname(tx_out_file), f), os.path.join(cobalt_dir, f)) return out_file
def _cnvkit_segment(cnr_file, cov_interval, data, items, out_file=None): """Perform segmentation and copy number calling on normalized inputs """ if not out_file: out_file = "%s.cns" % os.path.splitext(cnr_file)[0] if not utils.file_uptodate(out_file, cnr_file): with file_transaction(data, out_file) as tx_out_file: if not _cna_has_values(cnr_file): with open(tx_out_file, "w") as out_handle: out_handle.write("chromosome\tstart\tend\tgene\tlog2\tprobes\tCN1\tCN2\tbaf\tweight\n") else: cmd = [_get_cmd(), "segment", "-p", str(dd.get_cores(data)), "-o", tx_out_file, cnr_file] small_vrn_files = _compatible_small_variants(data, items) if len(small_vrn_files) > 0 and _cna_has_values(cnr_file) and cov_interval != "genome": cmd += ["--vcf", small_vrn_files[0].name, "--sample-id", small_vrn_files[0].sample] if small_vrn_files[0].normal: cmd += ["--normal-id", small_vrn_files[0].normal] if cov_interval == "genome": cmd += ["--threshold", "0.00001"] # For tumors, remove very low normalized regions, avoiding upcaptured noise # https://github.com/chapmanb/bcbio-nextgen/issues/2171#issuecomment-348333650 paired = vcfutils.get_paired(items) if paired: cmd += ["--drop-low-coverage"] # preferentially use conda installed Rscript export_cmd = ("%s && export TMPDIR=%s && " % (utils.get_R_exports(), os.path.dirname(tx_out_file))) do.run(export_cmd + " ".join(cmd), "CNVkit segment") return out_file
def _amber_het_file(method, vrn_files, work_dir, paired): """Create file of BAFs in normal heterozygous positions compatible with AMBER. Two available methods: - pon -- Use panel of normals with likely heterozygous sites. - variants -- Use pre-existing variant calls, filtered to likely heterozygotes. https://github.com/hartwigmedical/hmftools/tree/master/amber https://github.com/hartwigmedical/hmftools/blob/637e3db1a1a995f4daefe2d0a1511a5bdadbeb05/hmf-common/src/test/resources/amber/new.amber.baf """ assert vrn_files, "Did not find compatible variant calling files for PURPLE inputs" from bcbio.heterogeneity import bubbletree if method == "variants": amber_dir = utils.safe_makedir(os.path.join(work_dir, "amber")) out_file = os.path.join(amber_dir, "%s.amber.baf" % dd.get_sample_name(paired.tumor_data)) prep_file = bubbletree.prep_vrn_file(vrn_files[0]["vrn_file"], vrn_files[0]["variantcaller"], work_dir, paired, AmberWriter) utils.symlink_plus(prep_file, out_file) pcf_file = out_file + ".pcf" if not utils.file_exists(pcf_file): with file_transaction(paired.tumor_data, pcf_file) as tx_out_file: r_file = os.path.join(os.path.dirname(tx_out_file), "bafSegmentation.R") with open(r_file, "w") as out_handle: out_handle.write(_amber_seg_script) cmd = "%s && %s --no-environ %s %s %s" % (utils.get_R_exports(), utils.Rscript_cmd(), r_file, out_file, pcf_file) do.run(cmd, "PURPLE: AMBER baf segmentation") else: assert method == "pon" out_file = _run_amber(paired, work_dir) return out_file
def _run_cobalt(paired, work_dir): """Run Cobalt for counting read depth across genomic windows. PURPLE requires even 1000bp windows so use integrated counting solution directly rather than converting from CNVkit calculations. If this approach is useful should be moved upstream to be available to other tools as an input comparison. https://github.com/hartwigmedical/hmftools/tree/master/count-bam-lines """ cobalt_dir = utils.safe_makedir(os.path.join(work_dir, "cobalt")) out_file = os.path.join(cobalt_dir, "%s.cobalt" % dd.get_sample_name(paired.tumor_data)) if not utils.file_exists(out_file): with file_transaction(paired.tumor_data, out_file) as tx_out_file: cmd = ["COBALT"] + _get_jvm_opts(tx_out_file, paired.tumor_data) + \ ["-reference", paired.normal_name, "-reference_bam", paired.normal_bam, "-tumor", paired.tumor_name, "-tumor_bam", paired.tumor_bam, "-threads", dd.get_num_cores(paired.tumor_data), "-output_dir", os.path.dirname(tx_out_file), "-gc_profile", dd.get_variation_resources(paired.tumor_data)["gc_profile"]] cmd = "%s && %s" % (utils.get_R_exports(), " ".join([str(x) for x in cmd])) do.run(cmd, "PURPLE: COBALT read depth normalization") for f in os.listdir(os.path.dirname(tx_out_file)): if f != os.path.basename(tx_out_file): shutil.move(os.path.join(os.path.dirname(tx_out_file), f), os.path.join(cobalt_dir, f)) return out_file
def _run_titancna(cn_file, het_file, ploidy, num_clusters, work_dir, data): """Run titanCNA wrapper script on given ploidy and clusters. """ sample = dd.get_sample_name(data) cores = dd.get_num_cores(data) export_cmd = utils.get_R_exports() ploidy_dir = utils.safe_makedir(os.path.join(work_dir, "run_ploidy%s" % ploidy)) cluster_dir = "%s_cluster%02d" % (sample, num_clusters) out_dir = os.path.join(ploidy_dir, cluster_dir) if not utils.file_uptodate(out_dir + ".titan.txt", cn_file): with tx_tmpdir(data) as tmp_dir: with utils.chdir(tmp_dir): cmd = ("{export_cmd} && titanCNA.R --id {sample} --hetFile {het_file} --cnFile {cn_file} " "--numClusters {num_clusters} --ploidy {ploidy} --numCores {cores} --outDir {tmp_dir} " "--libdir None") if data["genome_build"] in ("hg19", "hg38"): cmd += " --genomeStyle UCSC" # TitanCNA's model is influenced by the variance in read coverage data # and data type: set reasonable defaults for non-WGS runs # (see https://github.com/gavinha/TitanCNA/tree/master/scripts/R_scripts) if dd.get_coverage_interval(data) != "genome": cmd += " --alphaK=2500 --alphaKHigh=2500" do.run(cmd.format(**locals()), "TitanCNA CNV detection: ploidy %s, cluster %s" % (ploidy, num_clusters)) for fname in glob.glob(os.path.join(tmp_dir, cluster_dir + "*")): shutil.move(fname, ploidy_dir) if os.path.exists(os.path.join(tmp_dir, "Rplots.pdf")): shutil.move(os.path.join(tmp_dir, "Rplots.pdf"), os.path.join(ploidy_dir, "%s.Rplots.pdf" % cluster_dir)) return ploidy_dir
def _run_titancna(cn_file, het_file, ploidy, num_clusters, work_dir, data): """Run titanCNA wrapper script on given ploidy and clusters. """ sample = dd.get_sample_name(data) cores = dd.get_num_cores(data) export_cmd = utils.get_R_exports() ploidy_dir = utils.safe_makedir( os.path.join(work_dir, "run_ploidy%s" % ploidy)) cluster_dir = "%s_cluster%02d" % (sample, num_clusters) out_dir = os.path.join(ploidy_dir, cluster_dir) if not utils.file_uptodate(out_dir + ".titan.txt", cn_file): with tx_tmpdir(data) as tmp_dir: with utils.chdir(tmp_dir): cmd = ( "{export_cmd} && titanCNA.R --id {sample} --hetFile {het_file} --cnFile {cn_file} " "--numClusters {num_clusters} --ploidy {ploidy} --numCores {cores} --outDir {tmp_dir}" ) do.run( cmd.format(**locals()), "TitanCNA CNV detection: ploidy %s, cluster %s" % (ploidy, num_clusters)) for fname in glob.glob(os.path.join(tmp_dir, cluster_dir + "*")): shutil.move(fname, ploidy_dir) if os.path.exists(os.path.join(tmp_dir, "Rplots.pdf")): shutil.move( os.path.join(tmp_dir, "Rplots.pdf"), os.path.join(ploidy_dir, "%s.Rplots.pdf" % cluster_dir)) return ploidy_dir
def _run_purecn_normaldb(paired, out): """Run PureCN with normaldb and native segmentation paired is one t/n pair or only """ sample = utils.to_single_data(paired.tumor_data) bed_file = tz.get_in(["config", "algorithm", "purecn_bed_ready"], sample) sample_name = dd.get_sample_name(sample) work_dir = _sv_workdir(sample) rscript = utils.Rscript_cmd("r36") purecn_r = utils.R_package_script("r36", "PureCN", "extdata/PureCN.R") intervals = tz.get_in(["config", "algorithm", "purecn_bed_ready"], sample) bam_file = dd.get_align_bam(sample) # termline and somatic - just annotated and filters assigned variants_vcf = tz.get_in(["variants"], sample)[0].get("germline") # in a T/N case, there is no germline file - vrn file with all variants if not variants_vcf: variants_vcf = tz.get_in(["variants"], sample)[0].get("vrn_file") normaldb = tz.get_in(["config", "algorithm", "background", "cnv_reference", "purecn_normaldb"], sample) mappingbiasfile = tz.get_in(["config", "algorithm", "background", "cnv_reference", "purecn_mapping_bias"], sample) sample_coverage = tz.get_in(["depth", "bins", "purecn"], sample) simple_repeat_bed = dd.get_variation_resources(sample)["simple_repeat"] result_file = os.path.join(work_dir, sample_name + ".rds") genome = dd.get_genome_build(sample) cmd = [ rscript, purecn_r, "--out", work_dir, "--tumor", sample_coverage, "--sampleid", sample_name, "--vcf", variants_vcf, "--normaldb", normaldb, "--mappingbiasfile", mappingbiasfile, "--intervals", intervals, "--snpblacklist", simple_repeat_bed, "--genome", genome, "--force", "--postoptimize", "--seed", "123", "--bootstrapn", "500", "--cores", dd.get_num_cores(sample)] resources = config_utils.get_resources("purecn", sample) if "options" in resources: cmd += [str(x) for x in resources.get("options", [])] # it is not recommended to use matched normal sample in PureCN analysis, # because then it skips PON coverage normalization and denoising steps! # but still, if it is supplied, we useit if paired.normal_data: normal_sample = utils.to_single_data(paired.normal_data) if normal_sample: normal_coverage = tz.get_in(["depth", "bins", "purecn"], normal_sample) cmd.extend(["--normal", normal_coverage]) if not os.path.exists(result_file): try: cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(env = "r36"), utils.get_R_exports(env = "r36"), " ".join([str(x) for x in cmd])) do.run(cmd_line, "PureCN copy number calling") logger.debug("Saved PureCN output to " + work_dir) except subprocess.CalledProcessError as msg: logger.info("PureCN failed") out_base, out, all_files = _get_purecn_files(paired, work_dir, require_exist = True) return out
def _cnvkit_segment(cnr_file, cov_interval, data, items, out_file=None, detailed=False): """Perform segmentation and copy number calling on normalized inputs """ if not out_file: out_file = "%s.cns" % os.path.splitext(cnr_file)[0] if not utils.file_uptodate(out_file, cnr_file): with file_transaction(data, out_file) as tx_out_file: if not _cna_has_values(cnr_file): with open(tx_out_file, "w") as out_handle: out_handle.write( "chromosome\tstart\tend\tgene\tlog2\tprobes\tCN1\tCN2\tbaf\tweight\n" ) else: # Scale cores to avoid memory issues with segmentation # https://github.com/etal/cnvkit/issues/346 if cov_interval == "genome": cores = max(1, dd.get_cores(data) // 2) else: cores = dd.get_cores(data) cmd = [ _get_cmd(), "segment", "-p", str(cores), "-o", tx_out_file, cnr_file ] small_vrn_files = _compatible_small_variants(data, items) if len(small_vrn_files) > 0 and _cna_has_values( cnr_file) and cov_interval != "genome": cmd += [ "--vcf", small_vrn_files[0].name, "--sample-id", small_vrn_files[0].sample ] if small_vrn_files[0].normal: cmd += ["--normal-id", small_vrn_files[0].normal] resources = config_utils.get_resources("cnvkit_segment", data["config"]) user_options = resources.get("options", []) cmd += [str(x) for x in user_options] if cov_interval == "genome" and "--threshold" not in user_options: cmd += ["--threshold", "0.00001"] # For tumors, remove very low normalized regions, avoiding upcaptured noise # https://github.com/bcbio/bcbio-nextgen/issues/2171#issuecomment-348333650 # unless we want detailed segmentation for downstream tools paired = vcfutils.get_paired(items) if paired: #if detailed: # cmd += ["-m", "hmm-tumor"] if "--drop-low-coverage" not in user_options: cmd += ["--drop-low-coverage"] # preferentially use conda installed Rscript export_cmd = ( "%s && export TMPDIR=%s && " % (utils.get_R_exports(), os.path.dirname(tx_out_file))) do.run(export_cmd + " ".join(cmd), "CNVkit segment") return out_file
def _run_purecn(paired, work_dir): """Run PureCN.R wrapper with pre-segmented CNVkit or GATK4 inputs. """ segfns = { "cnvkit": _segment_normalized_cnvkit, "gatk-cnv": _segment_normalized_gatk } out_base, out, all_files = _get_purecn_files(paired, work_dir) failed_file = out_base + "-failed.log" cnr_file = tz.get_in(["depth", "bins", "normalized"], paired.tumor_data) if not utils.file_uptodate( out["rds"], cnr_file) and not utils.file_exists(failed_file): cnr_file, seg_file = segfns[cnvkit.bin_approach(paired.tumor_data)]( cnr_file, work_dir, paired) from bcbio import heterogeneity vcf_file = heterogeneity.get_variants( paired.tumor_data, include_germline=False)[0]["vrn_file"] vcf_file = germline.filter_to_pass_and_reject(vcf_file, paired, out_dir=work_dir) with file_transaction(paired.tumor_data, out_base) as tx_out_base: # Use UCSC style naming for human builds to support BSgenome genome = ("hg19" if dd.get_genome_build(paired.tumor_data) in [ "GRCh37", "hg19" ] else dd.get_genome_build(paired.tumor_data)) cmd = [ "PureCN.R", "--seed", "42", "--out", tx_out_base, "--rds", "%s.rds" % tx_out_base, "--sampleid", dd.get_sample_name(paired.tumor_data), "--genome", genome, "--vcf", vcf_file, "--tumor", cnr_file, "--segfile", seg_file, "--funsegmentation", "Hclust", "--maxnonclonal", "0.3" ] if dd.get_num_cores(paired.tumor_data) > 1: cmd += ["--cores", str(dd.get_num_cores(paired.tumor_data))] try: cmd = "export R_LIBS_USER=%s && %s && %s" % ( utils.R_sitelib(), utils.get_R_exports(), " ".join( [str(x) for x in cmd])) do.run(cmd, "PureCN copy number calling") except subprocess.CalledProcessError as msg: if _allowed_errors(str(msg)): logger.info( "PureCN failed to find solution for %s: skipping" % dd.get_sample_name(paired.tumor_data)) with open(failed_file, "w") as out_handle: out_handle.write(str(msg)) else: logger.exception() raise for f in all_files: if os.path.exists(os.path.join(os.path.dirname(tx_out_base), f)): shutil.move(os.path.join(os.path.dirname(tx_out_base), f), os.path.join(os.path.dirname(out_base), f)) out = _get_purecn_files(paired, work_dir, require_exist=True)[1] return out if (out.get("rds") and os.path.exists(out["rds"])) else None
def cpat(assembled_fasta, hexamer, logit, data, out_file=None): if out_file and file_exists(out_file): return out_file if not out_file: out_file = tempfile.NamedTemporaryFile(delete=False, suffix=".cpat").name cpat_cmd = config_utils.get_program("cpat.py", data) r_setup = utils.get_R_exports() cmd = ("{r_setup} && {cpat_cmd} --gene={assembled_fasta} --hex={hexamer} " "--logitModel={logit} --outfile={tx_out_file}") message = "Predicing coding potential of %s." % (assembled_fasta) with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), message) return out_file
def _do_run(paired): """Perform Battenberg caling with the paired dataset. This purposely does not use a temporary directory for the output since Battenberg does smart restarts. """ work_dir = _sv_workdir(paired.tumor_data) out = _get_battenberg_out(paired, work_dir) ignore_file = os.path.join(work_dir, "ignore_chromosomes.txt") if len(_missing_files(out)) > 0: ref_file = dd.get_ref_file(paired.tumor_data) bat_datadir = os.path.normpath( os.path.join(os.path.dirname(ref_file), os.pardir, "battenberg")) ignore_file, gl_file = _make_ignore_file( work_dir, ref_file, ignore_file, os.path.join(bat_datadir, "impute", "impute_info.txt")) local_sitelib = os.path.join( install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") tumor_bam = paired.tumor_bam normal_bam = paired.normal_bam platform = dd.get_platform(paired.tumor_data) genome_build = paired.tumor_data["genome_build"] # scale cores to avoid over-using memory during imputation cores = max(1, int(dd.get_num_cores(paired.tumor_data) * 0.5)) gender = { "male": "XY", "female": "XX", "unknown": "L" }.get(population.get_gender(paired.tumor_data)) if gender == "L": gender_str = "-ge %s -gl %s" % (gender, gl_file) else: gender_str = "-ge %s" % (gender) r_export_cmd = utils.get_R_exports() cmd = ( "export R_LIBS_USER={local_sitelib} && {r_export_cmd} && " "battenberg.pl -t {cores} -o {work_dir} -r {ref_file}.fai " "-tb {tumor_bam} -nb {normal_bam} -e {bat_datadir}/impute/impute_info.txt " "-u {bat_datadir}/1000genomesloci -c {bat_datadir}/probloci.txt " "-ig {ignore_file} {gender_str} " "-assembly {genome_build} -species Human -platform {platform}") do.run(cmd.format(**locals()), "Battenberg CNV calling") assert len(_missing_files( out)) == 0, "Missing Battenberg output: %s" % _missing_files(out) out["plot"] = _get_battenberg_out_plots(paired, work_dir) out["ignore"] = ignore_file return out
def make_logit_model(coding_fasta, noncoding_fasta, hexamers, data, out_dir=None): safe_makedir(out_dir) out_prefix = os.path.join(out_dir, "logit") out_file = out_prefix + ".logit.RData" if file_exists(out_file): return out_file tx_prefix = tempfile.NamedTemporaryFile(delete=False).name tx_out_file = tx_prefix + ".logit.RData" logit_cmd = config_utils.get_program("make_logitModel.py", data) r_setup = utils.get_R_exports() cmd = ("{r_setup} && {logit_cmd} --cgene={coding_fasta} --ngene={noncoding_fasta} " "--hex={hexamers} --outfile={tx_prefix}") message = "Building coding/noncoding logistical model." do.run(cmd.format(**locals()), message) shutil.move(tx_out_file, out_file) return out_file
def _run_titancna(cn_file, het_file, ploidy, num_clusters, work_dir, data): """Run titanCNA wrapper script on given ploidy and clusters. """ sample = dd.get_sample_name(data) cores = dd.get_num_cores(data) export_cmd = utils.get_R_exports() ploidy_dir = utils.safe_makedir(os.path.join(work_dir, "run_ploidy%s" % ploidy)) cluster_dir = "%s_cluster%02d" % (sample, num_clusters) out_dir = os.path.join(ploidy_dir, cluster_dir) if not utils.file_uptodate(out_dir + ".titan.txt", cn_file): with tx_tmpdir(data) as tmp_dir: with utils.chdir(tmp_dir): cmd = ("{export_cmd} && titanCNA.R --id {sample} --hetFile {het_file} --cnFile {cn_file} " "--numClusters {num_clusters} --ploidy {ploidy} --numCores {cores} --outDir {tmp_dir} " "--libdir None") chroms = ["'%s'" % c.name.replace("chr", "") for c in ref.file_contigs(dd.get_ref_file(data)) if chromhacks.is_autosomal_or_x(c.name)] if "'X'" not in chroms: chroms += ["'X'"] # Use UCSC style naming for human builds to support BSgenome genome_build = ("hg19" if dd.get_genome_build(data) in ["GRCh37", "hg19"] else dd.get_genome_build(data)) cmd += """ --chrs "c(%s)" """ % ",".join(chroms) cmd += " --genomeBuild {genome_build}" if data["genome_build"] in ("hg19", "hg38"): cmd += " --genomeStyle UCSC" if data["genome_build"] in ["hg38"]: data_dir = os.path.normpath(os.path.join( os.path.dirname(os.path.realpath(os.path.join( os.path.dirname(utils.Rscript_cmd()), "titanCNA.R"))), os.pardir, os.pardir, "data")) cytoband_file = os.path.join(data_dir, "cytoBand_hg38.txt") assert os.path.exists(cytoband_file), cytoband_file cmd += " --cytobandFile %s" % cytoband_file # TitanCNA's model is influenced by the variance in read coverage data # and data type: set reasonable defaults for non-WGS runs # (see https://github.com/gavinha/TitanCNA/tree/master/scripts/R_scripts) if dd.get_coverage_interval(data) != "genome": cmd += " --alphaK=2500 --alphaKHigh=2500" do.run(cmd.format(**locals()), "TitanCNA CNV detection: ploidy %s, cluster %s" % (ploidy, num_clusters)) for fname in glob.glob(os.path.join(tmp_dir, cluster_dir + "*")): shutil.move(fname, ploidy_dir) if os.path.exists(os.path.join(tmp_dir, "Rplots.pdf")): shutil.move(os.path.join(tmp_dir, "Rplots.pdf"), os.path.join(ploidy_dir, "%s.Rplots.pdf" % cluster_dir)) return ploidy_dir
def rnaseq_vardict_variant_calling(data): sample = dd.get_sample_name(data) out_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "variation", "rnaseq", "vardict")) out_file = os.path.join(out_dir, sample + "-vardict.vcf.gz") if file_exists(out_file): data = dd.set_vrn_file(data, out_file) return data vardict_cmd = vardict.get_vardict_command(data) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" vcfstreamsort = config_utils.get_program("vcfstreamsort", data) compress_cmd = "| bgzip -c" freq = float(dd.get_min_allele_fraction(data, 20) / 100.0) var2vcf_opts = "-v 50" fix_ambig = vcfutils.fix_ambiguous_cl() remove_dup = vcfutils.remove_dup_cl() r_setup = get_R_exports() ref_file = dd.get_ref_file(data) bamfile = dd.get_work_bam(data) data = _setup_variant_regions(data, out_dir) opts, _ = vardict._vardict_options_from_config( [data], data["config"], out_file, dd.get_variant_regions(data), is_rnaseq=True) cores = dd.get_num_cores(data) if cores and cores > 1: opts += " -th %s" % str(cores) with file_transaction(data, out_file) as tx_out_file: jvm_opts = vardict._get_jvm_opts(data, tx_out_file) cmd = ("{r_setup} && {jvm_opts}{vardict_cmd} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} " "| {strandbias}" "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} " "| {fix_ambig} | {remove_dup} | {vcfstreamsort} {compress_cmd} " "> {tx_out_file}") message = "Calling RNA-seq variants with VarDict" do.run(cmd.format(**locals()), message) out_file = vcfutils.bgzip_and_index(out_file, data["config"]) data = dd.set_vrn_file(data, out_file) return data
def _cnvkit_segment(cnr_file, cov_interval, data, items, out_file=None, detailed=False): """Perform segmentation and copy number calling on normalized inputs """ if not out_file: out_file = "%s.cns" % os.path.splitext(cnr_file)[0] if not utils.file_uptodate(out_file, cnr_file): with file_transaction(data, out_file) as tx_out_file: if not _cna_has_values(cnr_file): with open(tx_out_file, "w") as out_handle: out_handle.write("chromosome\tstart\tend\tgene\tlog2\tprobes\tCN1\tCN2\tbaf\tweight\n") else: # Scale cores to avoid memory issues with segmentation # https://github.com/etal/cnvkit/issues/346 if cov_interval == "genome": cores = max(1, dd.get_cores(data) // 2) else: cores = dd.get_cores(data) cmd = [_get_cmd(), "segment", "-p", str(cores), "-o", tx_out_file, cnr_file] small_vrn_files = _compatible_small_variants(data, items) if len(small_vrn_files) > 0 and _cna_has_values(cnr_file) and cov_interval != "genome": cmd += ["--vcf", small_vrn_files[0].name, "--sample-id", small_vrn_files[0].sample] if small_vrn_files[0].normal: cmd += ["--normal-id", small_vrn_files[0].normal] resources = config_utils.get_resources("cnvkit_segment", data["config"]) user_options = resources.get("options", []) cmd += [str(x) for x in user_options] if cov_interval == "genome" and "--threshold" not in user_options: cmd += ["--threshold", "0.00001"] # For tumors, remove very low normalized regions, avoiding upcaptured noise # https://github.com/bcbio/bcbio-nextgen/issues/2171#issuecomment-348333650 # unless we want detailed segmentation for downstream tools paired = vcfutils.get_paired(items) if paired: #if detailed: # cmd += ["-m", "hmm-tumor"] if "--drop-low-coverage" not in user_options: cmd += ["--drop-low-coverage"] # preferentially use conda installed Rscript export_cmd = ("%s && export TMPDIR=%s && " % (utils.get_R_exports(), os.path.dirname(tx_out_file))) do.run(export_cmd + " ".join(cmd), "CNVkit segment") return out_file
def _run_purecn(paired, work_dir): """Run PureCN.R wrapper with pre-segmented CNVkit or GATK4 inputs. """ segfns = {"cnvkit": _segment_normalized_cnvkit, "gatk-cnv": _segment_normalized_gatk} out_base, out, all_files = _get_purecn_files(paired, work_dir) failed_file = out_base + "-failed.log" cnr_file = tz.get_in(["depth", "bins", "normalized"], paired.tumor_data) if not utils.file_uptodate(out["rds"], cnr_file) and not utils.file_exists(failed_file): cnr_file, seg_file = segfns[cnvkit.bin_approach(paired.tumor_data)](cnr_file, work_dir, paired) from bcbio import heterogeneity vcf_file = heterogeneity.get_variants(paired.tumor_data, include_germline=False)[0]["vrn_file"] vcf_file = germline.filter_to_pass_and_reject(vcf_file, paired, out_dir=work_dir) with file_transaction(paired.tumor_data, out_base) as tx_out_base: # Use UCSC style naming for human builds to support BSgenome genome = ("hg19" if dd.get_genome_build(paired.tumor_data) in ["GRCh37", "hg19"] else dd.get_genome_build(paired.tumor_data)) cmd = ["PureCN.R", "--seed", "42", "--out", tx_out_base, "--rds", "%s.rds" % tx_out_base, "--sampleid", dd.get_sample_name(paired.tumor_data), "--genome", genome, "--vcf", vcf_file, "--tumor", cnr_file, "--segfile", seg_file, "--funsegmentation", "Hclust", "--maxnonclonal", "0.3"] if dd.get_num_cores(paired.tumor_data) > 1: cmd += ["--cores", str(dd.get_num_cores(paired.tumor_data))] try: cmd = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(), utils.get_R_exports(), " ".join([str(x) for x in cmd])) do.run(cmd, "PureCN copy number calling") except subprocess.CalledProcessError as msg: if _allowed_errors(str(msg)): logger.info("PureCN failed to find solution for %s: skipping" % dd.get_sample_name(paired.tumor_data)) with open(failed_file, "w") as out_handle: out_handle.write(str(msg)) else: logger.exception() raise for f in all_files: if os.path.exists(os.path.join(os.path.dirname(tx_out_base), f)): shutil.move(os.path.join(os.path.dirname(tx_out_base), f), os.path.join(os.path.dirname(out_base), f)) out = _get_purecn_files(paired, work_dir, require_exist=True)[1] return out if (out.get("rds") and os.path.exists(out["rds"])) else None
def _cnvkit_segment(cnr_file, cov_interval, data): """Perform segmentation and copy number calling on normalized inputs """ out_file = "%s.cns" % os.path.splitext(cnr_file)[0] if not utils.file_uptodate(out_file, cnr_file): with file_transaction(data, out_file) as tx_out_file: if not _cna_has_values(cnr_file): with open(tx_out_file, "w") as out_handle: out_handle.write("chromosome\tstart\tend\tgene\tlog2\tprobes\tCN1\tCN2\tbaf\tweight\n") else: cmd = [_get_cmd(), "segment", "-p", str(dd.get_cores(data)), "-o", tx_out_file, cnr_file] small_vrn_files = _compatible_small_variants(data) if len(small_vrn_files) > 0 and _cna_has_values(cnr_file) and cov_interval != "genome": cmd += ["-v", small_vrn_files[0]] if cov_interval == "genome": cmd += ["--threshold", "0.00001"] # preferentially use conda installed Rscript export_cmd = ("%s && export TMPDIR=%s && " % (utils.get_R_exports(), os.path.dirname(tx_out_file))) do.run(export_cmd + " ".join(cmd), "CNVkit segment") return out_file
def rnaseq_vardict_variant_calling(data): sample = dd.get_sample_name(data) out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "variation", "rnaseq", "vardict")) out_file = os.path.join(out_dir, sample + "-vardict.vcf.gz") if file_exists(out_file): data = dd.set_vrn_file(data, out_file) return data vardict_cmd = vardict.get_vardict_command(data) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" vcfstreamsort = config_utils.get_program("vcfstreamsort", data) compress_cmd = "| bgzip -c" freq = float(dd.get_min_allele_fraction(data, 20) / 100.0) var2vcf_opts = "-v 50" fix_ambig = vcfutils.fix_ambiguous_cl() remove_dup = vcfutils.remove_dup_cl() r_setup = get_R_exports() ref_file = dd.get_ref_file(data) bamfile = dd.get_work_bam(data) data = _setup_variant_regions(data, out_dir) opts, _ = vardict._vardict_options_from_config([data], data["config"], out_file, dd.get_variant_regions(data), is_rnaseq=True) cores = dd.get_num_cores(data) if cores and cores > 1: opts += " -th %s" % str(cores) with file_transaction(data, out_file) as tx_out_file: jvm_opts = vardict._get_jvm_opts(data, tx_out_file) cmd = ("{r_setup} && {jvm_opts}{vardict_cmd} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} " "| {strandbias}" "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} " "| {fix_ambig} | {remove_dup} | {vcfstreamsort} {compress_cmd} " "> {tx_out_file}") message = "Calling RNA-seq variants with VarDict" do.run(cmd.format(**locals()), message) out_file = vcfutils.bgzip_and_index(out_file, data["config"]) data = dd.set_vrn_file(data, out_file) return data
def rnaseq_vardict_variant_calling(data): sample = dd.get_sample_name(data) variation_dir = os.path.join(dd.get_work_dir(data), "variation") safe_makedir(variation_dir) out_file = os.path.join(variation_dir, sample + "-vardict.vcf.gz") if file_exists(out_file): data = dd.set_vrn_file(data, out_file) return data vardict_cmd = vardict.get_vardict_command(data) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" vcfstreamsort = config_utils.get_program("vcfstreamsort", data) compress_cmd = "| bgzip -c" freq = float(dd.get_min_allele_fraction(data, 20) / 100.0) var2vcf_opts = "-v 50" fix_ambig = vcfutils.fix_ambiguous_cl() remove_dup = vcfutils.remove_dup_cl() r_setup = get_R_exports() ref_file = dd.get_ref_file(data) bamfile = dd.get_work_bam(data) bed_file = gtf.gtf_to_bed(dd.get_gtf_file(data)) opts = " -c 1 -S 2 -E 3 -g 4 " resources = config_utils.get_resources("vardict", data) if resources.get("options"): opts += " ".join([str(x) for x in resources["options"]]) with file_transaction(data, out_file) as tx_out_file: jvm_opts = vardict._get_jvm_opts(data, tx_out_file) cmd = ("{r_setup} && {jvm_opts}{vardict_cmd} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} {bed_file} " "| {strandbias}" "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} " "| {fix_ambig} | {remove_dup} | {vcfstreamsort} {compress_cmd} " "> {tx_out_file}") message = "Calling RNA-seq variants with VarDict" do.run(cmd.format(**locals()), message) data = dd.set_vrn_file(data, out_file) return data
def _run_vardict_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with VarDict. var2vcf_valid uses -A flag which reports all alleles and improves sensitivity: https://github.com/AstraZeneca-NGS/VarDict/issues/35#issuecomment-276738191 """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: vrs = bedutils.population_variant_regions(items) target = shared.subset_variant_regions( vrs, region, out_file, items=items, do_merge=False) num_bams = len(align_bams) sample_vcf_names = [] # for individual sample names, given batch calling may be required for bamfile, item in zip(align_bams, items): # prepare commands sample = dd.get_sample_name(item) vardict = get_vardict_command(items[0]) opts, var2vcf_opts = _vardict_options_from_config(items, config, out_file, target) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) compress_cmd = "| bgzip -c" if tx_out_file.endswith("gz") else "" freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) remove_dup = vcfutils.remove_dup_cl() py_cl = os.path.join(utils.get_bcbio_bin(), "py") jvm_opts = _get_jvm_opts(items[0], tx_out_file) setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports()) contig_cl = vcfutils.add_contig_to_header_cl(ref_file, tx_out_file) lowfreq_filter = _lowfreq_linear_filter(0, False) cmd = ("{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} " "| teststrandbias.R " "| var2vcf_valid.pl -A -N {sample} -E -f {freq} {var2vcf_opts} " "| {contig_cl} | bcftools filter -i 'QUAL >= 0' | {lowfreq_filter} " "| {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}") if num_bams > 1: temp_file_prefix = out_file.replace(".gz", "").replace(".vcf", "") + item["name"][1] tmp_out = temp_file_prefix + ".temp.vcf" tmp_out += ".gz" if out_file.endswith("gz") else "" sample_vcf_names.append(tmp_out) with file_transaction(item, tmp_out) as tx_tmp_file: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_tmp_file, config, samples=[sample]) else: cmd += " > {tx_tmp_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) else: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_out_file, config, samples=[sample]) else: cmd += " > {tx_out_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) if num_bams > 1: # N.B. merge_variant_files wants region in 1-based end-inclusive # coordinates. Thus use bamprep.region_to_gatk vcfutils.merge_variant_files(orig_files=sample_vcf_names, out_file=tx_out_file, ref_file=ref_file, config=config, region=bamprep.region_to_gatk(region)) return out_file
def _run_vardict_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect variants with Vardict. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext( align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: target = shared.subset_variant_regions(dd.get_variant_regions( items[0]), region, out_file, do_merge=True) paired = vcfutils.get_paired_bams(align_bams, items) if not _is_bed_file(target): vcfutils.write_empty_vcf( tx_out_file, config, samples=[ x for x in [paired.tumor_name, paired.normal_name] if x ]) else: if not paired.normal_bam: ann_file = _run_vardict_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vardict = get_vardict_command(items[0]) vcfstreamsort = config_utils.get_program( "vcfstreamsort", config) strandbias = "testsomatic.R" var2vcf = "var2vcf_paired.pl" compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float( utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 # merge bed file regions as amplicon VarDict is only supported in single sample mode opts, var2vcf_opts = _vardict_options_from_config( items, config, out_file, target) fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) remove_dup = vcfutils.remove_dup_cl() if any("vardict_somatic_filter" in tz.get_in(( "config", "algorithm", "tools_off"), data, []) for data in items): somatic_filter = "" freq_filter = "" else: var2vcf_opts += " -M " # this makes VarDict soft filter non-differential variants somatic_filter = ( "| sed 's/\\\\.*Somatic\\\\/Somatic/' " "| sed 's/REJECT,Description=\".*\">/REJECT,Description=\"Not Somatic via VarDict\">/' " """| %s -c 'from bcbio.variation import freebayes; """ """freebayes.call_somatic("%s", "%s")' """ % (sys.executable, paired.tumor_name, paired.normal_name)) freq_filter = ( "| bcftools filter -m '+' -s 'REJECT' -e 'STATUS !~ \".*Somatic\"' 2> /dev/null " "| %s -x 'bcbio.variation.vardict.depth_freq_filter(x, %s, \"%s\")'" % (os.path.join(os.path.dirname(sys.executable), "py"), 0, dd.get_aligner(paired.tumor_data))) jvm_opts = _get_jvm_opts(items[0], tx_out_file) py_cl = os.path.join(utils.get_bcbio_bin(), "py") setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports()) contig_cl = vcfutils.add_contig_to_header_cl( ref_file, tx_out_file) cmd = ( "{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} " "| {strandbias} " "| {var2vcf} -P 0.9 -m 4.25 -f {freq} {var2vcf_opts} " "-N \"{paired.tumor_name}|{paired.normal_name}\" " "| {contig_cl} {freq_filter} " "| bcftools filter -i 'QUAL >= 0' " "{somatic_filter} | {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) return out_file
def _run_vardict_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with VarDict. var2vcf_valid uses -A flag which reports all alleles and improves sensitivity: https://github.com/AstraZeneca-NGS/VarDict/issues/35#issuecomment-276738191 """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: vrs = bedutils.population_variant_regions(items) target = shared.subset_variant_regions(vrs, region, out_file, items=items, do_merge=False) num_bams = len(align_bams) sample_vcf_names = [ ] # for individual sample names, given batch calling may be required for bamfile, item in zip(align_bams, items): # prepare commands sample = dd.get_sample_name(item) vardict = get_vardict_command(items[0]) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" opts, var2vcf_opts = _vardict_options_from_config( items, config, out_file, target) vcfstreamsort = config_utils.get_program( "vcfstreamsort", config) compress_cmd = "| bgzip -c" if tx_out_file.endswith( "gz") else "" freq = float( utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) remove_dup = vcfutils.remove_dup_cl() py_cl = os.path.join(utils.get_bcbio_bin(), "py") jvm_opts = _get_jvm_opts(items[0], tx_out_file) setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports()) contig_cl = vcfutils.add_contig_to_header_cl( ref_file, tx_out_file) cmd = ( "{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} " "| {strandbias}" "| {var2vcf} -A -N {sample} -E -f {freq} {var2vcf_opts} " "| {contig_cl} | bcftools filter -i 'QUAL >= 0' " "| {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}" ) if num_bams > 1: temp_file_prefix = out_file.replace(".gz", "").replace( ".vcf", "") + item["name"][1] tmp_out = temp_file_prefix + ".temp.vcf" tmp_out += ".gz" if out_file.endswith("gz") else "" sample_vcf_names.append(tmp_out) with file_transaction(item, tmp_out) as tx_tmp_file: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_tmp_file, config, samples=[sample]) else: cmd += " > {tx_tmp_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) else: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_out_file, config, samples=[sample]) else: cmd += " > {tx_out_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) if num_bams > 1: # N.B. merge_variant_files wants region in 1-based end-inclusive # coordinates. Thus use bamprep.region_to_gatk vcfutils.merge_variant_files( orig_files=sample_vcf_names, out_file=tx_out_file, ref_file=ref_file, config=config, region=bamprep.region_to_gatk(region)) return out_file
def _run_vardict_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect variants with Vardict. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: vrs = bedutils.population_variant_regions(items) target = shared.subset_variant_regions(vrs, region, out_file, items=items, do_merge=True) paired = vcfutils.get_paired_bams(align_bams, items) if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_out_file, config, samples=[x for x in [paired.tumor_name, paired.normal_name] if x]) else: if not paired.normal_bam: ann_file = _run_vardict_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vardict = get_vardict_command(items[0]) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 # merge bed file regions as amplicon VarDict is only supported in single sample mode opts, var2vcf_opts = _vardict_options_from_config(items, config, out_file, target) fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) remove_dup = vcfutils.remove_dup_cl() if any("vardict_somatic_filter" in tz.get_in(("config", "algorithm", "tools_off"), data, []) for data in items): somatic_filter = "" freq_filter = "" else: var2vcf_opts += " -M " # this makes VarDict soft filter non-differential variants somatic_filter = ("| sed 's/\\\\.*Somatic\\\\/Somatic/' " "| sed 's/REJECT,Description=\".*\">/REJECT,Description=\"Not Somatic via VarDict\">/' " """| %s -c 'from bcbio.variation import freebayes; """ """freebayes.call_somatic("%s", "%s")' """ % (sys.executable, paired.tumor_name, paired.normal_name)) freq_filter = ("| bcftools filter -m '+' -s 'REJECT' -e 'STATUS !~ \".*Somatic\"' 2> /dev/null " "| %s -x 'bcbio.variation.vardict.add_db_germline_flag(x)' " "| %s " "| %s -x 'bcbio.variation.vardict.depth_freq_filter(x, %s, \"%s\")'" % (os.path.join(os.path.dirname(sys.executable), "py"), _lowfreq_linear_filter(0, True), os.path.join(os.path.dirname(sys.executable), "py"), 0, bam.aligner_from_header(paired.tumor_bam))) jvm_opts = _get_jvm_opts(items[0], tx_out_file) py_cl = os.path.join(utils.get_bcbio_bin(), "py") setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports()) contig_cl = vcfutils.add_contig_to_header_cl(ref_file, tx_out_file) cmd = ("{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} " "| awk 'NF>=48' | testsomatic.R " "| var2vcf_paired.pl -P 0.9 -m 4.25 -f {freq} {var2vcf_opts} " "-N \"{paired.tumor_name}|{paired.normal_name}\" " "| {contig_cl} {freq_filter} " "| bcftools filter -i 'QUAL >= 0' " "{somatic_filter} | {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) return out_file