def _cnvkit_segment(cnr_file, cov_interval, data, items, out_file=None, detailed=False): """Perform segmentation and copy number calling on normalized inputs """ if not out_file: out_file = "%s.cns" % os.path.splitext(cnr_file)[0] if not utils.file_uptodate(out_file, cnr_file): with file_transaction(data, out_file) as tx_out_file: if not _cna_has_values(cnr_file): with open(tx_out_file, "w") as out_handle: out_handle.write( "chromosome\tstart\tend\tgene\tlog2\tprobes\tCN1\tCN2\tbaf\tweight\n" ) else: # Scale cores to avoid memory issues with segmentation # https://github.com/etal/cnvkit/issues/346 if cov_interval == "genome": cores = max(1, dd.get_cores(data) // 2) else: cores = dd.get_cores(data) cmd = [ _get_cmd(), "segment", "-p", str(cores), "-o", tx_out_file, cnr_file ] small_vrn_files = _compatible_small_variants(data, items) if len(small_vrn_files) > 0 and _cna_has_values( cnr_file) and cov_interval != "genome": cmd += [ "--vcf", small_vrn_files[0].name, "--sample-id", small_vrn_files[0].sample ] if small_vrn_files[0].normal: cmd += ["--normal-id", small_vrn_files[0].normal] resources = config_utils.get_resources("cnvkit_segment", data["config"]) user_options = resources.get("options", []) cmd += [str(x) for x in user_options] if cov_interval == "genome" and "--threshold" not in user_options: cmd += ["--threshold", "0.00001"] # For tumors, remove very low normalized regions, avoiding upcaptured noise # https://github.com/bcbio/bcbio-nextgen/issues/2171#issuecomment-348333650 # unless we want detailed segmentation for downstream tools paired = vcfutils.get_paired(items) if paired: #if detailed: # cmd += ["-m", "hmm-tumor"] if "--drop-low-coverage" not in user_options: cmd += ["--drop-low-coverage"] # preferentially use conda installed Rscript export_cmd = ( "%s && export TMPDIR=%s && " % (utils.get_R_exports(), os.path.dirname(tx_out_file))) do.run(export_cmd + " ".join(cmd), "CNVkit segment") return out_file
def remove_extracontigs(in_bam, data): """Remove extra contigs (non chr1-22,X,Y) from an input BAM. These extra contigs can often be arranged in different ways, causing incompatibility issues with GATK and other tools. This also fixes the read group header as in fixrg. This does not yet handle mapping over 1 -> chr1 issues since this requires a ton of search/replace which slows down conversion. """ work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "bamclean", dd.get_sample_name(data))) out_file = os.path.join(work_dir, "%s-noextras.bam" % utils.splitext_plus(os.path.basename(in_bam))[0]) if not utils.file_exists(out_file): out_file = os.path.join(work_dir, "%s-noextras.bam" % dd.get_sample_name(data)) if not utils.file_uptodate(out_file, in_bam): with file_transaction(data, out_file) as tx_out_file: target_chroms = _target_chroms_and_header(in_bam, data) str_chroms = " ".join(target_chroms) rg_info = novoalign.get_rg_info(data["rgnames"]) bcbio_py = sys.executable ref_file = dd.get_ref_file(data) local_bam = os.path.join(os.path.dirname(tx_out_file), os.path.basename(in_bam)) cores = dd.get_cores(data) utils.symlink_plus(in_bam, local_bam) bam.index(local_bam, data["config"]) cmd = ("samtools view -@ {cores} -h {local_bam} {str_chroms} | " """{bcbio_py} -c 'from bcbio.pipeline import cleanbam; """ """cleanbam.fix_header("{ref_file}")' | """ "samtools view -@ {cores} -u - | " "samtools addreplacerg -@ {cores} -r '{rg_info}' -m overwrite_all -O bam -o {tx_out_file} - ") do.run(cmd.format(**locals()), "bamprep, remove extra contigs: %s" % dd.get_sample_name(data)) return out_file
def _run_genotype_gvcfs(data, region, vrn_files, ref_file, out_file): """Performs genotyping of gVCFs into final VCF files. """ if not utils.file_exists(out_file): broad_runner = broad.runner_from_config(data["config"]) with file_transaction(data, out_file) as tx_out_file: assoc_files = tz.get_in(("genome_resources", "variation"), data, {}) if not assoc_files: assoc_files = {} params = ["-T", "GenotypeGVCFs", "-R", ref_file, "-o", tx_out_file, "-L", bamprep.region_to_gatk(region), "--max_alternate_alleles", "4"] for vrn_file in vrn_files: params += ["--variant", vrn_file] if assoc_files.get("dbsnp"): params += ["--dbsnp", assoc_files["dbsnp"]] broad_runner.new_resources("gatk-haplotype") cores = dd.get_cores(data) if cores > 1: # GATK performs poorly with memory usage when parallelizing # with a large number of cores but makes use of extra memory, # so we cap at 6 cores. # See issue #1565 for discussion params += ["-nt", str(min(6, cores))] memscale = {"magnitude": 0.9 * cores, "direction": "increase"} else: memscale = None broad_runner.run_gatk(params, memscale=memscale) return vcfutils.bgzip_and_index(out_file, data["config"])
def _call_hla(hla_fq, out_dir, data): """Run OptiType HLA calling for a specific fastq input. """ bin_dir = os.path.dirname(os.path.realpath(sys.executable)) out_dir = utils.safe_makedir(out_dir) with tx_tmpdir(data, os.path.dirname(out_dir)) as tx_out_dir: config_file = os.path.join(tx_out_dir, "config.ini") with open(config_file, "w") as out_handle: razers3 = os.path.join(bin_dir, "razers3") if not os.path.exists(razers3): raise ValueError("Could not find razers3 executable at %s" % (razers3)) out_handle.write( CONFIG_TMPL.format(razers3=razers3, cores=dd.get_cores(data))) resources = config_utils.get_resources("optitype", data["config"]) if resources.get("options"): opts = " ".join([str(x) for x in resources["options"]]) else: opts = "" cmd = ("OptiTypePipeline.py -v --dna {opts} -o {tx_out_dir} " "-i {hla_fq} -c {config_file}") do.run(cmd.format(**locals()), "HLA typing with OptiType") for outf in os.listdir(tx_out_dir): shutil.move(os.path.join(tx_out_dir, outf), os.path.join(out_dir, outf)) out_file = glob.glob(os.path.join(out_dir, "*", "*_result.tsv")) assert len( out_file ) == 1, "Expected one result file for OptiType, found %s" % out_file return out_file[0]
def _cnvkit_segment(cnr_file, cov_interval, data): """Perform segmentation and copy number calling on normalized inputs """ out_file = "%s.cns" % os.path.splitext(cnr_file)[0] if not utils.file_uptodate(out_file, cnr_file): with file_transaction(data, out_file) as tx_out_file: if not _cna_has_values(cnr_file): with open(tx_out_file, "w") as out_handle: out_handle.write( "chromosome\tstart\tend\tgene\tlog2\tprobes\tCN1\tCN2\tbaf\tweight\n" ) else: cmd = [ _get_cmd(), "segment", "-p", str(dd.get_cores(data)), "-o", tx_out_file, cnr_file ] small_vrn_files = _compatible_small_variants(data) if len(small_vrn_files) > 0 and _cna_has_values( cnr_file) and cov_interval != "genome": cmd += ["-v", small_vrn_files[0]] if cov_interval == "genome": cmd += ["--threshold", "0.00001"] # preferentially use conda installed Rscript export_cmd = ( "%s && export TMPDIR=%s && " % (utils.get_R_exports(), os.path.dirname(tx_out_file))) do.run(export_cmd + " ".join(cmd), "CNVkit segment") return out_file
def _call_hla(hla_fq, out_dir, data): """Run OptiType HLA calling for a specific fastq input. """ bin_dir = os.path.dirname(os.path.realpath(sys.executable)) out_dir = utils.safe_makedir(out_dir) with tx_tmpdir(data, os.path.dirname(out_dir)) as tx_out_dir: config_file = os.path.join(tx_out_dir, "config.ini") with open(config_file, "w") as out_handle: razers3 = os.path.join(bin_dir, "razers3") if not os.path.exists(razers3): raise ValueError("Could not find razers3 executable at %s" % (razers3)) out_handle.write(CONFIG_TMPL.format(razers3=razers3, cores=dd.get_cores(data))) resources = config_utils.get_resources("optitype", data["config"]) if resources.get("options"): opts = " ".join([str(x) for x in resources["options"]]) else: opts = "" cmd = ("OptiTypePipeline.py -v --dna {opts} -o {tx_out_dir} " "-i {hla_fq} -c {config_file}") do.run(cmd.format(**locals()), "HLA typing with OptiType") for outf in os.listdir(tx_out_dir): shutil.move(os.path.join(tx_out_dir, outf), os.path.join(out_dir, outf)) out_file = glob.glob(os.path.join(out_dir, "*", "*_result.tsv")) assert len(out_file) == 1, "Expected one result file for OptiType, found %s" % out_file return out_file[0]
def _run_genomicsdb_import(vrn_files, region, out_file, data): """Create a GenomicsDB reference for all the variation files: GATK4. Not yet tested as scale, need to explore --batchSize to reduce memory usage if needed. Does not support transactional directories yet, since GenomicsDB databases cannot be moved to new locations. We try to identify half-finished databases and restart: https://gatkforums.broadinstitute.org/gatk/discussion/10061/using-genomicsdbimport-to-prepare-gvcfs-for-input-to-genotypegvcfs-in-gatk4 Known issue -- Genomics DB workspace path core dumps on longer paths: (std::string::compare(char const*)) """ out_dir = "%s_genomicsdb" % utils.splitext_plus(out_file)[0] if not os.path.exists(out_dir) or _incomplete_genomicsdb(out_dir): if os.path.exists(out_dir): shutil.rmtree(out_dir) with utils.chdir(os.path.dirname(out_file)): with file_transaction(data, out_dir) as tx_out_dir: broad_runner = broad.runner_from_config(data["config"]) cores = dd.get_cores(data) params = ["-T", "GenomicsDBImport", "--reader-threads", str(cores), "--genomicsdb-workspace-path", os.path.relpath(out_dir, os.getcwd()), "-L", bamprep.region_to_gatk(region)] for vrn_file in vrn_files: vcfutils.bgzip_and_index(vrn_file, data["config"]) params += ["--variant", vrn_file] memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None broad_runner.run_gatk(params, memscale=memscale) return out_dir
def _get_snpeff_cmd(cmd_name, datadir, data, out_file): """Retrieve snpEff base command line. """ resources = config_utils.get_resources("snpeff", data["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx3g"]) # scale by cores, defaulting to 2x base usage to ensure we have enough memory # for single core runs to use with human genomes. # Sets a maximum amount of memory to avoid core dumps exceeding 32Gb # We shouldn't need that much memory for snpEff, so avoid issues # https://www.elastic.co/guide/en/elasticsearch/guide/current/heap-sizing.html#compressed_oops jvm_opts = config_utils.adjust_opts( jvm_opts, { "algorithm": { "memory_adjust": { "direction": "increase", "maximum": "30000M", "magnitude": max(2, dd.get_cores(data)) } } }) memory = " ".join(jvm_opts) snpeff = config_utils.get_program("snpEff", data["config"]) java_args = "-Djava.io.tmpdir=%s" % utils.safe_makedir( os.path.join(os.path.dirname(out_file), "tmp")) export = "unset JAVA_HOME && export PATH=%s:$PATH && " % ( utils.get_java_binpath()) cmd = "{export} {snpeff} {memory} {java_args} {cmd_name} -dataDir {datadir}" return cmd.format(**locals())
def _run_genotype_gvcfs(data, region, vrn_files, ref_file, out_file): """Performs genotyping of gVCFs into final VCF files. """ if not utils.file_exists(out_file): broad_runner = broad.runner_from_config(data["config"]) with file_transaction(data, out_file) as tx_out_file: assoc_files = tz.get_in(("genome_resources", "variation"), data, {}) if not assoc_files: assoc_files = {} params = [ "-T", "GenotypeGVCFs", "-R", ref_file, "-o", tx_out_file, "-L", bamprep.region_to_gatk(region), "--max_alternate_alleles", "4" ] for vrn_file in vrn_files: params += ["--variant", vrn_file] if assoc_files.get("dbsnp"): params += ["--dbsnp", assoc_files["dbsnp"]] broad_runner.new_resources("gatk-haplotype") cores = dd.get_cores(data) if cores > 1: # GATK performs poorly with memory usage when parallelizing # with a large number of cores but makes use of extra memory, # so we cap at 6 cores. # See issue #1565 for discussion params += ["-nt", str(min(6, cores))] memscale = {"magnitude": 0.9 * cores, "direction": "increase"} else: memscale = None broad_runner.run_gatk(params, memscale=memscale) return vcfutils.bgzip_and_index(out_file, data["config"])
def _cnvkit_coverage(data, bed_file, input_type): """Calculate coverage in a BED file for CNVkit. """ bam_file = dd.get_align_bam(data) work_dir = utils.safe_makedir(os.path.join(_sv_workdir(data), "raw")) exts = {".target.bed": ("target", "targetcoverage.cnn"), ".antitarget.bed": ("antitarget", "antitargetcoverage.cnn")} cnntype = None for orig, (cur_cnntype, ext) in exts.items(): if bed_file.endswith(orig): cnntype = cur_cnntype break if cnntype is None: assert bed_file.endswith(".bed"), "Unexpected BED file extension for coverage %s" % bed_file cnntype = "" base, base_old = _bam_to_outbase(bam_file, work_dir, data) out_file = "%s.%s" % (base, ext) out_file_old = "%s.%s" % (base_old, ext) # back compatible with previous runs to avoid re-calculating if utils.file_exists(out_file_old): out_file = out_file_old if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: cmd = [_get_cmd(), "coverage", "-p", str(dd.get_cores(data)), bam_file, bed_file, "-o", tx_out_file] do.run(_prep_cmd(cmd, tx_out_file), "CNVkit coverage") return {"itype": input_type, "file": out_file, "bam": bam_file, "cnntype": cnntype, "sample": dd.get_sample_name(data)}
def _run_genotype_gvcfs_gatk3(data, region, vrn_files, ref_file, out_file): """Performs genotyping of gVCFs into final VCF files. """ if not utils.file_exists(out_file): broad_runner = broad.runner_from_config(data["config"]) with file_transaction(data, out_file) as tx_out_file: assoc_files = tz.get_in(("genome_resources", "variation"), data, {}) if not assoc_files: assoc_files = {} params = ["-T", "GenotypeGVCFs", "-R", ref_file, "-o", tx_out_file, "-L", bamprep.region_to_gatk(region), "--max_alternate_alleles", "4"] for vrn_file in vrn_files: params += ["--variant", vrn_file] if assoc_files.get("dbsnp"): params += ["--dbsnp", assoc_files["dbsnp"]] broad_runner.new_resources("gatk-haplotype") cores = dd.get_cores(data) if cores > 1: # GATK performs poorly with memory usage when parallelizing # with a large number of cores but makes use of extra memory, # so we cap at 6 cores. # See issue #1565 for discussion # Recent GATK 3.x versions also have race conditions with multiple # threads, so limit to 1 and keep memory available # https://gatkforums.broadinstitute.org/wdl/discussion/8718/concurrentmodificationexception-in-gatk-3-7-genotypegvcfs # params += ["-nt", str(min(6, cores))] memscale = {"magnitude": 0.9 * cores, "direction": "increase"} else: memscale = None broad_runner.run_gatk(params, memscale=memscale) return vcfutils.bgzip_and_index(out_file, data["config"])
def _run_genomicsdb_import(vrn_files, region, out_file, data): """Create a GenomicsDB reference for all the variation files: GATK4. Not yet tested as scale, need to explore --batchSize to reduce memory usage if needed. XXX Does not support transactional directories yet, since GenomicsDB databases cannot be moved to new locations. We need to explore options to identify half-finished databases and restart: https://gatkforums.broadinstitute.org/gatk/discussion/10061/using-genomicsdbimport-to-prepare-gvcfs-for-input-to-genotypegvcfs-in-gatk4 """ out_dir = "%s_genomicsdb" % utils.splitext_plus(out_file)[0] if not os.path.exists(out_dir): with file_transaction(data, out_dir) as tx_out_dir: broad_runner = broad.runner_from_config(data["config"]) cores = dd.get_cores(data) params = ["-T", "GenomicsDBImport", "--readerThreads", str(cores), "--genomicsDBWorkspace", out_dir, "-L", bamprep.region_to_gatk(region)] for vrn_file in vrn_files: params += ["--variant", vrn_file] memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None broad_runner.run_gatk(params, memscale=memscale) return out_dir
def _run_cnvkit_shared(items, test_bams, background_bams, work_dir, background_name=None): """Shared functionality to run CNVkit, parallelizing over multiple BAM files. """ raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) background_cnn = os.path.join( raw_work_dir, "%s_background.cnn" % (background_name if background_name else "flat")) ckouts = [] for test_bam in test_bams: out_base = _bam_to_outbase(test_bam, raw_work_dir) ckouts.append({ "cnr": "%s.cns" % out_base, "cns": "%s.cns" % out_base, "back_cnn": background_cnn }) if not utils.file_exists(ckouts[0]["cnr"]): data = items[0] cov_interval = dd.get_coverage_interval(data) raw_target_bed, access_bed = _get_target_access_files( cov_interval, data, work_dir) # bail out if we ended up with no regions if not utils.file_exists(raw_target_bed): return {} raw_target_bed = annotate.add_genes(raw_target_bed, data) parallel = { "type": "local", "cores": dd.get_cores(data), "progs": ["cnvkit"] } target_bed, antitarget_bed = _cnvkit_targets(raw_target_bed, access_bed, cov_interval, raw_work_dir, data) def _bam_to_itype(bam): return "background" if bam in background_bams else "evaluate" split_cnns = run_multicore( _cnvkit_coverage, [(bam, bed, _bam_to_itype(bam), raw_work_dir, data) for bam in test_bams + background_bams for bed in _split_bed(target_bed, data) + _split_bed(antitarget_bed, data)], data["config"], parallel) coverage_cnns = _merge_coverage(split_cnns, data) background_cnn = _cnvkit_background( [x["file"] for x in coverage_cnns if x["itype"] == "background"], background_cnn, target_bed, antitarget_bed, data) fixed_cnrs = run_multicore( _cnvkit_fix, [(cnns, background_cnn, data) for cnns in tz.groupby( "bam", [x for x in coverage_cnns if x["itype"] == "evaluate"]).values()], data["config"], parallel) called_segs = run_multicore(_cnvkit_segment, [(cnr, cov_interval, data) for cnr in fixed_cnrs], data["config"], parallel) return ckouts
def _safe_run_theta(input_file, out_dir, output_ext, args, data): """Run THetA, catching and continuing on any errors. """ out_file = os.path.join(out_dir, _split_theta_ext(input_file) + output_ext) skip_file = out_file + ".skipped" if utils.file_exists(skip_file): return None if not utils.file_exists(out_file): with file_transaction(data, out_dir) as tx_out_dir: utils.safe_makedir(tx_out_dir) cmd = _get_cmd("RunTHetA.py") + args + \ [input_file, "--NUM_PROCESSES", dd.get_cores(data), "--FORCE", "-d", tx_out_dir] try: do.run(cmd, "Run THetA to calculate purity", log_error=False) except subprocess.CalledProcessError, msg: if ("Number of intervals must be greater than 1" in str(msg) or "This sample isn't a good candidate for THetA analysis" in str(msg)): with open( os.path.join(tx_out_dir, os.path.basename(skip_file)), "w") as out_handle: out_handle.write("Expected TheTA failure, skipping") return None else: raise
def _run_vqsr(in_file, ref_file, vrn_files, sensitivity_cutoff, filter_type, data): """Run variant quality score recalibration. """ cutoffs = ["100.0", "99.99", "99.98", "99.97", "99.96", "99.95", "99.94", "99.93", "99.92", "99.91", "99.9", "99.8", "99.7", "99.6", "99.5", "99.0", "98.0", "90.0"] if sensitivity_cutoff not in cutoffs: cutoffs.append(sensitivity_cutoff) cutoffs.sort() broad_runner = broad.runner_from_config(data["config"]) base = utils.splitext_plus(in_file)[0] recal_file = "%s.recal" % base tranches_file = "%s.tranches" % base if not utils.file_exists(recal_file): with file_transaction(data, recal_file, tranches_file) as (tx_recal, tx_tranches): params = ["-T", "VariantRecalibrator", "-R", ref_file, "--input", in_file, "--mode", filter_type, "--recal_file", tx_recal, "--tranches_file", tx_tranches] for cutoff in cutoffs: params += ["-tranche", str(cutoff)] params += _get_vqsr_training(filter_type, vrn_files) for a in _get_vqsr_annotations(filter_type): params += ["-an", a] cores = dd.get_cores(data) memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None try: broad_runner.new_resources("gatk-vqsr") broad_runner.run_gatk(params, log_error=False, memscale=memscale) except: # Can fail to run if not enough values are present to train. return None, None return recal_file, tranches_file
def _cnvkit_segment(cnr_file, cov_interval, data, items, out_file=None): """Perform segmentation and copy number calling on normalized inputs """ if not out_file: out_file = "%s.cns" % os.path.splitext(cnr_file)[0] if not utils.file_uptodate(out_file, cnr_file): with file_transaction(data, out_file) as tx_out_file: if not _cna_has_values(cnr_file): with open(tx_out_file, "w") as out_handle: out_handle.write("chromosome\tstart\tend\tgene\tlog2\tprobes\tCN1\tCN2\tbaf\tweight\n") else: cmd = [_get_cmd(), "segment", "-p", str(dd.get_cores(data)), "-o", tx_out_file, cnr_file] small_vrn_files = _compatible_small_variants(data, items) if len(small_vrn_files) > 0 and _cna_has_values(cnr_file) and cov_interval != "genome": cmd += ["--vcf", small_vrn_files[0].name, "--sample-id", small_vrn_files[0].sample] if small_vrn_files[0].normal: cmd += ["--normal-id", small_vrn_files[0].normal] if cov_interval == "genome": cmd += ["--threshold", "0.00001"] # For tumors, remove very low normalized regions, avoiding upcaptured noise # https://github.com/chapmanb/bcbio-nextgen/issues/2171#issuecomment-348333650 paired = vcfutils.get_paired(items) if paired: cmd += ["--drop-low-coverage"] # preferentially use conda installed Rscript export_cmd = ("%s && export TMPDIR=%s && " % (utils.get_R_exports(), os.path.dirname(tx_out_file))) do.run(export_cmd + " ".join(cmd), "CNVkit segment") return out_file
def _call_hla(hla_fq, out_dir, data): """Run OptiType HLA calling for a specific fastq input.""" out_dir = utils.safe_makedir(out_dir) with tx_tmpdir(data, os.path.dirname(out_dir)) as tx_out_dir: config_file = os.path.join(tx_out_dir, "config.ini") with open(config_file, "w") as out_handle: razers3 = os.path.realpath(utils.which("razers3")) if not os.path.exists(razers3): raise ValueError( f"Could not find razers3 executable at {razers3}") out_handle.write( CONFIG_TMPL.format(razers3=razers3, cores=dd.get_cores(data))) resources = config_utils.get_resources("optitype", data["config"]) if resources.get("options"): opts = " ".join([str(x) for x in resources["options"]]) else: opts = "" # optitype is looking for the reference in ./data which is in env/python3.6 not in tools/bin optitype = os.path.realpath(utils.which("OptiTypePipeline.py")) # techically, optitype is not a python package, conda is not able to set up its shebang properly python_bin = os.path.join(os.path.dirname(optitype), "python") cmd = f"{python_bin} {optitype} -v --dna {opts} -o {tx_out_dir} --enumerate 10 "\ f" -i {hla_fq} -c {config_file}" do.run(cmd, "HLA typing with OptiType") for outf in os.listdir(tx_out_dir): shutil.move(os.path.join(tx_out_dir, outf), os.path.join(out_dir, outf)) out_file = glob.glob(os.path.join(out_dir, "*", "*_result.tsv")) assert len( out_file ) == 1, "Expected one result file for OptiType, found %s" % out_file return out_file[0]
def _run_genotype_gvcfs_genomicsdb(genomics_db, region, out_file, data): """GenotypeGVCFs from a merged GenomicsDB input: GATK4. ropts += [str(x) for x in resources.get("options", [])] No core scaling -- not yet supported in GATK4. """ if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: broad_runner = broad.runner_from_config(data["config"]) # see issue https://github.com/bcbio/bcbio-nextgen/issues/3263 # for why --genomicsdb-use-vcf-codec is necessary params = [ "-T", "GenotypeGVCFs", "--variant", "gendb://%s" % genomics_db, "-R", dd.get_ref_file(data), "--genomicsdb-use-vcf-codec", "--output", tx_out_file, "-L", bamprep.region_to_gatk(region) ] params += ["-ploidy", str(ploidy.get_ploidy([data], region))] # Avoid slow genotyping runtimes with improved quality score calculation in GATK4 # https://gatkforums.broadinstitute.org/gatk/discussion/11471/performance-troubleshooting-tips-for-genotypegvcfs/p1 resources = config_utils.get_resources("gatk", data["config"]) params += [str(x) for x in resources.get("options", [])] cores = dd.get_cores(data) memscale = { "magnitude": 0.9 * cores, "direction": "increase" } if cores > 1 else None broad_runner.run_gatk(params, memscale=memscale) return vcfutils.bgzip_and_index(out_file, data["config"])
def _run_gridss(inputs, background, work_dir): out_file = os.path.join(work_dir, "%s-gridss.sv.vcf" % (dd.get_batch(inputs[0]) or dd.get_sample_name(inputs[0]))) if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): with file_transaction(inputs[0], out_file) as tx_out_file: htsjdk_opts = ["-Dsamjdk.create_index=true", "-Dsamjdk.use_async_io_read_samtools=true", "-Dsamjdk.use_async_io_write_samtools=true", "-Dsamjdk.use_async_io_write_tribble=true"] cores = dd.get_cores(inputs[0]) resources = config_utils.get_resources("gridss", inputs[0]["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx4g"]) jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust": {"direction": "increase", "magnitude": cores}}}) jvm_opts = _finalize_memory(jvm_opts) tx_ref_file = _setup_reference_files(inputs[0], os.path.dirname(tx_out_file)) blacklist_bed = sshared.prepare_exclude_file(inputs + background, out_file) cmd = ["gridss"] + jvm_opts + htsjdk_opts + ["gridss.CallVariants"] + \ ["THREADS=%s" % cores, "TMP_DIR=%s" % os.path.dirname(tx_out_file), "WORKING_DIR=%s" % os.path.dirname(tx_out_file), "OUTPUT=%s" % tx_out_file, "ASSEMBLY=%s" % tx_out_file.replace(".sv.vcf", ".gridss.assembly.bam"), "REFERENCE_SEQUENCE=%s" % tx_ref_file, "BLACKLIST=%s" % blacklist_bed] for data in inputs + background: cmd += ["INPUT=%s" % dd.get_align_bam(data), "INPUT_LABEL=%s" % dd.get_sample_name(data)] exports = utils.local_path_export() cmd = exports + " ".join(cmd) do.run(cmd, "GRIDSS SV analysis") return vcfutils.bgzip_and_index(out_file, inputs[0]["config"])
def _run_genotype_gvcfs_gatk3(data, region, vrn_files, ref_file, out_file): """Performs genotyping of gVCFs into final VCF files. """ if not utils.file_exists(out_file): broad_runner = broad.runner_from_config(data["config"]) with file_transaction(data, out_file) as tx_out_file: assoc_files = tz.get_in(("genome_resources", "variation"), data, {}) if not assoc_files: assoc_files = {} params = ["-T", "GenotypeGVCFs", "-R", ref_file, "-o", tx_out_file, "-L", bamprep.region_to_gatk(region), "--max_alternate_alleles", "4"] for vrn_file in vrn_files: params += ["--variant", vrn_file] if assoc_files.get("dbsnp"): params += ["--dbsnp", assoc_files["dbsnp"]] broad_runner.new_resources("gatk-haplotype") cores = dd.get_cores(data) if cores > 1: # GATK performs poorly with memory usage when parallelizing # with a large number of cores but makes use of extra memory, # so we cap at 6 cores. # See issue #1565 for discussion # Recent GATK 3.x versions also have race conditions with multiple # threads, so limit to 1 and keep memory available # https://gatkforums.broadinstitute.org/wdl/discussion/8718/concurrentmodificationexception-in-gatk-3-7-genotypegvcfs # params += ["-nt", str(min(6, cores))] memscale = {"magnitude": 0.9 * cores, "direction": "increase"} else: memscale = None broad_runner.run_gatk(params, memscale=memscale, parallel_gc=True) return vcfutils.bgzip_and_index(out_file, data["config"])
def fixrg(in_bam, names, ref_file, dirs, data): """Fix read group in a file, using samtools addreplacerg. addreplacerg does not remove the old read group, causing confusion when checking. We use reheader to work around this """ work_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "bamclean", dd.get_sample_name(data))) out_file = os.path.join( work_dir, "%s-fixrg.bam" % utils.splitext_plus(os.path.basename(in_bam))[0]) if not utils.file_exists(out_file): out_file = os.path.join(work_dir, "%s-fixrg.bam" % dd.get_sample_name(data)) if not utils.file_uptodate(out_file, in_bam): with file_transaction(data, out_file) as tx_out_file: rg_info = novoalign.get_rg_info(names) new_header = "%s-header.txt" % os.path.splitext(out_file)[0] cores = dd.get_cores(data) do.run( "samtools view -H {in_bam} | grep -v ^@RG > {new_header}". format(**locals()), "Create empty RG header: %s" % dd.get_sample_name(data)) cmd = ( "samtools reheader {new_header} {in_bam} | " "samtools addreplacerg -@ {cores} -r '{rg_info}' -m overwrite_all -O bam -o {tx_out_file} -" ) do.run(cmd.format(**locals()), "Fix read groups: %s" % dd.get_sample_name(data)) return out_file
def _run_vqsr(in_file, ref_file, vrn_files, sensitivity_cutoff, filter_type, data): """Run variant quality score recalibration. """ cutoffs = [ "100.0", "99.99", "99.98", "99.97", "99.96", "99.95", "99.94", "99.93", "99.92", "99.91", "99.9", "99.8", "99.7", "99.6", "99.5", "99.0", "98.0", "90.0" ] if sensitivity_cutoff not in cutoffs: cutoffs.append(sensitivity_cutoff) cutoffs.sort() broad_runner = broad.runner_from_config(data["config"]) gatk_type = broad_runner.gatk_type() base = utils.splitext_plus(in_file)[0] recal_file = ("%s-vqsrrecal.vcf.gz" % base) if gatk_type == "gatk4" else ("%s.recal" % base) tranches_file = "%s.tranches" % base plot_file = "%s-plots.R" % base if not utils.file_exists(recal_file): with file_transaction(data, recal_file, tranches_file, plot_file) as (tx_recal, tx_tranches, tx_plot_file): params = [ "-T", "VariantRecalibrator", "-R", ref_file, "--mode", filter_type, "--tranches_file", tx_tranches, "--rscript_file", tx_plot_file ] if gatk_type == "gatk4": params += ["--variant", in_file, "--output", tx_recal] else: params += ["--input", in_file, "--recal_file", tx_recal] params += _get_vqsr_training(filter_type, vrn_files, gatk_type) resources = config_utils.get_resources("gatk_variant_recalibrator", data["config"]) opts = resources.get("options", []) if not opts: for cutoff in cutoffs: opts += ["-tranche", str(cutoff)] for a in _get_vqsr_annotations(filter_type): opts += ["-an", a] params += opts cores = dd.get_cores(data) memscale = { "magnitude": 0.9 * cores, "direction": "increase" } if cores > 1 else None try: broad_runner.new_resources("gatk-vqsr") broad_runner.run_gatk(params, log_error=False, memscale=memscale, parallel_gc=True) except: # Can fail to run if not enough values are present to train. return None, None if gatk_type == "gatk4": vcfutils.bgzip_and_index(recal_file, data["config"]) return recal_file, tranches_file
def _run_cnvkit_shared(items, test_bams, background_bams, work_dir, background_name=None): """Shared functionality to run CNVkit, parallelizing over multiple BAM files. """ raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) background_cnn = os.path.join(raw_work_dir, "%s_background.cnn" % (background_name if background_name else "flat")) ckouts = [] for test_bam in test_bams: out_base = _bam_to_outbase(test_bam, raw_work_dir) ckouts.append({"cnr": "%s.cns" % out_base, "cns": "%s.cns" % out_base, "back_cnn": background_cnn}) if not utils.file_exists(ckouts[0]["cnr"]): data = items[0] cov_interval = dd.get_coverage_interval(data) raw_target_bed, access_bed = _get_target_access_files(cov_interval, data, work_dir) # bail out if we ended up with no regions if not utils.file_exists(raw_target_bed): return {} raw_target_bed = annotate.add_genes(raw_target_bed, data) parallel = {"type": "local", "cores": dd.get_cores(data), "progs": ["cnvkit"]} target_bed, antitarget_bed = _cnvkit_targets(raw_target_bed, access_bed, cov_interval, raw_work_dir, data) def _bam_to_itype(bam): return "background" if bam in background_bams else "evaluate" split_cnns = run_multicore( _cnvkit_coverage, [ (bam, bed, _bam_to_itype(bam), raw_work_dir, data) for bam in test_bams + background_bams for bed in _split_bed(target_bed, data) + _split_bed(antitarget_bed, data) ], data["config"], parallel, ) coverage_cnns = _merge_coverage(split_cnns, data) background_cnn = _cnvkit_background( [x["file"] for x in coverage_cnns if x["itype"] == "background"], background_cnn, target_bed, antitarget_bed, data, ) fixed_cnrs = run_multicore( _cnvkit_fix, [ (cnns, background_cnn, data) for cnns in tz.groupby("bam", [x for x in coverage_cnns if x["itype"] == "evaluate"]).values() ], data["config"], parallel, ) called_segs = run_multicore( _cnvkit_segment, [(cnr, cov_interval, data) for cnr in fixed_cnrs], data["config"], parallel ) return ckouts
def _cnvkit_segment(cnr_file, cov_interval, data, items, out_file=None, detailed=False): """Perform segmentation and copy number calling on normalized inputs """ if not out_file: out_file = "%s.cns" % os.path.splitext(cnr_file)[0] if not utils.file_uptodate(out_file, cnr_file): with file_transaction(data, out_file) as tx_out_file: if not _cna_has_values(cnr_file): with open(tx_out_file, "w") as out_handle: out_handle.write("chromosome\tstart\tend\tgene\tlog2\tprobes\tCN1\tCN2\tbaf\tweight\n") else: # Scale cores to avoid memory issues with segmentation # https://github.com/etal/cnvkit/issues/346 if cov_interval == "genome": cores = max(1, dd.get_cores(data) // 2) else: cores = dd.get_cores(data) cmd = [_get_cmd(), "segment", "-p", str(cores), "-o", tx_out_file, cnr_file] small_vrn_files = _compatible_small_variants(data, items) if len(small_vrn_files) > 0 and _cna_has_values(cnr_file) and cov_interval != "genome": cmd += ["--vcf", small_vrn_files[0].name, "--sample-id", small_vrn_files[0].sample] if small_vrn_files[0].normal: cmd += ["--normal-id", small_vrn_files[0].normal] resources = config_utils.get_resources("cnvkit_segment", data["config"]) user_options = resources.get("options", []) cmd += [str(x) for x in user_options] if cov_interval == "genome" and "--threshold" not in user_options: cmd += ["--threshold", "0.00001"] # For tumors, remove very low normalized regions, avoiding upcaptured noise # https://github.com/bcbio/bcbio-nextgen/issues/2171#issuecomment-348333650 # unless we want detailed segmentation for downstream tools paired = vcfutils.get_paired(items) if paired: #if detailed: # cmd += ["-m", "hmm-tumor"] if "--drop-low-coverage" not in user_options: cmd += ["--drop-low-coverage"] # preferentially use conda installed Rscript export_cmd = ("%s && export TMPDIR=%s && " % (utils.get_R_exports(), os.path.dirname(tx_out_file))) do.run(export_cmd + " ".join(cmd), "CNVkit segment") return out_file
def _get_jvm_opts(out_file, data): """Retrieve Java options, adjusting memory for available cores. """ resources = config_utils.get_resources("purple", data["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx3500m"]) jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust": {"direction": "increase", "maximum": "30000M", "magnitude": dd.get_cores(data)}}}) jvm_opts += broad.get_default_jvm_opts(os.path.dirname(out_file)) return jvm_opts
def make_command(data, cmd, bam_file, bed_file=None, depth_thresholds=None, max_cov=None, query=None): sambamba = config_utils.get_program("sambamba", data["config"], default="sambamba") num_cores = dd.get_cores(data) target = (" -L " + bed_file) if bed_file else "" thresholds = "".join([" -T" + str(d) for d in (depth_thresholds or [])]) maxcov = (" -C " + str(max_cov)) if max_cov else "" if query is None: query = "not failed_quality_control and not duplicate and not unmapped" return ("{sambamba} {cmd} -t {num_cores} {bam_file} " "{target} {thresholds} {maxcov} -F \"{query}\"").format(**locals())
def make_command(data, cmd, bam_file, bed_file=None, depth_thresholds=None, max_cov=None, query=None, multicore=True): sambamba = config_utils.get_program("sambamba", data["config"], default="sambamba") num_cores = dd.get_cores(data) if multicore else 1 target = (" -L " + bed_file) if bed_file else "" thresholds = "".join([" -T" + str(d) for d in (depth_thresholds or [])]) maxcov = (" -C " + str(max_cov)) if max_cov else "" if query is None: query = mapped_filter_query + " and not duplicate" return ("{sambamba} {cmd} -t {num_cores} {bam_file} " "{target} {thresholds} {maxcov} -F \"{query}\"").format(**locals())
def _run_cnvkit_shared(inputs, backgrounds): """Shared functionality to run CNVkit, parallelizing over multiple BAM files. """ work_dir = _sv_workdir(inputs[0]) raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) background_name = dd.get_sample_name(backgrounds[0]) if backgrounds else "flat" background_cnn = os.path.join(raw_work_dir, "%s_background.cnn" % (background_name)) ckouts = [] for cur_input in inputs: cur_raw_work_dir = utils.safe_makedir(os.path.join(_sv_workdir(cur_input), "raw")) out_base, out_base_old = _bam_to_outbase(dd.get_align_bam(cur_input), cur_raw_work_dir, cur_input) if utils.file_exists(out_base_old + ".cns"): out_base = out_base_old ckouts.append({"cnr": "%s.cnr" % out_base, "cns": "%s.cns" % out_base, "back_cnn": background_cnn}) if not utils.file_exists(ckouts[0]["cns"]): cov_interval = dd.get_coverage_interval(inputs[0]) samples_to_run = zip(["background"] * len(backgrounds), backgrounds) + \ zip(["evaluate"] * len(inputs), inputs) # New style shared SV bins if tz.get_in(["depth", "bins", "target"], inputs[0]): target_bed = tz.get_in(["depth", "bins", "target"], inputs[0]) antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], inputs[0]) raw_coverage_cnns = reduce(operator.add, [_get_general_coverage(cdata, itype) for itype, cdata in samples_to_run]) # Back compatible with pre-existing runs else: target_bed, antitarget_bed = _get_original_targets(inputs[0]) raw_coverage_cnns = reduce(operator.add, [_get_original_coverage(cdata, itype) for itype, cdata in samples_to_run]) # Currently metrics not calculated due to speed and needing re-evaluation # We could re-enable with larger truth sets to evaluate background noise # But want to reimplement in a more general fashion as part of normalization if False: coverage_cnns = reduce(operator.add, [_cnvkit_metrics(cnns, target_bed, antitarget_bed, cov_interval, inputs + backgrounds) for cnns in tz.groupby("bam", raw_coverage_cnns).values()]) background_cnn = _cnvkit_background(_select_background_cnns(coverage_cnns), background_cnn, target_bed, antitarget_bed, inputs[0]) else: coverage_cnns = raw_coverage_cnns background_cnn = _cnvkit_background([x["file"] for x in coverage_cnns if x["itype"] == "background"], background_cnn, target_bed, antitarget_bed, inputs[0]) parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"]} fixed_cnrs = run_multicore(_cnvkit_fix, [(cnns, background_cnn, inputs, ckouts) for cnns in tz.groupby("bam", [x for x in coverage_cnns if x["itype"] == "evaluate"]).values()], inputs[0]["config"], parallel) [_cnvkit_segment(cnr, cov_interval, data) for cnr, data in fixed_cnrs] return ckouts
def _normalize_sv_coverage_cnvkit(group_id, inputs, backgrounds, work_dir, back_files, out_files): """Normalize CNV coverage depths by GC, repeats and background using CNVkit - reference: calculates reference backgrounds from normals and pools including GC and repeat information - fix: Uses background to normalize coverage estimations http://cnvkit.readthedocs.io/en/stable/pipeline.html#fix """ from bcbio.structural import cnvkit cnns = reduce(operator.add, [[ tz.get_in(["depth", "bins", "target"], x), tz.get_in(["depth", "bins", "antitarget"], x) ] for x in backgrounds], []) for d in inputs: if tz.get_in(["depth", "bins", "target"], d): target_bed = tz.get_in(["depth", "bins", "target"], d) antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], d) input_backs = set( filter(lambda x: x is not None, [dd.get_background_cnv_reference(d, "cnvkit") for d in inputs])) if input_backs: assert len( input_backs ) == 1, "Multiple backgrounds in group: %s" % list(input_backs) back_file = list(input_backs)[0] else: back_file = cnvkit.cnvkit_background( cnns, os.path.join(work_dir, "background-%s-cnvkit.cnn" % (group_id)), backgrounds or inputs, target_bed, antitarget_bed) fix_cmd_inputs = [] for data in inputs: work_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "structural", dd.get_sample_name(data), "bins")) if tz.get_in(["depth", "bins", "target"], data): fix_file = os.path.join( work_dir, "%s-normalized.cnr" % (dd.get_sample_name(data))) fix_cmd_inputs.append((tz.get_in(["depth", "bins", "target"], data), tz.get_in(["depth", "bins", "antitarget"], data), back_file, fix_file, data)) out_files[dd.get_sample_name(data)] = fix_file back_files[dd.get_sample_name(data)] = back_file parallel = { "type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"] } run_multicore(cnvkit.run_fix_parallel, fix_cmd_inputs, inputs[0]["config"], parallel) return back_files, out_files
def _run_qsignature_generator(bam_file, data, out_dir): """ Run SignatureGenerator to create normalize vcf that later will be input of qsignature_summary :param bam_file: (str) path of the bam_file :param data: (list) list containing the all the dictionary for this sample :param out_dir: (str) path of the output :returns: (dict) dict with the normalize vcf file """ position = dd.get_qsig_file(data) mixup_check = dd.get_mixup_check(data) if mixup_check and mixup_check.startswith("qsignature"): if not position: logger.info("There is no qsignature for this species: %s" % tz.get_in(['genome_build'], data)) return {} jvm_opts = "-Xms750m -Xmx2g" limit_reads = 20000000 if mixup_check == "qsignature_full": slice_bam = bam_file jvm_opts = "-Xms750m -Xmx8g" limit_reads = 100000000 else: slice_bam = _slice_chr22(bam_file, data) qsig = config_utils.get_program("qsignature", data["config"]) if not qsig: return {} utils.safe_makedir(out_dir) out_name = os.path.basename(slice_bam).replace("bam", "qsig.vcf") out_file = os.path.join(out_dir, out_name) log_file = os.path.join(out_dir, "qsig.log") cores = dd.get_cores(data) base_cmd = ("{qsig} {jvm_opts} " "org.qcmg.sig.SignatureGenerator " "--noOfThreads {cores} " "-log {log_file} -i {position} " "-i {down_file} ") if not os.path.exists(out_file): down_file = bam.downsample(slice_bam, data, limit_reads) if not down_file: down_file = slice_bam file_qsign_out = "{0}.qsig.vcf".format(down_file) do.run(base_cmd.format(**locals()), "qsignature vcf generation: %s" % data["name"][-1]) if os.path.exists(file_qsign_out): with file_transaction(data, out_file) as file_txt_out: shutil.move(file_qsign_out, file_txt_out) else: raise IOError("File doesn't exist %s" % file_qsign_out) return {'qsig_vcf': out_file} return {}
def _run_cnvkit_shared_orig(inputs, backgrounds): """Original CNVkit implementation with full normalization and segmentation. """ work_dir = _sv_workdir(inputs[0]) raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) background_name = dd.get_sample_name(backgrounds[0]) if backgrounds else "flat" background_cnn = os.path.join(raw_work_dir, "%s_background.cnn" % (background_name)) ckouts = [] for cur_input in inputs: cur_raw_work_dir = utils.safe_makedir(os.path.join(_sv_workdir(cur_input), "raw")) out_base, out_base_old = _bam_to_outbase(dd.get_align_bam(cur_input), cur_raw_work_dir, cur_input) if utils.file_exists(out_base_old + ".cns"): out_base = out_base_old ckouts.append({"cnr": "%s.cnr" % out_base, "cns": "%s.cns" % out_base}) if not utils.file_exists(ckouts[0]["cns"]): cov_interval = dd.get_coverage_interval(inputs[0]) samples_to_run = list(zip(["background"] * len(backgrounds), backgrounds)) + \ list(zip(["evaluate"] * len(inputs), inputs)) # New style shared SV bins if tz.get_in(["depth", "bins", "target"], inputs[0]): target_bed = tz.get_in(["depth", "bins", "target"], inputs[0]) antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], inputs[0]) raw_coverage_cnns = reduce(operator.add, [_get_general_coverage(cdata, itype) for itype, cdata in samples_to_run]) # Back compatible with pre-existing runs else: target_bed, antitarget_bed = _get_original_targets(inputs[0]) raw_coverage_cnns = reduce(operator.add, [_get_original_coverage(cdata, itype) for itype, cdata in samples_to_run]) # Currently metrics not calculated due to speed and needing re-evaluation # We could re-enable with larger truth sets to evaluate background noise # But want to reimplement in a more general fashion as part of normalization if False: coverage_cnns = reduce(operator.add, [_cnvkit_metrics(cnns, target_bed, antitarget_bed, cov_interval, inputs + backgrounds) for cnns in tz.groupby("bam", raw_coverage_cnns).values()]) background_cnn = cnvkit_background(_select_background_cnns(coverage_cnns), background_cnn, inputs, target_bed, antitarget_bed) else: coverage_cnns = raw_coverage_cnns background_cnn = cnvkit_background([x["file"] for x in coverage_cnns if x["itype"] == "background"], background_cnn, inputs, target_bed, antitarget_bed) parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"]} fixed_cnrs = run_multicore(_cnvkit_fix, [(cnns, background_cnn, inputs, ckouts) for cnns in tz.groupby("bam", [x for x in coverage_cnns if x["itype"] == "evaluate"]).values()], inputs[0]["config"], parallel) [_cnvkit_segment(cnr, cov_interval, data, inputs + backgrounds) for cnr, data in fixed_cnrs] return ckouts
def _run_wham_genotype(in_file, all_bams, coords, data): """Run genotyping on a prepped, merged VCF file. """ out_file = "%s-wgts%s" % utils.splitext_plus(in_file) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: cores = dd.get_cores(data) ref_file = dd.get_ref_file(data) coord_str = bamprep.region_to_gatk(coords) cmd = ("WHAM-GRAPHENING -b {in_file} -x {cores} -a {ref_file} -f {all_bams} -r {coord_str} " "> {tx_out_file}") do.run(cmd.format(**locals()), "Genotype WHAM: %s" % region.to_safestr(coords)) return out_file
def _run_combine_gvcfs(vrn_files, region, ref_file, out_file, data): if not utils.file_exists(out_file): broad_runner = broad.runner_from_config(data["config"]) with file_transaction(data, out_file) as tx_out_file: params = ["-T", "CombineGVCFs", "-R", ref_file, "-o", tx_out_file, "-L", bamprep.region_to_gatk(region)] for vrn_file in vrn_files: params += ["--variant", vrn_file] cores = dd.get_cores(data) memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None broad_runner.new_resources("gatk-haplotype") broad_runner.run_gatk(params, memscale=memscale) return out_file
def run(bam_file, data, out_dir): """ Run SignatureGenerator to create normalize vcf that later will be input of qsignature_summary :param bam_file: (str) path of the bam_file :param data: (list) list containing the all the dictionary for this sample :param out_dir: (str) path of the output :returns: (string) output normalized vcf file """ qsig = config_utils.get_program("qsignature", data["config"]) res_qsig = config_utils.get_resources("qsignature", data["config"]) jvm_opts = " ".join(res_qsig.get("jvm_opts", ["-Xms750m", "-Xmx8g"])) if not qsig: logger.info("There is no qsignature tool. Skipping...") return None position = dd.get_qsig_file(data) mixup_check = dd.get_mixup_check(data) if mixup_check and mixup_check.startswith("qsignature"): utils.safe_makedir(out_dir) if not position: logger.info("There is no qsignature for this species: %s" % tz.get_in(['genome_build'], data)) return None if mixup_check == "qsignature_full": down_bam = bam_file else: down_bam = _slice_bam_chr21(bam_file, data) position = _slice_vcf_chr21(position, out_dir) out_name = os.path.basename(down_bam).replace("bam", "qsig.vcf") out_file = os.path.join(out_dir, out_name) log_file = os.path.join(out_dir, "qsig.log") cores = dd.get_cores(data) base_cmd = ("{qsig} {jvm_opts} " "org.qcmg.sig.SignatureGenerator " "--noOfThreads {cores} " "-log {log_file} -i {position} " "-i {down_bam} ") if not os.path.exists(out_file): file_qsign_out = "{0}.qsig.vcf".format(down_bam) do.run(base_cmd.format(**locals()), "qsignature vcf generation: %s" % dd.get_sample_name(data)) if os.path.exists(file_qsign_out): with file_transaction(data, out_file) as file_txt_out: shutil.move(file_qsign_out, file_txt_out) else: raise IOError("File doesn't exist %s" % file_qsign_out) return out_file return None
def run_mosdepth(data, target_name, bed_file, per_base=False, quantize=None): """Run mosdepth generating distribution, region depth and per-base depth. """ MosdepthCov = collections.namedtuple( "MosdepthCov", ("dist", "per_base", "regions", "quantize")) bam_file = dd.get_align_bam(data) or dd.get_work_bam(data) work_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "coverage", dd.get_sample_name(data))) prefix = os.path.join(work_dir, "%s-%s" % (dd.get_sample_name(data), target_name)) out = MosdepthCov("%s.mosdepth.dist.txt" % prefix, ("%s.per-base.bed.gz" % prefix) if per_base else None, ("%s.regions.bed.gz" % prefix) if bed_file else None, ("%s.quantized.bed.gz" % prefix) if quantize else None) if not utils.file_uptodate(out.dist, bam_file): with file_transaction(data, out.dist) as tx_out_file: tx_prefix = os.path.join(os.path.dirname(tx_out_file), os.path.basename(prefix)) num_cores = dd.get_cores(data) bed_arg = ("--by %s" % bed_file) if bed_file else "" perbase_arg = "" if per_base else "--no-per-base" mapq_arg = "-Q 1" if (per_base or quantize) else "" if quantize: quant_arg = "--quantize %s" % quantize[0] quant_export = " && ".join([ "export MOSDEPTH_Q%s=%s" % (i, x) for (i, x) in enumerate(quantize[1]) ]) quant_export += " && " else: quant_arg, quant_export = "", "" cmd = ( "{quant_export}mosdepth -t {num_cores} -F 1804 {mapq_arg} {perbase_arg} {bed_arg} {quant_arg} " "{tx_prefix} {bam_file}") message = "Calculating coverage: %s %s" % ( dd.get_sample_name(data), target_name) do.run(cmd.format(**locals()), message.format(**locals())) if out.per_base: shutil.move( os.path.join(os.path.dirname(tx_out_file), os.path.basename(out.per_base)), out.per_base) if out.regions: shutil.move( os.path.join(os.path.dirname(tx_out_file), os.path.basename(out.regions)), out.regions) if out.quantize: shutil.move( os.path.join(os.path.dirname(tx_out_file), os.path.basename(out.quantize)), out.quantize) return out
def run_combine_gvcfs(vrn_files, region, ref_file, out_file, data): if not utils.file_exists(out_file): broad_runner = broad.runner_from_config(data["config"]) with file_transaction(data, out_file) as tx_out_file: params = ["-T", "CombineGVCFs", "-R", ref_file, "-o", tx_out_file] if region: params += ["-L", bamprep.region_to_gatk(region)] for vrn_file in vrn_files: params += ["--variant", vrn_file] cores = dd.get_cores(data) memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None broad_runner.new_resources("gatk-haplotype") broad_runner.run_gatk(params, memscale=memscale) return vcfutils.bgzip_and_index(out_file, data["config"])
def _run_wham(inputs, background_bams): """Run WHAM on a defined set of inputs and targets. """ out_file = os.path.join(_sv_workdir(inputs[0]), "%s-wham.vcf.gz" % dd.get_sample_name(inputs[0])) if not utils.file_exists(out_file): with file_transaction(inputs[0], out_file) as tx_out_file: cores = dd.get_cores(inputs[0]) ref_file = dd.get_ref_file(inputs[0]) include_chroms = ",".join([c.name for c in ref.file_contigs(ref_file) if chromhacks.is_autosomal_or_x(c.name)]) all_bams = ",".join([x["align_bam"] for x in inputs] + background_bams) cmd = ("whamg -x {cores} -a {ref_file} -f {all_bams} -c {include_chroms} " "| bgzip -c > {tx_out_file}") do.run(cmd.format(**locals()), "WHAM SV caller: %s" % ", ".join(dd.get_sample_name(d) for d in inputs)) return vcfutils.bgzip_and_index(out_file, inputs[0]["config"])
def _run_wham(inputs, background_bams): """Run WHAM on a defined set of inputs and targets. """ out_file = os.path.join(_sv_workdir(inputs[0]), "%s-wham.vcf" % dd.get_sample_name(inputs[0])) input_bams = [x["align_bam"] for x in inputs] if not utils.file_exists(out_file): with file_transaction(inputs[0], out_file) as tx_out_file: cores = dd.get_cores(inputs[0]) background = "-b %s" % ",".join(background_bams) if background_bams else "" target_bams = ",".join(x["align_bam"] for x in inputs) target_bed = tz.get_in(["config", "algorithm", "variant_regions"], inputs[0]) target_str = "-e %s" % target_bed if target_bed else "" cmd = ("WHAM-BAM -x {cores} -t {target_bams} {background} {target_str} > {tx_out_file}") do.run(cmd.format(**locals()), "Run WHAM") return out_file
def _run_cnvkit_shared(inputs, backgrounds): """Shared functionality to run CNVkit, parallelizing over multiple BAM files. """ work_dir = _sv_workdir(inputs[0]) raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) background_name = dd.get_sample_name(backgrounds[0]) if backgrounds else "flat" background_cnn = os.path.join(raw_work_dir, "%s_background.cnn" % (background_name)) ckouts = [] for cur_input in inputs: cur_raw_work_dir = utils.safe_makedir(os.path.join(_sv_workdir(cur_input), "raw")) out_base = _bam_to_outbase(dd.get_align_bam(cur_input), cur_raw_work_dir) ckouts.append({"cnr": "%s.cnr" % out_base, "cns": "%s.cns" % out_base, "back_cnn": background_cnn}) if not utils.file_exists(ckouts[0]["cnr"]): cov_interval = dd.get_coverage_interval(inputs[0]) raw_target_bed, access_bed = _get_target_access_files(cov_interval, inputs[0], work_dir) # bail out if we ended up with no regions if not utils.file_exists(raw_target_bed): return {} raw_target_bed = annotate.add_genes(raw_target_bed, inputs[0]) parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"]} pct_coverage = (pybedtools.BedTool(raw_target_bed).total_coverage() / float(pybedtools.BedTool(access_bed).total_coverage())) * 100.0 target_bed, antitarget_bed = _cnvkit_targets(raw_target_bed, access_bed, cov_interval, pct_coverage, raw_work_dir, inputs[0]) split_beds = _split_bed(target_bed, inputs[0]) + _split_bed(antitarget_bed, inputs[0]) samples_to_run = zip(["background"] * len(backgrounds), backgrounds) + \ zip(["evaluate"] * len(inputs), inputs) split_cnns = run_multicore(_cnvkit_coverage, [(cdata, bed, itype) for itype, cdata in samples_to_run for bed in split_beds], inputs[0]["config"], parallel) raw_coverage_cnns = _merge_coverage(split_cnns, inputs[0]) coverage_cnns = run_multicore(_cnvkit_metrics, [(cnns, target_bed, antitarget_bed, cov_interval, inputs + backgrounds) for cnns in tz.groupby("bam", raw_coverage_cnns).values()], inputs[0]["config"], parallel) background_cnn = _cnvkit_background(_select_background_cnns(coverage_cnns), background_cnn, target_bed, antitarget_bed, inputs[0]) fixed_cnrs = run_multicore(_cnvkit_fix, [(cnns, background_cnn, inputs + backgrounds) for cnns in tz.groupby("bam", [x for x in coverage_cnns if x["itype"] == "evaluate"]).values()], inputs[0]["config"], parallel) run_multicore(_cnvkit_segment, [(cnr, cov_interval, data) for cnr, data in fixed_cnrs], inputs[0]["config"], parallel) return ckouts
def _run_genotype_gvcfs(data, vrn_files, ref_file, out_file): if not file_exists(out_file): broad_runner = broad.runner_from_config(data["config"]) with file_transaction(data, out_file) as tx_out_file: params = ["-T", "GenotypeGVCFs", "-R", ref_file, "-o", tx_out_file] for vrn_file in vrn_files: params += ["--variant", vrn_file] broad_runner.new_resources("gatk-haplotype") cores = dd.get_cores(data) if cores > 1: params += ["-nt", str(cores)] memscale = {"magnitude": 0.9 * cores, "direction": "increase"} else: memscale = None broad_runner.run_gatk(params, memscale=memscale) return out_file
def _run_wham(inputs, background_bams): """Run WHAM on a defined set of inputs and targets. """ out_file = os.path.join(_sv_workdir(inputs[0]), "%s-wham.vcf.gz" % dd.get_sample_name(inputs[0])) if not utils.file_exists(out_file): with file_transaction(inputs[0], out_file) as tx_out_file: coords = chromhacks.autosomal_or_x_coords(dd.get_ref_file(inputs[0])) parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": []} rs = run_multicore(_run_wham_coords, [(inputs, background_bams, coord, out_file) for coord in coords], inputs[0]["config"], parallel) rs = {coord: fname for (coord, fname) in rs} vcfutils.concat_variant_files([rs[c] for c in coords], tx_out_file, coords, dd.get_ref_file(inputs[0]), inputs[0]["config"]) return out_file
def _run_wham_coords(inputs, background_bams, coords, final_file): """Run WHAM on a specific set of chromosome, start, end coordinates. """ base, ext = os.path.splitext(final_file) out_file = "%s-%s%s" % (base, region.to_safestr(coords), ext) if not utils.file_exists(out_file): with file_transaction(inputs[0], out_file) as tx_out_file: cores = dd.get_cores(inputs[0]) ref_file = dd.get_ref_file(inputs[0]) all_bams = ",".join([x["align_bam"] for x in inputs] + background_bams) coord_str = bamprep.region_to_gatk(coords) opts = "-k -m 30" cmd = ("WHAM-GRAPHENING {opts} -x {cores} -a {ref_file} -f {all_bams} -r {coord_str} " "> {tx_out_file}") do.run(cmd.format(**locals()), "Run WHAM: %s" % region.to_safestr(coords)) return [[coords, out_file]]
def _run_genotype_gvcfs_genomicsdb(genomics_db, region, out_file, data): """GenotypeGVCFs from a merged GenomicsDB input: GATK4. No core scaling -- not yet supported in GATK4. """ if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: broad_runner = broad.runner_from_config(data["config"]) params = ["-T", "GenotypeGVCFs", "--variant", "gendb://%s" % genomics_db, "-R", dd.get_ref_file(data), "--output", tx_out_file, "-L", bamprep.region_to_gatk(region)] cores = dd.get_cores(data) memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None broad_runner.run_gatk(params, memscale=memscale) return vcfutils.bgzip_and_index(out_file, data["config"])
def _get_jvm_opts(out_file, data): """Retrieve Java options, adjusting memory for available cores. """ resources = config_utils.get_resources("purple", data["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx3500m"]) jvm_opts = config_utils.adjust_opts( jvm_opts, { "algorithm": { "memory_adjust": { "direction": "increase", "maximum": "30000M", "magnitude": dd.get_cores(data) } } }) jvm_opts += broad.get_default_jvm_opts(os.path.dirname(out_file)) return jvm_opts
def _run_gridss(inputs, background, work_dir): out_file = os.path.join( work_dir, "%s-gridss.sv.vcf" % (dd.get_batch(inputs[0]) or dd.get_sample_name(inputs[0]))) if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): with file_transaction(inputs[0], out_file) as tx_out_file: htsjdk_opts = [ "-Dsamjdk.create_index=true", "-Dsamjdk.use_async_io_read_samtools=true", "-Dsamjdk.use_async_io_write_samtools=true", "-Dsamjdk.use_async_io_write_tribble=true" ] cores = dd.get_cores(inputs[0]) resources = config_utils.get_resources("gridss", inputs[0]["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx4g"]) jvm_opts = config_utils.adjust_opts( jvm_opts, { "algorithm": { "memory_adjust": { "direction": "increase", "magnitude": cores } } }) jvm_opts = _finalize_memory(jvm_opts) tx_ref_file = _setup_reference_files(inputs[0], os.path.dirname(tx_out_file)) blacklist_bed = sshared.prepare_exclude_file( inputs + background, out_file) cmd = ["gridss"] + jvm_opts + htsjdk_opts + ["gridss.CallVariants"] + \ ["THREADS=%s" % cores, "TMP_DIR=%s" % os.path.dirname(tx_out_file), "WORKING_DIR=%s" % os.path.dirname(tx_out_file), "OUTPUT=%s" % tx_out_file, "ASSEMBLY=%s" % tx_out_file.replace(".sv.vcf", ".gridss.assembly.bam"), "REFERENCE_SEQUENCE=%s" % tx_ref_file, "BLACKLIST=%s" % blacklist_bed] for data in inputs + background: cmd += [ "INPUT=%s" % dd.get_align_bam(data), "INPUT_LABEL=%s" % dd.get_sample_name(data) ] exports = utils.local_path_export() cmd = exports + " ".join(cmd) do.run(cmd, "GRIDSS SV analysis") return vcfutils.bgzip_and_index(out_file, inputs[0]["config"])
def _call_hla(hla_fq, out_dir, data): """Run OptiType HLA calling for a specific """ bin_dir = os.path.dirname(os.path.realpath(sys.executable)) with tx_tmpdir(data, os.path.dirname(out_dir)) as tx_out_dir: config_file = os.path.join(tx_out_dir, "config.ini") with open(config_file, "w") as out_handle: razers3 = os.path.join(bin_dir, "razers3") if not os.path.exists(razers3): raise ValueError("Could not find razers3 executable at %s" % (razers3)) out_handle.write(CONFIG_TMPL.format(razers3=razers3, cores=dd.get_cores(data))) cmd = ("OptiTypePipeline.py -v --dna -o {tx_out_dir} " "-i {hla_fq} -c {config_file}") do.run(cmd.format(**locals()), "HLA typing with OptiType") shutil.move(tx_out_dir, out_dir) out_file = glob.glob(os.path.join(out_dir, "*", "*_result.tsv")) assert len(out_file) == 1, "Expected one result file for OptiType, found %s" % out_file return out_file[0]
def index(ref_file, out_dir, data): """Create a STAR index in the defined reference directory. """ (ref_dir, local_file) = os.path.split(ref_file) gtf_file = dd.get_gtf_file(data) if not utils.file_exists(gtf_file): raise ValueError("%s not found, could not create a star index." % (gtf_file)) if not utils.file_exists(out_dir): with tx_tmpdir(data, os.path.dirname(out_dir)) as tx_out_dir: num_cores = dd.get_cores(data) cmd = ("STAR --genomeDir {tx_out_dir} --genomeFastaFiles {ref_file} " "--runThreadN {num_cores} " "--runMode genomeGenerate --sjdbOverhang 99 --sjdbGTFfile {gtf_file}") do.run(cmd.format(**locals()), "Index STAR") if os.path.exists(out_dir): shutil.rmtree(out_dir) shutil.move(tx_out_dir, out_dir) return out_dir
def _calculate_coverage(data, work_dir, bed_file, bam_file, sample_name): sambamba_depth_file = os.path.join(work_dir, sample_name + '-sambamba_depth.tsv') sambamba = config_utils.get_program("sambamba", data["config"]) num_cores = dd.get_cores(data) if not utils.file_exists(sambamba_depth_file): with file_transaction(data, sambamba_depth_file) as tx_out_file: cmd = ("{sambamba} depth region -t {num_cores} " "-F \"\" -L {bed_file} {bam_file} -o {tx_out_file}") do.run(cmd.format(**locals()), "Calling sambamba region depth") logger.debug("Saved to " + sambamba_depth_file) out_file = os.path.join(work_dir, sample_name + '-coverage.tsv') if not utils.file_exists(out_file): logger.debug('Converting sambamba depth output to cov2lr.pl input in ' + dd.get_sample_name(data)) with file_transaction(data, out_file) as tx_out_file: _sambabma_depth_to_seq2cov(sambamba_depth_file, tx_out_file, sample_name) logger.debug("Saved to " + out_file) return out_file
def _run_wham(inputs, background_bams): """Run WHAM on a defined set of inputs and targets. """ out_file = os.path.join(_sv_workdir(inputs[0]), "%s-wham.bedpe" % dd.get_sample_name(inputs[0])) if not utils.file_exists(out_file): with file_transaction(inputs[0], out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: coords = chromhacks.autosomal_or_x_coords(dd.get_ref_file(inputs[0])) parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["wham"]} rs = run_multicore(_run_wham_coords, [(inputs, background_bams, coord, out_file) for coord in coords], inputs[0]["config"], parallel) rs = {coord: fname for (coord, fname) in rs} for coord in coords: with open(rs[coord]) as in_handle: shutil.copyfileobj(in_handle, out_handle) return out_file
def _run_vqsr(in_file, ref_file, vrn_files, sensitivity_cutoff, filter_type, data): """Run variant quality score recalibration. """ cutoffs = ["100.0", "99.99", "99.98", "99.97", "99.96", "99.95", "99.94", "99.93", "99.92", "99.91", "99.9", "99.8", "99.7", "99.6", "99.5", "99.0", "98.0", "90.0"] if sensitivity_cutoff not in cutoffs: cutoffs.append(sensitivity_cutoff) cutoffs.sort() broad_runner = broad.runner_from_config(data["config"]) gatk_type = broad_runner.gatk_type() base = utils.splitext_plus(in_file)[0] recal_file = ("%s-vqsrrecal.vcf.gz" % base) if gatk_type == "gatk4" else ("%s.recal" % base) tranches_file = "%s.tranches" % base plot_file = "%s-plots.R" % base if not utils.file_exists(recal_file): with file_transaction(data, recal_file, tranches_file, plot_file) as (tx_recal, tx_tranches, tx_plot_file): params = ["-T", "VariantRecalibrator", "-R", ref_file, "--mode", filter_type] if gatk_type == "gatk4": params += ["--variant", in_file, "--output", tx_recal, "--tranches-file", tx_tranches, "--rscript-file", tx_plot_file] else: params += ["--input", in_file, "--recal_file", tx_recal, "--tranches_file", tx_tranches, "--rscript_file", tx_plot_file] params += _get_vqsr_training(filter_type, vrn_files, gatk_type) resources = config_utils.get_resources("gatk_variant_recalibrator", data["config"]) opts = resources.get("options", []) if not opts: for cutoff in cutoffs: opts += ["-tranche", str(cutoff)] for a in _get_vqsr_annotations(filter_type, data): opts += ["-an", a] params += opts cores = dd.get_cores(data) memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None try: broad_runner.new_resources("gatk-vqsr") broad_runner.run_gatk(params, log_error=False, memscale=memscale, parallel_gc=True) except: # Can fail to run if not enough values are present to train. return None, None if gatk_type == "gatk4": vcfutils.bgzip_and_index(recal_file, data["config"]) return recal_file, tranches_file
def _run_wham_coords(inputs, background_bams, coords, final_file): """Run WHAM on a specific set of chromosome, start, end coordinates. """ base, ext = utils.splitext_plus(final_file) raw_file = "%s-%s.vcf" % (base, region.to_safestr(coords)) all_bams = ",".join([x["align_bam"] for x in inputs] + background_bams) if not utils.file_exists(raw_file): with file_transaction(inputs[0], raw_file) as tx_raw_file: cores = dd.get_cores(inputs[0]) ref_file = dd.get_ref_file(inputs[0]) coord_str = bamprep.region_to_gatk(coords) opts = "-k -m 30" cmd = ("WHAM-GRAPHENING {opts} -x {cores} -a {ref_file} -f {all_bams} -r {coord_str} " "> {tx_raw_file}") do.run(cmd.format(**locals()), "Run WHAM: %s" % region.to_safestr(coords)) merge_vcf = _run_wham_merge(raw_file, inputs[0]) gt_vcf = _run_wham_genotype(merge_vcf, all_bams, coords, inputs[0]) prep_vcf = vcfutils.sort_by_ref(gt_vcf, inputs[0]) return [[coords, prep_vcf]]