def _run_lumpy(full_bams, sr_bams, disc_bams, previous_evidence, work_dir, items): """Run lumpy-sv, using speedseq pipeline. """ batch = sshared.get_cur_batch(items) ext = "-%s-svs" % batch if batch else "-svs" out_file = os.path.join( work_dir, "%s%s.vcf" % (os.path.splitext(os.path.basename(items[0]["align_bam"]))[0], ext)) sv_exclude_bed = sshared.prepare_exclude_file(items, out_file) if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: with tx_tmpdir(items[0]) as tmpdir: full_bams = ",".join(full_bams) sr_bams = ",".join(sr_bams) disc_bams = ",".join(disc_bams) exclude = "-x %s" % sv_exclude_bed if ( sv_exclude_bed and utils.file_exists(sv_exclude_bed)) else "" ref_file = dd.get_ref_file(items[0]) depths = [] for sample, ev_files in previous_evidence.items(): for ev_type, ev_file in ev_files.items(): if utils.file_exists(ev_file): depths.append("%s:%s" % (sample, ev_file)) depth_arg = "-d %s" % ",".join(depths) if len( depths) > 0 else "" # use our bcbio python for runs within lumpyexpress exports = utils.local_path_export() cmd = ( "{exports}lumpyexpress -v -B {full_bams} -S {sr_bams} -D {disc_bams} " "{exclude} {depth_arg} -T {tmpdir} -o {tx_out_file}") do.run(cmd.format(**locals()), "lumpyexpress", items[0]) return vcfutils.sort_by_ref(out_file, items[0]), sv_exclude_bed
def _run_lumpy(full_bams, sr_bams, disc_bams, work_dir, items): """Run lumpy-sv, using speedseq pipeline. """ batch = sshared.get_cur_batch(items) ext = "-%s-svs" % batch if batch else "-svs" out_file = os.path.join( work_dir, "%s%s.vcf" % (os.path.splitext(os.path.basename(items[0]["align_bam"]))[0], ext)) sv_exclude_bed = sshared.prepare_exclude_file(items, out_file) if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: with tx_tmpdir(items[0]) as tmpdir: full_bams = ",".join(full_bams) sr_bams = ",".join(sr_bams) disc_bams = ",".join(disc_bams) exclude = "-x %s" % sv_exclude_bed if utils.file_exists( sv_exclude_bed) else "" ref_file = dd.get_ref_file(items[0]) # use our bcbio python for runs within lumpyexpress curpython_dir = os.path.dirname(sys.executable) cmd = ( "export PATH={curpython_dir}:$PATH && " "lumpyexpress -v -B {full_bams} -S {sr_bams} -D {disc_bams} " "{exclude} -T {tmpdir} -o {tx_out_file}") do.run(cmd.format(**locals()), "lumpyexpress", items[0]) return vcfutils.sort_by_ref(out_file, items[0]), sv_exclude_bed
def _run_lumpy(full_bams, sr_bams, disc_bams, previous_evidence, work_dir, items): """Run lumpy-sv, using speedseq pipeline. """ batch = sshared.get_cur_batch(items) ext = "-%s-svs" % batch if batch else "-svs" out_file = os.path.join(work_dir, "%s%s.vcf" % (os.path.splitext(os.path.basename(items[0]["align_bam"]))[0], ext)) sv_exclude_bed = sshared.prepare_exclude_file(items, out_file) if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: with tx_tmpdir(items[0]) as tmpdir: full_bams = ",".join(full_bams) sr_bams = ",".join(sr_bams) disc_bams = ",".join(disc_bams) exclude = "-x %s" % sv_exclude_bed if (sv_exclude_bed and utils.file_exists(sv_exclude_bed)) else "" ref_file = dd.get_ref_file(items[0]) depths = [] for sample, ev_files in previous_evidence.items(): for ev_type, ev_file in ev_files.items(): if utils.file_exists(ev_file): depths.append("%s:%s" % (sample, ev_file)) depth_arg = "-d %s" % ",".join(depths) if len(depths) > 0 else "" # use our bcbio python for runs within lumpyexpress exports = utils.local_path_export() cmd = ("{exports}lumpyexpress -v -B {full_bams} -S {sr_bams} -D {disc_bams} " "{exclude} {depth_arg} -T {tmpdir} -o {tx_out_file}") do.run(cmd.format(**locals()), "lumpyexpress", items[0]) return vcfutils.sort_by_ref(out_file, items[0]), sv_exclude_bed
def _run_gridss(inputs, background, work_dir): out_file = os.path.join(work_dir, "%s-gridss.sv.vcf" % (dd.get_batch(inputs[0]) or dd.get_sample_name(inputs[0]))) if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): with file_transaction(inputs[0], out_file) as tx_out_file: htsjdk_opts = ["-Dsamjdk.create_index=true", "-Dsamjdk.use_async_io_read_samtools=true", "-Dsamjdk.use_async_io_write_samtools=true", "-Dsamjdk.use_async_io_write_tribble=true"] cores = dd.get_cores(inputs[0]) resources = config_utils.get_resources("gridss", inputs[0]["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx4g"]) jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust": {"direction": "increase", "magnitude": cores}}}) jvm_opts = _finalize_memory(jvm_opts) tx_ref_file = _setup_reference_files(inputs[0], os.path.dirname(tx_out_file)) blacklist_bed = sshared.prepare_exclude_file(inputs + background, out_file) cmd = ["gridss"] + jvm_opts + htsjdk_opts + ["gridss.CallVariants"] + \ ["THREADS=%s" % cores, "TMP_DIR=%s" % os.path.dirname(tx_out_file), "WORKING_DIR=%s" % os.path.dirname(tx_out_file), "OUTPUT=%s" % tx_out_file, "ASSEMBLY=%s" % tx_out_file.replace(".sv.vcf", ".gridss.assembly.bam"), "REFERENCE_SEQUENCE=%s" % tx_ref_file, "BLACKLIST=%s" % blacklist_bed] for data in inputs + background: cmd += ["INPUT=%s" % dd.get_align_bam(data), "INPUT_LABEL=%s" % dd.get_sample_name(data)] exports = utils.local_path_export() cmd = exports + " ".join(cmd) do.run(cmd, "GRIDSS SV analysis") return vcfutils.bgzip_and_index(out_file, inputs[0]["config"])
def remove_exclude_regions(orig_bed, base_file, items, remove_entire_feature=False): """Remove centromere and short end regions from an existing BED file of regions to target. """ from bcbio.structural import shared as sshared out_bed = os.path.join("%s-noexclude.bed" % (utils.splitext_plus(base_file)[0])) if not utils.file_uptodate(out_bed, orig_bed): exclude_bed = sshared.prepare_exclude_file(items, base_file) with file_transaction(items[0], out_bed) as tx_out_bed: pybedtools.BedTool(orig_bed).subtract(pybedtools.BedTool(exclude_bed), A=remove_entire_feature, nonamecheck=True).saveas(tx_out_bed) if utils.file_exists(out_bed): return out_bed else: return orig_bed
def _delly_exclude_file(items, base_file, chrom): """Prepare a delly-specific exclude file eliminating chromosomes. Delly wants excluded chromosomes listed as just the chromosome, with no coordinates. """ base_exclude = sshared.prepare_exclude_file(items, base_file, chrom) out_file = "%s-delly%s" % utils.splitext_plus(base_exclude) with file_transaction(items[0], out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: with open(base_exclude) as in_handle: for line in in_handle: parts = line.split("\t") if parts[0] == chrom: out_handle.write(line) else: out_handle.write("%s\n" % parts[0]) return out_file
def _run_gridss(inputs, background, work_dir): out_file = os.path.join( work_dir, "%s-gridss.sv.vcf" % (dd.get_batch(inputs[0]) or dd.get_sample_name(inputs[0]))) if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): with file_transaction(inputs[0], out_file) as tx_out_file: htsjdk_opts = [ "-Dsamjdk.create_index=true", "-Dsamjdk.use_async_io_read_samtools=true", "-Dsamjdk.use_async_io_write_samtools=true", "-Dsamjdk.use_async_io_write_tribble=true" ] cores = dd.get_cores(inputs[0]) resources = config_utils.get_resources("gridss", inputs[0]["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx4g"]) jvm_opts = config_utils.adjust_opts( jvm_opts, { "algorithm": { "memory_adjust": { "direction": "increase", "magnitude": cores } } }) jvm_opts = _finalize_memory(jvm_opts) tx_ref_file = _setup_reference_files(inputs[0], os.path.dirname(tx_out_file)) blacklist_bed = sshared.prepare_exclude_file( inputs + background, out_file) cmd = ["gridss"] + jvm_opts + htsjdk_opts + ["gridss.CallVariants"] + \ ["THREADS=%s" % cores, "TMP_DIR=%s" % os.path.dirname(tx_out_file), "WORKING_DIR=%s" % os.path.dirname(tx_out_file), "OUTPUT=%s" % tx_out_file, "ASSEMBLY=%s" % tx_out_file.replace(".sv.vcf", ".gridss.assembly.bam"), "REFERENCE_SEQUENCE=%s" % tx_ref_file, "BLACKLIST=%s" % blacklist_bed] for data in inputs + background: cmd += [ "INPUT=%s" % dd.get_align_bam(data), "INPUT_LABEL=%s" % dd.get_sample_name(data) ] exports = utils.local_path_export() cmd = exports + " ".join(cmd) do.run(cmd, "GRIDSS SV analysis") return vcfutils.bgzip_and_index(out_file, inputs[0]["config"])
def _run_smoove(full_bams, sr_bams, disc_bams, work_dir, items): """Run lumpy-sv using smoove. """ batch = sshared.get_cur_batch(items) ext = "-%s-svs" % batch if batch else "-svs" name = "%s%s" % (dd.get_sample_name(items[0]), ext) out_file = os.path.join(work_dir, "%s-smoove.genotyped.vcf.gz" % name) sv_exclude_bed = sshared.prepare_exclude_file(items, out_file) old_out_file = os.path.join(work_dir, "%s%s-prep.vcf.gz" % (os.path.splitext(os.path.basename(items[0]["align_bam"]))[0], ext)) if utils.file_exists(old_out_file): return old_out_file, sv_exclude_bed if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: cores = dd.get_num_cores(items[0]) out_dir = os.path.dirname(tx_out_file) ref_file = dd.get_ref_file(items[0]) full_bams = " ".join(_prepare_smoove_bams(full_bams, sr_bams, disc_bams, items, os.path.dirname(tx_out_file))) std_excludes = ["~^GL", "~^HLA", "~_random", "~^chrUn", "~alt", "~decoy"] def _is_std_exclude(n): clean_excludes = [x.replace("~", "").replace("^", "") for x in std_excludes] return any([n.startswith(x) or n.endswith(x) for x in clean_excludes]) exclude_chrs = [c.name for c in ref.file_contigs(ref_file) if not chromhacks.is_nonalt(c.name) and not _is_std_exclude(c.name)] exclude_chrs = "--excludechroms '%s'" % ",".join(std_excludes + exclude_chrs) exclude_bed = ("--exclude %s" % sv_exclude_bed) if utils.file_exists(sv_exclude_bed) else "" tempdir = os.path.dirname(tx_out_file) cmd = ("export TMPDIR={tempdir} && " "smoove call --processes {cores} --genotype --removepr --fasta {ref_file} " "--name {name} --outdir {out_dir} " "{exclude_bed} {exclude_chrs} {full_bams}") with utils.chdir(tempdir): try: do.run(cmd.format(**locals()), "smoove lumpy calling", items[0]) except subprocess.CalledProcessError as msg: if _allowed_errors(str(msg)): vcfutils.write_empty_vcf(tx_out_file, config=items[0]["config"], samples=[dd.get_sample_name(d) for d in items]) else: logger.exception() raise vcfutils.bgzip_and_index(out_file, items[0]["config"]) return out_file, sv_exclude_bed
def _run_smoove(full_bams, sr_bams, disc_bams, work_dir, items): """Run lumpy-sv using smoove. """ batch = sshared.get_cur_batch(items) ext = "-%s-svs" % batch if batch else "-svs" name = "%s%s" % (dd.get_sample_name(items[0]), ext) out_file = os.path.join(work_dir, "%s-smoove.genotyped.vcf.gz" % name) sv_exclude_bed = sshared.prepare_exclude_file(items, out_file) old_out_file = os.path.join(work_dir, "%s%s-prep.vcf.gz" % (os.path.splitext(os.path.basename(items[0]["align_bam"]))[0], ext)) if utils.file_exists(old_out_file): return old_out_file, sv_exclude_bed if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: cores = dd.get_num_cores(items[0]) out_dir = os.path.dirname(tx_out_file) ref_file = dd.get_ref_file(items[0]) full_bams = " ".join(_prepare_smoove_bams(full_bams, sr_bams, disc_bams, items, os.path.dirname(tx_out_file))) std_excludes = ["~^GL", "~^HLA", "~_random", "~^chrUn", "~alt", "~decoy"] def _is_std_exclude(n): clean_excludes = [x.replace("~", "").replace("^", "") for x in std_excludes] return any([n.startswith(x) or n.endswith(x) for x in clean_excludes]) exclude_chrs = [c.name for c in ref.file_contigs(ref_file) if not chromhacks.is_nonalt(c.name) and not _is_std_exclude(c.name)] exclude_chrs = "--excludechroms '%s'" % ",".join(std_excludes + exclude_chrs) exclude_bed = ("--exclude %s" % sv_exclude_bed) if utils.file_exists(sv_exclude_bed) else "" tempdir = os.path.dirname(tx_out_file) cmd = ("export TMPDIR={tempdir} && " "smoove call --processes {cores} --genotype --removepr --fasta {ref_file} " "--name {name} --outdir {out_dir} " "{exclude_bed} {exclude_chrs} {full_bams}") with utils.chdir(tempdir): try: do.run(cmd.format(**locals()), "smoove lumpy calling", items[0]) except subprocess.CalledProcessError as msg: if _allowed_errors(msg): vcfutils.write_empty_vcf(tx_out_file, config=items[0]["config"], samples=[dd.get_sample_name(d) for d in items]) else: logger.exception() raise vcfutils.bgzip_and_index(out_file, items[0]["config"]) return out_file, sv_exclude_bed
def _run_lumpy(full_bams, sr_bams, disc_bams, work_dir, items): """Run lumpy-sv, using speedseq pipeline. """ batch = sshared.get_cur_batch(items) ext = "-%s-svs" % batch if batch else "-svs" out_file = os.path.join(work_dir, "%s%s.sv.bedpe" % (os.path.splitext(os.path.basename(items[0]["align_bam"]))[0], ext)) sv_exclude_bed = sshared.prepare_exclude_file(items, out_file) if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: with utils.curdir_tmpdir(items[0]) as tmpdir: out_base = tx_out_file.replace(".sv.bedpe", "") full_bams = ",".join(full_bams) sr_bams = ",".join(sr_bams) disc_bams = ",".join(disc_bams) exclude = "-x %s" % sv_exclude_bed if sv_exclude_bed else "" ref_file = dd.get_ref_file(items[0]) cmd = ("speedseq sv -v -B {full_bams} -S {sr_bams} -D {disc_bams} -R {ref_file} " "{exclude} -A false -T {tmpdir} -o {out_base}") do.run(cmd.format(**locals()), "speedseq lumpy", items[0]) return out_file, sv_exclude_bed
def _run_lumpy(full_bams, sr_bams, disc_bams, work_dir, items): """Run lumpy-sv, using speedseq pipeline. """ batch = sshared.get_cur_batch(items) ext = "-%s-svs" % batch if batch else "-svs" out_file = os.path.join(work_dir, "%s%s.sv.bedpe" % (os.path.splitext(os.path.basename(items[0]["align_bam"]))[0], ext)) sv_exclude_bed = sshared.prepare_exclude_file(items, out_file) if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: with tx_tmpdir(items[0]) as tmpdir: out_base = tx_out_file.replace(".sv.bedpe", "") full_bams = ",".join(full_bams) sr_bams = ",".join(sr_bams) disc_bams = ",".join(disc_bams) exclude = "-x %s" % sv_exclude_bed if sv_exclude_bed else "" ref_file = dd.get_ref_file(items[0]) cmd = ("speedseq sv -v -B {full_bams} -S {sr_bams} -D {disc_bams} -R {ref_file} " "{exclude} -A false -T {tmpdir} -o {out_base}") do.run(cmd.format(**locals()), "speedseq lumpy", items[0]) return out_file, sv_exclude_bed
def _run_lumpy(full_bams, sr_bams, disc_bams, work_dir, items): """Run lumpy-sv, using speedseq pipeline. """ batch = sshared.get_cur_batch(items) ext = "-%s-svs" % batch if batch else "-svs" out_file = os.path.join(work_dir, "%s%s.vcf" % (os.path.splitext(os.path.basename(items[0]["align_bam"]))[0], ext)) sv_exclude_bed = sshared.prepare_exclude_file(items, out_file) if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: with tx_tmpdir(items[0]) as tmpdir: full_bams = ",".join(full_bams) sr_bams = ",".join(sr_bams) disc_bams = ",".join(disc_bams) exclude = "-x %s" % sv_exclude_bed if utils.file_exists(sv_exclude_bed) else "" ref_file = dd.get_ref_file(items[0]) # use our bcbio python for runs within lumpyexpress curpython_dir = os.path.dirname(sys.executable) cmd = ("export PATH={curpython_dir}:$PATH && " "lumpyexpress -v -B {full_bams} -S {sr_bams} -D {disc_bams} " "{exclude} -T {tmpdir} -o {tx_out_file}") do.run(cmd.format(**locals()), "lumpyexpress", items[0]) return vcfutils.sort_by_ref(out_file, items[0]), sv_exclude_bed
def _get_full_exclude_file(items, work_bams, work_dir): base_file = os.path.join( work_dir, "%s-svs" % (os.path.splitext(os.path.basename(work_bams[0]))[0])) return sshared.prepare_exclude_file(items, base_file)
def _get_full_exclude_file(items, work_dir): base_file = os.path.join(work_dir, "%s-svs" % (os.path.splitext(os.path.basename(items[0]["work_bam"]))[0])) return sshared.prepare_exclude_file(items, base_file)