def _group_by_ctype(bed_file, depth, region_file, out_file): """Group adjacent callable/uncallble regions into defined intervals. Uses tips from bedtools discussion: https://groups.google.com/d/msg/bedtools-discuss/qYDE6XF-GRA/2icQtUeOX_UJ https://gist.github.com/arq5x/b67196a46db5b63bee06 """ import pybedtools def assign_coverage(feat): feat.name = _get_ctype(float(feat.name), depth) return feat full_out_file = "%s-full%s" % utils.splitext_plus(out_file) with open(full_out_file, "w") as out_handle: kwargs = { "g": [1, 4], "c": [1, 2, 3, 4], "ops": ["first", "first", "max", "first"] } # back compatible precision https://github.com/chapmanb/bcbio-nextgen/issues/664 if LooseVersion(programs.get_version_manifest( "bedtools")) >= LooseVersion("2.22.0"): kwargs["prec"] = 21 for line in open( pybedtools.BedTool(bed_file).each( assign_coverage).saveas().groupby(**kwargs).fn): out_handle.write("\t".join(line.split("\t")[2:])) pybedtools.BedTool(full_out_file).intersect(region_file).saveas(out_file)
def snpeff_version(args=None, data=None): raw_version = programs.get_version_manifest("snpeff", data=data) if not raw_version: raw_version = "" snpeff_version = "".join( [x for x in str(raw_version) if x in set(string.digits + ".")]) return snpeff_version
def samblaster_dedup_sort(data, tx_out_file, tx_sr_file, tx_disc_file): """Deduplicate and sort with samblaster, produces split read and discordant pair files. """ samblaster = config_utils.get_program("samblaster", data["config"]) samtools = config_utils.get_program("samtools", data["config"]) sambamba = config_utils.get_program("sambamba", data["config"]) cores, mem = _get_cores_memory(data, downscale=3) tmp_prefix = "%s-sorttmp" % utils.splitext_plus(tx_out_file)[0] for ext in ["spl", "disc", "full"]: utils.safe_makedir("%s-%s" % (tmp_prefix, ext)) sort_opt = "-N" if data.get("align_split") else "" full_tobam_cmd = ("{samtools} view -b -S -u - | " "{sambamba} sort {sort_opt} -t {cores} -m {mem} " "--tmpdir {tmp_prefix}-{dext} -o {out_file} /dev/stdin") tobam_cmd = ("{samtools} sort -@ {cores} -m {mem} " "-T {tmp_prefix}-{dext} -o {out_file} /dev/stdin") # samblaster 0.1.22 and better require the -M flag for compatibility with bwa-mem # https://github.com/GregoryFaust/samblaster/releases/tag/v.0.1.22 if LooseVersion(programs.get_version_manifest("samblaster", data=data, required=True)) >= LooseVersion("0.1.22"): opts = "-M" else: opts = "" splitter_cmd = tobam_cmd.format(out_file=tx_sr_file, dext="spl", **locals()) discordant_cmd = tobam_cmd.format(out_file=tx_disc_file, dext="disc", **locals()) dedup_cmd = full_tobam_cmd.format(out_file=tx_out_file, dext="full", **locals()) cmd = ("{samblaster} --addMateTags {opts} --splitterFile >({splitter_cmd}) --discordantFile >({discordant_cmd}) " "| {dedup_cmd}") return cmd.format(**locals())
def _get_stats_from_miraligner(fn, out_file, name): df = pd.read_csv(fn, sep="\t", dtype={"mism": "string", "add": "string", "t5": "string", "t3": "string"}, na_values=["."]) dfmirs = df[['mir', 'freq']].groupby(['mir']).count() df5 = df.loc[df.t5 != "0", ['mir', 't5']].groupby(['mir']).count() df3 = df.loc[df.t3 != "0", ['mir', 't3']].groupby(['mir']).count() dfadd = df.loc[df["add"] != "0", ['mir', 'add']].groupby(['mir']).count() dfmut = df.loc[df.mism != "0", ['mir', 'mism']].groupby(['mir']).count() if not utils.file_exists(out_file): version = get_version_manifest("seqbuster") with file_transaction(out_file) as tx_out: with open(tx_out, "w") as out_handle: print >>out_handle, "# stats {name}, version: {version}".format(**locals()) print >>out_handle, ("mirs\t{mirs}\nisomirs\t{isomirs}").format( mirs=len(dfmirs.index), isomirs=len(df.index)) print >>out_handle, ("mirs_mutations\t{muts}\nmirs_additions\t{add}").format( muts=len(dfmut.index), add=len(dfadd.index)) print >>out_handle, ("mirs_5-trimming\t{t5}\nmirs_3-trimming\t{t3}").format( t5=len(df5.index), t3=len(df3.index)) print >>out_handle, ("iso_mutations\t{muts}\niso_additions\t{add}").format( muts=sum(dfmut.mism), add=sum(dfadd["add"])) print >>out_handle, ("iso_5-trimming\t{t5}\niso_3-trimming\t{t3}").format( t5=sum(df5.t5), t3=sum(df3.t3))
def samblaster_dedup_sort(data, tx_out_file, tx_sr_file, tx_disc_file): """Deduplicate and sort with samblaster, produces split read and discordant pair files. """ samblaster = config_utils.get_program("samblaster", data["config"]) samtools = config_utils.get_program("samtools", data["config"]) cores, mem = _get_cores_memory(data, downscale=3) tmp_prefix = "%s-sorttmp" % utils.splitext_plus(tx_out_file)[0] for ext in ["spl", "disc", "full"]: utils.safe_makedir("%s-%s" % (tmp_prefix, ext)) if data.get("align_split"): full_tobam_cmd = _nosort_tobam_cmd() else: full_tobam_cmd = ("samtools view -b -u - | " "sambamba sort -t {cores} -m {mem} " "--tmpdir {tmp_prefix}-{dext} -o {out_file} /dev/stdin") tobam_cmd = ("{samtools} sort -@ {cores} -m {mem} " "-T {tmp_prefix}-{dext} -o {out_file} /dev/stdin") # samblaster 0.1.22 and better require the -M flag for compatibility with bwa-mem # https://github.com/GregoryFaust/samblaster/releases/tag/v.0.1.22 if LooseVersion(programs.get_version_manifest("samblaster", data=data, required=True)) >= LooseVersion("0.1.22"): opts = "-M" else: opts = "" splitter_cmd = tobam_cmd.format(out_file=tx_sr_file, dext="spl", **locals()) discordant_cmd = tobam_cmd.format(out_file=tx_disc_file, dext="disc", **locals()) dedup_cmd = full_tobam_cmd.format(out_file=tx_out_file, dext="full", **locals()) cmd = ("{samblaster} {opts} --splitterFile >({splitter_cmd}) --discordantFile >({discordant_cmd}) " "| {dedup_cmd}") return cmd.format(**locals())
def _vardict_options_from_config(items, config, out_file, target=None, is_rnaseq=False): var2vcf_opts = [] opts = ["-c 1", "-S 2", "-E 3", "-g 4"] # ["-z", "-F", "-c", "1", "-S", "2", "-E", "3", "-g", "4", "-x", "0", # "-k", "3", "-r", "4", "-m", "8"] cores = dd.get_num_cores(items[0]) if cores and cores > 1: opts += ["-th", str(cores)] # Disable SV calling for vardict, causes issues with regional analysis # by detecting SVs outside of target regions, which messes up merging # SV calling will be worked on as a separate step vardict_cl = get_vardict_command(items[0]) version = programs.get_version_manifest(vardict_cl) if (vardict_cl and version and ((vardict_cl == "vardict-java" and LooseVersion(version) >= LooseVersion("1.5.5")) or (vardict_cl == "vardict" and LooseVersion(version) >= LooseVersion("2018.07.25")))): opts += ["--nosv"] if (vardict_cl and version and (vardict_cl == "vardict-java" and LooseVersion(version) >= LooseVersion("1.5.6"))): opts += ["--deldupvar"] # remove low mapping quality reads if not is_rnaseq: opts += ["-Q", "10"] # Remove QCfail reads, avoiding high depth repetitive regions opts += ["-F", "0x700"] resources = config_utils.get_resources("vardict", config) if resources.get("options"): opts += [str(x) for x in resources["options"]] resources = config_utils.get_resources("var2vcf", config) if resources.get("options"): var2vcf_opts += [str(x) for x in resources["options"]] if target and _is_bed_file(target): target = _enforce_max_region_size(target, items[0]) opts += [target] # this must be the last option return " ".join(opts), " ".join(var2vcf_opts)
def snpeff_version(args=None, data=None): raw_version = programs.get_version_manifest("snpeff", data=data) if not raw_version: raw_version = "" snpeff_version = "".join([x for x in str(raw_version) if x in set(string.digits + ".")]) return snpeff_version
def snpeff_version(args=None, data=None): raw_version = programs.get_version_manifest("snpeff", data=data) if not raw_version: raw_version = "" snpeff_version = "".join([x for x in str(raw_version) if x in set(string.digits + ".")]) assert snpeff_version, "Did not find snpEff version information" return snpeff_version
def snpeff_version(args=None, data=None): raw_version = programs.get_version_manifest("snpeff", data=data) if not raw_version: raw_version = "" snpeff_version = "".join( [x for x in str(raw_version) if x in set(string.digits + ".")]) # Only return major version (4.3 not 4.3.1) which maps to databases snpeff_version = ".".join(snpeff_version.split(".")[:2]) return snpeff_version
def snpeff_version(args=None, data=None): raw_version = programs.get_version_manifest("snpeff", data=data) if not raw_version: raw_version = "" snpeff_version = "".join([x for x in str(raw_version) if x in set(string.digits + ".")]) # Only return major version (4.3 not 4.3.1) which maps to databases snpeff_version = ".".join(snpeff_version.split(".")[:2]) return snpeff_version
def _out_of_date(rw_file): """Check if a run workflow file points to an older version of manta and needs a refresh. """ with open(rw_file) as in_handle: for line in in_handle: if line.startswith("sys.path.append"): file_version = line.split("/lib/python")[0].split("Cellar/manta/")[-1] if file_version != programs.get_version_manifest("manta"): return True return False
def _get_snpeff_version(args): tooldir = args.tooldir or get_defaults()["tooldir"] raw_version = programs.get_version_manifest("snpeff") if not raw_version: config = {"resources": {"snpeff": {"jvm_opts": ["-Xms500m", "-Xmx1g"], "dir": os.path.join(tooldir, "share", "java", "snpeff")}}} raw_version = programs.java_versioner("snpeff", "snpEff", stdout_flag="snpEff version SnpEff")(config) snpeff_version = "".join([x for x in raw_version if x in set(string.digits + ".")]).replace(".", "_") assert snpeff_version, "Did not find snpEff version information" return snpeff_version
def snpeff_version(args=None): from bcbio.install import get_defaults tooldir = (args and args.tooldir) or get_defaults()["tooldir"] raw_version = programs.get_version_manifest("snpeff") if not raw_version: config = { "resources": { "snpeff": { "jvm_opts": ["-Xms500m", "-Xmx1g"], "dir": os.path.join(tooldir, "share", "java", "snpeff") } } } raw_version = programs.java_versioner( "snpeff", "snpEff", stdout_flag="snpEff version SnpEff")(config) snpeff_version = "".join( [x for x in str(raw_version) if x in set(string.digits + ".")]) assert snpeff_version, "Did not find snpEff version information" return snpeff_version
def _group_by_ctype(bed_file, depth, region_file, out_file): """Group adjacent callable/uncallble regions into defined intervals. Uses tips from bedtools discussion: https://groups.google.com/d/msg/bedtools-discuss/qYDE6XF-GRA/2icQtUeOX_UJ https://gist.github.com/arq5x/b67196a46db5b63bee06 """ def assign_coverage(feat): feat.name = _get_ctype(float(feat.name), depth) return feat full_out_file = "%s-full%s" % utils.splitext_plus(out_file) with open(full_out_file, "w") as out_handle: kwargs = {"g": [1, 4], "c": [1, 2, 3, 4], "ops": ["first", "first", "max", "first"]} # back compatible precision https://github.com/chapmanb/bcbio-nextgen/issues/664 if LooseVersion(programs.get_version_manifest("bedtools", True)) >= LooseVersion("2.22.0"): kwargs["prec"] = 21 for line in open(pybedtools.BedTool(bed_file).each(assign_coverage).saveas() .groupby(**kwargs).fn): out_handle.write("\t".join(line.split("\t")[2:])) pybedtools.BedTool(full_out_file).intersect(region_file, nonamecheck=True).saveas(out_file)
def _get_stats_from_miraligner(fn, out_file, name): df = pd.read_csv(fn, sep="\t", dtype={ "mism": "str", "add": "str", "t5": "str", "t3": "str" }, na_values=["."]) dfmirs = df[['mir', 'freq']].groupby(['mir']).count() df5 = df.loc[df.t5 != "0", ['mir', 't5']].groupby(['mir']).count() df3 = df.loc[df.t3 != "0", ['mir', 't3']].groupby(['mir']).count() dfadd = df.loc[df["add"] != "0", ['mir', 'add']].groupby(['mir']).count() dfmut = df.loc[df.mism != "0", ['mir', 'mism']].groupby(['mir']).count() if not utils.file_exists(out_file): version = get_version_manifest("seqbuster") with file_transaction(out_file) as tx_out: with open(tx_out, "w") as out_handle: print( ("# stats {name}, version: {version}").format(**locals()), file=out_handle) print(("mirs\t{mirs}\nisomirs\t{isomirs}").format( mirs=len(dfmirs.index), isomirs=len(df.index)), file=out_handle) print(("mirs_mutations\t{muts}\nmirs_additions\t{add}").format( muts=len(dfmut.index), add=len(dfadd.index)), file=out_handle) print(("mirs_5-trimming\t{t5}\nmirs_3-trimming\t{t3}").format( t5=len(df5.index), t3=len(df3.index)), file=out_handle) print(("iso_mutations\t{muts}\niso_additions\t{add}").format( muts=sum(dfmut.mism), add=sum(dfadd["add"])), file=out_handle) print(("iso_5-trimming\t{t5}\niso_3-trimming\t{t3}").format( t5=sum(df5.t5), t3=sum(df3.t3)), file=out_handle) return out_file