def main(): p = optparse.OptionParser(__doc__) p.add_option("-a", dest="a", help="file to annotate. first 3 columns are " "chrom start stop") p.add_option("-b", dest="b", help="superbed to annotate with") p.add_option("--header", dest="header", help="a file has a header", action="store_true", default=False) p.add_option("-N","--no-near", dest="no_near", help="dont find the nearest gene, just the up/downstream", action="store_true", default=False) p.add_option("--upstream", dest="upstream", type=int, default=None, help="distance upstream of [a] to look for [b]") p.add_option("--downstream", dest="downstream", type=int, default=None, help="distance downstream of [a] to look for [b]") p.add_option("--transcripts", dest="transcripts", action="store_true", default=False, help="use transcript names in output as well as" " gene name. default is just gene name") opts, args = p.parse_args() if (opts.a is None or opts.b is None): sys.exit(not p.print_help()) b = opts.b if not opts.transcripts: b = remove_transcripts(b) if not (opts.upstream or opts.downstream): superanno(opts.a, b, opts.header, opts.no_near, sys.stdout) else: out = open(BedTool._tmp(), "w") superanno(opts.a, b, opts.header, opts.no_near, out) out.close() new_header = [] out_fh = open(out.name) new_header = [out_fh.readline().rstrip("\r\n")] if opts.header else [] for xdir in ("upstream", "downstream"): dist = getattr(opts, xdir) if dist is None: continue new_out = open(BedTool._tmp(), "w") xstream(out_fh, b, dist, xdir, new_out) new_header.append("%s_%i" % (xdir, dist)) new_out.close() out_fh = open(new_out.name) if opts.header: print "\t".join(new_header) for line in open(out_fh.name): sys.stdout.write(line)
def nearest(a, b): a_not_overlapping = a.intersect(b, v=True) if len(a_not_overlapping) != 0: ab = a_not_overlapping.closest(b, t="all", stream=True) else: ab = [] by_name = collections.defaultdict(list) for row in ab: key = row[3] row[3] = "." by_name[key].append(row) fh = open(BedTool._tmp(), "w") seen = set() for name, rows in by_name.iteritems(): # TODO: just like above. full_names = [r[9] for r in rows] dists = [get_dist(r) for r in rows] if len(set(dists)) == 1: dists = set(dists) full_names = set(full_names) dists = ";".join(map(str, dists)) names = ";".join(full_names) line = "\t".join(name.split("Z_Z") + [names, dists]) if line in seen: continue seen.add(line) fh.write(line + "\n") fh.close() return fh.name
def nearest(a, b, bstrand=None): a_not_overlapping = a.intersect(b, v=True) if len(a_not_overlapping) != 0: ab = a_not_overlapping.closest(b, t="all", stream=True) else: ab = [] by_name = collections.defaultdict(list) for row in ab: key = row[3] row[3] = "." by_name[key].append(row) fh = open(BedTool._tmp(), "w") seen = set() for name, rows in by_name.iteritems(): # TODO: just like above. full_names = [r[9] for r in rows] dists = [get_dist(r, bstrand) for r in rows] if len(set(dists)) == 1: dists = set(dists) full_names = set(full_names) dists = ";".join(map(str, dists)) names = ";".join(full_names) line = "\t".join(name.split("Z_Z") + [names, dists]) if line in seen: continue seen.add(line) fh.write(line + "\n") fh.close() return fh.name
def add_xstream(a, b, dist, updown, report_distance=False): # run a window up or downstream. dir = dict(up="l", down="r")[updown] kwargs = {'sw':True, dir: dist} # have to set the other to 0 if "l" in kwargs: kwargs["r"] = 0 else: kwargs["l"] = 0 c = a.window(b, **kwargs) afields = a.field_count() get_name = gen_get_name(b, afields) seen = collections.defaultdict(set) # condense to unique names. for row in c: key = "\t".join(row[:afields]) seen[key].update([get_name(row)]) d = open(BedTool._tmp(), "w") for row in seen: d.write(row + "\t" + ",".join(sorted(seen[row])) + "\n") # write the entries that did not appear in the window'ed Bed for row in a: key = "\t".join(row[:afields]) if key in seen: continue d.write(str(row) + "\t.\n") d.close() dbed = BedTool(d.name) assert len(dbed) == len(a) return dbed
def simplify_bed(fbed, has_header): """ create a bed with no header and 6 columns. retain strand info. """ line_gen = reader(fbed, header=False) header = line_gen.next() if has_header else None fh = open(BedTool._tmp(), "w") for toks in line_gen: new_toks = toks[:3] + ["Z_Z".join(toks), ".", toks[5] if len(toks) > 5 else "."] fh.write("\t".join(new_toks) + "\n") fh.close() return BedTool(fh.name), header
def simplify_bed(fbed, has_header): """ create a bed with no header and 6 columns. retain strand info. """ line_gen = reader(fbed, header=False) header = line_gen.next() if has_header else None fh = open(BedTool._tmp(), "w") for toks in line_gen: new_toks = toks[:3] + [ "Z_Z".join(toks), ".", toks[5] if len(toks) > 5 else "." ] fh.write("\t".join(new_toks) + "\n") fh.close() return BedTool(fh.name), header
def overlapping(a, b): by_name = collections.defaultdict(list) for row in a.intersect(b, wo=True, stream=True).cut(range(6) + [9, 10], stream=True): key = row[3] # the ZZ joined string. # 6, 7 are name, type. by_name[key].append((row[6], row[7])) fh = open(BedTool._tmp(), "w") for name, rows in by_name.iteritems(): types = sorted(set([r[1] for r in rows])) full_names = sorted(set([r[0] for r in rows])) #regain the original line. line = name.split("Z_Z") + [";".join(full_names), ";".join(types)] fh.write("\t".join(line) + "\n") fh.close() return fh.name
def remove_transcripts(b): if op.exists(b + ".notranscripts"): if os.stat(b + ".notranscripts").st_mtime > os.stat(b).st_mtime: return b + ".notranscripts" try: bnew = open(b + ".notranscripts", 'w') except: bnew = open(BedTool._tmp(), "w") try: for row in reader(b, header=False): if "," in row[3]: row[3] = row[3].split(",")[1] bnew.write("\t".join(row) + "\n") bnew.close() except: os.unlink(bnew.name) raise return bnew.name
def add_xstream(a, b, dist, updown, report_distance=False): # run a window up or downstream. dir = dict(up="l", down="r")[updown] kwargs = {"sw": True, dir: dist} # have to set the other to 0 if "l" in kwargs: kwargs["r"] = 0 else: kwargs["l"] = 0 c = a.window(b, **kwargs) afields = a.field_count() get_name = gen_get_name(b, afields) seen = collections.defaultdict(set) # condense to unique names. for row in c: key = "\t".join(row[:afields]) seen[key].update([get_name(row)]) d = open(BedTool._tmp(), "w") for row in seen: d.write(row + "\t" + ",".join(sorted(seen[row])) + "\n") # write the entries that did not appear in the window'ed Bed for row in a: key = "\t".join(row[:afields]) if key in seen: continue d.write(str(row) + "\t.\n") d.close() dbed = BedTool(d.name) assert len(dbed) == len(a) return dbed
def add_closest(aname, bname): a, b = BedTool(aname), BedTool(bname) afields = a.field_count() c = a.closest(b, d=True) get_name = gen_get_name(b, afields) dbed = open(BedTool._tmp(), "w") # keep the name and distance seen_by_line = collections.defaultdict(list) for feat in c: key = "\t".join(feat[:afields]) seen_by_line[key].append([feat[-1], get_name(feat)]) for key, dist_names in seen_by_line.items(): if len(dist_names) > 0: assert len(set([d[0] for d in dist_names])) == 1 names = ",".join(sorted(set(d[1] for d in dist_names))) new_line = "\t".join([key] + [names] + [dist_names[0][0]]) dbed.write(new_line + "\n") dbed.close() d = BedTool(dbed.name) assert len(d) == len(a) return d
def add_closest(aname, bname): a, b = BedTool(aname), BedTool(bname) afields = a.field_count() c = a.closest(b, d=True) get_name = gen_get_name(b, afields) dbed = open(BedTool._tmp(), "w") # keep the name and distance seen_by_line = collections.defaultdict(list) for feat in c: key = "\t".join(feat[:afields]) seen_by_line[key].append([feat[-1], get_name(feat)]) for key, dist_names in seen_by_line.iteritems(): if len(dist_names) > 0: assert len(set([d[0] for d in dist_names])) == 1 names = ",".join(sorted(set(d[1] for d in dist_names))) new_line = "\t".join([key] + [names] + [dist_names[0][0]]) dbed.write(new_line + "\n") dbed.close() d = BedTool(dbed.name) assert len(d) == len(a) return d
def main(): p = optparse.OptionParser(__doc__) p.add_option("-a", dest="a", help="file to annotate. first 3 columns are " "chrom start stop") p.add_option("-b", dest="b", help="superbed to annotate with") p.add_option("--header", dest="header", help="a file has a header", action="store_true", default=False) p.add_option("-N", "--no-near", dest="no_near", help="dont find the nearest gene, just the up/downstream", action="store_true", default=False) p.add_option("--upstream", dest="upstream", type=int, default=None, help="distance upstream of [a] to look for [b]") p.add_option("--downstream", dest="downstream", type=int, default=None, help="distance downstream of [a] to look for [b]") p.add_option("--transcripts", dest="transcripts", action="store_true", default=False, help="use transcript names in output as well as" " gene name. default is just gene name") p.add_option( "--bstrand", dest="bstrand", default=None, type=int, help="if this is specified, it's the column number of the" " strand info from the b file and nearest are reported with" " upstream as negative relative to this column. either '+' or '-'") opts, args = p.parse_args() if (opts.a is None or opts.b is None): sys.exit(not p.print_help()) if not opts.bstrand is None: opts.bstrand -= 1 b = opts.b if not opts.transcripts: b = remove_transcripts(b) if not (opts.upstream or opts.downstream): superanno(opts.a, b, opts.header, opts.no_near, sys.stdout, opts.bstrand) else: out = open(BedTool._tmp(), "w") superanno(opts.a, b, opts.header, opts.no_near, out, opts.bstrand) out.close() new_header = [] out_fh = open(out.name) new_header = [out_fh.readline().rstrip("\r\n")] if opts.header else [] for xdir in ("upstream", "downstream"): dist = getattr(opts, xdir) if dist is None: continue new_out = open(BedTool._tmp(), "w") xstream(out_fh, b, dist, xdir, new_out) new_header.append("%s_%i" % (xdir, dist)) new_out.close() out_fh = open(new_out.name) if opts.header: print "\t".join(new_header) for line in open(out_fh.name): sys.stdout.write(line)
def handle_coreutils_sort_kwargs(self, prog='sort', instream=None, **kwargs): """ Handle coreutils sort program calls. *kwargs* are passed directly from the calling method (self.coreutils_sort). This method figures out, given how this BedTool was constructed, what to send to BEDTools programs -- for example, an open file to stdin with the `-` argument, or a filename with the `-a` argument. *instream* can be e.g., self.fn or 'a.bed' or an iterator. """ pybedtools.logger.debug( 'BedTool.handle_coreutils_sort_kwargs() got these kwargs:\n%s', pprint.pformat(kwargs)) stdin = None # Decide how to send instream to sort. # If it's a BedTool, then get underlying stream if isinstance(instream, BedTool): instream = instream.fn # Filename? No pipe, just provide the file if isinstance(instream, six.string_types): stdin = None input_fn = instream # A generator or iterator: pipe it as a generator of lines else: stdin = (str(i) for i in instream) input_fn = '-' # If stream not specified, then a tempfile will be created if kwargs.pop('stream', None): tmp = None else: output = kwargs.pop('output', None) if output: tmp = output else: tmp = BedTool._tmp() additional_args = kwargs.pop('additional_args', None) # Parse the kwargs into BEDTools-ready args cmds = [prog] for key, value in sorted(list(kwargs.items()), reverse=True): if isinstance(value, bool): if value: cmds.append('--' + key) else: continue elif isinstance(value, list) or isinstance(value, tuple): value = list(map(str, value)) # sort --key 1,1 --key 2,2r -k 5,5 for val in value: if len(key) == 1: cmds.append('-' + key) else: cmds.append('--' + key) cmds.append(str(val)) else: cmds.append('--' + key) cmds.append(str(value)) if additional_args: cmds.append(additional_args) cmds.append(input_fn) return cmds, tmp, stdin