Exemplo n.º 1
0
def main():
    p = optparse.OptionParser(__doc__)
    p.add_option("-a", dest="a", help="file to annotate. first 3 columns are "
                                      "chrom start stop")
    p.add_option("-b", dest="b", help="superbed to annotate with")

    p.add_option("--header", dest="header", help="a file has a header",
                    action="store_true", default=False)
    p.add_option("-N","--no-near", dest="no_near",
            help="dont find the nearest gene, just the up/downstream",
                    action="store_true", default=False)

    p.add_option("--upstream", dest="upstream", type=int, default=None,
                   help="distance upstream of [a] to look for [b]")
    p.add_option("--downstream", dest="downstream", type=int, default=None,
                   help="distance downstream of [a] to look for [b]")
    p.add_option("--transcripts", dest="transcripts", action="store_true",
            default=False, help="use transcript names in output as well as"
            " gene name. default is just gene name")

    opts, args = p.parse_args()
    if (opts.a is None or opts.b is None):
        sys.exit(not p.print_help())

    b = opts.b
    if not opts.transcripts:
        b = remove_transcripts(b)

    if not (opts.upstream or opts.downstream):
        superanno(opts.a, b, opts.header, opts.no_near, sys.stdout)

    else:
        out = open(BedTool._tmp(), "w")
        superanno(opts.a, b, opts.header, opts.no_near, out)
        out.close()

        new_header = []
        out_fh = open(out.name)
        new_header = [out_fh.readline().rstrip("\r\n")] if opts.header else []
        for xdir in ("upstream", "downstream"):
            dist = getattr(opts, xdir)
            if dist is None: continue
            new_out = open(BedTool._tmp(), "w")
            xstream(out_fh, b, dist, xdir, new_out)
            new_header.append("%s_%i" % (xdir, dist))
            new_out.close()
            out_fh = open(new_out.name)

        if opts.header:
            print "\t".join(new_header)
        for line in open(out_fh.name):
            sys.stdout.write(line)
Exemplo n.º 2
0
def nearest(a, b):
    a_not_overlapping = a.intersect(b, v=True)
    if len(a_not_overlapping) != 0:
        ab = a_not_overlapping.closest(b, t="all", stream=True)
    else:
        ab = []

    by_name = collections.defaultdict(list)
    for row in ab:
        key = row[3]
        row[3] = "."
        by_name[key].append(row)

    fh = open(BedTool._tmp(), "w")
    seen = set()

    for name, rows in by_name.iteritems():
        # TODO: just like above.
        full_names = [r[9] for r in rows]

        dists = [get_dist(r) for r in rows]
        if len(set(dists)) == 1:
            dists = set(dists)
            full_names = set(full_names)
        dists = ";".join(map(str, dists))
        names = ";".join(full_names)

        line = "\t".join(name.split("Z_Z") + [names, dists])
        if line in seen: continue
        seen.add(line)
        fh.write(line + "\n")
    fh.close()
    return fh.name
Exemplo n.º 3
0
def nearest(a, b, bstrand=None):
    a_not_overlapping = a.intersect(b, v=True)
    if len(a_not_overlapping) != 0:
        ab = a_not_overlapping.closest(b, t="all", stream=True)
    else:
        ab = []

    by_name = collections.defaultdict(list)
    for row in ab:
        key = row[3]
        row[3] = "."
        by_name[key].append(row)

    fh = open(BedTool._tmp(), "w")
    seen = set()

    for name, rows in by_name.iteritems():
        # TODO: just like above.
        full_names = [r[9] for r in rows]

        dists = [get_dist(r, bstrand) for r in rows]
        if len(set(dists)) == 1:
            dists = set(dists)
            full_names = set(full_names)
        dists = ";".join(map(str, dists))
        names = ";".join(full_names)

        line = "\t".join(name.split("Z_Z") + [names, dists])
        if line in seen: continue
        seen.add(line)
        fh.write(line + "\n")
    fh.close()
    return fh.name
Exemplo n.º 4
0
def add_xstream(a, b, dist, updown, report_distance=False):
    # run a window up or downstream.
    dir = dict(up="l", down="r")[updown]
    kwargs = {'sw':True, dir: dist}

    # have to set the other to 0
    if "l" in kwargs: kwargs["r"] = 0
    else: kwargs["l"] = 0

    c = a.window(b, **kwargs)
    afields = a.field_count()

    get_name = gen_get_name(b, afields)

    seen = collections.defaultdict(set)
    # condense to unique names.
    for row in c:
        key = "\t".join(row[:afields])
        seen[key].update([get_name(row)])

    d = open(BedTool._tmp(), "w")
    for row in seen:
        d.write(row + "\t" + ",".join(sorted(seen[row])) + "\n")

    # write the entries that did not appear in the window'ed Bed
    for row in a:
        key = "\t".join(row[:afields])
        if key in seen: continue
        d.write(str(row) + "\t.\n")

    d.close()
    dbed = BedTool(d.name)
    assert len(dbed) == len(a)
    return dbed
Exemplo n.º 5
0
def simplify_bed(fbed, has_header):
    """
    create a bed with no header and 6 columns.
    retain strand info.
    """
    line_gen = reader(fbed, header=False)
    header = line_gen.next() if has_header else None
    fh = open(BedTool._tmp(), "w")
    for toks in line_gen:
        new_toks = toks[:3] + ["Z_Z".join(toks), ".",
                                toks[5] if len(toks) > 5 else "."]
        fh.write("\t".join(new_toks) + "\n")
    fh.close()
    return BedTool(fh.name), header
Exemplo n.º 6
0
def simplify_bed(fbed, has_header):
    """
    create a bed with no header and 6 columns.
    retain strand info.
    """
    line_gen = reader(fbed, header=False)
    header = line_gen.next() if has_header else None
    fh = open(BedTool._tmp(), "w")
    for toks in line_gen:
        new_toks = toks[:3] + [
            "Z_Z".join(toks), ".", toks[5] if len(toks) > 5 else "."
        ]
        fh.write("\t".join(new_toks) + "\n")
    fh.close()
    return BedTool(fh.name), header
Exemplo n.º 7
0
def overlapping(a, b):
    by_name = collections.defaultdict(list)
    for row in a.intersect(b, wo=True, stream=True).cut(range(6) + [9, 10],
            stream=True):
        key = row[3] # the ZZ joined string.
        # 6, 7 are name, type.
        by_name[key].append((row[6], row[7]))

    fh = open(BedTool._tmp(), "w")
    for name, rows in by_name.iteritems():
        types = sorted(set([r[1] for r in rows]))
        full_names = sorted(set([r[0] for r in rows]))
        #regain the original line.
        line = name.split("Z_Z") + [";".join(full_names), ";".join(types)]
        fh.write("\t".join(line) + "\n")
    fh.close()
    return fh.name
Exemplo n.º 8
0
def overlapping(a, b):
    by_name = collections.defaultdict(list)
    for row in a.intersect(b, wo=True, stream=True).cut(range(6) + [9, 10],
                                                        stream=True):
        key = row[3]  # the ZZ joined string.
        # 6, 7 are name, type.
        by_name[key].append((row[6], row[7]))

    fh = open(BedTool._tmp(), "w")
    for name, rows in by_name.iteritems():
        types = sorted(set([r[1] for r in rows]))
        full_names = sorted(set([r[0] for r in rows]))
        #regain the original line.
        line = name.split("Z_Z") + [";".join(full_names), ";".join(types)]
        fh.write("\t".join(line) + "\n")
    fh.close()
    return fh.name
Exemplo n.º 9
0
def remove_transcripts(b):
    if op.exists(b + ".notranscripts"):
        if os.stat(b + ".notranscripts").st_mtime > os.stat(b).st_mtime:
            return b + ".notranscripts"
    try:
        bnew = open(b + ".notranscripts", 'w')
    except:
        bnew = open(BedTool._tmp(), "w")
    try:
        for row in reader(b, header=False):
            if "," in row[3]:
                row[3] = row[3].split(",")[1]
            bnew.write("\t".join(row) + "\n")
        bnew.close()
    except:
        os.unlink(bnew.name)
        raise
    return bnew.name
Exemplo n.º 10
0
def remove_transcripts(b):
    if op.exists(b + ".notranscripts"):
        if os.stat(b + ".notranscripts").st_mtime > os.stat(b).st_mtime:
            return b + ".notranscripts"
    try:
        bnew = open(b + ".notranscripts", 'w')
    except:
        bnew = open(BedTool._tmp(), "w")
    try:
        for row in reader(b, header=False):
            if "," in row[3]:
                row[3] = row[3].split(",")[1]
            bnew.write("\t".join(row) + "\n")
        bnew.close()
    except:
        os.unlink(bnew.name)
        raise
    return bnew.name
Exemplo n.º 11
0
def add_xstream(a, b, dist, updown, report_distance=False):
    # run a window up or downstream.
    dir = dict(up="l", down="r")[updown]
    kwargs = {"sw": True, dir: dist}

    # have to set the other to 0
    if "l" in kwargs:
        kwargs["r"] = 0
    else:
        kwargs["l"] = 0

    c = a.window(b, **kwargs)
    afields = a.field_count()

    get_name = gen_get_name(b, afields)

    seen = collections.defaultdict(set)
    # condense to unique names.
    for row in c:
        key = "\t".join(row[:afields])
        seen[key].update([get_name(row)])

    d = open(BedTool._tmp(), "w")
    for row in seen:
        d.write(row + "\t" + ",".join(sorted(seen[row])) + "\n")

    # write the entries that did not appear in the window'ed Bed
    for row in a:
        key = "\t".join(row[:afields])
        if key in seen:
            continue
        d.write(str(row) + "\t.\n")

    d.close()
    dbed = BedTool(d.name)
    assert len(dbed) == len(a)
    return dbed
Exemplo n.º 12
0
def add_closest(aname, bname):
    a, b = BedTool(aname), BedTool(bname)

    afields = a.field_count()
    c = a.closest(b, d=True)
    get_name = gen_get_name(b, afields)

    dbed = open(BedTool._tmp(), "w")
    # keep the name and distance
    seen_by_line = collections.defaultdict(list)
    for feat in c:
        key = "\t".join(feat[:afields])
        seen_by_line[key].append([feat[-1], get_name(feat)])

    for key, dist_names in seen_by_line.items():
        if len(dist_names) > 0:
            assert len(set([d[0] for d in dist_names])) == 1
        names = ",".join(sorted(set(d[1] for d in dist_names)))
        new_line = "\t".join([key] + [names] + [dist_names[0][0]])
        dbed.write(new_line + "\n")
    dbed.close()
    d = BedTool(dbed.name)
    assert len(d) == len(a)
    return d
Exemplo n.º 13
0
def add_closest(aname, bname):
    a, b = BedTool(aname), BedTool(bname)

    afields = a.field_count()
    c = a.closest(b, d=True)
    get_name = gen_get_name(b, afields)

    dbed = open(BedTool._tmp(), "w")
    # keep the name and distance
    seen_by_line = collections.defaultdict(list)
    for feat in c:
        key = "\t".join(feat[:afields])
        seen_by_line[key].append([feat[-1], get_name(feat)])

    for key, dist_names in seen_by_line.iteritems():
        if len(dist_names) > 0:
            assert len(set([d[0] for d in dist_names])) == 1
        names = ",".join(sorted(set(d[1] for d in dist_names)))
        new_line = "\t".join([key] + [names] + [dist_names[0][0]])
        dbed.write(new_line + "\n")
    dbed.close()
    d = BedTool(dbed.name)
    assert len(d) == len(a)
    return d
Exemplo n.º 14
0
def main():
    p = optparse.OptionParser(__doc__)
    p.add_option("-a",
                 dest="a",
                 help="file to annotate. first 3 columns are "
                 "chrom start stop")
    p.add_option("-b", dest="b", help="superbed to annotate with")

    p.add_option("--header",
                 dest="header",
                 help="a file has a header",
                 action="store_true",
                 default=False)
    p.add_option("-N",
                 "--no-near",
                 dest="no_near",
                 help="dont find the nearest gene, just the up/downstream",
                 action="store_true",
                 default=False)

    p.add_option("--upstream",
                 dest="upstream",
                 type=int,
                 default=None,
                 help="distance upstream of [a] to look for [b]")
    p.add_option("--downstream",
                 dest="downstream",
                 type=int,
                 default=None,
                 help="distance downstream of [a] to look for [b]")
    p.add_option("--transcripts",
                 dest="transcripts",
                 action="store_true",
                 default=False,
                 help="use transcript names in output as well as"
                 " gene name. default is just gene name")
    p.add_option(
        "--bstrand",
        dest="bstrand",
        default=None,
        type=int,
        help="if this is specified, it's the column number of the"
        " strand info from the b file and nearest are reported with"
        " upstream as negative relative to this column. either '+' or '-'")

    opts, args = p.parse_args()
    if (opts.a is None or opts.b is None):
        sys.exit(not p.print_help())

    if not opts.bstrand is None:
        opts.bstrand -= 1

    b = opts.b
    if not opts.transcripts:
        b = remove_transcripts(b)

    if not (opts.upstream or opts.downstream):
        superanno(opts.a, b, opts.header, opts.no_near, sys.stdout,
                  opts.bstrand)

    else:
        out = open(BedTool._tmp(), "w")
        superanno(opts.a, b, opts.header, opts.no_near, out, opts.bstrand)
        out.close()

        new_header = []
        out_fh = open(out.name)
        new_header = [out_fh.readline().rstrip("\r\n")] if opts.header else []
        for xdir in ("upstream", "downstream"):
            dist = getattr(opts, xdir)
            if dist is None: continue
            new_out = open(BedTool._tmp(), "w")
            xstream(out_fh, b, dist, xdir, new_out)
            new_header.append("%s_%i" % (xdir, dist))
            new_out.close()
            out_fh = open(new_out.name)

        if opts.header:
            print "\t".join(new_header)
        for line in open(out_fh.name):
            sys.stdout.write(line)
Exemplo n.º 15
0
def handle_coreutils_sort_kwargs(self, prog='sort', instream=None, **kwargs):
    """
    Handle coreutils sort program calls.

    *kwargs* are passed directly from the calling method (self.coreutils_sort).

    This method figures out, given how this BedTool was constructed, what
    to send to BEDTools programs -- for example, an open file to stdin with
    the `-` argument, or a filename with the `-a` argument.

    *instream* can be e.g., self.fn or 'a.bed' or an iterator.
    """
    pybedtools.logger.debug(
        'BedTool.handle_coreutils_sort_kwargs() got these kwargs:\n%s',
        pprint.pformat(kwargs))

    stdin = None

    # Decide how to send instream to sort.
    # If it's a BedTool, then get underlying stream
    if isinstance(instream, BedTool):
        instream = instream.fn

    # Filename? No pipe, just provide the file
    if isinstance(instream, six.string_types):
        stdin = None
        input_fn = instream
    # A generator or iterator: pipe it as a generator of lines
    else:
        stdin = (str(i) for i in instream)
        input_fn = '-'

    # If stream not specified, then a tempfile will be created
    if kwargs.pop('stream', None):
        tmp = None
    else:
        output = kwargs.pop('output', None)
        if output:
            tmp = output
        else:
            tmp = BedTool._tmp()

    additional_args = kwargs.pop('additional_args', None)

    # Parse the kwargs into BEDTools-ready args
    cmds = [prog]

    for key, value in sorted(list(kwargs.items()), reverse=True):
        if isinstance(value, bool):
            if value:
                cmds.append('--' + key)
            else:
                continue
        elif isinstance(value, list) or isinstance(value, tuple):
            value = list(map(str, value))

            # sort --key 1,1 --key 2,2r -k 5,5
            for val in value:
                if len(key) == 1:
                    cmds.append('-' + key)
                else:
                    cmds.append('--' + key)
                cmds.append(str(val))
        else:
            cmds.append('--' + key)
            cmds.append(str(value))

    if additional_args:
        cmds.append(additional_args)

    cmds.append(input_fn)
    return cmds, tmp, stdin