示例#1
0
文件: graph.py 项目: linlifeng/jcvi
    def path(self, path):
        from jcvi.utils.iter import pairwise

        oo = []
        if len(path) == 1:
            m = "Singleton {0}".format(path[0])
            oo.append((path[0].v, True))
            return m, oo

        edges = []
        for a, b in pairwise(path):
            av, bv = a.v, b.v
            flip = False
            if av > bv:
                av, bv = bv, av
                flip = True
            e = self.edges[(av, bv)]
            if flip:
                e.flip()

            if not oo:  # First edge imports two nodes
                oo.append((e.v1.v, e.o1 == ">"))
            last = oo[-1]
            assert last == (e.v1.v, e.o1 == ">")
            oo.append((e.v2.v, e.o2 == ">"))

            if flip:
                se = str(e)
                e.flip()
            else:
                se = str(e)
            edges.append(se)

        return "|".join(edges), oo
示例#2
0
文件: range.py 项目: bennyyu/jcvi
def range_interleave(ranges):
    """
    Returns the ranges in between the given ranges.

    >>> ranges = [("1", 30, 40), ("1", 45, 50), ("1", 10, 30)]
    >>> range_interleave(ranges)
    [('1', 41, 44)]
    >>> ranges = [("1", 30, 40), ("1", 42, 50)]
    >>> range_interleave(ranges)
    [('1', 41, 41)]
    """
    from jcvi.utils.iter import pairwise
    ranges = range_merge(ranges)
    interleaved_ranges = []

    for ch, cranges in groupby(ranges, key=lambda x: x[0]):
        for i, (a, b) in enumerate(pairwise(cranges)):
            ch, astart, aend = a
            ch, bstart, bend = b
            istart, iend = aend + 1, bstart - 1
            if istart > iend:
                continue
            interleaved_ranges.append((ch, istart, iend))

    return interleaved_ranges
示例#3
0
文件: bed.py 项目: yangjl/jcvi
def distance(args):
    """
    %prog distance bedfile

    Calculate distance between bed features. The output file is a list of
    distances, which can be used to plot histogram, etc.
    """
    from jcvi.utils.iter import pairwise

    p = OptionParser(distance.__doc__)
    p.add_option("--distmode", default="ss", choices=("ss", "ee"),
            help="Distance mode between paired reads. ss is outer distance, " \
                 "ee is inner distance [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args
    sortedbedfile = sort([bedfile])
    valid = total = 0
    fp = open(sortedbedfile)
    for a, b in pairwise(fp):
        a = BedLine(a)
        b = BedLine(b)
        ar = (a.seqid, a.start, a.end, "+")
        br = (b.seqid, b.start, b.end, "+")
        dist, oo = range_distance(ar, br, distmode=opts.distmode)
        total += 1
        if dist > 0:
            print dist
            valid += 1

    logging.debug("Total valid (> 0) distances: {0}.".\
                  format(percentage(valid, total)))
示例#4
0
文件: graph.py 项目: xuanblo/jcvi
    def path(self, path, flip=False):
        oo = []
        if len(path) == 1:
            m = "Singleton {0}".format(path[0])
            oo.append((path[0].v, True))
            return m, oo

        edges = []
        for a, b in pairwise(path):
            av, bv = a.v, b.v
            e = self.get_edge(av, bv)

            if not oo:  # First edge imports two nodes
                oo.append((e.v1.v, e.o1 == ">"))
            last = oo[-1]
            assert last == (e.v1.v, e.o1 == ">")
            oo.append((e.v2.v, e.o2 == ">"))

            if flip:
                se = str(e)
                e.flip()
            else:
                se = str(e)
            edges.append(se)

        return "|".join(edges), oo
示例#5
0
文件: bed.py 项目: radaniba/jcvi
def distance(args):
    """
    %prog distance bedfile

    Calculate distance between bed features. The output file is a list of
    distances, which can be used to plot histogram, etc.
    """
    from jcvi.utils.iter import pairwise

    p = OptionParser(distance.__doc__)
    p.add_option("--distmode", default="ss", choices=("ss", "ee"),
            help="Distance mode between paired reads. ss is outer distance, " \
                 "ee is inner distance [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args
    sortedbedfile = sort([bedfile])
    valid = total = 0
    fp = open(sortedbedfile)
    for a, b in pairwise(fp):
        a = BedLine(a)
        b = BedLine(b)
        ar = (a.seqid, a.start, a.end, "+")
        br = (b.seqid, b.start, b.end, "+")
        dist, oo = range_distance(ar, br, distmode=opts.distmode)
        total += 1
        if dist > 0:
            print dist
            valid += 1

    logging.debug("Total valid (> 0) distances: {0}.".\
                  format(percentage(valid, total)))
示例#6
0
文件: graph.py 项目: bennyyu/jcvi
    def path(self, path):
        from jcvi.utils.iter import pairwise

        oo = []
        if len(path) == 1:
            m = "Singleton {0}".format(path[0])
            oo.append((path[0].v, True))
            return m, oo

        edges = []
        for a, b in pairwise(path):
            av, bv = a.v, b.v
            flip = False
            if av > bv:
                av, bv = bv, av
                flip = True
            e = self.edges[(av, bv)]
            if flip:
                e.flip()

            if not oo:  # First edge imports two nodes
                oo.append((e.v1.v, e.o1 == ">"))
            last = oo[-1]
            assert last == (e.v1.v, e.o1 == ">")
            oo.append((e.v2.v, e.o2 == ">"))

            if flip:
                se = str(e)
                e.flip()
            else:
                se = str(e)
            edges.append(se)

        return "|".join(edges), oo
示例#7
0
def silicosoma(args):
    """
    %prog silicosoma in.silico > out.soma

    Convert .silico to .soma file.

    Format of .silico
        A text file containing in-silico digested contigs. This file contains pairs
    of lines. The first line in each pair constains an identifier, this contig
    length in bp, and the number of restriction sites, separated by white space.
    The second line contains a white space delimited list of the restriction
    site positions.

    Format of .soma
        Each line of the text file contains two decimal numbers: The size of the
    fragment and the standard deviation (both in kb), separated by white space.
    The standard deviation is ignored.
    """
    p = OptionParser(silicosoma.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    silicofile, = args
    fp = must_open(silicofile)
    fw = must_open(opts.outfile, "w")
    fp.next()
    positions = [int(x) for x in fp.next().split()]
    for a, b in pairwise(positions):
        assert a <= b
        fragsize = int(round((b - a) / 1000.))  # kb
        if fragsize:
            print >> fw, fragsize, 0
示例#8
0
    def path(self, path, flip=False):
        oo = []
        if len(path) == 1:
            m = "Singleton {0}".format(path[0])
            oo.append((path[0].v, True))
            return m, oo

        edges = []
        for a, b in pairwise(path):
            av, bv = a.v, b.v
            e = self.get_edge(av, bv)

            if not oo:  # First edge imports two nodes
                oo.append((e.v1.v, e.o1 == ">"))
            last = oo[-1]
            assert last == (e.v1.v, e.o1 == ">")
            oo.append((e.v2.v, e.o2 == ">"))

            if flip:
                se = str(e)
                e.flip()
            else:
                se = str(e)
            edges.append(se)

        return "|".join(edges), oo
示例#9
0
def chimera(args):
    """
    %prog chimera bedfile

    Scan the bed file to break scaffolds that multi-maps.
    """
    p = OptionParser(chimera.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args
    bed = Bed(bedfile)
    selected = select_bed(bed)
    mapped = defaultdict(set)  # scaffold => chr
    chimerabed = "chimera.bed"
    fw = open(chimerabed, "w")
    for b in selected:
        scf = range_parse(b.accn).seqid
        chr = b.seqid
        mapped[scf].add(chr)

    nchimera = 0
    for s, chrs in sorted(mapped.items()):
        if len(chrs) == 1:
            continue

        print >> sys.stderr, "=" * 80
        print >> sys.stderr, "{0} mapped to multiple locations: {1}".\
                format(s, ",".join(sorted(chrs)))
        ranges = []
        for b in selected:
            rr = range_parse(b.accn)
            scf = rr.seqid
            if scf == s:
                print >> sys.stderr, b
                ranges.append(rr)

        # Identify breakpoints
        ranges.sort(key=lambda x: (x.seqid, x.start, x.end))
        for a, b in pairwise(ranges):
            seqid = a.seqid
            if seqid != b.seqid:
                continue

            start, end = a.end, b.start
            if start > end:
                start, end = end, start

            chimeraline = "\t".join(str(x) for x in (seqid, start, end))
            print >> fw, chimeraline
            print >> sys.stderr, chimeraline
            nchimera += 1

    fw.close()
    logging.debug("A total of {0} junctions written to `{1}`.".\
                  format(nchimera, chimerabed))
示例#10
0
def min_feedback_arc_set(edges, remove=False, maxcycles=20000):
    """
    A directed graph may contain directed cycles, when such cycles are
    undesirable, we wish to eliminate them and obtain a directed acyclic graph
    (DAG). A feedback arc set has the property that it has at least one edge
    of every cycle in the graph. A minimum feedback arc set is the set that
    minimizes the total weight of the removed edges; or alternatively maximize
    the remaining edges. See: <http://en.wikipedia.org/wiki/Feedback_arc_set>.

    The MIP formulation proceeds as follows: use 0/1 indicator variable to
    select whether an edge is in the set, subject to constraint that each cycle
    must pick at least one such edge.

    >>> g = [(1, 2, 2), (2, 3, 2), (3, 4, 2)] + [(1, 3, 1), (3, 2, 1), (2, 4, 1)]
    >>> min_feedback_arc_set(g)
    ([(3, 2, 1)], 1)
    >>> min_feedback_arc_set(g, remove=True)  # Return DAG
    ([(1, 2, 2), (2, 3, 2), (3, 4, 2), (1, 3, 1), (2, 4, 1)], 1)
    """
    G = nx.DiGraph()
    edge_to_index = {}
    for i, (a, b, w) in enumerate(edges):
        G.add_edge(a, b)
        edge_to_index[a, b] = i

    nedges = len(edges)
    L = LPInstance()

    L.add_objective(edges, objective=MINIMIZE)

    constraints = []
    ncycles = 0
    for c in nx.simple_cycles(G):
        cycle_edges = []
        rc = c + [c[0]]  # Rotate the cycle
        for a, b in pairwise(rc):
            cycle_edges.append(edge_to_index[a, b])
        cc = summation(cycle_edges)
        constraints.append("{0} >= 1".format(cc))
        ncycles += 1
        if ncycles == maxcycles:
            break
    logging.debug("A total of {0} cycles found.".format(ncycles))

    L.constraints = constraints
    L.add_vars(nedges)

    selected, obj_val = L.lpsolve(clean=False)
    if remove:
        results = [x for i, x in enumerate(edges) if i not in selected] \
                        if selected else None
    else:
        results = [x for i, x in enumerate(edges) if i in selected] \
                        if selected else None

    return results, obj_val
示例#11
0
def fromblast(args):
    """
    %prog fromblast blastfile subject.fasta

    Generate path from BLAST file. If multiple subjects map to the same query,
    an edge is constructed between them (with the link provided by the query).

    The BLAST file MUST be filtered, chained, supermapped.
    """
    from jcvi.formats.blast import sort
    from jcvi.utils.range import range_distance

    p = OptionParser(fromblast.__doc__)
    p.add_option(
        "--clique",
        default=False,
        action="store_true",
        help="Populate clique instead of linear path [default: %default]",
    )
    p.add_option(
        "--maxdist",
        default=100000,
        type="int",
        help="Create edge within certain distance [default: %default]",
    )
    p.set_verbose(help="Print verbose reports to stdout")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, subjectfasta = args
    clique = opts.clique
    maxdist = opts.maxdist
    sort([blastfile, "--query"])
    blast = BlastSlow(blastfile, sorted=True)
    g = BiGraph()
    for query, blines in groupby(blast, key=lambda x: x.query):
        blines = list(blines)
        iterator = combinations(blines, 2) if clique else pairwise(blines)
        for a, b in iterator:
            asub, bsub = a.subject, b.subject
            if asub == bsub:
                continue

            arange = (a.query, a.qstart, a.qstop, "+")
            brange = (b.query, b.qstart, b.qstop, "+")
            dist, oo = range_distance(arange, brange, distmode="ee")
            if dist > maxdist:
                continue

            atag = ">" if a.orientation == "+" else "<"
            btag = ">" if b.orientation == "+" else "<"
            g.add_edge(asub, bsub, atag, btag)

    graph_to_agp(g, blastfile, subjectfasta, verbose=opts.verbose)
示例#12
0
文件: tsp.py 项目: zhaotao1987/jcvi
def plot_data(x, y, tour, M):
    from jcvi.graphics.base import plt, savefig
    plt.plot(x, y, "ro")
    for ia, ib in pairwise(tour):
        plt.plot((x[ia], x[ib]), (y[ia], y[ib]), "r-")

    score = evaluate(tour, M)
    plt.title("Score={0:.2f}".format(score))

    savefig("demo.pdf")
示例#13
0
文件: agp.py 项目: bennyyu/jcvi
    def validate_one(self, object, lines):
        object_beg = lines[0].object_beg
        assert object_beg == 1, \
                "object %s must start at 1 (instead of %d)" % \
                (object, object_beg)

        for a, b in pairwise(lines):
            assert b.object_beg - a.object_end == 1, \
                    "lines not continuous coords between:\n%s\n%s" % \
                    (a, b)
示例#14
0
 def compute_all_gaps(self, minsize=100, maxsize=500000, verbose=False):
     self.gapsizes = []
     for (a, b), gappos in zip(pairwise(self.scaffolds), self.pp):
         gapsize = self.compute_one_gap(a,
                                        b,
                                        gappos,
                                        minsize,
                                        maxsize,
                                        verbose=verbose)
         self.gapsizes.append(gapsize)
示例#15
0
文件: tsp.py 项目: Hensonmw/jcvi
def plot_data(x, y, tour, M):
    from jcvi.graphics.base import plt, savefig
    plt.plot(x, y, "ro")
    for ia, ib in pairwise(tour):
        plt.plot((x[ia], x[ib]), (y[ia], y[ib]), "r-")

    score = evaluate(tour, M)
    plt.title("Score={0:.2f}".format(score))

    savefig("demo.pdf")
示例#16
0
    def validate_one(self, object, lines):
        object_beg = lines[0].object_beg
        assert object_beg == 1, \
                "object %s must start at 1 (instead of %d)" % \
                (object, object_beg)

        for a, b in pairwise(lines):
            assert b.object_beg - a.object_end == 1, \
                    "lines not continuous coords between:\n%s\n%s" % \
                    (a, b)
示例#17
0
def pairinplace(args):
    """
    %prog pairinplace bulk.fastq

    Pair up the records in bulk.fastq by comparing the names for adjancent
    records. If they match, print to bulk.pairs.fastq, else print to
    bulk.frags.fastq.
    """
    from jcvi.utils.iter import pairwise

    p = OptionParser(pairinplace.__doc__)
    p.add_option("-r", dest="rclip", default=1, type="int",
            help="pair ID is derived from rstrip N chars [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastqfile, = args
    base = op.basename(fastqfile).split(".")[0]

    frags = base + ".frags.fastq"
    pairs = base + ".pairs.fastq"
    if fastqfile.endswith(".gz"):
        frags += ".gz"
        pairs += ".gz"

    fragsfw = must_open(frags, "w")
    pairsfw = must_open(pairs, "w")

    N = opts.rclip
    strip_name = lambda x: x[:-N] if N else str

    fh_iter = iter_fastq(fastqfile, key=strip_name)
    skipflag = False  # controls the iterator skip
    for a, b in pairwise(fh_iter):
        if b is None:  # hit the eof
            break

        if skipflag:
            skipflag = False
            continue

        if a.id == b.id:
            print >> pairsfw, a
            print >> pairsfw, b
            skipflag = True
        else:
            print >> fragsfw, a

    # don't forget the last one, when b is None
    if not skipflag:
        print >> fragsfw, a

    logging.debug("Reads paired into `%s` and `%s`" % (pairs, frags))
示例#18
0
def sequence_to_graph(G, seq, color='black'):
    """
    Automatically construct graph given a sequence of characters.
    """
    for x in seq:
        if x.endswith("_1"):  # Mutation
            G.node(x, color=color, width="0.1", shape="circle", label="")
        else:
            G.node(x, color=color)
    for a, b in pairwise(seq):
        G.edge(a, b, color=color)
示例#19
0
文件: align.py 项目: zhimenggan/jcvi
 def draw(self):
     ar = self.ar
     pad = self.pad
     pads = 0
     for (a, b), w, color in zip(pairwise(ar), self.wiggles, self.colors):
         yf = self.ystart + w * 1. / self.wiggle
         if color:
             p = Rectangle((a + pads, yf), b - a, self.height, color=color)
             self.append(p)
         pads += pad
     self.add_patches()
示例#20
0
文件: graph.py 项目: tanghaibao/jcvi
def sequence_to_graph(G, seq, color='black'):
    """
    Automatically construct graph given a sequence of characters.
    """
    for x in seq:
        if x.endswith("_1"):  # Mutation
            G.node(x, color=color, width="0.1", shape="circle", label="")
        else:
            G.node(x, color=color)
    for a, b in pairwise(seq):
        G.edge(a, b, color=color)
示例#21
0
文件: align.py 项目: tanghaibao/jcvi
 def draw(self):
     ar = self.ar
     pad = self.pad
     pads = 0
     for (a, b), w, color in zip(pairwise(ar), self.wiggles, self.colors):
         yf = self.ystart + w * 1. / self.wiggle
         if color:
             p = Rectangle((a + pads, yf), b - a, self.height, color=color)
             self.append(p)
         pads += pad
     self.add_patches()
示例#22
0
文件: align.py 项目: zhimenggan/jcvi
    def from_silico(self, filename="Ecoli.silico", nfrags=25):
        fp = open(filename)
        next(fp)
        ar = [0] + [int(x) for x in fp.next().split()]
        sizes = []  # Only retain frags beyond certain size
        for a, b in pairwise(ar):
            size = b - a
            if size < max(ar[:nfrags]) / 100:
                continue
            sizes.append(size)

        sizes = [choice(sizes) for x in xrange(nfrags)]
        return sizes
示例#23
0
文件: sspace.py 项目: tanghaibao/jcvi
    def graph(self):
        g = BiGraph()
        for scaffold, lines in self.iter_scaffold():
            self.scf[scaffold] = [x.tig for x in lines]

            for a, b in pairwise(lines):
                g.add_edge(a.tig, b.tig, a.o, b.o, length=a.gaps)

            if len(lines) == 1:  # Singleton scaffold
                a = lines[0]
                g.add_node(a.tig)

        return g
示例#24
0
文件: align.py 项目: tanghaibao/jcvi
    def from_silico(self, filename="Ecoli.silico", nfrags=25):
        fp = open(filename)
        next(fp)
        ar = [0] + [int(x) for x in fp.next().split()]
        sizes = []  # Only retain frags beyond certain size
        for a, b in pairwise(ar):
            size = b - a
            if size < max(ar[:nfrags]) / 100:
                continue
            sizes.append(size)

        sizes = [choice(sizes) for x in xrange(nfrags)]
        return sizes
示例#25
0
    def graph(self):
        g = BiGraph()
        for scaffold, lines in self.iter_scaffold():
            self.scf[scaffold] = [x.tig for x in lines]

            for a, b in pairwise(lines):
                g.add_edge(a.tig, b.tig, a.o, b.o, length=a.gaps)

            if len(lines) == 1:  # Singleton scaffold
                a = lines[0]
                g.add_node(a.tig)

        return g
示例#26
0
文件: graph.py 项目: xuanblo/jcvi
def make_paths(paths, weights=None):
    """
    Zip together paths. Called by merge_paths().
    """
    npaths = len(paths)
    weights = weights or [1] * npaths
    assert len(paths) == len(weights)

    G = nx.DiGraph()
    for path, w in zip(paths, weights):
        for a, b in pairwise(path):
            update_weight(G, a, b, w)
    return G
示例#27
0
def make_paths(paths, weights=None):
    """
    Zip together paths. Called by merge_paths().
    """
    npaths = len(paths)
    weights = weights or [1] * npaths
    assert len(paths) == len(weights)

    G = nx.DiGraph()
    for path, w in zip(paths, weights):
        for a, b in pairwise(path):
            update_weight(G, a, b, w)
    return G
示例#28
0
文件: syntenypath.py 项目: rrane/jcvi
def fromblast(args):
    """
    %prog fromblast blastfile subject.fasta

    Generate path from BLAST file. If multiple subjects map to the same query,
    an edge is constructed between them (with the link provided by the query).

    The BLAST file MUST be filtered, chained, supermapped.
    """
    from jcvi.formats.blast import sort
    from jcvi.utils.range import range_distance

    p = OptionParser(fromblast.__doc__)
    p.add_option("--clique", default=False, action="store_true",
                 help="Populate clique instead of linear path [default: %default]")
    p.add_option("--maxdist", default=100000, type="int",
                 help="Create edge within certain distance [default: %default]")
    p.add_option("--verbose", default=False, action="store_true",
                 help="Print verbose reports to stdout [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, subjectfasta = args
    clique = opts.clique
    maxdist = opts.maxdist
    sort([blastfile, "--query"])
    blast = BlastSlow(blastfile, sorted=True)
    g = BiGraph()
    for query, blines in groupby(blast, key=lambda x: x.query):
        blines = list(blines)
        iterator = combinations(blines, 2) if clique else pairwise(blines)
        for a, b in iterator:
            asub, bsub = a.subject, b.subject
            if asub == bsub:
                continue

            arange = (a.query, a.qstart, a.qstop, "+")
            brange = (b.query, b.qstart, b.qstop, "+")
            dist, oo = range_distance(arange, brange, distmode="ee")
            if dist > maxdist:
                continue

            atag = ">" if a.orientation == "+" else "<"
            btag = ">" if b.orientation == "+" else "<"
            g.add_edge(BiEdge(asub, bsub, atag, btag))

    graph_to_agp(g, blastfile, subjectfasta, verbose=opts.verbose)
示例#29
0
文件: graph.py 项目: rrane/jcvi
def merge_paths(paths):
    """
    Zip together sorted lists.

    >>> paths = [[1, 2, 3], [1, 3, 4], [2, 4, 5]]
    >>> merge_paths(paths)
    [1, 2, 3, 4, 5]
    """
    from jcvi.utils.iter import pairwise

    edges = []
    for a in paths:
        edges.extend(list(pairwise(a)))

    g = nx.DiGraph(edges)
    return topological_sort(g)
示例#30
0
 def subtourelim(model, where):
     if where != GRB.callback.MIPSOL:
         return
     selected = []
     # make a list of edges selected in the solution
     sol = model.cbGetSolution([model._vars[i] for i in range(nedges)])
     selected = [edges[i] for i, x in enumerate(sol) if x > .5]
     selected = [(idx[a], idx[b]) for a, b, w in selected]
     # find the shortest cycle in the selected edge list
     tour = subtour(selected)
     if len(tour) == n:
         return
     # add a subtour elimination constraint
     c = tour
     incident = [edge_store[a, b] for a, b in pairwise(c + [c[0]])]
     model.cbLazy(quicksum(model._vars[x] for x in incident) <= len(tour) - 1)
示例#31
0
文件: sspace.py 项目: biologyguy/jcvi
def path_to_agp(g, path, object, sizes, status):
    lines = []
    for (a, ao), (b, bo) in pairwise(path):
        ao = get_orientation(ao, status)
        e = g.get_edge(a.v, b.v)
        cline = AGPLine.cline(object, a.v, sizes, ao)
        gline = AGPLine.gline(object, e.length)
        lines.append(cline)
        lines.append(gline)
    # Do not forget the last one
    z, zo = path[-1]
    zo = get_orientation(zo, status)
    cline = AGPLine.cline(object, z.v, sizes, zo)
    lines.append(cline)

    return lines
示例#32
0
文件: sspace.py 项目: arvin580/jcvi
def path_to_agp(g, path, object, sizes, status):
    lines = []
    for (a, ao), (b, bo) in pairwise(path):
        ao = get_orientation(ao, status)
        e = g.get_edge(a.v, b.v)
        cline = AGPLine.cline(object, a.v, sizes, ao)
        gline = AGPLine.gline(object, e.length)
        lines.append(cline)
        lines.append(gline)
    # Do not forget the last one
    z, zo = path[-1]
    zo = get_orientation(zo, status)
    cline = AGPLine.cline(object, z.v, sizes, zo)
    lines.append(cline)

    return lines
示例#33
0
def breakpoint(args):
    """
    %prog breakpoint mstmap.input > breakpoints.bed

    Find scaffold breakpoints using genetic map. Use variation.vcf.mstmap() to
    generate the input for this routine.
    """
    from jcvi.utils.iter import pairwise

    p = OptionParser(breakpoint.__doc__)
    p.add_option(
        "--diff",
        default=0.1,
        type="float",
        help="Maximum ratio of differences allowed",
    )
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (mstmap, ) = args
    diff = opts.diff
    data = MSTMap(mstmap)

    # Remove singleton markers (avoid double cross-over)
    good = []
    nsingletons = 0
    for i in range(1, len(data) - 1):
        a = data[i]
        left_label, left_rr = check_markers(data[i - 1], a, diff)
        right_label, right_rr = check_markers(a, data[i + 1], diff)

        if left_label == BREAK and right_label == BREAK:
            nsingletons += 1
            continue

        good.append(a)

    logging.debug(
        "A total of {0} singleton markers removed.".format(nsingletons))

    for a, b in pairwise(good):
        label, rr = check_markers(a, b, diff)
        if label == BREAK:
            print("\t".join(str(x) for x in rr))
示例#34
0
文件: range.py 项目: xuanblo/jcvi
def range_depth(ranges, size, verbose=True):
    """
    Overlay ranges on [start, end], and summarize the ploidy of the intervals.
    """
    from jcvi.utils.iter import pairwise
    from jcvi.utils.cbook import percentage

    # Make endpoints
    endpoints = []
    for a, b in ranges:
        endpoints.append((a, LEFT))
        endpoints.append((b, RIGHT))
    endpoints.sort()
    vstart, vend = min(endpoints)[0], max(endpoints)[0]

    assert 0 <= vstart < size
    assert 0 <= vend < size

    depth = 0
    depthstore = defaultdict(int)
    depthstore[depth] += vstart
    depthdetails = [(0, vstart, depth)]

    for (a, atag), (b, btag) in pairwise(endpoints):
        if atag == LEFT:
            depth += 1
        elif atag == RIGHT:
            depth -= 1
        depthstore[depth] += b - a
        depthdetails.append((a, b, depth))

    assert btag == RIGHT
    depth -= 1

    assert depth == 0
    depthstore[depth] += size - vend
    depthdetails.append((vend, size, depth))

    assert sum(depthstore.values()) == size
    if verbose:
        for depth, count in sorted(depthstore.items()):
            print >> sys.stderr, "Depth {0}: {1}".\
                    format(depth, percentage(count, size))

    return depthstore, depthdetails
示例#35
0
文件: range.py 项目: shunte88/jcvi
def range_depth(ranges, size, verbose=True):
    """
    Overlay ranges on [start, end], and summarize the ploidy of the intervals.
    """
    from jcvi.utils.iter import pairwise
    from jcvi.utils.cbook import percentage

    # Make endpoints
    endpoints = []
    for a, b in ranges:
        endpoints.append((a, LEFT))
        endpoints.append((b, RIGHT))
    endpoints.sort()
    vstart, vend = min(endpoints)[0], max(endpoints)[0]

    assert 0 <= vstart < size
    assert 0 <= vend < size

    depth = 0
    depthstore = defaultdict(int)
    depthstore[depth] += vstart
    depthdetails = [(0, vstart, depth)]

    for (a, atag), (b, btag) in pairwise(endpoints):
        if atag == LEFT:
            depth += 1
        elif atag == RIGHT:
            depth -= 1
        depthstore[depth] += b - a
        depthdetails.append((a, b, depth))

    assert btag == RIGHT
    depth -= 1

    assert depth == 0
    depthstore[depth] += size - vend
    depthdetails.append((vend, size, depth))

    assert sum(depthstore.values()) == size
    if verbose:
        for depth, count in sorted(depthstore.items()):
            print >> sys.stderr, "Depth {0}: {1}".\
                    format(depth, percentage(count, size))

    return depthstore, depthdetails
示例#36
0
文件: range.py 项目: xuanblo/jcvi
def range_interleave(ranges, sizes={}, empty=False):
    """
    Returns the ranges in between the given ranges.

    >>> ranges = [("1", 30, 40), ("1", 45, 50), ("1", 10, 30)]
    >>> range_interleave(ranges)
    [('1', 41, 44)]
    >>> ranges = [("1", 30, 40), ("1", 42, 50)]
    >>> range_interleave(ranges)
    [('1', 41, 41)]
    >>> range_interleave(ranges, sizes={"1": 70})
    [('1', 1, 29), ('1', 41, 41), ('1', 51, 70)]
    """
    from jcvi.utils.iter import pairwise
    ranges = range_merge(ranges)
    interleaved_ranges = []

    for ch, cranges in groupby(ranges, key=lambda x: x[0]):
        cranges = list(cranges)
        size = sizes.get(ch, None)
        if size:
            ch, astart, aend = cranges[0]
            if astart > 1:
                interleaved_ranges.append((ch, 1, astart - 1))
            elif empty:
                interleaved_ranges.append(None)

        for a, b in pairwise(cranges):
            ch, astart, aend = a
            ch, bstart, bend = b
            istart, iend = aend + 1, bstart - 1
            if istart <= iend:
                interleaved_ranges.append((ch, istart, iend))
            elif empty:
                interleaved_ranges.append(None)

        if size:
            ch, astart, aend = cranges[-1]
            if aend < size:
                interleaved_ranges.append((ch, aend + 1, size))
            elif empty:
                interleaved_ranges.append(None)

    return interleaved_ranges
示例#37
0
文件: range.py 项目: shunte88/jcvi
def range_interleave(ranges, sizes={}, empty=False):
    """
    Returns the ranges in between the given ranges.

    >>> ranges = [("1", 30, 40), ("1", 45, 50), ("1", 10, 30)]
    >>> range_interleave(ranges)
    [('1', 41, 44)]
    >>> ranges = [("1", 30, 40), ("1", 42, 50)]
    >>> range_interleave(ranges)
    [('1', 41, 41)]
    >>> range_interleave(ranges, sizes={"1": 70})
    [('1', 1, 29), ('1', 41, 41), ('1', 51, 70)]
    """
    from jcvi.utils.iter import pairwise
    ranges = range_merge(ranges)
    interleaved_ranges = []

    for ch, cranges in groupby(ranges, key=lambda x: x[0]):
        cranges = list(cranges)
        size = sizes.get(ch, None)
        if size:
            ch, astart, aend = cranges[0]
            if astart > 1:
                interleaved_ranges.append((ch, 1, astart - 1))
            elif empty:
                interleaved_ranges.append(None)

        for a, b in pairwise(cranges):
            ch, astart, aend = a
            ch, bstart, bend = b
            istart, iend = aend + 1, bstart - 1
            if istart <= iend:
                interleaved_ranges.append((ch, istart, iend))
            elif empty:
                interleaved_ranges.append(None)

        if size:
            ch, astart, aend = cranges[-1]
            if aend < size:
                interleaved_ranges.append((ch, aend + 1, size))
            elif empty:
                interleaved_ranges.append(None)

    return interleaved_ranges
示例#38
0
文件: sspace.py 项目: biologyguy/jcvi
    def write_agp(self, filename):
        sizes = self.sz
        agp = []
        for scaffold, lines in self.iter_scaffold():
            for a, b in pairwise(lines):
                cline = AGPLine.cline(scaffold, a.tig, sizes, a.oo)
                gline = AGPLine.gline(scaffold, a.gaps)
                agp.append(cline)
                agp.append(gline)
            a = lines[-1]
            cline = AGPLine.cline(scaffold, a.tig, sizes, a.oo)
            agp.append(cline)

        fw = open(filename, "w")
        for a in agp:
            print >> fw, a
        fw.close()

        reindex([filename, "--inplace"])
        return filename
示例#39
0
文件: sspace.py 项目: arvin580/jcvi
    def write_agp(self, filename):
        sizes = self.sz
        agp = []
        for scaffold, lines in self.iter_scaffold():
            for a, b in pairwise(lines):
                cline = AGPLine.cline(scaffold, a.tig, sizes, a.oo)
                gline = AGPLine.gline(scaffold, a.gaps)
                agp.append(cline)
                agp.append(gline)
            a = lines[-1]
            cline = AGPLine.cline(scaffold, a.tig, sizes, a.oo)
            agp.append(cline)

        fw = open(filename, "w")
        for a in agp:
            print >> fw, a
        fw.close()

        reindex([filename, "--inplace"])
        return filename
示例#40
0
def breakpoint(args):
    """
    %prog breakpoint mstmap.input > breakpoints.bed

    Find scaffold breakpoints using genetic map. Use variation.vcf.mstmap() to
    generate the input for this routine.
    """
    from jcvi.utils.iter import pairwise

    p = OptionParser(breakpoint.__doc__)
    p.add_option("--diff", default=.1, type="float",
                 help="Maximum ratio of differences allowed [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    mstmap, = args
    diff = opts.diff
    data = MSTMap(mstmap)

    # Remove singleton markers (avoid double cross-over)
    good = []
    nsingletons = 0
    for i in xrange(1, len(data) - 1):
        a = data[i]
        left_label, left_rr = check_markers(data[i - 1], a, diff)
        right_label, right_rr = check_markers(a, data[i + 1], diff)

        if left_label == BREAK and right_label == BREAK:
            nsingletons += 1
            continue

        good.append(a)

    logging.debug("A total of {0} singleton markers removed.".format(nsingletons))

    for a, b in pairwise(good):
        label, rr = check_markers(a, b, diff)
        if label == BREAK:
            print "\t".join(str(x) for x in rr)
示例#41
0
文件: pad.py 项目: tanghaibao/jcvi
def write_PAD_bed(bedfile, prefix, pads, bed):

    fw = open(bedfile, "w")
    padnames = ["{0}:{1:05d}-{2:05d}".format(prefix, a, b) for a, b in pads]
    for a, b in pairwise(padnames):
        assert a != b, a

    j = 0
    # Assign all genes to new partitions
    for i, x in enumerate(bed):
        a, b = pads[j]
        if i > b:
            j += 1
            a, b = pads[j]
        print("\t".join((padnames[j], str(i), str(i + 1), x.accn)), file=fw)

    fw.close()

    npads = len(pads)
    logging.debug("{0} partition written in `{1}`.".format(npads, bedfile))
    return npads, padnames
示例#42
0
文件: pad.py 项目: wroldwiedbwe/jcvi
def write_PAD_bed(bedfile, prefix, pads, bed):

    fw = open(bedfile, "w")
    padnames = ["{0}:{1:05d}-{2:05d}".format(prefix, a, b) for a, b in pads]
    for a, b in pairwise(padnames):
        assert a != b, a

    j = 0
    # Assign all genes to new partitions
    for i, x in enumerate(bed):
        a, b = pads[j]
        if i > b:
            j += 1
            a, b = pads[j]
        print("\t".join((padnames[j], str(i), str(i + 1), x.accn)), file=fw)

    fw.close()

    npads = len(pads)
    logging.debug("{0} partition written in `{1}`.".format(npads, bedfile))
    return npads, padnames
示例#43
0
def happy_edges(row, prefix=None):
    """
    Convert a row in HAPPY file and yield edges.
    """
    trans = maketrans("[](){}", "      ")
    row = row.strip().strip("+")
    row = row.translate(trans)
    scfs = [x.strip("+") for x in row.split(":")]
    for a, b in pairwise(scfs):
        oa = '<' if a.strip()[0] == '-' else '>'
        ob = '<' if b.strip()[0] == '-' else '>'

        is_uncertain = a[-1] == ' ' or b[0] == ' '

        a = a.strip().strip('-')
        b = b.strip().strip('-')

        if prefix:
            a = prefix + a
            b = prefix + b

        yield (a, b, oa, ob), is_uncertain
示例#44
0
def happy_edges(row, prefix=None):
    """
    Convert a row in HAPPY file and yield edges.
    """
    trans = str.maketrans("[](){}", "      ")
    row = row.strip().strip("+")
    row = row.translate(trans)
    scfs = [x.strip("+") for x in row.split(":")]
    for a, b in pairwise(scfs):
        oa = "<" if a.strip()[0] == "-" else ">"
        ob = "<" if b.strip()[0] == "-" else ">"

        is_uncertain = a[-1] == " " or b[0] == " "

        a = a.strip().strip("-")
        b = b.strip().strip("-")

        if prefix:
            a = prefix + a
            b = prefix + b

        yield (a, b, oa, ob), is_uncertain
示例#45
0
def happy_edges(row, prefix=None):
    """
    Convert a row in HAPPY file and yield edges.
    """
    trans = maketrans("[](){}", "      ")
    row = row.strip().strip("+")
    row = row.translate(trans)
    scfs = [x.strip("+") for x in row.split(":")]
    for a, b in pairwise(scfs):
        oa = '<' if a.strip()[0] == '-' else '>'
        ob = '<' if b.strip()[0] == '-' else '>'

        is_uncertain = a[-1] == ' ' or b[0] == ' '

        a = a.strip().strip('-')
        b = b.strip().strip('-')

        if prefix:
            a = prefix + a
            b = prefix + b

        yield (a, b, oa, ob), is_uncertain
示例#46
0
文件: patch.py 项目: JinfengChen/jcvi
def gaps(args):
    """
    %prog gaps OM.bed fastafile

    Create patches around OM gaps.
    """
    from jcvi.formats.bed import uniq
    from jcvi.utils.iter import pairwise

    p = OptionParser(gaps.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ombed, fastafile = args
    ombed = uniq([ombed])
    bed = Bed(ombed)

    for a, b in pairwise(bed):
        om_a = (a.seqid, a.start, a.end, "+")
        om_b = (b.seqid, b.start, b.end, "+")
        ch_a = range_parse(a.accn)
        ch_b = range_parse(b.accn)
        ch_a = (ch_a.seqid, ch_a.start, ch_a.end, "+")
        ch_b = (ch_b.seqid, ch_b.start, ch_b.end, "+")

        om_dist, x = range_distance(om_a, om_b, distmode="ee")
        ch_dist, x = range_distance(ch_a, ch_b, distmode="ee")

        if om_dist <= 0 and ch_dist <= 0:
            continue

        print a
        print b
        print om_dist, ch_dist
示例#47
0
def gaps(args):
    """
    %prog gaps OM.bed fastafile

    Create patches around OM gaps.
    """
    from jcvi.formats.bed import uniq
    from jcvi.utils.iter import pairwise

    p = OptionParser(gaps.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ombed, fastafile = args
    ombed = uniq([ombed])
    bed = Bed(ombed)

    for a, b in pairwise(bed):
        om_a = (a.seqid, a.start, a.end, "+")
        om_b = (b.seqid, b.start, b.end, "+")
        ch_a = range_parse(a.accn)
        ch_b = range_parse(b.accn)
        ch_a = (ch_a.seqid, ch_a.start, ch_a.end, "+")
        ch_b = (ch_b.seqid, ch_b.start, ch_b.end, "+")

        om_dist, x = range_distance(om_a, om_b, distmode="ee")
        ch_dist, x = range_distance(ch_a, ch_b, distmode="ee")

        if om_dist <= 0 and ch_dist <= 0:
            continue

        print a
        print b
        print om_dist, ch_dist
示例#48
0
def adjgraph(args):
    """
    %prog adjgraph adjacency.txt subgraph.txt

    Construct adjacency graph for graphviz. The file may look like sample below.
    The lines with numbers are chromosomes with gene order information.

    genome 0
    chr 0
    -1 -13 -16 3 4 -6126 -5 17 -6 7 18 5357 8 -5358 5359 -9 -10 -11 5362 5360
    chr 1
    138 6133 -5387 144 -6132 -139 140 141 146 -147 6134 145 -170 -142 -143
    """
    import pygraphviz as pgv
    from jcvi.utils.iter import pairwise
    from jcvi.formats.base import SetFile

    p = OptionParser(adjgraph.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    infile, subgraph = args
    subgraph = SetFile(subgraph)
    subgraph = set(x.strip("-") for x in subgraph)

    G = pgv.AGraph(strict=False)  # allow multi-edge
    SG = pgv.AGraph(strict=False)

    palette = ("green", "magenta", "tomato", "peachpuff")
    fp = open(infile)
    genome_id = -1
    key = 0
    for row in fp:
        if row.strip() == "":
            continue

        atoms = row.split()
        tag = atoms[0]
        if tag in ("ChrNumber", "chr"):
            continue

        if tag == "genome":
            genome_id += 1
            gcolor = palette[genome_id]
            continue

        nodeseq = []
        for p in atoms:
            np = p.strip("-")
            nodeL, nodeR = np + "L", np + "R"
            if p[0] == "-":  # negative strand
                nodeseq += [nodeR, nodeL]
            else:
                nodeseq += [nodeL, nodeR]

        for a, b in pairwise(nodeseq):
            G.add_edge(a, b, key, color=gcolor)
            key += 1

            na, nb = a[:-1], b[:-1]
            if na not in subgraph and nb not in subgraph:
                continue

            SG.add_edge(a, b, key, color=gcolor)

    G.graph_attr.update(dpi="300")

    fw = open("graph.dot", "w")
    G.write(fw)
    fw.close()

    fw = open("subgraph.dot", "w")
    SG.write(fw)
    fw.close()
示例#49
0
文件: agp.py 项目: bennyyu/jcvi
def cut(args):
    """
    %prog cut agpfile bedfile

    Cut at the boundaries of the ranges in the bedfile. Use --shrink to control
    the exact boundaries where you cut.
    """
    p = OptionParser(cut.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    agpfile, bedfile = args
    agp = AGP(agpfile)
    bed = Bed(bedfile)
    simple_agp = agp.order
    newagpfile = agpfile.replace(".agp", ".cut.agp")
    fw = open(newagpfile, "w")

    agp_fixes = defaultdict(list)
    for component, intervals in bed.sub_beds():
        i, a = simple_agp[component]
        object = a.object
        component_span = a.component_span
        orientation = a.orientation

        assert a.component_beg, a.component_end
        arange = a.component_beg, a.component_end

        cuts = set()
        for i in intervals:
            start, end = i.start, i.end
            end -= 1

            assert start <= end
            cuts.add(start)
            cuts.add(end)

        cuts.add(0)
        cuts.add(component_span)
        cuts = list(sorted(cuts))

        sum_of_spans = 0
        for i, (a, b) in enumerate(pairwise(cuts)):
            oid = object + "_{0}".format(i)
            aline = [oid, 0, 0, 0]
            cspan = b - a
            aline += ['D', component, a + 1, b, orientation]
            sum_of_spans += cspan

            aline = "\t".join(str(x) for x in aline)
            agp_fixes[component].append(aline)

        assert component_span == sum_of_spans

    # Finally write the masked agp
    for a in agp:
        if not a.is_gap and a.component_id in agp_fixes:
            print >> fw, "\n".join(agp_fixes[a.component_id])
        else:
            print >> fw, a

    fw.close()
    # Reindex
    idxagpfile = reindex([newagpfile])
    shutil.move(idxagpfile, newagpfile)

    return newagpfile
示例#50
0
def tsp(edges, constraint_generation=False):
    """
    Calculates shortest cycle that traverses each node exactly once. Also known
    as the Traveling Salesman Problem (TSP).
    """
    edges = populate_edge_weights(edges)
    incoming, outgoing, nodes = node_to_edge(edges)

    nedges, nnodes = len(edges), len(nodes)
    L = LPInstance()

    L.add_objective(edges, objective=MINIMIZE)
    balance = []
    # For each node, select exactly 1 incoming and 1 outgoing edge
    for v in nodes:
        incoming_edges = incoming[v]
        outgoing_edges = outgoing[v]
        icc = summation(incoming_edges)
        occ = summation(outgoing_edges)
        balance.append("{0} = 1".format(icc))
        balance.append("{0} = 1".format(occ))

    # Subtour elimination - Miller-Tucker-Zemlin (MTZ) formulation
    # <http://en.wikipedia.org/wiki/Travelling_salesman_problem>
    # Desrochers and laporte, 1991 (DFJ) has a stronger constraint
    # See also:
    # G. Laporte / The traveling salesman problem: Overview of algorithms
    start_step = nedges + 1
    u0 = nodes[0]
    nodes_to_steps = dict((n, start_step + i) for i, n in enumerate(nodes[1:]))
    edge_store = dict((e[:2], i) for i, e in enumerate(edges))
    mtz = []
    for i, e in enumerate(edges):
        a, b = e[:2]
        if u0 in (a, b):
            continue
        na, nb = nodes_to_steps[a], nodes_to_steps[b]
        con_ab = " x{0} - x{1} + {2}x{3}".format(na, nb, nnodes - 1, i + 1)
        if (b, a) in edge_store:  # This extra term is the stronger DFJ formulation
            j = edge_store[(b, a)]
            con_ab += " + {0}x{1}".format(nnodes - 3, j + 1)
        con_ab += " <= {0}".format(nnodes - 2)
        mtz.append(con_ab)

    # Step variables u_i bound between 1 and n, as additional variables
    bounds = []
    for i in xrange(start_step, nedges + nnodes):
        bounds.append(" 1 <= x{0} <= {1}".format(i, nnodes - 1))

    L.add_vars(nedges)

    """
    Constraint generation seek to find 'cuts' in the LP problem, by solving the
    relaxed form. The subtours were then incrementally added to the constraints.
    """
    if constraint_generation:
        L.constraints = balance
        subtours = []
        while True:
            selected, obj_val = L.lpsolve()
            results = sorted(x for i, x in enumerate(edges) if i in selected) \
                            if selected else None
            if not results:
                break
            G = edges_to_graph(results)
            cycles = list(nx.simple_cycles(G))
            if len(cycles) == 1:
                break
            for c in cycles:
                incident = [edge_store[a, b] for a, b in pairwise(c + [c[0]])]
                icc = summation(incident)
                subtours.append("{0} <= {1}".format(icc, len(incident) - 1))
            L.constraints = balance + subtours
    else:
        L.constraints = balance + mtz
        L.add_vars(nnodes - 1, offset=start_step, binary=False)
        L.bounds = bounds
        selected, obj_val = L.lpsolve()
        results = sorted(x for i, x in enumerate(edges) if i in selected) \
                        if selected else None

    return results
示例#51
0
文件: agp.py 项目: bennyyu/jcvi
def mask(args):
    """
    %prog mask agpfile bedfile

    Mask given ranges in componets to gaps.
    """
    p = OptionParser(mask.__doc__)
    p.add_option("--split", default=False, action="store_true",
                 help="Split object and create new names [default: %default]")
    p.add_option("--log", default=False, action="store_true",
                 help="Write verbose logs to .masklog file [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(p.print_help())

    agpfile, bedfile = args
    agp = AGP(agpfile)
    bed = Bed(bedfile)
    simple_agp = agp.order
    # agp lines to replace original ones, keyed by the component
    agp_fixes = defaultdict(list)

    newagpfile = agpfile.replace(".agp", ".masked.agp")
    logfile = bedfile.replace(".bed", ".masklog")
    fw = open(newagpfile, "w")
    if opts.log:
        fwlog = open(logfile, "w")

    for component, intervals in bed.sub_beds():
        if opts.log:
            print >> fwlog, "\n".join(str(x) for x in intervals)
        i, a = simple_agp[component]
        object = a.object
        component_span = a.component_span
        orientation = a.orientation
        if opts.log:
            print >> fwlog, a

        assert a.component_beg, a.component_end
        arange = a.component_beg, a.component_end

        # Make sure `ivs` contain DISJOINT ranges, and located within `arange`
        ivs = []
        for i in intervals:
            iv = range_intersect(arange, (i.start, i.end))
            if iv is not None:
                ivs.append(iv)

        # Sort the ends of `ivs` as well as the arange
        arange = a.component_beg - 1, a.component_end + 1
        endpoints = sorted(flatten(ivs + [arange]))
        # reverse if component on negative strand
        if orientation == '-':
            endpoints.reverse()

        sum_of_spans = 0
        # assign complements as sequence components
        for i, (a, b) in enumerate(pairwise(endpoints)):
            if orientation == '-':
                a, b = b, a
            if orientation not in ('+', '-'):
                orientation = '+'

            oid = object + "_{0}".format(i / 2) if opts.split else object
            aline = [oid, 0, 0, 0]
            if i % 2 == 0:
                cspan = b - a - 1
                aline += ['D', component, a + 1, b - 1, orientation]
                is_gap = False
            else:
                cspan = b - a + 1
                aline += ["N", cspan, "fragment", "yes"]
                is_gap = True
            if cspan <= 0:
                continue

            sum_of_spans += cspan
            aline = "\t".join(str(x) for x in aline)
            if not (opts.split and is_gap):
                agp_fixes[component].append(aline)

            if opts.log:
                print >> fwlog, aline

        assert component_span == sum_of_spans
        if opts.log:
            print >> fwlog

    # Finally write the masked agp
    for a in agp:
        if not a.is_gap and a.component_id in agp_fixes:
            print >> fw, "\n".join(agp_fixes[a.component_id])
        else:
            print >> fw, a

    fw.close()
    # Reindex
    idxagpfile = reindex([newagpfile])
    shutil.move(idxagpfile, newagpfile)

    return newagpfile
示例#52
0
def fromblast(args):
    """
    %prog fromblast blastfile subject.fasta

    Generate path from BLAST file. If multiple subjects map to the same query,
    an edge is constructed between them (with the link provided by the query).

    The BLAST file MUST be filtered, chained, supermapped.
    """
    from jcvi.formats.blast import sort
    from jcvi.utils.range import range_distance

    p = OptionParser(fromblast.__doc__)
    p.add_option(
        "--clique",
        default=False,
        action="store_true",
        help="Populate clique instead of linear path [default: %default]")
    p.add_option(
        "--maxdist",
        default=100000,
        type="int",
        help="Create edge within certain distance [default: %default]")
    p.add_option("--verbose",
                 default=False,
                 action="store_true",
                 help="Print verbose reports to stdout [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, subjectfasta = args
    clique = opts.clique
    maxdist = opts.maxdist
    sort([blastfile, "--query"])
    blast = BlastSlow(blastfile, sorted=True)
    g = BiGraph()
    for query, blines in groupby(blast, key=lambda x: x.query):
        blines = list(blines)
        iterator = combinations(blines, 2) if clique else pairwise(blines)
        for a, b in iterator:
            asub, bsub = a.subject, b.subject
            if asub == bsub:
                continue

            arange = (a.query, a.qstart, a.qstop, "+")
            brange = (b.query, b.qstart, b.qstop, "+")
            dist, oo = range_distance(arange, brange, distmode="ee")
            if dist > maxdist:
                continue

            atag = ">" if a.orientation == "+" else "<"
            btag = ">" if b.orientation == "+" else "<"
            g.add_edge(BiEdge(asub, bsub, atag, btag))

    g.write("graph.txt")
    #g.draw("graph.pdf")

    logging.debug(str(g))
    paths = []
    for path in g.iter_paths():
        m, oo = g.path(path)
        if len(oo) == 1:  # Singleton path
            continue
        paths.append(oo)
        if opts.verbose:
            print m
            print oo

    npaths = len(paths)
    ntigs = sum(len(x) for x in paths)
    logging.debug("Graph decomposed to {0} paths with {1} components.".\
                  format(npaths, ntigs))

    agpfile = blastfile + ".agp"
    sizes = Sizes(subjectfasta)
    fwagp = open(agpfile, "w")
    scaffolded = set()
    for i, oo in enumerate(paths):
        ctgorder = [(str(ctg), ("+" if strand else "-")) \
                     for ctg, strand in oo]
        scaffolded |= set(ctg for ctg, strand in ctgorder)
        object = "pmol_{0:04d}".format(i)
        order_to_agp(object, ctgorder, sizes.mapping, fwagp)

    # Get the singletons as well
    nsingletons = 0
    for ctg, size in sizes.iter_sizes():
        if ctg in scaffolded:
            continue

        ctgorder = [(ctg, "+")]
        object = ctg
        order_to_agp(object, ctgorder, sizes.mapping, fwagp)
        nsingletons += 1
    logging.debug("Written {0} unscaffolded singletons.".format(nsingletons))

    fwagp.close()
    logging.debug("AGP file written to `{0}`.".format(agpfile))
示例#53
0
文件: bed.py 项目: radaniba/jcvi
 def links(self):
     r = []
     for s, sb in self.sub_beds():
         for a, b in pairwise(sb):
             r.append(((a.accn, a.strand), (b.accn, b.strand)))
     return r
示例#54
0
文件: patch.py 项目: JinfengChen/jcvi
def bambus(args):
    """
    %prog bambus bambus.bed bambus.mates total.fasta

    Insert unplaced scaffolds based on mates.
    """
    from jcvi.utils.iter import pairwise
    from jcvi.formats.posmap import MatesFile

    p = OptionParser(bambus.__doc__)
    p.add_option("--prefix", default="scaffold",
                 help="Prefix of the unplaced scaffolds [default: %default]")
    p.add_option("--minlinks", default=3, type="int",
                 help="Minimum number of links to place [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    bedfile, matesfile, fastafile = args
    pf = matesfile.rsplit(".", 1)[0]
    logfile = pf + ".log"
    log = open(logfile, "w")

    mf = MatesFile(matesfile)
    maxdist = max(x.max for x in mf.libraries.values())
    logging.debug("Max separation: {0}".format(maxdist))

    prefix = opts.prefix
    minlinks = opts.minlinks

    is_unplaced = lambda x: x.startswith(prefix)
    bed = Bed(bedfile, sorted=False)
    beds = []
    unplaced = defaultdict(list)

    for a, b in pairwise(bed):
        aname, bname = a.accn, b.accn
        aseqid, bseqid = a.seqid, b.seqid

        if aname not in mf:
            continue

        pa, la = mf[aname]
        if pa != bname:
            continue

        ia = is_unplaced(aseqid)
        ib = is_unplaced(bseqid)
        if ia == ib:
            continue

        if ia:
            a, b = b, a

        unplaced[b.seqid].append((a, b))
        beds.extend([a, b])

    sizes = Sizes(fastafile)
    candidatebed = Bed()
    cbeds = []
    # For each unplaced scaffold, find most likely placement and orientation
    for scf, beds in sorted(unplaced.items()):
        print >> log
        ranges = []
        for a, b in beds:
            aname, astrand = a.accn, a.strand
            bname, bstrand = b.accn, b.strand
            aseqid, bseqid = a.seqid, b.seqid
            pa, lib = mf[aname]

            print >> log, a
            print >> log, b

            flip_b = (astrand == bstrand)
            fbstrand = '-' if flip_b else '+'
            if flip_b:
                b.reverse_complement(sizes)

            lmin, lmax = lib.min, lib.max

            L = sizes.get_size(scf)
            assert astrand in ('+', '-')
            if astrand == '+':
                offset = a.start - b.end
                sstart, sstop = offset + lmin, offset + lmax
            else:
                offset = a.end - b.start + L
                sstart, sstop = offset - lmax, offset - lmin

            # Prevent out of range error
            size = sizes.get_size(aseqid)
            sstart = max(0, sstart)
            sstop = max(0, sstop)
            sstart = min(size - 1, sstart)
            sstop = min(size - 1, sstop)

            start_range = (aseqid, sstart, sstop, scf, 1, fbstrand)
            print >> log, "*" + "\t".join(str(x) for x in start_range)
            ranges.append(start_range)

        mranges = [x[:3] for x in ranges]
        # Determine placement by finding the interval with the most support
        rd = ranges_depth(mranges, sizes.mapping, verbose=False)
        alldepths = []
        for depth in rd:
            alldepths.extend(depth)
        print >> log, alldepths

        maxdepth = max(alldepths, key=lambda x: x[-1])[-1]
        if maxdepth < minlinks:
            print >> log, "Insufficient links ({0} < {1})".format(maxdepth, minlinks)
            continue

        candidates = [x for x in alldepths if x[-1] == maxdepth]
        nseqids = len(set(x[0] for x in candidates))
        msg = "Multiple conflicting candidates found"
        if nseqids != 1:
            print >> log, msg
            continue

        seqid, mmin, mmax, depth = candidates[0]
        mmin, mmax = range_minmax([x[1:3] for x in candidates])
        if (mmax - mmin) > maxdist:
            print >> log, msg
            continue

        # Determine orientation by voting
        nplus, nminus = 0, 0
        arange = (seqid, mmin, mmax)
        for sid, start, end, sf, sc, fbstrand in ranges:
            brange = (sid, start, end)
            if range_overlap(arange, brange):
                if fbstrand == '+':
                    nplus += 1
                else:
                    nminus += 1

        fbstrand = '+' if nplus >= nminus else '-'

        candidate = (seqid, mmin, mmax, scf, depth, fbstrand)
        bedline = BedLine("\t".join((str(x) for x in candidate)))
        cbeds.append(bedline)
        print >> log, "Plus: {0}, Minus: {1}".format(nplus, nminus)
        print >> log, candidate

    candidatebed.extend(cbeds)
    logging.debug("A total of {0} scaffolds can be placed.".\
                    format(len(candidatebed)))
    log.close()

    candidatebedfile = pf + ".candidate.bed"
    candidatebed.print_to_file(candidatebedfile, sorted=True)