Пример #1
0
def group(args):
    """
    %prog group anchorfiles

    Group the anchors into ortho-groups. Can input multiple anchor files.
    """
    p = OptionParser(group.__doc__)
    p.set_outfile()

    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    anchorfiles = args
    groups = Grouper()
    for anchorfile in anchorfiles:
        ac = AnchorFile(anchorfile)
        for a, b, idx in ac.iter_pairs():
            groups.join(a, b)

    logging.debug("Created {0} groups with {1} members.".\
                  format(len(groups), groups.num_members))

    outfile = opts.outfile
    fw = must_open(outfile, "w")
    for g in groups:
        print >> fw, ",".join(sorted(g))
    fw.close()

    return outfile
Пример #2
0
def fuse(args):
    """
    %prog fuse *.bed *.anchors

    Fuse gene orders based on anchors file.
    """
    from jcvi.algorithms.graph import BiGraph

    p = OptionParser(fuse.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    bedfiles = [x for x in args if x.endswith(".bed")]
    anchorfiles = [x for x in args if x.endswith(".anchors")]

    # TODO: Use Markov clustering to sparsify the edges
    families = Grouper()
    for anchorfile in anchorfiles:
        af = AnchorFile(anchorfile)
        for a, b, block_id in af.iter_pairs():
            families.join(a, b)

    allowed = set(families.keys())
    logging.debug("Total families: {}, Gene members: {}"
                  .format(len(families), len(allowed)))

    # TODO: Use C++ implementation of BiGraph() when available
    # For now just serialize this to the disk
    G = BiGraph()
    for bedfile in bedfiles:
        bed = Bed(bedfile, include=allowed)
        #add_bed_to_graph(G, bed, families)
        print_edges(G, bed, families)
Пример #3
0
def fuse(args):
    """
    %prog fuse *.bed *.anchors

    Fuse gene orders based on anchors file.
    """
    from jcvi.algorithms.graph import BiGraph

    p = OptionParser(fuse.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    bedfiles = [x for x in args if x.endswith(".bed")]
    anchorfiles = [x for x in args if x.endswith(".anchors")]

    # TODO: Use Markov clustering to sparsify the edges
    families = Grouper()
    for anchorfile in anchorfiles:
        af = AnchorFile(anchorfile)
        for a, b, block_id in af.iter_pairs():
            families.join(a, b)

    allowed = set(families.keys())
    logging.debug("Total families: {}, Gene members: {}".format(
        len(families), len(allowed)))

    # TODO: Use C++ implementation of BiGraph() when available
    # For now just serialize this to the disk
    for bedfile in bedfiles:
        bed = Bed(bedfile, include=allowed)
        print_edges(bed, families)
Пример #4
0
def group(args):
    """
    %prog group anchorfiles

    Group the anchors into ortho-groups. Can input multiple anchor files.
    """
    p = OptionParser(group.__doc__)
    p.set_outfile()

    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    anchorfiles = args
    groups = Grouper()
    for anchorfile in anchorfiles:
        ac = AnchorFile(anchorfile)
        for a, b, idx in ac.iter_pairs():
            groups.join(a, b)

    logging.debug("Created {0} groups with {1} members.".\
                  format(len(groups), groups.num_members))

    outfile = opts.outfile
    fw = must_open(outfile, "w")
    for g in groups:
        print >> fw, ",".join(sorted(g))
    fw.close()

    return outfile
Пример #5
0
def bed(args):
    """
    %prog bed anchorsfile

    Convert ANCHORS file to BED format.
    """
    from collections import defaultdict
    from jcvi.compara.synteny import AnchorFile, check_beds
    from jcvi.formats.bed import Bed
    from jcvi.formats.base import get_number

    p = OptionParser(bed.__doc__)
    p.add_option("--switch", default=False, action="store_true",
                 help="Switch reference and aligned map elements")
    p.add_option("--scale", type="float",
                 help="Scale the aligned map distance by factor")
    p.set_beds()
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    anchorsfile, = args
    switch = opts.switch
    scale = opts.scale
    ac = AnchorFile(anchorsfile)
    pairs = defaultdict(list)
    for a, b, block_id in ac.iter_pairs():
        pairs[a].append(b)

    qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts)
    bd = Bed()
    for q in qbed:
        qseqid, qstart, qend, qaccn = q.seqid, q.start, q.end, q.accn
        if qaccn not in pairs:
            continue
        for s in pairs[qaccn]:
            si, s = sorder[s]
            sseqid, sstart, send, saccn = s.seqid, s.start, s.end, s.accn
        if switch:
            qseqid, sseqid = sseqid, qseqid
            qstart, sstart = sstart, qstart
            qend, send = send, qend
            qaccn, saccn = saccn, qaccn
        if scale:
            sstart /= scale
        try:
            newsseqid = get_number(sseqid)
        except ValueError:
            raise ValueError, "`{0}` is on `{1}` with no number to extract".\
                                format(saccn, sseqid)
        bedline = "\t".join(str(x) for x in (qseqid, qstart - 1, qend,
                            "{0}:{1}".format(newsseqid, sstart)))
        bd.add(bedline)

    bd.print_to_file(filename=opts.outfile, sorted=True)
Пример #6
0
def bed(args):
    """
    %prog bed anchorsfile

    Convert ANCHORS file to BED format.
    """
    from collections import defaultdict
    from jcvi.compara.synteny import AnchorFile, check_beds
    from jcvi.formats.bed import Bed, BedLine
    from jcvi.formats.base import get_number

    p = OptionParser(bed.__doc__)
    p.add_option("--switch",
                 default=False,
                 action="store_true",
                 help="Switch reference and aligned map elements")
    p.add_option("--scale",
                 type="float",
                 help="Scale the aligned map distance by factor")
    p.set_beds()
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    anchorsfile, = args
    switch = opts.switch
    scale = opts.scale
    ac = AnchorFile(anchorsfile)
    pairs = defaultdict(list)
    for a, b, block_id in ac.iter_pairs():
        pairs[a].append(b)

    qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts)
    bd = Bed()
    for q in qbed:
        qseqid, qstart, qend, qaccn = q.seqid, q.start, q.end, q.accn
        if qaccn not in pairs:
            continue
        for s in pairs[qaccn]:
            si, s = sorder[s]
            sseqid, sstart, send, saccn = s.seqid, s.start, s.end, s.accn
        if switch:
            qseqid, sseqid = sseqid, qseqid
            qstart, sstart = sstart, qstart
            qend, send = send, qend
            qaccn, saccn = saccn, qaccn
        if scale:
            sstart /= scale
        bedline = "\t".join(
            str(x) for x in (qseqid, qstart - 1, qend,
                             "{0}:{1}".format(get_number(sseqid), sstart)))
        bd.append(BedLine(bedline))

    bd.print_to_file(filename=opts.outfile, sorted=True)
Пример #7
0
    def from_block_orientation(cls,
                               anchorfile,
                               qbed,
                               sbed,
                               forward_color="#e7298a",
                               reverse_color="#3690c0"):
        """Generate a palette which contains mapping from block_id (1-based) to colors.

        Args:
            anchorfile (str): Path to the .anchors file
            qbed (BedFile): Query BED
            sbed (BedFile): Subject BED
            forward_color (str, optional): Color of forward block. Defaults to "#e7298a".
            reverse_color (str, optional): Color of reverse block. Defaults to "#3690c0".
        """
        ac = AnchorFile(anchorfile)
        blocks = ac.blocks
        palette = {}
        qorder = qbed.order
        sorder = sbed.order

        for i, block in enumerate(blocks):
            block_id = i + 1

            a, b, _ = zip(*block)
            a = [qorder[x] for x in a]
            b = [sorder[x] for x in b]
            ia, _ = zip(*a)
            ib, _ = zip(*b)

            orientation = get_orientation(ia, ib)
            palette[
                block_id] = reverse_color if orientation == "-" else forward_color
        return cls(palettedict=palette)
Пример #8
0
def pairs(args):
    """
    %prog pairs anchorsfile prefix

    Convert anchorsfile to pairsfile.
    """
    p = OptionParser(pairs.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    anchorfile, prefix = args
    outfile = prefix + ".pairs"
    fw = open(outfile, "w")

    af = AnchorFile(anchorfile)
    blocks = af.blocks
    pad = len(str(len(blocks)))
    npairs = 0
    for i, block in enumerate(blocks):
        block_id = "{0}{1:0{2}d}".format(prefix, i + 1, pad)
        lines = []
        for q, s, score in block:
            npairs += 1
            score = score.replace("L", "")
            lines.append("\t".join((q, s, score, block_id)))
        print("\n".join(sorted(lines)), file=fw)

    fw.close()
    logging.debug("A total of {0} pairs written to `{1}`.".format(
        npairs, outfile))
Пример #9
0
def filter_exclude(blast_list, exclude=None):
    """ Filter gene pairs from an excluded list

    Args:
        blast_list (List[BlastLine]): List of BlastLines
        exclude (str, optional): Path to the excluded anchors file. Defaults to None.
    """
    from jcvi.compara.synteny import AnchorFile

    excluded_pairs = set()
    ac = AnchorFile(exclude)
    for a, b, block in ac.iter_pairs():
        excluded_pairs.add((a, b))
        excluded_pairs.add((b, a))
    for b in blast_list:
        if (b.query, b.subject) in excluded_pairs:
            continue
        yield b
Пример #10
0
def anchor2tsv(args):
    anchors = AnchorFile(args.fi)
    blocks = anchors.blocks
    i = 1
    fmt = "b%%0%dd" % ndigit(len(blocks))
    print("\t".join('bid gid1 gid2 score'.split()))
    for block in blocks:
        bid = fmt % i
        for line in block:
            a, b, score = line
            print("\t".join((bid, a, b, score)))
        i += 1
Пример #11
0
def fuse(args):
    """
    %prog fuse *.bed *.anchors

    Fuse gene orders based on anchors file.
    """
    p = OptionParser(fuse.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    bedfiles = [x for x in args if x.endswith(".bed")]
    anchorfiles = [x for x in args if x.endswith(".anchors")]
    aligned_genes = Grouper()
    for anchorfile in anchorfiles:
        af = AnchorFile(anchorfile)
        for a, b, block_id in af.iter_pairs():
            aligned_genes.join(a, b)

    print list(aligned_genes)
    logging.debug("Total aligned genes: {}".format(len(aligned_genes)))
Пример #12
0
def read_clusters(qa_file, qorder, sorder):
    af = AnchorFile(qa_file)
    blocks = af.blocks
    clusters = []
    for block in blocks:
        cluster = []
        for a, b, score in block:
            ia, oa = qorder[a]
            ib, ob = sorder[b]
            ca, cb = oa.seqid, ob.seqid
            cluster.append(((ca, ia), (cb, ib), score))
        clusters.append(cluster)

    return clusters
Пример #13
0
def zipbed(args):
    """
    %prog zipbed species.bed collinear.anchors

    Build ancestral contig from collinear blocks. For example, to build pre-rho
    order, use `zipbed rice.bed rice.rice.1x1.collinear.anchors`. The algorithms
    proceeds by interleaving the genes together.
    """
    p = OptionParser(zipbed.__doc__)
    p.add_option("--prefix",
                 default="b",
                 help="Prefix for the new seqid [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bedfile, anchorfile = args
    prefix = opts.prefix
    bed = Bed(bedfile)
    order = bed.order
    newbedfile = prefix + ".bed"
    fw = open(newbedfile, "w")

    af = AnchorFile(anchorfile)
    blocks = af.blocks
    pad = len(str(len(blocks)))
    for i, block in enumerate(blocks):
        block_id = "{0}{1:0{2}d}".format(prefix, i + 1, pad)
        pairs = []
        for q, s, score in block:
            qi, q = order[q]
            si, s = order[s]
            pairs.append((qi, si))
        newbed = list(interleave_pairs(pairs))
        for i, b in enumerate(newbed):
            accn = bed[b].accn
            print("\t".join(str(x) for x in (block_id, i, i + 1, accn)),
                  file=fw)

    logging.debug("Reconstructed bedfile written to `{0}`.".format(newbedfile))
Пример #14
0
def collinear(args):
    """
    %prog collinear a.b.anchors

    Reduce synteny blocks to strictly collinear, use dynamic programming in a
    procedure similar to DAGchainer.
    """
    p = OptionParser(collinear.__doc__)
    p.set_beds()

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (anchorfile, ) = args
    qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts)

    af = AnchorFile(anchorfile)
    newanchorfile = anchorfile.rsplit(".", 1)[0] + ".collinear.anchors"
    fw = open(newanchorfile, "w")

    blocks = af.blocks
    for block in blocks:
        print("#" * 3, file=fw)
        iblock = []
        for q, s, score in block:
            qi, q = qorder[q]
            si, s = sorder[s]
            score = get_number(score)
            iblock.append([qi, si, score])

        block = get_collinear(iblock)

        for q, s, score in block:
            q = qbed[q].accn
            s = sbed[s].accn
            print("\t".join((q, s, str(score))), file=fw)

    fw.close()
Пример #15
0
def mergechrom(args):
    """
    %prog mergechrom a.b.anchors

    merge synteny blocks on the same chromosome
    """
    p = OptionParser(mergechrom.__doc__)
    p.set_beds()

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    anchorfile, = args
    qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts)

    af = AnchorFile(anchorfile)
    newanchorfile = anchorfile.rsplit(".", 1)[0] + ".mergechrom.anchors"
    fw = open(newanchorfile, "w")

    qchrom_dic = dict((b.accn, b.seqid) for b in qbed)
    schrom_dic = dict((b.accn, b.seqid) for b in sbed)
    block_dic = dict()
    blocks = af.blocks
    for (i, block) in enumerate(blocks):
        q, s, score = block[0]
        qchrom, schrom = qchrom_dic[q], schrom_dic[s]
        k = "%s_%s" % (qchrom, schrom)
        if k not in block_dic: block_dic[k] = []
        block_dic[k].append(i)

    for (k, idxs) in block_dic.items():
        print("#" * 3, file=fw)
        for i in idxs:
            for q, s, score in blocks[i]:
                print("\t".join((q, s, str(score))), file=fw)

    fw.close()
    print("%d blocks merged to %d" % (len(blocks), len(block_dic.keys())))
Пример #16
0
def read_clusters(qa_file, qorder, sorder):
    """Read in the clusters from anchors file

    Args:
        qa_file (str): Path to input file
        qorder (dict): Dictionary to find position of feature in query
        sorder (dict): Dictionary to find position of feature in subject

    Returns:
        List: List of matches and scores
    """
    af = AnchorFile(qa_file)
    blocks = af.blocks
    clusters = []
    for block in blocks:
        cluster = []
        for a, b, score in block:
            ia, oa = qorder[a]
            ib, ob = sorder[b]
            ca, cb = oa.seqid, ob.seqid
            cluster.append(((ca, ia), (cb, ib), score))
        clusters.append(cluster)

    return clusters
Пример #17
0
def dotplot_main(args):
    p = OptionParser(__doc__)
    p.set_beds()
    p.add_option("--synteny", default=False, action="store_true",
            help="Run a fast synteny scan and display blocks [default: %default]")
    p.add_option("--cmaptext", help="Draw colormap box on the bottom-left corner")
    p.add_option("--vmin", dest="vmin", type="float", default=0,
            help="Minimum value in the colormap [default: %default]")
    p.add_option("--vmax", dest="vmax", type="float", default=2,
            help="Maximum value in the colormap [default: %default]")
    p.add_option("--genomenames", type="string", default=None,
            help="genome names for labeling axes in the form of qname_sname, " \
            "eg. \"Vitis vinifera_Oryza sativa\"")
    p.add_option("--nmax", dest="sample_number", type="int", default=10000,
            help="Maximum number of data points to plot [default: %default]")
    p.add_option("--minfont", type="int", default=4,
            help="Do not render labels with size smaller than")
    p.add_option("--colormap",
            help="Two column file, block id to color mapping [default: %default]")
    p.add_option("--nosort", default=False, action="store_true",
            help="Do not sort the seqids along the axes")
    p.add_option("--nosep", default=False, action="store_true",
            help="Do not add contig lines")
    p.add_option("--nostdpf", default=False, action="store_true",
            help="Do not standardize contig names")
    p.add_option("--skipempty", default=False, action="store_true",
            help="Skip seqids that do not have matches")
    p.add_option("--title", help="Title of the dot plot")
    p.set_outfile(outfile=None)
    opts, args, iopts = p.set_image_options(args, figsize="8x8",
                                            style="dark", dpi=90, cmap="copper")

    if len(args) != 1:
        sys.exit(not p.print_help())

    palette = opts.colormap
    if palette:
        palette = Palette(palette)

    anchorfile, = args
    cmaptext = opts.cmaptext
    if anchorfile.endswith(".ks"):
        from jcvi.apps.ks import KsFile

        logging.debug("Anchors contain Ks values")
        cmaptext = cmaptext or "*Ks* values"
        anchorksfile = anchorfile + ".anchors"
        if need_update(anchorfile, anchorksfile):
            ksfile = KsFile(anchorfile)
            ksfile.print_to_anchors(anchorksfile)
        anchorfile = anchorksfile

    qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts,
                sorted=(not opts.nosort))

    if opts.skipempty:
        ac = AnchorFile(anchorfile)
        if is_self:
            qseqids = sseqids = set()
        else:
            qseqids, sseqids = set(), set()

        for pair in ac.iter_pairs():
            q, s = pair[:2]
            qi, q = qorder[q]
            si, s = sorder[s]
            qseqids.add(q.seqid)
            sseqids.add(s.seqid)

        if is_self:
            qbed = sbed = subset_bed(qbed, qseqids)
        else:
            qbed = subset_bed(qbed, qseqids)
            sbed = subset_bed(sbed, sseqids)

    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])  # the whole canvas
    ax = fig.add_axes([.1, .1, .8, .8])  # the dot plot

    dotplot(anchorfile, qbed, sbed, fig, root, ax,
            vmin=opts.vmin, vmax=opts.vmax, is_self=is_self,
            synteny=opts.synteny, cmap_text=opts.cmaptext, cmap=iopts.cmap,
            genomenames=opts.genomenames, sample_number=opts.sample_number,
            minfont=opts.minfont, palette=palette, sep=(not opts.nosep),
            title=opts.title, stdpf=(not opts.nostdpf))

    image_name = opts.outfile or \
            (op.splitext(anchorfile)[0] + "." + opts.format)
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
    fig.clear()
Пример #18
0
def omgprepare(args):
    """
    %prog omgprepare ploidy anchorsfile blastfile

    Prepare to run Sankoff's OMG algorithm to get orthologs.
    """
    from jcvi.formats.blast import cscore
    from jcvi.formats.base import DictFile

    p = OptionParser(omgprepare.__doc__)
    p.add_option("--norbh",
                 action="store_true",
                 help="Disable RBH hits [default: %default]")
    p.add_option("--pctid",
                 default=0,
                 type="int",
                 help="Percent id cutoff for RBH hits [default: %default]")
    p.add_option("--cscore",
                 default=90,
                 type="int",
                 help="C-score cutoff for RBH hits [default: %default]")
    p.set_stripnames()
    p.set_beds()

    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    ploidy, anchorfile, blastfile = args
    norbh = opts.norbh
    pctid = opts.pctid
    cs = opts.cscore
    qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts)

    fp = open(ploidy)
    genomeidx = dict((x.split()[0], i) for i, x in enumerate(fp))
    fp.close()

    ploidy = DictFile(ploidy)

    geneinfo(qbed, qorder, genomeidx, ploidy)
    geneinfo(sbed, sorder, genomeidx, ploidy)

    pf = blastfile.rsplit(".", 1)[0]
    cscorefile = pf + ".cscore"
    cscore([blastfile, "-o", cscorefile, "--cutoff=0", "--pct"])
    ac = AnchorFile(anchorfile)
    pairs = set((a, b) for a, b, i in ac.iter_pairs())
    logging.debug("Imported {0} pairs from `{1}`.".format(
        len(pairs), anchorfile))

    weightsfile = pf + ".weights"
    fp = open(cscorefile)
    fw = open(weightsfile, "w")
    npairs = 0
    for row in fp:
        a, b, c, pct = row.split()
        c, pct = float(c), float(pct)
        c = int(c * 100)
        if (a, b) not in pairs:
            if norbh:
                continue
            if c < cs:
                continue
            if pct < pctid:
                continue
            c /= 10  # This severely penalizes RBH against synteny

        print >> fw, "\t".join((a, b, str(c)))
        npairs += 1
    fw.close()

    logging.debug("Write {0} pairs to `{1}`.".format(npairs, weightsfile))
Пример #19
0
def omgprepare(args):
    """
    %prog omgprepare ploidy anchorsfile blastfile

    Prepare to run Sankoff's OMG algorithm to get orthologs.
    """
    from jcvi.formats.blast import cscore
    from jcvi.formats.base import DictFile

    p = OptionParser(omgprepare.__doc__)
    p.add_option("--norbh", action="store_true",
                 help="Disable RBH hits [default: %default]")
    p.add_option("--pctid", default=0, type="int",
                 help="Percent id cutoff for RBH hits [default: %default]")
    p.add_option("--cscore", default=90, type="int",
                 help="C-score cutoff for RBH hits [default: %default]")
    p.set_stripnames()
    p.set_beds()

    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    ploidy, anchorfile, blastfile = args
    norbh = opts.norbh
    pctid = opts.pctid
    cs = opts.cscore
    qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts)

    fp = open(ploidy)
    genomeidx = dict((x.split()[0], i) for i, x in enumerate(fp))
    fp.close()

    ploidy = DictFile(ploidy)

    geneinfo(qbed, qorder, genomeidx, ploidy)
    geneinfo(sbed, sorder, genomeidx, ploidy)

    pf = blastfile.rsplit(".", 1)[0]
    cscorefile = pf + ".cscore"
    cscore([blastfile, "-o", cscorefile, "--cutoff=0", "--pct"])
    ac = AnchorFile(anchorfile)
    pairs = set((a, b) for a, b, i in ac.iter_pairs())
    logging.debug("Imported {0} pairs from `{1}`.".format(len(pairs), anchorfile))

    weightsfile = pf + ".weights"
    fp = open(cscorefile)
    fw = open(weightsfile, "w")
    npairs = 0
    for row in fp:
        a, b, c, pct = row.split()
        c, pct = float(c), float(pct)
        c = int(c * 100)
        if (a, b) not in pairs:
            if norbh:
                continue
            if c < cs:
                continue
            if pct < pctid:
                continue
            c /= 10  # This severely penalizes RBH against synteny

        print >> fw, "\t".join((a, b, str(c)))
        npairs += 1
    fw.close()

    logging.debug("Write {0} pairs to `{1}`.".format(npairs, weightsfile))
Пример #20
0
def cluster(args):
    """
    %prog cluster blastfile anchorfile --qbed qbedfile --sbed sbedfile

    Cluster the segments and form PAD. This is the method described in Tang et
    al. (2010) PNAS paper. The anchorfile defines a list of synteny blocks,
    based on which the genome on one or both axis can be chopped up into pieces
    and clustered.
    """
    from jcvi.utils.range import Range

    p = OptionParser(cluster.__doc__)
    p.set_beds()
    p.add_option("--minsize", default=10, type="int",
                 help="Only segment using blocks >= size [default: %default]")
    p.add_option("--path", default="~/scratch/bin",
                 help="Path to the CLUSTER 3.0 binary [default: %default]")

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, anchorfile = args
    qbed, sbed, qorder, sorder, is_self = check_beds(blastfile, p, opts)

    minsize = opts.minsize
    ac = AnchorFile(anchorfile)
    qranges, sranges = [], []
    qextra = [x[1:] for x in qbed.get_breaks()]
    sextra = [x[1:] for x in sbed.get_breaks()]

    id = 0
    for block in ac.iter_blocks(minsize=minsize):
        q, s = zip(*block)[:2]
        q = [qorder[x][0] for x in q]
        s = [sorder[x][0] for x in s]
        minq, maxq = min(q), max(q)
        mins, maxs = min(s), max(s)
        id += 1

        qr = Range("0", minq, maxq, maxq - minq, id)
        sr = Range("0", mins, maxs, maxs - mins, id)
        qranges.append(qr)
        sranges.append(sr)

    qpads = list(get_segments(qranges, qextra))
    spads = list(get_segments(sranges, sextra))

    suffix = ".pad.bed"
    qpf = opts.qbed.split(".")[0]
    spf = opts.sbed.split(".")[0]
    qpadfile = qpf + suffix
    spadfile = spf + suffix
    qnpads, qpadnames = write_PAD_bed(qpadfile, qpf, qpads, qbed)
    snpads, spadnames = write_PAD_bed(spadfile, spf, spads, sbed)

    qpadbed, spadbed = Bed(qpadfile), Bed(spadfile)

    logmp = make_arrays(blastfile, qpadbed, spadbed, qpadnames, spadnames)
    m, n = logmp.shape

    matrixfile = ".".join((qpf, spf, "logmp.txt"))
    fw = open(matrixfile, "w")
    header = ["o"] + spadnames
    print("\t".join(header), file=fw)
    for i in xrange(m):
        row = [qpadnames[i]] + ["{0:.1f}".format(x) for x in logmp[i, :]]
        print("\t".join(row), file=fw)

    fw.close()

    # Run CLUSTER 3.0 (Pearson correlation, average linkage)
    cmd = op.join(opts.path, "cluster")
    cmd += " -g 2 -e 2 -m a -f {0}".format(matrixfile)
    pf = matrixfile.rsplit(".", 1)[0]
    cdtfile = pf + ".cdt"
    if need_update(matrixfile, cdtfile):
        sh(cmd)
Пример #21
0
            help="Skip seqids that do not have matches")
    opts, args, iopts = p.set_image_options(sys.argv[1:], figsize="8x8",
                                            style="dark", dpi=90, cmap="copper")

    if len(args) != 1:
        sys.exit(not p.print_help())

    palette = opts.colormap
    if palette:
        palette = Palette(palette)

    anchorfile, = args
    qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts)

    if opts.skipempty:
        ac = AnchorFile(anchorfile)
        if is_self:
            qseqids = sseqids = set()
        else:
            qseqids, sseqids = set(), set()

        for pair in ac.iter_pairs():
            q, s = pair[:2]
            qi, q = qorder[q]
            si, s = sorder[s]
            qseqids.add(q.seqid)
            sseqids.add(s.seqid)

        if is_self:
            qbed = sbed = subset_bed(qbed, qseqids)
        else:
Пример #22
0
def ancestral(args):
    """
    %prog ancestral vplanifoliaA.vplanifoliaA.anchors > vplanifoliaA_blocks.bed

    Paint 14 chromosomes following alpha WGD.
    """
    p = OptionParser(ancestral.__doc__)
    p.set_beds()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (anchorsfile, ) = args
    qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts)

    # We focus on the following chromosome pairs
    target_pairs = {
        (1, 1),
        (1, 6),
        (1, 8),
        (1, 13),
        (2, 4),
        (3, 12),
        (3, 14),
        (5, 6),
        (5, 8),
        (7, 9),
        (7, 11),
        (9, 10),
        (10, 11),
    }

    def get_target(achr, bchr):
        if "chr" not in achr and "chr" not in bchr:
            return None
        achr, bchr = get_number(achr), get_number(bchr)
        if achr > bchr:
            achr, bchr = bchr, achr
        if (achr, bchr) in target_pairs:
            return achr, bchr
        return None

    def build_bedline(astart, aend, target_pair):
        # target_name = "{:02d}-{:02d}".format(*target_pair)
        target_name = [
            str(x) for x in target_pair if x in (1, 2, 3, 5, 7, 10)
        ][0]
        return "\t".join(
            str(x)
            for x in (astart.seqid, astart.start, aend.end, target_name))

    # Iterate through the blocks, store any regions that has hits to one of the
    # target_pairs
    ac = AnchorFile(anchorsfile)
    blocks = ac.blocks
    outbed = Bed()
    for i, block in enumerate(blocks):
        a, b, scores = zip(*block)
        a = [qorder[x] for x in a]
        b = [sorder[x] for x in b]
        astart, aend = min(a)[1], max(a)[1]
        bstart, bend = min(b)[1], max(b)[1]
        # Now convert to BED lines with new accn
        achr, bchr = astart.seqid, bstart.seqid
        target = get_target(achr, bchr)
        if target is None:
            continue
        outbed.add(build_bedline(astart, aend, target))
        outbed.add(build_bedline(bstart, bend, target))
    outbed.print_to_file(sorted=True)
Пример #23
0
def cluster(args):
    """
    %prog cluster blastfile anchorfile --qbed qbedfile --sbed sbedfile

    Cluster the segments and form PAD. This is the method described in Tang et
    al. (2010) PNAS paper. The anchorfile defines a list of synteny blocks,
    based on which the genome on one or both axis can be chopped up into pieces
    and clustered.
    """
    from jcvi.utils.range import Range

    p = OptionParser(cluster.__doc__)
    p.set_beds()
    p.add_option(
        "--minsize", default=10, type="int", help="Only segment using blocks >= size"
    )
    p.add_option(
        "--path", default="~/scratch/bin", help="Path to the CLUSTER 3.0 binary"
    )

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, anchorfile = args
    qbed, sbed, qorder, sorder, is_self = check_beds(blastfile, p, opts)

    minsize = opts.minsize
    ac = AnchorFile(anchorfile)
    qranges, sranges = [], []
    qextra = [x[1:] for x in qbed.get_breaks()]
    sextra = [x[1:] for x in sbed.get_breaks()]

    id = 0
    for block in ac.iter_blocks(minsize=minsize):
        q, s = list(zip(*block))[:2]
        q = [qorder[x][0] for x in q]
        s = [sorder[x][0] for x in s]
        minq, maxq = min(q), max(q)
        mins, maxs = min(s), max(s)
        id += 1

        qr = Range("0", minq, maxq, maxq - minq, id)
        sr = Range("0", mins, maxs, maxs - mins, id)
        qranges.append(qr)
        sranges.append(sr)

    qpads = list(get_segments(qranges, qextra))
    spads = list(get_segments(sranges, sextra))

    suffix = ".pad.bed"
    qpf = opts.qbed.split(".")[0]
    spf = opts.sbed.split(".")[0]
    qpadfile = qpf + suffix
    spadfile = spf + suffix
    qnpads, qpadnames = write_PAD_bed(qpadfile, qpf, qpads, qbed)
    snpads, spadnames = write_PAD_bed(spadfile, spf, spads, sbed)

    qpadbed, spadbed = Bed(qpadfile), Bed(spadfile)

    logmp = make_arrays(blastfile, qpadbed, spadbed, qpadnames, spadnames)
    m, n = logmp.shape

    matrixfile = ".".join((qpf, spf, "logmp.txt"))
    fw = open(matrixfile, "w")
    header = ["o"] + spadnames
    print("\t".join(header), file=fw)
    for i in range(m):
        row = [qpadnames[i]] + ["{0:.1f}".format(x) for x in logmp[i, :]]
        print("\t".join(row), file=fw)

    fw.close()

    # Run CLUSTER 3.0 (Pearson correlation, average linkage)
    cmd = op.join(opts.path, "cluster")
    cmd += " -g 2 -e 2 -m a -f {0}".format(matrixfile)
    pf = matrixfile.rsplit(".", 1)[0]
    cdtfile = pf + ".cdt"
    if need_update(matrixfile, cdtfile):
        sh(cmd)
Пример #24
0
                                            figsize="8x8",
                                            style="dark",
                                            dpi=90)

    if len(args) != 1:
        sys.exit(not p.print_help())

    palette = opts.colormap
    if palette:
        palette = Palette(palette)

    anchorfile, = args
    qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts)

    if opts.skipempty:
        ac = AnchorFile(anchorfile)
        if is_self:
            qseqids = sseqids = set()
        else:
            qseqids, sseqids = set(), set()

        for pair in ac.iter_pairs():
            q, s = pair[:2]
            qi, q = qorder[q]
            si, s = sorder[s]
            qseqids.add(q.seqid)
            sseqids.add(s.seqid)

        if is_self:
            qbed = sbed = subset_bed(qbed, qseqids)
        else: