示例#1
0
文件: pad.py 项目: bennyyu/jcvi
def pad(args):
    """
    %prog pad blastfile cdtfile --qbed q.pad.bed --sbed s.pad.bed

    Test and reconstruct candidate PADs.
    """
    from jcvi.formats.cdt import CDT

    p = OptionParser(pad.__doc__)
    add_beds(p)
    p.add_option("--cutoff", default=.3, type="float",
                 help="The clustering cutoff to call similar [default: %default]")

    opts, args = p.parse_args(args)
    qbed, sbed, qorder, sorder, is_self = check_beds(p, opts)

    if len(args) != 2:
        sys.exit(not p.print_help())

    cutoff = opts.cutoff
    blastfile, cdtfile = args
    cdt = CDT(cdtfile)
    qparts = list(cdt.iter_partitions(cutoff=cutoff))
    sparts = list(cdt.iter_partitions(cutoff=cutoff, gtr=False))

    qid, sid = {}, {}
    for i, part in enumerate(qparts):
        qid.update(dict((x, i) for x in part))
    for i, part in enumerate(sparts):
        sid.update(dict((x, i) for x in part))

    # Without writing files, conversion from PAD to merged PAD is done in memory
    for q in qbed:
        q.seqid = qid[q.seqid]
    for s in sbed:
        s.seqid = sid[s.seqid]

    qnames = range(len(qparts))
    snames = range(len(sparts))

    logmp = make_arrays(blastfile, qbed, sbed, qnames, snames)
    m, n = logmp.shape
    pvalue_cutoff = 1e-30
    cutoff = - log(pvalue_cutoff)

    significant = []
    for i in xrange(m):
        for j in xrange(n):
            score = logmp[i, j]
            if score < cutoff:
                continue
            significant.append((qparts[i], sparts[j], score))

    for a, b, score in significant:
        print "|".join(a), "|".join(b), score

    logging.debug("Collected {0} PAR comparisons significant at (P < {1}).".\
                    format(len(significant), pvalue_cutoff))
示例#2
0
文件: pad.py 项目: linlifeng/jcvi
def pad(args):
    """
    %prog pad blastfile cdtfile --qbed q.pad.bed --sbed s.pad.bed

    Test and reconstruct candidate PADs.
    """
    from jcvi.formats.cdt import CDT

    p = OptionParser(pad.__doc__)
    add_beds(p)
    p.add_option("--cutoff", default=.3, type="float",
                 help="The clustering cutoff to call similar [default: %default]")

    opts, args = p.parse_args(args)
    qbed, sbed, qorder, sorder, is_self = check_beds(p, opts)

    if len(args) != 2:
        sys.exit(not p.print_help())

    cutoff = opts.cutoff
    blastfile, cdtfile = args
    cdt = CDT(cdtfile)
    qparts = list(cdt.iter_partitions(cutoff=cutoff))
    sparts = list(cdt.iter_partitions(cutoff=cutoff, gtr=False))

    qid, sid = {}, {}
    for i, part in enumerate(qparts):
        qid.update(dict((x, i) for x in part))
    for i, part in enumerate(sparts):
        sid.update(dict((x, i) for x in part))

    # Without writing files, conversion from PAD to merged PAD is done in memory
    for q in qbed:
        q.seqid = qid[q.seqid]
    for s in sbed:
        s.seqid = sid[s.seqid]

    qnames = range(len(qparts))
    snames = range(len(sparts))

    logmp = make_arrays(blastfile, qbed, sbed, qnames, snames)
    m, n = logmp.shape
    pvalue_cutoff = 1e-30
    cutoff = - log(pvalue_cutoff)

    significant = []
    for i in xrange(m):
        for j in xrange(n):
            score = logmp[i, j]
            if score < cutoff:
                continue
            significant.append((qparts[i], sparts[j], score))

    for a, b, score in significant:
        print "|".join(a), "|".join(b), score

    logging.debug("Collected {0} PAR comparisons significant at (P < {1}).".\
                    format(len(significant), pvalue_cutoff))
示例#3
0
文件: dotplot.py 项目: linlifeng/jcvi
if __name__ == "__main__":

    p = OptionParser(__doc__)
    add_beds(p)
    p.add_option("--synteny", default=False, action="store_true",
            help="Run a fast synteny scan and display blocks [default: %default]")
    p.add_option("--cmap", default="Synonymous substitutions (Ks)",
            help="Draw colormap box on the bottom-left corner "
                 "[default: `%default`]")
    p.add_option("--vmin", dest="vmin", type="float", default=0,
            help="Minimum value in the colormap [default: %default]")
    p.add_option("--vmax", dest="vmax", type="float", default=1,
            help="Maximum value in the colormap [default: %default]")
    opts, args, iopts = set_image_options(p, sys.argv[1:], figsize="8x8", dpi=90)

    if len(args) != 1:
        sys.exit(not p.print_help())

    qbed, sbed, qorder, sorder, is_self = check_beds(p, opts)

    synteny = opts.synteny
    vmin, vmax = opts.vmin, opts.vmax
    cmap_text = opts.cmap

    anchorfile = args[0]

    image_name = op.splitext(anchorfile)[0] + "." + opts.format
    dotplot(anchorfile, qbed, sbed, image_name, vmin, vmax, iopts,
            is_self=is_self, synteny=synteny, cmap_text=cmap_text)
示例#4
0
文件: pad.py 项目: bennyyu/jcvi
def cluster(args):
    """
    %prog cluster blastfile anchorfile --qbed qbedfile --sbed sbedfile

    Cluster the segments and form PAD. This is the method described in Tang et
    al. (2010) PNAS paper. The anchorfile defines a list of synteny blocks,
    based on which the genome on one or both axis can be chopped up into pieces
    and clustered.
    """
    from jcvi.utils.range import Range

    p = OptionParser(cluster.__doc__)
    add_beds(p)
    p.add_option("--minsize", default=10, type="int",
                 help="Only segment using blocks >= size [default: %default]")
    p.add_option("--path", default="~/scratch/bin",
                 help="Path to the CLUSTER 3.0 binary [default: %default]")

    opts, args = p.parse_args(args)
    qbed, sbed, qorder, sorder, is_self = check_beds(p, opts)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, anchorfile = args
    minsize = opts.minsize
    ac = AnchorFile(anchorfile)
    qranges, sranges = [], []
    qextra = [x[1:] for x in qbed.get_breaks()]
    sextra = [x[1:] for x in sbed.get_breaks()]

    id = 0
    for q, s in ac.iter_blocks(minsize=minsize):
        q = [qorder[x][0] for x in q]
        s = [sorder[x][0] for x in s]
        minq, maxq = min(q), max(q)
        mins, maxs = min(s), max(s)
        id += 1

        qr = Range("0", minq, maxq, maxq - minq, id)
        sr = Range("0", mins, maxs, maxs - mins, id)
        qranges.append(qr)
        sranges.append(sr)

    qpads = list(get_segments(qranges, qextra))
    spads = list(get_segments(sranges, sextra))

    suffix = ".pad.bed"
    qpf = opts.qbed.split(".")[0]
    spf = opts.sbed.split(".")[0]
    qpadfile = qpf + suffix
    spadfile = spf + suffix
    qnpads, qpadnames = write_PAD_bed(qpadfile, qpf, qpads, qbed)
    snpads, spadnames = write_PAD_bed(spadfile, spf, spads, sbed)

    qpadbed, spadbed = Bed(qpadfile), Bed(spadfile)

    logmp = make_arrays(blastfile, qpadbed, spadbed, qpadnames, spadnames)
    m, n = logmp.shape

    matrixfile = ".".join((qpf, spf, "logmp.txt"))
    fw = open(matrixfile, "w")
    header = ["o"] + spadnames
    print >> fw, "\t".join(header)
    for i in xrange(m):
        row = [qpadnames[i]] + ["{0:.1f}".format(x) for x in logmp[i, :]]
        print >> fw, "\t".join(row)

    fw.close()

    # Run CLUSTER 3.0 (Pearson correlation, average linkage)
    cmd = op.join(opts.path, "cluster")
    cmd += " -g 2 -e 2 -m a -f {0}".format(matrixfile)
    pf = matrixfile.rsplit(".", 1)[0]
    cdtfile = pf + ".cdt"
    if need_update(matrixfile, cdtfile):
        sh(cmd)
示例#5
0
def main(blast_file, opts):

    qbed, sbed, qorder, sorder, is_self = check_beds(p, opts)

    tandem_Nmax = opts.tandem_Nmax
    filter_repeats = opts.filter_repeats
    cscore = opts.cscore

    fp = file(blast_file)
    total_lines = sum(1 for line in fp)
    logging.debug("Load BLAST file `%s` (total %d lines)" % \
            (blast_file, total_lines))
    fp.seek(0)
    blasts = sorted([BlastLine(line) for line in fp], \
            key=lambda b: b.score, reverse=True)

    filtered_blasts = []
    seen = set()
    ostrip = opts.strip_names
    nwarnings = 0
    for b in blasts:
        query, subject = b.query, b.subject
        if ostrip:
            query, subject = gene_name(query), gene_name(subject)
        if query not in qorder:
            if nwarnings < 100:
                logging.warning("{0} not in {1}".format(query,
                    qbed.filename))
            elif nwarnings == 100:
                logging.warning("too many warnings.. suppressed")
            nwarnings += 1
            continue
        if subject not in sorder:
            if nwarnings < 100:
                logging.warning("{0} not in {1}".format(subject,
                    sbed.filename))
            elif nwarnings == 100:
                logging.warning("too many warnings.. suppressed")
            nwarnings += 1
            continue

        qi, q = qorder[query]
        si, s = sorder[subject]

        if is_self and qi > si:
            # move all hits to same side when doing self-self BLAST
            query, subject = subject, query
            qi, si = si, qi
            q, s = s, q

        key = query, subject
        if key in seen:
            continue
        seen.add(key)
        b.query, b.subject = key

        b.qi, b.si = qi, si
        b.qseqid, b.sseqid = q.seqid, s.seqid

        filtered_blasts.append(b)

    if not tandem_Nmax is None:
        logging.debug("running the local dups filter (tandem_Nmax=%d) .." % \
                tandem_Nmax)

        qtandems = tandem_grouper(qbed, filtered_blasts,
                flip=True, tandem_Nmax=tandem_Nmax)
        standems = tandem_grouper(sbed, filtered_blasts,
                flip=False, tandem_Nmax=tandem_Nmax)

        qdups_fh = open(op.splitext(qbed_file)[0] + ".localdups", "w") \
                if opts.tandems_only else None

        if is_self:
            for s in standems:
                qtandems.join(*s)
            qdups_to_mother = write_localdups(qtandems, qbed, qdups_fh)
            sdups_to_mother = qdups_to_mother
        else:
            qdups_to_mother = write_localdups(qtandems, qbed, qdups_fh)
            sdups_fh = open(op.splitext(sbed_file)[0] + ".localdups", "w") \
                    if opts.tandems_only else None
            sdups_to_mother = write_localdups(standems, sbed, sdups_fh)

        if opts.tandems_only:
            # write out new .bed after tandem removal
            write_new_bed(qbed, qdups_to_mother)
            if not is_self:
                write_new_bed(sbed, sdups_to_mother)

            # just want to use this script as a tandem finder.
            sys.exit()

        before_filter = len(filtered_blasts)
        filtered_blasts = list(filter_tandem(filtered_blasts, \
                qdups_to_mother, sdups_to_mother))
        logging.debug("after filter (%d->%d) .." % \
                (before_filter, len(filtered_blasts)))

    if filter_repeats:
        before_filter = len(filtered_blasts)
        logging.debug("running the repeat filter")
        filtered_blasts = list(filter_repeat(filtered_blasts))
        logging.debug("after filter (%d->%d) .." % (before_filter,
            len(filtered_blasts)))

    if not cscore is None:
        before_filter = len(filtered_blasts)
        logging.debug("running the cscore filter (cscore>=%.2f) .." % cscore)
        filtered_blasts = list(filter_cscore(filtered_blasts, cscore=cscore))
        logging.debug("after filter (%d->%d) .." % (before_filter,
            len(filtered_blasts)))

    blastfilteredfile = blast_file + ".filtered"
    fw = open(blastfilteredfile, "w")
    write_new_blast(filtered_blasts, fh=fw)
    fw.close()
示例#6
0
文件: pad.py 项目: linlifeng/jcvi
def cluster(args):
    """
    %prog cluster blastfile anchorfile --qbed qbedfile --sbed sbedfile

    Cluster the segments and form PAD. This is the method described in Tang et
    al. (2010) PNAS paper. The anchorfile defines a list of synteny blocks,
    based on which the genome on one or both axis can be chopped up into pieces
    and clustered.
    """
    from jcvi.utils.range import Range

    p = OptionParser(cluster.__doc__)
    add_beds(p)
    p.add_option("--minsize", default=10, type="int",
                 help="Only segment using blocks >= size [default: %default]")
    p.add_option("--path", default="~/scratch/bin",
                 help="Path to the CLUSTER 3.0 binary [default: %default]")

    opts, args = p.parse_args(args)
    qbed, sbed, qorder, sorder, is_self = check_beds(p, opts)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, anchorfile = args
    minsize = opts.minsize
    ac = AnchorFile(anchorfile)
    qranges, sranges = [], []
    qextra = [x[1:] for x in qbed.get_breaks()]
    sextra = [x[1:] for x in sbed.get_breaks()]

    id = 0
    for q, s in ac.iter_blocks(minsize=minsize):
        q = [qorder[x][0] for x in q]
        s = [sorder[x][0] for x in s]
        minq, maxq = min(q), max(q)
        mins, maxs = min(s), max(s)
        id += 1

        qr = Range("0", minq, maxq, maxq - minq, id)
        sr = Range("0", mins, maxs, maxs - mins, id)
        qranges.append(qr)
        sranges.append(sr)

    qpads = list(get_segments(qranges, qextra))
    spads = list(get_segments(sranges, sextra))

    suffix = ".pad.bed"
    qpf = opts.qbed.split(".")[0]
    spf = opts.sbed.split(".")[0]
    qpadfile = qpf + suffix
    spadfile = spf + suffix
    qnpads, qpadnames = write_PAD_bed(qpadfile, qpf, qpads, qbed)
    snpads, spadnames = write_PAD_bed(spadfile, spf, spads, sbed)

    qpadbed, spadbed = Bed(qpadfile), Bed(spadfile)

    logmp = make_arrays(blastfile, qpadbed, spadbed, qpadnames, spadnames)
    m, n = logmp.shape

    matrixfile = ".".join((qpf, spf, "logmp.txt"))
    fw = open(matrixfile, "w")
    header = ["o"] + spadnames
    print >> fw, "\t".join(header)
    for i in xrange(m):
        row = [qpadnames[i]] + ["{0:.1f}".format(x) for x in logmp[i, :]]
        print >> fw, "\t".join(row)

    fw.close()

    # Run CLUSTER 3.0 (Pearson correlation, average linkage)
    cmd = op.join(opts.path, "cluster")
    cmd += " -g 2 -e 2 -m a -f {0}".format(matrixfile)
    pf = matrixfile.rsplit(".", 1)[0]
    cdtfile = pf + ".cdt"
    if need_update(matrixfile, cdtfile):
        sh(cmd)
示例#7
0
def main(blast_file, opts):

    qbed, sbed, qorder, sorder, is_self = check_beds(p, opts)

    tandem_Nmax = opts.tandem_Nmax
    filter_repeats = opts.filter_repeats
    cscore = opts.cscore

    fp = file(blast_file)
    total_lines = sum(1 for line in fp)
    logging.debug("Load BLAST file `%s` (total %d lines)" % \
            (blast_file, total_lines))
    fp.seek(0)
    blasts = sorted([BlastLine(line) for line in fp], \
            key=lambda b: b.score, reverse=True)

    filtered_blasts = []
    seen = set()
    ostrip = opts.strip_names
    nwarnings = 0
    for b in blasts:
        query, subject = b.query, b.subject
        if ostrip:
            query, subject = gene_name(query), gene_name(subject)
        if query not in qorder:
            if nwarnings < 100:
                logging.warning("{0} not in {1}".format(query, qbed.filename))
            elif nwarnings == 100:
                logging.warning("too many warnings.. suppressed")
            nwarnings += 1
            continue
        if subject not in sorder:
            if nwarnings < 100:
                logging.warning("{0} not in {1}".format(
                    subject, sbed.filename))
            elif nwarnings == 100:
                logging.warning("too many warnings.. suppressed")
            nwarnings += 1
            continue

        qi, q = qorder[query]
        si, s = sorder[subject]

        if is_self and qi > si:
            # move all hits to same side when doing self-self BLAST
            query, subject = subject, query
            qi, si = si, qi
            q, s = s, q

        key = query, subject
        if key in seen:
            continue
        seen.add(key)
        b.query, b.subject = key

        b.qi, b.si = qi, si
        b.qseqid, b.sseqid = q.seqid, s.seqid

        filtered_blasts.append(b)

    if not tandem_Nmax is None:
        logging.debug("running the local dups filter (tandem_Nmax=%d) .." % \
                tandem_Nmax)

        qtandems = tandem_grouper(qbed,
                                  filtered_blasts,
                                  flip=True,
                                  tandem_Nmax=tandem_Nmax)
        standems = tandem_grouper(sbed,
                                  filtered_blasts,
                                  flip=False,
                                  tandem_Nmax=tandem_Nmax)

        qdups_fh = open(op.splitext(qbed_file)[0] + ".localdups", "w") \
                if opts.tandems_only else None

        if is_self:
            for s in standems:
                qtandems.join(*s)
            qdups_to_mother = write_localdups(qtandems, qbed, qdups_fh)
            sdups_to_mother = qdups_to_mother
        else:
            qdups_to_mother = write_localdups(qtandems, qbed, qdups_fh)
            sdups_fh = open(op.splitext(sbed_file)[0] + ".localdups", "w") \
                    if opts.tandems_only else None
            sdups_to_mother = write_localdups(standems, sbed, sdups_fh)

        if opts.tandems_only:
            # write out new .bed after tandem removal
            write_new_bed(qbed, qdups_to_mother)
            if not is_self:
                write_new_bed(sbed, sdups_to_mother)

            # just want to use this script as a tandem finder.
            sys.exit()

        before_filter = len(filtered_blasts)
        filtered_blasts = list(filter_tandem(filtered_blasts, \
                qdups_to_mother, sdups_to_mother))
        logging.debug("after filter (%d->%d) .." % \
                (before_filter, len(filtered_blasts)))

    if filter_repeats:
        before_filter = len(filtered_blasts)
        logging.debug("running the repeat filter")
        filtered_blasts = list(filter_repeat(filtered_blasts))
        logging.debug("after filter (%d->%d) .." %
                      (before_filter, len(filtered_blasts)))

    if not cscore is None:
        before_filter = len(filtered_blasts)
        logging.debug("running the cscore filter (cscore>=%.2f) .." % cscore)
        filtered_blasts = list(filter_cscore(filtered_blasts, cscore=cscore))
        logging.debug("after filter (%d->%d) .." %
                      (before_filter, len(filtered_blasts)))

    blastfilteredfile = blast_file + ".filtered"
    fw = open(blastfilteredfile, "w")
    write_new_blast(filtered_blasts, fh=fw)
    fw.close()