示例#1
0
文件: blast.py 项目: linlifeng/jcvi
def completeness(args):
    """
    %prog completeness blastfile query.fasta > outfile

    Print statistics for each gene, the coverage of the alignment onto the best hit
    in AllGroup.niaa, as an indicator for completeness of the gene model.
    """
    from jcvi.utils.range import range_minmax

    p = OptionParser(completeness.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, fastafile = args
    f = Sizes(fastafile).mapping

    b = BlastSlow(blastfile)
    for query, blines in groupby(b, key=lambda x: x.query):
        blines = list(blines)
        ranges = [(x.sstart, x.sstop) for x in blines]
        b = blines[0]
        query, subject = b.query, b.subject

        rmin, rmax = range_minmax(ranges)
        subject_len = f[subject]

        nterminal_dist = rmin - 1
        cterminal_dist = subject_len - rmax + 1
        print "\t".join(
            str(x)
            for x in (b.query, b.subject, nterminal_dist, cterminal_dist))
示例#2
0
文件: blast.py 项目: bennyyu/jcvi
def completeness(args):
    """
    %prog completeness blastfile query.fasta > outfile

    Print statistics for each gene, the coverage of the alignment onto the best hit
    in AllGroup.niaa, as an indicator for completeness of the gene model.
    """
    from jcvi.utils.range import range_minmax

    p = OptionParser(completeness.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, fastafile = args
    f = Sizes(fastafile).mapping

    b = BlastSlow(blastfile)
    for query, blines in groupby(b, key=lambda x: x.query):
        blines = list(blines)
        ranges = [(x.sstart, x.sstop) for x in blines]
        b = blines[0]
        query, subject = b.query, b.subject

        rmin, rmax = range_minmax(ranges)
        subject_len = f[subject]

        nterminal_dist = rmin - 1
        cterminal_dist = subject_len - rmax + 1
        print "\t".join(str(x) for x in (b.query, b.subject,
            nterminal_dist, cterminal_dist))
示例#3
0
文件: blast.py 项目: pombredanne/jcvi
def completeness(args):
    """
    %prog completeness blastfile ref.fasta > outfile

    Print statistics for each gene, the coverage of the alignment onto the best hit,
    as an indicator for completeness of the gene model. For example, one might
    BLAST sugarcane ESTs against sorghum annotations as reference, to find
    full-length transcripts.
    """
    from jcvi.utils.range import range_minmax
    from jcvi.utils.cbook import SummaryStats

    p = OptionParser(completeness.__doc__)
    p.add_option("--ids", help="Save ids that are over 50% complete [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, fastafile = args
    idsfile = opts.ids
    f = Sizes(fastafile).mapping

    b = BlastSlow(blastfile)
    valid = []
    data = []
    cutoff = 50
    for query, blines in groupby(b, key=lambda x: x.query):
        blines = list(blines)
        ranges = [(x.sstart, x.sstop) for x in blines]
        b = blines[0]
        query, subject = b.query, b.subject

        rmin, rmax = range_minmax(ranges)
        subject_len = f[subject]

        nterminal_dist = rmin - 1
        cterminal_dist = subject_len - rmax
        covered = (rmax - rmin + 1) * 100 / subject_len
        if covered > cutoff:
            valid.append(query)

        data.append((nterminal_dist, cterminal_dist, covered))
        print "\t".join(str(x) for x in (query, subject, nterminal_dist, cterminal_dist, covered))

    nd, cd, cv = zip(*data)
    m = "Total: {0}, Coverage > {1}%: {2}\n".format(len(data), cutoff, len(valid))
    m += "N-terminal: {0}\n".format(SummaryStats(nd))
    m += "C-terminal: {0}\n".format(SummaryStats(cd))
    m += "Coverage: {0}".format(SummaryStats(cv))
    print >> sys.stderr, m

    if idsfile:
        fw = open(idsfile, "w")
        print >> fw, "\n".join(valid)
        logging.debug("A total of {0} ids (cov > {1} %) written to `{2}`.".format(len(valid), cutoff, idsfile))
        fw.close()
示例#4
0
文件: btab.py 项目: rrane/jcvi
def gff(args):
    """
    %prog gff btabfile

    Convert btab file generated by AAT to gff3 format.
    """
    from jcvi.utils.range import range_minmax
    from jcvi.formats.gff import valid_gff_parent_child, valid_gff_type

    p = OptionParser(gff.__doc__)
    p.add_option("--source", default=None, help="Specify GFF source." +
                " By default, it picks algorithm used to generate btab file." +
                " [default: %default]")
    p.add_option("--type", default="protein_match", choices=valid_gff_type,
                help="GFF feature type [default: %default]")

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    btabfile, = args
    btabdict = {}
    btab = Btab(btabfile, aat_dialect=True)
    osource = opts.source or "aat"
    otype = opts.type
    octype = valid_gff_parent_child[otype]
    for b in btab:
        nargs = b.nargs
        id = b.query + "-" + otype + "{0:05d}".format(b.chainNum)
        key = b.key
        if key not in btabdict:
            btabdict[key] = { 'id': id,
                              'method': b.method,
                              'query': b.query,
                              'subject': b.subject,
                              'strand': b.qStrand,
                              'sDesc': b.sDesc,
                              'coords': [],
                              'children': []
                            }

        btabdict[key]['coords'].append((b.qStart, b.qStop))
        btabdict[key]['children'].append(b.gffline(source=osource, type=octype, id=id))

    for v in btabdict.itervalues():
        b = BtabLine("\t".join(str(x) for x in [0] * nargs), aat_dialect=True)
        id = v['id']
        b.query = v['query']
        b.method = v['method']
        b.subject = v['subject']
        b.qStrand = v['strand']
        b.sDesc = v['sDesc']
        b.qStart, b.qStop = range_minmax(v['coords'])
        print b.gffline(source=osource, type=otype, primary_tag="ID", id=id)
        print "\n".join(v['children'])
示例#5
0
def get_boundary_bases(start, end, order):

    from jcvi.utils.range import range_minmax

    (i, s), (j, e) = order[start], order[end]
    seqid = s.seqid
    assert seqid == e.seqid

    startbase, endbase = range_minmax([(s.start, s.end), (e.start, e.end)])

    return seqid, startbase, endbase
示例#6
0
def get_boundary_bases(start, end, order):

    from jcvi.utils.range import range_minmax

    (i, s), (j, e) = order[start], order[end]
    seqid = s.seqid
    assert seqid == e.seqid

    startbase, endbase = range_minmax([(s.start, s.end), (e.start, e.end)])

    return seqid, startbase, endbase
示例#7
0
文件: ca.py 项目: rrane/jcvi
def clr(args):
    """
    %prog blastfile fastafiles

    Calculate the vector clear range file based BLAST to the vectors.
    """
    p = OptionParser(clr.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    blastfile = args[0]
    fastafiles = args[1:]

    sizes = {}
    for fa in fastafiles:
        f = Fasta(fa)
        sizes.update(f.itersizes())

    b = Blast(blastfile)
    seen = set()
    for query, hits in b.iter_hits():

        qsize = sizes[query]
        vectors = list((x.qstart, x.qstop) for x in hits)
        vmin, vmax = range_minmax(vectors)

        left_size = vmin - 1
        right_size = qsize - vmax

        if left_size > right_size:
            clr_start, clr_end = 0, vmin
        else:
            clr_start, clr_end = vmax, qsize

        print "\t".join(str(x) for x in (query, clr_start, clr_end))
        del sizes[query]

    for q, size in sorted(sizes.items()):
        print "\t".join(str(x) for x in (q, 0, size))
示例#8
0
def clr(args):
    """
    %prog blastfile fastafiles

    Calculate the vector clear range file based BLAST to the vectors.
    """
    p = OptionParser(clr.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    blastfile = args[0]
    fastafiles = args[1:]

    sizes = {}
    for fa in fastafiles:
        f = Fasta(fa)
        sizes.update(f.itersizes())

    b = Blast(blastfile)
    seen = set()
    for query, hits in b.iter_hits():

        qsize = sizes[query]
        vectors = list((x.qstart, x.qstop) for x in hits)
        vmin, vmax = range_minmax(vectors)

        left_size = vmin - 1
        right_size = qsize - vmax

        if left_size > right_size:
            clr_start, clr_end = 0, vmin
        else:
            clr_start, clr_end = vmax, qsize

        print "\t".join(str(x) for x in (query, clr_start, clr_end))
        del sizes[query]

    for q, size in sorted(sizes.items()):
        print "\t".join(str(x) for x in (q, 0, size))
示例#9
0
def get_cds_minmax(g, cid, level=2):
    cds = [x for x in g.children(cid, level) if x.featuretype == "CDS"]
    cdsranges = [(x.start, x.end) for x in cds]
    return range_minmax(cdsranges)
示例#10
0
def pastegenes(args):
    """
    %prog pastegenes coverage.list old.genes.bed new.genes.bed old.assembly

    Paste in zero or low coverage genes.  For a set of neighboring genes
    missing, add the whole cassette as unplaced scaffolds. For singletons the
    program will try to make a patch.
    """
    from jcvi.formats.base import DictFile
    from jcvi.utils.cbook import gene_name

    p = OptionParser(pastegenes.__doc__)
    p.add_option(
        "--cutoff",
        default=90,
        type="int",
        help="Coverage cutoff to call gene missing",
    )
    p.add_option(
        "--flank",
        default=2000,
        type="int",
        help="Get the seq of size on two ends",
    )
    p.add_option(
        "--maxsize",
        default=50000,
        type="int",
        help="Maximum size of patchers to be replaced",
    )
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    coveragefile, oldbed, newbed, oldassembly = args
    cutoff = opts.cutoff
    flank = opts.flank
    maxsize = opts.maxsize

    coverage = DictFile(coveragefile, valuepos=2, cast=float)

    obed = Bed(oldbed)
    order = obed.order
    bed = [x for x in obed if x.accn in coverage]
    key = lambda x: coverage[x.accn] >= cutoff

    extrabed = "extra.bed"
    extendbed = "extend.bed"
    pastebed = "paste.bed"

    fw = open(extrabed, "w")
    fwe = open(extendbed, "w")
    fwp = open(pastebed, "w")
    fw_ids = open(extendbed + ".ids", "w")

    singletons, large, large_genes = 0, 0, 0
    for chr, chrbed in groupby(bed, key=lambda x: x.seqid):
        chrbed = list(chrbed)
        for good, beds in groupby(chrbed, key=key):
            if good:
                continue

            beds = list(beds)
            blocksize = len(set([gene_name(x.accn) for x in beds]))
            if blocksize == 1:
                singletons += 1
                accn = beds[0].accn
                gi, gb = order[accn]
                leftb = obed[gi - 1]
                rightb = obed[gi + 1]
                leftr = leftb.range
                rightr = rightb.range
                cur = gb.range
                distance_to_left, oo = range_distance(leftr, cur)
                distance_to_right, oo = range_distance(cur, rightr)
                span, oo = range_distance(leftr, rightr)

                if distance_to_left <= distance_to_right and distance_to_left > 0:
                    label = "LEFT"
                else:
                    label = "RIGHT"

                if 0 < span <= maxsize:
                    print(
                        "\t".join(
                            str(x) for x in (chr, leftb.start, rightb.end, gb.accn)
                        ),
                        file=fwp,
                    )

                print(leftb, file=fwe)
                print(gb, file=fwe)
                print(rightb, file=fwe)
                print(
                    "L:{0} R:{1} [{2}]".format(
                        distance_to_left, distance_to_right, label
                    ),
                    file=fwe,
                )
                print(gb.accn, file=fw_ids)
                continue

            large += 1
            large_genes += blocksize

            ranges = [(x.start, x.end) for x in beds]
            rmin, rmax = range_minmax(ranges)
            rmin -= flank
            rmax += flank

            name = "-".join((beds[0].accn, beds[-1].accn))
            print("\t".join(str(x) for x in (chr, rmin - 1, rmax, name)), file=fw)

    fw.close()
    fwe.close()

    extrabed = mergeBed(extrabed, d=flank, nms=True)
    fastaFromBed(extrabed, oldassembly, name=True)
    summary([extrabed])

    logging.debug("Singleton blocks : {0}".format(singletons))
    logging.debug("Large blocks : {0} ({1} genes)".format(large, large_genes))
示例#11
0
def bambus(args):
    """
    %prog bambus bambus.bed bambus.mates total.fasta

    Insert unplaced scaffolds based on mates.
    """
    from jcvi.formats.bed import BedLine
    from jcvi.formats.posmap import MatesFile

    p = OptionParser(bambus.__doc__)
    p.add_option(
        "--prefix",
        default="scaffold",
        help="Prefix of the unplaced scaffolds",
    )
    p.add_option(
        "--minlinks",
        default=3,
        type="int",
        help="Minimum number of links to place",
    )
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    bedfile, matesfile, fastafile = args
    pf = matesfile.rsplit(".", 1)[0]
    logfile = pf + ".log"
    log = open(logfile, "w")

    mf = MatesFile(matesfile)
    maxdist = max(x.max for x in mf.libraries.values())
    logging.debug("Max separation: {0}".format(maxdist))

    prefix = opts.prefix
    minlinks = opts.minlinks

    is_unplaced = lambda x: x.startswith(prefix)
    bed = Bed(bedfile, sorted=False)
    beds = []
    unplaced = defaultdict(list)

    for a, b in pairwise(bed):
        aname, bname = a.accn, b.accn
        aseqid, bseqid = a.seqid, b.seqid

        if aname not in mf:
            continue

        pa, la = mf[aname]
        if pa != bname:
            continue

        ia = is_unplaced(aseqid)
        ib = is_unplaced(bseqid)
        if ia == ib:
            continue

        if ia:
            a, b = b, a

        unplaced[b.seqid].append((a, b))
        beds.extend([a, b])

    sizes = Sizes(fastafile)
    candidatebed = Bed()
    cbeds = []
    # For each unplaced scaffold, find most likely placement and orientation
    for scf, beds in sorted(unplaced.items()):
        print(file=log)
        ranges = []
        for a, b in beds:
            aname, astrand = a.accn, a.strand
            bname, bstrand = b.accn, b.strand
            aseqid, bseqid = a.seqid, b.seqid
            pa, lib = mf[aname]

            print(a, file=log)
            print(b, file=log)

            flip_b = astrand == bstrand
            fbstrand = "-" if flip_b else "+"
            if flip_b:
                b.reverse_complement(sizes)

            lmin, lmax = lib.min, lib.max

            L = sizes.get_size(scf)
            assert astrand in ("+", "-")
            if astrand == "+":
                offset = a.start - b.end
                sstart, sstop = offset + lmin, offset + lmax
            else:
                offset = a.end - b.start + L
                sstart, sstop = offset - lmax, offset - lmin

            # Prevent out of range error
            size = sizes.get_size(aseqid)
            sstart = max(0, sstart)
            sstop = max(0, sstop)
            sstart = min(size - 1, sstart)
            sstop = min(size - 1, sstop)

            start_range = (aseqid, sstart, sstop, scf, 1, fbstrand)
            print("*" + "\t".join(str(x) for x in start_range), file=log)
            ranges.append(start_range)

        mranges = [x[:3] for x in ranges]
        # Determine placement by finding the interval with the most support
        rd = ranges_depth(mranges, sizes.mapping, verbose=False)
        alldepths = []
        for depth in rd:
            alldepths.extend(depth)
        print(alldepths, file=log)

        maxdepth = max(alldepths, key=lambda x: x[-1])[-1]
        if maxdepth < minlinks:
            print("Insufficient links ({0} < {1})".format(maxdepth, minlinks), file=log)
            continue

        candidates = [x for x in alldepths if x[-1] == maxdepth]
        nseqids = len(set(x[0] for x in candidates))
        if nseqids != 1:
            msg = "Multiple conflicting candidates found"
            print(msg, file=log)
            continue

        seqid, mmin, mmax, depth = candidates[0]
        mmin, mmax = range_minmax([x[1:3] for x in candidates])

        if mmin >= mmax:
            msg = "Invalid (min, max) range"
            print("Invalid (min, max) range", file=log)
            continue

        if (mmax - mmin) > maxdist:
            msg = "(min, max) distance greater than library maxdist"
            print(msg, file=log)
            continue

        # Determine orientation by voting
        nplus, nminus = 0, 0
        arange = (seqid, mmin, mmax)
        for sid, start, end, sf, sc, fbstrand in ranges:
            brange = (sid, start, end)
            if range_overlap(arange, brange):
                if fbstrand == "+":
                    nplus += 1
                else:
                    nminus += 1

        fbstrand = "+" if nplus >= nminus else "-"

        candidate = (seqid, mmin, mmax, scf, depth, fbstrand)
        bedline = BedLine("\t".join((str(x) for x in candidate)))
        cbeds.append(bedline)
        print("Plus: {0}, Minus: {1}".format(nplus, nminus), file=log)
        print(candidate, file=log)

    candidatebed.extend(cbeds)
    logging.debug("A total of {0} scaffolds can be placed.".format(len(candidatebed)))
    log.close()

    candidatebedfile = pf + ".candidate.bed"
    candidatebed.print_to_file(candidatebedfile, sorted=True)
示例#12
0
def loss(args):
    """
    %prog loss a.b.i1.blocks [a.b-genomic.blast]

    Extract likely gene loss candidates between genome a and b.
    """
    p = OptionParser(loss.__doc__)
    p.add_option("--bed", default=False, action="store_true",
                 help="Genomic BLAST is in bed format [default: %default]")
    p.add_option("--gdist", default=20, type="int",
                 help="Gene distance [default: %default]")
    p.add_option("--bdist", default=20000, type="int",
                 help="Base pair distance [default: %default]")
    p.set_beds()
    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    blocksfile = args[0]
    emptyblast = (len(args) == 1)
    if emptyblast:
        genomicblast = "empty.blast"
        sh("touch {0}".format(genomicblast))
    else:
        genomicblast = args[1]

    gdist, bdist = opts.gdist, opts.bdist
    qbed, sbed, qorder, sorder, is_self = check_beds(blocksfile, p, opts)
    blocks = []
    fp = open(blocksfile)
    genetrack = {}
    proxytrack = {}
    for row in fp:
        a, b = row.split()
        genetrack[a] = b
        blocks.append((a, b))

    data = []
    for key, rows in groupby(blocks, key=lambda x: x[-1]):
        rows = list(rows)
        data.append((key, rows))

    imax = len(data) - 1
    for i, (key, rows) in enumerate(data):
        if i == 0 or i == imax:
            continue
        if key != '.':
            continue

        before, br = data[i - 1]
        after, ar = data[i + 1]
        bi, bx = sorder[before]
        ai, ax = sorder[after]
        dist = abs(bi - ai)
        if bx.seqid != ax.seqid or dist > gdist:
            continue

        start, end = range_minmax(((bx.start, bx.end), (ax.start, ax.end)))
        start, end = max(start - bdist, 1), end + bdist
        proxy = (bx.seqid, start, end)
        for a, b in rows:
            proxytrack[a] = proxy

    tags = {}
    if opts.bed:
        bed = Bed(genomicblast, sorted=False)
        key = lambda x: gene_name(x.accn.rsplit(".", 1)[0])
        for query, bb in groupby(bed, key=key):
            bb = list(bb)
            if query not in proxytrack:
                continue

            proxy = proxytrack[query]
            tag = "NS"
            best_b = bb[0]
            for b in bb:
                hsp = (b.seqid, b.start, b.end)
                if range_overlap(proxy, hsp):
                    tag = "S"
                    best_b = b
                    break

            hsp = (best_b.seqid, best_b.start, best_b.end)
            proxytrack[query] = hsp
            tags[query] = tag

    else:
        blast = Blast(genomicblast)
        for query, bb in blast.iter_hits():
            bb = list(bb)
            query = gene_name(query)
            if query not in proxytrack:
                continue

            proxy = proxytrack[query]
            tag = "NS"
            best_b = bb[0]
            for b in bb:
                hsp = (b.subject, b.sstart, b.sstop)
                if range_overlap(proxy, hsp):
                    tag = "S"
                    best_b = b
                    break

            hsp = (best_b.subject, best_b.sstart, best_b.sstop)
            proxytrack[query] = hsp
            tags[query] = tag

    for b in qbed:
        accn = b.accn
        target_region = genetrack[accn]
        if accn in proxytrack:
            target_region = region_str(proxytrack[accn])
            if accn in tags:
                ptag = "[{0}]".format(tags[accn])
            else:
                ptag = "[NF]"
            target_region = ptag + target_region

        print "\t".join((b.seqid, accn, target_region))

    if emptyblast:
        sh("rm -f {0}".format(genomicblast))
示例#13
0
文件: patch.py 项目: JinfengChen/jcvi
def pastegenes(args):
    """
    %prog pastegenes coverage.list old.genes.bed new.genes.bed old.assembly

    Paste in zero or low coverage genes.  For a set of neighboring genes
    missing, add the whole cassette as unplaced scaffolds. For singletons the
    program will try to make a patch.
    """
    from jcvi.formats.base import DictFile
    from jcvi.utils.cbook import gene_name

    p = OptionParser(pastegenes.__doc__)
    p.add_option("--cutoff", default=90, type="int",
                 help="Coverage cutoff to call gene missing [default: %default]")
    p.add_option("--flank", default=2000, type="int",
                 help="Get the seq of size on two ends [default: %default]")
    p.add_option("--maxsize", default=50000, type="int",
            help="Maximum size of patchers to be replaced [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    coveragefile, oldbed, newbed, oldassembly = args
    cutoff = opts.cutoff
    flank = opts.flank
    maxsize = opts.maxsize

    coverage = DictFile(coveragefile, valuepos=2, cast=float)

    obed = Bed(oldbed)
    order = obed.order
    bed = [x for x in obed if x.accn in coverage]
    key = lambda x: coverage[x.accn] >= cutoff

    extrabed = "extra.bed"
    extendbed = "extend.bed"
    pastebed = "paste.bed"

    fw = open(extrabed, "w")
    fwe = open(extendbed, "w")
    fwp = open(pastebed, "w")
    fw_ids = open(extendbed + ".ids", "w")

    singletons, large, large_genes = 0, 0, 0
    for chr, chrbed in groupby(bed, key=lambda x: x.seqid):
        chrbed = list(chrbed)
        for good, beds in groupby(chrbed, key=key):
            if good:
                continue

            beds = list(beds)
            blocksize = len(set([gene_name(x.accn) for x in beds]))
            if blocksize == 1:
                singletons += 1
                accn = beds[0].accn
                gi, gb = order[accn]
                leftb = obed[gi - 1]
                rightb = obed[gi + 1]
                leftr = leftb.range
                rightr = rightb.range
                cur = gb.range
                distance_to_left, oo = range_distance(leftr, cur)
                distance_to_right, oo = range_distance(cur, rightr)
                span, oo = range_distance(leftr, rightr)

                if distance_to_left <= distance_to_right and \
                   distance_to_left > 0:
                    label = "LEFT"
                else:
                    label = "RIGHT"

                if 0 < span <= maxsize:
                    print >> fwp, "\t".join(str(x) for x in \
                                    (chr, leftb.start, rightb.end, gb.accn))

                print >> fwe, leftb
                print >> fwe, gb
                print >> fwe, rightb
                print >> fwe, "L:{0} R:{1} [{2}]".format(distance_to_left, \
                            distance_to_right, label)
                print >> fw_ids, gb.accn
                continue

            large += 1
            large_genes += blocksize

            ranges = [(x.start, x.end) for x in beds]
            rmin, rmax = range_minmax(ranges)
            rmin -= flank
            rmax += flank

            name = "-".join((beds[0].accn, beds[-1].accn))
            print >> fw, "\t".join(str(x) for x in (chr, rmin - 1, rmax, name))

    fw.close()
    fwe.close()

    extrabed = mergeBed(extrabed, d=flank, nms=True)
    fastaFromBed(extrabed, oldassembly, name=True)
    summary([extrabed])

    logging.debug("Singleton blocks : {0}".format(singletons))
    logging.debug("Large blocks : {0} ({1} genes)".format(large, large_genes))
示例#14
0
文件: patch.py 项目: JinfengChen/jcvi
def bambus(args):
    """
    %prog bambus bambus.bed bambus.mates total.fasta

    Insert unplaced scaffolds based on mates.
    """
    from jcvi.utils.iter import pairwise
    from jcvi.formats.posmap import MatesFile

    p = OptionParser(bambus.__doc__)
    p.add_option("--prefix", default="scaffold",
                 help="Prefix of the unplaced scaffolds [default: %default]")
    p.add_option("--minlinks", default=3, type="int",
                 help="Minimum number of links to place [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    bedfile, matesfile, fastafile = args
    pf = matesfile.rsplit(".", 1)[0]
    logfile = pf + ".log"
    log = open(logfile, "w")

    mf = MatesFile(matesfile)
    maxdist = max(x.max for x in mf.libraries.values())
    logging.debug("Max separation: {0}".format(maxdist))

    prefix = opts.prefix
    minlinks = opts.minlinks

    is_unplaced = lambda x: x.startswith(prefix)
    bed = Bed(bedfile, sorted=False)
    beds = []
    unplaced = defaultdict(list)

    for a, b in pairwise(bed):
        aname, bname = a.accn, b.accn
        aseqid, bseqid = a.seqid, b.seqid

        if aname not in mf:
            continue

        pa, la = mf[aname]
        if pa != bname:
            continue

        ia = is_unplaced(aseqid)
        ib = is_unplaced(bseqid)
        if ia == ib:
            continue

        if ia:
            a, b = b, a

        unplaced[b.seqid].append((a, b))
        beds.extend([a, b])

    sizes = Sizes(fastafile)
    candidatebed = Bed()
    cbeds = []
    # For each unplaced scaffold, find most likely placement and orientation
    for scf, beds in sorted(unplaced.items()):
        print >> log
        ranges = []
        for a, b in beds:
            aname, astrand = a.accn, a.strand
            bname, bstrand = b.accn, b.strand
            aseqid, bseqid = a.seqid, b.seqid
            pa, lib = mf[aname]

            print >> log, a
            print >> log, b

            flip_b = (astrand == bstrand)
            fbstrand = '-' if flip_b else '+'
            if flip_b:
                b.reverse_complement(sizes)

            lmin, lmax = lib.min, lib.max

            L = sizes.get_size(scf)
            assert astrand in ('+', '-')
            if astrand == '+':
                offset = a.start - b.end
                sstart, sstop = offset + lmin, offset + lmax
            else:
                offset = a.end - b.start + L
                sstart, sstop = offset - lmax, offset - lmin

            # Prevent out of range error
            size = sizes.get_size(aseqid)
            sstart = max(0, sstart)
            sstop = max(0, sstop)
            sstart = min(size - 1, sstart)
            sstop = min(size - 1, sstop)

            start_range = (aseqid, sstart, sstop, scf, 1, fbstrand)
            print >> log, "*" + "\t".join(str(x) for x in start_range)
            ranges.append(start_range)

        mranges = [x[:3] for x in ranges]
        # Determine placement by finding the interval with the most support
        rd = ranges_depth(mranges, sizes.mapping, verbose=False)
        alldepths = []
        for depth in rd:
            alldepths.extend(depth)
        print >> log, alldepths

        maxdepth = max(alldepths, key=lambda x: x[-1])[-1]
        if maxdepth < minlinks:
            print >> log, "Insufficient links ({0} < {1})".format(maxdepth, minlinks)
            continue

        candidates = [x for x in alldepths if x[-1] == maxdepth]
        nseqids = len(set(x[0] for x in candidates))
        msg = "Multiple conflicting candidates found"
        if nseqids != 1:
            print >> log, msg
            continue

        seqid, mmin, mmax, depth = candidates[0]
        mmin, mmax = range_minmax([x[1:3] for x in candidates])
        if (mmax - mmin) > maxdist:
            print >> log, msg
            continue

        # Determine orientation by voting
        nplus, nminus = 0, 0
        arange = (seqid, mmin, mmax)
        for sid, start, end, sf, sc, fbstrand in ranges:
            brange = (sid, start, end)
            if range_overlap(arange, brange):
                if fbstrand == '+':
                    nplus += 1
                else:
                    nminus += 1

        fbstrand = '+' if nplus >= nminus else '-'

        candidate = (seqid, mmin, mmax, scf, depth, fbstrand)
        bedline = BedLine("\t".join((str(x) for x in candidate)))
        cbeds.append(bedline)
        print >> log, "Plus: {0}, Minus: {1}".format(nplus, nminus)
        print >> log, candidate

    candidatebed.extend(cbeds)
    logging.debug("A total of {0} scaffolds can be placed.".\
                    format(len(candidatebed)))
    log.close()

    candidatebedfile = pf + ".candidate.bed"
    candidatebed.print_to_file(candidatebedfile, sorted=True)
示例#15
0
文件: btab.py 项目: zhaotao1987/jcvi
def gff(args):
    """
    %prog gff btabfile

    Convert btab file generated by AAT to gff3 format.
    """
    from jcvi.utils.range import range_minmax
    from jcvi.formats.gff import valid_gff_parent_child, valid_gff_type

    p = OptionParser(gff.__doc__)
    p.add_option(
        "--source",
        default=None,
        help="Specify GFF source." +
        " By default, it picks algorithm used to generate btab file." +
        " [default: %default]")
    p.add_option("--type",
                 default="protein_match",
                 choices=valid_gff_type,
                 help="GFF feature type [default: %default]")

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    btabfile, = args
    btabdict = {}
    btab = Btab(btabfile, aat_dialect=True)
    osource = opts.source or "aat"
    otype = opts.type
    octype = valid_gff_parent_child[otype]
    for b in btab:
        nargs = b.nargs
        id = b.query + "-" + otype + "{0:05d}".format(b.chainNum)
        key = b.key
        if key not in btabdict:
            btabdict[key] = {
                'id': id,
                'method': b.method,
                'query': b.query,
                'subject': b.subject,
                'strand': b.qStrand,
                'sDesc': b.sDesc,
                'coords': [],
                'children': []
            }

        btabdict[key]['coords'].append((b.qStart, b.qStop))
        btabdict[key]['children'].append(
            b.gffline(source=osource, type=octype, id=id))

    for v in btabdict.itervalues():
        b = BtabLine("\t".join(str(x) for x in [0] * nargs), aat_dialect=True)
        id = v['id']
        b.query = v['query']
        b.method = v['method']
        b.subject = v['subject']
        b.qStrand = v['strand']
        b.sDesc = v['sDesc']
        b.qStart, b.qStop = range_minmax(v['coords'])
        print b.gffline(source=osource, type=otype, primary_tag="ID", id=id)
        print "\n".join(v['children'])
示例#16
0
def completeness(args):
    """
    %prog completeness blastfile ref.fasta > outfile

    Print statistics for each gene, the coverage of the alignment onto the best hit,
    as an indicator for completeness of the gene model. For example, one might
    BLAST sugarcane ESTs against sorghum annotations as reference, to find
    full-length transcripts.
    """
    from jcvi.utils.range import range_minmax
    from jcvi.utils.cbook import SummaryStats

    p = OptionParser(completeness.__doc__)
    p.add_option(
        "--ids",
        help="Save ids that are over 50% complete [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, fastafile = args
    idsfile = opts.ids
    f = Sizes(fastafile).mapping

    b = BlastSlow(blastfile)
    valid = []
    data = []
    cutoff = 50
    for query, blines in groupby(b, key=lambda x: x.query):
        blines = list(blines)
        ranges = [(x.sstart, x.sstop) for x in blines]
        b = blines[0]
        query, subject = b.query, b.subject

        rmin, rmax = range_minmax(ranges)
        subject_len = f[subject]

        nterminal_dist = rmin - 1
        cterminal_dist = subject_len - rmax
        covered = (rmax - rmin + 1) * 100 / subject_len
        if covered > cutoff:
            valid.append(query)

        data.append((nterminal_dist, cterminal_dist, covered))
        print "\t".join(
            str(x)
            for x in (query, subject, nterminal_dist, cterminal_dist, covered))

    nd, cd, cv = zip(*data)
    m = "Total: {0}, Coverage > {1}%: {2}\n".\
           format(len(data), cutoff, len(valid))
    m += "N-terminal: {0}\n".format(SummaryStats(nd))
    m += "C-terminal: {0}\n".format(SummaryStats(cd))
    m += "Coverage: {0}".format(SummaryStats(cv))
    print >> sys.stderr, m

    if idsfile:
        fw = open(idsfile, "w")
        print >> fw, "\n".join(valid)
        logging.debug("A total of {0} ids (cov > {1} %) written to `{2}`.".\
                      format(len(valid), cutoff, idsfile))
        fw.close()
示例#17
0
文件: qc.py 项目: arvin580/jcvi
def get_cds_minmax(g, cid, level=2):
    cds = [x for x in g.children(cid, level) if x.featuretype == "CDS"]
    cdsranges = [(x.start, x.end) for x in cds]
    return range_minmax(cdsranges)
示例#18
0
文件: qc.py 项目: arvin580/jcvi
def trimUTR(args):
    """
    %prog trimUTR gffile

    Remove UTRs in the annotation set.

    If reference GFF3 is provided, reinstate UTRs from reference
    transcripts after trimming.

    Note: After running trimUTR, it is advised to also run
    `python -m jcvi.formats.gff fixboundaries` on the resultant GFF3
    to adjust the boundaries of all parent 'gene' features
    """
    import gffutils
    from jcvi.formats.base import SetFile

    p = OptionParser(trimUTR.__doc__)
    p.add_option("--trim5", default=None, type="str", \
        help="File containing gene list for 5' UTR trimming")
    p.add_option("--trim3", default=None, type="str", \
        help="File containing gene list for 3' UTR trimming")
    p.add_option("--trimrange", default=None, type="str", \
        help="File containing gene list for UTR trim back" + \
             "based on suggested (start, stop) coordinate range")
    p.add_option("--refgff", default=None, type="str", \
        help="Reference GFF3 used as fallback to replace UTRs")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    gffile, = args
    gff = make_index(gffile)

    trim_both = False if (opts.trim5 or opts.trim3) else True
    trim5 = SetFile(opts.trim5) if opts.trim5 else set()
    trim3 = SetFile(opts.trim3) if opts.trim3 else set()
    trimrange = dict()
    if opts.trimrange:
        trf = must_open(opts.trimrange)
        for tr in trf:
            assert len(tr.split("\t")) == 3, \
                "Must specify (start, stop) coordinate range"
            id, start, stop = tr.split("\t")
            trimrange[id] = (int(start), int(stop))
        trf.close()

    refgff = make_index(opts.refgff) if opts.refgff else None

    fw = must_open(opts.outfile, "w")
    for feat in gff.iter_by_parent_childs(featuretype="gene", order_by=("seqid", "start"), level=1):
        for c in feat:
            cid, ctype, cparent = c.id, c.featuretype, \
                c.attributes.get('Parent', [None])[0]
            t5, t3 = False, False
            if ctype == "gene":
                t5 = True if cid in trim5 else False
                t3 = True if cid in trim3 else False
                start, end = get_cds_minmax(gff, cid)
                trim(c, start, end, trim5=t5, trim3=t3, both=trim_both)
                fprint(c, fw)
            elif ctype == "mRNA":
                utr_types, extras = [], set()
                if any(id in trim5 for id in (cid, cparent)):
                    t5 = True
                    trim5.add(cid)
                if any(id in trim3 for id in (cid, cparent)):
                    t3 = True
                    trim3.add(cid)
                refc = None
                if refgff:
                    try:
                        refc = refgff[cid]
                        refctype = refc.featuretype
                        refptype = refgff[refc.attributes['Parent'][0]].featuretype
                        if refctype == "mRNA" and refptype == "gene":
                            if cmp_children(cid, gff, refgff, cftype="CDS"):
                                reinstate(c, refc, trim5=t5, trim3=t3, both=trim_both)
                                if t5: utr_types.append('five_prime_UTR')
                                if t3: utr_types.append('three_prime_UTR')
                                for utr_type in utr_types:
                                    for utr in refgff.children(refc, featuretype=utr_type):
                                        extras.add(utr)
                                        for exon in refgff.region(region=utr, featuretype="exon"):
                                            if exon.attributes['Parent'][0] == cid:
                                                extras.add(exon)
                        else:
                            refc = None
                    except gffutils.exceptions.FeatureNotFoundError:
                        pass
                start, end = get_cds_minmax(gff, cid, level=1)
                if cid in trimrange:
                    start, end = range_minmax([trimrange[cid], (start, end)])
                if not refc:
                    trim(c, start, end, trim5=t5, trim3=t3, both=trim_both)
                fprint(c, fw)
                for cc in gff.children(cid, order_by=("start")):
                    _ctype = cc.featuretype
                    if _ctype not in utr_types:
                        if _ctype != "CDS":
                            if _ctype == "exon":
                                eskip = [range_overlap(to_range(cc), to_range(x)) \
                                    for x in extras if x.featuretype == 'exon']
                                if any(skip for skip in eskip): continue
                            trim(cc, start, end, trim5=t5, trim3=t3, both=trim_both)
                            fprint(cc, fw)
                        else:
                            fprint(cc, fw)
                for x in extras:
                    fprint(x, fw)
    fw.close()
示例#19
0
def trimUTR(args):
    """
    %prog trimUTR gffile

    Remove UTRs in the annotation set.

    If reference GFF3 is provided, reinstate UTRs from reference
    transcripts after trimming.

    Note: After running trimUTR, it is advised to also run
    `python -m jcvi.formats.gff fixboundaries` on the resultant GFF3
    to adjust the boundaries of all parent 'gene' features
    """
    import gffutils
    from jcvi.formats.base import SetFile

    p = OptionParser(trimUTR.__doc__)
    p.add_option(
        "--trim5",
        default=None,
        type="str",
        help="File containing gene list for 5' UTR trimming",
    )
    p.add_option(
        "--trim3",
        default=None,
        type="str",
        help="File containing gene list for 3' UTR trimming",
    )
    p.add_option(
        "--trimrange",
        default=None,
        type="str",
        help="File containing gene list for UTR trim back" +
        "based on suggested (start, stop) coordinate range",
    )
    p.add_option(
        "--refgff",
        default=None,
        type="str",
        help="Reference GFF3 used as fallback to replace UTRs",
    )
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (gffile, ) = args
    gff = make_index(gffile)

    trim_both = False if (opts.trim5 or opts.trim3) else True
    trim5 = SetFile(opts.trim5) if opts.trim5 else set()
    trim3 = SetFile(opts.trim3) if opts.trim3 else set()
    trimrange = dict()
    if opts.trimrange:
        trf = must_open(opts.trimrange)
        for tr in trf:
            assert (len(tr.split("\t")) == 3
                    ), "Must specify (start, stop) coordinate range"
            id, start, stop = tr.split("\t")
            trimrange[id] = (int(start), int(stop))
        trf.close()

    refgff = make_index(opts.refgff) if opts.refgff else None

    fw = must_open(opts.outfile, "w")
    for feat in gff.iter_by_parent_childs(featuretype="gene",
                                          order_by=("seqid", "start"),
                                          level=1):
        for c in feat:
            cid, ctype, cparent = (
                c.id,
                c.featuretype,
                c.attributes.get("Parent", [None])[0],
            )
            t5, t3 = False, False
            if ctype == "gene":
                t5 = True if cid in trim5 else False
                t3 = True if cid in trim3 else False
                start, end = get_cds_minmax(gff, cid)
                trim(c, start, end, trim5=t5, trim3=t3, both=trim_both)
                fprint(c, fw)
            elif ctype == "mRNA":
                utr_types, extras = [], set()
                if any(id in trim5 for id in (cid, cparent)):
                    t5 = True
                    trim5.add(cid)
                if any(id in trim3 for id in (cid, cparent)):
                    t3 = True
                    trim3.add(cid)
                refc = None
                if refgff:
                    try:
                        refc = refgff[cid]
                        refctype = refc.featuretype
                        refptype = refgff[refc.attributes["Parent"]
                                          [0]].featuretype
                        if refctype == "mRNA" and refptype == "gene":
                            if cmp_children(cid, gff, refgff, cftype="CDS"):
                                reinstate(c,
                                          refc,
                                          trim5=t5,
                                          trim3=t3,
                                          both=trim_both)
                                if t5:
                                    utr_types.append("five_prime_UTR")
                                if t3:
                                    utr_types.append("three_prime_UTR")
                                for utr_type in utr_types:
                                    for utr in refgff.children(
                                            refc, featuretype=utr_type):
                                        extras.add(utr)
                                        for exon in refgff.region(
                                                region=utr,
                                                featuretype="exon"):
                                            if exon.attributes["Parent"][
                                                    0] == cid:
                                                extras.add(exon)
                        else:
                            refc = None
                    except gffutils.exceptions.FeatureNotFoundError:
                        pass
                start, end = get_cds_minmax(gff, cid, level=1)
                if cid in trimrange:
                    start, end = range_minmax([trimrange[cid], (start, end)])
                if not refc:
                    trim(c, start, end, trim5=t5, trim3=t3, both=trim_both)
                fprint(c, fw)
                for cc in gff.children(cid, order_by="start"):
                    _ctype = cc.featuretype
                    if _ctype not in utr_types:
                        if _ctype != "CDS":
                            if _ctype == "exon":
                                eskip = [
                                    range_overlap(to_range(cc), to_range(x))
                                    for x in extras if x.featuretype == "exon"
                                ]
                                if any(eskip):
                                    continue
                            trim(cc,
                                 start,
                                 end,
                                 trim5=t5,
                                 trim3=t3,
                                 both=trim_both)
                            fprint(cc, fw)
                        else:
                            fprint(cc, fw)
                for x in extras:
                    fprint(x, fw)
    fw.close()
示例#20
0
def loss(args):
    """
    %prog loss a.b.i1.blocks [a.b-genomic.blast]

    Extract likely gene loss candidates between genome a and b.
    """
    p = OptionParser(loss.__doc__)
    p.add_option("--bed",
                 default=False,
                 action="store_true",
                 help="Genomic BLAST is in bed format [default: %default]")
    p.add_option("--gdist",
                 default=20,
                 type="int",
                 help="Gene distance [default: %default]")
    p.add_option("--bdist",
                 default=20000,
                 type="int",
                 help="Base pair distance [default: %default]")
    p.set_beds()
    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    blocksfile = args[0]
    emptyblast = (len(args) == 1)
    if emptyblast:
        genomicblast = "empty.blast"
        sh("touch {0}".format(genomicblast))
    else:
        genomicblast = args[1]

    gdist, bdist = opts.gdist, opts.bdist
    qbed, sbed, qorder, sorder, is_self = check_beds(blocksfile, p, opts)
    blocks = []
    fp = open(blocksfile)
    genetrack = {}
    proxytrack = {}
    for row in fp:
        a, b = row.split()
        genetrack[a] = b
        blocks.append((a, b))

    data = []
    for key, rows in groupby(blocks, key=lambda x: x[-1]):
        rows = list(rows)
        data.append((key, rows))

    imax = len(data) - 1
    for i, (key, rows) in enumerate(data):
        if i == 0 or i == imax:
            continue
        if key != '.':
            continue

        before, br = data[i - 1]
        after, ar = data[i + 1]
        bi, bx = sorder[before]
        ai, ax = sorder[after]
        dist = abs(bi - ai)
        if bx.seqid != ax.seqid or dist > gdist:
            continue

        start, end = range_minmax(((bx.start, bx.end), (ax.start, ax.end)))
        start, end = max(start - bdist, 1), end + bdist
        proxy = (bx.seqid, start, end)
        for a, b in rows:
            proxytrack[a] = proxy

    tags = {}
    if opts.bed:
        bed = Bed(genomicblast, sorted=False)
        key = lambda x: gene_name(x.accn.rsplit(".", 1)[0])
        for query, bb in groupby(bed, key=key):
            bb = list(bb)
            if query not in proxytrack:
                continue

            proxy = proxytrack[query]
            tag = "NS"
            best_b = bb[0]
            for b in bb:
                hsp = (b.seqid, b.start, b.end)
                if range_overlap(proxy, hsp):
                    tag = "S"
                    best_b = b
                    break

            hsp = (best_b.seqid, best_b.start, best_b.end)
            proxytrack[query] = hsp
            tags[query] = tag

    else:
        blast = Blast(genomicblast)
        for query, bb in blast.iter_hits():
            bb = list(bb)
            query = gene_name(query)
            if query not in proxytrack:
                continue

            proxy = proxytrack[query]
            tag = "NS"
            best_b = bb[0]
            for b in bb:
                hsp = (b.subject, b.sstart, b.sstop)
                if range_overlap(proxy, hsp):
                    tag = "S"
                    best_b = b
                    break

            hsp = (best_b.subject, best_b.sstart, best_b.sstop)
            proxytrack[query] = hsp
            tags[query] = tag

    for b in qbed:
        accn = b.accn
        target_region = genetrack[accn]
        if accn in proxytrack:
            target_region = region_str(proxytrack[accn])
            if accn in tags:
                ptag = "[{0}]".format(tags[accn])
            else:
                ptag = "[NF]"
            target_region = ptag + target_region

        print "\t".join((b.seqid, accn, target_region))

    if emptyblast:
        sh("rm -f {0}".format(genomicblast))
示例#21
0
def test_range_minmax(ranges, expected):
    from jcvi.utils.range import range_minmax

    assert range_minmax(ranges) == expected