Пример #1
0
def make_cns_to_at_map(blast_file, tair_desc, gff, query, subject, outdir):
    """
    take the cns vs at blast file, find the best at hit for the cns,
    and find teh tair desc for that hit, create a datastructure like:
        {'cns_id' : ('at_name', 'at_desc') ... } 
    e.g.
        {'q1|5766644|5766724|3|2158237|2158317' : ('AT2G38030', 'pre-tRNA')}

    this can then be used in   make_better_datasheet.
    """
    shelve = get_shelve(outdir, query, subject)
    from blast_misc import blast_array
    b = blast_array(blast_file, best_hit=True)
    seen = {}
    updated = 0
    for row in b:
        if str(row['query']) in seen: continue
        seen[str(row['query'])] = True
        besthit = sorted(b[b['query'] == row['query']],
                         key=operator.itemgetter('eval'),
                         reverse=True)[0]
        # 1: get rid of starting SB or OS
        key = str(row['query'])[3:]

        current = shelve[key]
        current["at_rna"] = str(row['subject']) + ';;eval(' + str(
            row['eval']) + ')' + tair_desc[str(row['subject'])]
        updated += 1
        shelve[key] = current
    print >> sys.stderr, "%i cns's tagged as hitting at_rna" % updated
    shelve.close()
Пример #2
0
def make_cns_to_at_map(blast_file, tair_desc, gff, query, subject, outdir):
    """
    take the cns vs at blast file, find the best at hit for the cns,
    and find teh tair desc for that hit, create a datastructure like:
        {'cns_id' : ('at_name', 'at_desc') ... } 
    e.g.
        {'q1|5766644|5766724|3|2158237|2158317' : ('AT2G38030', 'pre-tRNA')}

    this can then be used in   make_better_datasheet.
    """
    shelve = get_shelve(outdir, query, subject)
    from blast_misc import blast_array

    b = blast_array(blast_file, best_hit=True)
    seen = {}
    updated = 0
    for row in b:
        if str(row["query"]) in seen:
            continue
        seen[str(row["query"])] = True
        besthit = sorted(b[b["query"] == row["query"]], key=operator.itemgetter("eval"), reverse=True)[0]
        # 1: get rid of starting SB or OS
        key = str(row["query"])[3:]

        current = shelve[key]
        current["at_rna"] = str(row["subject"]) + ";;eval(" + str(row["eval"]) + ")" + tair_desc[str(row["subject"])]
        updated += 1
        shelve[key] = current
    print >> sys.stderr, "%i cns's tagged as hitting at_rna" % updated
    shelve.close()
Пример #3
0
def find_colinear_hits(blastfile, qeval, seval, mask='query', as_str=False):
    sqlite_file = blastfile[:blastfile.rfind(".")] + ".sqlite"
    db = sqlite3.connect(sqlite_file)
    cur = db.cursor()
    
    # need these to convert the absolute coords in sqlite to match
    # the local positions in the fresh blast
    qmin, smin = [x[0] for x in cur.execute('SELECT bpmin FROM image_info ORDER BY id')]

    # so we mask the new blast with anything that's NOT an HSP
    sql = "SELECT image_id, bpmin, bpmax FROM image_data WHERE type != 'HSP'"
    if mask == 'query': sql += ' AND image_id = 1'

    b = blast_array(blastfile, dopickle=False, best_hit=0, maxkeep=99999)
    if plot:
        pylab.plot(b['qstart'], b['sstart'], "kx")
    b = b[(b['eval'] < qeval) & (b['eval'] < seval)]

    if plot:
        pylab.plot(b['qstart'], b['sstart'], "ro")

    # TODO: remove stuff that's way off the diagonal?


    for row  in cur.execute(sql).fetchall():
        (start, stop, lmin) = row[0] == 1 and ('qstart', 'qstop', qmin) or ('sstart', 'sstop', smin)
        assert (start, stop) == ('qstart', 'qstop') # for now, always only using query
        cds_start, cds_stop = row[1] - lmin + 1, row[2] - lmin + 1
        bstart, bstop  = b[start], b[stop]
        b = b[numexpr.evaluate("(((bstart < cds_start) & (bstop < cds_start)) | ((bstop  > cds_stop ) & (bstart > cds_stop)))")]
                               
    r = 0 
    if not b.shape[0]: return None
    delta = 0.2 * b['sstart'].max()

    # here, try to find a sort of line, and keep removing outliers to only get linear cnss
    for i in range(4):
        slope, intercept, r, zy, zz = linregress(b['qstart'], b['sstart'])
        #print >>sys.stderr, slope, intercept, r, zy, zz
        if r > 0.8: break
        bqstart = b['qstart']
        expected = numexpr.evaluate('intercept + slope * bqstart')
        bsstart = b['sstart']
        s = b.shape[0]
        b = b[numexpr.evaluate('bsstart - expected < delta')]
        if s == b.shape[0]: break # not removing anything.

    if plot:
        pylab.plot(b['qstart'], b['sstart'], "bo")
        pylab.savefig('/var/www/ms_tmp/d.png')
    
    cnss = []
    start_stops = [map(lambda p: int(p) + qmin - 1, pair) for pair in zip(b['qstart'], b['qstop'])]

    for qstart, qstop in start_stops:
        qres =  cur.execute('SELECT xmin, ymin, xmax, ymax, id, pair_id FROM image_data WHERE image_id = 1 AND bpmin = ? AND bpmax = ?', (qstart, qstop)).fetchone()
        this_cns = [qres[:-1]]
        if not qres: continue
        sres =  cur.execute('SELECT xmin, ymin, xmax, ymax, id FROM image_data WHERE id = ?', (qres[-1],)).fetchone()
        this_cns.append(sres)
        cnss.append(this_cns)
    return cnss
Пример #4
0
def find_colinear_hits(blastfile, qeval, seval, mask='query', as_str=False):
    sqlite_file = blastfile[:blastfile.rfind(".")] + ".sqlite"
    db = sqlite3.connect(sqlite_file)
    cur = db.cursor()

    # need these to convert the absolute coords in sqlite to match
    # the local positions in the fresh blast
    qmin, smin = [
        x[0] for x in cur.execute('SELECT bpmin FROM image_info ORDER BY id')
    ]

    # so we mask the new blast with anything that's NOT an HSP
    sql = "SELECT image_id, bpmin, bpmax FROM image_data WHERE type != 'HSP'"
    if mask == 'query': sql += ' AND image_id = 1'

    b = blast_array(blastfile, dopickle=False, best_hit=0, maxkeep=99999)
    if plot:
        pylab.plot(b['qstart'], b['sstart'], "kx")
    b = b[(b['eval'] < qeval) & (b['eval'] < seval)]

    if plot:
        pylab.plot(b['qstart'], b['sstart'], "ro")

    # TODO: remove stuff that's way off the diagonal?

    for row in cur.execute(sql).fetchall():
        (start, stop,
         lmin) = row[0] == 1 and ('qstart', 'qstop', qmin) or ('sstart',
                                                               'sstop', smin)
        assert (start, stop) == ('qstart', 'qstop'
                                 )  # for now, always only using query
        cds_start, cds_stop = row[1] - lmin + 1, row[2] - lmin + 1
        bstart, bstop = b[start], b[stop]
        b = b[numexpr.evaluate(
            "(((bstart < cds_start) & (bstop < cds_start)) | ((bstop  > cds_stop ) & (bstart > cds_stop)))"
        )]

    r = 0
    if not b.shape[0]: return None
    delta = 0.2 * b['sstart'].max()

    # here, try to find a sort of line, and keep removing outliers to only get linear cnss
    for i in range(4):
        slope, intercept, r, zy, zz = linregress(b['qstart'], b['sstart'])
        #print >>sys.stderr, slope, intercept, r, zy, zz
        if r > 0.8: break
        bqstart = b['qstart']
        expected = numexpr.evaluate('intercept + slope * bqstart')
        bsstart = b['sstart']
        s = b.shape[0]
        b = b[numexpr.evaluate('bsstart - expected < delta')]
        if s == b.shape[0]: break  # not removing anything.

    if plot:
        pylab.plot(b['qstart'], b['sstart'], "bo")
        pylab.savefig('/var/www/ms_tmp/d.png')

    cnss = []
    start_stops = [
        map(lambda p: int(p) + qmin - 1, pair)
        for pair in zip(b['qstart'], b['qstop'])
    ]

    for qstart, qstop in start_stops:
        qres = cur.execute(
            'SELECT xmin, ymin, xmax, ymax, id, pair_id FROM image_data WHERE image_id = 1 AND bpmin = ? AND bpmax = ?',
            (qstart, qstop)).fetchone()
        this_cns = [qres[:-1]]
        if not qres: continue
        sres = cur.execute(
            'SELECT xmin, ymin, xmax, ymax, id FROM image_data WHERE id = ?',
            (qres[-1], )).fetchone()
        this_cns.append(sres)
        cnss.append(this_cns)
    return cnss
Пример #5
0
import sys
sys.path.insert(0,".")
import blast_misc
import time
import operator

blast_file = 'data/t.blast'
b = blast_misc.blast_array(blast_file, best_hit=0, maxkeep=999999, dopickle=0)
print b
print b.shape