def make_cns_to_at_map(blast_file, tair_desc, gff, query, subject, outdir): """ take the cns vs at blast file, find the best at hit for the cns, and find teh tair desc for that hit, create a datastructure like: {'cns_id' : ('at_name', 'at_desc') ... } e.g. {'q1|5766644|5766724|3|2158237|2158317' : ('AT2G38030', 'pre-tRNA')} this can then be used in make_better_datasheet. """ shelve = get_shelve(outdir, query, subject) from blast_misc import blast_array b = blast_array(blast_file, best_hit=True) seen = {} updated = 0 for row in b: if str(row['query']) in seen: continue seen[str(row['query'])] = True besthit = sorted(b[b['query'] == row['query']], key=operator.itemgetter('eval'), reverse=True)[0] # 1: get rid of starting SB or OS key = str(row['query'])[3:] current = shelve[key] current["at_rna"] = str(row['subject']) + ';;eval(' + str( row['eval']) + ')' + tair_desc[str(row['subject'])] updated += 1 shelve[key] = current print >> sys.stderr, "%i cns's tagged as hitting at_rna" % updated shelve.close()
def make_cns_to_at_map(blast_file, tair_desc, gff, query, subject, outdir): """ take the cns vs at blast file, find the best at hit for the cns, and find teh tair desc for that hit, create a datastructure like: {'cns_id' : ('at_name', 'at_desc') ... } e.g. {'q1|5766644|5766724|3|2158237|2158317' : ('AT2G38030', 'pre-tRNA')} this can then be used in make_better_datasheet. """ shelve = get_shelve(outdir, query, subject) from blast_misc import blast_array b = blast_array(blast_file, best_hit=True) seen = {} updated = 0 for row in b: if str(row["query"]) in seen: continue seen[str(row["query"])] = True besthit = sorted(b[b["query"] == row["query"]], key=operator.itemgetter("eval"), reverse=True)[0] # 1: get rid of starting SB or OS key = str(row["query"])[3:] current = shelve[key] current["at_rna"] = str(row["subject"]) + ";;eval(" + str(row["eval"]) + ")" + tair_desc[str(row["subject"])] updated += 1 shelve[key] = current print >> sys.stderr, "%i cns's tagged as hitting at_rna" % updated shelve.close()
def find_colinear_hits(blastfile, qeval, seval, mask='query', as_str=False): sqlite_file = blastfile[:blastfile.rfind(".")] + ".sqlite" db = sqlite3.connect(sqlite_file) cur = db.cursor() # need these to convert the absolute coords in sqlite to match # the local positions in the fresh blast qmin, smin = [x[0] for x in cur.execute('SELECT bpmin FROM image_info ORDER BY id')] # so we mask the new blast with anything that's NOT an HSP sql = "SELECT image_id, bpmin, bpmax FROM image_data WHERE type != 'HSP'" if mask == 'query': sql += ' AND image_id = 1' b = blast_array(blastfile, dopickle=False, best_hit=0, maxkeep=99999) if plot: pylab.plot(b['qstart'], b['sstart'], "kx") b = b[(b['eval'] < qeval) & (b['eval'] < seval)] if plot: pylab.plot(b['qstart'], b['sstart'], "ro") # TODO: remove stuff that's way off the diagonal? for row in cur.execute(sql).fetchall(): (start, stop, lmin) = row[0] == 1 and ('qstart', 'qstop', qmin) or ('sstart', 'sstop', smin) assert (start, stop) == ('qstart', 'qstop') # for now, always only using query cds_start, cds_stop = row[1] - lmin + 1, row[2] - lmin + 1 bstart, bstop = b[start], b[stop] b = b[numexpr.evaluate("(((bstart < cds_start) & (bstop < cds_start)) | ((bstop > cds_stop ) & (bstart > cds_stop)))")] r = 0 if not b.shape[0]: return None delta = 0.2 * b['sstart'].max() # here, try to find a sort of line, and keep removing outliers to only get linear cnss for i in range(4): slope, intercept, r, zy, zz = linregress(b['qstart'], b['sstart']) #print >>sys.stderr, slope, intercept, r, zy, zz if r > 0.8: break bqstart = b['qstart'] expected = numexpr.evaluate('intercept + slope * bqstart') bsstart = b['sstart'] s = b.shape[0] b = b[numexpr.evaluate('bsstart - expected < delta')] if s == b.shape[0]: break # not removing anything. if plot: pylab.plot(b['qstart'], b['sstart'], "bo") pylab.savefig('/var/www/ms_tmp/d.png') cnss = [] start_stops = [map(lambda p: int(p) + qmin - 1, pair) for pair in zip(b['qstart'], b['qstop'])] for qstart, qstop in start_stops: qres = cur.execute('SELECT xmin, ymin, xmax, ymax, id, pair_id FROM image_data WHERE image_id = 1 AND bpmin = ? AND bpmax = ?', (qstart, qstop)).fetchone() this_cns = [qres[:-1]] if not qres: continue sres = cur.execute('SELECT xmin, ymin, xmax, ymax, id FROM image_data WHERE id = ?', (qres[-1],)).fetchone() this_cns.append(sres) cnss.append(this_cns) return cnss
def find_colinear_hits(blastfile, qeval, seval, mask='query', as_str=False): sqlite_file = blastfile[:blastfile.rfind(".")] + ".sqlite" db = sqlite3.connect(sqlite_file) cur = db.cursor() # need these to convert the absolute coords in sqlite to match # the local positions in the fresh blast qmin, smin = [ x[0] for x in cur.execute('SELECT bpmin FROM image_info ORDER BY id') ] # so we mask the new blast with anything that's NOT an HSP sql = "SELECT image_id, bpmin, bpmax FROM image_data WHERE type != 'HSP'" if mask == 'query': sql += ' AND image_id = 1' b = blast_array(blastfile, dopickle=False, best_hit=0, maxkeep=99999) if plot: pylab.plot(b['qstart'], b['sstart'], "kx") b = b[(b['eval'] < qeval) & (b['eval'] < seval)] if plot: pylab.plot(b['qstart'], b['sstart'], "ro") # TODO: remove stuff that's way off the diagonal? for row in cur.execute(sql).fetchall(): (start, stop, lmin) = row[0] == 1 and ('qstart', 'qstop', qmin) or ('sstart', 'sstop', smin) assert (start, stop) == ('qstart', 'qstop' ) # for now, always only using query cds_start, cds_stop = row[1] - lmin + 1, row[2] - lmin + 1 bstart, bstop = b[start], b[stop] b = b[numexpr.evaluate( "(((bstart < cds_start) & (bstop < cds_start)) | ((bstop > cds_stop ) & (bstart > cds_stop)))" )] r = 0 if not b.shape[0]: return None delta = 0.2 * b['sstart'].max() # here, try to find a sort of line, and keep removing outliers to only get linear cnss for i in range(4): slope, intercept, r, zy, zz = linregress(b['qstart'], b['sstart']) #print >>sys.stderr, slope, intercept, r, zy, zz if r > 0.8: break bqstart = b['qstart'] expected = numexpr.evaluate('intercept + slope * bqstart') bsstart = b['sstart'] s = b.shape[0] b = b[numexpr.evaluate('bsstart - expected < delta')] if s == b.shape[0]: break # not removing anything. if plot: pylab.plot(b['qstart'], b['sstart'], "bo") pylab.savefig('/var/www/ms_tmp/d.png') cnss = [] start_stops = [ map(lambda p: int(p) + qmin - 1, pair) for pair in zip(b['qstart'], b['qstop']) ] for qstart, qstop in start_stops: qres = cur.execute( 'SELECT xmin, ymin, xmax, ymax, id, pair_id FROM image_data WHERE image_id = 1 AND bpmin = ? AND bpmax = ?', (qstart, qstop)).fetchone() this_cns = [qres[:-1]] if not qres: continue sres = cur.execute( 'SELECT xmin, ymin, xmax, ymax, id FROM image_data WHERE id = ?', (qres[-1], )).fetchone() this_cns.append(sres) cnss.append(this_cns) return cnss
import sys sys.path.insert(0,".") import blast_misc import time import operator blast_file = 'data/t.blast' b = blast_misc.blast_array(blast_file, best_hit=0, maxkeep=999999, dopickle=0) print b print b.shape