def eval_original_fna(fasta_filename): """ Reads a fasta file and returns (ncRNA family, # of seqs belonging to the family, clique size) ncRNA family is determined by plurality, which can be None. """ tally_by_family = defaultdict(lambda: 0) ids_hit_by_family = defaultdict(lambda: set()) clique_size = 0 for id in os.popen("grep \"^>\" " + fasta_filename): id = id.strip()[1:] clique_size += 1 #id = id[:id.rfind('_')] # what was this for??? (acc,junk),strand,start,end = parsed_accID(id,True) ncRNA_id, ncRNA_family = get_ribo1(acc,start,end) tally_by_family[ncRNA_family] += 1 ids_hit_by_family[ncRNA_family].add( ncRNA_id ) tally_by_family = tally_by_family.items() tally_by_family.sort(key=itemgetter(1)) fam,count = tally_by_family[-1] # HACK HERE!!! if fam is None and len(tally_by_family) > 1: if tally_by_family[-2][1] >= 0.5*clique_size: lesser_fam = tally_by_family[-2][0] + '-' return lesser_fam,tally_by_family[-2][1],clique_size,ids_hit_by_family[tally_by_family[-2][0]] elif tally_by_family[-2][1] >= 3: lesser_fam = tally_by_family[-2][0] + '--' return lesser_fam,tally_by_family[-2][1],clique_size,ids_hit_by_family[tally_by_family[-2][0]] return fam,count,clique_size,ids_hit_by_family[fam]
def eval_clique(Q, cursor): """ Given Q which is a clique containing node indices, look it up on the db using cursor. Like eval_original_fna, returns: (fam, # of seqs belonging to fam, clique_size, fam ids hit) """ FETCH_SQL = "SELECT n.id,s.start,s.end \ FROM sets_for_nodes s \ LEFT JOIN nodes_to_index AS n \ ON (s.nodes_ind=n.ind) WHERE i={i}" T = defaultdict(lambda: 0) # fam ---> hit count H = defaultdict(lambda: set()) # fam ---> set of ids hit for i in Q: cursor.execute( FETCH_SQL.format(i=i) ) _id,_loc_start,_loc_end = cursor.fetchone() (acc,junk),strand,start,end = parsed_accID(_id, True, _loc_start, _loc_end) id, fam = get_ribo1( acc, start, end ) T[fam] += 1 H[fam].add( id ) T = T.items() T.sort(key=itemgetter(1), reverse=True) fam,count = T[0] # HACK if fam is None and len(T)>1: if T[1][1] >= 0.5*len(Q): return T[1][0]+'-',T[1][1],len(Q),H[T[1][0]] elif T[1][1] >= 3: return T[1][0]+'--',T[1][1],len(Q),H[T[1][0]] return fam, count, len(Q), H[fam]
def read_cmfinder_motif(motif_filename): """ Reads a motif file (outputted by CMfinder, stockholm format) Returns (ncRNA family, # of motif members belonging to the family, # of motif members) ncRNA family is determined by plurality. family is None if plurality are not ncRNAs. """ tally_by_family = defaultdict(lambda: 0) motif_size = 0 with open(motif_filename) as f: f.readline() f.readline() for line in f: if len(line.strip()) == 0: continue feature, text, annot = line.strip().split(None,2) if feature == '#=GS' and annot.startswith('DE'): break if feature == '#=GS' and annot.startswith('WT'): motif_size += 1 print >> sys.stderr, "looking up id", text #id = id[:id.rfind('_')] # what was this for? (acc,junk),strand,start,end = parsed_accID(text,True) ncRNA_id, ncRNA_family = get_ribo1(acc,start,end) tally_by_family[ncRNA_family] += 1 print >> sys.stderr, "motif filename is", motif_filename print >> sys.stderr, "tally by family is", tally_by_family tally_by_family = tally_by_family.items() tally_by_family.sort(key=itemgetter(1)) fam,count = tally_by_family[-1] return fam,count,motif_size
def check_hit(i, cursor=None): """ Given <i>, piece up its accession #, start, end, and call get_ribo1 which will return (<ncRNA_id>,<ncRNA_family>) if it is a hit or (None,None) if not a hit """ from miscRibo import get_ribo1 del_it = False if cursor is None: conn = CONN_FUNC() cursor = get_dict_cursor(conn) del_it = True cursor.execute("SELECT n.id,s.start,s.end \ FROM sets_for_nodes AS s \ LEFT JOIN nodes_to_index AS n \ ON (s.nodes_ind=n.ind) WHERE i={0}".format(i)) r = cursor.fetchone() (acc,junk),strand,start,end = parsed_accID(r['id'],True,r['start'],r['end']) if del_it: conn.close() return get_ribo1(acc,start,end)