Exemplo n.º 1
0
def eval_original_fna(fasta_filename):
	"""
	Reads a fasta file and returns (ncRNA family, # of seqs belonging to the family, clique size)
	ncRNA family is determined by plurality, which can be None.
	"""
	tally_by_family = defaultdict(lambda: 0)
	ids_hit_by_family = defaultdict(lambda: set())
	clique_size = 0
	for id in os.popen("grep \"^>\" " + fasta_filename):
		id = id.strip()[1:]
		clique_size += 1
		#id = id[:id.rfind('_')] # what was this for???
		(acc,junk),strand,start,end = parsed_accID(id,True)
		ncRNA_id, ncRNA_family = get_ribo1(acc,start,end)
		tally_by_family[ncRNA_family] += 1
		ids_hit_by_family[ncRNA_family].add( ncRNA_id )
	tally_by_family = tally_by_family.items()
	tally_by_family.sort(key=itemgetter(1))
	fam,count = tally_by_family[-1]
	# HACK HERE!!!
	if fam is None and len(tally_by_family) > 1:
		if tally_by_family[-2][1] >= 0.5*clique_size:
			lesser_fam = tally_by_family[-2][0] + '-'
			return lesser_fam,tally_by_family[-2][1],clique_size,ids_hit_by_family[tally_by_family[-2][0]]
		elif tally_by_family[-2][1] >= 3:
			lesser_fam = tally_by_family[-2][0] + '--'
			return lesser_fam,tally_by_family[-2][1],clique_size,ids_hit_by_family[tally_by_family[-2][0]]
	return fam,count,clique_size,ids_hit_by_family[fam]
Exemplo n.º 2
0
def eval_clique(Q, cursor):
	"""
	Given Q which is a clique containing node indices, look it up
	on the db using cursor.
	
	Like eval_original_fna, returns:
	(fam, # of seqs belonging to fam, clique_size, fam ids hit)
	"""
	FETCH_SQL = "SELECT n.id,s.start,s.end \
				FROM sets_for_nodes s \
				LEFT JOIN nodes_to_index AS n \
				ON (s.nodes_ind=n.ind) WHERE i={i}"

	T = defaultdict(lambda: 0)       # fam ---> hit count
	H = defaultdict(lambda: set())   # fam ---> set of ids hit

	for i in Q:
		cursor.execute( FETCH_SQL.format(i=i) )
		_id,_loc_start,_loc_end = cursor.fetchone()
		(acc,junk),strand,start,end = parsed_accID(_id, True, _loc_start, _loc_end)
		id, fam = get_ribo1( acc, start, end )
		T[fam] += 1
		H[fam].add( id )

	T = T.items()
	T.sort(key=itemgetter(1), reverse=True)
	fam,count = T[0]
	# HACK
	if fam is None and len(T)>1:
		if T[1][1] >= 0.5*len(Q):
			return T[1][0]+'-',T[1][1],len(Q),H[T[1][0]]
		elif T[1][1] >= 3:
			return T[1][0]+'--',T[1][1],len(Q),H[T[1][0]]
	return fam, count, len(Q), H[fam]
Exemplo n.º 3
0
def read_cmfinder_motif(motif_filename):
	"""
	Reads a motif file (outputted by CMfinder, stockholm format)
	Returns (ncRNA family, # of motif members belonging to the family, # of motif members)
	ncRNA family is determined by plurality. family is None if plurality are not ncRNAs.
	"""
	tally_by_family = defaultdict(lambda: 0)
	motif_size = 0
	with open(motif_filename) as f:
		f.readline()
		f.readline()
		for line in f:
			if len(line.strip()) == 0:
				continue
			feature, text, annot = line.strip().split(None,2)
			if feature == '#=GS' and annot.startswith('DE'):
				break
			if feature == '#=GS' and annot.startswith('WT'):
				motif_size += 1
				print >> sys.stderr, "looking up id", text
				#id = id[:id.rfind('_')] # what was this for?
				(acc,junk),strand,start,end = parsed_accID(text,True)
				ncRNA_id, ncRNA_family = get_ribo1(acc,start,end)
				tally_by_family[ncRNA_family] += 1
	print >> sys.stderr, "motif filename is", motif_filename
	print >> sys.stderr, "tally by family is", tally_by_family
	tally_by_family = tally_by_family.items()
	tally_by_family.sort(key=itemgetter(1))
	fam,count = tally_by_family[-1]
	return fam,count,motif_size
Exemplo n.º 4
0
def check_hit(i, cursor=None):
	"""
		Given <i>, piece up its accession #, start, end,
		and call get_ribo1 which will return (<ncRNA_id>,<ncRNA_family>)
		if it is a hit or (None,None) if not a hit
	"""
	from miscRibo import get_ribo1
	del_it = False
	if cursor is None:
		conn = CONN_FUNC()
		cursor = get_dict_cursor(conn)
		del_it = True
	cursor.execute("SELECT n.id,s.start,s.end \
					FROM sets_for_nodes AS s \
					LEFT JOIN nodes_to_index AS n \
					ON (s.nodes_ind=n.ind) WHERE i={0}".format(i))
	r = cursor.fetchone()
	(acc,junk),strand,start,end = parsed_accID(r['id'],True,r['start'],r['end'])
	if del_it:
		conn.close()
	return get_ribo1(acc,start,end)