Exemplo n.º 1
0
def make_eval_func_by_mapping_pickle(mapping_pickle_filename):
	"""
	If the scanning fasta file was just one large concatenated chromosome,
	then we need this mapping pickle which is a list of (0-based start, 0-based end, fam)

	Returns a function that takes input arguments id, (best,loc_start,loc_end)
	and returns rb_fam (which can be 'random' to indicate a scan hit to a none-ncRNA region)
	"""
	with open(mapping_pickle_filename) as handle:
		mapping = load( handle )

	def eval_func(id, (best,loc_start,loc_end)):
		if loc_start > loc_end:
			# swap coords if hit is on minus strand
			(loc_start, loc_end) = (loc_end, loc_start)
		ii = bisect(mapping, (loc_start,loc_end))	
		T = defaultdict(lambda: 0)
		# go <--- way
		for j in xrange(min(ii,len(mapping)-1),0,-1):
			if mapping[j][1] < loc_start:
				break
			T[mapping[j][2]] = max(T[mapping[j][2]], calc_overlap(loc_start, loc_end, mapping[j][0], mapping[j][1]))
		# go ---> way
		if ii < len(mapping)-1:
			for j in xrange(ii+1, len(mapping), +1):
				if mapping[j][0] > loc_end:
					break
				T[mapping[j][2]] = max(T[mapping[j][2]], calc_overlap(loc_start, loc_end, mapping[j][0], mapping[j][1]))
		# take the majority family and return it
		T = T.items()
		T.sort(key=itemgetter(1), reverse=True)
		print >> sys.stderr, "best is", T[0], " out of whole length", (loc_end-loc_start+1)
#		raw_input()
		if T[0][1] >= .5*(loc_end-loc_start+1):
			t = T[0][0]
			return t #return t[:t.index('___')]
		else:	
			return 'random___000'#return 'random'
def evaluate_blast_graph_count_nucleotides(graph_or_filename,hit_ratio=None,ignore_prefix=['shuffled','random']):
	"""
		Similar to evaluate_blast_graph, except that the real ncRNAs (in the DB, not query)
		  are embedded with flanking regions, and the IDs should be in 
		  format <family>_<DB id>_<acc>/<embedded_start>-<embedded_end>

		If <hit_ratio> is None, then for each node N, 
		  sensitivity = (# of real ncRNA-neighbor nts) / (# of real ncRNA nts)
		  specificity = (# of real ncRNA-neighbor nts) / (# of neighbor nts)
		
		If <hit_ratio> is defined, ex: 0.8, then for each node N,
     		  a neighbor node M is a hit if the # of hit on M is >= <hit_ratio>*<M's ncRNA len>

		NOTE: for this kind of blast output, the INPUT should be seq IDs like <family>_<db_id>
		      which means they are real ncRNAs with NO padding
		      and the DB can either be random/shuffled seqIDs
 		      or <family>_<db_id>_<acc>/<embedded_start>-<embedded_end>
	"""
	rex = re.compile('(\S+)_(\d+)_(\S+)')
	rex_real = re.compile('(\S+)_(\d+)')
	from miscncRNA import get_ncRNA_info

	if hit_ratio is not None:
		hit_ratio = float(hit_ratio)

        if type(graph_or_filename) is XGraph:
                X = graph_or_filename
        else:
                X = read_gpickle(graph_or_filename)

	total_nt_by_family = defaultdict(lambda: 0)
        spec_by_family = defaultdict(lambda: [])
        sens_by_family = defaultdict(lambda: [])
	for n in X.nodes_iter():
		if any(map(lambda x: n.startswith(x), ignore_prefix)): continue
		if n.count('_') > 2: continue
		m = rex_real.match(n)
		if m is None: continue
		family,query_db_id = m.group(1),m.group(2)
		tmp_true = defaultdict(lambda: IntervalSet())
		tmp_false = defaultdict(lambda: IntervalSet())

		# the query nodes must be <family>_<db_id> (i.e. no padding)
		info = get_ncRNA_info(query_db_id)
		print >> sys.stderr, n
		total_nt_by_family[family] += info['end']-info['start']+1
		if X.degree(n) == 0: # has 0 neighbors
			sens_by_family[family].append(0)
			spec_by_family[family].append(0)
			continue		
		# e is in format (local_start,local_end,score)
		for (myself,neighbor,e) in X.edges_iter(n):
			if any(map(lambda x: neighbor.startswith(x), ignore_prefix)):
				# not a real ncRNA
				tmp_false[neighbor].add(Interval(e[0],e[1]))
			else:
				m = rex.match(neighbor)
				duncare,db_id,blob = m.group(1),m.group(2),m.group(3)
				if db_id == query_db_id: continue # it's a self vs self-embedded hit, ignore
				(acc,duncare),hit_start,hit_end,hit_strand = parsed_accID(blob,True,e[0],e[1])
				tmp_true[db_id].add(Interval(hit_start,hit_end))

		tp,fp = (0,0)
		for db_id,regions in tmp_true.iteritems():
			info = get_ncRNA_info(db_id)
			for x in regions:
				c = calc_overlap(info['start'],info['end'],x.lower_bound,x.upper_bound)
				if hit_ratio is None:
					tp += c
					fp += (x.upper_bound-x.lower_bound+1) - c
					
				elif c >= hit_ratio*(info['end']-info['start']+1):
					tp += 1
				else:
					fp += 1
		for some_id,regions in tmp_false.iteritems():
			for x in regions: fp += x.upper_bound-x.lower_bound+1

		print >> sys.stderr, tp,fp
		if tp+fp == 0:
			sens_by_family[family].append(0)
			spec_by_family[family].append(0)
		else:
			sens_by_family[family].append(tp) # NOTE: it's raw count!!!
			spec_by_family[family].append(tp*1./(tp+fp))
		#raw_input('...')
	for k in sens_by_family:
		if hit_ratio is None:
			sens_by_family[k] = map(lambda x: x*1./total_nt_by_family[k], sens_by_family[k])
		else:
			sens_by_family[k] = map(lambda x: x*1./len(total_nt_by_family[k]), sens_by_family[k])
	return (None,sens_by_family,spec_by_family)