def make_eval_func_by_mapping_pickle(mapping_pickle_filename): """ If the scanning fasta file was just one large concatenated chromosome, then we need this mapping pickle which is a list of (0-based start, 0-based end, fam) Returns a function that takes input arguments id, (best,loc_start,loc_end) and returns rb_fam (which can be 'random' to indicate a scan hit to a none-ncRNA region) """ with open(mapping_pickle_filename) as handle: mapping = load( handle ) def eval_func(id, (best,loc_start,loc_end)): if loc_start > loc_end: # swap coords if hit is on minus strand (loc_start, loc_end) = (loc_end, loc_start) ii = bisect(mapping, (loc_start,loc_end)) T = defaultdict(lambda: 0) # go <--- way for j in xrange(min(ii,len(mapping)-1),0,-1): if mapping[j][1] < loc_start: break T[mapping[j][2]] = max(T[mapping[j][2]], calc_overlap(loc_start, loc_end, mapping[j][0], mapping[j][1])) # go ---> way if ii < len(mapping)-1: for j in xrange(ii+1, len(mapping), +1): if mapping[j][0] > loc_end: break T[mapping[j][2]] = max(T[mapping[j][2]], calc_overlap(loc_start, loc_end, mapping[j][0], mapping[j][1])) # take the majority family and return it T = T.items() T.sort(key=itemgetter(1), reverse=True) print >> sys.stderr, "best is", T[0], " out of whole length", (loc_end-loc_start+1) # raw_input() if T[0][1] >= .5*(loc_end-loc_start+1): t = T[0][0] return t #return t[:t.index('___')] else: return 'random___000'#return 'random'
def evaluate_blast_graph_count_nucleotides(graph_or_filename,hit_ratio=None,ignore_prefix=['shuffled','random']): """ Similar to evaluate_blast_graph, except that the real ncRNAs (in the DB, not query) are embedded with flanking regions, and the IDs should be in format <family>_<DB id>_<acc>/<embedded_start>-<embedded_end> If <hit_ratio> is None, then for each node N, sensitivity = (# of real ncRNA-neighbor nts) / (# of real ncRNA nts) specificity = (# of real ncRNA-neighbor nts) / (# of neighbor nts) If <hit_ratio> is defined, ex: 0.8, then for each node N, a neighbor node M is a hit if the # of hit on M is >= <hit_ratio>*<M's ncRNA len> NOTE: for this kind of blast output, the INPUT should be seq IDs like <family>_<db_id> which means they are real ncRNAs with NO padding and the DB can either be random/shuffled seqIDs or <family>_<db_id>_<acc>/<embedded_start>-<embedded_end> """ rex = re.compile('(\S+)_(\d+)_(\S+)') rex_real = re.compile('(\S+)_(\d+)') from miscncRNA import get_ncRNA_info if hit_ratio is not None: hit_ratio = float(hit_ratio) if type(graph_or_filename) is XGraph: X = graph_or_filename else: X = read_gpickle(graph_or_filename) total_nt_by_family = defaultdict(lambda: 0) spec_by_family = defaultdict(lambda: []) sens_by_family = defaultdict(lambda: []) for n in X.nodes_iter(): if any(map(lambda x: n.startswith(x), ignore_prefix)): continue if n.count('_') > 2: continue m = rex_real.match(n) if m is None: continue family,query_db_id = m.group(1),m.group(2) tmp_true = defaultdict(lambda: IntervalSet()) tmp_false = defaultdict(lambda: IntervalSet()) # the query nodes must be <family>_<db_id> (i.e. no padding) info = get_ncRNA_info(query_db_id) print >> sys.stderr, n total_nt_by_family[family] += info['end']-info['start']+1 if X.degree(n) == 0: # has 0 neighbors sens_by_family[family].append(0) spec_by_family[family].append(0) continue # e is in format (local_start,local_end,score) for (myself,neighbor,e) in X.edges_iter(n): if any(map(lambda x: neighbor.startswith(x), ignore_prefix)): # not a real ncRNA tmp_false[neighbor].add(Interval(e[0],e[1])) else: m = rex.match(neighbor) duncare,db_id,blob = m.group(1),m.group(2),m.group(3) if db_id == query_db_id: continue # it's a self vs self-embedded hit, ignore (acc,duncare),hit_start,hit_end,hit_strand = parsed_accID(blob,True,e[0],e[1]) tmp_true[db_id].add(Interval(hit_start,hit_end)) tp,fp = (0,0) for db_id,regions in tmp_true.iteritems(): info = get_ncRNA_info(db_id) for x in regions: c = calc_overlap(info['start'],info['end'],x.lower_bound,x.upper_bound) if hit_ratio is None: tp += c fp += (x.upper_bound-x.lower_bound+1) - c elif c >= hit_ratio*(info['end']-info['start']+1): tp += 1 else: fp += 1 for some_id,regions in tmp_false.iteritems(): for x in regions: fp += x.upper_bound-x.lower_bound+1 print >> sys.stderr, tp,fp if tp+fp == 0: sens_by_family[family].append(0) spec_by_family[family].append(0) else: sens_by_family[family].append(tp) # NOTE: it's raw count!!! spec_by_family[family].append(tp*1./(tp+fp)) #raw_input('...') for k in sens_by_family: if hit_ratio is None: sens_by_family[k] = map(lambda x: x*1./total_nt_by_family[k], sens_by_family[k]) else: sens_by_family[k] = map(lambda x: x*1./len(total_nt_by_family[k]), sens_by_family[k]) return (None,sens_by_family,spec_by_family)