def eval_clique(Q, cursor): """ Given Q which is a clique containing node indices, look it up on the db using cursor. Like eval_original_fna, returns: (fam, # of seqs belonging to fam, clique_size, fam ids hit) """ FETCH_SQL = "SELECT n.id,s.start,s.end \ FROM sets_for_nodes s \ LEFT JOIN nodes_to_index AS n \ ON (s.nodes_ind=n.ind) WHERE i={i}" T = defaultdict(lambda: 0) # fam ---> hit count H = defaultdict(lambda: set()) # fam ---> set of ids hit for i in Q: cursor.execute( FETCH_SQL.format(i=i) ) _id,_loc_start,_loc_end = cursor.fetchone() (acc,junk),strand,start,end = parsed_accID(_id, True, _loc_start, _loc_end) id, fam = get_ribo1( acc, start, end ) T[fam] += 1 H[fam].add( id ) T = T.items() T.sort(key=itemgetter(1), reverse=True) fam,count = T[0] # HACK if fam is None and len(T)>1: if T[1][1] >= 0.5*len(Q): return T[1][0]+'-',T[1][1],len(Q),H[T[1][0]] elif T[1][1] >= 3: return T[1][0]+'--',T[1][1],len(Q),H[T[1][0]] return fam, count, len(Q), H[fam]
def eval_original_fna(fasta_filename): """ Reads a fasta file and returns (ncRNA family, # of seqs belonging to the family, clique size) ncRNA family is determined by plurality, which can be None. """ tally_by_family = defaultdict(lambda: 0) ids_hit_by_family = defaultdict(lambda: set()) clique_size = 0 for id in os.popen("grep \"^>\" " + fasta_filename): id = id.strip()[1:] clique_size += 1 #id = id[:id.rfind('_')] # what was this for??? (acc,junk),strand,start,end = parsed_accID(id,True) ncRNA_id, ncRNA_family = get_ribo1(acc,start,end) tally_by_family[ncRNA_family] += 1 ids_hit_by_family[ncRNA_family].add( ncRNA_id ) tally_by_family = tally_by_family.items() tally_by_family.sort(key=itemgetter(1)) fam,count = tally_by_family[-1] # HACK HERE!!! if fam is None and len(tally_by_family) > 1: if tally_by_family[-2][1] >= 0.5*clique_size: lesser_fam = tally_by_family[-2][0] + '-' return lesser_fam,tally_by_family[-2][1],clique_size,ids_hit_by_family[tally_by_family[-2][0]] elif tally_by_family[-2][1] >= 3: lesser_fam = tally_by_family[-2][0] + '--' return lesser_fam,tally_by_family[-2][1],clique_size,ids_hit_by_family[tally_by_family[-2][0]] return fam,count,clique_size,ids_hit_by_family[fam]
def read_cmfinder_motif(motif_filename): """ Reads a motif file (outputted by CMfinder, stockholm format) Returns (ncRNA family, # of motif members belonging to the family, # of motif members) ncRNA family is determined by plurality. family is None if plurality are not ncRNAs. """ tally_by_family = defaultdict(lambda: 0) motif_size = 0 with open(motif_filename) as f: f.readline() f.readline() for line in f: if len(line.strip()) == 0: continue feature, text, annot = line.strip().split(None,2) if feature == '#=GS' and annot.startswith('DE'): break if feature == '#=GS' and annot.startswith('WT'): motif_size += 1 print >> sys.stderr, "looking up id", text #id = id[:id.rfind('_')] # what was this for? (acc,junk),strand,start,end = parsed_accID(text,True) ncRNA_id, ncRNA_family = get_ribo1(acc,start,end) tally_by_family[ncRNA_family] += 1 print >> sys.stderr, "motif filename is", motif_filename print >> sys.stderr, "tally by family is", tally_by_family tally_by_family = tally_by_family.items() tally_by_family.sort(key=itemgetter(1)) fam,count = tally_by_family[-1] return fam,count,motif_size
def get_tree_info(id_map): taxon_map = {} gene_seq_map = {} species_map = {} for label, seqid in id_map.iteritems(): print >> sys.stderr, "processing ", label, seqid idp = miscParses.parsed_accID(seqid) # now we annotate it with (a) species name; (b) closest downstream gene description r = NCBI.get_acc_gb(idp.acc) species_name = r.annotations['organism'] species_map[idp.acc] = species_name if idp.strand == +1: gene = NCBI.get_closest_downstream_gene(r, idp.end, idp.strand) else: gene = NCBI.get_closest_downstream_gene(r, idp.start, idp.strand) if gene is None: taxon_map[label] = 'Unknown' gene_seq_map[seqid] = None continue # TODO: put this into a phyloXML format!! # for now....let's just do normal newick gene_seq_map[seqid] = gene newname = label + '-' + species_name + '::' if 'product' in gene.qualifiers: newname += gene.qualifiers['product'][0] elif 'note' in gene.qualifiers: newname += gene.qualifiers['note'][0] elif 'gene' in gene.qualifiers: newname += gene.qualifiers['gene'] taxon_map[label] = newname return taxon_map, gene_seq_map, species_map
def get_fam_count(fasta_filename): from Bio import SeqIO from miscParses import parsed_accID total_fam_counts = defaultdict(lambda: 0) # read the scanned fasta so we know how many ribos per family there are # doing this everytime may be a waste, but it ensures we're having the right counts... for r in SeqIO.parse(open(fasta_filename), 'fasta'): print >> sys.stderr, "fasta reading....", r.id (acc, junk_version),strand,start,end = parsed_accID(r.id, True) (rb_id, rb_fam) = get_ribo1(acc, start, end) total_fam_counts[rb_fam] += 1 return total_fam_counts
def map_tree_taxon(tree_filename, id_map): taxon_map = {} gene_seq_map = {} t = dendropy.Tree.get_from_path(tree_filename, 'newick') for n in t.leaf_nodes(): # n.taxon.label is like 'T1', id_map maps back to <acc>/<start>-<end> idp = miscParses.parsed_accID(id_map[n.taxon.label]) # now we annotate it with (a) species name; (b) closest downstream gene description r = NCBI.get_acc_gb(idp.acc) species_name = r.annotations['organism'] if idp.strand == +1: gene = NCBI.get_closest_downstream_gene(r, idp.end, idp.strand) else: gene = NCBI.get_closest_downstream_gene(r, idp.start, idp.strand) # TODO: put this into a phyloXML format!! # for now....let's just do normal newick gene_seq_map[n.taxon.label] = gene.qualifiers['translation'] newname = n.taxon.label + '-' + species_name + '::' + gene.qualifiers['product'][0] taxon_map[n.taxon.label] = newname n.taxon.label = newname return t, taxon_map, gene_seq_map
def evaluate_blast_graph_count_nucleotides(graph_or_filename,hit_ratio=None,ignore_prefix=['shuffled','random']): """ Similar to evaluate_blast_graph, except that the real ncRNAs (in the DB, not query) are embedded with flanking regions, and the IDs should be in format <family>_<DB id>_<acc>/<embedded_start>-<embedded_end> If <hit_ratio> is None, then for each node N, sensitivity = (# of real ncRNA-neighbor nts) / (# of real ncRNA nts) specificity = (# of real ncRNA-neighbor nts) / (# of neighbor nts) If <hit_ratio> is defined, ex: 0.8, then for each node N, a neighbor node M is a hit if the # of hit on M is >= <hit_ratio>*<M's ncRNA len> NOTE: for this kind of blast output, the INPUT should be seq IDs like <family>_<db_id> which means they are real ncRNAs with NO padding and the DB can either be random/shuffled seqIDs or <family>_<db_id>_<acc>/<embedded_start>-<embedded_end> """ rex = re.compile('(\S+)_(\d+)_(\S+)') rex_real = re.compile('(\S+)_(\d+)') from miscncRNA import get_ncRNA_info if hit_ratio is not None: hit_ratio = float(hit_ratio) if type(graph_or_filename) is XGraph: X = graph_or_filename else: X = read_gpickle(graph_or_filename) total_nt_by_family = defaultdict(lambda: 0) spec_by_family = defaultdict(lambda: []) sens_by_family = defaultdict(lambda: []) for n in X.nodes_iter(): if any(map(lambda x: n.startswith(x), ignore_prefix)): continue if n.count('_') > 2: continue m = rex_real.match(n) if m is None: continue family,query_db_id = m.group(1),m.group(2) tmp_true = defaultdict(lambda: IntervalSet()) tmp_false = defaultdict(lambda: IntervalSet()) # the query nodes must be <family>_<db_id> (i.e. no padding) info = get_ncRNA_info(query_db_id) print >> sys.stderr, n total_nt_by_family[family] += info['end']-info['start']+1 if X.degree(n) == 0: # has 0 neighbors sens_by_family[family].append(0) spec_by_family[family].append(0) continue # e is in format (local_start,local_end,score) for (myself,neighbor,e) in X.edges_iter(n): if any(map(lambda x: neighbor.startswith(x), ignore_prefix)): # not a real ncRNA tmp_false[neighbor].add(Interval(e[0],e[1])) else: m = rex.match(neighbor) duncare,db_id,blob = m.group(1),m.group(2),m.group(3) if db_id == query_db_id: continue # it's a self vs self-embedded hit, ignore (acc,duncare),hit_start,hit_end,hit_strand = parsed_accID(blob,True,e[0],e[1]) tmp_true[db_id].add(Interval(hit_start,hit_end)) tp,fp = (0,0) for db_id,regions in tmp_true.iteritems(): info = get_ncRNA_info(db_id) for x in regions: c = calc_overlap(info['start'],info['end'],x.lower_bound,x.upper_bound) if hit_ratio is None: tp += c fp += (x.upper_bound-x.lower_bound+1) - c elif c >= hit_ratio*(info['end']-info['start']+1): tp += 1 else: fp += 1 for some_id,regions in tmp_false.iteritems(): for x in regions: fp += x.upper_bound-x.lower_bound+1 print >> sys.stderr, tp,fp if tp+fp == 0: sens_by_family[family].append(0) spec_by_family[family].append(0) else: sens_by_family[family].append(tp) # NOTE: it's raw count!!! spec_by_family[family].append(tp*1./(tp+fp)) #raw_input('...') for k in sens_by_family: if hit_ratio is None: sens_by_family[k] = map(lambda x: x*1./total_nt_by_family[k], sens_by_family[k]) else: sens_by_family[k] = map(lambda x: x*1./len(total_nt_by_family[k]), sens_by_family[k]) return (None,sens_by_family,spec_by_family)