def dbGCperc(inFileName): """ deprecated """ file_db=open(inFileName) seq=Bioseq() numseq=0 sum_count=0 sum_nb=0 while 1: seq.read(file_db) if seq.sequence==None: break numseq=numseq+1 occ,nb=seq.occ_word(1) count=0 for i in occ.keys(): if i == "G" or i=="C": count+=occ[i] if nb!=0 : print 'sequence #',numseq,'=','[',seq.header[0:20],'...]',float(count)/nb sum_count+=count sum_nb+=nb print "Total:",float(sum_count)/sum_nb, sum_count file_db.close()
def dbWord(inFileName,wordsize): """ deprecated """ vec_len=[] file_db=open(inFileName) seq=Bioseq() numseq=0 nb_word=0 statCount={} while 1: seq.read(file_db) if seq.sequence==None: break numseq=numseq+1 print 'sequence #',numseq,'=',seq.getLength(),'[',seq.header[0:40],'...]' occ,nb=seq.occ_word(wordsize) nb_word=nb_word+nb for i in occ.keys(): if i not in statCount.keys(): statCount[i]=occ[i] else: statCount[i]=statCount[i]+occ[i] file_db.close() vec_sort=[] for i in statCount.keys(): vec_sort.append((-float(statCount[i])/nb_word,i)) vec_sort.sort() for i in vec_sort: print i[1],"=",-i[0]
def dbRelEntropy(inFileName,wordsize): """ deprecated """ file_db=open(inFileName) seq=Bioseq() refocc={} sumlen=0 while 1: seq.read(file_db) if seq.sequence==None: break sumlen=sumlen+seq.getLength()-wordsize occ=seq.occ_word(wordsize) if(len(refocc)==0): refocc=occ else: for w in occ.keys(): if refocc.has_key(w): refocc[w]=refocc[w]+occ[w] else: refocc[w]=occ[w] file_db.close() reffreq={} for w in refocc.keys(): reffreq[w]=float(refocc[w]+1)/sumlen vec_len=[] stat=Stat() file_db=open(inFileName) numseq=0 while 1: seq.read(file_db) if seq.sequence==None: break i=seq.rel_entropy(reffreq) stat.add(i) numseq=numseq+1 print 'sequence #',numseq,'=',seq.getLength(),'[',seq.header[0:40],'...] entropy',i vec_len.append((i,numseq,seq.header)) file_db.close() vec_len.sort() for s in vec_len: print 'H=',s[0],'=> #',s[1], s[2] print stat.string() return vec_len