def __init__(self,seed_seqs, all_seqs, width = 6, verbose = ''): self.seed_seqs = seed_seqs #Sequences to be scanned for seeds self.seqs = all_seqs self.candidates = [] self.models = [] #Set directly or computed from seed_seqs self.width = width self.verbose = verbose if width: self.goodwmersT = MotifTools.top_nmers(self.width,self.seed_seqs,1,"") else: self.goodwmersT = zip(self.seed_seqs,range(len(self.seed_seqs))) self.bgprob = {'A': 0.31, 'C': .19, 'G': .19, 'T': .31} self.beta = 0.001 self.deltamin = 1e-3 self.probes = [] self.method = "ZOOPS" # OOPS or ZOOPS ) self.param = {} self.gapflank = 0 self.gapweight = 0.2 self.seedbeta = 0.02 self.joint = 1 global theMarkovBackground if theMarkovBackground: self.bgprob = theMarkovBackground.zeroth() '''DELETE
def main(): seqsD = Fasta.load(sys.argv[1]) seqs = seqsD.values() for w in range(1, 7): allnmers = permute(w) nmersT = MotifTools.top_nmers(w, seqs, 'with counts', 'purge Ns') nmersD = {} total = 0 for nmer in allnmers: nmersD[nmer] = 1 #Pseudo count total = total + 1 for nmer, count in nmersT[:]: try: rc = MotifTools.revcomplement(nmer) nmersD[nmer] = nmersD[nmer] + count nmersD[rc] = nmersD[rc] + count total = total + 2 * count except KeyError: pass _t = nmersD.keys() _t.sort() print "# freq in %s (total %d with pseudocounts)" % (sys.argv[1], total) for nmer in _t: print "%-7s %20.17f" % (nmer, float(nmersD[nmer]) / total) sys.stdout.flush()
def main(): seqsD = Fasta.load(sys.argv[1]) seqs = seqsD.values() for w in range(1,7): allnmers = permute(w) nmersT = MotifTools.top_nmers(w,seqs,'with counts','purge Ns') nmersD = {} total = 0 for nmer in allnmers: nmersD[nmer] = 1 #Pseudo count total = total + 1 for nmer,count in nmersT[:]: try: rc = MotifTools.revcomplement(nmer) nmersD[nmer] = nmersD[nmer] + count nmersD[rc] = nmersD[rc] + count total = total + 2*count except KeyError: pass _t = nmersD.keys() _t.sort() print "# freq in %s (total %d with pseudocounts)"%(sys.argv[1],total) for nmer in _t: print "%-7s %20.17f"%(nmer,float(nmersD[nmer]) / total) sys.stdout.flush()
def info2seeds(N,infofile,probefile,species='YEAST'): G = ProbeSet(species) IDs = G.ids_from_file(probefile) Q = EM.theMarkovBackground.zeroth() seqs = Fasta.seqs(infofile) if not N: nmers = seqs else: nmers= MotifTools.top_nmers(N,seqs) if len(nmers) > 1000: nmers = nmers[0:1000] print "Scoring enrichment of %d nmers from %s"%len(nmers,infofile) sys.stdout.flush() nmers_scoresT = [] for nmer in nmers: if nmer.isalpha(): p = G.p_value(nmer,IDs,'') #'verbose' nmers_scoresT.append((nmer,p)) nmers_scoresT.sort(lambda x,y: cmp(x[1],y[1])) last = min(20,len(nmers_scoresT)) models = [] for i in range(last): seq = nmers_scoresT[i][0] m = MotifTools.Motif('',Q) m.compute_from_text(seq,0.1) models.append(m) for tup in nmers_scoresT[0:40]: print tup return(models)
def main(fastafile, outDirectory): # !! 1/2/09 AD added 'fastafile' var and changed 'if __name__' as way to call this from script. seqsD = Fasta.load(fastafile) seqs = seqsD.values() output = [] for w in range(1,7): allnmers = permute(w) nmersT = MotifTools.top_nmers(w,seqs,'with counts','purge Ns') nmersD = {} total = 0 for nmer in allnmers: nmersD[nmer] = 1 #Pseudo count total = total + 1 for nmer,count in nmersT[:]: try: rc = MotifTools.revcomplement(nmer) nmersD[nmer] = nmersD[nmer] + count nmersD[rc] = nmersD[rc] + count total = total + 2*count except KeyError: pass _t = nmersD.keys() _t.sort() output.append("# freq in %s (total %d with pseudocounts)\n"%(fastafile.split('/')[-1],total)) # AD 02-27-09 added a '\n' to make file look right for nmer in _t: output.append( "%-7s %20.17f\n"%(nmer,float(nmersD[nmer]) / total)) # AD 02-27-09 added a '\n' to make file look right # open output file and write out results outFile = '%s/%s.freq' % (outDirectory, fastafile.split('/')[-1]) outFile = open(outFile, 'w') for index in output: outFile.write(index)
def Reduce_Nmers(Info): print 'COMPUTING Nmers ....' mseqs = ReduceInfo2seqs(Info,70, lambda L: MotifTools.top_nmers(6,L)[0:3]) print "Combining representative sequences...: " for i in range(len(mseqs)): i = i + 1 print '\t%s'%mseqs[i-1], if (i%5 == 0): print print top_seq_pairs = MotifTools.top_nmers(5,mseqs,1) total_nmers = 0 for (mner,count) in top_seq_pairs: total_nmers = total_nmers + count for (nmer,count) in top_seq_pairs[0:8]: print "RESULT: %s\t%2d (%5.2f%%) occurences: "%(nmer,count, 100*float(count)/total_nmers), for bsite in Info.query['bsites']: seq = bsite.cleantxt() (max,s1,s2) = MotifTools.compare_seqs(nmer,seq) print ' %s vs %s %4.2f correct'%(s1,s2,max)
def freq_from_seqs_old(self,seqs): self.highestorder = 4 for depth in range(1,6): nmersT = MotifTools.top_nmers(depth, seqs, "TUPLES") self.nmers_by_size[depth] = map(lambda x:x[0],nmersT) total = 0 for nmer,count in nmersT: total = total + count for nmer,count in nmersT: rc = MotifTools.revcomplement(nmer) if nmer == rc: #correct top_nmers f = float(count)/total #palindrome count else: f = float(count)/total/2 self.F[nmer] = f self.F[rc] = f for depth in range(0): #For debugging total = 0 for k in self.F.keys(): if len(k) == depth: total = total + self.F[k] print k, self.F[k] print depth,total
def study_seqs(self,seqs): for depth in range(1,6): nmersT = MotifTools.top_nmers(depth, seqs, "TUPLES") total = 0 for nmer,count in nmersT: total = total + count rc = MotifTools.revcomplement(nmer) for nmer,count in nmersT: f = math.log(float(count)/total)/math.log(2) f_2 = math.log(0.5 * float(count)/total)/math.log(2) rc = MotifTools.revcomplement(nmer) if rc != nmer: self.D[nmer] = f_2 self.D[rc] = f_2 else: self.D[nmer] = f for depth in range(0): total = 0 for k in self.D.keys(): if len(k) == depth: total = total + pow(2,self.D[k]) print k, pow(2,self.D[k]) print depth,total self.highestorder = 5
def freq_from_seqs(self,seqs): self.highestorder = 6 for w in range(1,7): allnmers = permute(w) nmersT = MotifTools.top_nmers(w,seqs,'with counts','purge Ns') self.nmers_by_size[w] = allnmers[:] nmersD = {} total = 0.0 for nmer in allnmers: #Pseudo count nmersD[nmer] = 1 total = total + 1 for nmer,count in nmersT: try: rc = MotifTools.revcomplement(nmer) nmersD[nmer] = nmersD[nmer] + count nmersD[rc] = nmersD[rc] + count total = total + 2*count except KeyError: pass for nmer in nmersD.keys(): rc = MotifTools.revcomplement(nmer) f = nmersD[nmer]/total self.F[nmer] = f self.F[rc] = f