def go(self): """Execution function: coordinates options used then uses TAMO.MotifMetrics to find kmers with good enrichment in listOfLinkedSeqs. Catches the output in self.output for access from MDAP.""" # set metric thresholds here pVal_thresh = 0.01 church_thresh = 0.01 binomial_thresh = 0.01 # # # # # # # # # # # # # # ::THIN THE HEARD PHASE:: # Are we using a range or a single size? Then make a list of all kmers in range # that are present in at least 10% of linkedSeqs (top_nmers_seqs()) to reduce # needless kmer testing in the metrics phase. theShortList = [] if self.kmerRange: for k in range(self.kmerRange[0],self.kmerRange[1]): kmers = MotifMetrics.top_nmers_seqs(k, self.linkedSeqs_seqs) print '%s %smers found.' % (len(kmers), k) theShortList.extend(kmers) else: theShortList = MotifMetrics.top_nmers_seqs(self.kmerSize, self.linkedSeqs_seqs) print '%s %smers found.' % (len(theShortList), self.kmerSize) # Convert theShortList into list of motif objs not just strings # REASON: church routine asks the motif for its width. for i in range(0,len(theShortList)): theShortList[i] = MotifTools.Motif_from_text(theShortList[i]) # # # # # # # # # # # # # ::METRICS PHASE:: # Using theShortList, calculate the: # --------METRICS---------- --METHOD CALL-- # - HyperGeometric Enrichment (p_value) # - Group Specificity Score (church) # - Over-representation (binomial) # # Retain those kmers that recieve the cut-off score or better in at least one # of the above metrics. # list with indexes as follows [kmer, p_value, church, binomial] keepers = [] t1 = time() count = 1 shortList_Len = len(theShortList) for kmer in theShortList: p_value = self.allSeqs.p_value(kmer, self.linkedSeqs_ids, factor=0.75) church = 'NA' #self.allSeqs.church(kmer, self.linkedSeqs_ids) binomial = 'NA' #self.allSeqs.binomial(kmer, self.linkedSeqs_ids) if p_value <= pVal_thresh or church <= church_thresh or binomial <= binomial_thresh: keepers.append([kmer, p_value, church, binomial]) print '%s\t%s\t--\t%s of %s' % (kmer, p_value, count, shortList_Len) count+=1 t2 = time() self.output = keepers print 'Calculating the metrics took %.3f min.' % ((t2-t1)/60) # Create a formated string to be printed to a file in MDAP class. toFile = ['#kmer\tp_value\tchurch\tbinomial\n'] for i in keepers: toFile.append('%s\t%s\t%s\t%s\n' % (i[0].oneletter,i[1],i[2],i[3])) # AD added ".oneletter" to i[0] to remove the " (1)" from output self.toFile = toFile # Change log since last commit: # 02-26-09 -- added MemeWrap._getMaxSize() # 02-26-09 -- added MemeWrap._getWidthOption() # 02-26-09 -- added MemeWrap._get_bFile() # 02-27-09 -- added MemeWrap._getExtraArgs()
from TAMO.MD.Meme import Meme from TAMO import Clustering #from TAMO.DataSources import GO from time import time TC8_path = '/Users/biggus/Documents/James/Data/ClusterDefs/TC-Fastas/TC-8.fas' TC8_ids = Fasta.ids(TC8_path) TC8_seqs = Fasta.seqs(TC8_path) allSeqs = MotifMetrics.ProbeSet('/Users/biggus/Documents/James/Data/2KB/2kb_Sequence/2kb_Anopheles/2KBupTSS_goodAffyAGAPsFastasOUT.masked.nr.fas') outFile = '/Users/biggus/Documents/James/Data/ClusterDefs/TC-8_MotifMetrics.5-12.txt' roughBestKmers = [] for i in range(6,10): imers = MotifMetrics.top_nmers_seqs(i,TC8_seqs) roughBestKmers.extend(imers) print '%s %smers found.' % (len(imers), i) kmerMetrics = ['Kmer\thGeoPval\tBinomOverRep\n'] for kmer in roughBestKmers: hGeoPval = allSeqs.Enrichment(kmer, TC8_ids) binom = allSeqs.overrep(kmer,TC8_ids) kmerMetrics.append('%s\t%s\t%s\n' % (kmer,hGeoPval,binom)) outFile = open(outFile,'w') outFile.writelines(kmerMetrics) print "Done."