def main(): seqsD = Fasta.load(sys.argv[1]) seqs = seqsD.values() for w in range(1,7): allnmers = permute(w) nmersT = MotifTools.top_nmers(w,seqs,'with counts','purge Ns') nmersD = {} total = 0 for nmer in allnmers: nmersD[nmer] = 1 #Pseudo count total = total + 1 for nmer,count in nmersT[:]: try: rc = MotifTools.revcomplement(nmer) nmersD[nmer] = nmersD[nmer] + count nmersD[rc] = nmersD[rc] + count total = total + 2*count except KeyError: pass _t = nmersD.keys() _t.sort() print "# freq in %s (total %d with pseudocounts)"%(sys.argv[1],total) for nmer in _t: print "%-7s %20.17f"%(nmer,float(nmersD[nmer]) / total) sys.stdout.flush()
def info2seeds(N,infofile,probefile,species='YEAST'): G = ProbeSet(species) IDs = G.ids_from_file(probefile) Q = EM.theMarkovBackground.zeroth() seqs = Fasta.seqs(infofile) if not N: nmers = seqs else: nmers= MotifTools.top_nmers(N,seqs) if len(nmers) > 1000: nmers = nmers[0:1000] print "Scoring enrichment of %d nmers from %s"%len(nmers,infofile) sys.stdout.flush() nmers_scoresT = [] for nmer in nmers: if nmer.isalpha(): p = G.p_value(nmer,IDs,'') #'verbose' nmers_scoresT.append((nmer,p)) nmers_scoresT.sort(lambda x,y: cmp(x[1],y[1])) last = min(20,len(nmers_scoresT)) models = [] for i in range(last): seq = nmers_scoresT[i][0] m = MotifTools.Motif('',Q) m.compute_from_text(seq,0.1) models.append(m) for tup in nmers_scoresT[0:40]: print tup return(models)
def main(): seqsD = Fasta.load(sys.argv[1]) seqs = seqsD.values() for w in range(1, 7): allnmers = permute(w) nmersT = MotifTools.top_nmers(w, seqs, 'with counts', 'purge Ns') nmersD = {} total = 0 for nmer in allnmers: nmersD[nmer] = 1 #Pseudo count total = total + 1 for nmer, count in nmersT[:]: try: rc = MotifTools.revcomplement(nmer) nmersD[nmer] = nmersD[nmer] + count nmersD[rc] = nmersD[rc] + count total = total + 2 * count except KeyError: pass _t = nmersD.keys() _t.sort() print "# freq in %s (total %d with pseudocounts)" % (sys.argv[1], total) for nmer in _t: print "%-7s %20.17f" % (nmer, float(nmersD[nmer]) / total) sys.stdout.flush()
def main(fastafile, outDirectory): # !! 1/2/09 AD added 'fastafile' var and changed 'if __name__' as way to call this from script. seqsD = Fasta.load(fastafile) seqs = seqsD.values() output = [] for w in range(1,7): allnmers = permute(w) nmersT = MotifTools.top_nmers(w,seqs,'with counts','purge Ns') nmersD = {} total = 0 for nmer in allnmers: nmersD[nmer] = 1 #Pseudo count total = total + 1 for nmer,count in nmersT[:]: try: rc = MotifTools.revcomplement(nmer) nmersD[nmer] = nmersD[nmer] + count nmersD[rc] = nmersD[rc] + count total = total + 2*count except KeyError: pass _t = nmersD.keys() _t.sort() output.append("# freq in %s (total %d with pseudocounts)\n"%(fastafile.split('/')[-1],total)) # AD 02-27-09 added a '\n' to make file look right for nmer in _t: output.append( "%-7s %20.17f\n"%(nmer,float(nmersD[nmer]) / total)) # AD 02-27-09 added a '\n' to make file look right # open output file and write out results outFile = '%s/%s.freq' % (outDirectory, fastafile.split('/')[-1]) outFile = open(outFile, 'w') for index in output: outFile.write(index)
def loadmotif(infile, trimstart=0, trimend=0): from TAMO import MotifTools lines = loadlist(infile) if lines[0] == "A\tC\tG\tT": ma = [] for l in lines[1:]: p = l.split("\t") ma.append({ 'A': float(p[0]), 'C': float(p[1]), 'G': float(p[2]), 'T': float(p[3]) }) if trimend == 0: ma = ma[trimstart:] else: ma = ma[trimstart:-trimend] return MotifTools.Motif_from_counts(ma) elif lines[0][0] in 'ACGT': if trimend == 0: lines = lines[trimstart:] else: lines = lines[trimstart:-trimend] return MotifTools.Motif(lines) else: na = [] for line in lines: na.append(list(map(int, line.split()))) ma = [] for i in range(len(na[0])): ma.append({ 'A': na[0][i], 'C': na[1][i], 'G': na[2][i], 'T': na[3][i] }) return MotifTools.Motif_from_counts(ma)
def tamo2tamo(file, outname): global probefile, PROBESET, fsafile motifs = MotifTools.load(file) if fsafile: fsaname = fsafile else: fsaname = find_fsa(file) print '# FSA ', fsaname fsaD = MotifMetrics.fasta2seqs(fsaname, 'want_dict') probes = fsaD.keys() if not probefile: PROBESET = MotifMetrics.ProbeSet('YEAST') #PROBESET= pick_genome(fsaname) #for key,seq in fsaD.items(): # PROBESET.probes[key] = seq print "# %d motifs" % len(motifs) for motif in motifs: #motif.pvalue, motif.church = 1,1 #Comment this! if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif, probes, 'v') if motif.church == 1: motif.church = PROBESET.church(motif, probes, 'v') #if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v') #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v') #if motif.E_seq == None: motif.E_seq = PROBESET.E_seq(motif,probes,'v') if motif.ROC_auc == None: motif.ROC_auc = PROBESET.ROC_AUC(motif, probes, 'v') #if motif.MNCP == None: motif.MNCP = PROBESET.MNCP(motif,probes,'v') if motif.frac == None: motif.frac = PROBESET.frac(motif, probes, 'v', 0.7) if motif.numbound == 0: matching = PROBESET.matching_ids(motif, [], factor=0.7) matchbound = [x for x in matching if x in probes] motif.numbound = len(probes) motif.nummotif = len(matching) motif.numboundmotif = len(matchbound) if 0 and motif.CRA == None: try: pass CRA, Cfrac = PROBESET.cons_ROC_AUC(motif, probes, 'v', tuple='YES') motif.CRA = CRA motif.Cfrac = Cfrac except: pass MotifTools.save_motifs(motifs, outname)
def memefiles2tamo(files, tamoname): global probefile, PROBESET, fsafile motifs = [] for filename in files: print ">>>SDFSD>F ",filename if re.search('\.ace$',filename): mdobject = AlignAce.AlignAce(filename) if not mdobject.fastafile: mdobject.fastafile=filename.replace('.ace','.fsa') elif re.search('\.meme.*$',filename): mdobject = Meme.Meme(filename) if not mdobject.fastafile: mdobject.fastafile=re.sub('\..\.meme','.meme',filename).replace('.meme','.fsa') motifs.extend(mdobject.motifs) #fsaname = find_fsa(mdobject.fastafile) print mdobject.fastafile if fsafile: fsaname = fsafile else: fsaname = Fasta.find(mdobject.fastafile) fsaD = Fasta.load(fsaname) probes = fsaD.keys() if not probefile: PROBESET = MotifMetrics.ProbeSet('YEAST') #PROBESET= pick_genome(fsaname) for key,seq in fsaD.items(): PROBESET.probes[key] = seq for motif in motifs: if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif,probes,'v') if motif.church == 1: motif.church = PROBESET.church(motif,probes,'v') #if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v') #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v') #if motif.E_seq == None: motif.E_seq = PROBESET.E_seq(motif,probes,'v') if motif.ROC_auc== None: motif.ROC_auc= PROBESET.ROC_AUC(motif,probes,'v') #if motif.MNCP == None: motif.MNCP = PROBESET.MNCP(motif,probes,'v') if motif.frac == None: motif.frac = PROBESET.frac(motif,probes,'v',0.7) if re.search('\.meme$',filename): motif.MAP = -math.log(motif.evalue)/math.log(10) if 0 and (motif.CRA == None): try: pass CRA, Cfrac = PROBESET.cons_ROC_AUC(motif,probes,'v',tuple='YES') motif.CRA = CRA motif.Cfrac = Cfrac except: pass if re.search('\.meme$',filename): mdobject.motifs.sort(lambda x,y: cmp(x.pvalue, y.pvalue)) else: mdobject.motifs.sort(lambda x,y: cmp(x.church, y.church)) MotifTools.save_motifs(motifs,tamoname)
def tamo2tf(TAMO_file): '''Converts TAMO files to the TRANSFAC format ''' ml = MotifTools.txt2motifs(TAMO_file) TAMO_file_name = TAMO_file.split("/")[-1] ACGT = ["A", "C", "G", "T"] n = 1 oup = open("%s.tf" % (TAMO_file), "w") for m in ml: if m.source == "": oup.write("DE\t%s_%s\t%s_%s\n" % (TAMO_file_name, n, TAMO_file_name, n)) else: oup.write("DE\t%s\t%s\n" % (m.source, m.source)) count = 0 #print m.source for i in range(m.width): oup.write("%s\t" % count) for letter in ACGT: if m.logP: Pij = pow(2.0, m.logP[i][letter]) oup.write("%s\t" % int(Pij * 100)) oup.write("\n") count += 1 oup.write("XX\n") n += 1 oup.close()
def averagemotifs(motifs,ovlp=2,template=None,DFUNC=negcommonbitsrange,VERBOSE=1,prop=''): if not template: Dmat = computeDmat(motifs) idx = centroididx(Dmat) template = motifs[idx] for m in motifs: off, rc = minshortestoverhangdiff(template,m,OVLP(template,m),'want_offset',DFUNC=DFUNC) m.offset = off m.rc = rc #Find most negative offset offsets = [m.offset for m in motifs] ; offsets.sort() maxposs = [(m.offset + m.width) for m in motifs] ; maxposs.sort() minpos = -offsets[0] maxpos = maxposs[-1] + minpos pmotifs = [] for m in motifs: if m.rc: _m = m.revcomp() else : _m = m leftpad = minpos + m.offset rightpad = maxpos - (leftpad + m.width) padded = _m[-leftpad,_m.width+rightpad] #print '%s%s%s\t%s'%('*'*leftpad,_m.oneletter,'*'*rightpad,padded) pmotifs.append(padded) AVE = MotifTools.sum(pmotifs,[]) if VERBOSE: for m in pmotifs: d = minshortestoverhangdiff(AVE,m,OVLP(AVE,m),DFUNC=DFUNC) print '%s %5.3f'%(m.oneletter,d), if m.__dict__.has_key('key'): print m.key, if prop and m.__dict__.has_key(prop): print m.__dict__[prop], print print '-'*m.width return AVE
def __init__(self,seed_seqs, all_seqs, width = 6, verbose = ''): self.seed_seqs = seed_seqs #Sequences to be scanned for seeds self.seqs = all_seqs self.candidates = [] self.models = [] #Set directly or computed from seed_seqs self.width = width self.verbose = verbose if width: self.goodwmersT = MotifTools.top_nmers(self.width,self.seed_seqs,1,"") else: self.goodwmersT = zip(self.seed_seqs,range(len(self.seed_seqs))) self.bgprob = {'A': 0.31, 'C': .19, 'G': .19, 'T': .31} self.beta = 0.001 self.deltamin = 1e-3 self.probes = [] self.method = "ZOOPS" # OOPS or ZOOPS ) self.param = {} self.gapflank = 0 self.gapweight = 0.2 self.seedbeta = 0.02 self.joint = 1 global theMarkovBackground if theMarkovBackground: self.bgprob = theMarkovBackground.zeroth() '''DELETE
def parse_opts(): global GLOBALS global DFUNC, DMAX short_opts = 'm:' long_opts = ['dfunc:'] try: opts, args = getopt.getopt(sys.argv[1:], short_opts, long_opts) except getopt.GetoptError: print getopt.GetoptError.__dict__ usage() if not opts: usage() GLOBALS['args'] = args GLOBALS['motifs'] = [] DFUNCtxt = None for opt,value in opts: if opt == '-m': GLOBALS['motifs'] = MotifTools.txt2motifs(value) if opt == '--dfunc': DFUNCtxt = value if opt == '-d': DMAX = float(value) # Deal with DFUNC and DMAX if DFUNCtxt == 'NCB': _DFUNC = MotifCompare.negcommonbits elif DFUNCtxt: try: exec ("_DFUNC = MotifCompare.%s"%DFUNCtxt) except: usage("No such distance metric: %s"%DFUNCtxt) if _DFUNC: set_dfunc(_DFUNC,DMAX)
def tamo2tamo(file, outname): global probefile, PROBESET, fsafile motifs = MotifTools.load(file) if fsafile: fsaname = fsafile else: fsaname = find_fsa(file) print '# FSA ',fsaname fsaD = MotifMetrics.fasta2seqs(fsaname,'want_dict') probes = fsaD.keys() if not probefile: PROBESET = MotifMetrics.ProbeSet('YEAST') #PROBESET= pick_genome(fsaname) #for key,seq in fsaD.items(): # PROBESET.probes[key] = seq print "# %d motifs"%len(motifs) for motif in motifs: #motif.pvalue, motif.church = 1,1 #Comment this! if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif,probes,'v') if motif.church == 1: motif.church = PROBESET.church(motif,probes,'v') #if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v') #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v') #if motif.E_seq == None: motif.E_seq = PROBESET.E_seq(motif,probes,'v') if motif.ROC_auc== None: motif.ROC_auc= PROBESET.ROC_AUC(motif,probes,'v') #if motif.MNCP == None: motif.MNCP = PROBESET.MNCP(motif,probes,'v') if motif.frac == None: motif.frac = PROBESET.frac(motif,probes,'v',0.7) if motif.numbound == 0: matching = PROBESET.matching_ids(motif,[],factor=0.7) matchbound = [x for x in matching if x in probes] motif.numbound = len(probes) motif.nummotif = len(matching) motif.numboundmotif = len(matchbound) if 0 and motif.CRA == None: try: pass CRA, Cfrac = PROBESET.cons_ROC_AUC(motif,probes,'v',tuple='YES') motif.CRA = CRA motif.Cfrac = Cfrac except: pass MotifTools.save_motifs(motifs,outname)
def pick_chunk_score(wdir, TAMO_file, target, genome): '''Trims and returns the top motif in a cluster. This script takes in the TAMO file from the motifs in a single cluster. It trims the low-information ends from each motifs. It then indentifies the motif that is most significantly represented in the target genes in your genome. If no motif is significantly represented, then a blank top motif file is created. ''' os.system("cd %s" % wdir) os.chdir(wdir) script_dir = '/'.join(os.path.abspath(__file__).split('/') [:-1]) # path to pcc_merge_CC.py script ## # step 1 trim tamo to eliminate low information flanking sequence trim_motif(TAMO_file, 0.1) ## # step 2 Group Specificity Score" from the Church lab # python MotifMetrics.py [Genes of interest] -genome [FASTA of promoter sequence] -t [Trimmed TAMO of cluster motifs] # MotifMetrics.py checks if the motifs appear disproportionatly to the # targets compared to the rest of the genes. os.system( "python %s/MotifMetrics.py %s -genome %s -t %s_0.1.trim -spec > %s_0.1.trim_Cout" % (script_dir, target, genome, TAMO_file, TAMO_file)) ## # Gets the motif that is most significantly represented in your target genes # Returns "None" if none of the motifs has a p-value above 0.001. topm = parse_out_pcs("%s_0.1.trim_Cout" % TAMO_file) print "topm", topm ## # Writes the top motif to its own directory. if topm != "None": newdic = {} ml = MotifTools.txt2motifs("%s_0.1.trim" % TAMO_file) for m in ml: if m.oneletter == topm: newdic[m.oneletter] = m save_motifs(newdic.values(), "%s.TOP" % TAMO_file) os.system("rm %s_0.1.trim" % TAMO_file) os.system("rm %s_0.1.trim_Cout" % TAMO_file) ## # Writes a blank document if there was no top motif. else: oup = open("%s.TOP" % TAMO_file, "w") oup.close()
def ace2tamo(filename, tamoname): global probefile, PROBESET if re.search('\.ace$',filename): mdobject = AlignAce.AlignAce(filename) elif re.search('\.meme$',filename): mdobject = Meme.Meme(filename) fsaname = find_fsa(mdobject.fastafile) fsaD = MotifMetrics.fasta2seqs(fsaname,'want_dict') probes = fsaD.keys() if not probefile: PROBESET = MotifMetrics.ProbeSet('HUMAN_250') #PROBESET= pick_genome(fsaname) for key,seq in fsaD.items(): PROBESET.probes[key] = seq for motif in mdobject.motifs: if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif,probes,'v') if motif.church == 1: motif.church = PROBESET.church(motif,probes,'v') if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v') #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v') if motif.E_seq == None: motif.E_seq = PROBESET.E_seq(motif,probes,'v') if motif.ROC_auc== None: motif.ROC_auc= PROBESET.ROC_AUC(motif,probes,'v') if motif.MNCP == None: motif.MNCP = PROBESET.MNCP(motif,probes,'v') if re.search('\.meme$',filename): motif.MAP = -math.log(motif.evalue)/math.log(10) sys.stdout.flush() i = 0 for motif in mdobject.motifs: motif.seednum = i ; i=i+1 kmers = motif.bogus_kmers(100) motif.maxscore = -100 scores = [motif.scan(kmer)[2][0] for kmer in kmers] print Arith.avestd(scores) if re.search('\.meme$',filename): mdobject.motifs.sort(lambda x,y: cmp(x.pvalue, y.pvalue)) else: mdobject.motifs.sort(lambda x,y: cmp(x.church, y.church)) MotifTools.save_motifs(mdobject.motifs,tamoname)
def trim_motif(TAMO_file, cut=0.4): '''Trims the motifs in TAMO_file, eliminating low-information flanks.''' testmotifs = MotifTools.load(TAMO_file) file = TAMO_file + "_" + str(cut) + ".trim" new_mlist = [] for motif in testmotifs: m = motif.trimmed(cut) new_mlist.append(m) save_motifs(new_mlist, file)
def TAMO_split(TAMO_file, motifs_per_file=190): '''This function splits a TAMO into smaller files for create_cc''' ml = MotifTools.txt2motifs(TAMO_file) total = len(ml) / int(motifs_per_file) # Total number of TAMOs to generate by = motifs_per_file for i in range(total): print i print i * by + by, TAMO_file + '_n%s' % i save_motifs(ml[i * by:i * by + by], TAMO_file + '_n%s' % i) print total * by, len(ml), TAMO_file + '_n%s' % (total) save_motifs(ml[total * by:len(ml)], TAMO_file + '_n%s' % (total)) return (total)
def opentamo(fileloc): ''' Opens a tamo file with MotifTools.load and returns the list of motifs, except when the input file doesn't exist, in which case it returns an empty list. Has 1 argument: - fileloc: a string with the location of the file ''' try: return MotifTools.load(fileloc) except IOError: return []
def Reduce_Nmers(Info): print 'COMPUTING Nmers ....' mseqs = ReduceInfo2seqs(Info,70, lambda L: MotifTools.top_nmers(6,L)[0:3]) print "Combining representative sequences...: " for i in range(len(mseqs)): i = i + 1 print '\t%s'%mseqs[i-1], if (i%5 == 0): print print top_seq_pairs = MotifTools.top_nmers(5,mseqs,1) total_nmers = 0 for (mner,count) in top_seq_pairs: total_nmers = total_nmers + count for (nmer,count) in top_seq_pairs[0:8]: print "RESULT: %s\t%2d (%5.2f%%) occurences: "%(nmer,count, 100*float(count)/total_nmers), for bsite in Info.query['bsites']: seq = bsite.cleantxt() (max,s1,s2) = MotifTools.compare_seqs(nmer,seq) print ' %s vs %s %4.2f correct'%(s1,s2,max)
def tamofile2motifs(filename): FID = open(filename,'r') lines = FID.readlines() FID.close() motifs = [] seedD = {} seedfile = '' for i in range(len(lines)): if lines[i][0:10] == 'Log-odds matrix'[0:10]: w = len(lines[i+1].split())-1 ll = [] for pos in range(w): ll.append({}) for j in range(0,4): toks = lines[i+j+2].split() L = toks[0][1] for pos in range(w): ll[pos][L] = float(toks[pos+1]) m = MotifTools.Motif_from_ll(ll) motifs.append(m) if lines[i][0:6] == 'Motif '[0:6]: toks = lines[i].split() motifs[-1].nseqs = float(re.sub('[\(\)]','',toks[3])) motifs[-1].totalbits= float(toks[5]) motifs[-1].MAP = float(toks[7]) motifs[-1].seeddist = float(toks[9]) motifs[-1].seednum = int(toks[10][0:-1]) motifs[-1].pvalue = math.pow(10,-float(toks[12])) if 'ch:' in toks: motifs[-1].church = math.pow(10,-float(toks[14])) if lines[i][0:10] == 'Threshold: '[0:10]: toks = lines[i].split() motifs[-1].threshold= float(toks[1]) if lines[i][0:5] == 'Seed '[0:5]: toks = lines[i].split() id = int(toks[1][0:-1]) #'10:' -> '10' seedD[id] = toks[2] if lines[i][0:7] == 'Source: '[0:7]: motifs[-1].source = lines[i][7:].strip() if lines[i][0:6] == 'Gamma: '[0:6]: motifs[-1].gamma = float(lines[i][6:]) if lines[i][0:6] == 'Evalue: '[0:6]: motifs[-1].evalue = float(lines[i][7:].strip()) if lines[i].find('Using')>=0 and lines[i].find('as seeds')>=0: '''#Using all (132) motifs in SLT_081503.seeds as seeds:''' seedfile = lines[i].split()[-3] for i in range(len(motifs)): if seedfile: motifs[i].seedfile = seedfile seednum = motifs[i].seednum if seedD.has_key(seednum): motifs[i].seedtxt = seedD[seednum] return(motifs)
def motifs2tamo(motifs, outname): global probefile, PROBESET fsaname = find_fsa(outname) fsaD = MotifMetrics.fasta2seqs(fsaname,'want_dict') probes = fsaD.keys() if not probefile: PROBESET = MotifMetrics.ProbeSet('YEAST') #PROBESET= pick_genome(fsaname) #for key,seq in fsaD.items(): # PROBESET.probes[key] = seq print "# %d motifs"%len(motifs) for motif in motifs: if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif,probes,'v') if motif.church == 1: motif.church = PROBESET.church(motif,probes,'v') if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v') #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v') if motif.E_seq == None: motif.E_seq = PROBESET.E_seq(motif,probes,'v') if motif.ROC_auc== None: motif.ROC_auc= PROBESET.ROC_AUC(motif,probes,'v') if motif.MNCP == None: motif.MNCP = PROBESET.MNCP(motif,probes,'v') MotifTools.save_motifs(motifs,outname)
def parse_block(name, block): mat = [] ACGT = {"A": 1, "C": 2, "G": 3, "T": 4} for i in block: L = i.strip().split() D = {'A': 0, 'C': 0, 'T': 0, 'G': 0} for j in ACGT.keys(): D[j] = float(L[ACGT[j]]) mat.append(D) m = MotifTools.Motif_from_counts(mat) m.source = name #print m._print_p() return m
def test(): motifs = [] betalist = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 4.0] for beta in [1.0]: m = MotifTools.Motif() m.compute_from_text('GGTTTCAT', beta) #STE12 binding site print m m._print_ll() print "Against Ste12:" match = validate(m, "STE12", 'V', 'T') print "Against Fkh2:" fmatch = validate(m, "FKH2", 'V', 'T') print beta, match, fmatch
def freq_from_seqs_old(self,seqs): self.highestorder = 4 for depth in range(1,6): nmersT = MotifTools.top_nmers(depth, seqs, "TUPLES") self.nmers_by_size[depth] = map(lambda x:x[0],nmersT) total = 0 for nmer,count in nmersT: total = total + count for nmer,count in nmersT: rc = MotifTools.revcomplement(nmer) if nmer == rc: #correct top_nmers f = float(count)/total #palindrome count else: f = float(count)/total/2 self.F[nmer] = f self.F[rc] = f for depth in range(0): #For debugging total = 0 for k in self.F.keys(): if len(k) == depth: total = total + self.F[k] print k, self.F[k] print depth,total
def alignAndCombineMotifs(motifs, weights): # zip motifs and weights simMotifs = zip(motifs, weights) # sort by weights simMotifs.sort(key=lambda x: abs(x[1])) simMotifs.reverse() aligned = alignSimilarMotifs([x[0] for x in simMotifs], minoverlap=4) #print '--' #for each in aligned: print each.oneletter #print '\n' comboMotif = MotifTools.sum(aligned,[-x[1] for x in simMotifs]) return comboMotif
def combine_distance_matrix_for_2(wdir, TAMO_file_1, TAMO_file_2): '''Combines matricies made from two TAMO files. This script is used to create the final matrix after all jobs from create_cc_for_2 are complete. ''' ml_1 = MotifTools.txt2motifs(TAMO_file_1) ml_2 = MotifTools.txt2motifs(TAMO_file_2) n_split_1 = len(ml_1) / 100 n_split_2 = len(ml_2) / 100 print n_split_1, len(ml_1) print n_split_2 # Change to the working directory. os.system("cd %s" % wdir) os.chdir(wdir) # This loop will paste together matricies for i in range(n_split_1 + 1): com = "paste " for j in range(n_split_2 + 1): com += "%s_n%s-%s_n%s.dm " % (TAMO_file_1, i, TAMO_file_2, j) com += "> distance_%s" % i print com os.system(com) # com = "cat " for i in range(n_split_1 + 1): com += "distance_%s " % i com += "> %s-%s.dm" % (TAMO_file_1, TAMO_file_2) print com os.system(com)
def parse_opts(): global GLOBALS short_opts = 'm:g:' long_opts = ['genome=','top='] try: opts, args = getopt.getopt(sys.argv[1:], short_opts, long_opts) except getopt.GetoptError: print getopt.GetoptError.__dict__ usage() if not opts: usage() GLOBALS['args'] = args for opt,value in opts: if opt == '-m': GLOBALS['motifs'] = MotifTools.txt2motifs(value) if opt in ['-g', '--genome']: GLOBALS['genomefile'] = value if opt == '--top': GLOBALS['top'] = int(value)
def Read_Dreme_PSSM(lines): pwm = [] name = "Dreme Motif"; vals = [] for line in lines.split('\n'): for item in line.split(): vals.append(float(item)) pwm.append(vals) vals = []; #print pwm m = MotifTools.toDict(pwm) motif = MotifTools.Motif_from_counts(m) return motif;
def study_seqs(self,seqs): for depth in range(1,6): nmersT = MotifTools.top_nmers(depth, seqs, "TUPLES") total = 0 for nmer,count in nmersT: total = total + count rc = MotifTools.revcomplement(nmer) for nmer,count in nmersT: f = math.log(float(count)/total)/math.log(2) f_2 = math.log(0.5 * float(count)/total)/math.log(2) rc = MotifTools.revcomplement(nmer) if rc != nmer: self.D[nmer] = f_2 self.D[rc] = f_2 else: self.D[nmer] = f for depth in range(0): total = 0 for k in self.D.keys(): if len(k) == depth: total = total + pow(2,self.D[k]) print k, pow(2,self.D[k]) print depth,total self.highestorder = 5
def freq_from_seqs(self,seqs): self.highestorder = 6 for w in range(1,7): allnmers = permute(w) nmersT = MotifTools.top_nmers(w,seqs,'with counts','purge Ns') self.nmers_by_size[w] = allnmers[:] nmersD = {} total = 0.0 for nmer in allnmers: #Pseudo count nmersD[nmer] = 1 total = total + 1 for nmer,count in nmersT: try: rc = MotifTools.revcomplement(nmer) nmersD[nmer] = nmersD[nmer] + count nmersD[rc] = nmersD[rc] + count total = total + 2*count except KeyError: pass for nmer in nmersD.keys(): rc = MotifTools.revcomplement(nmer) f = nmersD[nmer]/total self.F[nmer] = f self.F[rc] = f
def Make_PWM_Motif(filename, motifBackGround=""): #print "# Reading PWM from: [%s]"%filename name, pwm = Read_PWM(filename) m = MotifTools.toDict(pwm) #print m motif = MotifTools.Motif_from_ll(m); motif.source = name #print "Motif:", motif.source #print "Max Motif Score:", motif.maxscore #print "Motif Summary:", motif.summary() #motif.printlogo(2.3,10) return motif
def averagemotifs(motifs, ovlp=2, template=None, DFUNC=negcommonbitsrange, VERBOSE=1, prop=''): if not template: Dmat = computeDmat(motifs) idx = centroididx(Dmat) template = motifs[idx] for m in motifs: off, rc = minshortestoverhangdiff(template, m, OVLP(template, m), 'want_offset', DFUNC=DFUNC) m.offset = off m.rc = rc #Find most negative offset offsets = [m.offset for m in motifs] offsets.sort() maxposs = [(m.offset + m.width) for m in motifs] maxposs.sort() minpos = -offsets[0] maxpos = maxposs[-1] + minpos pmotifs = [] for m in motifs: if m.rc: _m = m.revcomp() else: _m = m leftpad = minpos + m.offset rightpad = maxpos - (leftpad + m.width) padded = _m[-leftpad, _m.width + rightpad] #print '%s%s%s\t%s'%('*'*leftpad,_m.oneletter,'*'*rightpad,padded) pmotifs.append(padded) AVE = MotifTools.sum(pmotifs, []) if VERBOSE: for m in pmotifs: d = minshortestoverhangdiff(AVE, m, OVLP(AVE, m), DFUNC=DFUNC) print '%s %5.3f' % (m.oneletter, d), if m.__dict__.has_key('key'): print m.key, if prop and m.__dict__.has_key(prop): print m.__dict__[prop], print print '-' * m.width return AVE
def showdiffXvert(motif, seq, OVLP_FCN=None, DIFF_FCN=None): ''' The funtion converts the sequence to a Motif, computes the D of the best alignment, and prints the alignment that generated that D. ''' MSOdiff = minshortestoverhangdiff if not OVLP_FCN: OVLP_FCN = lambda A, B: min(min(A.width, B.width) - 1, 7) bg = motif.background other = MotifTools.Motif_from_text(seq, bg=bg) ovlp = OVLP_FCN(motif, other) diff = MSOdiff(motif, other, ovlp, DFUNC=DIFF_FCN) offset, rcflag = MSOdiff(motif, other, ovlp, 'want_offset', DFUNC=DIFF_FCN) if rcflag: m = other.revcomp() else: m = other print 'MSOdiff: %8.4f %s%s%s' % (diff, ' ' * 15, motif.oneletter, ' ' * (30 - motif.width)) print ' %8s %s%s%s' % (' ', ' ' * (15 + offset), m.oneletter, ' ' * (30 - offset - other.width)) return diff
def main(): fsa_fcn = up_and_no_N parse() FID = open(sys.argv[1]) tokss = [x.strip().split(',') for x in FID.readlines()] FID.close() D = {} for expt,motif,score,source in tokss: print expt,motif if expt == 'Category': continue if motif == 'x': continue motif = MotifTools.Motif_from_text(motif) motif.kellis = float(score) motif.source = source try: D[expt].append(motif) except: D[expt] = [motif] for expt,motifs in D.items(): root = expt ext = 'cons' if root[0:3] == 'Rnd': num = re.sub('.*_','',root) if len(num) == 1: root = re.sub('_','_00',root) else: root = re.sub('_','_0',root) root = re.sub('Rnd','random_',root) outname = '%s.t%s'%(root,ext) print '%-18s --> %s'%(root,outname) sys.stdout.flush() motifs2tamo(motifs,outname) try: pass #tamo2tamo(filename,outname) except: print "Error: Could not convert %s [[ %s ]]"%( filename, outname)
def memefiles2tamo(files, tamoname): global probefile, PROBESET motifs = [] for filename in files: print ">>>SDFSD>F ", filename if re.search('\.ace$', filename): mdobject = AlignAce.AlignAce(filename) if not mdobject.fastafile: mdobject.fastafile = filename.replace('.ace', '.fsa') elif re.search('\.meme.*$', filename): mdobject = Meme.Meme(filename) if not mdobject.fastafile: mdobject.fastafile = re.sub('\..\.meme', '.meme', filename).replace('.meme', '.fsa') motifs.extend(mdobject.motifs) #fsaname = find_fsa(mdobject.fastafile) print mdobject.fastafile fsaname = Fasta.find(mdobject.fastafile) fsaD = Fasta.load(fsaname) probes = fsaD.keys() if not probefile: PROBESET = MotifMetrics.ProbeSet('YEAST') #PROBESET= pick_genome(fsaname) for key, seq in fsaD.items(): PROBESET.probes[key] = seq for motif in motifs: if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif, probes, 'v') if motif.church == 1: motif.church = PROBESET.church(motif, probes, 'v') if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif, probes, 3, 'v') #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v') #if motif.E_seq == None: motif.E_seq = PROBESET.E_seq(motif,probes,'v') if motif.ROC_auc == None: motif.ROC_auc = PROBESET.ROC_AUC(motif, probes, 'v') if motif.MNCP == None: motif.MNCP = PROBESET.MNCP(motif, probes, 'v') if motif.frac == None: motif.frac = PROBESET.frac(motif, probes, 'v', 0.7) if re.search('\.meme$', filename): motif.MAP = -math.log(motif.evalue) / math.log(10) if 1 and (motif.CRA == None): try: pass CRA, Cfrac = PROBESET.cons_ROC_AUC(motif, probes, 'v', tuple='YES') motif.CRA = CRA motif.Cfrac = Cfrac except: pass if re.search('\.meme$', filename): mdobject.motifs.sort(lambda x, y: cmp(x.pvalue, y.pvalue)) else: mdobject.motifs.sort(lambda x, y: cmp(x.church, y.church)) MotifTools.save_motifs(motifs, tamoname)
def all_Wmers(self,N,seq): forw = [] rev = [] seqrc = MotifTools.revcomplement(seq) Mlh = theMarkovBackground.highestorder Mlb = theMarkovBackground.logbackground MCP = theMarkovBackground.CP Fbg = Mlb(seq) Rbg = Mlb(seqrc) nmask = map(lambda x:1-x, self.mask) ''' ?? QUESTION: Is it sensible to compute the background probabilities this way? 1) BG of complementary strand is taken as equal to primary strand. 2) Letters inside the motif window are not used for conditional probabilities. As a result, the calculation essentially breaks down to the log probability the background emits the sequence to the left of the window plus the log probability the background emits the sequence to the right. 3) I\'ve worked out an efficient way to compute this by a) Compute the background probability for the entire probe/sequence b) (Quick) Compute logQdiff below c) Subtract ''' for i in range(len(seq)-N+1): subseq = seq[i:i+N] '''Build Wmer information''' #Wtmp = Wmer(subseq) left = seq[0:i] right = seq[i+N:] #Wtmp.lflank = left #Wtmp.rflank = right #if i==0: Wtmp.src = seq #Wtmp.srcQ = Fbg #Wtmp.i = i '''This is the fast way''' logQdiff = Mlb(left[-Mlh:] + subseq + right[0:Mlh]) - Mlb(left[-Mlh:]) - Mlb(right[0:Mlh]) logQtot = Fbg - logQdiff '''Add a bit back for intervening bases in the "gap" ''' gapbg = 0 for p in range(N): gapbg = gapbg + MCP[subseq[p]] * nmask[p] logQtot = logQtot + gapbg '''Build Wmer-reverse complement information''' #Wtmprc = Wmer(Wtmp.rc) #Wtmprc.lflank = seqrc[0:-(i+N)] #Check this in case it is ever necessary #if i!=0: # Wtmprc.rflank = seqrc[-i:] #Necessary [11-12-02] #else: # Wtmprc.rflank = '' #Wtmprc.logQtot = Wtmp.logQtot #Wtmprc.srcQ = Wtmp.srcQ #Wtmprc.i = i forw.append(logQtot) rev.append(logQtot) W = [] W.extend(forw) W.extend(rev) #seq.c_wmerbgs = MDsupport.list2double(map(lambda x: x.logQtot, W)) #MDsupport.printdouble(seq.c_wmerbgs,len(W)) return(W)
def merge_runs(TAMO_file, wdir, height, distance, ancestor, target, genome): '''This script is used to merge motifs with the PCC matrix of all motifs. The script was originally written by Cheng Zou, and then converted to a function by Alex Seddon. ''' print "Here are the parameters you specified in this run " print "-tamo %s" % TAMO_file print "-wdir %s" % wdir print "-h height to cut the tree, %s" % height print "-distance %s" % distance print "-ancestor %s" % ancestor print "-target %s" % target print "-genome %s" % genome if TAMO_file == '' or wdir == '': help() os.system("cd %s" % wdir) os.chdir(wdir) # This code was in the original clustering script. It has been taken out # because the processes involved take too long and have been replaced by # the matrix creation scripts and the run_UPGMA script. #if distance==0: # os.system("python /mnt/home/seddonal/scripts/5_motif_merging/3.calculate_distance_matrix.py -i %s --dfunc pccrange" % TAMO_file) #os.system("R --vanilla --slave --args %s.dm %s< /mnt/home/seddonal/scripts/5_motif_merging/UPGMA_final.R> %s.Rout" % (TAMO_file,height,TAMO_file)) cl_dic = {} n = 0 # The file, TAMO_file.dm_UPGMA_Cl_0.05, is inorder of the motifs that appear # in the TAMO_file. If two motifs have the same number, they are considered # a part of the same cluster. # This loop pulls the clustering information out of this file and creats # the dictionary cl_dic = {cluster_index:{motif_index:'1'}} for line in open("%s.dm_UPGMA_Cl_%s" % (TAMO_file, height), "r"): # Gets the clusterindex of this motif cl = line.strip() # Adds the cluster index if it has not been if not cl_dic.has_key(cl): cl_dic[cl] = {} cl_dic[cl][n] = "1" # Adds the motif to that cluster n += 1 # Increases the motif index for the next motif #print cl_dic ml = MotifTools.txt2motifs(TAMO_file) old = [] # List of motifs that are the sole members of a cluster. # I think I can divide up this portion of the code to create a series print ancestor, ancestor == 0 if ancestor == 0: # This loop Looks at each cluster and attempts to merge the motifs # in the cluster if there are multiple motifs. for i in cl_dic.keys(): print i, cl_dic[i] # If there are multiple motifs in the cluster, it merges the motifs if len(cl_dic[i]) > 1: # Adds all of the motifs in the cluster to an object called # mlist. mlist = [] for j in cl_dic[i]: mlist.append(ml[j]) # Saves these motifs to there own TAMO file. save_motifs(mlist, "%s_sub_%s.tm" % (TAMO_file, i)) # I am fairly certain that this process of converting to TF and # then returning it to TAMO format is only for keeping the names # consistent. I need to verify this suspicion tamo2tf("%s_sub_%s.tm" % (TAMO_file, i)) os.system("cat %s_sub_%s.tm.tf > %s_sub_%s_sum.tm.tf" % (TAMO_file, i, TAMO_file, i)) tf2tamo("%s_sub_%s_sum.tm.tf" % (TAMO_file, i)) # Gets the top motif in the cluster. pick_chunk_score(wdir, '%s_sub_%s_sum.tm.tf.tm' % (TAMO_file, i), target, genome) # Removes the files that were created. os.system("rm %s_sub_%s_sum.tm.tf.tm" % (TAMO_file, i)) os.system("rm %s_sub_%s_sum.tm.tf" % (TAMO_file, i)) os.system("rm -R %s_sub_%s.tm.tf_ST*" % (TAMO_file, i)) # If there is only one motif in the cluster, it leaves it alone, # And adds it to old else: key = cl_dic[i].keys()[0] old.append(ml[key]) if ancestor == 1: # This loop Looks at each cluster and attempts to merge the motifs # in the cluster if there are multiple motifs. for i in cl_dic.keys(): print i, cl_dic[i] # If there are multiple motifs in the cluster, it merges the motifs if len(cl_dic[i]) > 1: # Adds all of the motifs in the cluster to an object called # mlist. mlist = [] for j in cl_dic[i]: mlist.append(ml[j]) # Saves these motifs to there own TAMO file. save_motifs(mlist, "%s_sub_%s.tm" % (TAMO_file, i)) # Merges the motifs in the same cluster using STAMP tamo2tf("%s_sub_%s.tm" % (TAMO_file, i)) # Gets the JASPER motifs that best match the motifs from within # the cluster. os.system( "STAMP -tf %s_sub_%s.tm.tf -sd /home/chengzou/bin/STAMP/ScoreDists/JaspRand_PCC_SWU.scores \ -go 1000 -ge 1000 -cc PCC -align SWU -out %s_sub_%s.tm.tf_STout -chp > %s_sub_%s.tm.tf_STout.log" % (TAMO_file, i, TAMO_file, i, TAMO_file, i)) parse_out_STAMP(TAMO_file, i) # combines the JASPER motifs with the cluster motif and then # converts them all to one TAMO file os.system( "cat %s_sub_%s.tm.tf %s_sub_%s.tm.tf_SToutFBP.txt.mod %s_sub_%s.tm.tf_STout_tree_clusters.txt > %s_sub_%s_sum.tm.tf" % (TAMO_file, i, TAMO_file, i, TAMO_file, i, TAMO_file, i)) tf2tamo("%s_sub_%s_sum.tm.tf" % (TAMO_file, i)) # Gets the top motif within the TAMO file. pick_chunk_score(wdir, '%s_sub_%s_sum.tm.tf.tm' % (TAMO_file, i), target, genome) # Removes any files created in the processing. os.system("rm %s_sub_%s_sum.tm.tf.tm" % (TAMO_file, i)) os.system("rm %s_sub_%s_sum.tm.tf" % (TAMO_file, i)) os.system("rm -R %s_sub_%s.tm.tf_ST*" % (TAMO_file, i)) else: key = cl_dic[i].keys()[0] old.append(ml[key]) # Combine together the top motifs from every os.system("cat %s_sub_*_sum.tm.tf.tm.TOP > %s_sub_new.tm" % (TAMO_file, TAMO_file)) save_motifs(old, "%s_sub_old.tm" % (TAMO_file)) os.system("cat %s_sub_old.tm %s_sub_new.tm > %s_P1.tm" % (TAMO_file, TAMO_file, TAMO_file))
genelist = argv[1].split('/')[-1] allclusters = argv[1] + '/' + genelist + '_allclusters.tamo' #print genelist oneletters = argv[1] + '/other/' + genelist + '_oneletter.tmp' symbols = argv[1] + '/other/' + genelist + '_symbols.tmp' # Open output files for writing oneletters = open(oneletters, 'w') symbols = open(symbols, 'w') # Define output variables oneletterlist = [] symbolstring = '1234567890ABCDEFGHIJKLMNOPQRSTUVWXYZ*+.,:;!' # Open list motiflist = MotifTools.load(allclusters) # Try to verify the initial list is not too long if len(motiflist) > len(symbolstring): # If the list is too long, raise an exception so that the program quits raise ValueError("The cluster list is too long for sitemap.py") # If the list is not too long, adjust the symbols string to the appropriate length else: symbolstring = symbolstring[:len(motiflist)] # Save symbol string in the symbols file and close that file symbols.write(symbolstring) symbols.close() # Add oneletter summaries to the list for num in range(len(motiflist)):
def EM_Cstart(self): verbose = self.verbose if verbose: print "Seeding models..." sys.stdout.flush() self.seed_models() #Initialize parameters if not self.param.has_key('gamma'): self.param['gamma'] = 0.2 timings = {'Probes':0, 'Background':0, 'C EM':0, 'Post':0} _time = time.time() for seq in self.seqs: P = Probe(seq) self.probes.append(P) _time2 = time.time(); timings['Probes'] = _time2-_time; _time = _time2 if verbose: print "Optimizing candidates by EM." if verbose: sys.stdout.flush() c_logZ_sets = {} for Model,i in zip(self.models,range(len(self.models))): width = Model.width self.calcmask(width) if not c_logZ_sets.has_key(width): c_logZs_set = [] if verbose: print "#%s |%s|"%(' '*28,'-'*len(self.seqs)) if verbose: sys.stdout.flush() if verbose: print "Computing background (width %2d) "%width, for P in self.probes: if verbose: sys.stdout.write('.') if verbose: sys.stdout.flush() logZs = self.all_Wmers(width,P) c_logZs = MDsupport.list2double(logZs) c_logZs_set.append(c_logZs) #P.c_wmerbgs = MDsupport.list2double(map(lambda x: x.logQtot, Wlist)) c_logZ_sets[width] = c_logZs_set if verbose: print c_logZ_set = c_logZ_sets[width] for P,c_logZs in zip(self.probes,c_logZ_set): P.c_wmerbgs = c_logZs _time2 = time.time() timings['Background'] = timings['Background'] +_time2-_time _time = _time2 '''Perform EM''' _time = time.time() newModel = self.EM_C(Model, self.probes) _time2 = time.time(); timings['C EM'] = timings['C EM'] + _time2-_time; _time = _time2 #print "cLL: ",newModel.joint #print "pLL: ",self.compute_joint(newModel,Wmers_by_seq) '''Was there a problem?''' if newModel == None: continue '''Set various things in PSSM''' #Distance(s) seeddist = MotifTools.infomaskdiff(newModel,Model) print '%s ----> %s'%(Model,newModel) print "Seed %2d: %s --> %s mask:%9.5f infoMask:%9.5f d:%9.5f"%( i, Model, newModel, MotifTools.maskdiff(newModel,Model), MotifTools.infomaskdiff(newModel,Model), #order is important Model-newModel) #Seed if Model.seedtxt: newModel.seedtxt = Model.seedtxt if Model.source: newModel.source = Model.source #newModel.denoise() newModel.seeddist = seeddist newModel.seednum = i print newModel newModel._print_p() newModel._print_ll() '''Set various things in Candidate (like a wrapper for PSSM)''' C = MotifCandidate() C.pssm = newModel.copy() #C.wmers = self.best_by_Z(Wmers_by_seq) C.wmers = [newModel.emit() for junk in range(20)] #C._update() #MAJOR REMOVAL????????? DBG 10-14-03 #C.MAPpurge() C.pssm = newModel.copy() self.candidates.append(C) _time2 = time.time(); timings['Post'] = timings['Post']+_time2-_time;_time = _time2 '''Print Timing Information''' if verbose: print "# Timing Information" _t = 0 for timing in timings.keys(): _t = _t + timings[timing] for timing in timings.keys(): print "# %12s %f %f%%"%(timing,timings[timing],timings[timing]*100/_t)
# TAMOify kmers and logify pVals for i in range(len(testMotifs)): testMotifs[i] = (Motif(testMotifs[i][0]),numpy.log10(float(testMotifs[i][1]))) # Sort on log'd pVals testMotifs.sort(key=lambda x: x[1]) comboMotifs = [] for i in range(0,int(len(testMotifs)*0.2)): simMotifs = getKmersWithOneMisMtch(testMotifs[i][0],testMotifs) alndMotifs = alignSimilarMotifs([x[0] for x in simMotifs]) #for m in simMotifs: #print m[0].oneletter comboMotifs.append(MotifTools.sum(alndMotifs,[-x[1] for x in simMotifs])) # -x[1] to convert neg logs to pos weights print len(comboMotifs) t2 = time.time() oFile = '/Users/biggus/Documents/James/Collaborations/Campbell/data/Results_HyperGeoScreen/masked/Results_gGEMS/CCupAt4Days.gte2x.5-16mers.shfSeq.3.gGEMS.tmo' pFile = '/Users/biggus/Documents/James/Collaborations/Campbell/data/Results_HyperGeoScreen/masked/Results_gGEMS/CCupAt4Days.gte2x.5-16mers.shfSeq.3.gGEMS.pkl' MotifTools.save_motifs(comboMotifs,oFile,kmer_count=60) pFile = open(pFile, 'w') cPickle.dump(comboMotifs,pFile) t3 = time.time() print 'Calculations took %.3f min.\nWriting/Pickling took %.3f min.' % ((float(t2)-t1)/60, (float(t3)-t2)/60)
def has_wmer(self,wmer): rc = MotifTools.revcomplement(wmer) if (wmer in self.wmers) or (rc in self.wmers): return(1) else: return(0)
def main(): try: opts, args = getopt.getopt(sys.argv[1:], "f:m:n:L:t:a:S:i:", ["help", "output="]) # AD added 'i' except getopt.GetoptError: usage() sys.exit(1) if not opts: usage() sys.exit(1) print "#" + ' '.join(sys.argv) fastafile, motiffile, motifnums, labels, thresh = (None, None, [], None, 0.75) # AD changed thresh val to 0.75 from 0.7 ambigs = [] scale = 50.0 / 1000.0 motifs = [] for opt, value in opts: #print opt, value if opt == '-f': fastafile = value elif opt == '-m': motifs.extend(MotifTools.txt2motifs(value)) elif opt == '-n': motifnums = [int(x) for x in value.split(',')] elif opt == '-L': labels = list(value) elif opt == '-t': thresh = float(value) elif opt == '-a': ambigs.extend(value.split(',')) elif opt == '-S': scale = float(value) elif opt == '-i': motiffile = value # AD added this option to ACTUALLY supply the tamo motif file at the command-line. The code to deal with motiffiles already existed. There was just no code for User to supply one. probes = Fasta.load(fastafile) if motiffile: for f in motiffile.split(','): # AD added this to allow supplying multiple tamo files at the prompt like you can supply multiple motifs motifs.extend(MotifTools.load(f)) if ambigs: for ambig in ambigs: motifs.append( MotifTools.Motif_from_text(ambig,0.1) ) if not motifnums: motifnums = range(len(motifs)) print '# %d: %s'%(len(motifs),motifnums) for i in range(len(motifnums)): motif = motifs[motifnums[i]] if labels and i < len(labels): txt = labels[i] else: txt = '%d'%i print '%-3s : %s %5.2f (%4.2f)'%(txt,motif,thresh*motif.maxscore,thresh) probehits = {} for key in probes.keys(): hits_by_motif = [] save_flag = 0 if re.search('[BDHU]',probes[key]): continue for num in motifnums: result = motifs[num].scan(probes[key],thresh*motif.maxscore) if result[0]: hits_by_motif.append(result) save_flag = 1 else: hits_by_motif.append(None) if save_flag: probehits[key]=hits_by_motif #scale = .1 maxw = 40 for key in probehits.keys(): l = len(probes[key]) a = list('-'* int(scale*l) ) a.extend( list(' '*10 ) ) desc = [] matches = probehits[key] for i in range(len(matches)): if matches[i]: subseqs,endpoints,scores = matches[i] for idx in range(len(subseqs)): start,stop = endpoints[idx] subseq = subseqs[idx] score = scores[idx] if labels and (i<len(labels)): ID = labels[i] else : ID = '%d'%i desc.append('%s %s %d-%d %4.2f '%(ID,subseq,start,stop,score)) start = int(start*scale) for offset in range(10): if a[start+offset] == '-': if labels and (i < len(labels)): a[start+offset] = labels[i] else: a[start+offset] = '%d'%i break print '%-14s %s'%(key,''.join(a)), print ' '*max(0,maxw-len(a)), '| '.join(['%-27s'%x for x in desc]) print print "Found matches in %d of %d input probes"%(len(probehits),len(probes))
def main(): if len(sys.argv) < 2: print "Usage: %s <fasta_file> [width = None ] [options]"%(re.sub('^.*/','',sys.argv[0])) print "Options include:" print "" print " EM Parameters:" print " -beta [0.01] Beta for pseudocounts" print " -seedbeta[0.02] Beta for pseudocounts for seeds from text" print " -gamma [0.2] Gamma (fraction of sequences)" print " -delta [0.001] Convergence criteria" print " " print " Seeds (not actually proper priors)" print " -prior Seqences or motifs for seeds (may be repeated)" print " -top N [0] Include w-mers in top N probes" print " -gap string sample gapped motifs" # print " -TF Seed with (all) TRANSFAC PSSMs (buggy)" print " -kmerseeds Use kmers with best enrichment score as seeds for EM" print " -pad add NN..NN to seed" print " " print " Genome / Background model " print " -human (250,1000) Use Human Background model" print " -g genome.fsa Use specicied Fasta file as background (searches first for matching frequency file)" # print " -Y2K, -Y5C Use Yeast Upstream Intergenic regions (2000, 500)" # print " -B Use Bacterial Orfs" print " " print "Examples:" print " %s t.fsa 5 -prior GGGTA -prior AAAAAC "%(sys.argv[0].split('/')[-1]) print " will start an EM with 3 seeds: GGGTA, AAAAA, and AAAAC" print print " %s t.fsa 5 -info CUP9.info -gamma 0.5 "%(sys.argv[0].split('/')[-1]) print " will start an EM with Enriched seeds in CUP9.info, with" print " Gamma expectation of 50% of all probes" print print " %s t.fsa -prior MCM1_5.tamo:0 "%(sys.argv[0].split('/')[-1]) print " will start an EM with 0th motif of the file MCM1_5.tamo" print " as a seed" print sys.exit(1) fastafile = sys.argv[1] #Echo the command line print "#" + ' '.join(map(lambda x: re.sub(' ','\ ',x), sys.argv)) if sys.argv[2].isdigit(): width = sys.argv[2] else: width = None algorithm = '' beta = '' seedbeta = '' deltamin = '' gamma = 0.2 infofile = '' seedmodels= [] species = 'YEAST' valid_tfs = [] #NOT USED gapped_syl= None gapflank = 0 gapweight = 0.2 enrichfact= 0.7 pmax = 0 #False TFSEEDS = 0 TFMids = [] pad = None bgfile = None seed_count = 0 #Default: Take the top 0 seed_s = [] #Initialize seq array '''Parse command-line arguments''' for tok,i in zip(sys.argv,xrange(len(sys.argv))): if tok == '-top' : seed_count = int(sys.argv[i+1]) elif tok == '-greedy': algorithm = "GREEDY" elif tok == '-prior' : seed_s.append(sys.argv[i+1]) elif tok == '-beta' : beta = float(sys.argv[i+1]) elif tok == '-seedbeta': seedbeta = float(sys.argv[i+1]) elif tok == '-gamma' : gamma = float(sys.argv[i+1]) elif tok == '-delta' : deltamin = float(sys.argv[i+1]) elif tok == '-kmerseeds' : infofile = 1 elif tok == '-valid' : valid_tfs.append(sys.argv[i+1]) #NOT USED elif tok == '-w' : width = sys.argv[i+1] elif tok == '-width' : width = sys.argv[i+1] elif tok == '-gap' : gapped_syl = sys.argv[i+1] elif tok == '-gapflank' :gapflank = int(sys.argv[i+1]) elif tok == '-gapweight':gapweight = float(sys.argv[i+1]) elif tok == '-enrichfact':enrichfact= float(sys.argv[i+1]) elif tok == '-pmax' : pmax = 1 elif tok == '-Y2K' : species = "YEAST_2000_UP" elif tok == '-Y5C' : species = "YEAST_500_UP" elif tok == '-B' : species = "BAC_ORF" elif tok == '-Ch22' : species = "Ch22" elif tok == '-genome': species = sys.argv[i+1] elif tok == '-pad' : pad = "TRUE" elif tok == '-bgfile': bgfile = sys.argv[i+1] elif tok == '-TF' : #NOT USED (TRANSFAC NOT SUPPLIED WITH DISTRIBUTION) TFSEEDS = 1 for j in range(i+1,len(sys.argv)): if re.match('M0',sys.argv[j]): TFMids.append(sys.argv[j]) else: break elif tok == '-human' : _s = '' if sys.argv[i+1].isdigit(): _s = '_'+sys.argv[i+1] else: _s = '' species = 'HUMAN'+_s if infofile: infofile = fastafile if bgfile: EM.loadMarkovBackground(bgfile) elif not ('-random_background' in sys.argv or '-nomarkov' in sys.argv): EM.loadMarkovBackground(species) else: EM.theMarkovBackground = EM.Zeroth() fsaD = Fasta.load(fastafile) Fasta.delN(fsaD) seqs = fsaD.values() probes = fsaD.keys() all_seqs = seqs seed_s.extend(seqs[0:min(seed_count,len(seqs))]) if infofile and width=='info': width = info2width(infofile) elif width != None: width = int(width) #Alternate source of seeds if infofile: if 1 or width: seedmodels.extend(info2seeds(width,infofile,fastafile,species)) else: print 'Error: need to specify motif width w/ .info file' #Any -prior pointers to motifs in other files? (seed_s, motifs) = parse_priors(seed_s) seedmodels.extend(motifs) #Should we get seeds from TRANSFAC? if TFSEEDS: #NOT USED tf = [] D = tfmats() if not TFMids: keys = D.keys() else: keys = [] for TFMid in TFMids: for key in D.keys(): if key[0:6] == TFMid: keys.append(key) break for key in keys: m = D[key] m.seednum = int(re.sub('M0*','',key.split()[0])) m.seedtxt = '%-24s %s'%(m,key) tf.append(m) tf.sort(lambda x,y: cmp(x.seednum,y.seednum)) seedmodels.extend(tf) #seedmodels.append(tf[33]) if gapped_syl: gapped_priors = gapped_motifs(gapped_syl) gapped_priors = map(lambda x:'N'+x+'N', gapped_priors) seed_s.extend(gapped_priors) if pad: print '# Padding models with NN-m-NN' newmodels = [] left = MotifTools.Motif_from_text('@') right = MotifTools.Motif_from_text('N') for m in seedmodels: newmodels.append(left + m + right) print left + m + right seedmodels = newmodels ''' Set everything up and GO!! ''' global theEM theEM = EM.EM(seed_s,[],width,"VERBOSE") if beta: theEM.beta = beta if deltamin: theEM.deltamin = deltamin if seedbeta: theEM.seedbeta = seedbeta theEM.param['gamma'] = gamma theEM.seqs.extend(all_seqs) theEM.models = seedmodels theEM.gapflank = gapflank theEM.gapweight = gapweight theEM.report() theEM.EM_Cstart() #GO!! #print "#Sorting candidates" #sys.stdout.flush() #EM.candidates.sort(lambda x,y: cmp(y.MAP,x.MAP)) ''' Compute some metrics ''' print "#Loading Genome %s"%species ; sys.stdout.flush() Genome = ProbeSet(species,enrichfact) ids = Genome.ids_from_file(fastafile) for C in theEM.candidates: if not pmax: C.pssm.pvalue = Genome.p_value(C.pssm,ids,'verbose') C.pssm.church = Genome.church(C.pssm,ids) C.pssm.frac = Genome.frac(C.pssm,probes,None,0.7) else: (p,frac) = Genome.best_p_value(C.pssm,ids) C.pssm.pvalue = p C.pssm.threshold = frac * C.pssm.maxscore print "Bests:",p,frac matching = Genome.matching_ids(C.pssm,[],factor=0.7) matchbound = [x for x in matching if x in probes] C.pssm.numbound = len(probes) C.pssm.nummotif = len(matching) C.pssm.numboundmotif = len(matchbound) sys.stdout.flush() ''' Print out all motifs (sorted by Enrichment) in an AlignACE-like form ''' theEM.candidates.sort(lambda x,y: cmp(x.pssm.pvalue,y.pssm.pvalue)) for C,i in zip(theEM.candidates,range(len(theEM.candidates))): C.pssm.maxscore = -100 #May have side effects. Recompute when done if C.pssm.valid: #NOT USED _t = C.pssm.valid if not _t[0]: vstring = "(--- %8.4f %8.4f %s)"%(_t[1],_t[2],_t[3]) else: vstring = "(HIT %8.4f %8.4f %s)"%(_t[1],_t[2],_t[3]) else: vstring = '' C.pssm._maxscore() #Recomputed MotifTools.print_motif(C.pssm,20,i) sys.stdout.flush() continue #Antiquated stuff -- Remove !! print "Log-odds matrix for Motif %3d %s"%(i,C) C.pssm._print_ll() print "Sequence Logo" C.pssm._print_bits() flush() #print '# %3d matching sequences at 90%%'%len(C.pssm.bestseqs(C.pssm.maxscore * 0.9)) flush() m = C.pssm if not m.__dict__.has_key('gamma'): m.gamma = None #Kludge to deal w/ old shelves if m.seedtxt: print "Seed: %3d %s"%(i,m.seedtxt) if m.source: print "Source: ",m.source if m.gamma: print "Gamma: %7.5f"%m.gamma if m.threshold: print "Threshold: %5.2f"%m.threshold #if C.pssm.seedtxt: # print 'Seed %3d %-25s'%(i,C.pssm.seedtxt) if C.pssm.church != None: vstring = 'ch: %5.2f %s'%( math.fabs(math.log(C.pssm.church)/math.log(10)), vstring) print "Motif %3d %-25s nlog(p): %6.3f %s"%(i,C,-math.log(C.pssm.pvalue)/math.log(10),vstring) if C.pssm.threshold: print "Threshold: %6.3f %4.1f%%"%( C.pssm.threshold, 100.0*C.pssm.threshold/C.pssm.maxscore) C.pssm.maxscore = -1e100 #May have side effects. Recompute when done for seq in C.wmers: print seq,i,C.pssm.scan(seq)[2][0] C.pssm._maxscore() #Recomputed print '*'*len(seq) print "MAP Score: %f"%C.MAP sys.stdout.flush() sys.stdout.flush() sys.exit(0) #Avoid ridiculous python cleanup times
def merge_runs_cc(TAMO_file, wdir, height, distance, ancestor, target, genome): '''This script is used to merge motifs with the PCC matrix of all motifs. The script was originally written by Cheng Zou, and then converted to a function by Alex Seddon. ''' print "Here are the parameters you specified in this run " print "-tamo %s" % TAMO_file print "-wdir %s" % wdir print "-h height to cut the tree, %s" % height print "-ancestor %s" % ancestor print "-target %s" % target print "-genome %s" % genome if TAMO_file == '' or wdir == '': help() os.system("cd %s" % wdir) os.chdir(wdir) # Get the directory where the script is located. script_dir = '/'.join(os.path.abspath(__file__).split('/')[:-1]) # This code was in the original clustering script. It has been taken out # because the processes involved take too long and have been taken up by # the matrrix creation scripts and the run_UPGMA script. #if distance==0: # os.system("python /mnt/home/seddonal/gil scottscripts/5_motif_merging/3.calculate_distance_matrix.py -i %s --dfunc pccrange" % TAMO_file) #os.system("R --vanilla --slave --args %s.dm %s< /mnt/home/seddonal/scripts/5_motif_merging/UPGMA_final.R> %s.Rout" % (TAMO_file,height,TAMO_file)) cl_dic = {} n = 0 # The file, TAMO_file.dm_UPGMA_Cl_0.05, is inorder of the motifs that appear # in the TAMO_file. If two motifs have the same number, they are considered # a part of the same cluster. # This loop pulls the clustering information out of this file and creats # the dictionary cl_dic = {cluster_index:{motif_index:'1'}} for line in open("%s.dm_UPGMA_Cl_%s" % (TAMO_file, height), "r"): # Gets the clusterindex of this motif cl = line.strip() # Adds the cluster index if it has not been if not cl_dic.has_key(cl): cl_dic[cl] = {} cl_dic[cl][n] = "1" # Adds the motif to that cluster n += 1 # Increases the motif index for the next motif #print cl_dic ml = MotifTools.txt2motifs(TAMO_file) old = [] # List of motifs that are the sole members of a cluster. # I think I can divide up this portion of the code to create a series print ancestor, ancestor == 0 cc_output = open('merge_runs_cc', 'w') if ancestor == 0: # This loop Looks at each cluster and attempts to merge the motifs # in the cluster if there are multiple motifs. for i in cl_dic.keys(): print i, cl_dic[i] # If there are multiple motifs in the cluster, it merges the motifs if len(cl_dic[i]) > 1: # Adds all of the motifs in the cluster to an object called # mlist. mlist = [] for j in cl_dic[i]: mlist.append(ml[j]) # Saves these motifs to there own TAMO file. save_motifs(mlist, "%s_sub_%s.tm" % (TAMO_file, i)) cc_output.write( 'module load TAMO; python %s/pcc_merge_CC.py merge_runs_no_ancestor -t %s/%s -i %s -target %s -genome %s\n' % (script_dir, wdir, TAMO_file, i, target, genome)) # If there is only one motif in the cluster, it leaves it alone, # And adds it to old else: key = cl_dic[i].keys()[0] old.append(ml[key]) if ancestor == 1: # This loop Looks at each cluster and attempts to merge the motifs # in the cluster if there are multiple motifs. for i in cl_dic.keys(): print i, cl_dic[i] # If there are multiple motifs in the cluster, it merges the motifs if len(cl_dic[i]) > 1: # Adds all of the motifs in the cluster to an object called # mlist. mlist = [] for j in cl_dic[i]: mlist.append(ml[j]) # Saves these motifs to there own TAMO file. save_motifs(mlist, "%s_sub_%s.tm" % (TAMO_file, i)) cc_output.write( 'module load TAMO; module load STAMPmotif; python %s/pcc_merge_CC.py merge_runs_ancestor -t %s/%s -i %s -target %s -genome %s\n' % (script_dir, wdir, TAMO_file, i, target, genome)) else: key = cl_dic[i].keys()[0] old.append(ml[key]) # Combine together the motifs that are in there own cluster. #os.system("cat %s_sub_*_sum.tm.tf.tm.TOP > %s_sub_new.tm" % (TAMO_file,TAMO_file)) save_motifs(old, "%s_sub_old.tm" % (TAMO_file))
def combine_distance_matrix(wdir, TAMO_file): '''Combines the PCC score matricies and outputs them as a single matrix. Originaly written by Cheng Zou, and converted to a function by Alex Seddon. ''' ml = MotifTools.txt2motifs(TAMO_file) n_split = len(ml) / 100 ## # Change to the working directory. os.system("cd %s" % wdir) os.chdir(wdir) # ## ## # The following loop keeps counts the number of lines in the each of the # PCC matricies for a comparison of a TAMO file with itself. lendic = {} # Dictionary with the length of PCC matricies. for i in range(n_split + 1): lendic[i] = line_count("%s_n%s.dm" % (TAMO_file, i)) print lendic # ## ## # This loop creates files with blanks. The files are used to ensure that # the PCC-distance matrix is square. The blank files will be created to take # the place of files that would have been left blank for i in range(n_split + 1): for j in range(0, i): # open the file to add blanks oup = open("%s_n%s-%s_n%s.dm" % (TAMO_file, i, TAMO_file, j), "w") print lendic[j], lendic[i] list = [] # Add a number of "-" to the list equal to the number of lines in # the self comparison files. for y in range(lendic[j]): list.append("-") for x in range(lendic[i]): oup.write("%s\n" % "\t".join(list)) oup.close() # ## ## # Creates a copy of the self comparison file so that it can be easily picked # out by the function. for i in range(n_split + 1): os.system("cp %s_n%s.dm %s_n%s-%s_n%s.dm" % (TAMO_file, i, TAMO_file, i, TAMO_file, i)) # ## ## # This loop will look at each for i in range(n_split + 1): com = "paste " for j in range(n_split + 1): com += "%s_n%s-%s_n%s.dm " % (TAMO_file, i, TAMO_file, j) com += "> distance_%s" % i print com os.system(com) com = "cat " for i in range(n_split + 1): com += "distance_%s " % i com += "> %s.dm" % TAMO_file print com # Concatonate all the matricies os.system(com) # My embarisingly ad hoc way of removing double tabs remove_double_tabs("%s.dm" % TAMO_file)
#!/usr/bin/python ''' This opens a general TAMO cluster list and outputs **TO STANDARD OUT** the probability matrices of all items there. Separated by a line with the name of each cluster. *It is recommended to be used in a bash pipeline where the standard out can be written into a file.* Has 1 argument: - motiflist: a TAMO motif list that will be outputed Returns: - A series of strings that represet the probability matrices of all motifs in the input list Author: Hector Galvez ''' from sys import argv from TAMO import MotifTools # Open list motiflist = MotifTools.load(argv[1]) # Start printing information for each motif for num in range(len(motiflist)): print '>Cluster_' + str(num + 1) motiflist[num]._print_p()
# Create a general list with all the motifs from all algorithms genlist = [] genlist.extend(seederlist) genlist.extend(memelist) genlist.extend(weederlist) # Perform clustering on the general list of motifs clusterinf = clusterinfo(genlist) averages = clusteravg(genlist,clusterinf) # Trim the final average list # averages = trim(averages,0.5) # print clusterinf # Save new list of cluster averages MotifTools.save_motifs(averages,tamooutput) # WEBLOGO IMAGE GENERATION # Generate giflogos of all average motifs for index in range(len(averages)): cluster = 'Cluster ' + str(index + 1) clustergif = argv[1] + '/other/cluster' + str(index + 1) averages[index].giflogo(clustergif,title=cluster,scale=2) # SUMMARY REPORT # Determine location of the markdown file for the summary report reportout = open(str(argv[1] + '/final/' + listname + '_cluster_report.md'), 'w') # Write the header of the report rundate = date.today() header = "# Summary report for `" + listname + "`\nThis analysis was run on: " + str(rundate) + \
def GetKmerFromMotifFromPWM(pwm, seq): m = MotifTools.toDict(pwm) #print m motif = MotifTools.Motif_from_counts(m) return motif.bestscanseq(seq);
# # Compare motifs in tamo format # from TAMO import MotifTools from TAMO.MotifMetrics import ProbeSet from TAMO.Clustering import MotifCompare from TAMO.Clustering import Kmedoids import sys import pickle import pprint file_unknown = sys.argv[1]# Unknown file_tfbs = sys.argv[2]# TF db motifs_unknown = MotifTools.load(file_unknown) motifs_tfbs = MotifTools.load(file_tfbs) match_dict = {} for unknown in motifs_unknown: tf_list = [] for tfbs in motifs_tfbs: #print #print "Comparing motifs:" #print " %s vs %s" % (unknown.source, tfbs.source) #print " Unknown motif ( %s ) vs TFBS ( %s ) " % (unknown, tfbs) #print joined_motifs = [] joined_motifs.append(unknown) joined_motifs.append(tfbs) print joined_motifs
def probOvlp(A,B,thresh=0.7,verbose=None): if A.width >= B.width: Wide, Narrow = A, B else: Wide, Narrow = B, A RC = MotifTools.revcomplement if 1: newWide = Wide[-1,Wide.width+1] if Wide.__dict__.has_key('bestWide'): bestWide = Wide.bestWide else: bestWideD = {} for x in newWide.bestseqs(thresh*newWide.maxscore): bestWideD[x] = 1 for x in bestWideD.keys(): bestWideD[RC(x)] = 1 Wide.bestWide = bestWideD.keys() bestWide = Wide.bestWide Wide = newWide if Narrow.__dict__.has_key('bestNarrow'): bestNarrow = Narrow.bestNarrow else: bestNarrowD = {} for x in Narrow.bestseqs(thresh*Narrow.maxscore): bestNarrowD[x] = 1 for x in bestNarrowD.keys(): bestNarrowD[RC(x)] = 1 bestNarrow = bestNarrowD.keys() Narrow.bestNarrow = bestNarrow #bestWide = [x[1] for x in Wide.bestseqs (thresh*Wide.maxscore) ] #bestNarrow = [x[1] for x in Narrow.bestseqs(thresh*Narrow.maxscore)] countNarrow = len(bestNarrow) countWide = len(bestWide) numtotal = math.pow(4,Wide.width) fudgefactor = math.pow(4,Wide.width - Narrow.width) bestWideTups = [(x,MotifTools.revcomplement(x)) for x in bestWide] countBoth = 0 for i in range(len(bestNarrow)): m_narrow = bestNarrow[i] delj = [] for j in range(len(bestWideTups)): if (bestWideTups[j][0].find(m_narrow) >= 0) or (bestWideTups[j][1].find(m_narrow) >= 0): countBoth += 1 delj.append(j) delj.reverse() #Chew in from the back for j in delj: del(bestWideTups[j]) if verbose: print '%10d %10d %10d %10d | %10d %5d '%( countWide, numtotal, countNarrow *fudgefactor , countBoth , countNarrow, Wide.width - Narrow.width), p = Arith.hypgeomsummore(countWide, #Num Interesting numtotal, #All k-mers countNarrow * fudgefactor, #Number picked countBoth ) #Number found return p
import os,sys,string from TAMO import MotifTools from TAMO.seq import Fasta from TAMO.MotifMetrics import ProbeSet promoters = ProbeSet(sys.argv[1]) geneset_ids = open(sys.argv[2]).read().split('\n')[:-1] match_ids = [] prom_ids = promoters.probes.keys() for id in geneset_ids: if id in prom_ids: match_ids.append(id) motifs = MotifTools.load(sys.argv[3]) church = 0.05 rocauc = 0.1 pvalue = 0.05 print "Name\tMotif\tChurch\tRoc-auc\tP-value" for m in motifs: m.church = promoters.church (m, match_ids) # m.ROC_auc = promoters.ROC_AUC (m, match_ids) m.pvalue = promoters.p_value (m, match_ids) if m.church <= church and m.pvalue <= pvalue: print "%s\t%s\t%s\t%s" %\ (m.source, m, m.church, m.pvalue)
from gusPyCode.MDAP_proj.MDAP_defs import alignAndCombineMotifs from TAMO import MotifTools Motif = MotifTools.Motif outFile = '/Users/biggus/Documents/James/Collaborations/Campbell/data/Results_HyperGeoScreen/masked/Results_gGEMS/CCupAt4Days.6-8mers.gGEMS.top6PlusCombos.motifs.stdThresh.tmo' m = MotifTools.load('/Users/biggus/Documents/James/Collaborations/Campbell/data/Results_HyperGeoScreen/masked/Results_gGEMS/CCupAt4Days.6-8mers.gGEMS.top6.motifs.stdThresh.tmo') w = [5.8952, 5.6523, 5.0585, 4.9788, 4.9678, 4.7688] toTmo = [] toTmo.append(alignAndCombineMotifs([m[0],m[1]],[w[0],w[1]])) toTmo.append(alignAndCombineMotifs([m[0],m[4]],[w[0],w[4]])) toTmo.append(alignAndCombineMotifs([m[1],m[4]],[w[1],w[4]])) toTmo.append(alignAndCombineMotifs([m[2],m[3]],[w[2],w[3]])) toTmo.append(alignAndCombineMotifs([m[2],m[5]],[w[2],w[5]])) for e in toTmo: print e.oneletter MotifTools.save_motifs(m+toTmo,outFile) None
filename = sys.argv[1] motif_list = open(filename).read().split('\nMOTIF')[1:] tamo_list = [] motif_counter = 1 nsites_pat = re.compile("(w= [0-9]+)") for motif in motif_list: m_info1, m_info2 = motif.split('letter-probability matrix: ') m_mat = m_info2.split('--------------------------------------------------------------------------------', 1)[0] m_mat_header, m_prob_mat = m_mat.split('\n', 1) nsites = int(nsites_pat.findall(m_mat_header)[0].split('= ')[1]) count_pos = m_prob_mat.split('\n')[:-1] count_mat = [] site_list = [] for count in count_pos: sites = [float(i) for i in count.split()] site_list.append(sites) count_dict = {'A': int(sites[0] * nsites), 'C': int(sites[1] * nsites), 'G': int(sites[2] * nsites), 'T': int(sites[3] * nsites)} count_mat.append(count_dict) m = MotifTools.Motif_from_counts(count_mat) m.source = "Motif%s | %s" % (motif_counter, m_mat_header) tamo_list.append(m) motif_counter += 1 MotifTools.save_motifs(tamo_list, "MEME_motifs_%s.tamo" % filename.split('.')[0])
def motif_matrix(fsa, motif, outfile, genome='mm9'): if genome == 'hg18': markov = "/nfs/genomes/human_gp_mar_06/hg18_promoters_3000_1000.markov" else: markov = "/nfs/data/cwng/chipseq/hypotheses/Mouse.markov" #Load motif and background adjust PSSM m = MotifTools.load(motif) EM.loadMarkovBackground(markov) bg = EM.theMarkovBackground.zeroth() F = Fasta.load(fsa, key_func=lambda x: x) seqs = F.values() n_seqs = len(seqs) n_motifs = len(m) SCORES = np.zeros((n_motifs, n_seqs), dtype='float') #SHIFTS=np.zeros((n_motifs,n_seqs)) #out=open(outfile,'w') for i, M in enumerate(m): ll = M.logP EM.loadMarkovBackground(markov) bg = EM.theMarkovBackground.zeroth() for pos in ll: for letter in pos.keys(): pos[letter] = pos[letter] - math.log( bg[letter]) / math.log(2.0) AM = MotifTools.Motif_from_ll(ll) #adj_model = MotifTools.Motif_from_ll(ll) #adj_model.source = M.source #pssm = MDsupport.Motif2c_PSSM(adj_model) #w=pssm.width #shift=[] #scores=[] mi, ma = AM.minscore, AM.maxscore #F_m={} #Search every seq for given motif above threshold t and print motif centered results for j, seq in enumerate(seqs): seq_fwd = seq.upper() #seq_rev = str(MotifTools.revcomplement(seq_fwd))[::-1] #scores_fwd = pssm.score_probe(seq_fwd) #scores_rev = pssm.score_probe(seq_rev) #max_score=mi #max_ind=0 #for ind,s in enumerate(scores_fwd): # if s> max_score: # max_score=s # max_ind=ind # strand='+' #for ind,s in enumerate(scores_rev): # if s> max_score: # max_score=s # max_ind=ind # strand='-' max_score = AM.bestscore(seq_fwd) mscore = (max_score - mi) / (ma - mi) #orig=len(seq_fwd)/2 #bind=max_ind+w//2 #d=abs(orig-bind) SCORES[i, j] = mscore #SHIFTS[i,j]=d #out.write('%1.3f\t'%mscore) #out.write('\n') #out.close() #del F np.savetxt(outfile, SCORES, fmt='%1.3f')