def mpat(tr, seqs) : dseqs = dict(seqs) for n in getPostOrder(tr) : data = n.data if not n.succ : data.seq = (None,dseqs[n.data.taxon]) else : s1,s2 = [tr.node(x).data.seq for x in n.succ] if s1[1] : if s2[1] : a = calign.globalAlign(s1[1],s2[1]) data.seq = (calign.createProfile(a),None) else : p1,p2 = calign.createProfile(s1[1:]), s2[0] #assert all([sum(x)==sum(p1[0]) for x in p1]) #assert all([sum(x)==sum(p2[0]) for x in p2]) pa = calign.prof2profAlign(p1,p2) data.seq = (trimp(pa),None) #print len(pa) else : p1 = s1[0] if s2[1] : p2 = calign.createProfile(s2[1:]) else : p2 = s2[0] #assert all([sum(x)==sum(p1[0]) for x in p1]) #assert all([sum(x)==sum(p2[0]) for x in p2]) pa = calign.prof2profAlign(p1,p2) data.seq = (trimp(pa),None) #print len(pa) #import pdb; pdb.set_trace() assert n.id == tr.root return n.data.seq[0]
def guesstimateTH(seqs, nMax, thc, matches = None, scores = None, ns = 100) : q = 1- 1./((len(seqs)/nMax)-.5) if q <= 0 : return None d = [[calign.globalAlign(x1,x2, report = calign.JCcorrection, scores=scores) for (x1,x2) in [random.sample(seqs, 2)]][0] for k in range(4000)] #import pdb; pdb.set_trace() p = ([int(nMax * q**n) for n in range(0,50)]) th = sum([nPairs(x) for x in p])/nPairs(len(seqs)) sols = [] for i in range(40) : d1 = random.sample(d,2000) sols.append(sorted(d1)[int(th*len(d1))]) #sol = sorted(d)[int(th*len(d))] #sol = mean(sols) sol = sorted(sols)[(40 * 9)//10] ## f = lambda t : (sum([x < t for x in d])/len(d)) - th #import pdb; pdb.set_trace() ## if f(0) * f(1) < 0 : ## sol = brentq(f, 0, 1) ## else : ## sol = thc/2 return sol/3
def mpa(tr, seqs, scores = defaultMatchScores, trimEnd = None) : dseqs = dict(seqs) #scores = (None,None,gapPenalty,feg) for n in getPostOrder(tr) : data = n.data if not n.succ : data.seq = (None,dseqs[n.data.taxon.strip("'")]) else : s1,s2 = [tr.node(x).data.seq for x in n.succ] if s1[1] : if s2[1] : a = calign.globalAlign(s1[1],s2[1], scores = scores) data.seq = (calign.createProfile(a),None) else : p1,p2 = calign.createProfile(s1[1:]), s2[0] pa = calign.prof2profAlign(p1,p2, scores = scores) data.seq = (trimendsp(pa, trimEnd) if trimEnd is not None else pa,None) #print len(pa) else : p1 = s1[0] if s2[1] : p2 = calign.createProfile(s2[1:]) else : p2 = s2[0] pa = calign.prof2profAlign(p1,p2, scores = scores) data.seq = (trimendsp(pa, trimEnd) if trimEnd is not None else pa,None) #print len(pa) #import pdb; pdb.set_trace() assert n.id == tr.root return n.data.seq[0]
def seqMultiAlign(seqs, scores = defaultMatchScores, report=False) : if len(seqs) < 2: return seqs a = calign.globalAlign(seqs[0], seqs[1], scores=scores) ns = 2 p = calign.createProfile(a) a = tuple(iton(x) for x in a) for kk,s2 in enumerate(seqs[2:]) : #print ns # assert p == calign.createProfile(a) assert len(a[0]) == len(p) and \ p[0][calign.GAP] != ns and p[-1][calign.GAP] != ns pad = 20 if len(p)+2*pad < len(s2) : # enough for sequences start to align pad = (len(s2) - len(p)) ; assert len(p)+2*pad >= len(s2) pa,extendLeft,extendRight = calign.profileAlign(s2, p, pad=pad, chop=True, gapPenalty=defaultMatchScores.gap) if extendLeft > 0 or extendRight > 0 : gapProfile = [0,0,0,0,0,ns] p1 = tuple((list(gapProfile) for k in range(extendLeft))) + p \ + tuple((list(gapProfile) for k in range(extendRight))) else : p1 = p for k,n in enumerate(pa) : p1[k][n] += 1 p = p1 if extendLeft > 0 or extendRight > 0 : fr = '-'*extendLeft bk = '-'*extendRight a = tuple((fr + x + bk for x in a)) a = a + (iton(pa),) ns += 1 if report and (kk+1) % 1000 == 0 : import sys print kk+1, len(a[0]), sys.stdout.flush() if report: print return a
def getDist(i,j) : mi,mj = mhs[i],mhs[j] anyCons = False if mi < lowDiversity : ri = [getCons(i)] anyCons = True else : ri = getReps(i) if mj < lowDiversity : rj = [getCons(j)] anyCons = True else : rj = getReps(j) nhs = len(ri)*len(rj) if nhs == 1 : h = calign.globalAlign(ri[0], rj[0], scores = defaultMatchScores, report = calign.JCcorrection) else : ap = calign.allpairs(ri, rj, align=True, scores = defaultMatchScores, report = calign.JCcorrection) h = sum([sum(x) for x in ap])/nhs global acnt acnt += nhs lowLim = 2*max(mi,mj) if anyCons and (h < lowLim or (h < refineUpperLimit and h < lowLim*refineFactor)) : xri = getReps(i) if len(ri) == 1 else ri xrj = getReps(j) if len(rj) == 1 else rj if ri != xri or rj != xrj : ap1 = calign.allpairs(xri, xrj, align=True, scores = defaultMatchScores, report = calign.JCcorrection) h1 = sum([sum(x) for x in ap1]) xnhs = (len(xri)*len(xrj)) acnt += xnhs h = (h * nhs + h1)/(nhs + xnhs) return max(h, lowLim)
def guesstimateTH(seqs, nMax, thc, matches = None, scores = None, ns = 100) : d = [[calign.globalAlign(x1,x2, report = calign.JCcorrection, scores = scores) for (x1,x2) in [random.sample(seqs, 2)]][0] for k in range(2000)] q = 1- 1./(len(seqs)/nMax) p = ([int(nMax * q**n) for n in range(0,100)]) th = sum([nPairs(x) for x in p])/nPairs(sum(p))/2 f = lambda t : (sum([x < t for x in d])/len(d)) - th #import pdb; pdb.set_trace() if f(0) * f(1) < 0 : sol = brentq(f, 0, 1) else : sol = thc/2 return sol k = len(seqs)//nMax if max(k,ns) > len(seqs)//50 : return thc/2 sq = random.sample(seqs, max(k,ns)) t,ds = treeFromSeqs(sq, matchScores = scores) del ds th = sorted(nodeHeights(t).values())[-(ns//k)] #import pdb; pdb.set_trace() return th if matches is None : #print "matches" matches = _buildLookupC(seqs) #print "done matches" # lseqs = [len(x) for x in seqs] fdis = lambda s,j : calign.globalAlign(s, seqs[j], scores = scores, report = calign.JCcorrection) nInClade = max((len(seqs)//nMax) + 1, nMax) lim = 2*nInClade v = [] li = random.sample(range(len(seqs)), min(ns,len(seqs))) for i in li: p = getMates(seqs[i], seqs, .3, fdis, matches, lim = lim) v.append((i,sorted(p[1]))) f3 = lambda x : mean(x) if len(x) else 1 f1 = lambda th : f3([1./x for x in [sum([x < th for x in u[1]]) for u in v] if x > 0]) #import pdb; pdb.set_trace() #f1 = lambda th : mean([1/max(sum([x < th for x in u[1]]),0.0001) for u in v]) if f1(1) >= 1./nInClade : f = lambda x : f1(1) * 1.01 - f1(x) else : f = lambda x : f1(x) - 1./nInClade assert f(0) * f(1) < 0 sol = brentq(f, 0, 1) f2 = lambda th : 1 - (sum([sum([x < th for x in u[1]]) for u in v])/sum([len(u[1]) for u in v])) f = lambda x : f2(x) - 1./nInClade #import pdb; pdb.set_trace() assert f(0) * f(1) <= 0 sol1 = brentq(f, 0, 1) th = max(sol,sol1) th = int(th*1000+.5)/1000 return th,matches
def deClutter(seqs, th, correction, failsTH = 20, matches = None, matchScores = defaultMatchScores, verbose = None) : if verbose: print >> verbose, "declutter",len(seqs),"at", "%g" % th tmain = time.clock() print >> verbose, "building lookup table ..., ", _failsHardTH = failsTH if matches is None : matches = _buildLookupC(seqs) N = len(seqs) if verbose: print >> verbose, "done." print >> verbose, "n n-matched #singles #pairs #groups #matched-now #new-matched" # Duplicate code for speed if correction : fdis = lambda i,j : calign.globalAlign(seqs[i], seqs[j], scores = matchScores, report = calign.JCcorrection) else : fdis = lambda i,j : calign.globalAlign(seqs[i], seqs[j], scores = matchScores, report = calign.DIVERGENCE) doStats = True if doStats : totTries = totFails = 0 emptys = set() elem2grps = dict() lseqs = [len(s) for s in seqs] getp = lambda j : getPotentials(j, seqs, lseqs, th, fdis, matches, elem2grps, rs = rs) # 0,1 : min( seq-ident producing a match below th ), count # 2,3 - sum of p(first fail before terminating), count rs = [2,0,1,0] grps = [] singles = set() pairs = list() pairsassingles = set() paired = set() for jSeq in range(N) : if jSeq in paired : continue pp = getp(jSeq) if doStats : iCans,sd,stat = pp totTries += stat[1] totFails += stat[0] else: iCans,sd = pp done = False if len(iCans) == 0 : singles.add(jSeq) done = True # Matching a previous single happens elif len(iCans) == 1 and iCans[0] not in paired \ and iCans[0] not in singles : partner = iCans[0] pp = getp(partner) if doStats : jCans,sd,stat1 = pp totTries += stat1[1] totFails += stat1[0] stat = (stat[0],stat[1], stat[2] + stat1[2]) else : jCans,sd = pp if len(jCans) == 1 and jCans[0] == jSeq : pairs.append([jSeq,partner,sd[0]]) # probably not safe: avoid duplicates but those will not be examined # again for possible removal paired.add(jSeq) paired.add(partner) assert all([x not in singles and x not in pairsassingles for x in [jSeq,partner]]) pairsassingles.add(jSeq) pairsassingles.add(partner) done = True else : iCans = set(jCans + iCans) iCans.discard(jSeq) iCans = list(iCans) assert iCans is not None if verbose and not done: print >> verbose, jSeq, len(paired), "%.4g%%" % (100*(len(paired)/len(seqs))),\ len(singles), len(pairs), len(grps), len(iCans)+1, \ sum([x not in paired for x in iCans])+1,\ ("(%d %d %d)" % (totTries,totFails,stat[2])) if doStats else "" if not done: g = [jSeq] + iCans paired.add(jSeq) sRemoved = [] for x in iCans: if x in singles: sRemoved.append(x) singles.discard(x) paired.add(x) if verbose and len(sRemoved) > 0: print >> verbose, len(sRemoved),"removed from singles",\ '('+",".join([str(x) for x in sRemoved])+')' for x in g : if x in pairsassingles : for k,(a,b,d) in enumerate(pairs): if a == x or b == x : del pairs[k] if a not in g : g.append(a) if b not in g : g.append(b) break if True: for x in g : if x in elem2grps: if not isinstance(g, set): g = set(g) g.update(elem2grps[x]) if isinstance(g, set) : g = list(g) for x in g : elem2grps[x] = g grps.append(g) if verbose: if doStats: print >> verbose, totTries, "matching alignments,", totFails,"fails." print >> verbose, "merging", len(grps), "groups", if 0 : ww =file("/tmp/gg",'w') for g in grps: for x in g: print >> ww, x, print >> ww ww.close() mgrps = sorted(mergeGroupings(grps), key = lambda x : len(x), reverse=0) if verbose: print >> verbose,"done, ", len(mgrps), "groups." return list(singles),pairs,mgrps
k = len(seqs)//nMax if max(k,ns) > len(seqs)//50 : return thc/2 sq = random.sample(seqs, max(k,ns)) t,ds = treeFromSeqs(sq, matchScores = scores) del ds th = sorted(nodeHeights(t).values())[-(ns//k)] #import pdb; pdb.set_trace() return th if matches is None : #print "matches" matches = _buildLookupC(seqs) #print "done matches" # lseqs = [len(x) for x in seqs] fdis = lambda s,j : calign.globalAlign(s, seqs[j], scores = scores, report = calign.JCcorrection) nInClade = max((len(seqs)//nMax) + 1, nMax) lim = 2*nInClade v = [] li = random.sample(range(len(seqs)), min(ns,len(seqs))) for i in li: p = getMates(seqs[i], seqs, .3, fdis, matches, lim = lim) v.append((i,sorted(p[1]))) f3 = lambda x : mean(x) if len(x) else 1 f1 = lambda th : f3([1./x for x in [sum([x < th for x in u[1]]) for u in v] if x > 0]) #import pdb; pdb.set_trace() #f1 = lambda th : mean([1/max(sum([x < th for x in u[1]]),0.0001) for u in v]) if f1(1) >= 1./nInClade :