def _taxOrder(trees) : dtops = Counter(toNewick(tree, topologyOnly=1) for tree in trees) averagedPosteriorDistances = defaultdict(lambda : 0) for tx in dtops: tree = parseNewick(tx) distances = dict() taxaDistance(tree, tree.root, distances) for k in distances: averagedPosteriorDistances[k] += distances[k] * dtops[tx]/len(trees) taxa = sorted(trees[0].get_taxa()) ntax = len(taxa) for j,k in allPairs(taxa) : averagedPosteriorDistances[k,j] = averagedPosteriorDistances[j,k] def groupDistance(g1,g2,dm) : return mean([dm[i] for i in itertools.product(g1,g2)]) ## def groupDistance(g1,g2,dm) : ## return min([dm[i] for i in itertools.product(g1,g2)]) groups = [[x] for x in taxa] dm = averagedPosteriorDistances while len(groups) > 1: # find the two closest groups. dists = [(groupDistance(groups[j],groups[k],dm), (j,k)) for j,k in allPairs(range(len(groups)))] dists = sorted(dists) d,(ij,ik) = dists[0] # 123 abc 0,0 0,1 1,0 02 11 20 # 321 abc # 123 cba # 321 cba # abc 123 # abc 321 # cba 123 # cba 321 g1,g2 = groups[ij],groups[ik] def gid(g1,g2,dm) : d = [] for n in range(len(g1)+len(g2) - 1) : for i in range(-1,max(-len(g1)-1,-(len(g2)-n+1)), -1) : #i > -(len(g2)-n+1) d.append(dm[g1[i], g2[n-(i+1)]]) return d dis = gid(g1,g2,dm),gid(g1,list(reversed(g2)),dm),gid(g2, g1,dm), gid(list(reversed(g2)),g1,dm) dis = sorted(zip(dis,range(4))) o = dis[0][1] if o & 1 : g2 = list(reversed(g2)) if o > 1 : g1,g2 = g2,g1 groups[ij] = g1 + g2 del groups[ik] otaxa = groups[0] return otaxa
def treeFromDists(ds, tax = None, weights = None, asString = False) : #up = scipy.cluster.hierarchy.average([x/2. for x in ds]) #up = calign.upgma([x/2. for x in ds], weights = weights) up = calign.upgma(ds, weights = weights) if weights and any([x>1 for x in weights]) : lw = len(weights) #wt = lambda i : weights[i] if i < lw else nup[i-lw][3] wt = lambda i : 1 if i < lw else nup[i-lw][3] nup = [] for i,j,d,w in up: w = wt(i) + wt(j) nup.append([i,j,d/2,w]) up = nup else : up = tuple((a,b,c/2,d) for a,b,c,d in up) # check that equiv tr = upgma2tree(up, tax) return tr if asString else parseNewick(tr)
def checkStr(s): try: T1 = parseNewick_sol.parseNewick(s) T2 = parseNewick.parseNewick(s) S = compareTree(T1, T2) if 0 in S: return 0 score = 1 if 1 in S: score -= 0.15 if 2 in S: score -= 0.1 if 3 in S: score -= 0.25 if 4 in S: score -= 0.25 return score except Exception as e: return 0
def checkStr(s): try: T1 = parseNewick_sol.parseNewick(s) T2 = parseNewick.parseNewick(s) S = compareTree(T1,T2) if 0 in S: return 0 score = 1 if 1 in S: score -= 0.15 if 2 in S: score -= 0.1 if 3 in S: score -= 0.25 if 4 in S: score -= 0.25 return score except Exception as e: return 0
def next(self) : if self.treetok : if self.treetok[0] == 'tree' : t = self.treetok[1] self.treetok = next(self.itokens) t = t.strip().split('=') assert len(t) >= 2 tname = t[0].strip() t = '='.join(t[1:]).strip() rooted=False weight=1.0 if t[0] == '[' : o,t = _parseOptions(t) if 'R' in o : rooted = True if 'U' in o : rooted = False if 'W' in o : weight=float(o['W']) tree = parseNewick(t.strip(), weight=weight, rooted=rooted, name=tname.split()[0], loadAttributes = self.withAttributes) if self.taxatable : for n in tree.get_terminals(): data = tree.node(n).data try: data.taxon = self.taxatable[data.taxon] except (ValueError,KeyError): raise RuntimeError("translation failed") return tree raise StopIteration
def assembleTree(trees, thFrom, thTo, getSeqForTaxon, nMaxReps = 20, maxPerCons = 100, lowDiversity = 0.02, refineFactor = 1.1, refineUpperLimit = .15, verbose = None) : cahelpers = dict() cahelper = lambda t : cahelpers.get(t.name) or \ (cahelpers.update([(t.name,CAhelper(t))]) or cahelpers.get(t.name)) if verbose: print >> verbose, "cutting",len(trees),"trees at %g" % thFrom # cut trees at thFrom pseudoTaxa = cutForestAt(trees, thFrom, cahelper) nReps = len(pseudoTaxa) reps = [None]*nReps def getReps(k) : if not reps[k] : t,n = pseudoTaxa[k] nc = len(n.data.terms) if nc > 2: nc = min(max(int(math.log(nc,3)), 2), nMaxReps) r = random.sample(n.data.terms, nc) else : r = n.data.terms reps[k] = [getSeqForTaxon(x.data.taxon) for x in r] return reps[k] cons = [None]*nReps def getCons(k) : if not cons[k] : t,n = pseudoTaxa[k] nc = len(n.data.terms) if nc > maxPerCons : i = random.sample(n.data.terms, maxPerCons) else : i = n.data.terms sq = [getSeqForTaxon(x.data.taxon) for x in i] # s, r = align.mpc(sq, nRefines=0) # del r s = doTheCons(sq, n.data.rh) #al = align.seqMultiAlign(sorted(sqs, reverse=1)) #s = align.stripseq(align.cons(calign.createProfile(al))) cons[k] = s return cons[k] mhs = [] for t,n in pseudoTaxa: cahelper(t) # populate rh mhs.append(n.data.rh) # if both low diversity - use consensus. If not valid or close to cluster height, do the # means thing. If not low diversity, use log representatives # low less then 4%?? ## lowDiversity = 0.02 ## refineFactor = 1.1 ## refineUpperLimit = .15 # counts how many alignments done (for display) global acnt acnt = 0 def getDist(i,j) : mi,mj = mhs[i],mhs[j] anyCons = False if mi < lowDiversity : ri = [getCons(i)] anyCons = True else : ri = getReps(i) if mj < lowDiversity : rj = [getCons(j)] anyCons = True else : rj = getReps(j) nhs = len(ri)*len(rj) if nhs == 1 : h = calign.globalAlign(ri[0], rj[0], scores = defaultMatchScores, report = calign.JCcorrection) else : ap = calign.allpairs(ri, rj, align=True, scores = defaultMatchScores, report = calign.JCcorrection) h = sum([sum(x) for x in ap])/nhs global acnt acnt += nhs lowLim = 2*max(mi,mj) if anyCons and (h < lowLim or (h < refineUpperLimit and h < lowLim*refineFactor)) : xri = getReps(i) if len(ri) == 1 else ri xrj = getReps(j) if len(rj) == 1 else rj if ri != xri or rj != xrj : ap1 = calign.allpairs(xri, xrj, align=True, scores = defaultMatchScores, report = calign.JCcorrection) h1 = sum([sum(x) for x in ap1]) xnhs = (len(xri)*len(xrj)) acnt += xnhs h = (h * nhs + h1)/(nhs + xnhs) return max(h, lowLim) if verbose : print >> verbose, "assembling",nReps,"sub-trees into one tree",time.strftime("%T") print "n-sub-tree #pair-only-alignments #alignments time" verbose.flush() tnow = time.clock() # Use array. those can get big ds = array.array('f',repeat(0.0,nPairs(nReps))) pos = 0 for i in range(nReps-1) : for j in range(i+1, nReps) : ds[pos] = getDist(i,j) pos += 1 if verbose : dn = sum(range(nReps-1, nReps-i-2,-1)) print >> verbose, i, dn, "%4.3g%%" % ((100.*dn)/len(ds)), acnt, time.strftime("%T") if verbose : print >> verbose, tohms(time.clock() - tnow), time.strftime("%T") # Using correct weights can throw off the height guarantee, or not? wt = [len(n.data.terms) for t,n in pseudoTaxa] tnew = treeFromDists(ds, tax = [str(x) for x in range(nReps)], weights = wt) del ds for n in getPostOrder(tnew) : if not n.succ : t,nd = pseudoTaxa[int(n.data.taxon)] if len(nd.data.terms) == 1 : n.data.taxon = nd.data.taxon n.data.rtree = "%s:%f" % (n.data.taxon, n.data.branchlength) else : # Insure heights are there cahelper(t) s = t.toNewick(nd.id) d = n.data.branchlength - nd.data.rh if (d < -1e-10) : print "***** ERROR", d n.data.rtree = "%s:%f" % (s, max(d,0.0)) else : ch = [tnew.node(x).data.rtree for x in n.succ] n.data.rtree = "(%s,%s)" % (ch[0],ch[1]) if n.id != tnew.root : n.data.rtree = n.data.rtree + (":%f" % n.data.branchlength) trec = tnew.node(tnew.root).data.rtree trec = parseNewick(trec) return trec