def unifrac_tasks_from_matrix(u, env_names, modes=UNIFRAC_DEFAULT_MODES): """Returns the UniFrac matrix, PCoA, and/or cluster from the matrix.""" result = {} if UNIFRAC_DIST_MATRIX in modes: result[UNIFRAC_DIST_MATRIX] = (u, env_names) if UNIFRAC_PCOA in modes: point_matrix, eigvals = principal_coordinates_analysis(u) result[UNIFRAC_PCOA] = output_pca(point_matrix, eigvals, env_names) if UNIFRAC_CLUST_ENVS in modes: nodes = map(PhyloNode, env_names) BIG = 1e305 U = u.copy() for i in range(len(U)): U[i, i] = BIG c = UPGMA_cluster(U, nodes, BIG) result[UNIFRAC_CLUST_ENVS] = c if UNIFRAC_NJ_ENVS in modes: c = nj(dists_to_nj(u, env_names)) result[UNIFRAC_NJ_ENVS] = c return result
def unifrac_tasks_from_matrix(u, env_names, modes=UNIFRAC_DEFAULT_MODES): """Returns the UniFrac matrix, PCoA, and/or cluster from the matrix.""" result = {} if UNIFRAC_DIST_MATRIX in modes: result[UNIFRAC_DIST_MATRIX] = (u, env_names) if UNIFRAC_PCOA in modes: point_matrix, eigvals = principal_coordinates_analysis(u) result[UNIFRAC_PCOA] = output_pca(point_matrix, eigvals, env_names) if UNIFRAC_CLUST_ENVS in modes: nodes = map(PhyloNode, env_names) BIG = 1e305 U = u.copy() for i in range(len(U)): U[i,i] = BIG c = UPGMA_cluster(U, nodes, BIG) result[UNIFRAC_CLUST_ENVS] = c if UNIFRAC_NJ_ENVS in modes: c = nj(dists_to_nj(u, env_names)) result[UNIFRAC_NJ_ENVS] = c return result
def construct_cluster(args, dm): # UPGMA OR # neighbor joining: from cogent.phylo import nj from cogent.cluster.UPGMA import upgma mycluster = nj.nj(dm) #mycluster = upgma(dm) return mycluster
def evaluate_tree(aln): d = distance.EstimateDistances(aln, submodel=JC69()) d.run(show_progress=False) njtree = nj.nj(d.getPairwiseDistances()) if debug: print(d) print(njtree.asciiArt()) print(njtree.sameTopology(tr)) for otu in 'BCD': print(njtree.getConnectingEdges('A', otu)) L = njtree.getConnectingEdges('A', 'B') return len(L) == 3
def distmat_to_tree(distmat): dist_headers, dist_matrix = distmat cogent_host_dist = {} # Loop through host distance matrix to create a dictionary of pairwise # distances for i, item in enumerate(dist_matrix): for j, itemtwo in enumerate(dist_matrix[i]): if i != j: cogent_host_dist[ (dist_headers[i], dist_headers[j])] = dist_matrix[i][j] # Generate tree from distance matrix return nj.nj(cogent_host_dist)
def get_alignment_tree(fname): """Build a neighbour joining tree""" from cogent.phylo import distance, nj from cogent.evolve.models import HKY85, F81 al = cogent.LoadSeqs(fname, format='fasta') d = distance.EstimateDistances(al, submodel=F81()) d.run(show_progress=False) mytree = nj.nj(d.getPairwiseDistances()) mytree = mytree.balanced() print(mytree.asciiArt()) print '''from cogent.draw import dendrogram p = dendrogram.SquareDendrogram(mytree) p.drawToPDF('tree-scaled.pdf', 500, 400, stroke_width=2.0, shade_param = 'r', max_value = 1.0,)''' return
def single_file_nj(input_file, output_file): # read in dist matrix f = open(input_file, 'U') headers, data = parse_distmat(f) f.close() # do nj distdict = {} for i in range(len(headers)): for j in range(len(headers)): distdict[(headers[i], headers[j])] = data[i, j] # need j,i too? tree = nj(distdict) # write output f = open(output_file, 'w') f.write(tree.getNewick(with_distances=True)) f.close()
def single_file_nj(input_file, output_file): # read in dist matrix f = open(input_file, 'U') headers, data = parse_distmat(f) f.close() # do nj distdict = {} for i in range(len(headers)): for j in range(len(headers)): distdict[(headers[i],headers[j])] = data[i,j] # need j,i too? tree = nj(distdict) # write output f = open(output_file,'w') f.write(tree.getNewick(with_distances=True)) f.close()
def plot_tree(sourceArray, args): # Build the input dictionary for the tree functions. distanceDict = dict() for i in range(len(idList)): for j in range(len(idList)): distanceDict[(idList[i], idList[j])]=sourceArray[i,j] # Generate the tree using the specified method. if args.method == 'upgma' or args.method is None: tree = upgma(distanceDict) elif args.method == 'nj': tree = nj(distanceDict) else: print "Method '%s' is not supported." %(args.method) exit(1) # Convert the tree to text and save to the specified file. art = tree.asciiArt() destFile = open(args.destPath, 'w') destFile.write(art+'\n') destFile.close() return
import sys from cogent.draw import dendrogram from cogent.phylo import nj import utils fn = 'data/nj_data.txt' data = utils.load_data(fn,split_lines=True) otus = utils.letters[:len(data)] dists = dict() for t1,line in zip(otus,data): L = line.strip().split() L = [float(n) for n in L] for t2,e in zip(otus,L): dists[(t1,t2)] = e tr = nj.nj(dists) print tr.asciiArt() print for n in tr.iterTips(): print n.ancestors()[0].Name, n def show_edge_children(node): children = node.iterNontips() for child in children: print node.Name, print child.Name, print node.distance(child), edges = list(tr.iterNontips()) for e in edges:
def test_nj(self): """testing nj""" reconstructed = nj(self.dists) self.assertTreeDistancesEqual(self.tree, reconstructed)
def phyl_tree(): mytree = nj.nj(d.getPairwiseDistances()) print ("\n\n") print mytree.asciiArt() al = LoadSeqs("cytc.fasta", moltype=PROTEIN, interleaved=False) d = distance.EstimateDistances(al, submodel = JTT92()) d.run() sys.stdout = open("cytc distances.txt", "w") print d phyl_tree() al = LoadSeqs("mtdna.fasta", moltype=DNA, interleaved=True, aligned=False) d = distance.EstimateDistances(al, submodel = JC69()) d.run() sys.stdout = open("mtdna distances.txt", "w") print d phyl_tree() seqs = LoadSeqs("cytb.fasta", moltype=PROTEIN, aligned=False) al = align_unaligned_seqs(seqs,PROTEIN) dcalc = distance.EstimateDistances(al, submodel = JTT92()) dcalc.run(show_progress = True) d = dcalc.getPairwiseDistances() tree=nj.nj(d) sys.stdout = open("cytb distances.txt", "w") print dcalc print '\n\n' print tree.asciiArt() #phyl_tree()
def TreeAlign(model, seqs, tree=None, indel_rate=0.01, indel_length=0.01, ui = None, ests_from_pairwise=True, param_vals=None): """Returns a multiple alignment and tree. Uses the provided substitution model and a tree for determining the progressive order. If a tree is not provided a Neighbour Joining tree is constructed from pairwise distances estimated from pairwise aligning the sequences. If running in parallel, only the distance estimation is parallelised and only the master CPU returns the alignment and tree, other CPU's return None, None. Arguments: - model: a substitution model - seqs: a sequence collection - indel_rate, indel_length: parameters for the progressive pair-HMM - ests_from_pairwise: if no tree provided and True, the median value of the substitution model parameters are used - param_vals: named key, value pairs for model parameters. These override ests_from_pairwise. """ _exclude_params = ['mprobs', 'rate', 'bin_switch'] if param_vals: param_vals = dict(param_vals) else: param_vals = {} if isinstance(seqs, dict): seq_names = list(seqs.keys()) else: seq_names = seqs.getSeqNames() two_seqs = len(seq_names) == 2 if tree: tip_names = tree.getTipNames() tip_names.sort() seq_names.sort() assert tip_names == seq_names, \ "names don't match between seqs and tree: tree=%s; seqs=%s" % \ (tip_names, seq_names) ests_from_pairwise = False elif two_seqs: tree = LoadTree(tip_names=seqs.getSeqNames()) ests_from_pairwise = False else: if ests_from_pairwise: est_params = [param for param in model.getParamList() \ if param not in _exclude_params] else: est_params = None dcalc = EstimateDistances(seqs, model, do_pair_align=True, est_params=est_params) dcalc.run() dists = dcalc.getPairwiseDistances() tree = NJ.nj(dists) LF = model.makeLikelihoodFunction(tree.bifurcating(name_unnamed=True), aligned=False) if ests_from_pairwise and not param_vals: # we use the Median to avoid the influence of outlier pairs param_vals = {} for param in est_params: numbers = dcalc.getParamValues(param) print("Param Estimate Summary Stats: %s" % param) print(numbers.summarize()) param_vals[param] = numbers.Median ui.display("Doing %s alignment" % ["progressive", "pairwise"][two_seqs]) with LF.updatesPostponed(): for param, val in list(param_vals.items()): LF.setParamRule(param, value=val, is_constant=True) LF.setParamRule('indel_rate', value=indel_rate, is_constant=True) LF.setParamRule('indel_length', value=indel_length, is_constant=True) LF.setSequences(seqs) edge = LF.getLogLikelihood().edge align = edge.getViterbiPath().getAlignment() info = Info() info["AlignParams"] = param_vals info["AlignParams"].update(dict(indel_length=indel_length, indel_rate=indel_rate)) align.Info = info return align, tree
def phyl_tree(): mytree = nj.nj(d.getPairwiseDistances()) print ("\n\n") print mytree.asciiArt()
def main(): if len(sys.argv) != 2: print "not enought arguments" sys.exit(1) inputfile = sys.argv[1] if not os.path.exists(inputfile): print "input file %s does not exists" % inputfile sys.exit(1) trees = {} with open(inputfile, 'r') as fhd: name = None for line in fhd: #print "line", line line = line.strip() cols = line.split("\t") if len(line) > 1 and len(cols) == 1: name = cols[0] print "name '%s'" % name for line in fhd: line = line.rstrip() if len(line) == 0: #print "name '%s' :: empty line" % str(name) name = None break if name is not None and name not in trees: #print "name '%s' :: getting header" % str(name) #print "name '%s' :: getting header :: %s" % (str(name), line) header = line headerNames = header.split("\t") if headerNames[0] != "": #print "name '%s' :: getting header :: first not empty :: '%s'" % (str(name), line) name = None continue dimentions = len(headerNames) - 1 trees[name] = {} trees[name]['dimentions'] = dimentions trees[name]['names' ] = headerNames[1:] trees[name]['matrix' ] = [] #print "name '%s' :: getting header :: dimentions %d" % ( str(name), dimentions ) else: if name is not None: colss = line.split("\t") if colss[0] not in trees[name]['names' ]: print "name '%s' :: getting matrix :: vertical name %s not in names %s" % (str(name), str(name), str(names)) sys.exit(1) cols = [] for x in colss[1:]: x = float(x) if x == 0: x = 0.00000001 cols.append(x) cols = array.array('f', cols ) if len(cols) != dimentions: print "name '%s' :: dimentions dont match %d vs %d" % ( str(name), len(cols), dimentions ) sys.exit(1) trees[name]['matrix'].append(cols) for treename in sorted(trees): if (not treename.endswith('_prop')) and (not treename.endswith('_jaccard_')): continue print "exporting tree %s" % treename treedata = trees[treename] dimentions = treedata['dimentions'] names = treedata['names' ] matrix = treedata['matrix' ] outbase = "%s.%s" % ( inputfile, treename ) dissi = {} for name1pos in xrange(len(names)): name1 = names[name1pos] for name2pos in xrange(len(names)): if name2pos >= name1pos: continue name2 = names[name2pos] name = (name1, name2) eman = (name2, name1) val1 = matrix[name1pos][name2pos] val2 = matrix[name2pos][name1pos] dissi[name] = val1 dissi[eman] = val2 mycluster = upgma(dissi).balanced() mycluster.writeToFile(outbase + ".upgma", with_distances=True) with open(outbase + ".upgma.tree", "w") as upgmat: upgmat.write( mycluster.asciiArt() ) myclusterden = dendrogram.ContemporaneousDendrogram( mycluster ) myclusterdendraw = myclusterden.drawToPDF(outbase + '.upgma.tree.pdf', 1024, 2048) #d = distance.LoadTree(treestring=mycluster.getNewick(with_distances=True), format='newick') #print dissi myclusterls = least_squares.WLS(dissi).evaluateTree(mycluster) print "UPGMA Least Square", myclusterls #myclusterls = least_square.evaluateTree(mycluster) #print least_square.evaluateTopology( LoadTree(treestring=mycluster.getNewick(with_distances=True), format='newick') ) mytree = nj.nj(dissi).balanced() mytree.writeToFile(outbase + ".nj", with_distances=True ) #print mytree.getNewick(with_distances=True) #print mytree.balanced().getNewick(with_distances=True) with open(outbase + ".nj.tree", "w") as njt: njt.write( mytree.asciiArt() ) mytreeden = dendrogram.ContemporaneousDendrogram( mytree ) mytreedendraw = mytreeden.drawToPDF(outbase + '.nj.tree.pdf', 1024, 2048) mytreels = least_squares.WLS(dissi).evaluateTree(mytree) print "NJ Least Square", mytreels
model = JC() #for seq in bububu: dists = dict() for i in range(len(bububu)-1): for j in range(i+1,len(bububu)): dists[(bububu[i].name,bububu[j].name)] = model.count(bububu[i],bububu[j]) #model.count() #for i in range(len(bububu)): # model.count() #from cogent.phylo import nj from cogent.phylo import nj #dists = {('a', 'b'): 2.7, ('c', 'b'): 2.33, ('c', 'a'): 0.73} njtree2 = nj.nj(dists) print njtree2.writeToFile('/home/puko/Desktop/science2.newick') #njtree2.writeToFile('/home/puko/Desktop/tree.newick') #print njtree2.asciiArt() #bootstrap import random file = open('/home/puko/Desktop/science_bootstrap2.newick','w') trees = str() trees_list = [] for x in range(0,499):
def main(): if len(sys.argv) != 2: print "not enought arguments" sys.exit(1) inputfile = sys.argv[1] if not os.path.exists(inputfile): print "input file %s does not exists" % inputfile sys.exit(1) trees = {} with open(inputfile, 'r') as fhd: name = None for line in fhd: #print "line", line line = line.strip() cols = line.split("\t") if len(line) > 1 and len(cols) == 1: name = cols[0] print "name '%s'" % name for line in fhd: line = line.rstrip() if len(line) == 0: #print "name '%s' :: empty line" % str(name) name = None break if name is not None and name not in trees: #print "name '%s' :: getting header" % str(name) #print "name '%s' :: getting header :: %s" % (str(name), line) header = line headerNames = header.split("\t") if headerNames[0] != "": #print "name '%s' :: getting header :: first not empty :: '%s'" % (str(name), line) name = None continue dimentions = len(headerNames) - 1 trees[name] = {} trees[name]['dimentions'] = dimentions trees[name]['names'] = headerNames[1:] trees[name]['matrix'] = [] #print "name '%s' :: getting header :: dimentions %d" % ( str(name), dimentions ) else: if name is not None: colss = line.split("\t") if colss[0] not in trees[name]['names']: print "name '%s' :: getting matrix :: vertical name %s not in names %s" % ( str(name), str(name), str(names)) sys.exit(1) cols = [] for x in colss[1:]: x = float(x) if x == 0: x = 0.00000001 cols.append(x) cols = array.array('f', cols) if len(cols) != dimentions: print "name '%s' :: dimentions dont match %d vs %d" % ( str(name), len(cols), dimentions) sys.exit(1) trees[name]['matrix'].append(cols) for treename in sorted(trees): if (not treename.endswith('_prop')) and ( not treename.endswith('_jaccard_')): continue print "exporting tree %s" % treename treedata = trees[treename] dimentions = treedata['dimentions'] names = treedata['names'] matrix = treedata['matrix'] outbase = "%s.%s" % (inputfile, treename) dissi = {} for name1pos in xrange(len(names)): name1 = names[name1pos] for name2pos in xrange(len(names)): if name2pos >= name1pos: continue name2 = names[name2pos] name = (name1, name2) eman = (name2, name1) val1 = matrix[name1pos][name2pos] val2 = matrix[name2pos][name1pos] dissi[name] = val1 dissi[eman] = val2 mycluster = upgma(dissi).balanced() mycluster.writeToFile(outbase + ".upgma", with_distances=True) with open(outbase + ".upgma.tree", "w") as upgmat: upgmat.write(mycluster.asciiArt()) myclusterden = dendrogram.ContemporaneousDendrogram(mycluster) myclusterdendraw = myclusterden.drawToPDF(outbase + '.upgma.tree.pdf', 1024, 2048) #d = distance.LoadTree(treestring=mycluster.getNewick(with_distances=True), format='newick') #print dissi myclusterls = least_squares.WLS(dissi).evaluateTree(mycluster) print "UPGMA Least Square", myclusterls #myclusterls = least_square.evaluateTree(mycluster) #print least_square.evaluateTopology( LoadTree(treestring=mycluster.getNewick(with_distances=True), format='newick') ) mytree = nj.nj(dissi).balanced() mytree.writeToFile(outbase + ".nj", with_distances=True) #print mytree.getNewick(with_distances=True) #print mytree.balanced().getNewick(with_distances=True) with open(outbase + ".nj.tree", "w") as njt: njt.write(mytree.asciiArt()) mytreeden = dendrogram.ContemporaneousDendrogram(mytree) mytreedendraw = mytreeden.drawToPDF(outbase + '.nj.tree.pdf', 1024, 2048) mytreels = least_squares.WLS(dissi).evaluateTree(mytree) print "NJ Least Square", mytreels
import sys from cogent.draw import dendrogram from cogent.phylo import nj import utils fn = 'data/nj_data.txt' data = utils.load_data(fn, split_lines=True) otus = utils.letters[:len(data)] dists = dict() for t1, line in zip(otus, data): L = line.strip().split() L = [float(n) for n in L] for t2, e in zip(otus, L): dists[(t1, t2)] = e tr = nj.nj(dists) print tr.asciiArt() print for n in tr.iterTips(): print n.ancestors()[0].Name, n def show_edge_children(node): children = node.iterNontips() for child in children: print node.Name, print child.Name, print node.distance(child),