def test_rnni(self): x.random_tree([a, b, c, d, e, f, g, h, i, j, k, l]) s = x.newick etetree1 = ete2.Tree(s, format=1) tl = x.tree_length pren = {no.name: no.incident_length for no in x.all_nodes} for origin in x.all_nodes: if origin.binary != "1" and origin.terminal is False: for child in [0,1]: pre_top2 = {no.name: (no.left.name, no.right.name) if no.terminal is False else (None,None) for no in x.all_nodes} old_root = x.root.name x.__rNNI(origin, child) x.set_binary() for target in x.all_nodes: if target.binary.startswith(origin.binary): pass elif origin.binary == "1": pass elif origin.mother == target: pass elif target.mother is not None and target.mother == origin.mother: pass else: s3 = x.newick pren2 = {no.name: no.incident_length for no in x.all_nodes} pre_top = {no.name: (no.left.name, no.right.name) if no.terminal is False else (None,None) for no in x.all_nodes} old_root_2 = x.root.name pre = x.__rSPR(origin, target) etetree3 = ete2.Tree(s3, format=1) x.set_binary() x.revert_topology_move(pre_top, pren, old_root_2) aftn2 = {no.name: no.incident_length for no in x.all_nodes} s2 = x.newick etetree2 = ete2.Tree(s2, format=1) rf = etetree2.robinson_foulds(etetree3)[0] self.assertEqual(rf, 0, "failed for " + origin.name + " and " + target.name) self.assertEqual(pren2, aftn2) x.revert_topology_move(pre_top2, pren, old_root) tl2 = x.tree_length aftn = {no.name: no.incident_length for no in x.all_nodes} s2 = x.newick etetree2 = ete2.Tree(s2, format=1) rf = etetree1.robinson_foulds(etetree2)[0] # self.assertEqual(s, s2) self.assertEqual(rf, 0) self.assertEqual(tl, tl2) self.assertEqual(pren, aftn)
def get_tree(tree_string): # FIXME # Make this much more elegant # Also, once a successful parse is achieved, remember the strategy and avoid brute force on subsequent trees # Do we need regex magic? if "[&" in tree_string and "&&NHX" not in tree_string: tree_string = regex1.sub(repl, tree_string) if "NHX" not in tree_string: tree_string = regex2.sub(repl, tree_string) # Try to parse tree as is try: t = ete2.Tree(tree_string) return t except (ValueError, ete2.parser.newick.NewickError): pass # Try to parse tree with internal node labels try: t = ete2.Tree(tree_string, format=1) return t except (ValueError, ete2.parser.newick.NewickError): # That didn't fix it. Give up return None
def test_CallTreeMethod(self): c = pc.ProgramCaller(config_alt) # Write an input file infn = output_dir + "Test.fa" with open(infn, 'wb') as outfile: outfile.write(">a1\nST\n>b2\nKL\n>c3\nSL\n>d4\nKT") # test a method that uses the proposed output filename exp_outfn = output_dir + "OG234.tre" if os.path.exists(exp_outfn): os.remove(exp_outfn) outfn = c.CallTreeMethod("fasttree", infn, exp_outfn, "OGabc_id") self.assertTrue(outfn != None) self.assertEqual(outfn, exp_outfn) expectedTree = "(a1:1.10312,c3:0.00055,(b2:0.00055,d4:1.46340)0.761:1.41871);" with open(outfn, 'rb') as infile: tree = infile.read().rstrip() self.assertEqual(expectedTree, tree) # test a method that generates its own output filename based on unique ID exp_outfn = output_dir + "OG234_id.treefile" if os.path.exists(exp_outfn): os.remove(exp_outfn) outfn = c.CallTreeMethod("iqtree", infn, output_dir + "OG234.tre", "OG234_id") self.assertEqual(outfn, exp_outfn) # with open(outfn, 'rb') as infile: tree = infile.read().rstrip() expectedTree = ete2.Tree( "(a1:0.7475350209,(b2:0.0000026604,c3:1.2318876921):1.279964,d4:0.0000022111);" ) actualTree = ete2.Tree(outfn) for n in expectedTree: x = (actualTree & n.name).dist self.assertLess( abs(x - n.dist) / n.dist, 0.3, (n.dist, (actualTree & n.name).dist))
def run(): # Parse options parser = optparse.OptionParser(__doc__) options, files = parser.parse_args() # Read trees first = True for count, line in enumerate(fileinput.input(files),1): t = ete2.Tree(line) leaves = t.get_leaves() # If first tree, get names if first: names = sorted([l.name for l in leaves]) print_middle(names) first = False for l in leaves: if l.name in names: l.name = str(names.index(l.name)+1) # Print tree line print "TREE tree_%d = %s" % (count, t.write()) print_footer() # Done return 0
def drawtrees(error, label): # Generate graphical representation of all trees. addtext(label, "Drawing trees") # Import tree drawing modules import ete2 import re start = time.time() # Trees to draw files = ["ml.tree", "mp.tree", "nj.tree", "ftree.tree"] filehs = [] # Loop over filenames and render image for tfile in files: try: handl = open(tfile, 'r') try: tree = ete2.Tree(handl.read()) except: error.write(tfile + "corrupted") name = re.sub('.tree', '', tfile) tree.render(name + '.png') except IOError: error.write("File: " + tfile + " not found") return error end = time.time() total = end - start text = "All trees drawn in " + str(total) + " seconds" addtext(label, text) return None
def random_topology( self, nspecies, names=None, rooted=False, ): """ Use ete2 to make a random topology Then add random branch lengths drawn from some distribution (default = gamma) Inner and leaf edge lengths can be drawn from differently parameterised versions of the distribution """ if names: random.shuffle(names) t = ete2.Tree() t.populate(nspecies, names_library=names) if rooted: t.set_outgroup(t.children[0]) else: t.unroot() t_as_newick = t.write() t_as_newick = t_as_newick.replace(')1', ')') return Tree(t_as_newick, name='random tree').pam2sps('strip')
def check_supersets(tree): if tree.is_leaf(): return False moved = False for c1 in tree.children: for c2 in tree.children: if c1 == c2: continue if c1.mutations.issubset(c2.mutations): c1.detach() c2.add_child(c1) moved = True elif c2.mutations.issubset(c1.mutations): c2.detach() c1.add_child(c2) moved = True overlap = c1.mutations.intersection(c2.mutations) if len(overlap) > 0: c1.detach() c2.detach() intermediate = instantiate_node(ete2.Tree(name='NoName')) intermediate.mutations = overlap intermediate.add_child(c1) intermediate.add_child(c2) tree.add_child(intermediate) moved = moved or check_supersets(c1) return moved
def populate_tree(session, newick, germline_seq, removed_muts): tree = ete2.Tree(newick) for node in tree.traverse(): if node.name not in ('NoName', 'germline', ''): seq = session.query(Sequence).filter( Sequence.ai == node.name).first() seq_ids = {} for collapsed_seq in get_seqs_collapsed_to(session, seq): seq_ids[collapsed_seq.seq_id] = { 'ai': collapsed_seq.ai, 'tissue': collapsed_seq.sample.tissue, 'subset': collapsed_seq.sample.subset, 'ig_class': collapsed_seq.sample.ig_class, 'copy_number': collapsed_seq.copy_number, 'sample_name': collapsed_seq.sample.name, 'sample_id': collapsed_seq.sample.id } node.name = seq.seq_id node.add_feature('seq_ids', seq_ids) node.add_feature('copy_number', sum([s['copy_number'] for s in seq_ids.values()])) modified_seq = remove_muts(seq.sequence, removed_muts, germline_seq) node.add_feature( 'mutations', get_mutations( germline_seq, modified_seq, map(int, json.loads(seq.mutations_from_clone).keys()))) else: node = instantiate_node(node) return tree
def import_phylo(phylo_list, biodb): from chlamdb.biosqldb import manipulate_biosqldb import biosql_own_sql_tables import ete2 import re server, db = manipulate_biosqldb.load_db(biodb) sql = 'create table IF NOT EXISTS biosqldb_phylogenies.BBH_%s (orthogroup varchar(100), phylogeny text, INDEX orthogroup (orthogroup));' % biodb server.adaptor.execute(sql, ) locuslag2orthogroup = biosql_own_sql_tables.locus_tag2orthogroup(biodb) l = len(phylo_list) for n, phylo in enumerate(phylo_list): print "%s/%s" % (n, l) t = ete2.Tree(phylo, format=0) leaves = [i for i in t.iter_leaves()] for leave in leaves: try: orthogroup = locuslag2orthogroup[leave.name] break except: continue sql = 'insert into biosqldb_phylogenies.BBH_%s values ("%s", "%s");' % ( biodb, orthogroup, t.write()) try: server.adaptor.execute(sql, ) except: print phylo server.commit()
def build_tree_from_dict(dict_tree, tree=None): if tree is None: tree = ete2.Tree(name="root") for parent, children in dict_tree.iteritems(): subtree = tree.add_child(name=parent) if children: subtree = build_tree_from_dict(children, subtree) return tree
def visualize(self,savepath='tree.txt',write_perm='False'): newick=make_newick(self)+';' self.newick=ete2.Tree(newick,format=1) print self.newick if write_perm: f=open(savepath,'w') f.write(str(self.newick)) f.close()
def consume(self, stream): for tree_string in stream: # Try to parse tree as is try: t = ete2.Tree(tree_string) yield t continue except (ValueError, ete2.parser.newick.NewickError): pass # Try to parse tree with internal node labels try: t = ete2.Tree(tree_string, format=1) yield t except (ValueError, ete2.parser.newick.NewickError): # That didn't fix it. Give up continue
def run(): # Parse options parser = optparse.OptionParser(__doc__) parser.add_option('-a', '--attribute', dest="attribute", default=None) parser.add_option('-d', '--dpi', type="int", default=None) parser.add_option('-H', '--height', type="int", dest="h", default=None) parser.add_option('-l', '--label', default="name") parser.add_option('-m', '--multiple', default=False, action="store_true") parser.add_option('-o', '--output', default=None) parser.add_option('-u', '--units', default="px") parser.add_option('-w', '--width', type="int", dest="w", default=None) options, files = parser.parse_args() # Setup TreeStyle ts = ete2.TreeStyle() ts.show_scale = False ts.show_branch_support = True # Read trees for n, line in enumerate(fileinput.input(files)): t = ete2.Tree(line) # Add faces if options.attribute: values = set( [getattr(l, options.attribute) for l in t.get_leaves()]) colours = get_colour_set(len(values)) colour_map = dict(zip(values, colours)) for l in t.iter_leaves(): mycolour = colour_map[getattr(l, options.attribute)] l.add_face( ete2.CircleFace(radius=10, color=mycolour, style="sphere"), 0) for l in t.iter_leaves(): l.add_face(ete2.TextFace(getattr(l, options.label)), 1) # Plot or save if options.output: kw = {} if options.h or options.w: for o in ("h", "w", "units", "dpi"): if getattr(options, o): kw[o] = getattr(options, o) if options.multiple: base, ext = os.path.splitext(options.output) filename = base + ("_%06d" % (n + 1)) + ext else: filename = options.output t.render(filename, ultrametric, tree_style=ts, **kw) else: t.show(ultrametric, tree_style=ts) if not options.multiple: return 0 return 0
def SupportedHierachies_wrapper(treeName, GeneToSpecies, species, dict_clades, clade_names): if not os.path.exists(treeName): return [] t = ete2.Tree(treeName, format=1) G = set(t.get_leaf_names()) S = list(set(map(GeneToSpecies, G))) if len(S) < 4: return [] result = SupportedHierachies(t, G, S, GeneToSpecies, species, dict_clades, clade_names, treeName) # print(treeName) return result
def ConvertTree(treeString): """for trees with sequence names iSp_jSeq replaces the jSeq with 0, 1,...""" tree = ete2.Tree(treeString) sp_counts = defaultdict(int) for seq in tree: iSp, jSeq = seq.name.split("_") kSeq = sp_counts[iSp] sp_counts[iSp] += 1 seq.name = "%s_%d" % (iSp, kSeq) return (tree.write() + "\n")
def RootGeneTreesArbitrarily(treesPat, nOGs, outputDir): filenames = [treesPat % i for i in xrange(nOGs)] outFilenames = [ outputDir + os.path.split(treesPat % i)[1] for i in xrange(nOGs) ] treeFilenames = [fn for fn in filenames if fn.endswith(".txt")] nErrors = 0 with open(outputDir + 'root_errors.txt', 'wb') as errorfile: for treeFN, outFN in zip(treeFilenames, outFilenames): try: t = ete2.Tree(treeFN) if len(t.get_children()) != 2: R = t.get_midpoint_outgroup() # if it's a tree with 3 genes all with zero length branches then root arbitrarily (it's possible this could happen with more than 3 nodes) if GetTotalLength(t) == 0.0: for leaf in t: R = leaf break elif AllEqualBranchLengths(t): # more generally, for any branch length all branches could have that same length for leaf in t: R = leaf break t.set_outgroup(R) t.resolve_polytomy() t.write(outfile=outFN) except Exception as err: try: t = ete2.Tree(treeFN) for leaf in t: R = leaf break t.set_outgroup(R) t.resolve_polytomy() t.write(outfile=outFN) except: errorfile.write(treeFN + ": " + str(err) + '\n') nErrors += 1 if nErrors != 0: print("WARNING: Some trees could not be rooted") print( "Usually this is because the tree contains genes from a single species." )
def reroot(tree_filepath, output_prefix=None): t = ete2.Tree(tree_filepath) t.set_outgroup("IMG_2540341180") if output_prefix is not None: t.write(format=0, outfile=output_prefix) else: print t.write(format=0) sys.stderr.write( "Reroot completed successfully. You can ignore previous errors.\n")
def build_consensus_tree(self): # Build a list of all clades in the treestream with frequency above the # requested threshold, sorted first by size and then by frequency. Do not # include the trivial clade of all leaves. clades = [] for clade, p in self.cp.clade_probs.items(): if p >= self.frequency: clade = clade.split(",") clades.append((len(clade), p, set(clade))) clades.sort() junk, trash, all_leaves = clades.pop() clades.reverse() # Start out with a tree in which all leaves are joined in one big polytomy t = ete2.Tree() for l in all_leaves: t.add_child(name=l) # Now recursively resolve the polytomy by greedily grouping clades t = recursive_builder(t, clades) cache = t.get_cached_content() # Add age annotations for clade in t.traverse("postorder"): if clade.is_leaf(): continue clade_key = ",".join(sorted([l.name for l in cache[clade]])) ages = self.cp.clade_ages[clade_key] mean = sum(ages) / len(ages) for c in clade.get_children(): leaf, age = c.get_farthest_leaf() c.dist = mean - age ages.sort() lower, median, upper = [ ages[int(x * len(ages))] for x in 0.05, 0.5, 0.95 ] clade.add_feature("age_mean", mean) clade.add_feature("age_median", median) clade.add_feature("age_HPD", "{%f-%f}" % (lower, upper)) for f in self.cp.clade_attributes: values = self.cp.clade_attributes[f][clade_key] mean = sum(values) / len(values) values.sort() lower, median, upper = [ values[int(x * len(values))] for x in 0.025, 0.5, 0.975 ] clade.add_feature("%s_mean" % f, mean) clade.add_feature("%s_median" % f, median) clade.add_feature("%s_HPD" % f, "{%f-%f}" % (lower, upper)) return t
def reference_ml_tree(self, fasta): infile = '%s/%s' % (self.msa_folder, fasta) outfile = '%s/%s.fastTree' % (self.tree_folder, fasta) if system('%s -wag -gamma -out %s %s' % (self.fasttree, outfile, infile)): exit('**Error while running:\n\tfastTree') tree = ete2.Tree(outfile) tree.resolve_polytomy() tree.write(outfile='%s-no_polytomies' % outfile) return ('%s.fastTree' % fasta, '%s.fastTree-no_polytomies' % fasta)
def reroot(tree_filepath, output_prefix=None): t = ete2.Tree(tree_filepath) ancestor = t.get_common_ancestor("IMG_2264867067", "IMG_638154511") t.set_outgroup(ancestor) if output_prefix is not None: t.write(format=0, outfile=output_prefix) else: print t.write(format=0) sys.stderr.write( "Reroot completed successfully. You can ignore previous errors.\n")
def as_ete_object(o): if isinstance(o, ete2.Tree): return o elif isinstance(o, dendropy.Tree) or isinstance(o, dendropy.Node): s = o.as_newick_string() + ";" # _LOG.debug(s) return ete2.Tree(s) elif isinstance(o, list) or isinstance(o, dendropy.TreeList): return [as_ete_object(t) for t in o] else: raise ValueError( "Object of type '%s' does not have a native ete2 representation" % type(o))
def derive_tree_from_splits(current_node, parent_hash, taxon_order, splits): split_hash = splits[parent_hash] child1_hash, child2_hash = elucidate_cc_split(parent_hash, split_hash) child1_node = ete2.Tree() child2_node = ete2.Tree() current_node.add_child(child1_node) current_node.add_child(child2_node) child1_size = clade_size(child1_hash) child2_size = clade_size(child2_hash) if child1_size == 1: child1_node.name = clade_taxon_names(child1_hash, taxon_order)[0] else: derive_tree_from_splits(child1_node, child1_hash, taxon_order, splits) if child2_size == 1: child2_node.name = clade_taxon_names(child2_hash, taxon_order)[0] else: derive_tree_from_splits(child2_node, child2_hash, taxon_order, splits)
def draw_tree(ptree, labels=None): root = ete2.Tree(name='root') T = [ ete2.Tree(name=(str(node) + '[' + str(i) + ']')) for i, node in enumerate(ptree.nodes) ] if labels is not None: for t, lab in zip(T, labels): t.name += '{' + str(lab) + '}' for i, p in enumerate(ptree.parents): if p > 0: T[p].add_child(T[i]) else: root.add_child(T[i]) cmap = color_map(max(labels) + 2) for t, l in zip(T, labels): ns = ete2.NodeStyle() ns['bgcolor'] = cmap[l] t.set_style(ns) if not t.is_leaf(): t.add_face(ete2.TextFace(t.name), column=0, position='branch-top') root.show()
def tree_ete(self): """The tree as an object in python memory from ETE2 We can add attributes to the leaves useful for the comparisons that we perform later on.""" # Load it # tree = ete2.Tree(self.tree) # Root it # five = tree.search_nodes(name='V') assert len(five) == 1 tree.set_outgroup(five[0]) tree.ladderize() # Return results # return tree
def draw_ete2_tree(organism, snplist, tree_file_name, config, c): '''Draws a phylogenetic tree using ETE2 Keyword arguments: organism -- the organism of which to make a tree snplist -- a list of the SNP names, positions and state file_name -- the name of the out-file _tree.pdf will be added ''' newick = tree_to_newick(organism, config, c) tree = ete2.Tree(newick, format=1) tree_depth = int(tree.get_distance(tree.get_farthest_leaf()[0])) for n in tree.traverse(): # Nodes are set to red colour nstyle = ete2.NodeStyle() nstyle["fgcolor"] = "#BE0508" nstyle["size"] = 10 nstyle["vt_line_color"] = "#000000" nstyle["hz_line_color"] = "#000000" nstyle["vt_line_type"] = 0 nstyle["hz_line_type"] = 0 nstyle["vt_line_width"] = 2 nstyle["hz_line_width"] = 2 for snp in snplist: if n.name == snp[0]: if snp[1] == snp[3]: # If the SNP is Derived in snplist, # change appearance of node nstyle["fgcolor"] = "#99FF66" nstyle["size"] = 15 nstyle["vt_line_color"] = "#000000" nstyle["hz_line_color"] = "#000000" nstyle["vt_line_type"] = 0 nstyle["hz_line_type"] = 0 elif snp[3] == "-": # If the SNP is missing due to a gap, make it grey nstyle["fgcolor"] = "#DDDDDD" nstyle["size"] = 10 nstyle["vt_line_color"] = "#DDDDDD" nstyle["hz_line_color"] = "#DDDDDD" nstyle["vt_line_type"] = 1 nstyle["hz_line_type"] = 1 n.set_style(nstyle) ts = ete2.TreeStyle() ts.show_leaf_name = False # Do not print(leaf names, they are added in layout) ts.show_scale = False # Do not show the scale ts.layout_fn = CanSNPer_tree_layout # Use the custom layout ts.optimal_scale_level = 'full' # Fully expand the branches of the tree if config["dev"]: print("#[DEV] Tree file: %s" % tree_file_name) tree.render(tree_file_name, tree_style=ts, w=tree_depth * 500)
def save_tree_to_file(self,filepath): newick=make_newick(self)+';' # countleft=0 # countright=0 # for char in newick: # if char=='(': # countleft+=1 # elif char==')': # countright+=1 # print countleft,' ',countright # print newick self.newick=ete2.Tree(newick,format=1) ts=ete2.TreeStyle() ts.rotation=90 #self.newick.show(tree_style=ts) self.newick.render(filepath,w=500,tree_style=ts)
def __init__(self, newick_strings): self.taxon_order = [] self.newick_strings = [] self.topology_arrays = [] self.newick_strings = newick_strings self.n_topologies = len(self.newick_strings) for i in range(self.n_topologies): ns = self.newick_strings[i] tree = ete2.Tree(ns) if i == 0: taxa = tree.get_leaf_names() self.taxon_order = sorted(taxa) self.generate_topology_array(ns)
def render_tree(self): newick=make_newick(self)+';' # countleft=0 # countright=0 # for char in newick: # if char=='(': # countleft+=1 # elif char==')': # countright+=1 # print countleft,' ',countright # print newick self.newick=ete2.Tree(newick,format=8) ts=ete2.TreeStyle() ts.rotation=90 #self.newick.show(tree_style=ts) self.newick.show(tree_style=ts)
def RenameTreeTaxa(self, treeFN, newTreeFilename, idsMap, qFixNegatives=False): # with open(treeFN, "rb") as inputTree: treeString = inputTree.next() try: tree = ete2.Tree(treeFN) for node in tree.get_leaves(): node.name = idsMap[node.name] if qFixNegatives: for n in tree.traverse(): if n.dist < 0.0: n.dist = 0.0 tree.write(outfile=newTreeFilename, format=4) except: pass
def calculate_topology_probabilities(ts): topology_counts = {} topology_data = {} cc_counts = {} cc_data = {} clade_sizes = {} for i in range(ts.n_trees): tree_array = ts.tree_arrays[i] topology_hash = tree_array["f0"].tostring() # topology hash is concatenated, sorted clade hashes if topology_hash not in topology_counts: # record topology tree_newick = ts.newick_strings[i] tree_root = ete2.Tree(tree_newick) topology_newick = tree_root.write(format = 9) # strip branch lengths topology_data[topology_hash] = topology_newick topology_counts[topology_hash] = 1 else: topology_counts[topology_hash] += 1 topology_array = tree_array[["f0", "f1"]] # we are only interested in clade & split hashes, not node heights for node in topology_array: parent_hash = node[0].tostring() # the hash for the clade split_hash = node[1].tostring() # the hash for the bifurcation n_node_taxa = clade_size(parent_hash) clade_sizes[parent_hash] = n_node_taxa if n_node_taxa >= 3: # record conditional clade if parent_hash not in cc_counts: cc_data[parent_hash] = {split_hash: node} cc_counts[parent_hash] = {split_hash: 1} elif split_hash not in cc_counts[parent_hash]: cc_data[parent_hash][split_hash] = node cc_counts[parent_hash][split_hash] = 1 else: cc_counts[parent_hash][split_hash] += 1 clades_set = CladeProbabilities(clade_sizes) topology_set = TopologyProbabilities(topology_data) cc_sets = {} for parent_hash, splits_data in cc_data.items(): cc_sets[parent_hash] = DiscreteProbabilities(splits_data) return topology_set, topology_counts, cc_sets, cc_counts, clades_set