def c_toArbre(self): if self.value == "=": n = TreeNode() n.name = self.value n1 = TreeNode() n1.name = "Id : " + self.sons[0] n2 = self.sons[1].e_toArbre() n.add_child(n1) n.add_child(n2) return n elif self.value == ';': n = TreeNode() n.name = self.value n1 = self.sons[0].c_toArbre() n2 = self.sons[1].c_toArbre() n.add_child(n1) n.add_child(n2) return n else: n = TreeNode() n.name = self.value n1 = self.sons[0].e_toArbre() n2 = self.sons[1].c_toArbre() n.add_child(n1) n.add_child(n2) return n
def recreate_tree(tree, num_layers=None, color=True): # build tree with same topology but without the coordinate and metadata labels # use color_dict to color nodes the appropriate colors new_tree = TreeNode(name=tree.name) #new_tree = TreeNodeHashable(name = tree.name) new_tree.img_style['size'] = 10 if color: new_tree.img_style['fgcolor'] = tree.color new_tree.img_style['shape'] = 'sphere' old_layer = [tree] new_layer = [new_tree] layer_num = 0 while old_layer: next_old_layer, next_new_layer = [], [] for ind, node in enumerate(old_layer): for child in node.children: next_old_layer.append(child) new_child = TreeNode(name=child.name) new_child.img_style['size'] = 10 if color: new_child.img_style['fgcolor'] = child.color new_child.img_style['shape'] = 'sphere' new_layer[ind].add_child(new_child) next_new_layer.append(new_child) old_layer = next_old_layer new_layer = next_new_layer layer_num += 1 if num_layers is not None and layer_num == num_layers: break return new_tree
def plot_marker_tree(tree, marker, resize_nodes=False, save=True): supplementary_data = pd.read_csv('../Suppl.Table2.CODEX_paper_MRLdatasetexpression.csv') supplementary_data.rename(columns={'X.X': 'X', 'Y.Y': 'Y', 'Z.Z': 'Z'}, inplace=True) supplementary_data['CD45_int'] = supplementary_data['CD45'].astype(int) ids_to_names = pd.read_csv('ClusterIDtoName.txt', sep='\t') cell_lines = list(ids_to_names['ID'].values) ids_to_names = dict(zip(ids_to_names['ID'].values, ids_to_names['Name'].values)) # remove dirt from supplementary data supplementary_annotations = pd.read_excel('../Suppl.Table2.cluster annotations and cell counts.xlsx') dirt = supplementary_annotations.loc[supplementary_annotations['Imaging phenotype (cell type)'] == 'dirt', 'X-shift cluster ID'] supplementary_data = supplementary_data[~supplementary_data['Imaging phenotype cluster ID'].isin(dirt)] supplementary_data['sample'] = supplementary_data['sample_Xtile_Ytile'].apply(lambda x: x.split('_')[0]) suppl_converted = convert_coordinates(supplementary_data)[['X', 'Y', 'Z', 'sample', marker]] new_tree = TreeNode(name = tree.name) new_tree.img_style['size'] = 1 if resize_nodes else 10 new_tree.img_style['fgcolor'] = hls2hex(0, 0, 0) new_tree.img_style['shape'] = 'sphere' marker_avgs = [] old_layer = [tree] new_layer = [new_tree] layer_num = 0 while old_layer: next_old_layer, next_new_layer = [], [] for ind, node in enumerate(old_layer): for child in node.children: next_old_layer.append(child) new_child = TreeNode(name = child.name) marker_avg = get_node_markers(child, marker, suppl_converted) new_child.add_features(marker_avg=marker_avg) marker_avgs.append(marker_avg) new_layer[ind].add_child(new_child) next_new_layer.append(new_child) old_layer = next_old_layer new_layer = next_new_layer layer_num += 1 marker_min, marker_max = np.min(marker_avgs), np.max(marker_avgs) for node in new_tree.iter_descendants(): norm_marker = (node.marker_avg - marker_min) / (marker_max - marker_min) node.add_features(marker_avg=norm_marker) node.add_features(color=hls2hex(0, norm_marker, norm_marker*0.5)) for node in new_tree.iter_descendants(): node.img_style['size'] = 1 + 10 * node.marker_avg if resize_nodes else 10 node.img_style['fgcolor'] = node.color node.img_style['shape'] = 'sphere' ts = TreeStyle() ts.show_leaf_name = False ts.rotation = 90 ts.title.add_face(TextFace(marker, fsize=20), column=0) save_dir = 'Marker_Trees' if resize_nodes else 'Marker_Trees_Same_Size' if save: new_tree.render(save_dir + '/marker_tree_{}.png'.format(marker), tree_style=ts) else: return new_tree.render('%%inline', tree_style=ts)
def addDeadLineage(spTree): """ Takes: - spTree (ete3.Tree) : species tree Returns: (ete3.Tree) : same tree with a dead lineage (name "-1") as outgroup AND all nodes have a "dead" feature (bool that is True only for the dead lineage and the new root) """ newSpTree = deepcopy(spTree) newSpTree.dist = 0.1 for n in newSpTree.traverse(): n.add_feature("dead",False) newRoot = TreeNode() newRoot.add_feature("dead",True) newRoot.dist = 0.0 newRoot.add_child(newSpTree) rootHeight = newRoot.get_distance(newRoot.get_leaves()[0]) deadLineage = TreeNode() deadLineage.add_feature("dead",True) deadLineage.name = "-1" deadLineage.dist = rootHeight newRoot.add_child(deadLineage) return newRoot
def read_tree(infile, format, quiet=False): if infile=='-': nwk_string = sys.stdin.readlines()[0] tree = TreeNode(newick=nwk_string, format=format, quoted_node_names=True) else: tree = TreeNode(newick=infile, format=format, quoted_node_names=True) if not quiet: num_leaves = len([ n for n in tree.traverse() if n.is_leaf() ]) sys.stderr.write('number of leaves in input tree: {:,}\n'.format(num_leaves)) return tree
def DFS_get_tree(root, par_node): results = get_info(root) par_node.name = results[0] if len(results) == 1: # par node is a leaf, end return elif len(results) == 3: name, l, r = results l_node = TreeNode() r_node = TreeNode() par_node.add_child(l_node) par_node.add_child(r_node) return DFS_get_tree(l, l_node), DFS_get_tree(r, r_node)
def compare(self, tree2, method='identity'): '''compare this tree to the other tree''' if method == 'identity': # we compare lists of seq, parent, abundance # return true if these lists are identical, else false list1 = sorted((node.sequence, node.frequency, node.up.sequence if node.up is not None else None) for node in self.tree.traverse()) list2 = sorted((node.sequence, node.frequency, node.up.sequence if node.up is not None else None) for node in tree2.tree.traverse()) return list1 == list2 elif method == 'MRCA': # matrix of hamming distance of common ancestors of taxa # takes a true and inferred tree as CollapsedTree objects taxa = [ node.sequence for node in self.tree.traverse() if node.frequency ] n_taxa = len(taxa) d = scipy.zeros(shape=(n_taxa, n_taxa)) sum_sites = scipy.zeros(shape=(n_taxa, n_taxa)) for i in range(n_taxa): nodei_true = self.tree.iter_search_nodes( sequence=taxa[i]).next() nodei = tree2.tree.iter_search_nodes(sequence=taxa[i]).next() for j in range(i + 1, n_taxa): nodej_true = self.tree.iter_search_nodes( sequence=taxa[j]).next() nodej = tree2.tree.iter_search_nodes( sequence=taxa[j]).next() MRCA_true = self.tree.get_common_ancestor( (nodei_true, nodej_true)).sequence MRCA = tree2.tree.get_common_ancestor( (nodei, nodej)).sequence d[i, j] = hamming_distance(MRCA_true, MRCA) sum_sites[i, j] = len(MRCA_true) return d.sum() / sum_sites.sum() elif method == 'RF': tree1_copy = self.tree.copy(method='deepcopy') tree2_copy = tree2.tree.copy(method='deepcopy') for treex in (tree1_copy, tree2_copy): for node in list(treex.traverse()): if node.frequency > 0: child = TreeNode() child.add_feature('sequence', node.sequence) node.add_child(child) try: return tree1_copy.robinson_foulds(tree2_copy, attr_t1='sequence', attr_t2='sequence', unrooted_trees=True)[0] except: return tree1_copy.robinson_foulds(tree2_copy, attr_t1='sequence', attr_t2='sequence', unrooted_trees=True, allow_dup=True)[0] else: raise ValueError('invalid distance method: ' + method)
def _build(parse_node, tree_node=None): if tree_node is None: tree_node = TreeNode() if isinstance(parse_node, list): print(parse_node) if isinstance(parse_node, LeafNode): symbol = parse_node.literal token = parse_node.token tree_node.name = symbol tree_node.add_feature("tokens", [token]) elif isinstance(parse_node, InternalNode): symbol = parse_node.symbol rule = parse_node.rule children = parse_node.children token = [] for child_node in children: node = _build(child_node) tree_node.add_child(node) token.extend(node.tokens) tree_node.name = symbol tree_node.add_feature("rule", rule) tree_node.add_feature("tokens", token) return tree_node
def _convert_biotree_to_etetree(bio_tree): fhand = io.StringIO() write_newick([bio_tree], fhand) newick = fhand.getvalue() newick = re.sub("Inner[0-9]+:", ":", newick) ete_tree = TreeNode(newick) return (ete_tree)
def add_tree_layer(tree, leaves, clusters, proportions, child_coords, prop_filter): ''' tree: tree that we want to add an additional layer to leaves: leaves of tree clusters: number of clusters in the child layer proportions: nested dictionary containing id of parent and id of child and the proportion of cells contained in the parent that are also contained in the child prop_filter: proportion of cells for edge between clusters to be created ''' child_nodes = {} for ind in range(len(clusters)): child_node_id = clusters[ind] child_nodes[child_node_id] = TreeNode(name=child_node_id) # add coordinate data to node child_nodes[child_node_id].add_features( coords=child_coords[child_node_id]) child_nodes[child_node_id].add_features(cluster_id=child_node_id) for child_node_id in proportions: # ensure that each child node is not added to more than one parent node proportions_child = proportions[child_node_id] max_node_id = max(proportions_child, key=proportions_child.get) if proportions_child[max_node_id] > prop_filter: parent_node = leaves[max_node_id] parent_node.add_child(child_nodes[child_node_id]) return tree, child_nodes
def p_toArbre(self): n = TreeNode() n.name = "main()" n1 = TreeNode() n1.name = str(self.sons[0]) n2 = self.sons[1].c_toArbre() n3 = self.sons[2].e_toArbre() n.add_child(n1) n.add_child(n2) n.add_child(n3) return n
def e_toArbre(self): if self.type == "NUMBER": n = TreeNode() n.name = "Number : " + str(self.value) return n elif self.type == "ID": n = TreeNode() n.name = "Id : " + self.value return n elif self.type == "OPBIN": n = TreeNode() n.name = self.value n1 = self.sons[0].e_toArbre() n2 = self.sons[1].e_toArbre() n.add_child(n1) n.add_child(n2) return n
def simplify_tree(self, tree): root_label = self._simplify_tree(tree) if tree.label in ['Arg1', 'Arg2', 'Conn', 'none']: tree.children = self.get_leave_node(tree) return for i, c in enumerate(tree.children): if self.deeperthan1(c): self.simplify_tree(c) else: n = TreeNode() n.children = [c] n.label = c.label tree.children[i] = n
def parameterised_test(mutDict, insertionDict, mutations, expected_output): f = MockFile() node = TreeNode(name="test_node") node.mutations = mutations original_mD = mutDict.copy() original_iD = insertionDict.copy() genome_tree.writeGenomeShortIndels(node=node, file=f, mutDict=mutDict, insertionDict=insertionDict) # the whole point of this function is that the genome tree updates and then de-updates # any mutations. So we need the mutDict and insertionDict to remain the same before and after printing. assert mutDict == original_mD assert insertionDict == original_iD assert f.written_data == expected_output #
def initialize_pathogen_tree(self): """ Initialize one pathogen lineage per host tip dist records height that pathogen lineage was started TODO: relax this assumption - needs some way to input """ # reset containers self.extant_p = [] # pathogen lineages that have not coalesced self.not_yet_sampled_p = [] # pathogen lineages higher in the tree for i, host_tip in enumerate(self.hosttree.get_leaves()): pnode = TreeNode(name=host_tip.name + '_P', dist=0) pnode.add_features(height=host_tip.height, host=host_tip) if host_tip.height == 0: self.extant_p.append(pnode) else: self.not_yet_sampled_p.append(pnode)
def copy_forest(forest, features=None): features = set(features if features else forest[0].features) copied_forest = [] for tree in forest: copied_tree = TreeNode() todo = [(tree, copied_tree)] copied_forest.append(copied_tree) while todo: n, copied_n = todo.pop() copied_n.dist = n.dist copied_n.support = n.support copied_n.name = n.name for f in features: if hasattr(n, f): copied_n.add_feature(f, getattr(n, f)) for c in n.children: todo.append((c, copied_n.add_child())) return copied_forest
def creation_by_words(self, words): """ Creation of a tree based on separate words in the word list :type words: list """ # Creates an empty tree tree = Tree() tree.name = "" # Make sure there are no duplicates words = set(words) # Populate tree for word in words: # If no similar words exist, add it to the base of tree target = tree if self.is_reversed: words = list(reversed(split(r'[\s-]+|:[\\/]{2}', word))) else: words = split(r'[\s-]+|:[\\/]{2}', word) # Find relatives in the tree root = '' pos = 0 for pos in xrange(len(words), -1, -1): root = ' '.join(words[:pos]) if root in self.name2node: target = self.name2node[root] break # Add new nodes as necessary fullname = root for wd in words[pos:]: fullname = (fullname + ' ' + wd).strip() new_node = TreeNode(name=wd.strip(), dist=target.dist + 1) target.add_child(new_node) self.name2node[fullname] = new_node target = new_node return tree
def add_tree_to_distribution(self, tree): """ Add the bipartition of a tree to the CCP distribution Takes: - tree (ete3.Tree): phylogenetic tree """ if len(tree.children) == 3: ## special unrroted case where the tree begin by a trifurcation ... ## we artificially remove the trifurcation to avoid future problems a = TreeNode() b = tree.children[1] c = tree.children[2] b.detach() c.detach() tree.add_child(a) a.add_child(b) a.add_child(c) #print " special rerooting " for i in tree.traverse(): if len(i.children) > 2: print "multifurcation detected! Please provide bifurcating trees." print "exiting now" exit(1) if self.nb_observation == 0: ##no tree has been observed yet: add all the leaves for l in tree.get_leaf_names(): self.get_leaf_id(l) ##adds the leaves to the CCP for node in tree.traverse("postorder"): ##for each branch of the tree self.add_tree_branch_to_distribution(node) self.nb_observation += 1 return
def simulate(self): ''' simulate a collapsed tree given params replaces existing tree data member with simulation result, and returns self ''' if self.params is None: raise ValueError('params must be defined for simulation') # initiate by running a LeavesAndClades simulation to get the number of clones and mutants # in the root node of the collapsed tree LeavesAndClades.simulate(self) self.tree = TreeNode() self.tree.add_feature('frequency', self.c) if self.m == 0: return self for _ in range(self.m): # ooooh, recursion child = CollapsedTree(params=self.params, frame=self.frame).simulate().tree child.dist = 1 self.tree.add_child(child) return self
def coalesce_paths(self, child_paths, t0): """ Create a new TreeNode and assign a given list of child nodes and its host node. :param child_paths: A list of TreeNodes in the pathogen tree. :param t0: Time of pathogen coalescence as height :return: A tuple containing: 1. TreeNode object for the new pathogen lineage. 2. updated extant list """ assert len(child_paths ) == 2, 'Can only coalesce 2 pathogen lineages at a time' p1, p2 = child_paths assert p1 in self.extant_p and p2 in self.extant_p, 'Both pathogen lineages must be extant' assert p1.host == p2.host, 'Can only coalesce pathogen lineages in the same host' host = p1.host assert p1.height < t0 and p2.height < t0, \ 'Pathogen lineage heights %f %f cannot exceed coalescent event %f' % (p1.height, p2.height, t0) # create new pathogen lineage new_path = TreeNode(name='_'.join([x.name for x in child_paths]), dist=0) new_path.add_features(host=host, height=t0) # cast child_paths as a List because ete3.Tree.children requires it new_path.children = list(child_paths) self.extant_p.append(new_path) # coalesced pathogen lineages are no longer extant for node in child_paths: node.up = new_path node.dist = t0 - node.height # when node was created, we stored the height self.extant_p.remove(node) self.not_extant_p.append(node) return new_path
def make_ete_trees(agent_ids: Iterable[str]) -> List[TreeNode]: '''Construct an ETE Toolkit Tree from a sequence of agent IDs Agent IDs must be constructed such that for any agent with ID :math:`p` with a parent with ID :math:`p`, :math:`p == c[:-1]`. This function should be able to handle multiple phylogenies among the agents, but this behavior is not guaranteed, tested, nor supported. Args: agent_ids: Sequence of agent IDs to build a tree from. Returns: A list of the roots of the created trees. ''' stem = os.path.commonprefix(list(agent_ids)) id_node_map: Dict[str, TreeNode] = dict() sorted_agents = sorted(agent_ids) roots: List[TreeNode] = [] for agent_id in sorted_agents: phylogeny_id = agent_id[len(stem):] try: if phylogeny_id: int(phylogeny_id) except ValueError as e: raise ValueError( 'String in ID {} after stem {} is non-numeric'.format( agent_id, stem)) from e parent_phylo_id = phylogeny_id[:-1] if parent_phylo_id in id_node_map: parent = id_node_map[parent_phylo_id] child = parent.add_child(name=agent_id) else: child = TreeNode(name=agent_id) roots.append(child) id_node_map[phylogeny_id] = child return roots
def get_tree_from_CCP(self, method, bip=None, node=None): """ RECURSIVE build a tree from the CCP distribution Takes: - method (function) : function that takes a bipartition id and returns a tuple of children bipartition ids - bip (int): bip id - node (ete3.TreeNode): current tree Returns: (ete3.TreeNode): phylogenetic tree drawn from the CCP distribution """ DIP = [] BLEN = [] if node == None: ##nothing is created yet root_bip = None ##bip that contains all leaves but one leaf_bip = None ##bip that contains only 1 leaf for bip in self.dbip_set.keys(): if len(self.dbip_set[bip]) == self.number_of_leaves( ) - 1: ##all leaves but one type of clade root_bip = bip one_leaf_set = set(self.dleaf_id.keys()) - self.dbip_set[ bip] ##complementary leaf set. Only one leaf leaf_bip = self.get_bip_from_leafset(one_leaf_set) break leaf_bip_count = self.dbip_count[leaf_bip] leaf_bip_blen = self.dbip_bls[leaf_bip] / leaf_bip_count ##as this is the root, this length will be divided between both of the root children. DIP = [leaf_bip, root_bip] BLEN = [leaf_bip_blen / 2., leaf_bip_blen / 2.] ##... creating the root node node = Tree() elif len(self.dbip_set[bip]) == 1: ##the current bip is a leaf leaf_id = [i for i in self.dbip_set[bip]][0] leaf_name = self.dleaf_id[leaf_id] node.name = leaf_name return node else: ##bipartition node that is not the root: draw bipatition using the method function DIP = method(bip) ##choosing a split of the clade for d in DIP: BLEN.append(self.dbip_bls[d] * 1. / self.dbip_count[d]) ##for each new clade in dip, we create a child node for i, d in enumerate(DIP): new = TreeNode(dist=BLEN[i]) #new = node.newnode()##creating new node node.add_child(new) #node.link_child(new,newlen=BLEN[i])##linking it as child and giving it its length self.get_tree_from_CCP(method, d, new) #RECURSION return node
def bub_tree(tree, fasta, outfile1, root, types, c_dict, show, size, colours, field1, field2, scale, multiplier, dna): """ :param tree: tree object from ete :param fasta: the fasta file used to make the tree :param outfile1: outfile suffix :param root: sequence name to use as root :param types: tree type: circular (c) or rectangle (r) :param c_dict: dictionary mapping colour to time point (from col_map) :param show: show the tree in a gui (y/n) :param size: scale the terminal nodes by frequency information (y/n) :param colours: if using a matched fasta file, colour the sequence by charge/IUPAC :param field1: the field that contains the size/frequency value :param field2: the field that contains the size/frequency value :param scale: how much to scale the x axis :param multiplier :param dna true/false, is sequence a DNA sequence? :param t_list list of time points :return: None, outputs svg/pdf image of the tree """ if multiplier is None: mult = 500 else: mult = multiplier if dna: dna_prot = 'dna' bg_c = { 'A': 'green', 'C': 'blue', 'G': 'black', 'T': 'red', '-': 'grey', 'X': 'white' } fg_c = { 'A': 'black', 'C': 'black', 'G': 'black', 'T': 'black', '-': 'black', 'X': 'white' } else: dna_prot = 'aa' bg_c = { 'K': '#145AFF', 'R': '#145AFF', 'H': '#8282D2', 'E': '#E60A0A', 'D': '#E60A0A', 'N': '#00DCDC', 'Q': '#00DCDC', 'S': '#FA9600', 'T': '#FA9600', 'L': '#0F820F', 'I': '#0F820F', 'V': '#0F820F', 'Y': '#3232AA', 'F': '#3232AA', 'W': '#B45AB4', 'C': '#E6E600', 'M': '#E6E600', 'A': '#C8C8C8', 'G': '#EBEBEB', 'P': '#DC9682', '-': 'grey', 'X': 'white' } fg_c = { 'K': 'black', 'R': 'black', 'H': 'black', 'E': 'black', 'D': 'black', 'N': 'black', 'Q': 'black', 'S': 'black', 'T': 'black', 'L': 'black', 'I': 'black', 'V': 'black', 'Y': 'black', 'F': 'black', 'W': 'black', 'C': 'black', 'M': 'black', 'A': 'black', 'G': 'black', 'P': 'black', '-': 'grey', 'X': 'white' } if colours == 3: bg_c = None fg_c = None # outfile3 = str(outfile1.replace(".svg", ".nwk")) tstyle = TreeStyle() tstyle.force_topology = False tstyle.mode = types tstyle.scale = scale tstyle.min_leaf_separation = 0 tstyle.optimal_scale_level = 'full' # 'mid' # tstyle.complete_branch_lines_when_necessary = False if types == 'c': tstyle.root_opening_factor = 0.25 tstyle.draw_guiding_lines = False tstyle.guiding_lines_color = 'slateblue' tstyle.show_leaf_name = False tstyle.allow_face_overlap = True tstyle.show_branch_length = False tstyle.show_branch_support = False TreeNode(format=0, support=True) # tnode = TreeNode() if root is not None: tree.set_outgroup(root) # else: # r = tnode.get_midpoint_outgroup() # print("r", r) # tree.set_outgroup(r) time_col = [] for node in tree.traverse(): # node.ladderize() if node.is_leaf() is True: try: name = node.name.split("_") time = name[field2] kind = name[3] # print(name) except: time = 'zero' name = node.name print("Incorrect name format for ", node.name) if size is True: try: s = 20 + float(name[field1]) * mult except: s = 20 print("No frequency information for ", node.name) else: s = 20 colour = c_dict[time] time_col.append((time, colour)) nstyle = NodeStyle() nstyle["fgcolor"] = colour nstyle["size"] = s nstyle["hz_line_width"] = 10 nstyle["vt_line_width"] = 10 nstyle["hz_line_color"] = colour nstyle["vt_line_color"] = 'black' nstyle["hz_line_type"] = 0 nstyle["vt_line_type"] = 0 node.set_style(nstyle) if root is not None and node.name == root: # place holder in case you want to do something with the root leaf print('root is ', node.name) # nstyle["shape"] = "square" # nstyle["fgcolor"] = "black" # nstyle["size"] = s # nstyle["shape"] = "circle" # node.set_style(nstyle) else: nstyle["shape"] = "circle" node.set_style(nstyle) if fasta is not None: seq = fasta[str(node.name)] seqFace = SequenceFace(seq, seqtype=dna_prot, fsize=10, fg_colors=fg_c, bg_colors=bg_c, codon=None, col_w=40, alt_col_w=3, special_col=None, interactive=True) # seqFace = SeqMotifFace(seq=seq, motifs=None, seqtype=dna_prot, gap_format=' ', seq_format='()', scale_factor=20, # height=20, width=50, fgcolor='white', bgcolor='grey', gapcolor='white', ) # seqFace = SeqMotifFace(seq, seq_format="seq", fgcolor=fg_c, bgcolor=bg_c) #interactive=True (tree & node.name).add_face(seqFace, 0, "aligned") else: nstyle = NodeStyle() nstyle["size"] = 0.1 nstyle["hz_line_width"] = 10 nstyle["vt_line_width"] = 10 node.set_style(nstyle) continue tree.ladderize() # tnode.ladderize() legendkey = sorted(set(time_col)) legendkey = [(tp, col) for tp, col in legendkey] # legendkey.insert(0, ('Root', 'black')) legendkey.append(('', 'white')) for tm, clr in legendkey: tstyle.legend.add_face(faces.CircleFace(30, clr), column=0) tstyle.legend.add_face(faces.TextFace('\t' + tm, ftype='Arial', fsize=60, fgcolor='black', tight_text=True), column=1) if show is True: tree.show(tree_style=tstyle) tree.render(outfile1, dpi=600, tree_style=tstyle)
def simulate(self, sequence, pair_bounds=None, lambda_=0.9, lambda0=[1], N=None, T=None, n=None, verbose=False, selection_params=None): ''' Simulate a poisson branching process with mutation introduced by the chosen mutation model e.g. motif or uniform. Can either simulate under a neutral model without selection, or using an affinity muturation inspired model for selection. ''' progeny = poisson(lambda_) # Default progeny distribution stop_dist = None # Default stopping criterium for affinity simulation # Checking the validity of the input parameters: if N is not None and T is not None: raise ValueError( 'Only one of N and T can be used. One must be None.') if selection_params is not None and T is None: raise ValueError( 'Simulation with selection was chosen. A time, T, must be specified.' ) elif N is None and T is None: raise ValueError('Either N or T must be specified.') if N is not None and n is not None and n[-1] > N: raise ValueError('n ({}) must not larger than N ({})'.format( n[-1], N)) elif N is not None and n is not None and len(n) != 1: raise ValueError( 'n ({}) must a single value when specifying N'.format(n)) if T is not None and len(T) > 1 and (n is None or (len(n) != 1 and len(n) != len(T))): raise ValueError( 'n must be specified when using intermediate sampling:', n) elif T is not None and len(T) > 1 and len(n) == 1: n = [n[-1]] * len(T) # Planting the tree: tree = TreeNode() tree.dist = 0 tree.add_feature('sequence', sequence) tree.add_feature('terminated', False) tree.add_feature('sampled', False) tree.add_feature('frequency', 0) tree.add_feature('time', 0) if selection_params is not None: hd_generation = list( ) # Collect an array of the counts of each hamming distance at each time step stop_dist, mature_affy, naive_affy, target_dist, target_count, skip_update, A_total, B_total, Lp, k, outbase = selection_params # Make a list of target sequences: targetAAseqs = [ self.one_mutant(sequence, target_dist) for i in range(target_count) ] # Assert that the target sequences are comparable to the naive sequence: aa = translate(tree.sequence) assert (sum([1 for t in targetAAseqs if len(t) != len(aa)]) == 0 ) # All targets are same length assert (sum([ 1 for t in targetAAseqs if hamming_distance(aa, t) == target_dist ])) # All target are "target_dist" away from the naive sequence # Affinity is an exponential function of hamming distance: assert (target_dist > 0) def hd2affy(hd): return (mature_affy + hd**k * (naive_affy - mature_affy) / target_dist**k) # We store both the amino acid sequence and the affinity as tree features: tree.add_feature('AAseq', str(aa)) tree.add_feature( 'Kd', selection_utils.calc_Kd(tree.AAseq, targetAAseqs, hd2affy)) tree.add_feature( 'target_dist', min([ hamming_distance(tree.AAseq, taa) for taa in targetAAseqs ])) t = 0 # <-- Time at start leaves_unterminated = 1 # Small lambdas are causing problems so make a minimum: lambda_min = 10e-10 hd_distrib = [] while leaves_unterminated > 0 and ( leaves_unterminated < N if N is not None else True) and (t < max(T) if T is not None else True) and ( stop_dist >= min(hd_distrib) if stop_dist is not None and t > 0 else True): if verbose: print('At time:', t) t += 1 # Sample intermediate time point: if T is not None and len(T) > 1 and (t - 1) in T: si = T.index(t - 1) live_nostop_leaves = [ l for l in tree.iter_leaves() if not l.terminated and not has_stop(l.sequence) ] random.shuffle(live_nostop_leaves) if len(live_nostop_leaves) < n[si]: raise RuntimeError( 'tree with {} leaves, less than what desired for intermediate sampling {}. Try later generation or increasing the carrying capacity.' .format(leaves_unterminated, n)) # Make the sample and kill the cells sampled: for leaf in live_nostop_leaves[:n[si]]: leaves_unterminated -= 1 leaf.sampled = True leaf.terminated = True if verbose: print('Made an intermediate sample at time:', t - 1) live_leaves = [l for l in tree.iter_leaves() if not l.terminated] random.shuffle(live_leaves) skip_lambda_n = 0 # At every new round reset the all the lambdas # Draw progeny for each leaf: for leaf in live_leaves: if selection_params is not None: if skip_lambda_n == 0: skip_lambda_n = skip_update + 1 # Add one so skip_update=0 is no skip tree = selection_utils.lambda_selection( tree, targetAAseqs, hd2affy, A_total, B_total, Lp) if leaf.lambda_ > lambda_min: progeny = poisson(leaf.lambda_) else: progeny = poisson(lambda_min) skip_lambda_n -= 1 n_children = progeny.rvs() leaves_unterminated += n_children - 1 # <-- Getting 1, is equal to staying alive if not n_children: leaf.terminated = True for child_count in range(n_children): # If sequence pair mutate them separately with their own mutation rate: if pair_bounds is not None: mutated_sequence1 = self.mutate( leaf.sequence[pair_bounds[0][0]:pair_bounds[0][1]], lambda0=lambda0[0]) mutated_sequence2 = self.mutate( leaf.sequence[pair_bounds[1][0]:pair_bounds[1][1]], lambda0=lambda0[1]) mutated_sequence = mutated_sequence1 + mutated_sequence2 else: mutated_sequence = self.mutate(leaf.sequence, lambda0=lambda0[0]) child = TreeNode() child.dist = sum( x != y for x, y in zip(mutated_sequence, leaf.sequence)) child.add_feature('sequence', mutated_sequence) if selection_params is not None: aa = translate(child.sequence) child.add_feature('AAseq', str(aa)) child.add_feature( 'Kd', selection_utils.calc_Kd(child.AAseq, targetAAseqs, hd2affy)) child.add_feature( 'target_dist', min([ hamming_distance(child.AAseq, taa) for taa in targetAAseqs ])) child.add_feature('frequency', 0) child.add_feature('terminated', False) child.add_feature('sampled', False) child.add_feature('time', t) leaf.add_child(child) if selection_params is not None: hd_distrib = [ min([ hamming_distance(tn.AAseq, ta) for ta in targetAAseqs ]) for tn in tree.iter_leaves() if not tn.terminated ] if target_dist > 0: hist = scipy.histogram(hd_distrib, bins=list(range(target_dist * 10))) else: # Just make a minimum of 10 bins hist = scipy.histogram(hd_distrib, bins=list(range(10))) hd_generation.append(hist) if verbose and hd_distrib: print('Total cell population:', sum(hist[0])) print('Majority hamming distance:', scipy.argmax(hist[0])) print('Affinity of latest sampled leaf:', leaf.Kd) print( 'Progeny distribution lambda for the latest sampled leaf:', leaf.lambda_) if leaves_unterminated < N: raise RuntimeError( 'Tree terminated with {} leaves, {} desired'.format( leaves_unterminated, N)) # Keep a histogram of the hamming distances at each generation: if selection_params is not None: with open(outbase + '_selection_runstats.p', 'wb') as f: pickle.dump(hd_generation, f) # Each leaf in final generation gets an observation frequency of 1, unless downsampled: if T is not None and len(T) > 1: # Iterate the intermediate time steps (excluding the last time): for Ti in sorted(T)[:-1]: si = T.index(Ti) # Only sample those that have been 'sampled' at intermediate sampling times: final_leaves = [ leaf for leaf in tree.iter_descendants() if leaf.time == Ti and leaf.sampled ] if len(final_leaves) < n[si]: raise RuntimeError( 'tree terminated with {} leaves, less than what desired after downsampling {}' .format(leaves_unterminated, n[si])) for leaf in final_leaves: # No need to down-sample, this was already done in the simulation loop leaf.frequency = 1 if selection_params and max(T) != t: raise RuntimeError( 'tree terminated with before the requested sample time.') # Do the normal sampling of the last time step: final_leaves = [ leaf for leaf in tree.iter_leaves() if leaf.time == t and not has_stop(leaf.sequence) ] # Report stop codon sequences: stop_leaves = [ leaf for leaf in tree.iter_leaves() if leaf.time == t and has_stop(leaf.sequence) ] if stop_leaves: print( 'Tree contains {} leaves with stop codons, out of {} total at last time point.' .format(len(stop_leaves), len(final_leaves))) if T is not None: si = T.index(sorted(T)[-1]) else: si = 0 # By default, downsample to the target simulation size: if n is not None and len(final_leaves) >= n[si]: for leaf in random.sample(final_leaves, n[si]): leaf.frequency = 1 elif n is None and N is not None: if len( final_leaves ) < N: # Removed nonsense sequences might decrease the number of final leaves to less than N N = len(final_leaves) for leaf in random.sample(final_leaves, N): leaf.frequency = 1 elif N is None and T is not None: for leaf in final_leaves: leaf.frequency = 1 elif n is not None and len(final_leaves) < n[si]: raise RuntimeError( 'tree terminated with {} leaves, less than what desired after downsampling {}' .format(leaves_unterminated, n[si])) else: raise RuntimeError('Unknown option.') # Prune away lineages that are unobserved: for node in tree.iter_descendants(): if sum(node2.frequency for node2 in node.traverse()) == 0: node.detach() # Remove unobserved unifurcations: for node in tree.iter_descendants(): parent = node.up if node.frequency == 0 and len(node.children) == 1: node.delete(prevent_nondicotomic=False) node.children[0].dist = hamming_distance( node.children[0].sequence, parent.sequence) # Assign unique names to each node: for i, node in enumerate(tree.traverse(), 1): node.name = 'simcell_{}'.format(i) # Return the uncollapsed tree: return tree
def build_tree(fcs_paths, num_neighbors, prop_filter=0.1): ''' fcs_paths: dictionary of (cluster numbers, path) num_neighbors: number of neighbors used in X-shift prop_filter: proportion of cells for edge between clusters to be created ''' # first initialize tree with 1 node at top and its children tree = TreeNode(name=0) leaves = {0: tree} _, cluster_data_child = fcsparser.parse(fcs_paths[0]) cluster_data_child = process_fcs(cluster_data_child) tree.add_features(coords=cluster_data_child[['X', 'Y', 'Z']]) tree.add_features(cluster_id=0) child_cluster_counts = cluster_data_child['cluster_id'].value_counts() child_coords = cluster_data_child[['cluster_id', 'sample', 'X', 'Y', 'Z']] child_coords_groupby = child_coords.groupby('cluster_id') child_coords = { group: child_coords.loc[inds, ['X', 'Y', 'Z', 'sample']] for group, inds in child_coords_groupby.groups.items() } clusters = list(child_cluster_counts.keys()) child_cluster_counts /= child_cluster_counts.sum() proportions = {} for child_node_id, val in child_cluster_counts.iteritems(): proportions[child_node_id] = {0: val} # set proportion filter to 0 for first layer, as everything is a child of the vertex tree, leaves = add_tree_layer(tree, leaves, clusters, proportions, child_coords, prop_filter=0) # build the rest of the tree for ind, nn in enumerate(num_neighbors[:-1]): _, cluster_data_parent = fcsparser.parse(fcs_paths[ind]) _, cluster_data_child = fcsparser.parse(fcs_paths[ind + 1]) cluster_data_parent = process_fcs(cluster_data_parent) cluster_data_child = process_fcs(cluster_data_child) child_cluster_counts = cluster_data_child['cluster_id'].value_counts() clusters = list(child_cluster_counts.keys()) match_data_parent = cluster_data_parent[['X', 'Y', 'Z', 'cluster_id']].astype(int) match_data_child = cluster_data_child[['X', 'Y', 'Z', 'cluster_id']].astype(int) merged = pd.merge(match_data_parent, match_data_child, on=['X', 'Y', 'Z']) parent_clusters = merged['cluster_id_x'].tolist() child_clusters = merged['cluster_id_y'].tolist() child_coords = cluster_data_child[[ 'cluster_id', 'sample', 'X', 'Y', 'Z' ]] child_coords_groupby = child_coords.groupby('cluster_id') child_coords = { group: child_coords.loc[inds, ['X', 'Y', 'Z', 'sample']] for group, inds in child_coords_groupby.groups.items() } proportions = defaultdict(Counter) for parent_cluster, child_cluster in zip(parent_clusters, child_clusters): proportions[child_cluster][ parent_cluster] += 1 / child_cluster_counts[child_cluster] tree, leaves = add_tree_layer(tree, leaves, clusters, proportions, child_coords, prop_filter) return tree
def create_tree_node(seq, frequency=0): tree = TreeNode() tree.add_feature('sequence', seq) tree.add_feature('frequency', frequency) return tree
# def name(self): # return self._name # def parent(self): # return self._parent # def assignParent(self, parent): # if( self._parent is None): # self._parent = parent # elif self._parent == parent: # return # else: # raise Exception("mismatched parents") G = TreeNode(name=u'cellular organisms') nodes = {u'cellular organisms': G} nstyle = NodeStyle() nstyle['shape'] = 'circle' nstyle['size'] = 3 def layout(node): #print(node) if (len(node.get_ancestors()) < 4): print(node.name) n = AttrFace("name", fsize=9) n.margin_top = 10 n.margin_bottom = 0 n.margin_left = 10 faces.add_face_to_node(n, node, 0, position="float")
if (args['--format']): ShowFormat() sys.exit(-1) basehtml = args['--html'] if args['--html'] else 'base.html' from ete3 import Tree, TreeNode #read ped file from stdin. ped_data = {} #map for name -> raw data. node_data = {} #map for name -> TreeNode for line in sys.stdin: line = line.strip() if line and line[0] != '#': #skip comment line. ss = line.split() ped_data[ss[1]] = ss n = TreeNode(name=ss[1]) n.add_feature('raw', ss) node_data[ss[1]] = n # for k,v in node_data.items(): # print(v.write(format=2,features=['raw'])) #find the root node, and convert results to josn. #Check data integrity. m_error = False for _, data in ped_data.items(): if data[2] != '0' and data[2] not in ped_data.keys(): m_error = True sys.stderr.write('ERROR: missing declearation for father: %s\n' % (data[2])) if data[3] != '0' and data[3] not in ped_data.keys():
def subdivideSpTree(spTree): """ Takes: - spTree (ete3.Tree) : an ULTRAMETRIC species tree Returns: (ete3.Tree) : subdivided species tree where all nodes have a timeSlice feature or None if the species tree is not ultrametric """ newSpTree = deepcopy(spTree) featureName = "timeSlice" ##1/ getting distance from root. Dheight = getDistFromRootDic(newSpTree , checkUltrametric = True) if Dheight is None: print "!!ERROR!! : the species tree is not ultrametric" return None # we know that there is n-1 internal nodes (where n is the number of leaves) # hence the maximal timeSlice is n-1 (all leaves have timeSlice 0) ##2/assign timeSlice to nodes currentTS = len(newSpTree.get_leaves()) - 1 for n,h in sorted(Dheight.iteritems(), key=lambda (k,v): (v,k)): n.add_feature(featureName, currentTS ) if currentTS != 0: currentTS -= 1 #print newSpTree.get_ascii(attributes=[featureName,"name"]) ##3/subdivide according to timeSlice RealNodes = [i for i in newSpTree.traverse()] for n in RealNodes: if n.is_root(): continue nodeToAdd = n.up.timeSlice - n.timeSlice - 1 while nodeToAdd > 0: parentNode = n.up n.detach() NullNode = TreeNode() NullNode.add_feature( featureName, parentNode.timeSlice - 1 ) if "dead" in n.features: NullNode.add_feature("dead" , n.dead) parentNode.add_child(NullNode) NullNode.add_child(n) nodeToAdd -= 1 #print newSpTree.get_ascii(attributes=[featureName,"name"]) return newSpTree
def parse_nexus(tree_path, columns=None): trees = [] for nex_tree in read_nexus(tree_path): todo = [(nex_tree.root, None)] tree = None while todo: clade, parent = todo.pop() dist = 0 try: dist = float(clade.branch_length) except: pass name = getattr(clade, 'name', None) if not name: name = getattr(clade, 'confidence', None) if not isinstance(name, str): name = None node = TreeNode(dist=dist, name=name) if parent is None: tree = node else: parent.add_child(node) # Parse LSD2 dates and CIs, and PastML columns date, ci = None, None columns2values = defaultdict(set) comment = getattr(clade, 'comment', None) if isinstance(comment, str): date = next(iter(re.findall(DATE_COMMENT_REGEX, comment)), None) ci = next(iter(re.findall(CI_DATE_REGEX_LSD, comment)), None) if ci is None: ci = next(iter(re.findall(CI_DATE_REGEX_PASTML, comment)), None) if columns: for column in columns: values = \ set.union(*(set(_.split('|')) for _ in re.findall(COLUMN_REGEX_PASTML.format(column=column), comment)), set()) if values: columns2values[column] |= values comment = getattr(clade, 'branch_length', None) if not ci and not parent and isinstance(comment, str): ci = next(iter(re.findall(CI_DATE_REGEX_LSD, comment)), None) if ci is None: ci = next(iter(re.findall(CI_DATE_REGEX_PASTML, comment)), None) comment = getattr(clade, 'confidence', None) if ci is None and comment is not None and isinstance(comment, str): ci = next(iter(re.findall(CI_DATE_REGEX_LSD, comment)), None) if ci is None: ci = next(iter(re.findall(CI_DATE_REGEX_PASTML, comment)), None) if date is not None: try: date = float(date) node.add_feature(DATE, date) except: pass if ci is not None: try: ci = [float(_) for _ in ci] node.add_feature(DATE_CI, ci) except: pass if columns2values: for c, vs in columns2values.items(): node.add_feature(c, vs) todo.extend((c, node) for c in clade.clades) for n in tree.traverse('preorder'): date, ci = getattr(n, DATE, None), getattr(n, DATE_CI, None) if date is not None or ci is not None: for c in n.children: if c.dist == 0: if getattr(c, DATE, None) is None: c.add_feature(DATE, date) if getattr(c, DATE_CI, None) is None: c.add_feature(DATE_CI, ci) for n in tree.traverse('postorder'): date, ci = getattr(n, DATE, None), getattr(n, DATE_CI, None) if not n.is_root() and n.dist == 0 and (date is not None or ci is not None): if getattr(n.up, DATE, None) is None: n.up.add_feature(DATE, date) if getattr(n.up, DATE_CI, None) is None: n.up.add_feature(DATE_CI, ci) # propagate dates up to the root if needed if getattr(tree, DATE, None) is None: dated_node = next((n for n in tree.traverse() if getattr(n, DATE, None) is not None), None) if dated_node: while dated_node != tree: if getattr(dated_node.up, DATE, None) is None: dated_node.up.add_feature( DATE, getattr(dated_node, DATE) - dated_node.dist) dated_node = dated_node.up trees.append(tree) return trees