def annotate_tree_depths(tree: CassiopeiaTree) -> None: """Annotates tree depth at every node. Adds two attributes to the tree: how far away each node is from the root of the tree and how many triplets are rooted at that node. Modifies the tree in place. Args: tree: An ete3 Tree Returns: A dictionary mapping depth to the list of nodes at that depth. """ depth_to_nodes = defaultdict(list) for n in tree.depth_first_traverse_nodes(source=tree.root, postorder=False): if tree.is_root(n): tree.set_attribute(n, "depth", 0) else: tree.set_attribute(n, "depth", tree.get_attribute(tree.parent(n), "depth") + 1) depth_to_nodes[tree.get_attribute(n, "depth")].append(n) number_of_leaves = 0 correction = 0 for child in tree.children(n): number_of_leaves += len(tree.leaves_in_subtree(child)) correction += nCr(len(tree.leaves_in_subtree(child)), 3) tree.set_attribute(n, "number_of_triplets", nCr(number_of_leaves, 3) - correction) return depth_to_nodes
def fitch_hartigan_top_down( cassiopeia_tree: CassiopeiaTree, root: Optional[str] = None, state_key: str = "S1", label_key: str = "label", copy: bool = False, ) -> Optional[CassiopeiaTree]: """Run Fitch-Hartigan top-down refinement Runs the Fitch-Hartigan top-down algorithm which selects an optimal solution from the tree rooted at the specified root. Args: cassiopeia_tree: CassiopeiaTree that has been processed with the Fitch-Hartigan bottom-up algorithm. root: Root from which to begin this refinement. Only the subtree below this node will be considered. state_key: Attribute key that stores the Fitch-Hartigan ancestral states. label_key: Key to add that stores the maximum-parsimony assignment inferred from the Fitch-Hartigan top-down refinement. copy: Modify the tree in place or not. Returns: A new CassiopeiaTree if the copy is set to True, else None. Raises: A CassiopeiaTreeError if Fitch-Hartigan bottom-up has not been called or if the state_key does not exist for a node. """ # assign root root = cassiopeia_tree.root if (root is None) else root cassiopeia_tree = cassiopeia_tree.copy() if copy else cassiopeia_tree for node in cassiopeia_tree.depth_first_traverse_nodes(source=root, postorder=False): if node == root: root_states = cassiopeia_tree.get_attribute(root, state_key) cassiopeia_tree.set_attribute(root, label_key, np.random.choice(root_states)) continue parent = cassiopeia_tree.parent(node) parent_label = cassiopeia_tree.get_attribute(parent, label_key) optimal_node_states = cassiopeia_tree.get_attribute(node, state_key) if parent_label in optimal_node_states: cassiopeia_tree.set_attribute(node, label_key, parent_label) else: cassiopeia_tree.set_attribute( node, label_key, np.random.choice(optimal_node_states)) return cassiopeia_tree if copy else None
def compute_expansion_pvalues( tree: CassiopeiaTree, min_clade_size: int = 10, min_depth: int = 1, copy: bool = False, ) -> Union[CassiopeiaTree, None]: """Call expansion pvalues on a tree. Uses the methodology described in Yang, Jones et al, BioRxiv (2021) to assess the expansion probability of a given subclade of a phylogeny. Mathematical treatment of the coalescent probability is described in Griffiths and Tavare, Stochastic Models (1998). The probability computed corresponds to the probability that, under a simple neutral coalescent model, a given subclade contains the observed number of cells; in other words, a one-sided p-value. Often, if the probability is less than some threshold (e.g., 0.05), this might indicate that there exists some subclade under this node that to which this expansion probability can be attributed (i.e. the null hypothesis that the subclade is undergoing neutral drift can be rejected). This function will add an attribute "expansion_pvalue" to the tree, and return None unless :param:`copy` is set to True. On a typical balanced tree, this function will perform in O(n log n) time, but can be up to O(n^3) on highly unbalanced trees. A future endeavor may be to impelement the function in O(n) time. Args: tree: CassiopeiaTree min_clade_size: Minimum number of leaves in a subtree to be considered. min_depth: Minimum depth of clade to be considered. Depth is measured in number of nodes from the root, not branch lengths. copy: Return copy. Returns: If copy is set to False, returns the tree with attributes added in place. Else, returns a new CassiopeiaTree. """ tree = tree.copy() if copy else tree # instantiate attributes _depths = {} for node in tree.depth_first_traverse_nodes(postorder=False): tree.set_attribute(node, "expansion_pvalue", 1.0) if tree.is_root(node): _depths[node] = 0 else: _depths[node] = _depths[tree.parent(node)] + 1 for node in tree.depth_first_traverse_nodes(postorder=False): n = len(tree.leaves_in_subtree(node)) k = len(tree.children(node)) for c in tree.children(node): if len(tree.leaves_in_subtree(c)) < min_clade_size: continue depth = _depths[c] if depth < min_depth: continue b = len(tree.leaves_in_subtree(c)) # this value below is a simplification of the quantity: # sum[simple_coalescent_probability(n, b2, k) for \ # b2 in range(b, n - k + 2)] p = nCk(n - b, k - 1) / nCk(n - 1, k - 1) tree.set_attribute(c, "expansion_pvalue", p) return tree if copy else None
def overlay_data(self, tree: CassiopeiaTree): """Overlays Cas9-based lineage tracing data onto the CassiopeiaTree. Args: tree: Input CassiopeiaTree """ if self.random_seed is not None: np.random.seed(self.random_seed) # create state priors if they don't exist. # This will set the instance's variable for mutation priors and will # use this for all future simulations. if self.mutation_priors is None: self.mutation_priors = {} probabilites = [ self.state_generating_distribution() for _ in range(self.number_of_states) ] Z = np.sum(probabilites) for i in range(self.number_of_states): self.mutation_priors[i + 1] = probabilites[i] / Z number_of_characters = self.number_of_cassettes * self.size_of_cassette # initialize character states character_matrix = {} for node in tree.nodes: character_matrix[node] = [-1] * number_of_characters for node in tree.depth_first_traverse_nodes(tree.root, postorder=False): if tree.is_root(node): character_matrix[node] = [0] * number_of_characters continue parent = tree.parent(node) life_time = tree.get_time(node) - tree.get_time(parent) character_array = character_matrix[parent] open_sites = [ c for c in range(len(character_array)) if character_array[c] == 0 ] new_cuts = [] for site in open_sites: mutation_rate = self.mutation_rate_per_character[site] mutation_probability = 1 - (np.exp(-life_time * mutation_rate)) if np.random.uniform() < mutation_probability: new_cuts.append(site) # collapse cuts that are on the same cassette cuts_remaining = new_cuts if self.collapse_sites_on_cassette and self.size_of_cassette > 1: character_array, cuts_remaining = self.collapse_sites( character_array, new_cuts ) # introduce new states at cut sites character_array = self.introduce_states( character_array, cuts_remaining ) # silence cassettes silencing_probability = 1 - ( np.exp(-life_time * self.heritable_silencing_rate) ) character_array = self.silence_cassettes( character_array, silencing_probability, self.heritable_missing_data_state, ) character_matrix[node] = character_array # apply stochastic silencing for leaf in tree.leaves: character_matrix[leaf] = self.silence_cassettes( character_matrix[leaf], self.stochastic_silencing_rate, self.stochastic_missing_data_state, ) tree.set_all_character_states(character_matrix)