def estimate_branch_lengths(self, tree: CassiopeiaTree) -> None:
        """
        Estimate branch lengths of the tree using the given model.

        The tree must be binary except for the root, which should have degree
        1.

        This method raises a ValueError if the discretization_size is too
        small or the tree topology is not valid.

        The computational complexity of this method is:
        O(discretization_level * tree.n_cell * tree.n_character)

        Raises:
            ValueError if discretization_size is too small or the tree topology
            is not valid.
        """
        self._validate_input_tree(tree)
        tree_orig = tree
        tree = deepcopy(tree)
        # We first impute the unambiguous missing states because it makes
        # the number of mutated states at each vertex increase monotonically
        # from parent to child, making the dynamic programming state and code
        # much clearer.
        tree.impute_deducible_missing_states()

        self._precompute_K_non_missing(tree)
        self._log_joints = {}  # type: Dict[str, np.array]
        self._posterior_means = {}  # type: Dict[str, float]
        self._posteriors = {}  # type: Dict[str, np.array]

        self._populate_attributes_with_cpp_implementation(tree)
        self._populate_branch_lengths(tree_orig)
def get_outgroup(tree: CassiopeiaTree, triplet: Tuple[str, str, str]) -> str:
    """Infers the outgroup of a triplet from a CassioepiaTree.


    Finds the outgroup based on the depth of the latest-common-ancestors
    of each pair of items. The pair with the deepest LCA is the
    ingroup and the remaining leaf is the outgroup. We infer the depth
    of the LCA from the number of shared ancestors.

    Args:
        tree: CassiopeiaTree
        triplet: A tuple of three leaves constituting a triplet.

    Returns:
        The outgroup (i.e. the most distal leaf in the triplet.)
    """

    i, j, k = triplet[0], triplet[1], triplet[2]

    i_ancestors = tree.get_all_ancestors(i)
    j_ancestors = tree.get_all_ancestors(j)
    k_ancestors = tree.get_all_ancestors(k)

    ij_common = len(set(i_ancestors) & set(j_ancestors))
    ik_common = len(set(i_ancestors) & set(k_ancestors))
    jk_common = len(set(j_ancestors) & set(k_ancestors))
    out_group = "None"
    if ij_common > jk_common and ij_common > ik_common:
        out_group = k
    elif ik_common > jk_common and ik_common > ij_common:
        out_group = j
    elif jk_common > ij_common and jk_common > ik_common:
        out_group = i
    return out_group
示例#3
0
    def test_minimum_branch_length(self, name, solver):
        """
        Test that the minimum branch length feature works.

        Same as test_small_tree_with_one_mutation but now we constrain the
        minimum branch length.Should give very short edges 1->3,1->4,0->2
        and edges 0->1,2->5,2->6 close to 1.
        """
        tree = nx.DiGraph()
        tree.add_nodes_from(["0", "1", "2", "3", "4", "5", "6"]),
        tree.add_edges_from(
            [
                ("0", "1"),
                ("0", "2"),
                ("1", "3"),
                ("1", "4"),
                ("2", "5"),
                ("2", "6"),
            ]
        )
        tree = CassiopeiaTree(tree=tree)
        tree.set_all_character_states(
            {
                "0": [0],
                "1": [0],
                "2": [0],
                "3": [0],
                "4": [0],
                "5": [0],
                "6": [1],
            }
        )
        model = IIDExponentialMLE(minimum_branch_length=0.01, solver=solver)
        model.estimate_branch_lengths(tree)
        self.assertAlmostEqual(
            tree.get_branch_length("0", "1"), 0.990, places=3
        )
        self.assertAlmostEqual(
            tree.get_branch_length("0", "2"), 0.010, places=3
        )
        self.assertAlmostEqual(
            tree.get_branch_length("1", "3"), 0.010, places=3
        )
        self.assertAlmostEqual(
            tree.get_branch_length("1", "4"), 0.010, places=3
        )
        self.assertAlmostEqual(
            tree.get_branch_length("2", "5"), 0.990, places=3
        )
        self.assertAlmostEqual(
            tree.get_branch_length("2", "6"), 0.990, places=3
        )
        self.assertAlmostEqual(model.log_likelihood, -1.922, places=3)
        self.assertAlmostEqual(model.mutation_rate, 0.405, places=3)
示例#4
0
    def overlay_data(
        self,
        tree: CassiopeiaTree,
        attribute_key: str = "spatial",
    ):
        """Overlays spatial data onto the CassiopeiaTree via Brownian motion.

        Args:
            tree: The CassiopeiaTree to overlay spatial data on to.
            attribute_key: The name of the attribute to save the coordinates as.
                This also serves as the prefix of the coordinates saved into
                the `cell_meta` attribute as `{attribute_key}_i` where i is
                an integer from 0...`dim-1`.
        """
        # Using numpy arrays instead of tuples for easy vector operations
        locations = {tree.root: np.zeros(self.dim)}
        for parent, child in tree.depth_first_traverse_edges(source=tree.root):
            parent_location = locations[parent]
            branch_length = tree.get_branch_length(parent, child)

            locations[child] = parent_location + np.random.normal(
                scale=np.sqrt(2 * self.diffusion_coefficient * branch_length),
                size=self.dim,
            )

        # Scale if desired
        # Note that Python dictionaries preserve order since 3.6
        if self.scale_unit_area:
            all_coordinates = np.array(list(locations.values()))

            # Shift each dimension so that the smallest value is at 0.
            all_coordinates -= all_coordinates.min(axis=0)

            # Scale all dimensions (by the same value) so that all values are
            # between [0, 1]. We don't scale each dimension separately because
            # we want to retain the shape of the distribution.
            all_coordinates /= all_coordinates.max()
            locations = {
                node: coordinates
                for node, coordinates in zip(locations.keys(), all_coordinates)
            }

        # Set node attributes
        for node, loc in locations.items():
            tree.set_attribute(node, attribute_key, tuple(loc))

        # Set cell meta
        cell_meta = (tree.cell_meta.copy() if tree.cell_meta is not None else
                     pd.DataFrame(index=tree.leaves))
        columns = [f"{attribute_key}_{i}" for i in range(self.dim)]
        cell_meta[columns] = np.nan
        for leaf in tree.leaves:
            cell_meta.loc[leaf, columns] = locations[leaf]
        tree.cell_meta = cell_meta
示例#5
0
def _N_fitch_count(
    cassiopeia_tree: CassiopeiaTree,
    unique_states: List[str],
    node_to_i: Dict[str, int],
    label_to_j: Dict[str, int],
    state_key: str = "S1",
) -> np.array(int):
    """Fill in the dynamic programming table N for FitchCount.
    
    Computes N[v, s], corresponding to the number of solutions below
    a node v in the tree given v takes on the state s.

    Args:
        cassiopeia_tree: CassiopeiaTree object
        unique_states: The state space that a node can take on
        node_to_i: Helper array storing a mapping of each node to a unique
            integer
        label_to_j: Helper array storing a mapping of each unique state in the
            state space to a unique integer
        state_key: Attribute name in the CassiopeiaTree storing the possible
            states for each node, as inferred with the Fitch-Hartigan algorithm

    Returns:
        A 2-dimensional array storing N[v, s] - the number of
            equally-parsimonious solutions below node v, given v takes on
            state s
    """
    def _fill(v: str, s: str):
        """Helper function to fill in a single entry in N."""

        if cassiopeia_tree.is_leaf(v):
            return 1

        children = cassiopeia_tree.children(v)
        A = np.zeros((len(children)))

        legal_states = []
        for i, u in zip(range(len(children)), children):

            if s not in cassiopeia_tree.get_attribute(u, state_key):
                legal_states = cassiopeia_tree.get_attribute(u, state_key)
            else:
                legal_states = [s]

            A[i] = np.sum(
                [N[node_to_i[u], label_to_j[sp]] for sp in legal_states])
        return np.prod([A[u] for u in range(len(A))])

    N = np.full((len(cassiopeia_tree.nodes), len(unique_states)), 0.0)
    for n in cassiopeia_tree.depth_first_traverse_nodes():
        for s in cassiopeia_tree.get_attribute(n, state_key):
            N[node_to_i[n], label_to_j[s]] = _fill(n, s)

    return N
示例#6
0
    def test_small_tree_regression(self, name, solver):
        """
        Perfect binary tree with "normal" amount of mutations on each edge.

        Regression test. Cannot be solved by hand. We just check that this
        solution never changes.
        """
        tree = nx.DiGraph()
        tree.add_nodes_from(["0", "1", "2", "3", "4", "5", "6"]),
        tree.add_edges_from(
            [
                ("0", "1"),
                ("0", "2"),
                ("1", "3"),
                ("1", "4"),
                ("2", "5"),
                ("2", "6"),
            ]
        )
        tree = CassiopeiaTree(tree=tree)
        tree.set_all_character_states(
            {
                "0": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                "1": [1, 0, 0, 0, 0, 0, 0, 0, 0, -1],
                "2": [0, 0, 0, 0, 0, 6, 0, 0, 0, -1],
                "3": [1, 2, 0, 0, 0, 0, 0, 0, 0, -1],
                "4": [1, 0, 3, 0, 0, 0, 0, 0, 0, -1],
                "5": [0, 0, 0, 0, 5, 6, 7, 0, 0, -1],
                "6": [0, 0, 0, 4, 0, 6, 0, 8, 9, -1],
            }
        )
        model = IIDExponentialMLE(minimum_branch_length=1e-4, solver=solver)
        model.estimate_branch_lengths(tree)
        self.assertAlmostEqual(model.mutation_rate, 0.378, places=3)
        self.assertAlmostEqual(
            tree.get_branch_length("0", "1"), 0.537, places=3
        )
        self.assertAlmostEqual(
            tree.get_branch_length("0", "2"), 0.219, places=3
        )
        self.assertAlmostEqual(
            tree.get_branch_length("1", "3"), 0.463, places=3
        )
        self.assertAlmostEqual(
            tree.get_branch_length("1", "4"), 0.463, places=3
        )
        self.assertAlmostEqual(
            tree.get_branch_length("2", "5"), 0.781, places=3
        )
        self.assertAlmostEqual(
            tree.get_branch_length("2", "6"), 0.781, places=3
        )
        self.assertAlmostEqual(model.log_likelihood, -22.689, places=3)
示例#7
0
def score_small_parsimony(
    cassiopeia_tree: CassiopeiaTree,
    meta_item: str,
    root: Optional[str] = None,
    infer_ancestral_states: bool = True,
    label_key: Optional[str] = "label",
) -> int:
    """Computes the small-parsimony of the tree.

    Using the meta data stored in the specified cell meta column, compute the
    parsimony score of the tree.

    Args:
        cassiopeia_tree: CassiopeiaTree object with cell meta data.
        meta_item: A column in the CassiopeiaTree cell meta corresponding to a
            categorical variable.
        root: Node to treat as the root. Only the subtree below
            this node will be considered.
        infer_ancestral_states: Whether or not ancestral states must be inferred
            (this will be False if `fitch_hartigan` has already been called on
            the tree.)
        label_key: If ancestral states have already been inferred, this key
            indicates the name of the attribute they're stored in.

    Returns:
        The parsimony score.

    Raises:
        CassiopeiaError if label_key has not been populated.
    """

    cassiopeia_tree = cassiopeia_tree.copy()

    if infer_ancestral_states:
        fitch_hartigan(cassiopeia_tree, meta_item, root, label_key=label_key)

    parsimony = 0
    for (parent,
         child) in cassiopeia_tree.depth_first_traverse_edges(source=root):

        try:
            if cassiopeia_tree.get_attribute(
                    parent, label_key) != cassiopeia_tree.get_attribute(
                        child, label_key):
                parsimony += 1
        except CassiopeiaTreeError:
            raise CassiopeiaError(f"{label_key} does not exist for a node, "
                                  "try running Fitch-Hartigan or passing "
                                  "infer_ancestral_states=True.")
    return parsimony
示例#8
0
    def get_dissimilarity_map(self,
                              cassiopeia_tree: CassiopeiaTree,
                              layer: Optional[str] = None) -> pd.DataFrame:
        """Obtains or generates a matrix that is updated throughout the solver.

        The highest-level method to obtain a dissimilarity map, which
        will be the matrix primarily used throughout the solve method. This
        matrix contains the pairwise dissimilarity between samples which is used
        for identifying sample pairs to merge, and will be updated at every
        iteration within the solve method. This method is not limited to
        outputting dissimilarity maps, but is instead deliberately
        designed to be overwritten to allow for use of similarity maps or other
        algorithm-specific sample to sample comparison maps in derived classes.

        Args:
            cassiopeia_tree: Tree object from which the 
                dissimilarity map is generated from
            layer: Layer storing the character matrix 
                for solving. If None, the default character matrix is used in 
                the CassiopeiaTree.

        Returns:
            pd.DataFrame: The matrix that will be used throughout the solve 
                method.
        """

        self.setup_dissimilarity_map(cassiopeia_tree, layer)
        dissimilarity_map = cassiopeia_tree.get_dissimilarity_map()

        return dissimilarity_map
示例#9
0
    def setUp(self) -> None:

        tree = nx.DiGraph()
        tree.add_edges_from([
            ("A", "B"),
            ("A", "C"),
            ("B", "D"),
            ("B", "E"),
            ("B", "F"),
            ("E", "G"),
            ("E", "H"),
            ("C", "I"),
            ("C", "J"),
        ])

        meta_data = pd.DataFrame.from_dict(
            {
                "D": ["TypeB", 10],
                "F": ["TypeA", 5],
                "G": ["TypeA", 3],
                "H": ["TypeB", 22],
                "I": ["TypeC", 2],
                "J": ["TypeC", 11],
            },
            orient="index",
            columns=["CellType", "nUMI"],
        )

        self.tree = CassiopeiaTree(tree=tree, cell_meta=meta_data)
示例#10
0
    def test_phylogenetic_weights_matrix_inverse_fn(self):

        tree = nx.DiGraph()
        tree.add_nodes_from(["A", "B", "C", "D", "E", "F"])
        tree.add_edge("F", "A", length=0.1)
        tree.add_edge("F", "B", length=0.2)
        tree.add_edge("F", "E", length=0.5)
        tree.add_edge("E", "C", length=0.3)
        tree.add_edge("E", "D", length=0.4)

        tree = CassiopeiaTree(tree=tree)

        weight_matrix = data_utilities.compute_phylogenetic_weight_matrix(
            tree, inverse=True, inverse_fn=lambda x: -np.log(x))

        expected_weight_matrix = pd.DataFrame.from_dict(
            {
                "A": [0.0, -np.log(0.3), -np.log(0.9), 0],
                "B": [-np.log(0.3), 0.0, 0, -np.log(1.1)],
                "C": [-np.log(0.9), 0, 0.0, -np.log(0.7)],
                "D": [0.0, -np.log(1.1), -np.log(0.7), 0.0],
            },
            orient="index",
            columns=["A", "B", "C", "D"],
        )

        pd.testing.assert_frame_equal(weight_matrix, expected_weight_matrix)
示例#11
0
    def test_simple_phylogenetic_weights_matrix_inverse(self):

        tree = nx.DiGraph()
        tree.add_nodes_from(["A", "B", "C", "D", "E", "F"])
        tree.add_edge("F", "A", length=0.1)
        tree.add_edge("F", "B", length=0.2)
        tree.add_edge("F", "E", length=0.5)
        tree.add_edge("E", "C", length=0.3)
        tree.add_edge("E", "D", length=0.4)

        tree = CassiopeiaTree(tree=tree)

        weight_matrix = data_utilities.compute_phylogenetic_weight_matrix(
            tree, inverse=True)

        expected_weight_matrix = pd.DataFrame.from_dict(
            {
                "A": [0.0, 1.0 / 0.3, 1.0 / 0.9, 1.0],
                "B": [1.0 / 0.3, 0.0, 1.0, 1.0 / 1.1],
                "C": [1.0 / 0.9, 1.0, 0.0, 1.0 / 0.7],
                "D": [1.0, 1.0 / 1.1, 1.0 / 0.7, 0.0],
            },
            orient="index",
            columns=["A", "B", "C", "D"],
        )

        pd.testing.assert_frame_equal(weight_matrix, expected_weight_matrix)
示例#12
0
 def test_saturation(self, name, solver):
     """
     Tree topology is just a branch 0->1.
     There is one mutated character i.e.:
         root [state = '0']
         |
         v
         child [state = '1']
     Since the character matrix is degenerate (it is saturated),
     an error should be raised.
     """
     tree = nx.DiGraph()
     tree.add_nodes_from(["0", "1"])
     tree.add_edge("0", "1")
     tree = CassiopeiaTree(tree=tree)
     tree.set_all_character_states({"0": [0], "1": [1]})
     model = IIDExponentialMLE(minimum_branch_length=1e-4, solver=solver)
     with self.assertRaises(ValueError):
         model.estimate_branch_lengths(tree)
示例#13
0
    def test_inter_cluster_distance_custom_input(self):

        tree = nx.DiGraph()
        tree.add_nodes_from(["A", "B", "C", "D", "E", "F"])
        tree.add_edge("F", "A", length=0.1)
        tree.add_edge("F", "B", length=0.2)
        tree.add_edge("F", "E", length=0.5)
        tree.add_edge("E", "C", length=0.3)
        tree.add_edge("E", "D", length=0.4)

        meta_data = pd.DataFrame.from_dict(
            {
                "A": ["TypeA", 10],
                "B": ["TypeA", 5],
                "C": ["TypeB", 3],
                "D": ["TypeB", 22],
            },
            orient="index",
            columns=["CellType", "nUMI"],
        )

        weight_matrix = pd.DataFrame.from_dict(
            {
                "A": [0.0, 0.5, 1.2, 0.4],
                "B": [0.5, 0.0, 3.0, 1.1],
                "C": [1.2, 3.0, 0.0, 0.8],
                "D": [0.4, 1.1, 0.8, 0.0],
            },
            orient="index",
            columns=["A", "B", "C", "D"],
        )

        tree = CassiopeiaTree(tree=tree)

        inter_cluster_distances = data_utilities.compute_inter_cluster_distances(
            tree,
            meta_data=meta_data["CellType"],
            dissimilarity_map=weight_matrix,
        )

        expected_distances = pd.DataFrame.from_dict(
            {
                "TypeA": [0.25, 1.425],
                "TypeB": [1.425, 0.4]
            },
            orient="index",
            columns=["TypeA", "TypeB"],
        )

        pd.testing.assert_frame_equal(
            expected_distances,
            inter_cluster_distances,
            check_exact=False,
            atol=0.001,
        )
示例#14
0
    def setup_dissimilarity_map(self,
                                cassiopeia_tree: CassiopeiaTree,
                                layer: Optional[str] = None) -> None:
        """Sets up the solver.

        Sets up the solver with respect to the input CassiopeiaTree by
        creating the dissimilarity map if needed and setting up the
        "root" sample if the tree will be rooted. Operates directly on the
        CassiopeiaTree.

        Args:
            cassiopeia_tree: Input CassiopeiaTree to `solve`.
            layer: Layer storing the character matrix for solving. If None, the
                default character matrix is used in the CassiopeiaTree.

        Raises:
            A `DistanceSolverError` if rooting parameters are not passed in
                correctly (i.e. no root is specified and the user has not
                asked to find a root) or when a dissimilarity map cannot
                be found or computed.
        """

        # if root sample is not specified, we'll add the implicit root
        # and recompute the dissimilarity map

        if cassiopeia_tree.root_sample_name is None:
            if self.add_root:
                self.setup_root_finder(cassiopeia_tree)

            else:
                raise DistanceSolverError(
                    "Please specify an explicit root sample in the Cassiopeia Tree"
                    " or specify the solver to add an implicit root")

        if cassiopeia_tree.get_dissimilarity_map() is None:
            if self.dissimilarity_function is None:
                raise DistanceSolverError(
                    "Please specify a dissimilarity function or populate the "
                    "CassiopeiaTree object with a dissimilarity map")

            cassiopeia_tree.compute_dissimilarity_map(
                self.dissimilarity_function, self.prior_transformation, layer)
    def test_invalid_sampling_probability_raises_error(self):
        tree = nx.DiGraph()
        tree.add_nodes_from(["0", "1", "2", "3"])
        tree.add_edges_from([("0", "1"), ("1", "2"), ("1", "3")])
        tree = CassiopeiaTree(tree=tree)
        tree.set_all_character_states({
            "0": [0],
            "1": [1],
            "2": [-1],
            "3": [1]
        }, )

        for sampling_probability in [-1.0, 2.0]:
            with self.assertRaises(ValueError):
                IIDExponentialBayesian(
                    mutation_rate=1.0,
                    birth_rate=1.0,
                    sampling_probability=sampling_probability,
                    discretization_level=500,
                )
    def test_small_discretization_level_raises_error(self):
        tree = nx.DiGraph()
        tree.add_nodes_from(["0", "1", "2", "3"])
        tree.add_edges_from([("0", "1"), ("1", "2"), ("1", "3")])
        tree = CassiopeiaTree(tree=tree)
        tree.set_all_character_states({
            "0": [0],
            "1": [1],
            "2": [-1],
            "3": [1]
        }, )

        model = IIDExponentialBayesian(
            mutation_rate=1.0,
            birth_rate=1.0,
            sampling_probability=1.0,
            discretization_level=2,
        )
        with self.assertRaises(ValueError):
            model.estimate_branch_lengths(tree)
示例#17
0
    def setup_root_finder(self, cassiopeia_tree: CassiopeiaTree) -> None:
        """Defines the implicit rooting strategy for the UPGMASolver.

        By default, the UPGMA algorithm returns an rooted tree. Therefore,
        the implicit root will be placed and specified at the end of the
        solving procedure as the parent of the last two unjoined nodes.

        Args:
            cassiopeia_tree: Input CassiopeiaTree to `solve`
        """
        cassiopeia_tree.root_sample_name = "root"
示例#18
0
 def test_hand_solvable_problem_2(self, name, solver):
     """
     Tree topology is just a branch 0->1.
     There are two mutated characters and one unmutated character, i.e.:
         root [state = '000']
         |
         v
         child [state = '011']
     The solution can be verified by hand. The optimization problem is:
         min_{r * t0} log(exp(-r * t0)) + 2 * log(1 - exp(-r * t0))
     The solution is r * t0 = ln(3) ~ 1.098
     (Note that because the depth of the tree is fixed to 1, r * t0 = r * 1
     is the mutation rate.)
     """
     tree = nx.DiGraph()
     tree.add_nodes_from(["0", "1"])
     tree.add_edge("0", "1")
     tree = CassiopeiaTree(tree=tree)
     tree.set_all_character_states({"0": [0, 0, 0], "1": [0, 1, 1]})
     model = IIDExponentialMLE(minimum_branch_length=1e-4, solver=solver)
     model.estimate_branch_lengths(tree)
     self.assertAlmostEqual(tree.get_branch_length("0", "1"), 1.0, places=3)
     self.assertAlmostEqual(tree.get_time("1"), 1.0, places=3)
     self.assertAlmostEqual(tree.get_time("0"), 0.0, places=3)
     self.assertAlmostEqual(model.mutation_rate, np.log(3), places=3)
     self.assertAlmostEqual(model.log_likelihood, -1.910, places=3)
示例#19
0
    def setup_root_finder(self, cassiopeia_tree: CassiopeiaTree) -> None:
        """Gives the implicit rooting strategy for the SNJ Solver.

        By default, the SpectralNeighborJoining algorithm returns an
        unrooted tree.  To root this tree, an implicit root of all zeros is
        added to the character matrix. Then, the dissimilarity map is
        recalculated using the updated character matrix. If the tree already
        has a computed dissimilarity map, only the new similarities are
        calculated. See 'setup_root_finder' in NeighborJoiningSolver.

        Args:
            cassiopeia_tree: Input CassiopeiaTree to `solve`
        """
        character_matrix = cassiopeia_tree.character_matrix.copy()
        rooted_character_matrix = character_matrix.copy()

        root = [0] * rooted_character_matrix.shape[1]
        rooted_character_matrix.loc["root"] = root
        cassiopeia_tree.root_sample_name = "root"
        cassiopeia_tree.character_matrix = rooted_character_matrix

        if self.dissimilarity_function is None:
            raise DistanceSolver.DistanceSolverError(
                "Please specify a dissimilarity function to add an implicit "
                "root, or specify an explicit root"
            )

        dissimilarity_map = cassiopeia_tree.get_dissimilarity_map()
        if dissimilarity_map is None:
            cassiopeia_tree.compute_dissimilarity_map(
                self.dissimilarity_function, self.prior_transformation
            )
        else:
            dissimilarity = {"root": 0}
            for leaf in character_matrix.index:
                weights = None
                if cassiopeia_tree.priors:
                    weights = solver_utilities.transform_priors(
                        cassiopeia_tree.priors, self.prior_transformation
                    )
                dissimilarity[leaf] = self.dissimilarity_function(
                    rooted_character_matrix.loc["root"].values,
                    rooted_character_matrix.loc[leaf].values,
                    cassiopeia_tree.missing_state_indicator,
                    weights,
                )
            cassiopeia_tree.set_dissimilarity("root", dissimilarity)

        cassiopeia_tree.character_matrix = character_matrix
示例#20
0
def create_clade_colors(
    tree: CassiopeiaTree, clade_colors: Dict[str, Tuple[float, float, float]]
) -> Tuple[Dict[str, Tuple[float, float, float]], Dict[Tuple[str, str], Tuple[
        float, float, float]], ]:
    """Assign colors to nodes and branches by clade.

    Args:
        tree: The CassiopeiaTree.
        clade_colors: Dictionary containing internal node-color mappings. These
            colors will be used to color all the paths from this node to the
            leaves the provided color.

    Returns:
        Two dictionaries. The first contains the node colors, and the second
            contains the branch colors.
    """
    # Deal with clade colors.
    descendants = {}
    for node in clade_colors.keys():
        descendants[node] = set(tree.depth_first_traverse_nodes(node))
    if len(set.union(*list(descendants.values()))) != sum(
            len(d) for d in descendants.values()):
        warnings.warn(
            "Some clades specified with `clade_colors` are overlapping. "
            "Colors may be overridden.",
            PlottingWarning,
        )

    # Color by largest clade first
    node_colors = {}
    branch_colors = {}
    for node in sorted(descendants,
                       key=lambda x: len(descendants[x]),
                       reverse=True):
        color = clade_colors[node]
        for n1, n2 in tree.depth_first_traverse_edges(node):
            node_colors[n1] = node_colors[n2] = color
            branch_colors[(n1, n2)] = color
    return node_colors, branch_colors
    def _precompute_K_non_missing(self, tree: CassiopeiaTree):
        """
        For each vertex in the tree, how many states are not missing.
        """
        # Check precondition: Add deducible states must have been imputed.
        for (parent, child) in tree.edges:
            parent_states = tree.get_character_states(parent)
            child_states = tree.get_character_states(child)
            for (parent_state, child_state) in zip(parent_states, child_states):
                # Check that deducible missing states have been imputed.
                # (This should ALWAYS pass)
                if (
                    parent_state != 0
                    and parent_state != tree.missing_state_indicator
                    and child_state == tree.missing_state_indicator
                ):
                    raise ValueError(
                        "Some deducible missing states have not "
                        "been imputed."
                    )

        # Compute _K_non_missing
        self._K_non_missing = {}
        self._K_non_missing[tree.root] = tree.n_character
        for node in tree.nodes:
            self._K_non_missing[
                node
            ] = tree.n_character - tree.get_character_states(node).count(
                tree.missing_state_indicator
            )

        # Validate monotonicity of K_non_missing
        for (parent, child) in tree.edges:
            if self._K_non_missing[parent] < self._K_non_missing[child]:
                raise ValueError(
                    "The number of missing states is not " "monotone."
                )
示例#22
0
 def test_on_simulated_data(self, name, solver):
     """
     We run the estimator on data simulated under the correct model.
     The estimator should be close to the ground truth.
     """
     tree = nx.DiGraph()
     tree.add_nodes_from(["0", "1", "2", "3", "4", "5", "6"]),
     tree.add_edges_from(
         [
             ("0", "1"),
             ("0", "2"),
             ("1", "3"),
             ("1", "4"),
             ("2", "5"),
             ("2", "6"),
         ]
     )
     tree = CassiopeiaTree(tree=tree)
     tree.set_times(
         {"0": 0, "1": 0.1, "2": 0.9, "3": 1.0, "4": 1.0, "5": 1.0, "6": 1.0}
     )
     np.random.seed(1)
     Cas9LineageTracingDataSimulator(
         number_of_cassettes=200,
         size_of_cassette=1,
         mutation_rate=1.5,
     ).overlay_data(tree)
     model = IIDExponentialMLE(minimum_branch_length=1e-4, solver=solver)
     model.estimate_branch_lengths(tree)
     self.assertTrue(0.05 < tree.get_time("1") < 0.15)
     self.assertTrue(0.8 < tree.get_time("2") < 1.0)
     self.assertTrue(0.9 < tree.get_time("3") < 1.1)
     self.assertTrue(0.9 < tree.get_time("4") < 1.1)
     self.assertTrue(0.9 < tree.get_time("5") < 1.1)
     self.assertTrue(0.9 < tree.get_time("6") < 1.1)
     self.assertTrue(1.4 < model.mutation_rate < 1.6)
     self.assertAlmostEqual(tree.get_time("0"), 0.0, places=3)
示例#23
0
    def test_inter_cluster_distance_basic(self):

        tree = nx.DiGraph()
        tree.add_nodes_from(["A", "B", "C", "D", "E", "F"])
        tree.add_edge("F", "A", length=0.1)
        tree.add_edge("F", "B", length=0.2)
        tree.add_edge("F", "E", length=0.5)
        tree.add_edge("E", "C", length=0.3)
        tree.add_edge("E", "D", length=0.4)

        meta_data = pd.DataFrame.from_dict(
            {
                "A": ["TypeA", 10],
                "B": ["TypeA", 5],
                "C": ["TypeB", 3],
                "D": ["TypeB", 22],
            },
            orient="index",
            columns=["CellType", "nUMI"],
        )

        tree = CassiopeiaTree(tree=tree, cell_meta=meta_data)

        inter_cluster_distances = data_utilities.compute_inter_cluster_distances(
            tree, meta_item="CellType")

        expected_distances = pd.DataFrame.from_dict(
            {
                "TypeA": [0.15, 1.0],
                "TypeB": [1.0, 0.35]
            },
            orient="index",
            columns=["TypeA", "TypeB"],
        )

        pd.testing.assert_frame_equal(expected_distances,
                                      inter_cluster_distances)

        self.assertRaises(
            CassiopeiaError,
            data_utilities.compute_inter_cluster_distances,
            tree,
            "nUMI",
        )
示例#24
0
 def test_small_tree_with_one_mutation(self, name, solver):
     """
     Perfect binary tree with one mutation at a node 6: Should give very
     short edges 1->3,1->4,0->2.
     The problem can be solved by hand: it trivially reduces to a
     1-dimensional problem:
         min_{r * t0} 2 * log(exp(-r * t0)) + log(1 - exp(-r * t0))
     The solution is r * t0 = ln(1.5) ~ 0.405
     (Note that because the depth of the tree is fixed to 1, r * t0 = r * 1
     is the mutation rate.)
     """
     tree = nx.DiGraph()
     tree.add_nodes_from(["0", "1", "2", "3", "4", "5", "6"]),
     tree.add_edges_from(
         [
             ("0", "1"),
             ("0", "2"),
             ("1", "3"),
             ("1", "4"),
             ("2", "5"),
             ("2", "6"),
         ]
     )
     tree = CassiopeiaTree(tree=tree)
     tree.set_all_character_states(
         {
             "0": [0],
             "1": [0],
             "2": [0],
             "3": [0],
             "4": [0],
             "5": [0],
             "6": [1],
         }
     )
     # Need to make minimum_branch_length be epsilon or else SCS fails...
     model = IIDExponentialMLE(minimum_branch_length=1e-4, solver=solver)
     model.estimate_branch_lengths(tree)
     self.assertAlmostEqual(tree.get_branch_length("0", "1"), 1.0, places=3)
     self.assertAlmostEqual(tree.get_branch_length("0", "2"), 0.0, places=3)
     self.assertAlmostEqual(tree.get_branch_length("1", "3"), 0.0, places=3)
     self.assertAlmostEqual(tree.get_branch_length("1", "4"), 0.0, places=3)
     self.assertAlmostEqual(tree.get_branch_length("2", "5"), 1.0, places=3)
     self.assertAlmostEqual(tree.get_branch_length("2", "6"), 1.0, places=3)
     self.assertAlmostEqual(model.log_likelihood, -1.910, places=3)
     self.assertAlmostEqual(model.mutation_rate, np.log(1.5), places=3)
示例#25
0
def fitch_hartigan(
    cassiopeia_tree: CassiopeiaTree,
    meta_item: str,
    root: Optional[str] = None,
    state_key: str = "S1",
    label_key: str = "label",
    copy: bool = False,
) -> Optional[CassiopeiaTree]:
    """Run the Fitch-Hartigan algorithm.
    
    Performs the full Fitch-Hartigan small parsimony algorithm which, given
    a set of states for the leaves, infers the most-parsimonious set of states
    and returns a random solution that satisfies the maximum-parsimony
    criterion. The solution will be stored in the label key specified by the
    user (by default 'label'). This function will modify the tree in place
    if `copy=False`.

    Args:
        cassiopeia_tree: CassiopeiaTree that has been processed with the
            Fitch-Hartigan bottom-up algorithm.
        meta_item: A column in the CassiopeiaTree cell meta corresponding to a
            categorical variable.
        root: Root from which to begin this refinement. Only the subtree below
            this node will be considered.
        state_key: Attribute key that stores the Fitch-Hartigan ancestral
            states.
        label_key: Key to add that stores the maximum-parsimony assignment
            inferred from the Fitch-Hartigan top-down refinement.
        copy: Modify the tree in place or not.
    
    Returns:
        A new CassiopeiaTree if the copy is set to True, else None.
    """

    cassiopeia_tree = cassiopeia_tree.copy() if copy else cassiopeia_tree

    fitch_hartigan_bottom_up(cassiopeia_tree, meta_item, state_key)

    fitch_hartigan_top_down(cassiopeia_tree, root, state_key, label_key)

    return cassiopeia_tree if copy else None
示例#26
0
def compute_phylogenetic_weight_matrix(
    tree: CassiopeiaTree,
    inverse: bool = False,
    inverse_fn: Callable[[Union[int, float]], float] = lambda x: 1 / x,
) -> pd.DataFrame:
    """Computes the phylogenetic weight matrix.

    Computes the distances between all leaves in a tree. The user has the option
    to return the inverse matrix, (i.e., transform distances to proximities) and
    specify an appropriate inverse function.

    This function computes the phylogenetic weight matrix in O(n^2 logn) time.

    An NxN weight matrix is returned.

    Args:
        tree: CassiopeiaTree
        inverse: Convert distances to proximities
        inverse_fn: Inverse function (default = 1 / x)

    Returns:
        An NxN phylogenetic weight matrix
    """
    N = tree.n_cell
    W = pd.DataFrame(np.zeros((N, N)), index=tree.leaves, columns=tree.leaves)

    for leaf1 in tree.leaves:

        distances = tree.get_distances(leaf1, leaves_only=True)
        for leaf2, _d in distances.items():

            if inverse:
                _d = inverse_fn(_d) if _d > 0 else np.inf

            W.loc[leaf1, leaf2] = W.loc[leaf2, leaf1] = _d

    np.fill_diagonal(W.values, 0)

    return W
示例#27
0
 def test_subtree_collapses_when_no_mutations(self, name, solver):
     """
     A subtree with no mutations should collapse to 0. It reduces the
     problem to the same as in 'test_hand_solvable_problem_1'
     """
     tree = nx.DiGraph()
     tree.add_nodes_from(["0", "1", "2", "3", "4"]),
     tree.add_edges_from([("0", "1"), ("1", "2"), ("1", "3"), ("0", "4")])
     tree = CassiopeiaTree(tree=tree)
     tree.set_all_character_states(
         {"0": [0], "1": [1], "2": [1], "3": [1], "4": [0]}
     )
     model = IIDExponentialMLE(minimum_branch_length=1e-4, solver=solver)
     model.estimate_branch_lengths(tree)
     self.assertAlmostEqual(model.log_likelihood, -1.386, places=3)
     self.assertAlmostEqual(tree.get_branch_length("0", "1"), 1.0, places=3)
     self.assertAlmostEqual(tree.get_branch_length("1", "2"), 0.0, places=3)
     self.assertAlmostEqual(tree.get_branch_length("1", "3"), 0.0, places=3)
     self.assertAlmostEqual(tree.get_branch_length("0", "4"), 1.0, places=3)
     self.assertAlmostEqual(model.mutation_rate, np.log(2), places=3)
示例#28
0
def fitch_hartigan_bottom_up(
    cassiopeia_tree: CassiopeiaTree,
    meta_item: str,
    add_key: str = "S1",
    copy: bool = False,
) -> Optional[CassiopeiaTree]:
    """Performs Fitch-Hartigan bottom-up ancestral reconstruction.

    Performs the bottom-up phase of the Fitch-Hartigan small parsimony
    algorithm. A new attribute called "S1" will be added to each node
    storing the optimal set of ancestral states inferred from this bottom-up 
    algorithm. If copy is False, the tree will be modified in place.
     

    Args:
        cassiopeia_tree: CassiopeiaTree object with cell meta data.
        meta_item: A column in the CassiopeiaTree cell meta corresponding to a
            categorical variable.
        add_key: Key to add for bottom-up reconstruction
        copy: Modify the tree in place or not.

    Returns:
        A new CassiopeiaTree if the copy is set to True, else None.

    Raises:
        CassiopeiaError if the tree does not have the specified meta data
            or the meta data is not categorical.
    """

    if meta_item not in cassiopeia_tree.cell_meta.columns:
        raise CassiopeiaError(
            "Meta item does not exist in the cassiopeia tree")

    meta = cassiopeia_tree.cell_meta[meta_item]

    if is_numeric_dtype(meta):
        raise CassiopeiaError("Meta item is not a categorical variable.")

    if not is_categorical_dtype(meta):
        meta = meta.astype("category")

    cassiopeia_tree = cassiopeia_tree.copy() if copy else cassiopeia_tree

    for node in cassiopeia_tree.depth_first_traverse_nodes():

        if cassiopeia_tree.is_leaf(node):
            cassiopeia_tree.set_attribute(node, add_key, [meta.loc[node]])

        else:
            children = cassiopeia_tree.children(node)
            if len(children) == 1:
                child_assignment = cassiopeia_tree.get_attribute(
                    children[0], add_key)
                cassiopeia_tree.set_attribute(node, add_key,
                                              [child_assignment])

            all_labels = np.concatenate([
                cassiopeia_tree.get_attribute(child, add_key)
                for child in children
            ])
            states, frequencies = np.unique(all_labels, return_counts=True)

            S1 = states[np.where(frequencies == np.max(frequencies))]
            cassiopeia_tree.set_attribute(node, add_key, S1)

    return cassiopeia_tree if copy else None
示例#29
0
def _C_fitch_count(
    cassiopeia_tree: CassiopeiaTree,
    N: np.array,
    unique_states: List[str],
    node_to_i: Dict[str, int],
    label_to_j: Dict[str, int],
    state_key: str = "S1",
) -> np.array(int):
    """Fill in the dynamic programming table C for FitchCount.
    
    Computes C[v, s, s1, s2], the number of transitions from state s1 to
    state s2 in the subtree rooted at v, given that state v takes on the
    state s. 

    Args:
        cassiopeia_tree: CassiopeiaTree object
        N: N array computed during FitchCount storing the number of solutions
            below a node v given v takes on state s
        unique_states: The state space that a node can take on
        node_to_i: Helper array storing a mapping of each node to a unique
            integer
        label_to_j: Helper array storing a mapping of each unique state in the
            state space to a unique integer
        state_key: Attribute name in the CassiopeiaTree storing the possible
            states for each node, as inferred with the Fitch-Hartigan algorithm

    Returns:
        A 4-dimensional array storing C[v, s, s1, s2] - the number of
            transitions from state s1 to s2 below a node v given v takes on
            the state s.
    """
    def _fill(v: str, s: str, s1: str, s2: str) -> int:
        """Helper function to fill in a single entry in C."""

        if cassiopeia_tree.is_leaf(v):
            return 0

        children = cassiopeia_tree.children(v)
        A = np.zeros((len(children)))
        LS = [[]] * len(children)

        for i, u in zip(range(len(children)), children):
            if s in cassiopeia_tree.get_attribute(u, state_key):
                LS[i] = [s]
            else:
                LS[i] = cassiopeia_tree.get_attribute(u, state_key)

            A[i] = np.sum([
                C[node_to_i[u], label_to_j[sp], label_to_j[s1],
                  label_to_j[s2], ] for sp in LS[i]
            ])

            if s1 == s and s2 in LS[i]:
                A[i] += N[node_to_i[u], label_to_j[s2]]

        parts = []
        for i, u in zip(range(len(children)), children):
            prod = 1

            for k, up in zip(range(len(children)), children):
                fact = 0
                if up == u:
                    continue
                for sp in LS[k]:
                    fact += N[node_to_i[up], label_to_j[sp]]
                prod *= fact

            part = A[i] * prod
            parts.append(part)

        return np.sum(parts)

    C = np.zeros(
        (len(cassiopeia_tree.nodes), N.shape[1], N.shape[1], N.shape[1]))

    for n in cassiopeia_tree.depth_first_traverse_nodes():
        for s in cassiopeia_tree.get_attribute(n, state_key):
            for (s1, s2) in itertools.product(unique_states, repeat=2):
                C[node_to_i[n], label_to_j[s], label_to_j[s1],
                  label_to_j[s2]] = _fill(n, s, s1, s2)

    return C
示例#30
0
def fitch_count(
    cassiopeia_tree: CassiopeiaTree,
    meta_item: str,
    root: Optional[str] = None,
    infer_ancestral_states: bool = True,
    state_key: str = "S1",
    unique_states: Optional[List[str]] = None,
):
    """Runs the FitchCount algorithm.

    Performs the FitchCount algorithm for inferring the number of times that
    two states transition to one another across all equally-parsimonious
    solutions returned by the Fitch-Hartigan algorithm. The original algorithm
    was described in Quinn, Jones, et al, Science (2021). The output is an 
    MxM count matrix, where the values indicate the number of times that
    m1 transitioned to m2 along an edge in a Fitch-Hartigan solution.
    To obtain probabilities P(m1 -> m2), divide each row by its row-sum.

    This procedure will only work on categorical data and will otherwise raise
    an error.

    Args:
        cassiopeia_tree: CassiopeiaTree object with a tree and cell meta data.
        meta_item: A column in the CassiopeiaTree cell meta corresponding to a
            categorical variable.
        root: Node to treat as the root. Only the subtree below this node will
            be considered for the procedure.
        infer_ancestral_states: Whether or not to initialize the ancestral state
            sets with Fitch-Hartigan.
        state_key: If ancestral state sets have already been created, then this
            argument specifies what the attribute name is in the CassiopeiaTree
        unique_states: State space that can be optionally provided by the user.
            If this is not provided, we take the unique values in
            `cell_meta[meta_item]` to be the state space.

    Returns:
        An MxM count matrix indicating the number of edges that contained a
            transition between two states across all equally parsimonious
            solutions returned by Fitch-Hartigan.
    """
    cassiopeia_tree = cassiopeia_tree.copy()

    if unique_states is None:
        unique_states = cassiopeia_tree.cell_meta[meta_item].unique()
    else:
        if (len(
                np.setdiff1d(cassiopeia_tree.cell_meta[meta_item].unique(),
                             unique_states)) > 0):
            raise FitchCountError("Specified state space does not span the set"
                                  " of states that appear in the meta data.")

    if root != cassiopeia_tree.root:
        cassiopeia_tree.subset_clade(root)

    if infer_ancestral_states:
        fitch_hartigan_bottom_up(cassiopeia_tree, meta_item, add_key=state_key)

    # create mapping from nodes to integers
    bfs_postorder = [cassiopeia_tree.root]
    for (_, e1) in cassiopeia_tree.breadth_first_traverse_edges():
        bfs_postorder.append(e1)

    node_to_i = dict(zip(bfs_postorder, range(len(bfs_postorder))))
    label_to_j = dict(zip(unique_states, range(len(unique_states))))

    N = _N_fitch_count(cassiopeia_tree, unique_states, node_to_i, label_to_j,
                       state_key)

    C = _C_fitch_count(cassiopeia_tree, N, unique_states, node_to_i,
                       label_to_j, state_key)

    M = pd.DataFrame(np.zeros((N.shape[1], N.shape[1])))
    M.columns = unique_states
    M.index = unique_states

    # create count matrix
    for s1 in unique_states:
        for s2 in unique_states:
            M.loc[s1, s2] = np.sum(C[node_to_i[cassiopeia_tree.root], :,
                                     label_to_j[s1], label_to_j[s2], ])

    return M