Пример #1
0
 def test_lca_characters_ambiguous(self):
     vecs = [
         [(1, 1), (0, 2), (3, ), (4, ), (5, )],
         [1, -1, -1, 3, -1],
         [1, 2, 3, 2, -1],
     ]
     ret_vec = data_utilities.get_lca_characters(vecs,
                                                 missing_state_indicator=-1)
     self.assertEqual(ret_vec, [1, 0, 3, 0, 5])
Пример #2
0
    def assess_cutoff(
        self,
        samples: List[str],
        character_matrix: pd.DataFrame,
        missing_state_indicator: int = -1,
    ) -> bool:
        """Assesses samples with respect to hybrid cutoff.

        Args:
            samples: A list of samples in a clade.
            character_matrix: Character matrix
            missing_state_indicator: Indicator for missing data.

        Returns:
            True if the cutoff is reached, False if not.
        """

        if self.cell_cutoff is None:
            root_states = data_utilities.get_lca_characters(
                character_matrix.loc[samples].values.tolist(),
                missing_state_indicator,
            )

            lca_distances = [
                dissimilarity_functions.hamming_distance(
                    np.array(root_states), character_matrix.loc[u].values)
                for u in samples
            ]

            if np.max(lca_distances) <= self.lca_cutoff:
                return True

        else:
            if len(samples) <= self.cell_cutoff:
                return True

        return False
Пример #3
0
    def apply_bottom_solver(
        self,
        cassiopeia_tree: CassiopeiaTree,
        root: int,
        samples=List[str],
        logfile: str = "stdout.log",
        layer: Optional[str] = None,
    ) -> Tuple[nx.DiGraph, int]:
        """Apply the bottom solver to subproblems.

        A private method for solving subproblems identified by the top-down
        solver with the more precise bottom solver for this instantation of
        the HybridSolver. This function will create a unique log file, based
        on the root, set up a new instance of the bottom solver and solve the
        subproblem.

        The function will return a tree for the subproblem and the identifier
        of the root of the tree.

        Args:
            cassiopeia_tree: CassiopeiaTree for the entire dataset. This
                will be subsetted with respect to the samples specified.
            root: Identifier of the root in the master tree
            samples: A list of samples for which to infer a tree.
            logfile: Base location for logging output. A specific logfile
                will be created from this base logfile name.
            layer: Layer storing the character matrix for solving. If None, the
                default character matrix is used in the CassiopeiaTree.

        Returns:
            A tree in the form of a Networkx graph and the original root
                identifier

        """
        if len(samples) == 1:
            subproblem_tree = nx.DiGraph()
            subproblem_tree.add_edge(root, samples[0])
            return subproblem_tree, root

        if layer:
            character_matrix = cassiopeia_tree.layers[layer].copy()
        else:
            character_matrix = cassiopeia_tree.character_matrix.copy()

        subproblem_character_matrix = character_matrix.loc[samples]

        subtree_root = data_utilities.get_lca_characters(
            subproblem_character_matrix.loc[samples].values.tolist(),
            cassiopeia_tree.missing_state_indicator,
        )

        subtree = CassiopeiaTree(
            subproblem_character_matrix,
            missing_state_indicator=cassiopeia_tree.missing_state_indicator,
            priors=cassiopeia_tree.priors,
        )

        self.bottom_solver.solve(subtree, logfile=logfile)

        subproblem_tree = subtree.get_tree_topology()
        subproblem_root = [
            n for n in subproblem_tree if subproblem_tree.in_degree(n) == 0
        ][0]
        subproblem_tree.add_edge(root, subproblem_root)

        return subproblem_tree, root
Пример #4
0
    def update_similarity_map_and_character_matrix(
        self,
        character_matrix: pd.DataFrame,
        similarity_function: Callable[
            [np.array, np.array, int, Dict[int, Dict[int, float]]], float
        ],
        similarity_map: pd.DataFrame,
        cherry: Tuple[str, str],
        new_node: str,
        missing_state_indicator: int = -1,
        weights=None,
    ) -> pd.DataFrame:
        """Update similarity map after finding a cherry.

        Adds the new LCA node into the character matrix with the mutations
        shared by the joined nodes as its character vector. Then, updates the
        similarity matrix by calculating the pairwise similarity between the
        new LCA node and all existing nodes.

        Args:
            character_matrix: Contains the character information for all nodes,
                updated as nodes are joined and new internal LCA nodes are added
            similarity_function: A similarity function
            similarity_map: A similarity map to update
            cherry: A tuple of indices in the similarity map that are joining
            new_node: New node name, to be added to the updated similarity map
            missing_state_indicator: Character representing missing data
            weights: Weighting of each (character, state) pair. Typically a
                transformation of the priors.

        Returns:
            A new similarity map, updated with the new node
        """

        character_i, character_j = (
            np.where(character_matrix.index == cherry[0])[0][0],
            np.where(character_matrix.index == cherry[1])[0][0],
        )

        character_array = character_matrix.to_numpy(copy=True)
        similarity_array = similarity_map.to_numpy()
        i_characters = character_array[character_i, :]
        j_characters = character_array[character_j, :]
        lca = data_utilities.get_lca_characters(
            [i_characters, j_characters], missing_state_indicator
        )
        character_matrix.loc[new_node] = lca

        similarity_array_updated = self.__update_similarity_map(
            character_array,
            similarity_array,
            np.array(lca),
            similarity_function,
            missing_state_indicator,
            weights,
        )

        sample_names = list(similarity_map.index) + [new_node]

        similarity_map = pd.DataFrame(
            similarity_array_updated, index=sample_names, columns=sample_names
        )

        # drop out cherry from similarity map and character matrix
        similarity_map.drop(
            columns=[cherry[0], cherry[1]],
            index=[cherry[0], cherry[1]],
            inplace=True,
        )

        character_matrix.drop(index=[cherry[0], cherry[1]], inplace=True)

        return similarity_map
Пример #5
0
    def solve(
        self,
        cassiopeia_tree: CassiopeiaTree,
        layer: Optional[str] = None,
        collapse_mutationless_edges: bool = False,
        logfile: str = "stdout.log",
    ):
        """Infers a tree with Cassiopeia-ILP.

        Solves a tree using the Cassiopeia-ILP algorithm and populates a tree
        in the provided CassiopeiaTree.

        Args:
            cassiopeia_tree: Input CassiopeiaTree
            layer: Layer storing the character matrix for solving. If None, the
                default character matrix is used in the CassiopeiaTree.
            collapse_mutationless_edges: Indicates if the final reconstructed
                tree should collapse mutationless edges based on internal states
                inferred by Camin-Sokal parsimony. In scoring accuracy, this
                removes artifacts caused by arbitrarily resolving polytomies.
            logfile: Location to log progress.
        """

        if self.weighted and not cassiopeia_tree.priors:
            raise ILPSolverError(
                "Specify prior probabilities in the CassiopeiaTree for weighted"
                " analysis.")

        # setup logfile config
        handler = logging.FileHandler(logfile)
        handler.setLevel(logging.INFO)
        logger.addHandler(handler)
        logger.info("Solving tree with the following parameters.")
        logger.info(f"Convergence time limit: {self.convergence_time_limit}")
        logger.info(
            f"Convergence iteration limit: {self.convergence_iteration_limit}")
        logger.info(
            f"Max potential graph layer size: {self.maximum_potential_graph_layer_size}"
        )
        logger.info(
            f"Max potential graph lca distance: {self.maximum_potential_graph_lca_distance}"
        )
        logger.info(f"MIP gap: {self.mip_gap}")

        if layer:
            character_matrix = cassiopeia_tree.layers[layer].copy()
        else:
            character_matrix = cassiopeia_tree.character_matrix.copy()
        if any(
                is_ambiguous_state(state)
                for state in character_matrix.values.flatten()):
            raise ILPSolverError("Solver does not support ambiguous states.")

        unique_character_matrix = character_matrix.drop_duplicates()

        weights = None
        if cassiopeia_tree.priors:
            weights = solver_utilities.transform_priors(
                cassiopeia_tree.priors, self.prior_transformation)

        # find the root of the tree & generate process ID
        root = tuple(
            data_utilities.get_lca_characters(
                unique_character_matrix.values.tolist(),
                cassiopeia_tree.missing_state_indicator,
            ))

        logger.info(f"Phylogenetic root: {root}")

        pid = hashlib.md5("|".join([str(r) for r in root
                                    ]).encode("utf-8")).hexdigest()

        targets = [tuple(t) for t in unique_character_matrix.values.tolist()]

        if unique_character_matrix.shape[0] == 1:
            optimal_solution = nx.DiGraph()
            optimal_solution.add_node(root)
            optimal_solution = (
                self.__append_sample_names_and_remove_spurious_leaves(
                    optimal_solution, character_matrix))
            cassiopeia_tree.populate_tree(optimal_solution, layer=layer)
            return

        # determine diameter of the dataset by evaluating maximum distance to
        # the root from each sample
        if (self.maximum_potential_graph_lca_distance is not None) and (
                self.maximum_potential_graph_lca_distance > 0):
            max_lca_distance = self.maximum_potential_graph_lca_distance

        else:
            max_lca_distance = 0
            lca_distances = [
                dissimilarity_functions.hamming_distance(
                    root,
                    np.array(u),
                    ignore_missing_state=True,
                    missing_state_indicator=cassiopeia_tree.
                    missing_state_indicator,
                ) for u in targets
            ]

            for (i, j) in itertools.combinations(range(len(lca_distances)), 2):
                max_lca_distance = max(max_lca_distance,
                                       lca_distances[i] + lca_distances[j] + 1)

        # infer the potential graph
        potential_graph = self.infer_potential_graph(
            unique_character_matrix,
            pid,
            max_lca_distance,
            weights,
            cassiopeia_tree.missing_state_indicator,
        )

        # generate Steiner Tree ILP model
        nodes = list(potential_graph.nodes())
        encoder = dict(zip(nodes, list(range(len(nodes)))))
        decoder = dict((v, k) for k, v in encoder.items())

        _potential_graph = nx.relabel_nodes(potential_graph, encoder)
        _targets = list(map(lambda x: encoder[x], targets))
        _root = encoder[root]

        model, edge_variables = self.generate_steiner_model(
            _potential_graph, _root, _targets)

        # solve the ILP problem and return a set of proposed solutions
        proposed_solutions = self.solve_steiner_instance(
            model, edge_variables, _potential_graph, pid, logfile)

        # select best model and post process the solution
        optimal_solution = proposed_solutions[0]
        optimal_solution = nx.relabel_nodes(optimal_solution, decoder)

        optimal_solution = self.post_process_steiner_solution(
            optimal_solution, root)

        # append sample names to the solution and populate the tree
        optimal_solution = (
            self.__append_sample_names_and_remove_spurious_leaves(
                optimal_solution, character_matrix))

        cassiopeia_tree.populate_tree(optimal_solution, layer=layer)

        # rename internal nodes such that they are not tuples
        node_name_generator = solver_utilities.node_name_generator()
        internal_node_rename = {}
        for i in cassiopeia_tree.internal_nodes:
            internal_node_rename[i] = next(node_name_generator)
        cassiopeia_tree.relabel_nodes(internal_node_rename)

        cassiopeia_tree.collapse_unifurcations()

        # collapse mutationless edges
        if collapse_mutationless_edges:
            cassiopeia_tree.collapse_mutationless_edges(
                infer_ancestral_characters=True)
        logger.removeHandler(handler)
Пример #6
0
 def test_lca_characters(self):
     vecs = [[1, 0, 3, 4, 5], [1, -1, -1, 3, -1], [1, 2, 3, 2, -1]]
     ret_vec = data_utilities.get_lca_characters(vecs,
                                                 missing_state_indicator=-1)
     self.assertEqual(ret_vec, [1, 0, 3, 0, 5])
Пример #7
0
    def percolate(
        self,
        character_matrix: pd.DataFrame,
        samples: List[str],
        priors: Optional[Dict[int, Dict[int, float]]] = None,
        weights: Optional[Dict[int, Dict[int, float]]] = None,
        missing_state_indicator: int = -1,
    ) -> Tuple[List[str], List[str]]:
        """The function used by the percolation algorithm to partition the
        set of samples in two.
        First, a pairwise similarity graph is generated with samples as nodes
        such that edges between a pair of nodes is some provided function on
        the number of character/state mutations shared. Then, the algorithm
        removes the minimum edge (in the case of ties all are removed) until
        the graph is split into multiple connected components. If there are more
        than two connected components, the procedure joins them until two remain.
        This is done by inferring the mutations of the LCA of each sample set
        obeying Camin-Sokal Parsimony, and then clustering the groups of samples
        based on their LCAs. The provided solver is used to cluster the groups
        into two clusters.
        Args:
            character_matrix: Character matrix
            samples: A list of samples to partition
            priors: A dictionary storing the probability of each character
                mutating to a particular state.
            weights: Weighting of each (character, state) pair. Typically a
                transformation of the priors.
            missing_state_indicator: Character representing missing data.
        Returns:
            A tuple of lists, representing the left and right partition groups
        """
        sample_indices = solver_utilities.convert_sample_names_to_indices(
            character_matrix.index, samples)
        unique_character_array = character_matrix.to_numpy()

        G = nx.Graph()
        G.add_nodes_from(sample_indices)

        # Add edge weights into the similarity graph
        edge_weight_buckets = defaultdict(list)
        for i, j in itertools.combinations(sample_indices, 2):
            similarity = self.similarity_function(
                unique_character_array[i, :],
                unique_character_array[j, :],
                missing_state_indicator,
                weights,
            )
            if similarity > self.threshold:
                edge_weight_buckets[similarity].append((i, j))
                G.add_edge(i, j)

        if len(G.edges) == 0:
            return samples, []

        connected_components = list(nx.connected_components(G))
        sorted_edge_weights = sorted(edge_weight_buckets, reverse=True)

        # Percolate the similarity graph by continuously removing the minimum
        # edge until at least two components exists
        while len(connected_components) <= 1:
            min_weight = sorted_edge_weights.pop()
            for edge in edge_weight_buckets[min_weight]:
                G.remove_edge(edge[0], edge[1])
            connected_components = list(nx.connected_components(G))

        # If the number of connected components > 2, merge components by
        # joining the most similar LCAs of each component until
        # only 2 remain
        partition_sides = []

        if len(connected_components) > 2:
            for c in range(len(connected_components)):
                connected_components[c] = list(connected_components[c])
            lcas = {}
            component_to_nodes = {}
            # Find the LCA of the nodes in each connected component
            for ind in range(len(connected_components)):
                component_identifier = "component" + str(ind)
                component_to_nodes[
                    component_identifier] = connected_components[ind]
                character_vectors = [
                    list(i) for i in list(unique_character_array[
                        connected_components[ind], :])
                ]
                lcas[component_identifier] = data_utilities.get_lca_characters(
                    character_vectors, missing_state_indicator)
            # Build a tree on the LCA characters to cluster the components
            lca_tree = CassiopeiaTree(
                pd.DataFrame.from_dict(lcas, orient="index"),
                missing_state_indicator=missing_state_indicator,
                priors=priors,
            )

            self.joining_solver.solve(lca_tree,
                                      collapse_mutationless_edges=False)
            grouped_components = []

            # Take the split at the root as the clusters of components
            # in the split, ignoring unifurcations
            current_node = lca_tree.root
            while len(grouped_components) == 0:
                successors = lca_tree.children(current_node)
                if len(successors) == 1:
                    current_node = successors[0]
                else:
                    for i in successors:
                        grouped_components.append(
                            lca_tree.leaves_in_subtree(i))

            # For each component in each cluster, take the nodes in that
            # component to form the final split
            for cluster in grouped_components:
                sample_index_group = []
                for component in cluster:
                    sample_index_group.extend(component_to_nodes[component])
                partition_sides.append(sample_index_group)
        else:
            for c in range(len(connected_components)):
                partition_sides.append(list(connected_components[c]))

        # Convert from component indices back to the sample names in the
        # original character matrix
        sample_names = list(character_matrix.index)
        partition_named = []
        for sample_index_group in partition_sides:
            sample_name_group = []
            for sample_index in sample_index_group:
                sample_name_group.append(sample_names[sample_index])
            partition_named.append(sample_name_group)

        return partition_named