def solve( self, cassiopeia_tree: CassiopeiaTree, layer: Optional[str] = None, collapse_mutationless_edges: bool = False, logfile: str = "stdout.log", ): """The general hybrid solver routine. The hybrid solver proceeds by clustering together cells using the algorithm stored in the top_solver until a criteria is reached. Once this criteria is reached, the bottom_solver is applied to each subproblem left over from the "greedy" clustering. Args: cassiopeia_tree: CassiopeiaTree that stores the character matrix and priors for reconstruction. layer: Layer storing the character matrix for solving. If None, the default character matrix is used in the CassiopeiaTree. collapse_mutationless_edges: Indicates if the final reconstructed tree should collapse mutationless edges based on internal states inferred by Camin-Sokal parsimony. In scoring accuracy, this removes artifacts caused by arbitrarily resolving polytomies. logfile: Location to log progress. """ node_name_generator = solver_utilities.node_name_generator() if layer: character_matrix = cassiopeia_tree.layers[layer].copy() else: character_matrix = cassiopeia_tree.character_matrix.copy() unique_character_matrix = character_matrix.drop_duplicates() weights = None if cassiopeia_tree.priors: weights = solver_utilities.transform_priors( cassiopeia_tree.priors, self.prior_transformation) tree = nx.DiGraph() # call top-down solver until a desired cutoff is reached. _, subproblems, tree = self.apply_top_solver( unique_character_matrix, list(unique_character_matrix.index), tree, node_name_generator, weights=weights, missing_state_indicator=cassiopeia_tree.missing_state_indicator, ) logfile_names = iter([i for i in range(1, len(subproblems) + 1)]) # multi-threaded bottom solver approach with multiprocessing.Pool(processes=self.threads) as pool: results = list( tqdm( pool.starmap( self.apply_bottom_solver, [( cassiopeia_tree, subproblem[0], subproblem[1], f"{logfile.split('.log')[0]}-" f"{next(logfile_names)}.log", layer, ) for subproblem in subproblems], ), total=len(subproblems), )) for result in results: subproblem_tree, subproblem_root = result[0], result[1] # check that the only overlapping name is the root, else # add a new name so that we don't get edges across the tree existing_nodes = [n for n in tree] mapping = {} for n in subproblem_tree: if n in existing_nodes and n != subproblem_root: mapping[n] = next(node_name_generator) subproblem_tree = nx.relabel_nodes(subproblem_tree, mapping) tree = nx.compose(tree, subproblem_tree) # append sample names to the solution and populate the tree samples_tree = self.__add_duplicates_to_tree_and_remove_spurious_leaves( tree, character_matrix, node_name_generator) cassiopeia_tree.populate_tree(samples_tree, layer=layer) cassiopeia_tree.collapse_unifurcations() # collapse mutationless edges if collapse_mutationless_edges: cassiopeia_tree.collapse_mutationless_edges( infer_ancestral_characters=True)
def solve( self, cassiopeia_tree: CassiopeiaTree, layer: Optional[str] = None, collapse_mutationless_edges: bool = False, logfile: str = "stdout.log", ) -> None: """Solves a tree for a general bottom-up distance-based solver routine. The general solver routine proceeds by iteratively finding pairs of samples to join together into a "cherry" and then reform the dissimilarity matrix with respect to this new cherry. The implementation of how to find cherries and update the dissimilarity map is left to subclasses of DistanceSolver. The function will update the `tree` attribute of the input CassiopeiaTree. Args: cassiopeia_tree: CassiopeiaTree object to be populated layer: Layer storing the character matrix for solving. If None, the default character matrix is used in the CassiopeiaTree. collapse_mutationless_edges: Indicates if the final reconstructed tree should collapse mutationless edges based on internal states inferred by Camin-Sokal parsimony. In scoring accuracy, this removes artifacts caused by arbitrarily resolving polytomies. logfile: File location to log output. Not currently used. """ node_name_generator = solver_utilities.node_name_generator() dissimilarity_map = self.get_dissimilarity_map(cassiopeia_tree, layer) N = dissimilarity_map.shape[0] # instantiate a dissimilarity map that can be updated as we join # together nodes. _dissimilarity_map = dissimilarity_map.copy() # instantiate a tree where all samples appear as leaves. tree = nx.Graph() tree.add_nodes_from(_dissimilarity_map.index) while N > 2: i, j = self.find_cherry(_dissimilarity_map.to_numpy()) # get indices in the dissimilarity matrix to join node_i, node_j = ( _dissimilarity_map.index[i], _dissimilarity_map.index[j], ) new_node_name = next(node_name_generator) tree.add_node(new_node_name) tree.add_edges_from([(new_node_name, node_i), (new_node_name, node_j)]) _dissimilarity_map = self.update_dissimilarity_map( _dissimilarity_map, (node_i, node_j), new_node_name) N = _dissimilarity_map.shape[0] tree = self.root_tree( tree, cassiopeia_tree.root_sample_name, _dissimilarity_map.index.values, ) # remove root from character matrix before populating tree if (cassiopeia_tree.root_sample_name in cassiopeia_tree.character_matrix.index): cassiopeia_tree.character_matrix = ( cassiopeia_tree.character_matrix.drop( index=cassiopeia_tree.root_sample_name)) cassiopeia_tree.populate_tree(tree, layer=layer) cassiopeia_tree.collapse_unifurcations() # collapse mutationless edges if collapse_mutationless_edges: cassiopeia_tree.collapse_mutationless_edges( infer_ancestral_characters=True)
def solve( self, cassiopeia_tree: CassiopeiaTree, layer: Optional[str] = None, collapse_mutationless_edges: bool = False, logfile: str = "stdout.log", ): """Infers a tree with Cassiopeia-ILP. Solves a tree using the Cassiopeia-ILP algorithm and populates a tree in the provided CassiopeiaTree. Args: cassiopeia_tree: Input CassiopeiaTree layer: Layer storing the character matrix for solving. If None, the default character matrix is used in the CassiopeiaTree. collapse_mutationless_edges: Indicates if the final reconstructed tree should collapse mutationless edges based on internal states inferred by Camin-Sokal parsimony. In scoring accuracy, this removes artifacts caused by arbitrarily resolving polytomies. logfile: Location to log progress. """ if self.weighted and not cassiopeia_tree.priors: raise ILPSolverError( "Specify prior probabilities in the CassiopeiaTree for weighted" " analysis.") # setup logfile config handler = logging.FileHandler(logfile) handler.setLevel(logging.INFO) logger.addHandler(handler) logger.info("Solving tree with the following parameters.") logger.info(f"Convergence time limit: {self.convergence_time_limit}") logger.info( f"Convergence iteration limit: {self.convergence_iteration_limit}") logger.info( f"Max potential graph layer size: {self.maximum_potential_graph_layer_size}" ) logger.info( f"Max potential graph lca distance: {self.maximum_potential_graph_lca_distance}" ) logger.info(f"MIP gap: {self.mip_gap}") if layer: character_matrix = cassiopeia_tree.layers[layer].copy() else: character_matrix = cassiopeia_tree.character_matrix.copy() if any( is_ambiguous_state(state) for state in character_matrix.values.flatten()): raise ILPSolverError("Solver does not support ambiguous states.") unique_character_matrix = character_matrix.drop_duplicates() weights = None if cassiopeia_tree.priors: weights = solver_utilities.transform_priors( cassiopeia_tree.priors, self.prior_transformation) # find the root of the tree & generate process ID root = tuple( data_utilities.get_lca_characters( unique_character_matrix.values.tolist(), cassiopeia_tree.missing_state_indicator, )) logger.info(f"Phylogenetic root: {root}") pid = hashlib.md5("|".join([str(r) for r in root ]).encode("utf-8")).hexdigest() targets = [tuple(t) for t in unique_character_matrix.values.tolist()] if unique_character_matrix.shape[0] == 1: optimal_solution = nx.DiGraph() optimal_solution.add_node(root) optimal_solution = ( self.__append_sample_names_and_remove_spurious_leaves( optimal_solution, character_matrix)) cassiopeia_tree.populate_tree(optimal_solution, layer=layer) return # determine diameter of the dataset by evaluating maximum distance to # the root from each sample if (self.maximum_potential_graph_lca_distance is not None) and ( self.maximum_potential_graph_lca_distance > 0): max_lca_distance = self.maximum_potential_graph_lca_distance else: max_lca_distance = 0 lca_distances = [ dissimilarity_functions.hamming_distance( root, np.array(u), ignore_missing_state=True, missing_state_indicator=cassiopeia_tree. missing_state_indicator, ) for u in targets ] for (i, j) in itertools.combinations(range(len(lca_distances)), 2): max_lca_distance = max(max_lca_distance, lca_distances[i] + lca_distances[j] + 1) # infer the potential graph potential_graph = self.infer_potential_graph( unique_character_matrix, pid, max_lca_distance, weights, cassiopeia_tree.missing_state_indicator, ) # generate Steiner Tree ILP model nodes = list(potential_graph.nodes()) encoder = dict(zip(nodes, list(range(len(nodes))))) decoder = dict((v, k) for k, v in encoder.items()) _potential_graph = nx.relabel_nodes(potential_graph, encoder) _targets = list(map(lambda x: encoder[x], targets)) _root = encoder[root] model, edge_variables = self.generate_steiner_model( _potential_graph, _root, _targets) # solve the ILP problem and return a set of proposed solutions proposed_solutions = self.solve_steiner_instance( model, edge_variables, _potential_graph, pid, logfile) # select best model and post process the solution optimal_solution = proposed_solutions[0] optimal_solution = nx.relabel_nodes(optimal_solution, decoder) optimal_solution = self.post_process_steiner_solution( optimal_solution, root) # append sample names to the solution and populate the tree optimal_solution = ( self.__append_sample_names_and_remove_spurious_leaves( optimal_solution, character_matrix)) cassiopeia_tree.populate_tree(optimal_solution, layer=layer) # rename internal nodes such that they are not tuples node_name_generator = solver_utilities.node_name_generator() internal_node_rename = {} for i in cassiopeia_tree.internal_nodes: internal_node_rename[i] = next(node_name_generator) cassiopeia_tree.relabel_nodes(internal_node_rename) cassiopeia_tree.collapse_unifurcations() # collapse mutationless edges if collapse_mutationless_edges: cassiopeia_tree.collapse_mutationless_edges( infer_ancestral_characters=True) logger.removeHandler(handler)