def get_dissimilarity_map(self, cassiopeia_tree: CassiopeiaTree, layer: Optional[str] = None) -> pd.DataFrame: """Obtains or generates a matrix that is updated throughout the solver. The highest-level method to obtain a dissimilarity map, which will be the matrix primarily used throughout the solve method. This matrix contains the pairwise dissimilarity between samples which is used for identifying sample pairs to merge, and will be updated at every iteration within the solve method. This method is not limited to outputting dissimilarity maps, but is instead deliberately designed to be overwritten to allow for use of similarity maps or other algorithm-specific sample to sample comparison maps in derived classes. Args: cassiopeia_tree: Tree object from which the dissimilarity map is generated from layer: Layer storing the character matrix for solving. If None, the default character matrix is used in the CassiopeiaTree. Returns: pd.DataFrame: The matrix that will be used throughout the solve method. """ self.setup_dissimilarity_map(cassiopeia_tree, layer) dissimilarity_map = cassiopeia_tree.get_dissimilarity_map() return dissimilarity_map
def setup_root_finder(self, cassiopeia_tree: CassiopeiaTree) -> None: """Gives the implicit rooting strategy for the SNJ Solver. By default, the SpectralNeighborJoining algorithm returns an unrooted tree. To root this tree, an implicit root of all zeros is added to the character matrix. Then, the dissimilarity map is recalculated using the updated character matrix. If the tree already has a computed dissimilarity map, only the new similarities are calculated. See 'setup_root_finder' in NeighborJoiningSolver. Args: cassiopeia_tree: Input CassiopeiaTree to `solve` """ character_matrix = cassiopeia_tree.character_matrix.copy() rooted_character_matrix = character_matrix.copy() root = [0] * rooted_character_matrix.shape[1] rooted_character_matrix.loc["root"] = root cassiopeia_tree.root_sample_name = "root" cassiopeia_tree.character_matrix = rooted_character_matrix if self.dissimilarity_function is None: raise DistanceSolver.DistanceSolverError( "Please specify a dissimilarity function to add an implicit " "root, or specify an explicit root" ) dissimilarity_map = cassiopeia_tree.get_dissimilarity_map() if dissimilarity_map is None: cassiopeia_tree.compute_dissimilarity_map( self.dissimilarity_function, self.prior_transformation ) else: dissimilarity = {"root": 0} for leaf in character_matrix.index: weights = None if cassiopeia_tree.priors: weights = solver_utilities.transform_priors( cassiopeia_tree.priors, self.prior_transformation ) dissimilarity[leaf] = self.dissimilarity_function( rooted_character_matrix.loc["root"].values, rooted_character_matrix.loc[leaf].values, cassiopeia_tree.missing_state_indicator, weights, ) cassiopeia_tree.set_dissimilarity("root", dissimilarity) cassiopeia_tree.character_matrix = character_matrix
def setup_dissimilarity_map(self, cassiopeia_tree: CassiopeiaTree, layer: Optional[str] = None) -> None: """Sets up the solver. Sets up the solver with respect to the input CassiopeiaTree by creating the dissimilarity map if needed and setting up the "root" sample if the tree will be rooted. Operates directly on the CassiopeiaTree. Args: cassiopeia_tree: Input CassiopeiaTree to `solve`. layer: Layer storing the character matrix for solving. If None, the default character matrix is used in the CassiopeiaTree. Raises: A `DistanceSolverError` if rooting parameters are not passed in correctly (i.e. no root is specified and the user has not asked to find a root) or when a dissimilarity map cannot be found or computed. """ # if root sample is not specified, we'll add the implicit root # and recompute the dissimilarity map if cassiopeia_tree.root_sample_name is None: if self.add_root: self.setup_root_finder(cassiopeia_tree) else: raise DistanceSolverError( "Please specify an explicit root sample in the Cassiopeia Tree" " or specify the solver to add an implicit root") if cassiopeia_tree.get_dissimilarity_map() is None: if self.dissimilarity_function is None: raise DistanceSolverError( "Please specify a dissimilarity function or populate the " "CassiopeiaTree object with a dissimilarity map") cassiopeia_tree.compute_dissimilarity_map( self.dissimilarity_function, self.prior_transformation, layer)
def compute_cophenetic_correlation( tree: CassiopeiaTree, weights: Optional[pd.DataFrame] = None, dissimilarity_map: Optional[pd.DataFrame] = None, dissimilarity_function: Optional[ Callable[[np.array, np.array, int, Dict[int, Dict[int, float]]], float] ] = dissimilarity_functions.weighted_hamming_distance, ) -> Tuple[float, float]: """Computes the cophenetic correlation of a lineage. Computes the cophenetic correlation of a lineage, which is defined as the Pearson correlation between the phylogenetic distance and dissimilarity between characters. If neither weight matrix nor the dissimilarity map are precomputed, then this function will run in O(mn^2 + n^2logn + n^2) time, as the dissimilarity map will take O(mn^2) time, the phylogenetic distance will take O(n^2 logn) time, and the Pearson correlation will take O(n^2) time since it must compare n^2 entries (n = number of leaves; m = number of characters). Args: tree: CassiopeiaTree weights: Phylogenetic weights matrix. If this is not specified, invokes `cas.data.compute_phylogenetic_weight_matrix` dissimilarity_map: Dissimilarity matrix between samples. If this is not specified, then `tree.compute_dissimilarity_map` will be called. dissimilarity_function: Dissimilarity function to use. If dissimilarity map is not passed in, and one does not already exist in the CassiopeiaTree, then this function will be used to compute the dissimilarities between samples. Returns: The cophenetic correlation value and significance for the tree. """ # set phylogenetic weight matrix W = ( compute_phylogenetic_weight_matrix(tree) if (weights is None) else weights ) # set dissimilarity map D = ( tree.get_dissimilarity_map() if (dissimilarity_map is None) else dissimilarity_map ) if D is None: tree.compute_dissimilarity_map( dissimilarity_function=dissimilarity_function ) D = tree.get_dissimilarity_map() # align matrices cells = tree.leaves W = W.loc[cells, cells] D = D.loc[cells, cells] # convert to condensed distance matrices Wp = spatial.distance.squareform(W) Dp = spatial.distance.squareform(D) return stats.pearsonr(Wp, Dp)