def test_hellinger(spatial_data): hellinger_data = np.abs(spatial_data[:-2].copy()) hellinger_data = hellinger_data / hellinger_data.sum(axis=1)[:, np.newaxis] hellinger_data = np.sqrt(hellinger_data) dist_matrix = hellinger_data @ hellinger_data.T dist_matrix = 1.0 - dist_matrix dist_matrix = np.sqrt(dist_matrix) # Correct for nan handling dist_matrix[np.isnan(dist_matrix)] = 0.0 test_matrix = dist.pairwise_special_metric(np.abs(spatial_data[:-2])) assert_array_almost_equal( test_matrix, dist_matrix, err_msg="Distances don't match " "for metric hellinger", ) # Ensure ll_dirichlet runs test_matrix = dist.pairwise_special_metric(np.abs(spatial_data[:-2]), metric="ll_dirichlet") assert ( test_matrix is not None), "Pairwise Special Metric with LL Dirichlet metric failed"
def test_sparse_hellinger(sparse_spatial_data): dist_matrix = dist.pairwise_special_metric( np.abs(sparse_spatial_data[:-2].toarray())) test_matrix = np.array([[ spdist.sparse_hellinger( np.abs(sparse_spatial_data[i]).indices, np.abs(sparse_spatial_data[i]).data, np.abs(sparse_spatial_data[j]).indices, np.abs(sparse_spatial_data[j]).data, ) for j in range(sparse_spatial_data.shape[0] - 2) ] for i in range(sparse_spatial_data.shape[0] - 2)]) assert_array_almost_equal( test_matrix, dist_matrix, err_msg="Sparse distances don't match " "for metric hellinger", decimal=4, ) # Ensure ll_dirichlet runs test_matrix = np.array([[ spdist.sparse_ll_dirichlet( sparse_spatial_data[i].indices, sparse_spatial_data[i].data, sparse_spatial_data[j].indices, sparse_spatial_data[j].data, ) for j in range(sparse_spatial_data.shape[0]) ] for i in range(sparse_spatial_data.shape[0])]) assert ( test_matrix is not None), "Pairwise Special Metric with LL Dirichlet metric failed"
def test_disconnected_data_precomputed(num_isolates, sparse): disconnected_data = np.random.choice(a=[False, True], size=(10, 20), p=[0.66, 1 - 0.66]) # Add some disconnected data for the corner case test disconnected_data = np.vstack( [disconnected_data, np.zeros((num_isolates, 20), dtype="bool")]) new_columns = np.zeros((num_isolates + 10, num_isolates), dtype="bool") for i in range(num_isolates): new_columns[10 + i, i] = True disconnected_data = np.hstack([disconnected_data, new_columns]) dmat = pairwise_special_metric(disconnected_data) if sparse: dmat = csr_matrix(dmat) model = UMAP(n_neighbors=3, metric="precomputed", disconnection_distance=1).fit(dmat) # Check that the first isolate has no edges in our umap.graph_ isolated_vertices = disconnected_vertices(model) assert isolated_vertices[10] == True number_of_nan = np.sum(np.isnan(model.embedding_[isolated_vertices])) assert number_of_nan >= num_isolates * model.n_components
def component_layout( data, n_components, component_labels, dim, random_state, metric="euclidean", metric_kwds={}, ): """Provide a layout relating the separate connected components. This is done by taking the centroid of each component and then performing a spectral embedding of the centroids. Parameters ---------- data: array of shape (n_samples, n_features) The source data -- required so we can generate centroids for each connected component of the graph. n_components: int The number of distinct components to be layed out. component_labels: array of shape (n_samples) For each vertex in the graph the label of the component to which the vertex belongs. dim: int The chosen embedding dimension. metric: string or callable (optional, default 'euclidean') The metric used to measure distances among the source data points. metric_kwds: dict (optional, default {}) Keyword arguments to be passed to the metric function. If metric is 'precomputed', 'linkage' keyword can be used to specify 'average', 'complete', or 'single' linkage. Default is 'average' Returns ------- component_embedding: array of shape (n_components, dim) The ``dim``-dimensional embedding of the ``n_components``-many connected components. """ component_centroids = np.empty((n_components, data.shape[1]), dtype=np.float64) if metric == "precomputed": # cannot compute centroids from precomputed distances # instead, compute centroid distances using linkage distance_matrix = np.zeros((n_components, n_components), dtype=np.float64) linkage = metric_kwds.get("linkage", "average") if linkage == "average": linkage = np.mean elif linkage == "complete": linkage = np.max elif linkage == "single": linkage = np.min else: raise ValueError("Unrecognized linkage '%s'. Please choose from " "'average', 'complete', or 'single'" % linkage) for c_i in range(n_components): dm_i = data[component_labels == c_i] for c_j in range(c_i + 1, n_components): dist = linkage(dm_i[:, component_labels == c_j]) distance_matrix[c_i, c_j] = dist distance_matrix[c_j, c_i] = dist else: for label in range(n_components): component_centroids[label] = data[component_labels == label].mean( axis=0) if scipy.sparse.isspmatrix(component_centroids): warn( "Forcing component centroids to dense; if you are running out of " "memory then consider increasing n_neighbors.") component_centroids = component_centroids.toarray() if metric in SPECIAL_METRICS: distance_matrix = pairwise_special_metric(component_centroids, metric=metric) elif metric in SPARSE_SPECIAL_METRICS: distance_matrix = pairwise_special_metric( component_centroids, metric=SPARSE_SPECIAL_METRICS[metric]) else: if callable(metric) and scipy.sparse.isspmatrix(data): function_to_name_mapping = { v: k for k, v in sparse_named_distances.items() } try: metric_name = function_to_name_mapping[metric] except KeyError: raise NotImplementedError( "Multicomponent layout for custom " "sparse metrics is not implemented at " "this time.") distance_matrix = pairwise_distances(component_centroids, metric=metric_name, **metric_kwds) else: distance_matrix = pairwise_distances(component_centroids, metric=metric, **metric_kwds) affinity_matrix = np.exp(-(distance_matrix**2)) component_embedding = SpectralEmbedding( n_components=dim, affinity="precomputed", random_state=random_state).fit_transform(affinity_matrix) component_embedding /= component_embedding.max() return component_embedding
def test_grad_metrics_match_metrics(): for metric in dist.named_distances_with_gradients: if metric in spatial_distances: dist_matrix = pairwise_distances(spatial_data, metric=metric) # scipy is bad sometimes if metric == "braycurtis": dist_matrix[np.where(~np.isfinite(dist_matrix))] = 0.0 if metric in ("cosine", "correlation"): dist_matrix[np.where(~np.isfinite(dist_matrix))] = 1.0 # And because distance between all zero vectors should be zero dist_matrix[10, 11] = 0.0 dist_matrix[11, 10] = 0.0 dist_function = dist.named_distances_with_gradients[metric] test_matrix = np.array( [ [ dist_function(spatial_data[i], spatial_data[j])[0] for j in range(spatial_data.shape[0]) ] for i in range(spatial_data.shape[0]) ] ) assert_array_almost_equal( test_matrix, dist_matrix, err_msg="Distances with grad don't match " "for metric {}".format(metric), ) # Handle the few special distances separately # SEuclidean v = np.abs(np.random.randn(spatial_data.shape[1])) dist_matrix = pairwise_distances(spatial_data, metric="seuclidean", V=v) test_matrix = np.array( [ [ dist.standardised_euclidean_grad(spatial_data[i], spatial_data[j], v)[0] for j in range(spatial_data.shape[0]) ] for i in range(spatial_data.shape[0]) ] ) assert_array_almost_equal( test_matrix, dist_matrix, err_msg="Distances don't match " "for metric seuclidean", ) # Weighted minkowski dist_matrix = pairwise_distances(spatial_data, metric="wminkowski", w=v, p=3) test_matrix = np.array( [ [ dist.weighted_minkowski_grad(spatial_data[i], spatial_data[j], v, p=3)[ 0 ] for j in range(spatial_data.shape[0]) ] for i in range(spatial_data.shape[0]) ] ) assert_array_almost_equal( test_matrix, dist_matrix, err_msg="Distances don't match " "for metric weighted_minkowski", ) # Mahalanobis v = np.abs(np.random.randn(spatial_data.shape[1], spatial_data.shape[1])) dist_matrix = pairwise_distances(spatial_data, metric="mahalanobis", VI=v) test_matrix = np.array( [ [ dist.mahalanobis_grad(spatial_data[i], spatial_data[j], v)[0] for j in range(spatial_data.shape[0]) ] for i in range(spatial_data.shape[0]) ] ) assert_array_almost_equal( test_matrix, dist_matrix, err_msg="Distances don't match " "for metric mahalanobis", ) # Hellinger dist_matrix = dist.pairwise_special_metric( np.abs(spatial_data[:-2]), np.abs(spatial_data[:-2]) ) test_matrix = np.array( [ [ dist.hellinger_grad(np.abs(spatial_data[i]), np.abs(spatial_data[j]))[0] for j in range(spatial_data.shape[0] - 2) ] for i in range(spatial_data.shape[0] - 2) ] ) assert_array_almost_equal( test_matrix, dist_matrix, err_msg="Distances don't match " "for metric hellinger", )
def component_layout( data, n_components, component_labels, dim, random_state, metric="euclidean", metric_kwds={}, ): """Provide a layout relating the separate connected components. This is done by taking the centroid of each component and then performing a spectral embedding of the centroids. Parameters ---------- data: array of shape (n_samples, n_features) The source data -- required so we can generate centroids for each connected component of the graph. n_components: int The number of distinct components to be layed out. component_labels: array of shape (n_samples) For each vertex in the graph the label of the component to which the vertex belongs. dim: int The chosen embedding dimension. metric: string or callable (optional, default 'euclidean') The metric used to measure distances among the source data points. metric_kwds: dict (optional, default {}) Keyword arguments to be passed to the metric function. Returns ------- component_embedding: array of shape (n_components, dim) The ``dim``-dimensional embedding of the ``n_components``-many connected components. """ component_centroids = np.empty((n_components, data.shape[1]), dtype=np.float64) for label in range(n_components): component_centroids[label] = data[component_labels == label].mean( axis=0) if metric in ("hellinger", "ll_dirichlet"): distance_matrix = pairwise_special_metric(component_centroids, metric=metric) else: distance_matrix = pairwise_distances(component_centroids, metric=metric, **metric_kwds) affinity_matrix = np.exp(-(distance_matrix**2)) component_embedding = SpectralEmbedding( n_components=dim, affinity="precomputed", random_state=random_state).fit_transform(affinity_matrix) component_embedding /= component_embedding.max() return component_embedding
def test_grad_metrics_match_metrics(spatial_data, spatial_distances): for metric in dist.named_distances_with_gradients: if metric in spatial_distances: spatial_check(metric, spatial_data, spatial_distances, with_grad=True) # Handle the few special distances separately # SEuclidean v = np.abs(np.random.randn(spatial_data.shape[1])) dist_matrix = pairwise_distances(spatial_data, metric="seuclidean", V=v) test_matrix = np.array([[ dist.standardised_euclidean_grad(spatial_data[i], spatial_data[j], v)[0] for j in range(spatial_data.shape[0]) ] for i in range(spatial_data.shape[0])]) assert_array_almost_equal( test_matrix, dist_matrix, err_msg="Distances don't match " "for metric seuclidean", ) # Weighted minkowski dist_matrix = pairwise_distances(spatial_data, metric="wminkowski", w=v, p=3) test_matrix = np.array([[ dist.weighted_minkowski_grad(spatial_data[i], spatial_data[j], v, p=3)[0] for j in range(spatial_data.shape[0]) ] for i in range(spatial_data.shape[0])]) assert_array_almost_equal( test_matrix, dist_matrix, err_msg="Distances don't match " "for metric weighted_minkowski", ) # Mahalanobis v = np.abs(np.random.randn(spatial_data.shape[1], spatial_data.shape[1])) dist_matrix = pairwise_distances(spatial_data, metric="mahalanobis", VI=v) test_matrix = np.array([[ dist.mahalanobis_grad(spatial_data[i], spatial_data[j], v)[0] for j in range(spatial_data.shape[0]) ] for i in range(spatial_data.shape[0])]) assert_array_almost_equal( test_matrix, dist_matrix, err_msg="Distances don't match " "for metric mahalanobis", ) # Hellinger dist_matrix = dist.pairwise_special_metric(np.abs(spatial_data[:-2]), np.abs(spatial_data[:-2])) test_matrix = np.array([[ dist.hellinger_grad(np.abs(spatial_data[i]), np.abs(spatial_data[j]))[0] for j in range(spatial_data.shape[0] - 2) ] for i in range(spatial_data.shape[0] - 2)]) assert_array_almost_equal( test_matrix, dist_matrix, err_msg="Distances don't match " "for metric hellinger", )
def fit(self, X, y=None): """Generate graph to fit X into an embedded space. Optionally use y for supervised dimension reduction. Parameters ---------- X : array, shape (n_samples, n_features) or (n_samples, n_samples) If the metric is 'precomputed' X must be a square distance matrix. Otherwise it contains a sample per row. If the method is 'exact', X may be a sparse matrix of type 'csr', 'csc' or 'coo'. y : array, shape (n_samples) A target array for supervised dimension reduction. How this is handled is determined by parameters UMAP was instantiated with. The relevant attributes are ``target_metric`` and ``target_metric_kwds``. """ X = check_array(X, dtype=np.float32, accept_sparse="csr", order="C") self._raw_data = X # Handle all the optional arguments, setting default if self.a is None or self.b is None: self._a, self._b = find_ab_params(self.spread, self.min_dist) else: self._a = self.a self._b = self.b if isinstance(self.init, np.ndarray): init = check_array(self.init, dtype=np.float32, accept_sparse=False) else: init = self.init self._initial_alpha = self.learning_rate self._validate_parameters() if self.verbose: print(str(self)) self._original_n_threads = numba.get_num_threads() if self.n_jobs > 0 and self.njobs is not None: numba.set_num_threads(self.n_jobs) # Check if we should unique the data # We've already ensured that we aren't in the precomputed case if self.unique: # check if the matrix is dense if self._sparse_data: # Call a sparse unique function index, inverse, counts = csr_unique(X) else: index, inverse, counts = np.unique( X, return_index=True, return_inverse=True, return_counts=True, axis=0, )[1:4] if self.verbose: print( "Unique=True -> Number of data points reduced from ", X.shape[0], " to ", X[index].shape[0], ) most_common = np.argmax(counts) print( "Most common duplicate is", index[most_common], " with a count of ", counts[most_common], ) # If we aren't asking for unique use the full index. # This will save special cases later. else: index = list(range(X.shape[0])) inverse = list(range(X.shape[0])) # Error check n_neighbors based on data size if X[index].shape[0] <= self.n_neighbors: if X[index].shape[0] == 1: self.embedding_ = np.zeros( (1, self.n_components)) # needed to sklearn comparability return self warn("n_neighbors is larger than the dataset size; truncating to " "X.shape[0] - 1") self._n_neighbors = X[index].shape[0] - 1 if self.densmap: self._densmap_kwds["n_neighbors"] = self._n_neighbors else: self._n_neighbors = self.n_neighbors # Note: unless it causes issues for setting 'index', could move this to # initial sparsity check above if self._sparse_data and not X.has_sorted_indices: X.sort_indices() random_state = check_random_state(self.random_state) if self.verbose: print("Construct fuzzy simplicial set") if self.metric == "precomputed" and self._sparse_data: # For sparse precomputed distance matrices, we just argsort the rows to find # nearest neighbors. To make this easier, we expect matrices that are # symmetrical (so we can find neighbors by looking at rows in isolation, # rather than also having to consider that sample's column too). print("Computing KNNs for sparse precomputed distances...") if sparse_tril(X).getnnz() != sparse_triu(X).getnnz(): raise ValueError( "Sparse precomputed distance matrices should be symmetrical!" ) if not np.all(X.diagonal() == 0): raise ValueError( "Non-zero distances from samples to themselves!") self._knn_indices = np.zeros((X.shape[0], self.n_neighbors), dtype=np.int) self._knn_dists = np.zeros(self._knn_indices.shape, dtype=np.float) for row_id in range(X.shape[0]): # Find KNNs row-by-row row_data = X[row_id].data row_indices = X[row_id].indices if len(row_data) < self._n_neighbors: raise ValueError( "Some rows contain fewer than n_neighbors distances!") row_nn_data_indices = np.argsort(row_data)[:self._n_neighbors] self._knn_indices[row_id] = row_indices[row_nn_data_indices] self._knn_dists[row_id] = row_data[row_nn_data_indices] ( self.graph_, self._sigmas, self._rhos, self.graph_dists_, ) = fuzzy_simplicial_set( X[index], self.n_neighbors, random_state, "precomputed", self._metric_kwds, self._knn_indices, self._knn_dists, self.angular_rp_forest, self.set_op_mix_ratio, self.local_connectivity, True, self.verbose, self.densmap or self.output_dens, ) # Handle small cases efficiently by computing all distances elif X[index].shape[ 0] < 4096 and not self.force_approximation_algorithm: self._small_data = True try: # sklearn pairwise_distances fails for callable metric on sparse data _m = self.metric if self._sparse_data else self._input_distance_func dmat = pairwise_distances(X[index], metric=_m, **self._metric_kwds) except (ValueError, TypeError) as e: # metric is numba.jit'd or not supported by sklearn, # fallback to pairwise special if self._sparse_data: # Get a fresh metric since we are casting to dense if not callable(self.metric): _m = dist.named_distances[self.metric] dmat = dist.pairwise_special_metric( X[index].toarray(), metric=_m, kwds=self._metric_kwds, ) else: dmat = dist.pairwise_special_metric( X[index], metric=self._input_distance_func, kwds=self._metric_kwds, ) else: dmat = dist.pairwise_special_metric( X[index], metric=self._input_distance_func, kwds=self._metric_kwds, ) ( self.graph_, self._sigmas, self._rhos, self.graph_dists_, ) = fuzzy_simplicial_set( dmat, self._n_neighbors, random_state, "precomputed", self._metric_kwds, None, None, self.angular_rp_forest, self.set_op_mix_ratio, self.local_connectivity, True, self.verbose, self.densmap or self.output_dens, ) else: # Standard case self._small_data = False # Standard case if self._sparse_data and self.metric in pynn_sparse_named_distances: nn_metric = self.metric elif not self._sparse_data and self.metric in pynn_named_distances: nn_metric = self.metric else: nn_metric = self._input_distance_func ( self._knn_indices, self._knn_dists, self._knn_search_index, ) = nearest_neighbors( X[index], self._n_neighbors, nn_metric, self._metric_kwds, self.angular_rp_forest, random_state, self.low_memory, use_pynndescent=True, n_jobs=self.n_jobs, verbose=self.verbose, ) ( self.graph_, self._sigmas, self._rhos, self.graph_dists_, ) = fuzzy_simplicial_set( X[index], self.n_neighbors, random_state, nn_metric, self._metric_kwds, self._knn_indices, self._knn_dists, self.angular_rp_forest, self.set_op_mix_ratio, self.local_connectivity, True, self.verbose, self.densmap or self.output_dens, ) # Currently not checking if any duplicate points have differing labels # Might be worth throwing a warning... if y is not None: if self.densmap: raise NotImplementedError( "Supervised embedding is not supported with densMAP.") len_X = len(X) if not self._sparse_data else X.shape[0] if len_X != len(y): raise ValueError( "Length of x = {len_x}, length of y = {len_y}, while it must be equal." .format(len_x=len_X, len_y=len(y))) y_ = check_array(y, ensure_2d=False)[index] if self.target_metric == "categorical": if self.target_weight < 1.0: far_dist = 2.5 * (1.0 / (1.0 - self.target_weight)) else: far_dist = 1.0e12 self.graph_ = discrete_metric_simplicial_set_intersection( self.graph_, y_, far_dist=far_dist) elif self.target_metric in dist.DISCRETE_METRICS: if self.target_weight < 1.0: scale = 2.5 * (1.0 / (1.0 - self.target_weight)) else: scale = 1.0e12 # self.graph_ = discrete_metric_simplicial_set_intersection( # self.graph_, # y_, # metric=self.target_metric, # metric_kws=self.target_metric_kwds, # metric_scale=scale # ) metric_kws = dist.get_discrete_params(y_, self.target_metric) self.graph_ = discrete_metric_simplicial_set_intersection( self.graph_, y_, metric=self.target_metric, metric_kws=metric_kws, metric_scale=scale, ) else: if len(y_.shape) == 1: y_ = y_.reshape(-1, 1) if self.target_n_neighbors == -1: target_n_neighbors = self._n_neighbors else: target_n_neighbors = self.target_n_neighbors # Handle the small case as precomputed as before if y.shape[0] < 4096: try: ydmat = pairwise_distances(y_, metric=self.target_metric, **self._target_metric_kwds) except (TypeError, ValueError): ydmat = dist.pairwise_special_metric( y_, metric=self.target_metric, kwds=self._target_metric_kwds, ) target_graph, target_sigmas, target_rhos = fuzzy_simplicial_set( ydmat, target_n_neighbors, random_state, "precomputed", self._target_metric_kwds, None, None, False, 1.0, 1.0, False, ) else: # Standard case target_graph, target_sigmas, target_rhos = fuzzy_simplicial_set( y_, target_n_neighbors, random_state, self.target_metric, self._target_metric_kwds, None, None, False, 1.0, 1.0, False, ) # product = self.graph_.multiply(target_graph) # # self.graph_ = 0.99 * product + 0.01 * (self.graph_ + # # target_graph - # # product) # self.graph_ = product self.graph_ = general_simplicial_set_intersection( self.graph_, target_graph, self.target_weight) self.graph_ = reset_local_connectivity(self.graph_) self._supervised = True else: self._supervised = False # embed graph self.fit_embed_data(X, y, index, inverse) return self
def component_layout( data, n_components, component_labels, dim, random_state, metric="euclidean", metric_kwds={}, ): """Provide a layout relating the separate connected components. This is done by taking the centroid of each component and then performing a spectral embedding of the centroids. Parameters ---------- data: array of shape (n_samples, n_features) The source data -- required so we can generate centroids for each connected component of the graph. n_components: int The number of distinct components to be layed out. component_labels: array of shape (n_samples) For each vertex in the graph the label of the component to which the vertex belongs. dim: int The chosen embedding dimension. metric: string or callable (optional, default 'euclidean') The metric used to measure distances among the source data points. metric_kwds: dict (optional, default {}) Keyword arguments to be passed to the metric function. If metric is 'precomputed', 'linkage' keyword can be used to specify 'average', 'complete', or 'single' linkage. Default is 'average' Returns ------- component_embedding: array of shape (n_components, dim) The ``dim``-dimensional embedding of the ``n_components``-many connected components. """ component_centroids = np.empty((n_components, data.shape[1]), dtype=np.float64) if metric == "precomputed": # cannot compute centroids from precomputed distances # instead, compute centroid distances using linkage distance_matrix = np.zeros((n_components, n_components), dtype=np.float64) linkage = metric_kwds.get("linkage", "average") if linkage == "average": linkage = np.mean elif linkage == "complete": linkage = np.max elif linkage == "single": linkage = np.min else: raise ValueError("Unrecognized linkage '%s'. Please choose from " "'average', 'complete', or 'single'" % linkage) for c_i in range(n_components): dm_i = data[component_labels == c_i] for c_j in range(c_i + 1, n_components): dist = linkage(dm_i[:, component_labels == c_j]) distance_matrix[c_i, c_j] = dist distance_matrix[c_j, c_i] = dist else: for label in range(n_components): component_centroids[label] = data[component_labels == label].mean( axis=0) if metric in ("hellinger", "ll_dirichlet"): distance_matrix = pairwise_special_metric(component_centroids, metric=metric) else: distance_matrix = pairwise_distances(component_centroids, metric=metric, **metric_kwds) affinity_matrix = np.exp(-(distance_matrix**2)) component_embedding = SpectralEmbedding( n_components=dim, affinity="precomputed", random_state=random_state).fit_transform(affinity_matrix) component_embedding /= component_embedding.max() return component_embedding