def neighbors(self, k=10, queue_size=5, random_state=0): """\ Calculate neighbors of `adata_new` observations in `adata`. This function calculates `k` neighbors in `adata` for each observation of `adata_new`. """ from umap.nndescent import initialise_search from umap.utils import deheap_sort from umap.umap_ import INT32_MAX, INT32_MIN random_state = check_random_state(random_state) rng_state = random_state.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64) train = self._rep test = self._obsm['rep'] init = initialise_search( self._rp_forest, train, test, int(k * queue_size), self._random_init, self._tree_init, rng_state, ) result = self._search( train, self._search_graph.indptr, self._search_graph.indices, init, test, ) indices, dists = deheap_sort(result) self._indices, self._distances = indices[:, :k], dists[:, :k]
def test_nn_search(nn_data): train = nn_data[100:] test = nn_data[:100] (knn_indices, knn_dists, rp_forest) = nearest_neighbors( train, 10, "euclidean", {}, False, np.random, use_pynndescent=False, ) # Commented - NOT REALLY USED IN THE TEST # graph = fuzzy_simplicial_set( # nn_data, # 10, # np.random, # "euclidean", # {}, # knn_indices, # knn_dists, # False, # 1.0, # 1.0, # False, # ) search_graph = setup_search_graph(knn_dists, knn_indices, train) rng_state = np.random.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64) init = initialise_search(rp_forest, train, test, int(10 * 3), rng_state, dist.euclidean) result = initialized_nnd_search(train, search_graph.indptr, search_graph.indices, init, test, dist.euclidean) indices, dists = deheap_sort(result) indices = indices[:, :10] tree = KDTree(train) true_indices = tree.query(test, 10, return_distance=False) num_correct = 0.0 for i in range(test.shape[0]): num_correct += np.sum(np.in1d(true_indices[i], indices[i])) percent_correct = num_correct / (test.shape[0] * 10) assert_greater_equal( percent_correct, 0.99, "Sparse NN-descent did not get " "99% accuracy on nearest " "neighbors", )
def test_nn_search(): train = nn_data[100:] test = nn_data[:100] (knn_indices, knn_dists, rp_forest) = nearest_neighbors(train, 10, "euclidean", {}, False, np.random) graph = fuzzy_simplicial_set( nn_data, 10, np.random, "euclidean", {}, knn_indices, knn_dists, False, 1.0, 1.0, False, ) search_graph = sparse.lil_matrix((train.shape[0], train.shape[0]), dtype=np.int8) search_graph.rows = knn_indices search_graph.data = (knn_dists != 0).astype(np.int8) search_graph = search_graph.maximum(search_graph.transpose()).tocsr() random_init, tree_init = make_initialisations(dist.euclidean, ()) search = make_initialized_nnd_search(dist.euclidean, ()) rng_state = np.random.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64) init = initialise_search(rp_forest, train, test, int(10 * 3), random_init, tree_init, rng_state) result = search(train, search_graph.indptr, search_graph.indices, init, test) indices, dists = deheap_sort(result) indices = indices[:, :10] tree = KDTree(train) true_indices = tree.query(test, 10, return_distance=False) num_correct = 0.0 for i in range(test.shape[0]): num_correct += np.sum(np.in1d(true_indices[i], indices[i])) percent_correct = num_correct / (test.shape[0] * 10) assert_greater_equal( percent_correct, 0.99, "Sparse NN-descent did not get " "99% accuracy on nearest " "neighbors", )
def _nhood_search(umap_object, nhood_size): if umap_object._small_data: dmat = sklearn.metrics.pairwise_distances(umap_object._raw_data) indices = np.argpartition(dmat, nhood_size)[:, :nhood_size] dmat_shortened = submatrix(dmat, indices, nhood_size) indices_sorted = np.argsort(dmat_shortened) indices = submatrix(indices, indices_sorted, nhood_size) dists = submatrix(dmat_shortened, indices_sorted, nhood_size) else: rng_state = np.empty(3, dtype=np.int64) if len(umap_object._metric_kwds) >= 1: _dist = umap_object._input_distance_func _args = tuple(umap_object._metric_kwds.values()) @numba.njit() def _metric(x, y): _dist(x, y, *_args) else: _metric = umap_object._input_distance_func init = initialise_search( umap_object._rp_forest, umap_object._raw_data, umap_object._raw_data, int(nhood_size * umap_object.transform_queue_size), rng_state, _metric, ) result = initialized_nnd_search( umap_object._raw_data, umap_object._search_graph.indptr, umap_object._search_graph.indices, init, umap_object._raw_data, _metric, ) indices, dists = deheap_sort(result) indices = indices[:, :nhood_size] dists = dists[:, :nhood_size] return indices, dists
def neighbors_update(adata, adata_new, k=10, queue_size=5, random_state=0): # only with use_rep='X' for now from umap.nndescent import make_initialisations, make_initialized_nnd_search, initialise_search from umap.umap_ import INT32_MAX, INT32_MIN from umap.utils import deheap_sort import umap.distances as dist if 'metric_kwds' in adata.uns['neighbors']['params']: dist_args = tuple( adata.uns['neighbors']['params']['metric_kwds'].values()) else: dist_args = () dist_func = dist.named_distances[adata.uns['neighbors']['params'] ['metric']] random_init, tree_init = make_initialisations(dist_func, dist_args) search = make_initialized_nnd_search(dist_func, dist_args) search_graph = adata.uns['neighbors']['distances'].copy() search_graph.data = (search_graph.data > 0).astype(np.int8) search_graph = search_graph.maximum(search_graph.transpose()) # prune it? random_state = check_random_state(random_state) rng_state = random_state.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64) if 'rp_forest' in adata.uns['neighbors']: rp_forest = _rp_forest_generate(adata.uns['neighbors']['rp_forest']) else: rp_forest = None train = adata.X test = adata_new.X init = initialise_search(rp_forest, train, test, int(k * queue_size), random_init, tree_init, rng_state) result = search(train, search_graph.indptr, search_graph.indices, init, test) indices, dists = deheap_sort(result) return indices[:, :k], dists[:, :k]
def _nhood_search(umap_object, nhood_size): if umap_object._small_data: dmat = sklearn.metrics.pairwise_distances(umap_object._raw_data) indices = np.argpartition(dmat, nhood_size)[:, :nhood_size] dmat_shortened = submatrix(dmat, indices, nhood_size) indices_sorted = np.argsort(dmat_shortened) indices = submatrix(indices, indices_sorted, nhood_size) dists = submatrix(dmat_shortened, indices_sorted, nhood_size) else: rng_state = np.empty(3, dtype=np.int64) init = initialise_search( umap_object._rp_forest, umap_object._raw_data, umap_object._raw_data, int(nhood_size * umap_object.transform_queue_size), rng_state, umap_object._distance_func, umap_object._dist_args, ) result = initialized_nnd_search( umap_object._raw_data, umap_object._search_graph.indptr, umap_object._search_graph.indices, init, umap_object._raw_data, umap_object._distance_func, umap_object._dist_args, ) indices, dists = deheap_sort(result) indices = indices[:, :nhood_size] dists = dists[:, :nhood_size] return indices, dists