def neighbors(self, k=10, queue_size=5, random_state=0): """\ Calculate neighbors of `adata_new` observations in `adata`. This function calculates `k` neighbors in `adata` for each observation of `adata_new`. """ from umap.utils import deheap_sort from umap.umap_ import INT32_MAX, INT32_MIN random_state = check_random_state(random_state) rng_state = random_state.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64) train = self._rep test = self._obsm['rep'] init = self._initialise_search( self._rp_forest, train, test, int(k * queue_size), rng_state=rng_state, ) result = self._search( train, self._search_graph.indptr, self._search_graph.indices, init, test, ) indices, dists = deheap_sort(result) self._indices, self._distances = indices[:, :k], dists[:, :k]
def neighbors(self, k=None, queue_size=5, epsilon=0.1, random_state=0): """\ Calculate neighbors of `adata_new` observations in `adata`. This function calculates `k` neighbors in `adata` for each observation of `adata_new`. """ from umap.umap_ import INT32_MAX, INT32_MIN random_state = check_random_state(random_state) rng_state = random_state.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64) train = self._rep test = self._obsm['rep'] if k is None: k = self._n_neighbors if self._use_pynndescent: self._nnd_idx.search_rng_state = rng_state self._indices, self._distances = self._nnd_idx.query(test, k, epsilon) else: from umap.utils import deheap_sort init = self._initialise_search( self._rp_forest, train, test, int(k * queue_size), rng_state=rng_state ) result = self._search( train, self._search_graph.indptr, self._search_graph.indices, init, test ) indices, dists = deheap_sort(result) self._indices, self._distances = indices[:, :k], dists[:, :k]
def nn_descent( data, n_neighbors, rng_state, max_candidates=50, dist=dist.euclidean, n_iters=10, delta=0.001, rho=0.5, rp_tree_init=True, leaf_array=None, low_memory=False, verbose=False, ): tried = set([(-1, -1)]) current_graph = make_heap(data.shape[0], n_neighbors) for i in range(data.shape[0]): indices = rejection_sample(n_neighbors, data.shape[0], rng_state) for j in range(indices.shape[0]): d = dist(data[i], data[indices[j]]) heap_push(current_graph, i, d, indices[j], 1) heap_push(current_graph, indices[j], d, i, 1) tried.add((i, indices[j])) tried.add((indices[j], i)) if rp_tree_init: init_rp_tree(data, dist, current_graph, leaf_array, tried=tried) if low_memory: nn_descent_internal_low_memory( current_graph, data, n_neighbors, rng_state, max_candidates=max_candidates, dist=dist, n_iters=n_iters, delta=delta, rho=rho, verbose=verbose, ) else: nn_descent_internal_high_memory( current_graph, data, n_neighbors, rng_state, tried, max_candidates=max_candidates, dist=dist, n_iters=n_iters, delta=delta, rho=rho, verbose=verbose, ) return deheap_sort(current_graph)
def test_nn_search(nn_data): train = nn_data[100:] test = nn_data[:100] (knn_indices, knn_dists, rp_forest) = nearest_neighbors( train, 10, "euclidean", {}, False, np.random, use_pynndescent=False, ) # Commented - NOT REALLY USED IN THE TEST # graph = fuzzy_simplicial_set( # nn_data, # 10, # np.random, # "euclidean", # {}, # knn_indices, # knn_dists, # False, # 1.0, # 1.0, # False, # ) search_graph = setup_search_graph(knn_dists, knn_indices, train) rng_state = np.random.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64) init = initialise_search(rp_forest, train, test, int(10 * 3), rng_state, dist.euclidean) result = initialized_nnd_search(train, search_graph.indptr, search_graph.indices, init, test, dist.euclidean) indices, dists = deheap_sort(result) indices = indices[:, :10] tree = KDTree(train) true_indices = tree.query(test, 10, return_distance=False) num_correct = 0.0 for i in range(test.shape[0]): num_correct += np.sum(np.in1d(true_indices[i], indices[i])) percent_correct = num_correct / (test.shape[0] * 10) assert_greater_equal( percent_correct, 0.99, "Sparse NN-descent did not get " "99% accuracy on nearest " "neighbors", )
def test_nn_search(): train = nn_data[100:] test = nn_data[:100] (knn_indices, knn_dists, rp_forest) = nearest_neighbors(train, 10, "euclidean", {}, False, np.random) graph = fuzzy_simplicial_set( nn_data, 10, np.random, "euclidean", {}, knn_indices, knn_dists, False, 1.0, 1.0, False, ) search_graph = sparse.lil_matrix((train.shape[0], train.shape[0]), dtype=np.int8) search_graph.rows = knn_indices search_graph.data = (knn_dists != 0).astype(np.int8) search_graph = search_graph.maximum(search_graph.transpose()).tocsr() random_init, tree_init = make_initialisations(dist.euclidean, ()) search = make_initialized_nnd_search(dist.euclidean, ()) rng_state = np.random.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64) init = initialise_search(rp_forest, train, test, int(10 * 3), random_init, tree_init, rng_state) result = search(train, search_graph.indptr, search_graph.indices, init, test) indices, dists = deheap_sort(result) indices = indices[:, :10] tree = KDTree(train) true_indices = tree.query(test, 10, return_distance=False) num_correct = 0.0 for i in range(test.shape[0]): num_correct += np.sum(np.in1d(true_indices[i], indices[i])) percent_correct = num_correct / (test.shape[0] * 10) assert_greater_equal( percent_correct, 0.99, "Sparse NN-descent did not get " "99% accuracy on nearest " "neighbors", )
def _nhood_search(umap_object, nhood_size): if umap_object._small_data: dmat = sklearn.metrics.pairwise_distances(umap_object._raw_data) indices = np.argpartition(dmat, nhood_size)[:, :nhood_size] dmat_shortened = submatrix(dmat, indices, nhood_size) indices_sorted = np.argsort(dmat_shortened) indices = submatrix(indices, indices_sorted, nhood_size) dists = submatrix(dmat_shortened, indices_sorted, nhood_size) else: rng_state = np.empty(3, dtype=np.int64) if len(umap_object._metric_kwds) >= 1: _dist = umap_object._input_distance_func _args = tuple(umap_object._metric_kwds.values()) @numba.njit() def _metric(x, y): _dist(x, y, *_args) else: _metric = umap_object._input_distance_func init = initialise_search( umap_object._rp_forest, umap_object._raw_data, umap_object._raw_data, int(nhood_size * umap_object.transform_queue_size), rng_state, _metric, ) result = initialized_nnd_search( umap_object._raw_data, umap_object._search_graph.indptr, umap_object._search_graph.indices, init, umap_object._raw_data, _metric, ) indices, dists = deheap_sort(result) indices = indices[:, :nhood_size] dists = dists[:, :nhood_size] return indices, dists
def neighbors_update(adata, adata_new, k=10, queue_size=5, random_state=0): # only with use_rep='X' for now from umap.nndescent import make_initialisations, make_initialized_nnd_search, initialise_search from umap.umap_ import INT32_MAX, INT32_MIN from umap.utils import deheap_sort import umap.distances as dist if 'metric_kwds' in adata.uns['neighbors']['params']: dist_args = tuple( adata.uns['neighbors']['params']['metric_kwds'].values()) else: dist_args = () dist_func = dist.named_distances[adata.uns['neighbors']['params'] ['metric']] random_init, tree_init = make_initialisations(dist_func, dist_args) search = make_initialized_nnd_search(dist_func, dist_args) search_graph = adata.uns['neighbors']['distances'].copy() search_graph.data = (search_graph.data > 0).astype(np.int8) search_graph = search_graph.maximum(search_graph.transpose()) # prune it? random_state = check_random_state(random_state) rng_state = random_state.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64) if 'rp_forest' in adata.uns['neighbors']: rp_forest = _rp_forest_generate(adata.uns['neighbors']['rp_forest']) else: rp_forest = None train = adata.X test = adata_new.X init = initialise_search(rp_forest, train, test, int(k * queue_size), random_init, tree_init, rng_state) result = search(train, search_graph.indptr, search_graph.indices, init, test) indices, dists = deheap_sort(result) return indices[:, :k], dists[:, :k]
def _nhood_search(umap_object, nhood_size): if umap_object._small_data: dmat = sklearn.metrics.pairwise_distances(umap_object._raw_data) indices = np.argpartition(dmat, nhood_size)[:, :nhood_size] dmat_shortened = submatrix(dmat, indices, nhood_size) indices_sorted = np.argsort(dmat_shortened) indices = submatrix(indices, indices_sorted, nhood_size) dists = submatrix(dmat_shortened, indices_sorted, nhood_size) else: rng_state = np.empty(3, dtype=np.int64) init = initialise_search( umap_object._rp_forest, umap_object._raw_data, umap_object._raw_data, int(nhood_size * umap_object.transform_queue_size), rng_state, umap_object._distance_func, umap_object._dist_args, ) result = initialized_nnd_search( umap_object._raw_data, umap_object._search_graph.indptr, umap_object._search_graph.indices, init, umap_object._raw_data, umap_object._distance_func, umap_object._dist_args, ) indices, dists = deheap_sort(result) indices = indices[:, :nhood_size] dists = dists[:, :nhood_size] return indices, dists
def test_sparse_nn_search(sparse_nn_data): train = sparse_nn_data[100:] test = sparse_nn_data[:100] (knn_indices, knn_dists, rp_forest) = nearest_neighbors( train, 15, "euclidean", {}, False, np.random, use_pynndescent=False, ) # COMMENTED OUT as NOT REALLY INFLUENCING THE TEST # NOTE: there is a use of nn_data here rather than spatial_nn_data # looks like a copy&paste error, not very intended. # graph = fuzzy_simplicial_set( # nn_data, # 15, # np.random, # "euclidean", # {}, # knn_indices, # knn_dists, # False, # 1.0, # 1.0, # False, # ) search_graph = setup_search_graph(knn_dists, knn_indices, train) rng_state = np.random.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64) init = sparse_initialise_search( rp_forest, train.indices, train.indptr, train.data, test.indices, test.indptr, test.data, int(10 * 6), rng_state, spdist.sparse_euclidean, ) result = sparse_initialized_nnd_search( train.indices, train.indptr, train.data, search_graph.indptr, search_graph.indices, init, test.indices, test.indptr, test.data, spdist.sparse_euclidean, ) indices, dists = deheap_sort(result) indices = indices[:, :10] tree = KDTree(train.toarray()) true_indices = tree.query(test.toarray(), 10, return_distance=False) num_correct = 0.0 for i in range(test.shape[0]): num_correct += np.sum(np.in1d(true_indices[i], indices[i])) percent_correct = num_correct / (test.shape[0] * 10) assert_greater_equal( percent_correct, 0.85, "Sparse NN-descent did not get " "85% accuracy on nearest " "neighbors", )
def sparse_nn_descent( inds, indptr, data, n_vertices, n_neighbors, rng_state, max_candidates=50, sparse_dist=umap.sparse.sparse_euclidean, n_iters=10, delta=0.001, rho=0.5, low_memory=False, rp_tree_init=True, leaf_array=None, verbose=False, ): tried = set([(-1, -1)]) current_graph = make_heap(n_vertices, n_neighbors) for i in range(n_vertices): indices = rejection_sample(n_neighbors, n_vertices, rng_state) for j in range(indices.shape[0]): from_inds = inds[indptr[i]:indptr[i + 1]] from_data = data[indptr[i]:indptr[i + 1]] to_inds = inds[indptr[indices[j]]:indptr[indices[j] + 1]] to_data = data[indptr[indices[j]]:indptr[indices[j] + 1]] d = sparse_dist(from_inds, from_data, to_inds, to_data) heap_push(current_graph, i, d, indices[j], 1) heap_push(current_graph, indices[j], d, i, 1) tried.add((i, indices[j])) tried.add((indices[j], i)) if rp_tree_init: sparse_init_rp_tree( inds, indptr, data, sparse_dist, current_graph, leaf_array, tried=tried, ) if low_memory: sparse_nn_descent_internal_low_memory( current_graph, inds, indptr, data, n_vertices, n_neighbors, rng_state, max_candidates=max_candidates, sparse_dist=sparse_dist, n_iters=n_iters, delta=delta, rho=rho, verbose=verbose, ) else: sparse_nn_descent_internal_high_memory( current_graph, inds, indptr, data, n_vertices, n_neighbors, rng_state, tried, max_candidates=max_candidates, sparse_dist=sparse_dist, n_iters=n_iters, delta=delta, rho=rho, verbose=verbose, ) return deheap_sort(current_graph)
def nn_descent( data, n_neighbors, rng_state, max_candidates=50, n_iters=10, delta=0.001, rho=0.5, rp_tree_init=True, leaf_array=None, verbose=False, ): n_vertices = data.shape[0] current_graph = make_heap(data.shape[0], n_neighbors) for i in range(data.shape[0]): indices = rejection_sample(n_neighbors, data.shape[0], rng_state) for j in range(indices.shape[0]): d = dist(data[i], data[indices[j]], *dist_args) heap_push(current_graph, i, d, indices[j], 1) heap_push(current_graph, indices[j], d, i, 1) if rp_tree_init: for n in range(leaf_array.shape[0]): for i in range(leaf_array.shape[1]): if leaf_array[n, i] < 0: break for j in range(i + 1, leaf_array.shape[1]): if leaf_array[n, j] < 0: break d = dist( data[leaf_array[n, i]], data[leaf_array[n, j]], *dist_args ) heap_push( current_graph, leaf_array[n, i], d, leaf_array[n, j], 1 ) heap_push( current_graph, leaf_array[n, j], d, leaf_array[n, i], 1 ) for n in range(n_iters): if verbose: print("\t", n, " / ", n_iters) candidate_neighbors = build_candidates( current_graph, n_vertices, n_neighbors, max_candidates, rng_state ) c = 0 for i in range(n_vertices): for j in range(max_candidates): p = int(candidate_neighbors[0, i, j]) if p < 0 or tau_rand(rng_state) < rho: continue for k in range(max_candidates): q = int(candidate_neighbors[0, i, k]) if ( q < 0 or not candidate_neighbors[2, i, j] and not candidate_neighbors[2, i, k] ): continue d = dist(data[p], data[q], *dist_args) c += heap_push(current_graph, p, d, q, 1) c += heap_push(current_graph, q, d, p, 1) if c <= delta * n_neighbors * data.shape[0]: break return deheap_sort(current_graph)
def nn_descent(inds, indptr, data, n_vertices, n_neighbors, rng_state, max_candidates=50, n_iters=10, delta=0.001, rho=0.5, rp_tree_init=True, leaf_array=None, verbose=False): current_graph = make_heap(n_vertices, n_neighbors) for i in range(n_vertices): indices = rejection_sample(n_neighbors, n_vertices, rng_state) for j in range(indices.shape[0]): from_inds = inds[indptr[i]:indptr[i + 1]] from_data = data[indptr[i]:indptr[i + 1]] to_inds = inds[indptr[indices[j]]:indptr[indices[j] + 1]] to_data = data[indptr[indices[j]]:indptr[indices[j] + 1]] d = sparse_dist(from_inds, from_data, to_inds, to_data, *dist_args) heap_push(current_graph, i, d, indices[j], 1) heap_push(current_graph, indices[j], d, i, 1) if rp_tree_init: for n in range(leaf_array.shape[0]): for i in range(leaf_array.shape[1]): if leaf_array[n, i] < 0: break for j in range(i + 1, leaf_array.shape[1]): if leaf_array[n, j] < 0: break from_inds = inds[indptr[leaf_array[ n, i]]:indptr[leaf_array[n, i] + 1]] from_data = data[indptr[leaf_array[ n, i]]:indptr[leaf_array[n, i] + 1]] to_inds = inds[indptr[leaf_array[ n, j]]:indptr[leaf_array[n, j] + 1]] to_data = data[indptr[leaf_array[ n, j]]:indptr[leaf_array[n, j] + 1]] d = sparse_dist(from_inds, from_data, to_inds, to_data, *dist_args) heap_push(current_graph, leaf_array[n, i], d, leaf_array[n, j], 1) heap_push(current_graph, leaf_array[n, j], d, leaf_array[n, i], 1) for n in range(n_iters): if verbose: print("\t", n, " / ", n_iters) candidate_neighbors = build_candidates(current_graph, n_vertices, n_neighbors, max_candidates, rng_state) c = 0 for i in range(n_vertices): for j in range(max_candidates): p = int(candidate_neighbors[0, i, j]) if p < 0 or tau_rand(rng_state) < rho: continue for k in range(max_candidates): q = int(candidate_neighbors[0, i, k]) if q < 0 or not candidate_neighbors[2, i, j] and not \ candidate_neighbors[2, i, k]: continue from_inds = inds[indptr[p]:indptr[p + 1]] from_data = data[indptr[p]:indptr[p + 1]] to_inds = inds[indptr[q]:indptr[q + 1]] to_data = data[indptr[q]:indptr[q + 1]] d = sparse_dist(from_inds, from_data, to_inds, to_data, *dist_args) c += heap_push(current_graph, p, d, q, 1) c += heap_push(current_graph, q, d, p, 1) if c <= delta * n_neighbors * n_vertices: break return deheap_sort(current_graph)