def get_Xss_confidence(self): X = self.X_data X = X.A if sp.issparse(X) else X Xss = self.Xss.get_X() alg = 'ball_tree' if Xss.shape[1] > 10 else 'kd_tree' if X.shape[0] > 200000 and X.shape[1] > 2: from pynndescent import NNDescent nbrs = NNDescent(X, metric='euclidean', n_neighbors=min(self.k, X.shape[0] - 1), n_jobs=-1, random_state=19491001) _, dist = nbrs.query(Xss, k=min(self.k, X.shape[0] - 1)) else: alg = 'ball_tree' if X.shape[1] > 10 else 'kd_tree' nbrs = NearestNeighbors(n_neighbors=min(self.k, X.shape[0] - 1), algorithm=alg, n_jobs=-1).fit(X) dist, _ = nbrs.kneighbors(Xss) dist_m = dist.mean(1) confidence = 1 - dist_m / dist_m.max() return confidence
def fit(self, data: np.ndarray, sight_k: int = None) -> Any: """Fits the model data: np.ndarray (samples, features) np sight_k: int the farthest point that a node is allowed to connect to when its closest neighbours are not allowed """ self.data = data if sight_k is not None: self.sight_k = sight_k logging.debug( f"First search the {self.sight_k} nearest neighbours for {self.n_samples}" ) np.random.seed(13) if self.metric == "correlation": self._nn = _NearestNeighbors(n_neighbors=self.sight_k + 1, metric=self.metric, p=self.minkowski_p, n_jobs=self.n_jobs, algorithm="brute") self._nn.fit(self.data) elif self.metric == "js": self._nn = NNDescent(data=self.data, metric=jensen_shannon_distance) else: self._nn = _NearestNeighbors(n_neighbors=self.sight_k + 1, metric=self.metric, p=self.minkowski_p, n_jobs=self.n_jobs, leaf_size=30) self._nn.fit(self.data) # call this to calculate bknn self.kneighbors_graph(mode='distance') return self
def test_transformer_equivalence(): N_NEIGHBORS = 15 EPSILON = 0.15 train = nn_data[:400] test = nn_data[:200] # Note we shift N_NEIGHBORS to conform to sklearn's KNeighborTransformer defn nnd = NNDescent(data=train, n_neighbors=N_NEIGHBORS + 1, random_state=42, compressed=False) indices, dists = nnd.query(test, k=N_NEIGHBORS, epsilon=EPSILON) sort_idx = np.argsort(indices, axis=1) indices_sorted = np.vstack( [indices[i, sort_idx[i]] for i in range(sort_idx.shape[0])]) dists_sorted = np.vstack( [dists[i, sort_idx[i]] for i in range(sort_idx.shape[0])]) # Note we shift N_NEIGHBORS to conform to sklearn' KNeighborTransformer defn transformer = PyNNDescentTransformer(n_neighbors=N_NEIGHBORS, search_epsilon=EPSILON, random_state=42).fit( train, compress_index=False) Xt = transformer.transform(test).sorted_indices() assert np.all(Xt.indices == indices_sorted.flatten()) assert np.allclose(Xt.data, dists_sorted.flat)
def fit(self, X, V, k, s=None, tol=1e-4): self.__reset__() # knn clustering if self.nbrs_idx is None: if X.shape[0] > 200000 and X.shape[1] > 2: from pynndescent import NNDescent nbrs = NNDescent(X, metric='euclidean', n_neighbors=k + 1, n_jobs=-1, random_state=19491001) Idx, _ = nbrs.query(X, k=k+1) else: alg = 'ball_tree' if X.shape[1] > 10 else 'kd_tree' nbrs = NearestNeighbors(n_neighbors=k + 1, algorithm=alg, n_jobs=-1).fit(X) _, Idx = nbrs.kneighbors(X) self.nbrs_idx = Idx[:, 1:] else: Idx = self.nbrs_idx # compute transition prob. n = X.shape[0] self.P = np.zeros((n, n)) for i in range(n): y = X[i] v = V[i] Y = X[Idx[i, 1:]] p = compute_markov_trans_prob(y, v, Y, s, cont_time=True) p[p <= tol] = 0 # tolerance check self.P[Idx[i, 1:], i] = p self.P[i, i] = -np.sum(p)
def get_Xss_confidence(self, k=50): X = self.X_data X = X.A if sp.issparse(X) else X Xss = self.Xss.get_X() Xref = np.median(X, 0) Xss = np.vstack((Xss, Xref)) if X.shape[0] > 200000 and X.shape[1] > 2: from pynndescent import NNDescent nbrs = NNDescent(X, metric="euclidean", n_neighbors=min(k, X.shape[0] - 1), n_jobs=-1, random_state=19491001) _, dist = nbrs.query(Xss, k=min(k, X.shape[0] - 1)) else: alg = "ball_tree" if X.shape[1] > 10 else "kd_tree" nbrs = NearestNeighbors(n_neighbors=min(k, X.shape[0] - 1), algorithm=alg, n_jobs=-1).fit(X) dist, _ = nbrs.kneighbors(Xss) dist_m = dist.mean(1) # confidence = 1 - dist_m / dist_m.max() sigma = 0.1 * 0.5 * (np.max(X[:, 0]) - np.min(X[:, 0]) + np.max(X[:, 1]) - np.min(X[:, 1])) confidence = gaussian_1d(dist_m, sigma=sigma) confidence /= np.max(confidence) return confidence[:-1]
def graphize_vecfld(func, X, nbrs_idx=None, dist=None, k=30, distance_free=True, n_int_steps=20, cores=1): n, d = X.shape nbrs = None if nbrs_idx is None: if X.shape[0] > 200000 and X.shape[1] > 2: from pynndescent import NNDescent nbrs = NNDescent(X, metric='euclidean', n_neighbors=k+1, n_jobs=-1, random_state=19491001) nbrs_idx, dist = nbrs.query(X, k=k+1) else: alg = 'ball_tree' if X.shape[1] > 10 else 'kd_tree' nbrs = NearestNeighbors(n_neighbors=k+1, algorithm=alg, n_jobs=-1).fit(X) dist, nbrs_idx = nbrs.kneighbors(X) if dist is None and not distance_free: D = pdist(X) else: D = None V = sp.csr_matrix((n, n)) if cores == 1: for i, idx in tqdm(enumerate(nbrs_idx), desc='Constructing diffusion graph from reconstructed vector field'): V += construct_v(X, i, idx, n_int_steps, func, distance_free, dist, D, n) else: pool = ThreadPool(cores) res = pool.starmap(construct_v, zip(itertools.repeat(X), np.arange(len(nbrs_idx)), nbrs_idx, itertools.repeat(n_int_steps), itertools.repeat(func), itertools.repeat(distance_free), itertools.repeat(dist), itertools.repeat(D), itertools.repeat(n))) pool.close() pool.join() V = functools.reduce((lambda a, b: a + b), res) return V, nbrs
def bandwidth_selector(X): """ This function computes an empirical bandwidth for a Gaussian kernel. """ n, m = X.shape if n > 200000 and m > 2: from pynndescent import NNDescent nbrs = NNDescent( X, metric="euclidean", n_neighbors=max(2, int(0.2 * n)), n_jobs=-1, random_state=19491001, ) _, distances = nbrs.query(X, k=max(2, int(0.2 * n))) else: alg = "ball_tree" if X.shape[1] > 10 else "kd_tree" nbrs = NearestNeighbors(n_neighbors=max(2, int(0.2 * n)), algorithm=alg, n_jobs=-1).fit(X) distances, _ = nbrs.kneighbors(X) d = np.mean(distances[:, 1:]) / 1.5 return np.sqrt(2) * d
def test_nn_decent_with_parallel_backend(): np.random.seed(42) N = 100 D = 128 chunk_size = N // 8 n_neighbors = 25 data = np.random.rand(N, D).astype(np.float32) nn_indices, nn_distances = NNDescent( data, n_neighbors=n_neighbors, max_candidates=max_candidates, n_iters=2, tree_init=False, seed_per_row=True, )._neighbor_graph with joblib.parallel_backend("threading"): nn_indices_threaded, nn_distances_threaded = NNDescent( data, n_neighbors=n_neighbors, max_candidates=max_candidates, n_iters=2, tree_init=False, seed_per_row=True, )._neighbor_graph assert_allclose(nn_indices_threaded, nn_indices) assert_allclose(nn_distances_threaded, nn_distances)
def test_tree_no_split(small_data, sparse_small_data, metric): k = 10 for data, data_type in zip([small_data, sparse_small_data], ["dense", "sparse"]): n_instances = data.shape[0] leaf_size = n_instances + 1 # just to be safe data_train = data[n_instances // 2:] data_test = data[:n_instances // 2] nnd = NNDescent( data_train, metric=metric, n_neighbors=data_train.shape[0] - 1, random_state=None, tree_init=True, leaf_size=leaf_size, ) nnd.prepare() knn_indices, _ = nnd.query(data_test, k=k, epsilon=0.2) true_nnd = NearestNeighbors(metric=metric).fit(data_train) true_indices = true_nnd.kneighbors(data_test, k, return_distance=False) num_correct = 0.0 for i in range(true_indices.shape[0]): num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i])) percent_correct = num_correct / (true_indices.shape[0] * k) assert ( percent_correct >= 0.95 ), "NN-descent query did not get 95% for accuracy on nearest neighbors on {} data".format( data_type)
def _init_pynndescent(self, distances): from pynndescent import NNDescent self._use_pynndescent = True first_col = np.arange(distances.shape[0])[:, None] init_indices = np.hstack((first_col, np.stack(distances.tolil().rows))) self._nnd_idx = NNDescent( data=self._rep, metric=self._metric, metric_kwds=self._metric_kwds, n_neighbors=self._n_neighbors, init_graph=init_indices, random_state=self._neigh_random_state, ) # temporary hack for the broken forest storage from pynndescent.rp_trees import make_forest current_random_state = check_random_state(self._nnd_idx.random_state) self._nnd_idx._rp_forest = make_forest( self._nnd_idx._raw_data, self._nnd_idx.n_neighbors, self._nnd_idx.n_search_trees, self._nnd_idx.leaf_size, self._nnd_idx.rng_state, current_random_state, self._nnd_idx.n_jobs, self._nnd_idx._angular_trees, )
def compute_tau(X, V, k=100, nbr_idx=None): if nbr_idx is None: if X.shape[0] > 200000 and X.shape[1] > 2: from pynndescent import NNDescent nbrs = NNDescent( X, metric="euclidean", n_neighbors=k, n_jobs=-1, random_state=19491001, ) _, dist = nbrs.query(X, k=k) else: alg = "ball_tree" if X.shape[1] > 10 else "kd_tree" nbrs = NearestNeighbors(n_neighbors=k, algorithm=alg, n_jobs=-1).fit(X) dists, _ = nbrs.kneighbors(X) else: dists = np.zeros(nbr_idx.shape) for i in range(nbr_idx.shape[0]): for j in range(nbr_idx.shape[1]): x = X[i] y = X[nbr_idx[i, j]] dists[i, j] = np.sqrt((x - y).dot(x - y)) d = np.mean(dists[:, 1:], 1) v = np.linalg.norm(V, axis=1) tau = d / v return tau, v
def get_knn_graph(self, data): nn = NNDescent(data, metric="euclidean", n_jobs=self.n_jobs, random_state=self.random_state) indices, distances = nn.query(data, k=self.n_neighbors + 1) knn = indices[:, 1:] return knn
def fit(self, X, W, y, verbose=0): """ Fit a counterfactual estimation model given explanatory variables X, treatment variable W and target y This method fits a forest-based model, extracts a supervised embedding from its leaves, and builds an nearest neighbor index on the embedding Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] Data with explanatory variables, with possible confounders of treatment assignment and effect. W : array-like, shape = [n_samples] Treatment variable. The model will try to estimate a counterfactual outcome for each unique value in this variable. Should not exceed 10 values. y: array-like, shape = [n_samples] Target variable. verbose : int, optional (default=0) Verbosity level. Returns ------- self: object """ # checking if W has too many unique values if len(np.unique(W)) > 10: raise ValueError('More than 10 unique values for W. \ Too many unique values will make the process very expensive.') # fitting the model self.model.fit(X, y) # getting forest embedding from model self.train_embed_ = self._get_forest_embed(X) # create neighbor index self.nn_index = NNDescent(self.train_embed_, metric='hamming') # creating a df with treatment assignments and outcomes self.train_outcome_df = pd.DataFrame({'neighbor': range(X.shape[0]), 'y': y, 'W': W}) # saving explanatory variables if self.save_explanatory: self.X_train = X.assign(W=W, y=y) # return self return self
def fit(self, X, V, k, s=None, method="qp", eps=None, tol=1e-4): # pass index # the parameter k will be replaced by a connectivity matrix in the future. self.__reset__() # knn clustering if X.shape[0] > 200000 and X.shape[1] > 2: from pynndescent import NNDescent nbrs = NNDescent(X, metric="euclidean", n_neighbors=k, n_jobs=-1, random_state=19491001) Idx, _ = nbrs.query(X, k=k) else: alg = "ball_tree" if X.shape[1] > 10 else "kd_tree" nbrs = NearestNeighbors(n_neighbors=k, algorithm=alg, n_jobs=-1).fit(X) _, Idx = nbrs.kneighbors(X) # compute transition prob. n = X.shape[0] self.P = np.zeros((n, n)) if method == "kernel": inv_s = np.linalg.inv(s) # compute density kernel if eps is not None: self.Kd = np.zeros((n, n)) inv_eps = 1 / eps for i in range(n): self.Kd[i, Idx[i]] = compute_density_kernel( X[i], X[Idx[i]], inv_eps) D = np.sum(self.Kd, 0) for i in range(n): y = X[i] v = V[i] if method == "qp": Y = X[Idx[i, 1:]] p = compute_markov_trans_prob(y, v, Y, s) p[p <= tol] = 0 # tolerance check self.P[Idx[i, 1:], i] = p self.P[i, i] = 1 - np.sum(p) else: Y = X[Idx[i]] # p = compute_kernel_trans_prob(y, v, Y, inv_s) k = compute_drift_kernel(y, v, Y, inv_s) if eps is not None: k /= D[Idx[i]] p = k / np.sum(k) p[p <= tol] = 0 # tolerance check p = p / np.sum(p) self.P[Idx[i], i] = p
def test_update_w_prepare_query_accuracy(nn_data, metric): nnd = NNDescent( nn_data[200:800], metric=metric, n_neighbors=10, random_state=None, compressed=False, ) nnd.prepare() nnd.update(xs_fresh=nn_data[800:]) nnd.prepare() knn_indices, _ = nnd.query(nn_data[:200], k=10, epsilon=0.2) true_nnd = NearestNeighbors(metric=metric).fit(nn_data[200:]) true_indices = true_nnd.kneighbors(nn_data[:200], 10, return_distance=False) num_correct = 0.0 for i in range(true_indices.shape[0]): num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i])) percent_correct = num_correct / (true_indices.shape[0] * 10) assert percent_correct >= 0.95, ("NN-descent query did not get 95% " "accuracy on nearest neighbors")
def calculate_neighbours(genes, n_neighbours: int, inverse: bool, scale: str, log: bool, description: str = '', return_neigh_sim: bool = False, genes_query_data: pd.DataFrame = None, remove_self: bool = False): """ Calculate neighbours of genes based on cosine distance. :param genes: Data frame as in class init, gene names (rows) should match the one in init. :param n_neighbours: Number of neighbours to obtain for each gene. This will include self for non-inverse. :param inverse: Calculate most similar neighbours (False) or neighbours with inverse profile (True). :param scale: Scale expression by gene with 'minmax' (min=0, max=1) or 'mean0std1' (mean=0, std=1) or 'none'. :param log: Should expression data be log2(data+pseudocount) transformed before scaling. :param description: If an error occurs while making KNN index report this description with the error. :param return_neigh_sim: Return tuple with nearest neighbour matrix and similarity matrix data frames, as returned by pynndescent, but with distance matrix converted to similarities and with added gene names for the index. :param genes_query_data: Use this as query. If None use genes. :param remove_self: Used only if return_neigh_dist is true. Whether to remove sample from its closest neighbours or not. If return_neigh_dist is False this is done automatically. This also removes the last column of neighbours if self is not present - thus it should not be used with inverse, as self will not be present. :return: Dict with keys being gene pair names tuple (smaller name by alphabet is the first tuple value) and values representing cosine similarity. Or see return_neigh_dist. """ genes_index, genes_query = NeighbourCalculator.get_index_query(genes=genes, inverse=inverse, scale=scale, log=log, genes_query_data=genes_query_data) # Random state was not set during the analysis in the paper so the obtained results might differ slightly try: index = NNDescent(genes_index, n_jobs=THREADS, metric='cosine', random_state=0) except ValueError: try: index = NNDescent(genes_index, tree_init=False, n_jobs=THREADS, random_state=0) warnings.warn( 'Dataset ' + description + ' index computed without tree initialisation', Warning) except ValueError: raise ValueError('Dataset ' + description + ' can not be processed by pydescent') neighbours, distances = index.query(genes_query.tolist(), k=n_neighbours) if genes_query_data is None: genes_query_data = genes if return_neigh_sim: neighbours = NeighbourCalculator.parse_neighbours_matrix(neighbours=neighbours, genes_query=genes_query_data, genes_idx=genes) similarities = pd.DataFrame(NeighbourCalculator.parse_distances_matrix(distances), index=genes_query_data.index) if remove_self: neighbours, similarities = NeighbourCalculator.remove_self_pynn_matrix(neighbours=neighbours, similarities=similarities) return neighbours, similarities else: return NeighbourCalculator.parse_neighbours(neighbours=neighbours, distances=distances, genes_query=genes_query_data, genes_idx=genes)
def test_nn_descent(): np.random.seed(42) N = 100 # D = 128 D = 4 chunk_size = N // 8 n_neighbors = 25 data = np.random.rand(N, D).astype(np.float32) nn_indices, nn_distances = NNDescent( data, n_neighbors=n_neighbors, max_candidates=max_candidates, n_iters=1, random_state=42, delta=0, tree_init=False, seed_per_row=True, )._neighbor_graph for i in range(data.shape[0]): assert_equal( len(nn_indices[i]), len(np.unique(nn_indices[i])), "Duplicate graph_indices in unthreaded knn graph", ) nn_indices_threaded, nn_distances_threaded = NNDescent( data, n_neighbors=n_neighbors, max_candidates=max_candidates, n_iters=1, random_state=42, delta=0, tree_init=False, seed_per_row=True, n_jobs=2, )._neighbor_graph for i in range(data.shape[0]): assert_equal( len(nn_indices_threaded[i]), len(np.unique(nn_indices_threaded[i])), "Duplicate graph_indices in threaded knn graph", ) nbrs = NearestNeighbors(n_neighbors=n_neighbors, algorithm="brute").fit(data) _, nn_gold_indices = nbrs.kneighbors(data) assert_allclose(nn_indices_threaded, nn_indices) assert_allclose(nn_distances_threaded, nn_distances)
def test_nn_descent_query_accuracy(nn_data): nnd = NNDescent(nn_data[200:], "euclidean", n_neighbors=10, random_state=None) knn_indices, _ = nnd.query(nn_data[:200], k=10, epsilon=0.2) tree = KDTree(nn_data[200:]) true_indices = tree.query(nn_data[:200], 10, return_distance=False) num_correct = 0.0 for i in range(true_indices.shape[0]): num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i])) percent_correct = num_correct / (true_indices.shape[0] * 10) assert percent_correct >= 0.95, ( "NN-descent query did not get 95% " "accuracy on nearest neighbors" )
class NNDescent(KNNIndex): # TODO: Make mapping from sklearn metrics to lib metrics def build(self, data): self.index = LibNNDescent(data, metric=self.metric, n_neighbors=5) def query_train(self, data, k): search_neighbors = min(data.shape[0] - 1, k + 1) neighbors, distances = self.index.query(data, k=search_neighbors, queue_size=1) return neighbors[:, 1:], distances[:, 1:] def query(self, query, k): return self.index.query(query, k=k, queue_size=1)
def test_nn_descent_query_accuracy_angular(nn_data): nnd = NNDescent(nn_data[200:], "cosine", n_neighbors=30, random_state=None) knn_indices, _ = nnd.query(nn_data[:200], k=10, epsilon=0.32) nn = NearestNeighbors(metric="cosine").fit(nn_data[200:]) true_indices = nn.kneighbors(nn_data[:200], n_neighbors=10, return_distance=False) num_correct = 0.0 for i in range(true_indices.shape[0]): num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i])) percent_correct = num_correct / (true_indices.shape[0] * 10) assert percent_correct >= 0.95, ( "NN-descent query did not get 95% " "accuracy on nearest neighbors" )
def py_nearest_neighbors(dataset, K, metric, repetition): # (try) to enforce pynndescent using only a single thread: # a point is his own NN # thats why we query K+1 and remove afterwards runtime = np.zeros(repetition+1) nn_list = [] for i in range(repetition+1): start = time.perf_counter() index = NNDescent(dataset.X, n_neighbors=(K+1), # verbose=True, tree_init=False, # some fancy init n_jobs=1) elapsed = time.perf_counter()-start runtime[i] = elapsed nn_arr = index._neighbor_graph[0] assert((nn_arr[:,0] == np.array(range(dataset.N))).all()) nn_list.append(NearestNeighbors(nn_arr[:,1:], metric)) # skip first repetition, since JIT does a lot of work then... return nn_list[1:], Timingdata(None, runtime[1:], "pynndescent")
def test_deduplicated_data_behaves_normally(seed, cosine_hang_data): data = np.unique(cosine_hang_data, axis=0) data = data[~np.all(data == 0, axis=1)] data = data[:1000] n_neighbors = 10 knn_indices, _ = NNDescent( data, "cosine", {}, n_neighbors, random_state=np.random.RandomState(seed), n_trees=20, )._neighbor_graph for i in range(data.shape[0]): assert len(knn_indices[i]) == len(np.unique( knn_indices[i])), "Duplicate graph_indices in knn graph" angular_data = normalize(data, norm="l2") tree = KDTree(angular_data) true_indices = tree.query(angular_data, n_neighbors, return_distance=False) num_correct = 0 for i in range(data.shape[0]): num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i])) proportion_correct = num_correct / (data.shape[0] * n_neighbors) assert (proportion_correct >= 0.95), "NN-descent did not get 95% accuracy on nearest neighbors"
def test_deduplicated_data_behaves_normally(): this_dir = os.path.dirname(os.path.abspath(__file__)) data_path = os.path.join(this_dir, "test_data/cosine_hang.npy") data = np.unique(np.load(data_path), axis=0) data = data[~np.all(data == 0, axis=1)] data = data[:1000] n_neighbors = 10 knn_indices, _ = NNDescent( data, "cosine", {}, n_neighbors, random_state=np.random, n_trees=20 )._neighbor_graph for i in range(data.shape[0]): assert_equal( len(knn_indices[i]), len(np.unique(knn_indices[i])), "Duplicate graph_indices in knn graph", ) angular_data = normalize(data, norm="l2") tree = KDTree(angular_data) true_indices = tree.query(angular_data, n_neighbors, return_distance=False) num_correct = 0 for i in range(data.shape[0]): num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i])) proportion_correct = num_correct / (data.shape[0] * n_neighbors) assert_greater_equal( proportion_correct, 0.95, "NN-descent did not get 95%" " accuracy on nearest neighbors", )
def trn(X, n, return_index=True, seed=19491001, **kwargs): trnet = TRNET(n, X, seed) trnet.run(**kwargs) if not return_index: return trnet.W else: if X.shape[0] > 200000 and X.shape[1] > 2: from pynndescent import NNDescent nbrs = NNDescent(X, metric="euclidean", n_neighbors=1, n_jobs=-1, random_state=seed) idx, _ = nbrs.query(trnet.W, k=1) else: alg = "ball_tree" if X.shape[1] > 10 else "kd_tree" nbrs = NearestNeighbors(n_neighbors=1, algorithm=alg, n_jobs=-1).fit(X) _, idx = nbrs.kneighbors(trnet.W) return idx[:, 0]
def test_tree_numbers_after_multiple_updates(n_trees): trees_after_update = max(1, int(np.round(n_trees / 3))) nnd = NNDescent(np.array([[1.0]]), n_neighbors=1, n_trees=n_trees) assert nnd.n_trees == n_trees, "NN-descent update changed the number of trees" assert ( nnd.n_trees_after_update == trees_after_update ), "The value of the n_trees_after_update in NN-descent after update(s) is wrong" for i in range(5): nnd.update(xs_fresh=np.array([[i]], dtype=np.float64)) assert ( nnd.n_trees == trees_after_update ), "The value of the n_trees in NN-descent after update(s) is wrong" assert ( nnd.n_trees_after_update == trees_after_update ), "The value of the n_trees_after_update in NN-descent after update(s) is wrong"
def test_joblib_dump(): seed = np.random.RandomState(42) x1 = seed.normal(0, 100, (1000, 50)) x2 = seed.normal(0, 100, (1000, 50)) index1 = NNDescent(x1, "euclidean", {}, 10, random_state=None) neighbors1, distances1 = index1.query(x2) mem_temp = io.BytesIO() joblib.dump(index1, mem_temp) mem_temp.seek(0) index2 = joblib.load(mem_temp) neighbors2, distances2 = index2.query(x2) np.testing.assert_equal(neighbors1, neighbors2) np.testing.assert_equal(distances1, distances2)
def __init__(self, data, n_components=30, normalize=False): self.data = data if self.data.shape[1] > n_components: from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler data_std = StandardScaler().fit_transform(self.data) self.pca = PCA(n_components).fit_transform(data_std) else: self.pca = np.array(data) if normalize: # from sklearn.preprocessing import MaxAbsScaler # self.pca = MaxAbsScaler().fit_transform(self.pca) raise NotImplementedError from pynndescent import NNDescent self.ann_index = NNDescent(self.pca)
def test_sparse_nn_descent_query_accuracy(): nnd = NNDescent( sparse_nn_data[200:], "euclidean", n_neighbors=10, random_state=None ) knn_indices, _ = nnd.query(sparse_nn_data[:200], k=10) tree = KDTree(sparse_nn_data[200:].toarray()) true_indices = tree.query(sparse_nn_data[:200].toarray(), 10, return_distance=False) num_correct = 0.0 for i in range(true_indices.shape[0]): num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i])) percent_correct = num_correct / (true_indices.shape[0] * 10) assert_greater_equal( percent_correct, 0.95, "Sparse NN-descent query did not get 95% " "accuracy on nearest neighbors", )
def p_ij_sym(x, perp, verbose=False): num_pts = x.shape[0] k = min(num_pts - 1, int(3 * perp)) if verbose: print('Indexing') index = NNDescent(x) neighbors = np.empty((num_pts, k-1), dtype=np.int) p_ij = np.empty((num_pts, k-1)) for i, xi in enumerate(x): if verbose: print('Calculating probabilities: {cur}/{tot}'.format( cur=i+1, tot=num_pts), end='\r') nn, dists = index.query([xi], k) beta = find_beta(dists[0, 1:], perp) neighbors[i] = nn[0, 1:] p_ij[i] = p_i(dists[0, 1:], beta) row_indices = np.repeat(np.arange(num_pts), k-1) p = csr_matrix((p_ij.ravel(), (row_indices, neighbors.ravel()))) return 0.5*(p + p.transpose())
def test_nn_decent_with_n_jobs_minus_one(): nn_indices, nn_distances = NNDescent( data, n_neighbors=n_neighbors, max_candidates=max_candidates, n_iters=2, delta=0, tree_init=False, seed_per_row=True, )._neighbor_graph for i in range(data.shape[0]): assert_equal( len(nn_indices[i]), len(np.unique(nn_indices[i])), "Duplicate indices in unthreaded knn graph", ) nn_indices_threaded, nn_distances_threaded = NNDescent( data, n_neighbors=n_neighbors, max_candidates=max_candidates, n_iters=2, delta=0, tree_init=False, seed_per_row=True, n_jobs=-1, )._neighbor_graph for i in range(data.shape[0]): assert_equal( len(nn_indices_threaded[i]), len(np.unique(nn_indices_threaded[i])), "Duplicate indices in threaded knn graph", ) nbrs = NearestNeighbors(n_neighbors=n_neighbors, algorithm="brute").fit(data) _, nn_gold_indices = nbrs.kneighbors(data) assert_allclose(nn_indices_threaded, nn_indices) assert_allclose(nn_distances_threaded, nn_distances)