def test_update_w_prepare_query_accuracy(nn_data, metric): nnd = NNDescent( nn_data[200:800], metric=metric, n_neighbors=10, random_state=None, compressed=False, ) nnd.prepare() nnd.update(xs_fresh=nn_data[800:]) nnd.prepare() knn_indices, _ = nnd.query(nn_data[:200], k=10, epsilon=0.2) true_nnd = NearestNeighbors(metric=metric).fit(nn_data[200:]) true_indices = true_nnd.kneighbors(nn_data[:200], 10, return_distance=False) num_correct = 0.0 for i in range(true_indices.shape[0]): num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i])) percent_correct = num_correct / (true_indices.shape[0] * 10) assert percent_correct >= 0.95, ("NN-descent query did not get 95% " "accuracy on nearest neighbors")
def test_tree_no_split(small_data, sparse_small_data, metric): k = 10 for data, data_type in zip([small_data, sparse_small_data], ["dense", "sparse"]): n_instances = data.shape[0] leaf_size = n_instances + 1 # just to be safe data_train = data[n_instances // 2:] data_test = data[:n_instances // 2] nnd = NNDescent( data_train, metric=metric, n_neighbors=data_train.shape[0] - 1, random_state=None, tree_init=True, leaf_size=leaf_size, ) nnd.prepare() knn_indices, _ = nnd.query(data_test, k=k, epsilon=0.2) true_nnd = NearestNeighbors(metric=metric).fit(data_train) true_indices = true_nnd.kneighbors(data_test, k, return_distance=False) num_correct = 0.0 for i in range(true_indices.shape[0]): num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i])) percent_correct = num_correct / (true_indices.shape[0] * k) assert ( percent_correct >= 0.95 ), "NN-descent query did not get 95% for accuracy on nearest neighbors on {} data".format( data_type)
def create( cls, index_vectors: np.ndarray, metric: Metric = DEFAULT_METRIC, epsilon: float = DEFAULT_EPSILON, neighbors: int = DEFAULT_NEIGHBORS, diversify_probability: float = DEFAULT_DIVERSIFY_PROBABILITY, pruning_degree_multiplier: float = DEFAULT_PRUNING_DEGREE_MULTIPLIER, ) -> "Descent": index = NNDescent( data=index_vectors, metric=_METRIC_NAMES[metric], n_neighbors=neighbors, diversify_prob=diversify_probability, pruning_degree_multiplier=pruning_degree_multiplier, ) index.prepare() return Descent( index=index, epsilon=epsilon, )
def test_generate_triplets(self): key = random.PRNGKey(42) n_points = 1000 n_inliers = 10 n_outliers = 5 n_random = 3 n_extra = min(n_inliers + 50, n_points) # Currently testing it only for 'euclidean' distance. The test for other # cases breaks due to issues with the knn search NNDescent package, but # it works fine when tested in a colab. for distance in ['euclidean']: inputs = np.random.normal(size=(n_points, 100)) index = NNDescent(inputs, metric=distance) index.prepare() neighbors = index.query(inputs, n_extra)[0] neighbors = np.concatenate( (np.arange(n_points).reshape([-1, 1]), neighbors), 1) distance_fn = trimap.get_distance_fn(distance) _, _, sig = trimap.find_scaled_neighbors(inputs, neighbors, distance_fn) triplets, _ = trimap.generate_triplets(key, inputs, n_inliers=n_inliers, n_outliers=n_outliers, n_random=n_random, distance=distance) similar_pairs_distances = distance_fn(inputs[triplets[:, 0]], inputs[triplets[:, 1]])**2 similar_pairs_distances /= (sig[triplets[:, 0]] * sig[triplets[:, 1]]) outlier_pairs_distances = distance_fn(inputs[triplets[:, 0]], inputs[triplets[:, 2]])**2 outlier_pairs_distances /= (sig[triplets[:, 0]] * sig[triplets[:, 2]]) npt.assert_array_less(similar_pairs_distances, outlier_pairs_distances) n_knn_triplets = inputs.shape[0] * n_inliers * n_outliers n_random_triplets = inputs.shape[0] * n_random npt.assert_equal(triplets.shape, [n_knn_triplets + n_random_triplets, 3])
def test_one_dimensional_data(nn_data, metric): nnd = NNDescent( nn_data[200:, :1], metric=metric, n_neighbors=20, random_state=None, tree_init=False, ) nnd.prepare() knn_indices, _ = nnd.query(nn_data[:200, :1], k=10, epsilon=0.2) true_nnd = NearestNeighbors(metric=metric).fit(nn_data[200:, :1]) true_indices = true_nnd.kneighbors(nn_data[:200, :1], 10, return_distance=False) num_correct = 0.0 for i in range(true_indices.shape[0]): num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i])) percent_correct = num_correct / (true_indices.shape[0] * 10) assert percent_correct >= 0.95, ("NN-descent query did not get 95% " "accuracy on nearest neighbors")
k, return_distance=False) p_correct = evaluate_predictions(neighbors_expected, neighbors, k) assert p_correct >= 0.95, ("NN-descent query did not get 95% " "accuracy on nearest neighbors") k = 10 xs_orig, xs_fresh, xs_updated, indices_updated = update_data[case] queries1 = xs_orig # original index = NNDescent(xs_orig, metric=metric, n_neighbors=40, random_state=1234) index.prepare() evaluate(index, xs_orig, queries1) # updated index.update(xs_fresh=xs_fresh, xs_updated=xs_updated, updated_indices=indices_updated) if xs_fresh is not None: xs = np.vstack((xs_orig, xs_fresh)) queries2 = np.vstack((queries1, xs_fresh)) else: xs = xs_orig queries2 = queries1 if indices_updated is not None: xs[indices_updated] = xs_updated evaluate(index, xs, queries2) if indices_updated is not None:
def __call__(self, fwd_vecs, rev_vecs, initclusters, fwd_vecs2=None): from pynndescent import NNDescent assert initclusters is None, ( "Currently I haven't built support" + " for initclusters; use SparseNumpyCosineSimFromFwdAndRevOneDVecs" + " instead") #fwd_vecs2 is used when you don't just want to compute self-similarities #normalize the vectors fwd_vecs = magnitude_norm_sparsemat(sparse_mat=fwd_vecs) if (rev_vecs is not None): rev_vecs = magnitude_norm_sparsemat(sparse_mat=rev_vecs) else: rev_vecs = None if (fwd_vecs2 is None): fwd_vecs2 = fwd_vecs else: fwd_vecs2 = magnitude_norm_sparsemat(sparse_mat=fwd_vecs2) #build the index if (self.verbose): print(datetime.now(), "Building the index") sys.stdout.flush() index = NNDescent(fwd_vecs2, metric="cosine") if (self.verbose): print(datetime.now(), "Preparing the index") sys.stdout.flush() index.prepare() if (self.verbose): print(datetime.now(), "Index ready") sys.stdout.flush() if (self.verbose): print(datetime.now(), "Querying neighbors for fwd") sys.stdout.flush() fwd_neighbs, fwd_dists = index.query(fwd_vecs, k=self.n_neighbors) if (rev_vecs is not None): if (self.verbose): print(datetime.now(), "Querying neighbors for rev") sys.stdout.flush() rev_neighbs, rev_dists = index.query(fwd_vecs, k=self.n_neighbors) if (self.verbose): print(datetime.now(), "Unifying fwd and rev") sys.stdout.flush() fwdrev_neighbs = np.concatenate([fwd_neighbs, rev_neighbs], axis=1) fwdrev_dists = np.concatenate([fwd_dists, rev_dists], axis=1) fwdrev_dists_argsort = np.argsort(fwdrev_dists, axis=1) #need to remove redundancy sims = [] neighbors = [] for i in range(len(fwdrev_dists_argsort)): sims_this_ex = [] neighbors_this_ex = [] neighbors_seen = set() #iterate in order of similarities in the fwd/rev sim search for j in fwdrev_dists_argsort[i]: #get the neighbor neighbor = fwdrev_neighbs[i][j] #make sure it hasn't appeared before (this can happen if # a point is a neighbor according to both the fwd and # the rev search) if neighbor not in neighbors_seen: neighbors_seen.add(neighbor) neighbors_this_ex.append(neighbor) #Need to subtract from 1 because pynndescent returns # 1 - cosinesim sims_this_ex.append(1 - fwdrev_dists[i][j]) #leave once we have n_neighbors neighbors; since we # iterated over the distances in ascending order, these # should be the nearest neighbors if (len(sims_this_ex) == self.n_neighbors): break assert len(neighbors_seen) == self.n_neighbors sims.append(np.array(sims_this_ex)) #neighbors need to be converted to integers as they'll # be used later for indexing neighbors.append(np.array(neighbors_this_ex).astype("int")) else: #Need to subtract from 1 because pynndescent returns 1 - cosinesim sims = 1.0 - fwd_dists neighbors = fwd_neighbs return sims, neighbors
def compute_similarity_graph(self, X, knn=15, sigma=3., zp_k=None, metric='euclidean', maxN=5000): """ Computes similarity graph using parameters specified in self.param """ N = X.shape[0] if knn is None: if N < maxN: knn = N else: print( "Parameter knn was given None and N > maxN, so setting knn=15" ) knn = 15 if N < maxN: print("Calculating NN graph with SKLEARN NearestNeighbors...") if knn > N / 2: nn = NearestNeighbors(n_neighbors=knn, algorithm='brute').fit(X) else: nn = NearestNeighbors(n_neighbors=knn, algorithm='ball_tree').fit(X) # construct CSR matrix representation of the k-NN graph A_data, A_ind = nn.kneighbors(X, knn, return_distance=True) else: print( "Calculating NN graph with NNDescent package since N = {} > {}" .format(N, maxN)) from pynndescent import NNDescent index = NNDescent(X, metric=metric) index.prepare() A_ind, A_data = index.query(X, k=knn) # modify from the kneighbors_graph function from sklearn to # accomodate Zelnik-Perona scaling n_nonzero = N * knn A_indptr = np.arange(0, n_nonzero + 1, knn) if zp_k is not None and metric == 'euclidean': k_dist = A_data[:, zp_k][:, np.newaxis] k_dist[k_dist < 1e-4] = 1e-4 A_data /= np.sqrt(k_dist * k_dist[A_ind, 0]) A_data = np.ravel(A_data) if metric == 'cosine': print(np.max(A_data)) W = sps.csr_matrix( ( ( 1. - A_data ), # need to do 1.-A_data since NNDescent returns cosine DISTANCE (1. - cosine_similarity) A_ind.ravel(), A_indptr), shape=(N, N)) else: W = sps.csr_matrix( (np.exp(-(A_data**2) / sigma), A_ind.ravel(), A_indptr), shape=(N, N)) W = (W + W.T) / 2 #W = max(W, W.T) W.setdiag(0) W.eliminate_zeros() return W