def test_run2(): ''' Parameters ---------- checks: number of nodes to check (?) ''' n = 100 k = 3 ops = 10 test_n = 1 x = random_vectors(n) test_points = random_vectors(test_n) index = Index(x, w=(0.5, 0.5)) for i in range(n // ops): ur = index.run(ops) print(ur) ids1, dists1 = index.knn_search_points(test_points, k, checks=1) # ids2, dists2 = index.knn_search_points(test_points, k, checks = 50) ids3, dists3 = index.knn_search_points(test_points, k, checks=100) print("1: ", ids1) print("1: ", dists1) # print("2: ", ids2) print("3: ", ids3) print("3: ", dists3) print(index.size())
def test_openmp_obj(self): N = 10000 # must be large enough x0 = random_vectors(N, dtype=np.float64) x = PseudoArray(x0) index = Index(x) self.assertFalse(index.is_using_pyarray) index.add_points( x.shape[0]) # we must add points before querying the index pts = np.asarray(x0, dtype=np.float32) for r in range(5): # make cache ready idx, dists = index.knn_search_points(pts, 10) start = time.time() ids1, dists1 = index.knn_search_points(pts, 10, cores=1) elapsed1 = time.time() - start start = time.time() ids2, dists2 = index.knn_search_points(pts, 10, cores=4) elapsed2 = time.time() - start print("single thread: {:.2f} ms".format(elapsed1 * 1000)) print("4 threads: {:.2f} ms".format(elapsed2 * 1000))
def test_large_k(self): x = random_vectors() q = random_vectors(1) k = x.shape[0] + 1 # make k larger than # of vectors in x index = Index(x) self.assertTrue(index.is_using_pyarray) index.add_points(x.shape[0]) with self.assertRaises(ValueError): index.knn_search(0, k) with self.assertRaises(ValueError): index.knn_search_points(q, k)
class KNNKernelDensity: SQRT2PI = np.sqrt(2 * np.pi) def __init__(self, X: np.ndarray[Any, Any], online: Optional[bool] = False): self.X = X self.index = Index(X) if not online: self.index.add_points(len(X)) def run(self, ops: Any) -> Any: return self.index.run(ops) def run_ids(self, ids: Iterable[int]) -> Any: return self.index.run_ids(ids) def score_samples( self, X: np.ndarray[Any, Any], k: int = 10, bandwidth: float = 0.2 ) -> float: _, dists = self.index.knn_search_points(X, k=k) scores = self._gaussian_score(dists, bandwidth) / k return scores def _gaussian_score(self, dists: float, bandwidth: float) -> float: logg = -0.5 * (dists / bandwidth) ** 2 g = np.exp(logg) / bandwidth / self.SQRT2PI return g.sum(axis=1) # type: ignore
class KNNKernelDensity(): SQRT2PI = np.sqrt(2 * np.pi) def __init__(self, X, online=False): self.X = X self.index = Index(X) if not online: # if offline self.index.add_points(len(X)) def run(self, ops): return self.index.run(ops) def run_ids(self, ids): return self.index.run_ids(ids) def score_samples(self, X, k=10, bandwidth=0.2): _, dists = self.index.knn_search_points(X, k=k) scores = self._gaussian_score(dists, bandwidth) / k return scores def _gaussian_score(self, dists, bandwidth): logg = -0.5 * (dists / bandwidth) ** 2 g = np.exp(logg) / bandwidth / self.SQRT2PI return g.sum(axis=1)
def test_updates_after_all_points_added(self): np.random.seed(10) n = 10000 w = (0.5, 0.5) x = random_vectors(n) ops = 1000 index = Index(x, w=w) self.assertTrue(index.is_using_pyarray) index.add_points(n) # add all points for i in range(1000): index.knn_search_points(random_vectors(100), 10) # accumulate losses for i in range(10): res = index.run(ops) self.assertEqual(res['addPointResult'], 0) self.assertEqual(res['updateIndexResult'], ops)
def test_incremental_run2(self): n = 1000 k = 20 ops = 100 test_n = 30 x = random_vectors(n) test_points = random_vectors(test_n) index = Index(x) self.assertTrue(index.is_using_pyarray) for i in range(n // ops): ur = index.run(ops) ids1, dists1 = index.knn_search_points(test_points, k, checks=100) ids2, dists2 = index.knn_search_points(test_points, k, checks=1000) """ The assertion below always holds since later search checks a larger number of nodes and the search process is deterministic """ self.assertEqual(np.sum(dists1 >= dists2), test_n * k)
def test_openmp(self): N = 10000 # must be large enough x = random_vectors(N) index = Index(x) self.assertTrue(index.is_using_pyarray) index.add_points( x.shape[0]) # we must add points before querying the index for r in range(5): # make cache ready idx, dists = index.knn_search_points(x, 10) start = time.time() ids1, dists1 = index.knn_search_points(x, 10, cores=1) elapsed1 = time.time() - start start = time.time() ids2, dists2 = index.knn_search_points(x, 10, cores=4) elapsed2 = time.time() - start print("single thread: {:.2f} ms".format(elapsed1 * 1000)) print("4 threads: {:.2f} ms".format(elapsed2 * 1000))
def test_knn_search_points(): """ KNN_SEARCH_POINTS ---------- GIVEN DATA(ARRAY), RETURN INDEXES & DISTANCES in ASCENDING ORDER (including itself) Parameters ---------- points: data(2d array) of target point (any 2d array can be possible) e.g.) [[0.33, 0.61, ...]] k: number of points to find (WE MUST SET K LESS THAN OR EQUAL TO THE # OF POINTS) cores: number of cores to use checks: eps: sorted: Returns ------- ids: ids of points found (numpy 2D array) dists: distances from target point (numpy 2D array) """ x = random_vectors(n=10, d=3) index = Index(x) index.add_points(x.shape[0]) # pick random integer pt = np.random.randint(x.shape[0]) # id. e.g.) 94 # TEST ON RANDOM DATA POINT pts = np.asarray(x[[pt]], dtype=np.float32) idx2, dist2 = index.knn_search_points(pts, 3, cores=1) print(idx2) print(dist2) # TEST ON WHOLE DATA SET (ARRAY) idx3, dist3 = index.knn_search_points(x, 5, cores=1) print(idx3) print(dist3)
def test_random_64(self): x = random_vectors(dtype=np.float64) index = Index(x) self.assertTrue(index.is_using_pyarray) index.add_points( x.shape[0]) # we must add points before querying the index pt = np.random.randint(x.shape[0]) pts = np.asarray(x[[pt]], dtype=np.float32) idx, dists = index.knn_search_points(pts, 1, cores=1) self.assertEqual(len(idx), 1) self.assertEqual(idx[0], pt)
class KNNRegressor(): def __init__(self, X, y, n_neighbors=5, weights='uniform', online=False): self.X = X self.y = y self.index = Index(X) self.n_neighbors = n_neighbors self.weights = weights if not online: # if offline self.index.add_points(len(X)) def run(self, ops): return self.index.run(ops) def predict(self, X): indices, dists = self.index.knn_search_points(X, k=self.n_neighbors) weights = self._get_weights(dists) if self.weights == 'uniform': y_pred = np.mean(self.y[indices], axis=1) else: y_pred = np.empty((X.shape[0], self.y.shape[1])) denom = np.sum(weights, axis=1) for j in range(self.y.shape[1]): num = np.sum(self.y[indices, j] * weights, axis=1) y_pred[:, j] = num / denom if self.y.ndim == 1: y_pred = y_pred.ravel() return y_pred def _get_weights(self, dists): if self.weights == 'uniform': return None for i, dist in enumerate(dists): if 0. in dist: dists[i] = dist == 0. else: dists[i] = 1. / dist return dists
def test_check_x_type(self): x = random_vectors() index = Index(x) self.assertTrue(index.is_using_pyarray) index.add_points(len(x)) index.knn_search_points(x, 10) with self.assertRaises(ValueError): x = random_vectors(dtype=np.int32) index = Index(x) index.add_points(len(x)) index.knn_search_points(x, 10) with self.assertRaises(ValueError): x = np.random.rand(100, 10) index = Index(x) index.add_points(len(x)) index.knn_search_points(x, 10)