def test_openmp_obj(self): N = 10000 # must be large enough x0 = random_vectors(N, dtype=np.float64) x = PseudoArray(x0) index = Index(x) self.assertFalse(index.is_using_pyarray) index.add_points( x.shape[0]) # we must add points before querying the index pts = np.asarray(x0, dtype=np.float32) for r in range(5): # make cache ready idx, dists = index.knn_search_points(pts, 10) start = time.time() ids1, dists1 = index.knn_search_points(pts, 10, cores=1) elapsed1 = time.time() - start start = time.time() ids2, dists2 = index.knn_search_points(pts, 10, cores=4) elapsed2 = time.time() - start print("single thread: {:.2f} ms".format(elapsed1 * 1000)) print("4 threads: {:.2f} ms".format(elapsed2 * 1000))
def test_knn_search(): """ KNN_SEARCH ---------- GIVEN INDEX, RETURN INDEXES & DISTANCES in ASCENDING ORDER (including itself) Parameters ---------- pid: index of target point k: number of points to find (WE MUST SET K LESS THAN OR EQUAL TO THE # OF POINTS) cores: number of cores to use checks: eps: sorted: Returns ------- ids: ids of points found (numpy 2D array) dists: distances from target point (numpy 2D array) """ x = random_vectors() index = Index(x) index.add_points(x.shape[0]) # pick random integer pt = np.random.randint(x.shape[0]) # id. e.g.) 94 print(x[[pt]]) # data. e.g.) [[0.64, ...]] idx, dist = index.knn_search(pt, 5, cores=1) print(idx) # if pt=10, array([[10, 80, 87, 5, 95]]) print(dist) # array([[0, 0.76741797, 0.86952025, 0.90387696, 0.9157505 ]])
class KNNKernelDensity: SQRT2PI = np.sqrt(2 * np.pi) def __init__(self, X: np.ndarray[Any, Any], online: Optional[bool] = False): self.X = X self.index = Index(X) if not online: self.index.add_points(len(X)) def run(self, ops: Any) -> Any: return self.index.run(ops) def run_ids(self, ids: Iterable[int]) -> Any: return self.index.run_ids(ids) def score_samples( self, X: np.ndarray[Any, Any], k: int = 10, bandwidth: float = 0.2 ) -> float: _, dists = self.index.knn_search_points(X, k=k) scores = self._gaussian_score(dists, bandwidth) / k return scores def _gaussian_score(self, dists: float, bandwidth: float) -> float: logg = -0.5 * (dists / bandwidth) ** 2 g = np.exp(logg) / bandwidth / self.SQRT2PI return g.sum(axis=1) # type: ignore
class KNNKernelDensity(): SQRT2PI = np.sqrt(2 * np.pi) def __init__(self, X, online=False): self.X = X self.index = Index(X) if not online: # if offline self.index.add_points(len(X)) def run(self, ops): return self.index.run(ops) def run_ids(self, ids): return self.index.run_ids(ids) def score_samples(self, X, k=10, bandwidth=0.2): _, dists = self.index.knn_search_points(X, k=k) scores = self._gaussian_score(dists, bandwidth) / k return scores def _gaussian_score(self, dists, bandwidth): logg = -0.5 * (dists / bandwidth) ** 2 g = np.exp(logg) / bandwidth / self.SQRT2PI return g.sum(axis=1)
def test_return_shape_64(self): x = random_vectors(dtype=np.float64) index = Index(x) self.assertIs(x, index.array) self.assertTrue(index.is_using_pyarray) index.add_points(x.shape[0]) for i in range(x.shape[0]): ids, dists = index.knn_search(i, 5) self.assertEqual(ids.shape, (1, 5)) self.assertEqual(dists.shape, (1, 5))
def test_large_k(self): x = random_vectors() q = random_vectors(1) k = x.shape[0] + 1 # make k larger than # of vectors in x index = Index(x) self.assertTrue(index.is_using_pyarray) index.add_points(x.shape[0]) with self.assertRaises(ValueError): index.knn_search(0, k) with self.assertRaises(ValueError): index.knn_search_points(q, k)
def test_random_64(self): x = random_vectors(dtype=np.float64) index = Index(x) self.assertTrue(index.is_using_pyarray) index.add_points( x.shape[0]) # we must add points before querying the index pt = np.random.randint(x.shape[0]) pts = np.asarray(x[[pt]], dtype=np.float32) idx, dists = index.knn_search_points(pts, 1, cores=1) self.assertEqual(len(idx), 1) self.assertEqual(idx[0], pt)
def test_add_points(): """ ADD_POINTS ---------- WE MUST ADD POINTS BEFORE QUERYING THE INDEX Parameters ---------- ops: number of points to add """ x = random_vectors(n=30) index = Index(x) print(index.size()) # 0 since we did not add any points index.add_points(1) # add 1 point print(index.size()) # 1 index.add_points(100000) print(index.size()) # 30 since we cannot add more than we have
class KNNRegressor(): def __init__(self, X, y, n_neighbors=5, weights='uniform', online=False): self.X = X self.y = y self.index = Index(X) self.n_neighbors = n_neighbors self.weights = weights if not online: # if offline self.index.add_points(len(X)) def run(self, ops): return self.index.run(ops) def predict(self, X): indices, dists = self.index.knn_search_points(X, k=self.n_neighbors) weights = self._get_weights(dists) if self.weights == 'uniform': y_pred = np.mean(self.y[indices], axis=1) else: y_pred = np.empty((X.shape[0], self.y.shape[1])) denom = np.sum(weights, axis=1) for j in range(self.y.shape[1]): num = np.sum(self.y[indices, j] * weights, axis=1) y_pred[:, j] = num / denom if self.y.ndim == 1: y_pred = y_pred.ravel() return y_pred def _get_weights(self, dists): if self.weights == 'uniform': return None for i, dist in enumerate(dists): if 0. in dist: dists[i] = dist == 0. else: dists[i] = 1. / dist return dists
def test_updates_after_all_points_added(self): np.random.seed(10) n = 10000 w = (0.5, 0.5) x = random_vectors(n) ops = 1000 index = Index(x, w=w) self.assertTrue(index.is_using_pyarray) index.add_points(n) # add all points for i in range(1000): index.knn_search_points(random_vectors(100), 10) # accumulate losses for i in range(10): res = index.run(ops) self.assertEqual(res['addPointResult'], 0) self.assertEqual(res['updateIndexResult'], ops)
def test_openmp(self): N = 10000 # must be large enough x = random_vectors(N) index = Index(x) self.assertTrue(index.is_using_pyarray) index.add_points( x.shape[0]) # we must add points before querying the index for r in range(5): # make cache ready idx, dists = index.knn_search_points(x, 10) start = time.time() ids1, dists1 = index.knn_search_points(x, 10, cores=1) elapsed1 = time.time() - start start = time.time() ids2, dists2 = index.knn_search_points(x, 10, cores=4) elapsed2 = time.time() - start print("single thread: {:.2f} ms".format(elapsed1 * 1000)) print("4 threads: {:.2f} ms".format(elapsed2 * 1000))
def test_knn_search_points(): """ KNN_SEARCH_POINTS ---------- GIVEN DATA(ARRAY), RETURN INDEXES & DISTANCES in ASCENDING ORDER (including itself) Parameters ---------- points: data(2d array) of target point (any 2d array can be possible) e.g.) [[0.33, 0.61, ...]] k: number of points to find (WE MUST SET K LESS THAN OR EQUAL TO THE # OF POINTS) cores: number of cores to use checks: eps: sorted: Returns ------- ids: ids of points found (numpy 2D array) dists: distances from target point (numpy 2D array) """ x = random_vectors(n=10, d=3) index = Index(x) index.add_points(x.shape[0]) # pick random integer pt = np.random.randint(x.shape[0]) # id. e.g.) 94 # TEST ON RANDOM DATA POINT pts = np.asarray(x[[pt]], dtype=np.float32) idx2, dist2 = index.knn_search_points(pts, 3, cores=1) print(idx2) print(dist2) # TEST ON WHOLE DATA SET (ARRAY) idx3, dist3 = index.knn_search_points(x, 5, cores=1) print(idx3) print(dist3)
def test_check_x_type(self): x = random_vectors() index = Index(x) self.assertTrue(index.is_using_pyarray) index.add_points(len(x)) index.knn_search_points(x, 10) with self.assertRaises(ValueError): x = random_vectors(dtype=np.int32) index = Index(x) index.add_points(len(x)) index.knn_search_points(x, 10) with self.assertRaises(ValueError): x = np.random.rand(100, 10) index = Index(x) index.add_points(len(x)) index.knn_search_points(x, 10)