def test_search_by_vector(self): f = 2 i = HnswIndex(f, 'L2') i.add_data([2, 2]) i.add_data([3, 2]) i.add_data([3, 3]) i.build() self.assertEqual(i.search_by_vector([4, 4], 3), [2, 1, 0]) self.assertEqual(i.search_by_vector([1, 1], 3), [0, 1, 2]) self.assertEqual(i.search_by_vector([4, 2], 3), [1, 2, 0])
def test_search_by_vector(self): f = 3 i = HnswIndex(f) i.add_data([0, 0, 1]) i.add_data([0, 1, 0]) i.add_data([1, 0, 0]) i.build(max_m0=10, m=5) self.assertEqual(i.search_by_vector([3, 2, 1], 3), [2, 1, 0]) self.assertEqual(i.search_by_vector([1, 2, 3], 3), [0, 1, 2]) self.assertEqual(i.search_by_vector([2, 0, 1], 3), [2, 0, 1])
class N2(BaseANN): def __init__(self, m): threads = 8 self.name = 'N2(m={}, threads={})'.format(m,threads) self._m = m self._threads = threads self._index = None print("Init done") def fit(self, X): X = numpy.array(X) X = X.astype(numpy.float32) self._index = HnswIndex(X.shape[1],"L2") print("Shape", X.shape[1]) for el in X: self._index.add_data(el) self._index.build(m=self._m, n_threads=self._threads) print("Fit done") def query(self, v, n): v = v.astype(numpy.float32) #print(v) #print(n) #print("-----------------------------------") nns = self._index.search_by_vector(v,n) #print("[search_by_vector]: Nearest neighborhoods of vector {}: {}".format(v, nns)) return nns def use_threads(self): return False
def test04_batch_search_by_vectors(self): index = HnswIndex(self.dim) index.load(self.model_fname) T = [[random.gauss(0, 1) for z in xrange(self.dim)] for y in xrange(100)] batch_res = index.batch_search_by_vectors(T, 10, num_threads=12, include_distances=True) normal_res = [ index.search_by_vector(t, 10, include_distances=True) for t in T ] self.assertEqual(batch_res, normal_res)
class N2(BaseANN): def __init__(self, m, ef_construction, n_threads, ef_search, metric): self._m = m self._m0 = m * 2 self._ef_construction = ef_construction self._n_threads = n_threads self._ef_search = ef_search self._index_name = os.path.join( INDEX_DIR, "youtube_n2_M%d_efCon%d_n_thread%s" % (m, ef_construction, n_threads)) self.name = "N2_M%d_efCon%d_n_thread%s_efSearch%d" % ( m, ef_construction, n_threads, ef_search) self._metric = metric d = os.path.dirname(self._index_name) if not os.path.exists(d): os.makedirs(d) def fit(self, X): from n2 import HnswIndex if self._metric == 'euclidean': self._n2 = HnswIndex(X.shape[1], 'L2') else: self._n2 = HnswIndex(X.shape[1]) if os.path.exists(self._index_name): logging.debug("Loading index from file") self._n2.load(self._index_name) else: logging.debug("Index file is not exist: {0}".format( self._index_name)) logging.debug("Start fitting") for i, x in enumerate(X): self._n2.add_data(x.tolist()) self._n2.build(m=self._m, max_m0=self._m0, ef_construction=self._ef_construction, n_threads=self._n_threads) self._n2.save(self._index_name) def query(self, v, n): return self._n2.search_by_vector(v.tolist(), n, self._ef_search) def __str__(self): return self.name
class N2(BaseANN): def __init__(self, m, ef_construction, n_threads, ef_search, metric, batch): self.name = "N2_M%d_efCon%d_n_thread%s_efSearch%d%s" % (m, ef_construction, n_threads, ef_search, '_batch' if batch else '') self._m = m self._m0 = m * 2 self._ef_construction = ef_construction self._n_threads = n_threads self._ef_search = ef_search self._index_name = os.path.join(CACHE_DIR, "index_n2_%s_M%d_efCon%d_n_thread%s" % (args.dataset, m, ef_construction, n_threads)) self._metric = metric def fit(self, X): if self._metric == 'euclidean': self._n2 = HnswIndex(X.shape[1], 'L2') elif self._metric == 'dot': self._n2 = HnswIndex(X.shape[1], 'dot') else: self._n2 = HnswIndex(X.shape[1]) if os.path.exists(self._index_name): n2_logger.info("Loading index from file") self._n2.load(self._index_name, use_mmap=False) return n2_logger.info("Create Index") for i, x in enumerate(X): self._n2.add_data(x) self._n2.build(m=self._m, max_m0=self._m0, ef_construction=self._ef_construction, n_threads=self._n_threads) self._n2.save(self._index_name) def query(self, v, n): return self._n2.search_by_vector(v, n, self._ef_search) def batch_query(self, X, n): self.b_res = self._n2.batch_search_by_vectors(X, n, self._ef_search, self._n_threads) def get_batch_results(self): return self.b_res def __str__(self): return self.name
def precision(self, n, n_trees=10, n_points=10000, n_rounds=10): found = 0 for r in xrange(n_rounds): # create random points at distance x from (1000, 0, 0, ...) f = 10 i = HnswIndex(f, 'L2') for j in xrange(n_points): p = [random.gauss(0, 1) for z in xrange(f - 1)] norm = sum([pi**2 for pi in p])**0.5 x = [1000] + [pi / norm * j for pi in p] i.add_data(x) i.build() nns = i.search_by_vector([1000] + [0] * (f - 1), n) self.assertEqual(nns, sorted(nns)) # should be in order # The number of gaps should be equal to the last item minus n-1 found += len([_x for _x in nns if _x < n]) return 1.0 * found / (n * n_rounds)
class N2(BaseANN): def __init__(self, m): threads = 8 self.name = 'N2(m={}, threads={})'.format(m, threads) self._m = m self._threads = threads self._index = None def fit(self, X): X = numpy.array(X) X = X.astype(numpy.float32) self._index = HnswIndex(X.shape[1], "L2") for el in X: self._index.add_data(el) self._index.build(m=self._m, n_threads=self._threads) def query(self, v, n): v = v.astype(numpy.float32) nns = self._index.search_by_vector(v, n) return nns
from n2 import HnswIndex import random f = 3 t = HnswIndex(f) # HnswIndex(f, "L2 or angular") for i in xrange(1000): v = [random.gauss(0, 1) for z in xrange(f)] t.add_data(v) t.build(m=5, max_m0=10, n_threads=4) t.save('test.n2') u = HnswIndex(f, "angular") u.load('test.n2') search_id = 1 k = 3 neighbor_ids = u.search_by_id(search_id, k) print( "[search_by_id]: Nearest neighborhoods of id {}: {}".format( search_id, neighbor_ids)) example_vector_query = [random.gauss(0, 1) for z in xrange(f)] nns = u.search_by_vector(example_vector_query, k, include_distances=True) print( "[search_by_vector]: Nearest neighborhoods of vector {}: {}".format( example_vector_query, nns))