def t1est_large_index_batch(self): print "test_large_index_batch" start_time = int(round(time.time() * 1000)) os.system("rm -rf test_db") os.system("mkdir test_db") # Generate pairs of random points where the pair is super close f = 100 i = AnnoyIndex(f, 12, "test_db", 10, 1000, 3048576000, 0) i_v = [] v_v = [] for j in xrange(0, 100000, 2): p = [random.gauss(0, 1) for z in xrange(f)] f1 = random.random() + 1 f2 = random.random() + 1 x = [f1 * pi + random.gauss(0, 1e-2) for pi in p] y = [f2 * pi + random.gauss(0, 1e-2) for pi in p] i_v.append(j) i_v.append(j+1) v_v.append(x) v_v.append(y) i.add_item_batch(i_v, v_v) i = AnnoyIndex(f, 12, "test_db", 10, 1000, 3048576000, 1) for j in xrange(0, 100000, 2): self.assertEqual(i.get_nns_by_item(j, 2, 50), [j, j+1]) self.assertEqual(i.get_nns_by_item(j+1, 2, 50), [j+1, j]) print "Total time = ", (int(round(time.time() * 1000)) - start_time)/1000
def _get_index(self, f, distance): input = 'test/glove.twitter.27B.%dd.txt.gz' % f output = 'test/glove.%d.%s.annoy' % (f, distance) if not os.path.exists(output): if not os.path.exists(input): # Download GloVe pretrained vectors: http://nlp.stanford.edu/projects/glove/ # Hosting them on my own S3 bucket since the original files changed format url = 'https://s3-us-west-1.amazonaws.com/annoy-vectors/glove.twitter.27B.%dd.txt.gz' % f print('downloading', url, '->', input) urlretrieve(url, input) print('building index', distance, f) annoy = AnnoyIndex(f, 12, "test_db", 10, 1000, 3048576000, 0) v_v = [] items = [] for i, line in enumerate(gzip.open(input, 'rb')): v = [float(x) for x in line.strip().split()[1:]] v_v.append(v) items.append(i) if (i+1) % 10000 == 0: print (i+1) annoy.add_item_batch(items, v_v) v_v = [] items = [] if v_v: annoy.add_item_batch(items, v_v) return annoy
def _get_index(self, f, distance): input = 'test/glove.twitter.27B.%dd.txt.gz' % f output = 'test/glove.%d.%s.annoy' % (f, distance) if not os.path.exists(output): if not os.path.exists(input): # Download GloVe pretrained vectors: http://nlp.stanford.edu/projects/glove/ # Hosting them on my own S3 bucket since the original files changed format url = 'https://s3-us-west-1.amazonaws.com/annoy-vectors/glove.twitter.27B.%dd.txt.gz' % f print('downloading', url, '->', input) urlretrieve(url, input) print('building index', distance, f) annoy = AnnoyIndex(f, 12, "test_db", 10, 1000, 3048576000, 0) v_v = [] items = [] for i, line in enumerate(gzip.open(input, 'rb')): v = [float(x) for x in line.strip().split()[1:]] v_v.append(v) items.append(i) if (i + 1) % 10000 == 0: print(i + 1) annoy.add_item_batch(items, v_v) v_v = [] items = [] if v_v: annoy.add_item_batch(items, v_v) return annoy
def test_get_nns_by_item_batch(self): print "test_get_nns_by_item_batch " os.system("rm -rf test_db") os.system("mkdir test_db") f = 3 i = AnnoyIndex(f, 3, "test_db", 10, 1000, 3048576000, 0) i.add_item_batch([0,1,2], [[2, 1, 0], [1, 2, 0], [0, 0, 1]]) self.assertEqual(i.get_nns_by_item(0, 3), [0, 1, 2]) self.assertEqual(i.get_nns_by_item(1, 3), [1, 0, 2]) self.assertTrue(i.get_nns_by_item(2, 3) in [[2, 0, 1], [2, 1, 0]]) # could be either