Пример #1
0
def precision(f=40, n=1000000):
    t = AnnoyIndex(f)
    for i in xrange(n):
        v = []
        for z in xrange(f):
            v.append(random.gauss(0, 1))
        t.add_item(i, v)

    t.build(2 * f)
    t.save('test.tree')

    limits = [10, 100, 1000, 10000]
    k = 10
    prec_sum = {}
    prec_n = 1000
    time_sum = {}

    for i in xrange(prec_n):
        j = random.randrange(0, n)
        print 'finding nbs for', j
        
        closest = set(t.get_nns_by_item(j, n)[:k])
        for limit in limits:
            t0 = time.time()
            toplist = t.get_nns_by_item(j, limit)
            T = time.time() - t0
            
            found = len(closest.intersection(toplist))
            hitrate = 1.0 * found / k
            prec_sum[limit] = prec_sum.get(limit, 0.0) + hitrate
            time_sum[limit] = time_sum.get(limit, 0.0) + T

        for limit in limits:
            print 'limit: %-9d precision: %6.2f%% avg time: %.6fs' % (limit, 100.0 * prec_sum[limit] / (i + 1), time_sum[limit] / (i + 1))
Пример #2
0
    def t1est_large_index_batch(self):
        print "test_large_index_batch"
        start_time = int(round(time.time() * 1000))
        os.system("rm -rf test_db")
        os.system("mkdir test_db")
        # Generate pairs of random points where the pair is super close
        f = 100
        i = AnnoyIndex(f, 12, "test_db", 10,  1000, 3048576000, 0)
        i_v = []
        v_v = []
        for j in xrange(0, 100000, 2):
            p = [random.gauss(0, 1) for z in xrange(f)]
            f1 = random.random() + 1
            f2 = random.random() + 1
            x = [f1 * pi + random.gauss(0, 1e-2) for pi in p]
            y = [f2 * pi + random.gauss(0, 1e-2) for pi in p]
            i_v.append(j)
            i_v.append(j+1)
            v_v.append(x)
            v_v.append(y)
        
        i.add_item_batch(i_v, v_v)

        i = AnnoyIndex(f, 12, "test_db", 10,  1000, 3048576000, 1)
        for j in xrange(0, 100000, 2):
            self.assertEqual(i.get_nns_by_item(j, 2, 50), [j, j+1])
            self.assertEqual(i.get_nns_by_item(j+1, 2, 50), [j+1, j])
        print "Total time = ",  (int(round(time.time() * 1000)) - start_time)/1000
Пример #3
0
 def test_get_nns_by_item_batch(self):
     print "test_get_nns_by_item_batch "
     os.system("rm -rf test_db")
     os.system("mkdir test_db")
     f = 3
     i = AnnoyIndex(f, 3, "test_db", 10, 1000, 3048576000, 0)
     i.add_item_batch([0,1,2], [[2, 1, 0], [1, 2, 0], [0, 0, 1]])
    
     self.assertEqual(i.get_nns_by_item(0, 3), [0, 1, 2])
     self.assertEqual(i.get_nns_by_item(1, 3), [1, 0, 2])
     self.assertTrue(i.get_nns_by_item(2, 3) in [[2, 0, 1], [2, 1, 0]]) # could be either
Пример #4
0
    def test_get_nns_by_item(self):
        f = 3
        i = AnnoyIndex(f)
        i.add_item(0, [2, 1, 0])
        i.add_item(1, [1, 2, 0])
        i.add_item(2, [0, 0, 1])
        i.build(10)

        self.assertEqual(i.get_nns_by_item(0, 3), [0, 1, 2])
        self.assertEqual(i.get_nns_by_item(1, 3), [1, 0, 2])
        self.assertTrue(i.get_nns_by_item(2, 3) in [[2, 0, 1], [2, 1, 0]]) # could be either
Пример #5
0
 def test_basic_nns(self):
     f = 100
     i = AnnoyIndex(f, 'hamming')
     u = numpy.random.binomial(1, 0.5, f)
     v = numpy.random.binomial(1, 0.5, f)
     i.add_item(0, u)
     i.add_item(1, v)
     i.build(10)
     self.assertEquals(i.get_nns_by_item(0, 99), [0, 1])
     self.assertEquals(i.get_nns_by_item(1, 99), [1, 0])
     rs, ds = i.get_nns_by_item(0, 99, include_distances=True)
     self.assertEquals(rs, [0, 1])
     self.assertAlmostEqual(ds[0], 0)
     self.assertAlmostEqual(ds[1], numpy.dot(u-v, u-v))
Пример #6
0
 def test_item_vector_after_save(self):
     # Issue #279
     a = AnnoyIndex(3)
     a.verbose(True)
     a.add_item(1, [1, 0, 0])
     a.add_item(2, [0, 1, 0])
     a.add_item(3, [0, 0, 1])
     a.build(-1)
     self.assertEqual(a.get_n_items(), 4)
     self.assertEqual(a.get_item_vector(3), [0, 0, 1])
     self.assertEqual(set(a.get_nns_by_item(1, 999)), set([1, 2, 3]))
     a.save('something.annoy')
     self.assertEqual(a.get_n_items(), 4)
     self.assertEqual(a.get_item_vector(3), [0, 0, 1])
     self.assertEqual(set(a.get_nns_by_item(1, 999)), set([1, 2, 3]))
Пример #7
0
    def test_large_index(self):
        # Generate pairs of random points where the pair is super close
        f = 10
        i = AnnoyIndex(f, 'manhattan')
        for j in range(0, 10000, 2):
            p = [random.gauss(0, 1) for z in range(f)]
            x = [1 + pi + random.gauss(0, 1e-2) for pi in p]
            y = [1 + pi + random.gauss(0, 1e-2) for pi in p]
            i.add_item(j, x)
            i.add_item(j+1, y)

        i.build(10)
        for j in range(0, 10000, 2):
            self.assertEqual(i.get_nns_by_item(j, 2), [j, j+1])
            self.assertEqual(i.get_nns_by_item(j+1, 2), [j+1, j])
Пример #8
0
 def _test_holes_base(self, n, f=100, base_i=100000):
     annoy = AnnoyIndex(f)
     for i in range(n):
         annoy.add_item(base_i + i, numpy.random.normal(size=(f,)))
     annoy.build(100)
     res = annoy.get_nns_by_item(base_i, n)
     self.assertEquals(set(res), set([base_i + i for i in range(n)]))
def make_text_graph(user_lemma_matrix, dimensionality, metric, number_of_estimators, number_of_neighbors):
    user_lemma_matrix_tfidf = augmented_tf_idf(user_lemma_matrix)
    # print(user_lemma_matrix_tfidf.shape)
    if (user_lemma_matrix_tfidf.shape[0] <= dimensionality) or (user_lemma_matrix_tfidf.shape[1] <= dimensionality):
        X_svd = user_lemma_matrix_tfidf.toarray()
    else:
        X_svd = TruncatedSVD(n_components=dimensionality).fit_transform(user_lemma_matrix_tfidf)

    annoy_index = AnnoyIndex(X_svd.shape[1], metric=metric)

    for q in range(X_svd.shape[0]):
        annoy_index.add_item(q, X_svd[q, :])

    annoy_index.build(number_of_estimators)

    row = list()
    col = list()
    data = list()
    for q in range(X_svd.shape[0]):
        neighbors, distances = annoy_index.get_nns_by_item(q, number_of_neighbors, include_distances=True)

        row.extend([q] * number_of_neighbors)
        col.extend(neighbors)
        data.extend(distances)

    row = np.array(row, dtype=np.int64)
    col = np.array(col, dtype=np.int64)
    data = np.array(data, dtype=np.float64)

    text_graph = spsp.coo_matrix((data, (row, col)), shape=(X_svd.shape[0], X_svd.shape[0]))
    text_graph = spsp.csr_matrix(text_graph)

    return text_graph
Пример #10
0
    def test_zero_vectors(self):
        # Mentioned on the annoy-user list
        bitstrings = [
            '0000000000011000001110000011111000101110111110000100000100000000',
            '0000000000011000001110000011111000101110111110000100000100000001',
            '0000000000011000001110000011111000101110111110000100000100000010',
            '0010010100011001001000010001100101011110000000110000011110001100',
            '1001011010000110100101101001111010001110100001101000111000001110',
            '0111100101111001011110010010001100010111000111100001101100011111',
            '0011000010011101000011010010111000101110100101111000011101001011',
            '0011000010011100000011010010111000101110100101111000011101001011',
            '1001100000111010001010000010110000111100100101001001010000000111',
            '0000000000111101010100010001000101101001000000011000001101000000',
            '1000101001010001011100010111001100110011001100110011001111001100',
            '1110011001001111100110010001100100001011000011010010111100100111',
        ]
        vectors = [[int(bit) for bit in bitstring] for bitstring in bitstrings]

        f = 64
        idx = AnnoyIndex(f, 'hamming')
        for i, v in enumerate(vectors):
            idx.add_item(i, v)

        idx.build(10)
        idx.save('idx.ann')
        idx = AnnoyIndex(f, 'hamming')
        idx.load('idx.ann')
        js, ds = idx.get_nns_by_item(0, 5, include_distances=True)
        self.assertEquals(js[0], 0)
        self.assertEquals(ds[:4], [0, 1, 1, 22])
Пример #11
0
def create_walks(df,index_file,patient_dict_file,index_dict_file,n_neighbors = 25,walks_per_patient=10,walk_size=50,out_dir="./"):
    index = AnnoyIndex(df.shape[1])
    index.load(index_file)
    patient_dict = {}
    for key, val in csv.reader(open(patient_dict_file)):
        patient_dict[key] = int(val)
    index_dict = {}
    for key, val in csv.reader(open(index_dict_file)):
        index_dict[int(key)] = val
    print("Computing nearest-neighbors...")
    neighbor_dict = {}
    for i in range(index.get_n_items()):
        if i % 1000 == 0:
            print str(i)
        patient_id = index_dict[i]
        neighbors = index.get_nns_by_item(i=i, n=n_neighbors, search_k=-1, include_distances=False)
        neighbor_ids = [index_dict[x] for x in neighbors]
        neighbor_dict[patient_id] = neighbor_ids
    f = open(out_dir+"patient_walks.txt", 'wb')
    for i in range(index.get_n_items()):
        if i % 1000 == 0:
            print str(i)
        patient_id = index_dict[i]
        patient_sentences = ""
        for j in range(walks_per_patient):
            sentence = generate_sentence(start=patient_id,neighbor_dict=neighbor_dict,
                                        n_neighbors=n_neighbors,walk_size=walk_size)
            patient_sentences = sentence + "\n"
            ## Write it ##
        f.write(patient_sentences)
Пример #12
0
 def test_get_lots_of_nns(self):
     f = 10
     i = AnnoyIndex(f, 'euclidean')
     i.add_item(0, [random.gauss(0, 1) for x in xrange(f)])
     i.build(10)
     for j in xrange(100):
         self.assertEqual(i.get_nns_by_item(0, 999999999), [0])
Пример #13
0
def do(indextype):
    a = AnnoyIndex(8, indextype[0])
    a.load('points.%s.annoy' % indextype)
    with open('points.%s.ann.txt' % indextype, 'w') as out:
        for q_index in [1443, 1240, 818, 1725, 1290, 2031, 1117, 1211, 1902, 603]:
            nns = a.get_nns_by_item(q_index, 10)
            print >> out, '%s\t%s' % (q_index, ','.join([str(n) for n in nns]))
Пример #14
0
 def test_large_index(self):
     # Generate pairs of random points where the pair is super close
     f = 10
     i = AnnoyIndex(f)
     for j in xrange(0, 10000, 2):
         p = [random.gauss(0, 1) for z in xrange(f)]
         f1 = random.random() + 1
         f2 = random.random() + 1
         x = [f1 * pi + random.gauss(0, 1e-2) for pi in p]
         y = [f2 * pi + random.gauss(0, 1e-2) for pi in p]
         i.add_item(j, x)
         i.add_item(j+1, y)
     
     i.build(10)
     for j in xrange(0, 10000, 2):
         self.assertEquals(i.get_nns_by_item(j, 2), [j, j+1])
         self.assertEquals(i.get_nns_by_item(j+1, 2), [j+1, j])
Пример #15
0
    def t1est_large_index(self):
        os.system("rm -rf test_db")
        os.system("mkdir test_db")
        # Generate pairs of random points where the pair is super close
        f = 10
        i = AnnoyIndex(f, 10, "test_db", 10,  1000, 3048576000)
        for j in xrange(0, 10000, 2):
            p = [random.gauss(0, 1) for z in xrange(f)]
            f1 = random.random() + 1
            f2 = random.random() + 1
            x = [f1 * pi + random.gauss(0, 1e-2) for pi in p]
            y = [f2 * pi + random.gauss(0, 1e-2) for pi in p]
            i.add_item(j, x)
            i.add_item(j+1, y)

        for j in xrange(0, 10000, 2):
            self.assertEqual(i.get_nns_by_item(j, 2), [j, j+1])
            self.assertEqual(i.get_nns_by_item(j+1, 2), [j+1, j])
Пример #16
0
 def test_include_dists_check_ranges(self):
     f = 3
     i = AnnoyIndex(f)
     for j in xrange(100000):
         i.add_item(j, numpy.random.normal(size=f))
     i.build(10)
     indices, dists = i.get_nns_by_item(0, 100000, include_distances=True)
     self.assertTrue(max(dists) < 2.0)
     self.assertAlmostEqual(min(dists), 0.0)
Пример #17
0
    def test_get_nns_by_item(self):
        f = 2
        i = AnnoyIndex(f, 'euclidean')
        i.add_item(0, [2, 2])
        i.add_item(1, [3, 2])
        i.add_item(2, [3, 3])
        i.build(10)

        self.assertEqual(i.get_nns_by_item(0, 3), [0, 1, 2])
        self.assertEqual(i.get_nns_by_item(2, 3), [2, 1, 0])
Пример #18
0
    def test_get_nns_by_item(self):
        f = 3
        i = AnnoyIndex(f)
        i.add_item(0, [2,1,0])
        i.add_item(1, [1,2,0])
        i.add_item(2, [0,0,1])
        i.build(10)

        self.assertEquals(i.get_nns_by_item(0, 3), [0,1,2])
        self.assertEquals(i.get_nns_by_item(1, 3), [1,0,2])
Пример #19
0
    def test_get_nns_search_k(self):
        f = 3
        i = AnnoyIndex(f)
        i.add_item(0, [0, 0, 1])
        i.add_item(1, [0, 1, 0])
        i.add_item(2, [1, 0, 0])
        i.build(10)

        self.assertEqual(i.get_nns_by_item(0, 3, 10), [0, 1, 2])
        self.assertEqual(i.get_nns_by_vector([3, 2, 1], 3, 10), [2, 1, 0])
Пример #20
0
    def test_include_dists(self):
        f = 40
        i = AnnoyIndex(f, 'euclidean')
        v = numpy.random.normal(size=f)
        i.add_item(0, v)
        i.add_item(1, -v)
        i.build(10)

        indices, dists = i.get_nns_by_item(0, 2, 10, True)
        self.assertEqual(indices, [0, 1])
        self.assertAlmostEqual(dists[0], 0.0)
Пример #21
0
    def test_include_dists(self):
        # Double checking issue 112
        f = 40
        i = AnnoyIndex(f)
        v = numpy.random.normal(size=f)
        i.add_item(0, v)
        i.add_item(1, -v)
        i.build(10)

        indices, dists = i.get_nns_by_item(0, 2, 10, True)
        self.assertEqual(indices, [0, 1])
        self.assertAlmostEqual(dists[0], 0.0)
        self.assertAlmostEqual(dists[1], 2.0)
Пример #22
0
 def test_distance_consistency(self):
     n, f = 1000, 3
     i = AnnoyIndex(f, 'manhattan')
     for j in range(n):
         i.add_item(j, numpy.random.normal(size=f))
     i.build(10)
     for a in random.sample(range(n), 100):
         indices, dists = i.get_nns_by_item(a, 100, include_distances=True)
         for b, dist in zip(indices, dists):
             self.assertAlmostEqual(dist, i.get_distance(a, b))
             u = numpy.array(i.get_item_vector(a))
             v = numpy.array(i.get_item_vector(b))
             self.assertAlmostEqual(dist, numpy.sum(numpy.fabs(u - v)))
             self.assertAlmostEqual(dist, sum([abs(float(x)-float(y)) for x, y in zip(u, v)]))
Пример #23
0
 def test_distance_consistency(self):
     n, f = 1000, 3
     i = AnnoyIndex(f, 'dot')
     for j in range(n):
         i.add_item(j, numpy.random.normal(size=f))
     i.build(10)
     for a in random.sample(range(n), 100):
         indices, dists = i.get_nns_by_item(a, 100, include_distances=True)
         for b, dist in zip(indices, dists):
             self.assertAlmostEqual(dist, numpy.dot(
                 i.get_item_vector(a),
                 i.get_item_vector(b)
             ))
             self.assertEqual(dist, i.get_distance(a, b))
Пример #24
0
 def test_distance_consistency(self):
     n, f = 1000, 3
     i = AnnoyIndex(f, 'euclidean')
     for j in xrange(n):
         i.add_item(j, numpy.random.normal(size=f))
     i.build(10)
     for a in random.sample(range(n), 100):
         indices, dists = i.get_nns_by_item(a, 100, include_distances=True)
         for b, dist in zip(indices, dists):
             self.assertAlmostEqual(dist, i.get_distance(a, b))
             u = numpy.array(i.get_item_vector(a))
             v = numpy.array(i.get_item_vector(b))
             # self.assertAlmostEqual(dist, euclidean(u, v))
             self.assertAlmostEqual(dist, numpy.dot(u - v, u - v) ** 0.5)
             self.assertAlmostEqual(dist, sum([(x-y)**2 for x, y in zip(u, v)])**0.5)
Пример #25
0
 def test_save_load(self):
     f = 100
     i = AnnoyIndex(f, 'hamming')
     u = numpy.random.binomial(1, 0.5, f)
     v = numpy.random.binomial(1, 0.5, f)
     i.add_item(0, u)
     i.add_item(1, v)
     i.build(10)
     i.save('blah.ann')
     j = AnnoyIndex(f, 'hamming')
     j.load('blah.ann')
     rs, ds = j.get_nns_by_item(0, 99, include_distances=True)
     self.assertEquals(rs, [0, 1])
     self.assertAlmostEqual(ds[0], 0)
     self.assertAlmostEqual(ds[1], numpy.dot(u-v, u-v))
Пример #26
0
 def run(self):
     try:
         index = AnnoyIndex(self.n_dims, metric='angular')
         index.load(self.index_filepath)
         for i in range(self.data_indices[0], self.data_indices[1]):
             neighbour_indexes = index.get_nns_by_item(
                 i, self.k, search_k=self.search_k, include_distances=False)
             neighbour_indexes = np.array(neighbour_indexes,
                                          dtype=np.uint32)
             self.results_queue.put(
                 IndexNeighbours(row_index=i,
                                 neighbour_list=neighbour_indexes))
     except Exception as e:
         self.exception = e
     finally:
         self.results_queue.close()
Пример #27
0
 def test_distance_consistency(self):
     n, f = 1000, 3
     i = AnnoyIndex(f, 'manhattan')
     for j in xrange(n):
         i.add_item(j, numpy.random.normal(size=f))
     i.build(10)
     for a in random.sample(range(n), 100):
         indices, dists = i.get_nns_by_item(a, 100, include_distances=True)
         for b, dist in zip(indices, dists):
             self.assertAlmostEqual(dist, i.get_distance(a, b))
             u = numpy.array(i.get_item_vector(a))
             v = numpy.array(i.get_item_vector(b))
             self.assertAlmostEqual(dist, numpy.sum(numpy.fabs(u - v)))
             self.assertAlmostEqual(
                 dist,
                 sum([abs(float(x) - float(y)) for x, y in zip(u, v)]))
Пример #28
0
    def debug():

        f = 40
        t = AnnoyIndex(f)  # Length of item vector that will be indexed
        for i in xrange(1000):
            v = [random.gauss(0, 1) for z in xrange(f)]
            t.add_item(i, v)

        t.build(10)  # 10 trees
        t.save('test.ann')

        # ...
        u = AnnoyIndex(f)
        u.load('test.ann')  # super fast, will just mmap the file
        print(u.get_nns_by_item(0,
                                1000))  # will find the 1000 nearest neighbors
Пример #29
0
def test_build_sparse_annoy_index(annoy_index_file):
    data = np.random.choice([0, 1], size=(10, 5))
    sparse_data = csr_matrix(data)

    index = build_annoy_index(sparse_data, annoy_index_file)
    assert os.path.exists(annoy_index_file)

    loaded_index = AnnoyIndex(5, metric='angular')
    loaded_index.load(annoy_index_file)

    assert index.f == loaded_index.f == 5
    assert index.get_n_items() == loaded_index.get_n_items() == 10
    assert index.get_nns_by_item(0, 5) == loaded_index.get_nns_by_item(0, 5)

    index.unload()
    loaded_index.unload()
Пример #30
0
def get_nearest_neighbor_and_similarity(predictions, image_count, master_image, save_path, model):

    dims = 1000
    if model == Models.VGG19_MODEL:
        dims = 4096
    elif model == Models.INCEPTION_3:
        dims = 1000
    elif model == Models.INCEPTION_RESNET_V2:
        dims = 25088

    n_nearest_neighbors = image_count + 1
    trees = 10000
    file_index_to_file_vector = {}

    # Build an index (Approximate Nearest Neighbours)
    t = AnnoyIndex(dims)
    # for i in range(preds.shape[0]):
    i = 0
    j = 0
    for l in predictions:
        file_vector = predictions[i]
        file_index_to_file_vector[i] = file_vector
        t.add_item(i, file_vector)
        i += 1
    t.build(trees)
#    t.save(save_path)

    # for i in range(preds.shape[0]):
    for o in predictions:
        master_vector = file_index_to_file_vector[j]
        # Here we assign master vector, SHOULD be one K

        named_nearest_neighbors = []
        similarities = []
        nearest_neighbors = t.get_nns_by_item(j, n_nearest_neighbors)
        j += 1

    # Next we print all the neighbours on one axis, should redo new master and nearest for the second axis to plot
    for j in nearest_neighbors:
        #         print (j)
        neighbor_vector = predictions[j]
        # The distance between objects,/ similarity, cosine for vinkel
        # similarity = 1 - spatial.distance.cosine(master_vector, neighbor_vector)
        similarity = 1 - spatial.distance.cosine(master_image, neighbor_vector)
        rounded_similarity = int((similarity * 10000)) / 10000.0
        similarities.append(rounded_similarity)
    return similarities, nearest_neighbors
Пример #31
0
    def get_similar_items(self, product_id: int,
                          rec_type: int) -> pd.DataFrame:
        '''
        Function that creates recommendation lists.

        The intuition behind using less components is reducing the number of latent factors
        that can be inferred. And, by excluding item features for the CAB model, recommendations
        will be less based off explicit features such as `aisle` and `department`.
        -------------------
        type:
        1 - Similar Items [DEFAULT_PARAMS]
        2 - Complement Items [CAB_PARAMS]
        '''
        logging.info(
            f'Logging recommendations for {self.model.config.ANNOY_PARAMS[rec_type]}'
        )
        if rec_type == 1:
            annoy_model = AnnoyIndex(
                self.model.config.LIGHTFM_PARAMS['no_components'])
            annoy_model.load(self.config.PATHS.models + '/item.ann')
        elif rec_type == 2:
            annoy_model = AnnoyIndex(
                self.model.config.LIGHTFM_CAB_PARAMS['no_components'])
            annoy_model.load(self.config.PATHS.models + '/item_cab.ann')
        similar_variants = annoy_model.get_nns_by_item(
            product_id,
            self.model.config.ANNOY_PARAMS['nn_count'],
            search_k=-1,
            include_distances=False)

        logging.info(type(similar_variants))
        logging.info(similar_variants)
        similar_variants_df = self.item_df.iloc[similar_variants, :]

        similarVariantsTable = PrettyTable(
            ['product_id', 'product_name', 'aisle', 'department', 'num'])
        similarVariantsTable.add_row([
            similar_variants_df['product_id'],
            similar_variants_df['product_name'], similar_variants_df['aisle'],
            similar_variants_df['department'], similar_variants_df['num']
        ])
        logging.info(
            f'{self.model.config.ANNOY_PARAMS[rec_type]} Data: \n{similarVariantsTable}'
        )

        return similar_variants_df
Пример #32
0
 def test_holes_more(self):
     f = 10
     index = AnnoyIndex(f)
     valid_indices = random.sample(range(2000), 1000) # leave holes
     for i in valid_indices:
         v = numpy.random.normal(size=(f,))
         index.add_item(i, v)
     index.build(10)
     for i in valid_indices:
         js = index.get_nns_by_item(i, 10000)
         for j in js:
             self.assertTrue(j in valid_indices)
     for i in range(1000):
         v = numpy.random.normal(size=(f,))
         js = index.get_nns_by_vector(v, 10000)
         for j in js:
             self.assertTrue(j in valid_indices)
Пример #33
0
 def test_random_holes(self):
     f = 10
     index = AnnoyIndex(f)
     valid_indices = random.sample(range(2000), 1000) # leave holes
     for i in valid_indices:
         v = numpy.random.normal(size=(f,))
         index.add_item(i, v)
     index.build(10)
     for i in valid_indices:
         js = index.get_nns_by_item(i, 10000)
         for j in js:
             self.assertTrue(j in valid_indices)
     for i in range(1000):
         v = numpy.random.normal(size=(f,))
         js = index.get_nns_by_vector(v, 10000)
         for j in js:
             self.assertTrue(j in valid_indices)
Пример #34
0
def main():
    """ Main function """

    # Building a k-nearest neighbor graph using annoy and cosine distance
    annoy = AnnoyIndex(len(DATA.columns), metric="angular")
    annoy_graph = []

    for i, v in enumerate(DATA.values):
        annoy.add_item(i, v)
    annoy.build(10)

    for i in range(len(DATA)):
        for j in annoy.get_nns_by_item(i, 10):
            annoy_graph.append(
                (i, j, cosine_distance(DATA.values[i], DATA.values[j])))

    # Creating the tmap layout
    x, y, s, t, _ = tm.layout_from_edge_list(len(DATA), annoy_graph)

    faerun = Faerun(view="front", coords=False)
    faerun.add_scatter(
        "MINIBOONE",
        {
            "x": x,
            "y": y,
            "c": LABELS,
            "labels": LABELS
        },
        shader="smoothCircle",
        colormap="Set1",
        point_scale=2.0,
        max_point_size=20,
        has_legend=True,
        categorical=True,
        legend_labels={(0, "Noise"), (1, "Signal")},
    )
    faerun.add_tree(
        "MINIBOONE_tree",
        {
            "from": s,
            "to": t
        },
        point_helper="MINIBOONE",
        color="#666666",
    )
    faerun.plot("miniboone", template="default")
Пример #35
0
    def test1(self):
        rows = self.query_country_name('%')
        annoyIndex = AnnoyIndex(768)
        # for i,row in enumerate(rows):
        #     encode=self.bc.encode([row[1]])
        #     annoyIndex.add_item(i,encode[0])
        # annoyIndex.build(10)
        # annoyIndex.save('articles')
        annoyIndex.load('articles')
        result, index = annoyIndex.get_nns_by_item(10,
                                                   5,
                                                   include_distances=True)
        print(rows[10])
        print(np.cos(index))
        for i in result:

            print(rows[i])
Пример #36
0
 def test_distance_consistency(self):
     n, f = 1000, 3
     i = AnnoyIndex(f, 'euclidean')
     for j in xrange(n):
         i.add_item(j, numpy.random.normal(size=f))
     i.build(10)
     for a in random.sample(range(n), 100):
         indices, dists = i.get_nns_by_item(a, 100, include_distances=True)
         for b, dist in zip(indices, dists):
             self.assertAlmostEqual(dist, i.get_distance(a, b))
             u = numpy.array(i.get_item_vector(a))
             v = numpy.array(i.get_item_vector(b))
             # self.assertAlmostEqual(dist, euclidean(u, v))
             self.assertAlmostEqual(dist, numpy.dot(u - v, u - v)**0.5)
             self.assertAlmostEqual(
                 dist,
                 sum([(x - y)**2 for x, y in zip(u, v)])**0.5)
Пример #37
0
class Index:
    def __init__(self, directory):
        print('loading index from "%s"' % directory)
        self.vector_filename = directory + '/vectors.npy'
        self.index_filename = self.vector_filename + '.ann'
        self.images_filename = directory + '/names.txt'
        self.description_filename = directory + '/descriptions.txt'
        self.image_directory = directory + '/images'

        self.vectors = np.load(self.vector_filename, mmap_mode='r')
        dim = self.vectors.shape[1]
        if not os.path.isfile(self.index_filename):
            print('building index')
            index = AnnoyIndex(dim, metric='dot')
            for i in range(self.vectors.shape[0]):
                index.add_item(i, self.vectors[i])
                if i % 1000 == 0:
                    print(i)
            index.build(20)
            index.save(self.index_filename)
        self.index = AnnoyIndex(dim, metric='dot')
        self.index.load(self.index_filename)

        with open(self.images_filename) as fp:
            self.images = [x.strip().split('\t')[0] for x in fp.readlines()]

        self.descriptions = collections.defaultdict(str)
        if os.path.exists(self.description_filename):
            with open(self.description_filename) as fp:
                for i, line in enumerate(fp.readlines()):
                    self.descriptions[i] = line.strip()

    def closest(self, i, n=10):
        return [(1, i)] + list(
            map(lambda x: (x[1], x[0]),
                zip(*self.index.get_nns_by_item(i, n,
                                                include_distances=True))))

    def image(self, i):
        if i >= 0 and i < len(self.images):
            return self.image_directory + '/' + self.images[i]
        return None

    def description(self, i):
        return self.descriptions[i]
Пример #38
0
def nearest_neighbors(collection, num_neighbors=10, n_trees=100):
    """
    Finds the num_neighbors nearest neighbors to each cell in the sparse matrix

    Return result is a dictionary of lists, where the key is an index into the cells, 
    and the value is the neighbors of that cell
    """
    nn_idx = AnnoyIndex(collection.num_genes())
    # Add the elements in reverse order because Annoy allocates the memory based on
    # the value of the element added - so adding in increasing order will trigger
    # lots of allocations
    for i in range(collection.num_cells() - 1, -1, -1):
        nn_idx.add_item(i, collection.get_cell_expression_vector(i))
    nn_idx.build(n_trees)
    return {
        i: nn_idx.get_nns_by_item(i, num_neighbors)
        for i in range(collection.num_cells())
    }
Пример #39
0
 def test_distance_consistency(self):
     n, f = 1000, 3
     i = AnnoyIndex(f)
     for j in xrange(n):
         i.add_item(j, numpy.random.normal(size=f))
     i.build(10)
     for a in random.sample(range(n), 100):
         indices, dists = i.get_nns_by_item(a, 100, include_distances=True)
         for b, dist in zip(indices, dists):
             self.assertAlmostEqual(dist, i.get_distance(a, b))
             u = i.get_item_vector(a)
             v = i.get_item_vector(b)
             u_norm = numpy.array(u) * numpy.dot(u, u)**-0.5
             v_norm = numpy.array(v) * numpy.dot(v, v)**-0.5
             # cos = numpy.clip(1 - cosine(u, v), -1, 1) # scipy returns 1 - cos
             self.assertAlmostEqual(dist, numpy.dot(u_norm - v_norm, u_norm - v_norm) ** 0.5)
             # self.assertAlmostEqual(dist, (2*(1 - cos))**0.5)
             self.assertAlmostEqual(dist, sum([(x-y)**2 for x, y in zip(u_norm, v_norm)])**0.5)
Пример #40
0
 def nearest_neighbor_search(self, GE_csc):
     K = self.num_of_neighbor * 2
     n, d = GE_csc.shape
     t = AnnoyIndex(d, "angular")
     for i in range(n):
         t.add_item(i, GE_csc[i, :])
     t.build(100)
     t.save('test.ann')
     u = AnnoyIndex(d, "angular")
     u.load('test.ann')
     os.remove('test.ann')
     val = np.zeros((n, K))
     ind = np.zeros((n, K))
     for i in range(n):
         tmp, tmp1 = u.get_nns_by_item(i, K, include_distances=True)
         ind[i, :] = tmp
         val[i, :] = tmp1
     return ind.astype('int'), val
Пример #41
0
 def test_distance_consistency(self):
     n, f = 1000, 3
     i = AnnoyIndex(f)
     for j in xrange(n):
         i.add_item(j, numpy.random.normal(size=f))
     i.build(10)
     for a in random.sample(range(n), 100):
         indices, dists = i.get_nns_by_item(a, 100, include_distances=True)
         for b, dist in zip(indices, dists):
             self.assertAlmostEqual(dist, i.get_distance(a, b))
             u = i.get_item_vector(a)
             v = i.get_item_vector(b)
             u_norm = numpy.array(u) * numpy.dot(u, u)**-0.5
             v_norm = numpy.array(v) * numpy.dot(v, v)**-0.5
             # cos = numpy.clip(1 - cosine(u, v), -1, 1) # scipy returns 1 - cos
             self.assertAlmostEqual(dist, numpy.dot(u_norm - v_norm, u_norm - v_norm) ** 0.5)
             # self.assertAlmostEqual(dist, (2*(1 - cos))**0.5)
             self.assertAlmostEqual(dist, sum([(x-y)**2 for x, y in zip(u_norm, v_norm)])**0.5)
Пример #42
0
def cross_community_search(save_name, dimension=8):
    predictions = []
    t = AnnoyIndex(dimension, metric='euclidean')
    t.load(saved_objs_path + '/' + save_name)

    user_threshold = 1682
    close_nb_amount = 30
    actual_nb_amount = 20
    ds = get_dataset()
    g = ds.original_graph
    for src_node in range(1682, 1682 + 942):
        nn_nodes, distances = t.get_nns_by_item(src_node,
                                                close_nb_amount,
                                                include_distances=True)
        each_user_close_nb = {}
        for idx, dst_node in enumerate(nn_nodes):
            # remove non-user node
            if dst_node - user_threshold > 0:
                continue

            # cal weigh of each path
            paths = nx.shortest_path(g,
                                     source=src_node,
                                     target=dst_node,
                                     weight='score')
            path_length = nx.shortest_path_length(g,
                                                  source=src_node,
                                                  target=dst_node,
                                                  weight='score')
            if path_length == 0:
                continue
            print(idx, paths, path_length)
            real_rating = g[src_node][paths[1]]['score']
            pred_rating = path_length / len(paths)
            pred = Prediction(src_node, paths[1], real_rating, pred_rating, {})
            predictions.append(pred)

    rmse, prec, rec, ils_sim = evaluate_pred(g, predictions)
    ds_name = 'movielens'
    algo_name = 'GraphRec'
    with open(f'eval_{ds_name}.csv', 'a') as f:
        f.write(
            f'{ds_name}_{algo_name},rmse,{rmse},precision,{prec},recall,{rec},ils,{ils_sim}\n'
        )
def annoy_search(ref_index, features, tree_size, recommended_item_size,
                 _metric):
    start = time.time()
    index = df.loc[ref_index]['features']
    f = len(index)
    t = AnnoyIndex(f, metric=_metric)
    print("Metric use: ", _metric)

    for e in range(len(features)):
        t.add_item(e, features[e])

    t.build(tree_size, n_jobs=-1)
    similar_img_ids, distances = t.get_nns_by_item(ref_index,
                                                   recommended_item_size,
                                                   include_distances=True)
    end = time.time()
    annoy_runtime = end - start
    print("ANNOY Runtime: ", end - start)
    return similar_img_ids, distances, annoy_runtime
Пример #44
0
    def search(self, seed, k=5):
        '''
        seed: seed item to find nearest neighbor
        k: number of cloest neighhbors
        '''

        a = AnnoyIndex(self.dimension, 'angular')

        words = self.data['user'].unique().tolist() + self.data['item'].unique(
        ).tolist()
        le = preprocessing.LabelEncoder()
        le.fit(words)
        for word in words:
            a.add_item(le.transform([word])[0], self.w2v_model.wv[word])

        a.build(-1)

        a_return = a.get_nns_by_item(le.transform([seed])[0], k)
        return le.inverse_transform(a_return)
Пример #45
0
    def generate_neighbors(self, neighbors, **kwargs):
        # Extract parameters if provided in kwargs.
        dimension = 1  # Length of item vector that will be indexed
        metric = kwargs.get('metric', 'euclidean')
        num_trees = kwargs.get('num_trees', 10)

        # Build tree with the given data.
        t = AnnoyIndex(dimension, metric)
        for i in range(self.num_samples):
            t.add_item(i, [self.samples[i].item()])
        t.build(num_trees)

        # Generate neighbor map array.
        neighbor_map = np.zeros((self.num_samples, neighbors))
        for i in range(self.num_samples):
            nearest_neighbors = t.get_nns_by_item(i, neighbors)
            neighbor_map[i, :] = nearest_neighbors

        self.neighbor_map = neighbor_map.astype(int)
Пример #46
0
def representative_sample(X, num_samples, save=False):
    """Sample vectors in X, prefering edge cases and vectors farthest from other vectors in sample set


    """
    X = X.values if hasattr(X, 'values') else np.array(X)
    N, M = X.shape
    rownums = np.arange(N)
    np.random.shuffle(rownums)

    idx = AnnoyIndex(M)
    for i, row in enumerate(X):
        idx.add_item(i, row)
    idx.build(int(np.log2(N)) + 1)

    if save:
        if isinstance(save, (bytes, str)):
            idxfilename = save
        else:
            idxfile = tempfile.NamedTemporaryFile(delete=False)
            idxfile.close()
            idxfilename = idxfile.name
        idx.save(idxfilename)
        idx = AnnoyIndex(M)
        idx.load(idxfile.name)

    samples = -1 * np.ones(shape=(num_samples, ), dtype=int)
    samples[0] = rownums[0]
    # FIXME: some integer determined by N and num_samples and distribution
    j, num_nns = 0, min(1000, int(num_samples / 2. + 1))
    for i in rownums:
        if i in samples:
            continue
        nns = idx.get_nns_by_item(i, num_nns)
        # FIXME: pick vector furthest from past K (K > 1) points or outside of a hypercube (sized to uniformly fill the space) around the last sample
        try:
            samples[j + 1] = np.setdiff1d(nns, samples)[-1]
        except:
            samples[j + 1]
        if len(num_nns) < num_samples / 3.:
            num_nns = min(N, 1.3 * num_nns)
        j += 1
    return samples
Пример #47
0
 def test_holes_more(self):
     f = 10
     index = AnnoyIndex(f)
     valid_indices = set()
     for i in range(1000):
         i2 = int(i*2**-0.5) # leave holes every few items
         valid_indices.add(i2)
         v = numpy.random.normal(size=(f,))
         index.add_item(i2, v)
     index.build(10)
     for i in valid_indices:
         js = index.get_nns_by_item(i, 10000)
         for j in js:
             self.assertTrue(j in valid_indices)
     for i in range(1000):
         v = numpy.random.normal(size=(f,))
         js = index.get_nns_by_vector(v, 10000)
         for j in js:
             self.assertTrue(j in valid_indices)
Пример #48
0
def get_hardest_negatives(samples_data, train_index, dim):
    u = AnnoyIndex(dim, 'angular')
    u.load(train_index)
    hardest_negatives = []
    for index, sample in enumerate(samples_data):
        similar_questions = u.get_nns_by_item(index, 1000)
        cur_id = sample['table_id']
        for close_index in similar_questions:
            close_sample = samples_data[close_index]
            close_id = close_sample['table_id']
            if cur_id != close_id:
                hardest_negative = {
                    'table_id': cur_id,
                    'question_tokens': close_sample['question_tokens'],
                    'label': 0.0
                }
                hardest_negatives.append(hardest_negative)
                break
    return hardest_negatives
Пример #49
0
    def test_get_nns_with_distances(self):
        f = 3
        i = AnnoyIndex(f, 'manhattan')
        i.add_item(0, [0, 0, 2])
        i.add_item(1, [0, 1, 1])
        i.add_item(2, [1, 0, 0])
        i.build(10)

        l, d = i.get_nns_by_item(0, 3, -1, True)
        self.assertEqual(l, [0, 1, 2])
        self.assertAlmostEqual(d[0], 0.0)
        self.assertAlmostEqual(d[1], 2.0)
        self.assertAlmostEqual(d[2], 3.0)

        l, d = i.get_nns_by_vector([2, 2, 1], 3, -1, True)
        self.assertEqual(l, [1, 2, 0])
        self.assertAlmostEqual(d[0], 3.0)
        self.assertAlmostEqual(d[1], 4.0)
        self.assertAlmostEqual(d[2], 5.0)
Пример #50
0
    def test_get_nns_with_distances(self):
        f = 3
        i = AnnoyIndex(f, 'euclidean')
        i.add_item(0, [0, 0, 2])
        i.add_item(1, [0, 1, 1])
        i.add_item(2, [1, 0, 0])
        i.build(10)

        l, d = i.get_nns_by_item(0, 3, -1, True)
        self.assertEqual(l, [0, 1, 2])
        self.assertAlmostEqual(d[0]**2, 0.0)
        self.assertAlmostEqual(d[1]**2, 2.0)
        self.assertAlmostEqual(d[2]**2, 5.0)

        l, d = i.get_nns_by_vector([2, 2, 2], 3, -1, True)
        self.assertEqual(l, [1, 0, 2])
        self.assertAlmostEqual(d[0]**2, 6.0)
        self.assertAlmostEqual(d[1]**2, 8.0)
        self.assertAlmostEqual(d[2]**2, 9.0)
Пример #51
0
    def test_get_nns_with_distances(self):
        f = 3
        i = AnnoyIndex(f, 'euclidean')
        i.add_item(0, [0, 0, 2])
        i.add_item(1, [0, 1, 1])
        i.add_item(2, [1, 0, 0])
        i.build(10)

        l, d = i.get_nns_by_item(0, 3, -1, True)
        self.assertEquals(l, [0, 1, 2])
        self.assertAlmostEquals(d[0]**2, 0.0)
        self.assertAlmostEquals(d[1]**2, 2.0)
        self.assertAlmostEquals(d[2]**2, 5.0)

        l, d = i.get_nns_by_vector([2, 2, 2], 3, -1, True)
        self.assertEquals(l, [1, 0, 2])
        self.assertAlmostEquals(d[0]**2, 6.0)
        self.assertAlmostEquals(d[1]**2, 8.0)
        self.assertAlmostEquals(d[2]**2, 9.0)
Пример #52
0
def generate_pair(X,
                  n_neighbors,
                  n_MN,
                  n_FP,
                  distance='euclidean',
                  verbose=True):
    '''Generate pairs for the dataset.
    '''
    n, dim = X.shape
    # sample more neighbors than needed
    n_neighbors_extra = min(n_neighbors + 50, n - 1)
    tree = AnnoyIndex(dim, metric=distance)
    if _RANDOM_STATE is not None:
        tree.set_seed(_RANDOM_STATE)
    for i in range(n):
        tree.add_item(i, X[i, :])
    tree.build(20)

    option = distance_to_option(distance=distance)

    nbrs = np.zeros((n, n_neighbors_extra), dtype=np.int32)
    knn_distances = np.empty((n, n_neighbors_extra), dtype=np.float32)

    for i in range(n):
        nbrs_ = tree.get_nns_by_item(i, n_neighbors_extra + 1)
        nbrs[i, :] = nbrs_[1:]
        for j in range(n_neighbors_extra):
            knn_distances[i, j] = tree.get_distance(i, nbrs[i, j])
    print_verbose("Found nearest neighbor", verbose)
    sig = np.maximum(np.mean(knn_distances[:, 3:6], axis=1), 1e-10)
    print_verbose("Calculated sigma", verbose)
    scaled_dist = scale_dist(knn_distances, sig, nbrs)
    print_verbose("Found scaled dist", verbose)
    pair_neighbors = sample_neighbors_pair(X, scaled_dist, nbrs, n_neighbors)
    if _RANDOM_STATE is None:
        pair_MN = sample_MN_pair(X, n_MN, option)
        pair_FP = sample_FP_pair(X, pair_neighbors, n_neighbors, n_FP)
    else:
        pair_MN = sample_MN_pair_deterministic(X, n_MN, _RANDOM_STATE, option)
        pair_FP = sample_FP_pair_deterministic(X, pair_neighbors, n_neighbors,
                                               n_FP, _RANDOM_STATE)
    return pair_neighbors, pair_MN, pair_FP, tree
Пример #53
0
def KNN_Annoy(X, KK):
    NK = KK
    NN, NF = X.shape
    if KK > NF:
        raise ValueError("KK should be less than 2th-dim of X")

    t = AnnoyIndex(NF, metric='euclidean')
    for i, v in enumerate(X):
        t.add_item(i, v)

    t.build(100)
    ind = []
    val = []

    for i in range(NN):
        closest = t.get_nns_by_item(i, NK)
        ind.append(closest)
        val.append([t.get_distance(i, j) for j in closest])

    return np.array(ind), np.array(val)
Пример #54
0
async def query_index(request, index_name):

    index = loaded.get(index_name)

    if index is None:
        dimmensions = 10
        index = AnnoyIndex(dimmensions)
        file = (DATA_PATH / index_name).with_suffix('.ann')
        index.load(str(file.absolute()))
        loaded[index_name] = index

    id = int(request.args['id'][0])
    count = request.args.get('count')
    if count is None:
        count = 10

    items = index.get_nns_by_item(id, count)

    result = {'items': items}
    return json(result)
Пример #55
0
	def find_closest_songs(self,song_id,id_songname_dict):
		layer_outputs = []
		get_song_embeddings = Model(inputs=self.model.input,outputs=self.model.get_layer(index=13).output)
		with open('Metadata\\song_id_to_prediction.txt') as f:
			song_id_to_prediction = json.loads(f.read())
			#print(song_id_to_prediction)
		song_ids,song_predictions = zip(*(song_id_to_prediction.items())) #prediction for the given song is the average of all spectrogram latent factors
		#song_predictions = np.array(song_predictions)

		t = AnnoyIndex(self.num_factors,'angular')
		for i in range(len(song_predictions)):
			t.add_item(i,song_predictions[i]) #assign each predicted latent factor and index
		t.build(10)
		closest_songs_indexes = t.get_nns_by_item(song_ids.index(song_id),10)
		print("The most similar songs to {} are:".format(id_songname_dict[song_id]))
		for index in closest_songs_indexes:
			try:
				print(id_songname_dict[song_ids[index]])
			except KeyError:
				pass
Пример #56
0
        def _random_nn(X):
            idx = AnnoyIndex(X.shape[1], 'euclidean')
            for i in range(X.shape[0]):
                idx.add_item(i, X[i])

            logging.info("building an index with %d items" % X.shape[0])
            idx.build(50)

            logging.info("finding %d neighbor groups" % self.n_clusters)
            seen = {}
            label = 0

            guess = np.random.randint(X.shape[0])
            centers = {guess: 0}

            while label < self.n_clusters:
                neighbors = idx.get_nns_by_item(guess, _get_num_neighbors())
                for point in neighbors:
                    seen[point] = label
                seen[guess] = label

                # find a distant point
                dists = np.array([[idx.get_distance(i, j) for i in centers]
                                  for j in range(X.shape[0])])

                avg_dists = np.average(dists, axis=1)
                dist_prob = softmax(avg_dists)

                guess = np.random.choice(X.shape[0], p=dist_prob)

                while guess in seen:
                    guess = np.random.choice(X.shape[0], p=dist_prob)
                centers[guess] = label

                label = label + 1

            y = np.zeros(X.shape[0])

            for k, v in seen.items():
                y[k] = v
            return y
Пример #57
0
 def nearest_neighbor_search(self, GE_csc):
     K = self.num_of_neighbor * 2
     n, d = GE_csc.shape
     t = AnnoyIndex(d)
     for i in range(n):
         t.add_item(i, GE_csc[i, :])
     t.build(100)
     print('#######OS PROCESS ID#####')
     print(str(os.getpid()))
     ann_file = str(os.getpid()) + 'test.ann'
     t.save(ann_file)
     u = AnnoyIndex(d)
     u.load(ann_file)
     os.remove(ann_file)
     val = np.zeros((n, K))
     ind = np.zeros((n, K))
     for i in range(n):
         tmp, tmp1 = u.get_nns_by_item(i, K, include_distances=True)
         ind[i, :] = tmp
         val[i, :] = tmp1
     return ind.astype('int'), val
Пример #58
0
	def find_nearest(self):
		ann = AnnoyIndex(num_merchants)
		for customer in self.customers:
			customer_vector = list(matrix.loc[[customer]])
			ann.add_item(customer, customer_vector)
			if customer%200 == 0:
				print 'Adding '+ str(customer)
		print "Building"
		if len(self.merchantIDs) > max_trees:
			ann.build(max_trees)
		else:
			ann.build(len(self.merchantIDs))
		print "...done"
		for customer in self.customers:
			neighbors = ann.get_nns_by_item(customer, num_neighbors)
			if customer%200 == 0:
				print "Found neighbors for " + str(customer)
			self.nearest[customer] = []
			for neighbor in neighbors:
				if neighbor != customer:
					self.nearest[customer].append((neighbor, ann.get_distance(neighbor, customer)))
Пример #59
0
def ann_annoy(data, metric='euclidean',
              n_neighbors=10,
              trees=10):
    """My Approximate Nearest Neighbors function (ANN)
    using the annoy package.

    Parameters
    ----------


    Returns
    -------


    """
    datapoints = data.shape[0]
    dimension = data.shape[1]

    # initialize the annoy database
    ann = AnnoyIndex(dimension)

    # store the datapoints
    for (i, row) in enumerate(data):
        ann.add_item(i, row.tolist())

    # build the index
    ann.build(trees)

    # find the k-nearest neighbors for all points
    idx = np.zeros((datapoints, n_neighbors), dtype='int')
    distVals = idx.copy().astype(np.float)

    # extract the distance values
    for i in range(0, datapoints):
        idx[i,:] = ann.get_nns_by_item(i, n_neighbors)

        for j in range(0, n_neighbors):
            distVals[i,j] = ann.get_distance(i, idx[i,j])

    return distVals, idx