def test_celeba_embedding(self):
        PATHS_JSON = os.getenv('PATHS_JSON', abspath(join(__file__, '..', '..', 'data', 'paths_celeba.json')))

        EMBEDDING_JSON = os.getenv('EMBEDDING_JSON', abspath(join(__file__, '..', '..', 'data', 'embeddings_celeba.json')))


        INDEX_FILENAME = os.getenv('INDEX_FILENAME', os.path.abspath(os.path.join(__file__, '..', '..', 'data', 'index_celeba.ann')))

        NSW_INDEX_FILENAME = os.getenv('NSW_INDEX_FILENAME', os.path.abspath(os.path.join(__file__, '..', '..', 'data', 'index_celeba_nsw')))

        TEST_CASES_FILENAME = os.getenv('TEST_CASES_FILENAME',
            os.path.abspath(os.path.join(__file__, '..', '..', 'data', 'index_celeba_test_cases.json')))

        with open(PATHS_JSON, 'r') as fp:
            print('Loading paths')
            paths = np.array(json.load(fp))
        with open(EMBEDDING_JSON, 'r') as fp:
            print('Loading embeddings')
            embeddings = json.load(fp)

        with open(TEST_CASES_FILENAME, 'r') as fp:
            print('Loading test_cases')
            test_cases = json.load(fp)


        annoy = AnnoyIndex(len(embeddings[0]))    
        annoy_index = annoy.load(INDEX_FILENAME)

        print('building nsw index')
        nsw_index = PyNSW('l2')
        print('Creating nodes')
        nodes = [create_node(path, vector) for path, vector in zip(paths, embeddings)]
        print('Inserting nodes')
        for node in tqdm(nodes):
            nsw_index.nn_insert(node, 5, 1000)

        n, k_annoy, k_nsw = 0, 0, 0

        print('Calculating accuracy on CelebA')

        for tk in test_cases:
            vector = embeddings[int(tk['embedding_index'])]
            
            closest_paths_real = tk['closest_paths_real']

            closest_paths_annoy = paths[annoy.get_nns_by_vector(vector, 10, 1000)]

            closest_paths_nsw = [n[1] for n in nsw_index.nn_search(create_node('kek', vector), 5, 10)]

            assert len(closest_paths_real) == 10
            assert len(closest_paths_annoy) == 10
            assert len(closest_paths_nsw) == 10

            n += 10
            k_annoy += len(set(closest_paths_annoy).intersection(closest_paths_real))
            k_nsw += len(set(closest_paths_nsw).intersection(closest_paths_real))


        print('Annoy accuracy on CelebA embeddings: {:.3f}%'.format(100.0 * k_annoy / n))
        print('NSW accuracy on CelebA embeddings: {:.3f}%'.format(100.0 * k_nsw / n))
    def _get_index(self, dataset):
        vectors_fn = join(os.getenv('ACCURACY_TEST_DATA_PATH',
               abspath(join(abspath(dirname(abspath(dirname(__file__)))), 'data', 'test'))), dataset + '.hdf5')

        if not exists(vectors_fn):
            url = 'http://vectors.erikbern.com/%s.hdf5' % dataset
            print('downloading', url, '->', vectors_fn)
            urlretrieve(url, vectors_fn)

        index_fn = os.path.join(dataset + '.annoy')
        dataset_f = h5py.File(vectors_fn)
        distance = dataset_f.attrs['distance']
        f = dataset_f['train'].shape[1]
        annoy = AnnoyIndex(f, distance)

        print('building nsw index')
        nsw = PyNSW('l2')
        for i in trange(dataset_f['train'].shape[0]):
            v = dataset_f['train'][i]
            nsw.nn_insert(PyNode(str(i), v), 1, 100)

        if not os.path.exists(index_fn):
            print('adding items', distance, f)
            for i, v in enumerate(dataset_f['train']):
                annoy.add_item(i, v)

            print('building annoy index')
            annoy.build(10)
            annoy.save(index_fn)
        else:
            annoy.load(index_fn)
        return annoy, dataset_f
예제 #3
0
    def test_save_load(self):
        print('test_save_load')
        nsw = self._create_index()
        index_path = tempfile.NamedTemporaryFile(delete=False).name
        nsw.save(index_path)

        empty_nsw = PyNSW('l2')
        empty_nsw.load(index_path)

        # compare original and loaded index on different number of iterations
        NUM_ITERS = 1

        for num_iter in range(1, NUM_ITERS + 1):
            for node in self.nodes:
                self.assertEqual(
                    nsw.nn_search(node, num_iter, 3, random_seed=1334),
                    empty_nsw.nn_search(node, num_iter, 3, random_seed=1334))
def create_index(index_path):
    index = PyNSW('l2')
    if os.path.exists(index_path):
        index.load(index_path)
    else:
        with open(PATHS_JSON, 'r') as fp:
            print('Loading paths')
            paths = json.load(fp)
        with open(EMBEDDING_JSON, 'r') as fp:
            print('Loading embeddings')
            embeddings = json.load(fp)
        print('Creating nodes')
        nodes = [create_node(path, vector) for path, vector in zip(paths, embeddings)]
        print('Inserting nodes')
        for idx, node in enumerate(nodes):
            if idx % 500 == 0:
                print('{} nodes inserted'.format(idx))
            index.nn_insert(node, 3, 1000)
        index.save(index_path)
예제 #5
0
 def test_fail(self):
     fake_dist_type = 'l3'
     with self.assertRaises(TypeError) as context:
         PyNSW(fake_dist_type)
     self.assertEqual('Unknown distance type: {}'.format(fake_dist_type),
                      context.exception.message)
예제 #6
0
 def test_l2(self):
     nsw = PyNSW('l2')
     self.assertEqual('l2', nsw.dist_type)
예제 #7
0
 def test_l1(self):
     nsw = PyNSW('l1')
     self.assertEqual('l1', nsw.dist_type)
예제 #8
0
from tqdm import tqdm

from python.index import create_node, PyNSW

PATHS_JSON = getenv('PATHS_JSON',
                    abspath(join(__file__, '..', '..', 'data', 'paths.json')))
EMBEDDING_JSON = getenv(
    'EMBEDDING_JSON',
    abspath(join(__file__, '..', '..', 'data', 'embeddings.json')))

if __name__ == '__main__':
    with open(PATHS_JSON) as fp:
        paths = json.load(fp)
    with open(EMBEDDING_JSON) as fp:
        embeddings = json.load(fp)

    nodes = [
        create_node(path, vector) for path, vector in zip(paths, embeddings)
    ]

    nsw = PyNSW('l2')
    for node in tqdm(nodes):
        nsw.nn_insert(node, 1, 100)

    random_vector = embeddings[100]
    print(paths[100])
    print(random_vector)

    neighbors = nsw.nn_search(create_node('kek', random_vector), 5, 3)
    print(neighbors)
예제 #9
0
from python.index import create_node, PyNSW
from tqdm import tqdm

NSW_INDEX_FILENAME = os.getenv(
    'NSW_INDEX_FILENAME',
    os.path.abspath(
        os.path.join(__file__, '..', '..', 'data', 'index_celeba_nsw')))

PATHS_JSON = os.getenv(
    'PATHS_JSON',
    os.path.abspath(
        os.path.join(__file__, '..', '..', 'data', 'paths_celeba.json')))
EMBEDDING_JSON = os.getenv(
    'EMBEDDING_JSON',
    os.path.abspath(
        os.path.join(__file__, '..', '..', 'data', 'embeddings_celeba.json')))

index = PyNSW('l2')

with open(PATHS_JSON, 'r') as fp:
    print('Loading paths')
    paths = json.load(fp)
with open(EMBEDDING_JSON, 'r') as fp:
    print('Loading embeddings')
    embeddings = json.load(fp)
print('Creating nodes')
nodes = [create_node(path, vector) for path, vector in zip(paths, embeddings)]
print('Inserting nodes')
for node in tqdm(nodes):
    index.nn_insert(node, 3, 10)
index.save(NSW_INDEX_FILENAME)
예제 #10
0
 def _create_index(self, num_neighbors=(NUM_NODES - 1), num_iters=1):
     nsw = PyNSW('l2')
     # connect each node to `num_neighbors` other nodes
     for i, node in enumerate(self.nodes):
         nsw.nn_insert(node, num_iters, num_neighbors, random_seed=1334 + i)
     return nsw
예제 #11
0
 def _create_index(self, num_neighbors=(NUM_NODES - 1), num_iters=1):
     nsw = PyNSW('l2')
     # connect each node to `num_neighbors` other nodes
     for node in self.nodes:
         nsw.nn_insert(node, num_iters, num_neighbors)
     return nsw
예제 #12
0
import numpy as np
from python.index import PyNode, PyDistance_l1, PyDistance_l2, PyNSW
from os.path import dirname, join, abspath
from os import getenv
import json
import jsonpickle


nsw = PyNSW('l2')

PATHS_JSON = getenv('PATHS_JSON',
                    abspath(join(abspath(dirname(abspath(dirname(__file__)))), 'data', 'paths.json')))
EMBEDDING_JSON = getenv('EMBEDDING_JSON',
                        abspath(join(abspath(dirname(abspath(dirname(__file__)))), 'data', 'embeddings.json')))

with open(PATHS_JSON, 'r') as fp:
    PATHS = json.load(fp)

with open(EMBEDDING_JSON, 'r') as fp:
    EMBEDDINGS = json.load(fp)

nodes = [PyNode(path, vector) for path, vector in zip(PATHS, EMBEDDINGS)]
for node in nodes:
    nsw.nn_insert(node)

random_vector = EMBEDDINGS[100]
print(random_vector)

neighbors = nsw.nn_search(PyNode('1', random_vector), 5, 3)

print(neighbors)