Пример #1
0
    def test_retrieval(self):
        # We want 12 projections, 20 results at least
        rbpt = RandomBinaryProjectionTree('testHash', 12, 20)

        # Create engine for 100 dimensional feature space, do not forget to set
        # nearest filter to 20, because default is 10
        self.engine = Engine(100,
                             lshashes=[rbpt],
                             vector_filters=[NearestFilter(20)])

        # First insert 200000 random vectors
        #print 'Indexing...'
        for k in range(200000):
            x = numpy.random.randn(100)
            x_data = 'data'
            self.engine.store_vector(x, x_data)

        # Now do random queries and check result set size
        #print 'Querying...'
        for k in range(10):
            x = numpy.random.randn(100)
            n = self.engine.neighbours(x)
            #print "Candidate count = %d" % self.engine.candidate_count(x)
            #print "Result size = %d" % len(n)
            self.assertEqual(len(n), 20)
Пример #2
0
    def test_storage_memory(self):
        # We want 10 projections, 20 results at least
        rbpt = RandomBinaryProjectionTree('testHash', 10, 20)

        # Create engine for 100 dimensional feature space
        self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)])

        # First insert 2000 random vectors
        for k in range(2000):
            x = numpy.random.randn(100)
            x_data = 'data'
            self.engine.store_vector(x, x_data)

        self.memory.store_hash_configuration(rbpt)

        rbpt2 = RandomBinaryProjectionTree(None, None, None)
        rbpt2.apply_config(self.memory.load_hash_configuration('testHash'))

        self.assertEqual(rbpt.dim, rbpt2.dim)
        self.assertEqual(rbpt.hash_name, rbpt2.hash_name)
        self.assertEqual(rbpt.projection_count, rbpt2.projection_count)

        for i in range(rbpt.normals.shape[0]):
            for j in range(rbpt.normals.shape[1]):
                self.assertEqual(rbpt.normals[i, j], rbpt2.normals[i, j])

        # Now do random queries and check result set size
        for k in range(10):
            x = numpy.random.randn(100)
            keys1 = rbpt.hash_vector(x, querying=True)
            keys2 = rbpt2.hash_vector(x, querying=True)
            self.assertEqual(len(keys1), len(keys2))
            for k in range(len(keys1)):
                self.assertEqual(keys1[k], keys2[k])
Пример #3
0
    def build(self, train, batch_size=64, converter=convert_seq, device=0):
        train_iter = chainer.iterators.SerialIterator(train,
                                                      batch_size,
                                                      repeat=False)
        train_iter.reset()

        act_list = [[] for _ in range(self.n_dknn_layers)]
        label_list = []
        print('caching hiddens')
        n_batches = len(train) // batch_size
        for i, train_batch in enumerate(tqdm(train_iter, total=n_batches)):
            data = converter(train_batch, device=device, with_label=True)
            text = data['xs']
            labels = data['ys']

            with chainer.using_config('train', False):
                _, dknn_layers = self.model.predict(text, dknn=True)
                assert len(dknn_layers) == self.model.n_dknn_layers
            for i in range(self.n_dknn_layers):
                layer = dknn_layers[i]
                layer.to_cpu()
                act_list[i] += [x for x in layer.data]
            label_list.extend([int(x) for x in labels])
        self.act_list = act_list
        self.label_list = label_list

        if self.lsh:
            print('using Locally Sensitive Hashing for NN Search')
        else:
            print('using KDTree for NN Search')
        self.tree_list = []  # one lookup tree for each dknn layer
        for i in range(self.n_dknn_layers):
            print('building tree for layer {}'.format(i))
            if self.lsh:  # if lsh
                n_hidden = act_list[i][0].shape[0]
                rbpt = RandomBinaryProjectionTree('rbpt', 75, 75)
                tree = Engine(n_hidden, lshashes=[rbpt])

                for j, example in enumerate(tqdm(act_list[i])):
                    assert example.ndim == 1
                    assert example.shape[0] == n_hidden

                    tree.store_vector(example, j)
            else:  # if kdtree
                tree = KDTree(act_list[i])

            self.tree_list.append(tree)
Пример #4
0
 def __init__(self, level, dim, proj, bico):
     self.level = level
     self.dim = dim
     self.proj = proj
     self.point_to_biconode = {}
     self.rbpt = RandomBinaryProjectionTree('rbpt', proj, 1)
     self.rbp = RandomBinaryProjections('rbp', proj)
     self.sqdist = SquaredEuclideanDistance()
     self.ann_engine = Engine(
         dim,
         lshashes=[self.rbp],
         distance=self.sqdist,
         vector_filters=[DistanceThresholdFilter(bico._getR(self.level))])
     self.num_cfs = 0
     self.bico = bico
     self.cf = ClusteringFeature(Point(np.zeros(dim)), Point(np.zeros(dim)),
                                 0, 0)
Пример #5
0
    def test_storage_redis(self):
        # We want 10 projections, 20 results at least
        rbpt = RandomBinaryProjectionTree('testHash', 10, 20)

        # Create engine for 100 dimensional feature space
        self.engine = Engine(100,
                             lshashes=[rbpt],
                             vector_filters=[NearestFilter(20)])

        # First insert 2000 random vectors
        for k in range(2000):
            x = numpy.random.randn(100)
            x_data = 'data'
            self.engine.store_vector(x, x_data)

        self.redis_storage.store_hash_configuration(rbpt)

        rbpt2 = RandomBinaryProjectionTree(None, None, None)
        rbpt2.apply_config(
            self.redis_storage.load_hash_configuration('testHash'))

        self.assertEqual(rbpt.dim, rbpt2.dim)
        self.assertEqual(rbpt.hash_name, rbpt2.hash_name)
        self.assertEqual(rbpt.projection_count, rbpt2.projection_count)

        for i in range(rbpt.normals.shape[0]):
            for j in range(rbpt.normals.shape[1]):
                self.assertEqual(rbpt.normals[i, j], rbpt2.normals[i, j])

        # Now do random queries and check result set size
        for k in range(10):
            x = numpy.random.randn(100)
            keys1 = rbpt.hash_vector(x, querying=True)
            keys2 = rbpt2.hash_vector(x, querying=True)
            self.assertEqual(len(keys1), len(keys2))
            for k in range(len(keys1)):
                self.assertEqual(keys1[k], keys2[k])
Пример #6
0
def example1():

    # Dimension of feature space
    DIM = 100

    # Number of data points (dont do too much because of exact search)
    POINTS = 10000

    print('Creating engines')

    # We want 12 projections, 20 results at least
    rbpt = RandomBinaryProjectionTree('rbpt', 20, 20)

    # Create engine 1
    engine_rbpt = Engine(DIM, lshashes=[rbpt], distance=CosineDistance())

    # Create binary hash as child hash
    rbp = RandomBinaryProjections('rbp1', 20)

    # Create engine 2
    engine = Engine(DIM, lshashes=[rbp], distance=CosineDistance())

    # Create permutations meta-hash
    permutations = HashPermutations('permut')

    # Create binary hash as child hash
    rbp_perm = RandomBinaryProjections('rbp_perm', 20)
    rbp_conf = {'num_permutation': 50, 'beam_size': 10, 'num_neighbour': 100}

    # Add rbp as child hash of permutations hash
    permutations.add_child_hash(rbp_perm, rbp_conf)

    # Create engine 3
    engine_perm = Engine(DIM,
                         lshashes=[permutations],
                         distance=CosineDistance())

    # Create permutations meta-hash
    permutations2 = HashPermutationMapper('permut2')

    # Create binary hash as child hash
    rbp_perm2 = RandomBinaryProjections('rbp_perm2', 12)

    # Add rbp as child hash of permutations hash
    permutations2.add_child_hash(rbp_perm2)

    # Create engine 3
    engine_perm2 = Engine(DIM,
                          lshashes=[permutations2],
                          distance=CosineDistance())

    print('Indexing %d random vectors of dimension %d' % (POINTS, DIM))

    # First index some random vectors
    matrix = numpy.zeros((POINTS, DIM))
    for i in xrange(POINTS):
        v = numpy.random.randn(DIM)
        matrix[i, :] = nearpy.utils.utils.unitvec(v)
        engine.store_vector(v, i)
        engine_rbpt.store_vector(v, i)
        engine_perm.store_vector(v, i)
        engine_perm2.store_vector(v, i)

    print('Buckets 1 = %d' % len(engine.storage.buckets['rbp1'].keys()))
    print('Buckets 2 = %d' % len(engine_rbpt.storage.buckets['rbpt'].keys()))

    print('Building permuted index for HashPermutations')

    # Then update permuted index
    permutations.build_permuted_index()

    print('Generate random data')

    # Get random query vector
    query = numpy.random.randn(DIM)

    # Do random query on engine 1
    print('\nNeighbour distances with RandomBinaryProjectionTree:')
    print('  -> Candidate count is %d' % engine_rbpt.candidate_count(query))
    results = engine_rbpt.neighbours(query)
    print_results(results)

    # Do random query on engine 2
    print('\nNeighbour distances with RandomBinaryProjections:')
    print('  -> Candidate count is %d' % engine.candidate_count(query))
    results = engine.neighbours(query)
    print_results(results)

    # Do random query on engine 3
    print('\nNeighbour distances with HashPermutations:')
    print('  -> Candidate count is %d' % engine_perm.candidate_count(query))
    results = engine_perm.neighbours(query)
    print_results(results)

    # Do random query on engine 4
    print('\nNeighbour distances with HashPermutations2:')
    print('  -> Candidate count is %d' % engine_perm2.candidate_count(query))
    results = engine_perm2.neighbours(query)
    print_results(results)

    # Real neighbours
    print('\nReal neighbour distances:')
    query = nearpy.utils.utils.unitvec(query)
    query = query.reshape((DIM, 1))
    dists = CosineDistance().distance(matrix, query)
    dists = dists.reshape((-1, ))
    # dists = sorted(dists)

    dists_argsort = numpy.argsort(dists)

    results = [(None, d, dists[d]) for d in dists_argsort[:10]]
    print_results(results)
Пример #7
0
def print_results(results):
    print('  Data \t| Distance')
    for r in results:
        data = r[1]
        dist = r[2]
        print('  {} \t| {:.4f}'.format(data, dist))


DIM = int(sys.argv[2])

POINTS = int(sys.argv[3])

rand = random.randint(1, POINTS)

rbpt = RandomBinaryProjectionTree('rbpt', 64, 3)

engine = Engine(DIM, lshashes=[rbpt], distance=CosineDistance())

with open(str(sys.argv[1])) as f:
    content = f.readlines()
with open(str("test_nn.txt")) as f:
    content2 = f.readlines()
rand_frame = None
rand_v1 = None
rand_v2 = None
rand_v3 = None

count = 0
for i in content:
    fv = i.split()