Exemplo n.º 1
0
class TestRandomBinaryProjections(unittest.TestCase):

    def setUp(self):
        self.rbp = RandomBinaryProjections('testHash', 10)
        self.rbp.reset(100)

    def test_hash_format(self):
        h = self.rbp.hash_vector(numpy.random.randn(100))
        self.assertEqual(len(h), 1)
        self.assertEqual(type(h[0]), type(''))
        self.assertEqual(len(h[0]), 10)
        for c in h[0]:
            self.assertTrue(c == '1' or c == '0')

    def test_hash_deterministic(self):
        x = numpy.random.randn(100)
        first_hash = self.rbp.hash_vector(x)[0]
        for k in range(100):
            self.assertEqual(first_hash, self.rbp.hash_vector(x)[0])

    def test_hash_format_sparse(self):
        h = self.rbp.hash_vector(scipy.sparse.rand(100, 1, density=0.1))
        self.assertEqual(len(h), 1)
        self.assertEqual(type(h[0]), type(''))
        self.assertEqual(len(h[0]), 10)
        for c in h[0]:
            self.assertTrue(c == '1' or c == '0')

    def test_hash_deterministic_sparse(self):
        x = scipy.sparse.rand(100, 1, density=0.1)
        first_hash = self.rbp.hash_vector(x)[0]
        for k in range(100):
            self.assertEqual(first_hash, self.rbp.hash_vector(x)[0])
    def setUp(self):
        logging.basicConfig(level=logging.WARNING)
        numpy.random.seed(11)

        # Create permutations meta-hash
        self.permutations = HashPermutations('permut')

        # Create binary hash as child hash
        rbp = RandomBinaryProjections('rbp1', 4, rand_seed=19)
        rbp_conf = {
            'num_permutation': 50,
            'beam_size': 10,
            'num_neighbour': 100
        }

        # Add rbp as child hash of permutations hash
        self.permutations.add_child_hash(rbp, rbp_conf)

        # Create engine with meta hash and cosine distance
        self.engine_perm = Engine(200,
                                  lshashes=[self.permutations],
                                  distance=CosineDistance())

        # Create engine without permutation meta-hash
        self.engine = Engine(200, lshashes=[rbp], distance=CosineDistance())
Exemplo n.º 3
0
    def test_hash_memory_storage_rbp(self):
        hash1 = RandomBinaryProjections('testRBPHash', 10)
        hash1.reset(100)

        self.memory.store_hash_configuration(hash1)

        hash2 = RandomBinaryProjections(None, None)
        hash2.apply_config(self.memory.load_hash_configuration('testRBPHash'))

        self.assertEqual(hash1.dim, hash2.dim)
        self.assertEqual(hash1.hash_name, hash2.hash_name)
        self.assertEqual(hash1.projection_count, hash2.projection_count)

        for i in range(hash1.normals.shape[0]):
            for j in range(hash1.normals.shape[1]):
                self.assertEqual(hash1.normals[i, j], hash2.normals[i, j])
Exemplo n.º 4
0
    def test_random_binary_projections(self):
        dim = 4
        vector_count = 5000
        vectors = numpy.random.randn(dim, vector_count)

        # First get recall and precision for one 1-dim random hash
        rbp = RandomBinaryProjections('rbp', 32)
        nearest = NearestFilter(10 + 1)
        engine = Engine(dim, lshashes=[rbp], vector_filters=[nearest])
        exp = RecallPrecisionExperiment(10, vectors)
        result = exp.perform_experiment([engine])

        recall1 = result[0][0]
        precision1 = result[0][1]
        searchtime1 = result[0][2]

        print('\nRecall RBP: %f, Precision RBP: %f, SearchTime RBP: %f\n' % \
            (recall1, precision1, searchtime1))
Exemplo n.º 5
0
def createLSHFile(inputFilename, L, K):
    if os.path.isfile(inputFilename):
        siftVectorFile = open(inputFilename, 'r')
        # Dimension of our vector space
        dimension = 2
        noOfLayers = L
        noOfBits = K
        layers = []
        for i in xrange(noOfLayers):
            # Create a random binary hash with 10 bits
            layers.append(RandomBinaryProjections('layer' + str(i), noOfBits))

        # Create engine with pipeline configuration
        engine = Engine(dimension, lshashes=layers)
        outFile = open('filename_d.lsh', 'w')
        storeVectors(siftVectorFile, engine, outFile)
        outFile.close()
        siftVectorFile.close()

    else:
        print 'Filename incorrect!'
Exemplo n.º 6
0
    def __init__(self,
                 dim,
                 lshashes=None,
                 distance=None,
                 fetch_vector_filters=None,
                 vector_filters=None,
                 storage=None):
        """ Keeps the configuration. """
        if lshashes is None:
            lshashes = [RandomBinaryProjections('default', 10)]
        self.lshashes = lshashes
        if distance is None: distance = CosineDistance()
        self.distance = distance
        if vector_filters is None: vector_filters = [NearestFilter(10)]
        self.vector_filters = vector_filters
        if fetch_vector_filters is None:
            fetch_vector_filters = [UniqueFilter()]
        self.fetch_vector_filters = fetch_vector_filters
        if storage is None: storage = MemoryStorage()
        self.storage = storage

        # Initialize all hashes for the data space dimension.
        for lshash in self.lshashes:
            lshash.reset(dim)
Exemplo n.º 7
0
def example1():

    # Dimension of feature space
    DIM = 100

    # Number of data points (dont do too much because of exact search)
    POINTS = 10000

    print('Creating engines')

    # We want 12 projections, 20 results at least
    rbpt = RandomBinaryProjectionTree('rbpt', 20, 20)

    # Create engine 1
    engine_rbpt = Engine(DIM, lshashes=[rbpt], distance=CosineDistance())

    # Create binary hash as child hash
    rbp = RandomBinaryProjections('rbp1', 20)

    # Create engine 2
    engine = Engine(DIM, lshashes=[rbp], distance=CosineDistance())

    # Create permutations meta-hash
    permutations = HashPermutations('permut')

    # Create binary hash as child hash
    rbp_perm = RandomBinaryProjections('rbp_perm', 20)
    rbp_conf = {'num_permutation': 50, 'beam_size': 10, 'num_neighbour': 100}

    # Add rbp as child hash of permutations hash
    permutations.add_child_hash(rbp_perm, rbp_conf)

    # Create engine 3
    engine_perm = Engine(DIM,
                         lshashes=[permutations],
                         distance=CosineDistance())

    # Create permutations meta-hash
    permutations2 = HashPermutationMapper('permut2')

    # Create binary hash as child hash
    rbp_perm2 = RandomBinaryProjections('rbp_perm2', 12)

    # Add rbp as child hash of permutations hash
    permutations2.add_child_hash(rbp_perm2)

    # Create engine 3
    engine_perm2 = Engine(DIM,
                          lshashes=[permutations2],
                          distance=CosineDistance())

    print('Indexing %d random vectors of dimension %d' % (POINTS, DIM))

    # First index some random vectors
    matrix = numpy.zeros((POINTS, DIM))
    for i in xrange(POINTS):
        v = numpy.random.randn(DIM)
        matrix[i] = v
        engine.store_vector(v)
        engine_rbpt.store_vector(v)
        engine_perm.store_vector(v)
        engine_perm2.store_vector(v)

    print('Buckets 1 = %d' % len(engine.storage.buckets['rbp1'].keys()))
    print('Buckets 2 = %d' % len(engine_rbpt.storage.buckets['rbpt'].keys()))

    print('Building permuted index for HashPermutations')

    # Then update permuted index
    permutations.build_permuted_index()

    print('Generate random data')

    # Get random query vector
    query = numpy.random.randn(DIM)

    # Do random query on engine 1
    print('\nNeighbour distances with RandomBinaryProjectionTree:')
    print('  -> Candidate count is %d' % engine_rbpt.candidate_count(query))
    results = engine_rbpt.neighbours(query)
    dists = [x[2] for x in results]
    print(dists)

    # Do random query on engine 2
    print('\nNeighbour distances with RandomBinaryProjections:')
    print('  -> Candidate count is %d' % engine.candidate_count(query))
    results = engine.neighbours(query)
    dists = [x[2] for x in results]
    print(dists)

    # Do random query on engine 3
    print('\nNeighbour distances with HashPermutations:')
    print('  -> Candidate count is %d' % engine_perm.candidate_count(query))
    results = engine_perm.neighbours(query)
    dists = [x[2] for x in results]
    print(dists)

    # Do random query on engine 4
    print('\nNeighbour distances with HashPermutations2:')
    print('  -> Candidate count is %d' % engine_perm2.candidate_count(query))
    results = engine_perm2.neighbours(query)
    dists = [x[2] for x in results]
    print(dists)

    # Real neighbours
    print('\nReal neighbour distances:')
    query = query.reshape((1, DIM))
    dists = CosineDistance().distance(matrix, query)
    dists = dists.reshape((-1, ))
    dists = sorted(dists)
    print(dists[:10])
Exemplo n.º 8
0
 def setUp(self):
     self.rbp = RandomBinaryProjections('testHash', 10)
     self.rbp.reset(100)
Exemplo n.º 9
0
def example2():

    # Dimension of feature space
    DIM = 100

    # Number of data points (dont do too much because of exact search)
    POINTS = 20000

    ##########################################################

    print('Performing indexing with HashPermutations...')
    t0 = time.time()

    # Create permutations meta-hash
    permutations = HashPermutations('permut')

    # Create binary hash as child hash
    rbp_perm = RandomBinaryProjections('rbp_perm', 14)
    rbp_conf = {'num_permutation': 50, 'beam_size': 10, 'num_neighbour': 100}

    # Add rbp as child hash of permutations hash
    permutations.add_child_hash(rbp_perm, rbp_conf)

    # Create engine
    engine_perm = Engine(DIM,
                         lshashes=[permutations],
                         distance=CosineDistance())

    # First index some random vectors
    matrix = numpy.zeros((POINTS, DIM))
    for i in range(POINTS):
        v = numpy.random.randn(DIM)
        matrix[i] = v
        engine_perm.store_vector(v)

    # Then update permuted index
    permutations.build_permuted_index()

    t1 = time.time()
    print('Indexing took %f seconds' % (t1 - t0))

    # Get random query vector
    query = numpy.random.randn(DIM)

    # Do random query on engine 3
    print('\nNeighbour distances with HashPermutations:')
    print('  -> Candidate count is %d' % engine_perm.candidate_count(query))
    results = engine_perm.neighbours(query)
    dists = [x[2] for x in results]
    print(dists)

    # Real neighbours
    print('\nReal neighbour distances:')
    query = query.reshape((DIM))
    dists = CosineDistance().distance(matrix, query)
    dists = dists.reshape((-1, ))
    dists = sorted(dists)
    print(dists[:10])

    ##########################################################

    print('\nPerforming indexing with HashPermutationMapper...')
    t0 = time.time()

    # Create permutations meta-hash
    permutations2 = HashPermutationMapper('permut2')

    # Create binary hash as child hash
    rbp_perm2 = RandomBinaryProjections('rbp_perm2', 14)

    # Add rbp as child hash of permutations hash
    permutations2.add_child_hash(rbp_perm2)

    # Create engine
    engine_perm2 = Engine(DIM,
                          lshashes=[permutations2],
                          distance=CosineDistance())

    # First index some random vectors
    matrix = numpy.zeros((POINTS, DIM))
    for i in range(POINTS):
        v = numpy.random.randn(DIM)
        matrix[i] = v
        engine_perm2.store_vector(v)

    t1 = time.time()
    print('Indexing took %f seconds' % (t1 - t0))

    # Get random query vector
    query = numpy.random.randn(DIM)

    # Do random query on engine 4
    print('\nNeighbour distances with HashPermutationMapper:')
    print('  -> Candidate count is %d' % engine_perm2.candidate_count(query))
    results = engine_perm2.neighbours(query)
    dists = [x[2] for x in results]
    print(dists)

    # Real neighbours
    print('\nReal neighbour distances:')
    query = query.reshape((DIM))
    dists = CosineDistance().distance(matrix, query)
    dists = dists.reshape((-1, ))
    dists = sorted(dists)
    print(dists[:10])

    ##########################################################

    print('\nPerforming indexing with multiple binary hashes...')
    t0 = time.time()

    hashes = []
    for k in range(20):
        hashes.append(RandomBinaryProjections('rbp_%d' % k, 10))

    # Create engine
    engine_rbps = Engine(DIM, lshashes=hashes, distance=CosineDistance())

    # First index some random vectors
    matrix = numpy.zeros((POINTS, DIM))
    for i in range(POINTS):
        v = numpy.random.randn(DIM)
        matrix[i] = v
        engine_rbps.store_vector(v)

    t1 = time.time()
    print('Indexing took %f seconds' % (t1 - t0))

    # Get random query vector
    query = numpy.random.randn(DIM)

    # Do random query on engine 4
    print('\nNeighbour distances with multiple binary hashes:')
    print('  -> Candidate count is %d' % engine_rbps.candidate_count(query))
    results = engine_rbps.neighbours(query)
    dists = [x[2] for x in results]
    print(dists)

    # Real neighbours
    print('\nReal neighbour distances:')
    query = query.reshape((DIM))
    dists = CosineDistance().distance(matrix, query)
    dists = dists.reshape((-1, ))
    dists = sorted(dists)
    print(dists[:10])