def setUp(self): logging.basicConfig(level=logging.WARNING) numpy.random.seed(11) # Create permutations meta-hash self.permutations = HashPermutations('permut') # Create binary hash as child hash rbp = RandomBinaryProjections('rbp1', 4, rand_seed=19) rbp_conf = { 'num_permutation': 50, 'beam_size': 10, 'num_neighbour': 100 } # Add rbp as child hash of permutations hash self.permutations.add_child_hash(rbp, rbp_conf) # Create engine with meta hash and cosine distance self.engine_perm = Engine(200, lshashes=[self.permutations], distance=CosineDistance()) # Create engine without permutation meta-hash self.engine = Engine(200, lshashes=[rbp], distance=CosineDistance())
def test_random_discretized_projections(self): dim = 4 vector_count = 5000 vectors = numpy.random.randn(dim, vector_count) # First get recall and precision for one 1-dim random hash rdp = RandomDiscretizedProjections('rdp', 1, 0.01) nearest = NearestFilter(10 + 1) engine = Engine(dim, lshashes=[rdp], vector_filters=[nearest]) exp = RecallPrecisionExperiment(10, vectors) result = exp.perform_experiment([engine]) recall1 = result[0][0] precision1 = result[0][1] searchtime1 = result[0][2] print('\nRecall RDP: %f, Precision RDP: %f, SearchTime RDP: %f\n' % \ (recall1, precision1, searchtime1)) # Then get recall and precision for one 4-dim random hash rdp = RandomDiscretizedProjections('rdp', 2, 0.2) engine = Engine(dim, lshashes=[rdp], vector_filters=[nearest]) result = exp.perform_experiment([engine]) recall2 = result[0][0] precision2 = result[0][1] searchtime2 = result[0][2] print('\nRecall RDP: %f, Precision RDP: %f, SearchTime RDP: %f\n' % \ (recall2, precision2, searchtime2)) # Many things are random here, but the precision should increase # with dimension self.assertTrue(precision2 > precision1)
def test_storage_issue(self): engine1 = Engine(100) engine2 = Engine(100) for k in range(1000): x = numpy.random.randn(100) x_data = 'data' engine1.store_vector(x, x_data) # Each engine should have its own default storage self.assertEqual(len(engine2.storage.buckets), 0)
class TestEngine(unittest.TestCase): def setUp(self): self.engine = Engine(1000) def test_storage_issue(self): engine1 = Engine(100) engine2 = Engine(100) for k in range(1000): x = numpy.random.randn(100) x_data = 'data' engine1.store_vector(x, x_data) # Each engine should have its own default storage self.assertEqual(len(engine2.storage.buckets), 0) def test_retrieval(self): for k in range(100): self.engine.clean_all_buckets() x = numpy.random.randn(1000) x_data = 'data' self.engine.store_vector(x, x_data) n = self.engine.neighbours(x) y, y_data, y_distance = n[0] normalized_x = unitvec(x) delta = 0.000000001 self.assertAlmostEqual(numpy.abs((normalized_x - y)).max(), 0, delta=delta) self.assertEqual(y_data, x_data) self.assertAlmostEqual(y_distance, 0.0, delta=delta) def test_retrieval_sparse(self): for k in range(100): self.engine.clean_all_buckets() x = scipy.sparse.rand(1000, 1, density=0.05) x_data = 'data' self.engine.store_vector(x, x_data) n = self.engine.neighbours(x) y, y_data, y_distance = n[0] normalized_x = unitvec(x) delta = 0.000000001 self.assertAlmostEqual(numpy.abs((normalized_x - y)).max(), 0, delta=delta) self.assertEqual(y_data, x_data) self.assertAlmostEqual(y_distance, 0.0, delta=delta)
def test_retrieval(self): # We want 12 projections, 20 results at least rbpt = RandomBinaryProjectionTree('testHash', 12, 20) # Create engine for 100 dimensional feature space, do not forget to set # nearest filter to 20, because default is 10 self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)]) # First insert 200000 random vectors for k in range(200000): x = numpy.random.randn(100) x_data = 'data {}'.format(k) self.engine.store_vector(x, x_data) # Now do random queries and check result set size for k in range(10): x = numpy.random.randn(100) n = self.engine.neighbours(x) self.assertEqual(len(n), 20)
def test_storage_redis(self): # We want 10 projections, 20 results at least rbpt = RandomBinaryProjectionTree('testHash', 10, 20) # Create engine for 100 dimensional feature space self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)]) # First insert 2000 random vectors for k in range(2000): x = numpy.random.randn(100) x_data = 'data' self.engine.store_vector(x, x_data) self.redis_storage.store_hash_configuration(rbpt) rbpt2 = RandomBinaryProjectionTree(None, None, None) rbpt2.apply_config( self.redis_storage.load_hash_configuration('testHash')) self.assertEqual(rbpt.dim, rbpt2.dim) self.assertEqual(rbpt.hash_name, rbpt2.hash_name) self.assertEqual(rbpt.projection_count, rbpt2.projection_count) for i in range(rbpt.normals.shape[0]): for j in range(rbpt.normals.shape[1]): self.assertEqual(rbpt.normals[i, j], rbpt2.normals[i, j]) # Now do random queries and check result set size for k in range(10): x = numpy.random.randn(100) keys1 = rbpt.hash_vector(x, querying=True) keys2 = rbpt2.hash_vector(x, querying=True) self.assertEqual(len(keys1), len(keys2)) for k in range(len(keys1)): self.assertEqual(keys1[k], keys2[k])
def test_experiment_with_list_1(self): dim = 50 vector_count = 100 vectors = [] for index in range(vector_count): vectors.append(numpy.random.randn(dim)) unibucket = UniBucket('testHash') nearest = NearestFilter(10 + 1) engine = Engine(dim, lshashes=[unibucket], vector_filters=[nearest]) exp = RecallPrecisionExperiment(10, vectors) result = exp.perform_experiment([engine]) # Both recall and precision must be one in this case self.assertEqual(result[0][0], 1.0) self.assertEqual(result[0][1], 1.0)
def test_experiment_with_unibucket_3(self): dim = 50 vector_count = 100 vectors = numpy.random.randn(dim, vector_count) unibucket = UniBucket('testHash') nearest = NearestFilter(5 + 1) engine = Engine(dim, lshashes=[unibucket], vector_filters=[nearest]) exp = RecallPrecisionExperiment(10, vectors) result = exp.perform_experiment([engine]) # In this case recall is only 0.5 # because the engine returns 5 nearest, but # the experiment looks for 10 nearest. self.assertEqual(result[0][0], 0.5) self.assertEqual(result[0][1], 1.0)
def test_experiment_with_unibucket_1(self): dim = 50 vector_count = 100 vectors = numpy.random.randn(dim, vector_count) unibucket = UniBucket('testHash') nearest = NearestFilter(10 + 1) engine = Engine(dim, lshashes=[unibucket], vector_filters=[nearest], distance=EuclideanDistance()) exp = RecallPrecisionExperiment(10, vectors) result = exp.perform_experiment([engine]) # Both recall and precision must be one in this case self.assertEqual(result[0][0], 1.0) self.assertEqual(result[0][1], 1.0)
def test_experiment_with_list_2(self): dim = 50 vector_count = 100 vectors = [] for index in range(vector_count): vectors.append(numpy.random.randn(dim)) unibucket = UniBucket('testHash') nearest = NearestFilter(10 + 1) engine = Engine(dim, lshashes=[unibucket], vector_filters=[nearest]) exp = RecallPrecisionExperiment(5, vectors) result = exp.perform_experiment([engine]) # In this case precision is only 0.5 # because the engine returns 10 nearest, but # the experiment only looks for 5 nearest. self.assertEqual(result[0][0], 1.0) self.assertEqual(result[0][1], 0.5)
def test_random_binary_projections(self): dim = 4 vector_count = 5000 vectors = numpy.random.randn(dim, vector_count) # First get recall and precision for one 1-dim random hash rbp = RandomBinaryProjections('rbp', 32) nearest = NearestFilter(10 + 1) engine = Engine(dim, lshashes=[rbp], vector_filters=[nearest]) exp = RecallPrecisionExperiment(10, vectors) result = exp.perform_experiment([engine]) recall1 = result[0][0] precision1 = result[0][1] searchtime1 = result[0][2] print('\nRecall RBP: %f, Precision RBP: %f, SearchTime RBP: %f\n' % \ (recall1, precision1, searchtime1))
class TestPermutation(unittest.TestCase): def setUp(self): logging.basicConfig(level=logging.WARNING) numpy.random.seed(11) # Create permutations meta-hash self.permutations = HashPermutations('permut') # Create binary hash as child hash rbp = RandomBinaryProjections('rbp1', 4, rand_seed=19) rbp_conf = { 'num_permutation': 50, 'beam_size': 10, 'num_neighbour': 100 } # Add rbp as child hash of permutations hash self.permutations.add_child_hash(rbp, rbp_conf) # Create engine with meta hash and cosine distance self.engine_perm = Engine(200, lshashes=[self.permutations], distance=CosineDistance()) # Create engine without permutation meta-hash self.engine = Engine(200, lshashes=[rbp], distance=CosineDistance()) def test_runnable(self): # First index some random vectors matrix = numpy.zeros((1000, 200)) for i in xrange(1000): v = numpy.random.randn(200) matrix[i] = v self.engine.store_vector(v) self.engine_perm.store_vector(v) # Then update permuted index self.permutations.build_permuted_index() # Do random query on engine with permutations meta-hash query = numpy.random.randn(200) results = self.engine_perm.neighbours(query) permuted_dists = [x[2] for x in results] # Do random query on engine without permutations meta-hash (distances should be larger):' results = self.engine.neighbours(query) dists = [x[2] for x in results] self.assertLess(permuted_dists[0], dists[0])
def createLSHFile(inputFilename, L, K): if os.path.isfile(inputFilename): siftVectorFile = open(inputFilename, 'r') # Dimension of our vector space dimension = 2 noOfLayers = L noOfBits = K layers = [] for i in xrange(noOfLayers): # Create a random binary hash with 10 bits layers.append(RandomBinaryProjections('layer' + str(i), noOfBits)) # Create engine with pipeline configuration engine = Engine(dimension, lshashes=layers) outFile = open('filename_d.lsh', 'w') storeVectors(siftVectorFile, engine, outFile) outFile.close() siftVectorFile.close() else: print 'Filename incorrect!'
def example1(): # Dimension of feature space DIM = 100 # Number of data points (dont do too much because of exact search) POINTS = 10000 print('Creating engines') # We want 12 projections, 20 results at least rbpt = RandomBinaryProjectionTree('rbpt', 20, 20) # Create engine 1 engine_rbpt = Engine(DIM, lshashes=[rbpt], distance=CosineDistance()) # Create binary hash as child hash rbp = RandomBinaryProjections('rbp1', 20) # Create engine 2 engine = Engine(DIM, lshashes=[rbp], distance=CosineDistance()) # Create permutations meta-hash permutations = HashPermutations('permut') # Create binary hash as child hash rbp_perm = RandomBinaryProjections('rbp_perm', 20) rbp_conf = {'num_permutation': 50, 'beam_size': 10, 'num_neighbour': 100} # Add rbp as child hash of permutations hash permutations.add_child_hash(rbp_perm, rbp_conf) # Create engine 3 engine_perm = Engine(DIM, lshashes=[permutations], distance=CosineDistance()) # Create permutations meta-hash permutations2 = HashPermutationMapper('permut2') # Create binary hash as child hash rbp_perm2 = RandomBinaryProjections('rbp_perm2', 12) # Add rbp as child hash of permutations hash permutations2.add_child_hash(rbp_perm2) # Create engine 3 engine_perm2 = Engine(DIM, lshashes=[permutations2], distance=CosineDistance()) print('Indexing %d random vectors of dimension %d' % (POINTS, DIM)) # First index some random vectors matrix = numpy.zeros((POINTS, DIM)) for i in xrange(POINTS): v = numpy.random.randn(DIM) matrix[i] = v engine.store_vector(v) engine_rbpt.store_vector(v) engine_perm.store_vector(v) engine_perm2.store_vector(v) print('Buckets 1 = %d' % len(engine.storage.buckets['rbp1'].keys())) print('Buckets 2 = %d' % len(engine_rbpt.storage.buckets['rbpt'].keys())) print('Building permuted index for HashPermutations') # Then update permuted index permutations.build_permuted_index() print('Generate random data') # Get random query vector query = numpy.random.randn(DIM) # Do random query on engine 1 print('\nNeighbour distances with RandomBinaryProjectionTree:') print(' -> Candidate count is %d' % engine_rbpt.candidate_count(query)) results = engine_rbpt.neighbours(query) dists = [x[2] for x in results] print(dists) # Do random query on engine 2 print('\nNeighbour distances with RandomBinaryProjections:') print(' -> Candidate count is %d' % engine.candidate_count(query)) results = engine.neighbours(query) dists = [x[2] for x in results] print(dists) # Do random query on engine 3 print('\nNeighbour distances with HashPermutations:') print(' -> Candidate count is %d' % engine_perm.candidate_count(query)) results = engine_perm.neighbours(query) dists = [x[2] for x in results] print(dists) # Do random query on engine 4 print('\nNeighbour distances with HashPermutations2:') print(' -> Candidate count is %d' % engine_perm2.candidate_count(query)) results = engine_perm2.neighbours(query) dists = [x[2] for x in results] print(dists) # Real neighbours print('\nReal neighbour distances:') query = query.reshape((1, DIM)) dists = CosineDistance().distance(matrix, query) dists = dists.reshape((-1, )) dists = sorted(dists) print(dists[:10])
def setUp(self): self.engine = Engine(1000)
def example2(): # Dimension of feature space DIM = 100 # Number of data points (dont do too much because of exact search) POINTS = 20000 ########################################################## print('Performing indexing with HashPermutations...') t0 = time.time() # Create permutations meta-hash permutations = HashPermutations('permut') # Create binary hash as child hash rbp_perm = RandomBinaryProjections('rbp_perm', 14) rbp_conf = {'num_permutation': 50, 'beam_size': 10, 'num_neighbour': 100} # Add rbp as child hash of permutations hash permutations.add_child_hash(rbp_perm, rbp_conf) # Create engine engine_perm = Engine(DIM, lshashes=[permutations], distance=CosineDistance()) # First index some random vectors matrix = numpy.zeros((POINTS, DIM)) for i in range(POINTS): v = numpy.random.randn(DIM) matrix[i] = v engine_perm.store_vector(v) # Then update permuted index permutations.build_permuted_index() t1 = time.time() print('Indexing took %f seconds' % (t1 - t0)) # Get random query vector query = numpy.random.randn(DIM) # Do random query on engine 3 print('\nNeighbour distances with HashPermutations:') print(' -> Candidate count is %d' % engine_perm.candidate_count(query)) results = engine_perm.neighbours(query) dists = [x[2] for x in results] print(dists) # Real neighbours print('\nReal neighbour distances:') query = query.reshape((DIM)) dists = CosineDistance().distance(matrix, query) dists = dists.reshape((-1, )) dists = sorted(dists) print(dists[:10]) ########################################################## print('\nPerforming indexing with HashPermutationMapper...') t0 = time.time() # Create permutations meta-hash permutations2 = HashPermutationMapper('permut2') # Create binary hash as child hash rbp_perm2 = RandomBinaryProjections('rbp_perm2', 14) # Add rbp as child hash of permutations hash permutations2.add_child_hash(rbp_perm2) # Create engine engine_perm2 = Engine(DIM, lshashes=[permutations2], distance=CosineDistance()) # First index some random vectors matrix = numpy.zeros((POINTS, DIM)) for i in range(POINTS): v = numpy.random.randn(DIM) matrix[i] = v engine_perm2.store_vector(v) t1 = time.time() print('Indexing took %f seconds' % (t1 - t0)) # Get random query vector query = numpy.random.randn(DIM) # Do random query on engine 4 print('\nNeighbour distances with HashPermutationMapper:') print(' -> Candidate count is %d' % engine_perm2.candidate_count(query)) results = engine_perm2.neighbours(query) dists = [x[2] for x in results] print(dists) # Real neighbours print('\nReal neighbour distances:') query = query.reshape((DIM)) dists = CosineDistance().distance(matrix, query) dists = dists.reshape((-1, )) dists = sorted(dists) print(dists[:10]) ########################################################## print('\nPerforming indexing with multiple binary hashes...') t0 = time.time() hashes = [] for k in range(20): hashes.append(RandomBinaryProjections('rbp_%d' % k, 10)) # Create engine engine_rbps = Engine(DIM, lshashes=hashes, distance=CosineDistance()) # First index some random vectors matrix = numpy.zeros((POINTS, DIM)) for i in range(POINTS): v = numpy.random.randn(DIM) matrix[i] = v engine_rbps.store_vector(v) t1 = time.time() print('Indexing took %f seconds' % (t1 - t0)) # Get random query vector query = numpy.random.randn(DIM) # Do random query on engine 4 print('\nNeighbour distances with multiple binary hashes:') print(' -> Candidate count is %d' % engine_rbps.candidate_count(query)) results = engine_rbps.neighbours(query) dists = [x[2] for x in results] print(dists) # Real neighbours print('\nReal neighbour distances:') query = query.reshape((DIM)) dists = CosineDistance().distance(matrix, query) dists = dists.reshape((-1, )) dists = sorted(dists) print(dists[:10])
# We are going to test these bin widths bin_widths = [0.01 * x for x in range(1, 5)] # Create engines for all configurations for bin_width in bin_widths: # Use four random 1-dim discretized projections rdp1 = RandomDiscretizedProjections('rdp1', 4, bin_width) rdp2 = RandomDiscretizedProjections('rdp2', 4, bin_width) rdp3 = RandomDiscretizedProjections('rdp3', 4, bin_width) rdp4 = RandomDiscretizedProjections('rdp4', 4, bin_width) #ub1 = UniBucket('uni') # Create engine with this configuration #engine = Engine(dimension, lshashes=[rdp1, rdp2, rdp3, rdp4], # vector_filters=[unique, nearest]) engine = Engine(dimension, lshashes=[rdp1, rdp2, rdp3, rdp4], vector_filters=[nearest]) # Add engine to list of engines to evaluate engines.append(engine) print 'Creating experiment and performing exact search...' # Create experiment (looking for ten closest neighbours). # The constructor performs exact search for evaluation. # So the data set should not be too large for experiments. exp = DistanceRatioExperiment(N, vectors, coverage_ratio=0.01) print 'Performing experiment for all engines...' # Perform experiment for all engines
def test_delete_vector_with_provided_value(self): engine = Engine(self.dim, lshashes=[UniBucket('testHash')]) self.fill_engine(engine) engine.delete_vector(self.removed_value, self.removed_vector) self.check_delete(engine)
def test_delete_vector_multiple_hash(self): hashes = [UniBucket('name_hash_%d' % k) for k in range(10)] engine = Engine(self.dim, lshashes=hashes) self.fill_engine(engine) engine.delete_vector(self.removed_value) self.check_delete(engine)
def test_delete_vector_single_hash(self): engine = Engine(self.dim, lshashes=[UniBucket('testHash')]) self.fill_engine(engine) engine.delete_vector(self.removed_value) self.check_delete(engine)
class TestRandomBinaryProjectionTree(unittest.TestCase): def setUp(self): self.memory = MemoryStorage() self.redis_object = Redis() self.redis_storage = RedisStorage(self.redis_object) numpy.random.seed(16) def test_retrieval(self): # We want 12 projections, 20 results at least rbpt = RandomBinaryProjectionTree('testHash', 12, 20) # Create engine for 100 dimensional feature space, do not forget to set # nearest filter to 20, because default is 10 self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)]) # First insert 200000 random vectors for k in range(200000): x = numpy.random.randn(100) x_data = 'data {}'.format(k) self.engine.store_vector(x, x_data) # Now do random queries and check result set size for k in range(10): x = numpy.random.randn(100) n = self.engine.neighbours(x) self.assertEqual(len(n), 20) def test_storage_memory(self): # We want 10 projections, 20 results at least rbpt = RandomBinaryProjectionTree('testHash', 10, 20) # Create engine for 100 dimensional feature space self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)]) # First insert 2000 random vectors for k in range(2000): x = numpy.random.randn(100) x_data = 'data' self.engine.store_vector(x, x_data) self.memory.store_hash_configuration(rbpt) rbpt2 = RandomBinaryProjectionTree(None, None, None) rbpt2.apply_config(self.memory.load_hash_configuration('testHash')) self.assertEqual(rbpt.dim, rbpt2.dim) self.assertEqual(rbpt.hash_name, rbpt2.hash_name) self.assertEqual(rbpt.projection_count, rbpt2.projection_count) for i in range(rbpt.normals.shape[0]): for j in range(rbpt.normals.shape[1]): self.assertEqual(rbpt.normals[i, j], rbpt2.normals[i, j]) # Now do random queries and check result set size for k in range(10): x = numpy.random.randn(100) keys1 = rbpt.hash_vector(x, querying=True) keys2 = rbpt2.hash_vector(x, querying=True) self.assertEqual(len(keys1), len(keys2)) for k in range(len(keys1)): self.assertEqual(keys1[k], keys2[k]) def test_storage_redis(self): # We want 10 projections, 20 results at least rbpt = RandomBinaryProjectionTree('testHash', 10, 20) # Create engine for 100 dimensional feature space self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)]) # First insert 2000 random vectors for k in range(2000): x = numpy.random.randn(100) x_data = 'data' self.engine.store_vector(x, x_data) self.redis_storage.store_hash_configuration(rbpt) rbpt2 = RandomBinaryProjectionTree(None, None, None) rbpt2.apply_config( self.redis_storage.load_hash_configuration('testHash')) self.assertEqual(rbpt.dim, rbpt2.dim) self.assertEqual(rbpt.hash_name, rbpt2.hash_name) self.assertEqual(rbpt.projection_count, rbpt2.projection_count) for i in range(rbpt.normals.shape[0]): for j in range(rbpt.normals.shape[1]): self.assertEqual(rbpt.normals[i, j], rbpt2.normals[i, j]) # Now do random queries and check result set size for k in range(10): x = numpy.random.randn(100) keys1 = rbpt.hash_vector(x, querying=True) keys2 = rbpt2.hash_vector(x, querying=True) self.assertEqual(len(keys1), len(keys2)) for k in range(len(keys1)): self.assertEqual(keys1[k], keys2[k])