def setUp(self): self.V = [] self.V.append((numpy.array([0]), 'data1', 0.4)) self.V.append((numpy.array([1]), 'data2', 0.9)) self.V.append((numpy.array([2]), 'data3', 1.4)) self.V.append((numpy.array([3]), 'data4', 2.1)) self.V.append((numpy.array([4]), 'data5', 0.1)) self.V.append((numpy.array([5]), 'data6', 8.7)) self.V.append((numpy.array([6]), 'data7', 3.4)) self.V.append((numpy.array([7]), 'data8', 2.8)) self.threshold_filter = DistanceThresholdFilter(1.0) self.nearest_filter = NearestFilter(5) self.unique = UniqueFilter()
def test_random_discretized_projections(self): dim = 4 vector_count = 5000 vectors = numpy.random.randn(dim, vector_count) # First get recall and precision for one 1-dim random hash rdp = RandomDiscretizedProjections('rdp', 1, 0.01) nearest = NearestFilter(10 + 1) engine = Engine(dim, lshashes=[rdp], vector_filters=[nearest]) exp = RecallPrecisionExperiment(10, vectors) result = exp.perform_experiment([engine]) recall1 = result[0][0] precision1 = result[0][1] searchtime1 = result[0][2] print('\nRecall RDP: %f, Precision RDP: %f, SearchTime RDP: %f\n' % \ (recall1, precision1, searchtime1)) # Then get recall and precision for one 4-dim random hash rdp = RandomDiscretizedProjections('rdp', 2, 0.2) engine = Engine(dim, lshashes=[rdp], vector_filters=[nearest]) result = exp.perform_experiment([engine]) recall2 = result[0][0] precision2 = result[0][1] searchtime2 = result[0][2] print('\nRecall RDP: %f, Precision RDP: %f, SearchTime RDP: %f\n' % \ (recall2, precision2, searchtime2)) # Many things are random here, but the precision should increase # with dimension self.assertTrue(precision2 > precision1)
def test_experiment_with_list_1(self): dim = 50 vector_count = 100 vectors = [] for index in range(vector_count): vectors.append(numpy.random.randn(dim)) unibucket = UniBucket('testHash') nearest = NearestFilter(10 + 1) engine = Engine(dim, lshashes=[unibucket], vector_filters=[nearest]) exp = RecallPrecisionExperiment(10, vectors) result = exp.perform_experiment([engine]) # Both recall and precision must be one in this case self.assertEqual(result[0][0], 1.0) self.assertEqual(result[0][1], 1.0)
def test_experiment_with_unibucket_3(self): dim = 50 vector_count = 100 vectors = numpy.random.randn(dim, vector_count) unibucket = UniBucket('testHash') nearest = NearestFilter(5 + 1) engine = Engine(dim, lshashes=[unibucket], vector_filters=[nearest]) exp = RecallPrecisionExperiment(10, vectors) result = exp.perform_experiment([engine]) # In this case recall is only 0.5 # because the engine returns 5 nearest, but # the experiment looks for 10 nearest. self.assertEqual(result[0][0], 0.5) self.assertEqual(result[0][1], 1.0)
def test_experiment_with_unibucket_1(self): dim = 50 vector_count = 100 vectors = numpy.random.randn(dim, vector_count) unibucket = UniBucket('testHash') nearest = NearestFilter(10 + 1) engine = Engine(dim, lshashes=[unibucket], vector_filters=[nearest], distance=EuclideanDistance()) exp = RecallPrecisionExperiment(10, vectors) result = exp.perform_experiment([engine]) # Both recall and precision must be one in this case self.assertEqual(result[0][0], 1.0) self.assertEqual(result[0][1], 1.0)
def test_experiment_with_list_2(self): dim = 50 vector_count = 100 vectors = [] for index in range(vector_count): vectors.append(numpy.random.randn(dim)) unibucket = UniBucket('testHash') nearest = NearestFilter(10 + 1) engine = Engine(dim, lshashes=[unibucket], vector_filters=[nearest]) exp = RecallPrecisionExperiment(5, vectors) result = exp.perform_experiment([engine]) # In this case precision is only 0.5 # because the engine returns 10 nearest, but # the experiment only looks for 5 nearest. self.assertEqual(result[0][0], 1.0) self.assertEqual(result[0][1], 0.5)
def test_random_binary_projections(self): dim = 4 vector_count = 5000 vectors = numpy.random.randn(dim, vector_count) # First get recall and precision for one 1-dim random hash rbp = RandomBinaryProjections('rbp', 32) nearest = NearestFilter(10 + 1) engine = Engine(dim, lshashes=[rbp], vector_filters=[nearest]) exp = RecallPrecisionExperiment(10, vectors) result = exp.perform_experiment([engine]) recall1 = result[0][0] precision1 = result[0][1] searchtime1 = result[0][2] print('\nRecall RBP: %f, Precision RBP: %f, SearchTime RBP: %f\n' % \ (recall1, precision1, searchtime1))
class TestVectorFilters(unittest.TestCase): def setUp(self): self.V = [] self.V.append((numpy.array([0]), 'data1', 0.4)) self.V.append((numpy.array([1]), 'data2', 0.9)) self.V.append((numpy.array([2]), 'data3', 1.4)) self.V.append((numpy.array([3]), 'data4', 2.1)) self.V.append((numpy.array([4]), 'data5', 0.1)) self.V.append((numpy.array([5]), 'data6', 8.7)) self.V.append((numpy.array([6]), 'data7', 3.4)) self.V.append((numpy.array([7]), 'data8', 2.8)) self.threshold_filter = DistanceThresholdFilter(1.0) self.nearest_filter = NearestFilter(5) self.unique = UniqueFilter() def test_thresholding(self): result = self.threshold_filter.filter_vectors(self.V) self.assertEqual(len(result), 3) self.assertIn(self.V[0], result) self.assertIn(self.V[1], result) self.assertIn(self.V[4], result) def test_nearest(self): result = self.nearest_filter.filter_vectors(self.V) self.assertEqual(len(result), 5) self.assertIn(self.V[0], result) self.assertIn(self.V[1], result) self.assertIn(self.V[4], result) self.assertIn(self.V[2], result) self.assertIn(self.V[3], result) def test_unique(self): W = self.V W.append((numpy.array([7]), 'data8', 2.8)) W.append((numpy.array([0]), 'data1', 2.8)) W.append((numpy.array([1]), 'data2', 2.8)) W.append((numpy.array([6]), 'data7', 2.8)) result = self.unique.filter_vectors(W) self.assertEqual(len(result), 8)
def test_retrieval(self): # We want 12 projections, 20 results at least rbpt = RandomBinaryProjectionTree('testHash', 12, 20) # Create engine for 100 dimensional feature space, do not forget to set # nearest filter to 20, because default is 10 self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)]) # First insert 200000 random vectors for k in range(200000): x = numpy.random.randn(100) x_data = 'data {}'.format(k) self.engine.store_vector(x, x_data) # Now do random queries and check result set size for k in range(10): x = numpy.random.randn(100) n = self.engine.neighbours(x) self.assertEqual(len(n), 20)
def test_storage_redis(self): # We want 10 projections, 20 results at least rbpt = RandomBinaryProjectionTree('testHash', 10, 20) # Create engine for 100 dimensional feature space self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)]) # First insert 2000 random vectors for k in range(2000): x = numpy.random.randn(100) x_data = 'data' self.engine.store_vector(x, x_data) self.redis_storage.store_hash_configuration(rbpt) rbpt2 = RandomBinaryProjectionTree(None, None, None) rbpt2.apply_config( self.redis_storage.load_hash_configuration('testHash')) self.assertEqual(rbpt.dim, rbpt2.dim) self.assertEqual(rbpt.hash_name, rbpt2.hash_name) self.assertEqual(rbpt.projection_count, rbpt2.projection_count) for i in range(rbpt.normals.shape[0]): for j in range(rbpt.normals.shape[1]): self.assertEqual(rbpt.normals[i, j], rbpt2.normals[i, j]) # Now do random queries and check result set size for k in range(10): x = numpy.random.randn(100) keys1 = rbpt.hash_vector(x, querying=True) keys2 = rbpt2.hash_vector(x, querying=True) self.assertEqual(len(keys1), len(keys2)) for k in range(len(keys1)): self.assertEqual(keys1[k], keys2[k])
def __init__(self, dim, lshashes=None, distance=None, fetch_vector_filters=None, vector_filters=None, storage=None): """ Keeps the configuration. """ if lshashes is None: lshashes = [RandomBinaryProjections('default', 10)] self.lshashes = lshashes if distance is None: distance = CosineDistance() self.distance = distance if vector_filters is None: vector_filters = [NearestFilter(10)] self.vector_filters = vector_filters if fetch_vector_filters is None: fetch_vector_filters = [UniqueFilter()] self.fetch_vector_filters = fetch_vector_filters if storage is None: storage = MemoryStorage() self.storage = storage # Initialize all hashes for the data space dimension. for lshash in self.lshashes: lshash.reset(dim)
# Create data set from two clusters vectors = [] center = numpy.random.randn(dimension) for index in xrange(vector_count / 2): vector = center + 0.01 * numpy.random.randn(dimension) vectors.append(vector) center = numpy.random.randn(dimension) for index in xrange(vector_count / 2): vector = center + 0.01 * numpy.random.randn(dimension) vectors.append(vector) # We are looking for the N closest neighbours N = 20 nearest = NearestFilter(N) # We will fill this array with all the engines we want to test engines = [] print 'Creating engines...' # We are going to test these bin widths bin_widths = [0.01 * x for x in range(1, 5)] # Create engines for all configurations for bin_width in bin_widths: # Use four random 1-dim discretized projections rdp1 = RandomDiscretizedProjections('rdp1', 4, bin_width) rdp2 = RandomDiscretizedProjections('rdp2', 4, bin_width) rdp3 = RandomDiscretizedProjections('rdp3', 4, bin_width) rdp4 = RandomDiscretizedProjections('rdp4', 4, bin_width)