def test_remove_from_index(self): # Test that actual removal occurs. i = LinearHashIndex() i.index = {0, 1, 2} # noinspection PyTypeChecker i.remove_from_index([[0, 0], [1, 0]]) self.assertSetEqual(i.index, {1})
def test_build_index_with_cache(self): cache_element = DataMemoryElement() i = LinearHashIndex(cache_element) # noinspection PyTypeChecker i.build_index([[0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 0, 1]]) nose.tools.assert_equal(i.index, {1, 2, 3, 4}) nose.tools.assert_false(cache_element.is_empty())
def test_save_cache_remove_from_index(self): # Test that the cache is updated appropriately on a removal. cache_element = DataMemoryElement() self.assertTrue(cache_element.is_empty()) i = LinearHashIndex(cache_element) # noinspection PyTypeChecker i.build_index([[0, 1, 0], # 2 [0, 1, 1], # 3 [1, 0, 0], # 4 [1, 1, 0]]) # 6 self.assertFalse(cache_element.is_empty()) self.assertSetEqual( set(numpy.load(BytesIO(cache_element.get_bytes()))), {2, 3, 4, 6} ) # noinspection PyTypeChecker i.remove_from_index([[0, 1, 1], # 3 [1, 0, 0]]) # 4 self.assertFalse(cache_element.is_empty()) self.assertSetEqual( set(numpy.load(BytesIO(cache_element.get_bytes()))), {2, 6} )
def test_from_config_with_cache(self): c = LinearHashIndex.get_default_config() c['cache_element'][ 'type'] = 'smqtk.representation.data_element.memory_element.DataMemoryElement' i = LinearHashIndex.from_config(c) self.assertIsInstance(i.cache_element, DataMemoryElement) self.assertEqual(i.index, set())
def nn(self, d, n=1): """ Return the nearest `N` neighbors to the given descriptor element. :param d: Descriptor element to compute the neighbors of. :type d: smqtk.representation.DescriptorElement :param n: Number of nearest neighbors to find. :type n: int :return: Tuple of nearest N DescriptorElement instances, and a tuple of the distance values to those neighbors. :rtype: (tuple[smqtk.representation.DescriptorElement], tuple[float]) """ super(LSHNearestNeighborIndex, self).nn(d, n) self._log.debug("generating hash for descriptor") d_v = d.vector() d_h = self.lsh_functor.get_hash(d_v) def comp_descr_dist(d2_v): return self._distance_function(d_v, d2_v) self._log.debug("getting near hashes") hi = self.hash_index # Make on-the-fly linear index if we weren't originally set with one if hi is None: hi = LinearHashIndex() # not calling ``build_index`` because we already have the int # hashes. with self._hash2uuid_lock: hi.index = numpy.array(self._hash2uuid.keys()) hashes, hash_dists = hi.nn(d_h, n) self._log.debug("getting UUIDs of descriptors for nearby hashes") neighbor_uuids = [] with self._hash2uuid_lock: for h_int in map(bit_vector_to_int_large, hashes): # If descriptor hash not in our map, we effectively skip it neighbor_uuids.extend(self._hash2uuid.get(h_int, ())) self._log.debug("-- matched %d UUIDs", len(neighbor_uuids)) self._log.debug("getting descriptors for neighbor_uuids") neighbors = \ list(self.descriptor_index.get_many_descriptors(neighbor_uuids)) self._log.debug("ordering descriptors via distance method '%s'", self.distance_method) self._log.debug('-- getting element vectors') neighbor_vectors = elements_to_matrix(neighbors, report_interval=1.0) self._log.debug('-- calculating distances') distances = map(comp_descr_dist, neighbor_vectors) self._log.debug('-- ordering') ordered = sorted(zip(neighbors, distances), key=lambda p: p[1]) self._log.debug('-- slicing top n=%d', n) return zip(*(ordered[:n]))
def test_update_index_no_index(self): # Test calling update index with no existing index. Should result the # same as calling build_index with no index. i = LinearHashIndex() # noinspection PyTypeChecker i.update_index([[0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 0, 1]]) self.assertEqual(i.index, {1, 2, 3, 4}) self.assertIsNone(i.cache_element)
def test_build_index_no_cache(self): i = LinearHashIndex() # noinspection PyTypeChecker i.build_index([[0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 0, 1]]) self.assertEqual(i.index, {1, 2, 3, 4}) self.assertIsNone(i.cache_element)
def test_save_cache(self): cache_element = DataMemoryElement() nose.tools.assert_true(cache_element.is_empty()) i = LinearHashIndex(cache_element) # noinspection PyTypeChecker i.build_index([[0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 0, 1]]) nose.tools.assert_false(cache_element.is_empty()) nose.tools.assert_true(len(cache_element.get_bytes()) > 0)
def nn(self, d, n=1): """ Return the nearest `N` neighbors to the given descriptor element. :param d: Descriptor element to compute the neighbors of. :type d: smqtk.representation.DescriptorElement :param n: Number of nearest neighbors to find. :type n: int :return: Tuple of nearest N DescriptorElement instances, and a tuple of the distance values to those neighbors. :rtype: (tuple[smqtk.representation.DescriptorElement], tuple[float]) """ super(LSHNearestNeighborIndex, self).nn(d, n) self._log.debug("generating hash for descriptor") d_v = d.vector() d_h = self.lsh_functor.get_hash(d_v) def comp_descr_dist(d2_v): return self._distance_function(d_v, d2_v) self._log.debug("getting near hashes") hi = self.hash_index if hi is None: # Make on-the-fly linear index hi = LinearHashIndex() # not calling ``build_index`` because we already have the int # hashes. hi.index = numpy.array(list(self.hash2uuids_kvstore.keys())) near_hashes, _ = hi.nn(d_h, n) self._log.debug("getting UUIDs of descriptors for nearby hashes") neighbor_uuids = [] for h_int in map(bit_vector_to_int_large, near_hashes): # If descriptor hash not in our map, we effectively skip it #: :type: collections.Iterable near_uuids = self.hash2uuids_kvstore.get(h_int, ()) neighbor_uuids.extend(near_uuids) self._log.debug("-- matched %d UUIDs", len(neighbor_uuids)) self._log.debug("getting descriptors for neighbor_uuids") neighbors = \ list(self.descriptor_index.get_many_descriptors(neighbor_uuids)) self._log.debug("ordering descriptors via distance method '%s'", self.distance_method) self._log.debug('-- getting element vectors') neighbor_vectors = elements_to_matrix(neighbors, report_interval=1.0) self._log.debug('-- calculating distances') distances = map(comp_descr_dist, neighbor_vectors) self._log.debug('-- ordering') ordered = sorted(zip(neighbors, distances), key=lambda p: p[1]) self._log.debug('-- slicing top n=%d', n) return zip(*(ordered[:n]))
def test_build_index_with_cache(self): cache_element = DataMemoryElement() i = LinearHashIndex(cache_element) # noinspection PyTypeChecker i.build_index([[0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 0, 1]]) self.assertEqual(i.index, {1, 2, 3, 4}) self.assertFalse(cache_element.is_empty())
def test_remove_from_index_single_not_in_index(self): # Test attempting to remove single hash not in the index. i = LinearHashIndex() i.index = {0, 1, 2} self.assertRaises( KeyError, i.remove_from_index, [[1, 0, 0]] # 4 ) self.assertSetEqual(i.index, {0, 1, 2})
def test_get_config(self): i = LinearHashIndex() # Without cache element expected_c = LinearHashIndex.get_default_config() self.assertEqual(i.get_config(), expected_c) # With cache element i.cache_element = DataMemoryElement() expected_c['cache_element']['type'] = 'DataMemoryElement' self.assertEqual(i.get_config(), expected_c)
def test_load_cache(self): cache_element = DataMemoryElement() i1 = LinearHashIndex(cache_element) # noinspection PyTypeChecker i1.build_index([[0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 0, 1]]) # load called on initialization. i2 = LinearHashIndex(cache_element) nose.tools.assert_equal(i1.cache_element, i2.cache_element) nose.tools.assert_equal(i1.index, i2.index)
def test_nn(self): i = LinearHashIndex() # noinspection PyTypeChecker i.build_index([[0, 1, 0], [1, 1, 0], [0, 1, 1], [0, 0, 1]]) near_codes, near_dists = i.nn([0, 0, 0], 4) nose.tools.assert_equal(set(map(tuple, near_codes[:2])), {(0, 1, 0), (0, 0, 1)}) nose.tools.assert_equal(set(map(tuple, near_codes[2:])), {(1, 1, 0), (0, 1, 1)}) numpy.testing.assert_array_almost_equal( near_dists, (1 / 3., 1 / 3., 2 / 3., 2 / 3.))
def test_update_index_add_hashes(self): i = LinearHashIndex() # Build index with some initial hashes # noinspection PyTypeChecker i.build_index([[0, 0], [0, 1]]) self.assertSetEqual(i.index, {0, 1}) # Update index with new stuff # noinspection PyTypeChecker i.update_index([[1, 0], [1, 1]]) self.assertSetEqual(i.index, {0, 1, 2, 3})
def test_save_cache_build_index(self): cache_element = DataMemoryElement() self.assertTrue(cache_element.is_empty()) i = LinearHashIndex(cache_element) # noinspection PyTypeChecker i.build_index([[0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 0, 1]]) self.assertFalse(cache_element.is_empty()) # Check byte content expected_cache = {1, 2, 3, 4} actual_cache = set(numpy.load(BytesIO(cache_element.get_bytes()))) self.assertSetEqual(expected_cache, actual_cache)
def test_remove_from_index_one_of_many_not_in_index(self): # Test attempting to remove hashes where one of them is not in the # index. i = LinearHashIndex() i.index = {0, 1, 2} self.assertRaises( KeyError, i.remove_from_index, [[0, 0], # 0 [0, 1], # 1 [1, 1]] # 3 ) # Check that the index has not been modified. self.assertSetEqual(i.index, {0, 1, 2})
def test_load_cache(self): cache_element = DataMemoryElement() i1 = LinearHashIndex(cache_element) # noinspection PyTypeChecker i1.build_index([[0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 0, 1]]) # load called on initialization. i2 = LinearHashIndex(cache_element) self.assertEqual(i1.cache_element, i2.cache_element) self.assertEqual(i1.index, i2.index)
def test_nn(self): i = LinearHashIndex() # noinspection PyTypeChecker i.build_index([[0, 1, 0], [1, 1, 0], [0, 1, 1], [0, 0, 1]]) # noinspection PyTypeChecker near_codes, near_dists = i.nn([0, 0, 0], 4) self.assertEqual(set(map(tuple, near_codes[:2])), {(0, 1, 0), (0, 0, 1)}) self.assertEqual(set(map(tuple, near_codes[2:])), {(1, 1, 0), (0, 1, 1)}) numpy.testing.assert_array_almost_equal(near_dists, (1/3., 1/3., 2/3., 2/3.))
def test_remove_from_index_one_of_many_not_in_index(self): # Test attempting to remove hashes where one of them is not in the # index. i = LinearHashIndex() i.index = {0, 1, 2} self.assertRaises( KeyError, i.remove_from_index, [ [0, 0], # 0 [0, 1], # 1 [1, 1] ] # 3 ) # Check that the index has not been modified. self.assertSetEqual(i.index, {0, 1, 2})
def test_update_index_with_hash_index(self): # Similar test to `test_update_index_add_new_descriptors` but with a # linear hash index. descr_set = MemoryDescriptorSet() hash_kvs = MemoryKeyValueStore() linear_hi = LinearHashIndex() # simplest hash index, heap-sorts. index = LSHNearestNeighborIndex(DummyHashFunctor(), descr_set, hash_kvs, linear_hi) descriptors1 = [ DescriptorMemoryElement('t', 0), DescriptorMemoryElement('t', 1), DescriptorMemoryElement('t', 2), DescriptorMemoryElement('t', 3), DescriptorMemoryElement('t', 4), ] descriptors2 = [ DescriptorMemoryElement('t', 5), DescriptorMemoryElement('t', 6), ] # Vectors of length 1 for easy dummy hashing prediction. for d in descriptors1 + descriptors2: d.set_vector(np.ones(1, float) * d.uuid()) # Build initial index. index.build_index(descriptors1) # Initial hash index should only encode hashes for first batch of # descriptors. self.assertSetEqual(linear_hi.index, {0, 1, 2, 3, 4}) # Update index and check that components have new data. index.update_index(descriptors2) # Now the hash index should include all descriptor hashes. self.assertSetEqual(linear_hi.index, {0, 1, 2, 3, 4, 5, 6})
def test_save_cache_remove_from_index(self): # Test that the cache is updated appropriately on a removal. cache_element = DataMemoryElement() self.assertTrue(cache_element.is_empty()) i = LinearHashIndex(cache_element) # noinspection PyTypeChecker i.build_index([ [0, 1, 0], # 2 [0, 1, 1], # 3 [1, 0, 0], # 4 [1, 1, 0] ]) # 6 self.assertFalse(cache_element.is_empty()) self.assertSetEqual( set(numpy.load(BytesIO(cache_element.get_bytes()))), {2, 3, 4, 6}) # noinspection PyTypeChecker i.remove_from_index([ [0, 1, 1], # 3 [1, 0, 0] ]) # 4 self.assertFalse(cache_element.is_empty()) self.assertSetEqual( set(numpy.load(BytesIO(cache_element.get_bytes()))), {2, 6})
def test_configuration(self): i = LSHNearestNeighborIndex(lsh_functor=ItqFunctor(), descriptor_set=MemoryDescriptorSet(), hash2uuids_kvstore=MemoryKeyValueStore(), hash_index=LinearHashIndex(), distance_method='euclidean', read_only=True) for inst in configuration_test_helper( i): # type: LSHNearestNeighborIndex assert isinstance(inst.lsh_functor, LshFunctor) assert isinstance(inst.descriptor_set, MemoryDescriptorSet) assert isinstance(inst.hash_index, LinearHashIndex) assert isinstance(inst.hash2uuids_kvstore, MemoryKeyValueStore) assert inst.distance_method == 'euclidean' assert inst.read_only is True
def test_get_config(self): i = LinearHashIndex() # Without cache element expected_c = LinearHashIndex.get_default_config() nose.tools.assert_equal(i.get_config(), expected_c) # With cache element i.cache_element = DataMemoryElement() expected_c['cache_element']['type'] = 'DataMemoryElement' nose.tools.assert_equal(i.get_config(), expected_c)
def test_get_config(self): i = LinearHashIndex() # Without cache element expected_c = LinearHashIndex.get_default_config() self.assertEqual(i.get_config(), expected_c) # With cache element i.cache_element = DataMemoryElement() expected_c['cache_element'][ 'type'] = 'smqtk.representation.data_element.memory_element.DataMemoryElement' self.assertEqual(i.get_config(), expected_c)
def test_build_index_fresh_build_with_hash_index(self): descr_set = MemoryDescriptorSet() hash_kvs = MemoryKeyValueStore() linear_hi = LinearHashIndex() # simplest hash index, heap-sorts. index = LSHNearestNeighborIndex(DummyHashFunctor(), descr_set, hash_kvs, linear_hi) descriptors = [ DescriptorMemoryElement('t', 0), DescriptorMemoryElement('t', 1), DescriptorMemoryElement('t', 2), DescriptorMemoryElement('t', 3), DescriptorMemoryElement('t', 4), ] # Vectors of length 1 for easy dummy hashing prediction. for i, d in enumerate(descriptors): d.set_vector(np.ones(1, float) * i) index.build_index(descriptors) # Hash index should have been built with hash vectors, and linearHI # converts those to integers for storage. self.assertEqual(linear_hi.index, {0, 1, 2, 3, 4})
def test_from_config_with_cache(self): c = LinearHashIndex.get_default_config() c['cache_element']['type'] = "DataMemoryElement" i = LinearHashIndex.from_config(c) self.assertIsInstance(i.cache_element, DataMemoryElement) self.assertEqual(i.index, set())
def test_build_index_no_input(self): i = LinearHashIndex() nose.tools.assert_raises_regexp(ValueError, "No hashes given to index", i.build_index, [])
def test_from_config_no_cache(self): # Default config is valid and specifies no cache. c = LinearHashIndex.get_default_config() i = LinearHashIndex.from_config(c) self.assertIsNone(i.cache_element) self.assertEqual(i.index, set())
def test_build_index_no_cache(self): i = LinearHashIndex() # noinspection PyTypeChecker i.build_index([[0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 0, 1]]) nose.tools.assert_equal(i.index, {1, 2, 3, 4}) nose.tools.assert_is_none(i.cache_element)
def test_from_config_no_cache(self): # Default config is valid and specifies no cache. c = LinearHashIndex.get_default_config() i = LinearHashIndex.from_config(c) nose.tools.assert_is_none(i.cache_element) nose.tools.assert_equal(i.index, set())
def test_from_config_with_cache(self): c = LinearHashIndex.get_default_config() c['cache_element']['type'] = "DataMemoryElement" i = LinearHashIndex.from_config(c) nose.tools.assert_is_instance(i.cache_element, DataMemoryElement) nose.tools.assert_equal(i.index, set())
def test_is_usable(self): # Should always be true since this impl does no have special deps. nose.tools.assert_true(LinearHashIndex.is_usable())
def test_default_config(self): c = LinearHashIndex.get_default_config() nose.tools.assert_equal(len(c), 1) nose.tools.assert_is_none(c['cache_element']['type'])
def _nn(self, d, n=1): """ Internal method to be implemented by sub-classes to return the nearest `N` neighbors to the given descriptor element. When this internal method is called, we have already checked that there is a vector in ``d`` and our index is not empty. :param d: Descriptor element to compute the neighbors of. :type d: smqtk.representation.DescriptorElement :param n: Number of nearest neighbors to find. :type n: int :return: Tuple of nearest N DescriptorElement instances, and a tuple of the distance values to those neighbors. :rtype: (tuple[smqtk.representation.DescriptorElement], tuple[float]) """ self._log.debug("generating hash for descriptor") d_v = d.vector() d_h = self.lsh_functor.get_hash(d_v) def comp_descr_dist(d2_v): return self._distance_function(d_v, d2_v) with self._model_lock: self._log.debug("getting near hashes") hi = self.hash_index if hi is None: # Make on-the-fly linear index hi = LinearHashIndex() # not calling ``build_index`` because we already have the int # hashes. hi.index = numpy.array(list(self.hash2uuids_kvstore.keys())) near_hashes, _ = hi.nn(d_h, n) self._log.debug("getting UUIDs of descriptors for nearby hashes") neighbor_uuids = [] for h_int in map(bit_vector_to_int_large, near_hashes): # If descriptor hash not in our map, we effectively skip it. # Get set of descriptor UUIDs for a hash code. #: :type: set[collections.Hashable] near_uuids = self.hash2uuids_kvstore.get(h_int, set()) # Accumulate matching descriptor UUIDs to a list. neighbor_uuids.extend(near_uuids) self._log.debug("-- matched %d UUIDs", len(neighbor_uuids)) self._log.debug("getting descriptors for neighbor_uuids") neighbors = \ list(self.descriptor_index.get_many_descriptors(neighbor_uuids)) # Done with model parts at this point, so releasing lock. self._log.debug("ordering descriptors via distance method '%s'", self.distance_method) self._log.debug('-- getting element vectors') neighbor_vectors = elements_to_matrix(neighbors, report_interval=1.0) self._log.debug('-- calculating distances') distances = list(map(comp_descr_dist, neighbor_vectors)) self._log.debug('-- ordering') ordered = sorted(zip(neighbors, distances), key=lambda p: p[1]) self._log.debug('-- slicing top n=%d', n) return list(zip(*(ordered[:n])))
def test_default_config(self): c = LinearHashIndex.get_default_config() self.assertEqual(len(c), 1) self.assertIsNone(c['cache_element']['type'])
def test_build_index_no_input(self): i = LinearHashIndex() self.assertRaises(ValueError, i.build_index, [])
def test_update_index_no_input(self): i = LinearHashIndex() self.assertRaises(ValueError, i.update_index, [])
def test_is_usable(self): # Should always be true since this impl does no have special deps. self.assertTrue(LinearHashIndex.is_usable())
def test_save_cache_readonly(self): ro_cache = DataMemoryElement(readonly=True) i = LinearHashIndex(ro_cache) nose.tools.assert_raises_regexp( ValueError, "is read-only", i.build_index, [[0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 0, 1]])
def _make_hi_linear(self): return LinearHashIndex()
def _nn(self, d, n=1): """ Internal method to be implemented by sub-classes to return the nearest `N` neighbors to the given descriptor element. When this internal method is called, we have already checked that there is a vector in ``d`` and our index is not empty. :param d: Descriptor element to compute the neighbors of. :type d: smqtk.representation.DescriptorElement :param n: Number of nearest neighbors to find. :type n: int :return: Tuple of nearest N DescriptorElement instances, and a tuple of the distance values to those neighbors. :rtype: (tuple[smqtk.representation.DescriptorElement], tuple[float]) """ self._log.debug("generating hash for descriptor") d_v = d.vector() d_h = self.lsh_functor.get_hash(d_v) def comp_descr_dist(d2_v): return self._distance_function(d_v, d2_v) with self._model_lock: self._log.debug("getting near hashes") hi = self.hash_index if hi is None: # Make on-the-fly linear index hi = LinearHashIndex() # not calling ``build_index`` because we already have the int # hashes. hi.index = set(self.hash2uuids_kvstore.keys()) near_hashes, _ = hi.nn(d_h, n) self._log.debug("getting UUIDs of descriptors for nearby hashes") neighbor_uuids = [] for h_int in map(bit_vector_to_int_large, near_hashes): # If descriptor hash not in our map, we effectively skip it. # Get set of descriptor UUIDs for a hash code. #: :type: set[collections.Hashable] near_uuids = self.hash2uuids_kvstore.get(h_int, set()) # Accumulate matching descriptor UUIDs to a list. neighbor_uuids.extend(near_uuids) self._log.debug("-- matched %d UUIDs", len(neighbor_uuids)) self._log.debug("getting descriptors for neighbor_uuids") neighbors = \ list(self.descriptor_index.get_many_descriptors(neighbor_uuids)) # Done with model parts at this point, so releasing lock. self._log.debug("ordering descriptors via distance method '%s'", self.distance_method) self._log.debug('-- getting element vectors') neighbor_vectors = elements_to_matrix(neighbors, report_interval=1.0) self._log.debug('-- calculating distances') distances = list(map(comp_descr_dist, neighbor_vectors)) self._log.debug('-- ordering') ordered = sorted(zip(neighbors, distances), key=lambda p: p[1]) self._log.debug('-- slicing top n=%d', n) return list(zip(*(ordered[:n])))