def test_remove_from_index(self): # Test that actual removal occurs. i = LinearHashIndex() i.index = {0, 1, 2} # noinspection PyTypeChecker i.remove_from_index([[0, 0], [1, 0]]) self.assertSetEqual(i.index, {1})
def nn(self, d, n=1): """ Return the nearest `N` neighbors to the given descriptor element. :param d: Descriptor element to compute the neighbors of. :type d: smqtk.representation.DescriptorElement :param n: Number of nearest neighbors to find. :type n: int :return: Tuple of nearest N DescriptorElement instances, and a tuple of the distance values to those neighbors. :rtype: (tuple[smqtk.representation.DescriptorElement], tuple[float]) """ super(LSHNearestNeighborIndex, self).nn(d, n) self._log.debug("generating hash for descriptor") d_v = d.vector() d_h = self.lsh_functor.get_hash(d_v) def comp_descr_dist(d2_v): return self._distance_function(d_v, d2_v) self._log.debug("getting near hashes") hi = self.hash_index # Make on-the-fly linear index if we weren't originally set with one if hi is None: hi = LinearHashIndex() # not calling ``build_index`` because we already have the int # hashes. with self._hash2uuid_lock: hi.index = numpy.array(self._hash2uuid.keys()) hashes, hash_dists = hi.nn(d_h, n) self._log.debug("getting UUIDs of descriptors for nearby hashes") neighbor_uuids = [] with self._hash2uuid_lock: for h_int in map(bit_vector_to_int_large, hashes): # If descriptor hash not in our map, we effectively skip it neighbor_uuids.extend(self._hash2uuid.get(h_int, ())) self._log.debug("-- matched %d UUIDs", len(neighbor_uuids)) self._log.debug("getting descriptors for neighbor_uuids") neighbors = \ list(self.descriptor_index.get_many_descriptors(neighbor_uuids)) self._log.debug("ordering descriptors via distance method '%s'", self.distance_method) self._log.debug('-- getting element vectors') neighbor_vectors = elements_to_matrix(neighbors, report_interval=1.0) self._log.debug('-- calculating distances') distances = map(comp_descr_dist, neighbor_vectors) self._log.debug('-- ordering') ordered = sorted(zip(neighbors, distances), key=lambda p: p[1]) self._log.debug('-- slicing top n=%d', n) return zip(*(ordered[:n]))
def nn(self, d, n=1): """ Return the nearest `N` neighbors to the given descriptor element. :param d: Descriptor element to compute the neighbors of. :type d: smqtk.representation.DescriptorElement :param n: Number of nearest neighbors to find. :type n: int :return: Tuple of nearest N DescriptorElement instances, and a tuple of the distance values to those neighbors. :rtype: (tuple[smqtk.representation.DescriptorElement], tuple[float]) """ super(LSHNearestNeighborIndex, self).nn(d, n) self._log.debug("generating hash for descriptor") d_v = d.vector() d_h = self.lsh_functor.get_hash(d_v) def comp_descr_dist(d2_v): return self._distance_function(d_v, d2_v) self._log.debug("getting near hashes") hi = self.hash_index if hi is None: # Make on-the-fly linear index hi = LinearHashIndex() # not calling ``build_index`` because we already have the int # hashes. hi.index = numpy.array(list(self.hash2uuids_kvstore.keys())) near_hashes, _ = hi.nn(d_h, n) self._log.debug("getting UUIDs of descriptors for nearby hashes") neighbor_uuids = [] for h_int in map(bit_vector_to_int_large, near_hashes): # If descriptor hash not in our map, we effectively skip it #: :type: collections.Iterable near_uuids = self.hash2uuids_kvstore.get(h_int, ()) neighbor_uuids.extend(near_uuids) self._log.debug("-- matched %d UUIDs", len(neighbor_uuids)) self._log.debug("getting descriptors for neighbor_uuids") neighbors = \ list(self.descriptor_index.get_many_descriptors(neighbor_uuids)) self._log.debug("ordering descriptors via distance method '%s'", self.distance_method) self._log.debug('-- getting element vectors') neighbor_vectors = elements_to_matrix(neighbors, report_interval=1.0) self._log.debug('-- calculating distances') distances = map(comp_descr_dist, neighbor_vectors) self._log.debug('-- ordering') ordered = sorted(zip(neighbors, distances), key=lambda p: p[1]) self._log.debug('-- slicing top n=%d', n) return zip(*(ordered[:n]))
def test_remove_from_index_single_not_in_index(self): # Test attempting to remove single hash not in the index. i = LinearHashIndex() i.index = {0, 1, 2} self.assertRaises( KeyError, i.remove_from_index, [[1, 0, 0]] # 4 ) self.assertSetEqual(i.index, {0, 1, 2})
def test_remove_from_index_one_of_many_not_in_index(self): # Test attempting to remove hashes where one of them is not in the # index. i = LinearHashIndex() i.index = {0, 1, 2} self.assertRaises( KeyError, i.remove_from_index, [[0, 0], # 0 [0, 1], # 1 [1, 1]] # 3 ) # Check that the index has not been modified. self.assertSetEqual(i.index, {0, 1, 2})
def test_remove_from_index_one_of_many_not_in_index(self): # Test attempting to remove hashes where one of them is not in the # index. i = LinearHashIndex() i.index = {0, 1, 2} self.assertRaises( KeyError, i.remove_from_index, [ [0, 0], # 0 [0, 1], # 1 [1, 1] ] # 3 ) # Check that the index has not been modified. self.assertSetEqual(i.index, {0, 1, 2})
def _nn(self, d, n=1): """ Internal method to be implemented by sub-classes to return the nearest `N` neighbors to the given descriptor element. When this internal method is called, we have already checked that there is a vector in ``d`` and our index is not empty. :param d: Descriptor element to compute the neighbors of. :type d: smqtk.representation.DescriptorElement :param n: Number of nearest neighbors to find. :type n: int :return: Tuple of nearest N DescriptorElement instances, and a tuple of the distance values to those neighbors. :rtype: (tuple[smqtk.representation.DescriptorElement], tuple[float]) """ self._log.debug("generating hash for descriptor") d_v = d.vector() d_h = self.lsh_functor.get_hash(d_v) def comp_descr_dist(d2_v): return self._distance_function(d_v, d2_v) with self._model_lock: self._log.debug("getting near hashes") hi = self.hash_index if hi is None: # Make on-the-fly linear index hi = LinearHashIndex() # not calling ``build_index`` because we already have the int # hashes. hi.index = numpy.array(list(self.hash2uuids_kvstore.keys())) near_hashes, _ = hi.nn(d_h, n) self._log.debug("getting UUIDs of descriptors for nearby hashes") neighbor_uuids = [] for h_int in map(bit_vector_to_int_large, near_hashes): # If descriptor hash not in our map, we effectively skip it. # Get set of descriptor UUIDs for a hash code. #: :type: set[collections.Hashable] near_uuids = self.hash2uuids_kvstore.get(h_int, set()) # Accumulate matching descriptor UUIDs to a list. neighbor_uuids.extend(near_uuids) self._log.debug("-- matched %d UUIDs", len(neighbor_uuids)) self._log.debug("getting descriptors for neighbor_uuids") neighbors = \ list(self.descriptor_index.get_many_descriptors(neighbor_uuids)) # Done with model parts at this point, so releasing lock. self._log.debug("ordering descriptors via distance method '%s'", self.distance_method) self._log.debug('-- getting element vectors') neighbor_vectors = elements_to_matrix(neighbors, report_interval=1.0) self._log.debug('-- calculating distances') distances = list(map(comp_descr_dist, neighbor_vectors)) self._log.debug('-- ordering') ordered = sorted(zip(neighbors, distances), key=lambda p: p[1]) self._log.debug('-- slicing top n=%d', n) return list(zip(*(ordered[:n])))
def _nn(self, d, n=1): """ Internal method to be implemented by sub-classes to return the nearest `N` neighbors to the given descriptor element. When this internal method is called, we have already checked that there is a vector in ``d`` and our index is not empty. :param d: Descriptor element to compute the neighbors of. :type d: smqtk.representation.DescriptorElement :param n: Number of nearest neighbors to find. :type n: int :return: Tuple of nearest N DescriptorElement instances, and a tuple of the distance values to those neighbors. :rtype: (tuple[smqtk.representation.DescriptorElement], tuple[float]) """ self._log.debug("generating hash for descriptor") d_v = d.vector() d_h = self.lsh_functor.get_hash(d_v) def comp_descr_dist(d2_v): return self._distance_function(d_v, d2_v) with self._model_lock: self._log.debug("getting near hashes") hi = self.hash_index if hi is None: # Make on-the-fly linear index hi = LinearHashIndex() # not calling ``build_index`` because we already have the int # hashes. hi.index = set(self.hash2uuids_kvstore.keys()) near_hashes, _ = hi.nn(d_h, n) self._log.debug("getting UUIDs of descriptors for nearby hashes") neighbor_uuids = [] for h_int in map(bit_vector_to_int_large, near_hashes): # If descriptor hash not in our map, we effectively skip it. # Get set of descriptor UUIDs for a hash code. #: :type: set[collections.Hashable] near_uuids = self.hash2uuids_kvstore.get(h_int, set()) # Accumulate matching descriptor UUIDs to a list. neighbor_uuids.extend(near_uuids) self._log.debug("-- matched %d UUIDs", len(neighbor_uuids)) self._log.debug("getting descriptors for neighbor_uuids") neighbors = \ list(self.descriptor_index.get_many_descriptors(neighbor_uuids)) # Done with model parts at this point, so releasing lock. self._log.debug("ordering descriptors via distance method '%s'", self.distance_method) self._log.debug('-- getting element vectors') neighbor_vectors = elements_to_matrix(neighbors, report_interval=1.0) self._log.debug('-- calculating distances') distances = list(map(comp_descr_dist, neighbor_vectors)) self._log.debug('-- ordering') ordered = sorted(zip(neighbors, distances), key=lambda p: p[1]) self._log.debug('-- slicing top n=%d', n) return list(zip(*(ordered[:n])))