def test_int_to_bit_vector_large_1(self): numpy.testing.assert_array_equal( bit_utils.int_to_bit_vector_large(1), [True] ) numpy.testing.assert_array_equal( bit_utils.int_to_bit_vector_large(1, 7), ([False] * 6) + [True] )
def test_int_to_bit_vector_large_0(self): # Need at least one bit to represent 0. numpy.testing.assert_array_equal( bit_utils.int_to_bit_vector_large(0), [False] ) # Force 5 bits. numpy.testing.assert_array_equal( bit_utils.int_to_bit_vector_large(0, 5), [False, False, False, False, False] )
def test_remove_from_index_invalid_key_single(self): bt = SkLearnBallTreeHashIndex(random_seed=0) index = np.ndarray((1000, 256), bool) for i in range(1000): index[i] = int_to_bit_vector_large(i, 256) bt.build_index(index) # Copy post-build index for checking no removal occurred bt_data = np.copy(bt.bt.data) self.assertRaises(KeyError, bt.remove_from_index, [ int_to_bit_vector_large(1001, 256), ]) np.testing.assert_array_equal(bt_data, np.asarray(bt.bt.data))
def test_int_to_bit_vector_large_large(self): # Try large integer bit vectors int_val = (2**256) - 1 expected_vector = [True] * 256 numpy.testing.assert_array_equal( bit_utils.int_to_bit_vector_large(int_val), expected_vector ) int_val = (2**512) expected_vector = [True] + ([False] * 512) numpy.testing.assert_array_equal( bit_utils.int_to_bit_vector_large(int_val), expected_vector )
def nn(self, h, n=1): """ Return the nearest `N` neighbors to the given hash code. Distances are in the range [0,1] and are the percent different each neighbor hash is from the query, based on the number of bits contained in the query. :param h: Hash code to compute the neighbors of. Should be the same bit length as indexed hash codes. :type h: numpy.ndarray[bool] | list[bool] :param n: Number of nearest neighbors to find. :type n: int :raises ValueError: No index to query from. :return: Tuple of nearest N hash codes and a tuple of the distance values to those neighbors. :rtype: (tuple[numpy.ndarray[bool], tuple[float]) """ super(LinearHashIndex, self).nn(h, n) h_int = bit_vector_to_int_large(h) bits = len(h) #: :type: list[int|long] near_codes = \ heapq.nsmallest(n, self.index, lambda e: hamming_distance(h_int, e) ) distances = list(map(hamming_distance, near_codes, [h_int] * len(near_codes))) return [int_to_bit_vector_large(c, bits) for c in near_codes], \ [d / float(bits) for d in distances]
def nn(self, h, n=1): """ Return the nearest `N` neighbors to the given hash code. Distances are in the range [0,1] and are the percent different each neighbor hash is from the query, based on the number of bits contained in the query. :param h: Hash code to compute the neighbors of. Should be the same bit length as indexed hash codes. :type h: numpy.ndarray[bool] :param n: Number of nearest neighbors to find. :type n: int :raises ValueError: No index to query from. :return: Tuple of nearest N hash codes and a tuple of the distance values to those neighbors. :rtype: (tuple[numpy.ndarray[bool], tuple[float]) """ super(LinearHashIndex, self).nn(h, n) h_int = bit_vector_to_int_large(h) bits = len(h) #: :type: list[int|long] near_codes = \ heapq.nsmallest(n, self.index, lambda e: hamming_distance(h_int, e) ) distances = map(hamming_distance, near_codes, [h_int] * len(near_codes)) return [int_to_bit_vector_large(c, bits) for c in near_codes], \ [d / float(bits) for d in distances]
def test_remove_from_index_invalid_key_multiple(self): # Test that mixed valid and invalid keys raises KeyError and does not # modify the index. bt = SkLearnBallTreeHashIndex(random_seed=0) index = np.ndarray((1000, 256), bool) for i in range(1000): index[i] = int_to_bit_vector_large(i, 256) bt.build_index(index) # Copy post-build index for checking no removal occurred bt_data = np.copy(bt.bt.data) self.assertRaises(KeyError, bt.remove_from_index, [ int_to_bit_vector_large(42, 256), int_to_bit_vector_large(1008, 256), ]) np.testing.assert_array_equal(bt_data, np.asarray(bt.bt.data))
def test_remove_from_index(self): # Test that we actually remove from the index. bt = SkLearnBallTreeHashIndex(random_seed=0) index = np.ndarray((1000, 256), bool) for i in range(1000): index[i] = int_to_bit_vector_large(i, 256) bt.build_index(index) # Copy post-build index for checking no removal occurred bt_data = np.copy(bt.bt.data) bt.remove_from_index([ int_to_bit_vector_large(42, 256), int_to_bit_vector_large(998, 256), ]) # Make sure expected arrays are missing from data block. new_data = np.asarray(bt.bt.data) self.assertEqual(new_data.shape, (998, 256)) new_data_set = set(tuple(r) for r in new_data.tolist()) self.assertNotIn(tuple(int_to_bit_vector_large(42, 256)), new_data_set) self.assertNotIn(tuple(int_to_bit_vector_large(998, 256)), new_data_set)
def test_remove_from_index_last_element(self): """ Test removing the final the only element / final elements from the index. """ # Add one hash, remove one hash. bt = SkLearnBallTreeHashIndex(random_seed=0) index = np.ndarray((1, 256), bool) index[0] = int_to_bit_vector_large(1, 256) bt.build_index(index) self.assertEqual(bt.count(), 1) bt.remove_from_index(index) self.assertEqual(bt.count(), 0) self.assertIsNone(bt.bt) # Add many hashes, remove many hashes in batches until zero bt = SkLearnBallTreeHashIndex(random_seed=0) index = np.ndarray((1000, 256), bool) for i in range(1000): index[i] = int_to_bit_vector_large(i, 256) bt.build_index(index) # Remove first 250 bt.remove_from_index(index[:250]) self.assertEqual(bt.count(), 750) self.assertIsNotNone(bt.bt) # Remove second 250 bt.remove_from_index(index[250:500]) self.assertEqual(bt.count(), 500) self.assertIsNotNone(bt.bt) # Remove third 250 bt.remove_from_index(index[500:750]) self.assertEqual(bt.count(), 250) self.assertIsNotNone(bt.bt) # Remove final 250 bt.remove_from_index(index[750:]) self.assertEqual(bt.count(), 0) self.assertIsNone(bt.bt)
def test_remove_from_index_last_element_with_cache(self): """ Test removing final element also clears the cache element. """ c = DataMemoryElement() bt = SkLearnBallTreeHashIndex(cache_element=c, random_seed=0) index = np.ndarray((1, 256), bool) index[0] = int_to_bit_vector_large(1, 256) bt.build_index(index) self.assertEqual(bt.count(), 1) self.assertFalse(c.is_empty()) bt.remove_from_index(index) self.assertEqual(bt.count(), 0) self.assertTrue(c.is_empty())
def main(): args = cli_parser().parse_args() initialize_logging(logging.getLogger('smqtk'), logging.DEBUG) initialize_logging(logging.getLogger('__main__'), logging.DEBUG) log = logging.getLogger(__name__) hash2uuids_fp = os.path.abspath(args.hash2uuids_fp) bit_len = args.bit_len leaf_size = args.leaf_size rand_seed = args.rand_seed balltree_model_fp = os.path.abspath(args.balltree_model_fp) assert os.path.isfile(hash2uuids_fp), "Bad path: '%s'" % hash2uuids_fp assert os.path.isdir(os.path.dirname(balltree_model_fp)), \ "Bad path: %s" % balltree_model_fp log.debug("hash2uuids_fp : %s", hash2uuids_fp) log.debug("bit_len : %d", bit_len) log.debug("leaf_size : %d", leaf_size) log.debug("rand_seed : %d", rand_seed) log.debug("balltree_model_fp: %s", balltree_model_fp) log.info("Loading hash2uuids table") with open(hash2uuids_fp) as f: hash2uuids = cPickle.load(f) log.info("Computing hash-code vectors") hash_vectors = [] #[int_to_bit_vector_large(h, bit_len) for h in hash2uuids] rs = [0] * 7 for h in hash2uuids: hash_vectors.append( int_to_bit_vector_large(h, bit_len) ) report_progress(log.debug, rs, 1.) log.info("Initializing ball tree") btree = SkLearnBallTreeHashIndex(balltree_model_fp, leaf_size, rand_seed) log.info("Building ball tree") btree.build_index(hash_vectors)
def _nn(self, h, n=1): """ Internal method to be implemented by sub-classes to return the nearest `N` neighbor hash codes as bit-vectors to the given hash code bit-vector. Distances are in the range [0,1] and are the percent different each neighbor hash is from the query, based on the number of bits contained in the query (normalized hamming distance). When this internal method is called, we have already checked that our index is not empty. :param h: Hash code to compute the neighbors of. Should be the same bit length as indexed hash codes. :type h: numpy.ndarray[bool] :param n: Number of nearest neighbors to find. :type n: int :return: Tuple of nearest N hash codes and a tuple of the distance values to those neighbors. :rtype: (tuple[numpy.ndarray[bool]], tuple[float]) """ with self._model_lock: h_int = bit_vector_to_int_large(h) bits = len(h) #: :type: list[int|long] near_codes = \ heapq.nsmallest(n, self.index, lambda e: hamming_distance(h_int, e) ) distances = map(hamming_distance, near_codes, [h_int] * len(near_codes)) return [int_to_bit_vector_large(c, bits) for c in near_codes], \ [d / float(bits) for d in distances]
def _nn(self, h, n=1): """ Internal method to be implemented by sub-classes to return the nearest `N` neighbor hash codes as bit-vectors to the given hash code bit-vector. Distances are in the range [0,1] and are the percent different each neighbor hash is from the query, based on the number of bits contained in the query (normalized hamming distance). When this internal method is called, we have already checked that our index is not empty. :param h: Hash code to compute the neighbors of. Should be the same bit length as indexed hash codes. :type h: numpy.ndarray[bool] :param n: Number of nearest neighbors to find. :type n: int :return: Tuple of nearest N hash codes and a tuple of the distance values to those neighbors. :rtype: (tuple[numpy.ndarray[bool]], tuple[float]) """ with self._model_lock: h_int = bit_vector_to_int_large(h) bits = len(h) #: :type: list[int|long] near_codes = \ heapq.nsmallest(n, self.index, lambda e: hamming_distance(h_int, e) ) distances = map(hamming_distance, near_codes, [h_int] * len(near_codes)) return [int_to_bit_vector_large(c, bits) for c in near_codes], \ [d / float(bits) for d in distances]