def test_int_to_bit_vector_large_0(self): # Need at least one bit to represent 0. numpy.testing.assert_array_equal(bits.int_to_bit_vector_large(0), [False]) # Force 5 bits. numpy.testing.assert_array_equal(bits.int_to_bit_vector_large(0, 5), [False, False, False, False, False])
def test_int_to_bit_vector_large_large(self): # Try large integer bit vectors int_val = (2**256) - 1 expected_vector = [True] * 256 numpy.testing.assert_array_equal(bits.int_to_bit_vector_large(int_val), expected_vector) int_val = (2**512) expected_vector = [True] + ([False] * 512) numpy.testing.assert_array_equal(bits.int_to_bit_vector_large(int_val), expected_vector)
def test_remove_from_index_invalid_key_single(self): bt = SkLearnBallTreeHashIndex(random_seed=0) index = np.ndarray((1000, 256), bool) for i in range(1000): index[i] = int_to_bit_vector_large(i, 256) bt.build_index(index) # Copy post-build index for checking no removal occurred bt_data = np.copy(bt.bt.data) self.assertRaises(KeyError, bt.remove_from_index, [ int_to_bit_vector_large(1001, 256), ]) np.testing.assert_array_equal(bt_data, np.asarray(bt.bt.data))
def test_remove_from_index_invalid_key_multiple(self): # Test that mixed valid and invalid keys raises KeyError and does not # modify the index. bt = SkLearnBallTreeHashIndex(random_seed=0) index = np.ndarray((1000, 256), bool) for i in range(1000): index[i] = int_to_bit_vector_large(i, 256) bt.build_index(index) # Copy post-build index for checking no removal occurred bt_data = np.copy(bt.bt.data) self.assertRaises(KeyError, bt.remove_from_index, [ int_to_bit_vector_large(42, 256), int_to_bit_vector_large(1008, 256), ]) np.testing.assert_array_equal(bt_data, np.asarray(bt.bt.data))
def test_remove_from_index(self): # Test that we actually remove from the index. bt = SkLearnBallTreeHashIndex(random_seed=0) index = np.ndarray((1000, 256), bool) for i in range(1000): index[i] = int_to_bit_vector_large(i, 256) bt.build_index(index) # BallTree data should now contain 1000 entries self.assertEqual(bt.bt.data.shape, (1000, 256)) bt.remove_from_index([ int_to_bit_vector_large(42, 256), int_to_bit_vector_large(998, 256), ]) # Make sure data block is of the expected shape (two rows shorter) new_data = np.asarray(bt.bt.data) self.assertEqual(new_data.shape, (998, 256)) # Make sure expected arrays are missing from data block. new_data_set = set(tuple(r) for r in new_data.tolist()) self.assertNotIn(tuple(int_to_bit_vector_large(42, 256)), new_data_set) self.assertNotIn(tuple(int_to_bit_vector_large(998, 256)), new_data_set)
def test_remove_from_index_last_element(self): """ Test removing the final the only element / final elements from the index. """ # Add one hash, remove one hash. bt = SkLearnBallTreeHashIndex(random_seed=0) index = np.ndarray((1, 256), bool) index[0] = int_to_bit_vector_large(1, 256) bt.build_index(index) self.assertEqual(bt.count(), 1) bt.remove_from_index(index) self.assertEqual(bt.count(), 0) self.assertIsNone(bt.bt) # Add many hashes, remove many hashes in batches until zero bt = SkLearnBallTreeHashIndex(random_seed=0) index = np.ndarray((1000, 256), bool) for i in range(1000): index[i] = int_to_bit_vector_large(i, 256) bt.build_index(index) # Remove first 250 bt.remove_from_index(index[:250]) self.assertEqual(bt.count(), 750) self.assertIsNotNone(bt.bt) # Remove second 250 bt.remove_from_index(index[250:500]) self.assertEqual(bt.count(), 500) self.assertIsNotNone(bt.bt) # Remove third 250 bt.remove_from_index(index[500:750]) self.assertEqual(bt.count(), 250) self.assertIsNotNone(bt.bt) # Remove final 250 bt.remove_from_index(index[750:]) self.assertEqual(bt.count(), 0) self.assertIsNone(bt.bt)
def test_remove_from_index_last_element_with_cache(self): """ Test removing final element also clears the cache element. """ c = DataMemoryElement() bt = SkLearnBallTreeHashIndex(cache_element=c, random_seed=0) index = np.ndarray((1, 256), bool) index[0] = int_to_bit_vector_large(1, 256) bt.build_index(index) self.assertEqual(bt.count(), 1) self.assertFalse(c.is_empty()) bt.remove_from_index(index) self.assertEqual(bt.count(), 0) self.assertTrue(c.is_empty())
def _nn(self, h, n=1): """ Internal method to be implemented by sub-classes to return the nearest `N` neighbor hash codes as bit-vectors to the given hash code bit-vector. Distances are in the range [0,1] and are the percent different each neighbor hash is from the query, based on the number of bits contained in the query (normalized hamming distance). When this internal method is called, we have already checked that our index is not empty. :param h: Hash code to compute the neighbors of. Should be the same bit length as indexed hash codes. :type h: numpy.ndarray[bool] :param n: Number of nearest neighbors to find. :type n: int :return: Tuple of nearest N hash codes and a tuple of the distance values to those neighbors. :rtype: (tuple[numpy.ndarray[bool]], tuple[float]) """ with self._model_lock: h_int = bit_vector_to_int_large(h) bits = len(h) #: :type: list[int|long] near_codes = \ heapq.nsmallest(n, self.index, lambda e: hamming_distance(h_int, e) ) distances = map(hamming_distance, near_codes, [h_int] * len(near_codes)) return [int_to_bit_vector_large(c, bits) for c in near_codes], \ [d / float(bits) for d in distances]
def test_int_to_bit_vector_large_1(self): numpy.testing.assert_array_equal(bits.int_to_bit_vector_large(1), [True]) numpy.testing.assert_array_equal(bits.int_to_bit_vector_large(1, 7), ([False] * 6) + [True])