def test_estimator_cardinality_sparse_mode(self): estimator = HllCardinality() for truth in [0, 1, 1024]: hll = HyperLogLogPlusPlus(random_seed=89, length=1024) for i in range(truth): hll.add(i) estimated = estimator([hll])[0] self.assertEqual(estimated, truth)
def test_estimator_cardinality_dense_mode(self): estimator = HllCardinality() for truth in [1025, 2048]: hll = HyperLogLogPlusPlus(random_seed=89, length=1024) for i in range(truth): hll.add(i) estimated = estimator([hll])[0] self.assertAlmostEqual(estimated, truth, delta=truth * 0.05)
def insertion_test_helper(self, number_to_insert, acceptable_error=.05): hll = HyperLogLogPlusPlus(random_seed=137) for i in range(number_to_insert): hll.add(i) error_ratio = hll.estimate_cardinality() / number_to_insert self.assertAlmostEqual(error_ratio, 1.0, delta=acceptable_error)
def test_insert_same(self): hll = HyperLogLogPlusPlus(random_seed=42) hll.add(1) card_one = hll.estimate_cardinality() hll.add(1) self.assertEqual(card_one, hll.estimate_cardinality())
def estimator_tester_helper(self, number_of_hlls, acceptable_error=.05): estimator = HllCardinality() hll_list = [] for i in range(number_of_hlls): hll = HyperLogLogPlusPlus(random_seed=42) hll.add(i) hll_list.append(hll) error_ratio = estimator(hll_list)[0] / number_of_hlls self.assertAlmostEqual(error_ratio, 1.0, delta=acceptable_error)
def test_merge_sparse_with_sparse_to_sparse(self): hll1 = HyperLogLogPlusPlus(length=16, random_seed=234) hll1.add(1) hll2 = HyperLogLogPlusPlus(length=16, random_seed=234) hll2.add(1) merged_hll = hll1.merge(hll2) self.assertTrue(merged_hll.sparse_mode, 'Merged sketch is not in sparse mode.') self.assertTrue(all(hll1.buckets == merged_hll.buckets), 'Merged sketch is not correct.') self.assertSameElements(merged_hll.temp_set, set([1]), 'Temp set is not correct.') self.assertEqual(merged_hll.estimate_cardinality(), 1, 'Estimated cardinality is not correct.')
def test_merge_dense_with_dense(self): hll1 = HyperLogLogPlusPlus(length=16, random_seed=234) hll2 = HyperLogLogPlusPlus(length=16, random_seed=234) for i in range(16 * 6 + 1): hll1.add(i) hll2.add(i + 100) merged_hll = hll1.merge(hll2) self.assertFalse(merged_hll.sparse_mode, 'Merged sketch should not be in sparse mode.') self.assertGreater(sum(hll2.buckets == merged_hll.buckets), 0, 'Merged sketch is not correct.') self.assertSameElements(merged_hll.temp_set, set(), 'Temp set is not correct.') self.assertAlmostEqual( merged_hll.estimate_cardinality(), 194, delta=194 * 0.1 )
def test_merge_sparse_with_dense(self): hll1 = HyperLogLogPlusPlus(length=16, random_seed=234) hll1.add(100) hll2 = HyperLogLogPlusPlus(length=16, random_seed=234) for i in range(16 * 6 + 1): hll2.add(i) merged_hll = hll1.merge(hll2) self.assertFalse(merged_hll.sparse_mode, 'Merged sketch should not be in sparse mode.') # Should change one bucket value given this random seed. self.assertEqual(sum(hll2.buckets == merged_hll.buckets), 16 - 1, 'Merged sketch is not correct.') self.assertSameElements(merged_hll.temp_set, set(), 'Temp set is not correct.') self.assertGreater(merged_hll.estimate_cardinality(), hll2.estimate_cardinality())
def test_single_correct_bucket_placement(self): for bucket_idx, bucket_bin_str in self.bucket_idx_to_bin_str.items(): for leading_0_bin_str, num_leading_0s in self.bin_str_to_leading_zeros.items( ): hll = HyperLogLogPlusPlus( length=self.vector_length, random_seed=42, hash_class=NoOpHasher, num_integer_bits=self.num_integer_bits) total_bin_str = bucket_bin_str + leading_0_bin_str hll.add(int(total_bin_str, 2)) expected_buckets = np.zeros(16, dtype=np.int32) expected_buckets[bucket_idx] = num_leading_0s + 1 self.assertSameElements(hll.buckets, expected_buckets)
def test_merge_sparse_with_sparse_to_dense(self): hll1 = HyperLogLogPlusPlus(length=16, random_seed=234) hll2 = HyperLogLogPlusPlus(length=16, random_seed=234) for i in range(int(16 * 6 / 2)): hll1.add(i) hll2.add(i + 100) merged_hll = hll1.merge(hll2) self.assertTrue(merged_hll.sparse_mode, 'Merged sketch should be in sparse mode.') self.assertEqual(merged_hll.estimate_cardinality(), 96, 'Estimated cardinality not correct under sparse mode.') hll1.add(1000) merged_hll = hll1.merge(hll2) self.assertFalse(merged_hll.sparse_mode, 'Merged sketch should not be in sparse mode.') self.assertAlmostEqual( merged_hll.estimate_cardinality(), 97, delta=97 * 0.05, msg='Estimated cardinality not correct under dense mode.' )