def test_estimator_cardinality_dense_mode(self): estimator = HllCardinality() for truth in [1025, 2048]: hll = HyperLogLogPlusPlus(random_seed=89, length=1024) for i in range(truth): hll.add(i) estimated = estimator([hll])[0] self.assertAlmostEqual(estimated, truth, delta=truth * 0.05)
def insertion_test_helper(self, number_to_insert, acceptable_error=.05): hll = HyperLogLogPlusPlus(random_seed=137) for i in range(number_to_insert): hll.add(i) error_ratio = hll.estimate_cardinality() / number_to_insert self.assertAlmostEqual(error_ratio, 1.0, delta=acceptable_error)
def test_estimator_cardinality_sparse_mode(self): estimator = HllCardinality() for truth in [0, 1, 1024]: hll = HyperLogLogPlusPlus(random_seed=89, length=1024) for i in range(truth): hll.add(i) estimated = estimator([hll])[0] self.assertEqual(estimated, truth)
def estimator_tester_helper(self, number_of_hlls, acceptable_error=.05): estimator = HllCardinality() hll_list = [] for i in range(number_of_hlls): hll = HyperLogLogPlusPlus(random_seed=42) hll.add(i) hll_list.append(hll) error_ratio = estimator(hll_list)[0] / number_of_hlls self.assertAlmostEqual(error_ratio, 1.0, delta=acceptable_error)
def test_simple_estimate_smaller(self): hll = HyperLogLogPlusPlus(length=self.vector_length, random_seed=42, num_integer_bits=self.num_integer_bits) one_vector = np.ones(self.vector_length) hll.buckets = one_vector alpha_16 = 0.673 hll_should_estimate = alpha_16 * self.vector_length**2 * 2 / self.vector_length self.assertEqual(alpha_16, hll.alpha) self.assertEqual(hll.estimate_cardinality(), hll_should_estimate)
def test_simple_estimate_larger(self): m = 2**14 hll = HyperLogLogPlusPlus(length=m, random_seed=42, num_integer_bits=self.num_integer_bits) thirty_vector = 30 * np.ones(m) hll.buckets = thirty_vector alpha_m = 0.7213 / (1 + 1.079 / m) hll_should_estimate = alpha_m * m**2 * 2**30 / m self.assertEqual(alpha_m, hll.alpha) self.assertEqual(hll.estimate_cardinality(), hll_should_estimate)
def test_insert_same(self): hll = HyperLogLogPlusPlus(random_seed=42) hll.add(1) card_one = hll.estimate_cardinality() hll.add(1) self.assertEqual(card_one, hll.estimate_cardinality())
def test_merge_sparse_with_dense(self): hll1 = HyperLogLogPlusPlus(length=16, random_seed=234) hll1.add(100) hll2 = HyperLogLogPlusPlus(length=16, random_seed=234) for i in range(16 * 6 + 1): hll2.add(i) merged_hll = hll1.merge(hll2) self.assertFalse(merged_hll.sparse_mode, 'Merged sketch should not be in sparse mode.') # Should change one bucket value given this random seed. self.assertEqual(sum(hll2.buckets == merged_hll.buckets), 16 - 1, 'Merged sketch is not correct.') self.assertSameElements(merged_hll.temp_set, set(), 'Temp set is not correct.') self.assertGreater(merged_hll.estimate_cardinality(), hll2.estimate_cardinality())
def test_merge_sparse_with_sparse_to_sparse(self): hll1 = HyperLogLogPlusPlus(length=16, random_seed=234) hll1.add(1) hll2 = HyperLogLogPlusPlus(length=16, random_seed=234) hll2.add(1) merged_hll = hll1.merge(hll2) self.assertTrue(merged_hll.sparse_mode, 'Merged sketch is not in sparse mode.') self.assertTrue(all(hll1.buckets == merged_hll.buckets), 'Merged sketch is not correct.') self.assertSameElements(merged_hll.temp_set, set([1]), 'Temp set is not correct.') self.assertEqual(merged_hll.estimate_cardinality(), 1, 'Estimated cardinality is not correct.')
def test_single_correct_bucket_placement(self): for bucket_idx, bucket_bin_str in self.bucket_idx_to_bin_str.items(): for leading_0_bin_str, num_leading_0s in self.bin_str_to_leading_zeros.items( ): hll = HyperLogLogPlusPlus( length=self.vector_length, random_seed=42, hash_class=NoOpHasher, num_integer_bits=self.num_integer_bits) total_bin_str = bucket_bin_str + leading_0_bin_str hll.add(int(total_bin_str, 2)) expected_buckets = np.zeros(16, dtype=np.int32) expected_buckets[bucket_idx] = num_leading_0s + 1 self.assertSameElements(hll.buckets, expected_buckets)
def test_merge_dense_with_dense(self): hll1 = HyperLogLogPlusPlus(length=16, random_seed=234) hll2 = HyperLogLogPlusPlus(length=16, random_seed=234) for i in range(16 * 6 + 1): hll1.add(i) hll2.add(i + 100) merged_hll = hll1.merge(hll2) self.assertFalse(merged_hll.sparse_mode, 'Merged sketch should not be in sparse mode.') self.assertGreater(sum(hll2.buckets == merged_hll.buckets), 0, 'Merged sketch is not correct.') self.assertSameElements(merged_hll.temp_set, set(), 'Temp set is not correct.') self.assertAlmostEqual( merged_hll.estimate_cardinality(), 194, delta=194 * 0.1 )
def test_merge_sparse_with_sparse_to_dense(self): hll1 = HyperLogLogPlusPlus(length=16, random_seed=234) hll2 = HyperLogLogPlusPlus(length=16, random_seed=234) for i in range(int(16 * 6 / 2)): hll1.add(i) hll2.add(i + 100) merged_hll = hll1.merge(hll2) self.assertTrue(merged_hll.sparse_mode, 'Merged sketch should be in sparse mode.') self.assertEqual(merged_hll.estimate_cardinality(), 96, 'Estimated cardinality not correct under sparse mode.') hll1.add(1000) merged_hll = hll1.merge(hll2) self.assertFalse(merged_hll.sparse_mode, 'Merged sketch should not be in sparse mode.') self.assertAlmostEqual( merged_hll.estimate_cardinality(), 97, delta=97 * 0.05, msg='Estimated cardinality not correct under dense mode.' )
def setUp(self): super(InteroperabilityTest, self).setUp() self.number_of_trials = 2 self.universe_size = 2000 self.set_size_list = [5, 7, 9] self.large_set_size = 6 self.small_set_size = 3 self.sketch_size = 128 self.number_of_sets = 3 self.set_size = 50 self.num_large_sets = 1 self.num_small_sets = 3 self.order = set_generator.ORDER_RANDOM self.user_activity_association = ( set_generator.USER_ACTIVITY_ASSOCIATION_INDEPENDENT) self.shared_prop = 0.2 self.num_bloom_filter_hashes = 2 self.exponential_bloom_filter_decay_rate = 10 self.noiser_epsilon = np.log(3) self.noiser_flip_probability = .25 self.set_random_state = np.random.RandomState(42) self.sketch_random_state = np.random.RandomState(137) self.noise_random_state = np.random.RandomState(3) # non-noised estimators estimator_config_cascading_legions = SketchEstimatorConfig( name='cascading_legions', sketch_factory=CascadingLegions.get_sketch_factory( self.sketch_size, self.sketch_size), estimator=Estimator()) estimator_config_bloom_filter = SketchEstimatorConfig( name='bloom_filter-union_estimator', sketch_factory=BloomFilter.get_sketch_factory( self.sketch_size, self.num_bloom_filter_hashes), estimator=UnionEstimator()) estimator_config_logarithmic_bloom_filter = SketchEstimatorConfig( name='log_bloom_filter-first_moment_log', sketch_factory=LogarithmicBloomFilter.get_sketch_factory( self.sketch_size), estimator=FirstMomentEstimator(method='log')) estimator_config_exponential_bloom_filter = SketchEstimatorConfig( name='exp_bloom_filter-first_moment_exp', sketch_factory=ExponentialBloomFilter.get_sketch_factory( self.sketch_size, self.exponential_bloom_filter_decay_rate), estimator=FirstMomentEstimator(method='exp')) estimator_config_voc = SketchEstimatorConfig( name='vector_of_counts-sequential', sketch_factory=VectorOfCounts.get_sketch_factory(self.sketch_size), estimator=SequentialEstimator()) estimator_config_exact = SketchEstimatorConfig( name='exact_set-lossless', sketch_factory=ExactSet.get_sketch_factory(), estimator=LosslessEstimator()) estimator_config_hll = SketchEstimatorConfig( name='hyper_log_log', sketch_factory=HyperLogLogPlusPlus.get_sketch_factory(self.sketch_size), estimator=HllCardinality()) config_list = [ estimator_config_exact, estimator_config_cascading_legions, estimator_config_bloom_filter, estimator_config_logarithmic_bloom_filter, estimator_config_exponential_bloom_filter, estimator_config_voc, estimator_config_hll, ] self.name_to_non_noised_estimator_config = { config.name: config for config in config_list } # noised estimators noised_estimator_config_cascading_legions = SketchEstimatorConfig( name='cascading_legions', sketch_factory=CascadingLegions.get_sketch_factory( self.sketch_size, self.sketch_size), estimator=Estimator(), sketch_noiser=Noiser(self.noiser_flip_probability)) noised_estimator_config_bloom_filter = SketchEstimatorConfig( name='bloom_filter-union_estimator', sketch_factory=BloomFilter.get_sketch_factory( self.sketch_size, self.num_bloom_filter_hashes), estimator=UnionEstimator(), sketch_noiser=BlipNoiser(self.noiser_epsilon, self.noise_random_state)) noised_estimator_config_logarithmic_bloom_filter = SketchEstimatorConfig( name='log_bloom_filter-first_moment_log', sketch_factory=LogarithmicBloomFilter.get_sketch_factory( self.sketch_size), estimator=FirstMomentEstimator( method='log', denoiser=SurrealDenoiser( probability=self.noiser_flip_probability)), sketch_noiser=BlipNoiser(self.noiser_epsilon, self.noise_random_state)) noised_estimator_config_exponential_bloom_filter = SketchEstimatorConfig( name='exp_bloom_filter-first_moment_exp', sketch_factory=ExponentialBloomFilter.get_sketch_factory( self.sketch_size, self.exponential_bloom_filter_decay_rate), estimator=FirstMomentEstimator( method='exp', denoiser=SurrealDenoiser( probability=self.noiser_flip_probability)), sketch_noiser=BlipNoiser(self.noiser_epsilon, self.noise_random_state)) noised_estimator_config_voc = SketchEstimatorConfig( name='vector_of_counts-sequential', sketch_factory=VectorOfCounts.get_sketch_factory(self.sketch_size), estimator=SequentialEstimator(), sketch_noiser=LaplaceNoiser()) noised_estimator_config_exact = SketchEstimatorConfig( name='exact_set-lossless', sketch_factory=ExactSet.get_sketch_factory(), estimator=LosslessEstimator(), sketch_noiser=AddRandomElementsNoiser(1, self.noise_random_state)) noised_config_list = [ noised_estimator_config_exact, noised_estimator_config_cascading_legions, noised_estimator_config_bloom_filter, noised_estimator_config_logarithmic_bloom_filter, noised_estimator_config_exponential_bloom_filter, noised_estimator_config_voc, ] self.name_to_noised_estimator_config = { config.name: config for config in noised_config_list }
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') estimator_config_cascading_legions = SketchEstimatorConfig( name='cascading-legions', sketch_factory=CascadingLegions.get_sketch_factory( FLAGS.sketch_size, FLAGS.sketch_size), estimator=Estimator()) estimator_config_bloom_filter = SketchEstimatorConfig( name='bloom_filter-union_estimator', sketch_factory=BloomFilter.get_sketch_factory( FLAGS.sketch_size, FLAGS.num_bloom_filter_hashes), estimator=UnionEstimator()) estimator_config_logarithmic_bloom_filter = SketchEstimatorConfig( name='log_bloom_filter-first_moment_log', sketch_factory=LogarithmicBloomFilter.get_sketch_factory( FLAGS.sketch_size), estimator=FirstMomentEstimator(method='log')) estimator_config_exponential_bloom_filter = SketchEstimatorConfig( name='exp_bloom_filter-first_moment_exp', sketch_factory=ExponentialBloomFilter.get_sketch_factory( FLAGS.sketch_size, FLAGS.exponential_bloom_filter_decay_rate), estimator=FirstMomentEstimator(method='exp')) estimator_config_voc = SketchEstimatorConfig( name='vector_of_counts-sequential', sketch_factory=VectorOfCounts.get_sketch_factory(FLAGS.sketch_size), estimator=SequentialEstimator()) estimator_config_hll = SketchEstimatorConfig( name='hll++', sketch_factory=HyperLogLogPlusPlus.get_sketch_factory(FLAGS.sketch_size), estimator=HllCardinality()) estimator_config_exact = SketchEstimatorConfig( name='exact_set-lossless', sketch_factory=ExactSet.get_sketch_factory(), estimator=LosslessEstimator()) estimator_config_list = [ estimator_config_bloom_filter, estimator_config_logarithmic_bloom_filter, estimator_config_exponential_bloom_filter, estimator_config_cascading_legions, estimator_config_exact, estimator_config_hll, estimator_config_voc, ] name_to_estimator_config = { 'bloom_filter': estimator_config_bloom_filter, 'logarithmic_bloom_filter': estimator_config_logarithmic_bloom_filter, 'exponential_bloom_filter': estimator_config_exponential_bloom_filter, 'cascading_legions': estimator_config_cascading_legions, 'exact_set': estimator_config_exact, 'hll++': estimator_config_hll, 'vector_of_counts': estimator_config_voc, } set_generator_factory = ( set_generator.IndependentSetGenerator. get_generator_factory_with_num_and_size( universe_size=FLAGS.universe_size, num_sets=FLAGS.number_of_sets, set_size=FLAGS.set_size)) for estimator_method_config in estimator_config_list: print(f'Calculations for {estimator_method_config.name}') set_rs = np.random.RandomState(1) sketch_rs = np.random.RandomState(1) simulator = Simulator( num_runs=FLAGS.number_of_trials, set_generator_factory=set_generator_factory, sketch_estimator_config=estimator_method_config, set_random_state=set_rs, sketch_random_state=sketch_rs) _, agg_data = simulator.run_all_and_aggregate() print(f'Aggregate Statistics for {estimator_method_config.name}') print(agg_data)
def setUp(self): super(InteroperabilityTest, self).setUp() self.number_of_trials = 2 self.universe_size = 2000 self.set_size = 5 self.large_set_size = 6 self.small_set_size = 3 self.sketch_size = 64 self.number_of_sets = 2 self.num_large_sets = 1 self.num_small_sets = 3 self.order = set_generator.ORDER_RANDOM self.user_activity_association = ( set_generator.USER_ACTIVITY_ASSOCIATION_INDEPENDENT) self.shared_prop = 0.2 self.num_bloom_filter_hashes = 2 self.exponential_bloom_filter_decay_rate = 10 self.noiser_epsilon = np.log(3) self.noiser_flip_probability = .25 self.set_random_state = np.random.RandomState(42) self.sketch_random_state = np.random.RandomState(137) self.noise_random_state = np.random.RandomState(3) # non-noised estimators estimator_config_cascading_legions = EstimatorConfig( sketch_factory=CascadingLegions.get_sketch_factory( self.sketch_size, self.sketch_size), estimator=Estimator(), sketch_noiser=None, estimate_noiser=None) estimator_config_bloom_filter = EstimatorConfig( sketch_factory=BloomFilter.get_sketch_factory( self.sketch_size, self.num_bloom_filter_hashes), estimator=UnionEstimator(), sketch_noiser=None, estimate_noiser=None) estimator_config_logarithmic_bloom_filter = EstimatorConfig( sketch_factory=LogarithmicBloomFilter.get_sketch_factory( self.sketch_size), estimator=FirstMomentEstimator(method='log'), sketch_noiser=None, estimate_noiser=None) estimator_config_exponential_bloom_filter = EstimatorConfig( sketch_factory=ExponentialBloomFilter.get_sketch_factory( self.sketch_size, self.exponential_bloom_filter_decay_rate), estimator=FirstMomentEstimator(method='exp'), sketch_noiser=None, estimate_noiser=None) estimator_config_geometric_bloom_filter = EstimatorConfig( sketch_factory=GeometricBloomFilter.get_sketch_factory( self.sketch_size), estimator=GeometricUnionEstimator(), sketch_noiser=None, estimate_noiser=None) estimator_config_voc = EstimatorConfig( sketch_factory=VectorOfCounts.get_sketch_factory(self.sketch_size), estimator=SequentialEstimator(), sketch_noiser=None, estimate_noiser=None) estimator_config_exact = EstimatorConfig( sketch_factory=ExactSet.get_sketch_factory(), estimator=LosslessEstimator(), sketch_noiser=None, estimate_noiser=None) estimator_config_hll = EstimatorConfig( sketch_factory=HyperLogLogPlusPlus.get_sketch_factory( self.sketch_size), estimator=HllCardinality(), sketch_noiser=None, estimate_noiser=None) self.name_to_non_noised_estimator_config = { 'exact_set': estimator_config_exact, 'cascading_legions': estimator_config_cascading_legions, 'bloom_filter': estimator_config_bloom_filter, 'logarithmic_bloom_filter': estimator_config_logarithmic_bloom_filter, 'exponential_bloom_filter': estimator_config_exponential_bloom_filter, 'geometric_bloom_filter': estimator_config_geometric_bloom_filter, 'vector_of_counts': estimator_config_voc, 'hll': estimator_config_hll, } # noised estimators noised_estimator_config_cascading_legions = EstimatorConfig( sketch_factory=CascadingLegions.get_sketch_factory( self.sketch_size, self.sketch_size), estimator=Estimator(), sketch_noiser=Noiser(self.noiser_flip_probability), estimate_noiser=None) noised_estimator_config_bloom_filter = EstimatorConfig( sketch_factory=BloomFilter.get_sketch_factory( self.sketch_size, self.num_bloom_filter_hashes), estimator=UnionEstimator(), sketch_noiser=BlipNoiser(self.noiser_epsilon, self.noise_random_state), estimate_noiser=None) noised_estimator_config_logarithmic_bloom_filter = EstimatorConfig( sketch_factory=LogarithmicBloomFilter.get_sketch_factory( self.sketch_size), estimator=FirstMomentEstimator( method='log', denoiser=SurrealDenoiser( probability=self.noiser_flip_probability)), sketch_noiser=None, estimate_noiser=None) noised_estimator_config_exponential_bloom_filter = EstimatorConfig( sketch_factory=ExponentialBloomFilter.get_sketch_factory( self.sketch_size, self.exponential_bloom_filter_decay_rate), estimator=FirstMomentEstimator( method='exp', denoiser=SurrealDenoiser( probability=self.noiser_flip_probability)), sketch_noiser=None, estimate_noiser=None) noised_estimator_config_geometric_bloom_filter = EstimatorConfig( sketch_factory=GeometricBloomFilter.get_sketch_factory( self.sketch_size), estimator=GeometricUnionEstimator(), sketch_noiser=BlipNoiser(self.noiser_epsilon, self.noise_random_state), estimate_noiser=None) noised_estimator_config_voc = EstimatorConfig( sketch_factory=VectorOfCounts.get_sketch_factory(self.sketch_size), estimator=SequentialEstimator(), sketch_noiser=LaplaceNoiser(), estimate_noiser=None) noised_estimator_config_exact = EstimatorConfig( sketch_factory=ExactSet.get_sketch_factory(), estimator=LosslessEstimator(), sketch_noiser=AddRandomElementsNoiser(1, self.noise_random_state), estimate_noiser=None) self.name_to_noised_estimator_config = { 'exact_set': noised_estimator_config_exact, 'cascading_legions': noised_estimator_config_cascading_legions, 'bloom_filter': noised_estimator_config_bloom_filter, 'logarithmic_bloom_filter': noised_estimator_config_logarithmic_bloom_filter, 'exponential_bloom_filter': noised_estimator_config_exponential_bloom_filter, 'geometric_bloom_filter': noised_estimator_config_geometric_bloom_filter, 'vector_of_counts': noised_estimator_config_voc, }