def test_end_to_end_noise_without_oneplus_budget(self): max_freq = 3 this_multi_set = generate_multi_set([(1, 2), (2, 3), (3, 1), (10, 1)]) that_multi_set = generate_multi_set([(1, 1), (3, 1), (4, 5), (5, 1)]) this_noised_sketch = stratified_sketch.StratifiedSketch.init_from_exact_multi_set( max_freq, this_multi_set, epsilon=0.8, epsilon_split=0, noiser_class=PlusNoiser, union=stratified_sketch.ExactSetOperator.union, cardinality_sketch_factory=ExactMultiSet.get_sketch_factory(), random_seed=1) that_noised_sketch = stratified_sketch.StratifiedSketch.init_from_exact_multi_set( max_freq, that_multi_set, epsilon=0.8, epsilon_split=0, noiser_class=PlusNoiser, union=stratified_sketch.ExactSetOperator.union, cardinality_sketch_factory=ExactMultiSet.get_sketch_factory(), random_seed=1) estimator = stratified_sketch.PairwiseEstimator( denoiser_class=MinusDenoiser, sketch_operator=stratified_sketch.ExactSetOperator, cardinality_estimator=LosslessEstimator()) estimated = estimator(this_noised_sketch, that_noised_sketch) expected = [6, 4, 3] self.assertEqual(estimated, expected)
def test_noiser_for_exact_multi_set(self): s = ExactMultiSet() s.add_ids([1, 2]) n = AddRandomElementsNoiser(num_random_elements=3, random_state=np.random.RandomState(1)) s_copy = n(s) self.assertLen(s, 2) self.assertLen(s_copy, 5)
def run_one(self): """Run one iteration. Returns: A pd.DataFrame that has 2f+1 columns, where f is the maximum frequency. The column names are num_sets, estimated_cardinality_i and true_cardinality_i, for i = 1, ..., f. """ set_generator = self.set_generator_factory(self.set_random_state) sketch_random_seed = self.sketch_random_state.randint(2**32 - 1) # Build the sketches and keep track of actual ids for # later comparison. sketches = [] actual_ids = [] for campaign_ids in set_generator: actual_ids.append(campaign_ids) sketch = self.sketch_estimator_config.sketch_factory( sketch_random_seed) sketch.add_ids(campaign_ids) sketches.append(sketch) # Optionally noise the sketches. if hasattr(self.sketch_estimator_config, 'sketch_noiser' ) and self.sketch_estimator_config.sketch_noiser: sketch_noiser = self.sketch_estimator_config.sketch_noiser sketches = [sketch_noiser(s) for s in sketches] # Estimate cardinality for 1, 2, ..., n pubs. estimator = self.sketch_estimator_config.estimator # A set that keeps the running union. true_union = ExactMultiSet() metrics = [] max_freq = self.sketch_estimator_config.max_frequency for i in range(len(sketches)): estimated_cardinality = self._extend_histogram( estimator(sketches[:i + 1]), max_freq) if hasattr(self.sketch_estimator_config, 'estimate_noiser' ) and self.sketch_estimator_config.estimate_noiser: estimated_cardinality = [ self.sketch_estimator_config.estimate_noiser(e) for e in estimated_cardinality ] true_union.add_ids(actual_ids[i]) true_cardinality = self._extend_histogram( LosslessEstimator()([true_union]), max_freq) shuffle_distance = self._shuffle_distance(estimated_cardinality, true_cardinality) metrics.append([i + 1] + estimated_cardinality + true_cardinality + [shuffle_distance]) df_columns = ([NUM_SETS] + [ ESTIMATED_CARDINALITY_BASENAME + str(i + 1) for i in range(max_freq) ] + [TRUE_CARDINALITY_BASENAME + str(i + 1) for i in range(max_freq)] + [SHUFFLE_DISTANCE]) df = pd.DataFrame(metrics, columns=df_columns) return df
def test_heterogeneous_multi_set_generator_test_impression_count(self): g = HeterogeneousMultiSetGenerator(1000, [10], [(1, 1)], np.random.RandomState(1)) e = ExactMultiSet() for ids in g: e.add_ids(ids) h = LosslessEstimator()([e]) self.assertEqual(h[0], 10) self.assertGreater(len(h), 1)
def test_heterogeneous_multi_set_generator_with_frequency_cap(self): g = HeterogeneousMultiSetGenerator(1000, [100], [(1, 1)], np.random.RandomState(1), freq_cap=1) e = ExactMultiSet() for ids in g: e.add_ids(ids) h = LosslessEstimator()([e]) self.assertEqual(h, [100])
def test_homogeneous_pmf_multiset_generator_single_set(self): pmfgen = HomogeneousPmfMultiSetGenerator(100, [2], [[1]], np.random.RandomState(1)) hists = [] for s in pmfgen: e = ExactMultiSet() e.add_ids(s) hists.append(LosslessEstimator()([e])) self.assertLen(hists, 1) self.assertEqual(hists[0], [2])
def test_publisher_constant_frequency_set_generator(self): gen = PublisherConstantFrequencySetGenerator(100, [1, 2, 3], 3, np.random.RandomState(1)) hists = [] for s in gen: e = ExactMultiSet() e.add_ids(s) hists.append(LosslessEstimator()([e])) self.assertLen(hists, 3) self.assertEqual(hists[0], [1, 1, 1]) self.assertEqual(hists[1], [2, 2, 2]) self.assertEqual(hists[2], [3, 3, 3])
def test_less_one_estimator_multiple_for_exact_multi_set(self): s1 = ExactMultiSet() s1.add_ids([1, 2]) s2 = ExactMultiSet() s2.add_ids([1, 3, 4]) e = LessOneEstimator() self.assertEqual(e([s1, s2]), [3, 0])
def test_independent_set_estimator_two_sketches_single_frequency(self): sketch1 = ExactMultiSet() sketch1.add_ids(range(50)) sketch2 = ExactMultiSet() sketch2.add_ids(range(50)) estimator = IndependentSetEstimator(LosslessEstimator(), 100) result = estimator([sketch1, sketch2]) self.assertEqual(result, [75, 25])
def test_independent_set_estimator_two_sketches_multiple_frequencies(self): sketch1 = ExactMultiSet() sketch1.add_ids(list(range(50)) + list(range(20))) sketch2 = ExactMultiSet() sketch2.add_ids(list(range(30)) + list(range(10))) estimator = IndependentSetEstimator(LosslessEstimator(), 100) result = estimator([sketch1, sketch2]) self.assertEqual(result, [65, 34, 9, 2])
def test_multi_frequency_sketch_for_exact_multi_set(self): s = ExactMultiSet() s.add_ids([1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 1, 2, 3, 1, 2, 1, 1]) self.assertEqual(s.frequency(1), 6) self.assertEqual(s.frequency(2), 4) self.assertEqual(s.frequency(3), 3) self.assertEqual(s.frequency(4), 2) self.assertEqual(s.frequency(5), 1) self.assertEqual(s.frequency(6), 1) self.assertEqual(LosslessEstimator()([s]), [6, 4, 3, 2, 1, 1])
def setUp(self): super(PairwiseEstimatorTest, self).setUp() max_freq = 3 this_multi_set = generate_multi_set([(1, 2), (2, 3), (3, 1), (10, 1)]) that_multi_set = generate_multi_set([(1, 1), (3, 1), (4, 5), (5, 1)]) self.this_sketch = stratified_sketch.StratifiedSketch.init_from_exact_multi_set( max_freq, this_multi_set, cardinality_sketch_factory=ExactMultiSet.get_sketch_factory(), random_seed=1) self.that_sketch = stratified_sketch.StratifiedSketch.init_from_exact_multi_set( max_freq, that_multi_set, cardinality_sketch_factory=ExactMultiSet.get_sketch_factory(), random_seed=1) self.estimator = stratified_sketch.PairwiseEstimator( sketch_operator=stratified_sketch.ExactSetOperator, cardinality_estimator=LosslessEstimator()) self.merge_expected = { ONE_PLUS: { 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 10: 1 }, 1: { 5: 1, 10: 1 }, 2: { 3: 1 }, '3+': { 1: 1, 2: 1, 4: 1 }, }
def test_sketch_building_from_exact_multi_set(self): max_freq = 3 vids = [1, 2, 2, 3, 3, 3, 4, 4, 4, 4] input_set = ExactMultiSet() for vid in vids: input_set.add(vid) expected = { '1+': { 1: 1, 2: 1, 3: 1, 4: 1 }, 1: { 1: 1 }, 2: { 2: 1 }, '3+': { 3: 1, 4: 1 } } s = stratified_sketch.StratifiedSketch.init_from_exact_multi_set( max_freq, input_set, cardinality_sketch_factory=ExactMultiSet.get_sketch_factory(), union=stratified_sketch.ExactSetOperator.union, random_seed=1) self.assertLen(s.sketches.keys(), len(expected.keys())) for freq, sketch in s.sketches.items(): self.assertEqual(sketch.ids(), expected[freq])
def generate_sketches_from_sets(self, multi_sets, max_freq, epsilon=0, epsilon_split=0): sketches = [] for multi_set in multi_sets: s = stratified_sketch.StratifiedSketch.init_from_exact_multi_set( max_freq, multi_set, union=stratified_sketch.ExactSetOperator.union, epsilon=epsilon, epsilon_split=epsilon_split, noiser_class=PlusNoiser, cardinality_sketch_factory=ExactMultiSet.get_sketch_factory(), random_seed=1) sketches.append(s) return sketches
def test_noise_without_oneplus_budget(self): max_freq = 3 s = stratified_sketch.StratifiedSketch( max_freq=max_freq, epsilon=0.8, epsilon_split=0, noiser_class=PlusNoiser, union=stratified_sketch.ExactSetOperator.union, cardinality_sketch_factory=ExactMultiSet.get_sketch_factory(), random_seed=1) vids = [1, 2, 2, 3, 3, 3, 4, 4, 4, 4] for vid in vids: s.add(vid) s.create_sketches() expected = { '1+': { 9.0: 1, 10.0: 1, 11.0: 1, 12.0: 1 }, 1: { 9.0: 1 }, 2: { 10.0: 1 }, '3+': { 11.0: 1, 12.0: 1 } } self.assertLen(s.sketches.keys(), len(expected.keys())) for freq, sketch in s.sketches.items(): self.assertEqual(sketch.ids(), expected[freq]) s._destroy_sketches() self.assertEqual(s.sketches, {})
def test_sketch_create_and_destroy(self): max_freq = 3 s = stratified_sketch.StratifiedSketch( max_freq=max_freq, union=stratified_sketch.ExactSetOperator.union, cardinality_sketch_factory=ExactMultiSet.get_sketch_factory(), random_seed=1) for k in range(max_freq + 2): for i in range(k): s.add(k) s.create_sketches() expected = { '1+': { 1: 1, 2: 1, 3: 1, 4: 1 }, 1: { 1: 1 }, 2: { 2: 1 }, '3+': { 3: 1, 4: 1 } } self.assertLen(s.sketches.keys(), len(expected.keys())) for freq, sketch in s.sketches.items(): self.assertEqual(sketch.ids(), expected[freq]) s._destroy_sketches() self.assertEqual(s.sketches, {})
def test_sketch_building_from_set_generator(self): universe_size = 1000 set_sizes = [100] * 5 max_freq = 3 expected_sets = [[1, 1, 1, 2, 2, 3], [1, 1, 1, 3, 3, 4]] set_gen = FakeSetGenerator(expected_sets) s = stratified_sketch.StratifiedSketch.init_from_set_generator( max_freq, set_generator=set_gen, union=stratified_sketch.ExactSetOperator.union, cardinality_sketch_factory=ExactMultiSet.get_sketch_factory(), random_seed=1) expected = { '1+': { 4: 1, 1: 1, 2: 1, 3: 1 }, 1: { 4: 1 }, 2: { 2: 1 }, '3+': { 1: 1, 3: 1 } } self.assertLen(s.sketches.keys(), len(expected.keys())) for freq, sketch in s.sketches.items(): self.assertEqual(sketch.ids(), expected[freq])
def __call__(self, sketch): denoised_sketch = ExactMultiSet() for x in sketch.ids(): denoised_sketch.add_ids([x - self.constant] * sketch.frequency(x)) return denoised_sketch
def setUp(self): super(InteroperabilityTest, self).setUp() self.number_of_trials = 2 self.universe_size = 2000 self.set_size_list = [5, 7, 9] self.large_set_size = 6 self.small_set_size = 3 self.sketch_size = 128 self.number_of_sets = 3 self.set_size = 50 self.num_large_sets = 1 self.num_small_sets = 3 self.order = set_generator.ORDER_RANDOM self.user_activity_association = ( set_generator.USER_ACTIVITY_ASSOCIATION_INDEPENDENT) self.shared_prop = 0.2 self.num_bloom_filter_hashes = 2 self.exponential_bloom_filter_decay_rate = 10 self.geometic_bloom_filter_probability = 0.08 self.noiser_epsilon = np.log(3) self.noiser_flip_probability = .25 self.set_random_state = np.random.RandomState(42) self.sketch_random_state = np.random.RandomState(137) self.noise_random_state = np.random.RandomState(3) # non-noised estimators estimator_config_cascading_legions = SketchEstimatorConfig( name='cascading_legions', sketch_factory=CascadingLegions.get_sketch_factory( self.sketch_size, self.sketch_size), estimator=Estimator()) estimator_config_bloom_filter = SketchEstimatorConfig( name='bloom_filter-union_estimator', sketch_factory=BloomFilter.get_sketch_factory( self.sketch_size, self.num_bloom_filter_hashes), estimator=UnionEstimator()) estimator_config_geometric_bloom_filter = SketchEstimatorConfig( name='geo_bloom_filter-first_moment_geo', sketch_factory=GeometricBloomFilter.get_sketch_factory( self.sketch_size, self.geometic_bloom_filter_probability), estimator=FirstMomentEstimator(method='geo')) estimator_config_logarithmic_bloom_filter = SketchEstimatorConfig( name='log_bloom_filter-first_moment_log', sketch_factory=LogarithmicBloomFilter.get_sketch_factory( self.sketch_size), estimator=FirstMomentEstimator(method='log')) estimator_config_exponential_bloom_filter = SketchEstimatorConfig( name='exp_bloom_filter-first_moment_exp', sketch_factory=ExponentialBloomFilter.get_sketch_factory( self.sketch_size, self.exponential_bloom_filter_decay_rate), estimator=FirstMomentEstimator(method='exp')) estimator_config_voc = SketchEstimatorConfig( name='vector_of_counts-sequential', sketch_factory=VectorOfCounts.get_sketch_factory(self.sketch_size), estimator=SequentialEstimator()) estimator_config_exact = SketchEstimatorConfig( name='exact_set-lossless', sketch_factory=ExactMultiSet.get_sketch_factory(), estimator=LosslessEstimator()) estimator_config_hll = SketchEstimatorConfig( name='hyper_log_log', sketch_factory=HyperLogLogPlusPlus.get_sketch_factory( self.sketch_size), estimator=HllCardinality()) estimator_config_expadbf_first_moment_global_dp = SketchEstimatorConfig( name='estimator_config_expadbf_first_moment_global_d', sketch_factory=ExponentialBloomFilter.get_sketch_factory( length=10**5, decay_rate=10), estimator=FirstMomentEstimator( method=FirstMomentEstimator.METHOD_EXP, noiser=GeometricEstimateNoiser(epsilon=math.log(3)))) config_list = [ estimator_config_exact, estimator_config_cascading_legions, estimator_config_bloom_filter, estimator_config_logarithmic_bloom_filter, estimator_config_exponential_bloom_filter, estimator_config_geometric_bloom_filter, estimator_config_voc, estimator_config_hll, estimator_config_expadbf_first_moment_global_dp, ] self.name_to_non_noised_estimator_config = { config.name: config for config in config_list } # noised estimators noised_estimator_config_cascading_legions = SketchEstimatorConfig( name='cascading_legions', sketch_factory=CascadingLegions.get_sketch_factory( self.sketch_size, self.sketch_size), estimator=Estimator(), sketch_noiser=Noiser(self.noiser_flip_probability)) noised_estimator_config_bloom_filter = SketchEstimatorConfig( name='bloom_filter-union_estimator', sketch_factory=BloomFilter.get_sketch_factory( self.sketch_size, self.num_bloom_filter_hashes), estimator=UnionEstimator(), sketch_noiser=BlipNoiser(self.noiser_epsilon, self.noise_random_state)) noised_estimator_config_geometric_bloom_filter = SketchEstimatorConfig( name='geo_bloom_filter-first_moment_geo', sketch_factory=GeometricBloomFilter.get_sketch_factory( self.sketch_size, self.geometic_bloom_filter_probability), estimator=FirstMomentEstimator( method='geo', denoiser=SurrealDenoiser(epsilon=math.log(3))), sketch_noiser=BlipNoiser(self.noiser_epsilon, self.noise_random_state)) noised_estimator_config_logarithmic_bloom_filter = SketchEstimatorConfig( name='log_bloom_filter-first_moment_log', sketch_factory=LogarithmicBloomFilter.get_sketch_factory( self.sketch_size), estimator=FirstMomentEstimator( method='log', denoiser=SurrealDenoiser(epsilon=math.log(3))), sketch_noiser=BlipNoiser(self.noiser_epsilon, self.noise_random_state)) noised_estimator_config_exponential_bloom_filter = SketchEstimatorConfig( name='exp_bloom_filter-first_moment_exp', sketch_factory=ExponentialBloomFilter.get_sketch_factory( self.sketch_size, self.exponential_bloom_filter_decay_rate), estimator=FirstMomentEstimator( method='exp', denoiser=SurrealDenoiser(epsilon=math.log(3))), sketch_noiser=BlipNoiser(self.noiser_epsilon, self.noise_random_state)) noised_estimator_config_voc = SketchEstimatorConfig( name='vector_of_counts-sequential', sketch_factory=VectorOfCounts.get_sketch_factory(self.sketch_size), estimator=SequentialEstimator(), sketch_noiser=LaplaceNoiser()) noised_estimator_config_exact = SketchEstimatorConfig( name='exact_set-lossless', sketch_factory=ExactMultiSet.get_sketch_factory(), estimator=LosslessEstimator(), sketch_noiser=AddRandomElementsNoiser(1, self.noise_random_state)) noised_config_list = [ noised_estimator_config_exact, noised_estimator_config_cascading_legions, noised_estimator_config_bloom_filter, noised_estimator_config_logarithmic_bloom_filter, noised_estimator_config_exponential_bloom_filter, noised_estimator_config_geometric_bloom_filter, noised_estimator_config_voc, ] self.name_to_noised_estimator_config = { config.name: config for config in noised_config_list }
def test_less_one_estimator_for_empty_set(self): s = ExactMultiSet() e = LessOneEstimator() self.assertRaises(ValueError, lambda: e([s]))
def test_less_one_estimator_no_freq1or2(self): s = ExactMultiSet() s.add_ids([1, 1, 1, 2, 2, 2, 3, 3, 3]) e = LessOneEstimator() self.assertEqual(e([s]), [2, 2, 2])
def test_lossless_estimator_for_empty_set(self): s = ExactMultiSet() e = LosslessEstimator() self.assertEqual(e([s]), [0])
def test_lossless_estimator_for_exact_multi_set(self): s = ExactMultiSet() s.add_ids([1, 2, 3, 1, 2, 1]) e = LosslessEstimator() self.assertEqual(e([s]), [3, 2, 1])
def test_sketch_for_exact_multi_set(self): s = ExactMultiSet() s.add_ids([1, 2]) self.assertLen(s, 2, "ID set has wrong length.") self.assertIn(2, s, "ID set is missing an ID.") self.assertNotIn(3, s, "ID set contains an unexpected ID.")
def test_independent_set_estimator_universe_size_exceeded(self): sketch = ExactMultiSet() sketch.add_ids(range(11)) estimator = IndependentSetEstimator(LosslessEstimator(), 10) with self.assertRaises(AssertionError): result = estimator([sketch])
def test_independent_set_estimator_single_sketch(self): sketch = ExactMultiSet() sketch.add_ids([1, 2, 2, 3, 3, 3, 4, 5]) estimator = IndependentSetEstimator(LosslessEstimator(), 100) result = estimator([sketch]) self.assertEqual(result, [5, 2, 1])
def generate_multi_set(tuple_list): multi_set = ExactMultiSet() for tuple in tuple_list: multi_set.add_ids([tuple[0]] * tuple[1]) return multi_set