def test_end_to_end_noise_without_oneplus_budget(self):
        max_freq = 3
        this_multi_set = generate_multi_set([(1, 2), (2, 3), (3, 1), (10, 1)])
        that_multi_set = generate_multi_set([(1, 1), (3, 1), (4, 5), (5, 1)])
        this_noised_sketch = stratified_sketch.StratifiedSketch.init_from_exact_multi_set(
            max_freq,
            this_multi_set,
            epsilon=0.8,
            epsilon_split=0,
            noiser_class=PlusNoiser,
            union=stratified_sketch.ExactSetOperator.union,
            cardinality_sketch_factory=ExactMultiSet.get_sketch_factory(),
            random_seed=1)
        that_noised_sketch = stratified_sketch.StratifiedSketch.init_from_exact_multi_set(
            max_freq,
            that_multi_set,
            epsilon=0.8,
            epsilon_split=0,
            noiser_class=PlusNoiser,
            union=stratified_sketch.ExactSetOperator.union,
            cardinality_sketch_factory=ExactMultiSet.get_sketch_factory(),
            random_seed=1)
        estimator = stratified_sketch.PairwiseEstimator(
            denoiser_class=MinusDenoiser,
            sketch_operator=stratified_sketch.ExactSetOperator,
            cardinality_estimator=LosslessEstimator())

        estimated = estimator(this_noised_sketch, that_noised_sketch)
        expected = [6, 4, 3]
        self.assertEqual(estimated, expected)
 def test_noiser_for_exact_multi_set(self):
     s = ExactMultiSet()
     s.add_ids([1, 2])
     n = AddRandomElementsNoiser(num_random_elements=3,
                                 random_state=np.random.RandomState(1))
     s_copy = n(s)
     self.assertLen(s, 2)
     self.assertLen(s_copy, 5)
Пример #3
0
    def run_one(self):
        """Run one iteration.

    Returns:
      A pd.DataFrame that has 2f+1 columns, where f is the maximum
      frequency.  The column names are num_sets, estimated_cardinality_i
      and true_cardinality_i, for i = 1, ..., f.
    """
        set_generator = self.set_generator_factory(self.set_random_state)
        sketch_random_seed = self.sketch_random_state.randint(2**32 - 1)

        # Build the sketches and keep track of actual ids for
        # later comparison.
        sketches = []
        actual_ids = []
        for campaign_ids in set_generator:
            actual_ids.append(campaign_ids)
            sketch = self.sketch_estimator_config.sketch_factory(
                sketch_random_seed)
            sketch.add_ids(campaign_ids)
            sketches.append(sketch)

        # Optionally noise the sketches.
        if hasattr(self.sketch_estimator_config, 'sketch_noiser'
                   ) and self.sketch_estimator_config.sketch_noiser:
            sketch_noiser = self.sketch_estimator_config.sketch_noiser
            sketches = [sketch_noiser(s) for s in sketches]

        # Estimate cardinality for 1, 2, ..., n pubs.
        estimator = self.sketch_estimator_config.estimator
        # A set that keeps the running union.
        true_union = ExactMultiSet()
        metrics = []
        max_freq = self.sketch_estimator_config.max_frequency
        for i in range(len(sketches)):
            estimated_cardinality = self._extend_histogram(
                estimator(sketches[:i + 1]), max_freq)
            if hasattr(self.sketch_estimator_config, 'estimate_noiser'
                       ) and self.sketch_estimator_config.estimate_noiser:
                estimated_cardinality = [
                    self.sketch_estimator_config.estimate_noiser(e)
                    for e in estimated_cardinality
                ]
            true_union.add_ids(actual_ids[i])
            true_cardinality = self._extend_histogram(
                LosslessEstimator()([true_union]), max_freq)
            shuffle_distance = self._shuffle_distance(estimated_cardinality,
                                                      true_cardinality)
            metrics.append([i + 1] + estimated_cardinality + true_cardinality +
                           [shuffle_distance])

        df_columns = ([NUM_SETS] + [
            ESTIMATED_CARDINALITY_BASENAME + str(i + 1)
            for i in range(max_freq)
        ] + [TRUE_CARDINALITY_BASENAME + str(i + 1)
             for i in range(max_freq)] + [SHUFFLE_DISTANCE])
        df = pd.DataFrame(metrics, columns=df_columns)
        return df
 def test_heterogeneous_multi_set_generator_test_impression_count(self):
     g = HeterogeneousMultiSetGenerator(1000, [10], [(1, 1)],
                                        np.random.RandomState(1))
     e = ExactMultiSet()
     for ids in g:
         e.add_ids(ids)
     h = LosslessEstimator()([e])
     self.assertEqual(h[0], 10)
     self.assertGreater(len(h), 1)
 def test_heterogeneous_multi_set_generator_with_frequency_cap(self):
     g = HeterogeneousMultiSetGenerator(1000, [100], [(1, 1)],
                                        np.random.RandomState(1),
                                        freq_cap=1)
     e = ExactMultiSet()
     for ids in g:
         e.add_ids(ids)
     h = LosslessEstimator()([e])
     self.assertEqual(h, [100])
 def test_homogeneous_pmf_multiset_generator_single_set(self):
     pmfgen = HomogeneousPmfMultiSetGenerator(100, [2], [[1]],
                                              np.random.RandomState(1))
     hists = []
     for s in pmfgen:
         e = ExactMultiSet()
         e.add_ids(s)
         hists.append(LosslessEstimator()([e]))
     self.assertLen(hists, 1)
     self.assertEqual(hists[0], [2])
 def test_publisher_constant_frequency_set_generator(self):
     gen = PublisherConstantFrequencySetGenerator(100, [1, 2, 3], 3,
                                                  np.random.RandomState(1))
     hists = []
     for s in gen:
         e = ExactMultiSet()
         e.add_ids(s)
         hists.append(LosslessEstimator()([e]))
     self.assertLen(hists, 3)
     self.assertEqual(hists[0], [1, 1, 1])
     self.assertEqual(hists[1], [2, 2, 2])
     self.assertEqual(hists[2], [3, 3, 3])
 def test_less_one_estimator_multiple_for_exact_multi_set(self):
     s1 = ExactMultiSet()
     s1.add_ids([1, 2])
     s2 = ExactMultiSet()
     s2.add_ids([1, 3, 4])
     e = LessOneEstimator()
     self.assertEqual(e([s1, s2]), [3, 0])
 def test_independent_set_estimator_two_sketches_single_frequency(self):
   sketch1 = ExactMultiSet()
   sketch1.add_ids(range(50))
   sketch2 = ExactMultiSet()
   sketch2.add_ids(range(50))
   estimator = IndependentSetEstimator(LosslessEstimator(), 100)
   result = estimator([sketch1, sketch2])
   self.assertEqual(result, [75, 25])
 def test_independent_set_estimator_two_sketches_multiple_frequencies(self):
   sketch1 = ExactMultiSet()
   sketch1.add_ids(list(range(50)) + list(range(20)))
   sketch2 = ExactMultiSet()
   sketch2.add_ids(list(range(30)) + list(range(10)))
   estimator = IndependentSetEstimator(LosslessEstimator(), 100)
   result = estimator([sketch1, sketch2])
   self.assertEqual(result, [65, 34, 9, 2])
 def test_multi_frequency_sketch_for_exact_multi_set(self):
     s = ExactMultiSet()
     s.add_ids([1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 1, 2, 3, 1, 2, 1, 1])
     self.assertEqual(s.frequency(1), 6)
     self.assertEqual(s.frequency(2), 4)
     self.assertEqual(s.frequency(3), 3)
     self.assertEqual(s.frequency(4), 2)
     self.assertEqual(s.frequency(5), 1)
     self.assertEqual(s.frequency(6), 1)
     self.assertEqual(LosslessEstimator()([s]), [6, 4, 3, 2, 1, 1])
 def setUp(self):
     super(PairwiseEstimatorTest, self).setUp()
     max_freq = 3
     this_multi_set = generate_multi_set([(1, 2), (2, 3), (3, 1), (10, 1)])
     that_multi_set = generate_multi_set([(1, 1), (3, 1), (4, 5), (5, 1)])
     self.this_sketch = stratified_sketch.StratifiedSketch.init_from_exact_multi_set(
         max_freq,
         this_multi_set,
         cardinality_sketch_factory=ExactMultiSet.get_sketch_factory(),
         random_seed=1)
     self.that_sketch = stratified_sketch.StratifiedSketch.init_from_exact_multi_set(
         max_freq,
         that_multi_set,
         cardinality_sketch_factory=ExactMultiSet.get_sketch_factory(),
         random_seed=1)
     self.estimator = stratified_sketch.PairwiseEstimator(
         sketch_operator=stratified_sketch.ExactSetOperator,
         cardinality_estimator=LosslessEstimator())
     self.merge_expected = {
         ONE_PLUS: {
             1: 1,
             2: 1,
             3: 1,
             4: 1,
             5: 1,
             10: 1
         },
         1: {
             5: 1,
             10: 1
         },
         2: {
             3: 1
         },
         '3+': {
             1: 1,
             2: 1,
             4: 1
         },
     }
    def test_sketch_building_from_exact_multi_set(self):
        max_freq = 3
        vids = [1, 2, 2, 3, 3, 3, 4, 4, 4, 4]
        input_set = ExactMultiSet()
        for vid in vids:
            input_set.add(vid)

        expected = {
            '1+': {
                1: 1,
                2: 1,
                3: 1,
                4: 1
            },
            1: {
                1: 1
            },
            2: {
                2: 1
            },
            '3+': {
                3: 1,
                4: 1
            }
        }

        s = stratified_sketch.StratifiedSketch.init_from_exact_multi_set(
            max_freq,
            input_set,
            cardinality_sketch_factory=ExactMultiSet.get_sketch_factory(),
            union=stratified_sketch.ExactSetOperator.union,
            random_seed=1)

        self.assertLen(s.sketches.keys(), len(expected.keys()))

        for freq, sketch in s.sketches.items():
            self.assertEqual(sketch.ids(), expected[freq])
 def generate_sketches_from_sets(self,
                                 multi_sets,
                                 max_freq,
                                 epsilon=0,
                                 epsilon_split=0):
     sketches = []
     for multi_set in multi_sets:
         s = stratified_sketch.StratifiedSketch.init_from_exact_multi_set(
             max_freq,
             multi_set,
             union=stratified_sketch.ExactSetOperator.union,
             epsilon=epsilon,
             epsilon_split=epsilon_split,
             noiser_class=PlusNoiser,
             cardinality_sketch_factory=ExactMultiSet.get_sketch_factory(),
             random_seed=1)
         sketches.append(s)
     return sketches
    def test_noise_without_oneplus_budget(self):
        max_freq = 3
        s = stratified_sketch.StratifiedSketch(
            max_freq=max_freq,
            epsilon=0.8,
            epsilon_split=0,
            noiser_class=PlusNoiser,
            union=stratified_sketch.ExactSetOperator.union,
            cardinality_sketch_factory=ExactMultiSet.get_sketch_factory(),
            random_seed=1)

        vids = [1, 2, 2, 3, 3, 3, 4, 4, 4, 4]
        for vid in vids:
            s.add(vid)

        s.create_sketches()

        expected = {
            '1+': {
                9.0: 1,
                10.0: 1,
                11.0: 1,
                12.0: 1
            },
            1: {
                9.0: 1
            },
            2: {
                10.0: 1
            },
            '3+': {
                11.0: 1,
                12.0: 1
            }
        }

        self.assertLen(s.sketches.keys(), len(expected.keys()))

        for freq, sketch in s.sketches.items():
            self.assertEqual(sketch.ids(), expected[freq])

        s._destroy_sketches()
        self.assertEqual(s.sketches, {})
    def test_sketch_create_and_destroy(self):
        max_freq = 3
        s = stratified_sketch.StratifiedSketch(
            max_freq=max_freq,
            union=stratified_sketch.ExactSetOperator.union,
            cardinality_sketch_factory=ExactMultiSet.get_sketch_factory(),
            random_seed=1)

        for k in range(max_freq + 2):
            for i in range(k):
                s.add(k)

        s.create_sketches()

        expected = {
            '1+': {
                1: 1,
                2: 1,
                3: 1,
                4: 1
            },
            1: {
                1: 1
            },
            2: {
                2: 1
            },
            '3+': {
                3: 1,
                4: 1
            }
        }
        self.assertLen(s.sketches.keys(), len(expected.keys()))

        for freq, sketch in s.sketches.items():
            self.assertEqual(sketch.ids(), expected[freq])

        s._destroy_sketches()
        self.assertEqual(s.sketches, {})
    def test_sketch_building_from_set_generator(self):
        universe_size = 1000
        set_sizes = [100] * 5
        max_freq = 3

        expected_sets = [[1, 1, 1, 2, 2, 3], [1, 1, 1, 3, 3, 4]]
        set_gen = FakeSetGenerator(expected_sets)

        s = stratified_sketch.StratifiedSketch.init_from_set_generator(
            max_freq,
            set_generator=set_gen,
            union=stratified_sketch.ExactSetOperator.union,
            cardinality_sketch_factory=ExactMultiSet.get_sketch_factory(),
            random_seed=1)

        expected = {
            '1+': {
                4: 1,
                1: 1,
                2: 1,
                3: 1
            },
            1: {
                4: 1
            },
            2: {
                2: 1
            },
            '3+': {
                1: 1,
                3: 1
            }
        }
        self.assertLen(s.sketches.keys(), len(expected.keys()))

        for freq, sketch in s.sketches.items():
            self.assertEqual(sketch.ids(), expected[freq])
 def __call__(self, sketch):
     denoised_sketch = ExactMultiSet()
     for x in sketch.ids():
         denoised_sketch.add_ids([x - self.constant] * sketch.frequency(x))
     return denoised_sketch
    def setUp(self):
        super(InteroperabilityTest, self).setUp()
        self.number_of_trials = 2
        self.universe_size = 2000
        self.set_size_list = [5, 7, 9]
        self.large_set_size = 6
        self.small_set_size = 3
        self.sketch_size = 128
        self.number_of_sets = 3
        self.set_size = 50
        self.num_large_sets = 1
        self.num_small_sets = 3
        self.order = set_generator.ORDER_RANDOM
        self.user_activity_association = (
            set_generator.USER_ACTIVITY_ASSOCIATION_INDEPENDENT)
        self.shared_prop = 0.2
        self.num_bloom_filter_hashes = 2
        self.exponential_bloom_filter_decay_rate = 10
        self.geometic_bloom_filter_probability = 0.08
        self.noiser_epsilon = np.log(3)
        self.noiser_flip_probability = .25

        self.set_random_state = np.random.RandomState(42)
        self.sketch_random_state = np.random.RandomState(137)
        self.noise_random_state = np.random.RandomState(3)

        # non-noised estimators
        estimator_config_cascading_legions = SketchEstimatorConfig(
            name='cascading_legions',
            sketch_factory=CascadingLegions.get_sketch_factory(
                self.sketch_size, self.sketch_size),
            estimator=Estimator())

        estimator_config_bloom_filter = SketchEstimatorConfig(
            name='bloom_filter-union_estimator',
            sketch_factory=BloomFilter.get_sketch_factory(
                self.sketch_size, self.num_bloom_filter_hashes),
            estimator=UnionEstimator())

        estimator_config_geometric_bloom_filter = SketchEstimatorConfig(
            name='geo_bloom_filter-first_moment_geo',
            sketch_factory=GeometricBloomFilter.get_sketch_factory(
                self.sketch_size, self.geometic_bloom_filter_probability),
            estimator=FirstMomentEstimator(method='geo'))

        estimator_config_logarithmic_bloom_filter = SketchEstimatorConfig(
            name='log_bloom_filter-first_moment_log',
            sketch_factory=LogarithmicBloomFilter.get_sketch_factory(
                self.sketch_size),
            estimator=FirstMomentEstimator(method='log'))

        estimator_config_exponential_bloom_filter = SketchEstimatorConfig(
            name='exp_bloom_filter-first_moment_exp',
            sketch_factory=ExponentialBloomFilter.get_sketch_factory(
                self.sketch_size, self.exponential_bloom_filter_decay_rate),
            estimator=FirstMomentEstimator(method='exp'))

        estimator_config_voc = SketchEstimatorConfig(
            name='vector_of_counts-sequential',
            sketch_factory=VectorOfCounts.get_sketch_factory(self.sketch_size),
            estimator=SequentialEstimator())

        estimator_config_exact = SketchEstimatorConfig(
            name='exact_set-lossless',
            sketch_factory=ExactMultiSet.get_sketch_factory(),
            estimator=LosslessEstimator())

        estimator_config_hll = SketchEstimatorConfig(
            name='hyper_log_log',
            sketch_factory=HyperLogLogPlusPlus.get_sketch_factory(
                self.sketch_size),
            estimator=HllCardinality())

        estimator_config_expadbf_first_moment_global_dp = SketchEstimatorConfig(
            name='estimator_config_expadbf_first_moment_global_d',
            sketch_factory=ExponentialBloomFilter.get_sketch_factory(
                length=10**5, decay_rate=10),
            estimator=FirstMomentEstimator(
                method=FirstMomentEstimator.METHOD_EXP,
                noiser=GeometricEstimateNoiser(epsilon=math.log(3))))

        config_list = [
            estimator_config_exact,
            estimator_config_cascading_legions,
            estimator_config_bloom_filter,
            estimator_config_logarithmic_bloom_filter,
            estimator_config_exponential_bloom_filter,
            estimator_config_geometric_bloom_filter,
            estimator_config_voc,
            estimator_config_hll,
            estimator_config_expadbf_first_moment_global_dp,
        ]

        self.name_to_non_noised_estimator_config = {
            config.name: config
            for config in config_list
        }

        # noised estimators
        noised_estimator_config_cascading_legions = SketchEstimatorConfig(
            name='cascading_legions',
            sketch_factory=CascadingLegions.get_sketch_factory(
                self.sketch_size, self.sketch_size),
            estimator=Estimator(),
            sketch_noiser=Noiser(self.noiser_flip_probability))

        noised_estimator_config_bloom_filter = SketchEstimatorConfig(
            name='bloom_filter-union_estimator',
            sketch_factory=BloomFilter.get_sketch_factory(
                self.sketch_size, self.num_bloom_filter_hashes),
            estimator=UnionEstimator(),
            sketch_noiser=BlipNoiser(self.noiser_epsilon,
                                     self.noise_random_state))

        noised_estimator_config_geometric_bloom_filter = SketchEstimatorConfig(
            name='geo_bloom_filter-first_moment_geo',
            sketch_factory=GeometricBloomFilter.get_sketch_factory(
                self.sketch_size, self.geometic_bloom_filter_probability),
            estimator=FirstMomentEstimator(
                method='geo', denoiser=SurrealDenoiser(epsilon=math.log(3))),
            sketch_noiser=BlipNoiser(self.noiser_epsilon,
                                     self.noise_random_state))

        noised_estimator_config_logarithmic_bloom_filter = SketchEstimatorConfig(
            name='log_bloom_filter-first_moment_log',
            sketch_factory=LogarithmicBloomFilter.get_sketch_factory(
                self.sketch_size),
            estimator=FirstMomentEstimator(
                method='log', denoiser=SurrealDenoiser(epsilon=math.log(3))),
            sketch_noiser=BlipNoiser(self.noiser_epsilon,
                                     self.noise_random_state))

        noised_estimator_config_exponential_bloom_filter = SketchEstimatorConfig(
            name='exp_bloom_filter-first_moment_exp',
            sketch_factory=ExponentialBloomFilter.get_sketch_factory(
                self.sketch_size, self.exponential_bloom_filter_decay_rate),
            estimator=FirstMomentEstimator(
                method='exp', denoiser=SurrealDenoiser(epsilon=math.log(3))),
            sketch_noiser=BlipNoiser(self.noiser_epsilon,
                                     self.noise_random_state))

        noised_estimator_config_voc = SketchEstimatorConfig(
            name='vector_of_counts-sequential',
            sketch_factory=VectorOfCounts.get_sketch_factory(self.sketch_size),
            estimator=SequentialEstimator(),
            sketch_noiser=LaplaceNoiser())

        noised_estimator_config_exact = SketchEstimatorConfig(
            name='exact_set-lossless',
            sketch_factory=ExactMultiSet.get_sketch_factory(),
            estimator=LosslessEstimator(),
            sketch_noiser=AddRandomElementsNoiser(1, self.noise_random_state))

        noised_config_list = [
            noised_estimator_config_exact,
            noised_estimator_config_cascading_legions,
            noised_estimator_config_bloom_filter,
            noised_estimator_config_logarithmic_bloom_filter,
            noised_estimator_config_exponential_bloom_filter,
            noised_estimator_config_geometric_bloom_filter,
            noised_estimator_config_voc,
        ]

        self.name_to_noised_estimator_config = {
            config.name: config
            for config in noised_config_list
        }
 def test_less_one_estimator_for_empty_set(self):
     s = ExactMultiSet()
     e = LessOneEstimator()
     self.assertRaises(ValueError, lambda: e([s]))
 def test_less_one_estimator_no_freq1or2(self):
     s = ExactMultiSet()
     s.add_ids([1, 1, 1, 2, 2, 2, 3, 3, 3])
     e = LessOneEstimator()
     self.assertEqual(e([s]), [2, 2, 2])
 def test_lossless_estimator_for_empty_set(self):
     s = ExactMultiSet()
     e = LosslessEstimator()
     self.assertEqual(e([s]), [0])
 def test_lossless_estimator_for_exact_multi_set(self):
     s = ExactMultiSet()
     s.add_ids([1, 2, 3, 1, 2, 1])
     e = LosslessEstimator()
     self.assertEqual(e([s]), [3, 2, 1])
 def test_sketch_for_exact_multi_set(self):
     s = ExactMultiSet()
     s.add_ids([1, 2])
     self.assertLen(s, 2, "ID set has wrong length.")
     self.assertIn(2, s, "ID set is missing an ID.")
     self.assertNotIn(3, s, "ID set contains an unexpected ID.")
 def test_independent_set_estimator_universe_size_exceeded(self):
   sketch = ExactMultiSet()
   sketch.add_ids(range(11))
   estimator = IndependentSetEstimator(LosslessEstimator(), 10)
   with self.assertRaises(AssertionError):
       result = estimator([sketch])
 def test_independent_set_estimator_single_sketch(self):
   sketch = ExactMultiSet()
   sketch.add_ids([1, 2, 2, 3, 3, 3, 4, 5])
   estimator = IndependentSetEstimator(LosslessEstimator(), 100)
   result = estimator([sketch])
   self.assertEqual(result, [5, 2, 1])
def generate_multi_set(tuple_list):
    multi_set = ExactMultiSet()
    for tuple in tuple_list:
        multi_set.add_ids([tuple[0]] * tuple[1])
    return multi_set