def test_merge_sketches(self):
        sketches_list = self.generate_sketches_from_sets(
            self.init_set_list, self.max_freq)
        estimator = stratified_sketch.SequentialEstimator(
            sketch_operator=stratified_sketch.ExactSetOperator,
            cardinality_estimator=LosslessEstimator())
        merged_sketches = estimator.merge(sketches_list)

        expected = {
            ONE_PLUS: {
                1: 1,
                2: 1,
                3: 1,
                4: 1,
                5: 1
            },
            1: {},
            2: {
                3: 1,
                5: 1
            },
            '3+': {
                1: 1,
                2: 1,
                4: 1
            },
        }

        self.assertLen(merged_sketches.sketches, len(expected))
        for freq, sketch in expected.items():
            self.assertEqual(merged_sketches.sketches[freq].ids(), sketch)
    def test_simulator_run_all_and_aggregate_write_file(self):
        sketch_estimator_config = SketchEstimatorConfig(
            name='exact_set-lossless',
            sketch_factory=ExactMultiSet,
            estimator=LosslessEstimator())
        set_generator_factory = (set_generator.IndependentSetGenerator.
                                 get_generator_factory_with_num_and_size(
                                     universe_size=1, num_sets=1, set_size=1))

        file_df = io.StringIO()
        file_df_agg = io.StringIO()
        sim = simulator.Simulator(
            num_runs=5,
            set_generator_factory=set_generator_factory,
            sketch_estimator_config=sketch_estimator_config,
            file_handle_raw=file_df,
            file_handle_agg=file_df_agg)
        df, df_agg = sim()

        # Test if the saved data frame is the same as the one returned from the
        # simulator.
        file_df.seek(0)
        df_from_csv = pd.read_csv(file_df)
        pd.testing.assert_frame_equal(df, df_from_csv)

        file_df_agg.seek(0)
        df_agg_from_csv = pd.read_csv(file_df_agg, header=[0, 1], index_col=0)
        pd.testing.assert_frame_equal(df_agg, df_agg_from_csv)
    def test_multiple_frequencies(self):
        sketch_estimator_config = SketchEstimatorConfig(
            name='exact-set-multiple-frequencies',
            sketch_factory=ExactMultiSet,
            estimator=LosslessEstimator(),
            max_frequency=3)
        set_generator_factory = (FakeSetGenerator.get_generator_factory(
            [[1, 1, 1, 2, 2, 3], [1, 1, 1, 3, 3, 4]]))
        sim = simulator.Simulator(
            num_runs=1,
            set_generator_factory=set_generator_factory,
            sketch_estimator_config=sketch_estimator_config)
        df, _ = sim()
        expected_columns = [
            'num_sets', simulator.ESTIMATED_CARDINALITY_BASENAME + '1',
            simulator.ESTIMATED_CARDINALITY_BASENAME + '2',
            simulator.ESTIMATED_CARDINALITY_BASENAME + '3',
            simulator.TRUE_CARDINALITY_BASENAME + '1',
            simulator.TRUE_CARDINALITY_BASENAME + '2',
            simulator.TRUE_CARDINALITY_BASENAME + '3',
            simulator.SHUFFLE_DISTANCE, 'run_index',
            simulator.RELATIVE_ERROR_BASENAME + '1',
            simulator.RELATIVE_ERROR_BASENAME + '2',
            simulator.RELATIVE_ERROR_BASENAME + '3'
        ]
        expected_data = [[1, 3, 2, 1, 3, 2, 1, 0., 0, 0., 0., 0.],
                         [2, 4, 3, 2, 4, 3, 2, 0., 0, 0., 0., 0.]]

        expected_df = pd.DataFrame(expected_data, columns=expected_columns)
        pd.testing.assert_frame_equal(df, expected_df)
    def test_end_to_end_noise_without_oneplus_budget(self):
        max_freq = 3
        this_multi_set = generate_multi_set([(1, 2), (2, 3), (3, 1), (10, 1)])
        that_multi_set = generate_multi_set([(1, 1), (3, 1), (4, 5), (5, 1)])
        this_noised_sketch = stratified_sketch.StratifiedSketch.init_from_exact_multi_set(
            max_freq,
            this_multi_set,
            epsilon=0.8,
            epsilon_split=0,
            noiser_class=PlusNoiser,
            union=stratified_sketch.ExactSetOperator.union,
            cardinality_sketch_factory=ExactMultiSet.get_sketch_factory(),
            random_seed=1)
        that_noised_sketch = stratified_sketch.StratifiedSketch.init_from_exact_multi_set(
            max_freq,
            that_multi_set,
            epsilon=0.8,
            epsilon_split=0,
            noiser_class=PlusNoiser,
            union=stratified_sketch.ExactSetOperator.union,
            cardinality_sketch_factory=ExactMultiSet.get_sketch_factory(),
            random_seed=1)
        estimator = stratified_sketch.PairwiseEstimator(
            denoiser_class=MinusDenoiser,
            sketch_operator=stratified_sketch.ExactSetOperator,
            cardinality_estimator=LosslessEstimator())

        estimated = estimator(this_noised_sketch, that_noised_sketch)
        expected = [6, 4, 3]
        self.assertEqual(estimated, expected)
 def test_independent_set_estimator_two_sketches_single_frequency(self):
   sketch1 = ExactMultiSet()
   sketch1.add_ids(range(50))
   sketch2 = ExactMultiSet()
   sketch2.add_ids(range(50))
   estimator = IndependentSetEstimator(LosslessEstimator(), 100)
   result = estimator([sketch1, sketch2])
   self.assertEqual(result, [75, 25])
 def test_independent_set_estimator_two_sketches_multiple_frequencies(self):
   sketch1 = ExactMultiSet()
   sketch1.add_ids(list(range(50)) + list(range(20)))
   sketch2 = ExactMultiSet()
   sketch2.add_ids(list(range(30)) + list(range(10)))
   estimator = IndependentSetEstimator(LosslessEstimator(), 100)
   result = estimator([sketch1, sketch2])
   self.assertEqual(result, [65, 34, 9, 2])
示例#7
0
    def run_one(self):
        """Run one iteration.

    Returns:
      A pd.DataFrame that has 2f+1 columns, where f is the maximum
      frequency.  The column names are num_sets, estimated_cardinality_i
      and true_cardinality_i, for i = 1, ..., f.
    """
        set_generator = self.set_generator_factory(self.set_random_state)
        sketch_random_seed = self.sketch_random_state.randint(2**32 - 1)

        # Build the sketches and keep track of actual ids for
        # later comparison.
        sketches = []
        actual_ids = []
        for campaign_ids in set_generator:
            actual_ids.append(campaign_ids)
            sketch = self.sketch_estimator_config.sketch_factory(
                sketch_random_seed)
            sketch.add_ids(campaign_ids)
            sketches.append(sketch)

        # Optionally noise the sketches.
        if hasattr(self.sketch_estimator_config, 'sketch_noiser'
                   ) and self.sketch_estimator_config.sketch_noiser:
            sketch_noiser = self.sketch_estimator_config.sketch_noiser
            sketches = [sketch_noiser(s) for s in sketches]

        # Estimate cardinality for 1, 2, ..., n pubs.
        estimator = self.sketch_estimator_config.estimator
        # A set that keeps the running union.
        true_union = ExactMultiSet()
        metrics = []
        max_freq = self.sketch_estimator_config.max_frequency
        for i in range(len(sketches)):
            estimated_cardinality = self._extend_histogram(
                estimator(sketches[:i + 1]), max_freq)
            if hasattr(self.sketch_estimator_config, 'estimate_noiser'
                       ) and self.sketch_estimator_config.estimate_noiser:
                estimated_cardinality = [
                    self.sketch_estimator_config.estimate_noiser(e)
                    for e in estimated_cardinality
                ]
            true_union.add_ids(actual_ids[i])
            true_cardinality = self._extend_histogram(
                LosslessEstimator()([true_union]), max_freq)
            shuffle_distance = self._shuffle_distance(estimated_cardinality,
                                                      true_cardinality)
            metrics.append([i + 1] + estimated_cardinality + true_cardinality +
                           [shuffle_distance])

        df_columns = ([NUM_SETS] + [
            ESTIMATED_CARDINALITY_BASENAME + str(i + 1)
            for i in range(max_freq)
        ] + [TRUE_CARDINALITY_BASENAME + str(i + 1)
             for i in range(max_freq)] + [SHUFFLE_DISTANCE])
        df = pd.DataFrame(metrics, columns=df_columns)
        return df
 def test_heterogeneous_multi_set_generator_test_impression_count(self):
     g = HeterogeneousMultiSetGenerator(1000, [10], [(1, 1)],
                                        np.random.RandomState(1))
     e = ExactMultiSet()
     for ids in g:
         e.add_ids(ids)
     h = LosslessEstimator()([e])
     self.assertEqual(h[0], 10)
     self.assertGreater(len(h), 1)
 def test_heterogeneous_multi_set_generator_with_frequency_cap(self):
     g = HeterogeneousMultiSetGenerator(1000, [100], [(1, 1)],
                                        np.random.RandomState(1),
                                        freq_cap=1)
     e = ExactMultiSet()
     for ids in g:
         e.add_ids(ids)
     h = LosslessEstimator()([e])
     self.assertEqual(h, [100])
 def test_homogeneous_pmf_multiset_generator_single_set(self):
     pmfgen = HomogeneousPmfMultiSetGenerator(100, [2], [[1]],
                                              np.random.RandomState(1))
     hists = []
     for s in pmfgen:
         e = ExactMultiSet()
         e.add_ids(s)
         hists.append(LosslessEstimator()([e]))
     self.assertLen(hists, 1)
     self.assertEqual(hists[0], [2])
 def test_multi_frequency_sketch_for_exact_multi_set(self):
     s = ExactMultiSet()
     s.add_ids([1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 1, 2, 3, 1, 2, 1, 1])
     self.assertEqual(s.frequency(1), 6)
     self.assertEqual(s.frequency(2), 4)
     self.assertEqual(s.frequency(3), 3)
     self.assertEqual(s.frequency(4), 2)
     self.assertEqual(s.frequency(5), 1)
     self.assertEqual(s.frequency(6), 1)
     self.assertEqual(LosslessEstimator()([s]), [6, 4, 3, 2, 1, 1])
    def test_end_to_end(self):
        sketches_list = self.generate_sketches_from_sets(
            self.init_set_list, self.max_freq)
        estimator = stratified_sketch.SequentialEstimator(
            sketch_operator=stratified_sketch.ExactSetOperator,
            cardinality_estimator=LosslessEstimator())
        estimated = estimator(sketches_list)

        expected = [5, 5, 3]
        self.assertEqual(estimated, expected)
 def test_simulator_run_one_with_estimate_noiser(self):
     fake_estimate_noiser = FakeEstimateNoiser()
     sketch_estimator_config = SketchEstimatorConfig(
         name='exact_set-lossless',
         sketch_factory=ExactSet,
         estimator=LosslessEstimator(),
         estimate_noiser=fake_estimate_noiser)
     sim = get_simple_simulator(sketch_estimator_config)
     data_frame = sim.run_one()
     self.assertLen(data_frame, 1)
     self.assertEqual(data_frame['estimated_cardinality'].iloc[0], 10)
     self.assertEqual(fake_estimate_noiser._calls, 1)
 def test_publisher_constant_frequency_set_generator(self):
     gen = PublisherConstantFrequencySetGenerator(100, [1, 2, 3], 3,
                                                  np.random.RandomState(1))
     hists = []
     for s in gen:
         e = ExactMultiSet()
         e.add_ids(s)
         hists.append(LosslessEstimator()([e]))
     self.assertLen(hists, 3)
     self.assertEqual(hists[0], [1, 1, 1])
     self.assertEqual(hists[1], [2, 2, 2])
     self.assertEqual(hists[2], [3, 3, 3])
    def test_end_to_end_noise_without_oneplus_budget(self):
        sketches_list = self.generate_sketches_from_sets(self.init_set_list,
                                                         self.max_freq,
                                                         epsilon=0.8,
                                                         epsilon_split=0)
        estimator = stratified_sketch.SequentialEstimator(
            denoiser_class=MinusDenoiser,
            sketch_operator=stratified_sketch.ExactSetOperator,
            cardinality_estimator=LosslessEstimator())
        estimated = estimator(sketches_list)

        expected = [5, 5, 3]
        self.assertEqual(estimated, expected)
 def test_simulator_run_one_with_estimate_noiser(self):
     fake_estimate_noiser = FakeEstimateNoiser()
     sketch_estimator_config = SketchEstimatorConfig(
         name='exact_set-lossless',
         sketch_factory=ExactMultiSet,
         estimator=LosslessEstimator(),
         estimate_noiser=fake_estimate_noiser)
     sim = get_simple_simulator(sketch_estimator_config)
     data_frame = sim.run_one()
     self.assertLen(data_frame, 1)
     self.assertEqual(
         data_frame[simulator.ESTIMATED_CARDINALITY_BASENAME + '1'].iloc[0],
         10)
     self.assertEqual(fake_estimate_noiser._calls, 1)
def get_simple_simulator(sketch_estimator_config=None):
    if not sketch_estimator_config:
        sketch_estimator_config = SketchEstimatorConfig(
            name='exact_set-lossless',
            sketch_factory=ExactMultiSet,
            estimator=LosslessEstimator())
    set_generator_factory = (set_generator.IndependentSetGenerator.
                             get_generator_factory_with_num_and_size(
                                 universe_size=1, num_sets=1, set_size=1))

    return simulator.Simulator(num_runs=1,
                               set_generator_factory=set_generator_factory,
                               sketch_estimator_config=sketch_estimator_config,
                               sketch_random_state=np.random.RandomState(1),
                               set_random_state=np.random.RandomState(2))
    def test_simulator_run_all_and_aggregate_with_noise(self):
        rs = np.random.RandomState(3)
        sketch_estimator_config = SketchEstimatorConfig(
            name='exact_set-lossless',
            sketch_factory=ExactSet,
            estimator=LosslessEstimator(),
            sketch_noiser=AddRandomElementsNoiser(num_random_elements=3,
                                                  random_state=rs))
        sim = get_simple_simulator(sketch_estimator_config)

        data_frames = sim.run_all_and_aggregate()
        self.assertLen(data_frames, 2)
        for pub in data_frames[0]['num_sets']:
            self.assertEqual(pub, 1)
        self.assertEqual(data_frames[0]['estimated_cardinality'][0], 4)
        self.assertEqual(data_frames[0]['true_cardinality'][0], 1)
        self.assertEqual(data_frames[0]['relative_error'][0], 3)
    def test_simulator_run_all_and_aggregate_multiple_runs(self):
        sketch_estimator_config = SketchEstimatorConfig(
            name='exact_set-lossless',
            sketch_factory=ExactSet,
            estimator=LosslessEstimator())
        set_generator_factory = (set_generator.IndependentSetGenerator.
                                 get_generator_factory_with_num_and_size(
                                     universe_size=1, num_sets=1, set_size=1))

        sim = Simulator(num_runs=5,
                        set_generator_factory=set_generator_factory,
                        sketch_estimator_config=sketch_estimator_config)

        data_frames = sim.run_all_and_aggregate()
        self.assertLen(data_frames, 2)
        self.assertLen(data_frames[0], 5)
        for pub in data_frames[0]['num_sets']:
            self.assertEqual(pub, 1)
    def test_simulator_run_all_and_aggregate_with_noise(self):
        rs = np.random.RandomState(3)
        sketch_estimator_config = SketchEstimatorConfig(
            name='exact_set-lossless',
            sketch_factory=ExactMultiSet,
            estimator=LosslessEstimator(),
            sketch_noiser=AddRandomElementsNoiser(num_random_elements=3,
                                                  random_state=rs))
        sim = get_simple_simulator(sketch_estimator_config)

        data_frames = sim.run_all_and_aggregate()
        self.assertLen(data_frames, 2)
        for pub in data_frames[0]['num_sets']:
            self.assertEqual(pub, 1)
        self.assertEqual(
            data_frames[0][simulator.ESTIMATED_CARDINALITY_BASENAME + '1'][0],
            4)
        self.assertEqual(
            data_frames[0][simulator.TRUE_CARDINALITY_BASENAME + '1'][0], 1)
        self.assertEqual(
            data_frames[0][simulator.RELATIVE_ERROR_BASENAME + '1'][0], 3)
 def setUp(self):
     super(PairwiseEstimatorTest, self).setUp()
     max_freq = 3
     this_multi_set = generate_multi_set([(1, 2), (2, 3), (3, 1), (10, 1)])
     that_multi_set = generate_multi_set([(1, 1), (3, 1), (4, 5), (5, 1)])
     self.this_sketch = stratified_sketch.StratifiedSketch.init_from_exact_multi_set(
         max_freq,
         this_multi_set,
         cardinality_sketch_factory=ExactMultiSet.get_sketch_factory(),
         random_seed=1)
     self.that_sketch = stratified_sketch.StratifiedSketch.init_from_exact_multi_set(
         max_freq,
         that_multi_set,
         cardinality_sketch_factory=ExactMultiSet.get_sketch_factory(),
         random_seed=1)
     self.estimator = stratified_sketch.PairwiseEstimator(
         sketch_operator=stratified_sketch.ExactSetOperator,
         cardinality_estimator=LosslessEstimator())
     self.merge_expected = {
         ONE_PLUS: {
             1: 1,
             2: 1,
             3: 1,
             4: 1,
             5: 1,
             10: 1
         },
         1: {
             5: 1,
             10: 1
         },
         2: {
             3: 1
         },
         '3+': {
             1: 1,
             2: 1,
             4: 1
         },
     }
 def test_lossless_estimator_for_exact_multi_set(self):
     s = ExactMultiSet()
     s.add_ids([1, 2, 3, 1, 2, 1])
     e = LosslessEstimator()
     self.assertEqual(e([s]), [3, 2, 1])
    def setUp(self):
        super(InteroperabilityTest, self).setUp()
        self.number_of_trials = 2
        self.universe_size = 2000
        self.set_size = 5
        self.large_set_size = 6
        self.small_set_size = 3
        self.sketch_size = 64
        self.number_of_sets = 2
        self.num_large_sets = 1
        self.num_small_sets = 3
        self.order = set_generator.ORDER_RANDOM
        self.user_activity_association = (
            set_generator.USER_ACTIVITY_ASSOCIATION_INDEPENDENT)
        self.shared_prop = 0.2
        self.num_bloom_filter_hashes = 2
        self.exponential_bloom_filter_decay_rate = 10
        self.noiser_epsilon = np.log(3)
        self.noiser_flip_probability = .25

        self.set_random_state = np.random.RandomState(42)
        self.sketch_random_state = np.random.RandomState(137)
        self.noise_random_state = np.random.RandomState(3)

        # non-noised estimators
        estimator_config_cascading_legions = EstimatorConfig(
            sketch_factory=CascadingLegions.get_sketch_factory(
                self.sketch_size, self.sketch_size),
            estimator=Estimator(),
            sketch_noiser=None,
            estimate_noiser=None)

        estimator_config_bloom_filter = EstimatorConfig(
            sketch_factory=BloomFilter.get_sketch_factory(
                self.sketch_size, self.num_bloom_filter_hashes),
            estimator=UnionEstimator(),
            sketch_noiser=None,
            estimate_noiser=None)

        estimator_config_logarithmic_bloom_filter = EstimatorConfig(
            sketch_factory=LogarithmicBloomFilter.get_sketch_factory(
                self.sketch_size),
            estimator=FirstMomentEstimator(method='log'),
            sketch_noiser=None,
            estimate_noiser=None)

        estimator_config_exponential_bloom_filter = EstimatorConfig(
            sketch_factory=ExponentialBloomFilter.get_sketch_factory(
                self.sketch_size, self.exponential_bloom_filter_decay_rate),
            estimator=FirstMomentEstimator(method='exp'),
            sketch_noiser=None,
            estimate_noiser=None)

        estimator_config_geometric_bloom_filter = EstimatorConfig(
            sketch_factory=GeometricBloomFilter.get_sketch_factory(
                self.sketch_size),
            estimator=GeometricUnionEstimator(),
            sketch_noiser=None,
            estimate_noiser=None)

        estimator_config_voc = EstimatorConfig(
            sketch_factory=VectorOfCounts.get_sketch_factory(self.sketch_size),
            estimator=SequentialEstimator(),
            sketch_noiser=None,
            estimate_noiser=None)

        estimator_config_exact = EstimatorConfig(
            sketch_factory=ExactSet.get_sketch_factory(),
            estimator=LosslessEstimator(),
            sketch_noiser=None,
            estimate_noiser=None)

        estimator_config_hll = EstimatorConfig(
            sketch_factory=HyperLogLogPlusPlus.get_sketch_factory(
                self.sketch_size),
            estimator=HllCardinality(),
            sketch_noiser=None,
            estimate_noiser=None)

        self.name_to_non_noised_estimator_config = {
            'exact_set': estimator_config_exact,
            'cascading_legions': estimator_config_cascading_legions,
            'bloom_filter': estimator_config_bloom_filter,
            'logarithmic_bloom_filter':
            estimator_config_logarithmic_bloom_filter,
            'exponential_bloom_filter':
            estimator_config_exponential_bloom_filter,
            'geometric_bloom_filter': estimator_config_geometric_bloom_filter,
            'vector_of_counts': estimator_config_voc,
            'hll': estimator_config_hll,
        }

        # noised estimators

        noised_estimator_config_cascading_legions = EstimatorConfig(
            sketch_factory=CascadingLegions.get_sketch_factory(
                self.sketch_size, self.sketch_size),
            estimator=Estimator(),
            sketch_noiser=Noiser(self.noiser_flip_probability),
            estimate_noiser=None)

        noised_estimator_config_bloom_filter = EstimatorConfig(
            sketch_factory=BloomFilter.get_sketch_factory(
                self.sketch_size, self.num_bloom_filter_hashes),
            estimator=UnionEstimator(),
            sketch_noiser=BlipNoiser(self.noiser_epsilon,
                                     self.noise_random_state),
            estimate_noiser=None)

        noised_estimator_config_logarithmic_bloom_filter = EstimatorConfig(
            sketch_factory=LogarithmicBloomFilter.get_sketch_factory(
                self.sketch_size),
            estimator=FirstMomentEstimator(
                method='log',
                denoiser=SurrealDenoiser(
                    probability=self.noiser_flip_probability)),
            sketch_noiser=None,
            estimate_noiser=None)

        noised_estimator_config_exponential_bloom_filter = EstimatorConfig(
            sketch_factory=ExponentialBloomFilter.get_sketch_factory(
                self.sketch_size, self.exponential_bloom_filter_decay_rate),
            estimator=FirstMomentEstimator(
                method='exp',
                denoiser=SurrealDenoiser(
                    probability=self.noiser_flip_probability)),
            sketch_noiser=None,
            estimate_noiser=None)

        noised_estimator_config_geometric_bloom_filter = EstimatorConfig(
            sketch_factory=GeometricBloomFilter.get_sketch_factory(
                self.sketch_size),
            estimator=GeometricUnionEstimator(),
            sketch_noiser=BlipNoiser(self.noiser_epsilon,
                                     self.noise_random_state),
            estimate_noiser=None)

        noised_estimator_config_voc = EstimatorConfig(
            sketch_factory=VectorOfCounts.get_sketch_factory(self.sketch_size),
            estimator=SequentialEstimator(),
            sketch_noiser=LaplaceNoiser(),
            estimate_noiser=None)

        noised_estimator_config_exact = EstimatorConfig(
            sketch_factory=ExactSet.get_sketch_factory(),
            estimator=LosslessEstimator(),
            sketch_noiser=AddRandomElementsNoiser(1, self.noise_random_state),
            estimate_noiser=None)

        self.name_to_noised_estimator_config = {
            'exact_set': noised_estimator_config_exact,
            'cascading_legions': noised_estimator_config_cascading_legions,
            'bloom_filter': noised_estimator_config_bloom_filter,
            'logarithmic_bloom_filter':
            noised_estimator_config_logarithmic_bloom_filter,
            'exponential_bloom_filter':
            noised_estimator_config_exponential_bloom_filter,
            'geometric_bloom_filter':
            noised_estimator_config_geometric_bloom_filter,
            'vector_of_counts': noised_estimator_config_voc,
        }
示例#24
0
def main(argv):
  if len(argv) > 1:
    raise app.UsageError('Too many command-line arguments.')

  estimator_config_cascading_legions = SketchEstimatorConfig(
      name='cascading-legions',
      sketch_factory=CascadingLegions.get_sketch_factory(
          FLAGS.sketch_size, FLAGS.sketch_size),
      estimator=Estimator())

  estimator_config_bloom_filter = SketchEstimatorConfig(
      name='bloom_filter-union_estimator',
      sketch_factory=BloomFilter.get_sketch_factory(
          FLAGS.sketch_size, FLAGS.num_bloom_filter_hashes),
      estimator=UnionEstimator())

  estimator_config_logarithmic_bloom_filter = SketchEstimatorConfig(
      name='log_bloom_filter-first_moment_log',
      sketch_factory=LogarithmicBloomFilter.get_sketch_factory(
          FLAGS.sketch_size),
      estimator=FirstMomentEstimator(method='log'))

  estimator_config_exponential_bloom_filter = SketchEstimatorConfig(
      name='exp_bloom_filter-first_moment_exp',
      sketch_factory=ExponentialBloomFilter.get_sketch_factory(
          FLAGS.sketch_size, FLAGS.exponential_bloom_filter_decay_rate),
      estimator=FirstMomentEstimator(method='exp'))

  estimator_config_voc = SketchEstimatorConfig(
      name='vector_of_counts-sequential',
      sketch_factory=VectorOfCounts.get_sketch_factory(FLAGS.sketch_size),
      estimator=SequentialEstimator())

  estimator_config_hll = SketchEstimatorConfig(
      name='hll++',
      sketch_factory=HyperLogLogPlusPlus.get_sketch_factory(FLAGS.sketch_size),
      estimator=HllCardinality())

  estimator_config_exact = SketchEstimatorConfig(
      name='exact_set-lossless',
      sketch_factory=ExactSet.get_sketch_factory(),
      estimator=LosslessEstimator())

  estimator_config_list = [
      estimator_config_bloom_filter,
      estimator_config_logarithmic_bloom_filter,
      estimator_config_exponential_bloom_filter,
      estimator_config_cascading_legions,
      estimator_config_exact,
      estimator_config_hll,
      estimator_config_voc,
  ]

  name_to_estimator_config = {
      'bloom_filter': estimator_config_bloom_filter,
      'logarithmic_bloom_filter': estimator_config_logarithmic_bloom_filter,
      'exponential_bloom_filter': estimator_config_exponential_bloom_filter,
      'cascading_legions': estimator_config_cascading_legions,
      'exact_set': estimator_config_exact,
      'hll++': estimator_config_hll,
      'vector_of_counts': estimator_config_voc,
  }
  set_generator_factory = (
      set_generator.IndependentSetGenerator.
      get_generator_factory_with_num_and_size(
          universe_size=FLAGS.universe_size,
          num_sets=FLAGS.number_of_sets,
          set_size=FLAGS.set_size))

  for estimator_method_config in estimator_config_list:
    print(f'Calculations for {estimator_method_config.name}')
    set_rs = np.random.RandomState(1)
    sketch_rs = np.random.RandomState(1)
    simulator = Simulator(
        num_runs=FLAGS.number_of_trials,
        set_generator_factory=set_generator_factory,
        sketch_estimator_config=estimator_method_config,
        set_random_state=set_rs,
        sketch_random_state=sketch_rs)

    _, agg_data = simulator.run_all_and_aggregate()
    print(f'Aggregate Statistics for {estimator_method_config.name}')
    print(agg_data)
 def test_independent_set_estimator_single_sketch(self):
   sketch = ExactMultiSet()
   sketch.add_ids([1, 2, 2, 3, 3, 3, 4, 5])
   estimator = IndependentSetEstimator(LosslessEstimator(), 100)
   result = estimator([sketch])
   self.assertEqual(result, [5, 2, 1])
 def test_independent_set_estimator_empty_list(self):
   estimator = IndependentSetEstimator(LosslessEstimator(), 100)
   result = estimator([])
   self.assertEqual(result, [0])
示例#27
0
  def setUp(self):
    super(InteroperabilityTest, self).setUp()
    self.number_of_trials = 2
    self.universe_size = 2000
    self.set_size_list = [5, 7, 9]
    self.large_set_size = 6
    self.small_set_size = 3
    self.sketch_size = 128
    self.number_of_sets = 3
    self.set_size = 50
    self.num_large_sets = 1
    self.num_small_sets = 3
    self.order = set_generator.ORDER_RANDOM
    self.user_activity_association = (
        set_generator.USER_ACTIVITY_ASSOCIATION_INDEPENDENT)
    self.shared_prop = 0.2
    self.num_bloom_filter_hashes = 2
    self.exponential_bloom_filter_decay_rate = 10
    self.noiser_epsilon = np.log(3)
    self.noiser_flip_probability = .25

    self.set_random_state = np.random.RandomState(42)
    self.sketch_random_state = np.random.RandomState(137)
    self.noise_random_state = np.random.RandomState(3)

    # non-noised estimators
    estimator_config_cascading_legions = SketchEstimatorConfig(
        name='cascading_legions',
        sketch_factory=CascadingLegions.get_sketch_factory(
            self.sketch_size, self.sketch_size),
        estimator=Estimator())

    estimator_config_bloom_filter = SketchEstimatorConfig(
        name='bloom_filter-union_estimator',
        sketch_factory=BloomFilter.get_sketch_factory(
            self.sketch_size, self.num_bloom_filter_hashes),
        estimator=UnionEstimator())

    estimator_config_logarithmic_bloom_filter = SketchEstimatorConfig(
        name='log_bloom_filter-first_moment_log',
        sketch_factory=LogarithmicBloomFilter.get_sketch_factory(
            self.sketch_size),
        estimator=FirstMomentEstimator(method='log'))

    estimator_config_exponential_bloom_filter = SketchEstimatorConfig(
        name='exp_bloom_filter-first_moment_exp',
        sketch_factory=ExponentialBloomFilter.get_sketch_factory(
            self.sketch_size, self.exponential_bloom_filter_decay_rate),
        estimator=FirstMomentEstimator(method='exp'))

    estimator_config_voc = SketchEstimatorConfig(
        name='vector_of_counts-sequential',
        sketch_factory=VectorOfCounts.get_sketch_factory(self.sketch_size),
        estimator=SequentialEstimator())

    estimator_config_exact = SketchEstimatorConfig(
        name='exact_set-lossless',
        sketch_factory=ExactSet.get_sketch_factory(),
        estimator=LosslessEstimator())

    estimator_config_hll = SketchEstimatorConfig(
        name='hyper_log_log',
        sketch_factory=HyperLogLogPlusPlus.get_sketch_factory(self.sketch_size),
        estimator=HllCardinality())

    config_list = [
        estimator_config_exact,
        estimator_config_cascading_legions,
        estimator_config_bloom_filter,
        estimator_config_logarithmic_bloom_filter,
        estimator_config_exponential_bloom_filter,
        estimator_config_voc,
        estimator_config_hll,
    ]

    self.name_to_non_noised_estimator_config = {
        config.name: config for config in config_list
    }

    # noised estimators
    noised_estimator_config_cascading_legions = SketchEstimatorConfig(
        name='cascading_legions',
        sketch_factory=CascadingLegions.get_sketch_factory(
            self.sketch_size, self.sketch_size),
        estimator=Estimator(),
        sketch_noiser=Noiser(self.noiser_flip_probability))

    noised_estimator_config_bloom_filter = SketchEstimatorConfig(
        name='bloom_filter-union_estimator',
        sketch_factory=BloomFilter.get_sketch_factory(
            self.sketch_size, self.num_bloom_filter_hashes),
        estimator=UnionEstimator(),
        sketch_noiser=BlipNoiser(self.noiser_epsilon, self.noise_random_state))

    noised_estimator_config_logarithmic_bloom_filter = SketchEstimatorConfig(
        name='log_bloom_filter-first_moment_log',
        sketch_factory=LogarithmicBloomFilter.get_sketch_factory(
            self.sketch_size),
        estimator=FirstMomentEstimator(
            method='log',
            denoiser=SurrealDenoiser(
                probability=self.noiser_flip_probability)),
        sketch_noiser=BlipNoiser(self.noiser_epsilon, self.noise_random_state))

    noised_estimator_config_exponential_bloom_filter = SketchEstimatorConfig(
        name='exp_bloom_filter-first_moment_exp',
        sketch_factory=ExponentialBloomFilter.get_sketch_factory(
            self.sketch_size, self.exponential_bloom_filter_decay_rate),
        estimator=FirstMomentEstimator(
            method='exp',
            denoiser=SurrealDenoiser(
                probability=self.noiser_flip_probability)),
        sketch_noiser=BlipNoiser(self.noiser_epsilon, self.noise_random_state))

    noised_estimator_config_voc = SketchEstimatorConfig(
        name='vector_of_counts-sequential',
        sketch_factory=VectorOfCounts.get_sketch_factory(self.sketch_size),
        estimator=SequentialEstimator(),
        sketch_noiser=LaplaceNoiser())

    noised_estimator_config_exact = SketchEstimatorConfig(
        name='exact_set-lossless',
        sketch_factory=ExactSet.get_sketch_factory(),
        estimator=LosslessEstimator(),
        sketch_noiser=AddRandomElementsNoiser(1, self.noise_random_state))

    noised_config_list = [
        noised_estimator_config_exact,
        noised_estimator_config_cascading_legions,
        noised_estimator_config_bloom_filter,
        noised_estimator_config_logarithmic_bloom_filter,
        noised_estimator_config_exponential_bloom_filter,
        noised_estimator_config_voc,
    ]

    self.name_to_noised_estimator_config = {
        config.name: config for config in noised_config_list
    }
 def test_lossless_estimator_for_empty_set(self):
     s = ExactMultiSet()
     e = LosslessEstimator()
     self.assertEqual(e([s]), [0])
示例#29
0
 def test_lossless_estimator(self):
   s = ExactSet()
   s.add_ids([1, 2])
   e = LosslessEstimator()
   self.assertEqual(e([s]), 2)
 def test_independent_set_estimator_universe_size_exceeded(self):
   sketch = ExactMultiSet()
   sketch.add_ids(range(11))
   estimator = IndependentSetEstimator(LosslessEstimator(), 10)
   with self.assertRaises(AssertionError):
       result = estimator([sketch])