Exemplo n.º 1
0
 def test_clip_empty_vector_of_count(self, epsilon, clip_threshold,
                                     expected):
     sketch = VectorOfCounts(num_buckets=2, random_seed=0)
     sketch.stats = np.array([2, 2])
     pairwise_estimator = PairwiseEstimator(clip=True,
                                            epsilon=epsilon,
                                            clip_threshold=clip_threshold)
     res = pairwise_estimator.clip_empty_vector_of_count(sketch)
     np.testing.assert_array_equal(res.stats, expected)
Exemplo n.º 2
0
 def _create_sketches(this_stats, that_stats):
   sketches = []
   for stats in [this_stats, that_stats]:
     sketch = None
     if stats is not None:
       sketch = VectorOfCounts(num_buckets=2, random_seed=1)
       sketch.stats = np.array(stats)
     sketches.append(sketch)
   return sketches
Exemplo n.º 3
0
 def test_merge_no_clip(self):
     sketch_list = []
     for _ in range(2):
         sketch = VectorOfCounts(num_buckets=2, random_seed=2)
         sketch.add_ids([1])
         sketch_list.append(sketch)
     pairwise_estimator = PairwiseEstimator()
     merged = pairwise_estimator.merge(sketch_list[0], sketch_list[1])
     np.testing.assert_array_equal(np.sort(merged.stats), np.array([0,
                                                                    1.5]))
Exemplo n.º 4
0
 def test_estimate_cardinality_no_clip(self):
     sketch_list = []
     for _ in range(3):
         sketch = VectorOfCounts(num_buckets=2, random_seed=3)
         sketch.add_ids([1])
         sketch_list.append(sketch)
     estimator = SequentialEstimator()
     result = estimator(sketch_list)[0]
     actual = 1.75
     self.assertEqual(result, actual)
Exemplo n.º 5
0
 def test_has_full_intersection(self):
     pairwise_estimator = PairwiseEstimator()
     this = VectorOfCounts(num_buckets=64, random_seed=2)
     this.add_ids(range(100))
     that = VectorOfCounts(num_buckets=64, random_seed=2)
     that.add_ids(range(100))
     intersection_cardinality = pairwise_estimator._intersection(this, that)
     self.assertTrue(
         pairwise_estimator.has_full_intersection(intersection_cardinality,
                                                  this, that))
Exemplo n.º 6
0
 def test_has_zero_intersection(self):
     pairwise_estimator = PairwiseEstimator()
     this = VectorOfCounts(num_buckets=64, random_seed=2)
     this.add_ids(range(100))
     # Clip relies on hypothesis testing and hence requires a minimum size
     that = VectorOfCounts(num_buckets=64, random_seed=2)
     that.add_ids(range(100, 200))
     intersection_cardinality = pairwise_estimator._intersection(this, that)
     self.assertTrue(
         pairwise_estimator.has_zero_intersection(intersection_cardinality,
                                                  this, that))
Exemplo n.º 7
0
 def test_assert_compatible_not_vector_of_count(self):
     sketch = VectorOfCounts(num_buckets=4, random_seed=2)
     estimator = PairwiseEstimator()
     with self.assertRaises(AssertionError):
         estimator.assert_compatible(sketch, [])
     with self.assertRaises(AssertionError):
         estimator.assert_compatible([], sketch)
Exemplo n.º 8
0
 def test_add_ids_random_state(self):
     sketch1 = VectorOfCounts(num_buckets=8, random_seed=0)
     sketch1.add_ids(range(4))
     sketch2 = VectorOfCounts(num_buckets=8, random_seed=0)
     sketch2.add_ids(range(4))
     np.testing.assert_array_equal(
         sketch1.stats,
         sketch2.stats,
         err_msg='Two VoC are not the same with the same random seed.')
Exemplo n.º 9
0
 def test_identity(self):
     noiser = IdentityNoiser()
     sketch_original = VectorOfCounts(num_buckets=8, random_seed=1)
     sketch_noised = noiser(sketch_original)
     np.testing.assert_array_equal(
         sketch_original.stats,
         sketch_noised.stats,
         err_msg='IdentityNoiser should not change the sketch.')
Exemplo n.º 10
0
 def test_get_std_of_intersection(self, epsilon, intersection_cardinality,
                                  expected):
     this_sketch = VectorOfCounts(num_buckets=4, random_seed=0)
     this_sketch.stats = np.array([2, 2, 0, 0])
     that_sketch = VectorOfCounts(num_buckets=4, random_seed=0)
     that_sketch.stats = np.array([2, 0, 2, 0])
     pairwise_estimator = PairwiseEstimator(clip=True, epsilon=epsilon)
     res = pairwise_estimator._get_std_of_intersection(
         intersection_cardinality, this_sketch, that_sketch)
     self.assertAlmostEqual(res, expected, 2)
Exemplo n.º 11
0
 def test_evaluate_closeness_to_a_value(self, epsilon,
                                        intersection_cardinality,
                                        value_to_compare_with, expected):
     this_sketch = VectorOfCounts(num_buckets=4, random_seed=0)
     this_sketch.stats = np.array([2, 2, 0, 0])
     that_sketch = VectorOfCounts(num_buckets=4, random_seed=0)
     that_sketch.stats = np.array([2, 0, 2, 0])
     pairwise_estimator = PairwiseEstimator(clip=True, epsilon=epsilon)
     res = pairwise_estimator.evaluate_closeness_to_a_value(
         intersection_cardinality, value_to_compare_with, this_sketch,
         that_sketch)
     self.assertAlmostEqual(res, expected, 2)
 def test_estimate_cardinality_with_clip(self):
   base_sketch = VectorOfCounts(num_buckets=64, random_seed=3)
   base_sketch.add_ids(range(100))
   sketch_list_a = [base_sketch]
   sketch_list_b = [base_sketch]
   for _ in range(3):
     empty_sketch = VectorOfCounts(num_buckets=64, random_seed=3)
     sketch_list_a.append(empty_sketch)  # add empty sketch
     sketch_list_b.append(base_sketch)  # add same sketch
   estimator = SequentialEstimator(clip=True)
   result_a = estimator(sketch_list_a)
   result_b = estimator(sketch_list_b)
   self.assertEqual(result_a, base_sketch.cardinality(),
                    msg='Fail to detect the no-intersection case.')
   self.assertEqual(result_b, base_sketch.cardinality(),
                    msg='Fail to detect the full-intersection case.')
Exemplo n.º 13
0
 def test_assert_compatible_not_same_hash_function(self):
     sketch1 = VectorOfCounts(num_buckets=4, random_seed=1)
     sketch2 = VectorOfCounts(num_buckets=4, random_seed=2)
     estimator = PairwiseEstimator()
     with self.assertRaises(AssertionError):
         estimator.assert_compatible(sketch1, sketch2)
Exemplo n.º 14
0
 def test_assert_compatible_not_equal_length(self):
     sketch1 = VectorOfCounts(num_buckets=4, random_seed=2)
     sketch2 = VectorOfCounts(num_buckets=8, random_seed=2)
     estimator = PairwiseEstimator()
     with self.assertRaises(AssertionError):
         estimator.assert_compatible(sketch1, sketch2)
Exemplo n.º 15
0
 def test_laplace(self):
     noiser = LaplaceNoiser()
     sketch_original = VectorOfCounts(num_buckets=8, random_seed=1)
     sketch_noised = noiser(sketch_original)
     for o, n in zip(sketch_original.stats, sketch_noised.stats):
         self.assertNotEqual(o, n)
Exemplo n.º 16
0
 def test_add_ids_multiple_times(self):
     sketch = VectorOfCounts(num_buckets=8, random_seed=0)
     sketch.add_ids([1])
     with self.assertRaises(AssertionError):
         sketch.add_ids([1])
Exemplo n.º 17
0
 def test_add_ids(self):
     sketch = VectorOfCounts(num_buckets=8, random_seed=0)
     sketch.add_ids([1])
     self.assertEqual(sketch.cardinality(), 1)
    def setUp(self):
        super(InteroperabilityTest, self).setUp()
        self.number_of_trials = 2
        self.universe_size = 2000
        self.set_size = 5
        self.large_set_size = 6
        self.small_set_size = 3
        self.sketch_size = 64
        self.number_of_sets = 2
        self.num_large_sets = 1
        self.num_small_sets = 3
        self.order = set_generator.ORDER_RANDOM
        self.user_activity_association = (
            set_generator.USER_ACTIVITY_ASSOCIATION_INDEPENDENT)
        self.shared_prop = 0.2
        self.num_bloom_filter_hashes = 2
        self.exponential_bloom_filter_decay_rate = 10
        self.noiser_epsilon = np.log(3)
        self.noiser_flip_probability = .25

        self.set_random_state = np.random.RandomState(42)
        self.sketch_random_state = np.random.RandomState(137)
        self.noise_random_state = np.random.RandomState(3)

        # non-noised estimators
        estimator_config_cascading_legions = EstimatorConfig(
            sketch_factory=CascadingLegions.get_sketch_factory(
                self.sketch_size, self.sketch_size),
            estimator=Estimator(),
            sketch_noiser=None,
            estimate_noiser=None)

        estimator_config_bloom_filter = EstimatorConfig(
            sketch_factory=BloomFilter.get_sketch_factory(
                self.sketch_size, self.num_bloom_filter_hashes),
            estimator=UnionEstimator(),
            sketch_noiser=None,
            estimate_noiser=None)

        estimator_config_logarithmic_bloom_filter = EstimatorConfig(
            sketch_factory=LogarithmicBloomFilter.get_sketch_factory(
                self.sketch_size),
            estimator=FirstMomentEstimator(method='log'),
            sketch_noiser=None,
            estimate_noiser=None)

        estimator_config_exponential_bloom_filter = EstimatorConfig(
            sketch_factory=ExponentialBloomFilter.get_sketch_factory(
                self.sketch_size, self.exponential_bloom_filter_decay_rate),
            estimator=FirstMomentEstimator(method='exp'),
            sketch_noiser=None,
            estimate_noiser=None)

        estimator_config_geometric_bloom_filter = EstimatorConfig(
            sketch_factory=GeometricBloomFilter.get_sketch_factory(
                self.sketch_size),
            estimator=GeometricUnionEstimator(),
            sketch_noiser=None,
            estimate_noiser=None)

        estimator_config_voc = EstimatorConfig(
            sketch_factory=VectorOfCounts.get_sketch_factory(self.sketch_size),
            estimator=SequentialEstimator(),
            sketch_noiser=None,
            estimate_noiser=None)

        estimator_config_exact = EstimatorConfig(
            sketch_factory=ExactSet.get_sketch_factory(),
            estimator=LosslessEstimator(),
            sketch_noiser=None,
            estimate_noiser=None)

        estimator_config_hll = EstimatorConfig(
            sketch_factory=HyperLogLogPlusPlus.get_sketch_factory(
                self.sketch_size),
            estimator=HllCardinality(),
            sketch_noiser=None,
            estimate_noiser=None)

        self.name_to_non_noised_estimator_config = {
            'exact_set': estimator_config_exact,
            'cascading_legions': estimator_config_cascading_legions,
            'bloom_filter': estimator_config_bloom_filter,
            'logarithmic_bloom_filter':
            estimator_config_logarithmic_bloom_filter,
            'exponential_bloom_filter':
            estimator_config_exponential_bloom_filter,
            'geometric_bloom_filter': estimator_config_geometric_bloom_filter,
            'vector_of_counts': estimator_config_voc,
            'hll': estimator_config_hll,
        }

        # noised estimators

        noised_estimator_config_cascading_legions = EstimatorConfig(
            sketch_factory=CascadingLegions.get_sketch_factory(
                self.sketch_size, self.sketch_size),
            estimator=Estimator(),
            sketch_noiser=Noiser(self.noiser_flip_probability),
            estimate_noiser=None)

        noised_estimator_config_bloom_filter = EstimatorConfig(
            sketch_factory=BloomFilter.get_sketch_factory(
                self.sketch_size, self.num_bloom_filter_hashes),
            estimator=UnionEstimator(),
            sketch_noiser=BlipNoiser(self.noiser_epsilon,
                                     self.noise_random_state),
            estimate_noiser=None)

        noised_estimator_config_logarithmic_bloom_filter = EstimatorConfig(
            sketch_factory=LogarithmicBloomFilter.get_sketch_factory(
                self.sketch_size),
            estimator=FirstMomentEstimator(
                method='log',
                denoiser=SurrealDenoiser(
                    probability=self.noiser_flip_probability)),
            sketch_noiser=None,
            estimate_noiser=None)

        noised_estimator_config_exponential_bloom_filter = EstimatorConfig(
            sketch_factory=ExponentialBloomFilter.get_sketch_factory(
                self.sketch_size, self.exponential_bloom_filter_decay_rate),
            estimator=FirstMomentEstimator(
                method='exp',
                denoiser=SurrealDenoiser(
                    probability=self.noiser_flip_probability)),
            sketch_noiser=None,
            estimate_noiser=None)

        noised_estimator_config_geometric_bloom_filter = EstimatorConfig(
            sketch_factory=GeometricBloomFilter.get_sketch_factory(
                self.sketch_size),
            estimator=GeometricUnionEstimator(),
            sketch_noiser=BlipNoiser(self.noiser_epsilon,
                                     self.noise_random_state),
            estimate_noiser=None)

        noised_estimator_config_voc = EstimatorConfig(
            sketch_factory=VectorOfCounts.get_sketch_factory(self.sketch_size),
            estimator=SequentialEstimator(),
            sketch_noiser=LaplaceNoiser(),
            estimate_noiser=None)

        noised_estimator_config_exact = EstimatorConfig(
            sketch_factory=ExactSet.get_sketch_factory(),
            estimator=LosslessEstimator(),
            sketch_noiser=AddRandomElementsNoiser(1, self.noise_random_state),
            estimate_noiser=None)

        self.name_to_noised_estimator_config = {
            'exact_set': noised_estimator_config_exact,
            'cascading_legions': noised_estimator_config_cascading_legions,
            'bloom_filter': noised_estimator_config_bloom_filter,
            'logarithmic_bloom_filter':
            noised_estimator_config_logarithmic_bloom_filter,
            'exponential_bloom_filter':
            noised_estimator_config_exponential_bloom_filter,
            'geometric_bloom_filter':
            noised_estimator_config_geometric_bloom_filter,
            'vector_of_counts': noised_estimator_config_voc,
        }
Exemplo n.º 19
0
def main(argv):
  if len(argv) > 1:
    raise app.UsageError('Too many command-line arguments.')

  estimator_config_cascading_legions = SketchEstimatorConfig(
      name='cascading-legions',
      sketch_factory=CascadingLegions.get_sketch_factory(
          FLAGS.sketch_size, FLAGS.sketch_size),
      estimator=Estimator())

  estimator_config_bloom_filter = SketchEstimatorConfig(
      name='bloom_filter-union_estimator',
      sketch_factory=BloomFilter.get_sketch_factory(
          FLAGS.sketch_size, FLAGS.num_bloom_filter_hashes),
      estimator=UnionEstimator())

  estimator_config_logarithmic_bloom_filter = SketchEstimatorConfig(
      name='log_bloom_filter-first_moment_log',
      sketch_factory=LogarithmicBloomFilter.get_sketch_factory(
          FLAGS.sketch_size),
      estimator=FirstMomentEstimator(method='log'))

  estimator_config_exponential_bloom_filter = SketchEstimatorConfig(
      name='exp_bloom_filter-first_moment_exp',
      sketch_factory=ExponentialBloomFilter.get_sketch_factory(
          FLAGS.sketch_size, FLAGS.exponential_bloom_filter_decay_rate),
      estimator=FirstMomentEstimator(method='exp'))

  estimator_config_voc = SketchEstimatorConfig(
      name='vector_of_counts-sequential',
      sketch_factory=VectorOfCounts.get_sketch_factory(FLAGS.sketch_size),
      estimator=SequentialEstimator())

  estimator_config_hll = SketchEstimatorConfig(
      name='hll++',
      sketch_factory=HyperLogLogPlusPlus.get_sketch_factory(FLAGS.sketch_size),
      estimator=HllCardinality())

  estimator_config_exact = SketchEstimatorConfig(
      name='exact_set-lossless',
      sketch_factory=ExactSet.get_sketch_factory(),
      estimator=LosslessEstimator())

  estimator_config_list = [
      estimator_config_bloom_filter,
      estimator_config_logarithmic_bloom_filter,
      estimator_config_exponential_bloom_filter,
      estimator_config_cascading_legions,
      estimator_config_exact,
      estimator_config_hll,
      estimator_config_voc,
  ]

  name_to_estimator_config = {
      'bloom_filter': estimator_config_bloom_filter,
      'logarithmic_bloom_filter': estimator_config_logarithmic_bloom_filter,
      'exponential_bloom_filter': estimator_config_exponential_bloom_filter,
      'cascading_legions': estimator_config_cascading_legions,
      'exact_set': estimator_config_exact,
      'hll++': estimator_config_hll,
      'vector_of_counts': estimator_config_voc,
  }
  set_generator_factory = (
      set_generator.IndependentSetGenerator.
      get_generator_factory_with_num_and_size(
          universe_size=FLAGS.universe_size,
          num_sets=FLAGS.number_of_sets,
          set_size=FLAGS.set_size))

  for estimator_method_config in estimator_config_list:
    print(f'Calculations for {estimator_method_config.name}')
    set_rs = np.random.RandomState(1)
    sketch_rs = np.random.RandomState(1)
    simulator = Simulator(
        num_runs=FLAGS.number_of_trials,
        set_generator_factory=set_generator_factory,
        sketch_estimator_config=estimator_method_config,
        set_random_state=set_rs,
        sketch_random_state=sketch_rs)

    _, agg_data = simulator.run_all_and_aggregate()
    print(f'Aggregate Statistics for {estimator_method_config.name}')
    print(agg_data)
Exemplo n.º 20
0
 def test_get_std_of_sketch_sum(self, epsilon, expected):
     sketch = VectorOfCounts(num_buckets=2, random_seed=0)
     sketch.stats = np.array([2, 2])
     pairwise_estimator = PairwiseEstimator(clip=True, epsilon=epsilon)
     res = pairwise_estimator._get_std_of_sketch_sum(sketch)
     self.assertEqual(res, expected)
Exemplo n.º 21
0
 def test_merge_with_clip(self):
     this_sketch = VectorOfCounts(num_buckets=64, random_seed=2)
     this_sketch.add_ids(range(100))
     # First test no intersection
     that_sketch = VectorOfCounts(num_buckets=64, random_seed=2)
     that_sketch.add_ids(range(100, 200))
     pairwise_estimator = PairwiseEstimator(clip=True)
     merged = pairwise_estimator.merge(this_sketch, that_sketch)
     np.testing.assert_array_equal(
         x=merged.stats,
         y=this_sketch.stats + that_sketch.stats,
         err_msg='Fail to detect the no-intersection case.')
     # Then test full intersection
     that_sketch = VectorOfCounts(num_buckets=64, random_seed=2)
     that_sketch.add_ids(range(100))
     merged = pairwise_estimator.merge(this_sketch, that_sketch)
     np.testing.assert_array_equal(
         x=merged.stats,
         y=this_sketch.stats,
         err_msg='Fail to detect the full-intersection case.')
Exemplo n.º 22
0
  def setUp(self):
    super(InteroperabilityTest, self).setUp()
    self.number_of_trials = 2
    self.universe_size = 2000
    self.set_size_list = [5, 7, 9]
    self.large_set_size = 6
    self.small_set_size = 3
    self.sketch_size = 128
    self.number_of_sets = 3
    self.set_size = 50
    self.num_large_sets = 1
    self.num_small_sets = 3
    self.order = set_generator.ORDER_RANDOM
    self.user_activity_association = (
        set_generator.USER_ACTIVITY_ASSOCIATION_INDEPENDENT)
    self.shared_prop = 0.2
    self.num_bloom_filter_hashes = 2
    self.exponential_bloom_filter_decay_rate = 10
    self.noiser_epsilon = np.log(3)
    self.noiser_flip_probability = .25

    self.set_random_state = np.random.RandomState(42)
    self.sketch_random_state = np.random.RandomState(137)
    self.noise_random_state = np.random.RandomState(3)

    # non-noised estimators
    estimator_config_cascading_legions = SketchEstimatorConfig(
        name='cascading_legions',
        sketch_factory=CascadingLegions.get_sketch_factory(
            self.sketch_size, self.sketch_size),
        estimator=Estimator())

    estimator_config_bloom_filter = SketchEstimatorConfig(
        name='bloom_filter-union_estimator',
        sketch_factory=BloomFilter.get_sketch_factory(
            self.sketch_size, self.num_bloom_filter_hashes),
        estimator=UnionEstimator())

    estimator_config_logarithmic_bloom_filter = SketchEstimatorConfig(
        name='log_bloom_filter-first_moment_log',
        sketch_factory=LogarithmicBloomFilter.get_sketch_factory(
            self.sketch_size),
        estimator=FirstMomentEstimator(method='log'))

    estimator_config_exponential_bloom_filter = SketchEstimatorConfig(
        name='exp_bloom_filter-first_moment_exp',
        sketch_factory=ExponentialBloomFilter.get_sketch_factory(
            self.sketch_size, self.exponential_bloom_filter_decay_rate),
        estimator=FirstMomentEstimator(method='exp'))

    estimator_config_voc = SketchEstimatorConfig(
        name='vector_of_counts-sequential',
        sketch_factory=VectorOfCounts.get_sketch_factory(self.sketch_size),
        estimator=SequentialEstimator())

    estimator_config_exact = SketchEstimatorConfig(
        name='exact_set-lossless',
        sketch_factory=ExactSet.get_sketch_factory(),
        estimator=LosslessEstimator())

    estimator_config_hll = SketchEstimatorConfig(
        name='hyper_log_log',
        sketch_factory=HyperLogLogPlusPlus.get_sketch_factory(self.sketch_size),
        estimator=HllCardinality())

    config_list = [
        estimator_config_exact,
        estimator_config_cascading_legions,
        estimator_config_bloom_filter,
        estimator_config_logarithmic_bloom_filter,
        estimator_config_exponential_bloom_filter,
        estimator_config_voc,
        estimator_config_hll,
    ]

    self.name_to_non_noised_estimator_config = {
        config.name: config for config in config_list
    }

    # noised estimators
    noised_estimator_config_cascading_legions = SketchEstimatorConfig(
        name='cascading_legions',
        sketch_factory=CascadingLegions.get_sketch_factory(
            self.sketch_size, self.sketch_size),
        estimator=Estimator(),
        sketch_noiser=Noiser(self.noiser_flip_probability))

    noised_estimator_config_bloom_filter = SketchEstimatorConfig(
        name='bloom_filter-union_estimator',
        sketch_factory=BloomFilter.get_sketch_factory(
            self.sketch_size, self.num_bloom_filter_hashes),
        estimator=UnionEstimator(),
        sketch_noiser=BlipNoiser(self.noiser_epsilon, self.noise_random_state))

    noised_estimator_config_logarithmic_bloom_filter = SketchEstimatorConfig(
        name='log_bloom_filter-first_moment_log',
        sketch_factory=LogarithmicBloomFilter.get_sketch_factory(
            self.sketch_size),
        estimator=FirstMomentEstimator(
            method='log',
            denoiser=SurrealDenoiser(
                probability=self.noiser_flip_probability)),
        sketch_noiser=BlipNoiser(self.noiser_epsilon, self.noise_random_state))

    noised_estimator_config_exponential_bloom_filter = SketchEstimatorConfig(
        name='exp_bloom_filter-first_moment_exp',
        sketch_factory=ExponentialBloomFilter.get_sketch_factory(
            self.sketch_size, self.exponential_bloom_filter_decay_rate),
        estimator=FirstMomentEstimator(
            method='exp',
            denoiser=SurrealDenoiser(
                probability=self.noiser_flip_probability)),
        sketch_noiser=BlipNoiser(self.noiser_epsilon, self.noise_random_state))

    noised_estimator_config_voc = SketchEstimatorConfig(
        name='vector_of_counts-sequential',
        sketch_factory=VectorOfCounts.get_sketch_factory(self.sketch_size),
        estimator=SequentialEstimator(),
        sketch_noiser=LaplaceNoiser())

    noised_estimator_config_exact = SketchEstimatorConfig(
        name='exact_set-lossless',
        sketch_factory=ExactSet.get_sketch_factory(),
        estimator=LosslessEstimator(),
        sketch_noiser=AddRandomElementsNoiser(1, self.noise_random_state))

    noised_config_list = [
        noised_estimator_config_exact,
        noised_estimator_config_cascading_legions,
        noised_estimator_config_bloom_filter,
        noised_estimator_config_logarithmic_bloom_filter,
        noised_estimator_config_exponential_bloom_filter,
        noised_estimator_config_voc,
    ]

    self.name_to_noised_estimator_config = {
        config.name: config for config in noised_config_list
    }