Пример #1
0
    def test_sample_matrix_noisy(self):
        s1 = liquid_legions.LiquidLegions(2.0, 100000, random_seed=42)
        s2 = s1.get_compatible_sketch()

        s1.add_ids(list(range(30000)))
        s2.add_ids(list(range(20000, 40000)))

        dp_p = 0.25
        noiser = liquid_legions.Noiser(dp_p)
        s1, s2 = list(map(noiser, [s1, s2]))

        e = liquid_legions.VennEstimator([s1, s2])
        logging.info('Venn: %s', e.estimate_from_all())
        sampler = liquid_legions.Sampler([s1, s2])
        posteriors = sampler.get_all_posteriors()
        logging.info('Row sums: %s', posteriors.sum(axis=1))

        sample = sampler.sample_matrix()
        logging.info('Sample shape: %s', sample.shape)
        counts = sample.sum(axis=0)
        logging.info('Bit count expectations: %s', counts)
        s1_cardinality = sampler.sketch.get_cardinality_for_legionaries_count(
            counts[0])
        s2_cardinality = sampler.sketch.get_cardinality_for_legionaries_count(
            counts[1])
        logging.info('s1 cardinality: %s', s1_cardinality)
        logging.info('s2 cardinality: %s', s2_cardinality)
        self.assertAlmostEqual(s1_cardinality, 30000, delta=1500)
        self.assertAlmostEqual(s2_cardinality, 20000, delta=1000)
Пример #2
0
    def test_sample_noisy_large(self):
        s1 = liquid_legions.LiquidLegions(2.0, 100000, random_seed=42)
        s2 = s1.get_compatible_sketch()

        s1.add_ids(list(range(300000)))
        s2.add_ids(list(range(200000, 400000)))

        noised_s1 = liquid_legions.Noiser(0.25)(s1)
        noised_s2 = liquid_legions.Noiser(0.1)(s2)

        sampler = liquid_legions.Sampler([noised_s1, noised_s2])

        m = sampler.sample_matrix()
        logging.info('Sampled matrix: %s', m.sum(axis=0))

        sampled_s1, sampled_s2 = sampler.sample()

        logging.info('s1 legionaries: %d', sampled_s1.legionaries_count())
        logging.info('s1 cardinality: %.3f', sampled_s1.get_cardinality())
        logging.info('s2 cardinality: %.3f', sampled_s2.get_cardinality())
        union = liquid_legions.LiquidLegions.merge_of([s1, s2])
        logging.info('s1|s2 cardinality: %.3f', union.get_cardinality())
        self.assertAlmostEqual(sampled_s1.get_cardinality(),
                               300000,
                               delta=15000)
        self.assertAlmostEqual(sampled_s2.get_cardinality(),
                               200000,
                               delta=10000)
        self.assertAlmostEqual(union.get_cardinality(), 400000, delta=20000)
Пример #3
0
    def test_posteriors_noisy(self):
        s1 = liquid_legions.LiquidLegions(2.0, 100000, random_seed=42)
        s2 = s1.get_compatible_sketch()

        s1.add_ids(list(range(30000)))
        s2.add_ids(list(range(20000, 40000)))

        s1 = liquid_legions.Noiser(0.3)(s1)
        s2 = liquid_legions.Noiser(0.2)(s2)

        e = liquid_legions.VennEstimator([s1, s2])
        logging.info('Venn: %s', e.estimate_from_all())
        sampler = liquid_legions.Sampler([s1, s2])
        posteriors = sampler.get_all_posteriors()
        logging.info('Row sums: %s', posteriors.sum(axis=1))

        counts = posteriors.sum(axis=0)
        logging.info('Bit count expectations: %s', counts)
        s1_cardinality = sampler.sketch.get_cardinality_for_legionaries_count(
            counts[1] + counts[3])
        logging.info('s1 cardinality: %.3f', s1_cardinality)
        s2_cardinality = sampler.sketch.get_cardinality_for_legionaries_count(
            counts[2] + counts[3])
        logging.info('s2 cardinality: %.3f', s2_cardinality)
        union_cardinality = sampler.sketch.get_cardinality_for_legionaries_count(
            counts[1] + counts[2] + counts[3])
        logging.info('s1 | s2 cardinality: %.3f', union_cardinality)
        self.assertAlmostEqual(s1_cardinality, 30000, delta=1500)
        self.assertAlmostEqual(s2_cardinality, 20000, delta=1000)
        self.assertAlmostEqual(union_cardinality, 40000, delta=2000)
Пример #4
0
    def test_pure_sketch_building(self):
        l = liquid_legions.LiquidLegions(10, 100000, random_seed=42)

        for i in range(1000000):
            if i % 10000 == 0:
                self.assertAlmostEqual(float(i),
                                       float(l.get_cardinality()),
                                       delta=(i + 1) * 0.05)
            l.add_id(i)
Пример #5
0
 def make_chain(self, num_sketches, a, m, set_halfsize):
     result = []
     for i in range(num_sketches):
         sketch = liquid_legions.LiquidLegions(a, m, random_seed=42)
         for j in range(set_halfsize):
             item_1 = farmhash.hash32withseed(f'i{i}_{j}', 567)
             item_2 = farmhash.hash32withseed(f'i{i + 1}_{j}', 567)
             sketch.add_id(item_1)
             sketch.add_id(item_2)
         result.append(sketch)
     return result
Пример #6
0
 def test_set_difference_pure(self):
     s1 = liquid_legions.LiquidLegions(3.0, 100000, random_seed=42)
     s2 = liquid_legions.LiquidLegions(3.0, 100000, random_seed=42)
     s1.add_ids(list(range(0, 60000)))
     s2.add_ids(list(range(40000, 150000)))
     # Expected diff is range(0, 40000), i.e. size 40k.
     sampler = liquid_legions.Sampler([s1, s2])
     estimated_diff_sketch = sampler.sample_diff()
     diff_sketch = s1.get_compatible_sketch()
     diff_sketch.add_ids(range(0, 40000))
     logging.info('Diff cardinality: %.3f',
                  estimated_diff_sketch.get_cardinality())
     self.assertAlmostEqual(estimated_diff_sketch.get_cardinality(),
                            40000,
                            delta=2000)
     match_count = {}
     for i in range(100000):
         estimated_bit = estimated_diff_sketch.sketch.get(i, 0) > 0
         true_bit = diff_sketch.sketch.get(i, 0) > 0
         match_count[true_bit, estimated_bit] = match_count.get(
             (true_bit, estimated_bit), 0) + 1
     logging.info('Match / mismatch counts: %s',
                  sorted(match_count.items()))
Пример #7
0
    def test_venn_priors_two_tiny(self):
        s1 = liquid_legions.LiquidLegions(20.0, 10, random_seed=42)
        s2 = s1.get_compatible_sketch()

        s1.add_ids(list(range(10)))
        s2.add_ids(list(range(10000)))

        e = liquid_legions.VennEstimator([s1, s2])
        logging.info('Venn: %s', e.estimate_from_all())
        sampler = liquid_legions.Sampler([s1, s2])
        priors = sampler.get_all_venn_priors()
        logging.info('Priors: %s', priors)
        logging.info('Row sums: %s', priors.sum(axis=1))
        self.assertAlmostEqual(priors[0][3], 1.0)
        self.assertAlmostEqual(priors[4][2], 0.95, delta=0.1)
        self.assertAlmostEqual(priors[9][0], 0.95, delta=0.1)
Пример #8
0
    def test_posteriors_tiny_pure(self):
        s1 = liquid_legions.LiquidLegions(20.0, 10, random_seed=42)
        s2 = s1.get_compatible_sketch()

        s1.add_ids(list(range(10)))
        s2.add_ids(list(range(10000)))

        e = liquid_legions.VennEstimator([s1, s2])
        logging.info('Venn: %s', e.estimate_from_all())
        sampler = liquid_legions.Sampler([s1, s2])
        posteriors = sampler.get_all_posteriors()
        logging.info('Posteriors: %s', posteriors)
        logging.info('Row sums: %s', posteriors.sum(axis=1))
        self.assertAlmostEqual(posteriors[0, 3], 1.0)
        self.assertAlmostEqual(posteriors[3, 2], 1.0)
        for x in posteriors.sum(axis=1):
            self.assertAlmostEqual(x, 1.0)
Пример #9
0
 def test_sequential_estimator(self):
     noiser = liquid_legions.Noiser(0.25)
     estimator = liquid_legions.SequentialEstimator()
     true_set = set()
     universe_set = list(range(1000000))
     sketches = []
     for i in range(10):
         new_set = numpy.random.choice(universe_set, size=10000)
         sketch = liquid_legions.LiquidLegions(10, 10000, random_seed=42)
         sketch.add_ids(new_set)
         true_set = true_set | set(new_set)
         noised_sketch = noiser(sketch)
         sketches.append(noised_sketch)
         logging.info('Step %d, cardinality %.3f, true_cardinality %d.', i,
                      estimator(sketches), len(true_set))
     self.assertAlmostEqual(len(true_set),
                            estimator(sketches),
                            delta=len(true_set) * 0.25)
Пример #10
0
    def test_distribution_given_observation(self):
        s1 = liquid_legions.LiquidLegions(20.0, 10, random_seed=42)
        s2 = s1.get_compatible_sketch()

        s1.add_ids(list(range(10)))
        s2.add_ids(list(range(10000)))

        s1, s2 = map(liquid_legions.Noiser(0.01), [s1, s2])
        sampler = liquid_legions.Sampler([s1, s2])
        logging.info('Transition matrix: %s', sampler.transition_matrix)
        logging.info('Distribution: %s',
                     sampler.distribution_given_observation(1))
        self.assertAlmostEqual(sampler.transition_matrix[0, 0],
                               0.98,
                               delta=0.001)
        self.assertAlmostEqual(sampler.transition_matrix[0, 1],
                               0.0099,
                               delta=0.0001)
Пример #11
0
    def test_venn_priors_two(self):
        s1 = liquid_legions.LiquidLegions(20.0, 10000, random_seed=42)
        s2 = s1.get_compatible_sketch()

        s1.add_ids(list(range(1000)))
        s2.add_ids(list(range(20000)))

        e = liquid_legions.VennEstimator([s1, s2])
        logging.info('Venn: %s', e.estimate_from_all())
        sampler = liquid_legions.Sampler([s1, s2])
        priors = sampler.get_all_venn_priors()
        logging.info('Priors: %s', priors)
        counts = priors.sum(axis=0)
        logging.info('Counts: %s', counts)
        s1_cardinality = sampler.sketch.get_cardinality_for_legionaries_count(
            counts[1] + counts[3])
        s2_cardinality = sampler.sketch.get_cardinality_for_legionaries_count(
            counts[2] + counts[3])
        logging.info('Cardinalities: %.3f, %.3f', s1_cardinality,
                     s2_cardinality)
        self.assertAlmostEqual(s1_cardinality, 1000, delta=100)
        self.assertAlmostEqual(s2_cardinality, 20000, delta=2000)
Пример #12
0
    def test_manual_sequential_merge_large_overlap_pure(self):
        noiser = liquid_legions.Noiser(0.0)  # No noise.

        s = liquid_legions.LiquidLegions(10.0, 50000, random_seed=42)

        true_set = set()
        for i in range(10):
            a = s.get_compatible_sketch()
            new_set = range(i * 1000, i * 1000 + 10000)
            a.add_ids(new_set)
            true_set = true_set | set(new_set)
            noised_a = noiser(a)
            sampler = liquid_legions.Sampler([s, noised_a])
            _, sampled_a = sampler.sample()
            venn_estimator = liquid_legions.VennEstimator([s, noised_a])
            logging.info('Venn: %s', venn_estimator())

            s.merge_in(sampled_a)
            logging.info('Step %d, cardinality %.3f, true cardinality: %d', i,
                         s.get_cardinality(), len(true_set))
        self.assertAlmostEqual(len(true_set),
                               s.get_cardinality(),
                               delta=len(true_set) * 0.1)
Пример #13
0
 def test_venn_estimator_pure_single(self):
     s = liquid_legions.LiquidLegions(5, 100000, random_seed=42)
     s.add_ids(range(2000))
     e = liquid_legions.VennEstimator([s])
     self.assertAlmostEqual(e.estimate_from_all()[1], 2000, delta=50)
     logging.info('Venn: %s', e.estimate_from_all())