示例#1
0
    def setUp(self):
        super().setUp()
        self.redis.delete('recently-consumed')

        # Construct a set of links that the user has seen.
        self.seen_links = set()
        while len(self.seen_links) < 100:
            fullname = self.random_fullname()
            self.seen_links.add(fullname)

        # Construct a set of links that the user hasn't seen.  Ensure that
        # there's no intersection between the seen set and the unseen set.
        self.unseen_links = set()
        while len(self.unseen_links) < 100:
            fullname = self.random_fullname()
            if fullname not in self.seen_links:
                self.unseen_links.add(fullname)

        # Initialize the recently consumed Bloom filter on the seen set.
        self.recently_consumed = BloomFilter(
            self.seen_links,
            num_values=1000,
            false_positives=0.001,
            key='recently-consumed',
        )
示例#2
0
 def test_init_without_iterable(self):
     'Test BloomFilter.__init__() without an iterable for initialization'
     dilberts = BloomFilter(num_elements=100, false_positives=0.01)
     assert dilberts.num_elements == 100
     assert dilberts.false_positives == 0.01
     assert 'rajiv' not in dilberts
     assert 'raj' not in dilberts
     assert 'dan' not in dilberts
     assert 'eric' not in dilberts
     assert dilberts._num_bits_set() == 0
     assert len(dilberts) == 0
示例#3
0
 def test_init_without_iterable(self):
     'Test BloomFilter.__init__() without an iterable for initialization'
     dilberts = BloomFilter(num_values=100, false_positives=0.01)
     assert dilberts.num_values == 100
     assert dilberts.false_positives == 0.01
     assert 'rajiv' not in dilberts
     assert 'raj' not in dilberts
     assert 'dan' not in dilberts
     assert 'eric' not in dilberts
     assert dilberts._num_bits_set() == 0
     assert len(dilberts) == 0
示例#4
0
    def setUp(self):
        super().setUp()
        self.redis.delete('recently-consumed')

        # Construct a set of links that the user has seen.
        self.seen_links = set()
        while len(self.seen_links) < 100:
            fullname = self.random_fullname()
            self.seen_links.add(fullname)

        # Construct a set of links that the user hasn't seen.  Ensure that
        # there's no intersection between the seen set and the unseen set.
        self.unseen_links = set()
        while len(self.unseen_links) < 100:
            fullname = self.random_fullname()
            if fullname not in self.seen_links:
                self.unseen_links.add(fullname)

        # Initialize the recently consumed Bloom filter on the seen set.
        self.recently_consumed = BloomFilter(
            self.seen_links,
            num_values=1000,
            false_positives=0.001,
            key='recently-consumed',
        )
示例#5
0
 def test_repr(self):
     'Test BloomFilter.__repr__()'
     dilberts = BloomFilter(
         num_elements=100,
         false_positives=0.01,
         key=self._KEY,
     )
     assert repr(dilberts) == f'<BloomFilter key={self._KEY}>'
示例#6
0
 def test_repr(self):
     'Test BloomFilter.__repr__()'
     dilberts = BloomFilter(
         num_values=100,
         false_positives=0.01,
         key='dilberts',
     )
     assert repr(dilberts) == '<BloomFilter key=dilberts>'
示例#7
0
 def test_repr(self):
     'Test BloomFilter.__repr__()'
     dilberts = BloomFilter(
         num_values=100,
         false_positives=0.01,
         key=self._KEY,
     )
     assert repr(dilberts) == '<BloomFilter key={}>'.format(self._KEY)
示例#8
0
 def test_init_with_iterable(self):
     'Test BloomFilter.__init__() with an iterable for initialization'
     dilberts = BloomFilter(
         {'rajiv', 'raj'},
         num_elements=100,
         false_positives=0.01,
     )
     assert dilberts.num_elements == 100
     assert dilberts.false_positives == 0.01
     assert 'rajiv' in dilberts
     assert 'raj' in dilberts
     assert 'dan' not in dilberts
     assert 'eric' not in dilberts
     # We've inserted two elements into dilberts: 'rajiv' and 'raj'.  So
     # unless dilberts._bit_offsets('rajiv') and
     # dilberts._bit_offsets('raj') perfectly collide/overlap, they differ
     # by at least 1 bit, hence dilberts.num_hashes() + 1:
     assert dilberts._num_bits_set() > dilberts.num_hashes() + 1
     assert len(dilberts) == 2
示例#9
0
 def test_init_with_iterable(self):
     'Test BloomFilter.__init__() with an iterable for initialization'
     dilberts = BloomFilter(
         {'rajiv', 'raj'},
         num_values=100,
         false_positives=0.01,
     )
     assert dilberts.num_values == 100
     assert dilberts.false_positives == 0.01
     assert 'rajiv' in dilberts
     assert 'raj' in dilberts
     assert 'dan' not in dilberts
     assert 'eric' not in dilberts
     # We've inserted two elements into dilberts: 'rajiv' and 'raj'.  So
     # unless dilberts._bit_offsets('rajiv') and
     # dilberts._bit_offsets('raj') perfectly collide/overlap, they differ
     # by at least 1 bit, hence dilberts.num_hashes() + 1:
     assert dilberts._num_bits_set() > dilberts.num_hashes() + 1
     assert len(dilberts) == 2
示例#10
0
    def test_update(self):
        'Test BloomFilter update(), __contains__(), and __len__()'
        dilberts = BloomFilter(
            redis=self.redis,
            num_elements=100,
            false_positives=0.01,
        )
        assert 'rajiv' not in dilberts
        assert 'raj' not in dilberts
        assert 'dan' not in dilberts
        assert 'eric' not in dilberts
        assert 'jenny' not in dilberts
        assert 'will' not in dilberts
        assert 'rhodes' not in dilberts
        assert len(dilberts) == 0

        dilberts.update({'rajiv', 'raj'}, {'dan', 'eric'})
        assert 'rajiv' in dilberts
        assert 'raj' in dilberts
        assert 'dan' in dilberts
        assert 'eric' in dilberts
        assert 'jenny' not in dilberts
        assert 'will' not in dilberts
        assert 'rhodes' not in dilberts
        assert len(dilberts) == 4

        dilberts.update({'jenny', 'will'})
        assert 'rajiv' in dilberts
        assert 'raj' in dilberts
        assert 'dan' in dilberts
        assert 'eric' in dilberts
        assert 'jenny' in dilberts
        assert 'will' in dilberts
        assert 'rhodes' not in dilberts
        assert len(dilberts) == 6

        dilberts.update(set())
        assert 'rajiv' in dilberts
        assert 'raj' in dilberts
        assert 'dan' in dilberts
        assert 'eric' in dilberts
        assert 'jenny' in dilberts
        assert 'will' in dilberts
        assert 'rhodes' not in dilberts
        assert len(dilberts) == 6
示例#11
0
    def test_update(self):
        'Test BloomFilter update(), __contains__(), and __len__()'
        dilberts = BloomFilter(num_values=100, false_positives=0.01)
        assert 'rajiv' not in dilberts
        assert 'raj' not in dilberts
        assert 'dan' not in dilberts
        assert 'eric' not in dilberts
        assert 'jenny' not in dilberts
        assert 'will' not in dilberts
        assert 'rhodes' not in dilberts
        assert len(dilberts) == 0

        dilberts.update({'rajiv', 'raj'}, {'dan', 'eric'})
        assert 'rajiv' in dilberts
        assert 'raj' in dilberts
        assert 'dan' in dilberts
        assert 'eric' in dilberts
        assert 'jenny' not in dilberts
        assert 'will' not in dilberts
        assert 'rhodes' not in dilberts
        assert len(dilberts) == 4

        dilberts.update({'jenny', 'will'})
        assert 'rajiv' in dilberts
        assert 'raj' in dilberts
        assert 'dan' in dilberts
        assert 'eric' in dilberts
        assert 'jenny' in dilberts
        assert 'will' in dilberts
        assert 'rhodes' not in dilberts
        assert len(dilberts) == 6

        dilberts.update(set())
        assert 'rajiv' in dilberts
        assert 'raj' in dilberts
        assert 'dan' in dilberts
        assert 'eric' in dilberts
        assert 'jenny' in dilberts
        assert 'will' in dilberts
        assert 'rhodes' not in dilberts
        assert len(dilberts) == 6
示例#12
0
class RecentlyConsumedTests(TestCase):
    "Simulate reddit's recently consumed problem to test our Bloom filter."

    def setUp(self):
        super().setUp()
        self.redis.delete('recently-consumed')

        # Construct a set of links that the user has seen.
        self.seen_links = set()
        while len(self.seen_links) < 100:
            fullname = self.random_fullname()
            self.seen_links.add(fullname)

        # Construct a set of links that the user hasn't seen.  Ensure that
        # there's no intersection between the seen set and the unseen set.
        self.unseen_links = set()
        while len(self.unseen_links) < 100:
            fullname = self.random_fullname()
            if fullname not in self.seen_links:
                self.unseen_links.add(fullname)

        # Initialize the recently consumed Bloom filter on the seen set.
        self.recently_consumed = BloomFilter(
            self.seen_links,
            num_values=1000,
            false_positives=0.001,
            key='recently-consumed',
        )

    def tearDown(self):
        self.recently_consumed.clear()
        super().tearDown()

    @staticmethod
    def random_fullname(*, prefix='t3_', size=6):
        alphabet, id36 = string.digits + string.ascii_lowercase, ''
        for _ in range(size):
            id36 += random.choice(alphabet)
        return prefix + id36

    @staticmethod
    def round(number, *, sig_digits=1):
        '''Round a float to the specified number of significant digits.

        Reference implementation:
            https://github.com/ActiveState/code/blob/3b27230f418b714bc9a0f897cb8ea189c3515e99/recipes/Python/578114_Round_number_specified_number_significant/recipe-578114.py
        '''
        try:
            ndigits = sig_digits - 1 - math.floor(math.log10(abs(number)))
        except ValueError:
            # math.log10(number) raised a ValueError, so number must be 0.0.
            # No need to round 0.0.
            return number
        else:
            return round(number, ndigits)

    def test_zero_false_negatives(self):
        'Ensure that we produce zero false negatives'
        for seen_link in self.seen_links:
            assert seen_link in self.recently_consumed

    def test_acceptable_false_positives(self):
        'Ensure that we produce false positives at an acceptable rate'
        acceptable, actual = self.recently_consumed.false_positives, 0

        for unseen_link in self.unseen_links:
            actual += unseen_link in self.recently_consumed
        actual /= len(self.unseen_links)
        actual = self.round(actual, sig_digits=1)

        message = 'acceptable: {}; actual: {}'.format(acceptable, actual)
        assert actual <= acceptable, message
示例#13
0
    def test_size_and_num_hashes(self):
        'Test BloomFilter.size()'
        dilberts = BloomFilter(num_values=100, false_positives=0.1)
        assert dilberts.size() == 480
        assert dilberts.num_hashes() == 4

        dilberts = BloomFilter(num_values=1000, false_positives=0.1)
        assert dilberts.size() == 4793
        assert dilberts.num_hashes() == 4

        dilberts = BloomFilter(num_values=100, false_positives=0.01)
        assert dilberts.size() == 959
        assert dilberts.num_hashes() == 7

        dilberts = BloomFilter(num_values=1000, false_positives=0.01)
        assert dilberts.size() == 9586
        assert dilberts.num_hashes() == 7
示例#14
0
class RecentlyConsumedTests(TestCase):
    "Simulate reddit's recently consumed problem to test our Bloom filter."

    def setUp(self):
        super().setUp()
        self.redis.delete('recently-consumed')

        # Construct a set of links that the user has seen.
        self.seen_links = set()
        while len(self.seen_links) < 100:
            fullname = self.random_fullname()
            self.seen_links.add(fullname)

        # Construct a set of links that the user hasn't seen.  Ensure that
        # there's no intersection between the seen set and the unseen set.
        self.unseen_links = set()
        while len(self.unseen_links) < 100:
            fullname = self.random_fullname()
            if fullname not in self.seen_links:
                self.unseen_links.add(fullname)

        # Initialize the recently consumed Bloom filter on the seen set.
        self.recently_consumed = BloomFilter(
            self.seen_links,
            num_values=1000,
            false_positives=0.001,
            key='recently-consumed',
        )

    def tearDown(self):
        self.recently_consumed.clear()
        super().tearDown()

    @staticmethod
    def random_fullname(*, prefix='t3_', size=6):
        alphabet, id36 = string.digits + string.ascii_lowercase, ''
        for _ in range(size):
            id36 += random.choice(alphabet)
        return prefix + id36

    @staticmethod
    def round(number, *, sig_digits=1):
        '''Round a float to the specified number of significant digits.

        Reference implementation:
            https://github.com/ActiveState/code/blob/3b27230f418b714bc9a0f897cb8ea189c3515e99/recipes/Python/578114_Round_number_specified_number_significant/recipe-578114.py
        '''
        try:
            ndigits = sig_digits - 1 - math.floor(math.log10(abs(number)))
        except ValueError:
            # math.log10(number) raised a ValueError, so number must be 0.0.
            # No need to round 0.0.
            return number
        else:
            return round(number, ndigits)

    def test_zero_false_negatives(self):
        'Ensure that we produce zero false negatives'
        for seen_link in self.seen_links:
            assert seen_link in self.recently_consumed

    def test_acceptable_false_positives(self):
        'Ensure that we produce false positives at an acceptable rate'
        acceptable, actual = self.recently_consumed.false_positives, 0

        for unseen_link in self.unseen_links:
            actual += unseen_link in self.recently_consumed
        actual /= len(self.unseen_links)
        actual = self.round(actual, sig_digits=1)

        message = 'acceptable: {}; actual: {}'.format(acceptable, actual)
        assert actual <= acceptable, message
示例#15
0
    def test_size_and_num_hashes(self):
        'Test BloomFilter.size()'
        dilberts = BloomFilter(num_elements=100, false_positives=0.1)
        assert dilberts.size() == 480
        assert dilberts.num_hashes() == 4

        dilberts = BloomFilter(num_elements=1000, false_positives=0.1)
        assert dilberts.size() == 4793
        assert dilberts.num_hashes() == 4

        dilberts = BloomFilter(num_elements=100, false_positives=0.01)
        assert dilberts.size() == 959
        assert dilberts.num_hashes() == 7

        dilberts = BloomFilter(num_elements=1000, false_positives=0.01)
        assert dilberts.size() == 9586
        assert dilberts.num_hashes() == 7
示例#16
0
    def test_add(self):
        'Test BloomFilter add(), __contains__(), and __len__()'
        dilberts = BloomFilter(num_elements=100, false_positives=0.01)
        assert 'rajiv' not in dilberts
        assert 'raj' not in dilberts
        assert 'dan' not in dilberts
        assert 'eric' not in dilberts
        assert len(dilberts) == 0

        dilberts.add('rajiv')
        assert 'rajiv' in dilberts
        assert 'raj' not in dilberts
        assert 'dan' not in dilberts
        assert 'eric' not in dilberts
        assert len(dilberts) == 1

        dilberts.add('raj')
        assert 'rajiv' in dilberts
        assert 'raj' in dilberts
        assert 'dan' not in dilberts
        assert 'eric' not in dilberts
        assert len(dilberts) == 2

        dilberts.add('rajiv')
        assert 'rajiv' in dilberts
        assert 'raj' in dilberts
        assert 'dan' not in dilberts
        assert 'eric' not in dilberts
        assert len(dilberts) == 2

        dilberts.add('raj')
        assert 'rajiv' in dilberts
        assert 'raj' in dilberts
        assert 'dan' not in dilberts
        assert 'eric' not in dilberts
        assert len(dilberts) == 2

        dilberts.add('dan')
        assert 'rajiv' in dilberts
        assert 'raj' in dilberts
        assert 'dan' in dilberts
        assert 'eric' not in dilberts
        assert len(dilberts) == 3

        dilberts.add('eric')
        assert 'rajiv' in dilberts
        assert 'raj' in dilberts
        assert 'dan' in dilberts
        assert 'eric' in dilberts
        assert len(dilberts) == 4
示例#17
0
    def test_add(self):
        'Test BloomFilter add(), __contains__(), and __len__()'
        dilberts = BloomFilter(num_values=100, false_positives=0.01)
        assert 'rajiv' not in dilberts
        assert 'raj' not in dilberts
        assert 'dan' not in dilberts
        assert 'eric' not in dilberts
        assert len(dilberts) == 0

        dilberts.add('rajiv')
        assert 'rajiv' in dilberts
        assert 'raj' not in dilberts
        assert 'dan' not in dilberts
        assert 'eric' not in dilberts
        assert len(dilberts) == 1

        dilberts.add('raj')
        assert 'rajiv' in dilberts
        assert 'raj' in dilberts
        assert 'dan' not in dilberts
        assert 'eric' not in dilberts
        assert len(dilberts) == 2

        dilberts.add('rajiv')
        assert 'rajiv' in dilberts
        assert 'raj' in dilberts
        assert 'dan' not in dilberts
        assert 'eric' not in dilberts
        assert len(dilberts) == 2

        dilberts.add('raj')
        assert 'rajiv' in dilberts
        assert 'raj' in dilberts
        assert 'dan' not in dilberts
        assert 'eric' not in dilberts
        assert len(dilberts) == 2

        dilberts.add('dan')
        assert 'rajiv' in dilberts
        assert 'raj' in dilberts
        assert 'dan' in dilberts
        assert 'eric' not in dilberts
        assert len(dilberts) == 3

        dilberts.add('eric')
        assert 'rajiv' in dilberts
        assert 'raj' in dilberts
        assert 'dan' in dilberts
        assert 'eric' in dilberts
        assert len(dilberts) == 4