示例#1
0
    def test_union_k_fail(self):
        bloom_one = BloomFilter(100, 0.01)
        bloom_two = BloomFilter(100, 0.001)

        def _run():
            bloom_one.union(bloom_two)

        self.assertRaises(ValueError, _run)
示例#2
0
    def test_intersection_capacity_fail(self):
        bloom_one = BloomFilter(1000, 0.001)
        bloom_two = BloomFilter(100, 0.001)

        def _run():
            bloom_one.intersection(bloom_two)

        self.assertRaises(ValueError, _run)
示例#3
0
 def test_union(self):
     bloom_one = BloomFilter(100, 0.001)
     bloom_two = BloomFilter(100, 0.001)
     chars = [chr(i) for i in range_fn(97, 123)]
     for char in chars[int(len(chars) / 2):]:
         bloom_one.add(char)
     for char in chars[:int(len(chars) / 2)]:
         bloom_two.add(char)
     new_bloom = bloom_one.union(bloom_two)
     for char in chars:
         self.assertTrue(char in new_bloom)
示例#4
0
 def bloomf(self):
     bFilter = BloomFilter(capacity=1000, error_rate=0.001)
     return bFilter
示例#5
0
from __future__ import print_function

import time
from pybloom_live.pybloom import BloomFilter

try:
    range = xrange
except NameError:
    pass

NS = 10**9
for _p in range(1, 3):
    p = 10 ** _p
    for e in range(9):
        X = int(1000 * 10 ** (e / 2.0))
        print(X, p, end='')
        bloomfilter = BloomFilter(X + 1, 1.0/p)
        t = time.time()

        for x in range(X):
            bloomfilter.add(x)
        print((time.time() - t) / X * NS, end='')
        t = time.time()
        for x in range(X):
            x in bloomfilter
        print((time.time() - t) / X * NS, end='')
        t = time.time()
        for x in range(X, 2*X):
            x in bloomfilter
        print((time.time() - t) / X * NS)
示例#6
0
import re

headers = {}
headers[
    'User-Agent'] = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
headers['Accept-Encoding'] = 'gzip, deflate, br'
headers['Accept-Language'] = 'zh-CN,zh;q=0.9'
headers['Connection'] = 'keep-alive'
headers['Host'] = 'tieba.baidu.com'

baidu_base_url = 'https://tieba.baidu.com'
baidu_base_url_no_https = 'http://tieba.baidu.com'

from pybloom_live.pybloom import BloomFilter

title_url_bloom = BloomFilter(capacity=2 << 15, error_rate=0.01)


class BaiduTiebaSpider(scrapy.Spider):
    name = "tieba"
    allowed_domains = ['tieba.baidu.com']
    root_url = 'https://tieba.baidu.com/f?kw=%E7%9B%B8%E4%BA%B2&ie=utf-8&pn='

    custom_settings = {
        'ITEM_PIPELINES': {
            'spider.pipelines.TiebaPipeline': 300
        },
    }

    MAX_DEEP_INDEX = 1000