def __init__(self, bloom_name):
     self.bloom_path = '%s.blm' % bloom_name
     is_exist = os.path.exists(self.bloom_path)
     if is_exist:
         self.bf = BloomFilter.fromfile(open(self.bloom_path, 'rb'))
     else:
         self.bf = BloomFilter(20000, 0.001)
Пример #2
0
    def __init__(self,
                 endpoint=config.config['general']['dbpedia']['endpoint'],
                 one_hop_bloom_file=config.config['general']['dbpedia']
                 ['one_hop_bloom_file'],
                 two_hop_bloom_file=config.config['general']['dbpedia']
                 ['two_hop_bloom_file']):
        super(DBpedia, self).__init__(endpoint)
        self.type_uri = "<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>"
        if os.path.exists(one_hop_bloom_file):
            with open(one_hop_bloom_file, 'rb') as bloom_file:
                self.one_hop_bloom = BloomFilter.fromfile(bloom_file)
        else:
            self.one_hop_bloom = None
        self.two_hop_bloom_file = two_hop_bloom_file

        self.two_hop_bloom = dict()
        for item in [True, False]:
            file_path = two_hop_bloom_file.replace('spo2', 'spo2' + str(item))
            if os.path.exists(file_path):
                with open(file_path, 'rb') as bloom_file:
                    self.two_hop_bloom[item] = ScalableBloomFilter.fromfile(
                        bloom_file)
            else:
                self.two_hop_bloom[item] = ScalableBloomFilter(
                    mode=ScalableBloomFilter.LARGE_SET_GROWTH)

        self.two_hop_bloom_counter = 0
Пример #3
0
 def __init__(self):
     self.filename = 'bloomFilter.blm'
     is_exist = os.path.exists(self.filename)
     if is_exist:
         self.bf = BloomFilter.fromfile(open(self.filename, 'rb'))
     else:
         self.bf = BloomFilter(100000000, 0.001)
Пример #4
0
 def open_spider(self, spider):
     if os.path.exists(self.file_name):
         self.bf = BloomFilter.fromfile(open(self.file_name, 'rb'))
         self.cap_begin = len(self.bf)  # 打开blm文件时读入初始数量
         print('open blm file success')
         print('初始容量:%d' % self.cap_begin)
     else:
         self.bf = BloomFilter(100000000, 0.001)
         print('Not find the blm file, creat one')
Пример #5
0
    def test_train_bloom_filter__from_file(self):
        with open('./resources/hot_names_bloom_filter', 'rb') as f:
            bf = BloomFilter.fromfile(f)

        for name in sample(self.names, 10):
            assert name in bf

            prefix = ''.join(sample(ascii_letters, 10))
            fake_name = f'{prefix}{name}'
            assert fake_name not in bf
Пример #6
0
 def __init__(self, bloom_name):
     bloom_dir = './bf'
     if not os.path.exists(bloom_dir):
         os.makedirs(bloom_dir)
     self.bloom_path = '%s/%s.blm' % (bloom_dir, bloom_name)
     is_exist = os.path.exists(self.bloom_path)
     if is_exist:
         self.bf = BloomFilter.fromfile(open(self.bloom_path, 'rb'))
     else:
         self.bf = BloomFilter(20000, 0.001)
Пример #7
0
    def bloom_readfrom_db(self):
        tempFile = open("tempFile", "wb")

        bloom_dict = self.myMongo["bloom"].find_one({"_id": self.taskCode})

        if bloom_dict: #如果有布隆过滤器,读取
            bloomData = bloom_dict["bloom_data"]
            tempFile.write(bloomData)
            tempFile.close()
            bloomFile = open("tempFile", "rb")
            self.bloom = BloomFilter.fromfile(bloomFile)
        else:
            self.bloom = BloomFilter(capacity=1000000, error_rate=0.00001)
Пример #8
0
def bloom_url(url):
    is_exist = os.path.exists(r'C:\spiders\zhilian_celery\bloom.blm')
    if is_exist:
        bf = BloomFilter.fromfile(
            open(r'C:\spiders\zhilian_celery\bloom.blm', 'rb', buffering=40))
    else:
        bf = BloomFilter(10000000, 0.001)

        # for animal in animals:
    if url in bf:
        print(1)
        return 0
    else:
        bf.add(url)
        bf.tofile(open(r'C:\spiders\zhilian_celery\bloom.blm', 'wb'))
        return 1
Пример #9
0
 def filter_url(self, url):
     """
     进行url去重处理,可能需要的请求数据过多,防止重复
     :param url:对url进行判断,看是否重复
     :return:
     """
     bloom_path = '{}.blm'.format(self.name)
     # 判断是否存在这个文件
     is_exist = os.path.exists(bloom_path)
     if is_exist:
         bf = BloomFilter.fromfile(open(bloom_path, 'rb'))
     else:
         # 新建一个,储存在内存中
         bf = BloomFilter(1000000, 0.01)
     if url in bf:
         return False
     # 不存在将url添加进去
     bf.add(url)
     bf.tofile(open(bloom_path, 'wb'))
     return True
Пример #10
0
def bloom_file_init():
    path = '../spiders/sites.blm'
    is_exist = os.path.exists(path)
    # 判断是否存在bloom文件
    # 判断存在就读取
    if is_exist:
        bf = BloomFilter.fromfile(open(path, 'rb'))
    # 没有该文件则创建bf对象 最后的时候保存文件
    else:
        bf = BloomFilter(10000000, 0.01)

    with MongoClient(get_project_settings()['MONGODB_URL']) as client:
        sites_coll = client.site.sites
        sites_unverified_coll = client.site.sites_unverified
        for x in sites_coll.find():
            result = bf.add(x['url'])
            print(x['url'], ' ', result)
        for x in sites_unverified_coll.find({}):
            result = bf.add(x['url'])
            print(x['url'], ' ', result)

    bf.tofile(open(path, 'wb'))
Пример #11
0
def Bulon():
    if os.path.exists('布隆文件/{}.blm'.format(DATABASE)):
        bf = BloomFilter.fromfile(open('布隆文件/{}.blm'.format(DATABASE), 'rb'))
    else:
        bf = BloomFilter(1000000, 0.001)
    return bf
Пример #12
0
 def mapper_init(self):
     with open(os.path.join(basedir, 'resources/hot_user_ids.bf'),
               'rb') as f:
         self.filter = BloomFilter.fromfile(f)
Пример #13
0
# 导入库
import os
from pybloom_live import BloomFilter

# 数据库文件
animals = [
    'dog', 'cat', 'giraffe', 'fly', 'mosquito', 'horse', 'eagle', 'bird',
    'bison', 'boar', 'butterfly', 'ant', 'anaconda', 'bear', 'chicken',
    'dolphin', 'donkey', 'crow', 'crocodile', 'testadd'
]

# 判断文件是否存在
# 存在时读取,不存在时创建
is_exist = os.path.exists('test.blm')
if is_exist:
    bf = BloomFilter.fromfile(open('test.blm', 'rb'))
# 若没有该文件则创建bf对象
else:
    bf = BloomFilter(20000, 0.001)

# 如果存在则跳过,否则写入
for i in range(10):
    if i in bf:
        print('pass')
        pass
    else:
        print('add %s' % i)
        bf.add(i)
        bf.tofile(open('test.blm', 'wb'))

#判断是否存在
Пример #14
0
                    items = items[:-1]

                    for i in range(2):
                        if not all([
                                item.startswith('<') and item.endswith('>')
                                for item in items[i:i + 2]
                        ]):
                            continue
                        key = ':'.join([items[i][1:-1], items[i + 1][1:-1]])
                        bloom.add(key)

        with open(os.path.join(blooms_path, 'spo1.bloom'), 'wb') as f:
            bloom.tofile(f)

    with open(os.path.join(blooms_path, 'spo1.bloom'), 'rb') as f:
        one_hop_bloom = BloomFilter.fromfile(f)

        ds = LC_Qaud_Linked(
            path=os.path.join(args.base_path, args.dataset_path))
        ds.load()
        ds.parse()
        for row in ds.qapairs:
            for item in row.sparql.where_clause:
                if item[0].startswith('<'):
                    key = ':'.join([item[0][1:-1], item[1][1:-1]])
                elif item[2].startswith('<'):
                    key = ':'.join([item[1][1:-1], item[2][1:-1]])
                else:
                    key = ''
                if '#type' not in key and key != '' and key not in one_hop_bloom:
                    print(key)
Пример #15
0
def _open_bloom(infile):
    nb = open(infile, "rb")
    return BloomFilter.fromfile(nb)
Пример #16
0
import os
from pybloom_live import BloomFilter

# coon = pymysql.connect(host='127.0.0.1', user='******', passwd='qwer', db='haining')
# cur = coon.cursor()
# cur.execute("SELECT room_id from haining_room")
# room_urls = cur.fetchall()

ls = ["1049be49dc584707"]
os.chdir(r'E:\Myproject\Scan\chizhou\chizhou\spiders')

is_exist = os.path.exists('chizhou.blm')
# 判断是否存在bloom文件
# 判断存在就读取
if is_exist:
    bf = BloomFilter.fromfile(open('chizhou.blm', 'rb'))
    # 没有该文件则创建bf对象 最后的时候保存文件
else:
    bf = BloomFilter(1000000, 0.0000001)

i = 1
for room_url in ls:
    if room_url in bf:
        print('pass')
        pass
    else:
        # 加入布隆列表
        bf.add(room_url)
        print('添加了 %s 个' % i)
        i += 1
# 创建,写入布隆文件(单次写入)