def __init__(self, endpoint=config.config['general']['dbpedia']['endpoint'], one_hop_bloom_file=config.config['general']['dbpedia'] ['one_hop_bloom_file'], two_hop_bloom_file=config.config['general']['dbpedia'] ['two_hop_bloom_file']): super(DBpedia, self).__init__(endpoint) self.type_uri = "<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>" if os.path.exists(one_hop_bloom_file): with open(one_hop_bloom_file, 'rb') as bloom_file: self.one_hop_bloom = BloomFilter.fromfile(bloom_file) else: self.one_hop_bloom = None self.two_hop_bloom_file = two_hop_bloom_file self.two_hop_bloom = dict() for item in [True, False]: file_path = two_hop_bloom_file.replace('spo2', 'spo2' + str(item)) if os.path.exists(file_path): with open(file_path, 'rb') as bloom_file: self.two_hop_bloom[item] = ScalableBloomFilter.fromfile( bloom_file) else: self.two_hop_bloom[item] = ScalableBloomFilter( mode=ScalableBloomFilter.LARGE_SET_GROWTH) self.two_hop_bloom_counter = 0
async def run(self) -> None: try: with open('/data/bloom-filter', 'rb') as f: log('debug', 'Using saved bloom-filter') self.filter = ScalableBloomFilter.fromfile(f) except FileNotFoundError: log('debug', 'Creating new bloom-filter') self.filter = ScalableBloomFilter(initial_capacity=100000) self.conn_pool = await retry( partial(asyncpg.create_pool, host='db', user='******', database='ipfs_crawler'), 'database', gaierror, ConnectionRefusedError, asyncpg.CannotConnectNowError) # start consumers for _ in range(8): self.workers.append(asyncio.ensure_future(self.worker())) # start producer self.producer: Future = asyncio.ensure_future(self.read_logs()) log('info', 'Started crawling') # If an exception is thrown in the background task, # our crawler should not ignore it and continue to run, but throws it. await asyncio.gather(self.producer, *self.workers)
logger.setLevel(logging.INFO) handler = logging.FileHandler("log.txt") formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") handler.setFormatter(formatter) logger.addHandler(handler) ## 控制台handler chhandler = logging.StreamHandler() chhandler.setFormatter(formatter) logger.addHandler(chhandler) from pybloom_live import ScalableBloomFilter blommfilter_file = "bloomfilter.suffix" titlefilter_file = "titlefilter.suffix" deque_file = "sites.pkl" try: sbf = ScalableBloomFilter.fromfile(open(blommfilter_file, "rb")) sbf_title = ScalableBloomFilter.fromfile(open(titlefilter_file, "rb")) with open(deque_file,'rb') as f: sites_deque = pickle.load(f) except: # logger.warning('去重文件不存在') sbf = ScalableBloomFilter( initial_capacity=5000, error_rate=0.001, mode=ScalableBloomFilter.LARGE_SET_GROWTH, ) sbf_title = ScalableBloomFilter( initial_capacity=5000, error_rate=0.001, mode=ScalableBloomFilter.LARGE_SET_GROWTH, )