class Collector(threading.Thread): def __init__(self, table_folder, process_num=None): """ @summary: --------- @param table_folder: @param process_num: 进程编号 --------- @result: """ super(Collector, self).__init__() self._db = RedisDB() self._thread_stop = False self._todo_requests = collections.deque() self._tab_requests = setting.TAB_REQUSETS.format( table_folder=table_folder) self._tab_spider_status = setting.TAB_SPIDER_STATUS.format( table_folder=table_folder) self._spider_mark = LOCAL_HOST_IP + ("_%s" % process_num if process_num else "_0") self._interval = setting.COLLECTOR_SLEEP_TIME self._request_count = setting.COLLECTOR_TASK_COUNT self._is_collector_task = False self._db.clear(self._tab_spider_status) def run(self): while not self._thread_stop: try: self.__input_data() except Exception as e: log.exception(e) self._is_collector_task = False time.sleep(self._interval) def stop(self): self._thread_stop = True def __input_data(self): if len(self._todo_requests) >= self._request_count: return # 汇报节点信息 self._db.zadd(self._tab_spider_status, self._spider_mark, 0) # 未做 request_count = self._request_count # 先赋值 # 根据等待节点数量,动态分配request spider_wait_count = self._db.zget_count(self._tab_spider_status, priority_min=0, priority_max=0) if spider_wait_count: # 任务数量 task_count = self._db.zget_count(self._tab_requests) # 动态分配的数量 = 任务数量 / 休息的节点数量 + 1 request_count = task_count // spider_wait_count + 1 request_count = (request_count if request_count <= self._request_count else self._request_count) if not request_count: return # requests_list = self._db.zget(self._tab_requests, count = request_count) # 取任务 current_timestamp = tools.get_current_timestamp() priority_max = current_timestamp - setting.REQUEST_TIME_OUT # 普通的任务 与 已经超时的任务 requests_list = self._db.zrangebyscore_set_score( self._tab_requests, priority_min="-inf", priority_max=priority_max, score=current_timestamp, count=request_count, ) # print('取任务', len(requests_list)) if not requests_list: pass else: self._is_collector_task = True # 将取到的任务放回到redis, 以当前时间戳标记,表示正在做的任务。任务做完在request_buffer中删除,没做完则到超时时间后重新做 # self._db.zadd(self._tab_requests, requests_list, prioritys=current_timestamp) # 汇报节点信息 self._db.zadd(self._tab_spider_status, self._spider_mark, 1) # 正在做 # 存request self.__put_requests(requests_list) def __put_requests(self, requests_list): for request in requests_list: try: request_dict = { "request_obj": Request.from_dict(eval(request)), "request_redis": request, } except Exception as e: log.exception(""" error %s request %s """ % (e, request)) request_dict = None if request_dict: self._todo_requests.append(request_dict) def get_requests(self, count): requests = [] count = count if count <= len(self._todo_requests) else len( self._todo_requests) while count: requests.append(self._todo_requests.popleft()) count -= 1 return requests def get_requests_count(self): return len(self._todo_requests) or self._db.zget_count( self._tab_requests) def is_collector_task(self): return self._is_collector_task
class RequestBuffer(threading.Thread, Singleton): dedup = None def __init__(self, table_folder): if not hasattr(self, "_requests_deque"): super(RequestBuffer, self).__init__() self._thread_stop = False self._is_adding_to_db = False self._requests_deque = collections.deque() self._del_requests_deque = collections.deque() self._db = RedisDB() self._table_request = setting.TAB_REQUSETS.format( table_folder=table_folder) self._table_failed_request = setting.TAB_FAILED_REQUSETS.format( table_folder=table_folder) if not self.__class__.dedup and setting.REQUEST_FILTER_ENABLE: self.__class__.dedup = Dedup( filter_type=Dedup.ExpireFilter, name=table_folder, expire_time=2592000, to_md5=False, ) # 过期时间为一个月 def run(self): while not self._thread_stop: try: self.__add_request_to_db() except Exception as e: log.exception(e) tools.delay_time(1) def stop(self): self._thread_stop = True def put_request(self, request): self._requests_deque.append(request) if self.get_requests_count() > MAX_URL_COUNT: # 超过最大缓存,主动调用 self.flush() def put_del_request(self, request): self._del_requests_deque.append(request) def put_failed_request(self, request, table=None): try: request_dict = request.to_dict self._db.zadd(table or self._table_failed_request, request_dict, request.priority) except Exception as e: log.exception(e) def flush(self): try: self.__add_request_to_db() except Exception as e: log.exception(e) def get_requests_count(self): return len(self._requests_deque) def is_adding_to_db(self): return self._is_adding_to_db def __add_request_to_db(self): request_list = [] prioritys = [] callbacks = [] while self._requests_deque: request = self._requests_deque.popleft() self._is_adding_to_db = True if callable(request): # 函数 # 注意:应该考虑闭包情况。闭包情况可写成 # def test(xxx = xxx): # # TODO 业务逻辑 使用 xxx # 这么写不会导致xxx为循环结束后的最后一个值 callbacks.append(request) continue priority = request.priority # 如果需要去重并且库中已重复 则continue if (request.filter_repeat and setting.REQUEST_FILTER_ENABLE and not self.__class__.dedup.add(request.fingerprint)): log.debug("request已存在 url = %s" % request.url) continue else: request_list.append(str(request.to_dict)) prioritys.append(priority) if len(request_list) > MAX_URL_COUNT: self._db.zadd(self._table_request, request_list, prioritys) request_list = [] prioritys = [] # 入库 if request_list: self._db.zadd(self._table_request, request_list, prioritys) # 执行回调 for callback in callbacks: try: callback() except Exception as e: log.exception(e) # 删除已做任务 if self._del_requests_deque: request_done_list = [] while self._del_requests_deque: request_done_list.append(self._del_requests_deque.popleft()) # 去掉request_list中的requests, 否则可能会将刚添加的request删除 request_done_list = list( set(request_done_list) - set(request_list)) if request_done_list: self._db.zrem(self._table_request, request_done_list) self._is_adding_to_db = False