async def set_result(self, request: Request, response: Response, task_request: Request): """ 保存结果 @param request: @param response: @param task_request: """ # 如果失败,且失败次数未达到,返回waiting str_request = request.serialize(self.module) # 如果在进行队列中,删除 if str_request in self.pending: self.pending.pop(str_request) # 如果成功 if response.ok == 1: return True if response.ok == -1: self.failure[str_request] = response.status return False if str_request in self.failure: self.failure[str_request] += 1 await self.add(request) else: self.failure[str_request] = 1 await self.add(request)
async def check_pending_task(self): # 判断是否有超时的链接 now_time = time.time() if now_time - self._last_check_pending_task_time > 10: self._last_check_pending_task_time = now_time now_time = time.time() with await self.pool as conn: pending_list = await conn.hgetall(self._pending_key) to_waiting_list = [] del_pending_list = [] for k, v in pending_list.items(): if now_time - int(v) > self._setting["PENDING_THRESHOLD"]: request = Request.unserialize(k, self.module) to_waiting_list.extend([request.priority, k]) del_pending_list.append(k) if to_waiting_list: pipe = conn.pipeline() pipe.zadd(self._waiting_key, *to_waiting_list) pipe.hdel(self._pending_key, *del_pending_list) result = await pipe.execute() logger.info(f"pendings: {len(pending_list)}, del_pending: {result[1]}, add_waitings: {result[0]}")
async def get(self, priority): """ 从队列中获取一个request """ if not self.waiting.empty(): result = await self.waiting.get() self.pending[result[1]] = get_timestamp() return Request.unserialize(result[1], self.module) return None
async def get(self, priority): """ 从队列中获取一个request """ message = await self.pool.subscribe(2) if message: request = Request.unserialize(message.body.decode(), self.module) request.message = message else: request = None return request
async def _process_task(self, request: Request, task_id): """ 处理请求 @param task_id: @param request: request对象 """ try: task_request = request.replace() # 处理请求和回调 response = await self.handle_request(task_request) # 处理请求响应 await self.process_response(request, response) # 处理请求结果 await self.scheduler.set_result(request, response, task_request) except Exception as e: debug_msg = traceback.format_exc(self.logging.get_tb_limit()) logger.error(f"{request} callback error \n{debug_msg}")
async def _process_start_urls(self): """ 初始化start_urls, 添加到队列中去 """ try: request_list = [ Request(url=url, callback=self.parse) for url in self.start_urls ] counts = 0 if request_list: counts = await self.scheduler.add(request_list) logger.info(f"init start urls end, set {counts}") except Exception as e: # 初始化start_urls失败 debug_msg = traceback.format_exc(self.logging.get_tb_limit()) logger.error(f"init start urls error \n{debug_msg}")
async def get(self, priority: typing.Union[int, list]): """ 从redis中获取request @param priority: 为None的时候,获取所有权重,否则获取指定的权重,可以是int,也可以是int列表 @return: request """ priority_list = [] if priority is None: priority_list.append(("-inf", "+inf")) elif isinstance(priority, int): priority_list.append((priority, priority)) else: priority_list = get_priority_list(priority) try: lua = """ redis.replicate_commands() local waiting_key = KEYS[1] local pending_key = KEYS[2] local min = KEYS[3] local max = KEYS[4] -- 取值 local result = redis.call('zrevrangebyscore', waiting_key, max, min, 'LIMIT', 0, 1) if result and table.getn(result) > 0 then redis.call('zrem', waiting_key, result[1]) redis.call('hset', pending_key, result[1], redis.call('TIME')[1]) return result[1] end return nil """ with await self.pool as conn: for p_item in priority_list: _min, _max = p_item eval_result = await conn.eval(lua, keys=[self._waiting_key, self._pending_key, _min, _max], args=[]) if eval_result: self.task_count += 1 return Request.unserialize(eval_result, self.module) except Exception as e: logger.error(f"get request error \n{traceback.format_exc()}") return None
async def set_result(self, request: Request, response: Response, task_request: Request): """ 保存结果,设置状态(成功或失败) @param request: @param response: @param task_request: @return: """ request_ser = request.serialize(self.module) with await self.pool as conn: if response.ok == 1: # 成功,删除pending队列 await conn.hdel(self._pending_key, request_ser) self.task_success += 1 else: failure_response = serialize_request_and_response(task_request, response) # 失败, 从等待队列中删除,并放到失败队列 pipe = conn.pipeline() pipe.hdel(self._pending_key, request_ser) pipe.hset(self._failure_key, request_ser, failure_response) await pipe.execute() self.task_failure += 1
async def start_requests(self): """ 用于初始化url,默认读取start_urls, 可重写 """ for url in self.start_urls: yield Request(url=url, callback=self.parse)