def __export_to_db(self, tab_item, datas, is_update=False, update_keys=()): to_table = tools.get_info(tab_item, ":s_(.*?)_item$", fetch_one=True) # 打点 校验 self.check_datas(table=to_table, datas=datas) for pipeline in self._pipelines: if is_update: if to_table == self._task_table and not isinstance( pipeline, MysqlPipeline ): continue if not pipeline.update_items(to_table, datas, update_keys=update_keys): log.error( f"{pipeline.__class__.__name__} 更新数据失败. table: {to_table} items: {datas}" ) return False else: if not pipeline.save_items(to_table, datas): log.error( f"{pipeline.__class__.__name__} 保存数据失败. table: {to_table} items: {datas}" ) return False # 若是任务表, 且上面的pipeline里没mysql,则需调用mysql更新任务 if not self._have_mysql_pipeline and is_update and to_table == self._task_table: self.mysql_pipeline.update_items(to_table, datas, update_keys=update_keys)
def add(self, sql, exception_callfunc=None): """ Args: sql: exception_callfunc: 异常回调 Returns: 添加行数 """ affect_count = None try: conn, cursor = self.get_connection() affect_count = cursor.execute(sql) conn.commit() except Exception as e: log.error( """ error:%s sql: %s """ % (e, sql) ) if exception_callfunc: exception_callfunc(e) finally: self.close_connection(conn, cursor) return affect_count
def _make_absolute(self, link): """Makes a given link absolute.""" try: link = link.strip() # Parse the link with stdlib. parsed = urlparse(link)._asdict() # If link is relative, then join it with base_url. if not parsed["netloc"]: return urljoin(self.url, link) # Link is absolute; if it lacks a scheme, add one from base_url. if not parsed["scheme"]: parsed["scheme"] = urlparse(self.url).scheme # Reconstruct the URL to incorporate the new scheme. parsed = (v for v in parsed.values()) return urlunparse(parsed) except Exception as e: log.error( "Invalid URL <{}> can't make absolute_link. exception: {}". format(link, e)) # Link is absolute and complete with scheme; nothing to be done here. return link
def update_items(self, table, items: List[Dict], update_keys=Tuple) -> bool: """ 更新数据 Args: table: 表名 items: 数据,[{},{},...] update_keys: 更新的字段, 如 ("title", "publish_time") Returns: 是否更新成功 True / False 若False,不会将本批数据入到去重库,以便再次入库 """ sql, datas = tools.make_batch_sql(table, items, update_columns=update_keys or list(items[0].keys())) update_count = self.to_db.add_batch(sql, datas) if update_count is None: log.error("更新表 %s 数据失败" % (table)) else: msg = "共更新 %s 条数据 到 %s" % (update_count // 2, table) if update_keys: msg += " 更新字段为 {}".format(update_keys) log.info(msg) return update_count != None
def delete(self, sql): """ 删除 Args: sql: Returns: True / False """ try: conn, cursor = self.get_connection() cursor.execute(sql) conn.commit() except Exception as e: log.error( """ error:%s sql: %s """ % (e, sql) ) return False else: return True finally: self.close_connection(conn, cursor)
def update(self, coll_name, data: Dict, condition: Dict, upsert: bool = False): """ 更新 Args: coll_name: 集合名 data: 单条数据 {"xxx":"xxx"} condition: 更新条件 {"_id": "xxxx"} upsert: 数据不存在则插入,默认为 False Returns: True / False """ try: collection = self.get_collection(coll_name) collection.update_one(condition, {"$set": data}, upsert=upsert) except Exception as e: log.error( """ error:{} condition: {} """.format( e, condition ) ) return False else: return True
def update_items(self, tab_item, items_data, update_keys=()): """ @summary: --------- @param tab_item: redis中items的表名 @param items_data: [item.to_dict] 数据 @param update_keys: 更新的字段 --------- @result: """ to_table = tools.get_info(tab_item, ":s_(.*?)_item", fetch_one=True) sql, datas = tools.make_batch_sql( to_table, items_data, update_columns=update_keys or list(items_data[0].keys()), ) update_count = self.to_db.add_batch(sql, datas) if update_count is None: log.error("更新表 %s 数据失败" % (to_table)) else: msg = "共更新 %s 条数据 到 %s" % (update_count // 2, to_table) if update_keys: msg += " 更新字段为 {}".format(update_keys) log.info(msg) return update_count != None
def add_batch(self, sql, datas: List[Dict]): """ @summary: 批量添加数据 --------- @ param sql: insert ignore into (xxx,xxx) values (%s, %s, %s) # param datas: 列表 [[..], [...]] --------- @result: 添加行数 """ affect_count = None try: conn, cursor = self.get_connection() affect_count = cursor.executemany(sql, datas) conn.commit() except Exception as e: log.error( """ error:%s sql: %s """ % (e, sql) ) finally: self.close_connection(conn, cursor) return affect_count
def wapper(*args, **kwargs): for i in range(3): try: return func(*args, **kwargs) except (err.InterfaceError, err.OperationalError) as e: log.error(""" error:%s sql: %s """ % (e, kwargs.get("sql") or args[1]))
def run(self): """ @summary: 重写run方法 检查mysql中的任务是否做完, 做完停止 --------- --------- @result: """ try: self.create_batch_record_table() if not self._parsers: # 不是add_parser 模式 self._parsers.append(self) self._start() while True: try: self.heartbeat() if ( self.task_is_done() and self.all_thread_is_done() ): # redis全部的任务已经做完 并且mysql中的任务已经做完(检查各个线程all_thread_is_done,防止任务没做完,就更新任务状态,导致程序结束的情况) if not self._is_notify_end: self.spider_end() self.record_spider_state( spider_type=2, state=1, batch_date=self._batch_date_cache, spider_end_time=tools.get_current_date(), batch_interval=self._batch_interval, ) self._is_notify_end = True if not self._keep_alive: self._stop_all_thread() break else: self._is_notify_end = False self.check_task_status() except Exception as e: log.exception(e) tools.delay_time(10) # 10秒钟检查一次爬虫状态 except Exception as e: msg = "《%s》主线程异常 爬虫结束 exception: %s" % (self._batch_name, e) log.error(msg) self.send_msg(msg, level="error", message_prefix="《%s》爬虫异常结束".format(self._batch_name)) os._exit(137) # 使退出码为35072 方便爬虫管理器重启
def record_batch(self): """ @summary: 记录批次信息(初始化) --------- --------- @result: """ # 查询总任务数 sql = "select count(1) from %s%s" % ( self._task_table, self._task_condition_prefix_where, ) total_task_count = self._mysqldb.find(sql)[0][0] batch_date = tools.get_current_date(self._date_format) sql = ( "insert into %s (batch_date, done_count, total_count, `interval`, interval_unit, create_time) values ('%s', %s, %s, %s, '%s', CURRENT_TIME)" % ( self._batch_record_table, batch_date, 0, total_task_count, self._batch_interval if self._batch_interval >= 1 else self._batch_interval * 24, "day" if self._batch_interval >= 1 else "hour", ) ) affect_count = self._mysqldb.add(sql) # None / 0 / 1 (1 为成功) if affect_count: # 重置批次日期 self._batch_date_cache = batch_date # 重新刷下self.batch_date 中的 os.environ.get('batch_date') 否则日期还停留在上一个批次 os.environ["batch_date"] = self._batch_date_cache # 爬虫开始 self.spider_begin() self.record_spider_state( spider_type=2, state=0, batch_date=batch_date, spider_start_time=tools.get_current_date(), batch_interval=self._batch_interval, ) else: log.error("插入新批次失败") return affect_count
def _reconnect(self): # 检测连接状态, 当数据库重启或设置 timeout 导致断开连接时自动重连 retry_count = 0 while True: try: retry_count += 1 log.error(f"redis 连接断开, 重新连接 {retry_count}") if self.get_connect(): log.info(f"redis 连接成功") return True except (ConnectionError, TimeoutError) as e: log.error(f"连接失败 e: {e}") time.sleep(2)
def __init__( self, ip=None, port=None, db=None, user_name=None, user_pass=None, **kwargs ): # 可能会改setting中的值,所以此处不能直接赋值为默认值,需要后加载赋值 if not ip: ip = setting.MYSQL_IP if not port: port = setting.MYSQL_PORT if not db: db = setting.MYSQL_DB if not user_name: user_name = setting.MYSQL_USER_NAME if not user_pass: user_pass = setting.MYSQL_USER_PASS try: self.connect_pool = PooledDB( creator=pymysql, mincached=1, maxcached=100, maxconnections=100, blocking=True, ping=7, host=ip, port=port, user=user_name, passwd=user_pass, db=db, charset="utf8mb4", cursorclass=cursors.SSCursor, ) # cursorclass 使用服务的游标,默认的在多线程下大批量插入数据会使内存递增 except Exception as e: log.error( """ 连接数据失败: ip: {} port: {} db: {} user_name: {} user_pass: {} exception: {} """.format( ip, port, db, user_name, user_pass, e ) ) else: log.debug("连接到mysql数据库 %s : %s" % (ip, db))
def run(self): while True: try: try: with RedisLock( key=self._tab_user_pool, lock_timeout=3600, wait_timeout=0 ) as _lock: if _lock.locked: for user in self._load_user(): retry_times = 0 while retry_times <= self._login_retry_times: try: user = self.login(user) if user: self.add_user(user) else: self.handle_login_failed_user(user) break except NotImplementedError: log.error( f"{self.__class__.__name__} must be implementation login method!" ) os._exit(0) except Exception as e: self.handel_exception(e) log.debug( f"login failed, user: {user} retry_times: {retry_times}" ) retry_times += 1 else: self.handle_login_failed_user(user) now_user_count = self._redisdb.hget_count( self._tab_user_pool ) log.info("当前在线user数为 {}".format(now_user_count)) except Exception as e: log.exception(e) if self._keep_alive: tools.delay_time(10) else: break except Exception as e: log.exception(e) tools.delay_time(1)
def update(self, sql): try: conn, cursor = self.get_connection() cursor.execute(sql) conn.commit() except Exception as e: log.error(""" error:%s sql: %s """ % (e, sql)) return False else: return True finally: self.close_connection(conn, cursor)
def set_unique_key(self, table, key): try: sql = "alter table %s add unique (%s)" % (table, key) conn, cursor = self.get_connection() cursor.execute(sql) conn.commit() except Exception as e: log.error(table + " " + str(e) + " key = " + key) return False else: log.debug("%s表创建唯一索引成功 索引为 %s" % (table, key)) return True finally: self.close_connection(conn, cursor)
def login(self) -> Optional[GuestUser]: """ 默认使用webdirver去登录,生产cookie,可以重写 """ with WebDriver(**self._kwargs) as driver: driver.get(self._page_url) cookies = driver.cookies for key in self._must_contained_keys: if key not in cookies: break else: user = GuestUser(user_agent=driver.user_agent, cookies=cookies) return user log.error("获取cookie失败 cookies = {}".format(cookies)) return None
def delete(self, table, condition: Dict): """ 删除 Args: table: condition: 查找条件 Returns: True / False """ try: collection = self.get_collection(table) collection.delete_one(condition) except Exception as e: log.error(""" error:{} condition: {} """.format(e, condition)) return False else: return True
def export_items(self, tab_item, items_data): """ @summary: --------- @param tab_item: redis中items的表名 @param items_data: [item.to_dict] 数据 --------- @result: """ to_table = tools.get_info(tab_item, ":s_(.*?)_item", fetch_one=True) sql, datas = tools.make_batch_sql(to_table, items_data) add_count = self.to_db.add_batch(sql, datas) datas_size = len(datas) if add_count is None: log.error("导出数据到表 %s 失败" % (to_table)) else: log.info("共导出 %s 条数据 到 %s, 重复 %s 条" % (datas_size, to_table, datas_size - add_count)) return add_count != None
def update(self, table, data: Dict, condition: Dict): """ 更新 Args: table: 表名 data: 数据 {"xxx":"xxx"} condition: 更新条件 {"_id": "xxxx"} Returns: True / False """ try: collection = self.get_collection(table) collection.update_one(condition, {"$set": data}) except Exception as e: log.error(""" error:{} condition: {} """.format(e, condition)) return False else: return True
def create_cookie(self): """ 可能会重写 @return: """ with WebDriver(**self._kwargs) as driver: driver.get(self._page_url) cookies = driver.get_cookies() cookies_json = {} for cookie in cookies: cookies_json[cookie["name"]] = cookie["value"] for key in self._must_contained_keys: if key not in cookies_json: break else: return cookies_json log.error("获取cookie失败 cookies = {}".format(cookies_json)) return None
def save_items(self, table, items: List[Dict]) -> bool: """ 保存数据 Args: table: 表名 items: 数据,[{},{},...] Returns: 是否保存成功 True / False 若False,不会将本批数据入到去重库,以便再次入库 """ sql, datas = tools.make_batch_sql(table, items) add_count = self.to_db.add_batch(sql, datas) datas_size = len(datas) if add_count is None: log.error("导出数据到表 %s 失败" % (table)) else: log.info("共导出 %s 条数据 到 %s, 重复 %s 条" % (datas_size, table, datas_size - add_count)) return add_count != None
def batch_date(self): """ @summary: 获取批次时间 --------- --------- @result: """ batch_date = os.environ.get("batch_date") if not batch_date: sql = 'select date_format(batch_date, "{date_format}") from {batch_record_table} order by id desc limit 1'.format( date_format=self._date_format.replace(":%M", ":%i"), batch_record_table=self._batch_record_table, ) batch_info = MysqlDB().find(sql) # (('2018-08-19'),) if batch_info: os.environ["batch_date"] = batch_date = batch_info[0][0] else: log.error("需先运行 start_monitor_task()") os._exit(137) # 使退出码为35072 方便爬虫管理器重启 return batch_date
def update_task_state(self, task_id, state=1, **kwargs): """ @summary: 更新任务表中任务状态,做完每个任务时代码逻辑中要主动调用。可能会重写 调用方法为 yield lambda : self.update_task_state(task_id, state) --------- @param task_id: @param state: --------- @result: """ kwargs["id"] = task_id kwargs[self._task_state] = state sql = tools.make_update_sql( self._task_table, kwargs, condition="id = {task_id}".format(task_id=task_id) ) if self._mysqldb.update(sql): log.debug("置任务%s状态成功" % task_id) else: log.error("置任务%s状态失败 sql=%s" % (task_id, sql))
def export(self, from_table, to_table, auto_update=False, batch_count=100): """ @summary: 用于从redis的item中导出数据到关系型数据库,如mysql/oracle from_table与to_table表结构必须一致 --------- @param from_table: @param to_table: @param auto_update: 当数据存在时是否自动更新 默认否 --------- @result: """ total_count = 0 while True: datas = [] try: datas = self.redisdb.sget(from_table, count=batch_count, is_pop=False) if not datas: log.info(""" \r%s -> %s 共导出 %s 条数据""" % (from_table, to_table, total_count)) break json_datas = [eval(data) for data in datas] sql, json_datas = tools.make_batch_sql(to_table, json_datas, auto_update) if self.to_db.add_batch(sql, json_datas): total_count += len(json_datas) self.redisdb.srem(from_table, datas) except Exception as e: log.exception(e) log.error(datas)
def check_task_status(self): """ 检查任务状态 预警 """ # 每分钟检查一次 now_time = time.time() if now_time - self._last_check_task_status_time > 60: self._last_check_task_status_time = now_time else: return # 检查redis中任务状态,若连续20分钟内任务数量未发生变化(parser可能卡死),则发出报警信息 task_count = self._redisdb.zget_count(self._tab_requests) if task_count: if task_count != self._last_task_count: self._last_task_count = task_count self._redisdb.hset( self._tab_spider_time, SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY, tools.get_current_timestamp(), ) # 多进程会重复发消息, 使用reids记录上次统计时间 else: # 判断时间间隔是否超过20分钟 lua = """ -- local key = KEYS[1] local field = ARGV[1] local current_timestamp = ARGV[2] -- 取值 local last_timestamp = redis.call('hget', KEYS[1], field) if last_timestamp and current_timestamp - last_timestamp >= 1200 then return current_timestamp - last_timestamp -- 返回任务停滞时间 秒 end if not last_timestamp then redis.call('hset', KEYS[1], field, current_timestamp) end return 0 """ redis_obj = self._redisdb.get_redis_obj() cmd = redis_obj.register_script(lua) overtime = cmd( keys=[self._tab_spider_time], args=[ SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY, tools.get_current_timestamp(), ], ) if overtime: # 发送报警 msg = "《{}》爬虫任务停滞 {},请检查爬虫是否正常".format( self._spider_name, tools.format_seconds(overtime) ) log.error(msg) self.send_msg( msg, level="error", message_prefix="《{}》爬虫任务停滞".format(self._spider_name), ) else: self._last_task_count = 0 # 检查失败任务数量 超过1000 报警, failed_count = self._redisdb.zget_count(self._tab_failed_requests) if failed_count > setting.WARNING_FAILED_COUNT: # 发送报警 msg = "《%s》爬虫当前失败任务 %s, 请检查爬虫是否正常" % (self._spider_name, failed_count) log.error(msg) self.send_msg( msg, level="error", message_prefix="《%s》爬虫当前失败任务数预警" % (self._spider_name), ) # parser_control实时统计已做任务数及失败任务数,若失败数大于10且失败任务数/已做任务数>=0.5 则报警 failed_task_count, success_task_count = PaserControl.get_task_status_count() total_count = success_task_count + failed_task_count if total_count > 0: task_success_rate = success_task_count / total_count if task_success_rate < 0.5: # 发送报警 msg = "《%s》爬虫当前任务成功数%s, 失败数%s, 成功率 %.2f, 请检查爬虫是否正常" % ( self._spider_name, success_task_count, failed_task_count, task_success_rate, ) log.error(msg) # 统计下上次发消息的时间,若时间大于1小时,则报警(此处为多进程,需要考虑别报重复) self.send_msg( msg, level="error", message_prefix="《%s》爬虫当前任务成功率" % (self._spider_name), )
def deal_requests(self, requests): for request in requests: response = None request_redis = request["request_redis"] request = request["request_obj"] del_request_redis_after_item_to_db = False del_request_redis_after_request_to_db = False for parser in self._parsers: if parser.name == request.parser_name: used_download_midware_enable = False try: # 记录需下载的文档 self.record_download_status( PaserControl.DOWNLOAD_TOTAL, parser.name) # 解析request if request.auto_request: request_temp = None response = None # 下载中间件 if request.download_midware: download_midware = ( request.download_midware if callable(request.download_midware) else tools.get_method( parser, request.download_midware)) request_temp = download_midware(request) elif request.download_midware != False: request_temp = parser.download_midware(request) # 请求 if request_temp: if (isinstance(request_temp, (tuple, list)) and len(request_temp) == 2): request_temp, response = request_temp if not isinstance(request_temp, Request): raise Exception( "download_midware need return a request, but received type: {}" .format(type(request_temp))) used_download_midware_enable = True if not response: response = ( request_temp.get_response() if not setting.RESPONSE_CACHED_USED else request_temp.get_response_from_cached( save_cached=False)) else: response = (request.get_response() if not setting.RESPONSE_CACHED_USED else request.get_response_from_cached( save_cached=False)) if response == None: raise Exception( "连接超时 url: %s" % (request.url or request_temp.url)) else: response = None # 校验 if parser.validate(request, response) == False: continue if request.callback: # 如果有parser的回调函数,则用回调处理 callback_parser = (request.callback if callable( request.callback) else tools.get_method( parser, request.callback)) results = callback_parser(request, response) else: # 否则默认用parser处理 results = parser.parse(request, response) if results and not isinstance(results, Iterable): raise Exception( "%s.%s返回值必须可迭代" % (parser.name, request.callback or "parse")) # 标识上一个result是什么 result_type = 0 # 0\1\2 (初始值\request\item) # 此处判断是request 还是 item for result in results or []: if isinstance(result, Request): result_type = 1 # 给request的 parser_name 赋值 result.parser_name = result.parser_name or parser.name # 判断是同步的callback还是异步的 if result.request_sync: # 同步 request_dict = { "request_obj": result, "request_redis": None, } requests.append(request_dict) else: # 异步 # 将next_request 入库 self._request_buffer.put_request(result) del_request_redis_after_request_to_db = True elif isinstance(result, Item): result_type = 2 # 将item入库 self._item_buffer.put_item(result) # 需删除正在做的request del_request_redis_after_item_to_db = True elif callable(result): # result为可执行的无参函数 if (result_type == 2 ): # item 的 callback,buffer里的item均入库后再执行 self._item_buffer.put_item(result) del_request_redis_after_item_to_db = True else: # result_type == 1: # request 的 callback,buffer里的request均入库后再执行。可能有的parser直接返回callback self._request_buffer.put_request(result) del_request_redis_after_request_to_db = True # else: # raise TypeError('Expect Request、Item、callback func, bug get type: {}'.format(type(result))) except Exception as e: exception_type = (str(type(e)).replace("<class '", "").replace( "'>", "")) if exception_type.startswith("requests"): # 记录下载失败的文档 self.record_download_status( PaserControl.DOWNLOAD_EXCEPTION, parser.name) else: # 记录解析程序异常 self.record_download_status( PaserControl.PAESERS_EXCEPTION, parser.name) if setting.LOG_LEVEL == "DEBUG": # 只有debug模式下打印, 超时的异常篇幅太多 log.exception(e) log.error(""" -------------- %s.%s error ------------- error %s response %s deal request %s """ % ( parser.name, (request.callback and callable(request.callback) and getattr(request.callback, "__name__") or request.callback) or "parse", str(e), response, tools.dumps_json(request.to_dict, indent=28) if setting.LOG_LEVEL == "DEBUG" else request, )) request.error_msg = "%s: %s" % (exception_type, e) request.response = str(response) if "Invalid URL" in str(e): request.is_abandoned = True requests = parser.exception_request( request, response) or [request] if not isinstance(requests, Iterable): raise Exception("%s.%s返回值必须可迭代" % (parser.name, "exception_request")) for request in requests: if callable(request): self._request_buffer.put_request(request) continue if not isinstance(request, Request): raise Exception( "exception_request 需 yield request") if (request.retry_times + 1 > setting.SPIDER_MAX_RETRY_TIMES or request.is_abandoned): self.__class__._failed_task_count += 1 # 记录失败任务数 # 处理failed_request的返回值 request 或 func results = parser.failed_request( request, response) or [request] if not isinstance(results, Iterable): raise Exception( "%s.%s返回值必须可迭代" % (parser.name, "failed_request")) for result in results: if isinstance(result, Request): if setting.SAVE_FAILED_REQUEST: if used_download_midware_enable: # 去掉download_midware 添加的属性 original_request = ( Request.from_dict( eval(request_redis)) if request_redis else result) original_request.error_msg = ( request.error_msg) original_request.response = ( request.response) self._request_buffer.put_failed_request( original_request) else: self._request_buffer.put_failed_request( result) elif callable(result): self._request_buffer.put_request( result) elif isinstance(result, Item): self._item_buffer.put_item(result) del_request_redis_after_request_to_db = True else: # 将 requests 重新入库 爬取 request.retry_times += 1 request.filter_repeat = False log.info(""" 入库 等待重试 url %s 重试次数 %s 最大允许重试次数 %s""" % ( request.url, request.retry_times, setting.SPIDER_MAX_RETRY_TIMES, )) if used_download_midware_enable: # 去掉download_midware 添加的属性 使用原来的requests original_request = (Request.from_dict( eval(request_redis)) if request_redis else request) if hasattr(request, "error_msg"): original_request.error_msg = request.error_msg if hasattr(request, "response"): original_request.response = request.response original_request.retry_times = request.retry_times original_request.filter_repeat = ( request.filter_repeat) self._request_buffer.put_request( original_request) else: self._request_buffer.put_request(request) del_request_redis_after_request_to_db = True else: # 记录下载成功的文档 self.record_download_status( PaserControl.DOWNLOAD_SUCCESS, parser.name) # 记录成功任务数 self.__class__._success_task_count += 1 # 缓存下载成功的文档 if setting.RESPONSE_CACHED_ENABLE: request.save_cached( response=response, expire_time=setting. RESPONSE_CACHED_EXPIRE_TIME, ) finally: # 释放浏览器 if response and hasattr(response, "browser"): request._webdriver_pool.put(response.browser) break # 删除正在做的request 跟随item优先 if request_redis: if del_request_redis_after_item_to_db: self._item_buffer.put_item(request_redis) elif del_request_redis_after_request_to_db: self._request_buffer.put_del_request(request_redis) else: self._request_buffer.put_del_request(request_redis) if setting.SPIDER_SLEEP_TIME: time.sleep(setting.SPIDER_SLEEP_TIME)
def deal_requests(self, requests): for request in requests: response = None for parser in self._parsers: if parser.name == request.parser_name: try: # 记录需下载的文档 self.record_download_status( PaserControl.DOWNLOAD_TOTAL, parser.name) # 解析request if request.auto_request: request_temp = None response = None # 下载中间件 if request.download_midware: download_midware = ( request.download_midware if callable(request.download_midware) else tools.get_method( parser, request.download_midware)) request_temp = download_midware(request) elif request.download_midware != False: request_temp = parser.download_midware(request) # 请求 if request_temp: if (isinstance(request_temp, (tuple, list)) and len(request_temp) == 2): request_temp, response = request_temp if not isinstance(request_temp, Request): raise Exception( "download_midware need return a request, but received type: {}" .format(type(request_temp))) request = request_temp if not response: response = (request.get_response() if not setting.RESPONSE_CACHED_USED else request.get_response_from_cached( save_cached=False)) else: response = None # 校验 if parser.validate(request, response) == False: continue if request.callback: # 如果有parser的回调函数,则用回调处理 callback_parser = (request.callback if callable( request.callback) else tools.get_method( parser, request.callback)) results = callback_parser(request, response) else: # 否则默认用parser处理 results = parser.parse(request, response) if results and not isinstance(results, Iterable): raise Exception( "%s.%s返回值必须可迭代" % (parser.name, request.callback or "parse")) # 此处判断是request 还是 item for result in results or []: if isinstance(result, Request): # 给request的 parser_name 赋值 result.parser_name = result.parser_name or parser.name # 判断是同步的callback还是异步的 if result.request_sync: # 同步 requests.append(result) else: # 异步 # 将next_request 入库 self._memory_db.add(result) elif isinstance(result, Item): self._item_buffer.put_item(result) except Exception as e: exception_type = (str(type(e)).replace("<class '", "").replace( "'>", "")) if exception_type.startswith("requests"): # 记录下载失败的文档 self.record_download_status( PaserControl.DOWNLOAD_EXCEPTION, parser.name) else: # 记录解析程序异常 self.record_download_status( PaserControl.PAESERS_EXCEPTION, parser.name) if setting.LOG_LEVEL == "DEBUG": # 只有debug模式下打印, 超时的异常篇幅太多 log.exception(e) log.error(""" -------------- %s.%s error ------------- error %s response %s deal request %s """ % ( parser.name, (request.callback and callable(request.callback) and getattr(request.callback, "__name__") or request.callback) or "parse", str(e), response, tools.dumps_json(request.to_dict, indent=28) if setting.LOG_LEVEL == "DEBUG" else request, )) request.error_msg = "%s: %s" % (exception_type, e) request.response = str(response) if "Invalid URL" in str(e): request.is_abandoned = True requests = parser.exception_request( request, response) or [request] if not isinstance(requests, Iterable): raise Exception("%s.%s返回值必须可迭代" % (parser.name, "exception_request")) for request in requests: if not isinstance(request, Request): raise Exception( "exception_request 需 yield request") if (request.retry_times + 1 > setting.SPIDER_MAX_RETRY_TIMES or request.is_abandoned): self.__class__._failed_task_count += 1 # 记录失败任务数 # 处理failed_request的返回值 request 或 func results = parser.failed_request( request, response) or [request] if not isinstance(results, Iterable): raise Exception( "%s.%s返回值必须可迭代" % (parser.name, "failed_request")) log.info(""" 任务超过最大重试次数,丢弃 url %s 重试次数 %s 最大允许重试次数 %s""" % ( request.url, request.retry_times, setting.SPIDER_MAX_RETRY_TIMES, )) else: # 将 requests 重新入库 爬取 request.retry_times += 1 request.filter_repeat = False log.info(""" 入库 等待重试 url %s 重试次数 %s 最大允许重试次数 %s""" % ( request.url, request.retry_times, setting.SPIDER_MAX_RETRY_TIMES, )) self._memory_db.add(request) else: # 记录下载成功的文档 self.record_download_status( PaserControl.DOWNLOAD_SUCCESS, parser.name) # 记录成功任务数 self.__class__._success_task_count += 1 # 缓存下载成功的文档 if setting.RESPONSE_CACHED_ENABLE: request.save_cached( response=response, expire_time=setting. RESPONSE_CACHED_EXPIRE_TIME, ) finally: # 释放浏览器 if response and hasattr(response, "browser"): request._webdriver_pool.put(response.browser) break if setting.SPIDER_SLEEP_TIME: time.sleep(setting.SPIDER_SLEEP_TIME)
def clear(self, table): try: self._redis.delete(table) except Exception as e: log.error(e)
def __exit__(self, exc_type, exc_val, exc_tb): if exc_val: log.error(exc_val) self.quit() return True