class Scheduler(threading.Thread): __custom_setting__ = {} def __init__( self, redis_key=None, thread_count=None, begin_callback=None, end_callback=None, delete_keys=(), auto_stop_when_spider_done=None, auto_start_requests=None, send_run_time=True, batch_interval=0, wait_lock=True, task_table=None, ): """ @summary: 调度器 --------- @param redis_key: 爬虫request及item存放reis中的文件夹 @param thread_count: 线程数,默认为配置文件中的线程数 @param begin_callback: 爬虫开始回调函数 @param end_callback: 爬虫结束回调函数 @param delete_keys: 爬虫启动时删除的key,类型: 元组/bool/string。 支持正则 @param auto_stop_when_spider_done: 爬虫抓取完毕后是否自动结束或等待任务,默认自动结束 @param auto_start_requests: 爬虫是否自动添加任务 @param send_run_time: 发送运行时间 @param batch_interval: 抓取时间间隔 默认为0 天为单位 多次启动时,只有当前时间与第一次抓取结束的时间间隔大于指定的时间间隔时,爬虫才启动 @param wait_lock: 下发任务时否等待锁,若不等待锁,可能会存在多进程同时在下发一样的任务,因此分布式环境下请将该值设置True @param task_table: 任务表, 批次爬虫传递 --------- @result: """ super(Scheduler, self).__init__() for key, value in self.__class__.__custom_setting__.items(): setattr(setting, key, value) log.reload() self._redis_key = redis_key or setting.REDIS_KEY if not self._redis_key: raise Exception( """ redis_key 为redis中存放request与item的目录。不能为空, 可在setting中配置,如 REDIS_KEY = 'test' 或spider初始化时传参, 如 TestSpider(redis_key='test') """ ) self._request_buffer = RequestBuffer(redis_key) self._item_buffer = ItemBuffer(redis_key, task_table) self._collector = Collector(redis_key) self._parsers = [] self._parser_controls = [] self._parser_control_obj = PaserControl self._auto_stop_when_spider_done = ( auto_stop_when_spider_done if auto_stop_when_spider_done is not None else setting.AUTO_STOP_WHEN_SPIDER_DONE ) self._auto_start_requests = ( auto_start_requests if auto_start_requests is not None else setting.SPIDER_AUTO_START_REQUESTS ) self._send_run_time = send_run_time self._batch_interval = batch_interval self._begin_callback = ( begin_callback if begin_callback else lambda: log.info("\n********** feapder begin **********") ) self._end_callback = ( end_callback if end_callback else lambda: log.info("\n********** feapder end **********") ) self._thread_count = ( setting.SPIDER_THREAD_COUNT if not thread_count else thread_count ) self._spider_name = redis_key self._project_name = redis_key.split(":")[0] self._tab_spider_time = setting.TAB_SPIDER_TIME.format(redis_key=redis_key) self._tab_spider_status = setting.TAB_SPIDER_STATUS.format(redis_key=redis_key) self._tab_requests = setting.TAB_REQUSETS.format(redis_key=redis_key) self._tab_failed_requests = setting.TAB_FAILED_REQUSETS.format( redis_key=redis_key ) self._is_notify_end = False # 是否已经通知结束 self._last_task_count = 0 # 最近一次任务数量 self._redisdb = RedisDB() self._project_total_state_table = "{}_total_state".format(self._project_name) self._is_exist_project_total_state_table = False # Request 缓存设置 Request.cached_redis_key = redis_key Request.cached_expire_time = setting.RESPONSE_CACHED_EXPIRE_TIME delete_keys = delete_keys or setting.DELETE_KEYS if delete_keys: self.delete_tables(delete_keys) self._last_check_task_status_time = 0 self.wait_lock = wait_lock def add_parser(self, parser): parser = parser() # parser 实例化 if isinstance(parser, BaseParser): self._parsers.append(parser) else: raise ValueError("类型错误,爬虫需继承feapder.BaseParser或feapder.BatchParser") def run(self): if not self.is_reach_next_spider_time(): return self._start() while True: if self.all_thread_is_done(): if not self._is_notify_end: self.spider_end() # 跑完一轮 self.record_spider_state( spider_type=1, state=1, spider_end_time=tools.get_current_date(), batch_interval=self._batch_interval, ) self._is_notify_end = True if self._auto_stop_when_spider_done: self._stop_all_thread() break else: self._is_notify_end = False self.check_task_status() tools.delay_time(1) # 1秒钟检查一次爬虫状态 def __add_task(self): # 启动parser 的 start_requests self.spider_begin() # 不自动结束的爬虫此处只能执行一遍 self.record_spider_state( spider_type=1, state=0, batch_date=tools.get_current_date(), spider_start_time=tools.get_current_date(), batch_interval=self._batch_interval, ) # 判断任务池中属否还有任务,若有接着抓取 todo_task_count = self._collector.get_requests_count() if todo_task_count: log.info("检查到有待做任务 %s 条,不重下发新任务。将接着上回异常终止处继续抓取" % todo_task_count) else: for parser in self._parsers: results = parser.start_requests() # 添加request到请求队列,由请求队列统一入库 if results and not isinstance(results, Iterable): raise Exception("%s.%s返回值必须可迭代" % (parser.name, "start_requests")) result_type = 1 for result in results or []: if isinstance(result, Request): result.parser_name = result.parser_name or parser.name self._request_buffer.put_request(result) result_type = 1 elif isinstance(result, Item): self._item_buffer.put_item(result) result_type = 2 elif callable(result): # callbale的request可能是更新数据库操作的函数 if result_type == 1: self._request_buffer.put_request(result) else: self._item_buffer.put_item(result) else: raise TypeError( "start_requests yield result type error, expect Request、Item、callback func, bug get type: {}".format( type(result) ) ) self._request_buffer.flush() self._item_buffer.flush() def _start(self): # 启动request_buffer self._request_buffer.start() # 启动item_buffer self._item_buffer.start() # 启动collector self._collector.start() # 启动parser control for i in range(self._thread_count): parser_control = self._parser_control_obj( self._collector, self._redis_key, self._request_buffer, self._item_buffer, ) for parser in self._parsers: parser_control.add_parser(parser) parser_control.start() self._parser_controls.append(parser_control) # 下发任务 因为时间可能比较长,放到最后面 if setting.RETRY_FAILED_REQUESTS: # 重设失败的任务, 不用加锁,原子性操作 handle_failed_requests = HandleFailedRequests(self._redis_key) handle_failed_requests.reput_failed_requests_to_requests() # 下发新任务 if self._auto_start_requests: # 自动下发 if self.wait_lock: # 将添加任务处加锁,防止多进程之间添加重复的任务 with RedisLock( key=self._spider_name, timeout=3600, wait_timeout=60, redis_cli=RedisDB().get_redis_obj(), ) as lock: if lock.locked: self.__add_task() else: self.__add_task() def all_thread_is_done(self): for i in range(3): # 降低偶然性, 因为各个环节不是并发的,很有可能当时状态为假,但检测下一条时该状态为真。一次检测很有可能遇到这种偶然性 # 检测 collector 状态 if ( self._collector.is_collector_task() or self._collector.get_requests_count() > 0 ): return False # 检测 parser_control 状态 for parser_control in self._parser_controls: if not parser_control.is_not_task(): return False # 检测 item_buffer 状态 if ( self._item_buffer.get_items_count() > 0 or self._item_buffer.is_adding_to_db() ): return False # 检测 request_buffer 状态 if ( self._request_buffer.get_requests_count() > 0 or self._request_buffer.is_adding_to_db() ): return False tools.delay_time(1) return True @tools.run_safe_model("check_task_status") def check_task_status(self): """ 检查任务状态 预警 """ # 每分钟检查一次 now_time = time.time() if now_time - self._last_check_task_status_time > 60: self._last_check_task_status_time = now_time else: return # 检查redis中任务状态,若连续20分钟内任务数量未发生变化(parser可能卡死),则发出报警信息 task_count = self._redisdb.zget_count(self._tab_requests) if task_count: if task_count != self._last_task_count: self._last_task_count = task_count self._redisdb.hset( self._tab_spider_time, SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY, tools.get_current_timestamp(), ) # 多进程会重复发消息, 使用reids记录上次统计时间 else: # 判断时间间隔是否超过20分钟 lua = """ -- local key = KEYS[1] local field = ARGV[1] local current_timestamp = ARGV[2] -- 取值 local last_timestamp = redis.call('hget', KEYS[1], field) if last_timestamp and current_timestamp - last_timestamp >= 1200 then return current_timestamp - last_timestamp -- 返回任务停滞时间 秒 end if not last_timestamp then redis.call('hset', KEYS[1], field, current_timestamp) end return 0 """ redis_obj = self._redisdb.get_redis_obj() cmd = redis_obj.register_script(lua) overtime = cmd( keys=[self._tab_spider_time], args=[ SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY, tools.get_current_timestamp(), ], ) if overtime: # 发送报警 msg = "《{}》爬虫任务停滞 {},请检查爬虫是否正常".format( self._spider_name, tools.format_seconds(overtime) ) log.error(msg) self.send_msg( msg, level="error", message_prefix="《{}》爬虫任务停滞".format(self._spider_name), ) else: self._last_task_count = 0 # 检查失败任务数量 超过1000 报警, failed_count = self._redisdb.zget_count(self._tab_failed_requests) if failed_count > setting.WARNING_FAILED_COUNT: # 发送报警 msg = "《%s》爬虫当前失败任务 %s, 请检查爬虫是否正常" % (self._spider_name, failed_count) log.error(msg) self.send_msg( msg, level="error", message_prefix="《%s》爬虫当前失败任务数预警" % (self._spider_name), ) # parser_control实时统计已做任务数及失败任务数,若失败数大于10且失败任务数/已做任务数>=0.5 则报警 failed_task_count, success_task_count = PaserControl.get_task_status_count() total_count = success_task_count + failed_task_count if total_count > 0: task_success_rate = success_task_count / total_count if task_success_rate < 0.5: # 发送报警 msg = "《%s》爬虫当前任务成功数%s, 失败数%s, 成功率 %.2f, 请检查爬虫是否正常" % ( self._spider_name, success_task_count, failed_task_count, task_success_rate, ) log.error(msg) # 统计下上次发消息的时间,若时间大于1小时,则报警(此处为多进程,需要考虑别报重复) self.send_msg( msg, level="error", message_prefix="《%s》爬虫当前任务成功率" % (self._spider_name), ) def delete_tables(self, delete_tables_list): if isinstance(delete_tables_list, bool): delete_tables_list = [self._redis_key + "*"] elif not isinstance(delete_tables_list, (list, tuple)): delete_tables_list = [delete_tables_list] redis = RedisDB() for delete_tab in delete_tables_list: if not delete_tab.startswith(self._redis_key): delete_tab = self._redis_key + delete_tab tables = redis.getkeys(delete_tab) for table in tables: if table != self._tab_spider_time: log.info("正在删除key %s" % table) redis.clear(table) def _stop_all_thread(self): self._request_buffer.stop() self._item_buffer.stop() # 停止 collector self._collector.stop() # 停止 parser_controls for parser_control in self._parser_controls: parser_control.stop() def send_msg(self, msg, level="debug", message_prefix=""): if setting.WARNING_LEVEL == "ERROR": if level != "error": return if setting.DINGDING_WARNING_PHONE: keyword = "feapder报警系统\n" tools.dingding_warning(keyword + msg, message_prefix=message_prefix) if setting.EMAIL_RECEIVER: tools.email_warning( msg, message_prefix=message_prefix, title=self._spider_name ) def spider_begin(self): """ @summary: start_monitor_task 方式启动,此函数与spider_end不在同一进程内,变量不可共享 --------- --------- @result: """ if self._begin_callback: self._begin_callback() for parser in self._parsers: parser.start_callback() # 记录开始时间 if not self._redisdb.hexists(self._tab_spider_time, SPIDER_START_TIME_KEY): current_timestamp = tools.get_current_timestamp() self._redisdb.hset( self._tab_spider_time, SPIDER_START_TIME_KEY, current_timestamp ) # 发送消息 self.send_msg("《%s》爬虫开始" % self._spider_name) def spider_end(self): self.record_end_time() if self._end_callback: self._end_callback() for parser in self._parsers: parser.close() parser.end_callback() # 关闭webdirver if Request.webdriver_pool: Request.webdriver_pool.close() # 计算抓取时常 data = self._redisdb.hget( self._tab_spider_time, SPIDER_START_TIME_KEY, is_pop=True ) if data: begin_timestamp = int(data) spand_time = tools.get_current_timestamp() - begin_timestamp msg = "《%s》爬虫结束,耗时 %s" % ( self._spider_name, tools.format_seconds(spand_time), ) log.info(msg) if self._send_run_time: self.send_msg(msg) if not self._auto_stop_when_spider_done: log.info("爬虫不自动结束, 等待下一轮任务...") else: self.delete_tables(self._tab_spider_status) def record_end_time(self): # 记录结束时间 if self._batch_interval: current_timestamp = tools.get_current_timestamp() self._redisdb.hset( self._tab_spider_time, SPIDER_END_TIME_KEY, current_timestamp ) def is_reach_next_spider_time(self): if not self._batch_interval: return True last_spider_end_time = self._redisdb.hget( self._tab_spider_time, SPIDER_END_TIME_KEY ) if last_spider_end_time: last_spider_end_time = int(last_spider_end_time) current_timestamp = tools.get_current_timestamp() time_interval = current_timestamp - last_spider_end_time if time_interval < self._batch_interval * 86400: log.info( "上次运行结束时间为 {} 与当前时间间隔 为 {}, 小于规定的抓取时间间隔 {}。爬虫不执行,退出~".format( tools.timestamp_to_date(last_spider_end_time), tools.format_seconds(time_interval), tools.format_seconds(self._batch_interval * 86400), ) ) return False return True def record_spider_state( self, spider_type, state, batch_date=None, spider_start_time=None, spider_end_time=None, batch_interval=None, ): pass
class NormalUserPool(UserPoolInterface): """ 普通用户池,适用于账号成本低且大量的场景 """ def __init__( self, redis_key, *, table_userbase, login_state_key="login_state", lock_state_key="lock_state", username_key="username", password_key="password", login_retry_times=1, keep_alive=False, ): """ @param redis_key: 项目名 @param table_userbase: 用户表名 @param login_state_key: 登录状态列名 @param lock_state_key: 封锁状态列名 @param username_key: 登陆名列名 @param password_key: 密码列名 @param login_retry_times: 登陆失败重试次数 @param keep_alive: 是否保持常驻,以便user不足时立即补充 """ self._tab_user_pool = setting.TAB_USER_POOL.format( redis_key=redis_key, user_type="normal" ) self._login_retry_times = login_retry_times self._table_userbase = table_userbase self._login_state_key = login_state_key self._lock_state_key = lock_state_key self._username_key = username_key self._password_key = password_key self._keep_alive = keep_alive self._users_id = [] self._redisdb = RedisDB() self._mysqldb = MysqlDB() self._create_userbase() def _load_users_id(self): self._users_id = self._redisdb.hkeys(self._tab_user_pool) if self._users_id: random.shuffle(self._users_id) def _get_user_id(self): if not self._users_id: self._load_users_id() if self._users_id: return self._users_id.pop() def _create_userbase(self): sql = f""" CREATE TABLE IF NOT EXISTS `{self._table_userbase}` ( `id` int(10) unsigned NOT NULL AUTO_INCREMENT, `{self._username_key}` varchar(50) DEFAULT NULL COMMENT '用户名', `{self._password_key}` varchar(255) DEFAULT NULL COMMENT '密码', `{self._login_state_key}` int(11) DEFAULT '0' COMMENT '登录状态(0未登录 1已登录)', `{self._lock_state_key}` int(11) DEFAULT '0' COMMENT '账号是否被封(0 未封 1 被封)', PRIMARY KEY (`id`), UNIQUE KEY `username` (`username`) USING BTREE ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; """ self._mysqldb.execute(sql) def _load_user(self) -> Iterable[NormalUser]: """ 返回用户信息 @return: yield username, password """ sql = "select id, {username_key}, {password_key} from {table_userbase} where {lock_state_key} != 1 and {login_state_key} != 1".format( username_key=self._username_key, password_key=self._password_key, table_userbase=self._table_userbase, lock_state_key=self._lock_state_key, login_state_key=self._login_state_key, ) for id, username, password in self._mysqldb.find(sql): yield NormalUser(user_id=id, username=username, password=password) def handle_login_failed_user(self, user: NormalUser): """ 处理登录失败的user @return: """ pass def handel_exception(self, e: Exception): """ 处理异常 @param e: @return: """ log.exception(e) def login(self, user: NormalUser) -> NormalUser: """ 登录 生产cookie """ raise NotImplementedError def add_user(self, user: NormalUser): log.debug("add {}".format(user)) self._redisdb.hset(self._tab_user_pool, user.user_id, user.to_dict()) sql = "update {table_userbase} set {login_state_key} = 1 where id = {user_id}".format( table_userbase=self._table_userbase, login_state_key=self._login_state_key, username_key=self._username_key, user_id=user.user_id, ) self._mysqldb.update(sql) def get_user(self, block=True) -> Optional[NormalUser]: while True: try: user_id = self._get_user_id() user_str = None if user_id: user_str = self._redisdb.hget(self._tab_user_pool, user_id) # 如果没取到user,可能是其他爬虫将此用户删除了,需要重刷新本地缓存的用户id if not user_str: self._load_users_id() continue if not user_id and block: self._keep_alive = False self.run() continue return user_str and NormalUser(**eval(user_str)) except Exception as e: log.exception(e) tools.delay_time(1) def del_user(self, user_id: int): """ 删除失效的user @return: """ self._redisdb.hdel(self._tab_user_pool, user_id) self._load_users_id() sql = "update {table_userbase} set {login_state_key} = 0 where id = {user_id}".format( table_userbase=self._table_userbase, login_state_key=self._login_state_key, username_key=self._username_key, user_id=user_id, ) self._mysqldb.update(sql) def tag_user_locked(self, user_id: int): """ 标记用户被封堵 """ sql = "update {table_userbase} set {lock_state_key} = 1 where id = {user_id}".format( table_userbase=self._table_userbase, lock_state_key=self._lock_state_key, username_key=self._username_key, user_id=user_id, ) self._mysqldb.update(sql) def run(self): while True: try: try: with RedisLock( key=self._tab_user_pool, lock_timeout=3600, wait_timeout=0 ) as _lock: if _lock.locked: for user in self._load_user(): retry_times = 0 while retry_times <= self._login_retry_times: try: user = self.login(user) if user: self.add_user(user) else: self.handle_login_failed_user(user) break except NotImplementedError: log.error( f"{self.__class__.__name__} must be implementation login method!" ) os._exit(0) except Exception as e: self.handel_exception(e) log.debug( f"login failed, user: {user} retry_times: {retry_times}" ) retry_times += 1 else: self.handle_login_failed_user(user) now_user_count = self._redisdb.hget_count( self._tab_user_pool ) log.info("当前在线user数为 {}".format(now_user_count)) except Exception as e: log.exception(e) if self._keep_alive: tools.delay_time(10) else: break except Exception as e: log.exception(e) tools.delay_time(1)
class GuestUserPool(UserPoolInterface): """ 访客用户池 不需要登陆 """ def __init__( self, redis_key, page_url=None, min_users=1, must_contained_keys=(), keep_alive=False, **kwargs, ): """ @param redis_key: user存放在redis中的key前缀 @param page_url: 生产user的url @param min_users: 最小user数 @param must_contained_keys: cookie中必须包含的key,用于校验cookie是否正确 @param keep_alive: 是否保持常驻,以便user不足时立即补充 --- @param kwargs: WebDriver的一些参数 load_images: 是否加载图片 user_agent: 字符串 或 无参函数,返回值为user_agent proxy: xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址 headless: 是否启用无头模式 driver_type: CHROME 或 PHANTOMJS,FIREFOX timeout: 请求超时时间 window_size: # 窗口大小 executable_path: 浏览器路径,默认为默认路径 """ self._redisdb = RedisDB() self._tab_user_pool = setting.TAB_USER_POOL.format(redis_key=redis_key, user_type="guest") self._page_url = page_url self._min_users = min_users self._must_contained_keys = must_contained_keys self._keep_alive = keep_alive self._kwargs = kwargs self._kwargs.setdefault("load_images", False) self._kwargs.setdefault("headless", True) self._users_id = [] def _load_users_id(self): self._users_id = self._redisdb.hkeys(self._tab_user_pool) if self._users_id: random.shuffle(self._users_id) def _get_user_id(self): if not self._users_id: self._load_users_id() if self._users_id: return self._users_id.pop() def login(self) -> Optional[GuestUser]: """ 默认使用webdirver去登录,生产cookie,可以重写 """ with WebDriver(**self._kwargs) as driver: driver.get(self._page_url) cookies = driver.cookies for key in self._must_contained_keys: if key not in cookies: break else: user = GuestUser(user_agent=driver.user_agent, cookies=cookies) return user log.error("获取cookie失败 cookies = {}".format(cookies)) return None def add_user(self, user: GuestUser): log.debug("add {}".format(user)) self._redisdb.hset(self._tab_user_pool, user.user_id, user.to_dict()) def get_user(self, block=True) -> Optional[GuestUser]: """ Args: block: 无用户时是否等待 Returns: """ while True: try: user_id = self._get_user_id() user_str = None if user_id: user_str = self._redisdb.hget(self._tab_user_pool, user_id) # 如果没取到user,可能是其他爬虫将此用户删除了,需要重刷新本地缓存的用户id if not user_str: self._load_users_id() continue if not user_id and block: self._keep_alive = False with RedisLock(key=self._tab_user_pool, lock_timeout=3600, wait_timeout=0) as _lock: if _lock.locked: self.run() continue return user_str and GuestUser(**eval(user_str)) except Exception as e: log.exception(e) tools.delay_time(1) def del_user(self, user_id: str): self._redisdb.hdel(self._tab_user_pool, user_id) self._load_users_id() def run(self): while True: try: now_user_count = self._redisdb.hget_count(self._tab_user_pool) need_user_count = self._min_users - now_user_count if need_user_count > 0: log.info("当前在线user数为 {} 小于 {}, 生产user".format( now_user_count, self._min_users)) try: user = self.login() if user: self.add_user(user) except Exception as e: log.exception(e) else: log.debug("当前user数为 {} 数量足够 暂不生产".format(now_user_count)) if self._keep_alive: tools.delay_time(10) else: break except Exception as e: log.exception(e) tools.delay_time(1)
class Scheduler(threading.Thread): __custom_setting__ = {} def __init__( self, redis_key=None, thread_count=None, begin_callback=None, end_callback=None, delete_keys=(), keep_alive=None, auto_start_requests=None, batch_interval=0, wait_lock=True, task_table=None, **kwargs, ): """ @summary: 调度器 --------- @param redis_key: 爬虫request及item存放redis中的文件夹 @param thread_count: 线程数,默认为配置文件中的线程数 @param begin_callback: 爬虫开始回调函数 @param end_callback: 爬虫结束回调函数 @param delete_keys: 爬虫启动时删除的key,类型: 元组/bool/string。 支持正则 @param keep_alive: 爬虫是否常驻,默认否 @param auto_start_requests: 爬虫是否自动添加任务 @param batch_interval: 抓取时间间隔 默认为0 天为单位 多次启动时,只有当前时间与第一次抓取结束的时间间隔大于指定的时间间隔时,爬虫才启动 @param wait_lock: 下发任务时否等待锁,若不等待锁,可能会存在多进程同时在下发一样的任务,因此分布式环境下请将该值设置True @param task_table: 任务表, 批次爬虫传递 --------- @result: """ super(Scheduler, self).__init__() for key, value in self.__class__.__custom_setting__.items(): if key == "AUTO_STOP_WHEN_SPIDER_DONE": # 兼容老版本的配置 setattr(setting, "KEEP_ALIVE", not value) else: setattr(setting, key, value) self._redis_key = redis_key or setting.REDIS_KEY if not self._redis_key: raise Exception( """ redis_key 为redis中存放request与item的目录。不能为空, 可在setting中配置,如 REDIS_KEY = 'test' 或spider初始化时传参, 如 TestSpider(redis_key='test') """ ) self._request_buffer = RequestBuffer(redis_key) self._item_buffer = ItemBuffer(redis_key, task_table) self._collector = Collector(redis_key) self._parsers = [] self._parser_controls = [] self._parser_control_obj = ParserControl # 兼容老版本的参数 if "auto_stop_when_spider_done" in kwargs: self._keep_alive = not kwargs.get("auto_stop_when_spider_done") else: self._keep_alive = ( keep_alive if keep_alive is not None else setting.KEEP_ALIVE ) self._auto_start_requests = ( auto_start_requests if auto_start_requests is not None else setting.SPIDER_AUTO_START_REQUESTS ) self._batch_interval = batch_interval self._begin_callback = ( begin_callback if begin_callback else lambda: log.info("\n********** feapder begin **********") ) self._end_callback = ( end_callback if end_callback else lambda: log.info("\n********** feapder end **********") ) if thread_count: setattr(setting, "SPIDER_THREAD_COUNT", thread_count) self._thread_count = setting.SPIDER_THREAD_COUNT self._spider_name = redis_key self._project_name = redis_key.split(":")[0] self._tab_spider_status = setting.TAB_SPIDER_STATUS.format(redis_key=redis_key) self._tab_requests = setting.TAB_REQUESTS.format(redis_key=redis_key) self._tab_failed_requests = setting.TAB_FAILED_REQUESTS.format( redis_key=redis_key ) self._is_notify_end = False # 是否已经通知结束 self._last_task_count = 0 # 最近一次任务数量 self._last_check_task_count_time = 0 self._redisdb = RedisDB() self._project_total_state_table = "{}_total_state".format(self._project_name) self._is_exist_project_total_state_table = False # Request 缓存设置 Request.cached_redis_key = redis_key Request.cached_expire_time = setting.RESPONSE_CACHED_EXPIRE_TIME delete_keys = delete_keys or setting.DELETE_KEYS if delete_keys: self.delete_tables(delete_keys) self._last_check_task_status_time = 0 self.wait_lock = wait_lock self.init_metrics() # 重置丢失的任务 self.reset_task() def init_metrics(self): """ 初始化打点系统 """ metrics.init(**setting.METRICS_OTHER_ARGS) def add_parser(self, parser, **kwargs): parser = parser(**kwargs) # parser 实例化 if isinstance(parser, BaseParser): self._parsers.append(parser) else: raise ValueError("类型错误,爬虫需继承feapder.BaseParser或feapder.BatchParser") def run(self): if not self.is_reach_next_spider_time(): return self._start() while True: try: self.heartbeat() if self.all_thread_is_done(): if not self._is_notify_end: self.spider_end() # 跑完一轮 self.record_spider_state( spider_type=1, state=1, spider_end_time=tools.get_current_date(), batch_interval=self._batch_interval, ) self._is_notify_end = True if not self._keep_alive: self._stop_all_thread() break else: self._is_notify_end = False self.check_task_status() except Exception as e: log.exception(e) tools.delay_time(1) # 1秒钟检查一次爬虫状态 def __add_task(self): # 启动parser 的 start_requests self.spider_begin() # 不自动结束的爬虫此处只能执行一遍 self.record_spider_state( spider_type=1, state=0, batch_date=tools.get_current_date(), spider_start_time=tools.get_current_date(), batch_interval=self._batch_interval, ) # 判断任务池中属否还有任务,若有接着抓取 todo_task_count = self._collector.get_requests_count() if todo_task_count: log.info("检查到有待做任务 %s 条,不重下发新任务,将接着上回异常终止处继续抓取" % todo_task_count) else: for parser in self._parsers: results = parser.start_requests() # 添加request到请求队列,由请求队列统一入库 if results and not isinstance(results, Iterable): raise Exception("%s.%s返回值必须可迭代" % (parser.name, "start_requests")) result_type = 1 for result in results or []: if isinstance(result, Request): result.parser_name = result.parser_name or parser.name self._request_buffer.put_request(result) result_type = 1 elif isinstance(result, Item): self._item_buffer.put_item(result) result_type = 2 elif callable(result): # callbale的request可能是更新数据库操作的函数 if result_type == 1: self._request_buffer.put_request(result) else: self._item_buffer.put_item(result) else: raise TypeError( "start_requests yield result type error, expect Request、Item、callback func, bug get type: {}".format( type(result) ) ) self._request_buffer.flush() self._item_buffer.flush() def _start(self): # 启动request_buffer self._request_buffer.start() # 启动item_buffer self._item_buffer.start() # 启动collector self._collector.start() # 启动parser control for i in range(self._thread_count): parser_control = self._parser_control_obj( self._collector, self._redis_key, self._request_buffer, self._item_buffer, ) for parser in self._parsers: parser_control.add_parser(parser) parser_control.start() self._parser_controls.append(parser_control) # 下发任务 因为时间可能比较长,放到最后面 if setting.RETRY_FAILED_REQUESTS: # 重设失败的任务, 不用加锁,原子性操作 handle_failed_requests = HandleFailedRequests(self._redis_key) handle_failed_requests.reput_failed_requests_to_requests() # 下发新任务 if self._auto_start_requests: # 自动下发 if self.wait_lock: # 将添加任务处加锁,防止多进程之间添加重复的任务 with RedisLock(key=self._spider_name) as lock: if lock.locked: self.__add_task() else: self.__add_task() def all_thread_is_done(self): # 降低偶然性, 因为各个环节不是并发的,很有可能当时状态为假,但检测下一条时该状态为真。一次检测很有可能遇到这种偶然性 for i in range(3): # 检测 collector 状态 if ( self._collector.is_collector_task() or self._collector.get_requests_count() > 0 ): return False # 检测 parser_control 状态 for parser_control in self._parser_controls: if not parser_control.is_not_task(): return False # 检测 item_buffer 状态 if ( self._item_buffer.get_items_count() > 0 or self._item_buffer.is_adding_to_db() ): return False # 检测 request_buffer 状态 if ( self._request_buffer.get_requests_count() > 0 or self._request_buffer.is_adding_to_db() ): return False tools.delay_time(1) return True @tools.run_safe_model("check_task_status") def check_task_status(self): """ 检查任务状态 预警 """ # 每分钟检查一次 now_time = time.time() if now_time - self._last_check_task_status_time > 60: self._last_check_task_status_time = now_time else: return # 检查失败任务数量 超过1000 报警, failed_count = self._redisdb.zget_count(self._tab_failed_requests) if failed_count > setting.WARNING_FAILED_COUNT: # 发送报警 msg = "《%s》爬虫当前失败任务数:%s, 请检查爬虫是否正常" % (self._spider_name, failed_count) log.error(msg) self.send_msg( msg, level="error", message_prefix="《%s》爬虫当前失败任务数报警" % (self._spider_name), ) # parser_control实时统计已做任务数及失败任务数,若成功率<0.5 则报警 ( failed_task_count, success_task_count, total_task_count, ) = ParserControl.get_task_status_count() total_count = success_task_count + failed_task_count if total_count > 0: task_success_rate = success_task_count / total_count if task_success_rate < 0.5: # 发送报警 msg = "《%s》爬虫当前任务成功数%s, 失败数%s, 成功率 %.2f, 请检查爬虫是否正常" % ( self._spider_name, success_task_count, failed_task_count, task_success_rate, ) log.error(msg) self.send_msg( msg, level="error", message_prefix="《%s》爬虫当前任务成功率报警" % (self._spider_name), ) # 判断任务数是否变化 current_time = tools.get_current_timestamp() if ( current_time - self._last_check_task_count_time > setting.WARNING_CHECK_TASK_COUNT_INTERVAL ): if self._last_task_count and self._last_task_count == total_task_count: # 发送报警 msg = "《{}》爬虫任务停滞 {},请检查爬虫是否正常".format( self._spider_name, tools.format_seconds( current_time - self._last_check_task_count_time ), ) log.error(msg) self.send_msg( msg, level="error", message_prefix="《{}》爬虫任务停滞".format(self._spider_name), ) else: self._last_task_count = total_task_count self._last_check_task_count_time = current_time # 检查入库失败次数 if self._item_buffer.export_falied_times > setting.EXPORT_DATA_MAX_FAILED_TIMES: msg = "《{}》爬虫导出数据失败,失败次数:{}, 请检查爬虫是否正常".format( self._spider_name, self._item_buffer.export_falied_times ) log.error(msg) self.send_msg( msg, level="error", message_prefix="《%s》爬虫导出数据失败" % (self._spider_name) ) def delete_tables(self, delete_tables_list): if isinstance(delete_tables_list, bool): delete_tables_list = [self._redis_key + "*"] elif not isinstance(delete_tables_list, (list, tuple)): delete_tables_list = [delete_tables_list] for delete_tab in delete_tables_list: if not delete_tab.startswith(self._redis_key): delete_tab = self._redis_key + delete_tab tables = self._redisdb.getkeys(delete_tab) for table in tables: log.debug("正在删除key %s" % table) self._redisdb.clear(table) def _stop_all_thread(self): self._request_buffer.stop() self._item_buffer.stop() # 停止 collector self._collector.stop() # 停止 parser_controls for parser_control in self._parser_controls: parser_control.stop() self._started.clear() def send_msg(self, msg, level="debug", message_prefix=""): # log.debug("发送报警 level:{} msg{}".format(level, msg)) tools.send_msg(msg=msg, level=level, message_prefix=message_prefix) def spider_begin(self): """ @summary: start_monitor_task 方式启动,此函数与spider_end不在同一进程内,变量不可共享 --------- --------- @result: """ if self._begin_callback: self._begin_callback() for parser in self._parsers: parser.start_callback() # 记录开始时间 if not self._redisdb.hexists(self._tab_spider_status, SPIDER_START_TIME_KEY): current_timestamp = tools.get_current_timestamp() self._redisdb.hset( self._tab_spider_status, SPIDER_START_TIME_KEY, current_timestamp ) # 发送消息 self.send_msg("《%s》爬虫开始" % self._spider_name) def spider_end(self): self.record_end_time() if self._end_callback: self._end_callback() for parser in self._parsers: if not self._keep_alive: parser.close() parser.end_callback() if not self._keep_alive: # 关闭webdirver if Request.webdriver_pool: Request.webdriver_pool.close() # 关闭打点 metrics.close() else: metrics.flush() # 计算抓取时长 data = self._redisdb.hget( self._tab_spider_status, SPIDER_START_TIME_KEY, is_pop=True ) if data: begin_timestamp = int(data) spand_time = tools.get_current_timestamp() - begin_timestamp msg = "《%s》爬虫结束,耗时 %s" % ( self._spider_name, tools.format_seconds(spand_time), ) log.info(msg) self.send_msg(msg) if self._keep_alive: log.info("爬虫不自动结束, 等待下一轮任务...") else: self.delete_tables(self._tab_spider_status) def record_end_time(self): # 记录结束时间 if self._batch_interval: current_timestamp = tools.get_current_timestamp() self._redisdb.hset( self._tab_spider_status, SPIDER_END_TIME_KEY, current_timestamp ) def is_reach_next_spider_time(self): if not self._batch_interval: return True last_spider_end_time = self._redisdb.hget( self._tab_spider_status, SPIDER_END_TIME_KEY ) if last_spider_end_time: last_spider_end_time = int(last_spider_end_time) current_timestamp = tools.get_current_timestamp() time_interval = current_timestamp - last_spider_end_time if time_interval < self._batch_interval * 86400: log.info( "上次运行结束时间为 {} 与当前时间间隔 为 {}, 小于规定的抓取时间间隔 {}。爬虫不执行,退出~".format( tools.timestamp_to_date(last_spider_end_time), tools.format_seconds(time_interval), tools.format_seconds(self._batch_interval * 86400), ) ) return False return True def record_spider_state( self, spider_type, state, batch_date=None, spider_start_time=None, spider_end_time=None, batch_interval=None, ): pass def join(self, timeout=None): """ 重写线程的join """ if not self._started.is_set(): return super().join() def heartbeat(self): self._redisdb.hset( self._tab_spider_status, HEARTBEAT_TIME_KEY, tools.get_current_timestamp() ) def have_alive_spider(self, heartbeat_interval=10): heartbeat_time = self._redisdb.hget(self._tab_spider_status, HEARTBEAT_TIME_KEY) if heartbeat_time: heartbeat_time = int(heartbeat_time) current_timestamp = tools.get_current_timestamp() if current_timestamp > heartbeat_time + heartbeat_interval: return True return False def reset_task(self, heartbeat_interval=10): """ 重置丢失的任务 Returns: """ if self.have_alive_spider(heartbeat_interval=heartbeat_interval): current_timestamp = tools.get_current_timestamp() datas = self._redisdb.zrangebyscore_set_score( self._tab_requests, priority_min=current_timestamp, priority_max=current_timestamp + setting.REQUEST_LOST_TIMEOUT, score=300, count=None, ) lose_count = len(datas) if lose_count: log.info("重置丢失任务完毕,共{}条".format(len(datas)))