def get_user(self, block=True) -> Optional[GuestUser]: """ Args: block: 无用户时是否等待 Returns: """ while True: try: user_id = self._get_user_id() user_str = None if user_id: user_str = self._redisdb.hget(self._tab_user_pool, user_id) # 如果没取到user,可能是其他爬虫将此用户删除了,需要重刷新本地缓存的用户id if not user_str: self._load_users_id() continue if not user_id and block: self._keep_alive = False with RedisLock(key=self._tab_user_pool, lock_timeout=3600, wait_timeout=0) as _lock: if _lock.locked: self.run() continue return user_str and GuestUser(**eval(user_str)) except Exception as e: log.exception(e) tools.delay_time(1)
def check_filter_capacity(self): """ 检测filter状态,如果已满,加载新的filter @return: """ if ( not self._check_capacity_time or time.time() - self._check_capacity_time > 1800 ): if self.bitarray_type == ScalableBloomFilter.BASE_MEMORY: with self._thread_lock: while True: if self.filters[-1].is_at_capacity: self.filters.append(self.create_filter()) else: break self._check_capacity_time = time.time() else: with RedisLock( key="ScalableBloomFilter", timeout=300, wait_timeout=300, redis_cli=RedisDB(url=self.redis_url).get_redis_obj(), ) as lock: # 全局锁 同一时间只有一个进程在真正的创建新的filter,等这个进程创建完,其他进程只是把刚创建的filter append进来 if lock.locked: while True: if self.filters[-1].is_at_capacity: self.filters.append(self.create_filter()) else: break self._check_capacity_time = time.time()
def check_filter_capacity(self): """ 检测filter状态,如果已满,加载新的filter @return: """ if (not self._check_capacity_time or time.time() - self._check_capacity_time > 1800): if self.bitarray_type == ScalableBloomFilter.BASE_MEMORY: with self._thread_lock: while True: if self.filters[-1].is_at_capacity: self.filters.append(self.create_filter()) else: break self._check_capacity_time = time.time() else: # 全局锁 同一时间只有一个进程在真正的创建新的filter,等这个进程创建完,其他进程只是把刚创建的filter append进来 key = (f"ScalableBloomFilter:{self.name}" if self.name else "ScalableBloomFilter") with RedisLock(key=key, redis_cli=self._redis_cli) as lock: if lock.locked: while True: if self.filters[-1].is_at_capacity: self.filters.append(self.create_filter()) else: break self._check_capacity_time = time.time()
def run(self): with RedisLock( key=self._tab_cookie_pool, lock_timeout=3600, wait_timeout=100 ) as _lock: if _lock.locked: user_infos = self.get_user_info() if not isinstance(user_infos, Iterable): raise ValueError("get_user_info 返回值必须可迭代") if not user_infos: log.info("无可用用户") for username, password in user_infos: for i in range(self._login_retry_times): try: cookie = self.create_cookie(username, password) if cookie: self.save_cookie(username, cookie) else: self.handle_login_failed_user(username, password) break except Exception as e: self.handel_exception(e) else: self.handle_login_failed_user(username, password)
def run(self): while True: try: try: with RedisLock( key=self._tab_user_pool, lock_timeout=3600, wait_timeout=0 ) as _lock: if _lock.locked: for user in self._load_user(): retry_times = 0 while retry_times <= self._login_retry_times: try: user = self.login(user) if user: self.add_user(user) else: self.handle_login_failed_user(user) break except NotImplementedError: log.error( f"{self.__class__.__name__} must be implementation login method!" ) os._exit(0) except Exception as e: self.handel_exception(e) log.debug( f"login failed, user: {user} retry_times: {retry_times}" ) retry_times += 1 else: self.handle_login_failed_user(user) now_user_count = self._redisdb.hget_count( self._tab_user_pool ) log.info("当前在线user数为 {}".format(now_user_count)) except Exception as e: log.exception(e) if self._keep_alive: tools.delay_time(10) else: break except Exception as e: log.exception(e) tools.delay_time(1)
def get_cookie(self, wait_when_null=True): while True: try: cookie_info = self._redisdb.rpoplpush(self._tab_cookie_pool) if not cookie_info and wait_when_null: log.info("暂无cookie 生产中...") self._keep_alive = False self._min_cookies = 1 with RedisLock( key=self._tab_cookie_pool, lock_timeout=3600, wait_timeout=5 ) as _lock: if _lock.locked: self.run() continue return eval(cookie_info) if cookie_info else {} except Exception as e: log.exception(e) tools.delay_time(1)
def _start(self): # 启动request_buffer self._request_buffer.start() # 启动item_buffer self._item_buffer.start() # 启动collector self._collector.start() # 启动parser control for i in range(self._thread_count): parser_control = self._parser_control_obj( self._collector, self._redis_key, self._request_buffer, self._item_buffer, ) for parser in self._parsers: parser_control.add_parser(parser) parser_control.start() self._parser_controls.append(parser_control) # 下发任务 因为时间可能比较长,放到最后面 if setting.RETRY_FAILED_REQUESTS: # 重设失败的任务, 不用加锁,原子性操作 handle_failed_requests = HandleFailedRequests(self._redis_key) handle_failed_requests.reput_failed_requests_to_requests() # 下发新任务 if self._auto_start_requests: # 自动下发 if self.wait_lock: # 将添加任务处加锁,防止多进程之间添加重复的任务 with RedisLock( key=self._spider_name, timeout=3600, wait_timeout=60, redis_cli=RedisDB().get_redis_obj(), ) as lock: if lock.locked: self.__add_task() else: self.__add_task()
def task_is_done(self): """ @summary: 检查任务状态 是否做完 同时更新批次时间 (不能挂 挂了批次时间就不更新了) --------- --------- @result: True / False (做完 / 未做完) """ is_done = False # 查看批次记录表任务状态 sql = 'select date_format(batch_date, "{date_format}"), total_count, done_count, is_done from {batch_record_table} order by id desc limit 1'.format( date_format=self._date_format.replace(":%M", ":%i"), batch_record_table=self._batch_record_table, ) batch_info = self._mysqldb.find(sql) if batch_info is None: raise Exception("查询批次信息失败") if batch_info: self._batch_date_cache, total_count, done_count, is_done = batch_info[ 0] # 更新self._batch_date_cache, 防止新批次已经开始了,但self._batch_date_cache还是原来的批次时间 log.info("《%s》 批次时间%s 批次进度 %s/%s 完成状态 %d" % ( self._batch_name, self._batch_date_cache, done_count, total_count, is_done, )) os.environ[ "batch_date"] = self._batch_date_cache # 更新BatchParser里边的批次时间 if is_done: # 检查任务表中是否有没做的任务 若有则is_done 为 False # 比较耗时 加锁防止多进程同时查询 with RedisLock( key=self._spider_name, timeout=3600, wait_timeout=0, redis_cli=RedisDB().get_redis_obj(), ) as lock: if lock.locked: log.info("批次表标记已完成,正在检查任务表是否有未完成的任务") sql = "select 1 from %s where (%s = 0 or %s=2)%s limit 1" % ( self._task_table, self._task_state, self._task_state, self._task_condition_prefix_and, ) tasks = self._mysqldb.find(sql) # [(1,)] / [] if tasks: log.info("检测到任务表中有未完成任务,等待任务下发") is_done = False # 更新batch_record 表的is_done 状态,减少查询任务表的次数 sql = 'update {batch_record_table} set is_done = 0 where batch_date = "{batch_date}"'.format( batch_record_table=self._batch_record_table, batch_date=self._batch_date_cache, ) self._mysqldb.update(sql) else: log.info("任务表中任务均已完成,爬虫结束") else: log.info("批次表标记已完成,其他爬虫进程正在检查任务表是否有未完成的任务,本进程跳过检查,继续等待") is_done = False return is_done
def test_lock(): with RedisLock(key="test", redis_cli=RedisDB().get_redis_obj(), wait_timeout=10) as _lock: if _lock.locked: print(1) time.sleep(100)
def run(self, username=None): while True: try: with RedisLock(key=self._tab_user_pool, lock_timeout=3600, wait_timeout=0) as _lock: if _lock.locked: self.__sycn_users_info() online_user = 0 for user in self.users: if username and username != user.username: continue try: if user.cookies: online_user += 1 continue # 预检查 if not user.is_time_to_login(): log.info("账号{}与上次登录时间间隔过短,暂不登录: 将在{}登录使用". format(user.username, user.next_login_time())) continue user = self.login(user) if user.cookies: # 保存cookie user.set_login_time() self.add_user(user) self.record_user_status( user.user_id, GoldUserStatus.LOGIN_SUCCESS) log.debug("登录成功 {}".format(user.username)) online_user += 1 else: log.info("登录失败 {}".format(user.username)) self.record_user_status( user.user_id, GoldUserStatus.LOGIN_FALIED) except NotImplementedError: log.error( f"{self.__class__.__name__} must be implementation login method!" ) os._exit(0) except Exception as e: log.exception(e) msg = f"{user.username} 账号登陆失败 exception: {str(e)}" log.info(msg) self.record_user_status( user.user_id, GoldUserStatus.LOGIN_FALIED) send_msg( msg=msg, level="error", message_prefix=f"{user.username} 账号登陆失败", ) log.info("当前在线user数为 {}".format(online_user)) if self._keep_alive: time.sleep(10) else: break except Exception as e: log.exception(e) time.sleep(1)