Пример #1
0
class NormalUserPool(UserPoolInterface):
    """
    普通用户池,适用于账号成本低且大量的场景
    """

    def __init__(
        self,
        redis_key,
        *,
        table_userbase,
        login_state_key="login_state",
        lock_state_key="lock_state",
        username_key="username",
        password_key="password",
        login_retry_times=1,
        keep_alive=False,
    ):
        """
        @param redis_key: 项目名
        @param table_userbase: 用户表名
        @param login_state_key: 登录状态列名
        @param lock_state_key: 封锁状态列名
        @param username_key: 登陆名列名
        @param password_key: 密码列名
        @param login_retry_times: 登陆失败重试次数
        @param keep_alive: 是否保持常驻,以便user不足时立即补充
        """

        self._tab_user_pool = setting.TAB_USER_POOL.format(
            redis_key=redis_key, user_type="normal"
        )

        self._login_retry_times = login_retry_times
        self._table_userbase = table_userbase
        self._login_state_key = login_state_key
        self._lock_state_key = lock_state_key
        self._username_key = username_key
        self._password_key = password_key
        self._keep_alive = keep_alive

        self._users_id = []

        self._redisdb = RedisDB()
        self._mysqldb = MysqlDB()

        self._create_userbase()

    def _load_users_id(self):
        self._users_id = self._redisdb.hkeys(self._tab_user_pool)
        if self._users_id:
            random.shuffle(self._users_id)

    def _get_user_id(self):
        if not self._users_id:
            self._load_users_id()

        if self._users_id:
            return self._users_id.pop()

    def _create_userbase(self):
        sql = f"""
            CREATE TABLE IF NOT EXISTS `{self._table_userbase}` (
              `id` int(10) unsigned NOT NULL AUTO_INCREMENT,
              `{self._username_key}` varchar(50) DEFAULT NULL COMMENT '用户名',
              `{self._password_key}` varchar(255) DEFAULT NULL COMMENT '密码',
              `{self._login_state_key}` int(11) DEFAULT '0' COMMENT '登录状态(0未登录 1已登录)',
              `{self._lock_state_key}` int(11) DEFAULT '0' COMMENT '账号是否被封(0 未封 1 被封)',
              PRIMARY KEY (`id`),
              UNIQUE KEY `username` (`username`) USING BTREE
            ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
        """
        self._mysqldb.execute(sql)

    def _load_user(self) -> Iterable[NormalUser]:
        """
        返回用户信息
        @return: yield username, password
        """

        sql = "select id, {username_key}, {password_key} from {table_userbase} where {lock_state_key} != 1 and {login_state_key} != 1".format(
            username_key=self._username_key,
            password_key=self._password_key,
            table_userbase=self._table_userbase,
            lock_state_key=self._lock_state_key,
            login_state_key=self._login_state_key,
        )

        for id, username, password in self._mysqldb.find(sql):
            yield NormalUser(user_id=id, username=username, password=password)

    def handle_login_failed_user(self, user: NormalUser):
        """
        处理登录失败的user
        @return:
        """

        pass

    def handel_exception(self, e: Exception):
        """
        处理异常
        @param e:
        @return:
        """
        log.exception(e)

    def login(self, user: NormalUser) -> NormalUser:
        """
        登录 生产cookie
        """
        raise NotImplementedError

    def add_user(self, user: NormalUser):
        log.debug("add {}".format(user))
        self._redisdb.hset(self._tab_user_pool, user.user_id, user.to_dict())

        sql = "update {table_userbase} set {login_state_key} = 1 where id = {user_id}".format(
            table_userbase=self._table_userbase,
            login_state_key=self._login_state_key,
            username_key=self._username_key,
            user_id=user.user_id,
        )
        self._mysqldb.update(sql)

    def get_user(self, block=True) -> Optional[NormalUser]:
        while True:
            try:
                user_id = self._get_user_id()
                user_str = None
                if user_id:
                    user_str = self._redisdb.hget(self._tab_user_pool, user_id)
                    # 如果没取到user,可能是其他爬虫将此用户删除了,需要重刷新本地缓存的用户id
                    if not user_str:
                        self._load_users_id()
                        continue

                if not user_id and block:
                    self._keep_alive = False
                    self.run()
                    continue

                return user_str and NormalUser(**eval(user_str))
            except Exception as e:
                log.exception(e)
                tools.delay_time(1)

    def del_user(self, user_id: int):
        """
        删除失效的user
        @return:
        """
        self._redisdb.hdel(self._tab_user_pool, user_id)
        self._load_users_id()

        sql = "update {table_userbase} set {login_state_key} = 0 where id = {user_id}".format(
            table_userbase=self._table_userbase,
            login_state_key=self._login_state_key,
            username_key=self._username_key,
            user_id=user_id,
        )

        self._mysqldb.update(sql)

    def tag_user_locked(self, user_id: int):
        """
        标记用户被封堵
        """
        sql = "update {table_userbase} set {lock_state_key} = 1 where id = {user_id}".format(
            table_userbase=self._table_userbase,
            lock_state_key=self._lock_state_key,
            username_key=self._username_key,
            user_id=user_id,
        )

        self._mysqldb.update(sql)

    def run(self):
        while True:
            try:
                try:
                    with RedisLock(
                        key=self._tab_user_pool, lock_timeout=3600, wait_timeout=0
                    ) as _lock:
                        if _lock.locked:
                            for user in self._load_user():
                                retry_times = 0
                                while retry_times <= self._login_retry_times:
                                    try:
                                        user = self.login(user)
                                        if user:
                                            self.add_user(user)
                                        else:
                                            self.handle_login_failed_user(user)
                                        break
                                    except NotImplementedError:
                                        log.error(
                                            f"{self.__class__.__name__} must be implementation login method!"
                                        )
                                        os._exit(0)
                                    except Exception as e:
                                        self.handel_exception(e)
                                    log.debug(
                                        f"login failed, user: {user} retry_times: {retry_times}"
                                    )
                                    retry_times += 1
                                else:
                                    self.handle_login_failed_user(user)

                            now_user_count = self._redisdb.hget_count(
                                self._tab_user_pool
                            )
                            log.info("当前在线user数为 {}".format(now_user_count))

                except Exception as e:
                    log.exception(e)

                if self._keep_alive:
                    tools.delay_time(10)
                else:
                    break

            except Exception as e:
                log.exception(e)
                tools.delay_time(1)
Пример #2
0
class GuestUserPool(UserPoolInterface):
    """
    访客用户池 不需要登陆
    """
    def __init__(
            self,
            redis_key,
            page_url=None,
            min_users=1,
            must_contained_keys=(),
            keep_alive=False,
            **kwargs,
    ):
        """
        @param redis_key: user存放在redis中的key前缀
        @param page_url: 生产user的url
        @param min_users: 最小user数
        @param must_contained_keys: cookie中必须包含的key,用于校验cookie是否正确
        @param keep_alive: 是否保持常驻,以便user不足时立即补充
        ---
        @param kwargs: WebDriver的一些参数
            load_images: 是否加载图片
            user_agent: 字符串 或 无参函数,返回值为user_agent
            proxy: xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址
            headless: 是否启用无头模式
            driver_type: CHROME 或 PHANTOMJS,FIREFOX
            timeout: 请求超时时间
            window_size: # 窗口大小
            executable_path: 浏览器路径,默认为默认路径
        """

        self._redisdb = RedisDB()

        self._tab_user_pool = setting.TAB_USER_POOL.format(redis_key=redis_key,
                                                           user_type="guest")
        self._page_url = page_url
        self._min_users = min_users
        self._must_contained_keys = must_contained_keys
        self._keep_alive = keep_alive

        self._kwargs = kwargs
        self._kwargs.setdefault("load_images", False)
        self._kwargs.setdefault("headless", True)

        self._users_id = []

    def _load_users_id(self):
        self._users_id = self._redisdb.hkeys(self._tab_user_pool)
        if self._users_id:
            random.shuffle(self._users_id)

    def _get_user_id(self):
        if not self._users_id:
            self._load_users_id()

        if self._users_id:
            return self._users_id.pop()

    def login(self) -> Optional[GuestUser]:
        """
        默认使用webdirver去登录,生产cookie,可以重写
        """
        with WebDriver(**self._kwargs) as driver:
            driver.get(self._page_url)

            cookies = driver.cookies

            for key in self._must_contained_keys:
                if key not in cookies:
                    break
            else:
                user = GuestUser(user_agent=driver.user_agent, cookies=cookies)
                return user

            log.error("获取cookie失败 cookies = {}".format(cookies))
            return None

    def add_user(self, user: GuestUser):
        log.debug("add {}".format(user))
        self._redisdb.hset(self._tab_user_pool, user.user_id, user.to_dict())

    def get_user(self, block=True) -> Optional[GuestUser]:
        """

        Args:
            block: 无用户时是否等待

        Returns:

        """
        while True:
            try:
                user_id = self._get_user_id()
                user_str = None
                if user_id:
                    user_str = self._redisdb.hget(self._tab_user_pool, user_id)
                    # 如果没取到user,可能是其他爬虫将此用户删除了,需要重刷新本地缓存的用户id
                    if not user_str:
                        self._load_users_id()
                        continue

                if not user_id and block:
                    self._keep_alive = False
                    with RedisLock(key=self._tab_user_pool,
                                   lock_timeout=3600,
                                   wait_timeout=0) as _lock:
                        if _lock.locked:
                            self.run()
                    continue

                return user_str and GuestUser(**eval(user_str))
            except Exception as e:
                log.exception(e)
                tools.delay_time(1)

    def del_user(self, user_id: str):
        self._redisdb.hdel(self._tab_user_pool, user_id)
        self._load_users_id()

    def run(self):
        while True:
            try:
                now_user_count = self._redisdb.hget_count(self._tab_user_pool)
                need_user_count = self._min_users - now_user_count

                if need_user_count > 0:
                    log.info("当前在线user数为 {} 小于 {}, 生产user".format(
                        now_user_count, self._min_users))
                    try:
                        user = self.login()
                        if user:
                            self.add_user(user)
                    except Exception as e:
                        log.exception(e)
                else:
                    log.debug("当前user数为 {} 数量足够 暂不生产".format(now_user_count))

                    if self._keep_alive:
                        tools.delay_time(10)
                    else:
                        break

            except Exception as e:
                log.exception(e)
                tools.delay_time(1)
Пример #3
0
class GoldUserPool(UserPoolInterface):
    """
    账号昂贵、限制查询次数的用户的UserPool
    """
    def __init__(
        self,
        redis_key,
        *,
        users: List[GoldUser],
        keep_alive=False,
    ):
        """
        @param redis_key: user存放在redis中的key前缀
        @param users: 账号信息
        @param keep_alive: 是否保持常驻,以便user不足时立即补充
        """
        self._tab_user_pool = setting.TAB_USER_POOL.format(redis_key=redis_key,
                                                           user_type="gold")

        self.users = users
        self._keep_alive = keep_alive

        self._redisdb = RedisDB()
        self._users_id = []

        if not users:
            raise ValueError("not users")

        # 给user的类属性复制
        self.users[0].__class__.redisdb = self._redisdb
        self.users[0].__class__.redis_key = self._tab_user_pool

        self.__init_metrics()
        self.__sync_users_base_info()
        self.__sycn_users_info()

    def __init_metrics(self):
        metrics.init(**setting.METRICS_OTHER_ARGS)

    def __sync_users_base_info(self):
        # 本地同步基本信息到redis, 注 只能在初始化函数内同步
        for user in self.users:
            cache_user = self.get_user_by_id(user.user_id)
            if cache_user:
                for key, value in user.to_dict().items():
                    if not key.startswith("_"):
                        setattr(cache_user, key, value)
                cache_user.sycn_to_redis()

    def __sycn_users_info(self):
        # redis同步登录信息到本地
        for index, user in enumerate(self.users):
            cache_user = self.get_user_by_id(user.user_id)
            if cache_user:
                self.users[index] = cache_user

    def _load_users_id(self):
        self._users_id = self._redisdb.hkeys(self._tab_user_pool)
        if self._users_id:
            random.shuffle(self._users_id)

    def _get_user_id(self):
        if not self._users_id:
            self._load_users_id()

        if self._users_id:
            return self._users_id.pop()

    def login(self, user: GoldUser) -> GoldUser:
        """
        登录 生产cookie
        """
        raise NotImplementedError

    def get_user_by_id(self, user_id: str) -> GoldUser:
        user_str = self._redisdb.hget(self._tab_user_pool, user_id)
        if user_str:
            user = GoldUser(**eval(user_str))
            return user

    def get_user(
        self,
        block=True,
        username=None,
        used_for_spider_name=None,
        not_limit_use_interval=False,
    ) -> Optional[GoldUser]:
        """
        @params username: 获取指定的用户
        @params used_for_spider_name: 独享式使用,独享爬虫的名字。其他爬虫不可抢占
        @params block: 无用户时是否等待
        @params not_limit_frequence: 不限制使用频率
        @return: GoldUser
        """
        while True:
            try:
                user_id = username or self._get_user_id()
                user_str = None
                if user_id:
                    user_str = self._redisdb.hget(self._tab_user_pool, user_id)

                if (not user_id or not user_str) and block:
                    self._keep_alive = False
                    self.run(username)
                    continue

                # 取到用户
                user = GoldUser(**eval(user_str))

                # 独占式使用,若为其他爬虫,检查等待使用时间是否超过独占时间,若超过则可以使用
                if (user.get_used_for_spider_name()
                        and user.get_used_for_spider_name() !=
                        used_for_spider_name):
                    wait_time = time.time() - user.get_last_use_time()
                    if wait_time < user.exclusive_time:
                        log.info("用户{} 被 {} 爬虫独占,需等待 {} 秒后才可使用".format(
                            user.username,
                            user.get_used_for_spider_name(),
                            user.exclusive_time - wait_time,
                        ))
                        time.sleep(1)
                        continue

                if not user.is_overwork() and user.is_at_work_time():
                    if not user.cookies:
                        log.debug(f"用户 {user.username} 未登录,尝试登录")
                        self._keep_alive = False
                        self.run(username)
                        continue

                    if not_limit_use_interval or user.is_time_to_use():
                        user.set_used_for_spider_name(used_for_spider_name)
                        log.debug("使用用户 {}".format(user.username))
                        self.record_user_status(user.user_id,
                                                GoldUserStatus.USED)
                        return user
                    else:
                        log.debug("{} 用户使用间隔过短 查看下一个用户".format(user.username))
                        time.sleep(1)
                        continue
                else:
                    if not user.is_at_work_time():
                        log.info("用户 {} 不在工作时间 sleep 60s".format(
                            user.username))
                        if block:
                            time.sleep(60)
                            continue
                        else:
                            return None

            except Exception as e:
                log.exception(e)
                time.sleep(1)

    def del_user(self, user_id: str):
        user = self.get_user_by_id(user_id)
        if user:
            user.set_cookies(None)
            self.record_user_status(user.user_id, GoldUserStatus.OVERDUE)

    def add_user(self, user: GoldUser):
        user.sycn_to_redis()

    def delay_use(self, user_id: str, delay_seconds: int):
        user = self.get_user_by_id(user_id)
        if user:
            user.set_delay_use(delay_seconds)

        self.record_user_status(user_id, GoldUserStatus.SLEEP)

    def record_success_user(self, user_id: str):
        self.record_user_status(user_id, GoldUserStatus.SUCCESS)

    def record_exception_user(self, user_id: str):
        self.record_user_status(user_id, GoldUserStatus.EXCEPTION)

    def run(self, username=None):
        while True:
            try:
                with RedisLock(key=self._tab_user_pool,
                               lock_timeout=3600,
                               wait_timeout=0) as _lock:
                    if _lock.locked:
                        self.__sycn_users_info()
                        online_user = 0
                        for user in self.users:
                            if username and username != user.username:
                                continue

                            try:
                                if user.cookies:
                                    online_user += 1
                                    continue

                                # 预检查
                                if not user.is_time_to_login():
                                    log.info("账号{}与上次登录时间间隔过短,暂不登录: 将在{}登录使用".
                                             format(user.username,
                                                    user.next_login_time()))
                                    continue

                                user = self.login(user)
                                if user.cookies:
                                    # 保存cookie
                                    user.set_login_time()
                                    self.add_user(user)
                                    self.record_user_status(
                                        user.user_id,
                                        GoldUserStatus.LOGIN_SUCCESS)
                                    log.debug("登录成功 {}".format(user.username))
                                    online_user += 1
                                else:
                                    log.info("登录失败 {}".format(user.username))
                                    self.record_user_status(
                                        user.user_id,
                                        GoldUserStatus.LOGIN_FALIED)
                            except NotImplementedError:
                                log.error(
                                    f"{self.__class__.__name__} must be implementation login method!"
                                )
                                os._exit(0)
                            except Exception as e:
                                log.exception(e)
                                msg = f"{user.username} 账号登陆失败 exception: {str(e)}"
                                log.info(msg)
                                self.record_user_status(
                                    user.user_id, GoldUserStatus.LOGIN_FALIED)

                                send_msg(
                                    msg=msg,
                                    level="error",
                                    message_prefix=f"{user.username} 账号登陆失败",
                                )

                        log.info("当前在线user数为 {}".format(online_user))

                if self._keep_alive:
                    time.sleep(10)
                else:
                    break

            except Exception as e:
                log.exception(e)
                time.sleep(1)

    def record_user_status(self, user_id: str, status: GoldUserStatus):
        metrics.emit_counter(user_id, 1, classify=f"users_{status.value}")