示例#1
0
    def acquire(self):
        start = time.time()
        while 1:
            # 尝试加锁
            if self.redis_conn.setnx(self.lock_key, time.time()):
                self.redis_conn.expire(self.lock_key, self.timeout)
                self.locked = True
                break
            else:
                # 修复bug: 当加锁时被干掉 导致没有设置expire成功 锁无限存在
                if self.redis_conn.ttl(self.lock_key) < 0:
                    self.redis_conn.delete(self.lock_key)

            if self.wait_timeout > 0:
                if time.time() - start > self.wait_timeout:
                    log.info("加锁失败")
                    break
            else:
                # 不等待
                break
            if self.break_wait():
                log.info("break_wait 生效 不再等待加锁")
                break
            log.debug("等待加锁: {} wait:{}".format(self, time.time() - start))
            if self.wait_timeout > 10:
                time.sleep(5)
            else:
                time.sleep(1)
        return
示例#2
0
    def run(self):
        while True:
            try:
                now_user_count = self._redisdb.hget_count(self._tab_user_pool)
                need_user_count = self._min_users - now_user_count

                if need_user_count > 0:
                    log.info("当前在线user数为 {} 小于 {}, 生产user".format(
                        now_user_count, self._min_users))
                    try:
                        user = self.login()
                        if user:
                            self.add_user(user)
                    except Exception as e:
                        log.exception(e)
                else:
                    log.debug("当前user数为 {} 数量足够 暂不生产".format(now_user_count))

                    if self._keep_alive:
                        tools.delay_time(10)
                    else:
                        break

            except Exception as e:
                log.exception(e)
                tools.delay_time(1)
示例#3
0
    def __add_item_to_db(self, items, update_items, requests, callbacks,
                         items_fingerprints):
        export_success = False
        self._is_adding_to_db = True

        # 去重
        if setting.ITEM_FILTER_ENABLE:
            items, items_fingerprints = self.__dedup_items(
                items, items_fingerprints)

        # 分捡
        items_dict = self.__pick_items(items)
        update_items_dict = self.__pick_items(update_items,
                                              is_update_item=True)

        # item批量入库
        while items_dict:
            tab_item, datas = items_dict.popitem()

            log.debug("""
                -------------- item 批量入库 --------------
                表名: %s
                datas: %s
                    """ % (tab_item, tools.dumps_json(datas, indent=16)))

            export_success = self.__export_to_db(tab_item, datas)

        # 执行批量update
        while update_items_dict:
            tab_item, datas = update_items_dict.popitem()
            log.debug("""
                -------------- item 批量更新 --------------
                表名: %s
                datas: %s
                    """ % (tab_item, tools.dumps_json(datas, indent=16)))

            update_keys = self._item_update_keys.get(tab_item)
            export_success = self.__export_to_db(tab_item,
                                                 datas,
                                                 is_update=True,
                                                 update_keys=update_keys)

        # 执行回调
        while callbacks:
            try:
                callback = callbacks.pop(0)
                callback()
            except Exception as e:
                log.exception(e)

        # 删除做过的request
        if requests:
            self._db.zrem(self._table_request, requests)

        # 去重入库
        if export_success and setting.ITEM_FILTER_ENABLE:
            if items_fingerprints:
                self.__class__.dedup.add(items_fingerprints, skip_check=True)

        self._is_adding_to_db = False
示例#4
0
    def start_monitor_task(self):
        """
        @summary: 监控任务状态
        ---------
        ---------
        @result:
        """
        if not self._parsers:  # 不是多模版模式, 将自己注入到parsers,自己为模版
            self._is_more_parsers = False
            self._parsers.append(self)

        elif len(self._parsers) <= 1:
            self._is_more_parsers = False

        if self._task:
            self.distribute_task([self._task])
        else:
            tasks = self.get_todo_task_from_mysql()
            if not tasks:
                raise Exception("未获取到任务 请检查 task_id: {} 是否存在".format(
                    self._task_id))
            self.distribute_task(tasks)

        os.environ.setdefault("batch_date", "1970-00-00")
        log.debug("下发任务完毕")
示例#5
0
    def __add_request_to_db(self):
        request_list = []
        prioritys = []
        callbacks = []

        while self._requests_deque:
            request = self._requests_deque.popleft()
            self._is_adding_to_db = True

            if callable(request):
                # 函数
                # 注意:应该考虑闭包情况。闭包情况可写成
                # def test(xxx = xxx):
                #     # TODO 业务逻辑 使用 xxx
                # 这么写不会导致xxx为循环结束后的最后一个值
                callbacks.append(request)
                continue

            priority = request.priority

            # 如果需要去重并且库中已重复 则continue
            if (request.filter_repeat and setting.REQUEST_FILTER_ENABLE
                    and not self.__class__.dedup.add(request.fingerprint)):
                log.debug("request已存在  url = %s" % request.url)
                continue
            else:
                request_list.append(str(request.to_dict))
                prioritys.append(priority)

            if len(request_list) > MAX_URL_COUNT:
                self._db.zadd(self._table_request, request_list, prioritys)
                request_list = []
                prioritys = []

        # 入库
        if request_list:
            self._db.zadd(self._table_request, request_list, prioritys)

        # 执行回调
        for callback in callbacks:
            try:
                callback()
            except Exception as e:
                log.exception(e)

        # 删除已做任务
        if self._del_requests_deque:
            request_done_list = []
            while self._del_requests_deque:
                request_done_list.append(self._del_requests_deque.popleft())

            # 去掉request_list中的requests, 否则可能会将刚添加的request删除
            request_done_list = list(
                set(request_done_list) - set(request_list))

            if request_done_list:
                self._db.zrem(self._table_request, request_done_list)

        self._is_adding_to_db = False
示例#6
0
    def add_user(self, user: NormalUser):
        log.debug("add {}".format(user))
        self._redisdb.hset(self._tab_user_pool, user.user_id, user.to_dict())

        sql = "update {table_userbase} set {login_state_key} = 1 where id = {user_id}".format(
            table_userbase=self._table_userbase,
            login_state_key=self._login_state_key,
            username_key=self._username_key,
            user_id=user.user_id,
        )
        self._mysqldb.update(sql)
示例#7
0
    def delete_tables(self, delete_tables_list):
        if isinstance(delete_tables_list, bool):
            delete_tables_list = [self._redis_key + "*"]
        elif not isinstance(delete_tables_list, (list, tuple)):
            delete_tables_list = [delete_tables_list]

        for delete_tab in delete_tables_list:
            if not delete_tab.startswith(self._redis_key):
                delete_tab = self._redis_key + delete_tab
            tables = self._redisdb.getkeys(delete_tab)
            for table in tables:
                log.debug("正在删除key %s" % table)
                self._redisdb.clear(table)
示例#8
0
    def send(
        self,
        receivers: list,
        title: str,
        content: str,
        content_type: str = "plain",
        filepath: str = None,
    ):
        """

        Args:
            receivers:
            title:
            content:
            content_type: html / plain
            filepath:

        Returns:

        """
        # 创建一个带附件的实例
        message = MIMEMultipart()
        message["From"] = formataddr(
            (self.sender, self.username))  # 括号里的对应发件人邮箱昵称、发件人邮箱账号
        message["To"] = formataddr(
            (receivers[0], receivers[0]))  # ",".join(receivers)

        message["Subject"] = Header(title, "utf-8")

        content = MIMEText(content, content_type, "utf-8")
        message.attach(content)

        # 构造附件
        if filepath:
            attach = MIMEText(open(filepath, "rb").read(), "base64", "utf-8")
            attach.add_header(
                "content-disposition",
                "attachment",
                filename=("utf-8", "", os.path.basename(filepath)),
            )
            message.attach(attach)

        msg = message.as_string()
        # 此处直接发送多个邮箱有问题,改成一个个发送
        for receiver in receivers:
            log.debug("发送邮件到 {}".format(receiver))
            self.smtp_client.sendmail(self.username, receiver, msg)
        log.debug("邮件发送成功!!!")
        return True
示例#9
0
    def __init__(
        self, ip=None, port=None, db=None, user_name=None, user_pass=None, **kwargs
    ):
        # 可能会改setting中的值,所以此处不能直接赋值为默认值,需要后加载赋值
        if not ip:
            ip = setting.MYSQL_IP
        if not port:
            port = setting.MYSQL_PORT
        if not db:
            db = setting.MYSQL_DB
        if not user_name:
            user_name = setting.MYSQL_USER_NAME
        if not user_pass:
            user_pass = setting.MYSQL_USER_PASS

        try:

            self.connect_pool = PooledDB(
                creator=pymysql,
                mincached=1,
                maxcached=100,
                maxconnections=100,
                blocking=True,
                ping=7,
                host=ip,
                port=port,
                user=user_name,
                passwd=user_pass,
                db=db,
                charset="utf8mb4",
                cursorclass=cursors.SSCursor,
            )  # cursorclass 使用服务的游标,默认的在多线程下大批量插入数据会使内存递增

        except Exception as e:
            log.error(
                """
            连接数据失败:
            ip: {}
            port: {}
            db: {}
            user_name: {}
            user_pass: {}
            exception: {}
            """.format(
                    ip, port, db, user_name, user_pass, e
                )
            )
        else:
            log.debug("连接到mysql数据库 %s : %s" % (ip, db))
示例#10
0
    def delete_tables(self, delete_tables_list):
        if isinstance(delete_tables_list, bool):
            delete_tables_list = [self._redis_key + "*"]
        elif not isinstance(delete_tables_list, (list, tuple)):
            delete_tables_list = [delete_tables_list]

        redis = RedisDB()
        for delete_tab in delete_tables_list:
            if delete_tab == "*":
                delete_tab = self._redis_key + "*"

            tables = redis.getkeys(delete_tab)
            for table in tables:
                log.debug("正在清理表 %s" % table)
                redis.clear(table)
示例#11
0
    def run(self):
        while True:
            try:
                try:
                    with RedisLock(
                        key=self._tab_user_pool, lock_timeout=3600, wait_timeout=0
                    ) as _lock:
                        if _lock.locked:
                            for user in self._load_user():
                                retry_times = 0
                                while retry_times <= self._login_retry_times:
                                    try:
                                        user = self.login(user)
                                        if user:
                                            self.add_user(user)
                                        else:
                                            self.handle_login_failed_user(user)
                                        break
                                    except NotImplementedError:
                                        log.error(
                                            f"{self.__class__.__name__} must be implementation login method!"
                                        )
                                        os._exit(0)
                                    except Exception as e:
                                        self.handel_exception(e)
                                    log.debug(
                                        f"login failed, user: {user} retry_times: {retry_times}"
                                    )
                                    retry_times += 1
                                else:
                                    self.handle_login_failed_user(user)

                            now_user_count = self._redisdb.hget_count(
                                self._tab_user_pool
                            )
                            log.info("当前在线user数为 {}".format(now_user_count))

                except Exception as e:
                    log.exception(e)

                if self._keep_alive:
                    tools.delay_time(10)
                else:
                    break

            except Exception as e:
                log.exception(e)
                tools.delay_time(1)
示例#12
0
    def run(self):
        self._thread_stop = False
        while not self._thread_stop:
            try:
                request = self._collector.get_request()
                if not request:
                    if not self.is_show_tip:
                        log.debug("等待任务...")
                        self.is_show_tip = True
                    continue

                self.is_show_tip = False
                self.deal_request(request)

            except Exception as e:
                log.exception(e)
示例#13
0
    def set_unique_key(self, table, key):
        try:
            sql = "alter table %s add unique (%s)" % (table, key)

            conn, cursor = self.get_connection()
            cursor.execute(sql)
            conn.commit()

        except Exception as e:
            log.error(table + " " + str(e) + " key = " + key)
            return False
        else:
            log.debug("%s表创建唯一索引成功 索引为 %s" % (table, key))
            return True
        finally:
            self.close_connection(conn, cursor)
示例#14
0
    def run(self):
        self.distribute_task()

        for i in range(self._thread_count):
            parser_control = AirSpiderParserControl(self._memory_db)
            parser_control.add_parser(self)
            parser_control.start()
            self._parser_controls.append(parser_control)

        while True:
            if self.all_thread_is_done():
                # 停止 parser_controls
                for parser_control in self._parser_controls:
                    parser_control.stop()

                log.debug("无任务,爬虫结束")
                break
    def parse_play_time(self, request, response):
        """
        解析购买地址
        """
        movie_id = request.movie_id
        cinema_id = request.cinema_id

        pay_urls = response.xpath(
            f'//a[@data-val="{{movie_id: {movie_id}, cinema_id:{cinema_id}}}"]/@href'
        ).extract()
        for pay_url in pay_urls:
            log.debug("解析到购买地址 {}".format(pay_url))
            next_request = request.copy()
            next_request.url = pay_url
            next_request.callback = self.parse_seats
            next_request.priority = 1
            yield next_request
示例#16
0
    def run(self):
        while not self._thread_stop:
            try:
                request = self._memory_db.get()
                if not request:
                    if not self.is_show_tip:
                        log.debug("等待任务...")
                        self.is_show_tip = True

                    time.sleep(1)
                    continue

                self.is_show_tip = False
                self.deal_request(request)

            except Exception as e:
                log.exception(e)
                time.sleep(3)
示例#17
0
    def reput_failed_requests_to_requests(self):
        log.debug("正在重置失败的requests...")
        total_count = 0
        while True:
            failed_requests = self.get_failed_requests()
            if not failed_requests:
                break

            for request in failed_requests:
                request["retry_times"] = 0
                request_obj = Request.from_dict(request)
                self._request_buffer.put_request(request_obj)

                total_count += 1

        self._request_buffer.flush()

        log.debug("重置%s条失败requests为待抓取requests" % total_count)
示例#18
0
    def run(self):
        while not self._thread_stop:
            try:
                requests = self._memory_db.get()
                if not requests:
                    if not self.is_show_tip:
                        log.debug("parser 等待任务 ...")
                        self.is_show_tip = True

                    time.sleep(1)
                    self._wait_task_time += 1
                    continue

                self.is_show_tip = False
                self.deal_requests([requests])

            except Exception as e:
                log.exception(e)
示例#19
0
    def acquire(self):
        start = time.time()
        while True:
            # 尝试加锁
            if self.redis_conn.set(self.lock_key, time.time(), nx=True, ex=5):
                self.locked = True
                break

            if self.wait_timeout > 0:
                if time.time() - start > self.wait_timeout:
                    log.info("加锁失败")
                    break
            else:
                break
            log.debug("等待加锁: {} wait:{}".format(self, time.time() - start))
            if self.wait_timeout > 10:
                time.sleep(5)
            else:
                time.sleep(1)
        return
示例#20
0
    def run(self):
        while not self._thread_stop:
            try:
                requests = self._collector.get_requests(
                    setting.SPIDER_TASK_COUNT)
                if not requests:
                    if not self.is_show_tip:
                        log.debug("parser 等待任务 ...")
                        self.is_show_tip = True

                    # log.debug('parser 等待任务 {}...'.format(tools.format_seconds(self._wait_task_time)))

                    time.sleep(1)
                    self._wait_task_time += 1
                    continue

                self.is_show_tip = False
                self.deal_requests(requests)

            except Exception as e:
                log.exception(e)
示例#21
0
    def update_task_state(self, task_id, state=1, **kwargs):
        """
        @summary: 更新任务表中任务状态,做完每个任务时代码逻辑中要主动调用。可能会重写
        调用方法为 yield lambda : self.update_task_state(task_id, state)
        ---------
        @param task_id:
        @param state:
        ---------
        @result:
        """

        kwargs["id"] = task_id
        kwargs[self._task_state] = state

        sql = tools.make_update_sql(
            self._task_table, kwargs, condition="id = {task_id}".format(task_id=task_id)
        )

        if self._mysqldb.update(sql):
            log.debug("置任务%s状态成功" % task_id)
        else:
            log.error("置任务%s状态失败  sql=%s" % (task_id, sql))
示例#22
0
    def __init__(self,
                 ip_ports=None,
                 db=None,
                 user_pass=None,
                 url=None,
                 decode_responses=True,
                 service_name=None,
                 max_connections=32,
                 **kwargs):
        """
        redis的封装
        Args:
            ip_ports: ip:port 多个可写为列表或者逗号隔开 如 ip1:port1,ip2:port2 或 ["ip1:port1", "ip2:port2"]
            db:
            user_pass:
            url:
            decode_responses:
            service_name: 适用于redis哨兵模式
        """

        # 可能会改setting中的值,所以此处不能直接赋值为默认值,需要后加载赋值
        if ip_ports is None:
            ip_ports = setting.REDISDB_IP_PORTS
        if db is None:
            db = setting.REDISDB_DB
        if user_pass is None:
            user_pass = setting.REDISDB_USER_PASS
        if service_name is None:
            service_name = setting.REDISDB_SERVICE_NAME

        self._is_redis_cluster = False

        try:
            if not url:
                ip_ports = (ip_ports if isinstance(ip_ports, list) else
                            ip_ports.split(","))
                if len(ip_ports) > 1:
                    startup_nodes = []
                    for ip_port in ip_ports:
                        ip, port = ip_port.split(":")
                        startup_nodes.append({"host": ip, "port": port})

                    if service_name:
                        log.debug("使用redis哨兵模式")
                        hosts = [(node["host"], node["port"])
                                 for node in startup_nodes]
                        sentinel = Sentinel(hosts, socket_timeout=3, **kwargs)
                        self._redis = sentinel.master_for(
                            service_name,
                            password=user_pass,
                            db=db,
                            redis_class=redis.StrictRedis,
                            decode_responses=decode_responses,
                            max_connections=max_connections,
                            **kwargs)

                    else:
                        log.debug("使用redis集群模式")
                        self._redis = StrictRedisCluster(
                            startup_nodes=startup_nodes,
                            decode_responses=decode_responses,
                            password=user_pass,
                            max_connections=max_connections,
                            **kwargs)

                    self._is_redis_cluster = True
                else:
                    ip, port = ip_ports[0].split(":")
                    self._redis = redis.StrictRedis(
                        host=ip,
                        port=port,
                        db=db,
                        password=user_pass,
                        decode_responses=decode_responses,
                        max_connections=max_connections,
                        **kwargs)
            else:
                self._redis = redis.StrictRedis.from_url(
                    url, decode_responses=decode_responses)

        except Exception as e:
            raise
        else:
            if not url:
                log.debug("连接到redis数据库 %s db%s" % (ip_ports, db))
            else:
                log.debug("连接到redis数据库 %s" % (url))

        self._ip_ports = ip_ports
        self._db = db
        self._user_pass = user_pass
        self._url = url
示例#23
0
    def get_response(self, save_cached=False):
        """
        获取带有selector功能的response
        @param save_cached: 保存缓存 方便调试时不用每次都重新下载
        @return:
        """
        # 设置超时默认时间
        self.requests_kwargs.setdefault("timeout", 22)  # connect=22 read=22

        # 设置stream
        self.requests_kwargs.setdefault(
            "stream", True
        )  # 默认情况下,当你进行网络请求后,响应体会立即被下载。你可以通过 stream 参数覆盖这个行为,推迟下载响应体直到访问 Response.content 属性。此时仅有响应头被下载下来了。缺点: stream 设为 True,Requests 无法将连接释放回连接池,除非你 消耗了所有的数据,或者调用了 Response.close。 这样会带来连接效率低下的问题。

        # 关闭证书验证
        self.requests_kwargs.setdefault("verify", False)

        # 设置请求方法
        method = self.__dict__.get("method")
        if not method:
            if "data" in self.requests_kwargs:
                method = "POST"
            else:
                method = "GET"

        # 随机user—agent
        headers = self.requests_kwargs.get("headers", {})
        if "user-agent" not in headers and "User-Agent" not in headers:
            if self.random_user_agent and setting.RANDOM_HEADERS:
                headers.update(
                    {"User-Agent": self.__class__.user_agent_pool.get()})
                self.requests_kwargs.update(headers=headers)
        else:
            self.requests_kwargs.setdefault(
                "headers",
                {
                    "User-Agent":
                    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
                },
            )

        # 代理
        proxies = self.requests_kwargs.get("proxies", -1)
        if proxies == -1 and setting.PROXY_ENABLE and self.__class__.proxies_pool:
            while True:
                proxies = self.__class__.proxies_pool.get()
                if proxies:
                    self.requests_kwargs.update(proxies=proxies)
                    break
                else:
                    log.debug("暂无可用代理 ...")

        log.debug("""
                -------------- %srequest for ----------------
                url  = %s
                method = %s
                body = %s
                """ % (
            "" if not self.parser_name else "%s.%s " % (
                self.parser_name,
                (self.callback and callable(self.callback) and getattr(
                    self.callback, "__name__") or self.callback) or "parser",
            ),
            self.url,
            method,
            self.requests_kwargs,
        ))

        # def hooks(response, *args, **kwargs):
        #     print(response.url)
        #
        # self.requests_kwargs.update(hooks={'response': hooks})

        use_session = (setting.USE_SESSION if self.use_session is None else
                       self.use_session)  # self.use_session 优先级高

        if use_session:
            response = self._session.request(method, self.url,
                                             **self.requests_kwargs)
        else:
            response = requests.request(method, self.url,
                                        **self.requests_kwargs)

        response = Response(response)
        if save_cached:
            self.save_cached(response,
                             expire_time=self.__class__.cached_expire_time)

        return response
示例#24
0
 def add_user(self, user: GuestUser):
     log.debug("add {}".format(user))
     self._redisdb.hset(self._tab_user_pool, user.user_id, user.to_dict())
示例#25
0
    def get_user(
        self,
        block=True,
        username=None,
        used_for_spider_name=None,
        not_limit_use_interval=False,
    ) -> Optional[GoldUser]:
        """
        @params username: 获取指定的用户
        @params used_for_spider_name: 独享式使用,独享爬虫的名字。其他爬虫不可抢占
        @params block: 无用户时是否等待
        @params not_limit_frequence: 不限制使用频率
        @return: GoldUser
        """
        while True:
            try:
                user_id = username or self._get_user_id()
                user_str = None
                if user_id:
                    user_str = self._redisdb.hget(self._tab_user_pool, user_id)

                if (not user_id or not user_str) and block:
                    self._keep_alive = False
                    self.run(username)
                    continue

                # 取到用户
                user = GoldUser(**eval(user_str))

                # 独占式使用,若为其他爬虫,检查等待使用时间是否超过独占时间,若超过则可以使用
                if (user.get_used_for_spider_name()
                        and user.get_used_for_spider_name() !=
                        used_for_spider_name):
                    wait_time = time.time() - user.get_last_use_time()
                    if wait_time < user.exclusive_time:
                        log.info("用户{} 被 {} 爬虫独占,需等待 {} 秒后才可使用".format(
                            user.username,
                            user.get_used_for_spider_name(),
                            user.exclusive_time - wait_time,
                        ))
                        time.sleep(1)
                        continue

                if not user.is_overwork() and user.is_at_work_time():
                    if not user.cookies:
                        log.debug(f"用户 {user.username} 未登录,尝试登录")
                        self._keep_alive = False
                        self.run(username)
                        continue

                    if not_limit_use_interval or user.is_time_to_use():
                        user.set_used_for_spider_name(used_for_spider_name)
                        log.debug("使用用户 {}".format(user.username))
                        self.record_user_status(user.user_id,
                                                GoldUserStatus.USED)
                        return user
                    else:
                        log.debug("{} 用户使用间隔过短 查看下一个用户".format(user.username))
                        time.sleep(1)
                        continue
                else:
                    if not user.is_at_work_time():
                        log.info("用户 {} 不在工作时间 sleep 60s".format(
                            user.username))
                        if block:
                            time.sleep(60)
                            continue
                        else:
                            return None

            except Exception as e:
                log.exception(e)
                time.sleep(1)
示例#26
0
# -*- coding: utf-8 -*-
"""
Created on 2021/6/18 10:36 上午
---------
@summary:
---------
@author: Boris
@email: [email protected]
"""

from feapder.utils.log import log

log.debug(1)
示例#27
0
    def get_response(self, save_cached=False):
        """
        获取带有selector功能的response
        @param save_cached: 保存缓存 方便调试时不用每次都重新下载
        @return:
        """
        # 设置超时默认时间
        self.requests_kwargs.setdefault(
            "timeout", setting.REQUEST_TIMEOUT)  # connect=22 read=22

        # 设置stream
        # 默认情况下,当你进行网络请求后,响应体会立即被下载。你可以通过 stream 参数覆盖这个行为,推迟下载响应体直到访问 Response.content 属性。此时仅有响应头被下载下来了。缺点: stream 设为 True,Requests 无法将连接释放回连接池,除非你 消耗了所有的数据,或者调用了 Response.close。 这样会带来连接效率低下的问题。
        self.requests_kwargs.setdefault("stream", True)

        # 关闭证书验证
        self.requests_kwargs.setdefault("verify", False)

        # 设置请求方法
        method = self.__dict__.get("method")
        if not method:
            if "data" in self.requests_kwargs or "json" in self.requests_kwargs:
                method = "POST"
            else:
                method = "GET"

        # 随机user—agent
        headers = self.requests_kwargs.get("headers", {})
        if "user-agent" not in headers and "User-Agent" not in headers:
            if self.render:  # 如果是渲染默认,优先使用WEBDRIVER中配置的ua
                ua = setting.WEBDRIVER.get(
                    "user_agent") or self.__class__.user_agent_pool.get(
                        setting.USER_AGENT_TYPE)
            else:
                ua = self.__class__.user_agent_pool.get(
                    setting.USER_AGENT_TYPE)

            if self.random_user_agent and setting.RANDOM_HEADERS:
                headers.update({"User-Agent": ua})
                self.requests_kwargs.update(headers=headers)
        else:
            self.requests_kwargs.setdefault(
                "headers", {"User-Agent": setting.DEFAULT_USERAGENT})

        # 代理
        proxies = self.requests_kwargs.get("proxies", -1)
        if proxies == -1 and setting.PROXY_ENABLE and setting.PROXY_EXTRACT_API:
            while True:
                proxies = self._proxies_pool.get()
                if proxies:
                    self.requests_kwargs.update(proxies=proxies)
                    break
                else:
                    log.debug("暂无可用代理 ...")

        log.debug("""
                -------------- %srequest for ----------------
                url  = %s
                method = %s
                body = %s
                """ % (
            "" if not self.parser_name else "%s.%s " % (
                self.parser_name,
                (self.callback and callable(self.callback) and getattr(
                    self.callback, "__name__") or self.callback) or "parse",
            ),
            self.url,
            method,
            self.requests_kwargs,
        ))

        # def hooks(response, *args, **kwargs):
        #     print(response.url)
        #
        # self.requests_kwargs.update(hooks={'response': hooks})

        use_session = (setting.USE_SESSION if self.use_session is None else
                       self.use_session)  # self.use_session 优先级高

        if self.render:
            # 使用request的user_agent、cookies、proxy
            user_agent = headers.get("User-Agent") or headers.get("user-agent")
            cookies = self.requests_kwargs.get("cookies")
            if cookies and isinstance(cookies, RequestsCookieJar):
                cookies = cookies.get_dict()

            if not cookies:
                cookie_str = headers.get("Cookie") or headers.get("cookie")
                if cookie_str:
                    cookies = tools.get_cookies_from_str(cookie_str)

            proxy = None
            if proxies and proxies != -1:
                proxy = proxies.get("http",
                                    "").strip("http://") or proxies.get(
                                        "https", "").strip("https://")

            browser = self._webdriver_pool.get(user_agent=user_agent,
                                               proxy=proxy)

            url = self.url
            if self.requests_kwargs.get("params"):
                url = tools.joint_url(self.url,
                                      self.requests_kwargs.get("params"))

            try:
                browser.get(url)
                if cookies:
                    browser.cookies = cookies
                if self.render_time:
                    tools.delay_time(self.render_time)

                html = browser.page_source
                response = Response.from_dict({
                    "url": browser.current_url,
                    "cookies": browser.cookies,
                    "_content": html.encode(),
                    "status_code": 200,
                    "elapsed": 666,
                    "headers": {
                        "User-Agent":
                        browser.execute_script("return navigator.userAgent"),
                        "Cookie":
                        tools.cookies2str(browser.cookies),
                    },
                })

                response.browser = browser
            except Exception as e:
                self._webdriver_pool.remove(browser)
                raise e

        elif use_session:
            response = self._session.request(method, self.url,
                                             **self.requests_kwargs)
            response = Response(response)
        else:
            response = requests.request(method, self.url,
                                        **self.requests_kwargs)
            response = Response(response)

        if save_cached:
            self.save_cached(response,
                             expire_time=self.__class__.cached_expire_time)

        return response
示例#28
0
def run():
    while True:
        redisdb = RedisDB()
        try:
            block_ip = redisdb.sget(setting.CAPTCHA_BLOCK_IP_REDIS_KEY)
            if not block_ip:
                log.debug("暂无被封ip")
            for ip in block_ip:
                task = redisdb.hget(setting.CAPTCHA_REDIS_KEY, ip, is_pop=True)
                task = eval(task)
                ua = task.get("ua")
                url = task.get("url")

                with WebDriver(proxy=ip, user_agent=ua) as browser:
                    log.info("解封ip {}, url {}".format(ip, url))
                    browser.get(url)
                    browser.implicitly_wait(5)
                    frame = browser.find_element_by_id("tcaptcha_iframe")
                    browser.switch_to.frame(frame)
                    for i in range(20):
                        for i in range(1000):
                            bg_url = browser.find_element_by_id(
                                "slideBg").get_attribute("src")
                            slide_url = browser.find_element_by_id(
                                "slideBlock").get_attribute("src")
                            if bg_url and slide_url:
                                break
                        else:
                            log.error("滑块加载失败")
                            return

                        bg_image = os.path.join(
                            CAPTCHA_PATH,
                            "bg_" + tools.get_md5(bg_url) + ".png")
                        slide_image = os.path.join(
                            CAPTCHA_PATH,
                            "slider_" + tools.get_md5(slide_url) + ".png")
                        if tools.download_file(
                                bg_url, bg_image) and tools.download_file(
                                    slide_url, slide_image):
                            # 识别缺口
                            x, y = get_gap_center_point(bg_image,
                                                        slide_image,
                                                        show=False)
                            # 缩放
                            x = x * 340 / 680
                            x = x - 27.5 - 30
                            # 滑动
                            slide_btn = browser.find_element_by_id(
                                "tcaptcha_drag_thumb")
                            tracks = track.get_tracks(x)
                            drag_and_drop(browser, slide_btn, tracks)
                            # 删除图片
                            os.remove(bg_image)
                            os.remove(slide_image)

                            tools.delay_time(2)
                            if "verify.maoyan.com" not in browser.current_url:
                                log.info("解封成功")
                                break
                            else:
                                try:
                                    browser.find_element_by_css_selector(
                                        ".tc-action-icon").click()
                                except:
                                    pass
            tools.delay_time(3)
        except Exception as e:
            log.error(e)
示例#29
0
    def run(self, username=None):
        while True:
            try:
                with RedisLock(key=self._tab_user_pool,
                               lock_timeout=3600,
                               wait_timeout=0) as _lock:
                    if _lock.locked:
                        self.__sycn_users_info()
                        online_user = 0
                        for user in self.users:
                            if username and username != user.username:
                                continue

                            try:
                                if user.cookies:
                                    online_user += 1
                                    continue

                                # 预检查
                                if not user.is_time_to_login():
                                    log.info("账号{}与上次登录时间间隔过短,暂不登录: 将在{}登录使用".
                                             format(user.username,
                                                    user.next_login_time()))
                                    continue

                                user = self.login(user)
                                if user.cookies:
                                    # 保存cookie
                                    user.set_login_time()
                                    self.add_user(user)
                                    self.record_user_status(
                                        user.user_id,
                                        GoldUserStatus.LOGIN_SUCCESS)
                                    log.debug("登录成功 {}".format(user.username))
                                    online_user += 1
                                else:
                                    log.info("登录失败 {}".format(user.username))
                                    self.record_user_status(
                                        user.user_id,
                                        GoldUserStatus.LOGIN_FALIED)
                            except NotImplementedError:
                                log.error(
                                    f"{self.__class__.__name__} must be implementation login method!"
                                )
                                os._exit(0)
                            except Exception as e:
                                log.exception(e)
                                msg = f"{user.username} 账号登陆失败 exception: {str(e)}"
                                log.info(msg)
                                self.record_user_status(
                                    user.user_id, GoldUserStatus.LOGIN_FALIED)

                                send_msg(
                                    msg=msg,
                                    level="error",
                                    message_prefix=f"{user.username} 账号登陆失败",
                                )

                        log.info("当前在线user数为 {}".format(online_user))

                if self._keep_alive:
                    time.sleep(10)
                else:
                    break

            except Exception as e:
                log.exception(e)
                time.sleep(1)
示例#30
0
    def get_response(self, save_cached=False):
        """
        获取带有selector功能的response
        @param save_cached: 保存缓存 方便调试时不用每次都重新下载
        @return:
        """
        # 设置超时默认时间
        self.requests_kwargs.setdefault("timeout", 22)  # connect=22 read=22

        # 设置stream
        # 默认情况下,当你进行网络请求后,响应体会立即被下载。你可以通过 stream 参数覆盖这个行为,推迟下载响应体直到访问 Response.content 属性。此时仅有响应头被下载下来了。缺点: stream 设为 True,Requests 无法将连接释放回连接池,除非你 消耗了所有的数据,或者调用了 Response.close。 这样会带来连接效率低下的问题。
        self.requests_kwargs.setdefault("stream", True)

        # 关闭证书验证
        self.requests_kwargs.setdefault("verify", False)

        # 设置请求方法
        method = self.__dict__.get("method")
        if not method:
            if "data" in self.requests_kwargs:
                method = "POST"
            else:
                method = "GET"

        # 随机user—agent
        headers = self.requests_kwargs.get("headers", {})
        if "user-agent" not in headers and "User-Agent" not in headers:
            if self.random_user_agent and setting.RANDOM_HEADERS:
                headers.update(
                    {"User-Agent": self.__class__.user_agent_pool.get()})
                self.requests_kwargs.update(headers=headers)
        else:
            self.requests_kwargs.setdefault(
                "headers", {"User-Agent": setting.DEFAULT_USERAGENT})

        # 代理
        proxies = self.requests_kwargs.get("proxies", -1)
        if proxies == -1 and setting.PROXY_ENABLE and self.__class__.proxies_pool:
            while True:
                proxies = self.__class__.proxies_pool.get()
                if proxies:
                    self.requests_kwargs.update(proxies=proxies)
                    break
                else:
                    log.debug("暂无可用代理 ...")

        log.debug("""
                -------------- %srequest for ----------------
                url  = %s
                method = %s
                body = %s
                """ % (
            "" if not self.parser_name else "%s.%s " % (
                self.parser_name,
                (self.callback and callable(self.callback) and getattr(
                    self.callback, "__name__") or self.callback) or "parse",
            ),
            self.url,
            method,
            self.requests_kwargs,
        ))

        # def hooks(response, *args, **kwargs):
        #     print(response.url)
        #
        # self.requests_kwargs.update(hooks={'response': hooks})

        use_session = (setting.USE_SESSION if self.use_session is None else
                       self.use_session)  # self.use_session 优先级高

        if self.render:
            browser = self._webdriver_pool.get()

            try:
                browser.get(self.url)
                html = browser.page_source
                response = Response.from_dict({
                    "url": browser.current_url,
                    "cookies": browser.cookies,
                    "text": html,
                    "_content": html.encode(),
                    "status_code": 200,
                    "elapsed": 666,
                    "headers": {
                        "User-Agent":
                        browser.execute_script("return navigator.userAgent")
                    },
                })

                response._cached_text = html
                # response.browser = browser # 因为浏览器渲染完就释放了,所以不能绑定到response上
                self._webdriver_pool.put(browser)
            except Exception as e:
                self._webdriver_pool.remove(browser)
                raise e

        elif use_session:
            response = self._session.request(method, self.url,
                                             **self.requests_kwargs)
            response = Response(response)
        else:
            response = requests.request(method, self.url,
                                        **self.requests_kwargs)
            response = Response(response)

        if save_cached:
            self.save_cached(response,
                             expire_time=self.__class__.cached_expire_time)

        return response