Пример #1
0
    def __export_to_db(self, tab_item, datas, is_update=False, update_keys=()):
        to_table = tools.get_info(tab_item, ":s_(.*?)_item$", fetch_one=True)

        # 打点 校验
        self.check_datas(table=to_table, datas=datas)

        for pipeline in self._pipelines:
            if is_update:
                if to_table == self._task_table and not isinstance(
                    pipeline, MysqlPipeline
                ):
                    continue

                if not pipeline.update_items(to_table, datas, update_keys=update_keys):
                    log.error(
                        f"{pipeline.__class__.__name__} 更新数据失败. table: {to_table}  items: {datas}"
                    )
                    return False

            else:
                if not pipeline.save_items(to_table, datas):
                    log.error(
                        f"{pipeline.__class__.__name__} 保存数据失败. table: {to_table}  items: {datas}"
                    )
                    return False

        # 若是任务表, 且上面的pipeline里没mysql,则需调用mysql更新任务
        if not self._have_mysql_pipeline and is_update and to_table == self._task_table:
            self.mysql_pipeline.update_items(to_table, datas, update_keys=update_keys)
Пример #2
0
    def add(self, sql, exception_callfunc=None):
        """

        Args:
            sql:
            exception_callfunc: 异常回调

        Returns: 添加行数

        """
        affect_count = None

        try:
            conn, cursor = self.get_connection()
            affect_count = cursor.execute(sql)
            conn.commit()

        except Exception as e:
            log.error(
                """
                error:%s
                sql:  %s
            """
                % (e, sql)
            )
            if exception_callfunc:
                exception_callfunc(e)
        finally:
            self.close_connection(conn, cursor)

        return affect_count
Пример #3
0
    def _make_absolute(self, link):
        """Makes a given link absolute."""
        try:

            link = link.strip()

            # Parse the link with stdlib.
            parsed = urlparse(link)._asdict()

            # If link is relative, then join it with base_url.
            if not parsed["netloc"]:
                return urljoin(self.url, link)

            # Link is absolute; if it lacks a scheme, add one from base_url.
            if not parsed["scheme"]:
                parsed["scheme"] = urlparse(self.url).scheme

                # Reconstruct the URL to incorporate the new scheme.
                parsed = (v for v in parsed.values())
                return urlunparse(parsed)

        except Exception as e:
            log.error(
                "Invalid URL <{}> can't make absolute_link. exception: {}".
                format(link, e))

        # Link is absolute and complete with scheme; nothing to be done here.
        return link
Пример #4
0
    def update_items(self,
                     table,
                     items: List[Dict],
                     update_keys=Tuple) -> bool:
        """
        更新数据
        Args:
            table: 表名
            items: 数据,[{},{},...]
            update_keys: 更新的字段, 如 ("title", "publish_time")

        Returns: 是否更新成功 True / False
                 若False,不会将本批数据入到去重库,以便再次入库

        """

        sql, datas = tools.make_batch_sql(table,
                                          items,
                                          update_columns=update_keys
                                          or list(items[0].keys()))
        update_count = self.to_db.add_batch(sql, datas)
        if update_count is None:
            log.error("更新表 %s 数据失败" % (table))
        else:
            msg = "共更新 %s 条数据 到 %s" % (update_count // 2, table)
            if update_keys:
                msg += " 更新字段为 {}".format(update_keys)
            log.info(msg)

        return update_count != None
Пример #5
0
    def delete(self, sql):
        """
        删除
        Args:
            sql:

        Returns: True / False

        """
        try:
            conn, cursor = self.get_connection()
            cursor.execute(sql)
            conn.commit()

        except Exception as e:
            log.error(
                """
                error:%s
                sql:  %s
            """
                % (e, sql)
            )
            return False
        else:
            return True
        finally:
            self.close_connection(conn, cursor)
Пример #6
0
    def update(self, coll_name, data: Dict, condition: Dict, upsert: bool = False):
        """
        更新
        Args:
            coll_name: 集合名
            data: 单条数据 {"xxx":"xxx"}
            condition: 更新条件 {"_id": "xxxx"}
            upsert: 数据不存在则插入,默认为 False

        Returns: True / False
        """
        try:
            collection = self.get_collection(coll_name)
            collection.update_one(condition, {"$set": data}, upsert=upsert)
        except Exception as e:
            log.error(
                """
                error:{}
                condition: {}
            """.format(
                    e, condition
                )
            )
            return False
        else:
            return True
Пример #7
0
    def update_items(self, tab_item, items_data, update_keys=()):
        """
        @summary:
        ---------
        @param tab_item: redis中items的表名
        @param items_data: [item.to_dict] 数据
        @param update_keys: 更新的字段
        ---------
        @result:
        """
        to_table = tools.get_info(tab_item, ":s_(.*?)_item", fetch_one=True)
        sql, datas = tools.make_batch_sql(
            to_table,
            items_data,
            update_columns=update_keys or list(items_data[0].keys()),
        )
        update_count = self.to_db.add_batch(sql, datas)
        if update_count is None:
            log.error("更新表 %s 数据失败" % (to_table))
        else:
            msg = "共更新 %s 条数据 到 %s" % (update_count // 2, to_table)
            if update_keys:
                msg += " 更新字段为 {}".format(update_keys)
            log.info(msg)

        return update_count != None
Пример #8
0
    def add_batch(self, sql, datas: List[Dict]):
        """
        @summary: 批量添加数据
        ---------
        @ param sql: insert ignore into (xxx,xxx) values (%s, %s, %s)
        # param datas: 列表 [[..], [...]]
        ---------
        @result: 添加行数
        """
        affect_count = None

        try:
            conn, cursor = self.get_connection()
            affect_count = cursor.executemany(sql, datas)
            conn.commit()

        except Exception as e:
            log.error(
                """
                error:%s
                sql:  %s
                """
                % (e, sql)
            )
        finally:
            self.close_connection(conn, cursor)

        return affect_count
Пример #9
0
 def wapper(*args, **kwargs):
     for i in range(3):
         try:
             return func(*args, **kwargs)
         except (err.InterfaceError, err.OperationalError) as e:
             log.error("""
                 error:%s
                 sql:  %s
                 """ % (e, kwargs.get("sql") or args[1]))
Пример #10
0
    def run(self):
        """
        @summary: 重写run方法 检查mysql中的任务是否做完, 做完停止
        ---------
        ---------
        @result:
        """
        try:
            self.create_batch_record_table()

            if not self._parsers:  # 不是add_parser 模式
                self._parsers.append(self)

            self._start()

            while True:
                try:
                    self.heartbeat()
                    if (
                            self.task_is_done() and self.all_thread_is_done()
                    ):  # redis全部的任务已经做完 并且mysql中的任务已经做完(检查各个线程all_thread_is_done,防止任务没做完,就更新任务状态,导致程序结束的情况)
                        if not self._is_notify_end:
                            self.spider_end()
                            self.record_spider_state(
                                spider_type=2,
                                state=1,
                                batch_date=self._batch_date_cache,
                                spider_end_time=tools.get_current_date(),
                                batch_interval=self._batch_interval,
                            )

                            self._is_notify_end = True

                        if not self._keep_alive:
                            self._stop_all_thread()
                            break
                    else:
                        self._is_notify_end = False

                    self.check_task_status()

                except Exception as e:
                    log.exception(e)

                tools.delay_time(10)  # 10秒钟检查一次爬虫状态

        except Exception as e:
            msg = "《%s》主线程异常 爬虫结束 exception: %s" % (self._batch_name, e)
            log.error(msg)
            self.send_msg(msg,
                          level="error",
                          message_prefix="《%s》爬虫异常结束".format(self._batch_name))

            os._exit(137)  # 使退出码为35072 方便爬虫管理器重启
Пример #11
0
    def record_batch(self):
        """
        @summary: 记录批次信息(初始化)
        ---------
        ---------
        @result:
        """

        # 查询总任务数
        sql = "select count(1) from %s%s" % (
            self._task_table,
            self._task_condition_prefix_where,
        )
        total_task_count = self._mysqldb.find(sql)[0][0]

        batch_date = tools.get_current_date(self._date_format)

        sql = (
            "insert into %s (batch_date, done_count, total_count, `interval`, interval_unit, create_time) values ('%s', %s, %s, %s, '%s', CURRENT_TIME)"
            % (
                self._batch_record_table,
                batch_date,
                0,
                total_task_count,
                self._batch_interval
                if self._batch_interval >= 1
                else self._batch_interval * 24,
                "day" if self._batch_interval >= 1 else "hour",
            )
        )

        affect_count = self._mysqldb.add(sql)  # None / 0 / 1 (1 为成功)
        if affect_count:
            # 重置批次日期
            self._batch_date_cache = batch_date
            # 重新刷下self.batch_date 中的 os.environ.get('batch_date') 否则日期还停留在上一个批次
            os.environ["batch_date"] = self._batch_date_cache

            # 爬虫开始
            self.spider_begin()
            self.record_spider_state(
                spider_type=2,
                state=0,
                batch_date=batch_date,
                spider_start_time=tools.get_current_date(),
                batch_interval=self._batch_interval,
            )
        else:
            log.error("插入新批次失败")

        return affect_count
Пример #12
0
    def _reconnect(self):
        # 检测连接状态, 当数据库重启或设置 timeout 导致断开连接时自动重连
        retry_count = 0
        while True:
            try:
                retry_count += 1
                log.error(f"redis 连接断开, 重新连接 {retry_count}")
                if self.get_connect():
                    log.info(f"redis 连接成功")
                    return True
            except (ConnectionError, TimeoutError) as e:
                log.error(f"连接失败 e: {e}")

            time.sleep(2)
Пример #13
0
    def __init__(
        self, ip=None, port=None, db=None, user_name=None, user_pass=None, **kwargs
    ):
        # 可能会改setting中的值,所以此处不能直接赋值为默认值,需要后加载赋值
        if not ip:
            ip = setting.MYSQL_IP
        if not port:
            port = setting.MYSQL_PORT
        if not db:
            db = setting.MYSQL_DB
        if not user_name:
            user_name = setting.MYSQL_USER_NAME
        if not user_pass:
            user_pass = setting.MYSQL_USER_PASS

        try:

            self.connect_pool = PooledDB(
                creator=pymysql,
                mincached=1,
                maxcached=100,
                maxconnections=100,
                blocking=True,
                ping=7,
                host=ip,
                port=port,
                user=user_name,
                passwd=user_pass,
                db=db,
                charset="utf8mb4",
                cursorclass=cursors.SSCursor,
            )  # cursorclass 使用服务的游标,默认的在多线程下大批量插入数据会使内存递增

        except Exception as e:
            log.error(
                """
            连接数据失败:
            ip: {}
            port: {}
            db: {}
            user_name: {}
            user_pass: {}
            exception: {}
            """.format(
                    ip, port, db, user_name, user_pass, e
                )
            )
        else:
            log.debug("连接到mysql数据库 %s : %s" % (ip, db))
Пример #14
0
    def run(self):
        while True:
            try:
                try:
                    with RedisLock(
                        key=self._tab_user_pool, lock_timeout=3600, wait_timeout=0
                    ) as _lock:
                        if _lock.locked:
                            for user in self._load_user():
                                retry_times = 0
                                while retry_times <= self._login_retry_times:
                                    try:
                                        user = self.login(user)
                                        if user:
                                            self.add_user(user)
                                        else:
                                            self.handle_login_failed_user(user)
                                        break
                                    except NotImplementedError:
                                        log.error(
                                            f"{self.__class__.__name__} must be implementation login method!"
                                        )
                                        os._exit(0)
                                    except Exception as e:
                                        self.handel_exception(e)
                                    log.debug(
                                        f"login failed, user: {user} retry_times: {retry_times}"
                                    )
                                    retry_times += 1
                                else:
                                    self.handle_login_failed_user(user)

                            now_user_count = self._redisdb.hget_count(
                                self._tab_user_pool
                            )
                            log.info("当前在线user数为 {}".format(now_user_count))

                except Exception as e:
                    log.exception(e)

                if self._keep_alive:
                    tools.delay_time(10)
                else:
                    break

            except Exception as e:
                log.exception(e)
                tools.delay_time(1)
Пример #15
0
    def update(self, sql):
        try:
            conn, cursor = self.get_connection()
            cursor.execute(sql)
            conn.commit()

        except Exception as e:
            log.error("""
                error:%s
                sql:  %s
            """ % (e, sql))
            return False
        else:
            return True
        finally:
            self.close_connection(conn, cursor)
Пример #16
0
    def set_unique_key(self, table, key):
        try:
            sql = "alter table %s add unique (%s)" % (table, key)

            conn, cursor = self.get_connection()
            cursor.execute(sql)
            conn.commit()

        except Exception as e:
            log.error(table + " " + str(e) + " key = " + key)
            return False
        else:
            log.debug("%s表创建唯一索引成功 索引为 %s" % (table, key))
            return True
        finally:
            self.close_connection(conn, cursor)
Пример #17
0
    def login(self) -> Optional[GuestUser]:
        """
        默认使用webdirver去登录,生产cookie,可以重写
        """
        with WebDriver(**self._kwargs) as driver:
            driver.get(self._page_url)

            cookies = driver.cookies

            for key in self._must_contained_keys:
                if key not in cookies:
                    break
            else:
                user = GuestUser(user_agent=driver.user_agent, cookies=cookies)
                return user

            log.error("获取cookie失败 cookies = {}".format(cookies))
            return None
Пример #18
0
    def delete(self, table, condition: Dict):
        """
        删除
        Args:
            table:
            condition: 查找条件
        Returns: True / False

        """
        try:
            collection = self.get_collection(table)
            collection.delete_one(condition)
        except Exception as e:
            log.error("""
                error:{}
                condition: {}
            """.format(e, condition))
            return False
        else:
            return True
Пример #19
0
    def export_items(self, tab_item, items_data):
        """
        @summary:
        ---------
        @param tab_item: redis中items的表名
        @param items_data: [item.to_dict] 数据
        ---------
        @result:
        """

        to_table = tools.get_info(tab_item, ":s_(.*?)_item", fetch_one=True)
        sql, datas = tools.make_batch_sql(to_table, items_data)
        add_count = self.to_db.add_batch(sql, datas)
        datas_size = len(datas)
        if add_count is None:
            log.error("导出数据到表 %s 失败" % (to_table))
        else:
            log.info("共导出 %s 条数据 到 %s, 重复 %s 条" %
                     (datas_size, to_table, datas_size - add_count))

        return add_count != None
Пример #20
0
    def update(self, table, data: Dict, condition: Dict):
        """
        更新
        Args:
            table: 表名
            data: 数据 {"xxx":"xxx"}
            condition: 更新条件 {"_id": "xxxx"}

        Returns: True / False
        """
        try:
            collection = self.get_collection(table)
            collection.update_one(condition, {"$set": data})
        except Exception as e:
            log.error("""
                error:{}
                condition: {}
            """.format(e, condition))
            return False
        else:
            return True
Пример #21
0
    def create_cookie(self):
        """
        可能会重写
        @return:
        """
        with WebDriver(**self._kwargs) as driver:
            driver.get(self._page_url)

            cookies = driver.get_cookies()

            cookies_json = {}
            for cookie in cookies:
                cookies_json[cookie["name"]] = cookie["value"]

            for key in self._must_contained_keys:
                if key not in cookies_json:
                    break
            else:
                return cookies_json

            log.error("获取cookie失败 cookies = {}".format(cookies_json))
            return None
Пример #22
0
    def save_items(self, table, items: List[Dict]) -> bool:
        """
        保存数据
        Args:
            table: 表名
            items: 数据,[{},{},...]

        Returns: 是否保存成功 True / False
                 若False,不会将本批数据入到去重库,以便再次入库

        """

        sql, datas = tools.make_batch_sql(table, items)
        add_count = self.to_db.add_batch(sql, datas)
        datas_size = len(datas)
        if add_count is None:
            log.error("导出数据到表 %s 失败" % (table))
        else:
            log.info("共导出 %s 条数据 到 %s, 重复 %s 条" %
                     (datas_size, table, datas_size - add_count))

        return add_count != None
Пример #23
0
    def batch_date(self):
        """
        @summary: 获取批次时间
        ---------
        ---------
        @result:
        """

        batch_date = os.environ.get("batch_date")
        if not batch_date:
            sql = 'select date_format(batch_date, "{date_format}") from {batch_record_table} order by id desc limit 1'.format(
                date_format=self._date_format.replace(":%M", ":%i"),
                batch_record_table=self._batch_record_table,
            )
            batch_info = MysqlDB().find(sql)  # (('2018-08-19'),)
            if batch_info:
                os.environ["batch_date"] = batch_date = batch_info[0][0]
            else:
                log.error("需先运行 start_monitor_task()")
                os._exit(137)  # 使退出码为35072 方便爬虫管理器重启

        return batch_date
Пример #24
0
    def update_task_state(self, task_id, state=1, **kwargs):
        """
        @summary: 更新任务表中任务状态,做完每个任务时代码逻辑中要主动调用。可能会重写
        调用方法为 yield lambda : self.update_task_state(task_id, state)
        ---------
        @param task_id:
        @param state:
        ---------
        @result:
        """

        kwargs["id"] = task_id
        kwargs[self._task_state] = state

        sql = tools.make_update_sql(
            self._task_table, kwargs, condition="id = {task_id}".format(task_id=task_id)
        )

        if self._mysqldb.update(sql):
            log.debug("置任务%s状态成功" % task_id)
        else:
            log.error("置任务%s状态失败  sql=%s" % (task_id, sql))
Пример #25
0
    def export(self, from_table, to_table, auto_update=False, batch_count=100):
        """
        @summary:
        用于从redis的item中导出数据到关系型数据库,如mysql/oracle
        from_table与to_table表结构必须一致
        ---------
        @param from_table:
        @param to_table:
        @param auto_update: 当数据存在时是否自动更新 默认否
        ---------
        @result:
        """
        total_count = 0

        while True:
            datas = []
            try:
                datas = self.redisdb.sget(from_table,
                                          count=batch_count,
                                          is_pop=False)
                if not datas:
                    log.info("""
                        \r%s -> %s 共导出 %s 条数据""" %
                             (from_table, to_table, total_count))
                    break

                json_datas = [eval(data) for data in datas]
                sql, json_datas = tools.make_batch_sql(to_table, json_datas,
                                                       auto_update)
                if self.to_db.add_batch(sql, json_datas):
                    total_count += len(json_datas)
                    self.redisdb.srem(from_table, datas)

            except Exception as e:
                log.exception(e)
                log.error(datas)
Пример #26
0
    def check_task_status(self):
        """
        检查任务状态 预警
        """
        # 每分钟检查一次
        now_time = time.time()
        if now_time - self._last_check_task_status_time > 60:
            self._last_check_task_status_time = now_time
        else:
            return

        # 检查redis中任务状态,若连续20分钟内任务数量未发生变化(parser可能卡死),则发出报警信息
        task_count = self._redisdb.zget_count(self._tab_requests)

        if task_count:
            if task_count != self._last_task_count:
                self._last_task_count = task_count
                self._redisdb.hset(
                    self._tab_spider_time,
                    SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY,
                    tools.get_current_timestamp(),
                )  # 多进程会重复发消息, 使用reids记录上次统计时间
            else:
                # 判断时间间隔是否超过20分钟
                lua = """
                    -- local key = KEYS[1]
                    local field = ARGV[1]
                    local current_timestamp = ARGV[2]

                    -- 取值
                    local last_timestamp = redis.call('hget', KEYS[1], field)
                    if last_timestamp and current_timestamp - last_timestamp >= 1200 then
                        return current_timestamp - last_timestamp -- 返回任务停滞时间 秒
                    end

                    if not last_timestamp then
                        redis.call('hset', KEYS[1], field, current_timestamp)
                    end

                    return 0

                """
                redis_obj = self._redisdb.get_redis_obj()
                cmd = redis_obj.register_script(lua)
                overtime = cmd(
                    keys=[self._tab_spider_time],
                    args=[
                        SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY,
                        tools.get_current_timestamp(),
                    ],
                )

                if overtime:
                    # 发送报警
                    msg = "《{}》爬虫任务停滞 {},请检查爬虫是否正常".format(
                        self._spider_name, tools.format_seconds(overtime)
                    )
                    log.error(msg)
                    self.send_msg(
                        msg,
                        level="error",
                        message_prefix="《{}》爬虫任务停滞".format(self._spider_name),
                    )

        else:
            self._last_task_count = 0

        # 检查失败任务数量 超过1000 报警,
        failed_count = self._redisdb.zget_count(self._tab_failed_requests)
        if failed_count > setting.WARNING_FAILED_COUNT:
            # 发送报警
            msg = "《%s》爬虫当前失败任务 %s, 请检查爬虫是否正常" % (self._spider_name, failed_count)
            log.error(msg)
            self.send_msg(
                msg,
                level="error",
                message_prefix="《%s》爬虫当前失败任务数预警" % (self._spider_name),
            )

        # parser_control实时统计已做任务数及失败任务数,若失败数大于10且失败任务数/已做任务数>=0.5 则报警
        failed_task_count, success_task_count = PaserControl.get_task_status_count()
        total_count = success_task_count + failed_task_count
        if total_count > 0:
            task_success_rate = success_task_count / total_count
            if task_success_rate < 0.5:
                # 发送报警
                msg = "《%s》爬虫当前任务成功数%s, 失败数%s, 成功率 %.2f, 请检查爬虫是否正常" % (
                    self._spider_name,
                    success_task_count,
                    failed_task_count,
                    task_success_rate,
                )
                log.error(msg)
                # 统计下上次发消息的时间,若时间大于1小时,则报警(此处为多进程,需要考虑别报重复)
                self.send_msg(
                    msg,
                    level="error",
                    message_prefix="《%s》爬虫当前任务成功率" % (self._spider_name),
                )
Пример #27
0
    def deal_requests(self, requests):
        for request in requests:

            response = None
            request_redis = request["request_redis"]
            request = request["request_obj"]

            del_request_redis_after_item_to_db = False
            del_request_redis_after_request_to_db = False

            for parser in self._parsers:
                if parser.name == request.parser_name:
                    used_download_midware_enable = False
                    try:
                        # 记录需下载的文档
                        self.record_download_status(
                            PaserControl.DOWNLOAD_TOTAL, parser.name)

                        # 解析request
                        if request.auto_request:
                            request_temp = None
                            response = None
                            # 下载中间件
                            if request.download_midware:
                                download_midware = (
                                    request.download_midware
                                    if callable(request.download_midware)
                                    else tools.get_method(
                                        parser, request.download_midware))
                                request_temp = download_midware(request)
                            elif request.download_midware != False:
                                request_temp = parser.download_midware(request)

                            # 请求
                            if request_temp:
                                if (isinstance(request_temp, (tuple, list))
                                        and len(request_temp) == 2):
                                    request_temp, response = request_temp

                                if not isinstance(request_temp, Request):
                                    raise Exception(
                                        "download_midware need return a request, but received type: {}"
                                        .format(type(request_temp)))
                                used_download_midware_enable = True
                                if not response:
                                    response = (
                                        request_temp.get_response() if
                                        not setting.RESPONSE_CACHED_USED else
                                        request_temp.get_response_from_cached(
                                            save_cached=False))
                            else:
                                response = (request.get_response()
                                            if not setting.RESPONSE_CACHED_USED
                                            else
                                            request.get_response_from_cached(
                                                save_cached=False))

                            if response == None:
                                raise Exception(
                                    "连接超时 url: %s" %
                                    (request.url or request_temp.url))

                        else:
                            response = None

                        # 校验
                        if parser.validate(request, response) == False:
                            continue

                        if request.callback:  # 如果有parser的回调函数,则用回调处理
                            callback_parser = (request.callback if callable(
                                request.callback) else tools.get_method(
                                    parser, request.callback))
                            results = callback_parser(request, response)
                        else:  # 否则默认用parser处理
                            results = parser.parse(request, response)

                        if results and not isinstance(results, Iterable):
                            raise Exception(
                                "%s.%s返回值必须可迭代" %
                                (parser.name, request.callback or "parse"))

                        # 标识上一个result是什么
                        result_type = 0  # 0\1\2 (初始值\request\item)
                        # 此处判断是request 还是 item
                        for result in results or []:
                            if isinstance(result, Request):
                                result_type = 1
                                # 给request的 parser_name 赋值
                                result.parser_name = result.parser_name or parser.name

                                # 判断是同步的callback还是异步的
                                if result.request_sync:  # 同步
                                    request_dict = {
                                        "request_obj": result,
                                        "request_redis": None,
                                    }
                                    requests.append(request_dict)
                                else:  # 异步
                                    # 将next_request 入库
                                    self._request_buffer.put_request(result)
                                    del_request_redis_after_request_to_db = True

                            elif isinstance(result, Item):
                                result_type = 2
                                # 将item入库
                                self._item_buffer.put_item(result)
                                # 需删除正在做的request
                                del_request_redis_after_item_to_db = True

                            elif callable(result):  # result为可执行的无参函数
                                if (result_type == 2
                                    ):  # item 的 callback,buffer里的item均入库后再执行
                                    self._item_buffer.put_item(result)
                                    del_request_redis_after_item_to_db = True

                                else:  # result_type == 1: # request 的 callback,buffer里的request均入库后再执行。可能有的parser直接返回callback
                                    self._request_buffer.put_request(result)
                                    del_request_redis_after_request_to_db = True

                            # else:
                            #     raise TypeError('Expect Request、Item、callback func, bug get type: {}'.format(type(result)))

                    except Exception as e:
                        exception_type = (str(type(e)).replace("<class '",
                                                               "").replace(
                                                                   "'>", ""))
                        if exception_type.startswith("requests"):
                            # 记录下载失败的文档
                            self.record_download_status(
                                PaserControl.DOWNLOAD_EXCEPTION, parser.name)

                        else:
                            # 记录解析程序异常
                            self.record_download_status(
                                PaserControl.PAESERS_EXCEPTION, parser.name)

                        if setting.LOG_LEVEL == "DEBUG":  # 只有debug模式下打印, 超时的异常篇幅太多
                            log.exception(e)

                        log.error("""
                            -------------- %s.%s error -------------
                            error          %s
                            response       %s
                            deal request   %s
                            """ % (
                            parser.name,
                            (request.callback and callable(request.callback)
                             and getattr(request.callback, "__name__")
                             or request.callback) or "parse",
                            str(e),
                            response,
                            tools.dumps_json(request.to_dict, indent=28)
                            if setting.LOG_LEVEL == "DEBUG" else request,
                        ))

                        request.error_msg = "%s: %s" % (exception_type, e)
                        request.response = str(response)

                        if "Invalid URL" in str(e):
                            request.is_abandoned = True

                        requests = parser.exception_request(
                            request, response) or [request]
                        if not isinstance(requests, Iterable):
                            raise Exception("%s.%s返回值必须可迭代" %
                                            (parser.name, "exception_request"))
                        for request in requests:
                            if callable(request):
                                self._request_buffer.put_request(request)
                                continue

                            if not isinstance(request, Request):
                                raise Exception(
                                    "exception_request 需 yield request")

                            if (request.retry_times + 1 >
                                    setting.SPIDER_MAX_RETRY_TIMES
                                    or request.is_abandoned):
                                self.__class__._failed_task_count += 1  # 记录失败任务数

                                # 处理failed_request的返回值 request 或 func
                                results = parser.failed_request(
                                    request, response) or [request]
                                if not isinstance(results, Iterable):
                                    raise Exception(
                                        "%s.%s返回值必须可迭代" %
                                        (parser.name, "failed_request"))

                                for result in results:
                                    if isinstance(result, Request):
                                        if setting.SAVE_FAILED_REQUEST:
                                            if used_download_midware_enable:
                                                # 去掉download_midware 添加的属性
                                                original_request = (
                                                    Request.from_dict(
                                                        eval(request_redis)) if
                                                    request_redis else result)
                                                original_request.error_msg = (
                                                    request.error_msg)
                                                original_request.response = (
                                                    request.response)

                                                self._request_buffer.put_failed_request(
                                                    original_request)
                                            else:
                                                self._request_buffer.put_failed_request(
                                                    result)

                                    elif callable(result):
                                        self._request_buffer.put_request(
                                            result)

                                    elif isinstance(result, Item):
                                        self._item_buffer.put_item(result)

                                del_request_redis_after_request_to_db = True

                            else:
                                # 将 requests 重新入库 爬取
                                request.retry_times += 1
                                request.filter_repeat = False
                                log.info("""
                                    入库 等待重试
                                    url     %s
                                    重试次数 %s
                                    最大允许重试次数 %s""" % (
                                    request.url,
                                    request.retry_times,
                                    setting.SPIDER_MAX_RETRY_TIMES,
                                ))
                                if used_download_midware_enable:
                                    # 去掉download_midware 添加的属性 使用原来的requests
                                    original_request = (Request.from_dict(
                                        eval(request_redis)) if request_redis
                                                        else request)
                                    if hasattr(request, "error_msg"):
                                        original_request.error_msg = request.error_msg
                                    if hasattr(request, "response"):
                                        original_request.response = request.response
                                    original_request.retry_times = request.retry_times
                                    original_request.filter_repeat = (
                                        request.filter_repeat)

                                    self._request_buffer.put_request(
                                        original_request)
                                else:
                                    self._request_buffer.put_request(request)
                                del_request_redis_after_request_to_db = True

                    else:
                        # 记录下载成功的文档
                        self.record_download_status(
                            PaserControl.DOWNLOAD_SUCCESS, parser.name)
                        # 记录成功任务数
                        self.__class__._success_task_count += 1

                        # 缓存下载成功的文档
                        if setting.RESPONSE_CACHED_ENABLE:
                            request.save_cached(
                                response=response,
                                expire_time=setting.
                                RESPONSE_CACHED_EXPIRE_TIME,
                            )

                    finally:
                        # 释放浏览器
                        if response and hasattr(response, "browser"):
                            request._webdriver_pool.put(response.browser)

                    break

            # 删除正在做的request 跟随item优先
            if request_redis:
                if del_request_redis_after_item_to_db:
                    self._item_buffer.put_item(request_redis)

                elif del_request_redis_after_request_to_db:
                    self._request_buffer.put_del_request(request_redis)

                else:
                    self._request_buffer.put_del_request(request_redis)

        if setting.SPIDER_SLEEP_TIME:
            time.sleep(setting.SPIDER_SLEEP_TIME)
Пример #28
0
    def deal_requests(self, requests):
        for request in requests:

            response = None

            for parser in self._parsers:
                if parser.name == request.parser_name:
                    try:
                        # 记录需下载的文档
                        self.record_download_status(
                            PaserControl.DOWNLOAD_TOTAL, parser.name)

                        # 解析request
                        if request.auto_request:
                            request_temp = None
                            response = None

                            # 下载中间件
                            if request.download_midware:
                                download_midware = (
                                    request.download_midware
                                    if callable(request.download_midware)
                                    else tools.get_method(
                                        parser, request.download_midware))
                                request_temp = download_midware(request)
                            elif request.download_midware != False:
                                request_temp = parser.download_midware(request)

                            # 请求
                            if request_temp:
                                if (isinstance(request_temp, (tuple, list))
                                        and len(request_temp) == 2):
                                    request_temp, response = request_temp

                                if not isinstance(request_temp, Request):
                                    raise Exception(
                                        "download_midware need return a request, but received type: {}"
                                        .format(type(request_temp)))
                                request = request_temp

                            if not response:
                                response = (request.get_response()
                                            if not setting.RESPONSE_CACHED_USED
                                            else
                                            request.get_response_from_cached(
                                                save_cached=False))

                        else:
                            response = None

                        # 校验
                        if parser.validate(request, response) == False:
                            continue

                        if request.callback:  # 如果有parser的回调函数,则用回调处理
                            callback_parser = (request.callback if callable(
                                request.callback) else tools.get_method(
                                    parser, request.callback))
                            results = callback_parser(request, response)
                        else:  # 否则默认用parser处理
                            results = parser.parse(request, response)

                        if results and not isinstance(results, Iterable):
                            raise Exception(
                                "%s.%s返回值必须可迭代" %
                                (parser.name, request.callback or "parse"))

                        # 此处判断是request 还是 item
                        for result in results or []:
                            if isinstance(result, Request):
                                # 给request的 parser_name 赋值
                                result.parser_name = result.parser_name or parser.name

                                # 判断是同步的callback还是异步的
                                if result.request_sync:  # 同步
                                    requests.append(result)
                                else:  # 异步
                                    # 将next_request 入库
                                    self._memory_db.add(result)

                            elif isinstance(result, Item):
                                self._item_buffer.put_item(result)

                    except Exception as e:
                        exception_type = (str(type(e)).replace("<class '",
                                                               "").replace(
                                                                   "'>", ""))
                        if exception_type.startswith("requests"):
                            # 记录下载失败的文档
                            self.record_download_status(
                                PaserControl.DOWNLOAD_EXCEPTION, parser.name)

                        else:
                            # 记录解析程序异常
                            self.record_download_status(
                                PaserControl.PAESERS_EXCEPTION, parser.name)

                        if setting.LOG_LEVEL == "DEBUG":  # 只有debug模式下打印, 超时的异常篇幅太多
                            log.exception(e)

                        log.error("""
                                -------------- %s.%s error -------------
                                error          %s
                                response       %s
                                deal request   %s
                                """ % (
                            parser.name,
                            (request.callback and callable(request.callback)
                             and getattr(request.callback, "__name__")
                             or request.callback) or "parse",
                            str(e),
                            response,
                            tools.dumps_json(request.to_dict, indent=28)
                            if setting.LOG_LEVEL == "DEBUG" else request,
                        ))

                        request.error_msg = "%s: %s" % (exception_type, e)
                        request.response = str(response)

                        if "Invalid URL" in str(e):
                            request.is_abandoned = True

                        requests = parser.exception_request(
                            request, response) or [request]
                        if not isinstance(requests, Iterable):
                            raise Exception("%s.%s返回值必须可迭代" %
                                            (parser.name, "exception_request"))
                        for request in requests:
                            if not isinstance(request, Request):
                                raise Exception(
                                    "exception_request 需 yield request")

                            if (request.retry_times + 1 >
                                    setting.SPIDER_MAX_RETRY_TIMES
                                    or request.is_abandoned):
                                self.__class__._failed_task_count += 1  # 记录失败任务数

                                # 处理failed_request的返回值 request 或 func
                                results = parser.failed_request(
                                    request, response) or [request]
                                if not isinstance(results, Iterable):
                                    raise Exception(
                                        "%s.%s返回值必须可迭代" %
                                        (parser.name, "failed_request"))

                                log.info("""
                                    任务超过最大重试次数,丢弃
                                    url     %s
                                    重试次数 %s
                                    最大允许重试次数 %s""" % (
                                    request.url,
                                    request.retry_times,
                                    setting.SPIDER_MAX_RETRY_TIMES,
                                ))

                            else:
                                # 将 requests 重新入库 爬取
                                request.retry_times += 1
                                request.filter_repeat = False
                                log.info("""
                                        入库 等待重试
                                        url     %s
                                        重试次数 %s
                                        最大允许重试次数 %s""" % (
                                    request.url,
                                    request.retry_times,
                                    setting.SPIDER_MAX_RETRY_TIMES,
                                ))
                                self._memory_db.add(request)

                    else:
                        # 记录下载成功的文档
                        self.record_download_status(
                            PaserControl.DOWNLOAD_SUCCESS, parser.name)
                        # 记录成功任务数
                        self.__class__._success_task_count += 1

                        # 缓存下载成功的文档
                        if setting.RESPONSE_CACHED_ENABLE:
                            request.save_cached(
                                response=response,
                                expire_time=setting.
                                RESPONSE_CACHED_EXPIRE_TIME,
                            )

                    finally:
                        # 释放浏览器
                        if response and hasattr(response, "browser"):
                            request._webdriver_pool.put(response.browser)

                    break

        if setting.SPIDER_SLEEP_TIME:
            time.sleep(setting.SPIDER_SLEEP_TIME)
Пример #29
0
 def clear(self, table):
     try:
         self._redis.delete(table)
     except Exception as e:
         log.error(e)
Пример #30
0
    def __exit__(self, exc_type, exc_val, exc_tb):
        if exc_val:
            log.error(exc_val)

        self.quit()
        return True