Пример #1
0
 def put_failed_request(self, request, table=None):
     try:
         request_dict = request.to_dict
         self._db.zadd(table or self._table_failed_request, request_dict,
                       request.priority)
     except Exception as e:
         log.exception(e)
Пример #2
0
    def start_monitor_task(self, *args, **kws):
        if not self.is_reach_next_spider_time():
            return

        self._auto_start_requests = False
        redisdb = RedisDB()

        if not self._parsers:  # 不是add_parser 模式
            self._parsers.append(self)

        while True:
            try:
                # 检查redis中是否有任务
                tab_requests = setting.TAB_REQUSETS.format(
                    table_folder=self._table_folder)
                todo_task_count = redisdb.zget_count(tab_requests)

                if todo_task_count < self._min_task_count:  # 添加任务
                    # make start requests
                    self.distribute_task(*args, **kws)

                else:
                    log.info("redis 中尚有%s条积压任务,暂时不派发新任务" % todo_task_count)

            except Exception as e:
                log.exception(e)

            if self._auto_stop_when_spider_done:
                break

            time.sleep(self._check_task_interval)
Пример #3
0
    def __add_item_to_db(self, items, update_items, requests, callbacks,
                         items_fingerprints):
        export_success = False
        self._is_adding_to_db = True

        # 去重
        if setting.ITEM_FILTER_ENABLE:
            items, items_fingerprints = self.__dedup_items(
                items, items_fingerprints)

        # 分捡
        items_dict = self.__pick_items(items)
        update_items_dict = self.__pick_items(update_items,
                                              is_update_item=True)

        # item批量入库
        while items_dict:
            tab_item, datas = items_dict.popitem()

            log.debug("""
                -------------- item 批量入库 --------------
                表名: %s
                datas: %s
                    """ % (tab_item, tools.dumps_json(datas, indent=16)))

            export_success = self.__export_to_db(tab_item, datas)

        # 执行批量update
        while update_items_dict:
            tab_item, datas = update_items_dict.popitem()
            log.debug("""
                -------------- item 批量更新 --------------
                表名: %s
                datas: %s
                    """ % (tab_item, tools.dumps_json(datas, indent=16)))

            update_keys = self._item_update_keys.get(tab_item)
            export_success = self.__export_to_db(tab_item,
                                                 datas,
                                                 is_update=True,
                                                 update_keys=update_keys)

        # 执行回调
        while callbacks:
            try:
                callback = callbacks.pop(0)
                callback()
            except Exception as e:
                log.exception(e)

        # 删除做过的request
        if requests:
            self._db.zrem(self._table_request, requests)

        # 去重入库
        if export_success and setting.ITEM_FILTER_ENABLE:
            if items_fingerprints:
                self.__class__.dedup.add(items_fingerprints, skip_check=True)

        self._is_adding_to_db = False
Пример #4
0
    def __add_request_to_db(self):
        request_list = []
        prioritys = []
        callbacks = []

        while self._requests_deque:
            request = self._requests_deque.popleft()
            self._is_adding_to_db = True

            if callable(request):
                # 函数
                # 注意:应该考虑闭包情况。闭包情况可写成
                # def test(xxx = xxx):
                #     # TODO 业务逻辑 使用 xxx
                # 这么写不会导致xxx为循环结束后的最后一个值
                callbacks.append(request)
                continue

            priority = request.priority

            # 如果需要去重并且库中已重复 则continue
            if (request.filter_repeat and setting.REQUEST_FILTER_ENABLE
                    and not self.__class__.dedup.add(request.fingerprint)):
                log.debug("request已存在  url = %s" % request.url)
                continue
            else:
                request_list.append(str(request.to_dict))
                prioritys.append(priority)

            if len(request_list) > MAX_URL_COUNT:
                self._db.zadd(self._table_request, request_list, prioritys)
                request_list = []
                prioritys = []

        # 入库
        if request_list:
            self._db.zadd(self._table_request, request_list, prioritys)

        # 执行回调
        for callback in callbacks:
            try:
                callback()
            except Exception as e:
                log.exception(e)

        # 删除已做任务
        if self._del_requests_deque:
            request_done_list = []
            while self._del_requests_deque:
                request_done_list.append(self._del_requests_deque.popleft())

            # 去掉request_list中的requests, 否则可能会将刚添加的request删除
            request_done_list = list(
                set(request_done_list) - set(request_list))

            if request_done_list:
                self._db.zrem(self._table_request, request_done_list)

        self._is_adding_to_db = False
Пример #5
0
    def run(self):
        while not self._thread_stop:
            try:
                self.__add_request_to_db()
            except Exception as e:
                log.exception(e)

            tools.delay_time(1)
Пример #6
0
    def run(self):
        while not self._thread_stop:

            try:
                self.__input_data()
            except Exception as e:
                log.exception(e)

            self._is_collector_task = False

            time.sleep(self._interval)
Пример #7
0
    def flush(self):
        try:
            items = []
            update_items = []
            requests = []
            callbacks = []
            items_fingerprints = []
            data_count = 0

            while not self._items_queue.empty():
                data = self._items_queue.get_nowait()
                data_count += 1

                # data 分类
                if callable(data):
                    callbacks.append(data)

                elif isinstance(data, UpdateItem):
                    update_items.append(data)

                elif isinstance(data, Item):
                    items.append(data)
                    if setting.ITEM_FILTER_ENABLE:
                        items_fingerprints.append(data.fingerprint)

                else:  # request-redis
                    requests.append(data)

                if data_count >= UPLOAD_BATCH_MAX_SIZE:
                    self.__add_item_to_db(
                        items, update_items, requests, callbacks, items_fingerprints
                    )

                    items = []
                    update_items = []
                    requests = []
                    callbacks = []
                    items_fingerprints = []
                    data_count = 0

            if data_count:
                self.__add_item_to_db(
                    items, update_items, requests, callbacks, items_fingerprints
                )

        except Exception as e:
            log.exception(e)
Пример #8
0
    def __put_requests(self, requests_list):
        for request in requests_list:
            try:
                request_dict = {
                    "request_obj": Request.from_dict(eval(request)),
                    "request_redis": request,
                }
            except Exception as e:
                log.exception("""
                error %s
                request %s
                """ % (e, request))

                request_dict = None

            if request_dict:
                self._todo_requests.append(request_dict)
Пример #9
0
    def run(self):
        while not self._thread_stop:
            try:
                requests = self._memory_db.get()
                if not requests:
                    if not self.is_show_tip:
                        log.info("parser 等待任务 ...")
                        self.is_show_tip = True

                    time.sleep(1)
                    self._wait_task_time += 1
                    continue

                self.is_show_tip = False
                self.deal_requests([requests])

            except Exception as e:
                log.exception(e)
Пример #10
0
    def run(self):
        while not self._thread_stop:
            try:
                requests = self._collector.get_requests(
                    setting.PARSER_TASK_COUNT)
                if not requests:
                    if not self.is_show_tip:
                        log.info("parser 等待任务 ...")
                        self.is_show_tip = True

                    # log.info('parser 等待任务 {}...'.format(tools.format_seconds(self._wait_task_time)))

                    time.sleep(1)
                    self._wait_task_time += 1
                    continue

                self.is_show_tip = False
                self.deal_requests(requests)

            except Exception as e:
                log.exception(e)
Пример #11
0
    def export(self, from_table, to_table, auto_update=False, batch_count=100):
        """
        @summary:
        用于从redis的item中导出数据到关系型数据库,如mysql/oracle
        from_table与to_table表结构必须一致
        ---------
        @param from_table:
        @param to_table:
        @param auto_update: 当数据存在时是否自动更新 默认否
        ---------
        @result:
        """
        total_count = 0

        while True:
            datas = []
            try:
                datas = self._redisdb.sget(from_table,
                                           count=batch_count,
                                           is_pop=False)
                if not datas:
                    log.info("""
                        \r%s -> %s 共导出 %s 条数据""" %
                             (from_table, to_table, total_count))
                    break

                json_datas = [eval(data) for data in datas]
                sql, json_datas = tools.make_batch_sql(to_table, json_datas,
                                                       auto_update)
                if self._to_db.add_batch(sql, json_datas):
                    total_count += len(json_datas)
                    self._redisdb.srem(from_table, datas)

            except Exception as e:
                log.exception(e)
                log.error(datas)
Пример #12
0
    def start_monitor_task(self):
        """
        @summary: 监控任务状态
        ---------
        ---------
        @result:
        """
        if not self._parsers:  # 不是多模版模式, 将自己注入到parsers,自己为模版
            self._is_more_parsers = False
            self._parsers.append(self)

        elif len(self._parsers) <= 1:
            self._is_more_parsers = False

        self.create_batch_record_table()

        # 添加任务
        for parser in self._parsers:
            parser.add_task()

        is_first_check = True
        while True:
            try:
                if self.check_batch(is_first_check):  # 该批次已经做完
                    if not self._auto_stop_when_spider_done:
                        is_first_check = True
                        log.info("爬虫所有任务已做完,不自动结束,等待新任务...")
                        time.sleep(self._check_task_interval)
                        continue
                    else:
                        break

                is_first_check = False

                # 检查redis中是否有任务 任务小于_min_task_count 则从mysql中取
                tab_requests = setting.TAB_REQUSETS.format(
                    table_folder=self._table_folder)
                todo_task_count = self._redisdb.zget_count(tab_requests)

                tasks = []
                if todo_task_count < self._min_task_count:  # 从mysql中取任务
                    # 更新batch表的任务状态数量
                    self.update_task_done_count()

                    log.info("redis 中剩余任务%s 数量过小 从mysql中取任务追加" %
                             todo_task_count)
                    tasks = self.get_todo_task_from_mysql()
                    if not tasks:  # 状态为0的任务已经做完,需要检查状态为2的任务是否丢失

                        if (todo_task_count == 0
                            ):  # redis 中无待做任务,此时mysql中状态为2的任务为丢失任务。需重新做
                            lose_task_count = self.get_lose_task_count()

                            if not lose_task_count:
                                time.sleep(self._check_task_interval)
                                continue

                            elif (
                                    lose_task_count > self._task_limit * 5
                            ):  # 丢失任务太多,直接重置,否则每次等redis任务消耗完再取下一批丢失任务,速度过慢
                                log.info("正在重置丢失任务为待做 共 {} 条".format(
                                    lose_task_count))
                                # 重置正在做的任务为待做
                                if self.reset_lose_task_from_mysql():
                                    log.info("重置丢失任务成功")
                                else:
                                    log.info("重置丢失任务失败")

                                continue

                            else:  # 丢失任务少,直接取
                                log.info("正在取丢失任务 共 {} 条, 取 {} 条".format(
                                    lose_task_count,
                                    self._task_limit
                                    if self._task_limit <= lose_task_count else
                                    lose_task_count,
                                ))
                                tasks = self.get_doing_task_from_mysql()

                    else:
                        log.info("mysql 中取到待做任务 %s 条" % len(tasks))

                else:
                    log.info("redis 中尚有%s条积压任务,暂时不派发新任务" % todo_task_count)

                if not tasks:
                    if todo_task_count >= self._min_task_count:
                        # log.info('任务正在进行 redis中剩余任务 %s' % todo_task_count)
                        pass
                    else:
                        log.info("mysql 中无待做任务 redis中剩余任务 %s" %
                                 todo_task_count)
                else:
                    # make start requests
                    self.distribute_task(tasks)
                    log.info("添加任务到redis成功")

            except Exception as e:
                log.exception(e)

            time.sleep(self._check_task_interval)
Пример #13
0
    def deal_requests(self, requests):
        for request in requests:

            response = None
            request_redis = request["request_redis"]
            request = request["request_obj"]

            del_request_redis_after_item_to_db = False
            del_request_redis_after_request_to_db = False

            for parser in self._parsers:
                if parser.name == request.parser_name:
                    used_download_midware_enable = False
                    try:
                        # 记录需下载的文档
                        self.record_download_status(
                            PaserControl.DOWNLOAD_TOTAL, parser.name)

                        # 解析request
                        if request.auto_request:
                            request_temp = None
                            if request.download_midware:
                                download_midware = (
                                    request.download_midware
                                    if callable(request.download_midware)
                                    else tools.get_method(
                                        parser, request.download_midware))
                                request_temp = download_midware(request)
                            elif request.download_midware != False:
                                request_temp = parser.download_midware(request)

                            if request_temp:
                                if not isinstance(request_temp, Request):
                                    raise Exception(
                                        "download_midware need return a request, but received type: {}"
                                        .format(type(request_temp)))
                                used_download_midware_enable = True
                                response = (
                                    request_temp.get_response()
                                    if not setting.RESPONSE_CACHED_USED else
                                    request_temp.get_response_from_cached(
                                        save_cached=False))
                            else:
                                response = (request.get_response()
                                            if not setting.RESPONSE_CACHED_USED
                                            else
                                            request.get_response_from_cached(
                                                save_cached=False))

                            if response == None:
                                raise Exception(
                                    "连接超时 url: %s" %
                                    (request.url or request_temp.url))

                        else:
                            response = None

                        if request.callback:  # 如果有parser的回调函数,则用回调处理
                            callback_parser = (request.callback if callable(
                                request.callback) else tools.get_method(
                                    parser, request.callback))
                            results = callback_parser(request, response)
                        else:  # 否则默认用parser处理
                            results = parser.parser(request, response)

                        if results and not isinstance(results, Iterable):
                            raise Exception(
                                "%s.%s返回值必须可迭代" %
                                (parser.name, request.callback or "parser"))

                        # 标识上一个result是什么
                        result_type = 0  # 0\1\2 (初始值\request\item)
                        # 此处判断是request 还是 item
                        for result in results or []:
                            if isinstance(result, Request):
                                result_type = 1
                                # 给request的 parser_name 赋值
                                result.parser_name = result.parser_name or parser.name

                                # 判断是同步的callback还是异步的
                                if result.request_sync:  # 同步
                                    request_dict = {
                                        "request_obj": result,
                                        "request_redis": None,
                                    }
                                    requests.append(request_dict)
                                else:  # 异步
                                    # 将next_request 入库
                                    self._request_buffer.put_request(result)
                                    del_request_redis_after_request_to_db = True

                            elif isinstance(result, Item):
                                result_type = 2
                                # 将item入库
                                self._item_buffer.put_item(result)
                                # 需删除正在做的request
                                del_request_redis_after_item_to_db = True

                            elif callable(result):  # result为可执行的无参函数
                                if (result_type == 2
                                    ):  # item 的 callback,buffer里的item均入库后再执行
                                    self._item_buffer.put_item(result)
                                    del_request_redis_after_item_to_db = True

                                else:  # result_type == 1: # request 的 callback,buffer里的request均入库后再执行。可能有的parser直接返回callback
                                    self._request_buffer.put_request(result)
                                    del_request_redis_after_request_to_db = True

                            # else:
                            #     raise TypeError('Expect Request、Item、callback func, bug get type: {}'.format(type(result)))

                    except Exception as e:
                        exception_type = (str(type(e)).replace("<class '",
                                                               "").replace(
                                                                   "'>", ""))
                        if exception_type.startswith("requests"):
                            # 记录下载失败的文档
                            self.record_download_status(
                                PaserControl.DOWNLOAD_EXCEPTION, parser.name)

                        else:
                            # 记录解析程序异常
                            self.record_download_status(
                                PaserControl.PAESERS_EXCEPTION, parser.name)

                        if setting.LOG_LEVEL == "DEBUG":  # 只有debug模式下打印, 超时的异常篇幅太多
                            log.exception(e)

                        log.error("""
                            -------------- %s.%s error -------------
                            error          %s
                            response       %s
                            deal request   %s
                            """ % (
                            parser.name,
                            (request.callback and callable(request.callback)
                             and getattr(request.callback, "__name__")
                             or request.callback) or "parser",
                            str(e),
                            response,
                            tools.dumps_json(request.to_dict, indent=28)
                            if setting.LOG_LEVEL == "DEBUG" else request,
                        ))

                        request.error_msg = "%s: %s" % (exception_type, e)
                        request.response = str(response)

                        if "Invalid URL" in str(e):
                            request.is_abandoned = True

                        requests = parser.exception_request(
                            request, response) or [request]
                        if not isinstance(requests, Iterable):
                            raise Exception("%s.%s返回值必须可迭代" %
                                            (parser.name, "exception_request"))
                        for request in requests:
                            if callable(request):
                                self._request_buffer.put_request(request)
                                continue

                            if not isinstance(request, Request):
                                raise Exception(
                                    "exception_request 需return request")

                            if (request.retry_times + 1 >
                                    setting.PARSER_MAX_RETRY_TIMES
                                    or request.is_abandoned):
                                self.__class__._failed_task_count += 1  # 记录失败任务数

                                # 处理failed_request的返回值 request 或 func
                                results = parser.failed_request(
                                    request, response) or [request]
                                if not isinstance(results, Iterable):
                                    raise Exception(
                                        "%s.%s返回值必须可迭代" %
                                        (parser.name, "failed_request"))

                                for result in results:
                                    if isinstance(result, Request):
                                        if setting.SAVE_FAILED_REQUEST:
                                            if used_download_midware_enable:
                                                # 去掉download_midware 添加的属性
                                                original_request = (
                                                    Request.from_dict(
                                                        eval(request_redis)) if
                                                    request_redis else result)
                                                original_request.error_msg = (
                                                    request.error_msg)
                                                original_request.response = (
                                                    request.response)

                                                self._request_buffer.put_failed_request(
                                                    original_request)
                                            else:
                                                self._request_buffer.put_failed_request(
                                                    result)

                                    elif callable(result):
                                        self._request_buffer.put_request(
                                            result)

                                    elif isinstance(result, Item):
                                        self._item_buffer.put_item(result)

                                del_request_redis_after_request_to_db = True

                            else:
                                # 将 requests 重新入库 爬取
                                request.retry_times += 1
                                request.filter_repeat = False
                                log.info("""
                                    入库 等待重试
                                    url     %s
                                    重试次数 %s
                                    最大允许重试次数 %s""" % (
                                    request.url,
                                    request.retry_times,
                                    setting.PARSER_MAX_RETRY_TIMES,
                                ))
                                if used_download_midware_enable:
                                    # 去掉download_midware 添加的属性 使用原来的requests
                                    original_request = (Request.from_dict(
                                        eval(request_redis)) if request_redis
                                                        else request)
                                    if hasattr(request, "error_msg"):
                                        original_request.error_msg = request.error_msg
                                    if hasattr(request, "response"):
                                        original_request.response = request.response
                                    original_request.retry_times = request.retry_times
                                    original_request.filter_repeat = (
                                        request.filter_repeat)

                                    self._request_buffer.put_request(
                                        original_request)
                                else:
                                    self._request_buffer.put_request(request)
                                del_request_redis_after_request_to_db = True

                    else:
                        # 记录下载成功的文档
                        self.record_download_status(
                            PaserControl.DOWNLOAD_SUCCESS, parser.name)
                        # 记录成功任务数
                        self.__class__._success_task_count += 1

                        # 缓存下载成功的文档
                        if setting.RESPONSE_CACHED_ENABLE:
                            request.save_cached(
                                response=response,
                                expire_time=setting.
                                RESPONSE_CACHED_EXPIRE_TIME,
                            )

                    break

            # 删除正在做的request 跟随item优先
            if request_redis:
                if del_request_redis_after_item_to_db:
                    self._item_buffer.put_item(request_redis)

                elif del_request_redis_after_request_to_db:
                    self._request_buffer.put_del_request(request_redis)

                else:
                    self._request_buffer.put_del_request(request_redis)

        if setting.PARSER_SLEEP_TIME:
            time.sleep(setting.PARSER_SLEEP_TIME)
Пример #14
0
    def deal_requests(self, requests):
        for request in requests:

            response = None

            for parser in self._parsers:
                if parser.name == request.parser_name:
                    try:
                        # 记录需下载的文档
                        self.record_download_status(
                            PaserControl.DOWNLOAD_TOTAL, parser.name)

                        # 解析request
                        if request.auto_request:
                            request_temp = None
                            if request.download_midware:
                                download_midware = (
                                    request.download_midware
                                    if callable(request.download_midware)
                                    else tools.get_method(
                                        parser, request.download_midware))
                                request_temp = download_midware(request)
                            elif request.download_midware != False:
                                request_temp = parser.download_midware(request)

                            if request_temp:
                                if not isinstance(request_temp, Request):
                                    raise Exception(
                                        "download_midware need return a request, but received type: {}"
                                        .format(type(request_temp)))
                                request = request_temp

                            response = (request.get_response()
                                        if not setting.RESPONSE_CACHED_USED
                                        else request.get_response_from_cached(
                                            save_cached=False))

                        else:
                            response = None

                        if request.callback:  # 如果有parser的回调函数,则用回调处理
                            callback_parser = (request.callback if callable(
                                request.callback) else tools.get_method(
                                    parser, request.callback))
                            results = callback_parser(request, response)
                        else:  # 否则默认用parser处理
                            results = parser.parser(request, response)

                        if results and not isinstance(results, Iterable):
                            raise Exception(
                                "%s.%s返回值必须可迭代" %
                                (parser.name, request.callback or "parser"))

                        # 此处判断是request 还是 item
                        for result in results or []:
                            if isinstance(result, Request):
                                # 给request的 parser_name 赋值
                                result.parser_name = result.parser_name or parser.name

                                # 判断是同步的callback还是异步的
                                if result.request_sync:  # 同步
                                    requests.append(result)
                                else:  # 异步
                                    # 将next_request 入库
                                    self._memory_db.add(result)

                    except Exception as e:
                        exception_type = (str(type(e)).replace("<class '",
                                                               "").replace(
                                                                   "'>", ""))
                        if exception_type.startswith("requests"):
                            # 记录下载失败的文档
                            self.record_download_status(
                                PaserControl.DOWNLOAD_EXCEPTION, parser.name)

                        else:
                            # 记录解析程序异常
                            self.record_download_status(
                                PaserControl.PAESERS_EXCEPTION, parser.name)

                        if setting.LOG_LEVEL == "DEBUG":  # 只有debug模式下打印, 超时的异常篇幅太多
                            log.exception(e)

                        log.error("""
                                -------------- %s.%s error -------------
                                error          %s
                                response       %s
                                deal request   %s
                                """ % (
                            parser.name,
                            (request.callback and callable(request.callback)
                             and getattr(request.callback, "__name__")
                             or request.callback) or "parser",
                            str(e),
                            response,
                            tools.dumps_json(request.to_dict, indent=28)
                            if setting.LOG_LEVEL == "DEBUG" else request,
                        ))

                        request.error_msg = "%s: %s" % (exception_type, e)
                        request.response = str(response)

                        if "Invalid URL" in str(e):
                            request.is_abandoned = True

                        requests = parser.exception_request(
                            request, response) or [request]
                        if not isinstance(requests, Iterable):
                            raise Exception("%s.%s返回值必须可迭代" %
                                            (parser.name, "exception_request"))
                        for request in requests:
                            if not isinstance(request, Request):
                                raise Exception(
                                    "exception_request 需return request")

                            if (request.retry_times + 1 >
                                    setting.PARSER_MAX_RETRY_TIMES
                                    or request.is_abandoned):
                                self.__class__._failed_task_count += 1  # 记录失败任务数

                                # 处理failed_request的返回值 request 或 func
                                results = parser.failed_request(
                                    request, response) or [request]
                                if not isinstance(results, Iterable):
                                    raise Exception(
                                        "%s.%s返回值必须可迭代" %
                                        (parser.name, "failed_request"))

                                log.info("""
                                    任务超过最大重试次数,丢弃
                                    url     %s
                                    重试次数 %s
                                    最大允许重试次数 %s""" % (
                                    request.url,
                                    request.retry_times,
                                    setting.PARSER_MAX_RETRY_TIMES,
                                ))

                            else:
                                # 将 requests 重新入库 爬取
                                request.retry_times += 1
                                request.filter_repeat = False
                                log.info("""
                                        入库 等待重试
                                        url     %s
                                        重试次数 %s
                                        最大允许重试次数 %s""" % (
                                    request.url,
                                    request.retry_times,
                                    setting.PARSER_MAX_RETRY_TIMES,
                                ))
                                self._memory_db.add(request)

                    else:
                        # 记录下载成功的文档
                        self.record_download_status(
                            PaserControl.DOWNLOAD_SUCCESS, parser.name)
                        # 记录成功任务数
                        self.__class__._success_task_count += 1

                        # 缓存下载成功的文档
                        if setting.RESPONSE_CACHED_ENABLE:
                            request.save_cached(
                                response=response,
                                expire_time=setting.
                                RESPONSE_CACHED_EXPIRE_TIME,
                            )

                    break

        if setting.PARSER_SLEEP_TIME:
            time.sleep(setting.PARSER_SLEEP_TIME)
Пример #15
0
 def flush(self):
     try:
         self.__add_request_to_db()
     except Exception as e:
         log.exception(e)