예제 #1
0
    def __add_item_to_db(self, items, update_items, requests, callbacks,
                         items_fingerprints):
        export_success = False
        self._is_adding_to_db = True

        # 去重
        if setting.ITEM_FILTER_ENABLE:
            items, items_fingerprints = self.__dedup_items(
                items, items_fingerprints)

        # 分捡
        items_dict = self.__pick_items(items)
        update_items_dict = self.__pick_items(update_items,
                                              is_update_item=True)

        # item批量入库
        while items_dict:
            tab_item, datas = items_dict.popitem()

            log.debug("""
                -------------- item 批量入库 --------------
                表名: %s
                datas: %s
                    """ % (tab_item, tools.dumps_json(datas, indent=16)))

            export_success = self.__export_to_db(tab_item, datas)

        # 执行批量update
        while update_items_dict:
            tab_item, datas = update_items_dict.popitem()
            log.debug("""
                -------------- item 批量更新 --------------
                表名: %s
                datas: %s
                    """ % (tab_item, tools.dumps_json(datas, indent=16)))

            update_keys = self._item_update_keys.get(tab_item)
            export_success = self.__export_to_db(tab_item,
                                                 datas,
                                                 is_update=True,
                                                 update_keys=update_keys)

        # 执行回调
        while callbacks:
            try:
                callback = callbacks.pop(0)
                callback()
            except Exception as e:
                log.exception(e)

        # 删除做过的request
        if requests:
            self._db.zrem(self._table_request, requests)

        # 去重入库
        if export_success and setting.ITEM_FILTER_ENABLE:
            if items_fingerprints:
                self.__class__.dedup.add(items_fingerprints, skip_check=True)

        self._is_adding_to_db = False
예제 #2
0
    def create(self):
        __all__ = []

        import os

        path = os.getcwd()
        for file in os.listdir(path):
            if file.endswith(".py") and not file.startswith("__init__"):
                model = file.split(".")[0]
                __all__.append(model)

        del os

        with open("__init__.py", "w", encoding="utf-8") as file:
            text = "__all__ = %s" % dumps_json(__all__)
            file.write(text)
예제 #3
0
    def create(self, sort_keys=False):
        contents = self.get_data()

        json = {}
        for content in contents:
            content = content.strip()
            if not content or content.startswith(":"):
                continue

            regex = "([^:\s]*)[:|\s]*(.*)"

            result = tools.get_info(content, regex, fetch_one=True)
            if result[0] in json:
                json[result[0]] = json[result[0]] + "&" + result[1]
            else:
                json[result[0]] = result[1].strip()

        print(tools.dumps_json(json, sort_keys=sort_keys))
예제 #4
0
    def deal_requests(self, requests):
        for request in requests:

            response = None
            request_redis = request["request_redis"]
            request = request["request_obj"]

            del_request_redis_after_item_to_db = False
            del_request_redis_after_request_to_db = False

            for parser in self._parsers:
                if parser.name == request.parser_name:
                    used_download_midware_enable = False
                    try:
                        # 记录需下载的文档
                        self.record_download_status(
                            PaserControl.DOWNLOAD_TOTAL, parser.name)

                        # 解析request
                        if request.auto_request:
                            request_temp = None
                            response = None
                            # 下载中间件
                            if request.download_midware:
                                download_midware = (
                                    request.download_midware
                                    if callable(request.download_midware)
                                    else tools.get_method(
                                        parser, request.download_midware))
                                request_temp = download_midware(request)
                            elif request.download_midware != False:
                                request_temp = parser.download_midware(request)

                            # 请求
                            if request_temp:
                                if (isinstance(request_temp, (tuple, list))
                                        and len(request_temp) == 2):
                                    request_temp, response = request_temp

                                if not isinstance(request_temp, Request):
                                    raise Exception(
                                        "download_midware need return a request, but received type: {}"
                                        .format(type(request_temp)))
                                used_download_midware_enable = True
                                if not response:
                                    response = (
                                        request_temp.get_response() if
                                        not setting.RESPONSE_CACHED_USED else
                                        request_temp.get_response_from_cached(
                                            save_cached=False))
                            else:
                                response = (request.get_response()
                                            if not setting.RESPONSE_CACHED_USED
                                            else
                                            request.get_response_from_cached(
                                                save_cached=False))

                            if response == None:
                                raise Exception(
                                    "连接超时 url: %s" %
                                    (request.url or request_temp.url))

                        else:
                            response = None

                        # 校验
                        if parser.validate(request, response) == False:
                            continue

                        if request.callback:  # 如果有parser的回调函数,则用回调处理
                            callback_parser = (request.callback if callable(
                                request.callback) else tools.get_method(
                                    parser, request.callback))
                            results = callback_parser(request, response)
                        else:  # 否则默认用parser处理
                            results = parser.parse(request, response)

                        if results and not isinstance(results, Iterable):
                            raise Exception(
                                "%s.%s返回值必须可迭代" %
                                (parser.name, request.callback or "parse"))

                        # 标识上一个result是什么
                        result_type = 0  # 0\1\2 (初始值\request\item)
                        # 此处判断是request 还是 item
                        for result in results or []:
                            if isinstance(result, Request):
                                result_type = 1
                                # 给request的 parser_name 赋值
                                result.parser_name = result.parser_name or parser.name

                                # 判断是同步的callback还是异步的
                                if result.request_sync:  # 同步
                                    request_dict = {
                                        "request_obj": result,
                                        "request_redis": None,
                                    }
                                    requests.append(request_dict)
                                else:  # 异步
                                    # 将next_request 入库
                                    self._request_buffer.put_request(result)
                                    del_request_redis_after_request_to_db = True

                            elif isinstance(result, Item):
                                result_type = 2
                                # 将item入库
                                self._item_buffer.put_item(result)
                                # 需删除正在做的request
                                del_request_redis_after_item_to_db = True

                            elif callable(result):  # result为可执行的无参函数
                                if (result_type == 2
                                    ):  # item 的 callback,buffer里的item均入库后再执行
                                    self._item_buffer.put_item(result)
                                    del_request_redis_after_item_to_db = True

                                else:  # result_type == 1: # request 的 callback,buffer里的request均入库后再执行。可能有的parser直接返回callback
                                    self._request_buffer.put_request(result)
                                    del_request_redis_after_request_to_db = True

                            # else:
                            #     raise TypeError('Expect Request、Item、callback func, bug get type: {}'.format(type(result)))

                    except Exception as e:
                        exception_type = (str(type(e)).replace("<class '",
                                                               "").replace(
                                                                   "'>", ""))
                        if exception_type.startswith("requests"):
                            # 记录下载失败的文档
                            self.record_download_status(
                                PaserControl.DOWNLOAD_EXCEPTION, parser.name)

                        else:
                            # 记录解析程序异常
                            self.record_download_status(
                                PaserControl.PAESERS_EXCEPTION, parser.name)

                        if setting.LOG_LEVEL == "DEBUG":  # 只有debug模式下打印, 超时的异常篇幅太多
                            log.exception(e)

                        log.error("""
                            -------------- %s.%s error -------------
                            error          %s
                            response       %s
                            deal request   %s
                            """ % (
                            parser.name,
                            (request.callback and callable(request.callback)
                             and getattr(request.callback, "__name__")
                             or request.callback) or "parse",
                            str(e),
                            response,
                            tools.dumps_json(request.to_dict, indent=28)
                            if setting.LOG_LEVEL == "DEBUG" else request,
                        ))

                        request.error_msg = "%s: %s" % (exception_type, e)
                        request.response = str(response)

                        if "Invalid URL" in str(e):
                            request.is_abandoned = True

                        requests = parser.exception_request(
                            request, response) or [request]
                        if not isinstance(requests, Iterable):
                            raise Exception("%s.%s返回值必须可迭代" %
                                            (parser.name, "exception_request"))
                        for request in requests:
                            if callable(request):
                                self._request_buffer.put_request(request)
                                continue

                            if not isinstance(request, Request):
                                raise Exception(
                                    "exception_request 需 yield request")

                            if (request.retry_times + 1 >
                                    setting.SPIDER_MAX_RETRY_TIMES
                                    or request.is_abandoned):
                                self.__class__._failed_task_count += 1  # 记录失败任务数

                                # 处理failed_request的返回值 request 或 func
                                results = parser.failed_request(
                                    request, response) or [request]
                                if not isinstance(results, Iterable):
                                    raise Exception(
                                        "%s.%s返回值必须可迭代" %
                                        (parser.name, "failed_request"))

                                for result in results:
                                    if isinstance(result, Request):
                                        if setting.SAVE_FAILED_REQUEST:
                                            if used_download_midware_enable:
                                                # 去掉download_midware 添加的属性
                                                original_request = (
                                                    Request.from_dict(
                                                        eval(request_redis)) if
                                                    request_redis else result)
                                                original_request.error_msg = (
                                                    request.error_msg)
                                                original_request.response = (
                                                    request.response)

                                                self._request_buffer.put_failed_request(
                                                    original_request)
                                            else:
                                                self._request_buffer.put_failed_request(
                                                    result)

                                    elif callable(result):
                                        self._request_buffer.put_request(
                                            result)

                                    elif isinstance(result, Item):
                                        self._item_buffer.put_item(result)

                                del_request_redis_after_request_to_db = True

                            else:
                                # 将 requests 重新入库 爬取
                                request.retry_times += 1
                                request.filter_repeat = False
                                log.info("""
                                    入库 等待重试
                                    url     %s
                                    重试次数 %s
                                    最大允许重试次数 %s""" % (
                                    request.url,
                                    request.retry_times,
                                    setting.SPIDER_MAX_RETRY_TIMES,
                                ))
                                if used_download_midware_enable:
                                    # 去掉download_midware 添加的属性 使用原来的requests
                                    original_request = (Request.from_dict(
                                        eval(request_redis)) if request_redis
                                                        else request)
                                    if hasattr(request, "error_msg"):
                                        original_request.error_msg = request.error_msg
                                    if hasattr(request, "response"):
                                        original_request.response = request.response
                                    original_request.retry_times = request.retry_times
                                    original_request.filter_repeat = (
                                        request.filter_repeat)

                                    self._request_buffer.put_request(
                                        original_request)
                                else:
                                    self._request_buffer.put_request(request)
                                del_request_redis_after_request_to_db = True

                    else:
                        # 记录下载成功的文档
                        self.record_download_status(
                            PaserControl.DOWNLOAD_SUCCESS, parser.name)
                        # 记录成功任务数
                        self.__class__._success_task_count += 1

                        # 缓存下载成功的文档
                        if setting.RESPONSE_CACHED_ENABLE:
                            request.save_cached(
                                response=response,
                                expire_time=setting.
                                RESPONSE_CACHED_EXPIRE_TIME,
                            )

                    finally:
                        # 释放浏览器
                        if response and hasattr(response, "browser"):
                            request._webdriver_pool.put(response.browser)

                    break

            # 删除正在做的request 跟随item优先
            if request_redis:
                if del_request_redis_after_item_to_db:
                    self._item_buffer.put_item(request_redis)

                elif del_request_redis_after_request_to_db:
                    self._request_buffer.put_del_request(request_redis)

                else:
                    self._request_buffer.put_del_request(request_redis)

        if setting.SPIDER_SLEEP_TIME:
            time.sleep(setting.SPIDER_SLEEP_TIME)
예제 #5
0
    def deal_requests(self, requests):
        for request in requests:

            response = None

            for parser in self._parsers:
                if parser.name == request.parser_name:
                    try:
                        # 记录需下载的文档
                        self.record_download_status(
                            PaserControl.DOWNLOAD_TOTAL, parser.name)

                        # 解析request
                        if request.auto_request:
                            request_temp = None
                            response = None

                            # 下载中间件
                            if request.download_midware:
                                download_midware = (
                                    request.download_midware
                                    if callable(request.download_midware)
                                    else tools.get_method(
                                        parser, request.download_midware))
                                request_temp = download_midware(request)
                            elif request.download_midware != False:
                                request_temp = parser.download_midware(request)

                            # 请求
                            if request_temp:
                                if (isinstance(request_temp, (tuple, list))
                                        and len(request_temp) == 2):
                                    request_temp, response = request_temp

                                if not isinstance(request_temp, Request):
                                    raise Exception(
                                        "download_midware need return a request, but received type: {}"
                                        .format(type(request_temp)))
                                request = request_temp

                            if not response:
                                response = (request.get_response()
                                            if not setting.RESPONSE_CACHED_USED
                                            else
                                            request.get_response_from_cached(
                                                save_cached=False))

                        else:
                            response = None

                        # 校验
                        if parser.validate(request, response) == False:
                            continue

                        if request.callback:  # 如果有parser的回调函数,则用回调处理
                            callback_parser = (request.callback if callable(
                                request.callback) else tools.get_method(
                                    parser, request.callback))
                            results = callback_parser(request, response)
                        else:  # 否则默认用parser处理
                            results = parser.parse(request, response)

                        if results and not isinstance(results, Iterable):
                            raise Exception(
                                "%s.%s返回值必须可迭代" %
                                (parser.name, request.callback or "parse"))

                        # 此处判断是request 还是 item
                        for result in results or []:
                            if isinstance(result, Request):
                                # 给request的 parser_name 赋值
                                result.parser_name = result.parser_name or parser.name

                                # 判断是同步的callback还是异步的
                                if result.request_sync:  # 同步
                                    requests.append(result)
                                else:  # 异步
                                    # 将next_request 入库
                                    self._memory_db.add(result)

                            elif isinstance(result, Item):
                                self._item_buffer.put_item(result)

                    except Exception as e:
                        exception_type = (str(type(e)).replace("<class '",
                                                               "").replace(
                                                                   "'>", ""))
                        if exception_type.startswith("requests"):
                            # 记录下载失败的文档
                            self.record_download_status(
                                PaserControl.DOWNLOAD_EXCEPTION, parser.name)

                        else:
                            # 记录解析程序异常
                            self.record_download_status(
                                PaserControl.PAESERS_EXCEPTION, parser.name)

                        if setting.LOG_LEVEL == "DEBUG":  # 只有debug模式下打印, 超时的异常篇幅太多
                            log.exception(e)

                        log.error("""
                                -------------- %s.%s error -------------
                                error          %s
                                response       %s
                                deal request   %s
                                """ % (
                            parser.name,
                            (request.callback and callable(request.callback)
                             and getattr(request.callback, "__name__")
                             or request.callback) or "parse",
                            str(e),
                            response,
                            tools.dumps_json(request.to_dict, indent=28)
                            if setting.LOG_LEVEL == "DEBUG" else request,
                        ))

                        request.error_msg = "%s: %s" % (exception_type, e)
                        request.response = str(response)

                        if "Invalid URL" in str(e):
                            request.is_abandoned = True

                        requests = parser.exception_request(
                            request, response) or [request]
                        if not isinstance(requests, Iterable):
                            raise Exception("%s.%s返回值必须可迭代" %
                                            (parser.name, "exception_request"))
                        for request in requests:
                            if not isinstance(request, Request):
                                raise Exception(
                                    "exception_request 需 yield request")

                            if (request.retry_times + 1 >
                                    setting.SPIDER_MAX_RETRY_TIMES
                                    or request.is_abandoned):
                                self.__class__._failed_task_count += 1  # 记录失败任务数

                                # 处理failed_request的返回值 request 或 func
                                results = parser.failed_request(
                                    request, response) or [request]
                                if not isinstance(results, Iterable):
                                    raise Exception(
                                        "%s.%s返回值必须可迭代" %
                                        (parser.name, "failed_request"))

                                log.info("""
                                    任务超过最大重试次数,丢弃
                                    url     %s
                                    重试次数 %s
                                    最大允许重试次数 %s""" % (
                                    request.url,
                                    request.retry_times,
                                    setting.SPIDER_MAX_RETRY_TIMES,
                                ))

                            else:
                                # 将 requests 重新入库 爬取
                                request.retry_times += 1
                                request.filter_repeat = False
                                log.info("""
                                        入库 等待重试
                                        url     %s
                                        重试次数 %s
                                        最大允许重试次数 %s""" % (
                                    request.url,
                                    request.retry_times,
                                    setting.SPIDER_MAX_RETRY_TIMES,
                                ))
                                self._memory_db.add(request)

                    else:
                        # 记录下载成功的文档
                        self.record_download_status(
                            PaserControl.DOWNLOAD_SUCCESS, parser.name)
                        # 记录成功任务数
                        self.__class__._success_task_count += 1

                        # 缓存下载成功的文档
                        if setting.RESPONSE_CACHED_ENABLE:
                            request.save_cached(
                                response=response,
                                expire_time=setting.
                                RESPONSE_CACHED_EXPIRE_TIME,
                            )

                    finally:
                        # 释放浏览器
                        if response and hasattr(response, "browser"):
                            request._webdriver_pool.put(response.browser)

                    break

        if setting.SPIDER_SLEEP_TIME:
            time.sleep(setting.SPIDER_SLEEP_TIME)
예제 #6
0
파일: item.py 프로젝트: tikazyq/feapder
 def __repr__(self):
     return "<{}: {}>".format(self.item_name,
                              tools.dumps_json(self.to_dict))
예제 #7
0
from feapder.utils import tools
from datetime import datetime

date = tools.format_time("昨天3:10")
print(date)

print(tools.format_date("2017年4月17日 3时27分12秒"))

date = tools.format_time("昨天")
print(date)

date = tools.format_time("2021-11-05 14:18:10")
print(date)

date = tools.format_time("1 年前")
print(date)


class C:
    pass


data = {"date": datetime.now(), "c": C()}
print(tools.dumps_json(data))
예제 #8
0
    def __add_item_to_db(self, items, update_items, requests, callbacks,
                         items_fingerprints):
        export_success = True
        self._is_adding_to_db = True

        # 去重
        if setting.ITEM_FILTER_ENABLE:
            items, items_fingerprints = self.__dedup_items(
                items, items_fingerprints)

        # 分捡
        items_dict = self.__pick_items(items)
        update_items_dict = self.__pick_items(update_items,
                                              is_update_item=True)

        # item批量入库
        failed_items = {"add": [], "update": [], "requests": []}
        while items_dict:
            table, datas = items_dict.popitem()

            log.debug("""
                -------------- item 批量入库 --------------
                表名: %s
                datas: %s
                    """ % (table, tools.dumps_json(datas, indent=16)))

            if not self.__export_to_db(table, datas):
                export_success = False
                failed_items["add"].append({"table": table, "datas": datas})

        # 执行批量update
        while update_items_dict:
            table, datas = update_items_dict.popitem()

            log.debug("""
                -------------- item 批量更新 --------------
                表名: %s
                datas: %s
                    """ % (table, tools.dumps_json(datas, indent=16)))

            update_keys = self._item_update_keys.get(table)
            if not self.__export_to_db(
                    table, datas, is_update=True, update_keys=update_keys):
                export_success = False
                failed_items["update"].append({"table": table, "datas": datas})

        if export_success:
            # 执行回调
            while callbacks:
                try:
                    callback = callbacks.pop(0)
                    callback()
                except Exception as e:
                    log.exception(e)

            # 删除做过的request
            if requests:
                self.redis_db.zrem(self._table_request, requests)

            # 去重入库
            if setting.ITEM_FILTER_ENABLE:
                if items_fingerprints:
                    self.__class__.dedup.add(items_fingerprints,
                                             skip_check=True)
        else:
            failed_items["requests"] = requests

            if self.export_retry_times > setting.EXPORT_DATA_MAX_RETRY_TIMES:
                if self._redis_key != "air_spider":
                    # 失败的item记录到redis
                    self.redis_db.sadd(self._table_failed_items, failed_items)

                    # 删除做过的request
                    if requests:
                        self.redis_db.zrem(self._table_request, requests)

                    log.error("入库超过最大重试次数,不再重试,数据记录到redis,items:\n {}".format(
                        tools.dumps_json(failed_items)))
                self.export_retry_times = 0

            else:
                tip = ["入库不成功"]
                if callbacks:
                    tip.append("不执行回调")
                if requests:
                    tip.append("不删除任务")
                    exists = self.redis_db.zexists(self._table_request,
                                                   requests)
                    for exist, request in zip(exists, requests):
                        if exist:
                            self.redis_db.zadd(self._table_request, requests,
                                               300)

                if setting.ITEM_FILTER_ENABLE:
                    tip.append("数据不入去重库")

                if self._redis_key != "air_spider":
                    tip.append("将自动重试")

                tip.append("失败items:\n {}".format(
                    tools.dumps_json(failed_items)))
                log.error(",".join(tip))

                self.export_falied_times += 1

                if self._redis_key != "air_spider":
                    self.export_retry_times += 1

            if self.export_falied_times > setting.EXPORT_DATA_MAX_FAILED_TIMES:
                # 报警
                msg = "《{}》爬虫导出数据失败,失败次数:{},请检查爬虫是否正常".format(
                    self._redis_key, self.export_falied_times)
                log.error(msg)
                tools.send_msg(
                    msg=msg,
                    level="error",
                    message_prefix="《%s》爬虫导出数据失败" % (self._redis_key),
                )

        self._is_adding_to_db = False