Exemplo n.º 1
0
class ItemBuffer(threading.Thread, Singleton):
    def __init__(self, table_folder):
        if not hasattr(self, "_table_item"):
            super(ItemBuffer, self).__init__()

            self._thread_stop = False
            self._is_adding_to_db = False
            self._table_folder = table_folder

            self._items_queue = Queue(maxsize=MAX_ITEM_COUNT)
            self._db = RedisDB()

            self._table_item = setting.TAB_ITEM
            self._table_request = setting.TAB_REQUSETS.format(
                table_folder=table_folder)

            self._item_tables = {
                # 'xxx_item': {'tab_item': 'xxx:xxx_item'} # 记录item名与redis中item名对应关系
            }

            self._item_update_keys = {
                # 'xxx:xxx_item': ['id', 'name'...] # 记录redis中item名与需要更新的key对应关系
            }

            self._export_data = ExportData(
            ) if setting.ADD_ITEM_TO_MYSQL else None

            self.db_tip()

    def db_tip(self):
        msg = "\n"
        if setting.ADD_ITEM_TO_MYSQL:
            msg += "item 自动入mysql\n"
        if setting.ADD_ITEM_TO_REDIS:
            msg += "item 自动入redis\n"
        if msg == "\n":
            log.warning("*** 请注意检查item是否入库 !!!")
        else:
            log.info(msg)

    def run(self):
        while not self._thread_stop:
            self.flush()
            tools.delay_time(0.5)

        self.close()

    def stop(self):
        self._thread_stop = True

    def put_item(self, item):
        self._items_queue.put(item)

    def flush(self):
        try:
            items = []
            update_items = []
            requests = []
            callbacks = []
            items_fingerprints = []
            data_count = 0

            while not self._items_queue.empty():
                data = self._items_queue.get_nowait()
                data_count += 1

                # data 分类
                if callable(data):
                    callbacks.append(data)

                elif isinstance(data, UpdateItem):
                    update_items.append(data)

                elif isinstance(data, Item):
                    items.append(data)
                    if setting.ITEM_FILTER_ENABLE:
                        items_fingerprints.append(data.fingerprint)

                else:  # request-redis
                    requests.append(data)

                if data_count >= UPLOAD_BATCH_MAX_SIZE:
                    self.__add_item_to_db(items, update_items, requests,
                                          callbacks, items_fingerprints)

                    items = []
                    update_items = []
                    requests = []
                    callbacks = []
                    items_fingerprints = []
                    data_count = 0

            if data_count:
                self.__add_item_to_db(items, update_items, requests, callbacks,
                                      items_fingerprints)

        except Exception as e:
            log.exception(e)

    def get_items_count(self):
        return self._items_queue.qsize()

    def is_adding_to_db(self):
        return self._is_adding_to_db

    def __dedup_items(self, items, items_fingerprints):
        """
        去重
        @param items:
        @param items_fingerprints:
        @return: 返回去重后的items, items_fingerprints
        """
        if not items:
            return items, items_fingerprints

        is_exists = self.__class__.dedup.get(items_fingerprints)
        is_exists = is_exists if isinstance(is_exists, list) else [is_exists]

        dedup_items = []
        dedup_items_fingerprints = []
        items_count = dedup_items_count = dup_items_count = 0

        while is_exists:
            item = items.pop(0)
            items_fingerprint = items_fingerprints.pop(0)
            is_exist = is_exists.pop(0)

            items_count += 1

            if not is_exist:
                dedup_items.append(item)
                dedup_items_fingerprints.append(items_fingerprint)
                dedup_items_count += 1
            else:
                dup_items_count += 1

        log.info("待入库数据 {} 条, 重复 {} 条,实际待入库数据 {} 条".format(
            items_count, dup_items_count, dedup_items_count))

        return dedup_items, dedup_items_fingerprints

    def __pick_items(self, items, is_update_item=False):
        """
        将每个表之间的数据分开 拆分后 原items为空
        @param items:
        @param is_update_item:
        @return:
        """
        datas_dict = {
            # 'xxx:xxx_item': [{}, {}] redis 中的item名与对应的数据
        }

        while items:
            item = items.pop(0)
            # 取item下划线格式的名
            # 下划线类的名先从dict中取,没有则现取,然后存入dict。加快下次取的速度
            item_name = item.item_name
            item_table = self._item_tables.get(item_name)
            if not item_table:
                item_name_underline = item.name_underline
                tab_item = self._table_item.format(
                    table_folder=self._table_folder,
                    item_name=item_name_underline)

                item_table = {}
                item_table["tab_item"] = tab_item

                self._item_tables[item_name] = item_table

            else:
                tab_item = item_table.get("tab_item")

            # 入库前的回调
            item.per_to_db()

            if tab_item not in datas_dict:
                datas_dict[tab_item] = []

            datas_dict[tab_item].append(item.to_dict)

            if is_update_item and tab_item not in self._item_update_keys:
                self._item_update_keys[tab_item] = item.update_key

        return datas_dict

    def __export_to_db(self, tab_item, datas, is_update=False, update_keys=()):
        export_success = False
        # 打点 校验
        to_table = tools.get_info(tab_item, ":s_(.*?)_item", fetch_one=True)
        self.check_datas(table=to_table, datas=datas)

        if setting.ADD_ITEM_TO_MYSQL:  # 任务表需要入mysql
            if isinstance(setting.ADD_ITEM_TO_MYSQL, (list, tuple)):
                for item in setting.ADD_ITEM_TO_MYSQL:
                    if item in to_table:
                        export_success = (
                            self._export_data.export_items(tab_item, datas) if
                            not is_update else self._export_data.update_items(
                                tab_item, datas, update_keys=update_keys))

            else:
                export_success = (
                    self._export_data.export_items(tab_item, datas)
                    if not is_update else self._export_data.update_items(
                        tab_item, datas, update_keys=update_keys))

        if setting.ADD_ITEM_TO_REDIS:
            if isinstance(setting.ADD_ITEM_TO_REDIS, (list, tuple)):
                for item in setting.ADD_ITEM_TO_REDIS:
                    if item in to_table:
                        self._db.sadd(tab_item, datas)
                        export_success = True
                        log.info("共导出 %s 条数据 到redis %s" %
                                 (len(datas), tab_item))
                        break

            else:
                self._db.sadd(tab_item, datas)
                export_success = True
                log.info("共导出 %s 条数据 到redis %s" % (len(datas), tab_item))

        return export_success

    def __add_item_to_db(self, items, update_items, requests, callbacks,
                         items_fingerprints):
        export_success = False
        self._is_adding_to_db = True

        # 去重
        if setting.ITEM_FILTER_ENABLE:
            items, items_fingerprints = self.__dedup_items(
                items, items_fingerprints)

        # 分捡
        items_dict = self.__pick_items(items)
        update_items_dict = self.__pick_items(update_items,
                                              is_update_item=True)

        # item批量入库
        while items_dict:
            tab_item, datas = items_dict.popitem()

            log.debug("""
                -------------- item 批量入库 --------------
                表名: %s
                datas: %s
                    """ % (tab_item, tools.dumps_json(datas, indent=16)))

            export_success = self.__export_to_db(tab_item, datas)

        # 执行批量update
        while update_items_dict:
            tab_item, datas = update_items_dict.popitem()
            log.debug("""
                -------------- item 批量更新 --------------
                表名: %s
                datas: %s
                    """ % (tab_item, tools.dumps_json(datas, indent=16)))

            update_keys = self._item_update_keys.get(tab_item)
            export_success = self.__export_to_db(tab_item,
                                                 datas,
                                                 is_update=True,
                                                 update_keys=update_keys)

        # 执行回调
        while callbacks:
            try:
                callback = callbacks.pop(0)
                callback()
            except Exception as e:
                log.exception(e)

        # 删除做过的request
        if requests:
            self._db.zrem(self._table_request, requests)

        # 去重入库
        if export_success and setting.ITEM_FILTER_ENABLE:
            if items_fingerprints:
                self.__class__.dedup.add(items_fingerprints, skip_check=True)

        self._is_adding_to_db = False

    def check_datas(self, table, datas):
        """
        打点 记录总条数及每个key情况
        @param table: 表名
        @param datas: 数据 列表
        @return:
        """
        pass

    def close(self):
        pass
Exemplo n.º 2
0
class RequestBuffer(threading.Thread, Singleton):
    dedup = None

    def __init__(self, table_folder):
        if not hasattr(self, "_requests_deque"):
            super(RequestBuffer, self).__init__()

            self._thread_stop = False
            self._is_adding_to_db = False

            self._requests_deque = collections.deque()
            self._del_requests_deque = collections.deque()
            self._db = RedisDB()

            self._table_request = setting.TAB_REQUSETS.format(
                table_folder=table_folder)
            self._table_failed_request = setting.TAB_FAILED_REQUSETS.format(
                table_folder=table_folder)

            if not self.__class__.dedup and setting.REQUEST_FILTER_ENABLE:
                self.__class__.dedup = Dedup(
                    filter_type=Dedup.ExpireFilter,
                    name=table_folder,
                    expire_time=2592000,
                    to_md5=False,
                )  # 过期时间为一个月

    def run(self):
        while not self._thread_stop:
            try:
                self.__add_request_to_db()
            except Exception as e:
                log.exception(e)

            tools.delay_time(1)

    def stop(self):
        self._thread_stop = True

    def put_request(self, request):
        self._requests_deque.append(request)

        if self.get_requests_count() > MAX_URL_COUNT:  # 超过最大缓存,主动调用
            self.flush()

    def put_del_request(self, request):
        self._del_requests_deque.append(request)

    def put_failed_request(self, request, table=None):
        try:
            request_dict = request.to_dict
            self._db.zadd(table or self._table_failed_request, request_dict,
                          request.priority)
        except Exception as e:
            log.exception(e)

    def flush(self):
        try:
            self.__add_request_to_db()
        except Exception as e:
            log.exception(e)

    def get_requests_count(self):
        return len(self._requests_deque)

    def is_adding_to_db(self):
        return self._is_adding_to_db

    def __add_request_to_db(self):
        request_list = []
        prioritys = []
        callbacks = []

        while self._requests_deque:
            request = self._requests_deque.popleft()
            self._is_adding_to_db = True

            if callable(request):
                # 函数
                # 注意:应该考虑闭包情况。闭包情况可写成
                # def test(xxx = xxx):
                #     # TODO 业务逻辑 使用 xxx
                # 这么写不会导致xxx为循环结束后的最后一个值
                callbacks.append(request)
                continue

            priority = request.priority

            # 如果需要去重并且库中已重复 则continue
            if (request.filter_repeat and setting.REQUEST_FILTER_ENABLE
                    and not self.__class__.dedup.add(request.fingerprint)):
                log.debug("request已存在  url = %s" % request.url)
                continue
            else:
                request_list.append(str(request.to_dict))
                prioritys.append(priority)

            if len(request_list) > MAX_URL_COUNT:
                self._db.zadd(self._table_request, request_list, prioritys)
                request_list = []
                prioritys = []

        # 入库
        if request_list:
            self._db.zadd(self._table_request, request_list, prioritys)

        # 执行回调
        for callback in callbacks:
            try:
                callback()
            except Exception as e:
                log.exception(e)

        # 删除已做任务
        if self._del_requests_deque:
            request_done_list = []
            while self._del_requests_deque:
                request_done_list.append(self._del_requests_deque.popleft())

            # 去掉request_list中的requests, 否则可能会将刚添加的request删除
            request_done_list = list(
                set(request_done_list) - set(request_list))

            if request_done_list:
                self._db.zrem(self._table_request, request_done_list)

        self._is_adding_to_db = False