Exemplo n.º 1
0
class Collector(threading.Thread):
    def __init__(self, table_folder, process_num=None):
        """
        @summary:
        ---------
        @param table_folder:
        @param process_num: 进程编号
        ---------
        @result:
        """

        super(Collector, self).__init__()
        self._db = RedisDB()

        self._thread_stop = False

        self._todo_requests = collections.deque()

        self._tab_requests = setting.TAB_REQUSETS.format(
            table_folder=table_folder)
        self._tab_spider_status = setting.TAB_SPIDER_STATUS.format(
            table_folder=table_folder)

        self._spider_mark = LOCAL_HOST_IP + ("_%s" % process_num
                                             if process_num else "_0")

        self._interval = setting.COLLECTOR_SLEEP_TIME
        self._request_count = setting.COLLECTOR_TASK_COUNT
        self._is_collector_task = False

        self._db.clear(self._tab_spider_status)

    def run(self):
        while not self._thread_stop:

            try:
                self.__input_data()
            except Exception as e:
                log.exception(e)

            self._is_collector_task = False

            time.sleep(self._interval)

    def stop(self):
        self._thread_stop = True

    def __input_data(self):
        if len(self._todo_requests) >= self._request_count:
            return

        # 汇报节点信息
        self._db.zadd(self._tab_spider_status, self._spider_mark, 0)  # 未做

        request_count = self._request_count  # 先赋值
        # 根据等待节点数量,动态分配request
        spider_wait_count = self._db.zget_count(self._tab_spider_status,
                                                priority_min=0,
                                                priority_max=0)
        if spider_wait_count:
            # 任务数量
            task_count = self._db.zget_count(self._tab_requests)
            # 动态分配的数量 = 任务数量 / 休息的节点数量 + 1
            request_count = task_count // spider_wait_count + 1

        request_count = (request_count if request_count <= self._request_count
                         else self._request_count)

        if not request_count:
            return

        # requests_list = self._db.zget(self._tab_requests, count = request_count)

        # 取任务
        current_timestamp = tools.get_current_timestamp()
        priority_max = current_timestamp - setting.REQUEST_TIME_OUT  # 普通的任务 与 已经超时的任务
        requests_list = self._db.zrangebyscore_set_score(
            self._tab_requests,
            priority_min="-inf",
            priority_max=priority_max,
            score=current_timestamp,
            count=request_count,
        )
        # print('取任务', len(requests_list))

        if not requests_list:
            pass
        else:
            self._is_collector_task = True
            # 将取到的任务放回到redis, 以当前时间戳标记,表示正在做的任务。任务做完在request_buffer中删除,没做完则到超时时间后重新做
            # self._db.zadd(self._tab_requests, requests_list, prioritys=current_timestamp)

            # 汇报节点信息
            self._db.zadd(self._tab_spider_status, self._spider_mark, 1)  # 正在做

            # 存request
            self.__put_requests(requests_list)

    def __put_requests(self, requests_list):
        for request in requests_list:
            try:
                request_dict = {
                    "request_obj": Request.from_dict(eval(request)),
                    "request_redis": request,
                }
            except Exception as e:
                log.exception("""
                error %s
                request %s
                """ % (e, request))

                request_dict = None

            if request_dict:
                self._todo_requests.append(request_dict)

    def get_requests(self, count):
        requests = []
        count = count if count <= len(self._todo_requests) else len(
            self._todo_requests)
        while count:
            requests.append(self._todo_requests.popleft())
            count -= 1

        return requests

    def get_requests_count(self):
        return len(self._todo_requests) or self._db.zget_count(
            self._tab_requests)

    def is_collector_task(self):
        return self._is_collector_task
Exemplo n.º 2
0
class RequestBuffer(threading.Thread, Singleton):
    dedup = None

    def __init__(self, table_folder):
        if not hasattr(self, "_requests_deque"):
            super(RequestBuffer, self).__init__()

            self._thread_stop = False
            self._is_adding_to_db = False

            self._requests_deque = collections.deque()
            self._del_requests_deque = collections.deque()
            self._db = RedisDB()

            self._table_request = setting.TAB_REQUSETS.format(
                table_folder=table_folder)
            self._table_failed_request = setting.TAB_FAILED_REQUSETS.format(
                table_folder=table_folder)

            if not self.__class__.dedup and setting.REQUEST_FILTER_ENABLE:
                self.__class__.dedup = Dedup(
                    filter_type=Dedup.ExpireFilter,
                    name=table_folder,
                    expire_time=2592000,
                    to_md5=False,
                )  # 过期时间为一个月

    def run(self):
        while not self._thread_stop:
            try:
                self.__add_request_to_db()
            except Exception as e:
                log.exception(e)

            tools.delay_time(1)

    def stop(self):
        self._thread_stop = True

    def put_request(self, request):
        self._requests_deque.append(request)

        if self.get_requests_count() > MAX_URL_COUNT:  # 超过最大缓存,主动调用
            self.flush()

    def put_del_request(self, request):
        self._del_requests_deque.append(request)

    def put_failed_request(self, request, table=None):
        try:
            request_dict = request.to_dict
            self._db.zadd(table or self._table_failed_request, request_dict,
                          request.priority)
        except Exception as e:
            log.exception(e)

    def flush(self):
        try:
            self.__add_request_to_db()
        except Exception as e:
            log.exception(e)

    def get_requests_count(self):
        return len(self._requests_deque)

    def is_adding_to_db(self):
        return self._is_adding_to_db

    def __add_request_to_db(self):
        request_list = []
        prioritys = []
        callbacks = []

        while self._requests_deque:
            request = self._requests_deque.popleft()
            self._is_adding_to_db = True

            if callable(request):
                # 函数
                # 注意:应该考虑闭包情况。闭包情况可写成
                # def test(xxx = xxx):
                #     # TODO 业务逻辑 使用 xxx
                # 这么写不会导致xxx为循环结束后的最后一个值
                callbacks.append(request)
                continue

            priority = request.priority

            # 如果需要去重并且库中已重复 则continue
            if (request.filter_repeat and setting.REQUEST_FILTER_ENABLE
                    and not self.__class__.dedup.add(request.fingerprint)):
                log.debug("request已存在  url = %s" % request.url)
                continue
            else:
                request_list.append(str(request.to_dict))
                prioritys.append(priority)

            if len(request_list) > MAX_URL_COUNT:
                self._db.zadd(self._table_request, request_list, prioritys)
                request_list = []
                prioritys = []

        # 入库
        if request_list:
            self._db.zadd(self._table_request, request_list, prioritys)

        # 执行回调
        for callback in callbacks:
            try:
                callback()
            except Exception as e:
                log.exception(e)

        # 删除已做任务
        if self._del_requests_deque:
            request_done_list = []
            while self._del_requests_deque:
                request_done_list.append(self._del_requests_deque.popleft())

            # 去掉request_list中的requests, 否则可能会将刚添加的request删除
            request_done_list = list(
                set(request_done_list) - set(request_list))

            if request_done_list:
                self._db.zrem(self._table_request, request_done_list)

        self._is_adding_to_db = False