示例#1
0
class Collector(threading.Thread):
    def __init__(self, redis_key, process_num=None):
        """
        @summary:
        ---------
        @param redis_key:
        @param process_num: 进程编号
        ---------
        @result:
        """

        super(Collector, self).__init__()
        self._db = RedisDB()

        self._thread_stop = False

        self._todo_requests = collections.deque()

        self._tab_requests = setting.TAB_REQUSETS.format(redis_key=redis_key)
        self._tab_spider_status = setting.TAB_SPIDER_STATUS.format(
            redis_key=redis_key)

        self._spider_mark = LOCAL_HOST_IP + ("_%s" % process_num
                                             if process_num else "_0")

        self._interval = setting.COLLECTOR_SLEEP_TIME
        self._request_count = setting.COLLECTOR_TASK_COUNT
        self._is_collector_task = False

        self._db.clear(self._tab_spider_status)

    def run(self):
        while not self._thread_stop:

            try:
                self.__input_data()
            except Exception as e:
                log.exception(e)

            self._is_collector_task = False

            time.sleep(self._interval)

    def stop(self):
        self._thread_stop = True

    def __input_data(self):
        if len(self._todo_requests) >= self._request_count:
            return

        # 汇报节点信息
        self._db.zadd(self._tab_spider_status, self._spider_mark, 0)  # 未做

        request_count = self._request_count  # 先赋值
        # 根据等待节点数量,动态分配request
        spider_wait_count = self._db.zget_count(self._tab_spider_status,
                                                priority_min=0,
                                                priority_max=0)
        if spider_wait_count:
            # 任务数量
            task_count = self._db.zget_count(self._tab_requests)
            # 动态分配的数量 = 任务数量 / 休息的节点数量 + 1
            request_count = task_count // spider_wait_count + 1

        request_count = (request_count if request_count <= self._request_count
                         else self._request_count)

        if not request_count:
            return

        # 取任务,只取当前时间搓以内的任务,同时将任务分数修改为 current_timestamp + setting.REQUEST_TIME_OUT
        current_timestamp = tools.get_current_timestamp()
        requests_list = self._db.zrangebyscore_set_score(
            self._tab_requests,
            priority_min="-inf",
            priority_max=current_timestamp,
            score=current_timestamp + setting.REQUEST_TIME_OUT,
            count=request_count,
        )

        if requests_list:
            self._is_collector_task = True

            # 汇报节点信息
            self._db.zadd(self._tab_spider_status, self._spider_mark, 1)  # 正在做

            # 存request
            self.__put_requests(requests_list)

    def __put_requests(self, requests_list):
        for request in requests_list:
            try:
                request_dict = {
                    "request_obj": Request.from_dict(eval(request)),
                    "request_redis": request,
                }
            except Exception as e:
                log.exception("""
                error %s
                request %s
                """ % (e, request))

                request_dict = None

            if request_dict:
                self._todo_requests.append(request_dict)

    def get_requests(self, count):
        requests = []
        count = count if count <= len(self._todo_requests) else len(
            self._todo_requests)
        while count:
            requests.append(self._todo_requests.popleft())
            count -= 1

        return requests

    def get_requests_count(self):
        return len(self._todo_requests) or self._db.zget_count(
            self._tab_requests)

    def is_collector_task(self):
        return self._is_collector_task
示例#2
0
class Collector(threading.Thread):
    def __init__(self, redis_key):
        """
        @summary:
        ---------
        @param redis_key:
        ---------
        @result:
        """

        super(Collector, self).__init__()
        self._db = RedisDB()

        self._thread_stop = False

        self._todo_requests = collections.deque()

        self._tab_requests = setting.TAB_REQUSETS.format(redis_key=redis_key)
        self._tab_spider_status = setting.TAB_SPIDER_STATUS.format(
            redis_key=redis_key)

        self._spider_mark = tools.get_localhost_ip() + f"-{time.time()}"

        self._interval = setting.COLLECTOR_SLEEP_TIME
        self._request_count = setting.COLLECTOR_TASK_COUNT
        self._is_collector_task = False

        self.__delete_dead_node()

    def run(self):
        while not self._thread_stop:
            try:
                self.__report_node_heartbeat()
                self.__input_data()
            except Exception as e:
                log.exception(e)

            self._is_collector_task = False

            time.sleep(self._interval)

    def stop(self):
        self._thread_stop = True

    def __input_data(self):
        current_timestamp = tools.get_current_timestamp()
        if len(self._todo_requests) >= self._request_count:
            return

        request_count = self._request_count  # 先赋值
        # 查询最近有心跳的节点数量
        spider_wait_count = self._db.zget_count(
            self._tab_spider_status,
            priority_min=current_timestamp - (self._interval + 10),
            priority_max=current_timestamp,
        )
        # 根据等待节点数量,动态分配request
        if spider_wait_count:
            # 任务数量
            task_count = self._db.zget_count(self._tab_requests)
            # 动态分配的数量 = 任务数量 / 休息的节点数量 + 1
            request_count = task_count // spider_wait_count + 1

        request_count = (request_count if request_count <= self._request_count
                         else self._request_count)

        if not request_count:
            return

        # 取任务,只取当前时间搓以内的任务,同时将任务分数修改为 current_timestamp + setting.REQUEST_TIME_OUT
        requests_list = self._db.zrangebyscore_set_score(
            self._tab_requests,
            priority_min="-inf",
            priority_max=current_timestamp,
            score=current_timestamp + setting.REQUEST_TIME_OUT,
            count=request_count,
        )

        if requests_list:
            self._is_collector_task = True
            # 存request
            self.__put_requests(requests_list)

    def __report_node_heartbeat(self):
        """
        汇报节点心跳,以便任务平均分配
        """
        self._db.zadd(self._tab_spider_status, self._spider_mark,
                      tools.get_current_timestamp())

    def __delete_dead_node(self):
        """
        删除没有心跳的节点信息
        """
        self._db.zremrangebyscore(
            self._tab_spider_status,
            "-inf",
            tools.get_current_timestamp() - (self._interval + 10),
        )

    def __put_requests(self, requests_list):
        for request in requests_list:
            try:
                request_dict = {
                    "request_obj": Request.from_dict(eval(request)),
                    "request_redis": request,
                }
            except Exception as e:
                log.exception("""
                error %s
                request %s
                """ % (e, request))

                request_dict = None

            if request_dict:
                self._todo_requests.append(request_dict)

    def get_requests(self, count):
        requests = []
        count = count if count <= len(self._todo_requests) else len(
            self._todo_requests)
        while count:
            requests.append(self._todo_requests.popleft())
            count -= 1

        return requests

    def get_requests_count(self):
        return len(self._todo_requests) or self._db.zget_count(
            self._tab_requests)

    def is_collector_task(self):
        return self._is_collector_task
示例#3
0
class RequestBuffer(threading.Thread):
    dedup = None

    def __init__(self, redis_key):
        if not hasattr(self, "_requests_deque"):
            super(RequestBuffer, self).__init__()

            self._thread_stop = False
            self._is_adding_to_db = False

            self._requests_deque = collections.deque()
            self._del_requests_deque = collections.deque()
            self._db = RedisDB()

            self._table_request = setting.TAB_REQUSETS.format(
                redis_key=redis_key)
            self._table_failed_request = setting.TAB_FAILED_REQUSETS.format(
                redis_key=redis_key)

            if not self.__class__.dedup and setting.REQUEST_FILTER_ENABLE:
                self.__class__.dedup = Dedup(
                    name=redis_key,
                    to_md5=False,
                    **setting.REQUEST_FILTER_SETTING)  # 默认过期时间为一个月

    def run(self):
        self._thread_stop = False
        while not self._thread_stop:
            try:
                self.__add_request_to_db()
            except Exception as e:
                log.exception(e)

            tools.delay_time(1)

    def stop(self):
        self._thread_stop = True
        self._started.clear()

    def put_request(self, request):
        self._requests_deque.append(request)

        if self.get_requests_count() > MAX_URL_COUNT:  # 超过最大缓存,主动调用
            self.flush()

    def put_del_request(self, request):
        self._del_requests_deque.append(request)

    def put_failed_request(self, request, table=None):
        try:
            request_dict = request.to_dict
            self._db.zadd(table or self._table_failed_request, request_dict,
                          request.priority)
        except Exception as e:
            log.exception(e)

    def flush(self):
        try:
            self.__add_request_to_db()
        except Exception as e:
            log.exception(e)

    def get_requests_count(self):
        return len(self._requests_deque)

    def is_adding_to_db(self):
        return self._is_adding_to_db

    def __add_request_to_db(self):
        request_list = []
        prioritys = []
        callbacks = []

        while self._requests_deque:
            request = self._requests_deque.popleft()
            self._is_adding_to_db = True

            if callable(request):
                # 函数
                # 注意:应该考虑闭包情况。闭包情况可写成
                # def test(xxx = xxx):
                #     # TODO 业务逻辑 使用 xxx
                # 这么写不会导致xxx为循环结束后的最后一个值
                callbacks.append(request)
                continue

            priority = request.priority

            # 如果需要去重并且库中已重复 则continue
            if (request.filter_repeat and setting.REQUEST_FILTER_ENABLE
                    and not self.__class__.dedup.add(request.fingerprint)):
                log.debug("request已存在  url = %s" % request.url)
                continue
            else:
                request_list.append(str(request.to_dict))
                prioritys.append(priority)

            if len(request_list) > MAX_URL_COUNT:
                self._db.zadd(self._table_request, request_list, prioritys)
                request_list = []
                prioritys = []

        # 入库
        if request_list:
            self._db.zadd(self._table_request, request_list, prioritys)

        # 执行回调
        for callback in callbacks:
            try:
                callback()
            except Exception as e:
                log.exception(e)

        # 删除已做任务
        if self._del_requests_deque:
            request_done_list = []
            while self._del_requests_deque:
                request_done_list.append(self._del_requests_deque.popleft())

            # 去掉request_list中的requests, 否则可能会将刚添加的request删除
            request_done_list = list(
                set(request_done_list) - set(request_list))

            if request_done_list:
                self._db.zrem(self._table_request, request_done_list)

        self._is_adding_to_db = False