Пример #1
0
    def run(self):
        if not self.is_reach_next_spider_time():
            return

        self._start()

        while True:
            try:
                if self.all_thread_is_done():
                    if not self._is_notify_end:
                        self.spider_end()  # 跑完一轮
                        self.record_spider_state(
                            spider_type=1,
                            state=1,
                            spider_end_time=tools.get_current_date(),
                            batch_interval=self._batch_interval,
                        )

                        self._is_notify_end = True

                    if not self._keep_alive:
                        self._stop_all_thread()
                        break

                else:
                    self._is_notify_end = False

                self.check_task_status()

            except Exception as e:
                log.exception(e)

            tools.delay_time(1)  # 1秒钟检查一次爬虫状态
Пример #2
0
    def __add_item_to_db(self, items, update_items, requests, callbacks,
                         items_fingerprints):
        export_success = False
        self._is_adding_to_db = True

        # 去重
        if setting.ITEM_FILTER_ENABLE:
            items, items_fingerprints = self.__dedup_items(
                items, items_fingerprints)

        # 分捡
        items_dict = self.__pick_items(items)
        update_items_dict = self.__pick_items(update_items,
                                              is_update_item=True)

        # item批量入库
        while items_dict:
            tab_item, datas = items_dict.popitem()

            log.debug("""
                -------------- item 批量入库 --------------
                表名: %s
                datas: %s
                    """ % (tab_item, tools.dumps_json(datas, indent=16)))

            export_success = self.__export_to_db(tab_item, datas)

        # 执行批量update
        while update_items_dict:
            tab_item, datas = update_items_dict.popitem()
            log.debug("""
                -------------- item 批量更新 --------------
                表名: %s
                datas: %s
                    """ % (tab_item, tools.dumps_json(datas, indent=16)))

            update_keys = self._item_update_keys.get(tab_item)
            export_success = self.__export_to_db(tab_item,
                                                 datas,
                                                 is_update=True,
                                                 update_keys=update_keys)

        # 执行回调
        while callbacks:
            try:
                callback = callbacks.pop(0)
                callback()
            except Exception as e:
                log.exception(e)

        # 删除做过的request
        if requests:
            self._db.zrem(self._table_request, requests)

        # 去重入库
        if export_success and setting.ITEM_FILTER_ENABLE:
            if items_fingerprints:
                self.__class__.dedup.add(items_fingerprints, skip_check=True)

        self._is_adding_to_db = False
Пример #3
0
    def get_user(self, block=True) -> Optional[GuestUser]:
        """

        Args:
            block: 无用户时是否等待

        Returns:

        """
        while True:
            try:
                user_id = self._get_user_id()
                user_str = None
                if user_id:
                    user_str = self._redisdb.hget(self._tab_user_pool, user_id)
                    # 如果没取到user,可能是其他爬虫将此用户删除了,需要重刷新本地缓存的用户id
                    if not user_str:
                        self._load_users_id()
                        continue

                if not user_id and block:
                    self._keep_alive = False
                    with RedisLock(key=self._tab_user_pool,
                                   lock_timeout=3600,
                                   wait_timeout=0) as _lock:
                        if _lock.locked:
                            self.run()
                    continue

                return user_str and GuestUser(**eval(user_str))
            except Exception as e:
                log.exception(e)
                tools.delay_time(1)
Пример #4
0
 def put_failed_request(self, request, table=None):
     try:
         request_dict = request.to_dict
         self._db.zadd(table or self._table_failed_request, request_dict,
                       request.priority)
     except Exception as e:
         log.exception(e)
Пример #5
0
    def run(self):
        while True:
            try:
                now_user_count = self._redisdb.hget_count(self._tab_user_pool)
                need_user_count = self._min_users - now_user_count

                if need_user_count > 0:
                    log.info("当前在线user数为 {} 小于 {}, 生产user".format(
                        now_user_count, self._min_users))
                    try:
                        user = self.login()
                        if user:
                            self.add_user(user)
                    except Exception as e:
                        log.exception(e)
                else:
                    log.debug("当前user数为 {} 数量足够 暂不生产".format(now_user_count))

                    if self._keep_alive:
                        tools.delay_time(10)
                    else:
                        break

            except Exception as e:
                log.exception(e)
                tools.delay_time(1)
Пример #6
0
 def handel_exception(self, e):
     """
     处理异常
     @param e:
     @return:
     """
     log.exception(e)
Пример #7
0
    def start_monitor_task(self, *args, **kws):
        if not self.is_reach_next_spider_time():
            return

        self._auto_start_requests = False
        redisdb = RedisDB()

        if not self._parsers:  # 不是add_parser 模式
            self._parsers.append(self)

        while True:
            try:
                # 检查redis中是否有任务
                tab_requests = setting.TAB_REQUSETS.format(
                    redis_key=self._redis_key
                )
                todo_task_count = redisdb.zget_count(tab_requests)

                if todo_task_count < self._min_task_count:  # 添加任务
                    # make start requests
                    self.distribute_task(*args, **kws)

                else:
                    log.info("redis 中尚有%s条积压任务,暂时不派发新任务" % todo_task_count)

            except Exception as e:
                log.exception(e)

            if self._auto_stop_when_spider_done:
                break

            time.sleep(self._check_task_interval)
Пример #8
0
    def __add_request_to_db(self):
        request_list = []
        prioritys = []
        callbacks = []

        while self._requests_deque:
            request = self._requests_deque.popleft()
            self._is_adding_to_db = True

            if callable(request):
                # 函数
                # 注意:应该考虑闭包情况。闭包情况可写成
                # def test(xxx = xxx):
                #     # TODO 业务逻辑 使用 xxx
                # 这么写不会导致xxx为循环结束后的最后一个值
                callbacks.append(request)
                continue

            priority = request.priority

            # 如果需要去重并且库中已重复 则continue
            if (request.filter_repeat and setting.REQUEST_FILTER_ENABLE
                    and not self.__class__.dedup.add(request.fingerprint)):
                log.debug("request已存在  url = %s" % request.url)
                continue
            else:
                request_list.append(str(request.to_dict))
                prioritys.append(priority)

            if len(request_list) > MAX_URL_COUNT:
                self._db.zadd(self._table_request, request_list, prioritys)
                request_list = []
                prioritys = []

        # 入库
        if request_list:
            self._db.zadd(self._table_request, request_list, prioritys)

        # 执行回调
        for callback in callbacks:
            try:
                callback()
            except Exception as e:
                log.exception(e)

        # 删除已做任务
        if self._del_requests_deque:
            request_done_list = []
            while self._del_requests_deque:
                request_done_list.append(self._del_requests_deque.popleft())

            # 去掉request_list中的requests, 否则可能会将刚添加的request删除
            request_done_list = list(
                set(request_done_list) - set(request_list))

            if request_done_list:
                self._db.zrem(self._table_request, request_done_list)

        self._is_adding_to_db = False
Пример #9
0
    def run(self):
        while not self._thread_stop:
            try:
                self.__add_request_to_db()
            except Exception as e:
                log.exception(e)

            tools.delay_time(1)
Пример #10
0
    def run(self):
        self._thread_stop = False
        while not self._thread_stop:
            try:
                self.__input_data()
            except Exception as e:
                log.exception(e)
                time.sleep(0.1)

            self._is_collector_task = False
Пример #11
0
    def run(self):
        while not self._thread_stop:
            try:
                self.__report_node_heartbeat()
                self.__input_data()
            except Exception as e:
                log.exception(e)

            self._is_collector_task = False

            time.sleep(self._interval)
Пример #12
0
    def run(self):
        while True:
            try:
                now_cookie_count = self._redisdb.lget_count(self._tab_cookie_pool)
                need_cookie_count = self._min_cookies - now_cookie_count

                if need_cookie_count > 0:
                    log.info(
                        "当前cookie数为 {} 小于 {}, 生产cookie".format(
                            now_cookie_count, self._min_cookies
                        )
                    )
                    try:
                        cookies = self.create_cookie()
                        if cookies:
                            self.add_cookies(cookies)
                    except Exception as e:
                        log.exception(e)
                else:
                    log.info("当前cookie数为 {} 数量足够 暂不生产".format(now_cookie_count))

                    # 判断cookie池近一分钟数量是否有变化,无变化则认为爬虫不再用了,退出
                    last_count_info = self._redisdb.strget(
                        self._tab_cookie_pool_last_count
                    )
                    if not last_count_info:
                        self._redisdb.strset(
                            self._tab_cookie_pool_last_count,
                            "{}:{}".format(time.time(), now_cookie_count),
                        )
                    else:
                        last_time, last_count = last_count_info.split(":")
                        last_time = float(last_time)
                        last_count = int(last_count)

                        if time.time() - last_time > 60:
                            if now_cookie_count == last_count:
                                log.info("近一分钟,cookie池数量无变化,判定爬虫未使用,退出生产")
                                break
                            else:
                                self._redisdb.strset(
                                    self._tab_cookie_pool_last_count,
                                    "{}:{}".format(time.time(), now_cookie_count),
                                )

                    if self._keep_alive:
                        log.info("sleep 10")
                        tools.delay_time(10)
                    else:
                        break

            except Exception as e:
                log.exception(e)
                tools.delay_time(1)
Пример #13
0
    def run(self):
        """
        @summary: 重写run方法 检查mysql中的任务是否做完, 做完停止
        ---------
        ---------
        @result:
        """
        try:
            self.create_batch_record_table()

            if not self._parsers:  # 不是add_parser 模式
                self._parsers.append(self)

            self._start()

            while True:
                try:
                    self.heartbeat()
                    if (
                            self.task_is_done() and self.all_thread_is_done()
                    ):  # redis全部的任务已经做完 并且mysql中的任务已经做完(检查各个线程all_thread_is_done,防止任务没做完,就更新任务状态,导致程序结束的情况)
                        if not self._is_notify_end:
                            self.spider_end()
                            self.record_spider_state(
                                spider_type=2,
                                state=1,
                                batch_date=self._batch_date_cache,
                                spider_end_time=tools.get_current_date(),
                                batch_interval=self._batch_interval,
                            )

                            self._is_notify_end = True

                        if not self._keep_alive:
                            self._stop_all_thread()
                            break
                    else:
                        self._is_notify_end = False

                    self.check_task_status()

                except Exception as e:
                    log.exception(e)

                tools.delay_time(10)  # 10秒钟检查一次爬虫状态

        except Exception as e:
            msg = "《%s》主线程异常 爬虫结束 exception: %s" % (self._batch_name, e)
            log.error(msg)
            self.send_msg(msg,
                          level="error",
                          message_prefix="《%s》爬虫异常结束".format(self._batch_name))

            os._exit(137)  # 使退出码为35072 方便爬虫管理器重启
Пример #14
0
    def run(self):
        while True:
            try:
                try:
                    with RedisLock(
                        key=self._tab_user_pool, lock_timeout=3600, wait_timeout=0
                    ) as _lock:
                        if _lock.locked:
                            for user in self._load_user():
                                retry_times = 0
                                while retry_times <= self._login_retry_times:
                                    try:
                                        user = self.login(user)
                                        if user:
                                            self.add_user(user)
                                        else:
                                            self.handle_login_failed_user(user)
                                        break
                                    except NotImplementedError:
                                        log.error(
                                            f"{self.__class__.__name__} must be implementation login method!"
                                        )
                                        os._exit(0)
                                    except Exception as e:
                                        self.handel_exception(e)
                                    log.debug(
                                        f"login failed, user: {user} retry_times: {retry_times}"
                                    )
                                    retry_times += 1
                                else:
                                    self.handle_login_failed_user(user)

                            now_user_count = self._redisdb.hget_count(
                                self._tab_user_pool
                            )
                            log.info("当前在线user数为 {}".format(now_user_count))

                except Exception as e:
                    log.exception(e)

                if self._keep_alive:
                    tools.delay_time(10)
                else:
                    break

            except Exception as e:
                log.exception(e)
                tools.delay_time(1)
Пример #15
0
    def run(self):
        self._thread_stop = False
        while not self._thread_stop:
            try:
                request = self._collector.get_request()
                if not request:
                    if not self.is_show_tip:
                        log.debug("等待任务...")
                        self.is_show_tip = True
                    continue

                self.is_show_tip = False
                self.deal_request(request)

            except Exception as e:
                log.exception(e)
Пример #16
0
    def flush(self):
        try:
            items = []
            update_items = []
            requests = []
            callbacks = []
            items_fingerprints = []
            data_count = 0

            while not self._items_queue.empty():
                data = self._items_queue.get_nowait()
                data_count += 1

                # data 分类
                if callable(data):
                    callbacks.append(data)

                elif isinstance(data, UpdateItem):
                    update_items.append(data)

                elif isinstance(data, Item):
                    items.append(data)
                    if setting.ITEM_FILTER_ENABLE:
                        items_fingerprints.append(data.fingerprint)

                else:  # request-redis
                    requests.append(data)

                if data_count >= UPLOAD_BATCH_MAX_SIZE:
                    self.__add_item_to_db(
                        items, update_items, requests, callbacks, items_fingerprints
                    )

                    items = []
                    update_items = []
                    requests = []
                    callbacks = []
                    items_fingerprints = []
                    data_count = 0

            if data_count:
                self.__add_item_to_db(
                    items, update_items, requests, callbacks, items_fingerprints
                )

        except Exception as e:
            log.exception(e)
Пример #17
0
    def get_cookie(self, wait_when_null=True) -> User:
        while True:
            try:
                user_cookie = self._redisdb.rpoplpush(self._tab_cookie_pool)
                if not user_cookie and wait_when_null:
                    log.info("暂无cookie 生产中...")
                    self.login()
                    continue

                if user_cookie:
                    user_cookie = eval(user_cookie)
                    return User(**user_cookie)

                return None
            except Exception as e:
                log.exception(e)
                tools.delay_time(1)
Пример #18
0
    def run(self):
        if not self._parsers:  # 不是add_parser 模式
            self._parsers.append(self)

        self._start()

        while True:
            try:
                if self.all_thread_is_done():
                    self._stop_all_thread()
                    break
            except Exception as e:
                log.exception(e)

            tools.delay_time(1)  # 1秒钟检查一次爬虫状态

        self.delete_tables([self._redis_key + "*"])
Пример #19
0
    def __put_requests(self, requests_list):
        for request in requests_list:
            try:
                request_dict = {
                    "request_obj": Request.from_dict(eval(request)),
                    "request_redis": request,
                }
            except Exception as e:
                log.exception("""
                error %s
                request %s
                """ % (e, request))

                request_dict = None

            if request_dict:
                self._todo_requests.put(request_dict)
Пример #20
0
    def run(self):
        while not self._thread_stop:
            try:
                request = self._memory_db.get()
                if not request:
                    if not self.is_show_tip:
                        log.debug("等待任务...")
                        self.is_show_tip = True

                    time.sleep(1)
                    continue

                self.is_show_tip = False
                self.deal_request(request)

            except Exception as e:
                log.exception(e)
                time.sleep(3)
Пример #21
0
    def run(self):
        while not self._thread_stop:
            try:
                requests = self._memory_db.get()
                if not requests:
                    if not self.is_show_tip:
                        log.debug("parser 等待任务 ...")
                        self.is_show_tip = True

                    time.sleep(1)
                    self._wait_task_time += 1
                    continue

                self.is_show_tip = False
                self.deal_requests([requests])

            except Exception as e:
                log.exception(e)
Пример #22
0
 def get_cookie(self, wait_when_null=True):
     while True:
         try:
             cookie_info = self._redisdb.rpoplpush(self._tab_cookie_pool)
             if not cookie_info and wait_when_null:
                 log.info("暂无cookie 生产中...")
                 self._keep_alive = False
                 self._min_cookies = 1
                 with RedisLock(
                     key=self._tab_cookie_pool, lock_timeout=3600, wait_timeout=5
                 ) as _lock:
                     if _lock.locked:
                         self.run()
                 continue
             return eval(cookie_info) if cookie_info else {}
         except Exception as e:
             log.exception(e)
             tools.delay_time(1)
Пример #23
0
    def run(self):
        while not self._thread_stop:
            try:
                requests = self._collector.get_requests(setting.SPIDER_TASK_COUNT)
                if not requests:
                    if not self.is_show_tip:
                        log.info("parser 等待任务 ...")
                        self.is_show_tip = True

                    # log.info('parser 等待任务 {}...'.format(tools.format_seconds(self._wait_task_time)))

                    time.sleep(1)
                    self._wait_task_time += 1
                    continue

                self.is_show_tip = False
                self.deal_requests(requests)

            except Exception as e:
                log.exception(e)
Пример #24
0
    def save_items(self, table, items: List[Dict]) -> bool:
        """
        保存数据
        Args:
            table: 表名
            items: 数据,[{},{},...]

        Returns: 是否保存成功 True / False
                 若False,不会将本批数据入到去重库,以便再次入库

        """
        try:
            add_count = self.to_db.add_batch(coll_name=table, datas=items)
            datas_size = len(items)
            log.info("共导出 %s 条数据到 %s,  新增 %s条, 重复 %s 条" %
                     (datas_size, table, add_count, datas_size - add_count))
            return True
        except Exception as e:
            log.exception(e)
            return False
Пример #25
0
    def run(self):
        self.start_callback()

        for i in range(self._thread_count):
            parser_control = AirSpiderParserControl(self._memory_db,
                                                    self._item_buffer)
            parser_control.add_parser(self)
            parser_control.start()
            self._parser_controls.append(parser_control)

        self._item_buffer.start()

        self.distribute_task()

        while True:
            try:
                if self.all_thread_is_done():
                    # 停止 parser_controls
                    for parser_control in self._parser_controls:
                        parser_control.stop()

                    # 关闭item_buffer
                    self._item_buffer.stop()

                    # 关闭webdirver
                    if Request.webdriver_pool:
                        Request.webdriver_pool.close()

                    log.info("无任务,爬虫结束")
                    break

            except Exception as e:
                log.exception(e)

            tools.delay_time(1)  # 1秒钟检查一次爬虫状态

        self.end_callback()
        # 为了线程可重复start
        self._started.clear()
        # 关闭打点
        metrics.close()
Пример #26
0
    def get_user(self, block=True) -> Optional[NormalUser]:
        while True:
            try:
                user_id = self._get_user_id()
                user_str = None
                if user_id:
                    user_str = self._redisdb.hget(self._tab_user_pool, user_id)
                    # 如果没取到user,可能是其他爬虫将此用户删除了,需要重刷新本地缓存的用户id
                    if not user_str:
                        self._load_users_id()
                        continue

                if not user_id and block:
                    self._keep_alive = False
                    self.run()
                    continue

                return user_str and NormalUser(**eval(user_str))
            except Exception as e:
                log.exception(e)
                tools.delay_time(1)
Пример #27
0
    def reput_failed_requests_to_requests(self):
        log.debug("正在重置失败的requests...")
        total_count = 0
        while True:
            try:
                failed_requests = self.get_failed_requests()
                if not failed_requests:
                    break

                for request in failed_requests:
                    request["retry_times"] = 0
                    request_obj = Request.from_dict(request)
                    self._request_buffer.put_request(request_obj)

                    total_count += 1
            except Exception as e:
                log.exception(e)

        self._request_buffer.flush()

        log.debug("重置%s条失败requests为待抓取requests" % total_count)
Пример #28
0
    def emit(self, point=None, force=False):
        """
        1. 添加新点到 pending
        2. 如果符合条件,尝试聚合并打点
        3. 更新打点时间

        :param point:
        :param force: 强制提交所有点 默认False
        :return:
        """
        if point:
            self.pending_points.put(point)

        # 判断是否需要提交点 1、数量 2、间隔 3、强力打点
        if not (force
                or self.pending_points.qsize() >= self.max_points  # noqa: W503
                or time.time() - self.last_emit_ts >
                self.emit_interval  # noqa: W503
                ):
            return

        # 需要打点,读取可以打点的值, 确保只有一个线程在做点的压缩
        with self.lock:
            points = self._get_ready_emit(force=force)

            if not points:
                return
            try:
                self.influxdb.write_points(
                    points,
                    batch_size=self.batch_size,
                    time_precision=self.time_precision,
                    retention_policy=self.retention_policy,
                )
            except Exception:
                log.exception("error writing points")

            self.last_emit_ts = time.time()
Пример #29
0
    def login(self):
        """
        @return: 1 成功 0 失败
        """

        try:
            # 预检查
            if not self.is_time_to_login():
                log.info("此账号尚未到登陆时间: {}".format(self.username))
                time.sleep(5)
                return 0

            cookies = self.create_cookie()
            if not cookies:
                raise Exception("登陆失败 未获取到合法cookie")

            if not isinstance(cookies, dict):
                raise Exception("cookie 必须为字典格式")

            # 保存cookie
            self.set_login_time()
            self.set_cookies(cookies)
            log.info("登录成功 {}".format(self.username))
            self.record_user_status(LimitTimesUserStatus.LOGIN_SUCCESS)
            return 1

        except Exception as e:
            log.exception(e)
            send_msg(
                msg=f"{self.SITE_NAME} {self.username} 账号登陆异常 exception: {str(e)}",
                level="error",
                message_prefix=f"{self.SITE_NAME} {self.username} 账号登陆异常",
            )

        log.info("登录失败 {}".format(self.username))
        self.record_user_status(LimitTimesUserStatus.LOGIN_FALIED)
        return 0
Пример #30
0
    def update_items(self,
                     table,
                     items: List[Dict],
                     update_keys=Tuple) -> bool:
        """
        更新数据
        Args:
            table: 表名
            items: 数据,[{},{},...]
            update_keys: 更新的字段, 如 ("title", "publish_time")

        Returns: 是否更新成功 True / False
                 若False,不会将本批数据入到去重库,以便再次入库

        """
        try:
            add_count = self.to_db.add_batch(
                coll_name=table,
                datas=items,
                update_columns=update_keys or list(items[0].keys()),
            )
            datas_size = len(items)
            update_count = datas_size - add_count
            msg = "共导出 %s 条数据到 %s,  新增 %s 条, 更新 %s 条" % (
                datas_size,
                table,
                add_count,
                update_count,
            )
            if update_keys:
                msg += " 更新字段为 {}".format(update_keys)
            log.info(msg)

            return True
        except Exception as e:
            log.exception(e)
            return False