Пример #1
0
    def __connect_remote_queue(self):
        """
        连接到远程的ActiveMQ队列
        :return: True-连接成功,False-连接失败
        :rtype: bool
        """
        queue_info = self.queue_info
        check_key = ("host", "port", "username", "password", "queue_name")
        for k in check_key:
            if not queue_info[k]:
                logger.error("Connect information error! {} can't be empty!".format(k))
                return False

        final_queue_name = "/queue/{}".format(queue_info["queue_name"])
        host_tuple = (queue_info["host"], int(queue_info["port"]))

        # 生成当前机器的consumer_id
        consumer_id = make_consumer_id()

        self.remote_result_queue = ActiveMQQueue(
            host_tuple, queue_info["username"], queue_info["password"],
            final_queue_name, consumer_id
        )

        return True
Пример #2
0
    def __connect_remote_queue(self):
        """
        连接到远程的ActiveMQ队列
        :return: True-连接成功,False-连接失败
        :rtype: bool
        """
        queue_info = self.queue_info

        check_key = ("host", "port", "username", "password", "queue_name")
        for k in check_key:
            if not queue_info[k]:
                logger.error(
                    "Connect information error! {} can't be empty!".format(k))
                return False

        final_queue_name = "/queue/{}".format(queue_info["queue_name"])
        host_tuple = (queue_info["host"], int(queue_info["port"]))

        # 生成当前机器的consumer id
        # 当前机器的IP + "|" + "当前的timestamp(精确到秒)"
        consumer_id = "{ip}|{timestamp}".format(ip=IPAddress.current_ip(),
                                                timestamp=int(time.time()))

        self.remote_task_queue = ActiveMQQueue(host_tuple,
                                               queue_info["username"],
                                               queue_info["password"],
                                               final_queue_name, consumer_id)

        return True
Пример #3
0
    def _worker(self):
        """
        监视spiders文件夹,如果有文件变动,重新计算md5并重新加载
        """

        logger.info("{} start!".format(self.name))

        while self.status == self.EngineStatus.STATUS_RUNNING:

            self.ev.wait(settings.SPIDER_HOT_LOAD_IDLE)
            logger.debug("self._cache: {}".format(self._cache))

            for _filename in os.listdir(self.spider_dir):
                # 跳过双下划线开头的文件和非.py结尾的文件
                if _filename.startswith("__") or not _filename.endswith(".py"):
                    continue
                # 加载文件并且存到缓存中
                result = self._load_and_update_cache(_filename)
                if not result["success"]:
                    logger.error(result["message"])

            # for test, 如果要调试reload模块,只需要取消下面几行的注释即可
            # inst = self._cache["rss"][1].get_class()
            # inst = inst()
            # inst.run()

        logger.info("{} end!".format(self.name))
Пример #4
0
 def start(self):
     """
     先连接到远程队列,再调用super启动engine
     """
     if not self.__connect_remote_queue():
         logger.error("Can't connect to remote task queue!")
         return False
     super(SenderEngine, self).start()
     return True
Пример #5
0
    def _worker(self):
        logger.info("RefreshEngine start!")

        while self._status == EngineStatus.RUNNING:

            # 每次读库前等一会
            self._ev.wait(5)

            current_time = datetime.datetime.now()
            rows = RedstoneFeedsModel.objects.filter(is_deleted=0).all()
            for _feed in rows:
                if _feed.fetch_time + datetime.timedelta(minutes=_feed.interval) <= current_time:

                    logger.debug("Detected out-date rss. (ID:%s, Name:%s)", _feed.id, _feed.name)

                    try:
                        # 获取该feed使用的spider名称
                        sp = RedstoneSpiderModel.objects.filter(is_deleted=0, pk=_feed.spider_type).first()
                        if not sp:
                            logger.error("Can't get (name: {}, id: {}) spider info!".format(_feed.name, _feed.id))
                            # TODO: 考虑将feed的状态设置为失效
                            continue

                        # 将需要刷新的任务封装成指定的格式
                        task = {
                            "feed_url": _feed.url,
                            "feed_id": _feed.id,
                            "feed_name": _feed.name,
                            "feed_config": {
                                "use_proxy": _feed.use_proxy
                            },
                            "spider_name": sp.name
                        }

                        task = json.dumps(task)
                        self.push_task(task)
                    finally:
                        # 保证一定更新fetch_time字段
                        _feed.fetch_time = current_time
                        _feed.save()

        logger.info("RefreshEngine end!")
Пример #6
0
    def load_class_by_name(self, spider_name) -> Optional[SpiderBase]:
        logger.debug("Try to load spider: {}".format(spider_name))

        # 把爬虫名字转换成文件名,并提取pkg名
        # SpiderName: ExampleSpider => Pkg name: example_spider => Filename: example_spider.py
        pkg_name = [ch if ch.islower() else " " + ch for ch in spider_name]
        pkg_name = "".join(pkg_name).strip()
        pkg_name = pkg_name.replace(" ", "_")
        filename = pkg_name + ".py"

        # 不在缓存中,加载一下
        if pkg_name not in self._cache:
            result = self._load_and_update_cache(filename)
            if not result["success"]:
                logger.error(result["message"])
                return None

        # 直接调用对应爬虫module模块的get_class()方法获取爬虫类
        try:
            return self._cache[pkg_name][1].get_class()
        except AttributeError:
            logger.error("Spider doesn't have 'get_class()' method!")
            return None
Пример #7
0
    def run(self):
        logger.info("RSS Spider running, target: {}".format(self._url))

        # 获取RSS页面的内容
        resp_result = self._get_page_content()
        if not resp_result["success"]:
            error_message = "Can't fetch target URL's content, error msg: {}".format(
                resp_result["message"])
            logger.error(error_message)
            self.spider_result.success = False
            self.spider_result.error_message = error_message
            return False

        # 解析RSS
        raw_rss_content = resp_result["page_content"]
        parsed_rss = feedparser.parse(raw_rss_content)

        # 提取item信息,最后封装成一个dict
        """
        item = {
            "title": "",
            "link": "",
            "summary": ""
            "content": "" if not empty else title+url,
            "publish_time": "",
        }
        """
        items = parsed_rss["entries"]
        # rss_info = parsed_rss["feed"]

        for item in items:
            title = item["title"]
            link = item["link"]
            summary = item["summary"] if item["summary"] else "该文章暂无简介"
            content = item["content"]
            if not content:
                content = "{title}<br><a href=\"{link}\">{title}</a>".format(
                    title=title, link=link)

            # 匹配时间字符串
            raw_published_time = item["published"]
            fmt1 = "%a, %d %b %Y %H:%M:%S %z"
            fmt2 = "%a, %d %b %Y %H:%M:%S %Z"
            try:
                st = time.strptime(raw_published_time, fmt1)
            except ValueError:
                try:
                    st = time.strptime(raw_published_time, fmt2)
                except ValueError:
                    # 没救了,转不出来
                    logger.warning(
                        "Can't convert rss time to struct_time: '{}', use current time instead."
                        .format(raw_published_time))
                    st = time.localtime()

            # 把struct_time转成timestamp,处理时区问题
            published_time = time.mktime(st)
            published_time = \
                published_time + 8 * 3600 if not st.tm_gmtoff else published_time + 8 * 3600 + abs(st.tm_gmtoff)

            # 拼装result
            result = {
                "title": title,
                "link": link,
                "summary": summary,
                "content": content,
                "publish_time": published_time
            }
            self._push_result(result)

        self.spider_result.results = True
        logger.info("Rss spider done.")
        return True