Exemplo n.º 1
0
class SpiderWorker(Worker):
    def __init__(self, master='127.0.0.1:2181', type='spider'):
        """
        spider类型执行器
        :param master: 主节点地址
        :param type: 执行器类型
        :return:
        """
        Worker.__init__(self, master, type)
        # 注册任务队列
        self.spider_queue = FIFOQueue(self.redis, self.config.get("spider_queue"))
        self.processer_queue = FIFOQueue(self.redis, self.config.get("processor_queue"))
        # 注册过滤器
        self.duplicate_filter = DuplicateFilter(self.redis, self.config.get("duplicate_set"))
        # 注册存储数据库
        self.storage_pipline = MongodbStorage(self.mongodb, self.config.get("storage_db"))

    def run(self, job):
        """
        执行方法
        :param job: 任务信息
        :return:
        """
        # 注册爬虫
        crawler = PhantomCrawler()
        parser = NormalParser(job)

        if len(self.spider_queue) > 0:
            task = eval(self.spider_queue.pop())
            self.task = task

            # 若该任务失败次数过多,不再处理该任务
            if task['life'] == 0:
                return

            response = crawler.fetch(task['url'])
            # success, result = crawler.fetch(task['url'])

            # 若爬虫成功爬取
            if response['status_code'] == 200:
                try:
                    item = parser.parse(task['url'], response['content'])
                    # 分片写入
                    item['ram'] = random.random()
                    new_urls = item['links']

                    # 抓去的新链接判重后加入队列
                    for new_url in new_urls:
                        if not self.duplicate_filter.exists(new_url):
                            self.spider_queue.push({
                                "url": new_url,
                                "life": 5
                            })

                    # url原始解析结果持久化
                    item = self.storage_pipline.insert(self.config.get("page_table"), item)
                    self.processer_queue.push(item.get('_id'))

                    # 更新任务状态
                    self._update_status(True)
                    log("[SUCCESS] %s." % task['url'])
                except Exception, e:
                    # 将失败的url再次放入队列
                    self.spider_queue.push({
                        "url": task['url'],
                        "life": task['life'] - 1
                    })
                    log("[FAILED] %s %s" % (task['url'], e))
            else:
                # 更新任务状态
                self._update_status(False)

                # 将失败的url再次放入队列
                self.spider_queue.push({
                    "url": task['url'],
                    "life": task['life'] - 1
                })
                log("[FAILED] %s %s" % (task['url'], response['status_code']))

        else:
Exemplo n.º 2
0
class AsyncSpiderWorker(Worker):
    def __init__(self, master='127.0.0.1:2181', type='spider', concurrency=5, **kwargs):
        """
        异步爬虫执行器
        :param master: 主节点地址
        :param type: 执行器类型
        :param concurrency: 并发数
        :param kwargs:
        :return:
        """
        Worker.__init__(self, master, type)
        # 注册任务队列
        self.spider_queue = FIFOQueue(self.redis, self.config.get("spider_queue"))
        self.processer_queue = FIFOQueue(self.redis, self.config.get("processor_queue"))
        # 注册过滤器
        self.duplicate_filter = DuplicateFilter(self.redis, self.config.get("duplicate_set"))
        # 注册存储数据库
        self.storage_pipline = MongodbStorage(self.mongodb, self.config.get("storage_db"))
        # 并发线程数
        self.concurrency = concurrency
        # 内部队列
        self._queue = queues.Queue()

    def fetch(self, url, **kwargs):
        fetch = getattr(httpclient.AsyncHTTPClient(), 'fetch')
        return fetch(url, raise_error=False, **kwargs)

    def parse(self, url, html):
        """ 解析html页面 """
        parser = NormalParser(url, html, self.job)
        item = parser.parse()
        # 分片写入
        item['ram'] = random.random()
        new_urls = item.get('links')

        # 抓去的新链接判重后加入队列
        for new_url in new_urls:
            if not self.duplicate_filter.exists(new_url):
                self.spider_queue.push({
                    "url": new_url,
                    "life": 5
                })

        # url原始解析结果持久化
        item = self.storage_pipline.insert(self.config.get("page_table"), item)
        self.processer_queue.push(item.get('_id'))
        self._update_status(True)
        log("[SUCCESS] %s." % url)

    def handle_response(self, task, response):
        """ 处理异步返回 """
        if response.code == 200:
            self.parse(task['url'], response.body)

        else:
            self._update_status(False)
            self.spider_queue.push({
                "url": task['url'],
                "life": task['life'] - 1
            })
            log("[FAILED] %s %s" % (task['url'], response.code))

    @gen.coroutine
    def get_page(self, task):
        """
        获取页面内容
        :param task:
        :return:
        """
        try:
            response = yield self.fetch(task['url'])
        except Exception as e:
            print('Exception: %s %s' % (e, task['url']))
            raise gen.Return(e)
        raise gen.Return(response)

    @gen.coroutine
    def _run(self):
        @gen.coroutine
        def fetch_url():
            current_task = yield self._queue.get()
            try:
                response = yield self.get_page(current_task)
                self.handle_response(current_task, response)

                # 从分布式队列中取出$(concurrency)个任务加入队列
                for i in range(self.concurrency):
                    if len(self.spider_queue) > 0:
                        task = eval(self.spider_queue.pop())
                        if task['life'] > 0:
                            yield self._queue.put(task)

            finally:
                self._queue.task_done()

        @gen.coroutine
        def worker():
            while True:
                yield fetch_url()

        if len(self.spider_queue) > 0:
            self._update_on_job(True)
            # 加入首个任务
            self._queue.put(eval(self.spider_queue.pop()))

            # 启动worker直到队列为空
            for _ in range(self.concurrency):
                worker()

            yield self._queue.join(timeout=timedelta(seconds=300000))
        else:
            self.wait_task_time += 1
            if self.wait_task_time > 5:
                self._update_on_job(False)
            log("[SPIDER] Wait for some jobs...")
            time.sleep(3)

    def run(self, job):
        io_loop = ioloop.IOLoop.current()
        io_loop.run_sync(self._run)