Exemplo n.º 1
0
class Engine(object):
    def __init__(self, spider):
        # self.url = url
        self.spider_middleware = SpiderMiddleware()
        self.downloader_middleware = DownloaderMiddleware()
        self.spider = spider
        self.scheduler = Scheduler()
        self.downloader = Downloader()
        self.pipeline = Pipeline()
        self.total_request_num = 0
        self.total_response_num = 0

    def start(self):
        start_time = datetime.now()
        # print(3333)
        self._start_engine()
        end_time = datetime.now()
        # print(1111)
        logger.info('总时间{}'.format((end_time - start_time).total_seconds()))
        # print(222)

    def _request(self):
        request = self.scheduler.get_request()
        self.total_request_num += 1
        if request is None:
            return
        request = self.downloader_middleware.process_request(request)
        response = self.downloader.get_response(request)
        self.downloader_middleware.process_response(response)
        # response = self.spider_middleware.process_response(response)
        self.spider_middleware.process_response(response)
        results = self.spider.parse(response)
        for result in results:

            if isinstance(result, Request):
                self.spider_middleware.process_request(request)
                self.scheduler.add_request(result)

            else:
                self.pipeline.process_item(result)
        self.total_response_num += 1

    def _start_engine(self):
        requests = self.spider.start_requests()
        for request in requests:
            request = self.spider_middleware.process_request(request)
            self.scheduler.add_request(request)
        while True:
            self._request()
            if self.total_response_num >= self.total_request_num:
                return
Exemplo n.º 2
0
class Engine():
    def __init__(self):
        self.spider = Spider()
        self.scheduler = Scheduler()
        self.downloader = Downloader()
        self.pipeline = Pipeline()

        self.spider_mid = SpiderMiddleware()
        self.downloader_mid = DownloaderMiddleware()

    def start(self):
        # 被调用执行引擎逻辑的入口函数
        self._start_engine()

    def _start_engine(self):
        # 框架运行的逻辑

        # 1. 爬虫模块发出初始请求
        start_request = self.spider.start_request()

        # 利用爬虫中间件预处理请求对象
        start_request = self.spider_mid.process_request(start_request)

        # 2. 把初始请求添加给调度器
        self.scheduler.add_request(start_request)

        # 3. 从调度器获取请求对象
        request = self.scheduler.get_request()

        # 利用下载器中间件预处理请求对象
        request = self.downloader_mid.process_request(request)

        # 4. 利用下载器发起请求
        response = self.downloader.get_response(request)

        # 利用下载器中间件预处理响应对象
        response = self.downloader_mid.process_response(response)

        # 利用爬虫中间件预处理响应对象
        response = self.spider_mid.process_response(response)

        # 5. 利用爬虫的解析响应的方法,处理响应,得到结果
        result = self.spider.parse(response)

        # 6. 判断结果对象
        if isinstance(result, Request):

            # 利用爬虫中间件预处理请求对象
            result = self.spider_mid.process_request(result)

            # 6.1 如果是请求对象,那么就再交给调度器
            self.scheduler.add_request(result)

        else:

            # 6.2 否则,就交给管道处理
            self.pipeline.process_item(result)
Exemplo n.º 3
0
class Engine():
    def __init__(self, spiders):
        self.spiders = spiders
        self.scheduler = Scheduler()
        self.downloader = Downloader()
        self.pipeline = Pipeline()

        self.spider_mid = SpiderMiddleware()
        self.downloader_mid = DownloaderMiddleware()

        self.total_response_nums = 0
        self.total_request_nums = 0

    def start(self):
        # 被调用执行引擎逻辑的入口函数

        start = datetime.now()  # 起始时间
        logger.info("开始运行时间:%s" % start)  # 使用日志记录起始运行时间
        self._start_engine()
        stop = datetime.now()  # 结束时间
        logger.info("结束运行时间:%s" % stop)  # 使用日志记录结束运行时间
        logger.info("耗时:%.2f" % (stop - start).total_seconds())  # 使用日志记录运行耗时
        logger.info("总的请求数量:{}".format(self.total_request_nums))
        logger.info("总的响应数量:{}".format(self.total_response_nums))

    def _start_requests(self):
        """把爬虫中所有起始的url构造request并添加到请求队列中"""
        for spider_name, spider in self.spiders.items():
            for start_request in spider.start_requests():

                # 1. 爬虫模块发出初始请求
                # 利用爬虫中间件预处理请求对象
                start_request = self.spider_mid.process_request(start_request)
                # 给request对象增加spider_name的属性
                start_request.spider_name = spider_name
                # 2. 把初始请求添加给调度器
                self.scheduler.add_request(start_request)
                # 总请求数 + 1
                self.total_request_nums += 1

    def _execute_request_response_item(self):
        """队列中取出一个request,直到处理完毕"""
        # 3. 从调度器获取请求对象
        request = self.scheduler.get_request()
        # 判断队列是否取空
        if request is None:
            return  # 提前终止
        # 利用下载器中间件预处理请求对象
        request = self.downloader_mid.process_request(request)
        # 4. 利用下载器发起请求
        response = self.downloader.get_response(request)
        # 传递meta
        response.meta = request.meta
        # 利用下载器中间件预处理响应对象
        response = self.downloader_mid.process_response(response)
        # 利用爬虫中间件预处理响应对象
        response = self.spider_mid.process_response(response)
        # 根据爬虫名获取爬虫类对象
        spider = self.spiders[request.spider_name]
        # 5. 利用爬虫的解析响应的方法,处理响应,得到结果
        # request.parse指定的解析函数 = getattr(爬虫类对象, 指定的解析函数的字符串)
        parse_func = getattr(spider, request.parse)
        results = parse_func(response)
        for result in results:
            # 6. 判断结果对象
            if isinstance(result, Request):
                # 利用爬虫中间件预处理请求对象
                result = self.spider_mid.process_request(result)
                # 给request对象增加spider_name的属性
                result.spider_name = request.spider_name
                # 6.1 如果是请求对象,那么就再交给调度器
                self.scheduler.add_request(result)
                # 总请求数 + 1
                self.total_request_nums += 1
            else:
                # 6.2 否则,就交给管道处理
                self.pipeline.process_item(result)
        # 总响应数 +1
        self.total_response_nums += 1

    def _start_engine(self):
        # 框架运行的逻辑

        self._start_requests()  # 把所有初始request放入队列

        while True:
            time.sleep(0.1)
            self._execute_request_response_item()  # 处理一个从队列中取出的request

            # 程序退出的条件
            if self.scheduler.q.empty():
                break  # 判断队列为空
Exemplo n.º 4
0
class Engine():
    def __init__(self):
        self.spider = Spider()
        self.scheduler = Scheduler()
        self.downloader = Downloader()
        self.pipeline = Pipeline()

        self.spider_mid = SpiderMiddleware()
        self.downloader_mid = DownloaderMiddleware()

    def start(self):
        # 被调用执行引擎逻辑的入口函数

        start = datetime.now()  # 起始时间
        logger.info("开始运行时间:%s" % start)  # 使用日志记录起始运行时间
        self._start_engine()
        stop = datetime.now()  # 结束时间
        logger.info("结束运行时间:%s" % stop)  # 使用日志记录结束运行时间
        logger.info("耗时:%.2f" % (stop - start).total_seconds())  # 使用日志记录运行耗时

    def _start_engine(self):
        # 框架运行的逻辑

        # 1. 爬虫模块发出初始请求
        start_request = self.spider.start_request()

        # 利用爬虫中间件预处理请求对象
        start_request = self.spider_mid.process_request(start_request)

        # 2. 把初始请求添加给调度器
        self.scheduler.add_request(start_request)

        # 3. 从调度器获取请求对象
        request = self.scheduler.get_request()

        # 利用下载器中间件预处理请求对象
        request = self.downloader_mid.process_request(request)

        # 4. 利用下载器发起请求
        response = self.downloader.get_response(request)

        # 利用下载器中间件预处理响应对象
        response = self.downloader_mid.process_response(response)

        # 利用爬虫中间件预处理响应对象
        response = self.spider_mid.process_response(response)

        # 5. 利用爬虫的解析响应的方法,处理响应,得到结果
        result = self.spider.parse(response)

        # 6. 判断结果对象
        if isinstance(result, Request):

            # 利用爬虫中间件预处理请求对象
            result = self.spider_mid.process_request(result)

            # 6.1 如果是请求对象,那么就再交给调度器
            self.scheduler.add_request(result)

        else:

            # 6.2 否则,就交给管道处理
            self.pipeline.process_item(result)