Exemplo n.º 1
0
 def __init__(self):
     self.spider = Spider()
     self.scheduler = Scheduler()
     self.pipeline = Pipeline()
     self.downloader = Downloader()
     self.spider_mid = SpiderMiddleware()  # 初始化爬虫中间件对象
     self.downloader_mid = DownloaderMiddleware()  # 初始化下载器中间件对象
Exemplo n.º 2
0
class Engine(object):
    '''
    a. 对外提供整个的程序的入口
    b. 依次调用其他组件对外提供的接口,实现整个框架的运作(驱动)
    '''
    def __init__(self):
        """实例化其他各个组件"""
        self.spider = Spider()  # 接收爬虫对象
        self.scheduler = Scheduler()  # 初始化调度器对象
        self.downloader = Downloader()  # 初始化下载器对象
        self.pipeline = Pipeline()  # 初始化管道对象

        self.spider_mid = SpiderMiddleware()  # 初始化爬虫中间件对象
        self.downloader_mid = DownloaderMiddleware()  # 初始化下载器中间件对象

    def start(self):
        '''提供程序启动入口, 启动整个引擎'''

        # 测试log功能
        start_time = datetime.now()
        logger.info('爬虫启动:{}'.format(start_time))

        self._start_engine()
        end_time = datetime.now()
        print("爬虫结束:{}".format(end_time))
        print('爬虫共运行:{}秒'.format(
            (end_time - start_time).total_seconds()))  # total_seconds

    def _start_engine(self):
        '''依次调用其他组件对外提供的接口,实现整个框架的运作(驱动)'''
        # 1. 爬虫模块发出初始请求:调用爬虫的方法
        start_request = self.spider.start_requests()

        # 2. 把初始请求添加给调度器(队列)

        # 利用爬虫中间件预处理请求对象
        start_request = self.spider_mid.process_request(start_request)

        self.scheduler.add_request(start_request)
        # 3. 从调度器获取请求对象,准备交给下载器发起请求,获取一个响应对象
        request = self.scheduler.get_request()

        # 利用下载器中间件预处理请求对象
        request = self.downloader_mid.process_request(request)

        # 4. 利用下载器发起请求
        response = self.downloader.get_response(request)

        # 5. 利用爬虫的解析响应的方法,处理响应,得到结果
        result = self.spider.parse(response)
        # 6. 判断结果对象
        # 6.1 如果是请求对象,那么就再交给调度器
        if isinstance(result, Request):
            # 利用爬虫中间件预处理请求对象
            result = self.spider_mid.process_request(result)
            self.scheduler.add_request(result)
        # 6.2 否则,就交给管道处理
        else:
            self.pipeline.process_item(result)
Exemplo n.º 3
0
    def __init__(self):
        """实例化其他各个组件"""
        self.spider = Spider()  # 接收爬虫对象
        self.scheduler = Scheduler()  # 初始化调度器对象
        self.downloader = Downloader()  # 初始化下载器对象
        self.pipeline = Pipeline()  # 初始化管道对象

        self.spider_mid = SpiderMiddleware()  # 初始化爬虫中间件对象
        self.downloader_mid = DownloaderMiddleware()  # 初始化下载器中间件对象
Exemplo n.º 4
0
class Engine():
    def __init__(self):
        self.spider = Spider()
        self.scheduler = Scheduler()
        self.downloader = Downloader()
        self.pipeline = Pipeline()

        self.spider_mid = SpiderMiddleware()
        self.downloader_mid = DownloaderMiddleware()

    def start(self):
        # 被调用执行引擎逻辑的入口函数
        self._start_engine()

    def _start_engine(self):
        # 框架运行的逻辑

        # 1. 爬虫模块发出初始请求
        start_request = self.spider.start_request()

        # 利用爬虫中间件预处理请求对象
        start_request = self.spider_mid.process_request(start_request)

        # 2. 把初始请求添加给调度器
        self.scheduler.add_request(start_request)

        # 3. 从调度器获取请求对象
        request = self.scheduler.get_request()

        # 利用下载器中间件预处理请求对象
        request = self.downloader_mid.process_request(request)

        # 4. 利用下载器发起请求
        response = self.downloader.get_response(request)

        # 利用下载器中间件预处理响应对象
        response = self.downloader_mid.process_response(response)

        # 利用爬虫中间件预处理响应对象
        response = self.spider_mid.process_response(response)

        # 5. 利用爬虫的解析响应的方法,处理响应,得到结果
        result = self.spider.parse(response)

        # 6. 判断结果对象
        if isinstance(result, Request):

            # 利用爬虫中间件预处理请求对象
            result = self.spider_mid.process_request(result)

            # 6.1 如果是请求对象,那么就再交给调度器
            self.scheduler.add_request(result)

        else:

            # 6.2 否则,就交给管道处理
            self.pipeline.process_item(result)
Exemplo n.º 5
0
    def __init__(self, spider):
        self.spider = spider
        self.scheduler = Scheduler()
        self.downloader = Downloader()
        self.pipeline = Pipeline()

        # 实例化中间件
        self.spider_mid = SpiderMiddleware()
        self.downloader_mid = DownloaderMiddleware()
Exemplo n.º 6
0
 def __init__(self,spider):
     # self.url = url
     self.spider_middleware = SpiderMiddleware()
     self.downloader_middleware = DownloaderMiddleware()
     self.spider = spider
     self.scheduler = Scheduler()
     self.downloader = Downloader()
     self.pipeline= Pipeline()
     self.total_request_num = 0
     self.total_response_num = 0
Exemplo n.º 7
0
    def __init__(self, spiders):
        self.spiders = spiders
        self.scheduler = Scheduler()
        self.downloader = Downloader()
        self.pipeline = Pipeline()

        self.spider_mid = SpiderMiddleware()
        self.downloader_mid = DownloaderMiddleware()

        self.total_response_nums = 0
        self.total_request_nums = 0
Exemplo n.º 8
0
class Engine(object):
    def __init__(self, spider):
        # self.url = url
        self.spider_middleware = SpiderMiddleware()
        self.downloader_middleware = DownloaderMiddleware()
        self.spider = spider
        self.scheduler = Scheduler()
        self.downloader = Downloader()
        self.pipeline = Pipeline()
        self.total_request_num = 0
        self.total_response_num = 0

    def start(self):
        start_time = datetime.now()
        # print(3333)
        self._start_engine()
        end_time = datetime.now()
        # print(1111)
        logger.info('总时间{}'.format((end_time - start_time).total_seconds()))
        # print(222)

    def _request(self):
        request = self.scheduler.get_request()
        self.total_request_num += 1
        if request is None:
            return
        request = self.downloader_middleware.process_request(request)
        response = self.downloader.get_response(request)
        self.downloader_middleware.process_response(response)
        # response = self.spider_middleware.process_response(response)
        self.spider_middleware.process_response(response)
        results = self.spider.parse(response)
        for result in results:

            if isinstance(result, Request):
                self.spider_middleware.process_request(request)
                self.scheduler.add_request(result)

            else:
                self.pipeline.process_item(result)
        self.total_response_num += 1

    def _start_engine(self):
        requests = self.spider.start_requests()
        for request in requests:
            request = self.spider_middleware.process_request(request)
            self.scheduler.add_request(request)
        while True:
            self._request()
            if self.total_response_num >= self.total_request_num:
                return
Exemplo n.º 9
0
class Engine(object):
    def __init__(self):
        self.spider = Spider()
        self.scheduler = Scheduler()
        self.pipeline = Pipeline()
        self.downloader = Downloader()
        self.spider_mid = SpiderMiddleware()  # 初始化爬虫中间件对象
        self.downloader_mid = DownloaderMiddleware()  # 初始化下载器中间件对象

    def start(self):
        """启动整个引擎,主要调用逻辑代码写在_start_engine中"""
        start = datetime.now()  # 获取当前时间
        logger.info('开始运行时间:%s' % start)
        self._start_engine()
        stop = datetime.now()
        logger.info('运行结束时间:%s' % stop)
        # 运行总耗时时间
        logger.info('耗时: %.2f' % (stop - start).total_seconds())

    def _start_engine(self):
        '''实现个业务之间对接'''
        # 爬虫模块发出初始化请求
        start_request = self.spider.start_request()

        # 爬虫中间件
        start_request = self.spider_mid.process_request(start_request)
        # 把初始化请求添加给调度器
        self.scheduler.add_request(start_request)
        # 从调度器获取请求对象
        request = self.scheduler.get_request()

        # 利用下载中间件预处理请求对象
        request = self.downloader_mid.process_request(request)
        # 调用下载器发送请求
        response = self.downloader.get_response(request)

        # 下载中间件预处理对象
        response = self.downloader_mid.process_response(response)
        # 调用爬虫处理响应方法,处理响应,得到结果
        result = self.spider.parse(response)
        if isinstance(result, Request):
            result = self.spider_mid.process_request(result)
            self.scheduler.add_request(result)
        else:
            self.pipeline.process_item(result)
Exemplo n.º 10
0
    def __init__(self, pipelines={}, middlewares={}):
        # self.spider = Spider()
        # self.spider = spider
        self.spider = self._auto_import_instances(SPIDERS)
        self.scheduler = Scheduler()
        self.pipelines = pipelines
        # self.pipelines = self._auto_import_instances(PIPELINES)
        self.downloader = Downloader()
        self.spider_mid = SpiderMiddleware()  # 初始化爬虫中间件对象
        # self.downloader_mid = DownloaderMiddleware()  # 初始化下载器中间件对象
        self.downloader_mid = middlewares  # 初始化下载器中间件对象

        # 计算总响应数量
        self.total_response_nums = 0
        # 总请求数量
        self.total_request_nums = 0

        # 线程池
        self.pool = Pool()
        self.is_running = False
Exemplo n.º 11
0
class Engine():
    def __init__(self, spiders):
        self.spiders = spiders
        self.scheduler = Scheduler()
        self.downloader = Downloader()
        self.pipeline = Pipeline()

        self.spider_mid = SpiderMiddleware()
        self.downloader_mid = DownloaderMiddleware()

        self.total_response_nums = 0
        self.total_request_nums = 0

    def start(self):
        # 被调用执行引擎逻辑的入口函数

        start = datetime.now()  # 起始时间
        logger.info("开始运行时间:%s" % start)  # 使用日志记录起始运行时间
        self._start_engine()
        stop = datetime.now()  # 结束时间
        logger.info("结束运行时间:%s" % stop)  # 使用日志记录结束运行时间
        logger.info("耗时:%.2f" % (stop - start).total_seconds())  # 使用日志记录运行耗时
        logger.info("总的请求数量:{}".format(self.total_request_nums))
        logger.info("总的响应数量:{}".format(self.total_response_nums))

    def _start_requests(self):
        """把爬虫中所有起始的url构造request并添加到请求队列中"""
        for spider_name, spider in self.spiders.items():
            for start_request in spider.start_requests():

                # 1. 爬虫模块发出初始请求
                # 利用爬虫中间件预处理请求对象
                start_request = self.spider_mid.process_request(start_request)
                # 给request对象增加spider_name的属性
                start_request.spider_name = spider_name
                # 2. 把初始请求添加给调度器
                self.scheduler.add_request(start_request)
                # 总请求数 + 1
                self.total_request_nums += 1

    def _execute_request_response_item(self):
        """队列中取出一个request,直到处理完毕"""
        # 3. 从调度器获取请求对象
        request = self.scheduler.get_request()
        # 判断队列是否取空
        if request is None:
            return  # 提前终止
        # 利用下载器中间件预处理请求对象
        request = self.downloader_mid.process_request(request)
        # 4. 利用下载器发起请求
        response = self.downloader.get_response(request)
        # 传递meta
        response.meta = request.meta
        # 利用下载器中间件预处理响应对象
        response = self.downloader_mid.process_response(response)
        # 利用爬虫中间件预处理响应对象
        response = self.spider_mid.process_response(response)
        # 根据爬虫名获取爬虫类对象
        spider = self.spiders[request.spider_name]
        # 5. 利用爬虫的解析响应的方法,处理响应,得到结果
        # request.parse指定的解析函数 = getattr(爬虫类对象, 指定的解析函数的字符串)
        parse_func = getattr(spider, request.parse)
        results = parse_func(response)
        for result in results:
            # 6. 判断结果对象
            if isinstance(result, Request):
                # 利用爬虫中间件预处理请求对象
                result = self.spider_mid.process_request(result)
                # 给request对象增加spider_name的属性
                result.spider_name = request.spider_name
                # 6.1 如果是请求对象,那么就再交给调度器
                self.scheduler.add_request(result)
                # 总请求数 + 1
                self.total_request_nums += 1
            else:
                # 6.2 否则,就交给管道处理
                self.pipeline.process_item(result)
        # 总响应数 +1
        self.total_response_nums += 1

    def _start_engine(self):
        # 框架运行的逻辑

        self._start_requests()  # 把所有初始request放入队列

        while True:
            time.sleep(0.1)
            self._execute_request_response_item()  # 处理一个从队列中取出的request

            # 程序退出的条件
            if self.scheduler.q.empty():
                break  # 判断队列为空
Exemplo n.º 12
0
class Engine():
    def __init__(self):
        self.spider = Spider()
        self.scheduler = Scheduler()
        self.downloader = Downloader()
        self.pipeline = Pipeline()

        self.spider_mid = SpiderMiddleware()
        self.downloader_mid = DownloaderMiddleware()

    def start(self):
        # 被调用执行引擎逻辑的入口函数

        start = datetime.now()  # 起始时间
        logger.info("开始运行时间:%s" % start)  # 使用日志记录起始运行时间
        self._start_engine()
        stop = datetime.now()  # 结束时间
        logger.info("结束运行时间:%s" % stop)  # 使用日志记录结束运行时间
        logger.info("耗时:%.2f" % (stop - start).total_seconds())  # 使用日志记录运行耗时

    def _start_engine(self):
        # 框架运行的逻辑

        # 1. 爬虫模块发出初始请求
        start_request = self.spider.start_request()

        # 利用爬虫中间件预处理请求对象
        start_request = self.spider_mid.process_request(start_request)

        # 2. 把初始请求添加给调度器
        self.scheduler.add_request(start_request)

        # 3. 从调度器获取请求对象
        request = self.scheduler.get_request()

        # 利用下载器中间件预处理请求对象
        request = self.downloader_mid.process_request(request)

        # 4. 利用下载器发起请求
        response = self.downloader.get_response(request)

        # 利用下载器中间件预处理响应对象
        response = self.downloader_mid.process_response(response)

        # 利用爬虫中间件预处理响应对象
        response = self.spider_mid.process_response(response)

        # 5. 利用爬虫的解析响应的方法,处理响应,得到结果
        result = self.spider.parse(response)

        # 6. 判断结果对象
        if isinstance(result, Request):

            # 利用爬虫中间件预处理请求对象
            result = self.spider_mid.process_request(result)

            # 6.1 如果是请求对象,那么就再交给调度器
            self.scheduler.add_request(result)

        else:

            # 6.2 否则,就交给管道处理
            self.pipeline.process_item(result)
Exemplo n.º 13
0
class Engine(object):

    def __init__(self, spider):
        self.spider = spider
        self.scheduler = Scheduler()
        self.downloader = Downloader()
        self.pipeline = Pipeline()

        # 实例化中间件
        self.spider_mid = SpiderMiddleware()
        self.downloader_mid = DownloaderMiddleware()

    def _start_engine(self):
        # 编写引擎调度流程

        # 1调用spider模块的start_requests方法,获取起始的请求
        start_request = self.spider.start_requests()

        # ----1调用爬虫中间件的process_request方法处理请求
        start_request = self.spider_mid.process_request(start_request)

        # 2将起始的请求传递给调度器模块的add_request方法
        self.scheduler.add_request(start_request)

        # 3调用调度器模块的get_request方法,从带爬取队列中获取一个请求
        request = self.scheduler.get_request()

        # ----2调用下载器中间件类的process_request方法
        request = self.downloader_mid.process_request(request)

        # 4将获取到的请求交给下载器模块的get_response方法,获得请求对应的响应
        response = self.downloader.get_response(request)

        # ----3调用下载器中间件的process_response方法
        response  = self.downloader_mid.process_response(response)

        # 5.将响应交给spider模块的parse方法用于解析,获取解析结果
        result = self.spider.parse(response)

        # 6判断解析结果类型
        if isinstance(result, Request):
            # ----1调用爬虫中间件的process_request方法处理请求
            result = self.spider_mid.process_request(result)

            # 如果是request对象,则调用调度器模块的add_request方法,建请求放入队列
            self.scheduler.add_request(result)

        if isinstance(result, Item):
            result = self.spider_mid.process_item(result)

            # 如果是item对象,则调用pipeline模块的process_item方法处理
            self.pipeline.process_item(result)
        else:
            raise Exception('框架不支持的数据类型')

    def start(self):
        start = datetime.now()
        logger.info("引擎启动时间{}".format(start))
        self._start_engine()
        stop = datetime.now()
        logger.info("引擎停止时间{}".format(stop))
        logger.info("爬虫运行时间{}".format((stop-start).total_seconds()))