def __init__(self, crawler, spider_closed_callback): ## 将爬虫实例存储在执行引擎实例中 self.crawler = crawler ## 将爬虫实例所对应的配置也存储在执行引擎实例中 self.settings = crawler.settings ## 信号 self.signals = crawler.signals ## 日志格式化器 self.logformatter = crawler.logformatter self.slot = None self.spider = None ## 是否正在运行 self.running = False ## 是否已暂停执行 self.paused = False ## 从配置文件中加载调度器类 self.scheduler_cls = load_object(self.settings['SCHEDULER']) ## 从配置文件中加载下载器类 downloader_cls = load_object(self.settings['DOWNLOADER']) ## 实例化下载器 self.downloader = downloader_cls(crawler) ## 实例化 scraper,它是引擎连接爬虫类(Spider)和管道类(Pipeline)的桥梁 self.scraper = Scraper(crawler) ## 指定爬虫关闭的回调函数 self._spider_closed_callback = spider_closed_callback
def test_spider_output_handling(self): spider = self.MySpider() scraper = Scraper(Crawler(spider)) scraper.open_spider(spider) scraper._process_spidermw_output(RssItem(), None, None, None) scraper._process_spidermw_output(ExtendableItem(), None, None, None) scraper._process_spidermw_output(RssedItem(), None, None, None) scraper.close_spider(spider)
def __init__(self, settings, spider_closed_callback): self.settings = settings self.closing = {} # dict (spider -> reason) of spiders being closed self.closing_dfds = { } # dict (spider -> deferred) of spiders being closed self.running = False self.paused = False self._next_request_calls = {} self.scheduler = load_object(settings['SCHEDULER'])() self.downloader = Downloader() self.scraper = Scraper(self, self.settings) self._spider_closed_callback = spider_closed_callback
def __init__(self, crawler, spider_closed_callback): self.settings = crawler.settings self.slots = {} self.running = False self.paused = False self.scheduler_cls = load_object(self.settings['SCHEDULER']) self.downloader = Downloader(crawler) self.scraper = Scraper(crawler) self._concurrent_spiders = self.settings.getint('CONCURRENT_SPIDERS', 1) if self._concurrent_spiders != 1: warnings.warn("CONCURRENT_SPIDERS settings is deprecated, use " \ "Scrapyd max_proc config instead", ScrapyDeprecationWarning) self._spider_closed_callback = spider_closed_callback
def __init__(self, crawler, spider_closed_callback): self.crawler = crawler self.settings = crawler.settings self.signals = crawler.signals self.logformatter = crawler.logformatter self.slot = None self.spider = None self.running = False self.paused = False self.scheduler_cls = load_object(self.settings['SCHEDULER']) downloader_cls = load_object(self.settings['DOWNLOADER']) self.downloader = downloader_cls(crawler) self.scraper = Scraper(crawler) self._spider_closed_callback = spider_closed_callback
def __init__(self, crawler, spider_closed_callback: Callable) -> None: self.crawler = crawler self.settings = crawler.settings self.signals = crawler.signals self.logformatter = crawler.logformatter self.slot: Optional[Slot] = None self.spider: Optional[Spider] = None self.running = False self.paused = False self.scheduler_cls = load_object(crawler.settings["SCHEDULER"]) downloader_cls = load_object(self.settings['DOWNLOADER']) self.downloader = downloader_cls(crawler) self.scraper = Scraper(crawler) self._spider_closed_callback = spider_closed_callback
def __init__(self, crawler, spider_closed_callback): self.crawler = crawler self.settings = crawler.settings self.signals = crawler.signals #使用crawler的信号管理器 self.logformatter = crawler.logformatter self.slot = None self.spider = None self.running = False self.paused = False self.scheduler_cls = load_object( self.settings['SCHEDULER']) #根据配置的调度器类来生成对应的对象 downloader_cls = load_object( self.settings['DOWNLOADER']) #根据配置的下载器类来生成对应的类 self.downloader = downloader_cls(crawler) self.scraper = Scraper(crawler) #生成一个刮取器 self._spider_closed_callback = spider_closed_callback
def __init__(self, crawler, spider_closed_callback): self.crawler = crawler self.settings = crawler.settings self.signals = crawler.signals self.logformatter = crawler.logformatter self.slot = None self.spider = None self.running = False self.paused = False self.scheduler_cls = load_object( self.settings['SCHEDULER'] ) # 从settings中找到Scheduler调度器,找到Scheduler类 downloader_cls = load_object( self.settings['DOWNLOADER']) # 同样,找到Downloader下载器类 self.downloader = downloader_cls(crawler) # 实例化Downloader self.scraper = Scraper(crawler) # 实例化Scraper,它是引擎连接爬虫类的桥梁 self._spider_closed_callback = spider_closed_callback
def __init__(self, crawler, spider_closed_callback): self.crawler = crawler self.settings = crawler.settings self.signals = crawler.signals self.logformatter = crawler.logformatter self.slot = None self.spider = None self.running = False self.paused = False self.scheduler_cls = load_object(self.settings['SCHEDULER']) self.scraper = Scraper(crawler) self._spider_closed_callback = spider_closed_callback if self.downloader is None: downloader_cls = load_object(self.settings['DOWNLOADER']) MyExecutionEngine.downloader = downloader_cls(crawler) self.downloader = MyExecutionEngine.downloader self.downloader.close = CloseOnlyLastTime(self.downloader.close)
def __init__(self, crawler, spider_closed_callback): self.crawler = crawler self.settings = crawler.settings # 配置 self.signals = crawler.signals # 信号 self.logformatter = crawler.logformatter # 日志格式 self.slot = None self.spider = None self.running = False self.paused = False # 提取scheduler调度器类名(未进行实例化), 其在open_spdier中实例化 self.scheduler_cls = load_object(self.settings['SCHEDULER']) # 提取downloader下载器类名, 并实例化, 见scrapy/core/downloader/__init__.py文件 downloader_cls = load_object(self.settings['DOWNLOADER']) self.downloader = downloader_cls(crawler) # 实例化scrapyer: engine和spider之间的桥梁, 见scrapy/core/scraper.py self.scraper = Scraper(crawler) self._spider_closed_callback = spider_closed_callback
def __init__(self, crawler, spider_closed_callback): self.locker = threading.Condition() self.crawler = crawler self.settings = crawler.settings self.signals = crawler.signals self.logformatter = crawler.logformatter self.slot = None self.spider = None self.running = False self.paused = False self.scheduler_cls = load_object(self.settings['SCHEDULER']) downloader_cls = load_object(self.settings['DOWNLOADER']) self.downloader = downloader_cls(crawler) self.scraper = Scraper(crawler) self._concurrent_spiders = self.settings.getint('CONCURRENT_SPIDERS', 1) if self._concurrent_spiders != 1: warnings.warn("CONCURRENT_SPIDERS settings is deprecated, use " \ "Scrapyd max_proc config instead", ScrapyDeprecationWarning) self._spider_closed_callback = spider_closed_callback
def __init__(self, crawler, spider_closed_callback): self.crawler = crawler self.settings = crawler.settings self.signals = crawler.signals self.logformatter = crawler.logformatter self.slot = None self.spider = None self.running = False self.paused = False self.scheduler_cls = load_object(self.settings['SCHEDULER']) #在这里获取了scheduler类 #默认是'scrapy.core.scheduler.Scheduler' downloader_cls = load_object(self.settings['DOWNLOADER']) #这里获取downloader #默认是'scrapy.core.downloader.Downloader' self.downloader = downloader_cls(crawler) #下载器实例化 self.scraper = Scraper(crawler) self._spider_closed_callback = spider_closed_callback
def __init__(self, crawler, spider_closed_callback): self.crawler = crawler self.settings = crawler.settings self.signals = crawler.signals self.logformatter = crawler.logformatter self.slot = None self.spider = None self.running = False self.paused = False self.scheduler_cls = load_object( self.settings['SCHEDULER'] ) # SCHEDULER = 'scrapy.core.scheduler.Scheduler',仅仅获取对象,没做其他坏事 downloader_cls = load_object( self.settings['DOWNLOADER'] ) # DOWNLOADER = 'scrapy.core.downloader.Downloader' self.downloader = downloader_cls( crawler ) # 这个下载器,里面实例化了handler处理器,和到下载器之间的process_处理逻辑。就是具体的下载功能和中间件功能都已经实现了 self.scraper = Scraper( crawler) # 这里有定义有spidermw爬虫中间件和ITEM_pipeline管道对象,数据处理功能和存储功能都实现了 self._spider_closed_callback = spider_closed_callback # 这个回调很重要,关系到爬虫能不能停下来,是个匿名函数lambda _: self.stop(),最终还是执行engine的self.engine.stop
def __init__(self, crawler, spider_closed_callback: Callable) -> None: self.crawler = crawler self.settings = crawler.settings self.signals = crawler.signals self.logformatter = crawler.logformatter self.slot: Optional[Slot] = None self.spider: Optional[Spider] = None self.running = False # 是否暂停 self.paused = False self.scheduler_cls = self._get_scheduler_class(crawler.settings) # 加载下载器 downloader_cls = load_object(self.settings['DOWNLOADER']) self.downloader = downloader_cls(crawler) # 实例化这个抓取这个动作 self.scraper = Scraper(crawler) # 外部传入的关闭回调 self._spider_closed_callback = spider_closed_callback
"""