def test_start_already_running_exception(self): e = ExecutionEngine(get_crawler(TestSpider), lambda _: None) yield e.open_spider(TestSpider(), []) e.start() yield self.assertFailure(e.start(), RuntimeError).addBoth( lambda exc: self.assertEqual(str(exc), "Engine already running")) yield e.stop()
class Crawler(object): def __init__(self, settings): self.configured = False self.settings = settings self.signals = SignalManager(self) self.stats = load_object(settings['STATS_CLASS'])(self) self._start_requests = lambda: () self._spider = None # TODO: move SpiderManager to CrawlerProcess spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS']) self.spiders = spman_cls.from_crawler(self) def install(self): # TODO: remove together with scrapy.project.crawler usage import scrapy.project assert not hasattr( scrapy.project, 'crawler'), "crawler already installed" scrapy.project.crawler = self def uninstall(self): # TODO: remove together with scrapy.project.crawler usage import scrapy.project assert hasattr(scrapy.project, 'crawler'), "crawler not installed" del scrapy.project.crawler def configure(self): if self.configured: return self.configured = True lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) self.engine = ExecutionEngine(self, self._spider_closed) def crawl(self, spider, requests=None): assert self._spider is None, 'Spider already attached' self._spider = spider spider.set_crawler(self) if requests is None: self._start_requests = spider.start_requests else: self._start_requests = lambda: requests def _spider_closed(self, spider=None): if not self.engine.open_spiders: self.stop() @defer.inlineCallbacks def start(self): yield defer.maybeDeferred(self.configure) if self._spider: yield self.engine.open_spider(self._spider, self._start_requests()) yield defer.maybeDeferred(self.engine.start) @defer.inlineCallbacks def stop(self): if self.configured and self.engine.running: yield defer.maybeDeferred(self.engine.stop)
class Crawler(object): def __init__(self, settings): self.configured = False self.settings = settings self.signals = SignalManager(self) self.stats = load_object(settings['STATS_CLASS'])(self) self._start_requests = lambda: () self._spider = None # TODO: move SpiderManager to CrawlerProcess spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS']) self.spiders = spman_cls.from_crawler(self) def install(self): # TODO: remove together with scrapy.project.crawler usage import scrapy.project assert not hasattr(scrapy.project, 'crawler'), "crawler already installed" scrapy.project.crawler = self def uninstall(self): # TODO: remove together with scrapy.project.crawler usage import scrapy.project assert hasattr(scrapy.project, 'crawler'), "crawler not installed" del scrapy.project.crawler def configure(self): if self.configured: return self.configured = True lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) self.engine = ExecutionEngine(self, self._spider_closed) def crawl(self, spider, requests=None): assert self._spider is None, 'Spider already attached' self._spider = spider spider.set_crawler(self) if requests is None: self._start_requests = spider.start_requests else: self._start_requests = lambda: requests def _spider_closed(self, spider=None): if not self.engine.open_spiders: self.stop() @defer.inlineCallbacks def start(self): yield defer.maybeDeferred(self.configure) if self._spider: yield self.engine.open_spider(self._spider, self._start_requests()) yield defer.maybeDeferred(self.engine.start) @defer.inlineCallbacks def stop(self): if self.configured and self.engine.running: yield defer.maybeDeferred(self.engine.stop)
def test_close_engine_spiders_downloader(self): e = ExecutionEngine(get_crawler(TestSpider), lambda _: None) yield e.open_spider(TestSpider(), []) e.start() self.assertTrue(e.running) yield e.close() self.assertFalse(e.running) self.assertEqual(len(e.open_spiders), 0)
def test_close_spiders_downloader(self): with pytest.warns(ScrapyDeprecationWarning, match="ExecutionEngine.open_spiders is deprecated, " "please use ExecutionEngine.spider instead"): e = ExecutionEngine(get_crawler(TestSpider), lambda _: None) yield e.open_spider(TestSpider(), []) self.assertEqual(len(e.open_spiders), 1) yield e.close() self.assertEqual(len(e.open_spiders), 0)
def test_deprecated_schedule(self): with pytest.warns(ScrapyDeprecationWarning, match="ExecutionEngine.schedule is deprecated, please use " "ExecutionEngine.crawl or ExecutionEngine.download instead"): e = ExecutionEngine(get_crawler(TestSpider), lambda _: None) spider = TestSpider() yield e.open_spider(spider, []) e.start() e.schedule(Request("data:,"), spider) yield e.close()
def test_download_deprecated_spider_arg(self): with pytest.warns(ScrapyDeprecationWarning, match="Passing a 'spider' argument to " "ExecutionEngine.download is deprecated"): e = ExecutionEngine(get_crawler(TestSpider), lambda _: None) spider = TestSpider() yield e.open_spider(spider, []) e.start() e.download(Request("data:,"), spider) yield e.close()
def test_deprecated_has_capacity(self): with pytest.warns(ScrapyDeprecationWarning, match="ExecutionEngine.has_capacity is deprecated"): e = ExecutionEngine(get_crawler(TestSpider), lambda _: None) self.assertTrue(e.has_capacity()) spider = TestSpider() yield e.open_spider(spider, []) self.assertFalse(e.has_capacity()) e.start() yield e.close() self.assertTrue(e.has_capacity())
def test_deprecated_has_capacity(self): with warnings.catch_warnings(record=True) as warning_list: e = ExecutionEngine(get_crawler(TestSpider), lambda _: None) self.assertTrue(e.has_capacity()) spider = TestSpider() yield e.open_spider(spider, []) self.assertFalse(e.has_capacity()) e.start() yield e.close() self.assertTrue(e.has_capacity()) self.assertEqual(warning_list[0].category, ScrapyDeprecationWarning) self.assertEqual(str(warning_list[0].message), "ExecutionEngine.has_capacity is deprecated")
def test_close_spiders_downloader(self): with warnings.catch_warnings(record=True) as warning_list: e = ExecutionEngine(get_crawler(TestSpider), lambda _: None) yield e.open_spider(TestSpider(), []) self.assertEqual(len(e.open_spiders), 1) yield e.close() self.assertEqual(len(e.open_spiders), 0) self.assertEqual(warning_list[0].category, ScrapyDeprecationWarning) self.assertEqual( str(warning_list[0].message), "ExecutionEngine.open_spiders is deprecated, please use ExecutionEngine.spider instead", )
def test_download_deprecated_spider_arg(self): with warnings.catch_warnings(record=True) as warning_list: e = ExecutionEngine(get_crawler(TestSpider), lambda _: None) spider = TestSpider() yield e.open_spider(spider, []) e.start() e.download(Request("data:,"), spider) yield e.close() self.assertEqual(warning_list[0].category, ScrapyDeprecationWarning) self.assertEqual( str(warning_list[0].message), "Passing a 'spider' argument to ExecutionEngine.download is deprecated", )
class Crawler(object): def __init__(self, settings): self.configured = False self.settings = settings self.signals = SignalManager(self) self.stats = load_object(settings['STATS_CLASS'])(self) def install(self): import scrapy.project assert not hasattr(scrapy.project, 'crawler'), "crawler already installed" scrapy.project.crawler = self def uninstall(self): import scrapy.project assert hasattr(scrapy.project, 'crawler'), "crawler not installed" del scrapy.project.crawler def configure(self): if self.configured: return self.configured = True d = dict(overridden_settings(self.settings)) log.msg(format="Overridden settings: %(settings)r", settings=d, level=log.DEBUG) lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS']) self.spiders = spman_cls.from_crawler(self) self.engine = ExecutionEngine(self, self._spider_closed) def crawl(self, spider, requests=None): spider.set_crawler(self) if requests is None: requests = spider.start_requests() return self.engine.open_spider(spider, requests) def _spider_closed(self, spider=None): if not self.engine.open_spiders: self.stop() @defer.inlineCallbacks def start(self): yield defer.maybeDeferred(self.configure) yield defer.maybeDeferred(self.engine.start) @defer.inlineCallbacks def stop(self): if self.engine.running: yield defer.maybeDeferred(self.engine.stop)
def test_deprecated_schedule(self): with warnings.catch_warnings(record=True) as warning_list: e = ExecutionEngine(get_crawler(TestSpider), lambda _: None) spider = TestSpider() yield e.open_spider(spider, []) e.start() e.schedule(Request("data:,"), spider) yield e.close() self.assertEqual(warning_list[0].category, ScrapyDeprecationWarning) self.assertEqual( str(warning_list[0].message), "ExecutionEngine.schedule is deprecated, please use " "ExecutionEngine.crawl or ExecutionEngine.download instead", )
class Crawler(object): def __init__(self, settings): self.configured = False self.settings = settings self.signals = SignalManager(self) self.stats = load_object(settings["STATS_CLASS"])(self) def install(self): import scrapy.project assert not hasattr(scrapy.project, "crawler"), "crawler already installed" scrapy.project.crawler = self def uninstall(self): import scrapy.project assert hasattr(scrapy.project, "crawler"), "crawler not installed" del scrapy.project.crawler def configure(self): if self.configured: return self.configured = True self.extensions = ExtensionManager.from_crawler(self) spman_cls = load_object(self.settings["SPIDER_MANAGER_CLASS"]) self.spiders = spman_cls.from_crawler(self) self.engine = ExecutionEngine(self, self._spider_closed) def crawl(self, spider, requests=None): spider.set_crawler(self) if requests is None: requests = spider.start_requests() return self.engine.open_spider(spider, requests) def _spider_closed(self, spider=None): if not self.engine.open_spiders: self.stop() @defer.inlineCallbacks def start(self): yield defer.maybeDeferred(self.configure) yield defer.maybeDeferred(self.engine.start) @defer.inlineCallbacks def stop(self): if self.engine.running: yield defer.maybeDeferred(self.engine.stop)
class Crawler(SettingObject): spider_manager_class = StringField(default="scrapy.spidermanager.SpiderManager") def __init__(self, settings): super(Crawler, self).__init__(settings) self.configured = False def configure(self): if self.configured: return self.configured = True self.extensions = ExtensionManager(self.metas, self) spman_cls = load_object(self.spider_manager_class.to_value()) self.spiders = spman_cls(self.metas) self.engine = ExecutionEngine(self, self._spider_closed) def crawl(self, spider, requests=None): spider.set_crawler(self) if requests is None: requests = spider.start_requests() return self.engine.open_spider(spider, requests) def _spider_closed(self, spider=None): if not self.engine.open_spiders: self.stop() @defer.inlineCallbacks def start(self): yield defer.maybeDeferred(self.configure) yield defer.maybeDeferred(self.engine.start) @defer.inlineCallbacks def stop(self): if self.engine.running: yield defer.maybeDeferred(self.engine.stop)
class Crawler(object): def __init__(self, settings): self.configured = False self.settings = settings self.signals = SignalManager(self) self.stats = load_object(settings['STATS_CLASS'])(self) spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS']) self.spiders = spman_cls.from_crawler(self) self.scheduled = {} def install(self): import scrapy.project assert not hasattr(scrapy.project, 'crawler'), "crawler already installed" scrapy.project.crawler = self def uninstall(self): import scrapy.project assert hasattr(scrapy.project, 'crawler'), "crawler not installed" del scrapy.project.crawler def configure(self): if self.configured: return self.configured = True lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) self.engine = ExecutionEngine(self, self._spider_closed) def crawl(self, spider, requests=None): spider.set_crawler(self) if self.configured and self.engine.running: assert not self.scheduled return self.schedule(spider, requests) else: self.scheduled.setdefault(spider, []).append(requests) def schedule(self, spider, batches=[]): requests = [] for batch in batches: if batch is None: batch = spider.start_requests() requests.extend(batch) return self.engine.open_spider(spider, requests) def _spider_closed(self, spider=None): if not self.engine.open_spiders: self.stop() @defer.inlineCallbacks def start(self): yield defer.maybeDeferred(self.configure) for spider, batches in self.scheduled.iteritems(): yield self.schedule(spider, batches) yield defer.maybeDeferred(self.engine.start) @defer.inlineCallbacks def stop(self): if self.engine.running: yield defer.maybeDeferred(self.engine.stop)
def test_close_spiders_downloader(self): e = ExecutionEngine(get_crawler(TestSpider), lambda _: None) yield e.open_spider(TestSpider(), []) self.assertEqual(len(e.open_spiders), 1) yield e.close() self.assertEqual(len(e.open_spiders), 0)
class Crawler(object): def __init__(self, settings): self.configured = False self.settings = settings self.signals = SignalManager(self) self.stats = load_object(settings['STATS_CLASS'])(self) spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS']) self.spiders = spman_cls.from_crawler(self) self._scheduled = {} def install(self): import scrapy.project assert not hasattr(scrapy.project, 'crawler'), "crawler already installed" scrapy.project.crawler = self def uninstall(self): import scrapy.project assert hasattr(scrapy.project, 'crawler'), "crawler not installed" del scrapy.project.crawler def configure(self): if self.configured: return self.configured = True lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) self.engine = ExecutionEngine(self, self._spider_closed) def crawl(self, spider, requests=None): spider.set_crawler(self) if self.configured and self.engine.running: assert not self._scheduled return self._schedule(spider, requests) elif requests is None: self._scheduled[spider] = None else: self._scheduled.setdefault(spider, []).append(requests) def _schedule(self, spider, batches=()): requests = chain.from_iterable(batches) \ if batches else spider.start_requests() return self.engine.open_spider(spider, requests) def _spider_closed(self, spider=None): if not self.engine.open_spiders: self.stop() @defer.inlineCallbacks def start(self): yield defer.maybeDeferred(self.configure) for spider, batches in self._scheduled.iteritems(): yield self._schedule(spider, batches) yield defer.maybeDeferred(self.engine.start) @defer.inlineCallbacks def stop(self): if self.engine.running: yield defer.maybeDeferred(self.engine.stop)