def test_build_component_list(self): base = {'one': 1, 'two': 2, 'three': 3, 'five': 5, 'six': None} custom = {'two': None, 'three': 8, 'four': 4} self.assertEqual(build_component_list(base, custom), ['one', 'four', 'five', 'three']) custom = ['a', 'b', 'c'] self.assertEqual(build_component_list(base, custom), custom)
def test_duplicate_components_in_basesettings(self): # Higher priority takes precedence duplicate_bs = BaseSettings({'one': 1, 'two': 2}, priority=0) duplicate_bs.set('ONE', 4, priority=10) self.assertEqual(build_component_list(duplicate_bs, convert=lambda x: x.lower()), ['two', 'one']) duplicate_bs.set('one', duplicate_bs['one'], priority=20) self.assertEqual(build_component_list(duplicate_bs, convert=lambda x: x.lower()), ['one', 'two']) # Same priority raises ValueError duplicate_bs.set('ONE', duplicate_bs['ONE'], priority=20) self.assertRaises(ValueError, build_component_list, duplicate_bs, convert=lambda x: x.lower())
def run(self, args, opts): # load contracts contracts = build_component_list( self.settings['SPIDER_CONTRACTS_BASE'], self.settings['SPIDER_CONTRACTS'], ) self.conman = ContractsManager([load_object(c) for c in contracts]) # contract requests contract_reqs = defaultdict(list) self.crawler.engine.has_capacity = lambda: True for spider in args or self.crawler.spiders.list(): spider = self.crawler.spiders.create(spider) requests = self.get_requests(spider) if opts.list: for req in requests: contract_reqs[spider.name].append(req.callback.__name__) else: self.crawler.crawl(spider, requests) # start checks if opts.list: for spider, methods in sorted(contract_reqs.iteritems()): print spider for method in sorted(methods): print ' * %s' % method else: self.crawler.start()
def run(self, args, opts): # load contracts contracts = build_component_list(self.settings["SPIDER_CONTRACTS_BASE"], self.settings["SPIDER_CONTRACTS"]) self.conman = ContractsManager([load_object(c) for c in contracts]) self.results = TextTestRunner(verbosity=opts.verbose)._makeResult() # contract requests contract_reqs = defaultdict(list) spman_cls = load_object(self.settings["SPIDER_MANAGER_CLASS"]) spiders = spman_cls.from_settings(self.settings) for spider in args or spiders.list(): spider = spiders.create(spider) requests = self.get_requests(spider) if opts.list: for req in requests: contract_reqs[spider.name].append(req.callback.__name__) elif requests: crawler = self.crawler_process.create_crawler(spider.name) crawler.crawl(spider, requests) # start checks if opts.list: for spider, methods in sorted(contract_reqs.iteritems()): print spider for method in sorted(methods): print " * %s" % method else: self.crawler_process.start() self.results.printErrors()
def test_valid_numbers(self): # work well with None and numeric values d = {'a': 10, 'b': None, 'c': 15, 'd': 5.0} self.assertEqual(build_component_list(d, convert=lambda x: x), ['d', 'a', 'c']) d = {'a': 33333333333333333333, 'b': 11111111111111111111, 'c': 22222222222222222222} self.assertEqual(build_component_list(d, convert=lambda x: x), ['b', 'c', 'a']) # raise exception for invalid values d = {'one': '5'} self.assertRaises(ValueError, build_component_list, {}, d, convert=lambda x: x) d = {'one': '1.0'} self.assertRaises(ValueError, build_component_list, {}, d, convert=lambda x: x) d = {'one': [1, 2, 3]} self.assertRaises(ValueError, build_component_list, {}, d, convert=lambda x: x) d = {'one': {'a': 'a', 'b': 2}} self.assertRaises(ValueError, build_component_list, {}, d, convert=lambda x: x) d = {'one': 'lorem ipsum',} self.assertRaises(ValueError, build_component_list, {}, d, convert=lambda x: x)
def _get_mwlist_from_settings(cls, settings): item_pipelines = settings['ITEM_PIPELINES'] if isinstance(item_pipelines, (tuple, list, set, frozenset)): from scrapy.exceptions import ScrapyDeprecationWarning import warnings warnings.warn('ITEM_PIPELINES defined as a list or a set is deprecated, switch to a dict', category=ScrapyDeprecationWarning, stacklevel=1) # convert old ITEM_PIPELINE list to a dict with order 500 item_pipelines = dict(zip(item_pipelines, range(500, 500+len(item_pipelines)))) return build_component_list(settings['ITEM_PIPELINES_BASE'], item_pipelines)
def __init__(self): if not settings.getbool('WEBSERVICE_ENABLED'): raise NotConfigured logfile = settings['WEBSERVICE_LOGFILE'] port = settings.getint('WEBSERVICE_PORT') root = RootResource() reslist = build_component_list(settings['WEBSERVICE_RESOURCES_BASE'], \ settings['WEBSERVICE_RESOURCES']) for res_cls in map(load_object, reslist): res = res_cls() root.putChild(res.ws_name, res) server.Site.__init__(self, root, logPath=logfile) self.noisy = False reactor.callWhenRunning(reactor.listenTCP, port, self)
def __init__(self): if not settings.getbool('WEBSERVICE_ENABLED'): raise NotConfigured logfile = settings['WEBSERVICE_LOGFILE'] self.portrange = map(int, settings.getlist('WEBSERVICE_PORT')) self.host = settings['WEBSERVICE_HOST'] root = RootResource() reslist = build_component_list(settings['WEBSERVICE_RESOURCES_BASE'], \ settings['WEBSERVICE_RESOURCES']) for res_cls in map(load_object, reslist): res = res_cls() root.putChild(res.ws_name, res) server.Site.__init__(self, root, logPath=logfile) self.noisy = False dispatcher.connect(self.start_listening, signals.engine_started) dispatcher.connect(self.stop_listening, signals.engine_stopped)
def load(self): """Load middleware defined in settings module""" mwlist = build_component_list(settings['SPIDER_MIDDLEWARES_BASE'], \ settings['SPIDER_MIDDLEWARES']) self.enabled.clear() self.disabled.clear() for mwpath in mwlist: try: cls = load_object(mwpath) mw = cls() self.enabled[cls.__name__] = mw self._add_middleware(mw) except NotConfigured, e: self.disabled[cls.__name__] = mwpath if e.args: log.msg(e)
def _get_mwlist_from_settings(cls, settings): """ SPIDER_MIDDLEWARES = {} SPIDER_MIDDLEWARES_BASE = { # Engine side 'scrapy.contrib.spidermiddleware.httperror.HttpErrorMiddleware': 50, 'scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware': 500, 'scrapy.contrib.spidermiddleware.referer.RefererMiddleware': 700, 'scrapy.contrib.spidermiddleware.urllength.UrlLengthMiddleware': 800, 'scrapy.contrib.spidermiddleware.depth.DepthMiddleware': 900, # Spider side } """ return build_component_list(settings['SPIDER_MIDDLEWARES_BASE'], \ settings['SPIDER_MIDDLEWARES'])
def __init__(self, crawler): if not crawler.settings.getbool('WEBSERVICE_ENABLED'): raise NotConfigured self.crawler = crawler logfile = crawler.settings['WEBSERVICE_LOGFILE'] self.portrange = [int(x) for x in crawler.settings.getlist('WEBSERVICE_PORT')] self.host = crawler.settings['WEBSERVICE_HOST'] root = RootResource(crawler) reslist = build_component_list(crawler.settings['WEBSERVICE_RESOURCES_BASE'], crawler.settings['WEBSERVICE_RESOURCES']) for res_cls in map(load_object, reslist): res = res_cls(crawler) root.putChild(res.ws_name, res) server.Site.__init__(self, root, logPath=logfile) self.noisy = False crawler.signals.connect(self.start_listening, signals.engine_started) crawler.signals.connect(self.stop_listening, signals.engine_stopped)
def load(self): """ Load enabled extensions in settings module """ self.loaded = False self.enabled.clear() self.disabled.clear() extlist = build_component_list(settings['EXTENSIONS_BASE'], \ settings['EXTENSIONS']) for extension_path in extlist: try: cls = load_object(extension_path) self.enabled[cls.__name__] = cls() except NotConfigured, e: self.disabled[cls.__name__] = extension_path if e.args: log.msg(e)
def run(self, args, opts): # load contracts contracts = build_component_list( self.settings['SPIDER_CONTRACTS_BASE'], self.settings['SPIDER_CONTRACTS'], ) conman = ContractsManager([load_object(c) for c in contracts]) runner = TextTestRunner(verbosity=2 if opts.verbose else 1) result = TextTestResult(runner.stream, runner.descriptions, runner.verbosity) # contract requests contract_reqs = defaultdict(list) spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS']) spiders = spman_cls.from_settings(self.settings) for spider in args or spiders.list(): spider = spiders.create(spider) requests = self.get_requests(spider, conman, result) contract_reqs[spider.name] = [] if opts.list: for req in requests: contract_reqs[spider.name].append(req.callback.__name__) elif requests: crawler = self.crawler_process.create_crawler(spider.name) crawler.crawl(spider, requests) # start checks if opts.list: for spider, methods in sorted(contract_reqs.iteritems()): if not methods and not opts.verbose: continue print(spider) for method in sorted(methods): print(' * %s' % method) else: start = time.time() self.crawler_process.start() stop = time.time() result.printErrors() result.printSummary(start, stop) self.exitcode = int(not result.wasSuccessful())
def run_tests(spider, output_file, settings): """ Helper for running test contractors for a spider and output an XUnit file (for CI) For using offline input the HTTP cache is enabled """ settings.overrides.update({ "HTTPCACHE_ENABLED": True, "HTTPCACHE_EXPIRATION_SECS": 0, }) crawler = CrawlerProcess(settings) contracts = build_component_list( crawler.settings['SPIDER_CONTRACTS_BASE'], crawler.settings['SPIDER_CONTRACTS'], ) xunit = Xunit() xunit.enabled = True xunit.configure(AttributeDict(xunit_file=output_file), Config()) xunit.stopTest = lambda *x: None check = CheckCommand() check.set_crawler(crawler) check.settings = settings check.conman = ContractsManager([load_object(c) for c in contracts]) check.results = xunit # this are specially crafted requests that run tests as callbacks requests = check.get_requests(spider) crawler.install() crawler.configure() crawler.crawl(spider, requests) log.start(loglevel='DEBUG') # report is called when the crawler finishes, it creates the XUnit file report = lambda: check.results.report(check.results.error_report_file) dispatcher.connect(report, signals.engine_stopped) crawler.start()
def run(self, args, opts): # load contracts contracts = build_component_list( self.settings['SPIDER_CONTRACTS_BASE'], self.settings['SPIDER_CONTRACTS'], ) conman = ContractsManager([load_object(c) for c in contracts]) runner = TextTestRunner(verbosity=2 if opts.verbose else 1) result = TextTestResult(runner.stream, runner.descriptions, runner.verbosity) # contract requests contract_reqs = defaultdict(list) spiders = self.crawler_process.spiders for spidername in args or spiders.list(): spidercls = spiders.load(spidername) spidercls.start_requests = lambda s: conman.from_spider(s, result) tested_methods = conman.tested_methods_from_spidercls(spidercls) if opts.list: for method in tested_methods: contract_reqs[spidercls.name].append(method) elif tested_methods: self.crawler_process.crawl(spidercls) # start checks if opts.list: for spider, methods in sorted(contract_reqs.items()): if not methods and not opts.verbose: continue print(spider) for method in sorted(methods): print(' * %s' % method) else: start = time.time() self.crawler_process.start() stop = time.time() result.printErrors() result.printSummary(start, stop) self.exitcode = int(not result.wasSuccessful())
def _get_mwlist_from_settings(cls, settings): return build_component_list(settings._getcomposite("ITEM_PIPELINES"))
def _get_mwlist_from_settings(cls, settings): return build_component_list(settings.getwithbase('EXTENSIONS'))
def _get_mwlist_from_settings(cls, settings): return build_component_list(settings.getwithbase('ITEM_PIPELINES'))
def test_backwards_compatible_build_dict(self): base = {'one': 1, 'two': 2, 'three': 3, 'five': 5, 'six': None} custom = {'two': None, 'three': 8, 'four': 4} self.assertEqual(build_component_list(base, custom, convert=lambda x: x), ['one', 'four', 'five', 'three'])
def _get_mwlist_from_settings(cls, settings): return build_component_list(settings['SCHEDULER_MIDDLEWARES_BASE'], \ settings['SCHEDULER_MIDDLEWARES'])
def _get_mwlist_from_settings(cls, settings): return build_component_list(settings.getwithbase("EXTENSIONS"))
def _get_mwlist_from_settings(cls, settings): return build_component_list(settings['EXTENSIONS_BASE'], settings['EXTENSIONS'])
def _get_mwlist_from_settings(cls, settings): return build_component_list( settings.getwithbase('SPIDER_MIDDLEWARES') ) # 'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware': 50,
def _get_mwlist_from_settings(cls, settings): return build_component_list(settings._getcomposite('EXTENSIONS'))
def test_map_dict(self): custom = {'one': 1, 'two': 2, 'three': 3} self.assertEqual(build_component_list({}, custom, convert=lambda x: x.upper()), ['ONE', 'TWO', 'THREE'])
def _get_mwlist_from_settings(cls, settings): ## 从配置 SPIDER_MIDDLEWARES_BASE 和 SPIDER_MIDDLEWARES 中获取爬虫中间件类列表 return build_component_list(settings.getwithbase('SPIDER_MIDDLEWARES'))
def _get_mwlist_from_settings(cls, settings): return build_component_list(settings['SPIDER_MIDDLEWARES_BASE'], settings['SPIDER_MIDDLEWARES'])
def _get_mwlist_from_settings(cls, settings): return build_component_list(settings.getwithbase('SPIDER_MIDDLEWARES'))
def test_build_dict(self): d = {'one': 1, 'two': None, 'three': 8, 'four': 4} self.assertEqual(build_component_list(d, convert=lambda x: x), ['one', 'four', 'three'])
def test_backward_compatible_build_dict(self): base = {'one': 1, 'two': 2, 'three': 3, 'five': 5, 'six': None} custom = {'two': None, 'three': 8, 'four': 4} self.assertEqual( build_component_list(base, custom, convert=lambda x: x), ['one', 'four', 'five', 'three'])
def test_return_list(self): custom = ['a', 'b', 'c'] self.assertEqual(build_component_list(None, custom, convert=lambda x: x), custom)
def test_return_list(self): custom = ['a', 'b', 'c'] self.assertEqual( build_component_list(None, custom, convert=lambda x: x), custom)
def test_map_list(self): custom = ['a', 'b', 'c'] self.assertEqual(build_component_list(None, custom, lambda x: x.upper()), ['A', 'B', 'C'])
def test_map_dict(self): custom = {'one': 1, 'two': 2, 'three': 3} self.assertEqual( build_component_list({}, custom, convert=lambda x: x.upper()), ['ONE', 'TWO', 'THREE'])
def test_map_list(self): custom = ['a', 'b', 'c'] self.assertEqual( build_component_list(None, custom, lambda x: x.upper()), ['A', 'B', 'C'])
def _get_mwlist_from_settings(cls, settings): return build_component_list(settings._getcomposite("DOWNLOADER_MIDDLEWARES"))
def test_duplicate_components_in_list(self): duplicate_list = ['a', 'b', 'a'] with self.assertRaises(ValueError) as cm: build_component_list(None, duplicate_list, convert=lambda x: x) self.assertIn(str(duplicate_list), str(cm.exception))
def _get_mwlist_from_settings(cls, settings): # 从配置文件加载ITEM_PIPELINES_BASE和ITEM_PIPELINES类, 默认为空 return build_component_list(settings.getwithbase('ITEM_PIPELINES'))