class CrawlerRunnerTest(unittest.TestCase): def setUp(self): self.crawler_runner = CrawlerRunner(Settings()) def tearDown(self): return self.crawler_runner.stop() @defer.inlineCallbacks def test_populate_spidercls_settings(self): spider_settings = {'TEST1': 'spider', 'TEST2': 'spider'} project_settings = {'TEST1': 'project', 'TEST3': 'project'} class CustomSettingsSpider(DefaultSpider): custom_settings = spider_settings self.crawler_runner.settings.setdict(project_settings, priority='project') d = self.crawler_runner.crawl(CustomSettingsSpider) crawler = list(self.crawler_runner.crawlers)[0] yield d self.assertEqual(crawler.settings.get('TEST1'), 'spider') self.assertEqual(crawler.settings.get('TEST2'), 'spider') self.assertEqual(crawler.settings.get('TEST3'), 'project')
def handle_lj(self): configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) runner = CrawlerRunner(crawler_setting) #d = runner.crawl(HouseSpider) d = runner.crawl(LianjiaHouseSpider) d.addBoth(lambda _: reactor.stop()) reactor.run()
def crawler_start(usage, tasks): """Start specified spiders or validators from cmd with scrapy core api. There are four kinds of spiders: common, ajax, gfw, ajax_gfw. If you don't assign any tasks, all these spiders will run. """ maps = CRAWLER_TASK_MAPS if usage == 'crawler' else TEMP_TASK_MAPS origin_spiders = DEFAULT_CRAWLERS if usage == 'crawler' else DEFAULT_VALIDATORS if not tasks: spiders = origin_spiders else: spiders = list() cases = list(map(BaseCase, origin_spiders)) for task in tasks: for case in cases: if case.check(task, maps): spiders.append(case.spider) break else: # crawler_logger.warning('spider task {} is an invalid task, the allowed tasks are {}'.format( # task, list(maps.keys()))) pass if not spiders: #crawler_logger.warning('no spider starts up, please check your task input') return settings = get_project_settings() configure_logging(settings) runner = CrawlerRunner(settings) for spider in spiders: runner.crawl(spider) d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
def test_same_url(self): class TestSameUrlSpider(Spider): name = 'test_same_url' def __init__(self, *args, **kwargs): super(TestSameUrlSpider, self).__init__(*args, **kwargs) self.visited = 0 def start_requests(s): return self.conman.from_spider(s, self.results) def parse_first(self, response): self.visited += 1 return TestItem() def parse_second(self, response): self.visited += 1 return TestItem() with MockServer() as mockserver: contract_doc = '@url {}'.format(mockserver.url('/status?n=200')) get_unbound_function(TestSameUrlSpider.parse_first).__doc__ = contract_doc get_unbound_function(TestSameUrlSpider.parse_second).__doc__ = contract_doc crawler = CrawlerRunner().create_crawler(TestSameUrlSpider) yield crawler.crawl() self.assertEqual(crawler.spider.visited, 2)
def test_crawler_process(self): runner = CrawlerRunner(self.settings) d = runner.crawl(CustomSpider) d.addBoth(lambda _: reactor.stop()) # add crawl to redis key = "test-spider:istresearch.com:queue" self.redis_conn.zadd(key, self.example_feed, -99) # run the spider, give 20 seconds to see the url, crawl it, # and send to kafka. Then we kill the reactor def thread_func(): time.sleep(20) reactor.stop() thread = threading.Thread(target=thread_func) thread.start() reactor.run() message_count = 0 m = next(self.consumer) if m is None: pass else: the_dict = json.loads(m.value) if the_dict is not None and the_dict['appid'] == 'test' \ and the_dict['crawlid'] == 'abc12345': message_count += 1 self.assertEquals(message_count, 1)
def run(): options = { 'CONCURRENT_ITEMS': 250, #'USER_AGENT': 'Googlebot/2.1 (+http://www.google.com/bot.html)', 'CONCURRENT_REQUESTS': 30, 'DOWNLOAD_DELAY': 0.5, 'COOKIES_ENABLED': False, } spider = EntertainmentcareersSpider() settings = get_project_settings() settings.update(options) runner= CrawlerRunner(settings) runner.crawl(spider) d= runner.join() d.addBoth(lambda _:reactor.stop()) #crawler = Crawler(settings) #crawler.signals.connect(reactor.stop, signal=signals.spider_closed) #crawler.install() #crawler.configure() #crawler.crawl(spider) #crawler.start() #log.start(logfile="results.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False) reactor.run()
def run_spider(): options = { 'CONCURRENT_ITEMS': 250, 'USER_AGENT': 'Googlebot/2.1 (+http://www.google.com/bot.html)', 'CONCURRENT_REQUESTS': 30, 'DOWNLOAD_DELAY': 0.5, 'COOKIES_ENABLED': False, } settings = get_project_settings() configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) settings.update(options); #BookToscrapeSpider basic version from tutorial.spiders.booktoscrape_basic import BookToscrapeSpider #runner = CrawlerRunner(settings) #runner.crawl(BookToscrapeSpider()) #BookToscrapeSpider crawl version from tutorial.spiders.booktoscrape_crawl import BookToscrapeSpider as BookToscrapeSpider_crawl runner = CrawlerRunner(settings) runner.crawl(BookToscrapeSpider_crawl()) #crawler = Crawler(settings) #crawler.signals.connect(reactor.stop, signal=signals.spider_closed) #crawler.install() #crawler.configure() #crawler.crawl(spider) #crawler.start() #log.start(logfile="results.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False) d= runner.join() d.addBoth(lambda _:reactor.stop()) reactor.run()
def crawl_articles(spids): settings = get_project_settings() configure_logging(settings, install_root_handler=False) logging.getLogger('scrapy').setLevel(logging.WARNING) runner = CrawlerRunner(settings) loader = runner.spider_loader if 'all' in spids: spids = loader.list() spiders = [ loader.load(spid) for spid in spids if spid in loader.list() ] if not spiders: return random.shuffle(spiders) for spider in spiders: runner.crawl(spider) d = runner.join() d.addBoth(lambda _: reactor.stop()) logger.info('crawl job starting...') try: reactor.run() except Exception: logger.exception('crawl job got exception:') logger.info('crawl job finished')
def run_crawler_by_runner(): runner = CrawlerRunner(get_project_settings()) [runner.crawl(spider) for spider in spiders] d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
def test_start_requests_dupes(self): settings = {"CONCURRENT_REQUESTS": 1} crawler = CrawlerRunner(settings).create_crawler(DuplicateStartRequestsSpider) yield crawler.crawl(dont_filter=True, distinct_urls=2, dupe_factor=3, mockserver=self.mockserver) self.assertEqual(crawler.spider.visited, 6) yield crawler.crawl(dont_filter=False, distinct_urls=3, dupe_factor=4, mockserver=self.mockserver) self.assertEqual(crawler.spider.visited, 3)
def crawl(self): spider = Scrapy_ModuleSpider() Runner = CrawlerRunner(self.Scrapy_Module_setting) cra = Runner.crawl(spider) # stop reactor when spider closes cra.addBoth(lambda _: self.spider_closing(cra)) self.logger.info("Run reactor") reactor.run()
def runSpider(self, spider): configure_logging({'LOG_FORMAT': '%(asctime)s [%(name)s] %(levelname)s: %(message)s'}) settings = Settings() settings.set('FEED_URI', 'output.json') settings.set('FEED_FORMAT', 'json') runner = CrawlerRunner(settings) dfd = runner.crawl(spider) dfd.addBoth(lambda _: reactor.stop())
def get_crawler(spidercls=None, settings_dict=None): """Return an unconfigured Crawler object. If settings_dict is given, it will be used to populate the crawler settings with a project level priority. """ from scrapy.crawler import CrawlerRunner from scrapy.spiders import Spider runner = CrawlerRunner(settings_dict) return runner.create_crawler(spidercls or Spider)
def _test_delay(self, delay, randomize): settings = {"DOWNLOAD_DELAY": delay, 'RANDOMIZE_DOWNLOAD_DELAY': randomize} crawler = CrawlerRunner(settings).create_crawler(FollowAllSpider) yield crawler.crawl(maxlatency=delay * 2, mockserver=self.mockserver) t = crawler.spider.times totaltime = t[-1] - t[0] avgd = totaltime / (len(t) - 1) tolerance = 0.6 if randomize else 0.2 self.assertTrue(avgd > delay * (1 - tolerance), "download delay too small: %s" % avgd)
def test_crawler_runner_loading(self): module = 'tests.test_spiderloader.test_spiders.spider1' runner = CrawlerRunner({'SPIDER_MODULES': [module]}) self.assertRaisesRegexp(KeyError, 'Spider not found', runner.create_crawler, 'spider2') crawler = runner.create_crawler('spider1') self.assertTrue(issubclass(crawler.spidercls, scrapy.Spider)) self.assertEqual(crawler.spidercls.name, 'spider1')
def runProcess(self): configure_logging() dbHandler.check_watches() runner = CrawlerRunner() runner.crawl(spider.available_courses_spider) dbHandler.check_watches() d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
def run_login_spider(seed_url, username, password, db_name, logfile = "results.log"): init_db(db_name) settings = get_project_settings() runner = CrawlerRunner(settings) d = runner.crawl(LoginFinderSpider, seed_url = seed_url, username = username, password = password) d.addBoth(lambda _: reactor.stop()) log.start(loglevel=log.DEBUG, logfile=logfile) log.msg("Item pipelines enabled: %s" % str(settings.get("ITEM_PIPELINES")), level = log.INFO) reactor.run()
def main(): locale.setlocale(locale.LC_TIME, 'es_ES') configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) runner = CrawlerRunner() d = runner.crawl(LotoSpider) d.addBoth(lambda _: reactor.stop()) reactor.run() return None
def scrapy_embedding(spidercls): settings = get_scrapy_settings() # actually we can manually create crawler # but CrawlRunner does it more sophisticated and adds support for str runner = CrawlerRunner(settings) crawler = runner.create_crawler(spidercls) crawler.engine = crawler._create_engine() crawler.engine.start() # log.start(logstdout=False) return crawler
def runSpider(host, spider): spiders = spider.split(',') changeSettings(host) settings = get_project_settings() runner = CrawlerRunner(settings) for i in spiders: runner.crawl(SPIDER_MATCHER[i.lower()]) d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
def setUp(self): settings = Settings() settings.setmodule(undercrawler.settings) settings['DOWNLOAD_DELAY'] = 0.1 settings['ITEM_PIPELINES']['tests.utils.CollectorPipeline'] = 100 splash_url = os.environ.get('SPLASH_URL') if splash_url: settings['SPLASH_URL'] = splash_url settings.update(self.settings) runner = CrawlerRunner(settings) self.crawler = runner.create_crawler(BaseSpider)
def test_crawler_runner_bootstrap_failed(self): runner = CrawlerRunner() try: yield runner.crawl(ExceptionSpider) except ValueError: pass else: self.fail('Exception should be raised from spider') self.assertEqual(runner.bootstrap_failed, True)
def crawl(self): os.environ['SCRAPY_PROJECT'] = '{0}/{1}'.format(BASE_DIR, 'collector') runner = CrawlerRunner({'LOG_LEVEL': 'WARNING', 'LOG_FORMATTER': 'collector.collector.utils.PoliteLogFormatter', 'ITEM_PIPELINES': { 'collector.collector.pipelines.CodingDuplicatesPipeline': 1, 'collector.collector.pipelines.CodingPriorityPipeline': 2 }}) # runner = CrawlerRunner() d = runner.crawl(CodingProjectSpider) d.addBoth(lambda _: reactor.stop()) reactor.run()
def test_timeout_failure(self): crawler = CrawlerRunner({"DOWNLOAD_TIMEOUT": 0.35}).create_crawler(DelaySpider) yield crawler.crawl(n=0.5, mockserver=self.mockserver) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 == 0) self.assertTrue(crawler.spider.t2_err > 0) self.assertTrue(crawler.spider.t2_err > crawler.spider.t1) # server hangs after receiving response headers yield crawler.crawl(n=0.5, b=1, mockserver=self.mockserver) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 == 0) self.assertTrue(crawler.spider.t2_err > 0) self.assertTrue(crawler.spider.t2_err > crawler.spider.t1)
def startprocess(queue): runner = CrawlerRunner(get_project_settings()) dfs = set() l = runner.crawl('linkspider', website='http://caffe.berkeleyvision.org/', domain='berkeleyvision.org').addCallback(test,queue) #回调函数中参数1表示linkspider dfs.add(l) s = runner.crawl('srcspider', website='http://caffe.berkeleyvision.org/', domain='berkeleyvision.org').addCallback(test,queue)#回调函数中参数2表示srcspider dfs.add(s) c = runner.crawl('codespider', website='http://caffe.berkeleyvision.org/', domain='berkeleyvision.org').addCallback(test,queue)#回调函数中参数3表示codespider dfs.add(c) defer.DeferredList(dfs).addBoth(lambda _: reactor.stop()) # the script will block here until all crawling jobs are finished reactor.run()
def __init__(self, settings, spec_manager): # twisted base class is old-style so we cannot user super() Resource.__init__(self) self.spec_manager = spec_manager settings.set('PLUGINS', [p['bot'] for p in settings.get('PLUGINS')]) self.runner = CrawlerRunner(settings) log.msg("bot initialized", level=log.DEBUG)
def run_and_export(self, spider_cls, settings=None): """ Run spider with specified settings; return exported data. """ tmpdir = tempfile.mkdtemp() res_name = tmpdir + "/res" defaults = {"FEED_URI": "file://" + res_name, "FEED_FORMAT": "csv"} defaults.update(settings or {}) try: with MockServer() as s: runner = CrawlerRunner(Settings(defaults)) yield runner.crawl(spider_cls) with open(res_name, "rb") as f: defer.returnValue(f.read()) finally: shutil.rmtree(tmpdir)
def _run_feed_spider(url, feed): spid = str(uuid.uuid4()) feed['_id'] = spid configure_logging(TEST_SETTINGS, install_root_handler=False) logging.getLogger('scrapy').setLevel(logging.WARNING) save_feed(url) cls = SpiderFactory.create_spider(feed) runner = CrawlerRunner(TEST_SETTINGS) d = runner.crawl(cls) d.addBoth(lambda _: reactor.stop()) reactor.run(installSignalHandlers=False) n = get_stats([spid])[spid] if n == 0: raise Exception(f'feed spider crawled 0 articles') if is_exists_spider(url): raise Exception(f'feed[{url}] existed') del feed['_id'] save_spider_settings(feed)
class Bot(Resource): spider = SlydSpider() def __init__(self, settings, spec_manager): # twisted base class is old-style so we cannot user super() Resource.__init__(self) self.spec_manager = spec_manager settings.set('PLUGINS', [p['bot'] for p in settings.get('PLUGINS')]) self.runner = CrawlerRunner(settings) log.msg("bot initialized", level=log.DEBUG) def keep_spider_alive(self, spider): raise DontCloseSpider("keeping it open") def stop(self): """Stop the crawler""" self.runner.stop() log.msg("bot stopped", level=log.DEBUG)
def webcrawl(queue,webs,dom): website = '' domain = '' try: runner = CrawlerRunner(get_project_settings()) dfs = set() l = runner.crawl('linkspider', website=webs, domain=dom).addCallback(setflag, queue).addErrback(err, queue) #回调函数中参数1表示linkspider dfs.add(l) s = runner.crawl('srcspider', website=webs, domain=dom).addCallback(setflag, queue).addErrback(err, queue)#回调函数中参数2表示srcspider dfs.add(s) c = runner.crawl('codespider', website=webs, domain=dom).addCallback(setflag, queue).addErrback(err, queue)#回调函数中参数3表示codespider dfs.add(c) defer.DeferredList(dfs).addBoth(lambda _: reactor.stop()) # the script will block here until all crawling jobs are finished reactor.run() except Exception,e: print e
while (c <= 340) : keys = response.css("div:nth-child(3)>div:nth-child("+str(c)+")>table td>strong::text").extract() values = response.css("div:nth-child(3)>div:nth-child("+str(c)+")>table td::text").extract() c+=2 s = " " s = s.join(values) k = "NA" k = k.join(keys) req_values[k] = s #print(keys[0]) #print(s) print(req_values) #req_values['Timestamp'] = dateTimeObj.strftime("%d-%b-%Y (%H:%M:%S.%f)") #print(req_values) req_values = dict() configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) runner = CrawlerRunner() d = runner.crawl(MySpider) d.addBoth(lambda _: reactor.stop()) reactor.run() # the script will block here until the crawling is finished
start_urls = ['https://basement.redbull.com/university-api/entrants'] def parse(self, response): result = json.loads(response.text) List = [] registrations = result["registrations"] for i in range(len(registrations)): if (registrations[i]["country"] == "Bosnia and Herzegovina"): List.append(registrations[i]) sortedList = sorted(List, key=lambda k: k['voteCount'], reverse=True) #newList = eval(json.dumps(sortedList)) print(json.dumps(sortedList)) sys.stdout.flush() #with open('data.json','w') as outfile: #json.dump(sortedList,outfile) #return sortedList configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) runner = CrawlerRunner() d = runner.crawl(RedbullSpiderSpider) d.addBoth(lambda _: reactor.stop()) reactor.run() # the script will block here until the crawling is finished sleep(1000)
def make_crawler(settings, spider_cls=None, **extra_settings): settings.update(extra_settings) runner = CrawlerRunner(settings) return runner.create_crawler(spider_cls or TestSpider)
def make_crawler(settings, **extra_settings): settings.update(extra_settings) runner = CrawlerRunner(settings) return runner.create_crawler(BaseSpider)
class SpiderParser(): items = [] requests = [] response = None def __init__(self, settings, spider, args): """ init parser :param settings: :param spider: :param args: """ self.args = args self.spider = spider self.crawler_process = CrawlerRunner(settings) self.spider_loader = self.crawler_process.spider_loader self.spidercls = self.spider_loader.load(self.spider) def get_callback(self, request): """ get callback from obj or rules :param request: :return: """ if getattr(self.spidercls, 'rules', None): rules = self.spidercls.rules rule_index = request.meta.get('rule', -1) if rule_index >= 0 and rule_index < len(rules): rule = rules[rule_index] return rule.callback for rule in rules: if rule.link_extractor.matches(request.url): return rule.callback return 'parse' def run_callback(self, response, cb): """ run callback and get items and requests :param response: :param cb: :return: """ items, requests = [], [] for x in iterate_spider_output(cb(response)): if isinstance(x, (BaseItem, dict)): items.append(x) elif isinstance(x, Request): requests.append(x) return items, requests def prepare_request(self, spider, request, args): """ get request :param spider: :param request: :param args: :return: """ def callback(response): """ callback :param response: :return: """ request = response.request cb = self.args.callback or 'parse' if not callable(cb): cb_method = getattr(spider, cb, None) if callable(cb_method): cb = cb_method items, requests = self.run_callback(response, cb) # process request callback for request in requests: request.callback = self.get_callback(request) request.meta['callback'] = request.callback # process items and requests and response self.items += list(map(lambda item: process_item(item), items)) self.requests += list( map(lambda request: process_request(request), requests)) self.response = process_response(response) if args.meta: request.meta.update(args.meta) # update callback request.meta['callback'] = request.callback request.callback = callback return request def run(self): """ run main :return: """ request = Request(self.args.url, None) start_requests = lambda spider: [ self.prepare_request(spider, request, self.args) ] self.spidercls.start_requests = start_requests self.crawler_process.crawl(self.spidercls) if not len(self.crawler_process.crawlers) > 0: return {'ok': False} # init pcrawler self.pcrawler = list(self.crawler_process.crawlers)[0] d = self.crawler_process.join() d.addBoth(lambda _: reactor.stop()) reactor.run() return { 'items': self.items, 'requests': self.requests, 'response': self.response, 'ok': True }
from jk_en.sendEmail import * import os import sys # sys.path.append(os.path.dirname(os.path.abspath(__file__))) # print("***********************", os.path.dirname(os.path.abspath(__file__))) # os.system('scrapy crawl school_jk_spider') from twisted.internet import reactor from scrapy.crawler import CrawlerRunner from scrapy.utils.log import configure_logging from jk_en.spiders import school_jk_apider configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) runner = CrawlerRunner() d = runner.crawl(school_jk_apider.SchoolJkSpiderSpider) d.addBoth(lambda _: reactor.stop()) reactor.run() SendEmailMain()
from scrapy.crawler import CrawlerProcess from spiders.FlatLinkRemodelSpider import FlatLinkRemodelSpider from spiders.HouseLinkRemodelSpider import HouseLinkRemodelSpider from twisted.internet import reactor from scrapy.crawler import CrawlerRunner from scrapy.utils.log import configure_logging configure_logging() runner = CrawlerRunner() runner.crawl(FlatLinkRemodelSpider) runner.crawl(HouseLinkRemodelSpider) d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
from flask import Flask, render_template, jsonify, request, redirect, url_for, session, escape import flask_excel as excel import time import datetime sys.path.insert(0, './hotels/travelData/spiders') from booking import BookingSpider app = Flask(__name__) app.secret_key = "Super_secret_key" excel.init_excel(app) crawl_runner = CrawlerRunner() output_data = [] desCity = '' checkinDate = '' checkoutDate = '' room = 1 traveler = 1 @app.route('/') def index(): return render_template("index.html") # After clicking the Submit Button FLASK will come into this @app.route('/', methods=['POST'])
def __init__(self): self.output = {} #self.runner = CrawlerProcess(settings={'LOG_ENABLED': False}) self.runner = CrawlerRunner(settings={'LOG_ENABLED': False})
class CrawlTestCase(TestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() self.runner = CrawlerRunner() def tearDown(self): self.mockserver.__exit__(None, None, None) @defer.inlineCallbacks def test_follow_all(self): crawler = self.runner.create_crawler(FollowAllSpider) yield crawler.crawl(mockserver=self.mockserver) self.assertEqual(len(crawler.spider.urls_visited), 11) # 10 + start_url @defer.inlineCallbacks def test_fixed_delay(self): yield self._test_delay(total=3, delay=0.2) @defer.inlineCallbacks def test_randomized_delay(self): yield self._test_delay(total=3, delay=0.1, randomize=True) @defer.inlineCallbacks def _test_delay(self, total, delay, randomize=False): crawl_kwargs = dict( maxlatency=delay * 2, mockserver=self.mockserver, total=total, ) tolerance = (1 - (0.6 if randomize else 0.2)) settings = { "DOWNLOAD_DELAY": delay, 'RANDOMIZE_DOWNLOAD_DELAY': randomize } crawler = CrawlerRunner(settings).create_crawler(FollowAllSpider) yield crawler.crawl(**crawl_kwargs) times = crawler.spider.times total_time = times[-1] - times[0] average = total_time / (len(times) - 1) self.assertTrue(average > delay * tolerance, f"download delay too small: {average}") # Ensure that the same test parameters would cause a failure if no # download delay is set. Otherwise, it means we are using a combination # of ``total`` and ``delay`` values that are too small for the test # code above to have any meaning. settings["DOWNLOAD_DELAY"] = 0 crawler = CrawlerRunner(settings).create_crawler(FollowAllSpider) yield crawler.crawl(**crawl_kwargs) times = crawler.spider.times total_time = times[-1] - times[0] average = total_time / (len(times) - 1) self.assertFalse(average > delay / tolerance, "test total or delay values are too small") @defer.inlineCallbacks def test_timeout_success(self): crawler = self.runner.create_crawler(DelaySpider) yield crawler.crawl(n=0.5, mockserver=self.mockserver) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 > 0) self.assertTrue(crawler.spider.t2 > crawler.spider.t1) @defer.inlineCallbacks def test_timeout_failure(self): crawler = CrawlerRunner({ "DOWNLOAD_TIMEOUT": 0.35 }).create_crawler(DelaySpider) yield crawler.crawl(n=0.5, mockserver=self.mockserver) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 == 0) self.assertTrue(crawler.spider.t2_err > 0) self.assertTrue(crawler.spider.t2_err > crawler.spider.t1) # server hangs after receiving response headers yield crawler.crawl(n=0.5, b=1, mockserver=self.mockserver) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 == 0) self.assertTrue(crawler.spider.t2_err > 0) self.assertTrue(crawler.spider.t2_err > crawler.spider.t1) @defer.inlineCallbacks def test_retry_503(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/status?n=503"), mockserver=self.mockserver) self._assert_retried(log) @defer.inlineCallbacks def test_retry_conn_failed(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as log: yield crawler.crawl("http://localhost:65432/status?n=503", mockserver=self.mockserver) self._assert_retried(log) @defer.inlineCallbacks def test_retry_dns_error(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as log: # try to fetch the homepage of a non-existent domain yield crawler.crawl("http://dns.resolution.invalid./", mockserver=self.mockserver) self._assert_retried(log) @defer.inlineCallbacks def test_start_requests_bug_before_yield(self): with LogCapture('scrapy', level=logging.ERROR) as log: crawler = self.runner.create_crawler(BrokenStartRequestsSpider) yield crawler.crawl(fail_before_yield=1, mockserver=self.mockserver) self.assertEqual(len(log.records), 1) record = log.records[0] self.assertIsNotNone(record.exc_info) self.assertIs(record.exc_info[0], ZeroDivisionError) @defer.inlineCallbacks def test_start_requests_bug_yielding(self): with LogCapture('scrapy', level=logging.ERROR) as log: crawler = self.runner.create_crawler(BrokenStartRequestsSpider) yield crawler.crawl(fail_yielding=1, mockserver=self.mockserver) self.assertEqual(len(log.records), 1) record = log.records[0] self.assertIsNotNone(record.exc_info) self.assertIs(record.exc_info[0], ZeroDivisionError) @defer.inlineCallbacks def test_start_requests_lazyness(self): settings = {"CONCURRENT_REQUESTS": 1} crawler = CrawlerRunner(settings).create_crawler( BrokenStartRequestsSpider) yield crawler.crawl(mockserver=self.mockserver) self.assertTrue( crawler.spider.seedsseen.index(None) < crawler.spider.seedsseen.index(99), crawler.spider.seedsseen) @defer.inlineCallbacks def test_start_requests_dupes(self): settings = {"CONCURRENT_REQUESTS": 1} crawler = CrawlerRunner(settings).create_crawler( DuplicateStartRequestsSpider) yield crawler.crawl(dont_filter=True, distinct_urls=2, dupe_factor=3, mockserver=self.mockserver) self.assertEqual(crawler.spider.visited, 6) yield crawler.crawl(dont_filter=False, distinct_urls=3, dupe_factor=4, mockserver=self.mockserver) self.assertEqual(crawler.spider.visited, 3) @defer.inlineCallbacks def test_unbounded_response(self): # Completeness of responses without Content-Length or Transfer-Encoding # can not be determined, we treat them as valid but flagged as "partial" from urllib.parse import urlencode query = urlencode({ 'raw': '''\ HTTP/1.1 200 OK Server: Apache-Coyote/1.1 X-Powered-By: Servlet 2.4; JBoss-4.2.3.GA (build: SVNTag=JBoss_4_2_3_GA date=200807181417)/JBossWeb-2.0 Set-Cookie: JSESSIONID=08515F572832D0E659FD2B0D8031D75F; Path=/ Pragma: no-cache Expires: Thu, 01 Jan 1970 00:00:00 GMT Cache-Control: no-cache Cache-Control: no-store Content-Type: text/html;charset=UTF-8 Content-Language: en Date: Tue, 27 Aug 2013 13:05:05 GMT Connection: close foo body with multiples lines ''' }) crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as log: yield crawler.crawl(self.mockserver.url(f"/raw?{query}"), mockserver=self.mockserver) self.assertEqual(str(log).count("Got response 200"), 1) @defer.inlineCallbacks def test_retry_conn_lost(self): # connection lost after receiving data crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/drop?abort=0"), mockserver=self.mockserver) self._assert_retried(log) @defer.inlineCallbacks def test_retry_conn_aborted(self): # connection lost before receiving data crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/drop?abort=1"), mockserver=self.mockserver) self._assert_retried(log) def _assert_retried(self, log): self.assertEqual(str(log).count("Retrying"), 2) self.assertEqual(str(log).count("Gave up retrying"), 1) @defer.inlineCallbacks def test_referer_header(self): """Referer header is set by RefererMiddleware unless it is already set""" req0 = Request(self.mockserver.url('/echo?headers=1&body=0'), dont_filter=1) req1 = req0.replace() req2 = req0.replace(headers={'Referer': None}) req3 = req0.replace(headers={'Referer': 'http://example.com'}) req0.meta['next'] = req1 req1.meta['next'] = req2 req2.meta['next'] = req3 crawler = self.runner.create_crawler(SingleRequestSpider) yield crawler.crawl(seed=req0, mockserver=self.mockserver) # basic asserts in case of weird communication errors self.assertIn('responses', crawler.spider.meta) self.assertNotIn('failures', crawler.spider.meta) # start requests doesn't set Referer header echo0 = json.loads(to_unicode( crawler.spider.meta['responses'][2].body)) self.assertNotIn('Referer', echo0['headers']) # following request sets Referer to start request url echo1 = json.loads(to_unicode( crawler.spider.meta['responses'][1].body)) self.assertEqual(echo1['headers'].get('Referer'), [req0.url]) # next request avoids Referer header echo2 = json.loads(to_unicode( crawler.spider.meta['responses'][2].body)) self.assertNotIn('Referer', echo2['headers']) # last request explicitly sets a Referer header echo3 = json.loads(to_unicode( crawler.spider.meta['responses'][3].body)) self.assertEqual(echo3['headers'].get('Referer'), ['http://example.com']) @defer.inlineCallbacks def test_engine_status(self): from scrapy.utils.engine import get_engine_status est = [] def cb(response): est.append(get_engine_status(crawler.engine)) crawler = self.runner.create_crawler(SingleRequestSpider) yield crawler.crawl(seed=self.mockserver.url('/'), callback_func=cb, mockserver=self.mockserver) self.assertEqual(len(est), 1, est) s = dict(est[0]) self.assertEqual(s['engine.spider.name'], crawler.spider.name) self.assertEqual(s['len(engine.scraper.slot.active)'], 1) @defer.inlineCallbacks def test_format_engine_status(self): from scrapy.utils.engine import format_engine_status est = [] def cb(response): est.append(format_engine_status(crawler.engine)) crawler = self.runner.create_crawler(SingleRequestSpider) yield crawler.crawl(seed=self.mockserver.url('/'), callback_func=cb, mockserver=self.mockserver) self.assertEqual(len(est), 1, est) est = est[0].split("\n")[2:-2] # remove header & footer # convert to dict est = [x.split(":") for x in est] est = [x for sublist in est for x in sublist] # flatten est = [x.lstrip().rstrip() for x in est] it = iter(est) s = dict(zip(it, it)) self.assertEqual(s['engine.spider.name'], crawler.spider.name) self.assertEqual(s['len(engine.scraper.slot.active)'], '1') @defer.inlineCallbacks def test_graceful_crawl_error_handling(self): """ Test whether errors happening anywhere in Crawler.crawl() are properly reported (and not somehow swallowed) after a graceful engine shutdown. The errors should not come from within Scrapy's core but from within spiders/middlewares/etc., e.g. raised in Spider.start_requests(), SpiderMiddleware.process_start_requests(), etc. """ class TestError(Exception): pass class FaultySpider(SimpleSpider): def start_requests(self): raise TestError crawler = self.runner.create_crawler(FaultySpider) yield self.assertFailure(crawler.crawl(mockserver=self.mockserver), TestError) self.assertFalse(crawler.crawling) @defer.inlineCallbacks def test_open_spider_error_on_faulty_pipeline(self): settings = { "ITEM_PIPELINES": { "tests.pipelines.ZeroDivisionErrorPipeline": 300, } } crawler = CrawlerRunner(settings).create_crawler(SimpleSpider) yield self.assertFailure( self.runner.crawl(crawler, self.mockserver.url("/status?n=200"), mockserver=self.mockserver), ZeroDivisionError) self.assertFalse(crawler.crawling) @defer.inlineCallbacks def test_crawlerrunner_accepts_crawler(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as log: yield self.runner.crawl(crawler, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) self.assertIn("Got response 200", str(log)) @defer.inlineCallbacks def test_crawl_multiple(self): self.runner.crawl(SimpleSpider, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) self.runner.crawl(SimpleSpider, self.mockserver.url("/status?n=503"), mockserver=self.mockserver) with LogCapture() as log: yield self.runner.join() self._assert_retried(log) self.assertIn("Got response 200", str(log))
class CrawlSpiderTestCase(TestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() self.runner = CrawlerRunner() def tearDown(self): self.mockserver.__exit__(None, None, None) @defer.inlineCallbacks def _run_spider(self, spider_cls): items = [] def _on_item_scraped(item): items.append(item) crawler = self.runner.create_crawler(spider_cls) crawler.signals.connect(_on_item_scraped, signals.item_scraped) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/status?n=200"), mockserver=self.mockserver) return log, items, crawler.stats @defer.inlineCallbacks def test_crawlspider_with_parse(self): self.runner.crawl(CrawlSpiderWithParseMethod, mockserver=self.mockserver) with LogCapture() as log: yield self.runner.join() self.assertIn("[parse] status 200 (foo: None)", str(log)) self.assertIn("[parse] status 201 (foo: None)", str(log)) self.assertIn("[parse] status 202 (foo: bar)", str(log)) @defer.inlineCallbacks def test_crawlspider_with_errback(self): self.runner.crawl(CrawlSpiderWithErrback, mockserver=self.mockserver) with LogCapture() as log: yield self.runner.join() self.assertIn("[parse] status 200 (foo: None)", str(log)) self.assertIn("[parse] status 201 (foo: None)", str(log)) self.assertIn("[parse] status 202 (foo: bar)", str(log)) self.assertIn("[errback] status 404", str(log)) self.assertIn("[errback] status 500", str(log)) self.assertIn("[errback] status 501", str(log)) @defer.inlineCallbacks def test_async_def_parse(self): self.runner.crawl(AsyncDefSpider, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) with LogCapture() as log: yield self.runner.join() self.assertIn("Got response 200", str(log)) @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncio_parse(self): runner = CrawlerRunner({ "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor" }) runner.crawl(AsyncDefAsyncioSpider, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) with LogCapture() as log: yield runner.join() self.assertIn("Got response 200", str(log)) @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncio_parse_items_list(self): log, items, _ = yield self._run_spider(AsyncDefAsyncioReturnSpider) self.assertIn("Got response 200", str(log)) self.assertIn({'id': 1}, items) self.assertIn({'id': 2}, items) @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncio_parse_items_single_element(self): items = [] def _on_item_scraped(item): items.append(item) crawler = self.runner.create_crawler( AsyncDefAsyncioReturnSingleElementSpider) crawler.signals.connect(_on_item_scraped, signals.item_scraped) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/status?n=200"), mockserver=self.mockserver) self.assertIn("Got response 200", str(log)) self.assertIn({"foo": 42}, items) @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncgen_parse(self): log, _, stats = yield self._run_spider(AsyncDefAsyncioGenSpider) self.assertIn("Got response 200", str(log)) itemcount = stats.get_value('item_scraped_count') self.assertEqual(itemcount, 1) @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncgen_parse_loop(self): log, items, stats = yield self._run_spider( AsyncDefAsyncioGenLoopSpider) self.assertIn("Got response 200", str(log)) itemcount = stats.get_value('item_scraped_count') self.assertEqual(itemcount, 10) for i in range(10): self.assertIn({'foo': i}, items) @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncgen_parse_complex(self): _, items, stats = yield self._run_spider( AsyncDefAsyncioGenComplexSpider) itemcount = stats.get_value('item_scraped_count') self.assertEqual(itemcount, 156) # some random items for i in [1, 4, 21, 22, 207, 311]: self.assertIn({'index': i}, items) for i in [10, 30, 122]: self.assertIn({'index2': i}, items) @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncio_parse_reqs_list(self): log, *_ = yield self._run_spider(AsyncDefAsyncioReqsReturnSpider) for req_id in range(3): self.assertIn(f"Got response 200, req_id {req_id}", str(log)) @defer.inlineCallbacks def test_response_ssl_certificate_none(self): crawler = self.runner.create_crawler(SingleRequestSpider) url = self.mockserver.url("/echo?body=test", is_secure=False) yield crawler.crawl(seed=url, mockserver=self.mockserver) self.assertIsNone(crawler.spider.meta['responses'][0].certificate) @defer.inlineCallbacks def test_response_ssl_certificate(self): crawler = self.runner.create_crawler(SingleRequestSpider) url = self.mockserver.url("/echo?body=test", is_secure=True) yield crawler.crawl(seed=url, mockserver=self.mockserver) cert = crawler.spider.meta['responses'][0].certificate self.assertIsInstance(cert, Certificate) self.assertEqual(cert.getSubject().commonName, b"localhost") self.assertEqual(cert.getIssuer().commonName, b"localhost") @mark.xfail( reason="Responses with no body return early and contain no certificate" ) @defer.inlineCallbacks def test_response_ssl_certificate_empty_response(self): crawler = self.runner.create_crawler(SingleRequestSpider) url = self.mockserver.url("/status?n=200", is_secure=True) yield crawler.crawl(seed=url, mockserver=self.mockserver) cert = crawler.spider.meta['responses'][0].certificate self.assertIsInstance(cert, Certificate) self.assertEqual(cert.getSubject().commonName, b"localhost") self.assertEqual(cert.getIssuer().commonName, b"localhost") @defer.inlineCallbacks def test_dns_server_ip_address_none(self): crawler = self.runner.create_crawler(SingleRequestSpider) url = self.mockserver.url('/status?n=200') yield crawler.crawl(seed=url, mockserver=self.mockserver) ip_address = crawler.spider.meta['responses'][0].ip_address self.assertIsNone(ip_address) @defer.inlineCallbacks def test_dns_server_ip_address(self): crawler = self.runner.create_crawler(SingleRequestSpider) url = self.mockserver.url('/echo?body=test') expected_netloc, _ = urlparse(url).netloc.split(':') yield crawler.crawl(seed=url, mockserver=self.mockserver) ip_address = crawler.spider.meta['responses'][0].ip_address self.assertIsInstance(ip_address, IPv4Address) self.assertEqual(str(ip_address), gethostbyname(expected_netloc)) @defer.inlineCallbacks def test_bytes_received_stop_download_callback(self): crawler = self.runner.create_crawler(BytesReceivedCallbackSpider) yield crawler.crawl(mockserver=self.mockserver) self.assertIsNone(crawler.spider.meta.get("failure")) self.assertIsInstance(crawler.spider.meta["response"], Response) self.assertEqual(crawler.spider.meta["response"].body, crawler.spider.meta.get("bytes_received")) self.assertLess(len(crawler.spider.meta["response"].body), crawler.spider.full_response_length) @defer.inlineCallbacks def test_bytes_received_stop_download_errback(self): crawler = self.runner.create_crawler(BytesReceivedErrbackSpider) yield crawler.crawl(mockserver=self.mockserver) self.assertIsNone(crawler.spider.meta.get("response")) self.assertIsInstance(crawler.spider.meta["failure"], Failure) self.assertIsInstance(crawler.spider.meta["failure"].value, StopDownload) self.assertIsInstance(crawler.spider.meta["failure"].value.response, Response) self.assertEqual(crawler.spider.meta["failure"].value.response.body, crawler.spider.meta.get("bytes_received")) self.assertLess( len(crawler.spider.meta["failure"].value.response.body), crawler.spider.full_response_length) @defer.inlineCallbacks def test_headers_received_stop_download_callback(self): crawler = self.runner.create_crawler(HeadersReceivedCallbackSpider) yield crawler.crawl(mockserver=self.mockserver) self.assertIsNone(crawler.spider.meta.get("failure")) self.assertIsInstance(crawler.spider.meta["response"], Response) self.assertEqual(crawler.spider.meta["response"].headers, crawler.spider.meta.get("headers_received")) @defer.inlineCallbacks def test_headers_received_stop_download_errback(self): crawler = self.runner.create_crawler(HeadersReceivedErrbackSpider) yield crawler.crawl(mockserver=self.mockserver) self.assertIsNone(crawler.spider.meta.get("response")) self.assertIsInstance(crawler.spider.meta["failure"], Failure) self.assertIsInstance(crawler.spider.meta["failure"].value, StopDownload) self.assertIsInstance(crawler.spider.meta["failure"].value.response, Response) self.assertEqual(crawler.spider.meta["failure"].value.response.headers, crawler.spider.meta.get("headers_received"))
class BookCrawler: """Crawl the provided book, exporting the crawled images""" exporter = None def __init__(self, provider: CrawlSpider, slug: str, output_dir: str, verbose=False): self.provider = provider self.slug = slug self.output_dir = output_dir self.verbose = verbose if verbose: configure_logging() self.runner = CrawlerRunner() def get_volumes_links(self) -> Iterable[str]: """Get the available list of volumes links for the wanted book slug""" scraper = cfscrape.create_scraper() book_url = (f'http://{self.provider.allowed_domains[0]}/' f'{self.provider.url_key}/{self.slug}') response = scraper.get(book_url) document = document_fromstring(response.text) volume_elements = document.xpath( '//table[@class="listing"]//tr[position()>2]') if not volume_elements: raise BookScrapeException('No volumes found for the "%s" slug' % self.slug) volumes = [] for index, volume_element in enumerate(volume_elements): volume_link = volume_element.xpath('./td[1]/a/@href')[0] volumes.append(volume_link) volumes.reverse() return volumes def run(self, volume_start: int, volume_end: int): self.exporter = PdfExporter( self.output_dir, os.path.join(self.output_dir, 'images'), file_name='%s_%s.pdf' % (self.slug, '-'.join( [str(volume_start), str(volume_end)]))) logger.info( 'Crawling started for the book slug "%s" on the "%s" provider.', self.slug, self.provider.name) volumes_list = list(range(volume_start, volume_end + 1)) self.crawl(volumes_list) reactor.run() @staticmethod def _on_error(failure): if isinstance(failure.value, BookScrapeException): logger.error(str(failure.value)) else: logger.error(failure) def _get_crawler(self) -> Crawler: crawler = Crawler(self.provider, settings={ 'IMAGES_STORE': os.path.join(self.output_dir, 'images'), **SETTINGS }) crawler.signals.connect(self._on_error, signals.spider_error) crawler.signals.connect(self.exporter.export_item, signals.item_scraped) return crawler @defer.inlineCallbacks def crawl(self, volumes: Iterable[int]): yield self.runner.crawl(self._get_crawler(), book_slug=self.slug, volumes=volumes) reactor.stop() self.exporter.finish_exporting()
from scrapy.crawler import CrawlerProcess from spiders.FlatSpider import FlatSpider from spiders.HouseSpider import HouseSpider from twisted.internet import reactor from scrapy.crawler import CrawlerRunner from scrapy.utils.log import configure_logging configure_logging() runner = CrawlerRunner() runner.crawl(FlatSpider) runner.crawl(HouseSpider) d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
class CrawlTestCase(TestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() self.runner = CrawlerRunner() def tearDown(self): self.mockserver.__exit__(None, None, None) @defer.inlineCallbacks def test_follow_all(self): crawler = self.runner.create_crawler(FollowAllSpider) yield crawler.crawl(mockserver=self.mockserver) self.assertEqual(len(crawler.spider.urls_visited), 11) # 10 + start_url @defer.inlineCallbacks def test_fixed_delay(self): yield self._test_delay(total=3, delay=0.2) @defer.inlineCallbacks def test_randomized_delay(self): yield self._test_delay(total=3, delay=0.1, randomize=True) @defer.inlineCallbacks def _test_delay(self, total, delay, randomize=False): crawl_kwargs = dict( maxlatency=delay * 2, mockserver=self.mockserver, total=total, ) tolerance = (1 - (0.6 if randomize else 0.2)) settings = { "DOWNLOAD_DELAY": delay, 'RANDOMIZE_DOWNLOAD_DELAY': randomize } crawler = CrawlerRunner(settings).create_crawler(FollowAllSpider) yield crawler.crawl(**crawl_kwargs) times = crawler.spider.times total_time = times[-1] - times[0] average = total_time / (len(times) - 1) self.assertTrue(average > delay * tolerance, "download delay too small: %s" % average) # Ensure that the same test parameters would cause a failure if no # download delay is set. Otherwise, it means we are using a combination # of ``total`` and ``delay`` values that are too small for the test # code above to have any meaning. settings["DOWNLOAD_DELAY"] = 0 crawler = CrawlerRunner(settings).create_crawler(FollowAllSpider) yield crawler.crawl(**crawl_kwargs) times = crawler.spider.times total_time = times[-1] - times[0] average = total_time / (len(times) - 1) self.assertFalse(average > delay / tolerance, "test total or delay values are too small") @defer.inlineCallbacks def test_timeout_success(self): crawler = self.runner.create_crawler(DelaySpider) yield crawler.crawl(n=0.5, mockserver=self.mockserver) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 > 0) self.assertTrue(crawler.spider.t2 > crawler.spider.t1) @defer.inlineCallbacks def test_timeout_failure(self): crawler = CrawlerRunner({ "DOWNLOAD_TIMEOUT": 0.35 }).create_crawler(DelaySpider) yield crawler.crawl(n=0.5, mockserver=self.mockserver) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 == 0) self.assertTrue(crawler.spider.t2_err > 0) self.assertTrue(crawler.spider.t2_err > crawler.spider.t1) # server hangs after receiving response headers yield crawler.crawl(n=0.5, b=1, mockserver=self.mockserver) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 == 0) self.assertTrue(crawler.spider.t2_err > 0) self.assertTrue(crawler.spider.t2_err > crawler.spider.t1) @defer.inlineCallbacks def test_retry_503(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/status?n=503"), mockserver=self.mockserver) self._assert_retried(log) @defer.inlineCallbacks def test_retry_conn_failed(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as log: yield crawler.crawl("http://localhost:65432/status?n=503", mockserver=self.mockserver) self._assert_retried(log) @defer.inlineCallbacks def test_retry_dns_error(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as log: # try to fetch the homepage of a non-existent domain yield crawler.crawl("http://dns.resolution.invalid./", mockserver=self.mockserver) self._assert_retried(log) @defer.inlineCallbacks def test_start_requests_bug_before_yield(self): with LogCapture('scrapy', level=logging.ERROR) as log: crawler = self.runner.create_crawler(BrokenStartRequestsSpider) yield crawler.crawl(fail_before_yield=1, mockserver=self.mockserver) self.assertEqual(len(log.records), 1) record = log.records[0] self.assertIsNotNone(record.exc_info) self.assertIs(record.exc_info[0], ZeroDivisionError) @defer.inlineCallbacks def test_start_requests_bug_yielding(self): with LogCapture('scrapy', level=logging.ERROR) as log: crawler = self.runner.create_crawler(BrokenStartRequestsSpider) yield crawler.crawl(fail_yielding=1, mockserver=self.mockserver) self.assertEqual(len(log.records), 1) record = log.records[0] self.assertIsNotNone(record.exc_info) self.assertIs(record.exc_info[0], ZeroDivisionError) @defer.inlineCallbacks def test_start_requests_lazyness(self): settings = {"CONCURRENT_REQUESTS": 1} crawler = CrawlerRunner(settings).create_crawler( BrokenStartRequestsSpider) yield crawler.crawl(mockserver=self.mockserver) self.assertTrue( crawler.spider.seedsseen.index(None) < crawler.spider.seedsseen.index(99), crawler.spider.seedsseen) @defer.inlineCallbacks def test_start_requests_dupes(self): settings = {"CONCURRENT_REQUESTS": 1} crawler = CrawlerRunner(settings).create_crawler( DuplicateStartRequestsSpider) yield crawler.crawl(dont_filter=True, distinct_urls=2, dupe_factor=3, mockserver=self.mockserver) self.assertEqual(crawler.spider.visited, 6) yield crawler.crawl(dont_filter=False, distinct_urls=3, dupe_factor=4, mockserver=self.mockserver) self.assertEqual(crawler.spider.visited, 3) @defer.inlineCallbacks def test_unbounded_response(self): # Completeness of responses without Content-Length or Transfer-Encoding # can not be determined, we treat them as valid but flagged as "partial" from urllib.parse import urlencode query = urlencode({ 'raw': '''\ HTTP/1.1 200 OK Server: Apache-Coyote/1.1 X-Powered-By: Servlet 2.4; JBoss-4.2.3.GA (build: SVNTag=JBoss_4_2_3_GA date=200807181417)/JBossWeb-2.0 Set-Cookie: JSESSIONID=08515F572832D0E659FD2B0D8031D75F; Path=/ Pragma: no-cache Expires: Thu, 01 Jan 1970 00:00:00 GMT Cache-Control: no-cache Cache-Control: no-store Content-Type: text/html;charset=UTF-8 Content-Language: en Date: Tue, 27 Aug 2013 13:05:05 GMT Connection: close foo body with multiples lines ''' }) crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/raw?{0}".format(query)), mockserver=self.mockserver) self.assertEqual(str(log).count("Got response 200"), 1) @defer.inlineCallbacks def test_retry_conn_lost(self): # connection lost after receiving data crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/drop?abort=0"), mockserver=self.mockserver) self._assert_retried(log) @defer.inlineCallbacks def test_retry_conn_aborted(self): # connection lost before receiving data crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/drop?abort=1"), mockserver=self.mockserver) self._assert_retried(log) def _assert_retried(self, log): self.assertEqual(str(log).count("Retrying"), 2) self.assertEqual(str(log).count("Gave up retrying"), 1) @defer.inlineCallbacks def test_referer_header(self): """Referer header is set by RefererMiddleware unless it is already set""" req0 = Request(self.mockserver.url('/echo?headers=1&body=0'), dont_filter=1) req1 = req0.replace() req2 = req0.replace(headers={'Referer': None}) req3 = req0.replace(headers={'Referer': 'http://example.com'}) req0.meta['next'] = req1 req1.meta['next'] = req2 req2.meta['next'] = req3 crawler = self.runner.create_crawler(SingleRequestSpider) yield crawler.crawl(seed=req0, mockserver=self.mockserver) # basic asserts in case of weird communication errors self.assertIn('responses', crawler.spider.meta) self.assertNotIn('failures', crawler.spider.meta) # start requests doesn't set Referer header echo0 = json.loads(to_unicode( crawler.spider.meta['responses'][2].body)) self.assertNotIn('Referer', echo0['headers']) # following request sets Referer to start request url echo1 = json.loads(to_unicode( crawler.spider.meta['responses'][1].body)) self.assertEqual(echo1['headers'].get('Referer'), [req0.url]) # next request avoids Referer header echo2 = json.loads(to_unicode( crawler.spider.meta['responses'][2].body)) self.assertNotIn('Referer', echo2['headers']) # last request explicitly sets a Referer header echo3 = json.loads(to_unicode( crawler.spider.meta['responses'][3].body)) self.assertEqual(echo3['headers'].get('Referer'), ['http://example.com']) @defer.inlineCallbacks def test_engine_status(self): from scrapy.utils.engine import get_engine_status est = [] def cb(response): est.append(get_engine_status(crawler.engine)) crawler = self.runner.create_crawler(SingleRequestSpider) yield crawler.crawl(seed=self.mockserver.url('/'), callback_func=cb, mockserver=self.mockserver) self.assertEqual(len(est), 1, est) s = dict(est[0]) self.assertEqual(s['engine.spider.name'], crawler.spider.name) self.assertEqual(s['len(engine.scraper.slot.active)'], 1) @defer.inlineCallbacks def test_graceful_crawl_error_handling(self): """ Test whether errors happening anywhere in Crawler.crawl() are properly reported (and not somehow swallowed) after a graceful engine shutdown. The errors should not come from within Scrapy's core but from within spiders/middlewares/etc., e.g. raised in Spider.start_requests(), SpiderMiddleware.process_start_requests(), etc. """ class TestError(Exception): pass class FaultySpider(SimpleSpider): def start_requests(self): raise TestError crawler = self.runner.create_crawler(FaultySpider) yield self.assertFailure(crawler.crawl(mockserver=self.mockserver), TestError) self.assertFalse(crawler.crawling) @defer.inlineCallbacks def test_open_spider_error_on_faulty_pipeline(self): settings = { "ITEM_PIPELINES": { "tests.pipelines.ZeroDivisionErrorPipeline": 300, } } crawler = CrawlerRunner(settings).create_crawler(SimpleSpider) yield self.assertFailure( self.runner.crawl(crawler, self.mockserver.url("/status?n=200"), mockserver=self.mockserver), ZeroDivisionError) self.assertFalse(crawler.crawling) @defer.inlineCallbacks def test_crawlerrunner_accepts_crawler(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as log: yield self.runner.crawl(crawler, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) self.assertIn("Got response 200", str(log)) @defer.inlineCallbacks def test_crawl_multiple(self): self.runner.crawl(SimpleSpider, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) self.runner.crawl(SimpleSpider, self.mockserver.url("/status?n=503"), mockserver=self.mockserver) with LogCapture() as log: yield self.runner.join() self._assert_retried(log) self.assertIn("Got response 200", str(log)) @defer.inlineCallbacks def test_crawlspider_with_errback(self): self.runner.crawl(CrawlSpiderWithErrback, mockserver=self.mockserver) with LogCapture() as log: yield self.runner.join() self.assertIn("[callback] status 200", str(log)) self.assertIn("[callback] status 201", str(log)) self.assertIn("[errback] status 404", str(log)) self.assertIn("[errback] status 500", str(log)) @defer.inlineCallbacks def test_async_def_parse(self): self.runner.crawl(AsyncDefSpider, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) with LogCapture() as log: yield self.runner.join() self.assertIn("Got response 200", str(log)) @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncio_parse(self): runner = CrawlerRunner({ "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor" }) runner.crawl(AsyncDefAsyncioSpider, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) with LogCapture() as log: yield runner.join() self.assertIn("Got response 200", str(log)) @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncio_parse_items_list(self): items = [] def _on_item_scraped(item): items.append(item) crawler = self.runner.create_crawler(AsyncDefAsyncioReturnSpider) crawler.signals.connect(_on_item_scraped, signals.item_scraped) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/status?n=200"), mockserver=self.mockserver) self.assertIn("Got response 200", str(log)) self.assertIn({'id': 1}, items) self.assertIn({'id': 2}, items) @mark.skipif(sys.version_info < (3, 6), reason="Async generators require Python 3.6 or higher") @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncgen_parse(self): from tests.py36._test_crawl import AsyncDefAsyncioGenSpider crawler = self.runner.create_crawler(AsyncDefAsyncioGenSpider) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/status?n=200"), mockserver=self.mockserver) self.assertIn("Got response 200", str(log)) itemcount = crawler.stats.get_value('item_scraped_count') self.assertEqual(itemcount, 1) @mark.skipif(sys.version_info < (3, 6), reason="Async generators require Python 3.6 or higher") @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncgen_parse_loop(self): items = [] def _on_item_scraped(item): items.append(item) from tests.py36._test_crawl import AsyncDefAsyncioGenLoopSpider crawler = self.runner.create_crawler(AsyncDefAsyncioGenLoopSpider) crawler.signals.connect(_on_item_scraped, signals.item_scraped) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/status?n=200"), mockserver=self.mockserver) self.assertIn("Got response 200", str(log)) itemcount = crawler.stats.get_value('item_scraped_count') self.assertEqual(itemcount, 10) for i in range(10): self.assertIn({'foo': i}, items) @mark.skipif(sys.version_info < (3, 6), reason="Async generators require Python 3.6 or higher") @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncgen_parse_complex(self): items = [] def _on_item_scraped(item): items.append(item) from tests.py36._test_crawl import AsyncDefAsyncioGenComplexSpider crawler = self.runner.create_crawler(AsyncDefAsyncioGenComplexSpider) crawler.signals.connect(_on_item_scraped, signals.item_scraped) yield crawler.crawl(mockserver=self.mockserver) itemcount = crawler.stats.get_value('item_scraped_count') self.assertEqual(itemcount, 156) # some random items for i in [1, 4, 21, 22, 207, 311]: self.assertIn({'index': i}, items) for i in [10, 30, 122]: self.assertIn({'index2': i}, items) @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncio_parse_reqs_list(self): crawler = self.runner.create_crawler(AsyncDefAsyncioReqsReturnSpider) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/status?n=200"), mockserver=self.mockserver) for req_id in range(3): self.assertIn("Got response 200, req_id %d" % req_id, str(log)) @defer.inlineCallbacks def test_response_ssl_certificate_none(self): crawler = self.runner.create_crawler(SingleRequestSpider) url = self.mockserver.url("/echo?body=test", is_secure=False) yield crawler.crawl(seed=url, mockserver=self.mockserver) self.assertIsNone(crawler.spider.meta['responses'][0].certificate) @defer.inlineCallbacks def test_response_ssl_certificate(self): crawler = self.runner.create_crawler(SingleRequestSpider) url = self.mockserver.url("/echo?body=test", is_secure=True) yield crawler.crawl(seed=url, mockserver=self.mockserver) cert = crawler.spider.meta['responses'][0].certificate self.assertIsInstance(cert, Certificate) self.assertEqual(cert.getSubject().commonName, b"localhost") self.assertEqual(cert.getIssuer().commonName, b"localhost") @mark.xfail( reason="Responses with no body return early and contain no certificate" ) @defer.inlineCallbacks def test_response_ssl_certificate_empty_response(self): crawler = self.runner.create_crawler(SingleRequestSpider) url = self.mockserver.url("/status?n=200", is_secure=True) yield crawler.crawl(seed=url, mockserver=self.mockserver) cert = crawler.spider.meta['responses'][0].certificate self.assertIsInstance(cert, Certificate) self.assertEqual(cert.getSubject().commonName, b"localhost") self.assertEqual(cert.getIssuer().commonName, b"localhost") @defer.inlineCallbacks def test_dns_server_ip_address_none(self): crawler = self.runner.create_crawler(SingleRequestSpider) url = self.mockserver.url('/status?n=200') yield crawler.crawl(seed=url, mockserver=self.mockserver) ip_address = crawler.spider.meta['responses'][0].ip_address self.assertIsNone(ip_address) @defer.inlineCallbacks def test_dns_server_ip_address(self): crawler = self.runner.create_crawler(SingleRequestSpider) url = self.mockserver.url('/echo?body=test') expected_netloc, _ = urlparse(url).netloc.split(':') yield crawler.crawl(seed=url, mockserver=self.mockserver) ip_address = crawler.spider.meta['responses'][0].ip_address self.assertIsInstance(ip_address, IPv4Address) self.assertEqual(str(ip_address), gethostbyname(expected_netloc))
from orangespider.models import ArticleRule, BookRule from orangespider.spiders.article_spider import ArticleSpider from orangespider.spiders.book_spider import BookSpider if __name__ == '__main__': settings = get_project_settings() configure_logging(settings) db = db_connect() Session = sessionmaker(bind=db) session = Session() # Load ArticleRule article_rules = session.query(ArticleRule).filter( ArticleRule.enable == 1).all() # Load BookRule book_rules = session.query(BookRule).filter(BookRule.enable == 1).all() session.close() runner = CrawlerRunner(settings) # init ArticleSpider # for article_rule in article_rules: # runner.crawl(ArticleSpider, rule=article_rule) # init BookSpider for book_rule in book_rules: runner.crawl(BookSpider, rule=book_rule) d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run() logging.info('Spider Finished!')
def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() self.runner = CrawlerRunner()
allowed_domains = ['amazon.sa'] start_urls = start_urls user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36' def parse(self, response): data = response.css('#cm_cr-review_list') star_rating = data.css('.review-rating') comments = data.css('.review-text') count = 0 for review in star_rating: yield {'stars': ''.join(review.xpath('.//text()').extract()), 'comment': ''.join(comments[count].xpath(".//text()").extract()) } count += 1 next_page = response.css('.a-last a ::attr(href)').extract_first() if next_page : yield scrapy.Request(response.urljoin(next_page), callback=self.parse) configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) logger = logging.getLogger('scrapy.core.scraper') logger.setLevel('INFO') runner = CrawlerRunner(settings={ "FEEDS": { "reviews2.csv": {"format": "csv"}, }, }) d = runner.crawl(AmazonReviewsSpider) d.addBoth(lambda _: reactor.stop()) reactor.run() # the script will block here until the crawling is finished
def main(argv): inputfile = '' try: opts, args = getopt.getopt(argv, "hi:", ["ifile="]) except getopt.GetoptError: print('crawlers.py -i <inputfile>') sys.exit(2) for opt, arg in opts: if opt == '-h': print('crawlers.py -i <inputfile>') sys.exit() elif opt in ("-i", "--ifile="): inputfile = arg ticks = list(read_file(inputfile)) # Create and run spiders configure_logging() crawler_settings = Settings() crawler_settings.setmodule(my_settings) runner = CrawlerRunner(settings=crawler_settings) for tick in ticks: kwargs = {'tick': tick} runner.crawl(MWSpider, **kwargs) runner.crawl(ReutersSpider, **kwargs) runner.crawl(BloSpider, **kwargs) runner.crawl(MSNBCSpider, **kwargs) d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
def __init__(self, id_list, *args, **kwargs): super().__init__(*args, **kwargs) self.id_list = id_list configure_logging() self.runner = CrawlerRunner(get_project_settings())
def test_crawler_runner_accepts_None(self): runner = CrawlerRunner() self.assertOptionIsDefault(runner.settings, 'RETRY_ENABLED')
from twisted.internet import reactor, defer from scrapy.crawler import CrawlerRunner from scrapy.utils.log import configure_logging from weibo.spiders.topspider import TopspiderSpider from weibo.spiders.weibospider import WeibospiderSpider import logging from scrapy.utils.project import get_project_settings configure_logging() runner = CrawlerRunner(get_project_settings()) @defer.inlineCallbacks def crawl(): yield runner.crawl(TopspiderSpider) logging.info("TopspiderSpider is stopped") yield runner.crawl(WeibospiderSpider) reactor.stop() while True: logging.info("new cycle is starting") crawl() reactor.run( ) # the script will block here until the last crawl call is finished
from twisted.internet import reactor from scrapy.crawler import CrawlerRunner from scrapy.utils.log import configure_logging from scrapy.utils.project import get_project_settings import json # 加入项目配置文件 configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) reactor.suggestThreadPoolSize(30) runner = CrawlerRunner(get_project_settings()) # 导入爬虫 from bugs.spiders.jd_spider import JDSpoder # urls = [] # for line in open('../data/jd-goods-id.json', 'r',encoding="utf8"): # data = json.loads(line) # url = 'https://item.jd.com/{id}.html'.format(id = data['goods_id']) # urls.append(url) # 'https://item.jd.com/14786160283.html', urls = [ 'https://item.yiyaojd.com/13674344768.html', 'https://item.yiyaojd.com/3154005.html', 'https://item.yiyaojd.com/4410004.html', 'https://item.yiyaojd.com/13214528412.html', 'https://item.yiyaojd.com/4808847.html', 'https://item.yiyaojd.com/3091800.html', 'https://item.yiyaojd.com/3108320.html', 'https://item.yiyaojd.com/12801590412.html', 'https://item.yiyaojd.com/3172871.html', 'https://item.yiyaojd.com/17990313602.html' ] for i in range(len(urls)): kwargs = {'url': '{}'.format(urls[i])}
# # process.crawl("lawlib_xinshi_spider", domain={"law-lib.com"}) # # process.crawl("lawlib_minshi_spider", domain={"law-lib.com"}) # # process.crawl("lawlib_xinzhen_spider", domain={"law-lib.com"}) # # process.crawl("qq_news_spider", domain={"qq.com"}) # # process.crawl("sina_news_spider", domain={"sina.com.cn"}) # process.crawl("sina_sifa_news_spider", domain={"sina.com.cn"}) # process.crawl("sina_sifa_publicity_spider", domain={"sina.com.cn"}) # # process.start() # running the spiders sequentially by chaining the deferreds: configure_logging() settings = get_project_settings() runner = CrawlerRunner(settings=settings) @defer.inlineCallbacks def crawl(): # china yield runner.crawl(china.ChinaNewsSpider) yield runner.crawl(china.ChinaTheorySpider) yield runner.crawl(china.ChinaAffairSpider) # cctv yield runner.crawl(cctv.CCTVShipingSpider) yield runner.crawl(cctv.CCTVNewsSpider) yield runner.crawl(cctv.CCTVCaijingSpider) # chinadaily yield runner.crawl(chinadaily.ChinadailyChinaSpider)
from scrapy.utils.log import configure_logging # 爬取指定关键词的shopUrl class Img(scrapy.Spider): name = "img" def __init__(self, name=None, **kwargs): super().__init__(name, **kwargs) def start_requests(self): url = "http://www.meituan.com/" yield scrapy.Request(url=url, callback=self.parse, meta={"cookiejar": 1}) def parse(self, response): print(response.text) yield scrapy.Request(url="http://www.meituan.com/deal/47840801.html", callback=self.parse, dont_filter=True, meta={"cookiejar": response.meta["cookiejar"]}) if __name__ == "__main__": configure_logging({"LOG_FORMAT": "%(levelname)s: %(message)s"}) runner = CrawlerRunner(get_project_settings()) d = runner.crawl(Img) d.addBoth(lambda _: reactor.stop()) reactor.run() # the script will block here until the crawling is finished
from scrapy.crawler import CrawlerRunner from scrapy.utils.project import get_project_settings from scrapy.utils.log import configure_logging from coolscrapy.models import db_connect, create_news_table from coolscrapy.models import ArticleRule from sqlalchemy.orm import sessionmaker from coolscrapy.spiders.article_spider import ArticleSpider if __name__ == '__main__': settings = get_project_settings() configure_logging(settings) db = db_connect() Session = sessionmaker(bind=db) session = Session() rules = session.query(ArticleRule).filter(ArticleRule.enable == 1).all() session.close() runner = CrawlerRunner(settings) for rule in rules: # spider = ArticleSpider(rule) # instantiate every spider using rule # stop reactor when spider closes # runner.signals.connect(spider_closing, signal=signals.spider_closed) runner.crawl(ArticleSpider, rule=rule) d = runner.join() d.addBoth(lambda _: reactor.stop()) # blocks process so always keep as the last statement reactor.run() logging.info('all finished.')
def handler(event, context): runner = CrawlerRunner(get_project_settings()) d = runner.crawl('github_trend_crawler', timescale='daily') d.addBoth(lambda _: reactor.stop()) reactor.run()
def test_crawler_runner_bootstrap_successful_for_several(self): runner = CrawlerRunner() yield runner.crawl(NoRequestsSpider) yield runner.crawl(NoRequestsSpider) self.assertEqual(runner.bootstrap_failed, False)
def runSpider(): # initalize crawler with current project settings crawler = CrawlerRunner(get_project_settings()) # crawl the mimgspider and pass in the filename below for start url config crawler.crawl('mimgspider', 'C:/Users/kimbe/Documents/15112/TermProject/urls.txt')
def test_crawler_runner_accepts_dict(self): runner = CrawlerRunner({'foo': 'bar'}) self.assertEqual(runner.settings['foo'], 'bar') self.assertOptionIsDefault(runner.settings, 'RETRY_ENABLED')
class Scrape: """ Class that represents connection between flask and scrapy. * run crawler in twisted reactor synchronously * Initialize CrawlRunner() """ crawl_runner = CrawlerRunner() dict_of_spiders = {} def scrape(self, domain, dict_of_spiders): """ run crawler in twisted reactor synchronously. :param domain: the list of domains :param dict_of_spiders:{'springer': SpringerDoi, 'wiley': WileyDoi, 'ieee': IeeeDoi} """ domains = domain self.dict_of_spiders = dict_of_spiders for domain in domains: try: self.scrape_with_crochet(domain).wait(timeout=5) except crochet.TimeoutError: self.scrape_with_crochet(domain).cancel() raise @crochet.run_in_reactor def scrape_with_crochet(self, domain): """ signal fires when single item is processed and calls _crawler_result to save that item. Consider some synchronous do-one-thing-after-the-other application code that wants to use event-driven Twisted-using code. We have two threads at a minimum: the application thread(s) and the reactor thread. There are also multiple layers of code involved in this interaction Twisted code: Should only be called in reactor thread. This may be code from the Twisted package itself, or more likely code you have written that is built on top of Twisted. @wait_for/@run_in_reactor wrappers: The body of the functions runs in the reactor thread... but the caller should be in the application thread. The application code: Runs in the application thread(s), expects synchronous/blocking calls. dispatcher.connect will connect to the dispatcher that will kind of loop the code between these two functions. crawl_runner.crawl will connect to the our particular spider function based on the domain name, in our scrapy file and after each yield will pass to the crawler_result function. The setting.py is applied to the crawl runner. :param domain: the domain to crawl :return: a twisted.internet.defer.Deferred """ configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) crawler_settings = Settings() crawler_settings.setmodule(sets) self.crawl_runner.settings = crawler_settings dispatcher.connect(self._crawler_result, signal=signals.item_scraped) for i in self.dict_of_spiders: if i in domain: eventual = self.crawl_runner.crawl(self.dict_of_spiders[i], category=domain) return eventual def _crawler_result(self, item, response, spider): """ A callback that is fired after the scrape has completed. The scraped results are saved to Database. :param item: The items scraped from the website """ database.save(dict(item))