class ScrapyPuppeteerTestCase(TestCase): """Test case for the ``scrapy-puppeteer`` package""" class PuppeteerSpider(scrapy.Spider): name = 'puppeteer_crawl_spider' allowed_domains = ['codesandbox.io'] custom_settings = { 'DOWNLOADER_MIDDLEWARES': { 'scrapy_puppeteer.PuppeteerMiddleware': 800 } } items = [] def start_requests(self): yield scrapy_puppeteer.PuppeteerRequest( 'https://codesandbox.io/search?page=1', wait_until='networkidle0') def parse(self, response): for selector_item in response.selector.xpath( '//li[@class="ais-Hits-item"]'): self.items.append(selector_item.xpath('.//h2').extract_first()) def setUp(self): """Store the Scrapy runner to use in the tests""" self.runner = CrawlerRunner() @defer.inlineCallbacks def test_items_number(self): crawler = self.runner.create_crawler(self.PuppeteerSpider) yield crawler.crawl() self.assertEqual(len(crawler.spider.items), 10)
class CallbackKeywordArgumentsTestCase(TestCase): maxDiff = None def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() self.runner = CrawlerRunner() def tearDown(self): self.mockserver.__exit__(None, None, None) @defer.inlineCallbacks def test_callback_kwargs(self): crawler = self.runner.create_crawler(KeywordArgumentsSpider) with LogCapture() as log: yield crawler.crawl(mockserver=self.mockserver) self.assertTrue(all(crawler.spider.checks)) self.assertEqual(len(crawler.spider.checks), crawler.stats.get_value('boolean_checks')) # check exceptions for argument mismatch exceptions = {} for line in log.records: for key in ('takes_less', 'takes_more'): if key in line.getMessage(): exceptions[key] = line self.assertEqual(exceptions['takes_less'].exc_info[0], TypeError) self.assertEqual( str(exceptions['takes_less'].exc_info[1]), "parse_takes_less() got an unexpected keyword argument 'number'") self.assertEqual(exceptions['takes_more'].exc_info[0], TypeError) self.assertEqual( str(exceptions['takes_more'].exc_info[1]), "parse_takes_more() missing 1 required positional argument: 'other'" )
def test_downloader_middleware_override_request_in_process_response(self): """ Downloader middleware which returns a response with an specific 'request' attribute. * The spider callback should receive the overriden response.request * Handlers listening to the response_received signal should receive the overriden response.request * The "crawled" log message should show the overriden response.request """ signal_params = {} def signal_handler(response, request, spider): signal_params["response"] = response signal_params["request"] = request url = self.mockserver.url("/status?n=200") runner = CrawlerRunner(settings={ "DOWNLOADER_MIDDLEWARES": { __name__ + ".ProcessResponseMiddleware": 595, } }) crawler = runner.create_crawler(SingleRequestSpider) crawler.signals.connect(signal_handler, signal=signals.response_received) with LogCapture() as log: yield crawler.crawl(seed=url, mockserver=self.mockserver) response = crawler.spider.meta["responses"][0] self.assertEqual(response.request.url, OVERRIDEN_URL) self.assertEqual(signal_params["response"].url, url) self.assertEqual(signal_params["request"].url, OVERRIDEN_URL) log.check_present( ("scrapy.core.engine", "DEBUG", f"Crawled (200) <GET {OVERRIDEN_URL}> (referer: None)"), )
def scrape_sites(urls): # If a previous run failed before deleting the output file we # accidentally up appending to existing content, unless we delete # any existing file. if os.path.exists(LOCAL_OUTPUT_PATH): os.remove(LOCAL_OUTPUT_PATH) print(f"Crawling {len(urls)} urls") start = time.time() runner = CrawlerRunner() crawler = runner.create_crawler(PageSpider) # Runner.crawl immediately returns a Deferred object before scraping has taken place. # Runner.crawl simple schedules some crawling to run as soon as possible in reactors event loop. # In general with Deferred objects, you can attach at callback to them which will fire when the # event loop has finished processing the corresponding work. However, using the @inlineCallbacks # decorator we can block the program here using yield until the scraping has been completed to # make the code execution more intuitive. yield runner.crawl(crawler, urls=urls) print("Done processing {} urls in {:.2f} seconds".format( len(urls), time.time() - start)) # Load scrapy's data and remove local file result_df = pd.read_csv(LOCAL_OUTPUT_PATH, names=["url", "description", "body"]) os.remove(LOCAL_OUTPUT_PATH) return result_df
def make_crawler(**extra_settings): settings = Settings() settings['ITEM_PIPELINES'] = { 'scrapy_cdr.media_pipeline.CDRMediaPipeline': 1, 'tests.utils.CollectorPipeline': 100, } settings.update(extra_settings) runner = CrawlerRunner(settings) return runner.create_crawler(Spider)
def get_crawler(spidercls=None, settings_dict=None): """Return an unconfigured Crawler object. If settings_dict is given, it will be used to populate the crawler settings with a project level priority. """ from scrapy.crawler import CrawlerRunner from scrapy.spiders import Spider runner = CrawlerRunner(settings_dict) return runner.create_crawler(spidercls or Spider)
def test_crawler_runner_loading(self): module = 'tests.test_spiderloader.test_spiders.spider1' runner = CrawlerRunner({'SPIDER_MODULES': [module]}) self.assertRaisesRegex(KeyError, 'Spider not found', runner.create_crawler, 'spider2') crawler = runner.create_crawler('spider1') self.assertTrue(issubclass(crawler.spidercls, scrapy.Spider)) self.assertEqual(crawler.spidercls.name, 'spider1')
def test_crawler_runner_loading(self): module = 'tests.test_spiderloader.test_spiders.spider1' runner = CrawlerRunner({'SPIDER_MODULES': [module]}) self.assertRaisesRegexp(KeyError, 'Spider not found', runner.create_crawler, 'spider2') crawler = runner.create_crawler('spider1') self.assertTrue(issubclass(crawler.spidercls, scrapy.Spider)) self.assertEqual(crawler.spidercls.name, 'spider1')
def setUp(self): settings = Settings() settings.setmodule(undercrawler.settings) settings['DOWNLOAD_DELAY'] = 0.1 settings['ITEM_PIPELINES']['tests.utils.CollectorPipeline'] = 100 splash_url = os.environ.get('SPLASH_URL') if splash_url: settings['SPLASH_URL'] = splash_url settings.update(self.settings) runner = CrawlerRunner(settings) self.crawler = runner.create_crawler(BaseSpider)
def scrapy_embedding(spidercls): settings = get_scrapy_settings() # actually we can manually create crawler # but CrawlRunner does it more sophisticated and adds support for str runner = CrawlerRunner(settings) crawler = runner.create_crawler(spidercls) crawler.engine = crawler._create_engine() crawler.engine.start() #log.start(logstdout=False) return crawler
def scrapy_embedding(spidercls): settings = get_scrapy_settings() # actually we can manually create crawler # but CrawlRunner does it more sophisticated and adds support for str runner = CrawlerRunner(settings) crawler = runner.create_crawler(spidercls) crawler.engine = crawler._create_engine() crawler.engine.start() # log.start(logstdout=False) return crawler
def test_downloader_middleware_raise_exception(self): url = self.mockserver.url("/status?n=200") runner = CrawlerRunner(settings={ "DOWNLOADER_MIDDLEWARES": { RaiseExceptionRequestMiddleware: 590, }, }) crawler = runner.create_crawler(SingleRequestSpider) yield crawler.crawl(seed=url, mockserver=self.mockserver) failure = crawler.spider.meta["failure"] self.assertEqual(failure.request.url, url) self.assertIsInstance(failure.value, ZeroDivisionError)
def get_crawler(spidercls=None, settings_dict=None, prevent_warnings=True): """Return an unconfigured Crawler object. If settings_dict is given, it will be used to populate the crawler settings with a project level priority. """ from scrapy.crawler import CrawlerRunner from scrapy.spiders import Spider # Set by default settings that prevent deprecation warnings. settings = {} if prevent_warnings: settings['REQUEST_FINGERPRINTER_IMPLEMENTATION'] = 'VERSION' settings.update(settings_dict or {}) runner = CrawlerRunner(settings) return runner.create_crawler(spidercls or Spider)
class CrawlTestCase(TestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() self.runner = CrawlerRunner() def tearDown(self): self.mockserver.__exit__(None, None, None) @defer.inlineCallbacks def test_follow_all(self): crawler = self.runner.create_crawler(FollowAllSpider) yield crawler.crawl() assert crawler.stats.get_value('item_scraped_count') == 3
def make_crawler(spider_cls=ATestBaseSpider, **extra_settings): # clean up queue before starting spider assert spider_cls.name.startswith('test_'), 'pass a special test spider' redis_server = redis.from_url('redis://localhost') name = spider_cls.name redis_server.delete( SCHEDULER_DUPEFILTER_KEY % {'spider': name}, *redis_server.keys( SCHEDULER_QUEUE_KEY % {'spider': name} + '*')) settings = Settings() settings.setmodule(dd_crawler.settings) settings['ITEM_PIPELINES']['tests.utils.CollectorPipeline'] = 100 settings.update(extra_settings) runner = CrawlerRunner(settings) return runner.create_crawler(spider_cls)
def run_spider(settings, itemcount, keyheader='', conid='', spider_id=0): s = Settings() s.setmodule(settings) sl = SpiderLoader(settings=s) print('spider list=', sl.list()) spider = sl.load(sl.list()[spider_id]) spider.itemcount = itemcount configure_logging({'LOG_LEVEL': 'DEBUG'}) # scrapy 로그 레벨 설정 runner = CrawlerRunner(settings=s) crawler = runner.create_crawler(spider) #if sighandler != None: #sighandler.connect(crawler) d = runner.crawl(crawler, keyheader=keyheader, conid=conid) #d = runner.crawl(spider, keyheader=keyheader, itemcount=itemcount) return d
def conf(self): runner = CrawlerRunner(self.settings) list_urls = self.get_list_urls(self.area, self.illness) doctor_words = self.get_doctor_regex_words(self.doctor) print(list_urls) for i in range(0, len(list_urls)): domain = list_urls[i]['domain'] urls = list_urls[i]['urls'] #es_spider.ElAtletaComSpider spider = eval(domain.replace('.', '').title() + 'Spider') zoidgber_crawler = runner.create_crawler(spider) yield runner.crawl(zoidgber_crawler, doctor_regex=doctor_words, urls=urls, path=self.path) reactor.stop( ) # the script will block here until the crawling is finished
def test_downloader_middleware_alternative_callback(self): """ Downloader middleware which returns a response with a specific 'request' attribute, with an alternative callback """ runner = CrawlerRunner(settings={ "DOWNLOADER_MIDDLEWARES": { AlternativeCallbacksMiddleware: 595, } }) crawler = runner.create_crawler(AlternativeCallbacksSpider) with LogCapture() as log: url = self.mockserver.url("/status?n=200") yield crawler.crawl(seed=url, mockserver=self.mockserver) log.check_present(("alternative_callbacks_spider", "INFO", "alt_callback was invoked with foo=bar"), )
def test_downloader_middleware_do_not_override_in_process_exception(self): """ An exception is raised but caught by the next middleware, which returns a Response without a specific 'request' attribute. The spider callback should receive the original response.request """ url = self.mockserver.url("/status?n=200") runner = CrawlerRunner(settings={ "DOWNLOADER_MIDDLEWARES": { __name__ + ".RaiseExceptionRequestMiddleware": 590, __name__ + ".CatchExceptionDoNotOverrideRequestMiddleware": 595, }, }) crawler = runner.create_crawler(SingleRequestSpider) yield crawler.crawl(seed=url, mockserver=self.mockserver) response = crawler.spider.meta["responses"][0] self.assertEqual(response.body, b"Caught ZeroDivisionError") self.assertEqual(response.request.url, url)
def conf(self): runner = CrawlerRunner(self.settings) for province in self.provinces: property_crawler = runner.create_crawler('property') yield runner.crawl(property_crawler, transaction=self.transaction, property_type=self.property_type, province=province) province_dic_stats = {} province_dic_stats['transaction'] = self.transaction province_dic_stats['property_type'] = self.property_type province_dic_stats['province'] = province province_dic_stats['finish_reason'] = property_crawler.stats.get_value('finish_reason') province_dic_stats['start_time'] = property_crawler.stats.get_value('start_time') province_dic_stats['finish_time'] = property_crawler.stats.get_value('finish_time') province_dic_stats['item_scraped_count'] = property_crawler.stats.get_value('item_scraped_count') province_dic_stats['log_count/ERROR'] = property_crawler.stats.get_value('log_count/ERROR', default=0) # province_dic_stats.update(property_crawler.stats.get_stats()) self.stats_dic_list.append(province_dic_stats) reactor.stop() # the script will block here until the crawling is finished # pp = pprint.PrettyPrinter(indent=4) # pp.pprint(self.stats_dic_list) return self.stats_dic_list
class CrawlTestCase(TestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() self.runner = CrawlerRunner() def tearDown(self): self.mockserver.__exit__(None, None, None) @defer.inlineCallbacks def test_follow_all(self): crawler = self.runner.create_crawler(FollowAllSpider) yield crawler.crawl(mockserver=self.mockserver) self.assertEqual(len(crawler.spider.urls_visited), 11) # 10 + start_url @defer.inlineCallbacks def test_fixed_delay(self): yield self._test_delay(total=3, delay=0.1) @defer.inlineCallbacks def test_randomized_delay(self): yield self._test_delay(total=3, delay=0.1, randomize=True) @defer.inlineCallbacks def _test_delay(self, total, delay, randomize=False): crawl_kwargs = dict( maxlatency=delay * 2, mockserver=self.mockserver, total=total, ) tolerance = (1 - (0.6 if randomize else 0.2)) settings = { "DOWNLOAD_DELAY": delay, 'RANDOMIZE_DOWNLOAD_DELAY': randomize } crawler = CrawlerRunner(settings).create_crawler(FollowAllSpider) yield crawler.crawl(**crawl_kwargs) times = crawler.spider.times total_time = times[-1] - times[0] average = total_time / (len(times) - 1) self.assertTrue(average > delay * tolerance, "download delay too small: %s" % average) # Ensure that the same test parameters would cause a failure if no # download delay is set. Otherwise, it means we are using a combination # of ``total`` and ``delay`` values that are too small for the test # code above to have any meaning. settings["DOWNLOAD_DELAY"] = 0 crawler = CrawlerRunner(settings).create_crawler(FollowAllSpider) yield crawler.crawl(**crawl_kwargs) times = crawler.spider.times total_time = times[-1] - times[0] average = total_time / (len(times) - 1) self.assertFalse(average > delay / tolerance, "test total or delay values are too small") @defer.inlineCallbacks def test_timeout_success(self): crawler = self.runner.create_crawler(DelaySpider) yield crawler.crawl(n=0.5, mockserver=self.mockserver) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 > 0) self.assertTrue(crawler.spider.t2 > crawler.spider.t1) @defer.inlineCallbacks def test_timeout_failure(self): crawler = CrawlerRunner({ "DOWNLOAD_TIMEOUT": 0.35 }).create_crawler(DelaySpider) yield crawler.crawl(n=0.5, mockserver=self.mockserver) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 == 0) self.assertTrue(crawler.spider.t2_err > 0) self.assertTrue(crawler.spider.t2_err > crawler.spider.t1) # server hangs after receiving response headers yield crawler.crawl(n=0.5, b=1, mockserver=self.mockserver) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 == 0) self.assertTrue(crawler.spider.t2_err > 0) self.assertTrue(crawler.spider.t2_err > crawler.spider.t1) @defer.inlineCallbacks def test_retry_503(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/status?n=503"), mockserver=self.mockserver) self._assert_retried(l) @defer.inlineCallbacks def test_retry_conn_failed(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl("http://localhost:65432/status?n=503", mockserver=self.mockserver) self._assert_retried(l) @defer.inlineCallbacks def test_retry_dns_error(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: # try to fetch the homepage of a non-existent domain yield crawler.crawl("http://dns.resolution.invalid./", mockserver=self.mockserver) self._assert_retried(l) @defer.inlineCallbacks def test_start_requests_bug_before_yield(self): with LogCapture('scrapy', level=logging.ERROR) as l: crawler = self.runner.create_crawler(BrokenStartRequestsSpider) yield crawler.crawl(fail_before_yield=1, mockserver=self.mockserver) self.assertEqual(len(l.records), 1) record = l.records[0] self.assertIsNotNone(record.exc_info) self.assertIs(record.exc_info[0], ZeroDivisionError) @defer.inlineCallbacks def test_start_requests_bug_yielding(self): with LogCapture('scrapy', level=logging.ERROR) as l: crawler = self.runner.create_crawler(BrokenStartRequestsSpider) yield crawler.crawl(fail_yielding=1, mockserver=self.mockserver) self.assertEqual(len(l.records), 1) record = l.records[0] self.assertIsNotNone(record.exc_info) self.assertIs(record.exc_info[0], ZeroDivisionError) @defer.inlineCallbacks def test_start_requests_lazyness(self): settings = {"CONCURRENT_REQUESTS": 1} crawler = CrawlerRunner(settings).create_crawler( BrokenStartRequestsSpider) yield crawler.crawl(mockserver=self.mockserver) #self.assertTrue(False, crawler.spider.seedsseen) #self.assertTrue(crawler.spider.seedsseen.index(None) < crawler.spider.seedsseen.index(99), # crawler.spider.seedsseen) @defer.inlineCallbacks def test_start_requests_dupes(self): settings = {"CONCURRENT_REQUESTS": 1} crawler = CrawlerRunner(settings).create_crawler( DuplicateStartRequestsSpider) yield crawler.crawl(dont_filter=True, distinct_urls=2, dupe_factor=3, mockserver=self.mockserver) self.assertEqual(crawler.spider.visited, 6) yield crawler.crawl(dont_filter=False, distinct_urls=3, dupe_factor=4, mockserver=self.mockserver) self.assertEqual(crawler.spider.visited, 3) @defer.inlineCallbacks def test_unbounded_response(self): # Completeness of responses without Content-Length or Transfer-Encoding # can not be determined, we treat them as valid but flagged as "partial" from urllib.parse import urlencode query = urlencode({ 'raw': '''\ HTTP/1.1 200 OK Server: Apache-Coyote/1.1 X-Powered-By: Servlet 2.4; JBoss-4.2.3.GA (build: SVNTag=JBoss_4_2_3_GA date=200807181417)/JBossWeb-2.0 Set-Cookie: JSESSIONID=08515F572832D0E659FD2B0D8031D75F; Path=/ Pragma: no-cache Expires: Thu, 01 Jan 1970 00:00:00 GMT Cache-Control: no-cache Cache-Control: no-store Content-Type: text/html;charset=UTF-8 Content-Language: en Date: Tue, 27 Aug 2013 13:05:05 GMT Connection: close foo body with multiples lines ''' }) crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/raw?{0}".format(query)), mockserver=self.mockserver) self.assertEqual(str(l).count("Got response 200"), 1) @defer.inlineCallbacks def test_retry_conn_lost(self): # connection lost after receiving data crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/drop?abort=0"), mockserver=self.mockserver) self._assert_retried(l) @defer.inlineCallbacks def test_retry_conn_aborted(self): # connection lost before receiving data crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/drop?abort=1"), mockserver=self.mockserver) self._assert_retried(l) def _assert_retried(self, log): self.assertEqual(str(log).count("Retrying"), 2) self.assertEqual(str(log).count("Gave up retrying"), 1) @defer.inlineCallbacks def test_referer_header(self): """Referer header is set by RefererMiddleware unless it is already set""" req0 = Request(self.mockserver.url('/echo?headers=1&body=0'), dont_filter=1) req1 = req0.replace() req2 = req0.replace(headers={'Referer': None}) req3 = req0.replace(headers={'Referer': 'http://example.com'}) req0.meta['next'] = req1 req1.meta['next'] = req2 req2.meta['next'] = req3 crawler = self.runner.create_crawler(SingleRequestSpider) yield crawler.crawl(seed=req0, mockserver=self.mockserver) # basic asserts in case of weird communication errors self.assertIn('responses', crawler.spider.meta) self.assertNotIn('failures', crawler.spider.meta) # start requests doesn't set Referer header echo0 = json.loads(to_unicode( crawler.spider.meta['responses'][2].body)) self.assertNotIn('Referer', echo0['headers']) # following request sets Referer to start request url echo1 = json.loads(to_unicode( crawler.spider.meta['responses'][1].body)) self.assertEqual(echo1['headers'].get('Referer'), [req0.url]) # next request avoids Referer header echo2 = json.loads(to_unicode( crawler.spider.meta['responses'][2].body)) self.assertNotIn('Referer', echo2['headers']) # last request explicitly sets a Referer header echo3 = json.loads(to_unicode( crawler.spider.meta['responses'][3].body)) self.assertEqual(echo3['headers'].get('Referer'), ['http://example.com']) @defer.inlineCallbacks def test_engine_status(self): from scrapy.utils.engine import get_engine_status est = [] def cb(response): est.append(get_engine_status(crawler.engine)) crawler = self.runner.create_crawler(SingleRequestSpider) yield crawler.crawl(seed=self.mockserver.url('/'), callback_func=cb, mockserver=self.mockserver) self.assertEqual(len(est), 1, est) s = dict(est[0]) self.assertEqual(s['engine.spider.name'], crawler.spider.name) self.assertEqual(s['len(engine.scraper.slot.active)'], 1) @defer.inlineCallbacks def test_graceful_crawl_error_handling(self): """ Test whether errors happening anywhere in Crawler.crawl() are properly reported (and not somehow swallowed) after a graceful engine shutdown. The errors should not come from within Scrapy's core but from within spiders/middlewares/etc., e.g. raised in Spider.start_requests(), SpiderMiddleware.process_start_requests(), etc. """ class TestError(Exception): pass class FaultySpider(SimpleSpider): def start_requests(self): raise TestError crawler = self.runner.create_crawler(FaultySpider) yield self.assertFailure(crawler.crawl(mockserver=self.mockserver), TestError) self.assertFalse(crawler.crawling) @defer.inlineCallbacks def test_open_spider_error_on_faulty_pipeline(self): settings = { "ITEM_PIPELINES": { "tests.pipelines.ZeroDivisionErrorPipeline": 300, } } crawler = CrawlerRunner(settings).create_crawler(SimpleSpider) yield self.assertFailure( self.runner.crawl(crawler, self.mockserver.url("/status?n=200"), mockserver=self.mockserver), ZeroDivisionError) self.assertFalse(crawler.crawling) @defer.inlineCallbacks def test_crawlerrunner_accepts_crawler(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as log: yield self.runner.crawl(crawler, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) self.assertIn("Got response 200", str(log)) @defer.inlineCallbacks def test_crawl_multiple(self): self.runner.crawl(SimpleSpider, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) self.runner.crawl(SimpleSpider, self.mockserver.url("/status?n=503"), mockserver=self.mockserver) with LogCapture() as log: yield self.runner.join() self._assert_retried(log) self.assertIn("Got response 200", str(log)) @defer.inlineCallbacks def test_crawlspider_with_errback(self): self.runner.crawl(CrawlSpiderWithErrback, mockserver=self.mockserver) with LogCapture() as log: yield self.runner.join() self.assertIn("[callback] status 200", str(log)) self.assertIn("[callback] status 201", str(log)) self.assertIn("[errback] status 404", str(log)) self.assertIn("[errback] status 500", str(log)) @defer.inlineCallbacks def test_async_def_parse(self): self.runner.crawl(AsyncDefSpider, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) with LogCapture() as log: yield self.runner.join() self.assertIn("Got response 200", str(log)) @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncio_parse(self): runner = CrawlerRunner({"ASYNCIO_REACTOR": True}) runner.crawl(AsyncDefAsyncioSpider, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) with LogCapture() as log: yield runner.join() self.assertIn("Got response 200", str(log)) @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncio_parse_list(self): items = [] def _on_item_scraped(item): items.append(item) crawler = self.runner.create_crawler(AsyncDefAsyncioReturnSpider) crawler.signals.connect(_on_item_scraped, signals.item_scraped) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/status?n=200"), mockserver=self.mockserver) self.assertIn("Got response 200", str(log)) self.assertIn({'id': 1}, items) self.assertIn({'id': 2}, items)
class CrawlTestCase(TestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() self.runner = CrawlerRunner() def tearDown(self): self.mockserver.__exit__(None, None, None) @defer.inlineCallbacks def test_follow_all(self): crawler = self.runner.create_crawler(FollowAllSpider) yield crawler.crawl(mockserver=self.mockserver) self.assertEqual(len(crawler.spider.urls_visited), 11) # 10 + start_url @defer.inlineCallbacks def test_delay(self): # short to long delays yield self._test_delay(0.2, False) yield self._test_delay(1, False) # randoms yield self._test_delay(0.2, True) yield self._test_delay(1, True) @defer.inlineCallbacks def _test_delay(self, delay, randomize): settings = {"DOWNLOAD_DELAY": delay, 'RANDOMIZE_DOWNLOAD_DELAY': randomize} crawler = CrawlerRunner(settings).create_crawler(FollowAllSpider) yield crawler.crawl(maxlatency=delay * 2, mockserver=self.mockserver) t = crawler.spider.times totaltime = t[-1] - t[0] avgd = totaltime / (len(t) - 1) tolerance = 0.6 if randomize else 0.2 self.assertTrue(avgd > delay * (1 - tolerance), "download delay too small: %s" % avgd) @defer.inlineCallbacks def test_timeout_success(self): crawler = self.runner.create_crawler(DelaySpider) yield crawler.crawl(n=0.5, mockserver=self.mockserver) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 > 0) self.assertTrue(crawler.spider.t2 > crawler.spider.t1) @defer.inlineCallbacks def test_timeout_failure(self): crawler = CrawlerRunner({"DOWNLOAD_TIMEOUT": 0.35}).create_crawler(DelaySpider) yield crawler.crawl(n=0.5, mockserver=self.mockserver) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 == 0) self.assertTrue(crawler.spider.t2_err > 0) self.assertTrue(crawler.spider.t2_err > crawler.spider.t1) # server hangs after receiving response headers yield crawler.crawl(n=0.5, b=1, mockserver=self.mockserver) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 == 0) self.assertTrue(crawler.spider.t2_err > 0) self.assertTrue(crawler.spider.t2_err > crawler.spider.t1) @defer.inlineCallbacks def test_retry_503(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/status?n=503"), mockserver=self.mockserver) self._assert_retried(l) @defer.inlineCallbacks def test_retry_conn_failed(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl("http://localhost:65432/status?n=503", mockserver=self.mockserver) self._assert_retried(l) @defer.inlineCallbacks def test_retry_dns_error(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: # try to fetch the homepage of a non-existent domain yield crawler.crawl("http://dns.resolution.invalid./", mockserver=self.mockserver) self._assert_retried(l) @defer.inlineCallbacks def test_start_requests_bug_before_yield(self): with LogCapture('scrapy', level=logging.ERROR) as l: crawler = self.runner.create_crawler(BrokenStartRequestsSpider) yield crawler.crawl(fail_before_yield=1, mockserver=self.mockserver) self.assertEqual(len(l.records), 1) record = l.records[0] self.assertIsNotNone(record.exc_info) self.assertIs(record.exc_info[0], ZeroDivisionError) @defer.inlineCallbacks def test_start_requests_bug_yielding(self): with LogCapture('scrapy', level=logging.ERROR) as l: crawler = self.runner.create_crawler(BrokenStartRequestsSpider) yield crawler.crawl(fail_yielding=1, mockserver=self.mockserver) self.assertEqual(len(l.records), 1) record = l.records[0] self.assertIsNotNone(record.exc_info) self.assertIs(record.exc_info[0], ZeroDivisionError) @defer.inlineCallbacks def test_start_requests_lazyness(self): settings = {"CONCURRENT_REQUESTS": 1} crawler = CrawlerRunner(settings).create_crawler(BrokenStartRequestsSpider) yield crawler.crawl(mockserver=self.mockserver) #self.assertTrue(False, crawler.spider.seedsseen) #self.assertTrue(crawler.spider.seedsseen.index(None) < crawler.spider.seedsseen.index(99), # crawler.spider.seedsseen) @defer.inlineCallbacks def test_start_requests_dupes(self): settings = {"CONCURRENT_REQUESTS": 1} crawler = CrawlerRunner(settings).create_crawler(DuplicateStartRequestsSpider) yield crawler.crawl(dont_filter=True, distinct_urls=2, dupe_factor=3, mockserver=self.mockserver) self.assertEqual(crawler.spider.visited, 6) yield crawler.crawl(dont_filter=False, distinct_urls=3, dupe_factor=4, mockserver=self.mockserver) self.assertEqual(crawler.spider.visited, 3) @defer.inlineCallbacks def test_unbounded_response(self): # Completeness of responses without Content-Length or Transfer-Encoding # can not be determined, we treat them as valid but flagged as "partial" from six.moves.urllib.parse import urlencode query = urlencode({'raw': '''\ HTTP/1.1 200 OK Server: Apache-Coyote/1.1 X-Powered-By: Servlet 2.4; JBoss-4.2.3.GA (build: SVNTag=JBoss_4_2_3_GA date=200807181417)/JBossWeb-2.0 Set-Cookie: JSESSIONID=08515F572832D0E659FD2B0D8031D75F; Path=/ Pragma: no-cache Expires: Thu, 01 Jan 1970 00:00:00 GMT Cache-Control: no-cache Cache-Control: no-store Content-Type: text/html;charset=UTF-8 Content-Language: en Date: Tue, 27 Aug 2013 13:05:05 GMT Connection: close foo body with multiples lines '''}) crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/raw?{0}".format(query)), mockserver=self.mockserver) self.assertEqual(str(l).count("Got response 200"), 1) @defer.inlineCallbacks def test_retry_conn_lost(self): # connection lost after receiving data crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/drop?abort=0"), mockserver=self.mockserver) self._assert_retried(l) @defer.inlineCallbacks def test_retry_conn_aborted(self): # connection lost before receiving data crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/drop?abort=1"), mockserver=self.mockserver) self._assert_retried(l) def _assert_retried(self, log): self.assertEqual(str(log).count("Retrying"), 2) self.assertEqual(str(log).count("Gave up retrying"), 1) @defer.inlineCallbacks def test_referer_header(self): """Referer header is set by RefererMiddleware unless it is already set""" req0 = Request(self.mockserver.url('/echo?headers=1&body=0'), dont_filter=1) req1 = req0.replace() req2 = req0.replace(headers={'Referer': None}) req3 = req0.replace(headers={'Referer': 'http://example.com'}) req0.meta['next'] = req1 req1.meta['next'] = req2 req2.meta['next'] = req3 crawler = self.runner.create_crawler(SingleRequestSpider) yield crawler.crawl(seed=req0, mockserver=self.mockserver) # basic asserts in case of weird communication errors self.assertIn('responses', crawler.spider.meta) self.assertNotIn('failures', crawler.spider.meta) # start requests doesn't set Referer header echo0 = json.loads(to_unicode(crawler.spider.meta['responses'][2].body)) self.assertNotIn('Referer', echo0['headers']) # following request sets Referer to start request url echo1 = json.loads(to_unicode(crawler.spider.meta['responses'][1].body)) self.assertEqual(echo1['headers'].get('Referer'), [req0.url]) # next request avoids Referer header echo2 = json.loads(to_unicode(crawler.spider.meta['responses'][2].body)) self.assertNotIn('Referer', echo2['headers']) # last request explicitly sets a Referer header echo3 = json.loads(to_unicode(crawler.spider.meta['responses'][3].body)) self.assertEqual(echo3['headers'].get('Referer'), ['http://example.com']) @defer.inlineCallbacks def test_engine_status(self): from scrapy.utils.engine import get_engine_status est = [] def cb(response): est.append(get_engine_status(crawler.engine)) crawler = self.runner.create_crawler(SingleRequestSpider) yield crawler.crawl(seed=self.mockserver.url('/'), callback_func=cb, mockserver=self.mockserver) self.assertEqual(len(est), 1, est) s = dict(est[0]) self.assertEqual(s['engine.spider.name'], crawler.spider.name) self.assertEqual(s['len(engine.scraper.slot.active)'], 1) @defer.inlineCallbacks def test_graceful_crawl_error_handling(self): """ Test whether errors happening anywhere in Crawler.crawl() are properly reported (and not somehow swallowed) after a graceful engine shutdown. The errors should not come from within Scrapy's core but from within spiders/middlewares/etc., e.g. raised in Spider.start_requests(), SpiderMiddleware.process_start_requests(), etc. """ class TestError(Exception): pass class FaultySpider(SimpleSpider): def start_requests(self): raise TestError crawler = self.runner.create_crawler(FaultySpider) yield self.assertFailure(crawler.crawl(mockserver=self.mockserver), TestError) self.assertFalse(crawler.crawling) @defer.inlineCallbacks def test_open_spider_error_on_faulty_pipeline(self): settings = { "ITEM_PIPELINES": { "tests.pipelines.ZeroDivisionErrorPipeline": 300, } } crawler = CrawlerRunner(settings).create_crawler(SimpleSpider) yield self.assertFailure( self.runner.crawl(crawler, self.mockserver.url("/status?n=200"), mockserver=self.mockserver), ZeroDivisionError) self.assertFalse(crawler.crawling) @defer.inlineCallbacks def test_crawlerrunner_accepts_crawler(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as log: yield self.runner.crawl(crawler, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) self.assertIn("Got response 200", str(log)) @defer.inlineCallbacks def test_crawl_multiple(self): self.runner.crawl(SimpleSpider, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) self.runner.crawl(SimpleSpider, self.mockserver.url("/status?n=503"), mockserver=self.mockserver) with LogCapture() as log: yield self.runner.join() self._assert_retried(log) self.assertIn("Got response 200", str(log))
def make_crawler(settings, **extra_settings): settings.update(extra_settings) runner = CrawlerRunner(settings) return runner.create_crawler(BaseSpider)
class CrawlSpiderTestCase(TestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() self.runner = CrawlerRunner() def tearDown(self): self.mockserver.__exit__(None, None, None) @defer.inlineCallbacks def _run_spider(self, spider_cls): items = [] def _on_item_scraped(item): items.append(item) crawler = self.runner.create_crawler(spider_cls) crawler.signals.connect(_on_item_scraped, signals.item_scraped) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/status?n=200"), mockserver=self.mockserver) return log, items, crawler.stats @defer.inlineCallbacks def test_crawlspider_with_parse(self): self.runner.crawl(CrawlSpiderWithParseMethod, mockserver=self.mockserver) with LogCapture() as log: yield self.runner.join() self.assertIn("[parse] status 200 (foo: None)", str(log)) self.assertIn("[parse] status 201 (foo: None)", str(log)) self.assertIn("[parse] status 202 (foo: bar)", str(log)) @defer.inlineCallbacks def test_crawlspider_with_errback(self): self.runner.crawl(CrawlSpiderWithErrback, mockserver=self.mockserver) with LogCapture() as log: yield self.runner.join() self.assertIn("[parse] status 200 (foo: None)", str(log)) self.assertIn("[parse] status 201 (foo: None)", str(log)) self.assertIn("[parse] status 202 (foo: bar)", str(log)) self.assertIn("[errback] status 404", str(log)) self.assertIn("[errback] status 500", str(log)) self.assertIn("[errback] status 501", str(log)) @defer.inlineCallbacks def test_async_def_parse(self): self.runner.crawl(AsyncDefSpider, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) with LogCapture() as log: yield self.runner.join() self.assertIn("Got response 200", str(log)) @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncio_parse(self): runner = CrawlerRunner({ "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor" }) runner.crawl(AsyncDefAsyncioSpider, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) with LogCapture() as log: yield runner.join() self.assertIn("Got response 200", str(log)) @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncio_parse_items_list(self): log, items, _ = yield self._run_spider(AsyncDefAsyncioReturnSpider) self.assertIn("Got response 200", str(log)) self.assertIn({'id': 1}, items) self.assertIn({'id': 2}, items) @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncio_parse_items_single_element(self): items = [] def _on_item_scraped(item): items.append(item) crawler = self.runner.create_crawler( AsyncDefAsyncioReturnSingleElementSpider) crawler.signals.connect(_on_item_scraped, signals.item_scraped) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/status?n=200"), mockserver=self.mockserver) self.assertIn("Got response 200", str(log)) self.assertIn({"foo": 42}, items) @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncgen_parse(self): log, _, stats = yield self._run_spider(AsyncDefAsyncioGenSpider) self.assertIn("Got response 200", str(log)) itemcount = stats.get_value('item_scraped_count') self.assertEqual(itemcount, 1) @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncgen_parse_loop(self): log, items, stats = yield self._run_spider( AsyncDefAsyncioGenLoopSpider) self.assertIn("Got response 200", str(log)) itemcount = stats.get_value('item_scraped_count') self.assertEqual(itemcount, 10) for i in range(10): self.assertIn({'foo': i}, items) @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncgen_parse_complex(self): _, items, stats = yield self._run_spider( AsyncDefAsyncioGenComplexSpider) itemcount = stats.get_value('item_scraped_count') self.assertEqual(itemcount, 156) # some random items for i in [1, 4, 21, 22, 207, 311]: self.assertIn({'index': i}, items) for i in [10, 30, 122]: self.assertIn({'index2': i}, items) @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncio_parse_reqs_list(self): log, *_ = yield self._run_spider(AsyncDefAsyncioReqsReturnSpider) for req_id in range(3): self.assertIn(f"Got response 200, req_id {req_id}", str(log)) @defer.inlineCallbacks def test_response_ssl_certificate_none(self): crawler = self.runner.create_crawler(SingleRequestSpider) url = self.mockserver.url("/echo?body=test", is_secure=False) yield crawler.crawl(seed=url, mockserver=self.mockserver) self.assertIsNone(crawler.spider.meta['responses'][0].certificate) @defer.inlineCallbacks def test_response_ssl_certificate(self): crawler = self.runner.create_crawler(SingleRequestSpider) url = self.mockserver.url("/echo?body=test", is_secure=True) yield crawler.crawl(seed=url, mockserver=self.mockserver) cert = crawler.spider.meta['responses'][0].certificate self.assertIsInstance(cert, Certificate) self.assertEqual(cert.getSubject().commonName, b"localhost") self.assertEqual(cert.getIssuer().commonName, b"localhost") @mark.xfail( reason="Responses with no body return early and contain no certificate" ) @defer.inlineCallbacks def test_response_ssl_certificate_empty_response(self): crawler = self.runner.create_crawler(SingleRequestSpider) url = self.mockserver.url("/status?n=200", is_secure=True) yield crawler.crawl(seed=url, mockserver=self.mockserver) cert = crawler.spider.meta['responses'][0].certificate self.assertIsInstance(cert, Certificate) self.assertEqual(cert.getSubject().commonName, b"localhost") self.assertEqual(cert.getIssuer().commonName, b"localhost") @defer.inlineCallbacks def test_dns_server_ip_address_none(self): crawler = self.runner.create_crawler(SingleRequestSpider) url = self.mockserver.url('/status?n=200') yield crawler.crawl(seed=url, mockserver=self.mockserver) ip_address = crawler.spider.meta['responses'][0].ip_address self.assertIsNone(ip_address) @defer.inlineCallbacks def test_dns_server_ip_address(self): crawler = self.runner.create_crawler(SingleRequestSpider) url = self.mockserver.url('/echo?body=test') expected_netloc, _ = urlparse(url).netloc.split(':') yield crawler.crawl(seed=url, mockserver=self.mockserver) ip_address = crawler.spider.meta['responses'][0].ip_address self.assertIsInstance(ip_address, IPv4Address) self.assertEqual(str(ip_address), gethostbyname(expected_netloc)) @defer.inlineCallbacks def test_bytes_received_stop_download_callback(self): crawler = self.runner.create_crawler(BytesReceivedCallbackSpider) yield crawler.crawl(mockserver=self.mockserver) self.assertIsNone(crawler.spider.meta.get("failure")) self.assertIsInstance(crawler.spider.meta["response"], Response) self.assertEqual(crawler.spider.meta["response"].body, crawler.spider.meta.get("bytes_received")) self.assertLess(len(crawler.spider.meta["response"].body), crawler.spider.full_response_length) @defer.inlineCallbacks def test_bytes_received_stop_download_errback(self): crawler = self.runner.create_crawler(BytesReceivedErrbackSpider) yield crawler.crawl(mockserver=self.mockserver) self.assertIsNone(crawler.spider.meta.get("response")) self.assertIsInstance(crawler.spider.meta["failure"], Failure) self.assertIsInstance(crawler.spider.meta["failure"].value, StopDownload) self.assertIsInstance(crawler.spider.meta["failure"].value.response, Response) self.assertEqual(crawler.spider.meta["failure"].value.response.body, crawler.spider.meta.get("bytes_received")) self.assertLess( len(crawler.spider.meta["failure"].value.response.body), crawler.spider.full_response_length) @defer.inlineCallbacks def test_headers_received_stop_download_callback(self): crawler = self.runner.create_crawler(HeadersReceivedCallbackSpider) yield crawler.crawl(mockserver=self.mockserver) self.assertIsNone(crawler.spider.meta.get("failure")) self.assertIsInstance(crawler.spider.meta["response"], Response) self.assertEqual(crawler.spider.meta["response"].headers, crawler.spider.meta.get("headers_received")) @defer.inlineCallbacks def test_headers_received_stop_download_errback(self): crawler = self.runner.create_crawler(HeadersReceivedErrbackSpider) yield crawler.crawl(mockserver=self.mockserver) self.assertIsNone(crawler.spider.meta.get("response")) self.assertIsInstance(crawler.spider.meta["failure"], Failure) self.assertIsInstance(crawler.spider.meta["failure"].value, StopDownload) self.assertIsInstance(crawler.spider.meta["failure"].value.response, Response) self.assertEqual(crawler.spider.meta["failure"].value.response.headers, crawler.spider.meta.get("headers_received"))
class CentralWidget(QWidget): crawledMovieInfo = None def __init__(self, config): super().__init__() self.globalConfig = config if not 'FolderView' in config.sections(): config.add_section('y') self.scrapperConfig = self.globalConfig['Scrapper'] mainLayout = QHBoxLayout() mainLayout.setContentsMargins(3, 3, 3, 3) self.fileView = FolderView(config) self.fileView.movieFound.connect(self.onFoundMovie) mainLayout.addWidget(self.fileView) self.movieTab = QTabWidget() self.movieTab.setContentsMargins(0, 0, 0, 0) self.listView = MovieListView(config, self.fileView.model, parent=self) self.listView.movieDoubleClicked.connect(self.onMovieClicked) self.movieTab.addTab(self.listView, 'List') self.infoView = MovieInfoView() self.movieTab.addTab(self.infoView, 'Detail') mainLayout.addWidget(self.movieTab) self.setLayout(mainLayout) self.initCrawlRunner() def initCrawlRunner(self): from scrapy.crawler import CrawlerRunner from scrapy.utils.log import configure_logging from scrapy.utils.project import get_project_settings configure_logging() self.runner = CrawlerRunner(get_project_settings()) self.crawler = None self.site = None self.numCids = 0 ''' def onMovieDoubleClicked(self, index): self.movieTab.setCurrentWidget(self.infoView) self.fileView.setCurrentIndex(index) ''' def onMovieClicked(self, index, doubleClicked): if doubleClicked: self.movieTab.setCurrentWidget(self.infoView) self.fileView.setCurrentIndex(index) def updateFromFile(self, files, index = None): self.infoView.clearMovieInfo(True, self.numCids > 1) if not files: return #print(files) nfo = next(filter(lambda x:x.endswith('.nfo'), files), False) if not nfo: return info = utils.nfo2dict(nfo) if index: info['path'] = self.fileView.getPath(index) else: info['path'] = self.fileView.getPath() for file in files: if file.endswith('.nfo'): continue if '-poster' in file: info['poster'] = utils.AvImage(file) elif '-fanart' in file: info['fanart'] = file #print('file set movie info') #print(info) self.infoView.setMovieInfo(info, self.numCids > 1) def checkCrop(self, info): donotcrop = [] if 'VR' in info['id'] : return False if self.scrapperConfig.get('studios_skipping_crop'): donotcrop = self.scrapperConfig['studios_skipping_crop'].split(',') #print(donotcrop) if not info.get('studio'): return False if info['studio'] in donotcrop: return False if self.scrapperConfig.get('labels_skipping_crop'): donotcrop = self.scrapperConfig['labels_skipping_crop'].split(',') if not info.get('label'): return False if info['label'] in donotcrop: return False return True def updateFromScrapy(self, info): #self.infoView.clearMovieInfo(False, True) try : info['fanart'] = info['thumb'] if self.checkCrop(info): info['poster'] = info['thumb'].cropLeft() else: info['poster'] = info['thumb'] #print('scrapy set movie info') self.infoView.setMovieInfo(info, False) except: traceback.print_exc() def onFoundMovie(self, movieinfo): if isinstance(movieinfo, list): self.updateFromFile(movieinfo) elif isinstance(movieinfo, dict): self.updateFromScrapy(movieinfo) ''' def onScrapDone(self, _): self.numCids = 0 # don't do anything related to Qt UI in scrapy signal. it doesn't work. def onSpiderClosed(self, spider): #import pprint as pp #pp.pprint(spider.movieinfo) try: self.crawledMovieInfo = spider.movieinfo except: self.crawledMovieInfo = None ''' def onSpiderClosed(self, spider): self.numCids = 0 self.listView.refresh() def onItemScraped(self, item, response, spider): if not response.meta.get('id'): return cid = response.meta['id'] #print(cid) minfo = dict(item) minfo['path'] = self.fileView.getPath(cid['idx']) if 'releasedate' in minfo: minfo['year'] = minfo['releasedate'].split('-')[0] if 'actor_thumb' in minfo: del minfo['actor_thumb'] #print(response.request.headers) if self.numCids > 1: mfiles = self.fileView.getFiles(cid['idx']) self.updateFromFile(mfiles, cid['idx']) self.updateFromScrapy(minfo) self.infoView.saveMovieInfo() else: self.updateFromScrapy(minfo) #self.onFoundMovie(minfo) def runCrawler(self, **kw): from scrapy import signals site = self.scrapperConfig.get('site', 'r18') if not self.crawler or self.site != site: self.site = site self.crawler = self.runner.create_crawler(site) self.crawler.signals.connect(self.onItemScraped, signals.item_scraped) self.crawler.signals.connect(self.onSpiderClosed, signals.spider_closed) self.runner.crawl(self.crawler, **kw) #deferd = self.runner.crawl(self.crawler, **kw) #deferd.addBoth(self.onScrapDone) def scrap(self, **kw): if kw: kw['outdir'] = self.fileView.rootPath() self.runCrawler(**kw) return selected = self.fileView.getSelectedIndexes() if len(selected) < 1: print('select movie!') return cids = [{'cid':os.path.basename(self.fileView.getPath(idx)), 'idx':idx} for idx in selected] self.numCids = len(cids) #print('num cids:', self.numCids) #print(cids, self.fileView.rootPath()) self.runCrawler(**{'cids':cids}) ''' for index in selected: path = self.fileView.getPath(index) cid = os.path.basename(path) self.runCrawler(**{'keyword':cid, 'outdir':path}) ''' def saveAll(self): self.infoView.saveMovieInfo() self.listView.refresh() def changeDir(self, path): self.fileView.folderList.changeDir(path) def upDir(self): self.fileView.folderList.upDir() def fileRenameTool(self): config = self.globalConfig['FolderView'] from rename_tool import FileRenameDialog dlg = FileRenameDialog(config.get('currdir', ''), self) dlg.exec_()
class CrawlTestCase(TestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() self.runner = CrawlerRunner() def tearDown(self): self.mockserver.__exit__(None, None, None) @defer.inlineCallbacks def test_follow_all(self): crawler = self.runner.create_crawler(FollowAllSpider) yield crawler.crawl() self.assertEqual(len(crawler.spider.urls_visited), 11) # 10 + start_url @defer.inlineCallbacks def test_delay(self): # short to long delays yield self._test_delay(0.2, False) yield self._test_delay(1, False) # randoms yield self._test_delay(0.2, True) yield self._test_delay(1, True) @defer.inlineCallbacks def _test_delay(self, delay, randomize): settings = { "DOWNLOAD_DELAY": delay, 'RANDOMIZE_DOWNLOAD_DELAY': randomize } crawler = CrawlerRunner(settings).create_crawler(FollowAllSpider) yield crawler.crawl(maxlatency=delay * 2) t = crawler.spider.times totaltime = t[-1] - t[0] avgd = totaltime / (len(t) - 1) tolerance = 0.6 if randomize else 0.2 self.assertTrue(avgd > delay * (1 - tolerance), "download delay too small: %s" % avgd) @defer.inlineCallbacks def test_timeout_success(self): crawler = self.runner.create_crawler(DelaySpider) yield crawler.crawl(n=0.5) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 > 0) self.assertTrue(crawler.spider.t2 > crawler.spider.t1) @defer.inlineCallbacks def test_timeout_failure(self): crawler = CrawlerRunner({ "DOWNLOAD_TIMEOUT": 0.35 }).create_crawler(DelaySpider) yield crawler.crawl(n=0.5) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 == 0) self.assertTrue(crawler.spider.t2_err > 0) self.assertTrue(crawler.spider.t2_err > crawler.spider.t1) # server hangs after receiving response headers yield crawler.crawl(n=0.5, b=1) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 == 0) self.assertTrue(crawler.spider.t2_err > 0) self.assertTrue(crawler.spider.t2_err > crawler.spider.t1) @defer.inlineCallbacks def test_retry_503(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl("http://localhost:8998/status?n=503") self._assert_retried(l) @defer.inlineCallbacks def test_retry_conn_failed(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl("http://localhost:65432/status?n=503") self._assert_retried(l) @defer.inlineCallbacks def test_retry_dns_error(self): with mock.patch('socket.gethostbyname', side_effect=socket.gaierror( -5, 'No address associated with hostname')): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl("http://example.com/") self._assert_retried(l) @defer.inlineCallbacks def test_start_requests_bug_before_yield(self): with LogCapture('scrapy', level=logging.ERROR) as l: crawler = self.runner.create_crawler(BrokenStartRequestsSpider) yield crawler.crawl(fail_before_yield=1) self.assertEqual(len(l.records), 1) record = l.records[0] self.assertIsNotNone(record.exc_info) self.assertIs(record.exc_info[0], ZeroDivisionError) @defer.inlineCallbacks def test_start_requests_bug_yielding(self): with LogCapture('scrapy', level=logging.ERROR) as l: crawler = self.runner.create_crawler(BrokenStartRequestsSpider) yield crawler.crawl(fail_yielding=1) self.assertEqual(len(l.records), 1) record = l.records[0] self.assertIsNotNone(record.exc_info) self.assertIs(record.exc_info[0], ZeroDivisionError) @defer.inlineCallbacks def test_start_requests_lazyness(self): settings = {"CONCURRENT_REQUESTS": 1} crawler = CrawlerRunner(settings).create_crawler( BrokenStartRequestsSpider) yield crawler.crawl() #self.assertTrue(False, crawler.spider.seedsseen) #self.assertTrue(crawler.spider.seedsseen.index(None) < crawler.spider.seedsseen.index(99), # crawler.spider.seedsseen) @defer.inlineCallbacks def test_start_requests_dupes(self): settings = {"CONCURRENT_REQUESTS": 1} crawler = CrawlerRunner(settings).create_crawler( DuplicateStartRequestsSpider) yield crawler.crawl(dont_filter=True, distinct_urls=2, dupe_factor=3) self.assertEqual(crawler.spider.visited, 6) yield crawler.crawl(dont_filter=False, distinct_urls=3, dupe_factor=4) self.assertEqual(crawler.spider.visited, 3) @defer.inlineCallbacks def test_unbounded_response(self): # Completeness of responses without Content-Length or Transfer-Encoding # can not be determined, we treat them as valid but flagged as "partial" from six.moves.urllib.parse import urlencode query = urlencode({ 'raw': '''\ HTTP/1.1 200 OK Server: Apache-Coyote/1.1 X-Powered-By: Servlet 2.4; JBoss-4.2.3.GA (build: SVNTag=JBoss_4_2_3_GA date=200807181417)/JBossWeb-2.0 Set-Cookie: JSESSIONID=08515F572832D0E659FD2B0D8031D75F; Path=/ Pragma: no-cache Expires: Thu, 01 Jan 1970 00:00:00 GMT Cache-Control: no-cache Cache-Control: no-store Content-Type: text/html;charset=UTF-8 Content-Language: en Date: Tue, 27 Aug 2013 13:05:05 GMT Connection: close foo body with multiples lines ''' }) crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl("http://localhost:8998/raw?{0}".format(query)) self.assertEqual(str(l).count("Got response 200"), 1) @defer.inlineCallbacks def test_retry_conn_lost(self): # connection lost after receiving data crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl("http://localhost:8998/drop?abort=0") self._assert_retried(l) @defer.inlineCallbacks def test_retry_conn_aborted(self): # connection lost before receiving data crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl("http://localhost:8998/drop?abort=1") self._assert_retried(l) def _assert_retried(self, log): self.assertEqual(str(log).count("Retrying"), 2) self.assertEqual(str(log).count("Gave up retrying"), 1) @defer.inlineCallbacks def test_referer_header(self): """Referer header is set by RefererMiddleware unless it is already set""" req0 = Request('http://localhost:8998/echo?headers=1&body=0', dont_filter=1) req1 = req0.replace() req2 = req0.replace(headers={'Referer': None}) req3 = req0.replace(headers={'Referer': 'http://example.com'}) req0.meta['next'] = req1 req1.meta['next'] = req2 req2.meta['next'] = req3 crawler = self.runner.create_crawler(SingleRequestSpider) yield crawler.crawl(seed=req0) # basic asserts in case of weird communication errors self.assertIn('responses', crawler.spider.meta) self.assertNotIn('failures', crawler.spider.meta) # start requests doesn't set Referer header echo0 = json.loads(to_unicode( crawler.spider.meta['responses'][2].body)) self.assertNotIn('Referer', echo0['headers']) # following request sets Referer to start request url echo1 = json.loads(to_unicode( crawler.spider.meta['responses'][1].body)) self.assertEqual(echo1['headers'].get('Referer'), [req0.url]) # next request avoids Referer header echo2 = json.loads(to_unicode( crawler.spider.meta['responses'][2].body)) self.assertNotIn('Referer', echo2['headers']) # last request explicitly sets a Referer header echo3 = json.loads(to_unicode( crawler.spider.meta['responses'][3].body)) self.assertEqual(echo3['headers'].get('Referer'), ['http://example.com']) @defer.inlineCallbacks def test_engine_status(self): from scrapy.utils.engine import get_engine_status est = [] def cb(response): est.append(get_engine_status(crawler.engine)) crawler = self.runner.create_crawler(SingleRequestSpider) yield crawler.crawl(seed='http://localhost:8998/', callback_func=cb) self.assertEqual(len(est), 1, est) s = dict(est[0]) self.assertEqual(s['engine.spider.name'], crawler.spider.name) self.assertEqual(s['len(engine.scraper.slot.active)'], 1) @defer.inlineCallbacks def test_graceful_crawl_error_handling(self): """ Test whether errors happening anywhere in Crawler.crawl() are properly reported (and not somehow swallowed) after a graceful engine shutdown. The errors should not come from within Scrapy's core but from within spiders/middlewares/etc., e.g. raised in Spider.start_requests(), SpiderMiddleware.process_start_requests(), etc. """ class TestError(Exception): pass class FaultySpider(SimpleSpider): def start_requests(self): raise TestError crawler = self.runner.create_crawler(FaultySpider) yield self.assertFailure(crawler.crawl(), TestError) self.assertFalse(crawler.crawling) @defer.inlineCallbacks def test_open_spider_error_on_faulty_pipeline(self): settings = { "ITEM_PIPELINES": { "tests.pipelines.ZeroDivisionErrorPipeline": 300, } } crawler = CrawlerRunner(settings).create_crawler(SimpleSpider) yield self.assertFailure( self.runner.crawl(crawler, "http://localhost:8998/status?n=200"), ZeroDivisionError) self.assertFalse(crawler.crawling) @defer.inlineCallbacks def test_crawlerrunner_accepts_crawler(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as log: yield self.runner.crawl(crawler, "http://localhost:8998/status?n=200") self.assertIn("Got response 200", str(log)) @defer.inlineCallbacks def test_crawl_multiple(self): self.runner.crawl(SimpleSpider, "http://localhost:8998/status?n=200") self.runner.crawl(SimpleSpider, "http://localhost:8998/status?n=503") with LogCapture() as log: yield self.runner.join() self._assert_retried(log) self.assertIn("Got response 200", str(log))
def make_crawler(settings, spider_cls=None, **extra_settings): settings.update(extra_settings) runner = CrawlerRunner(settings) return runner.create_crawler(spider_cls or TestSpider)
class FileDownloadCrawlTestCase(TestCase): pipeline_class = 'scrapy.pipelines.files.FilesPipeline' store_setting_key = 'FILES_STORE' media_key = 'files' media_urls_key = 'file_urls' expected_checksums = { '5547178b89448faf0015a13f904c936e', 'c2281c83670e31d8aaab7cb642b824db', 'ed3f6538dc15d4d9179dae57319edc5f' } def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() # prepare a directory for storing files self.tmpmediastore = self.mktemp() os.mkdir(self.tmpmediastore) self.settings = { 'ITEM_PIPELINES': { self.pipeline_class: 1 }, self.store_setting_key: self.tmpmediastore, } self.runner = CrawlerRunner(self.settings) self.items = [] def tearDown(self): shutil.rmtree(self.tmpmediastore) self.items = [] self.mockserver.__exit__(None, None, None) def _on_item_scraped(self, item): self.items.append(item) def _create_crawler(self, spider_class, **kwargs): crawler = self.runner.create_crawler(spider_class, **kwargs) crawler.signals.connect(self._on_item_scraped, signals.item_scraped) return crawler def _assert_files_downloaded(self, items, logs): self.assertEqual(len(items), 1) self.assertIn(self.media_key, items[0]) # check that logs show the expected number of successful file downloads file_dl_success = 'File (downloaded): Downloaded file from' self.assertEqual(logs.count(file_dl_success), 3) # check that the images/files status is `downloaded` for item in items: for i in item[self.media_key]: self.assertEqual(i['status'], 'downloaded') # check that the images/files checksums are what we know they should be if self.expected_checksums is not None: checksums = set(i['checksum'] for item in items for i in item[self.media_key]) self.assertEqual(checksums, self.expected_checksums) # check that the image files where actually written to the media store for item in items: for i in item[self.media_key]: self.assertTrue( os.path.exists(os.path.join(self.tmpmediastore, i['path']))) def _assert_files_download_failure(self, crawler, items, code, logs): # check that the item does NOT have the "images/files" field populated self.assertEqual(len(items), 1) self.assertIn(self.media_key, items[0]) self.assertFalse(items[0][self.media_key]) # check that there was 1 successful fetch and 3 other responses with non-200 code self.assertEqual( crawler.stats.get_value('downloader/request_method_count/GET'), 4) self.assertEqual(crawler.stats.get_value('downloader/response_count'), 4) self.assertEqual( crawler.stats.get_value('downloader/response_status_count/200'), 1) self.assertEqual( crawler.stats.get_value('downloader/response_status_count/%d' % code), 3) # check that logs do show the failure on the file downloads file_dl_failure = 'File (code: %d): Error downloading file from' % code self.assertEqual(logs.count(file_dl_failure), 3) # check that no files were written to the media store self.assertEqual(os.listdir(self.tmpmediastore), []) @defer.inlineCallbacks def test_download_media(self): crawler = self._create_crawler(MediaDownloadSpider) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/files/images/"), media_key=self.media_key, media_urls_key=self.media_urls_key) self._assert_files_downloaded(self.items, str(log)) @defer.inlineCallbacks def test_download_media_wrong_urls(self): crawler = self._create_crawler(BrokenLinksMediaDownloadSpider) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/files/images/"), media_key=self.media_key, media_urls_key=self.media_urls_key) self._assert_files_download_failure(crawler, self.items, 404, str(log)) @defer.inlineCallbacks def test_download_media_redirected_default_failure(self): crawler = self._create_crawler(RedirectedMediaDownloadSpider) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/files/images/"), media_key=self.media_key, media_urls_key=self.media_urls_key, mockserver=self.mockserver) self._assert_files_download_failure(crawler, self.items, 302, str(log)) @defer.inlineCallbacks def test_download_media_redirected_allowed(self): settings = dict(self.settings) settings.update({'MEDIA_ALLOW_REDIRECTS': True}) self.runner = CrawlerRunner(settings) crawler = self._create_crawler(RedirectedMediaDownloadSpider) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/files/images/"), media_key=self.media_key, media_urls_key=self.media_urls_key, mockserver=self.mockserver) self._assert_files_downloaded(self.items, str(log)) self.assertEqual( crawler.stats.get_value('downloader/response_status_count/302'), 3)
def make_crawler(spider_cls, **extra_settings): settings = Settings() settings.setmodule(deepdeep.settings) settings.update(extra_settings) runner = CrawlerRunner(settings) return runner.create_crawler(spider_cls)
class FileDownloadCrawlTestCase(TestCase): pipeline_class = 'scrapy.pipelines.files.FilesPipeline' store_setting_key = 'FILES_STORE' media_key = 'files' media_urls_key = 'file_urls' expected_checksums = set([ '5547178b89448faf0015a13f904c936e', 'c2281c83670e31d8aaab7cb642b824db', 'ed3f6538dc15d4d9179dae57319edc5f']) def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() # prepare a directory for storing files self.tmpmediastore = self.mktemp() os.mkdir(self.tmpmediastore) self.settings = { 'ITEM_PIPELINES': {self.pipeline_class: 1}, self.store_setting_key: self.tmpmediastore, } self.runner = CrawlerRunner(self.settings) self.items = [] def tearDown(self): shutil.rmtree(self.tmpmediastore) self.items = [] self.mockserver.__exit__(None, None, None) def _on_item_scraped(self, item): self.items.append(item) def _create_crawler(self, spider_class, **kwargs): crawler = self.runner.create_crawler(spider_class, **kwargs) crawler.signals.connect(self._on_item_scraped, signals.item_scraped) return crawler def _assert_files_downloaded(self, items, logs): self.assertEqual(len(items), 1) self.assertIn(self.media_key, items[0]) # check that logs show the expected number of successful file downloads file_dl_success = 'File (downloaded): Downloaded file from' self.assertEqual(logs.count(file_dl_success), 3) # check that the images/files checksums are what we know they should be if self.expected_checksums is not None: checksums = set( i['checksum'] for item in items for i in item[self.media_key]) self.assertEqual(checksums, self.expected_checksums) # check that the image files where actually written to the media store for item in items: for i in item[self.media_key]: self.assertTrue( os.path.exists( os.path.join(self.tmpmediastore, i['path']))) def _assert_files_download_failure(self, crawler, items, code, logs): # check that the item does NOT have the "images/files" field populated self.assertEqual(len(items), 1) self.assertIn(self.media_key, items[0]) self.assertFalse(items[0][self.media_key]) # check that there was 1 successful fetch and 3 other responses with non-200 code self.assertEqual(crawler.stats.get_value('downloader/request_method_count/GET'), 4) self.assertEqual(crawler.stats.get_value('downloader/response_count'), 4) self.assertEqual(crawler.stats.get_value('downloader/response_status_count/200'), 1) self.assertEqual(crawler.stats.get_value('downloader/response_status_count/%d' % code), 3) # check that logs do show the failure on the file downloads file_dl_failure = 'File (code: %d): Error downloading file from' % code self.assertEqual(logs.count(file_dl_failure), 3) # check that no files were written to the media store self.assertEqual(os.listdir(self.tmpmediastore), []) @defer.inlineCallbacks def test_download_media(self): crawler = self._create_crawler(MediaDownloadSpider) with LogCapture() as log: yield crawler.crawl("http://localhost:8998/files/images/", media_key=self.media_key, media_urls_key=self.media_urls_key) self._assert_files_downloaded(self.items, str(log)) @defer.inlineCallbacks def test_download_media_wrong_urls(self): crawler = self._create_crawler(BrokenLinksMediaDownloadSpider) with LogCapture() as log: yield crawler.crawl("http://localhost:8998/files/images/", media_key=self.media_key, media_urls_key=self.media_urls_key) self._assert_files_download_failure(crawler, self.items, 404, str(log)) @defer.inlineCallbacks def test_download_media_redirected_default_failure(self): crawler = self._create_crawler(RedirectedMediaDownloadSpider) with LogCapture() as log: yield crawler.crawl("http://localhost:8998/files/images/", media_key=self.media_key, media_urls_key=self.media_urls_key) self._assert_files_download_failure(crawler, self.items, 302, str(log)) @defer.inlineCallbacks def test_download_media_redirected_allowed(self): settings = dict(self.settings) settings.update({'MEDIA_ALLOW_REDIRECTS': True}) self.runner = CrawlerRunner(settings) crawler = self._create_crawler(RedirectedMediaDownloadSpider) with LogCapture() as log: yield crawler.crawl("http://localhost:8998/files/images/", media_key=self.media_key, media_urls_key=self.media_urls_key) self._assert_files_downloaded(self.items, str(log)) self.assertEqual(crawler.stats.get_value('downloader/response_status_count/302'), 3)