class TestCatching(TestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() def tearDown(self): self.mockserver.__exit__(None, None, None) @defer.inlineCallbacks def test_success(self): crawler = get_crawler(SignalCatcherSpider) yield crawler.crawl(self.mockserver.url("/status?n=200")) self.assertEqual(crawler.spider.caught_times, 1) @defer.inlineCallbacks def test_timeout(self): crawler = get_crawler(SignalCatcherSpider, {"DOWNLOAD_TIMEOUT": 0.1}) yield crawler.crawl(self.mockserver.url("/delay?n=0.2")) self.assertEqual(crawler.spider.caught_times, 1) @defer.inlineCallbacks def test_disconnect(self): crawler = get_crawler(SignalCatcherSpider) yield crawler.crawl(self.mockserver.url("/drop")) self.assertEqual(crawler.spider.caught_times, 1) @defer.inlineCallbacks def test_noconnect(self): crawler = get_crawler(SignalCatcherSpider) yield crawler.crawl("http://thereisdefinetelynosuchdomain.com") self.assertEqual(crawler.spider.caught_times, 1)
class ProxyConnectTestCase(TestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() self._oldenv = os.environ.copy() self._proxy = MitmProxy() proxy_url = self._proxy.start() os.environ['https_proxy'] = proxy_url os.environ['http_proxy'] = proxy_url def tearDown(self): self.mockserver.__exit__(None, None, None) self._proxy.stop() os.environ = self._oldenv @defer.inlineCallbacks def test_https_connect_tunnel(self): crawler = get_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/status?n=200", is_secure=True)) self._assert_got_response_code(200, l) @pytest.mark.xfail(reason='Python 3.6+ fails this earlier', condition=sys.version_info.minor >= 6) @defer.inlineCallbacks def test_https_connect_tunnel_error(self): crawler = get_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl("https://localhost:99999/status?n=200") self._assert_got_tunnel_error(l) @defer.inlineCallbacks def test_https_tunnel_auth_error(self): os.environ['https_proxy'] = _wrong_credentials(os.environ['https_proxy']) crawler = get_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/status?n=200", is_secure=True)) # The proxy returns a 407 error code but it does not reach the client; # he just sees a TunnelError. self._assert_got_tunnel_error(l) @defer.inlineCallbacks def test_https_tunnel_without_leak_proxy_authorization_header(self): request = Request(self.mockserver.url("/echo", is_secure=True)) crawler = get_crawler(SingleRequestSpider) with LogCapture() as l: yield crawler.crawl(seed=request) self._assert_got_response_code(200, l) echo = json.loads(crawler.spider.meta['responses'][0].text) self.assertTrue('Proxy-Authorization' not in echo['headers']) def _assert_got_response_code(self, code, log): print(log) self.assertEqual(str(log).count('Crawled (%d)' % code), 1) def _assert_got_tunnel_error(self, log): print(log) self.assertIn('TunnelError', str(log))
class ProxyConnectTestCase(TestCase): def setUp(self): try: import mitmproxy # noqa: F401 except ImportError: self.skipTest('mitmproxy is not installed') self.mockserver = MockServer() self.mockserver.__enter__() self._oldenv = os.environ.copy() self._proxy = MitmProxy() proxy_url = self._proxy.start() os.environ['https_proxy'] = proxy_url os.environ['http_proxy'] = proxy_url def tearDown(self): self.mockserver.__exit__(None, None, None) self._proxy.stop() os.environ = self._oldenv @defer.inlineCallbacks def test_https_connect_tunnel(self): crawler = get_crawler(SimpleSpider) with LogCapture() as log: yield crawler.crawl( self.mockserver.url("/status?n=200", is_secure=True)) self._assert_got_response_code(200, log) @defer.inlineCallbacks def test_https_tunnel_auth_error(self): os.environ['https_proxy'] = _wrong_credentials( os.environ['https_proxy']) crawler = get_crawler(SimpleSpider) with LogCapture() as log: yield crawler.crawl( self.mockserver.url("/status?n=200", is_secure=True)) # The proxy returns a 407 error code but it does not reach the client; # he just sees a TunnelError. self._assert_got_tunnel_error(log) @defer.inlineCallbacks def test_https_tunnel_without_leak_proxy_authorization_header(self): request = Request(self.mockserver.url("/echo", is_secure=True)) crawler = get_crawler(SingleRequestSpider) with LogCapture() as log: yield crawler.crawl(seed=request) self._assert_got_response_code(200, log) echo = json.loads(crawler.spider.meta['responses'][0].text) self.assertTrue('Proxy-Authorization' not in echo['headers']) def _assert_got_response_code(self, code, log): print(log) self.assertEqual(str(log).count(f'Crawled ({code})'), 1) def _assert_got_tunnel_error(self, log): print(log) self.assertIn('TunnelError', str(log))
class Http11MockServerTestCase(unittest.TestCase): """HTTP 1.1 test case with MockServer""" def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() def tearDown(self): self.mockserver.__exit__(None, None, None) @defer.inlineCallbacks def test_download_with_content_length(self): crawler = get_crawler(SingleRequestSpider) # http://localhost:8998/partial set Content-Length to 1024, use download_maxsize= 1000 to avoid # download it yield crawler.crawl(seed=Request(url=self.mockserver.url('/partial'), meta={'download_maxsize': 1000})) failure = crawler.spider.meta['failure'] self.assertIsInstance(failure.value, defer.CancelledError) @defer.inlineCallbacks def test_download(self): crawler = get_crawler(SingleRequestSpider) yield crawler.crawl(seed=Request(url=self.mockserver.url(''))) failure = crawler.spider.meta.get('failure') self.assertTrue(failure == None) reason = crawler.spider.meta['close_reason'] self.assertTrue(reason, 'finished') @defer.inlineCallbacks def test_download_gzip_response(self): crawler = get_crawler(SingleRequestSpider) body = b'1' * 100 # PayloadResource requires body length to be 100 request = Request(self.mockserver.url('/payload'), method='POST', body=body, meta={'download_maxsize': 50}) yield crawler.crawl(seed=request) failure = crawler.spider.meta['failure'] # download_maxsize < 100, hence the CancelledError self.assertIsInstance(failure.value, defer.CancelledError) if six.PY2: request.headers.setdefault(b'Accept-Encoding', b'gzip,deflate') request = request.replace(url=self.mockserver.url('/xpayload')) yield crawler.crawl(seed=request) # download_maxsize = 50 is enough for the gzipped response failure = crawler.spider.meta.get('failure') self.assertTrue(failure == None) reason = crawler.spider.meta['close_reason'] self.assertTrue(reason, 'finished') else: # See issue https://twistedmatrix.com/trac/ticket/8175 raise unittest.SkipTest("xpayload only enabled for PY2")
class FileDownloadCrawlTestCase(TestCase): pipeline_class = 'scrapy.pipelines.files.FilesPipeline' store_setting_key = 'FILES_STORE' media_key = 'files' media_urls_key = 'file_urls' expected_checksums = { '5547178b89448faf0015a13f904c936e', 'c2281c83670e31d8aaab7cb642b824db', 'ed3f6538dc15d4d9179dae57319edc5f' } def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() # prepare a directory for storing files self.tmpmediastore = self.mktemp() os.mkdir(self.tmpmediastore) self.settings = { 'ITEM_PIPELINES': { self.pipeline_class: 1 }, self.store_setting_key: self.tmpmediastore, } self.runner = CrawlerRunner(self.settings) self.items = [] def tearDown(self): shutil.rmtree(self.tmpmediastore) self.items = [] self.mockserver.__exit__(None, None, None) def _on_item_scraped(self, item): self.items.append(item) def _create_crawler(self, spider_class, **kwargs): crawler = self.runner.create_crawler(spider_class, **kwargs) crawler.signals.connect(self._on_item_scraped, signals.item_scraped) return crawler def _assert_files_downloaded(self, items, logs): self.assertEqual(len(items), 1) self.assertIn(self.media_key, items[0]) # check that logs show the expected number of successful file downloads file_dl_success = 'File (downloaded): Downloaded file from' self.assertEqual(logs.count(file_dl_success), 3) # check that the images/files status is `downloaded` for item in items: for i in item[self.media_key]: self.assertEqual(i['status'], 'downloaded') # check that the images/files checksums are what we know they should be if self.expected_checksums is not None: checksums = set(i['checksum'] for item in items for i in item[self.media_key]) self.assertEqual(checksums, self.expected_checksums) # check that the image files where actually written to the media store for item in items: for i in item[self.media_key]: self.assertTrue( os.path.exists(os.path.join(self.tmpmediastore, i['path']))) def _assert_files_download_failure(self, crawler, items, code, logs): # check that the item does NOT have the "images/files" field populated self.assertEqual(len(items), 1) self.assertIn(self.media_key, items[0]) self.assertFalse(items[0][self.media_key]) # check that there was 1 successful fetch and 3 other responses with non-200 code self.assertEqual( crawler.stats.get_value('downloader/request_method_count/GET'), 4) self.assertEqual(crawler.stats.get_value('downloader/response_count'), 4) self.assertEqual( crawler.stats.get_value('downloader/response_status_count/200'), 1) self.assertEqual( crawler.stats.get_value('downloader/response_status_count/%d' % code), 3) # check that logs do show the failure on the file downloads file_dl_failure = 'File (code: %d): Error downloading file from' % code self.assertEqual(logs.count(file_dl_failure), 3) # check that no files were written to the media store self.assertEqual(os.listdir(self.tmpmediastore), []) @defer.inlineCallbacks def test_download_media(self): crawler = self._create_crawler(MediaDownloadSpider) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/files/images/"), media_key=self.media_key, media_urls_key=self.media_urls_key) self._assert_files_downloaded(self.items, str(log)) @defer.inlineCallbacks def test_download_media_wrong_urls(self): crawler = self._create_crawler(BrokenLinksMediaDownloadSpider) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/files/images/"), media_key=self.media_key, media_urls_key=self.media_urls_key) self._assert_files_download_failure(crawler, self.items, 404, str(log)) @defer.inlineCallbacks def test_download_media_redirected_default_failure(self): crawler = self._create_crawler(RedirectedMediaDownloadSpider) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/files/images/"), media_key=self.media_key, media_urls_key=self.media_urls_key, mockserver=self.mockserver) self._assert_files_download_failure(crawler, self.items, 302, str(log)) @defer.inlineCallbacks def test_download_media_redirected_allowed(self): settings = dict(self.settings) settings.update({'MEDIA_ALLOW_REDIRECTS': True}) self.runner = CrawlerRunner(settings) crawler = self._create_crawler(RedirectedMediaDownloadSpider) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/files/images/"), media_key=self.media_key, media_urls_key=self.media_urls_key, mockserver=self.mockserver) self._assert_files_downloaded(self.items, str(log)) self.assertEqual( crawler.stats.get_value('downloader/response_status_count/302'), 3)
class CrawlTestCase(TestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() self.runner = CrawlerRunner() def tearDown(self): self.mockserver.__exit__(None, None, None) @defer.inlineCallbacks def test_follow_all(self): crawler = self.runner.create_crawler(FollowAllSpider) yield crawler.crawl(mockserver=self.mockserver) self.assertEqual(len(crawler.spider.urls_visited), 11) # 10 + start_url @defer.inlineCallbacks def test_fixed_delay(self): yield self._test_delay(total=3, delay=0.1) @defer.inlineCallbacks def test_randomized_delay(self): yield self._test_delay(total=3, delay=0.1, randomize=True) @defer.inlineCallbacks def _test_delay(self, total, delay, randomize=False): crawl_kwargs = dict( maxlatency=delay * 2, mockserver=self.mockserver, total=total, ) tolerance = (1 - (0.6 if randomize else 0.2)) settings = { "DOWNLOAD_DELAY": delay, 'RANDOMIZE_DOWNLOAD_DELAY': randomize } crawler = CrawlerRunner(settings).create_crawler(FollowAllSpider) yield crawler.crawl(**crawl_kwargs) times = crawler.spider.times total_time = times[-1] - times[0] average = total_time / (len(times) - 1) self.assertTrue(average > delay * tolerance, "download delay too small: %s" % average) # Ensure that the same test parameters would cause a failure if no # download delay is set. Otherwise, it means we are using a combination # of ``total`` and ``delay`` values that are too small for the test # code above to have any meaning. settings["DOWNLOAD_DELAY"] = 0 crawler = CrawlerRunner(settings).create_crawler(FollowAllSpider) yield crawler.crawl(**crawl_kwargs) times = crawler.spider.times total_time = times[-1] - times[0] average = total_time / (len(times) - 1) self.assertFalse(average > delay / tolerance, "test total or delay values are too small") @defer.inlineCallbacks def test_timeout_success(self): crawler = self.runner.create_crawler(DelaySpider) yield crawler.crawl(n=0.5, mockserver=self.mockserver) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 > 0) self.assertTrue(crawler.spider.t2 > crawler.spider.t1) @defer.inlineCallbacks def test_timeout_failure(self): crawler = CrawlerRunner({ "DOWNLOAD_TIMEOUT": 0.35 }).create_crawler(DelaySpider) yield crawler.crawl(n=0.5, mockserver=self.mockserver) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 == 0) self.assertTrue(crawler.spider.t2_err > 0) self.assertTrue(crawler.spider.t2_err > crawler.spider.t1) # server hangs after receiving response headers yield crawler.crawl(n=0.5, b=1, mockserver=self.mockserver) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 == 0) self.assertTrue(crawler.spider.t2_err > 0) self.assertTrue(crawler.spider.t2_err > crawler.spider.t1) @defer.inlineCallbacks def test_retry_503(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/status?n=503"), mockserver=self.mockserver) self._assert_retried(l) @defer.inlineCallbacks def test_retry_conn_failed(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl("http://localhost:65432/status?n=503", mockserver=self.mockserver) self._assert_retried(l) @defer.inlineCallbacks def test_retry_dns_error(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: # try to fetch the homepage of a non-existent domain yield crawler.crawl("http://dns.resolution.invalid./", mockserver=self.mockserver) self._assert_retried(l) @defer.inlineCallbacks def test_start_requests_bug_before_yield(self): with LogCapture('scrapy', level=logging.ERROR) as l: crawler = self.runner.create_crawler(BrokenStartRequestsSpider) yield crawler.crawl(fail_before_yield=1, mockserver=self.mockserver) self.assertEqual(len(l.records), 1) record = l.records[0] self.assertIsNotNone(record.exc_info) self.assertIs(record.exc_info[0], ZeroDivisionError) @defer.inlineCallbacks def test_start_requests_bug_yielding(self): with LogCapture('scrapy', level=logging.ERROR) as l: crawler = self.runner.create_crawler(BrokenStartRequestsSpider) yield crawler.crawl(fail_yielding=1, mockserver=self.mockserver) self.assertEqual(len(l.records), 1) record = l.records[0] self.assertIsNotNone(record.exc_info) self.assertIs(record.exc_info[0], ZeroDivisionError) @defer.inlineCallbacks def test_start_requests_lazyness(self): settings = {"CONCURRENT_REQUESTS": 1} crawler = CrawlerRunner(settings).create_crawler( BrokenStartRequestsSpider) yield crawler.crawl(mockserver=self.mockserver) #self.assertTrue(False, crawler.spider.seedsseen) #self.assertTrue(crawler.spider.seedsseen.index(None) < crawler.spider.seedsseen.index(99), # crawler.spider.seedsseen) @defer.inlineCallbacks def test_start_requests_dupes(self): settings = {"CONCURRENT_REQUESTS": 1} crawler = CrawlerRunner(settings).create_crawler( DuplicateStartRequestsSpider) yield crawler.crawl(dont_filter=True, distinct_urls=2, dupe_factor=3, mockserver=self.mockserver) self.assertEqual(crawler.spider.visited, 6) yield crawler.crawl(dont_filter=False, distinct_urls=3, dupe_factor=4, mockserver=self.mockserver) self.assertEqual(crawler.spider.visited, 3) @defer.inlineCallbacks def test_unbounded_response(self): # Completeness of responses without Content-Length or Transfer-Encoding # can not be determined, we treat them as valid but flagged as "partial" from urllib.parse import urlencode query = urlencode({ 'raw': '''\ HTTP/1.1 200 OK Server: Apache-Coyote/1.1 X-Powered-By: Servlet 2.4; JBoss-4.2.3.GA (build: SVNTag=JBoss_4_2_3_GA date=200807181417)/JBossWeb-2.0 Set-Cookie: JSESSIONID=08515F572832D0E659FD2B0D8031D75F; Path=/ Pragma: no-cache Expires: Thu, 01 Jan 1970 00:00:00 GMT Cache-Control: no-cache Cache-Control: no-store Content-Type: text/html;charset=UTF-8 Content-Language: en Date: Tue, 27 Aug 2013 13:05:05 GMT Connection: close foo body with multiples lines ''' }) crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/raw?{0}".format(query)), mockserver=self.mockserver) self.assertEqual(str(l).count("Got response 200"), 1) @defer.inlineCallbacks def test_retry_conn_lost(self): # connection lost after receiving data crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/drop?abort=0"), mockserver=self.mockserver) self._assert_retried(l) @defer.inlineCallbacks def test_retry_conn_aborted(self): # connection lost before receiving data crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/drop?abort=1"), mockserver=self.mockserver) self._assert_retried(l) def _assert_retried(self, log): self.assertEqual(str(log).count("Retrying"), 2) self.assertEqual(str(log).count("Gave up retrying"), 1) @defer.inlineCallbacks def test_referer_header(self): """Referer header is set by RefererMiddleware unless it is already set""" req0 = Request(self.mockserver.url('/echo?headers=1&body=0'), dont_filter=1) req1 = req0.replace() req2 = req0.replace(headers={'Referer': None}) req3 = req0.replace(headers={'Referer': 'http://example.com'}) req0.meta['next'] = req1 req1.meta['next'] = req2 req2.meta['next'] = req3 crawler = self.runner.create_crawler(SingleRequestSpider) yield crawler.crawl(seed=req0, mockserver=self.mockserver) # basic asserts in case of weird communication errors self.assertIn('responses', crawler.spider.meta) self.assertNotIn('failures', crawler.spider.meta) # start requests doesn't set Referer header echo0 = json.loads(to_unicode( crawler.spider.meta['responses'][2].body)) self.assertNotIn('Referer', echo0['headers']) # following request sets Referer to start request url echo1 = json.loads(to_unicode( crawler.spider.meta['responses'][1].body)) self.assertEqual(echo1['headers'].get('Referer'), [req0.url]) # next request avoids Referer header echo2 = json.loads(to_unicode( crawler.spider.meta['responses'][2].body)) self.assertNotIn('Referer', echo2['headers']) # last request explicitly sets a Referer header echo3 = json.loads(to_unicode( crawler.spider.meta['responses'][3].body)) self.assertEqual(echo3['headers'].get('Referer'), ['http://example.com']) @defer.inlineCallbacks def test_engine_status(self): from scrapy.utils.engine import get_engine_status est = [] def cb(response): est.append(get_engine_status(crawler.engine)) crawler = self.runner.create_crawler(SingleRequestSpider) yield crawler.crawl(seed=self.mockserver.url('/'), callback_func=cb, mockserver=self.mockserver) self.assertEqual(len(est), 1, est) s = dict(est[0]) self.assertEqual(s['engine.spider.name'], crawler.spider.name) self.assertEqual(s['len(engine.scraper.slot.active)'], 1) @defer.inlineCallbacks def test_graceful_crawl_error_handling(self): """ Test whether errors happening anywhere in Crawler.crawl() are properly reported (and not somehow swallowed) after a graceful engine shutdown. The errors should not come from within Scrapy's core but from within spiders/middlewares/etc., e.g. raised in Spider.start_requests(), SpiderMiddleware.process_start_requests(), etc. """ class TestError(Exception): pass class FaultySpider(SimpleSpider): def start_requests(self): raise TestError crawler = self.runner.create_crawler(FaultySpider) yield self.assertFailure(crawler.crawl(mockserver=self.mockserver), TestError) self.assertFalse(crawler.crawling) @defer.inlineCallbacks def test_open_spider_error_on_faulty_pipeline(self): settings = { "ITEM_PIPELINES": { "tests.pipelines.ZeroDivisionErrorPipeline": 300, } } crawler = CrawlerRunner(settings).create_crawler(SimpleSpider) yield self.assertFailure( self.runner.crawl(crawler, self.mockserver.url("/status?n=200"), mockserver=self.mockserver), ZeroDivisionError) self.assertFalse(crawler.crawling) @defer.inlineCallbacks def test_crawlerrunner_accepts_crawler(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as log: yield self.runner.crawl(crawler, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) self.assertIn("Got response 200", str(log)) @defer.inlineCallbacks def test_crawl_multiple(self): self.runner.crawl(SimpleSpider, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) self.runner.crawl(SimpleSpider, self.mockserver.url("/status?n=503"), mockserver=self.mockserver) with LogCapture() as log: yield self.runner.join() self._assert_retried(log) self.assertIn("Got response 200", str(log)) @defer.inlineCallbacks def test_crawlspider_with_errback(self): self.runner.crawl(CrawlSpiderWithErrback, mockserver=self.mockserver) with LogCapture() as log: yield self.runner.join() self.assertIn("[callback] status 200", str(log)) self.assertIn("[callback] status 201", str(log)) self.assertIn("[errback] status 404", str(log)) self.assertIn("[errback] status 500", str(log)) @defer.inlineCallbacks def test_async_def_parse(self): self.runner.crawl(AsyncDefSpider, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) with LogCapture() as log: yield self.runner.join() self.assertIn("Got response 200", str(log)) @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncio_parse(self): runner = CrawlerRunner({"ASYNCIO_REACTOR": True}) runner.crawl(AsyncDefAsyncioSpider, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) with LogCapture() as log: yield runner.join() self.assertIn("Got response 200", str(log)) @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncio_parse_list(self): items = [] def _on_item_scraped(item): items.append(item) crawler = self.runner.create_crawler(AsyncDefAsyncioReturnSpider) crawler.signals.connect(_on_item_scraped, signals.item_scraped) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/status?n=200"), mockserver=self.mockserver) self.assertIn("Got response 200", str(log)) self.assertIn({'id': 1}, items) self.assertIn({'id': 2}, items)
class CrawlTestCase(TestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() self.runner = CrawlerRunner() def tearDown(self): self.mockserver.__exit__(None, None, None) @defer.inlineCallbacks def test_follow_all(self): crawler = self.runner.create_crawler(FollowAllSpider) yield crawler.crawl(mockserver=self.mockserver) self.assertEqual(len(crawler.spider.urls_visited), 11) # 10 + start_url @defer.inlineCallbacks def test_delay(self): # short to long delays yield self._test_delay(0.2, False) yield self._test_delay(1, False) # randoms yield self._test_delay(0.2, True) yield self._test_delay(1, True) @defer.inlineCallbacks def _test_delay(self, delay, randomize): settings = {"DOWNLOAD_DELAY": delay, 'RANDOMIZE_DOWNLOAD_DELAY': randomize} crawler = CrawlerRunner(settings).create_crawler(FollowAllSpider) yield crawler.crawl(maxlatency=delay * 2, mockserver=self.mockserver) t = crawler.spider.times totaltime = t[-1] - t[0] avgd = totaltime / (len(t) - 1) tolerance = 0.6 if randomize else 0.2 self.assertTrue(avgd > delay * (1 - tolerance), "download delay too small: %s" % avgd) @defer.inlineCallbacks def test_timeout_success(self): crawler = self.runner.create_crawler(DelaySpider) yield crawler.crawl(n=0.5, mockserver=self.mockserver) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 > 0) self.assertTrue(crawler.spider.t2 > crawler.spider.t1) @defer.inlineCallbacks def test_timeout_failure(self): crawler = CrawlerRunner({"DOWNLOAD_TIMEOUT": 0.35}).create_crawler(DelaySpider) yield crawler.crawl(n=0.5, mockserver=self.mockserver) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 == 0) self.assertTrue(crawler.spider.t2_err > 0) self.assertTrue(crawler.spider.t2_err > crawler.spider.t1) # server hangs after receiving response headers yield crawler.crawl(n=0.5, b=1, mockserver=self.mockserver) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 == 0) self.assertTrue(crawler.spider.t2_err > 0) self.assertTrue(crawler.spider.t2_err > crawler.spider.t1) @defer.inlineCallbacks def test_retry_503(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/status?n=503"), mockserver=self.mockserver) self._assert_retried(l) @defer.inlineCallbacks def test_retry_conn_failed(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl("http://localhost:65432/status?n=503", mockserver=self.mockserver) self._assert_retried(l) @defer.inlineCallbacks def test_retry_dns_error(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: # try to fetch the homepage of a non-existent domain yield crawler.crawl("http://dns.resolution.invalid./", mockserver=self.mockserver) self._assert_retried(l) @defer.inlineCallbacks def test_start_requests_bug_before_yield(self): with LogCapture('scrapy', level=logging.ERROR) as l: crawler = self.runner.create_crawler(BrokenStartRequestsSpider) yield crawler.crawl(fail_before_yield=1, mockserver=self.mockserver) self.assertEqual(len(l.records), 1) record = l.records[0] self.assertIsNotNone(record.exc_info) self.assertIs(record.exc_info[0], ZeroDivisionError) @defer.inlineCallbacks def test_start_requests_bug_yielding(self): with LogCapture('scrapy', level=logging.ERROR) as l: crawler = self.runner.create_crawler(BrokenStartRequestsSpider) yield crawler.crawl(fail_yielding=1, mockserver=self.mockserver) self.assertEqual(len(l.records), 1) record = l.records[0] self.assertIsNotNone(record.exc_info) self.assertIs(record.exc_info[0], ZeroDivisionError) @defer.inlineCallbacks def test_start_requests_lazyness(self): settings = {"CONCURRENT_REQUESTS": 1} crawler = CrawlerRunner(settings).create_crawler(BrokenStartRequestsSpider) yield crawler.crawl(mockserver=self.mockserver) #self.assertTrue(False, crawler.spider.seedsseen) #self.assertTrue(crawler.spider.seedsseen.index(None) < crawler.spider.seedsseen.index(99), # crawler.spider.seedsseen) @defer.inlineCallbacks def test_start_requests_dupes(self): settings = {"CONCURRENT_REQUESTS": 1} crawler = CrawlerRunner(settings).create_crawler(DuplicateStartRequestsSpider) yield crawler.crawl(dont_filter=True, distinct_urls=2, dupe_factor=3, mockserver=self.mockserver) self.assertEqual(crawler.spider.visited, 6) yield crawler.crawl(dont_filter=False, distinct_urls=3, dupe_factor=4, mockserver=self.mockserver) self.assertEqual(crawler.spider.visited, 3) @defer.inlineCallbacks def test_unbounded_response(self): # Completeness of responses without Content-Length or Transfer-Encoding # can not be determined, we treat them as valid but flagged as "partial" from six.moves.urllib.parse import urlencode query = urlencode({'raw': '''\ HTTP/1.1 200 OK Server: Apache-Coyote/1.1 X-Powered-By: Servlet 2.4; JBoss-4.2.3.GA (build: SVNTag=JBoss_4_2_3_GA date=200807181417)/JBossWeb-2.0 Set-Cookie: JSESSIONID=08515F572832D0E659FD2B0D8031D75F; Path=/ Pragma: no-cache Expires: Thu, 01 Jan 1970 00:00:00 GMT Cache-Control: no-cache Cache-Control: no-store Content-Type: text/html;charset=UTF-8 Content-Language: en Date: Tue, 27 Aug 2013 13:05:05 GMT Connection: close foo body with multiples lines '''}) crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/raw?{0}".format(query)), mockserver=self.mockserver) self.assertEqual(str(l).count("Got response 200"), 1) @defer.inlineCallbacks def test_retry_conn_lost(self): # connection lost after receiving data crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/drop?abort=0"), mockserver=self.mockserver) self._assert_retried(l) @defer.inlineCallbacks def test_retry_conn_aborted(self): # connection lost before receiving data crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/drop?abort=1"), mockserver=self.mockserver) self._assert_retried(l) def _assert_retried(self, log): self.assertEqual(str(log).count("Retrying"), 2) self.assertEqual(str(log).count("Gave up retrying"), 1) @defer.inlineCallbacks def test_referer_header(self): """Referer header is set by RefererMiddleware unless it is already set""" req0 = Request(self.mockserver.url('/echo?headers=1&body=0'), dont_filter=1) req1 = req0.replace() req2 = req0.replace(headers={'Referer': None}) req3 = req0.replace(headers={'Referer': 'http://example.com'}) req0.meta['next'] = req1 req1.meta['next'] = req2 req2.meta['next'] = req3 crawler = self.runner.create_crawler(SingleRequestSpider) yield crawler.crawl(seed=req0, mockserver=self.mockserver) # basic asserts in case of weird communication errors self.assertIn('responses', crawler.spider.meta) self.assertNotIn('failures', crawler.spider.meta) # start requests doesn't set Referer header echo0 = json.loads(to_unicode(crawler.spider.meta['responses'][2].body)) self.assertNotIn('Referer', echo0['headers']) # following request sets Referer to start request url echo1 = json.loads(to_unicode(crawler.spider.meta['responses'][1].body)) self.assertEqual(echo1['headers'].get('Referer'), [req0.url]) # next request avoids Referer header echo2 = json.loads(to_unicode(crawler.spider.meta['responses'][2].body)) self.assertNotIn('Referer', echo2['headers']) # last request explicitly sets a Referer header echo3 = json.loads(to_unicode(crawler.spider.meta['responses'][3].body)) self.assertEqual(echo3['headers'].get('Referer'), ['http://example.com']) @defer.inlineCallbacks def test_engine_status(self): from scrapy.utils.engine import get_engine_status est = [] def cb(response): est.append(get_engine_status(crawler.engine)) crawler = self.runner.create_crawler(SingleRequestSpider) yield crawler.crawl(seed=self.mockserver.url('/'), callback_func=cb, mockserver=self.mockserver) self.assertEqual(len(est), 1, est) s = dict(est[0]) self.assertEqual(s['engine.spider.name'], crawler.spider.name) self.assertEqual(s['len(engine.scraper.slot.active)'], 1) @defer.inlineCallbacks def test_graceful_crawl_error_handling(self): """ Test whether errors happening anywhere in Crawler.crawl() are properly reported (and not somehow swallowed) after a graceful engine shutdown. The errors should not come from within Scrapy's core but from within spiders/middlewares/etc., e.g. raised in Spider.start_requests(), SpiderMiddleware.process_start_requests(), etc. """ class TestError(Exception): pass class FaultySpider(SimpleSpider): def start_requests(self): raise TestError crawler = self.runner.create_crawler(FaultySpider) yield self.assertFailure(crawler.crawl(mockserver=self.mockserver), TestError) self.assertFalse(crawler.crawling) @defer.inlineCallbacks def test_open_spider_error_on_faulty_pipeline(self): settings = { "ITEM_PIPELINES": { "tests.pipelines.ZeroDivisionErrorPipeline": 300, } } crawler = CrawlerRunner(settings).create_crawler(SimpleSpider) yield self.assertFailure( self.runner.crawl(crawler, self.mockserver.url("/status?n=200"), mockserver=self.mockserver), ZeroDivisionError) self.assertFalse(crawler.crawling) @defer.inlineCallbacks def test_crawlerrunner_accepts_crawler(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as log: yield self.runner.crawl(crawler, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) self.assertIn("Got response 200", str(log)) @defer.inlineCallbacks def test_crawl_multiple(self): self.runner.crawl(SimpleSpider, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) self.runner.crawl(SimpleSpider, self.mockserver.url("/status?n=503"), mockserver=self.mockserver) with LogCapture() as log: yield self.runner.join() self._assert_retried(log) self.assertIn("Got response 200", str(log))
class CrawlSpiderTestCase(TestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() self.runner = CrawlerRunner() def tearDown(self): self.mockserver.__exit__(None, None, None) @defer.inlineCallbacks def _run_spider(self, spider_cls): items = [] def _on_item_scraped(item): items.append(item) crawler = self.runner.create_crawler(spider_cls) crawler.signals.connect(_on_item_scraped, signals.item_scraped) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/status?n=200"), mockserver=self.mockserver) return log, items, crawler.stats @defer.inlineCallbacks def test_crawlspider_with_parse(self): self.runner.crawl(CrawlSpiderWithParseMethod, mockserver=self.mockserver) with LogCapture() as log: yield self.runner.join() self.assertIn("[parse] status 200 (foo: None)", str(log)) self.assertIn("[parse] status 201 (foo: None)", str(log)) self.assertIn("[parse] status 202 (foo: bar)", str(log)) @defer.inlineCallbacks def test_crawlspider_with_errback(self): self.runner.crawl(CrawlSpiderWithErrback, mockserver=self.mockserver) with LogCapture() as log: yield self.runner.join() self.assertIn("[parse] status 200 (foo: None)", str(log)) self.assertIn("[parse] status 201 (foo: None)", str(log)) self.assertIn("[parse] status 202 (foo: bar)", str(log)) self.assertIn("[errback] status 404", str(log)) self.assertIn("[errback] status 500", str(log)) self.assertIn("[errback] status 501", str(log)) @defer.inlineCallbacks def test_async_def_parse(self): self.runner.crawl(AsyncDefSpider, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) with LogCapture() as log: yield self.runner.join() self.assertIn("Got response 200", str(log)) @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncio_parse(self): runner = CrawlerRunner({ "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor" }) runner.crawl(AsyncDefAsyncioSpider, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) with LogCapture() as log: yield runner.join() self.assertIn("Got response 200", str(log)) @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncio_parse_items_list(self): log, items, _ = yield self._run_spider(AsyncDefAsyncioReturnSpider) self.assertIn("Got response 200", str(log)) self.assertIn({'id': 1}, items) self.assertIn({'id': 2}, items) @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncio_parse_items_single_element(self): items = [] def _on_item_scraped(item): items.append(item) crawler = self.runner.create_crawler( AsyncDefAsyncioReturnSingleElementSpider) crawler.signals.connect(_on_item_scraped, signals.item_scraped) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/status?n=200"), mockserver=self.mockserver) self.assertIn("Got response 200", str(log)) self.assertIn({"foo": 42}, items) @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncgen_parse(self): log, _, stats = yield self._run_spider(AsyncDefAsyncioGenSpider) self.assertIn("Got response 200", str(log)) itemcount = stats.get_value('item_scraped_count') self.assertEqual(itemcount, 1) @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncgen_parse_loop(self): log, items, stats = yield self._run_spider( AsyncDefAsyncioGenLoopSpider) self.assertIn("Got response 200", str(log)) itemcount = stats.get_value('item_scraped_count') self.assertEqual(itemcount, 10) for i in range(10): self.assertIn({'foo': i}, items) @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncgen_parse_complex(self): _, items, stats = yield self._run_spider( AsyncDefAsyncioGenComplexSpider) itemcount = stats.get_value('item_scraped_count') self.assertEqual(itemcount, 156) # some random items for i in [1, 4, 21, 22, 207, 311]: self.assertIn({'index': i}, items) for i in [10, 30, 122]: self.assertIn({'index2': i}, items) @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncio_parse_reqs_list(self): log, *_ = yield self._run_spider(AsyncDefAsyncioReqsReturnSpider) for req_id in range(3): self.assertIn(f"Got response 200, req_id {req_id}", str(log)) @defer.inlineCallbacks def test_response_ssl_certificate_none(self): crawler = self.runner.create_crawler(SingleRequestSpider) url = self.mockserver.url("/echo?body=test", is_secure=False) yield crawler.crawl(seed=url, mockserver=self.mockserver) self.assertIsNone(crawler.spider.meta['responses'][0].certificate) @defer.inlineCallbacks def test_response_ssl_certificate(self): crawler = self.runner.create_crawler(SingleRequestSpider) url = self.mockserver.url("/echo?body=test", is_secure=True) yield crawler.crawl(seed=url, mockserver=self.mockserver) cert = crawler.spider.meta['responses'][0].certificate self.assertIsInstance(cert, Certificate) self.assertEqual(cert.getSubject().commonName, b"localhost") self.assertEqual(cert.getIssuer().commonName, b"localhost") @mark.xfail( reason="Responses with no body return early and contain no certificate" ) @defer.inlineCallbacks def test_response_ssl_certificate_empty_response(self): crawler = self.runner.create_crawler(SingleRequestSpider) url = self.mockserver.url("/status?n=200", is_secure=True) yield crawler.crawl(seed=url, mockserver=self.mockserver) cert = crawler.spider.meta['responses'][0].certificate self.assertIsInstance(cert, Certificate) self.assertEqual(cert.getSubject().commonName, b"localhost") self.assertEqual(cert.getIssuer().commonName, b"localhost") @defer.inlineCallbacks def test_dns_server_ip_address_none(self): crawler = self.runner.create_crawler(SingleRequestSpider) url = self.mockserver.url('/status?n=200') yield crawler.crawl(seed=url, mockserver=self.mockserver) ip_address = crawler.spider.meta['responses'][0].ip_address self.assertIsNone(ip_address) @defer.inlineCallbacks def test_dns_server_ip_address(self): crawler = self.runner.create_crawler(SingleRequestSpider) url = self.mockserver.url('/echo?body=test') expected_netloc, _ = urlparse(url).netloc.split(':') yield crawler.crawl(seed=url, mockserver=self.mockserver) ip_address = crawler.spider.meta['responses'][0].ip_address self.assertIsInstance(ip_address, IPv4Address) self.assertEqual(str(ip_address), gethostbyname(expected_netloc)) @defer.inlineCallbacks def test_bytes_received_stop_download_callback(self): crawler = self.runner.create_crawler(BytesReceivedCallbackSpider) yield crawler.crawl(mockserver=self.mockserver) self.assertIsNone(crawler.spider.meta.get("failure")) self.assertIsInstance(crawler.spider.meta["response"], Response) self.assertEqual(crawler.spider.meta["response"].body, crawler.spider.meta.get("bytes_received")) self.assertLess(len(crawler.spider.meta["response"].body), crawler.spider.full_response_length) @defer.inlineCallbacks def test_bytes_received_stop_download_errback(self): crawler = self.runner.create_crawler(BytesReceivedErrbackSpider) yield crawler.crawl(mockserver=self.mockserver) self.assertIsNone(crawler.spider.meta.get("response")) self.assertIsInstance(crawler.spider.meta["failure"], Failure) self.assertIsInstance(crawler.spider.meta["failure"].value, StopDownload) self.assertIsInstance(crawler.spider.meta["failure"].value.response, Response) self.assertEqual(crawler.spider.meta["failure"].value.response.body, crawler.spider.meta.get("bytes_received")) self.assertLess( len(crawler.spider.meta["failure"].value.response.body), crawler.spider.full_response_length) @defer.inlineCallbacks def test_headers_received_stop_download_callback(self): crawler = self.runner.create_crawler(HeadersReceivedCallbackSpider) yield crawler.crawl(mockserver=self.mockserver) self.assertIsNone(crawler.spider.meta.get("failure")) self.assertIsInstance(crawler.spider.meta["response"], Response) self.assertEqual(crawler.spider.meta["response"].headers, crawler.spider.meta.get("headers_received")) @defer.inlineCallbacks def test_headers_received_stop_download_errback(self): crawler = self.runner.create_crawler(HeadersReceivedErrbackSpider) yield crawler.crawl(mockserver=self.mockserver) self.assertIsNone(crawler.spider.meta.get("response")) self.assertIsInstance(crawler.spider.meta["failure"], Failure) self.assertIsInstance(crawler.spider.meta["failure"].value, StopDownload) self.assertIsInstance(crawler.spider.meta["failure"].value.response, Response) self.assertEqual(crawler.spider.meta["failure"].value.response.headers, crawler.spider.meta.get("headers_received"))
class FileDownloadCrawlTestCase(TestCase): pipeline_class = 'scrapy.pipelines.files.FilesPipeline' store_setting_key = 'FILES_STORE' media_key = 'files' media_urls_key = 'file_urls' expected_checksums = set([ '5547178b89448faf0015a13f904c936e', 'c2281c83670e31d8aaab7cb642b824db', 'ed3f6538dc15d4d9179dae57319edc5f']) def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() # prepare a directory for storing files self.tmpmediastore = self.mktemp() os.mkdir(self.tmpmediastore) self.settings = { 'ITEM_PIPELINES': {self.pipeline_class: 1}, self.store_setting_key: self.tmpmediastore, } self.runner = CrawlerRunner(self.settings) self.items = [] def tearDown(self): shutil.rmtree(self.tmpmediastore) self.items = [] self.mockserver.__exit__(None, None, None) def _on_item_scraped(self, item): self.items.append(item) def _create_crawler(self, spider_class, **kwargs): crawler = self.runner.create_crawler(spider_class, **kwargs) crawler.signals.connect(self._on_item_scraped, signals.item_scraped) return crawler def _assert_files_downloaded(self, items, logs): self.assertEqual(len(items), 1) self.assertIn(self.media_key, items[0]) # check that logs show the expected number of successful file downloads file_dl_success = 'File (downloaded): Downloaded file from' self.assertEqual(logs.count(file_dl_success), 3) # check that the images/files checksums are what we know they should be if self.expected_checksums is not None: checksums = set( i['checksum'] for item in items for i in item[self.media_key]) self.assertEqual(checksums, self.expected_checksums) # check that the image files where actually written to the media store for item in items: for i in item[self.media_key]: self.assertTrue( os.path.exists( os.path.join(self.tmpmediastore, i['path']))) def _assert_files_download_failure(self, crawler, items, code, logs): # check that the item does NOT have the "images/files" field populated self.assertEqual(len(items), 1) self.assertIn(self.media_key, items[0]) self.assertFalse(items[0][self.media_key]) # check that there was 1 successful fetch and 3 other responses with non-200 code self.assertEqual(crawler.stats.get_value('downloader/request_method_count/GET'), 4) self.assertEqual(crawler.stats.get_value('downloader/response_count'), 4) self.assertEqual(crawler.stats.get_value('downloader/response_status_count/200'), 1) self.assertEqual(crawler.stats.get_value('downloader/response_status_count/%d' % code), 3) # check that logs do show the failure on the file downloads file_dl_failure = 'File (code: %d): Error downloading file from' % code self.assertEqual(logs.count(file_dl_failure), 3) # check that no files were written to the media store self.assertEqual(os.listdir(self.tmpmediastore), []) @defer.inlineCallbacks def test_download_media(self): crawler = self._create_crawler(MediaDownloadSpider) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/files/images/"), media_key=self.media_key, media_urls_key=self.media_urls_key) self._assert_files_downloaded(self.items, str(log)) @defer.inlineCallbacks def test_download_media_wrong_urls(self): crawler = self._create_crawler(BrokenLinksMediaDownloadSpider) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/files/images/"), media_key=self.media_key, media_urls_key=self.media_urls_key) self._assert_files_download_failure(crawler, self.items, 404, str(log)) @defer.inlineCallbacks def test_download_media_redirected_default_failure(self): crawler = self._create_crawler(RedirectedMediaDownloadSpider) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/files/images/"), media_key=self.media_key, media_urls_key=self.media_urls_key, mockserver=self.mockserver) self._assert_files_download_failure(crawler, self.items, 302, str(log)) @defer.inlineCallbacks def test_download_media_redirected_allowed(self): settings = dict(self.settings) settings.update({'MEDIA_ALLOW_REDIRECTS': True}) self.runner = CrawlerRunner(settings) crawler = self._create_crawler(RedirectedMediaDownloadSpider) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/files/images/"), media_key=self.media_key, media_urls_key=self.media_urls_key, mockserver=self.mockserver) self._assert_files_downloaded(self.items, str(log)) self.assertEqual(crawler.stats.get_value('downloader/response_status_count/302'), 3)
class ProxyConnectTestCase(TestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() self._oldenv = os.environ.copy() self._proxy = MitmProxy() proxy_url = self._proxy.start() os.environ['https_proxy'] = proxy_url os.environ['http_proxy'] = proxy_url def tearDown(self): self.mockserver.__exit__(None, None, None) self._proxy.stop() os.environ = self._oldenv @defer.inlineCallbacks def test_https_connect_tunnel(self): crawler = get_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl( self.mockserver.url("/status?n=200", is_secure=True)) self._assert_got_response_code(200, l) @pytest.mark.xfail(reason='Python 3.6+ fails this earlier', condition=sys.version_info.minor >= 6) @defer.inlineCallbacks def test_https_connect_tunnel_error(self): crawler = get_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl("https://localhost:99999/status?n=200") self._assert_got_tunnel_error(l) @defer.inlineCallbacks def test_https_tunnel_auth_error(self): os.environ['https_proxy'] = _wrong_credentials( os.environ['https_proxy']) crawler = get_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl( self.mockserver.url("/status?n=200", is_secure=True)) # The proxy returns a 407 error code but it does not reach the client; # he just sees a TunnelError. self._assert_got_tunnel_error(l) @defer.inlineCallbacks def test_https_tunnel_without_leak_proxy_authorization_header(self): request = Request(self.mockserver.url("/echo", is_secure=True)) crawler = get_crawler(SingleRequestSpider) with LogCapture() as l: yield crawler.crawl(seed=request) self._assert_got_response_code(200, l) echo = json.loads(crawler.spider.meta['responses'][0].text) self.assertTrue('Proxy-Authorization' not in echo['headers']) # The noconnect mode isn't supported by the current mitmproxy, it returns # "Invalid request scheme: https" as it doesn't seem to support full URLs in GET at all, # and it's not clear what behavior is intended by Scrapy and by mitmproxy here. # https://github.com/mitmproxy/mitmproxy/issues/848 may be related. # The Scrapy noconnect mode was required, at least in the past, to work with Crawlera, # and https://github.com/scrapy-plugins/scrapy-crawlera/pull/44 seems to be related. @pytest.mark.xfail(reason='mitmproxy gives an error for noconnect requests' ) @defer.inlineCallbacks def test_https_noconnect(self): proxy = os.environ['https_proxy'] os.environ['https_proxy'] = proxy + '?noconnect' crawler = get_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl( self.mockserver.url("/status?n=200", is_secure=True)) self._assert_got_response_code(200, l) @pytest.mark.xfail(reason='mitmproxy gives an error for noconnect requests' ) @defer.inlineCallbacks def test_https_noconnect_auth_error(self): os.environ['https_proxy'] = _wrong_credentials( os.environ['https_proxy']) + '?noconnect' crawler = get_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl( self.mockserver.url("/status?n=200", is_secure=True)) self._assert_got_response_code(407, l) def _assert_got_response_code(self, code, log): print(log) self.assertEqual(str(log).count('Crawled (%d)' % code), 1) def _assert_got_tunnel_error(self, log): print(log) self.assertIn('TunnelError', str(log))
class CrawlTestCase(TestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() def tearDown(self): self.mockserver.__exit__(None, None, None) @defer.inlineCallbacks def test_response_200(self): url = self.mockserver.url("/status?n=200") crawler = CrawlerRunner().create_crawler(SingleRequestSpider) yield crawler.crawl(seed=url, mockserver=self.mockserver) response = crawler.spider.meta["responses"][0] self.assertEqual(response.request.url, url) @defer.inlineCallbacks def test_response_error(self): for status in ("404", "500"): url = self.mockserver.url(f"/status?n={status}") crawler = CrawlerRunner().create_crawler(SingleRequestSpider) yield crawler.crawl(seed=url, mockserver=self.mockserver) failure = crawler.spider.meta["failure"] response = failure.value.response self.assertEqual(failure.request.url, url) self.assertEqual(response.request.url, url) @defer.inlineCallbacks def test_downloader_middleware_raise_exception(self): url = self.mockserver.url("/status?n=200") runner = CrawlerRunner(settings={ "DOWNLOADER_MIDDLEWARES": { RaiseExceptionRequestMiddleware: 590, }, }) crawler = runner.create_crawler(SingleRequestSpider) yield crawler.crawl(seed=url, mockserver=self.mockserver) failure = crawler.spider.meta["failure"] self.assertEqual(failure.request.url, url) self.assertIsInstance(failure.value, ZeroDivisionError) @defer.inlineCallbacks def test_downloader_middleware_override_request_in_process_response(self): """ Downloader middleware which returns a response with an specific 'request' attribute. * The spider callback should receive the overridden response.request * Handlers listening to the response_received signal should receive the overridden response.request * The "crawled" log message should show the overridden response.request """ signal_params = {} def signal_handler(response, request, spider): signal_params["response"] = response signal_params["request"] = request url = self.mockserver.url("/status?n=200") runner = CrawlerRunner(settings={ "DOWNLOADER_MIDDLEWARES": { ProcessResponseMiddleware: 595, } }) crawler = runner.create_crawler(SingleRequestSpider) crawler.signals.connect(signal_handler, signal=signals.response_received) with LogCapture() as log: yield crawler.crawl(seed=url, mockserver=self.mockserver) response = crawler.spider.meta["responses"][0] self.assertEqual(response.request.url, OVERRIDEN_URL) self.assertEqual(signal_params["response"].url, url) self.assertEqual(signal_params["request"].url, OVERRIDEN_URL) log.check_present( ("scrapy.core.engine", "DEBUG", f"Crawled (200) <GET {OVERRIDEN_URL}> (referer: None)"), ) @defer.inlineCallbacks def test_downloader_middleware_override_in_process_exception(self): """ An exception is raised but caught by the next middleware, which returns a Response with a specific 'request' attribute. The spider callback should receive the overridden response.request """ url = self.mockserver.url("/status?n=200") runner = CrawlerRunner( settings={ "DOWNLOADER_MIDDLEWARES": { RaiseExceptionRequestMiddleware: 590, CatchExceptionOverrideRequestMiddleware: 595, }, }) crawler = runner.create_crawler(SingleRequestSpider) yield crawler.crawl(seed=url, mockserver=self.mockserver) response = crawler.spider.meta["responses"][0] self.assertEqual(response.body, b"Caught ZeroDivisionError") self.assertEqual(response.request.url, OVERRIDEN_URL) @defer.inlineCallbacks def test_downloader_middleware_do_not_override_in_process_exception(self): """ An exception is raised but caught by the next middleware, which returns a Response without a specific 'request' attribute. The spider callback should receive the original response.request """ url = self.mockserver.url("/status?n=200") runner = CrawlerRunner( settings={ "DOWNLOADER_MIDDLEWARES": { RaiseExceptionRequestMiddleware: 590, CatchExceptionDoNotOverrideRequestMiddleware: 595, }, }) crawler = runner.create_crawler(SingleRequestSpider) yield crawler.crawl(seed=url, mockserver=self.mockserver) response = crawler.spider.meta["responses"][0] self.assertEqual(response.body, b"Caught ZeroDivisionError") self.assertEqual(response.request.url, url) @defer.inlineCallbacks def test_downloader_middleware_alternative_callback(self): """ Downloader middleware which returns a response with a specific 'request' attribute, with an alternative callback """ runner = CrawlerRunner(settings={ "DOWNLOADER_MIDDLEWARES": { AlternativeCallbacksMiddleware: 595, } }) crawler = runner.create_crawler(AlternativeCallbacksSpider) with LogCapture() as log: url = self.mockserver.url("/status?n=200") yield crawler.crawl(seed=url, mockserver=self.mockserver) log.check_present(("alternative_callbacks_spider", "INFO", "alt_callback was invoked with foo=bar"), )
class ProxyConnectTestCase(TestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() self._oldenv = os.environ.copy() self._proxy = HTTPSProxy() self._proxy.start() # Wait for the proxy to start. time.sleep(1.0) os.environ['https_proxy'] = self._proxy.http_address() os.environ['http_proxy'] = self._proxy.http_address() def tearDown(self): self.mockserver.__exit__(None, None, None) self._proxy.shutdown() os.environ = self._oldenv @defer.inlineCallbacks def test_https_connect_tunnel(self): crawler = get_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl( self.mockserver.url("/status?n=200", is_secure=True)) self._assert_got_response_code(200, l) @defer.inlineCallbacks def test_https_noconnect(self): proxy = os.environ['https_proxy'] os.environ['https_proxy'] = proxy + '?noconnect' crawler = get_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl( self.mockserver.url("/status?n=200", is_secure=True)) self._assert_got_response_code(200, l) @defer.inlineCallbacks def test_https_connect_tunnel_error(self): crawler = get_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl("https://localhost:99999/status?n=200") self._assert_got_tunnel_error(l) @defer.inlineCallbacks def test_https_tunnel_auth_error(self): os.environ['https_proxy'] = _wrong_credentials( os.environ['https_proxy']) crawler = get_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl( self.mockserver.url("/status?n=200", is_secure=True)) # The proxy returns a 407 error code but it does not reach the client; # he just sees a TunnelError. self._assert_got_tunnel_error(l) @defer.inlineCallbacks def test_https_tunnel_without_leak_proxy_authorization_header(self): request = Request(self.mockserver.url("/echo", is_secure=True)) crawler = get_crawler(SingleRequestSpider) with LogCapture() as l: yield crawler.crawl(seed=request) self._assert_got_response_code(200, l) echo = json.loads(crawler.spider.meta['responses'][0].body) self.assertTrue('Proxy-Authorization' not in echo['headers']) @defer.inlineCallbacks def test_https_noconnect_auth_error(self): os.environ['https_proxy'] = _wrong_credentials( os.environ['https_proxy']) + '?noconnect' crawler = get_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl( self.mockserver.url("/status?n=200", is_secure=True)) self._assert_got_response_code(407, l) def _assert_got_response_code(self, code, log): print(log) self.assertEqual(str(log).count('Crawled (%d)' % code), 1) def _assert_got_tunnel_error(self, log): print(log) self.assertIn('TunnelError', str(log))
class ProxyConnectTestCase(TestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() self._oldenv = os.environ.copy() self._proxy = HTTPSProxy() self._proxy.start() # Wait for the proxy to start. time.sleep(1.0) os.environ['https_proxy'] = self._proxy.http_address() os.environ['http_proxy'] = self._proxy.http_address() def tearDown(self): self.mockserver.__exit__(None, None, None) self._proxy.shutdown() os.environ = self._oldenv @defer.inlineCallbacks def test_https_connect_tunnel(self): crawler = get_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/status?n=200", is_secure=True)) self._assert_got_response_code(200, l) @defer.inlineCallbacks def test_https_noconnect(self): proxy = os.environ['https_proxy'] os.environ['https_proxy'] = proxy + '?noconnect' crawler = get_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/status?n=200", is_secure=True)) self._assert_got_response_code(200, l) @defer.inlineCallbacks def test_https_connect_tunnel_error(self): crawler = get_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl("https://localhost:99999/status?n=200") self._assert_got_tunnel_error(l) @defer.inlineCallbacks def test_https_tunnel_auth_error(self): os.environ['https_proxy'] = _wrong_credentials(os.environ['https_proxy']) crawler = get_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/status?n=200", is_secure=True)) # The proxy returns a 407 error code but it does not reach the client; # he just sees a TunnelError. self._assert_got_tunnel_error(l) @defer.inlineCallbacks def test_https_tunnel_without_leak_proxy_authorization_header(self): request = Request(self.mockserver.url("/echo", is_secure=True)) crawler = get_crawler(SingleRequestSpider) with LogCapture() as l: yield crawler.crawl(seed=request) self._assert_got_response_code(200, l) echo = json.loads(crawler.spider.meta['responses'][0].body) self.assertTrue('Proxy-Authorization' not in echo['headers']) @defer.inlineCallbacks def test_https_noconnect_auth_error(self): os.environ['https_proxy'] = _wrong_credentials(os.environ['https_proxy']) + '?noconnect' crawler = get_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/status?n=200", is_secure=True)) self._assert_got_response_code(407, l) def _assert_got_response_code(self, code, log): print(log) self.assertEqual(str(log).count('Crawled (%d)' % code), 1) def _assert_got_tunnel_error(self, log): print(log) self.assertIn('TunnelError', str(log))