def test_start_requests_dupes(self): settings = {"CONCURRENT_REQUESTS": 1} spider = DuplicateStartRequestsSpider(dont_filter=True, distinct_urls=2, dupe_factor=3) yield docrawl(spider, settings) self.assertEqual(spider.visited, 6) spider = DuplicateStartRequestsSpider(dont_filter=False, distinct_urls=3, dupe_factor=4) yield docrawl(spider, settings) self.assertEqual(spider.visited, 3)
def test_timeout_failure(self): spider = DelaySpider(n=0.5) yield docrawl(spider, {"DOWNLOAD_TIMEOUT": 0.35}) self.assertTrue(spider.t1 > 0) self.assertTrue(spider.t2 == 0) self.assertTrue(spider.t2_err > 0) self.assertTrue(spider.t2_err > spider.t1) # server hangs after receiving response headers spider = DelaySpider(n=0.5, b=1) yield docrawl(spider, {"DOWNLOAD_TIMEOUT": 0.35}) self.assertTrue(spider.t1 > 0) self.assertTrue(spider.t2 == 0) self.assertTrue(spider.t2_err > 0) self.assertTrue(spider.t2_err > spider.t1)
def test_https_noconnect(self): os.environ[ 'https_proxy'] = 'http://*****:*****@localhost:8888?noconnect' spider = SimpleSpider("https://*****:*****@localhost:8888'
def test_https_tunnel_without_leak_proxy_authorization_header(self): request = Request("https://localhost:8999/echo") spider = SingleRequestSpider(seed=request) yield docrawl(spider) self._assert_got_response_code(200) echo = json.loads(spider.meta['responses'][0].body) self.assertTrue('Proxy-Authorization' not in echo['headers'])
def test_referer_header(self): """Referer header is set by RefererMiddleware unless it is already set""" req0 = Request('http://localhost:8998/echo?headers=1&body=0', dont_filter=1) req1 = req0.replace() req2 = req0.replace(headers={'Referer': None}) req3 = req0.replace(headers={'Referer': 'http://example.com'}) req0.meta['next'] = req1 req1.meta['next'] = req2 req2.meta['next'] = req3 spider = SingleRequestSpider(seed=req0) yield docrawl(spider) # basic asserts in case of weird communication errors self.assertIn('responses', spider.meta) self.assertNotIn('failures', spider.meta) # start requests doesn't set Referer header echo0 = json.loads(spider.meta['responses'][2].body) self.assertNotIn('Referer', echo0['headers']) # following request sets Referer to start request url echo1 = json.loads(spider.meta['responses'][1].body) self.assertEqual(echo1['headers'].get('Referer'), [req0.url]) # next request avoids Referer header echo2 = json.loads(spider.meta['responses'][2].body) self.assertNotIn('Referer', echo2['headers']) # last request explicitly sets a Referer header echo3 = json.loads(spider.meta['responses'][3].body) self.assertEqual(echo3['headers'].get('Referer'), ['http://example.com'])
def test_unbounded_response(self): # Completeness of responses without Content-Length or Transfer-Encoding # can not be determined, we treat them as valid but flagged as "partial" from urllib import urlencode query = urlencode({'raw': '''\ HTTP/1.1 200 OK Server: Apache-Coyote/1.1 X-Powered-By: Servlet 2.4; JBoss-4.2.3.GA (build: SVNTag=JBoss_4_2_3_GA date=200807181417)/JBossWeb-2.0 Set-Cookie: JSESSIONID=08515F572832D0E659FD2B0D8031D75F; Path=/ Pragma: no-cache Expires: Thu, 01 Jan 1970 00:00:00 GMT Cache-Control: no-cache Cache-Control: no-store Content-Type: text/html;charset=UTF-8 Content-Language: en Date: Tue, 27 Aug 2013 13:05:05 GMT Connection: close foo body with multiples lines '''}) spider = SimpleSpider("http://localhost:8998/raw?{0}".format(query)) yield docrawl(spider) log = get_testlog() self.assertEqual(log.count("Got response 200"), 1)
def test_retry_dns_error(self): with mock.patch('socket.gethostbyname', side_effect=socket.gaierror( -5, 'No address associated with hostname')): spider = SimpleSpider("http://example.com/") yield docrawl(spider) self._assert_retried()
def test_https_tunnel_auth_error(self): os.environ['https_proxy'] = 'http://*****:*****@localhost:8888' spider = SimpleSpider("https://*****:*****@localhost:8888'
def test_closespider_pagecount(self): spider = FollowAllSpider() close_on = 5 yield docrawl(spider, {'CLOSESPIDER_PAGECOUNT': close_on}) reason = spider.meta['close_reason'] self.assertEqual(reason, 'closespider_pagecount') pagecount = spider.crawler.stats.get_value('response_received_count') self.assertTrue(pagecount >= close_on)
def test_closespider_itemcount(self): spider = ItemSpider() close_on = 5 yield docrawl(spider, {'CLOSESPIDER_ITEMCOUNT': close_on}) reason = spider.meta['close_reason'] self.assertEqual(reason, 'closespider_itemcount') itemcount = spider.crawler.stats.get_value('item_scraped_count') self.assertTrue(itemcount >= close_on)
def test_closespider_pagecount(self): spider = FollowAllSpider() close_on = 5 yield docrawl(spider, {"CLOSESPIDER_PAGECOUNT": close_on}) reason = spider.meta["close_reason"] self.assertEqual(reason, "closespider_pagecount") pagecount = spider.crawler.stats.get_value("response_received_count") self.assertTrue(pagecount >= close_on)
def test_closespider_itemcount(self): spider = ItemSpider() close_on = 5 yield docrawl(spider, {"CLOSESPIDER_ITEMCOUNT": close_on}) reason = spider.meta["close_reason"] self.assertEqual(reason, "closespider_itemcount") itemcount = spider.crawler.stats.get_value("item_scraped_count") self.assertTrue(itemcount >= close_on)
def test_closespider_errorcount(self): spider = ErrorSpider(total=1000000) close_on = 5 yield docrawl(spider, {"CLOSESPIDER_ERRORCOUNT": close_on}) self.flushLoggedErrors(spider.exception_cls) reason = spider.meta["close_reason"] self.assertEqual(reason, "closespider_errorcount") key = "spider_exceptions/{name}".format(name=spider.exception_cls.__name__) errorcount = spider.crawler.stats.get_value(key) self.assertTrue(errorcount >= close_on)
def _test_delay(self, delay, randomize): settings = {"DOWNLOAD_DELAY": delay, 'RANDOMIZE_DOWNLOAD_DELAY': randomize} spider = FollowAllSpider(maxlatency=delay * 2) yield docrawl(spider, settings) t = spider.times totaltime = t[-1] - t[0] avgd = totaltime / (len(t) - 1) tolerance = 0.6 if randomize else 0.2 self.assertTrue(avgd > delay * (1 - tolerance), "download delay too small: %s" % avgd)
def test_closespider_errorcount(self): spider = ErrorSpider(total=1000000) close_on = 5 yield docrawl(spider, {'CLOSESPIDER_ERRORCOUNT': close_on}) self.flushLoggedErrors(spider.exception_cls) reason = spider.meta['close_reason'] self.assertEqual(reason, 'closespider_errorcount') key = 'spider_exceptions/{name}'\ .format(name=spider.exception_cls.__name__) errorcount = spider.crawler.stats.get_value(key) self.assertTrue(errorcount >= close_on)
def test_closespider_timeout(self): spider = FollowAllSpider(total=1000000) close_on = 0.1 yield docrawl(spider, {"CLOSESPIDER_TIMEOUT": close_on}) reason = spider.meta["close_reason"] self.assertEqual(reason, "closespider_timeout") stats = spider.crawler.stats start = stats.get_value("start_time") stop = stats.get_value("finish_time") diff = stop - start total_seconds = diff.seconds + diff.microseconds self.assertTrue(total_seconds >= close_on)
def test_closespider_timeout(self): spider = FollowAllSpider(total=1000000) close_on = 0.1 yield docrawl(spider, {'CLOSESPIDER_TIMEOUT': close_on}) reason = spider.meta['close_reason'] self.assertEqual(reason, 'closespider_timeout') stats = spider.crawler.stats start = stats.get_value('start_time') stop = stats.get_value('finish_time') diff = stop - start total_seconds = diff.seconds + diff.microseconds self.assertTrue(total_seconds >= close_on)
def test_logging(self): spider = _HttpErrorSpider(bypass_status_codes={402}) yield docrawl(spider) # print(get_testlog()) self.assertEqual(spider.parsed, {'200', '402'}) self.assertEqual(spider.skipped, {'402'}) self.assertEqual(spider.failed, {'404', '500'}) log = get_testlog() self.assertIn('Ignoring response <404', log) self.assertIn('Ignoring response <500', log) self.assertNotIn('Ignoring response <200', log) self.assertNotIn('Ignoring response <402', log)
def test_engine_status(self): from scrapy.utils.engine import get_engine_status est = [] def cb(response): est.append(get_engine_status(spider.crawler.engine)) spider = SingleRequestSpider(seed='http://localhost:8998/', callback_func=cb) yield docrawl(spider) self.assertEqual(len(est), 1, est) s = dict(est[0]) self.assertEqual(s['engine.spider.name'], spider.name) self.assertEqual(s['len(engine.scraper.slot.active)'], 1)
def test_start_requests_lazyness(self): settings = {"CONCURRENT_REQUESTS": 1} spider = BrokenStartRequestsSpider() yield docrawl(spider, settings)
def test_follow_all(self): spider = FollowAllSpider() yield docrawl(spider) self.assertEqual(len(spider.urls_visited), 11) # 10 + start_url
def test_https_connect_tunnel(self): spider = SimpleSpider("https://localhost:8999/status?n=200") yield docrawl(spider) self._assert_got_response_code(200)
def test_https_noconnect_auth_error(self): os.environ[ 'https_proxy'] = 'http://*****:*****@localhost:8888?noconnect' spider = SimpleSpider("https://localhost:8999/status?n=200") yield docrawl(spider) self._assert_got_response_code(407)
def test_retry_dns_error(self): with mock.patch('socket.gethostbyname', side_effect=socket.gaierror(-5, 'No address associated with hostname')): spider = SimpleSpider("http://example.com/") yield docrawl(spider) self._assert_retried()
def test_retry_503(self): spider = SimpleSpider("http://localhost:8998/status?n=503") yield docrawl(spider) self._assert_retried()
def test_timeout_success(self): spider = DelaySpider(n=0.5) yield docrawl(spider) self.assertTrue(spider.t1 > 0) self.assertTrue(spider.t2 > 0) self.assertTrue(spider.t2 > spider.t1)
def test_middleware_works(self): spider = _HttpErrorSpider() yield docrawl(spider) assert not spider.skipped, spider.skipped self.assertEqual(spider.parsed, {'200'}) self.assertEqual(spider.failed, {'404', '402', '500'})
def test_retry_conn_failed(self): spider = SimpleSpider("http://localhost:65432/status?n=503") yield docrawl(spider) self._assert_retried()
def test_https_connect_tunnel_error(self): spider = SimpleSpider("https://localhost:99999/status?n=200") yield docrawl(spider) self._assert_got_tunnel_error()
def test_retry_conn_aborted(self): # connection lost before receiving data spider = SimpleSpider("http://localhost:8998/drop?abort=1") yield docrawl(spider) self._assert_retried()
def test_start_requests_bug_before_yield(self): spider = BrokenStartRequestsSpider(fail_before_yield=1) yield docrawl(spider) errors = self.flushLoggedErrors(ZeroDivisionError) self.assertEqual(len(errors), 1)