def _retry(self, request, reason, spider): retries = request.meta.get('retry_times', 0) + 1 stats = spider.crawler.stats if retries <= self.max_retry_times: spider.logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s", {'request': request, 'retries': retries, 'reason': reason}, extra={'spider': spider}) retryreq = request.copy() retryreq.meta['retry_times'] = retries retryreq.dont_filter = True retryreq.priority = request.priority + self.priority_adjust if isinstance(reason, Exception): reason = global_object_name(reason.__class__) stats.inc_value('retry/count') stats.inc_value('retry/reason_count/%s' % reason) return retryreq else: stats.inc_value('retry/max_reached') spider.logger.debug("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s", {'request': request, 'retries': retries, 'reason': reason}, extra={'spider': spider}) response = Response('') response.replace(body="") response.status = 12138 return response
def test_request_cacheability(self): res0 = Response(self.request.url, status=200, headers={'Expires': self.tomorrow}) req0 = Request('http://example.com') req1 = req0.replace(headers={'Cache-Control': 'no-store'}) req2 = req0.replace(headers={'Cache-Control': 'no-cache'}) with self._middleware() as mw: # response for a request with no-store must not be cached res1 = self._process_requestresponse(mw, req1, res0) self.assertEqualResponse(res1, res0) assert mw.storage.retrieve_response(self.spider, req1) is None # Re-do request without no-store and expect it to be cached res2 = self._process_requestresponse(mw, req0, res0) assert 'cached' not in res2.flags res3 = mw.process_request(req0, self.spider) assert 'cached' in res3.flags self.assertEqualResponse(res2, res3) # request with no-cache directive must not return cached response # but it allows new response to be stored res0b = res0.replace(body=b'foo') res4 = self._process_requestresponse(mw, req2, res0b) self.assertEqualResponse(res4, res0b) assert 'cached' not in res4.flags res5 = self._process_requestresponse(mw, req0, None) self.assertEqualResponse(res5, res0b) assert 'cached' in res5.flags
def test_request_cacheability(self): res0 = Response(self.request.url, status=200, headers={'Expires': self.tomorrow}) req0 = Request('http://example.com') req1 = req0.replace(headers={'Cache-Control': 'no-store'}) req2 = req0.replace(headers={'Cache-Control': 'no-cache'}) with self._middleware() as mw: # response for a request with no-store must not be cached res1 = self._process_requestresponse(mw, req1, res0) self.assertEqualResponse(res1, res0) assert mw.storage.retrieve_response(self.spider, req1) is None # Re-do request without no-store and expect it to be cached res2 = self._process_requestresponse(mw, req0, res0) assert 'cached' not in res2.flags res3 = mw.process_request(req0, self.spider) assert 'cached' in res3.flags self.assertEqualResponse(res2, res3) # request with no-cache directive must not return cached response # but it allows new response to be stored res0b = res0.replace(body='foo') res4 = self._process_requestresponse(mw, req2, res0b) self.assertEqualResponse(res4, res0b) assert 'cached' not in res4.flags res5 = self._process_requestresponse(mw, req0, None) self.assertEqualResponse(res5, res0b) assert 'cached' in res5.flags
def parse_videos(self, response: Response): body = json.loads(response.body) assert body["kind"] == "youtube#videoListResponse" for item in body["items"]: response_copy = response.replace(url=self.get_video_url(item)) response_copy.meta["item"] = item yield self.lomLoader.parse(response_copy)
def test_cached_and_stale(self): sampledata = [ (200, {'Date': self.today, 'Expires': self.yesterday}), (200, {'Date': self.today, 'Expires': self.yesterday, 'Last-Modified': self.yesterday}), (200, {'Expires': self.yesterday}), (200, {'Expires': self.yesterday, 'ETag': 'foo'}), (200, {'Expires': self.yesterday, 'Last-Modified': self.yesterday}), (200, {'Expires': self.tomorrow, 'Age': '86405'}), (200, {'Cache-Control': 'max-age=86400', 'Age': '86405'}), # no-cache forces expiration, also revalidation if validators exists (200, {'Cache-Control': 'no-cache'}), (200, {'Cache-Control': 'no-cache', 'ETag': 'foo'}), (200, {'Cache-Control': 'no-cache', 'Last-Modified': self.yesterday}), (200, {'Cache-Control': 'no-cache,must-revalidate', 'Last-Modified': self.yesterday}), (200, {'Cache-Control': 'must-revalidate', 'Expires': self.yesterday, 'Last-Modified': self.yesterday}), (200, {'Cache-Control': 'max-age=86400,must-revalidate', 'Age': '86405'}), ] with self._middleware() as mw: for idx, (status, headers) in enumerate(sampledata): req0 = Request('http://example-%d.com' % idx) res0a = Response(req0.url, status=status, headers=headers) # cache expired response res1 = self._process_requestresponse(mw, req0, res0a) self.assertEqualResponse(res1, res0a) assert 'cached' not in res1.flags # Same request but as cached response is stale a new response must # be returned res0b = res0a.replace(body=b'bar') res2 = self._process_requestresponse(mw, req0, res0b) self.assertEqualResponse(res2, res0b) assert 'cached' not in res2.flags cc = headers.get('Cache-Control', '') # Previous response expired too, subsequent request to same # resource must revalidate and succeed on 304 if validators # are present if 'ETag' in headers or 'Last-Modified' in headers: res0c = res0b.replace(status=304) res3 = self._process_requestresponse(mw, req0, res0c) self.assertEqualResponse(res3, res0b) assert 'cached' in res3.flags # get cached response on server errors unless must-revalidate # in cached response res0d = res0b.replace(status=500) res4 = self._process_requestresponse(mw, req0, res0d) if 'must-revalidate' in cc: assert 'cached' not in res4.flags self.assertEqualResponse(res4, res0d) else: assert 'cached' in res4.flags self.assertEqualResponse(res4, res0b) # Requests with max-stale can fetch expired cached responses # unless cached response has must-revalidate req1 = req0.replace(headers={'Cache-Control': 'max-stale'}) res5 = self._process_requestresponse(mw, req1, res0b) self.assertEqualResponse(res5, res0b) if 'no-cache' in cc or 'must-revalidate' in cc: assert 'cached' not in res5.flags else: assert 'cached' in res5.flags
def test_cached_and_fresh(self): sampledata = [ (200, {'Date': self.yesterday, 'Expires': self.tomorrow}), (200, {'Date': self.yesterday, 'Cache-Control': 'max-age=86405'}), (200, {'Age': '299', 'Cache-Control': 'max-age=300'}), # Obey max-age if present over any others (200, {'Date': self.today, 'Age': '86405', 'Cache-Control': 'max-age=' + str(86400 * 3), 'Expires': self.yesterday, 'Last-Modified': self.yesterday, }), # obey Expires if max-age is not present (200, {'Date': self.yesterday, 'Age': '86400', 'Cache-Control': 'public', 'Expires': self.tomorrow, 'Last-Modified': self.yesterday, }), # Default missing Date header to right now (200, {'Expires': self.tomorrow}), # Firefox - Expires if age is greater than 10% of (Date - # Last-Modified) (200, {'Date': self.today, 'Last-Modified': self.yesterday, 'Age': str(86400 / 10 - 1)}), # Firefox - Set one year maxage to permanent redirects missing # expiration info (300, {}), (301, {}), (308, {}), ] with self._middleware() as mw: for idx, (status, headers) in enumerate(sampledata): req0 = Request('http://example-%d.com' % idx) res0 = Response(req0.url, status=status, headers=headers) # cache fresh response res1 = self._process_requestresponse(mw, req0, res0) self.assertEqualResponse(res1, res0) assert 'cached' not in res1.flags # return fresh cached response without network interaction res2 = self._process_requestresponse(mw, req0, None) self.assertEqualResponse(res1, res2) assert 'cached' in res2.flags # validate cached response if request max-age set as 0 req1 = req0.replace(headers={'Cache-Control': 'max-age=0'}) res304 = res0.replace(status=304) assert mw.process_request(req1, self.spider) is None res3 = self._process_requestresponse(mw, req1, res304) self.assertEqualResponse(res1, res3) assert 'cached' in res3.flags
def test_response_cacheability(self): responses = [ # 304 is not cacheable no matter what servers sends (False, 304, {}), (False, 304, {'Last-Modified': self.yesterday}), (False, 304, {'Expires': self.tomorrow}), (False, 304, {'Etag': 'bar'}), (False, 304, {'Cache-Control': 'max-age=3600'}), # Always obey no-store cache control (False, 200, {'Cache-Control': 'no-store'}), # invalid (False, 200, {'Cache-Control': 'no-store, max-age=300'}), # invalid (False, 200, { 'Cache-Control': 'no-store', 'Expires': self.tomorrow}), # Ignore responses missing expiration and/or validation headers (False, 200, {}), (False, 302, {}), (False, 307, {}), (False, 404, {}), # Cache responses with expiration and/or validation headers (True, 200, {'Last-Modified': self.yesterday}), (True, 203, {'Last-Modified': self.yesterday}), (True, 300, {'Last-Modified': self.yesterday}), (True, 301, {'Last-Modified': self.yesterday}), (True, 401, {'Last-Modified': self.yesterday}), (True, 404, {'Cache-Control': 'public, max-age=600'}), (True, 302, {'Expires': self.tomorrow}), (True, 200, {'Etag': 'foo'}), ] with self._middleware() as mw: for idx, (shouldcache, status, headers) in enumerate(responses): req0 = Request('http://example-%d.com' % idx) res0 = Response(req0.url, status=status, headers=headers) res1 = self._process_requestresponse(mw, req0, res0) res304 = res0.replace(status=304) res2 = self._process_requestresponse( mw, req0, res304 if shouldcache else res0) self.assertEqualResponse(res1, res0) self.assertEqualResponse(res2, res0) resc = mw.storage.retrieve_response(self.spider, req0) if shouldcache: self.assertEqualResponse(resc, res1) assert 'cached' in res2.flags and res2.status != 304 else: self.assertFalse(resc) assert 'cached' not in res2.flags
def test_cached_and_fresh(self): sampledata = [ (200, {'Date': self.yesterday, 'Expires': self.tomorrow}), (200, {'Date': self.yesterday, 'Cache-Control': 'max-age=86405'}), (200, {'Age': '299', 'Cache-Control': 'max-age=300'}), # Obey max-age if present over any others (200, {'Date': self.today, 'Age': '86405', 'Cache-Control': 'max-age=' + str(86400 * 3), 'Expires': self.yesterday, 'Last-Modified': self.yesterday, }), # obey Expires if max-age is not present (200, {'Date': self.yesterday, 'Age': '86400', 'Cache-Control': 'public', 'Expires': self.tomorrow, 'Last-Modified': self.yesterday, }), # Default missing Date header to right now (200, {'Expires': self.tomorrow}), # Firefox - Expires if age is greater than 10% of (Date - Last-Modified) (200, {'Date': self.today, 'Last-Modified': self.yesterday, 'Age': str(86400 / 10 - 1)}), # Firefox - Set one year maxage to permanent redirects missing expiration info (300, {}), (301, {}), (308, {}), ] with self._middleware() as mw: for idx, (status, headers) in enumerate(sampledata): req0 = Request('http://example-%d.com' % idx) res0 = Response(req0.url, status=status, headers=headers) # cache fresh response res1 = self._process_requestresponse(mw, req0, res0) self.assertEqualResponse(res1, res0) assert 'cached' not in res1.flags # return fresh cached response without network interaction res2 = self._process_requestresponse(mw, req0, None) self.assertEqualResponse(res1, res2) assert 'cached' in res2.flags # validate cached response if request max-age set as 0 req1 = req0.replace(headers={'Cache-Control': 'max-age=0'}) res304 = res0.replace(status=304) assert mw.process_request(req1, self.spider) is None res3 = self._process_requestresponse(mw, req1, res304) self.assertEqualResponse(res1, res3) assert 'cached' in res3.flags
def test_response_cacheability(self): responses = [ # 304 is not cacheable no matter what servers sends (False, 304, {}), (False, 304, {'Last-Modified': self.yesterday}), (False, 304, {'Expires': self.tomorrow}), (False, 304, {'Etag': 'bar'}), (False, 304, {'Cache-Control': 'max-age=3600'}), # Always obey no-store cache control (False, 200, {'Cache-Control': 'no-store'}), (False, 200, {'Cache-Control': 'no-store, max-age=300'}), # invalid (False, 200, {'Cache-Control': 'no-store', 'Expires': self.tomorrow}), # invalid # Ignore responses missing expiration and/or validation headers (False, 200, {}), (False, 302, {}), (False, 307, {}), (False, 404, {}), # Cache responses with expiration and/or validation headers (True, 200, {'Last-Modified': self.yesterday}), (True, 203, {'Last-Modified': self.yesterday}), (True, 300, {'Last-Modified': self.yesterday}), (True, 301, {'Last-Modified': self.yesterday}), (True, 401, {'Last-Modified': self.yesterday}), (True, 404, {'Cache-Control': 'public, max-age=600'}), (True, 302, {'Expires': self.tomorrow}), (True, 200, {'Etag': 'foo'}), ] with self._middleware() as mw: for idx, (shouldcache, status, headers) in enumerate(responses): req0 = Request('http://example-%d.com' % idx) res0 = Response(req0.url, status=status, headers=headers) res1 = self._process_requestresponse(mw, req0, res0) res304 = res0.replace(status=304) res2 = self._process_requestresponse(mw, req0, res304 if shouldcache else res0) self.assertEqualResponse(res1, res0) self.assertEqualResponse(res2, res0) resc = mw.storage.retrieve_response(self.spider, req0) if shouldcache: self.assertEqualResponse(resc, res1) assert 'cached' in res2.flags and res2.status != 304 else: self.assertFalse(resc) assert 'cached' not in res2.flags
def test_cached_and_stale(self): sampledata = [ (200, {"Date": self.today, "Expires": self.yesterday}), (200, {"Date": self.today, "Expires": self.yesterday, "Last-Modified": self.yesterday}), (200, {"Expires": self.yesterday}), (200, {"Expires": self.yesterday, "ETag": "foo"}), (200, {"Expires": self.yesterday, "Last-Modified": self.yesterday}), (200, {"Expires": self.tomorrow, "Age": "86405"}), (200, {"Cache-Control": "max-age=86400", "Age": "86405"}), # no-cache forces expiration, also revalidation if validators exists (200, {"Cache-Control": "no-cache"}), (200, {"Cache-Control": "no-cache", "ETag": "foo"}), (200, {"Cache-Control": "no-cache", "Last-Modified": self.yesterday}), ] with self._middleware() as mw: for idx, (status, headers) in enumerate(sampledata): req0 = Request("http://example-%d.com" % idx) res0a = Response(req0.url, status=status, headers=headers) # cache expired response res1 = self._process_requestresponse(mw, req0, res0a) self.assertEqualResponse(res1, res0a) assert "cached" not in res1.flags # Same request but as cached response is stale a new response must # be returned res0b = res0a.replace(body="bar") res2 = self._process_requestresponse(mw, req0, res0b) self.assertEqualResponse(res2, res0b) assert "cached" not in res2.flags # Previous response expired too, subsequent request to same # resource must revalidate and succeed on 304 if validators # are present if "ETag" in headers or "Last-Modified" in headers: res0c = res0b.replace(status=304) res3 = self._process_requestresponse(mw, req0, res0c) self.assertEqualResponse(res3, res0b) assert "cached" in res3.flags
def test_cached_and_stale(self): sampledata = [ (200, { "Date": self.today, "Expires": self.yesterday }), ( 200, { "Date": self.today, "Expires": self.yesterday, "Last-Modified": self.yesterday, }, ), (200, { "Expires": self.yesterday }), (200, { "Expires": self.yesterday, "ETag": "foo" }), (200, { "Expires": self.yesterday, "Last-Modified": self.yesterday }), (200, { "Expires": self.tomorrow, "Age": "86405" }), (200, { "Cache-Control": "max-age=86400", "Age": "86405" }), # no-cache forces expiration, also revalidation if validators exists (200, { "Cache-Control": "no-cache" }), (200, { "Cache-Control": "no-cache", "ETag": "foo" }), (200, { "Cache-Control": "no-cache", "Last-Modified": self.yesterday }), ( 200, { "Cache-Control": "no-cache,must-revalidate", "Last-Modified": self.yesterday, }, ), ( 200, { "Cache-Control": "must-revalidate", "Expires": self.yesterday, "Last-Modified": self.yesterday, }, ), (200, { "Cache-Control": "max-age=86400,must-revalidate", "Age": "86405" }), ] with self._middleware() as mw: for idx, (status, headers) in enumerate(sampledata): req0 = Request("http://example-%d.com" % idx) res0a = Response(req0.url, status=status, headers=headers) # cache expired response res1 = self._process_requestresponse(mw, req0, res0a) self.assertEqualResponse(res1, res0a) assert "cached" not in res1.flags # Same request but as cached response is stale a new response must # be returned res0b = res0a.replace(body=b"bar") res2 = self._process_requestresponse(mw, req0, res0b) self.assertEqualResponse(res2, res0b) assert "cached" not in res2.flags cc = headers.get("Cache-Control", "") # Previous response expired too, subsequent request to same # resource must revalidate and succeed on 304 if validators # are present if "ETag" in headers or "Last-Modified" in headers: res0c = res0b.replace(status=304) res3 = self._process_requestresponse(mw, req0, res0c) self.assertEqualResponse(res3, res0b) assert "cached" in res3.flags # get cached response on server errors unless must-revalidate # in cached response res0d = res0b.replace(status=500) res4 = self._process_requestresponse(mw, req0, res0d) if "must-revalidate" in cc: assert "cached" not in res4.flags self.assertEqualResponse(res4, res0d) else: assert "cached" in res4.flags self.assertEqualResponse(res4, res0b) # Requests with max-stale can fetch expired cached responses # unless cached response has must-revalidate req1 = req0.replace(headers={"Cache-Control": "max-stale"}) res5 = self._process_requestresponse(mw, req1, res0b) self.assertEqualResponse(res5, res0b) if "no-cache" in cc or "must-revalidate" in cc: assert "cached" not in res5.flags else: assert "cached" in res5.flags
def test_cached_and_fresh(self): sampledata = [ (200, { "Date": self.yesterday, "Expires": self.tomorrow }), (200, { "Date": self.yesterday, "Cache-Control": "max-age=86405" }), (200, { "Age": "299", "Cache-Control": "max-age=300" }), # Obey max-age if present over any others ( 200, { "Date": self.today, "Age": "86405", "Cache-Control": "max-age=" + str(86400 * 3), "Expires": self.yesterday, "Last-Modified": self.yesterday, }, ), # obey Expires if max-age is not present ( 200, { "Date": self.yesterday, "Age": "86400", "Cache-Control": "public", "Expires": self.tomorrow, "Last-Modified": self.yesterday, }, ), # Default missing Date header to right now (200, { "Expires": self.tomorrow }), # Firefox - Expires if age is greater than 10% of (Date - Last-Modified) ( 200, { "Date": self.today, "Last-Modified": self.yesterday, "Age": str(86400 / 10 - 1), }, ), # Firefox - Set one year maxage to permanent redirects missing expiration info (300, {}), (301, {}), (308, {}), ] with self._middleware() as mw: for idx, (status, headers) in enumerate(sampledata): req0 = Request("http://example-%d.com" % idx) res0 = Response(req0.url, status=status, headers=headers) # cache fresh response res1 = self._process_requestresponse(mw, req0, res0) self.assertEqualResponse(res1, res0) assert "cached" not in res1.flags # return fresh cached response without network interaction res2 = self._process_requestresponse(mw, req0, None) self.assertEqualResponse(res1, res2) assert "cached" in res2.flags # validate cached response if request max-age set as 0 req1 = req0.replace(headers={"Cache-Control": "max-age=0"}) res304 = res0.replace(status=304) assert mw.process_request(req1, self.spider) is None res3 = self._process_requestresponse(mw, req1, res304) self.assertEqualResponse(res1, res3) assert "cached" in res3.flags
def test_response_cacheability(self): responses = [ # 304 is not cacheable no matter what servers sends (False, 304, {}), (False, 304, { "Last-Modified": self.yesterday }), (False, 304, { "Expires": self.tomorrow }), (False, 304, { "Etag": "bar" }), (False, 304, { "Cache-Control": "max-age=3600" }), # Always obey no-store cache control (False, 200, { "Cache-Control": "no-store" }), # invalid (False, 200, { "Cache-Control": "no-store, max-age=300" }), ( False, 200, { "Cache-Control": "no-store", "Expires": self.tomorrow }, ), # invalid # Ignore responses missing expiration and/or validation headers (False, 200, {}), (False, 302, {}), (False, 307, {}), (False, 404, {}), # Cache responses with expiration and/or validation headers (True, 200, { "Last-Modified": self.yesterday }), (True, 203, { "Last-Modified": self.yesterday }), (True, 300, { "Last-Modified": self.yesterday }), (True, 301, { "Last-Modified": self.yesterday }), (True, 308, { "Last-Modified": self.yesterday }), (True, 401, { "Last-Modified": self.yesterday }), (True, 404, { "Cache-Control": "public, max-age=600" }), (True, 302, { "Expires": self.tomorrow }), (True, 200, { "Etag": "foo" }), ] with self._middleware() as mw: for idx, (shouldcache, status, headers) in enumerate(responses): req0 = Request("http://example-%d.com" % idx) res0 = Response(req0.url, status=status, headers=headers) res1 = self._process_requestresponse(mw, req0, res0) res304 = res0.replace(status=304) res2 = self._process_requestresponse( mw, req0, res304 if shouldcache else res0) self.assertEqualResponse(res1, res0) self.assertEqualResponse(res2, res0) resc = mw.storage.retrieve_response(self.spider, req0) if shouldcache: self.assertEqualResponse(resc, res1) assert "cached" in res2.flags and res2.status != 304 else: self.assertFalse(resc) assert "cached" not in res2.flags # cache unconditionally unless response contains no-store or is a 304 with self._middleware(HTTPCACHE_ALWAYS_STORE=True) as mw: for idx, (_, status, headers) in enumerate(responses): shouldcache = ("no-store" not in headers.get( "Cache-Control", "") and status != 304) req0 = Request("http://example2-%d.com" % idx) res0 = Response(req0.url, status=status, headers=headers) res1 = self._process_requestresponse(mw, req0, res0) res304 = res0.replace(status=304) res2 = self._process_requestresponse( mw, req0, res304 if shouldcache else res0) self.assertEqualResponse(res1, res0) self.assertEqualResponse(res2, res0) resc = mw.storage.retrieve_response(self.spider, req0) if shouldcache: self.assertEqualResponse(resc, res1) assert "cached" in res2.flags and res2.status != 304 else: self.assertFalse(resc) assert "cached" not in res2.flags