예제 #1
0
    def _retry(self, request, reason, spider):
        retries = request.meta.get('retry_times', 0) + 1

        stats = spider.crawler.stats
        if retries <= self.max_retry_times:
            spider.logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s",
                         {'request': request, 'retries': retries, 'reason': reason},
                         extra={'spider': spider})
            retryreq = request.copy()
            retryreq.meta['retry_times'] = retries
            retryreq.dont_filter = True
            retryreq.priority = request.priority + self.priority_adjust

            if isinstance(reason, Exception):
                reason = global_object_name(reason.__class__)

            stats.inc_value('retry/count')
            stats.inc_value('retry/reason_count/%s' % reason)
            return retryreq
        else:
            stats.inc_value('retry/max_reached')
            spider.logger.debug("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s",
                         {'request': request, 'retries': retries, 'reason': reason},
                         extra={'spider': spider})
            response = Response('')
            response.replace(body="")
            response.status = 12138
            return response
 def test_request_cacheability(self):
     res0 = Response(self.request.url, status=200,
                     headers={'Expires': self.tomorrow})
     req0 = Request('http://example.com')
     req1 = req0.replace(headers={'Cache-Control': 'no-store'})
     req2 = req0.replace(headers={'Cache-Control': 'no-cache'})
     with self._middleware() as mw:
         # response for a request with no-store must not be cached
         res1 = self._process_requestresponse(mw, req1, res0)
         self.assertEqualResponse(res1, res0)
         assert mw.storage.retrieve_response(self.spider, req1) is None
         # Re-do request without no-store and expect it to be cached
         res2 = self._process_requestresponse(mw, req0, res0)
         assert 'cached' not in res2.flags
         res3 = mw.process_request(req0, self.spider)
         assert 'cached' in res3.flags
         self.assertEqualResponse(res2, res3)
         # request with no-cache directive must not return cached response
         # but it allows new response to be stored
         res0b = res0.replace(body=b'foo')
         res4 = self._process_requestresponse(mw, req2, res0b)
         self.assertEqualResponse(res4, res0b)
         assert 'cached' not in res4.flags
         res5 = self._process_requestresponse(mw, req0, None)
         self.assertEqualResponse(res5, res0b)
         assert 'cached' in res5.flags
예제 #3
0
 def test_request_cacheability(self):
     res0 = Response(self.request.url,
                     status=200,
                     headers={'Expires': self.tomorrow})
     req0 = Request('http://example.com')
     req1 = req0.replace(headers={'Cache-Control': 'no-store'})
     req2 = req0.replace(headers={'Cache-Control': 'no-cache'})
     with self._middleware() as mw:
         # response for a request with no-store must not be cached
         res1 = self._process_requestresponse(mw, req1, res0)
         self.assertEqualResponse(res1, res0)
         assert mw.storage.retrieve_response(self.spider, req1) is None
         # Re-do request without no-store and expect it to be cached
         res2 = self._process_requestresponse(mw, req0, res0)
         assert 'cached' not in res2.flags
         res3 = mw.process_request(req0, self.spider)
         assert 'cached' in res3.flags
         self.assertEqualResponse(res2, res3)
         # request with no-cache directive must not return cached response
         # but it allows new response to be stored
         res0b = res0.replace(body='foo')
         res4 = self._process_requestresponse(mw, req2, res0b)
         self.assertEqualResponse(res4, res0b)
         assert 'cached' not in res4.flags
         res5 = self._process_requestresponse(mw, req0, None)
         self.assertEqualResponse(res5, res0b)
         assert 'cached' in res5.flags
예제 #4
0
 def parse_videos(self, response: Response):
     body = json.loads(response.body)
     assert body["kind"] == "youtube#videoListResponse"
     for item in body["items"]:
         response_copy = response.replace(url=self.get_video_url(item))
         response_copy.meta["item"] = item
         yield self.lomLoader.parse(response_copy)
 def test_cached_and_stale(self):
     sampledata = [
         (200, {'Date': self.today, 'Expires': self.yesterday}),
         (200, {'Date': self.today, 'Expires': self.yesterday, 'Last-Modified': self.yesterday}),
         (200, {'Expires': self.yesterday}),
         (200, {'Expires': self.yesterday, 'ETag': 'foo'}),
         (200, {'Expires': self.yesterday, 'Last-Modified': self.yesterday}),
         (200, {'Expires': self.tomorrow, 'Age': '86405'}),
         (200, {'Cache-Control': 'max-age=86400', 'Age': '86405'}),
         # no-cache forces expiration, also revalidation if validators exists
         (200, {'Cache-Control': 'no-cache'}),
         (200, {'Cache-Control': 'no-cache', 'ETag': 'foo'}),
         (200, {'Cache-Control': 'no-cache', 'Last-Modified': self.yesterday}),
         (200, {'Cache-Control': 'no-cache,must-revalidate', 'Last-Modified': self.yesterday}),
         (200, {'Cache-Control': 'must-revalidate', 'Expires': self.yesterday, 'Last-Modified': self.yesterday}),
         (200, {'Cache-Control': 'max-age=86400,must-revalidate', 'Age': '86405'}),
     ]
     with self._middleware() as mw:
         for idx, (status, headers) in enumerate(sampledata):
             req0 = Request('http://example-%d.com' % idx)
             res0a = Response(req0.url, status=status, headers=headers)
             # cache expired response
             res1 = self._process_requestresponse(mw, req0, res0a)
             self.assertEqualResponse(res1, res0a)
             assert 'cached' not in res1.flags
             # Same request but as cached response is stale a new response must
             # be returned
             res0b = res0a.replace(body=b'bar')
             res2 = self._process_requestresponse(mw, req0, res0b)
             self.assertEqualResponse(res2, res0b)
             assert 'cached' not in res2.flags
             cc = headers.get('Cache-Control', '')
             # Previous response expired too, subsequent request to same
             # resource must revalidate and succeed on 304 if validators
             # are present
             if 'ETag' in headers or 'Last-Modified' in headers:
                 res0c = res0b.replace(status=304)
                 res3 = self._process_requestresponse(mw, req0, res0c)
                 self.assertEqualResponse(res3, res0b)
                 assert 'cached' in res3.flags
                 # get cached response on server errors unless must-revalidate
                 # in cached response
                 res0d = res0b.replace(status=500)
                 res4 = self._process_requestresponse(mw, req0, res0d)
                 if 'must-revalidate' in cc:
                     assert 'cached' not in res4.flags
                     self.assertEqualResponse(res4, res0d)
                 else:
                     assert 'cached' in res4.flags
                     self.assertEqualResponse(res4, res0b)
             # Requests with max-stale can fetch expired cached responses
             # unless cached response has must-revalidate
             req1 = req0.replace(headers={'Cache-Control': 'max-stale'})
             res5 = self._process_requestresponse(mw, req1, res0b)
             self.assertEqualResponse(res5, res0b)
             if 'no-cache' in cc or 'must-revalidate' in cc:
                 assert 'cached' not in res5.flags
             else:
                 assert 'cached' in res5.flags
예제 #6
0
 def test_cached_and_stale(self):
     sampledata = [
         (200, {'Date': self.today, 'Expires': self.yesterday}),
         (200, {'Date': self.today, 'Expires': self.yesterday, 'Last-Modified': self.yesterday}),
         (200, {'Expires': self.yesterday}),
         (200, {'Expires': self.yesterday, 'ETag': 'foo'}),
         (200, {'Expires': self.yesterday, 'Last-Modified': self.yesterday}),
         (200, {'Expires': self.tomorrow, 'Age': '86405'}),
         (200, {'Cache-Control': 'max-age=86400', 'Age': '86405'}),
         # no-cache forces expiration, also revalidation if validators exists
         (200, {'Cache-Control': 'no-cache'}),
         (200, {'Cache-Control': 'no-cache', 'ETag': 'foo'}),
         (200, {'Cache-Control': 'no-cache', 'Last-Modified': self.yesterday}),
         (200, {'Cache-Control': 'no-cache,must-revalidate', 'Last-Modified': self.yesterday}),
         (200, {'Cache-Control': 'must-revalidate', 'Expires': self.yesterday, 'Last-Modified': self.yesterday}),
         (200, {'Cache-Control': 'max-age=86400,must-revalidate', 'Age': '86405'}),
     ]
     with self._middleware() as mw:
         for idx, (status, headers) in enumerate(sampledata):
             req0 = Request('http://example-%d.com' % idx)
             res0a = Response(req0.url, status=status, headers=headers)
             # cache expired response
             res1 = self._process_requestresponse(mw, req0, res0a)
             self.assertEqualResponse(res1, res0a)
             assert 'cached' not in res1.flags
             # Same request but as cached response is stale a new response must
             # be returned
             res0b = res0a.replace(body=b'bar')
             res2 = self._process_requestresponse(mw, req0, res0b)
             self.assertEqualResponse(res2, res0b)
             assert 'cached' not in res2.flags
             cc = headers.get('Cache-Control', '')
             # Previous response expired too, subsequent request to same
             # resource must revalidate and succeed on 304 if validators
             # are present
             if 'ETag' in headers or 'Last-Modified' in headers:
                 res0c = res0b.replace(status=304)
                 res3 = self._process_requestresponse(mw, req0, res0c)
                 self.assertEqualResponse(res3, res0b)
                 assert 'cached' in res3.flags
                 # get cached response on server errors unless must-revalidate
                 # in cached response
                 res0d = res0b.replace(status=500)
                 res4 = self._process_requestresponse(mw, req0, res0d)
                 if 'must-revalidate' in cc:
                     assert 'cached' not in res4.flags
                     self.assertEqualResponse(res4, res0d)
                 else:
                     assert 'cached' in res4.flags
                     self.assertEqualResponse(res4, res0b)
             # Requests with max-stale can fetch expired cached responses
             # unless cached response has must-revalidate
             req1 = req0.replace(headers={'Cache-Control': 'max-stale'})
             res5 = self._process_requestresponse(mw, req1, res0b)
             self.assertEqualResponse(res5, res0b)
             if 'no-cache' in cc or 'must-revalidate' in cc:
                 assert 'cached' not in res5.flags
             else:
                 assert 'cached' in res5.flags
 def test_cached_and_fresh(self):
     sampledata = [
         (200, {'Date': self.yesterday, 'Expires': self.tomorrow}),
         (200, {'Date': self.yesterday, 'Cache-Control': 'max-age=86405'}),
         (200, {'Age': '299', 'Cache-Control': 'max-age=300'}),
         # Obey max-age if present over any others
         (200, {'Date': self.today,
                'Age': '86405',
                'Cache-Control': 'max-age=' + str(86400 * 3),
                'Expires': self.yesterday,
                'Last-Modified': self.yesterday,
                }),
         # obey Expires if max-age is not present
         (200, {'Date': self.yesterday,
                'Age': '86400',
                'Cache-Control': 'public',
                'Expires': self.tomorrow,
                'Last-Modified': self.yesterday,
                }),
         # Default missing Date header to right now
         (200, {'Expires': self.tomorrow}),
         # Firefox - Expires if age is greater than 10% of (Date -
         # Last-Modified)
         (200,
          {'Date': self.today,
           'Last-Modified': self.yesterday,
           'Age': str(86400 / 10 - 1)}),
         # Firefox - Set one year maxage to permanent redirects missing
         # expiration info
         (300, {}), (301, {}), (308, {}),
     ]
     with self._middleware() as mw:
         for idx, (status, headers) in enumerate(sampledata):
             req0 = Request('http://example-%d.com' % idx)
             res0 = Response(req0.url, status=status, headers=headers)
             # cache fresh response
             res1 = self._process_requestresponse(mw, req0, res0)
             self.assertEqualResponse(res1, res0)
             assert 'cached' not in res1.flags
             # return fresh cached response without network interaction
             res2 = self._process_requestresponse(mw, req0, None)
             self.assertEqualResponse(res1, res2)
             assert 'cached' in res2.flags
             # validate cached response if request max-age set as 0
             req1 = req0.replace(headers={'Cache-Control': 'max-age=0'})
             res304 = res0.replace(status=304)
             assert mw.process_request(req1, self.spider) is None
             res3 = self._process_requestresponse(mw, req1, res304)
             self.assertEqualResponse(res1, res3)
             assert 'cached' in res3.flags
 def test_response_cacheability(self):
     responses = [
         # 304 is not cacheable no matter what servers sends
         (False, 304, {}),
         (False, 304, {'Last-Modified': self.yesterday}),
         (False, 304, {'Expires': self.tomorrow}),
         (False, 304, {'Etag': 'bar'}),
         (False, 304, {'Cache-Control': 'max-age=3600'}),
         # Always obey no-store cache control
         (False, 200, {'Cache-Control': 'no-store'}),
         # invalid
         (False, 200, {'Cache-Control': 'no-store, max-age=300'}),
         # invalid
         (False, 200, {
          'Cache-Control': 'no-store', 'Expires': self.tomorrow}),
         # Ignore responses missing expiration and/or validation headers
         (False, 200, {}),
         (False, 302, {}),
         (False, 307, {}),
         (False, 404, {}),
         # Cache responses with expiration and/or validation headers
         (True, 200, {'Last-Modified': self.yesterday}),
         (True, 203, {'Last-Modified': self.yesterday}),
         (True, 300, {'Last-Modified': self.yesterday}),
         (True, 301, {'Last-Modified': self.yesterday}),
         (True, 401, {'Last-Modified': self.yesterday}),
         (True, 404, {'Cache-Control': 'public, max-age=600'}),
         (True, 302, {'Expires': self.tomorrow}),
         (True, 200, {'Etag': 'foo'}),
     ]
     with self._middleware() as mw:
         for idx, (shouldcache, status, headers) in enumerate(responses):
             req0 = Request('http://example-%d.com' % idx)
             res0 = Response(req0.url, status=status, headers=headers)
             res1 = self._process_requestresponse(mw, req0, res0)
             res304 = res0.replace(status=304)
             res2 = self._process_requestresponse(
                 mw, req0, res304 if shouldcache else res0)
             self.assertEqualResponse(res1, res0)
             self.assertEqualResponse(res2, res0)
             resc = mw.storage.retrieve_response(self.spider, req0)
             if shouldcache:
                 self.assertEqualResponse(resc, res1)
                 assert 'cached' in res2.flags and res2.status != 304
             else:
                 self.assertFalse(resc)
                 assert 'cached' not in res2.flags
 def test_cached_and_fresh(self):
     sampledata = [
         (200, {'Date': self.yesterday, 'Expires': self.tomorrow}),
         (200, {'Date': self.yesterday, 'Cache-Control': 'max-age=86405'}),
         (200, {'Age': '299', 'Cache-Control': 'max-age=300'}),
         # Obey max-age if present over any others
         (200, {'Date': self.today,
                'Age': '86405',
                'Cache-Control': 'max-age=' + str(86400 * 3),
                'Expires': self.yesterday,
                'Last-Modified': self.yesterday,
                }),
         # obey Expires if max-age is not present
         (200, {'Date': self.yesterday,
                'Age': '86400',
                'Cache-Control': 'public',
                'Expires': self.tomorrow,
                'Last-Modified': self.yesterday,
                }),
         # Default missing Date header to right now
         (200, {'Expires': self.tomorrow}),
         # Firefox - Expires if age is greater than 10% of (Date - Last-Modified)
         (200, {'Date': self.today, 'Last-Modified': self.yesterday, 'Age': str(86400 / 10 - 1)}),
         # Firefox - Set one year maxage to permanent redirects missing expiration info
         (300, {}), (301, {}), (308, {}),
     ]
     with self._middleware() as mw:
         for idx, (status, headers) in enumerate(sampledata):
             req0 = Request('http://example-%d.com' % idx)
             res0 = Response(req0.url, status=status, headers=headers)
             # cache fresh response
             res1 = self._process_requestresponse(mw, req0, res0)
             self.assertEqualResponse(res1, res0)
             assert 'cached' not in res1.flags
             # return fresh cached response without network interaction
             res2 = self._process_requestresponse(mw, req0, None)
             self.assertEqualResponse(res1, res2)
             assert 'cached' in res2.flags
             # validate cached response if request max-age set as 0
             req1 = req0.replace(headers={'Cache-Control': 'max-age=0'})
             res304 = res0.replace(status=304)
             assert mw.process_request(req1, self.spider) is None
             res3 = self._process_requestresponse(mw, req1, res304)
             self.assertEqualResponse(res1, res3)
             assert 'cached' in res3.flags
예제 #10
0
 def test_response_cacheability(self):
     responses = [
         # 304 is not cacheable no matter what servers sends
         (False, 304, {}),
         (False, 304, {'Last-Modified': self.yesterday}),
         (False, 304, {'Expires': self.tomorrow}),
         (False, 304, {'Etag': 'bar'}),
         (False, 304, {'Cache-Control': 'max-age=3600'}),
         # Always obey no-store cache control
         (False, 200, {'Cache-Control': 'no-store'}),
         (False, 200, {'Cache-Control': 'no-store, max-age=300'}),  # invalid
         (False, 200, {'Cache-Control': 'no-store', 'Expires': self.tomorrow}),  # invalid
         # Ignore responses missing expiration and/or validation headers
         (False, 200, {}),
         (False, 302, {}),
         (False, 307, {}),
         (False, 404, {}),
         # Cache responses with expiration and/or validation headers
         (True, 200, {'Last-Modified': self.yesterday}),
         (True, 203, {'Last-Modified': self.yesterday}),
         (True, 300, {'Last-Modified': self.yesterday}),
         (True, 301, {'Last-Modified': self.yesterday}),
         (True, 401, {'Last-Modified': self.yesterday}),
         (True, 404, {'Cache-Control': 'public, max-age=600'}),
         (True, 302, {'Expires': self.tomorrow}),
         (True, 200, {'Etag': 'foo'}),
     ]
     with self._middleware() as mw:
         for idx, (shouldcache, status, headers) in enumerate(responses):
             req0 = Request('http://example-%d.com' % idx)
             res0 = Response(req0.url, status=status, headers=headers)
             res1 = self._process_requestresponse(mw, req0, res0)
             res304 = res0.replace(status=304)
             res2 = self._process_requestresponse(mw, req0, res304 if shouldcache else res0)
             self.assertEqualResponse(res1, res0)
             self.assertEqualResponse(res2, res0)
             resc = mw.storage.retrieve_response(self.spider, req0)
             if shouldcache:
                 self.assertEqualResponse(resc, res1)
                 assert 'cached' in res2.flags and res2.status != 304
             else:
                 self.assertFalse(resc)
                 assert 'cached' not in res2.flags
 def test_cached_and_stale(self):
     sampledata = [
         (200, {"Date": self.today, "Expires": self.yesterday}),
         (200, {"Date": self.today, "Expires": self.yesterday, "Last-Modified": self.yesterday}),
         (200, {"Expires": self.yesterday}),
         (200, {"Expires": self.yesterday, "ETag": "foo"}),
         (200, {"Expires": self.yesterday, "Last-Modified": self.yesterday}),
         (200, {"Expires": self.tomorrow, "Age": "86405"}),
         (200, {"Cache-Control": "max-age=86400", "Age": "86405"}),
         # no-cache forces expiration, also revalidation if validators exists
         (200, {"Cache-Control": "no-cache"}),
         (200, {"Cache-Control": "no-cache", "ETag": "foo"}),
         (200, {"Cache-Control": "no-cache", "Last-Modified": self.yesterday}),
     ]
     with self._middleware() as mw:
         for idx, (status, headers) in enumerate(sampledata):
             req0 = Request("http://example-%d.com" % idx)
             res0a = Response(req0.url, status=status, headers=headers)
             # cache expired response
             res1 = self._process_requestresponse(mw, req0, res0a)
             self.assertEqualResponse(res1, res0a)
             assert "cached" not in res1.flags
             # Same request but as cached response is stale a new response must
             # be returned
             res0b = res0a.replace(body="bar")
             res2 = self._process_requestresponse(mw, req0, res0b)
             self.assertEqualResponse(res2, res0b)
             assert "cached" not in res2.flags
             # Previous response expired too, subsequent request to same
             # resource must revalidate and succeed on 304 if validators
             # are present
             if "ETag" in headers or "Last-Modified" in headers:
                 res0c = res0b.replace(status=304)
                 res3 = self._process_requestresponse(mw, req0, res0c)
                 self.assertEqualResponse(res3, res0b)
                 assert "cached" in res3.flags
 def test_cached_and_stale(self):
     sampledata = [
         (200, {
             "Date": self.today,
             "Expires": self.yesterday
         }),
         (
             200,
             {
                 "Date": self.today,
                 "Expires": self.yesterday,
                 "Last-Modified": self.yesterday,
             },
         ),
         (200, {
             "Expires": self.yesterday
         }),
         (200, {
             "Expires": self.yesterday,
             "ETag": "foo"
         }),
         (200, {
             "Expires": self.yesterday,
             "Last-Modified": self.yesterday
         }),
         (200, {
             "Expires": self.tomorrow,
             "Age": "86405"
         }),
         (200, {
             "Cache-Control": "max-age=86400",
             "Age": "86405"
         }),
         # no-cache forces expiration, also revalidation if validators exists
         (200, {
             "Cache-Control": "no-cache"
         }),
         (200, {
             "Cache-Control": "no-cache",
             "ETag": "foo"
         }),
         (200, {
             "Cache-Control": "no-cache",
             "Last-Modified": self.yesterday
         }),
         (
             200,
             {
                 "Cache-Control": "no-cache,must-revalidate",
                 "Last-Modified": self.yesterday,
             },
         ),
         (
             200,
             {
                 "Cache-Control": "must-revalidate",
                 "Expires": self.yesterday,
                 "Last-Modified": self.yesterday,
             },
         ),
         (200, {
             "Cache-Control": "max-age=86400,must-revalidate",
             "Age": "86405"
         }),
     ]
     with self._middleware() as mw:
         for idx, (status, headers) in enumerate(sampledata):
             req0 = Request("http://example-%d.com" % idx)
             res0a = Response(req0.url, status=status, headers=headers)
             # cache expired response
             res1 = self._process_requestresponse(mw, req0, res0a)
             self.assertEqualResponse(res1, res0a)
             assert "cached" not in res1.flags
             # Same request but as cached response is stale a new response must
             # be returned
             res0b = res0a.replace(body=b"bar")
             res2 = self._process_requestresponse(mw, req0, res0b)
             self.assertEqualResponse(res2, res0b)
             assert "cached" not in res2.flags
             cc = headers.get("Cache-Control", "")
             # Previous response expired too, subsequent request to same
             # resource must revalidate and succeed on 304 if validators
             # are present
             if "ETag" in headers or "Last-Modified" in headers:
                 res0c = res0b.replace(status=304)
                 res3 = self._process_requestresponse(mw, req0, res0c)
                 self.assertEqualResponse(res3, res0b)
                 assert "cached" in res3.flags
                 # get cached response on server errors unless must-revalidate
                 # in cached response
                 res0d = res0b.replace(status=500)
                 res4 = self._process_requestresponse(mw, req0, res0d)
                 if "must-revalidate" in cc:
                     assert "cached" not in res4.flags
                     self.assertEqualResponse(res4, res0d)
                 else:
                     assert "cached" in res4.flags
                     self.assertEqualResponse(res4, res0b)
             # Requests with max-stale can fetch expired cached responses
             # unless cached response has must-revalidate
             req1 = req0.replace(headers={"Cache-Control": "max-stale"})
             res5 = self._process_requestresponse(mw, req1, res0b)
             self.assertEqualResponse(res5, res0b)
             if "no-cache" in cc or "must-revalidate" in cc:
                 assert "cached" not in res5.flags
             else:
                 assert "cached" in res5.flags
 def test_cached_and_fresh(self):
     sampledata = [
         (200, {
             "Date": self.yesterday,
             "Expires": self.tomorrow
         }),
         (200, {
             "Date": self.yesterday,
             "Cache-Control": "max-age=86405"
         }),
         (200, {
             "Age": "299",
             "Cache-Control": "max-age=300"
         }),
         # Obey max-age if present over any others
         (
             200,
             {
                 "Date": self.today,
                 "Age": "86405",
                 "Cache-Control": "max-age=" + str(86400 * 3),
                 "Expires": self.yesterday,
                 "Last-Modified": self.yesterday,
             },
         ),
         # obey Expires if max-age is not present
         (
             200,
             {
                 "Date": self.yesterday,
                 "Age": "86400",
                 "Cache-Control": "public",
                 "Expires": self.tomorrow,
                 "Last-Modified": self.yesterday,
             },
         ),
         # Default missing Date header to right now
         (200, {
             "Expires": self.tomorrow
         }),
         # Firefox - Expires if age is greater than 10% of (Date - Last-Modified)
         (
             200,
             {
                 "Date": self.today,
                 "Last-Modified": self.yesterday,
                 "Age": str(86400 / 10 - 1),
             },
         ),
         # Firefox - Set one year maxage to permanent redirects missing expiration info
         (300, {}),
         (301, {}),
         (308, {}),
     ]
     with self._middleware() as mw:
         for idx, (status, headers) in enumerate(sampledata):
             req0 = Request("http://example-%d.com" % idx)
             res0 = Response(req0.url, status=status, headers=headers)
             # cache fresh response
             res1 = self._process_requestresponse(mw, req0, res0)
             self.assertEqualResponse(res1, res0)
             assert "cached" not in res1.flags
             # return fresh cached response without network interaction
             res2 = self._process_requestresponse(mw, req0, None)
             self.assertEqualResponse(res1, res2)
             assert "cached" in res2.flags
             # validate cached response if request max-age set as 0
             req1 = req0.replace(headers={"Cache-Control": "max-age=0"})
             res304 = res0.replace(status=304)
             assert mw.process_request(req1, self.spider) is None
             res3 = self._process_requestresponse(mw, req1, res304)
             self.assertEqualResponse(res1, res3)
             assert "cached" in res3.flags
    def test_response_cacheability(self):
        responses = [
            # 304 is not cacheable no matter what servers sends
            (False, 304, {}),
            (False, 304, {
                "Last-Modified": self.yesterday
            }),
            (False, 304, {
                "Expires": self.tomorrow
            }),
            (False, 304, {
                "Etag": "bar"
            }),
            (False, 304, {
                "Cache-Control": "max-age=3600"
            }),
            # Always obey no-store cache control
            (False, 200, {
                "Cache-Control": "no-store"
            }),
            # invalid
            (False, 200, {
                "Cache-Control": "no-store, max-age=300"
            }),
            (
                False,
                200,
                {
                    "Cache-Control": "no-store",
                    "Expires": self.tomorrow
                },
            ),  # invalid
            # Ignore responses missing expiration and/or validation headers
            (False, 200, {}),
            (False, 302, {}),
            (False, 307, {}),
            (False, 404, {}),
            # Cache responses with expiration and/or validation headers
            (True, 200, {
                "Last-Modified": self.yesterday
            }),
            (True, 203, {
                "Last-Modified": self.yesterday
            }),
            (True, 300, {
                "Last-Modified": self.yesterday
            }),
            (True, 301, {
                "Last-Modified": self.yesterday
            }),
            (True, 308, {
                "Last-Modified": self.yesterday
            }),
            (True, 401, {
                "Last-Modified": self.yesterday
            }),
            (True, 404, {
                "Cache-Control": "public, max-age=600"
            }),
            (True, 302, {
                "Expires": self.tomorrow
            }),
            (True, 200, {
                "Etag": "foo"
            }),
        ]
        with self._middleware() as mw:
            for idx, (shouldcache, status, headers) in enumerate(responses):
                req0 = Request("http://example-%d.com" % idx)
                res0 = Response(req0.url, status=status, headers=headers)
                res1 = self._process_requestresponse(mw, req0, res0)
                res304 = res0.replace(status=304)
                res2 = self._process_requestresponse(
                    mw, req0, res304 if shouldcache else res0)
                self.assertEqualResponse(res1, res0)
                self.assertEqualResponse(res2, res0)
                resc = mw.storage.retrieve_response(self.spider, req0)
                if shouldcache:
                    self.assertEqualResponse(resc, res1)
                    assert "cached" in res2.flags and res2.status != 304
                else:
                    self.assertFalse(resc)
                    assert "cached" not in res2.flags

        # cache unconditionally unless response contains no-store or is a 304
        with self._middleware(HTTPCACHE_ALWAYS_STORE=True) as mw:
            for idx, (_, status, headers) in enumerate(responses):
                shouldcache = ("no-store" not in headers.get(
                    "Cache-Control", "") and status != 304)
                req0 = Request("http://example2-%d.com" % idx)
                res0 = Response(req0.url, status=status, headers=headers)
                res1 = self._process_requestresponse(mw, req0, res0)
                res304 = res0.replace(status=304)
                res2 = self._process_requestresponse(
                    mw, req0, res304 if shouldcache else res0)
                self.assertEqualResponse(res1, res0)
                self.assertEqualResponse(res2, res0)
                resc = mw.storage.retrieve_response(self.spider, req0)
                if shouldcache:
                    self.assertEqualResponse(resc, res1)
                    assert "cached" in res2.flags and res2.status != 304
                else:
                    self.assertFalse(resc)
                    assert "cached" not in res2.flags