def test_is_gzipped_case_insensitive(self): hdrs = Headers({"Content-Type": "Application/X-Gzip"}) r1 = Response("http://www.example.com", headers=hdrs) self.assertTrue(is_gzipped(r1)) hdrs = Headers({"Content-Type": "application/X-GZIP ; charset=utf-8"}) r1 = Response("http://www.example.com", headers=hdrs) self.assertTrue(is_gzipped(r1))
def process_response(self, request, response, spider): if request.method == 'HEAD': return response if isinstance(response, Response): if response.url.endswith('.xml.gz'): response.headers.setlist('Content-Encoding', [ b'gzip', ]) response.headers.setlist('Content-Type', [ b'application/xml', ]) content_encoding = response.headers.getlist('Content-Encoding') if content_encoding and not is_gzipped(response): encoding = content_encoding.pop() decoded_body = self._decode(response.body, encoding.lower()) respcls = responsetypes.from_args(headers=response.headers, url=response.url) kwargs = dict(cls=respcls, body=decoded_body) if issubclass(respcls, TextResponse): # force recalculating the encoding until we make sure the # responsetypes guessing is reliable kwargs['encoding'] = None response = response.replace(**kwargs) if not content_encoding: del response.headers['Content-Encoding'] return response
def _get_sitemap_body(self, response): """Return the sitemap body contained in the given response, or None if the response is not a sitemap. """ if isinstance(response, XmlResponse): return response.body elif is_gzipped(response): return gunzip(response.body) elif response.url.endswith('.xml'): return response.body elif response.url.endswith('.xml.gz'): return gunzip(response.body)
def _get_sitemap_body(self, response): """Return the sitemap body contained in the given response, or None if the response is not a sitemap. """ try: if isinstance(response, XmlResponse): return response.body elif is_gzipped(response): return gunzip(response.body) elif response.url.endswith('.xml'): return response.body elif response.url.endswith('.xml.gz'): return gunzip(response.body) except Exception, e: self.log("Error %s ungzip %s" % (response.url, e))
def process_response(self, request, response, spider): if isinstance(response, Response): content_encoding = response.headers.getlist('Content-Encoding') if content_encoding and not is_gzipped(response): encoding = content_encoding.pop() decoded_body = self._decode(response.body, encoding.lower()) respcls = responsetypes.from_args(headers=response.headers, \ url=response.url) kwargs = dict(cls=respcls, body=decoded_body) if issubclass(respcls, TextResponse): # force recalculating the encoding until we make sure the # responsetypes guessing is reliable kwargs['encoding'] = None response = response.replace(**kwargs) if not content_encoding: del response.headers['Content-Encoding'] return response
def test_is_x_gzipped_right(self): hdrs = Headers({"Content-Type": "application/x-gzip"}) r1 = Response("http://www.example.com", headers=hdrs) self.assertTrue(is_gzipped(r1))
def test_is_gzipped_empty(self): r1 = Response("http://www.example.com") self.assertFalse(is_gzipped(r1))
def test_is_gzipped_with_charset(self): hdrs = Headers({"Content-Type": "application/x-gzip;charset=utf-8"}) r1 = Response("http://www.example.com", headers=hdrs) self.assertTrue(is_gzipped(r1))
def test_is_gzipped_wrong(self): hdrs = Headers({"Content-Type": "application/javascript"}) r1 = Response("http://www.example.com", headers=hdrs) self.assertFalse(is_gzipped(r1))
def test_is_gzipped_not_quite(self): hdrs = Headers({"Content-Type": "application/gzippppp"}) r1 = Response("http://www.example.com", headers=hdrs) self.assertFalse(is_gzipped(r1))