def test_gunzip_truncated_short(self): with open(join(SAMPLEDIR, 'truncated-crc-error-short.gz'), 'rb') as f: r1 = Response("http://www.example.com", body=f.read()) self.assertTrue(gzip_magic_number(r1)) r2 = Response("http://www.example.com", body=gunzip(r1.body)) assert r2.body.endswith(b'</html>') self.assertFalse(gzip_magic_number(r2))
def test_gunzip_basic(self): with open(join(SAMPLEDIR, 'feed-sample1.xml.gz'), 'rb') as f: r1 = Response("http://www.example.com", body=f.read()) self.assertTrue(gzip_magic_number(r1)) r2 = Response("http://www.example.com", body=gunzip(r1.body)) self.assertFalse(gzip_magic_number(r2)) self.assertEqual(len(r2.body), 9950)
def _get_sitemap_body(self, response): """Return the sitemap body contained in the given response, or None if the response is not a sitemap. """ if isinstance(response, XmlResponse): return response.body elif gzip_magic_number(response): return gunzip(response.body) # actual gzipped sitemap files are decompressed above ; # if we are here (response body is not gzipped) # and have a response for .xml.gz, # it usually means that it was already gunzipped # by HttpCompression middleware, # the HTTP response being sent with "Content-Encoding: gzip" # without actually being a .xml.gz file in the first place, # merely XML gzip-compressed on the fly, # in other word, here, we have plain XML elif response.url.endswith('.xml') or response.url.endswith('.xml.gz'): return response.body elif response.url.endswith('sitemap.txt'): #print(response.body) a = response.body.decode("utf-8") #print(type(response.body)) a = a.split('\r\n') body = '<?xml version="1.0" encoding="UTF-8"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">' for x in a: body = body + '<url><loc>' + x + '</loc></url>' body = body + '</urlset>' return str.encode(body)
def _get_sitemap_body(self, response): """Return the sitemap body contained in the given response, or None if the response is not a sitemap. """ if isinstance(response, XmlResponse): return response.body elif gzip_magic_number(response): return gunzip(response.body) elif response.url.endswith('.xml') or response.url.endswith('.xml.gz'): return response.body try: root = ET.fromstring(response.body) return response.body except: pass
def _get_sitemap_body(self, response): """Return the sitemap body contained in the given response, or None if the response is not a sitemap. """ if isinstance(response, XmlResponse): return response.body elif gzip_magic_number(response): return gunzip(response.body) # actual gzipped sitemap files are decompressed above ; # if we are here (response body is not gzipped) # and have a response for .xml.gz, # it usually means that it was already gunzipped # by HttpCompression middleware, # the HTTP response being sent with "Content-Encoding: gzip" # without actually being a .xml.gz file in the first place, # merely XML gzip-compressed on the fly, # in other word, here, we have plain XML elif response.url.endswith('.xml') or response.url.endswith('.xml.gz'): return response.body
def test_is_gzipped_empty(self): r1 = Response("http://www.example.com") self.assertFalse(gzip_magic_number(r1))