def get_sitemap_body(response): '''Return the sitemap body contained in the given response, or None if the response is not a sitemap. ''' if isinstance(response, XmlResponse): return response.body elif is_gzipped(response): return gunzip(response.body) elif response.url.endswith('.xml'): return response.body elif response.url.endswith('.xml.gz'): return gunzip(response.body)
def _decode(self, body, encoding, max_length=0): if encoding == 'gzip' or encoding == 'x-gzip': body = gunzip(body, max_length) elif encoding == 'deflate': try: if max_length: dobj = zlib.decompressobj() body = dobj.decompress(body, max_length) if dobj.unconsumed_tail: raise DecompressSizeError( 'Response exceeded %s bytes' % max_length) else: body = zlib.decompress(body) except zlib.error: # ugly hack to work with raw deflate content that may # be sent by microsoft servers. For more information, see: # http://carsten.codimi.de/gzip.yaws/ # http://www.port80software.com/200ok/archive/2005/10/31/868.aspx # http://www.gzip.org/zlib/zlib_faq.html#faq38 if max_length: dobj = zlib.decompressobj(-15) body = dobj.decompress(body, max_length) if dobj.unconsumed_tail: raise DecompressSizeError( 'Response exceeded %s bytes' % max_length) else: body = zlib.decompress(body, -15) return body
def test_max_length(self): with open(join(SAMPLE_DIR, 'feed-sample1.xml.gz'), 'rb') as f: raw = f.read() self.assertEqual(len(gunzip(raw, 9950)), 9950) self.assertRaises(DecompressSizeError, gunzip, raw, 9949)
def test_gunzip_truncated_short(self): with open(join(SAMPLE_DIR, 'truncated-crc-error-short.gz'), 'rb') as f: text = gunzip(f.read()) self.assertTrue(text.endswith('</html>'))
def test_gunzip_basic(self): with open(join(SAMPLE_DIR, 'feed-sample1.xml.gz'), 'rb') as f: text = gunzip(f.read()) self.assertEqual(len(text), 9950)