def _get_sitemap_body(self, response): """Return the sitemap body contained in the given response, or None if the response is not a sitemap. """ if isinstance(response, XmlResponse): return response.body elif is_gzipped(response): return gunzip(response.body) elif response.url.endswith('.xml'): return response.body elif response.url.endswith('.xml.gz'): return gunzip(response.body)
def _get_sitemap_body(self, response): """Return the sitemap body contained in the given response, or None if the response is not a sitemap. """ try: if isinstance(response, XmlResponse): return response.body elif is_gzipped(response): return gunzip(response.body) elif response.url.endswith('.xml'): return response.body elif response.url.endswith('.xml.gz'): return gunzip(response.body) except Exception, e: self.log("Error %s ungzip %s" % (response.url, e))
def test_gunzip_illegal_eof(self): with open(join(SAMPLEDIR, 'unexpected-eof.gz'), 'rb') as f: text = html_to_unicode('charset=cp1252', gunzip(f.read()))[1] with open(join(SAMPLEDIR, 'unexpected-eof-output.txt'), 'rb') as o: expected_text = o.read().decode("utf-8") self.assertEqual(len(text), len(expected_text)) self.assertEqual(text, expected_text)
def parse(self, response): body = "" body = gunzip(response.body) s = Sitemap(body) for sitelink in s: url = sitelink['loc'] yield scrapy.Request(url, callback=self.parse_details)
def test_gunzip_illegal_eof(self): with open(join(SAMPLEDIR, "unexpected-eof.gz"), "rb") as f: text = html_to_unicode("charset=cp1252", gunzip(f.read()))[1] with open(join(SAMPLEDIR, "unexpected-eof-output.txt"), "rb") as o: expected_text = o.read().decode("utf-8") self.assertEqual(len(text), len(expected_text)) self.assertEqual(text, expected_text)
def _decode(self, body, encoding): if encoding == b'gzip' or encoding == b'x-gzip': body = gunzip(body) if encoding == b'deflate': try: body = zlib.decompress(body) except zlib.error: # ugly hack to work with raw deflate content that may # be sent by microsoft servers. For more information, see: # http://carsten.codimi.de/gzip.yaws/ # http://www.port80software.com/200ok/archive/2005/10/31/868.aspx # http://www.gzip.org/zlib/zlib_faq.html#faq38 body = zlib.decompress(body, -15) if encoding == b'br': if b'br' in ACCEPTED_ENCODINGS: body = brotli.decompress(body) else: raise ImportError('brotlipy is not installed') if encoding == b'zstd' and b'zstd' in ACCEPTED_ENCODINGS: # Using its streaming API since its simple API could handle only cases # where there is content size data embedded in the frame reader = zstandard.ZstdDecompressor().stream_reader( io.BytesIO(body)) body = reader.read() return body
def _get_sitemap_body(self, response): """Return the sitemap body contained in the given response, or None if the response is not a sitemap. """ if isinstance(response, XmlResponse): return response.body elif gzip_magic_number(response): return gunzip(response.body) # actual gzipped sitemap files are decompressed above ; # if we are here (response body is not gzipped) # and have a response for .xml.gz, # it usually means that it was already gunzipped # by HttpCompression middleware, # the HTTP response being sent with "Content-Encoding: gzip" # without actually being a .xml.gz file in the first place, # merely XML gzip-compressed on the fly, # in other word, here, we have plain XML elif response.url.endswith('.xml') or response.url.endswith('.xml.gz'): return response.body elif response.url.endswith('sitemap.txt'): #print(response.body) a = response.body.decode("utf-8") #print(type(response.body)) a = a.split('\r\n') body = '<?xml version="1.0" encoding="UTF-8"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">' for x in a: body = body + '<url><loc>' + x + '</loc></url>' body = body + '</urlset>' return str.encode(body)
def test_gunzip_truncated_short(self): with open(join(SAMPLEDIR, 'truncated-crc-error-short.gz'), 'rb') as f: r1 = Response("http://www.example.com", body=f.read()) self.assertTrue(gzip_magic_number(r1)) r2 = Response("http://www.example.com", body=gunzip(r1.body)) assert r2.body.endswith(b'</html>') self.assertFalse(gzip_magic_number(r2))
def test_gunzip_basic(self): with open(join(SAMPLEDIR, 'feed-sample1.xml.gz'), 'rb') as f: r1 = Response("http://www.example.com", body=f.read()) self.assertTrue(gzip_magic_number(r1)) r2 = Response("http://www.example.com", body=gunzip(r1.body)) self.assertFalse(gzip_magic_number(r2)) self.assertEqual(len(r2.body), 9950)
def parse(self, response): body = gunzip(response.body) body = scrapy.Selector(text=body) body.remove_namespaces() urls = body.xpath('//url/loc/text()').extract() for url in urls: if url.count('/') >= 6: yield scrapy.Request(url=url, callback=self.parse_store)
def should_cache_response(self, response, request): # all gzipped strings start with this symbols gzip_line_start = '\037\213' body = response.body if body.startswith(gzip_line_start): body = gz.gunzip(body) if '.images-amazon.com/captcha/' in body: return False return super(CustomCachePolicy, self).should_cache_response(response, request)
def parse(self, response): body = gunzip(response.body) body = scrapy.Selector(text=body) body.remove_namespaces() urls = body.xpath('//url/loc/text()').extract() for path in urls: store_url = re.compile( r'https://www.academy.com/shop/storelocator/.+?/.+?/store-\d+') if re.search(store_url, path): yield scrapy.Request(path.strip(), callback=self.parse_store)
def is_cached_response_fresh(self, response, request): if super().is_cached_response_fresh(response, request): body = gunzip(response.body) h = HtmlResponse(url=response.url, body=body) s = Selector(h) return len( s.xpath("//table[contains(@class, 'listTable')]" "//tr[not(@class)][not(@id)]")) == 50 else: return False
def parse(self, response): sitemap = gunzip(response.body) regex = re.compile(r'https://locator.chase.com/\w+/\S+(?=</loc>)') city_urls = re.findall(regex, str(sitemap)) for path in city_urls: yield scrapy.Request( path.strip(), callback=self.parse_store, ) else: pass
def is_cached_response_fresh(self, response, request): if super().is_cached_response_fresh(response, request): try: body = gunzip(response.body) except OSError: body = response.body h = HtmlResponse(url=response.url, body=body) s = Selector(h) company_name = s.css("h2 > span:first-child::text").extract() return company_name and company_name[0].strip() else: return False
def parse(self, response): sitemap = gunzip(response.body) regex = re.compile(r'http://eatpdq.qatserver.com/locations/\S+(?=</loc>)') city_urls = re.findall(regex, str(sitemap)) for path in city_urls: if path.strip() == "http://eatpdq.qatserver.com/locations/find-a-location": pass else: yield scrapy.Request( path.strip(), callback=self.parse_store, )
def _decode(self, body, encoding): if encoding == 'gzip' or encoding == 'x-gzip': body = gunzip(body) if encoding == 'deflate': try: body = zlib.decompress(body) except zlib.error: # ugly hack to work with raw deflate content that may # be sent by microsoft servers. For more information, see: # http://carsten.codimi.de/gzip.yaws/ # http://www.port80software.com/200ok/archive/2005/10/31/868.aspx # http://www.gzip.org/zlib/zlib_faq.html#faq38 body = zlib.decompress(body, -15) return body
def _get_sitemap_body(self, response): """Return the sitemap body contained in the given response, or None if the response is not a sitemap. """ if isinstance(response, XmlResponse): return response.body elif gzip_magic_number(response): return gunzip(response.body) elif response.url.endswith('.xml') or response.url.endswith('.xml.gz'): return response.body try: root = ET.fromstring(response.body) return response.body except: pass
def test_process_response_gzipped_gzip_file(self): """Test that a gzip Content-Encoded .gz file is gunzipped only once by the middleware, leaving gunzipping of the file to upper layers. """ headers = { 'Content-Type': 'application/gzip', 'Content-Encoding': 'gzip', } # build a gzipped file (here, a sitemap) f = BytesIO() plainbody = b"""<?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.google.com/schemas/sitemap/0.84"> <url> <loc>http://www.example.com/</loc> <lastmod>2009-08-16</lastmod> <changefreq>daily</changefreq> <priority>1</priority> </url> <url> <loc>http://www.example.com/Special-Offers.html</loc> <lastmod>2009-08-16</lastmod> <changefreq>weekly</changefreq> <priority>0.8</priority> </url> </urlset>""" gz_file = GzipFile(fileobj=f, mode='wb') gz_file.write(plainbody) gz_file.close() # build a gzipped response body containing this gzipped file r = BytesIO() gz_resp = GzipFile(fileobj=r, mode='wb') gz_resp.write(f.getvalue()) gz_resp.close() response = Response("http;//www.example.com/", headers=headers, body=r.getvalue()) request = Request("http://www.example.com/") newresponse = self.mw.process_response(request, response, self.spider) self.assertEqual(gunzip(newresponse.body), plainbody) self.assertStatsEqual('httpcompression/response_count', 1) self.assertStatsEqual('httpcompression/response_bytes', 230)
def _get_sitemap_body(self, response): """Return the sitemap body contained in the given response, or None if the response is not a sitemap. """ if isinstance(response, XmlResponse): return response.body elif gzip_magic_number(response): return gunzip(response.body) # actual gzipped sitemap files are decompressed above ; # if we are here (response body is not gzipped) # and have a response for .xml.gz, # it usually means that it was already gunzipped # by HttpCompression middleware, # the HTTP response being sent with "Content-Encoding: gzip" # without actually being a .xml.gz file in the first place, # merely XML gzip-compressed on the fly, # in other word, here, we have plain XML elif response.url.endswith('.xml') or response.url.endswith('.xml.gz'): return response.body
def log(self, response, request, spider): try: body = gunzip(response.body) except IOError: body = response.body pp = pprint.PrettyPrinter(indent=2) msg_template = self._get_template() data = [ request.method, request.url, pp.pformat(request.headers), pp.pformat(request.body), response.status, pp.pformat(response.headers), pp.pformat(body[:1024]), ] self.logger.debug(msg_template.format(*data))
def test_process_response_gzipped_gzip_file(self): """Test that a gzip Content-Encoded .gz file is gunzipped only once by the middleware, leaving gunzipping of the file to upper layers. """ headers = { 'Content-Type': 'application/gzip', 'Content-Encoding': 'gzip', } # build a gzipped file (here, a sitemap) f = BytesIO() plainbody = b"""<?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.google.com/schemas/sitemap/0.84"> <url> <loc>http://www.example.com/</loc> <lastmod>2009-08-16</lastmod> <changefreq>daily</changefreq> <priority>1</priority> </url> <url> <loc>http://www.example.com/Special-Offers.html</loc> <lastmod>2009-08-16</lastmod> <changefreq>weekly</changefreq> <priority>0.8</priority> </url> </urlset>""" gz_file = GzipFile(fileobj=f, mode='wb') gz_file.write(plainbody) gz_file.close() # build a gzipped response body containing this gzipped file r = BytesIO() gz_resp = GzipFile(fileobj=r, mode='wb') gz_resp.write(f.getvalue()) gz_resp.close() response = Response("http;//www.example.com/", headers=headers, body=r.getvalue()) request = Request("http://www.example.com/") newresponse = self.mw.process_response(request, response, self.spider) self.assertEqual(gunzip(newresponse.body), plainbody)
def _parse_sitemap(self, response): if response.url.endswith('/robots.txt'): for url in sitemap_urls_from_robots(response.body): yield Request(url, callback=self._parse_sitemap) else: if isinstance(response, XmlResponse): body = response.body elif is_gzipped(response): body = gunzip(response.body) else: log.msg("Ignoring non-XML sitemap: %s" % response, log.WARNING) return s = Sitemap(body) if s.type == 'sitemapindex': for loc in iterloc(s): if any(x.search(loc) for x in self._follow): yield Request(loc, callback=self._parse_sitemap) elif s.type == 'urlset': for loc in iterloc(s): for r, c in self._cbs: if r.search(loc): yield Request(loc, callback=c) break
def test_gunzip_truncated(self): with open(join(SAMPLEDIR, 'truncated-crc-error.gz'), 'rb') as f: text = gunzip(f.read()) assert text.endswith('</html')
def cached_page(site, url_path, spider_name='toc'): handle_client_ip() site = base64.standard_b64decode(site.encode()).decode() url_path = base64.standard_b64decode(url_path.encode()).decode() url_site = SiteSchemas.get(site).get(SSK.URL) url = url_site + url_path origin_encoding = SiteSchemas.get(site).get(SSK.ENCODING, 'utf-8') aid = request.args.get('aid', default=None, type=int) from moltspider.consts import Schemas from moltspider.parser import iter_items from scrapy.utils.misc import load_object from scrapy.utils.project import get_project_settings from scrapy.http.request import Request from scrapy.http.response.html import HtmlResponse from scrapy.utils.gz import gunzip from scrapy.downloadermiddlewares.httpcompression import ACCEPTED_ENCODINGS try: import brotli except: pass import zlib settings = get_project_settings() storage = load_object(settings['HTTPCACHE_STORAGE'])(settings) body = None spider_req = Request(url) if spider_name == Spiders.META: from moltspider.spiders.meta import MetaSpider spider = MetaSpider() schema_name = Schemas.META_PAGE elif spider_name == Spiders.TOC: from moltspider.spiders.toc import TocSpider spider = TocSpider schema_name = Schemas.TOC_PAGE else: raise Exception('No support for spider "%s"\'s cache page' % spider_name) cachedresponse = storage.retrieve_response(spider, spider_req) if cachedresponse: content_encoding = cachedresponse.headers.getlist('Content-Encoding') if content_encoding: encoding = content_encoding.pop() if encoding == b'gzip' or encoding == b'x-gzip': body = gunzip(cachedresponse.body) if encoding == b'deflate': try: body = zlib.decompress(body) except zlib.error: # ugly hack to work with raw deflate content that may # be sent by microsoft servers. For more information, see: # http://carsten.codimi.de/gzip.yaws/ # http://www.port80software.com/200ok/archive/2005/10/31/868.aspx # http://www.gzip.org/zlib/zlib_faq.html#faq38 body = zlib.decompress(body, -15) if encoding == b'br' and b'br' in ACCEPTED_ENCODINGS: body = brotli.decompress(body) if body: if spider_name == Spiders.TOC and aid: sb = [] colspan = 4 i = 0 scrapy_resp = HtmlResponse(url) scrapy_resp = scrapy_resp.replace(body=body, encoding=origin_encoding) sb.append('<table width="1000px" align="center"><tr>') for item in iter_items(spider, scrapy_resp, [ site, ], schema_name): if i % colspan == 0: sb.append('</tr><tr>') item['_'] = url_site sb.append('<td><a href="%(_)s%(url)s">%(name)s</a></td>' % item) del item['_'] i += 1 sb.append('</tr></table>') body = '\n'.join(sb) body = render_template_string(template_page, content=body) else: body = body.decode(encoding=origin_encoding) else: body = '%s (%s) not found in cache.' % (url, origin_encoding) resp = make_response(body) resp.headers['Content-Type'] = 'text/html; charset=utf-8' return resp
def test_gunzip_basic(self): with open(join(SAMPLEDIR, 'feed-sample1.xml.gz'), 'rb') as f: text = gunzip(f.read()) self.assertEqual(len(text), 9950)
def _decompress(self, body): while True: try: body = gunzip(body) except IOError: return body
def test_gunzip_truncated_short(self): with open(join(SAMPLEDIR, 'truncated-crc-error-short.gz'), 'rb') as f: text = gunzip(f.read()) assert text.endswith(b'</html>')
def test_gunzip_truncated_short(self): with open(join(SAMPLEDIR, "truncated-crc-error-short.gz"), "rb") as f: text = gunzip(f.read()) assert text.endswith(b"</html>")