Exemplo n.º 1
0
Arquivo: sitemap.py Projeto: DT021/wau
 def _get_sitemap_body(self, response):
     """Return the sitemap body contained in the given response, or None if the
     response is not a sitemap.
     """
     if isinstance(response, XmlResponse):
         return response.body
     elif is_gzipped(response):
         return gunzip(response.body)
     elif response.url.endswith('.xml'):
         return response.body
     elif response.url.endswith('.xml.gz'):
         return gunzip(response.body)
Exemplo n.º 2
0
	def _get_sitemap_body(self, response):
		"""Return the sitemap body contained in the given response, or None if the
		response is not a sitemap.
		"""
		if isinstance(response, XmlResponse):
			return response.body
		elif is_gzipped(response):
			return gunzip(response.body)
		elif response.url.endswith('.xml'):
			return response.body
		elif response.url.endswith('.xml.gz'):
			return gunzip(response.body)
Exemplo n.º 3
0
 def _get_sitemap_body(self, response):
     """Return the sitemap body contained in the given response, or None if the
     response is not a sitemap.
     """
     try:
         if isinstance(response, XmlResponse):
             return response.body
         elif is_gzipped(response):
             return gunzip(response.body)
         elif response.url.endswith('.xml'):
             return response.body
         elif response.url.endswith('.xml.gz'):
             return gunzip(response.body)
     except Exception, e:
         self.log("Error %s ungzip %s" % (response.url, e))
Exemplo n.º 4
0
 def test_gunzip_illegal_eof(self):
     with open(join(SAMPLEDIR, 'unexpected-eof.gz'), 'rb') as f:
         text = html_to_unicode('charset=cp1252', gunzip(f.read()))[1]
         with open(join(SAMPLEDIR, 'unexpected-eof-output.txt'), 'rb') as o:
             expected_text = o.read().decode("utf-8")
             self.assertEqual(len(text), len(expected_text))
             self.assertEqual(text, expected_text)
 def parse(self, response):
     body = ""
     body = gunzip(response.body)
     s = Sitemap(body)
     for sitelink in s:
         url = sitelink['loc']
         yield scrapy.Request(url, callback=self.parse_details)
Exemplo n.º 6
0
 def test_gunzip_illegal_eof(self):
     with open(join(SAMPLEDIR, "unexpected-eof.gz"), "rb") as f:
         text = html_to_unicode("charset=cp1252", gunzip(f.read()))[1]
         with open(join(SAMPLEDIR, "unexpected-eof-output.txt"), "rb") as o:
             expected_text = o.read().decode("utf-8")
             self.assertEqual(len(text), len(expected_text))
             self.assertEqual(text, expected_text)
Exemplo n.º 7
0
    def _decode(self, body, encoding):
        if encoding == b'gzip' or encoding == b'x-gzip':
            body = gunzip(body)

        if encoding == b'deflate':
            try:
                body = zlib.decompress(body)
            except zlib.error:
                # ugly hack to work with raw deflate content that may
                # be sent by microsoft servers. For more information, see:
                # http://carsten.codimi.de/gzip.yaws/
                # http://www.port80software.com/200ok/archive/2005/10/31/868.aspx
                # http://www.gzip.org/zlib/zlib_faq.html#faq38
                body = zlib.decompress(body, -15)
        if encoding == b'br':
            if b'br' in ACCEPTED_ENCODINGS:
                body = brotli.decompress(body)
            else:
                raise ImportError('brotlipy is not installed')
        if encoding == b'zstd' and b'zstd' in ACCEPTED_ENCODINGS:
            # Using its streaming API since its simple API could handle only cases
            # where there is content size data embedded in the frame
            reader = zstandard.ZstdDecompressor().stream_reader(
                io.BytesIO(body))
            body = reader.read()
        return body
Exemplo n.º 8
0
    def _get_sitemap_body(self, response):
        """Return the sitemap body contained in the given response,
        or None if the response is not a sitemap.
        """

        if isinstance(response, XmlResponse):
            return response.body
        elif gzip_magic_number(response):
            return gunzip(response.body)
        # actual gzipped sitemap files are decompressed above ;
        # if we are here (response body is not gzipped)
        # and have a response for .xml.gz,
        # it usually means that it was already gunzipped
        # by HttpCompression middleware,
        # the HTTP response being sent with "Content-Encoding: gzip"
        # without actually being a .xml.gz file in the first place,
        # merely XML gzip-compressed on the fly,
        # in other word, here, we have plain XML
        elif response.url.endswith('.xml') or response.url.endswith('.xml.gz'):
            return response.body
        elif response.url.endswith('sitemap.txt'):
            #print(response.body)
            a = response.body.decode("utf-8")
            #print(type(response.body))
            a = a.split('\r\n')
            body = '<?xml version="1.0" encoding="UTF-8"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">'
            for x in a:
                body = body + '<url><loc>' + x + '</loc></url>'
            body = body + '</urlset>'
            return str.encode(body)
Exemplo n.º 9
0
    def test_gunzip_truncated_short(self):
        with open(join(SAMPLEDIR, 'truncated-crc-error-short.gz'), 'rb') as f:
            r1 = Response("http://www.example.com", body=f.read())
            self.assertTrue(gzip_magic_number(r1))

            r2 = Response("http://www.example.com", body=gunzip(r1.body))
            assert r2.body.endswith(b'</html>')
            self.assertFalse(gzip_magic_number(r2))
Exemplo n.º 10
0
    def test_gunzip_basic(self):
        with open(join(SAMPLEDIR, 'feed-sample1.xml.gz'), 'rb') as f:
            r1 = Response("http://www.example.com", body=f.read())
            self.assertTrue(gzip_magic_number(r1))

            r2 = Response("http://www.example.com", body=gunzip(r1.body))
            self.assertFalse(gzip_magic_number(r2))
            self.assertEqual(len(r2.body), 9950)
Exemplo n.º 11
0
    def parse(self, response):
        body = gunzip(response.body)
        body = scrapy.Selector(text=body)
        body.remove_namespaces()
        urls = body.xpath('//url/loc/text()').extract()

        for url in urls:
            if url.count('/') >= 6:
                yield scrapy.Request(url=url, callback=self.parse_store)
Exemplo n.º 12
0
 def should_cache_response(self, response, request):
     # all gzipped strings start with this symbols
     gzip_line_start = '\037\213'
     body = response.body
     if body.startswith(gzip_line_start):
         body = gz.gunzip(body)
     if '.images-amazon.com/captcha/' in body:
         return False
     return super(CustomCachePolicy,
                  self).should_cache_response(response, request)
Exemplo n.º 13
0
 def parse(self, response):
     body = gunzip(response.body)
     body = scrapy.Selector(text=body)
     body.remove_namespaces()
     urls = body.xpath('//url/loc/text()').extract()
     for path in urls:
         store_url = re.compile(
             r'https://www.academy.com/shop/storelocator/.+?/.+?/store-\d+')
         if re.search(store_url, path):
             yield scrapy.Request(path.strip(), callback=self.parse_store)
Exemplo n.º 14
0
    def is_cached_response_fresh(self, response, request):
        if super().is_cached_response_fresh(response, request):

            body = gunzip(response.body)

            h = HtmlResponse(url=response.url, body=body)
            s = Selector(h)
            return len(
                s.xpath("//table[contains(@class, 'listTable')]"
                        "//tr[not(@class)][not(@id)]")) == 50
        else:
            return False
Exemplo n.º 15
0
    def parse(self, response):

        sitemap = gunzip(response.body)
        regex = re.compile(r'https://locator.chase.com/\w+/\S+(?=</loc>)')
        city_urls = re.findall(regex, str(sitemap))

        for path in city_urls:
            yield scrapy.Request(
                path.strip(),
                callback=self.parse_store,
            )
        else:
            pass
Exemplo n.º 16
0
    def is_cached_response_fresh(self, response, request):
        if super().is_cached_response_fresh(response, request):

            try:
                body = gunzip(response.body)
            except OSError:
                body = response.body

            h = HtmlResponse(url=response.url, body=body)
            s = Selector(h)
            company_name = s.css("h2 > span:first-child::text").extract()
            return company_name and company_name[0].strip()
        else:
            return False
Exemplo n.º 17
0
    def parse(self, response):

        sitemap = gunzip(response.body)
        regex = re.compile(r'http://eatpdq.qatserver.com/locations/\S+(?=</loc>)')
        city_urls = re.findall(regex, str(sitemap))

        for path in city_urls:
            if path.strip() == "http://eatpdq.qatserver.com/locations/find-a-location":
                pass
            else:
                yield scrapy.Request(
                    path.strip(),
                    callback=self.parse_store,
                )
Exemplo n.º 18
0
    def _decode(self, body, encoding):
        if encoding == 'gzip' or encoding == 'x-gzip':
            body = gunzip(body)

        if encoding == 'deflate':
            try:
                body = zlib.decompress(body)
            except zlib.error:
                # ugly hack to work with raw deflate content that may
                # be sent by microsoft servers. For more information, see:
                # http://carsten.codimi.de/gzip.yaws/
                # http://www.port80software.com/200ok/archive/2005/10/31/868.aspx
                # http://www.gzip.org/zlib/zlib_faq.html#faq38
                body = zlib.decompress(body, -15)
        return body
Exemplo n.º 19
0
    def _decode(self, body, encoding):
        if encoding == 'gzip' or encoding == 'x-gzip':
            body = gunzip(body)

        if encoding == 'deflate':
            try:
                body = zlib.decompress(body)
            except zlib.error:
                # ugly hack to work with raw deflate content that may
                # be sent by microsoft servers. For more information, see:
                # http://carsten.codimi.de/gzip.yaws/
                # http://www.port80software.com/200ok/archive/2005/10/31/868.aspx
                # http://www.gzip.org/zlib/zlib_faq.html#faq38
                body = zlib.decompress(body, -15)
        return body
Exemplo n.º 20
0
    def _get_sitemap_body(self, response):
        """Return the sitemap body contained in the given response,
        or None if the response is not a sitemap.
        """
        if isinstance(response, XmlResponse):
            return response.body
        elif gzip_magic_number(response):
            return gunzip(response.body)

        elif response.url.endswith('.xml') or response.url.endswith('.xml.gz'):
            return response.body

        try:
            root = ET.fromstring(response.body)
            return response.body
        except:
            pass
Exemplo n.º 21
0
    def test_process_response_gzipped_gzip_file(self):
        """Test that a gzip Content-Encoded .gz file is gunzipped
        only once by the middleware, leaving gunzipping of the file
        to upper layers.
        """
        headers = {
            'Content-Type': 'application/gzip',
            'Content-Encoding': 'gzip',
        }
        # build a gzipped file (here, a sitemap)
        f = BytesIO()
        plainbody = b"""<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.google.com/schemas/sitemap/0.84">
  <url>
    <loc>http://www.example.com/</loc>
    <lastmod>2009-08-16</lastmod>
    <changefreq>daily</changefreq>
    <priority>1</priority>
  </url>
  <url>
    <loc>http://www.example.com/Special-Offers.html</loc>
    <lastmod>2009-08-16</lastmod>
    <changefreq>weekly</changefreq>
    <priority>0.8</priority>
  </url>
</urlset>"""
        gz_file = GzipFile(fileobj=f, mode='wb')
        gz_file.write(plainbody)
        gz_file.close()

        # build a gzipped response body containing this gzipped file
        r = BytesIO()
        gz_resp = GzipFile(fileobj=r, mode='wb')
        gz_resp.write(f.getvalue())
        gz_resp.close()

        response = Response("http;//www.example.com/",
                            headers=headers,
                            body=r.getvalue())
        request = Request("http://www.example.com/")

        newresponse = self.mw.process_response(request, response, self.spider)
        self.assertEqual(gunzip(newresponse.body), plainbody)
        self.assertStatsEqual('httpcompression/response_count', 1)
        self.assertStatsEqual('httpcompression/response_bytes', 230)
Exemplo n.º 22
0
 def _get_sitemap_body(self, response):
     """Return the sitemap body contained in the given response,
     or None if the response is not a sitemap.
     """
     if isinstance(response, XmlResponse):
         return response.body
     elif gzip_magic_number(response):
         return gunzip(response.body)
     # actual gzipped sitemap files are decompressed above ;
     # if we are here (response body is not gzipped)
     # and have a response for .xml.gz,
     # it usually means that it was already gunzipped
     # by HttpCompression middleware,
     # the HTTP response being sent with "Content-Encoding: gzip"
     # without actually being a .xml.gz file in the first place,
     # merely XML gzip-compressed on the fly,
     # in other word, here, we have plain XML
     elif response.url.endswith('.xml') or response.url.endswith('.xml.gz'):
         return response.body
Exemplo n.º 23
0
    def log(self, response, request, spider):
        try:
            body = gunzip(response.body)
        except IOError:
            body = response.body

        pp = pprint.PrettyPrinter(indent=2)

        msg_template = self._get_template()
        data = [
            request.method,
            request.url,
            pp.pformat(request.headers),
            pp.pformat(request.body),
            response.status,
            pp.pformat(response.headers),
            pp.pformat(body[:1024]),
        ]
        self.logger.debug(msg_template.format(*data))
Exemplo n.º 24
0
 def _get_sitemap_body(self, response):
     """Return the sitemap body contained in the given response,
     or None if the response is not a sitemap.
     """
     if isinstance(response, XmlResponse):
         return response.body
     elif gzip_magic_number(response):
         return gunzip(response.body)
     # actual gzipped sitemap files are decompressed above ;
     # if we are here (response body is not gzipped)
     # and have a response for .xml.gz,
     # it usually means that it was already gunzipped
     # by HttpCompression middleware,
     # the HTTP response being sent with "Content-Encoding: gzip"
     # without actually being a .xml.gz file in the first place,
     # merely XML gzip-compressed on the fly,
     # in other word, here, we have plain XML
     elif response.url.endswith('.xml') or response.url.endswith('.xml.gz'):
         return response.body
    def test_process_response_gzipped_gzip_file(self):
        """Test that a gzip Content-Encoded .gz file is gunzipped
        only once by the middleware, leaving gunzipping of the file
        to upper layers.
        """
        headers = {
            'Content-Type': 'application/gzip',
            'Content-Encoding': 'gzip',
        }
        # build a gzipped file (here, a sitemap)
        f = BytesIO()
        plainbody = b"""<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.google.com/schemas/sitemap/0.84">
  <url>
    <loc>http://www.example.com/</loc>
    <lastmod>2009-08-16</lastmod>
    <changefreq>daily</changefreq>
    <priority>1</priority>
  </url>
  <url>
    <loc>http://www.example.com/Special-Offers.html</loc>
    <lastmod>2009-08-16</lastmod>
    <changefreq>weekly</changefreq>
    <priority>0.8</priority>
  </url>
</urlset>"""
        gz_file = GzipFile(fileobj=f, mode='wb')
        gz_file.write(plainbody)
        gz_file.close()

        # build a gzipped response body containing this gzipped file
        r = BytesIO()
        gz_resp = GzipFile(fileobj=r, mode='wb')
        gz_resp.write(f.getvalue())
        gz_resp.close()

        response = Response("http;//www.example.com/", headers=headers, body=r.getvalue())
        request = Request("http://www.example.com/")

        newresponse = self.mw.process_response(request, response, self.spider)
        self.assertEqual(gunzip(newresponse.body), plainbody)
Exemplo n.º 26
0
    def _parse_sitemap(self, response):
        if response.url.endswith('/robots.txt'):
            for url in sitemap_urls_from_robots(response.body):
                yield Request(url, callback=self._parse_sitemap)
        else:
            if isinstance(response, XmlResponse):
                body = response.body
            elif is_gzipped(response):
                body = gunzip(response.body)
            else:
                log.msg("Ignoring non-XML sitemap: %s" % response, log.WARNING)
                return

            s = Sitemap(body)
            if s.type == 'sitemapindex':
                for loc in iterloc(s):
                    if any(x.search(loc) for x in self._follow):
                        yield Request(loc, callback=self._parse_sitemap)
            elif s.type == 'urlset':
                for loc in iterloc(s):
                    for r, c in self._cbs:
                        if r.search(loc):
                            yield Request(loc, callback=c)
                            break
Exemplo n.º 27
0
    def _parse_sitemap(self, response):
        if response.url.endswith('/robots.txt'):
            for url in sitemap_urls_from_robots(response.body):
                yield Request(url, callback=self._parse_sitemap)
        else:
            if isinstance(response, XmlResponse):
                body = response.body
            elif is_gzipped(response):
                body = gunzip(response.body)
            else:
                log.msg("Ignoring non-XML sitemap: %s" % response, log.WARNING)
                return

            s = Sitemap(body)
            if s.type == 'sitemapindex':
                for loc in iterloc(s):
                    if any(x.search(loc) for x in self._follow):
                        yield Request(loc, callback=self._parse_sitemap)
            elif s.type == 'urlset':
                for loc in iterloc(s):
                    for r, c in self._cbs:
                        if r.search(loc):
                            yield Request(loc, callback=c)
                            break
Exemplo n.º 28
0
 def test_gunzip_truncated(self):
     with open(join(SAMPLEDIR, 'truncated-crc-error.gz'), 'rb') as f:
         text = gunzip(f.read())
         assert text.endswith('</html')
Exemplo n.º 29
0
def cached_page(site, url_path, spider_name='toc'):
    handle_client_ip()

    site = base64.standard_b64decode(site.encode()).decode()
    url_path = base64.standard_b64decode(url_path.encode()).decode()
    url_site = SiteSchemas.get(site).get(SSK.URL)
    url = url_site + url_path
    origin_encoding = SiteSchemas.get(site).get(SSK.ENCODING, 'utf-8')
    aid = request.args.get('aid', default=None, type=int)

    from moltspider.consts import Schemas
    from moltspider.parser import iter_items
    from scrapy.utils.misc import load_object
    from scrapy.utils.project import get_project_settings
    from scrapy.http.request import Request
    from scrapy.http.response.html import HtmlResponse
    from scrapy.utils.gz import gunzip
    from scrapy.downloadermiddlewares.httpcompression import ACCEPTED_ENCODINGS
    try:
        import brotli
    except:
        pass
    import zlib
    settings = get_project_settings()
    storage = load_object(settings['HTTPCACHE_STORAGE'])(settings)

    body = None
    spider_req = Request(url)
    if spider_name == Spiders.META:
        from moltspider.spiders.meta import MetaSpider
        spider = MetaSpider()
        schema_name = Schemas.META_PAGE
    elif spider_name == Spiders.TOC:
        from moltspider.spiders.toc import TocSpider
        spider = TocSpider
        schema_name = Schemas.TOC_PAGE
    else:
        raise Exception('No support for spider "%s"\'s cache page' %
                        spider_name)

    cachedresponse = storage.retrieve_response(spider, spider_req)
    if cachedresponse:
        content_encoding = cachedresponse.headers.getlist('Content-Encoding')
        if content_encoding:
            encoding = content_encoding.pop()
            if encoding == b'gzip' or encoding == b'x-gzip':
                body = gunzip(cachedresponse.body)

        if encoding == b'deflate':
            try:
                body = zlib.decompress(body)
            except zlib.error:
                # ugly hack to work with raw deflate content that may
                # be sent by microsoft servers. For more information, see:
                # http://carsten.codimi.de/gzip.yaws/
                # http://www.port80software.com/200ok/archive/2005/10/31/868.aspx
                # http://www.gzip.org/zlib/zlib_faq.html#faq38
                body = zlib.decompress(body, -15)
        if encoding == b'br' and b'br' in ACCEPTED_ENCODINGS:
            body = brotli.decompress(body)

    if body:
        if spider_name == Spiders.TOC and aid:
            sb = []
            colspan = 4
            i = 0
            scrapy_resp = HtmlResponse(url)
            scrapy_resp = scrapy_resp.replace(body=body,
                                              encoding=origin_encoding)
            sb.append('<table width="1000px" align="center"><tr>')
            for item in iter_items(spider, scrapy_resp, [
                    site,
            ], schema_name):
                if i % colspan == 0:
                    sb.append('</tr><tr>')
                item['_'] = url_site
                sb.append('<td><a href="%(_)s%(url)s">%(name)s</a></td>' %
                          item)
                del item['_']
                i += 1
            sb.append('</tr></table>')
            body = '\n'.join(sb)
            body = render_template_string(template_page, content=body)
        else:
            body = body.decode(encoding=origin_encoding)
    else:
        body = '%s (%s) not found in cache.' % (url, origin_encoding)

    resp = make_response(body)
    resp.headers['Content-Type'] = 'text/html; charset=utf-8'
    return resp
Exemplo n.º 30
0
 def test_gunzip_basic(self):
     with open(join(SAMPLEDIR, 'feed-sample1.xml.gz'), 'rb') as f:
         text = gunzip(f.read())
         self.assertEqual(len(text), 9950)
Exemplo n.º 31
0
 def _decompress(self, body):
     while True:
         try:
             body = gunzip(body)
         except IOError:
             return body
Exemplo n.º 32
0
 def test_gunzip_truncated_short(self):
     with open(join(SAMPLEDIR, 'truncated-crc-error-short.gz'), 'rb') as f:
         text = gunzip(f.read())
         assert text.endswith(b'</html>')
Exemplo n.º 33
0
 def test_gunzip_basic(self):
     with open(join(SAMPLEDIR, 'feed-sample1.xml.gz'), 'rb') as f:
         text = gunzip(f.read())
         self.assertEqual(len(text), 9950)
Exemplo n.º 34
0
 def test_gunzip_truncated_short(self):
     with open(join(SAMPLEDIR, "truncated-crc-error-short.gz"), "rb") as f:
         text = gunzip(f.read())
         assert text.endswith(b"</html>")