Пример #1
0
def cached_page(site, url_path, spider_name='toc'):
    handle_client_ip()

    site = base64.standard_b64decode(site.encode()).decode()
    url_path = base64.standard_b64decode(url_path.encode()).decode()
    url_site = SiteSchemas.get(site).get(SSK.URL)
    url = url_site + url_path
    origin_encoding = SiteSchemas.get(site).get(SSK.ENCODING, 'utf-8')
    aid = request.args.get('aid', default=None, type=int)

    from moltspider.consts import Schemas
    from moltspider.parser import iter_items
    from scrapy.utils.misc import load_object
    from scrapy.utils.project import get_project_settings
    from scrapy.http.request import Request
    from scrapy.http.response.html import HtmlResponse
    from scrapy.utils.gz import gunzip
    from scrapy.downloadermiddlewares.httpcompression import ACCEPTED_ENCODINGS
    try:
        import brotli
    except:
        pass
    import zlib
    settings = get_project_settings()
    storage = load_object(settings['HTTPCACHE_STORAGE'])(settings)

    body = None
    spider_req = Request(url)
    if spider_name == Spiders.META:
        from moltspider.spiders.meta import MetaSpider
        spider = MetaSpider()
        schema_name = Schemas.META_PAGE
    elif spider_name == Spiders.TOC:
        from moltspider.spiders.toc import TocSpider
        spider = TocSpider
        schema_name = Schemas.TOC_PAGE
    else:
        raise Exception('No support for spider "%s"\'s cache page' %
                        spider_name)

    cachedresponse = storage.retrieve_response(spider, spider_req)
    if cachedresponse:
        content_encoding = cachedresponse.headers.getlist('Content-Encoding')
        if content_encoding:
            encoding = content_encoding.pop()
            if encoding == b'gzip' or encoding == b'x-gzip':
                body = gunzip(cachedresponse.body)

        if encoding == b'deflate':
            try:
                body = zlib.decompress(body)
            except zlib.error:
                # ugly hack to work with raw deflate content that may
                # be sent by microsoft servers. For more information, see:
                # http://carsten.codimi.de/gzip.yaws/
                # http://www.port80software.com/200ok/archive/2005/10/31/868.aspx
                # http://www.gzip.org/zlib/zlib_faq.html#faq38
                body = zlib.decompress(body, -15)
        if encoding == b'br' and b'br' in ACCEPTED_ENCODINGS:
            body = brotli.decompress(body)

    if body:
        if spider_name == Spiders.TOC and aid:
            sb = []
            colspan = 4
            i = 0
            scrapy_resp = HtmlResponse(url)
            scrapy_resp = scrapy_resp.replace(body=body,
                                              encoding=origin_encoding)
            sb.append('<table width="1000px" align="center"><tr>')
            for item in iter_items(spider, scrapy_resp, [
                    site,
            ], schema_name):
                if i % colspan == 0:
                    sb.append('</tr><tr>')
                item['_'] = url_site
                sb.append('<td><a href="%(_)s%(url)s">%(name)s</a></td>' %
                          item)
                del item['_']
                i += 1
            sb.append('</tr></table>')
            body = '\n'.join(sb)
            body = render_template_string(template_page, content=body)
        else:
            body = body.decode(encoding=origin_encoding)
    else:
        body = '%s (%s) not found in cache.' % (url, origin_encoding)

    resp = make_response(body)
    resp.headers['Content-Type'] = 'text/html; charset=utf-8'
    return resp