Python Processor.process_html示例，mincss.processor.Processor.process_html Python示例

示例#1

0

显示文件

文件： app.py 项目： Landhi/mincss

def proxy(path):
    if path == 'favicon.ico':
        abort(404)
    url = path
    if not path.count('://'):
        url = 'http://' + url

    query = urlparse(request.url).query
    if query:
        url += '?%s' % query
    logging.info('Downloading %s' % url)
    t0 = time.time()
    html = download(url)
    t1 = time.time()
    print('%.4f seconds to download' % (t1 - t0))

    p = Processor(debug=False, optimize_lookup=True)
    # since we've already download the HTML
    t0 = time.time()
    p.process_html(html, url)
    t1 = time.time()
    p.process()
    t2 = time.time()
    print('%.4f seconds to parse and process' % (t2 - t1))

    collect_stats = request.args.get('MINCSS_STATS', False)
    stats = []
    css_url_regex = re.compile('url\(([^\)]+)\)')

    def css_url_replacer(match, href=None):
        filename = match.groups()[0]
        bail = match.group()

        if (
            (filename.startswith('"') and filename.endswith('"')) or
            (filename.startswith("'") and filename.endswith("'"))
        ):
            filename = filename[1:-1]
        if 'data:image' in filename or '://' in filename:
            return bail
        if filename == '.':
            # this is a known IE hack in CSS
            return bail

        new_filename = urljoin(url, filename)
        return 'url("%s")' % new_filename

    for i, each in enumerate(p.inlines):
        # this should be using CSSSelector instead
        new_inline = each.after
        new_inline = css_url_regex.sub(
            functools.partial(css_url_replacer, href=url),
            new_inline
        )
        stats.append(
            ('inline %s' % (i + 1), each.before, each.after)
        )
        html = html.replace(each.before, new_inline)

    parser = etree.HTMLParser()
    stripped = html.strip()
    tree = etree.fromstring(stripped, parser).getroottree()
    page = tree.getroot()

    # lxml inserts a doctype if none exists, so only include it in
    # the root if it was in the original html.
    was_doctype = tree.docinfo.doctype

    links = dict((x.href, x) for x in p.links)

    for link in CSSSelector('link')(page):
        if (
            link.attrib.get('rel', '') == 'stylesheet' or
            link.attrib['href'].lower().split('?')[0].endswith('.css')
        ):
            hash_ = hashlib.md5(url + link.attrib['href']).hexdigest()[:7]
            now = datetime.date.today()
            destination_dir = os.path.join(
                CACHE_DIR,
                str(now.year),
                str(now.month),
                str(now.day),
            )
            mkdir(destination_dir)
            new_css = links[link.attrib['href']].after
            stats.append((
                link.attrib['href'],
                links[link.attrib['href']].before,
                links[link.attrib['href']].after
            ))
            new_css = css_url_regex.sub(
                functools.partial(
                    css_url_replacer,
                    href=link.attrib['href']
                ),
                new_css
            )
            destination = os.path.join(destination_dir, hash_ + '.css')

            with codecs.open(destination, 'w', 'utf-8') as f:
                f.write(new_css)

            link.attrib['href'] = (
                '/cache%s' % destination.replace(CACHE_DIR, '')
            )

    for img in CSSSelector('img, script')(page):
        if 'src' in img.attrib:
            orig_src = urljoin(url, img.attrib['src'])
            img.attrib['src'] = orig_src

    for a in CSSSelector('a')(page):
        if 'href' not in a.attrib:
            continue
        href = a.attrib['href']

        if (
            '://' in href or
            href.startswith('#') or
            href.startswith('javascript:')
        ):
            continue

        if href.startswith('/'):
            a.attrib['href'] = (
                '/' +
                urljoin(url, a.attrib['href'])
                .replace('http://', '')
            )
        if collect_stats:
            a.attrib['href'] = add_collect_stats_qs(
                a.attrib['href'],
                collect_stats
            )

    html = etree.tostring(page, method='html')
    if collect_stats:
        html = re.sub(
            '<body[^>]*>',
            lambda m: m.group() + summorize_stats_html(stats),
            html,
            flags=re.I | re.M,
            count=1
        )

    return (was_doctype and was_doctype or '') + '\n' + html

示例#2

0

显示文件

文件： app.py 项目： zsalzbank/mincss

def proxy(path):
    if path == 'favicon.ico':
        abort(404)
    url = path
    if not path.count('://'):
        url = 'http://' + url

    query = urlparse.urlparse(request.url).query
    if query:
        url += '?%s' % query
    logging.info('Downloading %s' % url)
    t0 = time.time()
    html = download(url)
    t1 = time.time()
    print "%.4f seconds to download" % (t1 - t0)

    p = Processor(debug=False, optimize_lookup=True)
    # since we've already download the HTML
    t0 = time.time()
    p.process_html(html, url)
    t1 = time.time()
    p.process()
    t2 = time.time()
    print "%.4f seconds to parse and process" % (t2 - t1)

    collect_stats = request.args.get('MINCSS_STATS', False)
    stats = []
    css_url_regex = re.compile('url\(([^\)]+)\)')

    def css_url_replacer(match, href=None):
        filename = match.groups()[0]
        bail = match.group()

        if ((filename.startswith('"') and filename.endswith('"'))
                or (filename.startswith("'") and filename.endswith("'"))):
            filename = filename[1:-1]
        if 'data:image' in filename or '://' in filename:
            return bail
        if filename == '.':
            # this is a known IE hack in CSS
            return bail

        #if not filename.startswith('/'):
        #    filename = os.path.normpath(
        #        os.path.join(
        #            os.path.dirname(href),
        #            filename
        #        )
        #    )

        new_filename = urlparse.urljoin(url, filename)
        return 'url("%s")' % new_filename

    for i, each in enumerate(p.inlines):
        # this should be using CSSSelector instead
        new_inline = each.after
        new_inline = css_url_regex.sub(
            functools.partial(css_url_replacer, href=url), new_inline)
        stats.append(('inline %s' % (i + 1), each.before, each.after))
        html = html.replace(each.before, new_inline)

    parser = etree.HTMLParser()
    stripped = html.strip()
    tree = etree.fromstring(stripped, parser).getroottree()
    page = tree.getroot()

    # lxml inserts a doctype if none exists, so only include it in
    # the root if it was in the original html.
    was_doctype = tree.docinfo.doctype
    #root = tree if stripped.startswith(tree.docinfo.doctype) else page

    links = dict((x.href, x) for x in p.links)

    #all_lines = html.splitlines()
    for link in CSSSelector('link')(page):
        if (link.attrib.get('rel', '') == 'stylesheet'
                or link.attrib['href'].lower().split('?')[0].endswith('.css')):
            hash_ = hashlib.md5(url + link.attrib['href']).hexdigest()[:7]
            now = datetime.date.today()
            destination_dir = os.path.join(
                CACHE_DIR,
                str(now.year),
                str(now.month),
                str(now.day),
            )
            mkdir(destination_dir)
            new_css = links[link.attrib['href']].after
            stats.append(
                (link.attrib['href'], links[link.attrib['href']].before,
                 links[link.attrib['href']].after))
            new_css = css_url_regex.sub(
                functools.partial(css_url_replacer, href=link.attrib['href']),
                new_css)
            destination = os.path.join(destination_dir, hash_ + '.css')

            with codecs.open(destination, 'w', 'utf-8') as f:
                f.write(new_css)

            link.attrib['href'] = ('/cache%s' %
                                   destination.replace(CACHE_DIR, ''))

    for img in CSSSelector('img, script')(page):
        if 'src' in img.attrib:
            orig_src = urlparse.urljoin(url, img.attrib['src'])
            img.attrib['src'] = orig_src

    for a in CSSSelector('a')(page):
        if 'href' not in a.attrib:
            continue
        href = a.attrib['href']

        if ('://' in href or href.startswith('#')
                or href.startswith('javascript:')):
            continue

        if href.startswith('/'):
            a.attrib['href'] = (
                '/' +
                urlparse.urljoin(url, a.attrib['href']).replace('http://', ''))
        #else:
        if collect_stats:
            a.attrib['href'] = add_collect_stats_qs(a.attrib['href'],
                                                    collect_stats)

    html = etree.tostring(page, method='html')
    if collect_stats:
        html = re.sub('<body[^>]*>',
                      lambda m: m.group() + summorize_stats_html(stats),
                      html,
                      flags=re.I | re.M,
                      count=1)

    return (was_doctype and was_doctype or '') + '\n' + html

示例#3

0

显示文件

文件： mincss_response.py 项目： selenamarie/django-peterbecom

def mincss_response(response, request):
    if Processor is None or cssmin is None:
        logging.info("No mincss_response() possible")
        return response

    html = unicode(response.content, 'utf-8')
    p = Processor()
    p.process_html(html, request.build_absolute_uri())
    p.process()
    combined_css = []
    _total_before = 0
    _requests_before = 0
    for link in p.links:
        _total_before += len(link.before)
        _requests_before += 1
        #combined_css.append('/* %s */' % link.href)
        combined_css.append(link.after)

    for inline in p.inlines:
        _total_before += len(inline.before)
        combined_css.append(inline.after)

    if p.inlines:
        html = _style_regex.sub('', html)
    found_link_hrefs = [x.href for x in p.links]

    def link_remover(m):
        bail = m.group()
        for each in found_link_hrefs:
            if each in bail:
                return ''
        return bail

    html = _link_regex.sub(link_remover, html)

    _total_after = sum(len(x) for x in combined_css)
    combined_css = [cssmin.cssmin(x) for x in combined_css]
    _total_after_min = sum(len(x) for x in combined_css)

    stats_css = (
"""
/*
Stats about using github.com/peterbe/mincss
-------------------------------------------
Requests:         %s (now: 0)
Before:           %.fKb
After:            %.fKb
After (minified): %.fKb
Saving:           %.fKb
*/"""
        % (_requests_before,
           _total_before / 1024.,
           _total_after / 1024.,
           _total_after_min / 1024.,
           (_total_before - _total_after) / 1024.)

    )
    combined_css.insert(0, stats_css)
    new_style = (
        '<style type="text/css">\n%s\n</style>' %
        ('\n'.join(combined_css)).strip()
    )
    html = html.replace(
        '</head>',
        new_style + '\n</head>'
    )
    response.content = html.encode('utf-8')
    return response

示例#4

0

显示文件

文件： mincss_response.py 项目： elvis2002/django-peterbecom

def mincss_response(response, request):
    if Processor is None or cssmin is None:
        logging.info("No mincss_response() possible")
        return response

    abs_uri = request.build_absolute_uri()
    if abs_uri.startswith('http://testserver'):
        return response

    html = unicode(response.content, 'utf-8')
    p = Processor(
        preserve_remote_urls=True,
    )
    p.process_html(html, abs_uri)
    p.process()
    combined_css = []
    _total_before = 0
    _requests_before = 0
    for link in p.links:
        _total_before += len(link.before)

        _requests_before += 1
        # combined_css.append('/* %s */' % link.href)

        # Here's an ugly exception handling.
        # The autocomplete css is almost entirely based on javascript
        # events that mincss can't be aware of.
        # And since it's (going to be) a third-party tool, we can't
        # pepper it with /*no mincss*/ in every selector.
        if 'autocompeter' in link.href:
            # leave as is
            combined_css.append(link.before)
        else:
            combined_css.append(link.after)

    for inline in p.inlines:
        _total_before += len(inline.before)
        combined_css.append(inline.after)

    if p.inlines:
        html = _style_regex.sub('', html)
    found_link_hrefs = [x.href for x in p.links]

    def link_remover(m):
        bail = m.group()
        for each in found_link_hrefs:
            if each in bail:
                return ''
        return bail

    html = _link_regex.sub(link_remover, html)

    _total_after = sum(len(x) for x in combined_css)
    combined_css = [cssmin.cssmin(x) for x in combined_css]
    _total_after_min = sum(len(x) for x in combined_css)

    stats_css = (
"""
/*
Stats about using github.com/peterbe/mincss
-------------------------------------------
Requests:         %s (now: 0)
Before:           %.fKb
After:            %.fKb
After (minified): %.fKb
Saving:           %.fKb
*/"""
        % (_requests_before,
           _total_before / 1024.,
           _total_after / 1024.,
           _total_after_min / 1024.,
           (_total_before - _total_after) / 1024.)

    )
    combined_css.insert(0, stats_css)
    new_style = (
        '<style type="text/css">\n%s\n</style>' %
        ('\n'.join(combined_css)).strip()
    )
    html = html.replace(
        '</head>',
        new_style + '\n</head>'
    )
    response.content = html.encode('utf-8')
    return response

示例#5

0

显示文件

文件： mincss_response.py 项目： libliflin/django-peterbecom

def _mincss_response(response, request):
    if Processor is None or cssmin is None:
        logging.info("No mincss_response() possible")
        return response

    abs_uri = request.build_absolute_uri()
    if abs_uri.startswith('http://testserver'):
        return response

    lock_key = 'lock:' + hashlib.md5(request.path).hexdigest()
    if cache.get(lock_key):
        # we're actively busy prepping this one
        print "Bailing because mincss_response is already busy for: %s" % (
            request.path + request.META.get('QUERY_STRING'),
        )
        return response
    cache.set(lock_key, True, 200)
    print "Starting to mincss for: %s" % (
        request.path + request.META.get('QUERY_STRING'),
    )
    html = unicode(response.content, 'utf-8')
    t0 = time.time()
    p = Processor(
        preserve_remote_urls=True,
    )
    p.process_html(html, abs_uri)
    p.process()
    t1 = time.time()
    combined_css = []
    _total_before = 0
    _requests_before = 0

    for link in p.links:
        _total_before += len(link.before)
        _requests_before += 1
        combined_css.append(link.after)

    for inline in p.inlines:
        _total_before += len(inline.before)
        combined_css.append(inline.after)

    if p.inlines:
        html = _style_regex.sub('', html)
    found_link_hrefs = [x.href for x in p.links]

    def link_remover(m):
        bail = m.group()
        for each in found_link_hrefs:
            if each in bail:
                return ''
        return bail

    html = _link_regex.sub(link_remover, html)

    _total_after = sum(len(x) for x in combined_css)
    combined_css = [cssmin.cssmin(x) for x in combined_css]
    _total_after_min = sum(len(x) for x in combined_css)
    t2 = time.time()
    template = """
/*
Stats about using github.com/peterbe/mincss
-------------------------------------------
Requests:         %s (now: 0)
Before:           %.fKb
After:            %.fKb
After (minified): %.fKb
Saving:           %.fKb
*/"""
    stats_css = template % (
        _requests_before,
        _total_before / 1024.,
        _total_after / 1024.,
        _total_after_min / 1024.,
        (_total_before - _total_after) / 1024.
    )
    combined_css.insert(0, stats_css)
    new_style = (
        '<style type="text/css">\n%s\n</style>' %
        ('\n'.join(combined_css)).strip()
    )
    html = html.replace(
        '</head>',
        new_style + '\n</head>'
    )
    logger.info('Took %.2fms to process with mincss' % (
        (t1 - t0) * 1000,
    ))
    logger.info('Took %.2fms to post-process remaining CSS' % (
        (t2 - t1) * 1000,
    ))

    _save_mincssed_html(
        request.path + request.META.get('QUERY_STRING'),
        html
    )
    response.content = html.encode('utf-8')
    return response