def proxy(path): if path == 'favicon.ico': abort(404) url = path if not path.count('://'): url = 'http://' + url query = urlparse(request.url).query if query: url += '?%s' % query logging.info('Downloading %s' % url) t0 = time.time() html = download(url) t1 = time.time() print('%.4f seconds to download' % (t1 - t0)) p = Processor(debug=False, optimize_lookup=True) # since we've already download the HTML t0 = time.time() p.process_html(html, url) t1 = time.time() p.process() t2 = time.time() print('%.4f seconds to parse and process' % (t2 - t1)) collect_stats = request.args.get('MINCSS_STATS', False) stats = [] css_url_regex = re.compile('url\(([^\)]+)\)') def css_url_replacer(match, href=None): filename = match.groups()[0] bail = match.group() if ( (filename.startswith('"') and filename.endswith('"')) or (filename.startswith("'") and filename.endswith("'")) ): filename = filename[1:-1] if 'data:image' in filename or '://' in filename: return bail if filename == '.': # this is a known IE hack in CSS return bail new_filename = urljoin(url, filename) return 'url("%s")' % new_filename for i, each in enumerate(p.inlines): # this should be using CSSSelector instead new_inline = each.after new_inline = css_url_regex.sub( functools.partial(css_url_replacer, href=url), new_inline ) stats.append( ('inline %s' % (i + 1), each.before, each.after) ) html = html.replace(each.before, new_inline) parser = etree.HTMLParser() stripped = html.strip() tree = etree.fromstring(stripped, parser).getroottree() page = tree.getroot() # lxml inserts a doctype if none exists, so only include it in # the root if it was in the original html. was_doctype = tree.docinfo.doctype links = dict((x.href, x) for x in p.links) for link in CSSSelector('link')(page): if ( link.attrib.get('rel', '') == 'stylesheet' or link.attrib['href'].lower().split('?')[0].endswith('.css') ): hash_ = hashlib.md5(url + link.attrib['href']).hexdigest()[:7] now = datetime.date.today() destination_dir = os.path.join( CACHE_DIR, str(now.year), str(now.month), str(now.day), ) mkdir(destination_dir) new_css = links[link.attrib['href']].after stats.append(( link.attrib['href'], links[link.attrib['href']].before, links[link.attrib['href']].after )) new_css = css_url_regex.sub( functools.partial( css_url_replacer, href=link.attrib['href'] ), new_css ) destination = os.path.join(destination_dir, hash_ + '.css') with codecs.open(destination, 'w', 'utf-8') as f: f.write(new_css) link.attrib['href'] = ( '/cache%s' % destination.replace(CACHE_DIR, '') ) for img in CSSSelector('img, script')(page): if 'src' in img.attrib: orig_src = urljoin(url, img.attrib['src']) img.attrib['src'] = orig_src for a in CSSSelector('a')(page): if 'href' not in a.attrib: continue href = a.attrib['href'] if ( '://' in href or href.startswith('#') or href.startswith('javascript:') ): continue if href.startswith('/'): a.attrib['href'] = ( '/' + urljoin(url, a.attrib['href']) .replace('http://', '') ) if collect_stats: a.attrib['href'] = add_collect_stats_qs( a.attrib['href'], collect_stats ) html = etree.tostring(page, method='html') if collect_stats: html = re.sub( '<body[^>]*>', lambda m: m.group() + summorize_stats_html(stats), html, flags=re.I | re.M, count=1 ) return (was_doctype and was_doctype or '') + '\n' + html
def proxy(path): if path == 'favicon.ico': abort(404) url = path if not path.count('://'): url = 'http://' + url query = urlparse.urlparse(request.url).query if query: url += '?%s' % query logging.info('Downloading %s' % url) t0 = time.time() html = download(url) t1 = time.time() print "%.4f seconds to download" % (t1 - t0) p = Processor(debug=False, optimize_lookup=True) # since we've already download the HTML t0 = time.time() p.process_html(html, url) t1 = time.time() p.process() t2 = time.time() print "%.4f seconds to parse and process" % (t2 - t1) collect_stats = request.args.get('MINCSS_STATS', False) stats = [] css_url_regex = re.compile('url\(([^\)]+)\)') def css_url_replacer(match, href=None): filename = match.groups()[0] bail = match.group() if ((filename.startswith('"') and filename.endswith('"')) or (filename.startswith("'") and filename.endswith("'"))): filename = filename[1:-1] if 'data:image' in filename or '://' in filename: return bail if filename == '.': # this is a known IE hack in CSS return bail #if not filename.startswith('/'): # filename = os.path.normpath( # os.path.join( # os.path.dirname(href), # filename # ) # ) new_filename = urlparse.urljoin(url, filename) return 'url("%s")' % new_filename for i, each in enumerate(p.inlines): # this should be using CSSSelector instead new_inline = each.after new_inline = css_url_regex.sub( functools.partial(css_url_replacer, href=url), new_inline) stats.append(('inline %s' % (i + 1), each.before, each.after)) html = html.replace(each.before, new_inline) parser = etree.HTMLParser() stripped = html.strip() tree = etree.fromstring(stripped, parser).getroottree() page = tree.getroot() # lxml inserts a doctype if none exists, so only include it in # the root if it was in the original html. was_doctype = tree.docinfo.doctype #root = tree if stripped.startswith(tree.docinfo.doctype) else page links = dict((x.href, x) for x in p.links) #all_lines = html.splitlines() for link in CSSSelector('link')(page): if (link.attrib.get('rel', '') == 'stylesheet' or link.attrib['href'].lower().split('?')[0].endswith('.css')): hash_ = hashlib.md5(url + link.attrib['href']).hexdigest()[:7] now = datetime.date.today() destination_dir = os.path.join( CACHE_DIR, str(now.year), str(now.month), str(now.day), ) mkdir(destination_dir) new_css = links[link.attrib['href']].after stats.append( (link.attrib['href'], links[link.attrib['href']].before, links[link.attrib['href']].after)) new_css = css_url_regex.sub( functools.partial(css_url_replacer, href=link.attrib['href']), new_css) destination = os.path.join(destination_dir, hash_ + '.css') with codecs.open(destination, 'w', 'utf-8') as f: f.write(new_css) link.attrib['href'] = ('/cache%s' % destination.replace(CACHE_DIR, '')) for img in CSSSelector('img, script')(page): if 'src' in img.attrib: orig_src = urlparse.urljoin(url, img.attrib['src']) img.attrib['src'] = orig_src for a in CSSSelector('a')(page): if 'href' not in a.attrib: continue href = a.attrib['href'] if ('://' in href or href.startswith('#') or href.startswith('javascript:')): continue if href.startswith('/'): a.attrib['href'] = ( '/' + urlparse.urljoin(url, a.attrib['href']).replace('http://', '')) #else: if collect_stats: a.attrib['href'] = add_collect_stats_qs(a.attrib['href'], collect_stats) html = etree.tostring(page, method='html') if collect_stats: html = re.sub('<body[^>]*>', lambda m: m.group() + summorize_stats_html(stats), html, flags=re.I | re.M, count=1) return (was_doctype and was_doctype or '') + '\n' + html
def mincss_response(response, request): if Processor is None or cssmin is None: logging.info("No mincss_response() possible") return response html = unicode(response.content, 'utf-8') p = Processor() p.process_html(html, request.build_absolute_uri()) p.process() combined_css = [] _total_before = 0 _requests_before = 0 for link in p.links: _total_before += len(link.before) _requests_before += 1 #combined_css.append('/* %s */' % link.href) combined_css.append(link.after) for inline in p.inlines: _total_before += len(inline.before) combined_css.append(inline.after) if p.inlines: html = _style_regex.sub('', html) found_link_hrefs = [x.href for x in p.links] def link_remover(m): bail = m.group() for each in found_link_hrefs: if each in bail: return '' return bail html = _link_regex.sub(link_remover, html) _total_after = sum(len(x) for x in combined_css) combined_css = [cssmin.cssmin(x) for x in combined_css] _total_after_min = sum(len(x) for x in combined_css) stats_css = ( """ /* Stats about using github.com/peterbe/mincss ------------------------------------------- Requests: %s (now: 0) Before: %.fKb After: %.fKb After (minified): %.fKb Saving: %.fKb */""" % (_requests_before, _total_before / 1024., _total_after / 1024., _total_after_min / 1024., (_total_before - _total_after) / 1024.) ) combined_css.insert(0, stats_css) new_style = ( '<style type="text/css">\n%s\n</style>' % ('\n'.join(combined_css)).strip() ) html = html.replace( '</head>', new_style + '\n</head>' ) response.content = html.encode('utf-8') return response
def mincss_response(response, request): if Processor is None or cssmin is None: logging.info("No mincss_response() possible") return response abs_uri = request.build_absolute_uri() if abs_uri.startswith('http://testserver'): return response html = unicode(response.content, 'utf-8') p = Processor( preserve_remote_urls=True, ) p.process_html(html, abs_uri) p.process() combined_css = [] _total_before = 0 _requests_before = 0 for link in p.links: _total_before += len(link.before) _requests_before += 1 # combined_css.append('/* %s */' % link.href) # Here's an ugly exception handling. # The autocomplete css is almost entirely based on javascript # events that mincss can't be aware of. # And since it's (going to be) a third-party tool, we can't # pepper it with /*no mincss*/ in every selector. if 'autocompeter' in link.href: # leave as is combined_css.append(link.before) else: combined_css.append(link.after) for inline in p.inlines: _total_before += len(inline.before) combined_css.append(inline.after) if p.inlines: html = _style_regex.sub('', html) found_link_hrefs = [x.href for x in p.links] def link_remover(m): bail = m.group() for each in found_link_hrefs: if each in bail: return '' return bail html = _link_regex.sub(link_remover, html) _total_after = sum(len(x) for x in combined_css) combined_css = [cssmin.cssmin(x) for x in combined_css] _total_after_min = sum(len(x) for x in combined_css) stats_css = ( """ /* Stats about using github.com/peterbe/mincss ------------------------------------------- Requests: %s (now: 0) Before: %.fKb After: %.fKb After (minified): %.fKb Saving: %.fKb */""" % (_requests_before, _total_before / 1024., _total_after / 1024., _total_after_min / 1024., (_total_before - _total_after) / 1024.) ) combined_css.insert(0, stats_css) new_style = ( '<style type="text/css">\n%s\n</style>' % ('\n'.join(combined_css)).strip() ) html = html.replace( '</head>', new_style + '\n</head>' ) response.content = html.encode('utf-8') return response
def _mincss_response(response, request): if Processor is None or cssmin is None: logging.info("No mincss_response() possible") return response abs_uri = request.build_absolute_uri() if abs_uri.startswith('http://testserver'): return response lock_key = 'lock:' + hashlib.md5(request.path).hexdigest() if cache.get(lock_key): # we're actively busy prepping this one print "Bailing because mincss_response is already busy for: %s" % ( request.path + request.META.get('QUERY_STRING'), ) return response cache.set(lock_key, True, 200) print "Starting to mincss for: %s" % ( request.path + request.META.get('QUERY_STRING'), ) html = unicode(response.content, 'utf-8') t0 = time.time() p = Processor( preserve_remote_urls=True, ) p.process_html(html, abs_uri) p.process() t1 = time.time() combined_css = [] _total_before = 0 _requests_before = 0 for link in p.links: _total_before += len(link.before) _requests_before += 1 combined_css.append(link.after) for inline in p.inlines: _total_before += len(inline.before) combined_css.append(inline.after) if p.inlines: html = _style_regex.sub('', html) found_link_hrefs = [x.href for x in p.links] def link_remover(m): bail = m.group() for each in found_link_hrefs: if each in bail: return '' return bail html = _link_regex.sub(link_remover, html) _total_after = sum(len(x) for x in combined_css) combined_css = [cssmin.cssmin(x) for x in combined_css] _total_after_min = sum(len(x) for x in combined_css) t2 = time.time() template = """ /* Stats about using github.com/peterbe/mincss ------------------------------------------- Requests: %s (now: 0) Before: %.fKb After: %.fKb After (minified): %.fKb Saving: %.fKb */""" stats_css = template % ( _requests_before, _total_before / 1024., _total_after / 1024., _total_after_min / 1024., (_total_before - _total_after) / 1024. ) combined_css.insert(0, stats_css) new_style = ( '<style type="text/css">\n%s\n</style>' % ('\n'.join(combined_css)).strip() ) html = html.replace( '</head>', new_style + '\n</head>' ) logger.info('Took %.2fms to process with mincss' % ( (t1 - t0) * 1000, )) logger.info('Took %.2fms to post-process remaining CSS' % ( (t2 - t1) * 1000, )) _save_mincssed_html( request.path + request.META.get('QUERY_STRING'), html ) response.content = html.encode('utf-8') return response