def test_just_inline(self): html = os.path.join(HERE, 'one.html') url = 'file://' + html p = Processor() p.process(url) # on line 7 there inline css starts # one.html only has 1 block on inline CSS inline = p.inlines[0] lines_after = inline.after.strip().splitlines() eq_(inline.line, 7) ok_(len(inline.after) < len(inline.before)) # compare line by line expect = ''' h1, h2, h3 { text-align: center; } h3 { font-family: serif; } h2 { color:red } ''' for i, line in enumerate(expect.strip().splitlines()): eq_(line.strip(), lines_after[i].strip())
def test_just_one_link(self): html = os.path.join(HERE, 'two.html') url = 'file://' + html p = Processor() p.process(url) # two.html only has 1 link CSS ref link = p.links[0] eq_(link.href, 'two.css') #eq_(link.url, url.replace('.html', '.css')) ok_(len(link.after) < len(link.before)) lines_after = link.after.splitlines() # compare line by line expect = ''' body, html { margin: 0; } h1, h2, h3 { text-align: center; } h3 { font-family: serif; } h2 { color:red } ''' for i, line in enumerate(expect.strip().splitlines()): eq_(line.strip(), lines_after[i].strip())
def test_make_absolute_url(self): p = Processor() eq_(p.make_absolute_url('http://www.com/', './style.css'), 'http://www.com/style.css') eq_(p.make_absolute_url('http://www.com', './style.css'), 'http://www.com/style.css') eq_(p.make_absolute_url('http://www.com', '//cdn.com/style.css'), 'http://cdn.com/style.css') eq_(p.make_absolute_url('http://www.com/', '//cdn.com/style.css'), 'http://cdn.com/style.css') eq_(p.make_absolute_url('http://www.com/', '/style.css'), 'http://www.com/style.css') eq_(p.make_absolute_url('http://www.com/elsewhere', '/style.css'), 'http://www.com/style.css') eq_(p.make_absolute_url('http://www.com/elsewhere/', '/style.css'), 'http://www.com/style.css') eq_(p.make_absolute_url('http://www.com/elsewhere/', './style.css'), 'http://www.com/elsewhere/style.css') eq_(p.make_absolute_url('http://www.com/elsewhere', './style.css'), 'http://www.com/style.css')
def run(args): options = {'debug': args.verbose} if args.phantomjs_path: options['phantomjs'] = args.phantomjs_path elif args.phantomjs: options['phantomjs'] = True p = Processor(**options) t0 = time.time() p.process(args.url) t1 = time.time() print("TOTAL TIME ", t1 - t0) for inline in p.inlines: print("ON", inline.url) print("AT line", inline.line) print("BEFORE ".ljust(79, '-')) print(inline.before) print("AFTER ".ljust(79, '-')) print(inline.after) print() output_dir = args.outputdir if not os.path.isdir(output_dir): os.mkdir(output_dir) for link in p.links: print("FOR", link.href) #print("BEFORE ".ljust(79, '-')) #print(link.before) #print("AFTER ".ljust(79, '-')) #print(link.after) orig_name = link.href.split('/')[-1] with open(os.path.join(output_dir, orig_name), 'w') as f: f.write(link.after) before_name = 'before_' + link.href.split('/')[-1] with open(os.path.join(output_dir, before_name), 'w') as f: f.write(link.before.encode('utf-8')) print("Files written to", output_dir) print() print('(from %d to %d saves %d)' % (len(link.before), len( link.after), len(link.before) - len(link.after))) return 0
def test_download_with_phantomjs(self): html = os.path.join(HERE, 'one.html') url = 'file://' + html p = Processor(phantomjs=PHANTOMJS, phantomjs_options={'cookies-file': 'bla'}) p.process(url) # on line 7 there inline css starts # one.html only has 1 block on inline CSS inline = p.inlines[0] lines_after = inline.after.strip().splitlines() eq_(inline.line, 7) ok_(len(inline.after) < len(inline.before)) # compare line by line expect = ''' h1, h2, h3 { text-align: center; } h3 { font-family: serif; } h2 { color:red } ''' for i, line in enumerate(expect.strip().splitlines()): eq_(line.strip(), lines_after[i].strip())
def test_one_link_two_different_pages(self): html = os.path.join(HERE, 'two.html') url1 = 'file://' + html html_half = os.path.join(HERE, 'two_half.html') url2 = 'file://' + html_half p = Processor() p.process(url1, url2) # two.html only has 1 link CSS ref link = p.links[0] eq_(link.href, 'two.css') ok_(len(link.after) < len(link.before)) lines_after = link.after.splitlines() # compare line by line expect = ''' body, html { margin: 0; } h1, h2, h3 { text-align: center; } h3 { font-family: serif; } .foobar { delete:me } .foobar, h2 { color:red } ''' for i, line in enumerate(expect.strip().splitlines()): eq_(line.strip(), lines_after[i].strip())
def test_pseudo_selectors_hell(self): html = os.path.join(HERE, 'three.html') url = 'file://' + html p = Processor(preserve_remote_urls=False) p.process(url) # two.html only has 1 link CSS ref link = p.links[0] after = link.after ok_('a.three:hover' in after) ok_('a.hundred:link' not in after) ok_('.container > a.one' in after) ok_('.container > a.notused' not in after) ok_('input[type="button"]' not in after) ok_('input[type="search"]::-webkit-search-decoration' in after) ok_('input[type="reset"]::-webkit-search-decoration' not in after) ok_('input[type="search"]::-webkit-search-decoration' in after) ok_('textarea:-moz-autofill' not in after) ok_(':-moz-autofill' not in after) ok_('@media (max-width: 900px)' in after) ok_('.container .two' in after) ok_('a.four' not in after) ok_('::-webkit-input-placeholder' in after) ok_(':-moz-placeholder {' in after) ok_('div::-moz-focus-inner' in after) ok_('button::-moz-focus-inner' not in after) ok_('@-webkit-keyframes progress-bar-stripes' in after) ok_('from {' in after) # some day perhaps this can be untangled and parsed too ok_('@import url(other.css)' in after)
def test_html_with_totally_empty_style_tag(self): html = os.path.join(HERE, 'one-3.html') url = 'file://' + html p = Processor() p.process(url) eq_(p.inlines, [])
def test_no_mincss_inline(self): html = os.path.join(HERE, 'no-mincss-inline.html') url = 'file://' + html p = Processor() p.process(url) eq_(p.inlines[0].before, p.inlines[0].after)
def test_ignore_inline(self): html = os.path.join(HERE, 'ignore-inline.html') url = 'file://' + html p = Processor() p.process(url) assert not p.inlines
def test_ignore_link(self): html = os.path.join(HERE, 'ignore-link.html') url = 'file://' + html p = Processor() p.process(url) assert not p.links
def proxy(path): if path == 'favicon.ico': abort(404) url = path if not path.count('://'): url = 'http://' + url query = urlparse.urlparse(request.url).query if query: url += '?%s' % query logging.info('Downloading %s' % url) t0 = time.time() html = download(url) t1 = time.time() print "%.4f seconds to download" % (t1 - t0) p = Processor(debug=False, optimize_lookup=True) # since we've already download the HTML t0 = time.time() p.process_html(html, url) t1 = time.time() p.process() t2 = time.time() print "%.4f seconds to parse and process" % (t2 - t1) collect_stats = request.args.get('MINCSS_STATS', False) stats = [] css_url_regex = re.compile('url\(([^\)]+)\)') def css_url_replacer(match, href=None): filename = match.groups()[0] bail = match.group() if ((filename.startswith('"') and filename.endswith('"')) or (filename.startswith("'") and filename.endswith("'"))): filename = filename[1:-1] if 'data:image' in filename or '://' in filename: return bail if filename == '.': # this is a known IE hack in CSS return bail #if not filename.startswith('/'): # filename = os.path.normpath( # os.path.join( # os.path.dirname(href), # filename # ) # ) new_filename = urlparse.urljoin(url, filename) return 'url("%s")' % new_filename for i, each in enumerate(p.inlines): # this should be using CSSSelector instead new_inline = each.after new_inline = css_url_regex.sub( functools.partial(css_url_replacer, href=url), new_inline) stats.append(('inline %s' % (i + 1), each.before, each.after)) html = html.replace(each.before, new_inline) parser = etree.HTMLParser() stripped = html.strip() tree = etree.fromstring(stripped, parser).getroottree() page = tree.getroot() # lxml inserts a doctype if none exists, so only include it in # the root if it was in the original html. was_doctype = tree.docinfo.doctype #root = tree if stripped.startswith(tree.docinfo.doctype) else page links = dict((x.href, x) for x in p.links) #all_lines = html.splitlines() for link in CSSSelector('link')(page): if (link.attrib.get('rel', '') == 'stylesheet' or link.attrib['href'].lower().split('?')[0].endswith('.css')): hash_ = hashlib.md5(url + link.attrib['href']).hexdigest()[:7] now = datetime.date.today() destination_dir = os.path.join( CACHE_DIR, str(now.year), str(now.month), str(now.day), ) mkdir(destination_dir) new_css = links[link.attrib['href']].after stats.append( (link.attrib['href'], links[link.attrib['href']].before, links[link.attrib['href']].after)) new_css = css_url_regex.sub( functools.partial(css_url_replacer, href=link.attrib['href']), new_css) destination = os.path.join(destination_dir, hash_ + '.css') with codecs.open(destination, 'w', 'utf-8') as f: f.write(new_css) link.attrib['href'] = ('/cache%s' % destination.replace(CACHE_DIR, '')) for img in CSSSelector('img, script')(page): if 'src' in img.attrib: orig_src = urlparse.urljoin(url, img.attrib['src']) img.attrib['src'] = orig_src for a in CSSSelector('a')(page): if 'href' not in a.attrib: continue href = a.attrib['href'] if ('://' in href or href.startswith('#') or href.startswith('javascript:')): continue if href.startswith('/'): a.attrib['href'] = ( '/' + urlparse.urljoin(url, a.attrib['href']).replace('http://', '')) #else: if collect_stats: a.attrib['href'] = add_collect_stats_qs(a.attrib['href'], collect_stats) html = etree.tostring(page, method='html') if collect_stats: html = re.sub('<body[^>]*>', lambda m: m.group() + summorize_stats_html(stats), html, flags=re.I | re.M, count=1) return (was_doctype and was_doctype or '') + '\n' + html
def process(urls): p = Processor() p.process(*urls) return p
def test_make_absolute_url(self): p = Processor()