Exemplo n.º 1
0
def parse_article(fn):
    parser = etree.HTMLParser()
    tree   = etree.parse(open(fn), parser)
    e = tree.getroot()
    for i in e.xpath('.//img'):
        try:
            print layout.norm_ext_img_url(i.get('src'))
        except UnicodeEncodeError:
            sys.stderr.write('UnicodeError %s %r \n' %(fn, i.get('src')))
Exemplo n.º 2
0
def parse_css(css, cssimagedir):
    urls = [u for u in re.findall('url\((.*?)\)', css) if not u.startswith('data:')] + extra
    for u in urls:
        url = layout.norm_ext_img_url(u)
        lurl = layout.ext_img_url2local_cssimg_url(url)
        fn = layout.ext_img_url2fn(url, keep_ext=False)
        ofn = os.path.join(cssimagedir, fn)
        #print url, lurl, fn, ofn
        try:
            open(ofn, 'w').write( urllib2.urlopen(url).read() )
            css = css.replace(u, lurl)
        except urllib2.URLError, e:
            print 'ERR', e, url