Пример #1
0
    def save_attachments(self, html, document, prefix, tmpdir):
        """ Place attachments needed by the html of this document into tmpdir. Only attachments
        referenced using the given prefix are saved.
        """
        html = lxml.html.fromstring(html)
        prefix_len = len(prefix)

        # gather up the attachments that occur in the html
        imgs = [
            img for img in html.iter('img')
            if img.get('src', '').startswith(prefix)
        ]
        fnames = set(img.get('src')[prefix_len:] for img in imgs)

        # ensure the media directory exists
        media_dir = os.path.join(tmpdir, prefix)
        os.makedirs(media_dir, exist_ok=True)

        for attachment in document.attachments.all():
            # the src attribute values in fnames are URL-quoted
            if urllib.parse.quote(attachment.filename) in fnames:
                # save the attachment into tmpdir
                fname = os.path.join(media_dir, attachment.filename)
                with open(fname, "wb") as f:
                    shutil.copyfileobj(attachment.file, f)

        # make img references absolute
        # see https://github.com/wkhtmltopdf/wkhtmltopdf/issues/2660
        for img in imgs:
            img.set('src', os.path.join(tmpdir, img.get('src')))

        return lxml.html.tostring(html, encoding='unicode')
Пример #2
0
    def save_attachments(self, html, document, prefix, tmpdir):
        """ Place attachments needed by the html of this document into tmpdir. Only attachments
        referenced using the given prefix are saved.
        """
        html = lxml.html.fromstring(html)
        prefix_len = len(prefix)

        # gather up the attachments that occur in the html
        fnames = set(
            img.get('src')[prefix_len:]
            for img in html.iter('img')
            if img.get('src', '').startswith(prefix)
        )

        # ensure the media directory exists
        media_dir = os.path.join(tmpdir, prefix)
        os.makedirs(media_dir, exist_ok=True)

        for attachment in document.attachments.all():
            # the src attribute values in fnames are URL-quoted
            if urllib.parse.quote(attachment.filename) in fnames:
                # save the attachment into tmpdir
                fname = os.path.join(media_dir, attachment.filename)
                with open(fname, "wb") as f:
                    shutil.copyfileobj(attachment.file, f)
Пример #3
0
def make_body_images_inline(body):
    """Looks for images inside the body and make them inline.

    Before sending a message in HTML format, it is necessary to find
    all img tags contained in the body in order to rewrite them. For
    example, icons provided by CKeditor are stored on the server
    filesystem and not accessible from the outside. We must embark
    them as parts of the MIME message if we want recipients to
    display them correctly.

    :param body: the HTML body to parse
    """
    html = lxml.html.fromstring(body)
    parts = []
    for tag in html.iter("img"):
        src = tag.get("src")
        if src is None:
            continue
        o = urlparse(src)
        path = urllib.unquote(os.path.join(settings.BASE_DIR, o.path[1:]))
        if not os.path.exists(path):
            continue
        fname = os.path.basename(path)
        cid = "%s@modoboa" % os.path.splitext(fname)[0]
        tag.set("src", "cid:%s" % cid)
        with open(path, "rb") as fp:
            part = MIMEImage(fp.read())
        part["Content-ID"] = "<%s>" % cid
        part.replace_header("Content-Type",
                            '%s; name="%s"' % (part["Content-Type"], fname))
        part["Content-Disposition"] = "inline"
        parts.append(part)
    return lxml.html.tostring(html), parts
Пример #4
0
def sanitize_html(html_file):
  html = lxml.html.fromstring(html_file)

  for element in html.xpath("//script|//style|//meta|//link|//option|//iframe"):
    element.getparent().remove(element) 

  return (h for h in html.iter() if h.tag != etree.Comment)  
Пример #5
0
def get_html_attributes(html):
    """ Take an lxml ElementTree; return Counter of attributes; count once. """
    attributes = set()
    for element in html.iter():
        for attribute in element.keys():
            element_attribute = ' '.join([element.tag, attribute])
            attributes.add(element_attribute)
    return Counter(attributes)
Пример #6
0
    def _fix_html(self, value):
        html = lxml.html.fromstring(value)

        for elem in html.iter():
            if elem.text:
                elem.text = self._fix_text(elem.text)
            if elem.tail:
                elem.tail = self._fix_text(elem.tail)

        return lxml.html.tostring(html).decode()
Пример #7
0
def convert(doc: str):
    output = []
    html = lxml.html.fromstring(doc)

    for element in html.iter():
        res = fsm(element)
        if res:
            output.append(res)

    return '\n'.join(output)
Пример #8
0
def getLastPage(url):
    html = lxml.html.fromstring(openUrl(url).read())
    for element in html.iter():
        if element.tag == 'a':
            try:
                if element.attrib['class'] == 'last':
                    lastPage = 'http://wtcdata.nist.gov%s' % element.attrib['href']
                    lastPage = lastPage.split('=')[-1]
                    return lastPage
            except KeyError:
                pass
Пример #9
0
def convert_google_sheet(sid, gid, options):
    html = parse_google_document(
        'https://docs.google.com/spreadsheets/d/{sid}/htmlembed/sheet?gid={gid}&{options}'
            .format(sid=sid, gid=gid, options=options),
        errhelp={'sid' : sid, 'gid' : gid} )
    for script in html.iter('script'):
        v = script.get('src')
        if v is None:
            #pass #script.getparent().remove(script)
            script.text = script.text.replace("CHARTS_EXPORT_URI.push('","CHARTS_EXPORT_URI.push('https://docs.google.com")
        else:
            script.set('src',"https://docs.google.com"+v)
        
    html.find('head/link').rewrite_links(
        lambda s: 'https://docs.google.com' + s )
    html.find('head').append(lxml.html.Element( 'link',
        rel='stylesheet', href=url_for('static', filename='metatable.css'),
    ))
    html.find('body').append(lxml.html.Element( 'script',
        src="https://ajax.googleapis.com/ajax/libs/jquery/3.1.1/jquery.min.js"
    ))
    html.find('body').append(lxml.html.Element( 'script',
        src=url_for('static', filename='metatable.js')
    ))
    script = lxml.html.Element('script')
    script.text = ( "$(init); "
        "function init() { "
            "$('body').css('overflow', 'hidden'); "
            "var $table = $('#sheets-viewport table').detach(); "
            "var $metatable = create_metatable($table); "
            "$('body').empty().append($metatable); "
            "$metatable.resize(); "
        " }" 
        "$('.row-header-wrapper').remove();"  
        #"$('td').css('min-width', '100px');"
        "$(window).bind('load', function() {"
        "i=1;"
        "tableWidth=0;"
        "while (true) {  idStr = '#0C'+i.toString(); obj = $(idStr); if (obj[0]==undefined) {break;}; wstr=obj[0].style.width.replace('px', ''); tableWidth+=parseInt(wstr); i++; }"
        "tblList = $('table.waffle');"
        "tblList[1].style.width=tableWidth.toString()+'px';"   
        "tblList[3].style.width=tableWidth.toString()+'px';"   
        "initCharts();"

        "});"
        )
    html.find('body').append(script)
    # with open("output.txt", "w") as text_file:
    #     text_file.write(lxml.html.tostring(html, encoding='utf-8'))
    
    return b'<!DOCTYPE html>\n<meta charset="UTF-8">\n' + \
        lxml.html.tostring(html, encoding='utf-8')
def parse_to_raw_body(infilename, rewritten_input, make_transclusions):
    assert not rewritten_input, "no input file rewriting for .html input"
    transclusions = make_transclusions({})
    infile = (open(infilename, 'rb')
              if isinstance(infilename, basestring) else infilename)
    try:
        s = infile.read()
    finally:
        infile.close()
    html = parse_html(s)
    lang = next(html.iter()).attrib.get('lang', None)  #pylint: disable=W0612
    handle_data_url = transclusions.add_data_url
    raw_body = parse_body(html.find('body'), handle_data_url=handle_data_url)
    return raw_body, transclusions, []
Пример #11
0
def convert(mode, url):

    url_translator = URLTranslator(mode, url)

    # get the page and parse it with lxml
    html_str = requests.get(url).text
    html_str = UnicodeDammit(html_str).unicode
    html = lxml.html.document_fromstring(html_str, base_url=url)

    # clean up the page
    for el in html.iter():
        # support the HTML 'base' tag
        if el.tag == 'base' and el.get('href'):
            url_translator.register_base(el.get('href'))

        # remove comments
        if isinstance(el, lxml.html.HtmlComment):
            el.getparent().remove(el)
            continue

        # completely remove bad tags
        if el.tag in ('img', 'link', 'script', 'style', 'meta', 'iframe'):
            el.getparent().remove(el)
            continue

        # remove bad/useless attributes
        #   bad = causes extra downloads and/or adds bloat
        #   useless = doesn't do anything useful w/o CSS/JS
        # note that we don't remove ids since they can be used as URL anchors
        for attr, value in el.attrib.iteritems():
            if attr.startswith('on') or attr in ('style', 'class'):
                del el.attrib[attr]

        # translate input[type=image] to submit
        if el.tag == 'input' and el.get('type') == 'image':
            el.attrib['type'] = 'submit'
            if el.get('src'):
                del el.attrib['src']
            if el.get('alt'):
                el.attrib['value'] = el.get('alt')
                del el.attrib['alt']

        # TODO: translate <noscript> to regular text

        # translate URL-containing attributes
        for attr in ('src', 'href'):
            if attr in el.attrib and el.attrib[attr]:
                el.attrib[attr] = url_translator(el.attrib[attr])

    return lxml.html.tostring(html, encoding='utf-8')
def parse_to_raw_body(infilename, rewritten_input, make_transclusions):
    assert not rewritten_input, "no input file rewriting for .html input"
    transclusions = make_transclusions({})
    infile = (open(infilename, 'rb') if isinstance(infilename, basestring)
              else infilename)
    try:
        s = infile.read()
    finally:
        infile.close()
    html = parse_html(s)
    lang = next(html.iter()).attrib.get('lang', None) #pylint: disable=W0612
    handle_data_url = transclusions.add_data_url
    raw_body = parse_body(html.find('body'), handle_data_url=handle_data_url)
    return raw_body, transclusions, []
Пример #13
0
def google_spreadsheet_data(sid):
    html = parse_google_document(
        'https://docs.google.com/spreadsheets/d/{sid}/pubhtml?widget=true&range=1:70'
        .format(sid=sid),
        errhelp={'sid': sid})

    title = html.find('head/title').text
    sheets = []
    for script in html.iter('script'):
        if script.text is None:
            continue
        for match in SHEET_PATTERN.finditer(script.text):
            sheets.append(match.groupdict())
        if sheets:
            break
    return title, sheets
Пример #14
0
def getFullJpg(itemHTML):
    ### Get link to image:
    links = lxml.html.iterlinks(itemHTML)
    for element, attribute, link, pos in links:
        if link.endswith('?g2_imageViewsIndex=1'):
            link = "http://wtcdata.nist.gov%s" % link
            html = lxml.html.fromstring(openUrl(link).read())
            for element in html.iter():
                try:
                    if element.tag == 'img':
                        if element.attrib['id'] == 'IFid1':
                            mediaUrl = element.attrib['src']
                            mediaUrl = link = "http://wtcdata.nist.gov%s" % mediaUrl
                            return mediaUrl
                except KeyError:
                    pass
Пример #15
0
def get_article(data,
                url=None,
                encoding_in=None,
                encoding_out='unicode',
                debug=False,
                threshold=5):
    " Input a raw html string, returns a raw html string of the article "

    html = parse(data, encoding_in)
    score_all(html)

    # rank all nodes (largest to smallest)
    ranked_nodes = sorted(html.iter(),
                          key=lambda x: get_score(x),
                          reverse=True)

    # minimum threshold
    if not len(ranked_nodes) or get_score(ranked_nodes[0]) < threshold:
        return None

    # take common ancestor or the two highest rated nodes
    if len(ranked_nodes) > 1:
        best = lowest_common_ancestor(ranked_nodes[0], ranked_nodes[1], 3)

    else:
        best = ranked_nodes[0]

    # clean up
    if not debug:
        keep_threshold = get_score(ranked_nodes[0]) * 3 / 4
        clean_root(best, keep_threshold)

    # check for spammy content (links only)
    wc = count_words(best.text_content())
    wca = count_words(' '.join(
        [x.text_content() for x in best.findall('.//a')]))

    if not debug and (wc - wca < 50 or float(wca) / wc > 0.3):
        return None

    # fix urls
    if url:
        best.make_links_absolute(url)

    return lxml.etree.tostring(best if not debug else html,
                               method='html',
                               encoding=encoding_out)
Пример #16
0
def convert_google_sheet(sid, gid):
    html = parse_google_document(
        'https://docs.google.com/spreadsheets/d/{sid}/pubhtml/sheet?gid={gid}'.
        format(sid=sid, gid=gid),
        errhelp={
            'sid': sid,
            'gid': gid
        })
    for script in html.iter('script'):
        script.getparent().remove(script)
    for link in html.find('head').iter('link'):
        link.rewrite_links(lambda s: 'https:' + s if s.startswith('//') else
                           'https://docs.google.com' + s)
    html.find('head').append(
        lxml.html.Element(
            'link',
            rel='stylesheet',
            href=url_for('static', filename='metatable.css'),
        ))
    html.find('body').append(
        lxml.html.Element(
            'script',
            src=
            "https://ajax.googleapis.com/ajax/libs/jquery/3.6.0/jquery.min.js")
    )
    html.find('body').append(
        lxml.html.Element('script',
                          src=url_for('static', filename='metatable.js')))
    script = lxml.html.Element('script')
    script.text = ("$(init); "
                   "function init() { "
                   "$('body').css('overflow', 'hidden'); "
                   "var $viewport = $('#sheets-viewport').detach(); "
                   "var $table = $viewport.find('table').detach(); "
                   "var $svgs = $viewport.find('svg'); "
                   "var $metatable = create_metatable($table); "
                   "$('body').empty(); "
                   "$('body').append($svgs); "
                   "$('body').append($metatable); "
                   "$viewport.remove(); "
                   "$metatable.resize(); "
                   " }")
    html.find('body').append(script)
    return b'<!DOCTYPE html>\n<meta charset="UTF-8">\n' + \
        lxml.html.tostring(html, encoding='utf-8')
Пример #17
0
def html2plaintext(content):
    """HTML to plain text translation

    :param content: some HTML content
    """
    html = lxml.html.fromstring(content)
    plaintext = ""
    for ch in html.iter():
        p = None
        if ch.text is not None:
            p = ch.text.strip('\r\t\n')
        if ch.tag == "img":
            p = ch.get("alt")
        if p is None:
            continue
        plaintext += p + "\n"

    return plaintext
Пример #18
0
def html2plaintext(content):
    """HTML to plain text translation

    :param content: some HTML content
    """
    html = lxml.html.fromstring(content)
    plaintext = ""
    for ch in html.iter():
        p = None
        if ch.text is not None:
            p = ch.text.strip('\r\t\n')
        if ch.tag == "img":
            p = ch.get("alt")
        if p is None:
            continue
        plaintext += p + "\n"

    return plaintext
Пример #19
0
def find_images_in_body(body):
    """Looks for images inside a HTML body

    Before sending a message in HTML format, it is necessary to find
    all img tags contained in the body in order to rewrite them. For
    example, icons provided by CKeditor are stored on the server
    filesystem and not accessible from the outside. We must embark
    them as parts off the MIME message if we want recipients to
    display them correctly.

    :param body: the HTML body to parse
    """
    from email.mime.image import MIMEImage
    from urlparse import urlparse

    html = lxml.html.fromstring(body)
    parts = []
    for tag in html.iter("img"):
        src = tag.get("src")
        if src is None:
            continue
        o = urlparse(src)
        fname = os.path.basename(o.path)
        path = os.path.join(settings.MEDIA_ROOT, "webmail", fname)
        if not os.path.exists(path):
            continue
        cid = "%s@modoboa" % os.path.splitext(fname)[0]
        tag.set("src", "cid:%s" % cid)
        fp = open(path, "rb")
        p = MIMEImage(fp.read())
        fp.close()
        p["Content-ID"] = "<%s>" % cid
        ct = p["Content-Type"]
        p.replace_header("Content-Type",
                         '%s; name="%s"' % (ct, os.path.basename(fname)))
        p["Content-Disposition"] = "inline"
        parts.append(p)

    return lxml.html.tostring(html), parts
Пример #20
0
def find_images_in_body(body):
    """Looks for images inside a HTML body

    Before sending a message in HTML format, it is necessary to find
    all img tags contained in the body in order to rewrite them. For
    example, icons provided by CKeditor are stored on the server
    filesystem and not accessible from the outside. We must embark
    them as parts off the MIME message if we want recipients to
    display them correctly.

    :param body: the HTML body to parse
    """
    from email.mime.image import MIMEImage
    from urlparse import urlparse

    html = lxml.html.fromstring(body)
    parts = []
    for tag in html.iter("img"):
        src = tag.get("src")
        if src is None:
            continue
        o = urlparse(src)
        fname = os.path.basename(o.path)
        path = os.path.join(settings.MEDIA_ROOT, "webmail", fname)
        if not os.path.exists(path):
            continue
        cid = "%s@modoboa" % os.path.splitext(fname)[0]
        tag.set("src", "cid:%s" % cid)
        fp = open(path, "rb")
        p = MIMEImage(fp.read())
        fp.close()
        p["Content-ID"] = "<%s>" % cid
        ct = p["Content-Type"]
        p.replace_header("Content-Type", '%s; name="%s"'
                         % (ct, os.path.basename(fname)))
        p["Content-Disposition"] = "inline"
        parts.append(p)

    return lxml.html.tostring(html), parts
Пример #21
0
# And dump their timezones to ecfzones.json

import collections
import json
import lxml.html
import re
import urllib2

from ecftimezone import ECFTimezone

PACERLINKS = 'http://www.pacer.gov/psco/cgi-bin/links.pl'

courts = set()

html = lxml.html.parse(urllib2.urlopen(PACERLINKS))
for e in html.iter('a'):
    link = e.get('href')
    if not link:
        continue
    match = re.match(r'^https?://ecf\.([^.]+)\.uscourts.gov', link)
    if match:
        courts.add(match.group(1))

f = open('ecfdomains.txt', 'w')
for c in sorted(courts):
    f.write(c + "\n")
f.close()

e = ECFTimezone()
d = {c: e.timezone(c) for c in courts}
od = collections.OrderedDict(sorted(d.items()))
Пример #22
0
combined_name=os.path.join(os.path.dirname(directory_name), 'dealeron.json')

dealers = []

for file_name in file_names:
    with open(file_name, 'r') as fd:
        url = os.path.splitext(os.path.basename(file_name))[0]
        text = fd.read()
        if not '.dlron.' in text:
            print 'Dealer %s has moved on ...' % url
            continue
        try:
            html = lxml.html.document_fromstring(text)
            data = {'url': url, 'address': {}, 'geo': {}, 'departments': []}
            for meta in html.iter('meta'):
                name    = meta.get('name')
                content = meta.get('content')
                if name is not None:
                    if name == 'geo.position':
                        lat, lng = content.split(',')
                        data['geo']['latitude']  = lat
                        data['geo']['longitude'] = lng
                    elif name == 'geo.placement':
                        data['address']['addressLocality'] = content
                    elif name == 'geo.region':
                        data['address']['addressRegion'] = content
            for div in html.find_class('hours-page'):
                for span in div.iter('span'):
                    itemprop = span.get('itemprop')
                    content  = span.text_content()
Пример #23
0
            full_url = urljoin(url, i['src'])
            print("image URL: " + full_url)
            y = urllib.request.urlopen(full_url)
            t = t + 1
            with open('downloaded_images/' + str(t) + '.png', "wb") as code:
                code.write(y.read())
            counter += 1

    print('finished')
    return 0


print('Введите путь')
url = input()
parse(url)
response = requests.get(url)
html = html.fromstring(response.text)
f = open('links.txt', 'w')
i = 0
for a in html.iter("a"):
    if (i < 10):
        link = urljoin(url, a.get("href"))
        print(link)
        parse(link)
        f.write(link)
        f.write('\n')
        i += 1

f.close()
input()
Пример #24
0
                            data['address']['addressRegion']   = state
                            data['address']['postalCode']      = zip_code
                        break
                    break
                for anchor in html.find_class('get-directions-event-target'):
                    href = anchor.get('href')
                    if href:
                        text = href[href.index('?q=')+3:]
                        if len(text) >= 3:
                            lat, lng = get_lat_lng(text)
                            if len(lat) > 0 and len(lng) > 0:
                                data['geo'] = {'latitude': lat, 'longitude': lng }
                                break
                    break
            if len(data['geo']) == 0:
                for iframe in html.iter('iframe'):
                    src = iframe.get('src')
                    uri = urlparse(src)
                    if uri.hostname == 'maps.google.com':
                        src = src.replace('&amp;', '&')
                        val = parse_qs(uri.query)
                        if 'll' in val:
                            lat, lng = get_lat_lng(val['ll'][0])
                            if len(lat) > 0 and len(lng) > 0:
                                data['geo'] = {'latitude': lat, 'longitude': lng }
                                break
            dealers.append(data)
        except ValueError, e:
            pass

with open(combined_name, 'wb') as fd:
Пример #25
0
def main():
    # TODO: combine command-line and option file.
    # TODO: option to generate a default configuration file
    parser = argparse.ArgumentParser()  # TODO: doc
    parser.add_argument("-s", "--standalone", action="store_true")  # TODO: doc
    args = parser.parse_args()
    standalone = args.standalone

    conf = json.load((DATA / "artdoc.js").open())

    if Path("artdoc.js").exists():
        user_conf = json.load(Path("artdoc.js").open())
        conf.update(user_conf)

    info("Document:")
    doc_patterns = conf["doc"]
    if isinstance(doc_patterns, basestring):
        doc_patterns = [doc_patterns]
    docs = []
    for pattern in doc_patterns:
        matches = list(WORKDIR.glob(pattern))
        #subinfo("matching {!r}:".format(pattern))
        for match in matches:
            subinfo(str(match))
        docs.extend(matches)
    if not docs:
        sys.exit("error: no document found")

#    info("HTML template:")
#    template_file = HTML / "index.html"
#    subinfo(str(template_file))
#    template = template_file.open().read().encode("utf-8")

    info("Bibliography:")
    bib_patterns = conf["bib"]
    if isinstance(bib_patterns, basestring):
        bib_patterns = [bib_patterns]
    bibs = []
    for pattern in bib_patterns:
        matches = list(WORKDIR.glob(pattern))
        #subinfo("matching {!r}:".format(pattern))
        for match in matches:
            subinfo(str(match))
        bibs.extend(matches)
    if not bibs:
        print()

    info("JS:")
    cmd = coffee["-c", str(JS / "main.coffee")]
    subinfo(cmd)
    cmd()

    info("CSS:")
    cmd = stylus[str(CSS / "style.styl")]
    subinfo(str(cmd))
    cmd()

    # TODO: copy only what is required.
    shutil.copytree(str(DATA), str(ARTDOC))

    for doc in docs:
        pass

        info("PANDOC: generate JSON file")
        args = ["-t", "json", "--smart"]
        for bib in bibs:
            args.extend(["--bibliography", str(bib)])
        args.append(str(doc))
        cmd = pandoc[args]
        subinfo(cmd, "> json")
        json_str = cmd()

        info("Convert raw TeX to raw HTML")
        cmd = local[str(BIN / "rawHTML.hs")]
        subinfo(cmd, "< json > json")
        json_str = (cmd << json_str)()

        #        info("Flag/Box Proofs")
        #        cmd = local[str(BIN / "proof.hs")]
        #        subinfo(cmd, "< json > json")
        #        try:
        #            json_str = (cmd << json_str)()
        #        except Exception as error:
        #            print(repr(error))

        #        info("Wrap Section-Like Sequence of Blocks")
        #        cmd = local[str(BIN / "div.hs")]
        #        subinfo(cmd, "< json > json")
        #        try:
        #            json_str = (cmd << json_str)()
        #        except Exception as error:
        #            print(repr(error))

        info("Wrap Section-Like Sequence of Blocks")
        cmd = local[str(BIN / "section.hs")]
        subinfo(cmd, "< json > json")
        try:
            json_str = (cmd << json_str)()
        except Exception as error:
            print(repr(error))

        info("Flag Tombstones (end of proofs)")
        cmd = local[str(BIN / "tombstone.hs")]
        subinfo(cmd, "< json > json")
        try:
            json_str = (cmd << json_str)()
        except Exception as error:
            print(repr(error))

        info("Convert Images to SVG Images")
        cmd = local[str(BIN / "svg.hs")]
        subinfo(cmd, "< json > json")
        json_str = (cmd << json_str)()

        info("Generate HTML body from markdown")
        args = [
            "--email-obfuscation", "none", "-f", "json", "--mathjax", "-t",
            "html5", "--section-divs"
        ]
        cmd = pandoc[args]
        subinfo(cmd, "< json > body")
        pandoc_body_str = (cmd << json_str)()
        pandoc_html = lxml.html.document_fromstring(pandoc_body_str)
        pandoc_body = pandoc_html.cssselect("body")[0]

        info("Generate standalone HTML doc")
        html = HTML.html(HTML.head, HTML.body)
        body = html.cssselect("body")[0]
        head = html.cssselect("head")[0]
        head.append(HTML.meta(charset="utf-8"))
        body.attrib.update(pandoc_body.attrib)
        body.extend(pandoc_body[:])

        # ----------------------------------------------------------------------
        info("Add JQuery")
        head.extend(jquery(standalone=standalone))

        # ----------------------------------------------------------------------
        info("Add Velocity")
        head.extend(velocity(standalone=standalone))

        # ----------------------------------------------------------------------
        info("Add Clipboard.js")
        head.extend(clipboard(standalone=standalone))

        # ----------------------------------------------------------------------
        info("Add Highlight.js")
        head.extend(highlight(standalone=standalone))

        # ----------------------------------------------------------------------
        info("Add Google Fonts support")
        head.extend(
            google_fonts(["Alegreya", "Alegreya SC"], standalone=standalone))

        # ----------------------------------------------------------------------
        info("Add Mathjax support")
        head.extend(mathjax(standalone=standalone))

        # ----------------------------------------------------------------------
        info("Add Font Awesome support")
        head.extend(font_awesome(standalone=standalone))

        # ----------------------------------------------------------------------
        info("Add artdoc css & js files")
        head.extend(artdoc())

        # ----------------------------------------------------------------------
        info("Setting language to english (required for hyphens)")
        html.set("lang", "en")

        # ----------------------------------------------------------------------
        info("Ensure ids uniqueness")
        id_count = {}
        for elt in html.iter():
            _id = elt.get("id")
            if _id is not None:
                count = id_count.get(_id, 0)
                if count > 0:
                    elt.set("id", _id + "-" + str(count))
                id_count[_id] = count + 1

        # ----------------------------------------------------------------------
        info("Turning headers into self-links")
        sections = html.cssselect("section")
        for section in sections:
            id_ = section.get("id")
            heading = None
            if len(section):
                first = section[0]
                if first.tag in "h1 h2 h3 h4 h5 h6".split():
                    heading = first
            if id_ and heading is not None:
                contents = [heading.text or ""] + heading[:]
                heading.text, heading[:] = None, []
                href = {"href": "#" + id_}
                link = HTML.a(href, *contents)
                heading.insert(0, link)

        # ----------------------------------------------------------------------

        # TODO: deal with metadata & insert a document header with:
        #   - title,
        #   - date (format: Month Day, Year), autoformat, autogen ?
        #   - author(s) (with mail & affiliation when available ?).
        #     Assume custom metadata or parse the author field ?
        #     Representation of multiple authors ? MMm eLIFEsciences use
        #     popup for author info. Ex: http://elifesciences.org/content/4/e06356 !
        #     here, use hints from http://dtd.nlm.nih.gov/book/tag-library/:
        #
        #       - name (don't be more precise)
        #       - affiliation (concatenate)
        #       - address ???
        #       - email  --> Font Awesome Icon
        #       - url / uri ?
        #       - form of ID ? (like HAL ? or ZBlatt ?)

        # TODO: look at the rendering of
        #       http://kieranhealy.org/blog/archives/2014/01/23/plain-text/:
        #         - small grey date on top, bold title, bold author name,
        #           italics affiliation, repeat.

        metadata = get_metadata(str(doc))

        items = []

        date = parse_html(metadata.get("date"))
        if date is not None:
            items.append(HTML.p({"class": "date"}, *date))

#        def textify(item):
#          if isinstance(item, basestring):
#              return item
#          elif hasattr(item, "text"):
#              return item.text
#          else:
#              return "".join([textify(it) or "" for it in item])

        title = parse_html(metadata.get("title"))
        title_id = None
        if title is not None:
            #title_id = textify(title).lower().replace(" ", "-")
            items.append(
                HTML.h1({"class": "title"}, HTML.a({"href": "#"}, *title)))
            head.insert(0, HTML.title(*title))

        authors = metadata.get("author") or []

        for author in authors:
            if isinstance(author, basestring):
                name = parse_html(author)
                email = None
                affiliation = None
            else:
                name = parse_html(author.get("name"))
                email = parse_html(author.get("email"))
                affiliation = parse_html(author.get("affiliation"))

            if name is not None:
                if email is not None:
                    name = [HTML.a({"href": "mailto:" + email[0]}, *name)]
                name = HTML.p({"class": "author"}, *name)
                items.append(name)
                if affiliation is not None:
                    affiliation = HTML.p({"class": "affiliation"},
                                         *affiliation)
                    items.append(affiliation)

        header_attr = {"class": "main"}
        #        if title_id is not None:
        #          header_attr["id"] = title_id
        header = HTML.header(header_attr, *items)
        #        print("HEADER", lxml.html.tostring(header))
        body.insert(0, header)
        #        print("BODY", lxml.html.tostring(body))
        #        print("HTML", lxml.html.tostring(html))

        # ----------------------------------------------------------------------
        info("Generate the standalone HTML file")
        html_str = lxml.html.tostring(html,
                                      encoding="utf-8",
                                      doctype="<!DOCTYPE html>")
        doc.with_suffix(".html").open("wb").write(html_str)

    sys.exit(0)
Пример #26
0
    def get_event(self):
        d = {}
        d['id'] = id_from_path(self.url)
        d['date'] = date_from_id(d['id'])
        d['datetime'] = date_from_id(d['id'])
        d['url'] = self.url

        html = lxml.html.fromstring(self.document)
        for div in html.iter('div'):
            if div.get('id') == 'bodyContent':
                break

        tags = [t for t in div if not callable(t.tag) and not t.get('id') and 'footer' not in t.get('class', '')]
        parts = [t.text_content().strip().replace('\n', ' ') for t in tags]
        description = '\n'.join(parts)
        summary = description.split('\n', 1)[0]

        self.div = div
        if not summary:
            return None

        d['summary'] = summary
        d['description'] = description

        for n, p in enumerate(parts):
            match = re.search(r'\b(\d\d?)h(\d\d)?\b', p)
            if match:
                d['hour'] = time(int(match.group(1)), int(match.group(2) or '0'))
                d['datetime'] = combine(d['date'], d['hour'])
                parts[n] = p[:match.start(0)] + p[match.end(0):]
                break

        for n, p in enumerate(parts):
            match = re.search(ur'\b(\d+([,.]\d+)?)\s*(euros\b|euro\b|€)', p)
            if match:
                d['price'] = float(match.group(1).replace(',', '.'))
                parts[n] = p[:match.start(0)] + p[match.end(0):]
                break

        address = []
        for n, p in enumerate(parts):
            match = re.search(r'\d+[\s,]+(rue|boulevard|avenue)\s+.+', p, re.I)
            if match:
                address.append(match.group(0))
                p = parts[n] = p[:match.start(0)] + p[match.end(0):]
            match = re.search(r'\b(75|92|93|94|78|77|95|91)\d\d\d\b.*', p)
            if match:
                address.append(match.group(0))
                p = parts[n] = p[:match.start(0)] + p[match.end(0):]
            match = re.search(r'\b(m.tro|rer)\b.*', p, re.I)
            if match:
                address.append(match.group(0))
                p = parts[n] = p[:match.start(0)] + p[match.end(0):]
            match = re.search(r'@\s+\w+(\s+[^.]+.*)?', p) # refuse '@foo' or '@ foo . plop'
            if match:
                address.append(match.group(0))
                p = parts[n] = p[:match.start(0)] + p[match.end(0):]

        if address:
            d['address'] = ' '.join(address)

        return d
Пример #27
0
def get_html_elements(html):
    """ Take an lxml ElementTree; return Counter of elements; count once. """
    return Counter({element.tag for element in html.iter()})
Пример #28
0
                    break
                for geo in vcard.find_class('geo'):
                    data['geo'] = {}
                    for lat_el in geo.find_class('latitude'):
                        for value in lat_el.find_class('value-title'):
                            data['geo']['latitude'] = value.get('title')
                    for lng_el in geo.find_class('longitude'):
                        for value in lng_el.find_class('value-title'):
                            data['geo']['longitude'] = value.get('title')
                break
            for maps in html.find_class('google-map'):
                latlng = maps.get('data-markers-list')
                if latlng:
                    lat, lng = get_lat_lng(latlng)
                    data['geo'] = {'latitude': lat, 'longitude': lng }
            for link in html.iter('link'):
                if link.get('rel') == 'publisher':
                    data['google_plus'] = link.get('href')
#            for meta in html.iter('meta'):
#                name = meta.get('name')
#                if name == 'geo.position' or name == 'ICBM':
#                    latlng = meta.get('content')
#                    if latlng:
#                        lat, lng = get_lat_lng(latlng)
#                        data['geo'] = {'latitude': lat, 'longitude': lng }
            dealers.append(data)
        except ValueError, e:
            pass

with open(combined_name, 'wb') as fd:
    json.dump(list(dealers), fd)
def parse_news_articles(php_directory, download_directory, file_name, query):
	# Note: Assumes that path is stored as <query>.php/
	inpath = php_directory + file_name + "/"
	file_list = [ f for f in listdir(inpath) if isfile(join(inpath,f)) ]

	# For each file, get the article Titles and URLs
	for file in file_list:
		# Clear out any variables from last file
		articleURL = articleTitle = articleSource = summaryText = keywords = score = code = ""
	
		try:	
			intext = open(inpath + file, 'r').read()
			html = etree.HTML(intext)
		except lxml.etree.XMLSyntaxError:
			print "ERROR: XMLSyntaxError when reading " + inpath + file
			break

		for element in html.iter():
			if (element.tag == "p" and element.text == "News Result"):
				# Do nothing
				pass
	
			elif (element.tag == "a"):
				articleURL = element.attrib["href"]
				articleTitle = element.text

			elif (element.tag == "br"):
				if (element.tail != None):
					summaryText = element.tail

			elif (element.tag == "strong"):
				if (element.tail != "\n"):
					articleSource = element.tail

			elif (element.tag == "p"):
				# Check to see if article already exists using URL. If it exists, don't do anything
				if (articles.find_one({ "url": articleURL }) is not None):
					print "INFO: Duplicate article found"
				else:
					print "Processing: " + articleURL			
	
					# For each URL, assign its md5 as a unique identifier
					#code = base64.urlsafe_b64encode(os.urandom(18))
					m = hashlib.md5()
					m.update(articleURL)
					code = m.hexdigest()
        				first_level = code[0:2]
					second_level = code[2:4]
					
					# This code also becomes the filename for the full file path
					#articleFileDirectory = php_directory + file + "--news/"
					articleFileDirectory = download_directory + first_level + "/" + second_level + "/"
					articleFilePath = articleFileDirectory + code

					# Download full article and use full-text (if available) for keyword extraction
					fullArticleText = download_article_file(articleURL, articleFileDirectory, code)
			
					if (fullArticleText is not None):
						keyword_set = textrank(fullArticleText) 
						#articleFeatures = get_article_features(fullArticleText, articleURL)
						articleFeatures = None
						guessed_date = guess_date(fullArticleText)
					else:
						keyword_set = textrank(summaryText)
						#articleFeatures = get_article_features(summaryText, articleURL)
						articleFeatures = None
						guessed_date = guess_date(summaryText)
				
					keywords = list(keyword_set)
                        		
					processed_date = datetime.now().strftime("%Y-%m-%d")
					if (guessed_date is not None):
						publish_date = guessed_date
					else:
						publish_date = processed_date
	
					article = [{
					"q": query,
					"c": code,
					"f": articleFeatures,
					"pubd": publish_date,
					"procd": processed_date,
					"url": articleURL,
					"t": articleTitle,
					"abs": summaryText,
					"sr": articleSource,
					"k": keywords,
					"fp": articleFilePath,
					"m": None
					}]
		
                        		# Write article to MongoDB collection
		                        try:
		                                article_id = articles.insert(article)
		                        except MongoException.DuplicateKey:
		                                print "Duplicate key: " + code

		                        #print "Inserted into articles: " + articleTitle.encode('ascii', 'ignore')

					if (fullArticleText is None):
						fullArticleText = summaryText

					# Insert into ElasticSearch	
		                        json_str = mk_es_json(code, fullArticleText, articleURL, articleTitle, summaryText, publish_date)
                		        #print json_str
		                        index = 'article'
              				index_type = 'text'
        		                es_url = 'http://localhost:9200'
                        		r = post_to_elastic_search(es_url, index, index_type, code, json_str)
		                        print r
Пример #30
0
    def get_event(self):
        d = {}
        d['id'] = id_from_path(self.url)
        d['date'] = date_from_id(d['id'])
        d['datetime'] = date_from_id(d['id'])
        d['url'] = self.url

        html = lxml.html.fromstring(self.document)
        for div in html.iter('div'):
            if div.get('id') == 'bodyContent':
                break

        tags = [
            t for t in div if not callable(t.tag) and not t.get('id')
            and 'footer' not in t.get('class', '')
        ]
        parts = [t.text_content().strip().replace('\n', ' ') for t in tags]
        description = '\n'.join(parts)
        summary = description.split('\n', 1)[0]

        self.div = div
        if not summary:
            return None

        d['summary'] = summary
        d['description'] = description

        for n, p in enumerate(parts):
            match = re.search(r'\b(\d\d?)h(\d\d)?\b', p)
            if match:
                d['hour'] = time(int(match.group(1)),
                                 int(match.group(2) or '0'))
                d['datetime'] = combine(d['date'], d['hour'])
                parts[n] = p[:match.start(0)] + p[match.end(0):]
                break

        for n, p in enumerate(parts):
            match = re.search(ur'\b(\d+([,.]\d+)?)\s*(euros\b|euro\b|€)', p)
            if match:
                d['price'] = float(match.group(1).replace(',', '.'))
                parts[n] = p[:match.start(0)] + p[match.end(0):]
                break

        address = []
        for n, p in enumerate(parts):
            match = re.search(r'\d+[\s,]+(rue|boulevard|avenue)\s+.+', p, re.I)
            if match:
                address.append(match.group(0))
                p = parts[n] = p[:match.start(0)] + p[match.end(0):]
            match = re.search(r'\b(75|92|93|94|78|77|95|91)\d\d\d\b.*', p)
            if match:
                address.append(match.group(0))
                p = parts[n] = p[:match.start(0)] + p[match.end(0):]
            match = re.search(r'\b(m.tro|rer)\b.*', p, re.I)
            if match:
                address.append(match.group(0))
                p = parts[n] = p[:match.start(0)] + p[match.end(0):]
            match = re.search(r'@\s+\w+(\s+[^.]+.*)?',
                              p)  # refuse '@foo' or '@ foo . plop'
            if match:
                address.append(match.group(0))
                p = parts[n] = p[:match.start(0)] + p[match.end(0):]

        if address:
            d['address'] = ' '.join(address)

        return d
Пример #31
0
def extract_content(html):
    topnodes = {}

    for tag in ('p', 'li', 'dd', 'dt'):
        for p in html.iter(tag):
            parent, val = p.getparent(), valuate(p)
            topnodes[parent] = sumval(topnodes.get(parent, (0, 0, 0)), val)

    for p in html.iter('img'):
        l = categorise(p)
        if l > 0:
            parent = p.getparent()
            topnodes[parent] = sumval(topnodes.get(parent, (0, 0, 0)), (l, 1, 1))

    toplist = list(map(lambda x: (x[0], getval(x[1])), topnodes.items()))
    if not toplist:
        return []

    toplist.sort(key=lambda x: x[1], reverse=True)
    if toplist[0][0].tag in ('dl', 'ol', 'ul'):
        weighing = 4
    else:
        weighing = 2

    paths, article = {}, None
    for top, l in filter(lambda x: weighing*x[1] >= toplist[0][1], toplist):
        node, nesting = top.getparent(), 2
        while node is not None:
            if node.tag == 'article':
                article, artnesting = node, nesting

            info = paths.get(node, (0, 0))
            paths[node] = (info[0] + 1, max(info[1], nesting))
            node, nesting = node.getparent(), nesting + 1

    pathlist = list(paths.items())
    pathlist.sort(key=lambda x: x[1])
    maxp = pathlist[-1][1][0]

    if (article != None) and (10*valuate(top)[2] >= 8*valuate(article)[2]):
        top, nesting = article, artnesting
    else:
        if maxp > 1:
            pathlist = list(filter(lambda x: x[1][0] >= (maxp + 1) // 2, pathlist))

            top, info = pathlist[0]
            pathnr, nesting = info
            if info[0] == maxp // 2:
                for top, info in pathlist[1:]:
                    if info[0] != pathnr:
                        pathnr, nesting = info
                        break
        else:
            nesting = 1

    highesthdr, content, visited = 7, [], {}

    for p in top.iter():
        if p == top:
            if p.tag in ('dl', 'ol', 'ul'):
                p.tail = ''
                content.append(p)
                break
            else:
                continue

        if categorise(p) <= 0:
            continue

        if p.tag == 'img':
            parent = p.getparent()
            if parent != top:
                p = parent

        if p.tag.startswith('h'):
            towrite = True
            highesthdr = min(highesthdr, int(p.tag[1]))
        else:
            towrite = False

        encl, parent, i = p, p.getparent(), nesting
        while parent is not None and parent is not top:
            encl, parent = parent, parent.getparent()
            i -= 1

        if not towrite:
            towrite = i > 0

        if towrite:
            if not visited.get(encl):
                for elem in encl.iter():
                    visited[elem] = True

                encl.tail = ''
                content.append(encl)

    remove_after(top)
    if top.getparent() is not None:
        parent = top.getparent()
        parent.remove(top)
    else:
        parent = None

    lowesthdr, headers = None, []

    for i in range(1, highesthdr):
        elem = None
        for elem in html.iter('h%d' % (i,)):
            pass

        if elem is not None:
            elem.tail = ''
            headers.append(elem)
            remove_before(elem)
            elem.getparent().remove(elem)
            lowesthdr = i
            break


    if lowesthdr:
        for elem in html.iter():
            if elem.tag in ('h2', 'h3', 'h4', 'h5', 'h6'):
                elem.tail = ''
                headers.append(elem)
                elem.getparent().remove(elem)

        if parent is not None:
            for elem in parent:
                if type(elem.tag) == type(''):
                    elem.tail = ''
                    headers.append(elem)

    headers.extend(content)
    return list(map(clean_imgs, map(html_cleaner.clean_html, headers)))
Пример #32
0
def main():
    # TODO: combine command-line and option file.
    # TODO: option to generate a default configuration file
    parser = argparse.ArgumentParser() # TODO: doc
    parser.add_argument("-s", "--standalone", action="store_true") # TODO: doc
    args = parser.parse_args()
    standalone = args.standalone

    conf = json.load((DATA / "artdoc.js").open())

    if Path("artdoc.js").exists():
        user_conf = json.load(Path("artdoc.js").open())
        conf.update(user_conf)

    info("Document:")
    doc_patterns = conf["doc"]
    if isinstance(doc_patterns, basestring):
        doc_patterns = [doc_patterns]
    docs = []
    for pattern in doc_patterns:
        matches = list(WORKDIR.glob(pattern))
        #subinfo("matching {!r}:".format(pattern))
        for match in matches:
            subinfo(str(match))
        docs.extend(matches)
    if not docs:
        sys.exit("error: no document found")

#    info("HTML template:")
#    template_file = HTML / "index.html"
#    subinfo(str(template_file))
#    template = template_file.open().read().encode("utf-8")

    info("Bibliography:")
    bib_patterns = conf["bib"]
    if isinstance(bib_patterns, basestring):
        bib_patterns = [bib_patterns]
    bibs = []
    for pattern in bib_patterns:
        matches = list(WORKDIR.glob(pattern))
        #subinfo("matching {!r}:".format(pattern))
        for match in matches:
          subinfo(str(match))
        bibs.extend(matches)
    if not bibs:
        print()

    info("JS:")
    cmd = coffee["-c", str(JS / "main.coffee")]
    subinfo(cmd)
    cmd()    

    info("CSS:")
    cmd = stylus[str(CSS / "style.styl")]
    subinfo(str(cmd))
    cmd()

    # TODO: copy only what is required.
    shutil.copytree(str(DATA), str(ARTDOC))

    for doc in docs:
        pass

        info("PANDOC: generate JSON file")
        args = ["-t", "json", "--smart"]
        for bib in bibs:
            args.extend(["--bibliography", str(bib)])
        args.append(str(doc))
        cmd = pandoc[args]
        subinfo(cmd, "> json")
        json_str = cmd()

        info("Convert raw TeX to raw HTML")
        cmd = local[str(BIN / "rawHTML.hs")]
        subinfo(cmd, "< json > json")
        json_str = (cmd << json_str)()

#        info("Flag/Box Proofs")
#        cmd = local[str(BIN / "proof.hs")]
#        subinfo(cmd, "< json > json")
#        try:
#            json_str = (cmd << json_str)()
#        except Exception as error:
#            print(repr(error))

#        info("Wrap Section-Like Sequence of Blocks")
#        cmd = local[str(BIN / "div.hs")]
#        subinfo(cmd, "< json > json")
#        try:
#            json_str = (cmd << json_str)()
#        except Exception as error:
#            print(repr(error))

        info("Wrap Section-Like Sequence of Blocks")
        cmd = local[str(BIN / "section.hs")]
        subinfo(cmd, "< json > json")
        try:
            json_str = (cmd << json_str)()
        except Exception as error:
            print(repr(error))

        info("Flag Tombstones (end of proofs)")
        cmd = local[str(BIN / "tombstone.hs")]
        subinfo(cmd, "< json > json")
        try:
            json_str = (cmd << json_str)()
        except Exception as error:
            print(repr(error))

        info("Convert Images to SVG Images")
        cmd = local[str(BIN / "svg.hs")]
        subinfo(cmd, "< json > json")
        json_str = (cmd << json_str)()

        info("Generate HTML body from markdown")
        args = ["--email-obfuscation", "none",
                "-f", "json", 
                "--mathjax", 
                "-t", "html5", "--section-divs"]
        cmd = pandoc[args]
        subinfo(cmd, "< json > body")
        pandoc_body_str = (cmd << json_str)()
        pandoc_html = lxml.html.document_fromstring(pandoc_body_str)
        pandoc_body = pandoc_html.cssselect("body")[0]

        info("Generate standalone HTML doc")
        html = HTML.html(HTML.head, HTML.body)
        body = html.cssselect("body")[0]
        head = html.cssselect("head")[0]
        head.append(HTML.meta(charset="utf-8"))
        body.attrib.update(pandoc_body.attrib)
        body.extend(pandoc_body[:])

        # ----------------------------------------------------------------------
        info("Add JQuery")
        head.extend(jquery(standalone=standalone))

        # ----------------------------------------------------------------------
        info("Add Velocity")
        head.extend(velocity(standalone=standalone))

        # ----------------------------------------------------------------------
        info("Add Clipboard.js")
        head.extend(clipboard(standalone=standalone))

        # ----------------------------------------------------------------------
        info("Add Highlight.js")
        head.extend(highlight(standalone=standalone))

        # ----------------------------------------------------------------------
        info("Add Google Fonts support")
        head.extend(google_fonts(["Alegreya", "Alegreya SC"], standalone=standalone))

        # ----------------------------------------------------------------------
        info("Add Mathjax support")
        head.extend(mathjax(standalone=standalone))

        # ----------------------------------------------------------------------
        info("Add Font Awesome support")
        head.extend(font_awesome(standalone=standalone))

        # ----------------------------------------------------------------------
        info("Add artdoc css & js files")
        head.extend(artdoc())

        # ----------------------------------------------------------------------
        info("Setting language to english (required for hyphens)")
        html.set("lang", "en") 

        # ----------------------------------------------------------------------
        info("Ensure ids uniqueness")
        id_count = {}
        for elt in html.iter():
          _id = elt.get("id")
          if _id is not None:
             count = id_count.get(_id, 0)
             if count > 0:
                 elt.set("id", _id + "-" + str(count))
             id_count[_id] = count + 1

        # ----------------------------------------------------------------------
        info("Turning headers into self-links")
        sections = html.cssselect("section")
        for section in sections:
            id_ = section.get("id")
            heading = None
            if len(section):
                first = section[0]
                if first.tag in "h1 h2 h3 h4 h5 h6".split():
                    heading = first
            if id_ and heading is not None:
                contents = [heading.text or ""] + heading[:]
                heading.text, heading[:] = None, []
                href = {"href": "#" + id_}
                link = HTML.a(href, *contents)
                heading.insert(0, link)

        # ----------------------------------------------------------------------


        # TODO: deal with metadata & insert a document header with:
        #   - title, 
        #   - date (format: Month Day, Year), autoformat, autogen ? 
        #   - author(s) (with mail & affiliation when available ?).
        #     Assume custom metadata or parse the author field ?
        #     Representation of multiple authors ? MMm eLIFEsciences use
        #     popup for author info. Ex: http://elifesciences.org/content/4/e06356 !
        #     here, use hints from http://dtd.nlm.nih.gov/book/tag-library/:
        #
        #       - name (don't be more precise)
        #       - affiliation (concatenate)
        #       - address ???
        #       - email  --> Font Awesome Icon
        #       - url / uri ?
        #       - form of ID ? (like HAL ? or ZBlatt ?)



        # TODO: look at the rendering of
        #       http://kieranhealy.org/blog/archives/2014/01/23/plain-text/:
        #         - small grey date on top, bold title, bold author name,
        #           italics affiliation, repeat.


        metadata = get_metadata(str(doc))

        items = []

        date = parse_html(metadata.get("date"))
        if date is not None:
            items.append(HTML.p({"class": "date"}, *date))

#        def textify(item):
#          if isinstance(item, basestring):
#              return item
#          elif hasattr(item, "text"):
#              return item.text
#          else:
#              return "".join([textify(it) or "" for it in item])

        title = parse_html(metadata.get("title"))
        title_id = None
        if title is not None:
            #title_id = textify(title).lower().replace(" ", "-")
            items.append(
              HTML.h1(
                {"class": "title"}, 
                HTML.a(
                  {"href": "#"},
                  *title
                )
              )
            )
            head.insert(0, HTML.title(*title))

        authors = metadata.get("author") or []

        for author in authors:
            if isinstance(author, basestring):
                name = parse_html(author)
                email = None
                affiliation = None
            else:
                name = parse_html(author.get("name"))
                email = parse_html(author.get("email"))
                affiliation = parse_html(author.get("affiliation"))

            if name is not None:
                if email is not None:
                    name = [HTML.a({"href": "mailto:" + email[0]}, *name)]
                name = HTML.p({"class": "author"}, *name)
                items.append(name)
                if affiliation is not None:
                    affiliation = HTML.p({"class": "affiliation"}, *affiliation)
                    items.append(affiliation)
        
        header_attr = {"class": "main"}
#        if title_id is not None:
#          header_attr["id"] = title_id
        header = HTML.header(header_attr, *items)
#        print("HEADER", lxml.html.tostring(header))
        body.insert(0, header)
#        print("BODY", lxml.html.tostring(body))
#        print("HTML", lxml.html.tostring(html))


        # ----------------------------------------------------------------------
        info("Generate the standalone HTML file")
        html_str = lxml.html.tostring(html, encoding="utf-8", doctype="<!DOCTYPE html>")
        doc.with_suffix(".html").open("wb").write(html_str)

    sys.exit(0)