Exemplo n.º 1
0
    def parse(self, response):
        # parse article page
        if self.page_count > 0:
            item = WechatItem()
            item['date'] = self.today
            item['title'] = response.xpath(
                'normalize-space(//h2[@class="rich_media_title"]/text())'
            ).extract()[0]
            item['html'] = response.xpath(
                '//div[@class="rich_media_content "]').extract()[0]
            # fix incomplete tags
            html = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom"))
            xml = html.parse(item['html']).toxml()
            # convert to markdown
            h2m = Html2Markdown()
            h2m.feed(xml)
            h2m.close()
            item['markdown'] = h2m.output
            yield item

        else:  # parse article list
            for x in response.xpath('//div[@class="weui_media_bd"]'):
                # only get today articles
                date = x.xpath(
                    'normalize-space(p[@class="weui_media_extra_info"]/text())'
                ).extract()[0]
                date = date.replace('年', '/')
                date = date.replace('月', '/')
                date = date.replace('日', '')
                if date == '2019/7/2':
                    self.page_count += 1

        # whether need to request the next page
        if self.page_index < self.page_count:
            yield Request(url=response.url, callback=self.parse)
def parse_for_footnotes(article_or_page_generator):
    all_content = [
        getattr(article_or_page_generator, attr, None) \
        for attr in [u'articles', u'drafts', u'pages']]
    all_content = [x for x in all_content if x is not None]
    for article in sequence_gen(all_content):
        if u"[ref]" in article._content and u"[/ref]" in article._content:
            content = article._content.replace(u"[ref]", u"<x-simple-footnote>").replace(u"[/ref]",
                                                                                         u"</x-simple-footnote>")
            parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder(u"dom"))
            dom = parser.parse(content)
            endnotes = []
            count = 0
            for footnote in dom.getElementsByTagName(u"x-simple-footnote"):
                pn = footnote
                leavealone = False
                while pn:
                    if pn.nodeName in RAW_FOOTNOTE_CONTAINERS:
                        leavealone = True
                        break
                    pn = pn.parentNode
                if leavealone:
                    continue
                count += 1
                fnid = u"sf-%s-%s" % (article.slug, count)
                fnbackid = u"%s-back" % (fnid,)
                endnotes.append((footnote, fnid, fnbackid))
                number = dom.createElement(u"sup")
                number.setAttribute(u"id", fnbackid)
                numbera = dom.createElement(u"a")
                numbera.setAttribute(u"href", u"#%s" % fnid)
                numbera.setAttribute(u"class", u"simple-footnote")
                numbera.appendChild(dom.createTextNode(six.text_type(count)))
                txt = getText(footnote, recursive=True).replace(u"\n", u" ")
                numbera.setAttribute(u"title", txt)
                number.appendChild(numbera)
                footnote.parentNode.insertBefore(number, footnote)
            if endnotes:
                ol = dom.createElement(u"ol")
                ol.setAttribute(u"class", u"simple-footnotes")
                for e, fnid, fnbackid in endnotes:
                    li = dom.createElement(u"li")
                    li.setAttribute(u"id", fnid)
                    while e.firstChild:
                        li.appendChild(e.firstChild)
                    backlink = dom.createElement(u"a")
                    backlink.setAttribute(u"href", u"#%s" % fnbackid)
                    backlink.setAttribute(u"class", u"simple-footnote-back")
                    backlink.appendChild(dom.createTextNode(u'\u21a9'))
                    li.appendChild(dom.createTextNode(u" "))
                    li.appendChild(backlink)
                    ol.appendChild(li)
                    e.parentNode.removeChild(e)
                dom.getElementsByTagName(u"body")[0].appendChild(ol)
                s = html5lib.serializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values='legacy')
                output_generator = s.serialize(
                    html5lib.treewalkers.getTreeWalker(u"dom")(dom.getElementsByTagName(u"body")[0]))
                article._content = u"".join(list(output_generator)).replace(
                    u"<x-simple-footnote>", u"[ref]").replace(u"</x-simple-footnote>", u"[/ref]").replace(
                    u"<body>", u"").replace(u"</body>", u"")
Exemplo n.º 3
0
async def taskTx(sock, message, mtype):  # a poor implementation of an output coroutine.
    global revertProtocol
    tp = html5lib.getTreeBuilder("dom")
    p = html5lib.HTMLParser(tree=tp)
    tw = html5lib.getTreeWalker("dom")
    parsedTX = p.parseFragment(message)
    cleanTX = sanitizer.Filter(tw(parsedTX))
    s = html5lib.serializer.HTMLSerializer()
    pretx = s.serialize(cleanTX)
    tx = ''
    for item in pretx:
        tx += item
    if message == b"200":
        await sock.send("Goodbye.")
        await sock.close()
        return
    if message == b"202":
        await sock.send("Authentication Successful, you are now the admin terminal.")
    else:
        if revertProtocol:
            await sock.send(tx)
            return
        else:
            await sock.send(json.dumps({"MSG_TYPE":mtype, "MSG":tx}))
            return
Exemplo n.º 4
0
def parse_for_links(article_generator):
    prefix = 'L'

    for article in article_generator.articles:
        links = []
        parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom"))
        dom = parser.parse(article._content)

        for link in dom.getElementsByTagName("a"):
            href = link.getAttribute('href')

            if len(href) == 0 or href[0] == '#':
                continue  # do not print internal links

            if href in links:
                index = links.index(href) + 1
            else:
                links.append(href)
                index = len(links)

            sup = dom.createElement("sup")
            sup.setAttribute("class", "print")
            sup.appendChild(dom.createTextNode(prefix + str(index)))

            if link.nextSibling:
                link.parentNode.insertBefore(sup, link.nextSibling)
            else:
                link.parentNode.appendChild(sup)

        if links == []:
            continue

        # Links Title
        links_title = dom.createElement("h2")
        links_title.setAttribute("class", "print")
        links_title.appendChild(dom.createTextNode("Links"))
        dom.getElementsByTagName("body")[0].appendChild(links_title)

        # Actual Links
        links_div = dom.createElement("div")
        links_div.setAttribute("class", "print")
        links_div.setAttribute("style", "margin-left: 2.0em;")
        link_list = dom.createElement("ol")
        link_list.setAttribute("class", "print-links")
        for link in links:
            li = dom.createElement("li")
            li.appendChild(dom.createTextNode(link))
            link_list.appendChild(li)

        links_div.appendChild(link_list)
        dom.getElementsByTagName("body")[0].appendChild(links_div)

        # Produce the output
        s = html5lib.serializer.HTMLSerializer(omit_optional_tags=False)
        output_generator = s.serialize(
            html5lib.treewalkers.getTreeWalker("dom")(
                dom.getElementsByTagName("body")[0]))
        article._content = "".join(list(output_generator)).replace(
            "<body>", "").replace("</body>", "")
Exemplo n.º 5
0
def _has_element(tag, file_string):
    parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder('dom'))
    minidom_docment = parser.parse(file_string)

    if minidom_docment.getElementsByTagName(tag):
        return True
    else:
        return False
Exemplo n.º 6
0
def doc4url(url):
    builder = html5lib.getTreeBuilder('lxml')
    parser  = html5lib.HTMLParser(builder, namespaceHTMLElements = False)
    try:
        doc     = parser.parse(urllib2.urlopen(url).read())
    except:
        return None
    root    = doc.getroot()
    return root
Exemplo n.º 7
0
def parse_html(html, wrapper_element='div', wrapper_class='diff'):
    """Parse an HTML fragment into a Genshi stream."""
    builder = html5lib.getTreeBuilder('etree')
    parser = html5lib.HTMLParser(tree=builder)
    tree = parser.parseFragment(html)
    tree.tag = wrapper_element
    if wrapper_class is not None:
        tree.set('class', wrapper_class)
    return ET(tree)
Exemplo n.º 8
0
def html_parse(text):
    parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom"),
                                 strict=True)
    try:
        return (parser.parse(text), tuple())
    except:
        return (None, ('Line: {:d} Character: {:d} Error: {}'.format(
            e[0][0], e[0][1], html5lib.constants.E[e[1]] % e[2])
                       for e in parser.errors))
Exemplo n.º 9
0
def parse_html(html, wrapper_element='div', wrapper_class='diff'):
    """Parse an HTML fragment into a Genshi stream."""
    builder = html5lib.getTreeBuilder('etree')
    parser = html5lib.HTMLParser(tree=builder)
    tree = parser.parseFragment(html)
    tree.tag = wrapper_element
    if wrapper_class is not None:
        tree.set('class', wrapper_class)
    return ET(tree)
Exemplo n.º 10
0
def parse_for_links(article_generator):
    prefix = 'L'

    for article in article_generator.articles:
        links = []
        parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom"))
        dom = parser.parse(article._content)

        for link in dom.getElementsByTagName("a"):
           href = link.getAttribute('href')

           if len(href) == 0 or href[0] == '#':
               continue  # do not print internal links

           if href in links:
               index = links.index(href) + 1
           else:
               links.append(href)
               index = len(links)

           sup = dom.createElement("sup")
           sup.setAttribute("class", "print")
           sup.appendChild(dom.createTextNode(prefix + str(index)))

           if link.nextSibling:
               link.parentNode.insertBefore(sup, link.nextSibling)
           else:
               link.parentNode.appendChild(sup)

        if links == []:
            continue

        # Links Title
        links_title = dom.createElement("h2")
        links_title.setAttribute("class", "print")
        links_title.appendChild(dom.createTextNode("Links"))
        dom.getElementsByTagName("body")[0].appendChild(links_title)

        # Actual Links
        links_div = dom.createElement("div")
        links_div.setAttribute("class", "print")
        links_div.setAttribute("style", "margin-left: 2.0em;")
        link_list = dom.createElement("ol")
        link_list.setAttribute("class", "print-links")
        for link in links:
            li = dom.createElement("li")
            li.appendChild(dom.createTextNode(link))
            link_list.appendChild(li)

        links_div.appendChild(link_list)
        dom.getElementsByTagName("body")[0].appendChild(links_div)

        # Produce the output
        s = html5lib.serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values=True)
        output_generator = s.serialize(html5lib.treewalkers.getTreeWalker("dom")(dom.getElementsByTagName("body")[0]))
        article._content = "".join(list(output_generator)).replace("<body>", "").replace("</body>", "")
Exemplo n.º 11
0
def extract():
    parser = html5lib.html5parser.HTMLParser(tree=html5lib.getTreeBuilder("dom"))
    doc = parser.parse(open('current-work', "r"), transport_encoding='utf-8')

    head = doc.getElementsByTagName('head')[0]
    for n in head.childNodes:
        if n.tagName == 'script':
            head.removeChild(n)

    header = doc.getElementsByTagName('header')[0]
    #thecanvas = doc.getElementById('the-canvas') # doesn't work (?!)
    thecanvas = [ n for n in doc.getElementsByTagName('h4') if n.getAttribute('id') == 'the-canvas-element' ][0]

    # Add copyright from https://html.spec.whatwg.org/multipage/acknowledgements.html#acknowledgments
    copy = doc.createElement('p')
    copy.setAttribute('class', 'copyright')
    copy.appendChild(doc.createTextNode(u'Parts of this specification are \xA9 Copyright 2004-2014 Apple Inc., Mozilla Foundation, and Opera Software ASA. You are granted a license to use, reproduce and create derivative works of this document.'))
    header.appendChild(copy)

    keep = [header, thecanvas]
    node = thecanvas.nextSibling
    while node.nodeName != 'nav':
        keep.append(node)
        node = node.nextSibling
    p = thecanvas.parentNode
    for n in p.childNodes[:]:
        if n not in keep:
            p.removeChild(n)

    for n in header.childNodes[3:-4]:
        header.removeChild(n)

    def make_absolute(url):
        match = re.match(r'(\w+:|#)', url)
        if match:
            return url
        elif url[0] == '/':
            return 'https://html.spec.whatwg.org' + url
        else:
            return 'https://html.spec.whatwg.org/multipage/' + url

    # Fix relative URLs
    for e in doc.getElementsByTagName('script'):
        e.setAttribute('src', make_absolute(e.getAttribute('src')))
    for e in doc.getElementsByTagName('iframe'):
        e.setAttribute('src', make_absolute(e.getAttribute('src')))
    for e in doc.getElementsByTagName('img'):
        e.setAttribute('src', make_absolute(e.getAttribute('src')))
    for e in doc.getElementsByTagName('a'):
        e.setAttribute('href', make_absolute(e.getAttribute('href')))

    # Convert to XHTML, because it's quicker to re-parse than HTML5
    doc.documentElement.setAttribute('xmlns', 'http://www.w3.org/1999/xhtml')
    doc.removeChild(doc.firstChild) # remove the DOCTYPE

    open('current-work-canvas.xhtml', 'w').write(doc.toxml(encoding = 'UTF-8'))
Exemplo n.º 12
0
    def parse_html(cls, html_string):
        """
        Parse given HTML string and retuns Genshi ET object containing DOM tree
        :param html_string:
        :return:
        """
        # TODO: take care of self._encoding
        builder = html5lib.getTreeBuilder('etree')
        parser = html5lib.HTMLParser(tree=builder)
        tree = parser.parseFragment(html_string)

        return ET(tree)
Exemplo n.º 13
0
def get_options(template):
    parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom"))
    doc = parser.parse(template)

    options = {}

    media_size = doc.documentElement.attributes.get('data-gbclient-media-size')
    options['media'] = media_size.value if media_size else '62mm'

    media_orientation = doc.documentElement.attributes.get('data-gbclient-orientation')
    if media_orientation:
        options['orientation-requested'] = media_orientation.value

    return options
Exemplo n.º 14
0
def parse(text):

    sanitizer.HTMLSanitizer.allowed_elements.extend(['iframe'])
    sanitizer.HTMLSanitizer.allowed_attributes.extend(
        ['scrolling', 'allowfullscreen', 'frameborder'])

    # First run through the Markdown parser
    text = markdown.markdown(text, extensions=["extra"], safe_mode=False)

    # Sanitize using html5lib
    bits = []
    parser = html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer,
                                    tree=getTreeBuilder("dom"))
    for token in parser.parseFragment(text).childNodes:
        bits.append(token.toxml())
    return "".join(bits)
Exemplo n.º 15
0
 def save(self, *args, **kwargs):
     tree = html5lib.getTreeBuilder('dom')
     parser = html5lib.HTMLParser(tree=tree)
     dom = parser.parse(self.lead)
     if len(dom.getElementsByTagName('span')) == 0:
         element = dom.getElementsByTagName('p')[0]
         value = element.firstChild.nodeValue
         txt = dom.createTextNode(value[1:len(value)])
         dropcap = dom.createTextNode(value[0])
         span = dom.createElement('span')
         span.appendChild(dropcap)
         span.setAttribute('class', 'dropcap')
         element.childNodes[0] = span
         element.appendChild(txt)
         self.lead = element.toxml()
     super(ArticleIntro, self).save(self, *args, **kwargs)
def get_html_errors(filename):
    errors = {}
    with open(filename, "r") as f:
        parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom"),
                                     strict=True)
        err = ""
        try:
            document = parser.parse(f)
        except:
            for e in parser.errors:
                err += "Line {0}: {1}: {2} \n".format(e[0][0], e[1], e[2])

        if err:
            errors["message"] = err

    return errors
Exemplo n.º 17
0
 def save(self, *args, **kwargs):
     tree = html5lib.getTreeBuilder('dom')
     parser = html5lib.HTMLParser(tree=tree)
     dom = parser.parse(self.lead)
     if len(dom.getElementsByTagName('span')) == 0:
         element = dom.getElementsByTagName('p')[0]
         value = element.firstChild.nodeValue
         txt = dom.createTextNode(value[1:len(value)])
         dropcap = dom.createTextNode(value[0])
         span = dom.createElement('span')
         span.appendChild(dropcap)
         span.setAttribute('class', 'dropcap')
         element.childNodes[0] = span
         element.appendChild(txt)
         self.lead = element.toxml()
     super(ArticleIntro, self).save(self, *args, **kwargs)
Exemplo n.º 18
0
def parseModel(url):
    with urlopen(url) as f:
        parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("lxml"))
        document = parser.parse(
            f, transport_encoding=f.info().get_content_charset())
        specs = etree.XPath(
            "//html:div[@class='productDetailSpec specifications']",
            namespaces={"html": "http://www.w3.org/1999/xhtml"})
        #print(len(find_btn(document)))
        cuttedList = specs(document)[0][1:-1]
        #print(cuttedList)
        for item in cuttedList:
            text = item[0][0][0][0].text
            table = item[0][1][0]
            for row in table:
                print(row[0].text + " : " + row[1][1].text)
Exemplo n.º 19
0
def clean(text,
          tags=ALLOWED_TAGS,
          attributes=ALLOWED_ATTRIBUTES,
          styles=ALLOWED_STYLES,
          protocols=ALLOWED_PROTOCOLS,
          strip=False,
          strip_comments=True):
    """Clean an HTML fragment of malicious content and return it

    This function is a security-focused function whose sole purpose is to
    remove malicious content from a string such that it can be displayed as
    content in a web page.

    This function is not designed to use to transform content to be used in
    non-web-page contexts.

    :arg text: the text to clean
    :arg tags: whitelist of allowed tags; defaults to
        ``bleach.ALLOWED_TAGS``
    :arg attributes: whitelist of allowed attributes; defaults to
        ``bleach.ALLOWED_ATTRIBUTES``
    :arg styles: whitelist of allowed css; defaults to
        ``bleach.ALLOWED_STYLES``
    :arg protocols: whitelist of allowed protocols for links; defaults
        to ``bleach.ALLOWED_PROTOCOLS``
    :arg strip: whether or not to strip disallowed elements
    :arg strip_comments: whether or not to strip HTML comments

    """
    if not text:
        return ''

    text = force_unicode(text)

    class s(BleachSanitizer):
        allowed_elements = tags
        allowed_attributes = attributes
        allowed_css_properties = styles
        allowed_protocols = protocols
        strip_disallowed_elements = strip
        strip_html_comments = strip_comments

    parser = html5lib.HTMLParser(tokenizer=s,
                                 tree=html5lib.getTreeBuilder("lxml"))

    return _render(parser.parseFragment(text))
Exemplo n.º 20
0
def parse_referat(html):
    """
        TODO: При помощи библиотеки html5lib https://html5lib.readthedocs.io/en/latest/
        разобрать текст реферата на блоки и положить в словарь
        Содержимое тега div - в topic (тема)
        Содержимое тега strong - в title (название)
        Содержимое тегов p - отдельными строками в список content (текст)
    """
    parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom"))
    dom_tree = parser.parse(html)

    topic = dom_tree.getElementsByTagName('div')[0].firstChild.nodeValue
    title = dom_tree.getElementsByTagName('strong')[0].firstChild.nodeValue
    content = []
    p_tags = dom_tree.getElementsByTagName('p')
    for p_tag in p_tags:
        content.append(p_tag.firstChild.nodeValue)

    return {'topic': topic, 'title': title, 'content': content}
Exemplo n.º 21
0
def collect_remote_info() -> Dict[str, str]:
    parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom"))
    domain_data = requests.get(domain_url)
    parsed = parser.parse(domain_data.text)
    nodes = parsed.getElementsByTagName('tbody')[0]
    nodes = nodes.childNodes
    head = True
    remote_data = {}
    for node in nodes:
        if node.nodeType == 3:
            continue
        if head:
            head = False
            continue
        # node <tr> -> <td> -> <a> -> text node
        pkg_name = node.childNodes[1].childNodes[0].childNodes[0].nodeValue
        pkg_ver = node.childNodes[3].childNodes[0].childNodes[0].nodeValue
        remote_data[pkg_name] = pkg_ver
    return remote_data
Exemplo n.º 22
0
def collect_remote_info():
    parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom"))
    domain_data = requests.get(domain_url)
    domain_data = parser.parse(domain_data.text)
    nodes = domain_data.getElementsByTagName('tbody')[0]
    nodes = nodes.childNodes
    head = True
    remote_data = {}
    for node in nodes:
        if node.nodeType == 3:
            continue
        if head:
            head = False
            continue
        # node <tr> -> <td> -> <a> -> text node
        pkg_name = node.childNodes[1].childNodes[0].childNodes[0].nodeValue
        pkg_ver = node.childNodes[3].childNodes[0].childNodes[0].nodeValue
        remote_data[pkg_name] = pkg_ver
    return remote_data
Exemplo n.º 23
0
def grpoupsParser():
    '''Занесение все пользователей из списка групп в таблицу users'''
    community = ('foto_history', 'ru_foto', 'prophotos_ru', 'foto_history', 'ru_travel')
    url = 'livejournal.com'
    args = '/profile/friendlist?socconns=friends&mode_full_socconns=1'

    connection = sqlite3.connect(getScriptPwd() + 'livejournal.db')
    #connection.execute('delete from users')
    cursor = connection.cursor()

    for comm in community:
        print 'Community: %s' % comm
        print 'http://%s.%s%s' % (comm, url, args)
        res = urlopen('http://%s.%s%s' % (comm, url, args))
        print 'Parsing html'
        parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("lxml"))
        dom = parser.parse(res)
        root = dom.getroot()
        counter = 0
        skipped = 0

        print 'Adding users to DB'

        for node in root.findall(".//html:a",
                                namespaces={"html": "http://www.w3.org/1999/xhtml"}):
            try:
                #print node.get('href')
                cursor.execute('''INSERT INTO users
                    (name) VALUES ('?')''', node.text)
                counter += 1
            except:
                skipped += 1

        print '%s users added' % counter
        print '%s users skipped' % skipped

        connection.commit()

    print 'Total users', connection.execute('select count(*) from users').fetchone()

    cursor.close()
    connection.close()
Exemplo n.º 24
0
    :copyright: Copyright 2007-2018 by the Sphinx team, see AUTHORS.
    :license: BSD, see LICENSE for details.
"""

import re
import xml.etree.cElementTree as ElementTree
from hashlib import md5

import pytest
from html5lib import getTreeBuilder, HTMLParser
from test_build_html import flat_dict, tail_check, check_xpath

from sphinx.util.docutils import is_html5_writer_available

TREE_BUILDER = getTreeBuilder('etree', implementation=ElementTree)
HTML_PARSER = HTMLParser(TREE_BUILDER, namespaceHTMLElements=False)


etree_cache = {}


@pytest.mark.skipif(not is_html5_writer_available(), reason='HTML5 writer is not available')
@pytest.fixture(scope='module')
def cached_etree_parse():
    def parse(fname):
        if fname in etree_cache:
            return etree_cache[fname]
        with (fname).open('rb') as fp:
            etree = HTML_PARSER.parse(fp)
            etree_cache.clear()
Exemplo n.º 25
0
def parse_for_footnotes(article_or_page_generator):
    all_content = [
      getattr(article_or_page_generator, attr, None) \
      for attr in [u'articles',u'drafts',u'pages'] ]
    all_content = [x for x in all_content if x is not None]
    for article in sequence_gen(all_content):
        if u"[ref]" in article._content and u"[/ref]" in article._content:
            content = article._content.replace(u"[ref]",
                                               u"<x-simple-footnote>").replace(
                                                   u"[/ref]",
                                                   u"</x-simple-footnote>")
            parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder(u"dom"))
            dom = parser.parse(content)
            endnotes = []
            count = 0
            for footnote in dom.getElementsByTagName(u"x-simple-footnote"):
                pn = footnote
                leavealone = False
                while pn:
                    if pn.nodeName in RAW_FOOTNOTE_CONTAINERS:
                        leavealone = True
                        break
                    pn = pn.parentNode
                if leavealone:
                    continue
                count += 1
                fnid = u"sf-%s-%s" % (article.slug, count)
                fnbackid = u"%s-back" % (fnid, )
                endnotes.append((footnote, fnid, fnbackid))
                number = dom.createElement(u"sup")
                number.setAttribute(u"id", fnbackid)
                numbera = dom.createElement(u"a")
                numbera.setAttribute(u"href", u"#%s" % fnid)
                numbera.setAttribute(u"class", u"simple-footnote")
                numbera.appendChild(dom.createTextNode(unicode(count)))
                txt = getText(footnote, recursive=True).replace(u"\n", u" ")
                numbera.setAttribute(u"title", txt)
                number.appendChild(numbera)
                footnote.parentNode.insertBefore(number, footnote)
            if endnotes:
                ol = dom.createElement(u"ol")
                ol.setAttribute(u"class", u"simple-footnotes")
                for e, fnid, fnbackid in endnotes:
                    li = dom.createElement(u"li")
                    li.setAttribute(u"id", fnid)
                    while e.firstChild:
                        li.appendChild(e.firstChild)
                    backlink = dom.createElement(u"a")
                    backlink.setAttribute(u"href", u"#%s" % fnbackid)
                    backlink.setAttribute(u"class", u"simple-footnote-back")
                    backlink.appendChild(dom.createTextNode(u'\u21a9'))
                    li.appendChild(dom.createTextNode(u" "))
                    li.appendChild(backlink)
                    ol.appendChild(li)
                    e.parentNode.removeChild(e)
                dom.getElementsByTagName(u"body")[0].appendChild(ol)
                s = html5lib.serializer.HTMLSerializer(
                    omit_optional_tags=False, quote_attr_values='legacy')
                output_generator = s.serialize(
                    html5lib.treewalkers.getTreeWalker(u"dom")(
                        dom.getElementsByTagName(u"body")[0]))
                article._content = u"".join(list(output_generator)).replace(
                    u"<x-simple-footnote>", u"[ref]").replace(
                        u"</x-simple-footnote>",
                        u"[/ref]").replace(u"<body>",
                                           u"").replace(u"</body>", u"")
Exemplo n.º 26
0
# -*- coding: utf-8 -*-
from __future__ import unicode_literals

import html5lib
import pytest

from flask import render_template_string

from udata.frontend.markdown import md, parse_html, EXCERPT_TOKEN
from udata.utils import faker

parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom"))


def assert_md_equal(value, expected):
    __tracebackhide__ = True
    expected = '<div class="markdown">{0}</div>'.format(expected)
    assert value.strip() == expected


@pytest.mark.frontend
class MarkdownTest:
    def test_excerpt_is_not_removed(self, app):
        with app.test_request_context('/'):
            assert_md_equal(md(EXCERPT_TOKEN), EXCERPT_TOKEN)

    def test_markdown_filter_with_none(self, app):
        '''Markdown filter should not fails with None'''
        text = None
        with app.test_request_context('/'):
            result = render_template_string('{{ text|markdown }}', text=text)
Exemplo n.º 27
0
def fromstring(s):
    tb = html5lib.getTreeBuilder("lxml", implementation=etree)
    p = html5lib.HTMLParser(tb, namespaceHTMLElements=False)
    return p.parse(s)
Exemplo n.º 28
0
    :copyright: Copyright 2007-2016 by the Sphinx team, see AUTHORS.
    :license: BSD, see LICENSE for details.
"""

import os
import re

from six import PY3, iteritems

from sphinx import __display_version__
from util import remove_unicode_literals, gen_with_app, with_app, strip_escseq
from etree13 import ElementTree
from html5lib import getTreeBuilder, HTMLParser


TREE_BUILDER = getTreeBuilder("etree", implementation=ElementTree)
HTML_PARSER = HTMLParser(TREE_BUILDER, namespaceHTMLElements=False)

ENV_WARNINGS = """\
(%(root)s/autodoc_fodder.py:docstring of autodoc_fodder.MarkupError:\\d+: \
WARNING: duplicate object description of autodoc_fodder.MarkupError, other \
instance in %(root)s/autodoc.rst, use :noindex: for one of them
)?%(root)s/autodoc_fodder.py:docstring of autodoc_fodder.MarkupError:\\d+: \
WARNING: Explicit markup ends without a blank line; unexpected unindent.
%(root)s/index.rst:\\d+: WARNING: Encoding 'utf-8-sig' used for reading included \
file u'%(root)s/wrongenc.inc' seems to be wrong, try giving an :encoding: option
%(root)s/index.rst:\\d+: WARNING: image file not readable: foo.png
%(root)s/index.rst:\\d+: WARNING: nonlocal image URI found: http://www.python.org/logo.png
%(root)s/index.rst:\\d+: WARNING: download file not readable: %(root)s/nonexisting.png
%(root)s/index.rst:\\d+: WARNING: invalid single index entry u''
%(root)s/undecodable.rst:\\d+: WARNING: undecodable source characters, replacing \
Exemplo n.º 29
0
def fromstring(s):
    tb = html5lib.getTreeBuilder("lxml", implementation=etree)
    p = html5lib.HTMLParser(tb, namespaceHTMLElements=False)
    return p.parse(s)
Exemplo n.º 30
0
def parse_for_footnotes(article_generator):
    for article in article_generator.articles:
        if "[ref]" in article._content and "[/ref]" in article._content:
            content = article._content.replace("[ref]", "<x-simple-footnote>").replace("[/ref]", "</x-simple-footnote>")
            parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom"))
            dom = parser.parse(content)
            endnotes = []
            count = 0
            for footnote in dom.getElementsByTagName("x-simple-footnote"):
                pn = footnote
                leavealone = False
                while pn:
                    if pn.nodeName in RAW_FOOTNOTE_CONTAINERS:
                        leavealone = True
                        break
                    pn = pn.parentNode
                if leavealone:
                    continue
                count += 1
                fnid = "sf-%s-%s" % (article.slug, count)
                fnbackid = "%s-back" % (fnid,)
                endnotes.append((footnote, fnid, fnbackid))
                number = dom.createElement("sup")
                number.setAttribute("id", fnbackid)
                numbera = dom.createElement("a")
                numbera.setAttribute("href", "#%s" % fnid)
                numbera.setAttribute("class", "simple-footnote")
                numbera.appendChild(dom.createTextNode(str(count)))
                txt = getText(footnote, recursive=True).replace("\n", " ")
                numbera.setAttribute("title", txt)
                number.appendChild(numbera)
                footnote.parentNode.insertBefore(number, footnote)
            if endnotes:
                ol = dom.createElement("ol")
                ol.setAttribute("class", "simple-footnotes")
                ol.appendChild(dom.createTextNode('Notes:'))
                for e, fnid, fnbackid in endnotes:
                    li = dom.createElement("li")
                    li.setAttribute("id", fnid)
                    while e.firstChild:
                        li.appendChild(e.firstChild)
                    backlink = dom.createElement("a")
                    backlink.setAttribute("href", "#%s" % fnbackid)
                    backlink.setAttribute("class", "simple-footnote-back")
                    backlink.appendChild(dom.createTextNode(u'\u21a9'))
                    li.appendChild(dom.createTextNode(" "))
                    li.appendChild(backlink)
                    ol.appendChild(li)
                    e.parentNode.removeChild(e)
                dom.getElementsByTagName("body")[0].appendChild(ol)
                s = html5lib.serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values=True)
                output_generator = s.serialize(html5lib.treewalkers.getTreeWalker("dom")(dom.getElementsByTagName("body")[0]))
                article._content =  "".join(list(output_generator)).replace(
                    "<x-simple-footnote>", "[ref]").replace("</x-simple-footnote>", "[/ref]").replace(
                    "<body>", "").replace("</body>", "")
        if False:
            count = 0
            endnotes = []
            for f in footnotes:
                count += 1
                fnstr = '<a class="simple-footnote" name="%s-%s-back" href="#%s-%s"><sup>%s</a>' % (
                    article.slug, count, article.slug, count, count)
                endstr = '<li id="%s-%s">%s <a href="#%s-%s-back">&uarr;</a></li>' % (
                    article.slug, count, f[len("[ref]"):-len("[/ref]")], article.slug, count)
                content = content.replace(f, fnstr)
                endnotes.append(endstr)
            content += '<h4>Footnotes</h4><ol class="simple-footnotes">%s</ul>' % ("\n".join(endnotes),)
            article._content = content
Exemplo n.º 31
0
Arquivo: corpus.py Projeto: fnl/libfnl
    'pre',
    'style',
    'script',
    'textarea',
})


_ensure = lambda e, tag: e.find(tag) if e.tag != tag else e

# HTML5 serialization setup
_tree_walker = html5lib.getTreeWalker("etree", implementation=etree)
_serializer = html5lib.serializer.HTMLSerializer(omit_optional_tags=False,
                                                 resolve_entities=False)

# HTML5 parsing setup
_tree_builder = html5lib.getTreeBuilder("etree", implementation=etree)
_parser = html5lib.HTMLParser(_tree_builder, namespaceHTMLElements=False)
# FIX for HTMLParser.reset():
if not hasattr(_parser, "innerHTMLMode"):
    # add the missing attribute, as otherwise calling .reset() would raise an AttributeError
    _parser.innerHTMLMode = None


def Root(title=None, encoding=None) -> Element:
    root = Element(ROOT_TAG)
    head = SubElement(root, HEAD_TAG)

    if title is not None:
        assert isinstance(title, str), 'title not a string'
        SubElement(head, TITLE_TAG).text = title
Exemplo n.º 32
0
def purify_html(input_html, obj):
    parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom"))
    document = parser.parse(input_html)

    convert_to_markdown = True
    dom_modified = False

    # Group consecutive orphaned <li>s inside a <ul>
    for li in document.getElementsByTagName('li'):
        node = li
        has_proper_parent = False
        while node.tagName != 'body' and node.parentNode:
            node = node.parentNode
            if node.tagName in {'ul', 'ol'}:
                has_proper_parent = True

        if not has_proper_parent:
            sibling_items = []
            sibling = li
            while sibling and sibling.nodeType == minidom.Node.ELEMENT_NODE and sibling.tagName == 'li':
                sibling_items.append(sibling)
                sibling = sibling.nextSibling
                # jump over empty nodes
                while sibling and sibling.nodeType == minidom.Node.TEXT_NODE and sibling.data.isspace(
                ):
                    sibling = sibling.nextSibling

            container = document.createElement('ul')
            li.parentNode.replaceChild(container, li)
            print "!! Adding missing ul", obj
            container.appendChild(li)
            for child in sibling_items:
                if child != li:
                    container.appendChild(child)
            dom_modified = True

    # Handle missing <li>
    for ul in document.getElementsByTagName('ul'):
        for child in ul.childNodes:
            if isinstance(child, minidom.Comment) or isinstance(
                    child, minidom.Text) and not child.data.isspace():
                print "!! Adding missing li", obj
                li = document.createElement('li')
                ul.insertBefore(li, child)
                ul.removeChild(child)
                li.appendChild(child)
                dom_modified = True

    # Markdown doesn't allow paragraph inside a list
    for ul in chain(document.getElementsByTagName('ul'),
                    document.getElementsByTagName('ol')):
        for li in ul.childNodes:
            if not isinstance(li, (minidom.Text, minidom.Comment)):
                if li.getElementsByTagName('p'):
                    print "!! Cannot convert to markdown because a list contains a paragraph:", obj
                    convert_to_markdown = False

    # Markdown doesn't like a bold or italic section to start or end with a whitespace
    for element in chain(document.getElementsByTagName('em'),
                         document.getElementsByTagName('strong'),
                         document.getElementsByTagName('i'),
                         document.getElementsByTagName('b')):
        first_text_node = _get_first_text_node(element)
        if first_text_node is not None:
            whitespace, text = _split_leading_whitespace(first_text_node.data)
            if whitespace:
                print "!! Moving leading whitespace outside of the element:", obj
                first_text_node.data = text
                element.parentNode.insertBefore(
                    document.createTextNode(whitespace), element)
                dom_modified = True
        last_text_node = _get_last_text_node(element)
        if last_text_node is not None:
            text, whitespace = _split_trailing_whitespace(last_text_node.data)
            if whitespace:
                print "!! Moving trailing whitespace outside of the element:", obj, whitespace.__repr__(
                )
                last_text_node.data = text
                _insertAfter(document.createTextNode(whitespace), element)
                dom_modified = True

    # Our markdown render don't handle * and _ followed by a whitespace after a newline correctly.
    # The descriptions using those sequences of characters will be left as HTML. When we use a
    # spec-compliant Markdown renderer those descriptions could be converted to Markdown
    #
    # For instance the following is not rendered properly:
    # Something
    # * a
    # * b
    # * c
    if _contains_problematic_space_near_delimiter("".join([
            x.toxml()
            for x in document.getElementsByTagName('body')[0].childNodes
    ])):
        print "!! Cannot convert to markdown because the description contains '_ ' or '* ' " \
              "at a position which is very likely to trigger bugs in our markdown renderer." \
              "(It is likely that this description is not being rendered properly in its current form.)", obj
        # I'm pretty sure that converting such descriptions now yields CommonMark-compliant Markdown but proper
        # testing with a compliant python parser has not been performed. I'm leaving those description as
        # HTML for now which should make it easier to find them when switching to a compliant Markdown renderer.
        convert_to_markdown = False

    if convert_to_markdown and dom_modified:
        result = "".join([
            x.toxml()
            for x in document.getElementsByTagName('body')[0].childNodes
        ])
    else:
        result = input_html

    return result, convert_to_markdown
Exemplo n.º 33
0
#!/usr/local/bin/python

import html5lib
import urllib2
import lxml
from lxml.html import tostring
from lxml import etree

parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("lxml"))
f = urllib2.urlopen("http://ex.ua/view/14475479").read()
doc = html5lib.parse(f,treebuilder="lxml" , namespaceHTMLElements=False)
root = doc.getroot()

fnd_p = etree.XPath("/html/body/table/tbody/tr/td/table[@class='list']/tbody/tr/td/a[@rel='nofollow']", namespaces = {"html": "http://www.w3.org/1999/xhtml"})
print "[playlist]"
item = 1;
for a in fnd_p(root):
    print("File"+str(item)+"=http://ex.ua"+a.get('href'));
    print("Title"+str(item)+"="+a.get("title"))
    item=item+1;
Exemplo n.º 34
0
document = '<html><head><title>Test</title></head><body><h1 align="center">Big data news</h1><h1 align="center">AI news</h1><h1 align="right">2018.8.1</h1></body></html>'
#直接调用html5lib.parse来解析,解析时采用lxml构建树的方法
content = html5lib.parse(document,
                         treebuilder="lxml",
                         namespaceHTMLElements=False)
#指定要提取的内容所在的标签路径
rows = content.xpath('/html/body/h1')
for row in rows:
    t = row.xpath('./text()')[0]  #定位到标签节点后,通过text()提取内容
    print(t)

print('通过指定tree来解析:')
document = '<html><head><title>Test</title></head><body><h1 align="center">Big data news</h1><h1 align="center">AI news</h1><h1 align="right">2018.8.1</h1></body></html>'
#构造HTMLParser实例,指定构造lxml的树
p = html5lib.HTMLParser(strict=False,
                        tree=html5lib.getTreeBuilder('lxml'),
                        namespaceHTMLElements=False)
#解析HTML文档
t = p.parse(document)
rows = t.xpath('/html/body/h1')
for row in rows:
    t = row.xpath('./text()')[0]
    print(t)

print('通过指定tree来提取超链接:')
document = '<html><head><title>Test</title></head><body><a href="www.baidu.com">baidu</body></html>'
p = html5lib.HTMLParser(strict=False,
                        tree=html5lib.getTreeBuilder('lxml'),
                        namespaceHTMLElements=False)
t = p.parse(document)
#通过findall来查找所有标签名称为a的节点
Exemplo n.º 35
0
SPACE_PRESERVING_TAGS = frozenset({
    'pre',
    'style',
    'script',
    'textarea',
})

_ensure = lambda e, tag: e.find(tag) if e.tag != tag else e

# HTML5 serialization setup
_tree_walker = html5lib.getTreeWalker("etree", implementation=etree)
_serializer = html5lib.serializer.HTMLSerializer(omit_optional_tags=False,
                                                 resolve_entities=False)

# HTML5 parsing setup
_tree_builder = html5lib.getTreeBuilder("etree", implementation=etree)
_parser = html5lib.HTMLParser(_tree_builder, namespaceHTMLElements=False)
# FIX for HTMLParser.reset():
if not hasattr(_parser, "innerHTMLMode"):
    # add the missing attribute, as otherwise calling .reset() would raise an AttributeError
    _parser.innerHTMLMode = None


def Root(title=None, encoding=None) -> Element:
    root = Element(ROOT_TAG)
    head = SubElement(root, HEAD_TAG)

    if title is not None:
        assert isinstance(title, str), 'title not a string'
        SubElement(head, TITLE_TAG).text = title
Exemplo n.º 36
0
import html5lib

document1 = html5lib.parse("<p>Hello World!</p>")
print(document1)

from urllib.request import urlopen

with urlopen("http://www.google.com/") as f:
    document2 = html5lib.parse(
        f, transport_encoding=f.info().get_content_charset())
    print(document2)

document3 = html5lib.HTMLParser(
    tree=html5lib.getTreeBuilder("dom")).parse("<p>Hello World!</p>")
print(document3)

element = html5lib.parse('<p>Hello World!</p>')
walker = html5lib.getTreeWalker("etree")
stream = walker(element)
s = html5lib.serializer.HTMLSerializer().serialize(stream)
for i in s:
    print(i)

from html5lib.filters import sanitizer

dom = html5lib.parse("<script>alert('warning!')</script>", treebuilder="dom")
walker = html5lib.getTreeWalker("dom")
clean_stream = sanitizer.Filter(walker(dom))
print(clean_stream)
Exemplo n.º 37
0
def parse_for_footnotes(article_generator):
    for article in article_generator.articles:
        if "[ref]" in article._content and "[/ref]" in article._content:
            content = article._content.replace("[ref]",
                                               "<x-simple-footnote>").replace(
                                                   "[/ref]",
                                                   "</x-simple-footnote>")
            parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom"))
            dom = parser.parse(content)
            endnotes = []
            count = 0
            for footnote in dom.getElementsByTagName("x-simple-footnote"):
                pn = footnote
                leavealone = False
                while pn:
                    if pn.nodeName in RAW_FOOTNOTE_CONTAINERS:
                        leavealone = True
                        break
                    pn = pn.parentNode
                if leavealone:
                    continue
                count += 1
                fnid = "sf-%s-%s" % (article.slug, count)
                fnbackid = "%s-back" % (fnid, )
                endnotes.append((footnote, fnid, fnbackid))
                number = dom.createElement("sup")
                number.setAttribute("id", fnbackid)
                numbera = dom.createElement("a")
                numbera.setAttribute("href", "#%s" % fnid)
                numbera.setAttribute("class", "simple-footnote")
                numbera.appendChild(dom.createTextNode(str(count)))
                txt = getText(footnote, recursive=True).replace("\n", " ")
                numbera.setAttribute("title", txt)
                number.appendChild(numbera)
                footnote.parentNode.insertBefore(number, footnote)
            if endnotes:
                ol = dom.createElement("ol")
                ol.setAttribute("class", "simple-footnotes")
                for e, fnid, fnbackid in endnotes:
                    li = dom.createElement("li")
                    li.setAttribute("id", fnid)
                    while e.firstChild:
                        li.appendChild(e.firstChild)
                    backlink = dom.createElement("a")
                    backlink.setAttribute("href", "#%s" % fnbackid)
                    backlink.setAttribute("class", "simple-footnote-back")
                    backlink.appendChild(dom.createTextNode(u'\u21a9'))
                    li.appendChild(dom.createTextNode(" "))
                    li.appendChild(backlink)
                    ol.appendChild(li)
                    e.parentNode.removeChild(e)
                dom.getElementsByTagName("body")[0].appendChild(ol)
                s = html5lib.serializer.htmlserializer.HTMLSerializer(
                    omit_optional_tags=False, quote_attr_values=True)
                output_generator = s.serialize(
                    html5lib.treewalkers.getTreeWalker("dom")(
                        dom.getElementsByTagName("body")[0]))
                article._content = "".join(list(output_generator)).replace(
                    "<x-simple-footnote>",
                    "[ref]").replace("</x-simple-footnote>", "[/ref]").replace(
                        "<body>", "").replace("</body>", "")
        if False:
            count = 0
            endnotes = []
            for f in footnotes:
                count += 1
                fnstr = '<a class="simple-footnote" name="%s-%s-back" href="#%s-%s"><sup>%s</a>' % (
                    article.slug, count, article.slug, count, count)
                endstr = '<li id="%s-%s">%s <a href="#%s-%s-back">&uarr;</a></li>' % (
                    article.slug, count, f[len("[ref]"):-len("[/ref]")],
                    article.slug, count)
                content = content.replace(f, fnstr)
                endnotes.append(endstr)
            content += '<h4>Footnotes</h4><ol class="simple-footnotes">%s</ul>' % (
                "\n".join(endnotes), )
            article._content = content
Exemplo n.º 38
0
    https://github.com/sphinx-doc/sphinx/pull/2805/files

    :copyright: Copyright 2007-2017 by the Sphinx team, see AUTHORS.
    :license: BSD, see LICENSE for details.
"""

import xml.etree.cElementTree as ElementTree

import pytest
from html5lib import getTreeBuilder, HTMLParser

from sphinx.util.docutils import is_html5_writer_available

from test_build_html import flat_dict, tail_check, check_xpath

TREE_BUILDER = getTreeBuilder('etree', implementation=ElementTree)
HTML_PARSER = HTMLParser(TREE_BUILDER, namespaceHTMLElements=False)

etree_cache = {}


@pytest.mark.skipif(not is_html5_writer_available(),
                    reason='HTML5 writer is not available')
@pytest.fixture(scope='module')
def cached_etree_parse():
    def parse(fname):
        if fname in etree_cache:
            return etree_cache[fname]
        with (fname).open('rb') as fp:
            etree = HTML_PARSER.parse(fp)
            etree_cache.clear()
Exemplo n.º 39
0
def purify_html(input_html, obj):
    parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom"))
    document = parser.parse(input_html)

    convert_to_markdown = True
    dom_modified = False

    # Group consecutive orphaned <li>s inside a <ul>
    for li in document.getElementsByTagName('li'):
        node = li
        has_proper_parent = False
        while node.tagName != 'body' and node.parentNode:
            node = node.parentNode
            if node.tagName in {'ul', 'ol'}:
                has_proper_parent = True

        if not has_proper_parent:
            sibling_items = []
            sibling = li
            while sibling and sibling.nodeType == minidom.Node.ELEMENT_NODE and sibling.tagName == 'li':
                sibling_items.append(sibling)
                sibling = sibling.nextSibling
                # jump over empty nodes
                while sibling and sibling.nodeType == minidom.Node.TEXT_NODE and sibling.data.isspace():
                    sibling = sibling.nextSibling

            container = document.createElement('ul')
            li.parentNode.replaceChild(container, li)
            print "!! Adding missing ul", obj
            container.appendChild(li)
            for child in sibling_items:
                if child != li:
                    container.appendChild(child)
            dom_modified = True

    # Handle missing <li>
    for ul in document.getElementsByTagName('ul'):
        for child in ul.childNodes:
            if isinstance(child, minidom.Comment) or isinstance(child, minidom.Text) and not child.data.isspace():
                print "!! Adding missing li", obj
                li = document.createElement('li')
                ul.insertBefore(li, child)
                ul.removeChild(child)
                li.appendChild(child)
                dom_modified = True

    # Markdown doesn't allow paragraph inside a list
    for ul in chain(document.getElementsByTagName('ul'), document.getElementsByTagName('ol')):
        for li in ul.childNodes:
            if not isinstance(li, (minidom.Text, minidom.Comment)):
                if li.getElementsByTagName('p'):
                    print "!! Cannot convert to markdown because a list contains a paragraph:", obj
                    convert_to_markdown = False

    # Markdown doesn't like a bold or italic section to start or end with a whitespace
    for element in chain(document.getElementsByTagName('em'), document.getElementsByTagName('strong'),
                         document.getElementsByTagName('i'),  document.getElementsByTagName('b')):
        first_text_node = _get_first_text_node(element)
        if first_text_node is not None:
            whitespace, text = _split_leading_whitespace(first_text_node.data)
            if whitespace:
                print "!! Moving leading whitespace outside of the element:", obj
                first_text_node.data = text
                element.parentNode.insertBefore(document.createTextNode(whitespace), element)
                dom_modified = True
        last_text_node = _get_last_text_node(element)
        if last_text_node is not None:
            text, whitespace = _split_trailing_whitespace(last_text_node.data)
            if whitespace:
                print "!! Moving trailing whitespace outside of the element:", obj, whitespace.__repr__()
                last_text_node.data = text
                _insertAfter(document.createTextNode(whitespace), element)
                dom_modified = True

    # Our markdown render don't handle * and _ followed by a whitespace after a newline correctly.
    # The descriptions using those sequences of characters will be left as HTML. When we use a
    # spec-compliant Markdown renderer those descriptions could be converted to Markdown
    #
    # For instance the following is not rendered properly:
    # Something
    # * a
    # * b
    # * c
    if _contains_problematic_space_near_delimiter("".join([x.toxml() for x
                                                           in document.getElementsByTagName('body')[0].childNodes])):
        print "!! Cannot convert to markdown because the description contains '_ ' or '* ' " \
              "at a position which is very likely to trigger bugs in our markdown renderer." \
              "(It is likely that this description is not being rendered properly in its current form.)", obj
        # I'm pretty sure that converting such descriptions now yields CommonMark-compliant Markdown but proper
        # testing with a compliant python parser has not been performed. I'm leaving those description as
        # HTML for now which should make it easier to find them when switching to a compliant Markdown renderer.
        convert_to_markdown = False

    if convert_to_markdown and dom_modified:
        result = "".join([x.toxml() for x in document.getElementsByTagName('body')[0].childNodes])
    else:
        result = input_html

    return result, convert_to_markdown
Exemplo n.º 40
0
                print(row[0].text + " : " + row[1][1].text)


def parseModelList(url):
    print()
    with urlopen(url) as f:
        string = f.read().decode('utf-8')
        json_obj = json.loads(string)
        for model in json_obj['models']:
            model_url = 'http://www.cat.com/' + model['detail_url']
            print(model['model_name'])
            parseModel(model_url)


with urlopen("http://www.cat.com/en_GB/products/new/equipment.html") as f:
    parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("lxml"))
    document = parser.parse(f,
                            transport_encoding=f.info().get_content_charset())
    find_btn = etree.XPath(
        "//html:div[@class='span3 selector class-selector']",
        namespaces={"html": "http://www.w3.org/1999/xhtml"})
    for item in find_btn(document):
        url = "http://www.cat.com" + item[0].attrib['href']
        text = item[0][0][0][1].text
        #print(url)
        print(text)
        url_arr = str.split(url, '/')
        name = str.split(url_arr[-1], '.')[0]
        model_url = 'http://www.cat.com/en_GB/products/new/equipment/' + name + '/_jcr_content.feed.json'
        #print(name)
        parseModelList(model_url)
Exemplo n.º 41
0
# -*- coding: utf-8 -*-
from __future__ import unicode_literals

import html5lib

from flask import render_template_string

from .. import TestCase, WebTestMixin

from udata.frontend.markdown import md, init_app, EXCERPT_TOKEN

parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom"))


class MarkdownTestCase(TestCase, WebTestMixin):
    def create_app(self):
        app = super(MarkdownTestCase, self).create_app()
        init_app(app)
        return app

    def test_excerpt_is_not_removed(self):
        with self.app.test_request_context('/'):
            self.assertEqual(md(EXCERPT_TOKEN).strip(), EXCERPT_TOKEN)

    def test_markdown_filter_with_none(self):
        '''Markdown filter should not fails with None'''
        text = None
        with self.app.test_request_context('/'):
            result = render_template_string('{{ text|markdown }}', text=text)

        self.assertEqual(result, '')