def parse(self, response): # parse article page if self.page_count > 0: item = WechatItem() item['date'] = self.today item['title'] = response.xpath( 'normalize-space(//h2[@class="rich_media_title"]/text())' ).extract()[0] item['html'] = response.xpath( '//div[@class="rich_media_content "]').extract()[0] # fix incomplete tags html = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom")) xml = html.parse(item['html']).toxml() # convert to markdown h2m = Html2Markdown() h2m.feed(xml) h2m.close() item['markdown'] = h2m.output yield item else: # parse article list for x in response.xpath('//div[@class="weui_media_bd"]'): # only get today articles date = x.xpath( 'normalize-space(p[@class="weui_media_extra_info"]/text())' ).extract()[0] date = date.replace('年', '/') date = date.replace('月', '/') date = date.replace('日', '') if date == '2019/7/2': self.page_count += 1 # whether need to request the next page if self.page_index < self.page_count: yield Request(url=response.url, callback=self.parse)
def parse_for_footnotes(article_or_page_generator): all_content = [ getattr(article_or_page_generator, attr, None) \ for attr in [u'articles', u'drafts', u'pages']] all_content = [x for x in all_content if x is not None] for article in sequence_gen(all_content): if u"[ref]" in article._content and u"[/ref]" in article._content: content = article._content.replace(u"[ref]", u"<x-simple-footnote>").replace(u"[/ref]", u"</x-simple-footnote>") parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder(u"dom")) dom = parser.parse(content) endnotes = [] count = 0 for footnote in dom.getElementsByTagName(u"x-simple-footnote"): pn = footnote leavealone = False while pn: if pn.nodeName in RAW_FOOTNOTE_CONTAINERS: leavealone = True break pn = pn.parentNode if leavealone: continue count += 1 fnid = u"sf-%s-%s" % (article.slug, count) fnbackid = u"%s-back" % (fnid,) endnotes.append((footnote, fnid, fnbackid)) number = dom.createElement(u"sup") number.setAttribute(u"id", fnbackid) numbera = dom.createElement(u"a") numbera.setAttribute(u"href", u"#%s" % fnid) numbera.setAttribute(u"class", u"simple-footnote") numbera.appendChild(dom.createTextNode(six.text_type(count))) txt = getText(footnote, recursive=True).replace(u"\n", u" ") numbera.setAttribute(u"title", txt) number.appendChild(numbera) footnote.parentNode.insertBefore(number, footnote) if endnotes: ol = dom.createElement(u"ol") ol.setAttribute(u"class", u"simple-footnotes") for e, fnid, fnbackid in endnotes: li = dom.createElement(u"li") li.setAttribute(u"id", fnid) while e.firstChild: li.appendChild(e.firstChild) backlink = dom.createElement(u"a") backlink.setAttribute(u"href", u"#%s" % fnbackid) backlink.setAttribute(u"class", u"simple-footnote-back") backlink.appendChild(dom.createTextNode(u'\u21a9')) li.appendChild(dom.createTextNode(u" ")) li.appendChild(backlink) ol.appendChild(li) e.parentNode.removeChild(e) dom.getElementsByTagName(u"body")[0].appendChild(ol) s = html5lib.serializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values='legacy') output_generator = s.serialize( html5lib.treewalkers.getTreeWalker(u"dom")(dom.getElementsByTagName(u"body")[0])) article._content = u"".join(list(output_generator)).replace( u"<x-simple-footnote>", u"[ref]").replace(u"</x-simple-footnote>", u"[/ref]").replace( u"<body>", u"").replace(u"</body>", u"")
async def taskTx(sock, message, mtype): # a poor implementation of an output coroutine. global revertProtocol tp = html5lib.getTreeBuilder("dom") p = html5lib.HTMLParser(tree=tp) tw = html5lib.getTreeWalker("dom") parsedTX = p.parseFragment(message) cleanTX = sanitizer.Filter(tw(parsedTX)) s = html5lib.serializer.HTMLSerializer() pretx = s.serialize(cleanTX) tx = '' for item in pretx: tx += item if message == b"200": await sock.send("Goodbye.") await sock.close() return if message == b"202": await sock.send("Authentication Successful, you are now the admin terminal.") else: if revertProtocol: await sock.send(tx) return else: await sock.send(json.dumps({"MSG_TYPE":mtype, "MSG":tx})) return
def parse_for_links(article_generator): prefix = 'L' for article in article_generator.articles: links = [] parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom")) dom = parser.parse(article._content) for link in dom.getElementsByTagName("a"): href = link.getAttribute('href') if len(href) == 0 or href[0] == '#': continue # do not print internal links if href in links: index = links.index(href) + 1 else: links.append(href) index = len(links) sup = dom.createElement("sup") sup.setAttribute("class", "print") sup.appendChild(dom.createTextNode(prefix + str(index))) if link.nextSibling: link.parentNode.insertBefore(sup, link.nextSibling) else: link.parentNode.appendChild(sup) if links == []: continue # Links Title links_title = dom.createElement("h2") links_title.setAttribute("class", "print") links_title.appendChild(dom.createTextNode("Links")) dom.getElementsByTagName("body")[0].appendChild(links_title) # Actual Links links_div = dom.createElement("div") links_div.setAttribute("class", "print") links_div.setAttribute("style", "margin-left: 2.0em;") link_list = dom.createElement("ol") link_list.setAttribute("class", "print-links") for link in links: li = dom.createElement("li") li.appendChild(dom.createTextNode(link)) link_list.appendChild(li) links_div.appendChild(link_list) dom.getElementsByTagName("body")[0].appendChild(links_div) # Produce the output s = html5lib.serializer.HTMLSerializer(omit_optional_tags=False) output_generator = s.serialize( html5lib.treewalkers.getTreeWalker("dom")( dom.getElementsByTagName("body")[0])) article._content = "".join(list(output_generator)).replace( "<body>", "").replace("</body>", "")
def _has_element(tag, file_string): parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder('dom')) minidom_docment = parser.parse(file_string) if minidom_docment.getElementsByTagName(tag): return True else: return False
def doc4url(url): builder = html5lib.getTreeBuilder('lxml') parser = html5lib.HTMLParser(builder, namespaceHTMLElements = False) try: doc = parser.parse(urllib2.urlopen(url).read()) except: return None root = doc.getroot() return root
def parse_html(html, wrapper_element='div', wrapper_class='diff'): """Parse an HTML fragment into a Genshi stream.""" builder = html5lib.getTreeBuilder('etree') parser = html5lib.HTMLParser(tree=builder) tree = parser.parseFragment(html) tree.tag = wrapper_element if wrapper_class is not None: tree.set('class', wrapper_class) return ET(tree)
def html_parse(text): parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom"), strict=True) try: return (parser.parse(text), tuple()) except: return (None, ('Line: {:d} Character: {:d} Error: {}'.format( e[0][0], e[0][1], html5lib.constants.E[e[1]] % e[2]) for e in parser.errors))
def parse_for_links(article_generator): prefix = 'L' for article in article_generator.articles: links = [] parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom")) dom = parser.parse(article._content) for link in dom.getElementsByTagName("a"): href = link.getAttribute('href') if len(href) == 0 or href[0] == '#': continue # do not print internal links if href in links: index = links.index(href) + 1 else: links.append(href) index = len(links) sup = dom.createElement("sup") sup.setAttribute("class", "print") sup.appendChild(dom.createTextNode(prefix + str(index))) if link.nextSibling: link.parentNode.insertBefore(sup, link.nextSibling) else: link.parentNode.appendChild(sup) if links == []: continue # Links Title links_title = dom.createElement("h2") links_title.setAttribute("class", "print") links_title.appendChild(dom.createTextNode("Links")) dom.getElementsByTagName("body")[0].appendChild(links_title) # Actual Links links_div = dom.createElement("div") links_div.setAttribute("class", "print") links_div.setAttribute("style", "margin-left: 2.0em;") link_list = dom.createElement("ol") link_list.setAttribute("class", "print-links") for link in links: li = dom.createElement("li") li.appendChild(dom.createTextNode(link)) link_list.appendChild(li) links_div.appendChild(link_list) dom.getElementsByTagName("body")[0].appendChild(links_div) # Produce the output s = html5lib.serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values=True) output_generator = s.serialize(html5lib.treewalkers.getTreeWalker("dom")(dom.getElementsByTagName("body")[0])) article._content = "".join(list(output_generator)).replace("<body>", "").replace("</body>", "")
def extract(): parser = html5lib.html5parser.HTMLParser(tree=html5lib.getTreeBuilder("dom")) doc = parser.parse(open('current-work', "r"), transport_encoding='utf-8') head = doc.getElementsByTagName('head')[0] for n in head.childNodes: if n.tagName == 'script': head.removeChild(n) header = doc.getElementsByTagName('header')[0] #thecanvas = doc.getElementById('the-canvas') # doesn't work (?!) thecanvas = [ n for n in doc.getElementsByTagName('h4') if n.getAttribute('id') == 'the-canvas-element' ][0] # Add copyright from https://html.spec.whatwg.org/multipage/acknowledgements.html#acknowledgments copy = doc.createElement('p') copy.setAttribute('class', 'copyright') copy.appendChild(doc.createTextNode(u'Parts of this specification are \xA9 Copyright 2004-2014 Apple Inc., Mozilla Foundation, and Opera Software ASA. You are granted a license to use, reproduce and create derivative works of this document.')) header.appendChild(copy) keep = [header, thecanvas] node = thecanvas.nextSibling while node.nodeName != 'nav': keep.append(node) node = node.nextSibling p = thecanvas.parentNode for n in p.childNodes[:]: if n not in keep: p.removeChild(n) for n in header.childNodes[3:-4]: header.removeChild(n) def make_absolute(url): match = re.match(r'(\w+:|#)', url) if match: return url elif url[0] == '/': return 'https://html.spec.whatwg.org' + url else: return 'https://html.spec.whatwg.org/multipage/' + url # Fix relative URLs for e in doc.getElementsByTagName('script'): e.setAttribute('src', make_absolute(e.getAttribute('src'))) for e in doc.getElementsByTagName('iframe'): e.setAttribute('src', make_absolute(e.getAttribute('src'))) for e in doc.getElementsByTagName('img'): e.setAttribute('src', make_absolute(e.getAttribute('src'))) for e in doc.getElementsByTagName('a'): e.setAttribute('href', make_absolute(e.getAttribute('href'))) # Convert to XHTML, because it's quicker to re-parse than HTML5 doc.documentElement.setAttribute('xmlns', 'http://www.w3.org/1999/xhtml') doc.removeChild(doc.firstChild) # remove the DOCTYPE open('current-work-canvas.xhtml', 'w').write(doc.toxml(encoding = 'UTF-8'))
def parse_html(cls, html_string): """ Parse given HTML string and retuns Genshi ET object containing DOM tree :param html_string: :return: """ # TODO: take care of self._encoding builder = html5lib.getTreeBuilder('etree') parser = html5lib.HTMLParser(tree=builder) tree = parser.parseFragment(html_string) return ET(tree)
def get_options(template): parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom")) doc = parser.parse(template) options = {} media_size = doc.documentElement.attributes.get('data-gbclient-media-size') options['media'] = media_size.value if media_size else '62mm' media_orientation = doc.documentElement.attributes.get('data-gbclient-orientation') if media_orientation: options['orientation-requested'] = media_orientation.value return options
def parse(text): sanitizer.HTMLSanitizer.allowed_elements.extend(['iframe']) sanitizer.HTMLSanitizer.allowed_attributes.extend( ['scrolling', 'allowfullscreen', 'frameborder']) # First run through the Markdown parser text = markdown.markdown(text, extensions=["extra"], safe_mode=False) # Sanitize using html5lib bits = [] parser = html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer, tree=getTreeBuilder("dom")) for token in parser.parseFragment(text).childNodes: bits.append(token.toxml()) return "".join(bits)
def save(self, *args, **kwargs): tree = html5lib.getTreeBuilder('dom') parser = html5lib.HTMLParser(tree=tree) dom = parser.parse(self.lead) if len(dom.getElementsByTagName('span')) == 0: element = dom.getElementsByTagName('p')[0] value = element.firstChild.nodeValue txt = dom.createTextNode(value[1:len(value)]) dropcap = dom.createTextNode(value[0]) span = dom.createElement('span') span.appendChild(dropcap) span.setAttribute('class', 'dropcap') element.childNodes[0] = span element.appendChild(txt) self.lead = element.toxml() super(ArticleIntro, self).save(self, *args, **kwargs)
def get_html_errors(filename): errors = {} with open(filename, "r") as f: parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom"), strict=True) err = "" try: document = parser.parse(f) except: for e in parser.errors: err += "Line {0}: {1}: {2} \n".format(e[0][0], e[1], e[2]) if err: errors["message"] = err return errors
def parseModel(url): with urlopen(url) as f: parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("lxml")) document = parser.parse( f, transport_encoding=f.info().get_content_charset()) specs = etree.XPath( "//html:div[@class='productDetailSpec specifications']", namespaces={"html": "http://www.w3.org/1999/xhtml"}) #print(len(find_btn(document))) cuttedList = specs(document)[0][1:-1] #print(cuttedList) for item in cuttedList: text = item[0][0][0][0].text table = item[0][1][0] for row in table: print(row[0].text + " : " + row[1][1].text)
def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False, strip_comments=True): """Clean an HTML fragment of malicious content and return it This function is a security-focused function whose sole purpose is to remove malicious content from a string such that it can be displayed as content in a web page. This function is not designed to use to transform content to be used in non-web-page contexts. :arg text: the text to clean :arg tags: whitelist of allowed tags; defaults to ``bleach.ALLOWED_TAGS`` :arg attributes: whitelist of allowed attributes; defaults to ``bleach.ALLOWED_ATTRIBUTES`` :arg styles: whitelist of allowed css; defaults to ``bleach.ALLOWED_STYLES`` :arg protocols: whitelist of allowed protocols for links; defaults to ``bleach.ALLOWED_PROTOCOLS`` :arg strip: whether or not to strip disallowed elements :arg strip_comments: whether or not to strip HTML comments """ if not text: return '' text = force_unicode(text) class s(BleachSanitizer): allowed_elements = tags allowed_attributes = attributes allowed_css_properties = styles allowed_protocols = protocols strip_disallowed_elements = strip strip_html_comments = strip_comments parser = html5lib.HTMLParser(tokenizer=s, tree=html5lib.getTreeBuilder("lxml")) return _render(parser.parseFragment(text))
def parse_referat(html): """ TODO: При помощи библиотеки html5lib https://html5lib.readthedocs.io/en/latest/ разобрать текст реферата на блоки и положить в словарь Содержимое тега div - в topic (тема) Содержимое тега strong - в title (название) Содержимое тегов p - отдельными строками в список content (текст) """ parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom")) dom_tree = parser.parse(html) topic = dom_tree.getElementsByTagName('div')[0].firstChild.nodeValue title = dom_tree.getElementsByTagName('strong')[0].firstChild.nodeValue content = [] p_tags = dom_tree.getElementsByTagName('p') for p_tag in p_tags: content.append(p_tag.firstChild.nodeValue) return {'topic': topic, 'title': title, 'content': content}
def collect_remote_info() -> Dict[str, str]: parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom")) domain_data = requests.get(domain_url) parsed = parser.parse(domain_data.text) nodes = parsed.getElementsByTagName('tbody')[0] nodes = nodes.childNodes head = True remote_data = {} for node in nodes: if node.nodeType == 3: continue if head: head = False continue # node <tr> -> <td> -> <a> -> text node pkg_name = node.childNodes[1].childNodes[0].childNodes[0].nodeValue pkg_ver = node.childNodes[3].childNodes[0].childNodes[0].nodeValue remote_data[pkg_name] = pkg_ver return remote_data
def collect_remote_info(): parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom")) domain_data = requests.get(domain_url) domain_data = parser.parse(domain_data.text) nodes = domain_data.getElementsByTagName('tbody')[0] nodes = nodes.childNodes head = True remote_data = {} for node in nodes: if node.nodeType == 3: continue if head: head = False continue # node <tr> -> <td> -> <a> -> text node pkg_name = node.childNodes[1].childNodes[0].childNodes[0].nodeValue pkg_ver = node.childNodes[3].childNodes[0].childNodes[0].nodeValue remote_data[pkg_name] = pkg_ver return remote_data
def grpoupsParser(): '''Занесение все пользователей из списка групп в таблицу users''' community = ('foto_history', 'ru_foto', 'prophotos_ru', 'foto_history', 'ru_travel') url = 'livejournal.com' args = '/profile/friendlist?socconns=friends&mode_full_socconns=1' connection = sqlite3.connect(getScriptPwd() + 'livejournal.db') #connection.execute('delete from users') cursor = connection.cursor() for comm in community: print 'Community: %s' % comm print 'http://%s.%s%s' % (comm, url, args) res = urlopen('http://%s.%s%s' % (comm, url, args)) print 'Parsing html' parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("lxml")) dom = parser.parse(res) root = dom.getroot() counter = 0 skipped = 0 print 'Adding users to DB' for node in root.findall(".//html:a", namespaces={"html": "http://www.w3.org/1999/xhtml"}): try: #print node.get('href') cursor.execute('''INSERT INTO users (name) VALUES ('?')''', node.text) counter += 1 except: skipped += 1 print '%s users added' % counter print '%s users skipped' % skipped connection.commit() print 'Total users', connection.execute('select count(*) from users').fetchone() cursor.close() connection.close()
:copyright: Copyright 2007-2018 by the Sphinx team, see AUTHORS. :license: BSD, see LICENSE for details. """ import re import xml.etree.cElementTree as ElementTree from hashlib import md5 import pytest from html5lib import getTreeBuilder, HTMLParser from test_build_html import flat_dict, tail_check, check_xpath from sphinx.util.docutils import is_html5_writer_available TREE_BUILDER = getTreeBuilder('etree', implementation=ElementTree) HTML_PARSER = HTMLParser(TREE_BUILDER, namespaceHTMLElements=False) etree_cache = {} @pytest.mark.skipif(not is_html5_writer_available(), reason='HTML5 writer is not available') @pytest.fixture(scope='module') def cached_etree_parse(): def parse(fname): if fname in etree_cache: return etree_cache[fname] with (fname).open('rb') as fp: etree = HTML_PARSER.parse(fp) etree_cache.clear()
def parse_for_footnotes(article_or_page_generator): all_content = [ getattr(article_or_page_generator, attr, None) \ for attr in [u'articles',u'drafts',u'pages'] ] all_content = [x for x in all_content if x is not None] for article in sequence_gen(all_content): if u"[ref]" in article._content and u"[/ref]" in article._content: content = article._content.replace(u"[ref]", u"<x-simple-footnote>").replace( u"[/ref]", u"</x-simple-footnote>") parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder(u"dom")) dom = parser.parse(content) endnotes = [] count = 0 for footnote in dom.getElementsByTagName(u"x-simple-footnote"): pn = footnote leavealone = False while pn: if pn.nodeName in RAW_FOOTNOTE_CONTAINERS: leavealone = True break pn = pn.parentNode if leavealone: continue count += 1 fnid = u"sf-%s-%s" % (article.slug, count) fnbackid = u"%s-back" % (fnid, ) endnotes.append((footnote, fnid, fnbackid)) number = dom.createElement(u"sup") number.setAttribute(u"id", fnbackid) numbera = dom.createElement(u"a") numbera.setAttribute(u"href", u"#%s" % fnid) numbera.setAttribute(u"class", u"simple-footnote") numbera.appendChild(dom.createTextNode(unicode(count))) txt = getText(footnote, recursive=True).replace(u"\n", u" ") numbera.setAttribute(u"title", txt) number.appendChild(numbera) footnote.parentNode.insertBefore(number, footnote) if endnotes: ol = dom.createElement(u"ol") ol.setAttribute(u"class", u"simple-footnotes") for e, fnid, fnbackid in endnotes: li = dom.createElement(u"li") li.setAttribute(u"id", fnid) while e.firstChild: li.appendChild(e.firstChild) backlink = dom.createElement(u"a") backlink.setAttribute(u"href", u"#%s" % fnbackid) backlink.setAttribute(u"class", u"simple-footnote-back") backlink.appendChild(dom.createTextNode(u'\u21a9')) li.appendChild(dom.createTextNode(u" ")) li.appendChild(backlink) ol.appendChild(li) e.parentNode.removeChild(e) dom.getElementsByTagName(u"body")[0].appendChild(ol) s = html5lib.serializer.HTMLSerializer( omit_optional_tags=False, quote_attr_values='legacy') output_generator = s.serialize( html5lib.treewalkers.getTreeWalker(u"dom")( dom.getElementsByTagName(u"body")[0])) article._content = u"".join(list(output_generator)).replace( u"<x-simple-footnote>", u"[ref]").replace( u"</x-simple-footnote>", u"[/ref]").replace(u"<body>", u"").replace(u"</body>", u"")
# -*- coding: utf-8 -*- from __future__ import unicode_literals import html5lib import pytest from flask import render_template_string from udata.frontend.markdown import md, parse_html, EXCERPT_TOKEN from udata.utils import faker parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom")) def assert_md_equal(value, expected): __tracebackhide__ = True expected = '<div class="markdown">{0}</div>'.format(expected) assert value.strip() == expected @pytest.mark.frontend class MarkdownTest: def test_excerpt_is_not_removed(self, app): with app.test_request_context('/'): assert_md_equal(md(EXCERPT_TOKEN), EXCERPT_TOKEN) def test_markdown_filter_with_none(self, app): '''Markdown filter should not fails with None''' text = None with app.test_request_context('/'): result = render_template_string('{{ text|markdown }}', text=text)
def fromstring(s): tb = html5lib.getTreeBuilder("lxml", implementation=etree) p = html5lib.HTMLParser(tb, namespaceHTMLElements=False) return p.parse(s)
:copyright: Copyright 2007-2016 by the Sphinx team, see AUTHORS. :license: BSD, see LICENSE for details. """ import os import re from six import PY3, iteritems from sphinx import __display_version__ from util import remove_unicode_literals, gen_with_app, with_app, strip_escseq from etree13 import ElementTree from html5lib import getTreeBuilder, HTMLParser TREE_BUILDER = getTreeBuilder("etree", implementation=ElementTree) HTML_PARSER = HTMLParser(TREE_BUILDER, namespaceHTMLElements=False) ENV_WARNINGS = """\ (%(root)s/autodoc_fodder.py:docstring of autodoc_fodder.MarkupError:\\d+: \ WARNING: duplicate object description of autodoc_fodder.MarkupError, other \ instance in %(root)s/autodoc.rst, use :noindex: for one of them )?%(root)s/autodoc_fodder.py:docstring of autodoc_fodder.MarkupError:\\d+: \ WARNING: Explicit markup ends without a blank line; unexpected unindent. %(root)s/index.rst:\\d+: WARNING: Encoding 'utf-8-sig' used for reading included \ file u'%(root)s/wrongenc.inc' seems to be wrong, try giving an :encoding: option %(root)s/index.rst:\\d+: WARNING: image file not readable: foo.png %(root)s/index.rst:\\d+: WARNING: nonlocal image URI found: http://www.python.org/logo.png %(root)s/index.rst:\\d+: WARNING: download file not readable: %(root)s/nonexisting.png %(root)s/index.rst:\\d+: WARNING: invalid single index entry u'' %(root)s/undecodable.rst:\\d+: WARNING: undecodable source characters, replacing \
def parse_for_footnotes(article_generator): for article in article_generator.articles: if "[ref]" in article._content and "[/ref]" in article._content: content = article._content.replace("[ref]", "<x-simple-footnote>").replace("[/ref]", "</x-simple-footnote>") parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom")) dom = parser.parse(content) endnotes = [] count = 0 for footnote in dom.getElementsByTagName("x-simple-footnote"): pn = footnote leavealone = False while pn: if pn.nodeName in RAW_FOOTNOTE_CONTAINERS: leavealone = True break pn = pn.parentNode if leavealone: continue count += 1 fnid = "sf-%s-%s" % (article.slug, count) fnbackid = "%s-back" % (fnid,) endnotes.append((footnote, fnid, fnbackid)) number = dom.createElement("sup") number.setAttribute("id", fnbackid) numbera = dom.createElement("a") numbera.setAttribute("href", "#%s" % fnid) numbera.setAttribute("class", "simple-footnote") numbera.appendChild(dom.createTextNode(str(count))) txt = getText(footnote, recursive=True).replace("\n", " ") numbera.setAttribute("title", txt) number.appendChild(numbera) footnote.parentNode.insertBefore(number, footnote) if endnotes: ol = dom.createElement("ol") ol.setAttribute("class", "simple-footnotes") ol.appendChild(dom.createTextNode('Notes:')) for e, fnid, fnbackid in endnotes: li = dom.createElement("li") li.setAttribute("id", fnid) while e.firstChild: li.appendChild(e.firstChild) backlink = dom.createElement("a") backlink.setAttribute("href", "#%s" % fnbackid) backlink.setAttribute("class", "simple-footnote-back") backlink.appendChild(dom.createTextNode(u'\u21a9')) li.appendChild(dom.createTextNode(" ")) li.appendChild(backlink) ol.appendChild(li) e.parentNode.removeChild(e) dom.getElementsByTagName("body")[0].appendChild(ol) s = html5lib.serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values=True) output_generator = s.serialize(html5lib.treewalkers.getTreeWalker("dom")(dom.getElementsByTagName("body")[0])) article._content = "".join(list(output_generator)).replace( "<x-simple-footnote>", "[ref]").replace("</x-simple-footnote>", "[/ref]").replace( "<body>", "").replace("</body>", "") if False: count = 0 endnotes = [] for f in footnotes: count += 1 fnstr = '<a class="simple-footnote" name="%s-%s-back" href="#%s-%s"><sup>%s</a>' % ( article.slug, count, article.slug, count, count) endstr = '<li id="%s-%s">%s <a href="#%s-%s-back">↑</a></li>' % ( article.slug, count, f[len("[ref]"):-len("[/ref]")], article.slug, count) content = content.replace(f, fnstr) endnotes.append(endstr) content += '<h4>Footnotes</h4><ol class="simple-footnotes">%s</ul>' % ("\n".join(endnotes),) article._content = content
'pre', 'style', 'script', 'textarea', }) _ensure = lambda e, tag: e.find(tag) if e.tag != tag else e # HTML5 serialization setup _tree_walker = html5lib.getTreeWalker("etree", implementation=etree) _serializer = html5lib.serializer.HTMLSerializer(omit_optional_tags=False, resolve_entities=False) # HTML5 parsing setup _tree_builder = html5lib.getTreeBuilder("etree", implementation=etree) _parser = html5lib.HTMLParser(_tree_builder, namespaceHTMLElements=False) # FIX for HTMLParser.reset(): if not hasattr(_parser, "innerHTMLMode"): # add the missing attribute, as otherwise calling .reset() would raise an AttributeError _parser.innerHTMLMode = None def Root(title=None, encoding=None) -> Element: root = Element(ROOT_TAG) head = SubElement(root, HEAD_TAG) if title is not None: assert isinstance(title, str), 'title not a string' SubElement(head, TITLE_TAG).text = title
def purify_html(input_html, obj): parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom")) document = parser.parse(input_html) convert_to_markdown = True dom_modified = False # Group consecutive orphaned <li>s inside a <ul> for li in document.getElementsByTagName('li'): node = li has_proper_parent = False while node.tagName != 'body' and node.parentNode: node = node.parentNode if node.tagName in {'ul', 'ol'}: has_proper_parent = True if not has_proper_parent: sibling_items = [] sibling = li while sibling and sibling.nodeType == minidom.Node.ELEMENT_NODE and sibling.tagName == 'li': sibling_items.append(sibling) sibling = sibling.nextSibling # jump over empty nodes while sibling and sibling.nodeType == minidom.Node.TEXT_NODE and sibling.data.isspace( ): sibling = sibling.nextSibling container = document.createElement('ul') li.parentNode.replaceChild(container, li) print "!! Adding missing ul", obj container.appendChild(li) for child in sibling_items: if child != li: container.appendChild(child) dom_modified = True # Handle missing <li> for ul in document.getElementsByTagName('ul'): for child in ul.childNodes: if isinstance(child, minidom.Comment) or isinstance( child, minidom.Text) and not child.data.isspace(): print "!! Adding missing li", obj li = document.createElement('li') ul.insertBefore(li, child) ul.removeChild(child) li.appendChild(child) dom_modified = True # Markdown doesn't allow paragraph inside a list for ul in chain(document.getElementsByTagName('ul'), document.getElementsByTagName('ol')): for li in ul.childNodes: if not isinstance(li, (minidom.Text, minidom.Comment)): if li.getElementsByTagName('p'): print "!! Cannot convert to markdown because a list contains a paragraph:", obj convert_to_markdown = False # Markdown doesn't like a bold or italic section to start or end with a whitespace for element in chain(document.getElementsByTagName('em'), document.getElementsByTagName('strong'), document.getElementsByTagName('i'), document.getElementsByTagName('b')): first_text_node = _get_first_text_node(element) if first_text_node is not None: whitespace, text = _split_leading_whitespace(first_text_node.data) if whitespace: print "!! Moving leading whitespace outside of the element:", obj first_text_node.data = text element.parentNode.insertBefore( document.createTextNode(whitespace), element) dom_modified = True last_text_node = _get_last_text_node(element) if last_text_node is not None: text, whitespace = _split_trailing_whitespace(last_text_node.data) if whitespace: print "!! Moving trailing whitespace outside of the element:", obj, whitespace.__repr__( ) last_text_node.data = text _insertAfter(document.createTextNode(whitespace), element) dom_modified = True # Our markdown render don't handle * and _ followed by a whitespace after a newline correctly. # The descriptions using those sequences of characters will be left as HTML. When we use a # spec-compliant Markdown renderer those descriptions could be converted to Markdown # # For instance the following is not rendered properly: # Something # * a # * b # * c if _contains_problematic_space_near_delimiter("".join([ x.toxml() for x in document.getElementsByTagName('body')[0].childNodes ])): print "!! Cannot convert to markdown because the description contains '_ ' or '* ' " \ "at a position which is very likely to trigger bugs in our markdown renderer." \ "(It is likely that this description is not being rendered properly in its current form.)", obj # I'm pretty sure that converting such descriptions now yields CommonMark-compliant Markdown but proper # testing with a compliant python parser has not been performed. I'm leaving those description as # HTML for now which should make it easier to find them when switching to a compliant Markdown renderer. convert_to_markdown = False if convert_to_markdown and dom_modified: result = "".join([ x.toxml() for x in document.getElementsByTagName('body')[0].childNodes ]) else: result = input_html return result, convert_to_markdown
#!/usr/local/bin/python import html5lib import urllib2 import lxml from lxml.html import tostring from lxml import etree parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("lxml")) f = urllib2.urlopen("http://ex.ua/view/14475479").read() doc = html5lib.parse(f,treebuilder="lxml" , namespaceHTMLElements=False) root = doc.getroot() fnd_p = etree.XPath("/html/body/table/tbody/tr/td/table[@class='list']/tbody/tr/td/a[@rel='nofollow']", namespaces = {"html": "http://www.w3.org/1999/xhtml"}) print "[playlist]" item = 1; for a in fnd_p(root): print("File"+str(item)+"=http://ex.ua"+a.get('href')); print("Title"+str(item)+"="+a.get("title")) item=item+1;
document = '<html><head><title>Test</title></head><body><h1 align="center">Big data news</h1><h1 align="center">AI news</h1><h1 align="right">2018.8.1</h1></body></html>' #直接调用html5lib.parse来解析,解析时采用lxml构建树的方法 content = html5lib.parse(document, treebuilder="lxml", namespaceHTMLElements=False) #指定要提取的内容所在的标签路径 rows = content.xpath('/html/body/h1') for row in rows: t = row.xpath('./text()')[0] #定位到标签节点后,通过text()提取内容 print(t) print('通过指定tree来解析:') document = '<html><head><title>Test</title></head><body><h1 align="center">Big data news</h1><h1 align="center">AI news</h1><h1 align="right">2018.8.1</h1></body></html>' #构造HTMLParser实例,指定构造lxml的树 p = html5lib.HTMLParser(strict=False, tree=html5lib.getTreeBuilder('lxml'), namespaceHTMLElements=False) #解析HTML文档 t = p.parse(document) rows = t.xpath('/html/body/h1') for row in rows: t = row.xpath('./text()')[0] print(t) print('通过指定tree来提取超链接:') document = '<html><head><title>Test</title></head><body><a href="www.baidu.com">baidu</body></html>' p = html5lib.HTMLParser(strict=False, tree=html5lib.getTreeBuilder('lxml'), namespaceHTMLElements=False) t = p.parse(document) #通过findall来查找所有标签名称为a的节点
SPACE_PRESERVING_TAGS = frozenset({ 'pre', 'style', 'script', 'textarea', }) _ensure = lambda e, tag: e.find(tag) if e.tag != tag else e # HTML5 serialization setup _tree_walker = html5lib.getTreeWalker("etree", implementation=etree) _serializer = html5lib.serializer.HTMLSerializer(omit_optional_tags=False, resolve_entities=False) # HTML5 parsing setup _tree_builder = html5lib.getTreeBuilder("etree", implementation=etree) _parser = html5lib.HTMLParser(_tree_builder, namespaceHTMLElements=False) # FIX for HTMLParser.reset(): if not hasattr(_parser, "innerHTMLMode"): # add the missing attribute, as otherwise calling .reset() would raise an AttributeError _parser.innerHTMLMode = None def Root(title=None, encoding=None) -> Element: root = Element(ROOT_TAG) head = SubElement(root, HEAD_TAG) if title is not None: assert isinstance(title, str), 'title not a string' SubElement(head, TITLE_TAG).text = title
import html5lib document1 = html5lib.parse("<p>Hello World!</p>") print(document1) from urllib.request import urlopen with urlopen("http://www.google.com/") as f: document2 = html5lib.parse( f, transport_encoding=f.info().get_content_charset()) print(document2) document3 = html5lib.HTMLParser( tree=html5lib.getTreeBuilder("dom")).parse("<p>Hello World!</p>") print(document3) element = html5lib.parse('<p>Hello World!</p>') walker = html5lib.getTreeWalker("etree") stream = walker(element) s = html5lib.serializer.HTMLSerializer().serialize(stream) for i in s: print(i) from html5lib.filters import sanitizer dom = html5lib.parse("<script>alert('warning!')</script>", treebuilder="dom") walker = html5lib.getTreeWalker("dom") clean_stream = sanitizer.Filter(walker(dom)) print(clean_stream)
def parse_for_footnotes(article_generator): for article in article_generator.articles: if "[ref]" in article._content and "[/ref]" in article._content: content = article._content.replace("[ref]", "<x-simple-footnote>").replace( "[/ref]", "</x-simple-footnote>") parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom")) dom = parser.parse(content) endnotes = [] count = 0 for footnote in dom.getElementsByTagName("x-simple-footnote"): pn = footnote leavealone = False while pn: if pn.nodeName in RAW_FOOTNOTE_CONTAINERS: leavealone = True break pn = pn.parentNode if leavealone: continue count += 1 fnid = "sf-%s-%s" % (article.slug, count) fnbackid = "%s-back" % (fnid, ) endnotes.append((footnote, fnid, fnbackid)) number = dom.createElement("sup") number.setAttribute("id", fnbackid) numbera = dom.createElement("a") numbera.setAttribute("href", "#%s" % fnid) numbera.setAttribute("class", "simple-footnote") numbera.appendChild(dom.createTextNode(str(count))) txt = getText(footnote, recursive=True).replace("\n", " ") numbera.setAttribute("title", txt) number.appendChild(numbera) footnote.parentNode.insertBefore(number, footnote) if endnotes: ol = dom.createElement("ol") ol.setAttribute("class", "simple-footnotes") for e, fnid, fnbackid in endnotes: li = dom.createElement("li") li.setAttribute("id", fnid) while e.firstChild: li.appendChild(e.firstChild) backlink = dom.createElement("a") backlink.setAttribute("href", "#%s" % fnbackid) backlink.setAttribute("class", "simple-footnote-back") backlink.appendChild(dom.createTextNode(u'\u21a9')) li.appendChild(dom.createTextNode(" ")) li.appendChild(backlink) ol.appendChild(li) e.parentNode.removeChild(e) dom.getElementsByTagName("body")[0].appendChild(ol) s = html5lib.serializer.htmlserializer.HTMLSerializer( omit_optional_tags=False, quote_attr_values=True) output_generator = s.serialize( html5lib.treewalkers.getTreeWalker("dom")( dom.getElementsByTagName("body")[0])) article._content = "".join(list(output_generator)).replace( "<x-simple-footnote>", "[ref]").replace("</x-simple-footnote>", "[/ref]").replace( "<body>", "").replace("</body>", "") if False: count = 0 endnotes = [] for f in footnotes: count += 1 fnstr = '<a class="simple-footnote" name="%s-%s-back" href="#%s-%s"><sup>%s</a>' % ( article.slug, count, article.slug, count, count) endstr = '<li id="%s-%s">%s <a href="#%s-%s-back">↑</a></li>' % ( article.slug, count, f[len("[ref]"):-len("[/ref]")], article.slug, count) content = content.replace(f, fnstr) endnotes.append(endstr) content += '<h4>Footnotes</h4><ol class="simple-footnotes">%s</ul>' % ( "\n".join(endnotes), ) article._content = content
https://github.com/sphinx-doc/sphinx/pull/2805/files :copyright: Copyright 2007-2017 by the Sphinx team, see AUTHORS. :license: BSD, see LICENSE for details. """ import xml.etree.cElementTree as ElementTree import pytest from html5lib import getTreeBuilder, HTMLParser from sphinx.util.docutils import is_html5_writer_available from test_build_html import flat_dict, tail_check, check_xpath TREE_BUILDER = getTreeBuilder('etree', implementation=ElementTree) HTML_PARSER = HTMLParser(TREE_BUILDER, namespaceHTMLElements=False) etree_cache = {} @pytest.mark.skipif(not is_html5_writer_available(), reason='HTML5 writer is not available') @pytest.fixture(scope='module') def cached_etree_parse(): def parse(fname): if fname in etree_cache: return etree_cache[fname] with (fname).open('rb') as fp: etree = HTML_PARSER.parse(fp) etree_cache.clear()
def purify_html(input_html, obj): parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom")) document = parser.parse(input_html) convert_to_markdown = True dom_modified = False # Group consecutive orphaned <li>s inside a <ul> for li in document.getElementsByTagName('li'): node = li has_proper_parent = False while node.tagName != 'body' and node.parentNode: node = node.parentNode if node.tagName in {'ul', 'ol'}: has_proper_parent = True if not has_proper_parent: sibling_items = [] sibling = li while sibling and sibling.nodeType == minidom.Node.ELEMENT_NODE and sibling.tagName == 'li': sibling_items.append(sibling) sibling = sibling.nextSibling # jump over empty nodes while sibling and sibling.nodeType == minidom.Node.TEXT_NODE and sibling.data.isspace(): sibling = sibling.nextSibling container = document.createElement('ul') li.parentNode.replaceChild(container, li) print "!! Adding missing ul", obj container.appendChild(li) for child in sibling_items: if child != li: container.appendChild(child) dom_modified = True # Handle missing <li> for ul in document.getElementsByTagName('ul'): for child in ul.childNodes: if isinstance(child, minidom.Comment) or isinstance(child, minidom.Text) and not child.data.isspace(): print "!! Adding missing li", obj li = document.createElement('li') ul.insertBefore(li, child) ul.removeChild(child) li.appendChild(child) dom_modified = True # Markdown doesn't allow paragraph inside a list for ul in chain(document.getElementsByTagName('ul'), document.getElementsByTagName('ol')): for li in ul.childNodes: if not isinstance(li, (minidom.Text, minidom.Comment)): if li.getElementsByTagName('p'): print "!! Cannot convert to markdown because a list contains a paragraph:", obj convert_to_markdown = False # Markdown doesn't like a bold or italic section to start or end with a whitespace for element in chain(document.getElementsByTagName('em'), document.getElementsByTagName('strong'), document.getElementsByTagName('i'), document.getElementsByTagName('b')): first_text_node = _get_first_text_node(element) if first_text_node is not None: whitespace, text = _split_leading_whitespace(first_text_node.data) if whitespace: print "!! Moving leading whitespace outside of the element:", obj first_text_node.data = text element.parentNode.insertBefore(document.createTextNode(whitespace), element) dom_modified = True last_text_node = _get_last_text_node(element) if last_text_node is not None: text, whitespace = _split_trailing_whitespace(last_text_node.data) if whitespace: print "!! Moving trailing whitespace outside of the element:", obj, whitespace.__repr__() last_text_node.data = text _insertAfter(document.createTextNode(whitespace), element) dom_modified = True # Our markdown render don't handle * and _ followed by a whitespace after a newline correctly. # The descriptions using those sequences of characters will be left as HTML. When we use a # spec-compliant Markdown renderer those descriptions could be converted to Markdown # # For instance the following is not rendered properly: # Something # * a # * b # * c if _contains_problematic_space_near_delimiter("".join([x.toxml() for x in document.getElementsByTagName('body')[0].childNodes])): print "!! Cannot convert to markdown because the description contains '_ ' or '* ' " \ "at a position which is very likely to trigger bugs in our markdown renderer." \ "(It is likely that this description is not being rendered properly in its current form.)", obj # I'm pretty sure that converting such descriptions now yields CommonMark-compliant Markdown but proper # testing with a compliant python parser has not been performed. I'm leaving those description as # HTML for now which should make it easier to find them when switching to a compliant Markdown renderer. convert_to_markdown = False if convert_to_markdown and dom_modified: result = "".join([x.toxml() for x in document.getElementsByTagName('body')[0].childNodes]) else: result = input_html return result, convert_to_markdown
print(row[0].text + " : " + row[1][1].text) def parseModelList(url): print() with urlopen(url) as f: string = f.read().decode('utf-8') json_obj = json.loads(string) for model in json_obj['models']: model_url = 'http://www.cat.com/' + model['detail_url'] print(model['model_name']) parseModel(model_url) with urlopen("http://www.cat.com/en_GB/products/new/equipment.html") as f: parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("lxml")) document = parser.parse(f, transport_encoding=f.info().get_content_charset()) find_btn = etree.XPath( "//html:div[@class='span3 selector class-selector']", namespaces={"html": "http://www.w3.org/1999/xhtml"}) for item in find_btn(document): url = "http://www.cat.com" + item[0].attrib['href'] text = item[0][0][0][1].text #print(url) print(text) url_arr = str.split(url, '/') name = str.split(url_arr[-1], '.')[0] model_url = 'http://www.cat.com/en_GB/products/new/equipment/' + name + '/_jcr_content.feed.json' #print(name) parseModelList(model_url)
# -*- coding: utf-8 -*- from __future__ import unicode_literals import html5lib from flask import render_template_string from .. import TestCase, WebTestMixin from udata.frontend.markdown import md, init_app, EXCERPT_TOKEN parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom")) class MarkdownTestCase(TestCase, WebTestMixin): def create_app(self): app = super(MarkdownTestCase, self).create_app() init_app(app) return app def test_excerpt_is_not_removed(self): with self.app.test_request_context('/'): self.assertEqual(md(EXCERPT_TOKEN).strip(), EXCERPT_TOKEN) def test_markdown_filter_with_none(self): '''Markdown filter should not fails with None''' text = None with self.app.test_request_context('/'): result = render_template_string('{{ text|markdown }}', text=text) self.assertEqual(result, '')