def _serialize(domtree): walker = html5lib.treewalkers.getTreeWalker('etree') stream = walker(domtree) serializer = HTMLSerializer(quote_attr_values=True, alphabetical_attributes=True, omit_optional_tags=False) return serializer.render(stream)
def serialize(self, output): walker = html5lib.treewalkers.getTreeWalker("dom") stream = walker(self.document) s = HTMLSerializer(omit_optional_tags=True) output_generator = s.serialize(stream) for item in output_generator: output.write(item.encode('utf-8'))
def app_filter_html_path_inplace(path, filters, log=None): """Filter the given HTML file (in-place) based on "app-*" class attributes. For example, the HTML might contain something like: <div class="app-ide"> ...ide info... </div> <div class="app-edit"> ...edit info... </div> If there are no filters, then the HTML is not changed. If the filters include "ide" but not "edit", then the ide div remains and the edit div is removed. """ if not filters: return if log: log("app-filter `%s'", path) # Parse the HTML file. with open(path) as f: tree = html5lib.parse(f, namespaceHTMLElements=False) # Filter out the unwanted elements. filtered = False assert isinstance(filters, set) for elem in tree.getiterator(): indeces_to_drop = [] for i, child in enumerate(elem.getchildren()): if _should_drop_elem(child, filters, "class", "app-"): indeces_to_drop.insert(0, i) filtered = True if log: tag_str = "<%s" % child.tag if child.attrib: for n, v in child.attrib.items(): tag_str += ' %s="%s"' % (n, v) tag_str += ">" if len(tag_str) > 50: tag_str = tag_str[:47] + '...' log("... filter out %s", tag_str) for idx in indeces_to_drop: del elem[idx] # Write out any changes. if filtered: walker = treewalkers.getTreeWalker("etree", ET) stream = walker(tree) s = HTMLSerializer() outputter = s.serialize(stream) content = ''.join(list(outputter)) f = open(path, 'w') f.write("""<!DOCTYPE html> """) try: f.write(content) finally: f.close()
def to_unicode(self): """Return the unicode serialization of myself.""" container_len = len(self.CONTAINER_TAG) + 2 # 2 for the <> walker = getTreeWalker(self.TREEBUILDER) stream = walker(self._root) serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False) return serializer.render(stream)[container_len:-container_len - 1]
def clean_nl(string): """ This will clean up newlines so that nl2br can properly be called on the cleaned text. """ html_blocks = [ '{http://www.w3.org/1999/xhtml}blockquote', '{http://www.w3.org/1999/xhtml}ol', '{http://www.w3.org/1999/xhtml}li', '{http://www.w3.org/1999/xhtml}ul' ] if not string: return string def parse_html(tree): # In etree, a tag may have: # - some text content (piece of text before its first child) # - a tail (piece of text just after the tag, and before a sibling) # - children # Eg: "<div>text <b>children's text</b> children's tail</div> tail". # Strip new lines directly inside block level elements: first new lines # from the text, and: # - last new lines from the tail of the last child if there's children # (done in the children loop below). # - or last new lines from the text itself. if tree.tag in html_blocks: if tree.text: tree.text = tree.text.lstrip('\n') if not len(tree): # No children. tree.text = tree.text.rstrip('\n') # Remove the first new line after a block level element. if tree.tail and tree.tail.startswith('\n'): tree.tail = tree.tail[1:] for child in tree: # Recurse down the tree. if tree.tag in html_blocks: # Strip new lines directly inside block level elements: remove # the last new lines from the children's tails. if child.tail: child.tail = child.tail.rstrip('\n') parse_html(child) return tree parse = parse_html(html5lib.parseFragment(string)) # Serialize the parsed tree back to html. walker = html5lib.treewalkers.getTreeWalker('etree') stream = walker(parse) serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False) return serializer.render(stream)
def sanitize_html(html): if not html: return u'' if isinstance(html, bytes): html = html.decode('utf-8', 'replace') import html5lib from html5lib.sanitizer import HTMLSanitizer from html5lib.serializer.htmlserializer import HTMLSerializer from html5lib.treebuilders.etree_lxml import TreeBuilder from html5lib.treewalkers.lxmletree import TreeWalker parser = html5lib.HTMLParser(tokenizer=HTMLSanitizer, tree=TreeBuilder) tree = parser.parseFragment(html) serializer = HTMLSerializer(quote_attr_values=True, alphabetical_attributes=False, omit_optional_tags=False) stream = TreeWalker(tree) return serializer.render(stream)
def clean_nl(string): """ This will clean up newlines so that nl2br can properly be called on the cleaned text. """ html_blocks = ['blockquote', 'ol', 'li', 'ul'] if not string: return string def parse_html(tree): prev_tag = '' for i, node in enumerate(tree.childNodes): if node.type == 4: # Text node value = node.value # Strip new lines directly inside block level elements. if node.parent.name in html_blocks: value = value.strip('\n') # Remove the first new line after a block level element. if (prev_tag in html_blocks and value.startswith('\n')): value = value[1:] tree.childNodes[i].value = value else: tree.insertBefore(parse_html(node), node) tree.removeChild(node) prev_tag = node.name return tree parse = parse_html(html5lib.parseFragment(string)) if not parse.childNodes: # The parser couldn't make sense of the given html, eg bad markup. return '' walker = html5lib.treewalkers.getTreeWalker('simpletree') stream = walker(parse) serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False) return serializer.render(stream)
def sanitize_html_fragment(fragment): """ Santize an HTML ``fragment``, returning a copy of the fragment that has been cleaned up. """ if fragment: import html5lib from html5lib.sanitizer import HTMLSanitizer from html5lib.serializer.htmlserializer import HTMLSerializer parser = html5lib.HTMLParser(tokenizer=HTMLSanitizer) parsed = parser.parseFragment(fragment) walker = html5lib.treewalkers.getTreeWalker('etree') stream = walker(parsed) serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False) output = serializer.render(stream) return output else: return fragment
def independentize_html_path(src, dst, css_dir=None, log=None): """Process the `src' HTML path to `dst' making it independent. - favicon links are removed - CSS references are updated (if `css_dir' is given), else removed. - Relative links are de-linkified. """ if log: log.info("independentize %s %s", src, dst) # Parse the HTML file. with open(src) as f: tree = html5lib.parse(f, namespaceHTMLElements=False) # - Drop favicon links. # - Update or drop CSS links. head = tree.find("head") for link in head.getchildren()[:]: if link.tag != "link": continue rel = link.get("rel", "").split() if "icon" in rel: # this is a favicon link if log: log.debug("%s: remove <link rel='%s'/>", dst, link.get("rel")) head.remove(link) if "stylesheet" in rel: # this is a css ref if css_dir: # update the css dir href = link.get("href") href = posixpath.join(css_dir, posixpath.basename(href)) link.set("href", href) if log: log.debug("%s: update to <link href='%s'/>", dst, href) else: if log: log.debug("%s: remove <link href='%s'/>", dst, link.get("href")) head.remove(link) # De-linkify local references within the full docset. # TODO: Eventually would like to normalize these to point # to online version of the docs. body = tree.find("body") for elem in body.getiterator(): if elem.tag != "a": continue if not elem.get("href"): continue href = elem.get("href") scheme, netloc, path, params, query, fragment = urlparse(href) if scheme or netloc: # externals href continue if path: if log: log.debug("%s: de-linkify <a href='%s'>", dst, href) elem.tag = u"span" # de-linkify # Write out massaged doc. walker = treewalkers.getTreeWalker("etree", ET) stream = walker(tree) s = HTMLSerializer() outputter = s.serialize(stream) content = ''.join(list(outputter)) f = open(dst, 'w') try: f.write(content) finally: f.close()