async def taskTx(sock, message, mtype): # a poor implementation of an output coroutine. global revertProtocol tp = html5lib.getTreeBuilder("dom") p = html5lib.HTMLParser(tree=tp) tw = html5lib.getTreeWalker("dom") parsedTX = p.parseFragment(message) cleanTX = sanitizer.Filter(tw(parsedTX)) s = html5lib.serializer.HTMLSerializer() pretx = s.serialize(cleanTX) tx = '' for item in pretx: tx += item if message == b"200": await sock.send("Goodbye.") await sock.close() return if message == b"202": await sock.send("Authentication Successful, you are now the admin terminal.") else: if revertProtocol: await sock.send(tx) return else: await sock.send(json.dumps({"MSG_TYPE":mtype, "MSG":tx})) return
def sanitize_html(html): """ Make the given HTML string safe to display in a Yarrharr page. """ tree = html5lib.parseFragment(html) serializer = html5lib.serializer.HTMLSerializer() source = html5lib.getTreeWalker("etree")(tree) source = _strip_attrs(source) source = _drop_empty_tags(source) source = _ReplaceObjectFilter(source) source = _ElideFilter(source) source = _ReplaceYoutubeEmbedFilter(source) source = _ExtractTitleTextFilter(source) source = _adjust_links(source) source = _video_attrs(source) source = _wp_smileys(source) source = sanitizer.Filter( source, allowed_elements=sanitizer.allowed_elements | frozenset([ ( namespaces["html"], "summary", ), # https://github.com/html5lib/html5lib-python/pull/423 ( namespaces["html"], "wbr", ), # https://github.com/html5lib/html5lib-python/pull/395 ]), ) return serializer.render(source)
def strip_style_and_script(input): dom = html5lib.parseFragment(input, treebuilder="dom") walker = html5lib.getTreeWalker("dom") stream = walker(dom) s = html5lib.serializer.HTMLSerializer() return s.render(NoChildTagFilter(stream, ("script", "style")))
def run(self, text): parser = html5lib.HTMLParser(tokenizer=ForgeHTMLSanitizer) parsed = parser.parse(text) serializer = html5lib.serializer.HTMLSerializer() walker = html5lib.getTreeWalker("etree") stream = html5lib.filters.alphabeticalattributes.Filter(walker(parsed)) out = ''.join(serializer.serialize(stream)) return out
def test_htmlserialized (): document = html5lib.parse (StringIO ("""<html><body> <p>Hello & <!-- comment -->Wörld! ♣ ∖</p> </body></html>""")) walker = html5lib.getTreeWalker("etree") stream = walker (document) s = HTMLSerializer() assert ''.join (s.serialize(stream)) == ' Hello & Wörld! ♣ \u2216\n\n '
def html_tree_to_text(html_tree): options = {'quote_attr_values': 'always', 'use_trailing_solidus': True, 'space_before_trailing_solidus': True} serializer = html5lib.serializer.HTMLSerializer(**options) walker = html5lib.getTreeWalker('etree') stream = serializer.serialize(walker(html_tree)) return u''.join(stream)
def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False, strip_comments=True, filters=None): """Initializes a Cleaner :arg list tags: allowed list of tags; defaults to ``bleach.sanitizer.ALLOWED_TAGS`` :arg dict attributes: allowed attributes; can be a callable, list or dict; defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES`` :arg list styles: allowed list of css styles; defaults to ``bleach.sanitizer.ALLOWED_STYLES`` :arg list protocols: allowed list of protocols for links; defaults to ``bleach.sanitizer.ALLOWED_PROTOCOLS`` :arg bool strip: whether or not to strip disallowed elements :arg bool strip_comments: whether or not to strip HTML comments :arg list filters: list of html5lib Filter classes to pass streamed content through .. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters .. Warning:: Using filters changes the output of ``bleach.Cleaner.clean``. Make sure the way the filters change the output are secure. """ self.tags = tags self.attributes = attributes self.styles = styles self.protocols = protocols self.strip = strip self.strip_comments = strip_comments self.filters = filters or [] self.parser = html5lib.HTMLParser(namespaceHTMLElements=False) self.walker = html5lib.getTreeWalker('etree') self.serializer = HTMLSerializer( quote_attr_values='always', omit_optional_tags=False, # Bleach has its own sanitizer, so don't use the html5lib one sanitize=False, # Bleach sanitizer alphabetizes already, so don't use the html5lib one alphabetical_attributes=False, )
def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False, strip_comments=True, filters=None): """Initializes a Cleaner :arg list tags: allowed list of tags; defaults to ``bleach.sanitizer.ALLOWED_TAGS`` :arg dict attributes: allowed attributes; can be a callable, list or dict; defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES`` :arg list styles: allowed list of css styles; defaults to ``bleach.sanitizer.ALLOWED_STYLES`` :arg list protocols: allowed list of protocols for links; defaults to ``bleach.sanitizer.ALLOWED_PROTOCOLS`` :arg bool strip: whether or not to strip disallowed elements :arg bool strip_comments: whether or not to strip HTML comments :arg list filters: list of html5lib Filter classes to pass streamed content through .. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters .. Warning:: Using filters changes the output of ``bleach.Cleaner.clean``. Make sure the way the filters change the output are secure. """ self.tags = tags self.attributes = attributes self.styles = styles self.protocols = protocols self.strip = strip self.strip_comments = strip_comments self.filters = filters or [] self.parser = BleachHTMLParser(namespaceHTMLElements=False) self.walker = html5lib.getTreeWalker('etree') self.serializer = BleachHTMLSerializer( quote_attr_values='always', omit_optional_tags=False, escape_lt_in_attrs=True, # We want to leave entities as they are without escaping or # resolving or expanding resolve_entities=False, # Bleach has its own sanitizer, so don't use the html5lib one sanitize=False, # Bleach sanitizer alphabetizes already, so don't use the html5lib one alphabetical_attributes=False, )
def test_iterable(etree): walker = getTreeWalker('etree') stream = walker(etree) stream = TruncationFilter(stream, 98, end='...') assert stream.tree is etree iterator = iter(stream) assert iterator is not stream assert iter(iterator) is iterator
def html_tree_to_text(html_tree): options = { 'quote_attr_values': 'always', 'use_trailing_solidus': True, 'space_before_trailing_solidus': True } serializer = html5lib.serializer.HTMLSerializer(**options) walker = html5lib.getTreeWalker('etree') stream = serializer.serialize(walker(html_tree)) return u''.join(stream)
def obfuscate_emails(content): if isinstance(content, contents.Static): return dom = html5lib.parseFragment(content._content, treebuilder="etree") walker = html5lib.getTreeWalker("etree") stream = walker(dom) stream = ObfuscateEmailsFilter(stream) s = html5lib.serializer.HTMLSerializer(quote_attr_values="always", omit_optional_tags=False) content._content = s.render(stream)
def write_node(node, out): walker = html5lib.getTreeWalker("dom") stream = walker(node) s = html5lib.serializer.HTMLSerializer( quote_attr_values='always', minimize_boolean_attributes=False, use_best_quote_char=True, omit_optional_tags=False ) for txt in s.serialize(stream): out.write(txt)
def test_with_serializer(): """Verify filter works in the context of everything else""" parser = html5lib.HTMLParser() dom = parser.parseFragment( '<svg><pattern xlink:href="#patt2" id="patt1"></svg>') walker = html5lib.getTreeWalker('etree') ser = HTMLSerializer(alphabetical_attributes=True, quote_attr_values='always') # FIXME(willkg): The "xlink" namespace gets dropped by the serializer. When # that gets fixed, we can fix this expected result. assert (ser.render(walker(dom)) == '<svg><pattern id="patt1" href="#patt2"></pattern></svg>')
def test_with_serializer(): """Verify filter works in the context of everything else""" parser = html5lib.HTMLParser() dom = parser.parseFragment('<svg><pattern xlink:href="#patt2" id="patt1"></svg>') walker = html5lib.getTreeWalker('etree') ser = HTMLSerializer( alphabetical_attributes=True, quote_attr_values='always' ) # FIXME(willkg): The "xlink" namespace gets dropped by the serializer. When # that gets fixed, we can fix this expected result. assert ( ser.render(walker(dom)) == '<svg><pattern id="patt1" href="#patt2"></pattern></svg>' )
def trim_html(html): if not isinstance(html, Markup): raise TypeError("trim_html: expected Markup, got {!r}".format(type(html))) # TODO i think this could be combined with the bleach.clean call to avoid a # double parse? filters apply during serialization, bleach applies during # tokenization # TODO alternatively, could this apply during tokenization to avoid # bothering with any markup we're not even going to show? tree = html5lib.parse(html) walker = html5lib.getTreeWalker("etree") stream = walker(tree) stream = TrimFilter(stream) serializer = html5lib.serializer.HTMLSerializer() return Markup(u"".join(serializer.serialize(stream)).strip())
def trim_html(html): if not isinstance(html, Markup): raise TypeError("trim_html: expected Markup, got {!r}".format(type(html))) # TODO i think this could be combined with the bleach.clean call to avoid a # double parse? filters apply during serialization, bleach applies during # tokenization # TODO alternatively, could this apply during tokenization to avoid # bothering with any markup we're not even going to show? tree = html5lib.parse(html) walker = html5lib.getTreeWalker('etree') stream = walker(tree) stream = TrimFilter(stream) serializer = html5lib.serializer.HTMLSerializer() return Markup(u''.join(serializer.serialize(stream)).strip())
def __init__(self, ignore_headers=True, raise_invalid_tags=False): """ :param ignore_headers: If true, ignores text inside of the tags included in HEADER_ELEMENTS. This defaults to true because the text inside of these "header elements" is typically not a sentence. :param raise_invalid_tags: If true, raises an InvalidTagError when parsing a tag not in INLINE_ELEMENTS, BLOCK_LEVEL_ELEMENTS (which includes the elements of HEADER_ELEMENTS), SKIPPED_ELEMENTS, EMPTY_ELEMENTS, or SENTENCE_VOID_ELEMENTS. If false, ignores this tag and all of its children. (Sentences descending from it will not be included in the value returned from feed) """ # self.parser is an etree parser by default. self.parser = html5lib.HTMLParser() self.walker = html5lib.getTreeWalker("etree") self.sentences = [] self.ignore_header_text = ignore_headers self.raise_invalid_tags = raise_invalid_tags self.reset()
def truncate(html, truncated_message, suffix, max_entities=None, max_length=None): walker = html5lib.getTreeWalker('etree') html_stream = walker(html5lib.parseFragment(html, treebuilder='etree')) truncated_message_stream = walker( html5lib.parseFragment(truncated_message, treebuilder='etree')) suffix_stream = walker(html5lib.parseFragment(suffix, treebuilder='etree')) truncated = TelegramTruncator(html_stream, truncated_message=truncated_message_stream, suffix=suffix_stream, max_entities=max_entities, max_length=max_length) return HTMLSerializer().render(truncated).strip('\n')
def sanitize_html(html): """ Make the given HTML string safe to display in a Yarrharr page. """ tree = html5lib.parseFragment(html) serializer = html5lib.serializer.HTMLSerializer(sanitize=True) source = html5lib.getTreeWalker('etree')(tree) source = _strip_attrs(source) source = _drop_empty_tags(source) source = _ReplaceObjectFilter(source) source = _ElideFilter(source) source = _ReplaceYoutubeEmbedFilter(source) source = _ExtractTitleTextFilter(source) source = _adjust_links(source) source = _video_attrs(source) source = _wp_smileys(source) return serializer.render(source)
def filterEpub (item): """ epub reader """ book = epub.read_epub (item.rstrip ()) logging.debug (f'reading ebook {item}') for item in book.get_items_of_type (ebooklib.ITEM_DOCUMENT): logging.debug (f'got item {item.get_name ()}') # XXX: in theory html5lib should be able to detect the encoding of # bytes(), but it does not. document = html5lib.parse (item.get_content ().decode ('utf-8')) walker = html5lib.getTreeWalker("etree") stream = walker (document) s = HTMLSerializer() yield ''.join (s.serialize (stream)) # It looks like ebooklib is leaking ZipFile instances somewhere, which # can be prevented by resetting the book before the GC grabs it. book.reset () del book
def typogrify(html): # Using etree is important here because it does not suffer from a bug # where a text featuring entitities is split into various # adjacent text nodes. # (thanks html5lib folks for the tip). # See <https://github.com/html5lib/html5lib-python/issues/208> dom = html5lib.parseFragment(html, treebuilder="etree") walker = html5lib.getTreeWalker("etree") stream = walker(dom) stream = whitespace.Filter(stream) stream = medor.Filter(stream) stream = figures.Filter(stream) s = html5lib.serializer.HTMLSerializer(quote_attr_values=True, omit_optional_tags=False) return s.render(stream)
def _hyphenate_html(html): def hyphen_gen(stream): for el in stream: if el["type"] == "Characters": text = el["data"] text = _hyphenate(el["data"]) el['data'] = text yield el doc = html5lib.parseFragment(html, namespaceHTMLElements=False) walker = html5lib.getTreeWalker('etree') stream = walker(doc) stream = hyphen_gen(stream) return html5lib.serializer.HTMLSerializer().render(stream)
def wbr_serialize(self): """Returns concatenated HTML code with WBR tag. This is still experimental. Returns: The organized HTML code. (str) """ doc = ET.Element('span') doc.attrib['style'] = 'word-break: keep-all' for chunk in self: if (chunk.has_cjk() and doc.text): ele = ET.Element('wbr') doc.append(ele) doc.getchildren()[-1].tail = chunk.word else: # add word without span tag for non-CJK text (e.g. English) # by appending it after the last element if doc.getchildren(): if doc.getchildren()[-1].tail is None: doc.getchildren()[-1].tail = chunk.word else: doc.getchildren()[-1].tail += chunk.word else: if doc.text is None: doc.text = chunk.word else: doc.text += chunk.word content = ET.tostring(doc, encoding='utf-8').decode('utf-8') dom = html5lib.parseFragment(content) treewalker = getTreeWalker('etree') stream = treewalker(dom) serializer = html5lib.serializer.HTMLSerializer( quote_attr_values='always') allowed_elements = set(sanitizer.allowed_elements) allowed_elements.add((namespaces['html'], 'wbr')) allowed_css_properties = set(sanitizer.allowed_css_properties) allowed_css_properties.add('word-break') result = serializer.render( sanitizer.Filter( stream, allowed_elements=allowed_elements, allowed_css_properties=allowed_css_properties, )) return result
def apply_linkification( html: str, skip_tags: Optional[List[str]] = None, ) -> str: """Apply custom linkification filter to convert text patterns to links.""" parser = HTMLParser(namespaceHTMLElements=False) html_tree = parser.parseFragment(html) walker_stream = html5lib.getTreeWalker('etree')(html_tree) filtered_html_tree = LinkifyFilter(walker_stream, skip_tags) serializer = HTMLSerializer( quote_attr_values='always', omit_optional_tags=False, sanitize=False, alphabetical_attributes=False, ) return serializer.render(filtered_html_tree)
def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False, url_re=URL_RE, email_re=EMAIL_RE): """Creates a Linker instance :arg list callbacks: list of callbacks to run when adjusting tag attributes; defaults to ``bleach.linkifier.DEFAULT_CALLBACKS`` :arg list skip_tags: list of tags that you don't want to linkify the contents of; for example, you could set this to ``['pre']`` to skip linkifying contents of ``pre`` tags :arg bool parse_email: whether or not to linkify email addresses :arg re url_re: url matching regex :arg re email_re: email matching regex :returns: linkified text as unicode """ self.callbacks = callbacks self.skip_tags = skip_tags self.parse_email = parse_email self.url_re = url_re self.email_re = email_re self.parser = html5lib.HTMLParser(namespaceHTMLElements=False) self.walker = html5lib.getTreeWalker('etree') self.serializer = HTMLSerializer( quote_attr_values='always', omit_optional_tags=False, # linkify does not sanitize sanitize=False, # linkify alphabetizes alphabetical_attributes=False, )
def fix_french(html): # Using etree is important here because it does not suffer from a bug # where a text featuring entities is split into various # adjacent text nodes. # (thanks html5lib folks for the tip). # See <https://github.com/html5lib/html5lib-python/issues/208> dom = html5lib.parseFragment(html, treebuilder="etree") walker = html5lib.getTreeWalker("etree") stream = walker(dom) stream = whitespace.Filter(stream) stream = medor.Filter(stream) #stream = figures.Filter(stream) stream = hyphenate.Filter(stream, min_len=9, left=4, right=5) serializer = html5lib.serializer.HTMLSerializer(quote_attr_values=True, alphabetical_attributes=True, omit_optional_tags=False) return serializer.render(stream)
def html_sanitize(text): if not text: return '' p = HTMLParser(tokenizer=sanitizer.HTMLSanitizer) element = p.parseFragment(text) walker = getTreeWalker("etree") stream = walker(element) s = serializer.HTMLSerializer() text = s.render(stream) text = UnicodeDammit(text, ["utf-8"]) REMOVE_ATTRIBUTES = [ 'lang','language','onmouseover','onmouseout','script','font','style', 'dir','face','size','color','style','class','width','height','hspace', 'border','valign','align','background','bgcolor','text','link','vlink', 'alink','cellpadding','cellspacing', 'id'] soup = BeautifulSoup(text.unicode_markup) for attribute in REMOVE_ATTRIBUTES: for tag in soup.findAll(): if(attribute == 'style'): new_style = '' style = tag.attrs.get('style', None) if style: if style.find('normal') != -1: new_style += " font-weight:normal; " elif style.find('bold') != -1: new_style += " font-weight:bold; " if style.find('italic') != -1: new_style += " font-style: italic; " if style.find('underline') != -1: new_style += " text-decoration: underline; " tag.attrs['style'] = new_style else: del(tag[attribute]) html = soup.prettify('utf-8') try: body = re.findall(r'<body>(.*)</body>', html, re.S)[0].strip() except IndexError: body = html return body
def truncate_html(html, *args, **kwargs): """Truncates HTML string. :param html: The HTML string or parsed element tree (with :func:`html5lib.parse`). :param kwargs: Similar with :class:`.filters.TruncationFilter`. :return: The truncated HTML string. """ if hasattr(html, 'getchildren'): etree = html else: etree = html5lib.parse(html) walker = html5lib.getTreeWalker('lxml') stream = walker(etree) stream = TruncationFilter(stream, *args, **kwargs) serializer = html5lib.serializer.HTMLSerializer() serialized = serializer.serialize(stream) return u''.join(serialized).strip()
def truncate_html(html, *args, **kwargs): """Truncates HTML string. :param html: The HTML string or parsed element tree (with :func:`html5lib.parse`). :param kwargs: Similar with :class:`.filters.TruncationFilter`. :return: The truncated HTML string. """ if hasattr(html, 'getchildren'): etree = html else: etree = html5lib.parse(html) walker = html5lib.getTreeWalker('etree') stream = walker(etree) stream = TruncationFilter(stream, *args, **kwargs) serializer = html5lib.serializer.HTMLSerializer() serialized = serializer.serialize(stream) return u''.join(serialized).strip()
def __init__(self, ignore_headers=True, raise_invalid_tags=False): """ :param ignore_headers: If true, ignores text inside of the tags included in HEADER_ELEMENTS. This defaults to true because the text inside of these "header elements" is typically not a sentence. :param raise_invalid_tags: If true, raises an InvalidTagError when parsing a tag not in INLINE_ELEMENTS, BLOCK_LEVEL_ELEMENTS (which includes the elements of HEADER_ELEMENTS), SKIPPED_ELEMENTS, EMPTY_ELEMENTS, or SENTENCE_VOID_ELEMENTS. If false, ignores this tag and all of its children. (Sentences descending from it will not be included in the value returned from feed) """ # self.parser is an etree parser by default. self.parser = html5lib.HTMLParser() self.walker = html5lib.getTreeWalker("etree") self.sentences = [] self.ignored_parent_count = 0 self.current_string = '' self.ignore_header_text = ignore_headers self.raise_invalid_tags = raise_invalid_tags punkt_param = PunktParameters() abbreviations = [ 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'Adj', 'Adm', 'Adv', 'Asst', 'Bart', 'Bldg', 'Brig', 'Bros', 'Capt', 'Cmdr', 'Col', 'Comdr', 'Con', 'Corp', 'Cpl', 'DR', 'Dr', 'Drs', 'Ens', 'Gen', 'Gov', 'Hon', 'Hr', 'Hosp', 'Insp', 'Lt', 'MM', 'MR', 'MRS', 'MS', 'Maj', 'Messrs', 'Mlle', 'Mme', 'Mr', 'Mrs', 'Ms', 'Msgr', 'Op', 'Ord', 'Pfc', 'Ph', 'Prof', 'Pvt', 'Rep', 'Reps', 'Res', 'Rev', 'Rt', 'Sen', 'Sens', 'Sfc', 'Sgt', 'Sr', 'St', 'Supt', 'Surg', 'v', 'vs', 'i.e', 'inc', 'rev', 'e.g', 'etc', 'Nos', 'Nr', 'pp', 'Jan', 'Feb', 'Mar', 'Apr', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec' ] punkt_param.abbrev_types = set(abbreviations) self.tokenizer = PunktSentenceTokenizer(punkt_param) logging.basicConfig(filename='html-tokenizer.log', level=logging.WARNING, format='[%(asctime)s] [%(levelname)s] %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
# the ihatexml module emits data loss warnings. This in our case is okay # because we are willing to accept the data loss that happens on the way # from HTML to XML as we never go in reverse direction. In particular the # problem is XML namespaces which are not supported in HTML. warnings.filterwarnings('ignore', category=DataLossWarning) class ProcessingError(Exception): pass def compile_selector(sel): return CSSSelector(sel, translator='html') tree_walker = html5lib.getTreeWalker('lxml') class Processor(object): def __init__(self, title_cleanup_regex=None, content_selectors=None, ignore=None, no_default_ignores=False): self.content_selectors = [compile_selector(sel) for sel in content_selectors or ('body',)] if title_cleanup_regex is not None: title_cleanup_regex = re.compile(title_cleanup_regex, re.UNICODE) self.title_cleanup_regex = title_cleanup_regex self.ignore = [compile_selector(sel) for sel in ignore or ()] if not self.ignore and not no_default_ignores:
def serialize_fragment(h): # urgh walker = html5lib.getTreeWalker("etree") stream = walker(h) s = html5lib.serializer.HTMLSerializer() return ''.join(s.serialize(stream))[5:-6]
'wbr', 'xmp', }) SPACE_PRESERVING_TAGS = frozenset({ 'pre', 'style', 'script', 'textarea', }) _ensure = lambda e, tag: e.find(tag) if e.tag != tag else e # HTML5 serialization setup _tree_walker = html5lib.getTreeWalker("etree", implementation=etree) _serializer = html5lib.serializer.HTMLSerializer(omit_optional_tags=False, resolve_entities=False) # HTML5 parsing setup _tree_builder = html5lib.getTreeBuilder("etree", implementation=etree) _parser = html5lib.HTMLParser(_tree_builder, namespaceHTMLElements=False) # FIX for HTMLParser.reset(): if not hasattr(_parser, "innerHTMLMode"): # add the missing attribute, as otherwise calling .reset() would raise an AttributeError _parser.innerHTMLMode = None def Root(title=None, encoding=None) -> Element: root = Element(ROOT_TAG) head = SubElement(root, HEAD_TAG)
'video', 'wbr', 'xmp', }) SPACE_PRESERVING_TAGS = frozenset({ 'pre', 'style', 'script', 'textarea', }) _ensure = lambda e, tag: e.find(tag) if e.tag != tag else e # HTML5 serialization setup _tree_walker = html5lib.getTreeWalker("etree", implementation=etree) _serializer = html5lib.serializer.HTMLSerializer(omit_optional_tags=False, resolve_entities=False) # HTML5 parsing setup _tree_builder = html5lib.getTreeBuilder("etree", implementation=etree) _parser = html5lib.HTMLParser(_tree_builder, namespaceHTMLElements=False) # FIX for HTMLParser.reset(): if not hasattr(_parser, "innerHTMLMode"): # add the missing attribute, as otherwise calling .reset() would raise an AttributeError _parser.innerHTMLMode = None def Root(title=None, encoding=None) -> Element: root = Element(ROOT_TAG) head = SubElement(root, HEAD_TAG)
import html5lib with open("test_site/test.html", "rb") as f: element = html5lib.parse(f) walker = html5lib.getTreeWalker("etree") stream = walker(element) s = html5lib.serializer.HTMLSerializer() output = s.serialize(stream) for item in output: print("%r" % item)
def filterHtml (selectFunc, fd): document = html5lib.parse (fd) walker = html5lib.getTreeWalker("etree") stream = walker (document) s = HTMLSerializer() yield ''.join (s.serialize(Select (stream, selectFunc)))
def sanitize_html(text): dom = html5lib.parse(text, treebuilder='lxml') walker = html5lib.getTreeWalker('lxml') stream = _html_sanitizer_stream(walker(dom)) return _html_serializer.render(stream)
import html5lib document1 = html5lib.parse("<p>Hello World!</p>") print(document1) from urllib.request import urlopen with urlopen("http://www.google.com/") as f: document2 = html5lib.parse( f, transport_encoding=f.info().get_content_charset()) print(document2) document3 = html5lib.HTMLParser( tree=html5lib.getTreeBuilder("dom")).parse("<p>Hello World!</p>") print(document3) element = html5lib.parse('<p>Hello World!</p>') walker = html5lib.getTreeWalker("etree") stream = walker(element) s = html5lib.serializer.HTMLSerializer().serialize(stream) for i in s: print(i) from html5lib.filters import sanitizer dom = html5lib.parse("<script>alert('warning!')</script>", treebuilder="dom") walker = html5lib.getTreeWalker("dom") clean_stream = sanitizer.Filter(walker(dom)) print(clean_stream)
def print_tokens(html): tree = html5lib.parseFragment(html) w = html5lib.getTreeWalker("etree") print("Tokens for", html) for token in w(tree): pprint(token)
from copy import deepcopy from dataclasses import dataclass from pathlib import Path from typing import Literal import html5lib import unicodedata2 from lxml.etree import _Element as Element TREE_TYPE = "lxml" parser = html5lib.HTMLParser( tree=html5lib.treebuilders.getTreeBuilder(TREE_TYPE), namespaceHTMLElements=False, ) walker = html5lib.getTreeWalker(TREE_TYPE) serializer = html5lib.serializer.HTMLSerializer() repo = Path(__file__).parent @dataclass class Builder: path_to_html: dict[Path, Element] @classmethod def from_source_dir(cls, directory: Path, /) -> Builder: return cls( path_to_html={ i.relative_to(directory): parser.parse(i.read_bytes())
def string_from_doc(doc): walker = html5lib.getTreeWalker("lxml") serializer = html5lib.serializer.HTMLSerializer() output = unescape_qmarks(serializer.render(walker(doc))) return output
def setUp(self): self.parser = etree.XMLParser(resolve_entities=False) self.treewalker = html5lib.getTreeWalker("lxml") self.serializer = serializer.HTMLSerializer()
def render_text(s): s = RE_NEWLINES.sub('\n', s) paras = RE_LINE_SPLIT.split(s) paras = ['<p>%s</p>' % _process_text(p) for p in paras] return ''.join(paras) if html5lib is None: def render_html(s): raise RuntimeError('Please install html5lib for "html" renderer') else: _html_parser = html5lib.HTMLParser( tree=html5lib.treebuilders.getTreeBuilder('dom'), tokenizer=html5lib.sanitizer.HTMLSanitizer, ) _html_walker = html5lib.getTreeWalker('dom') _html_serializer = html5lib.serializer.HTMLSerializer() def render_html(s): stream = _html_walker(_html_parser.parse(s)) return u''.join(_html_serializer.serialize(stream)).strip() renderers = { 'markdown': render_markdown, 'html': render_html, 'text': render_text, } def markup(s):