def transform_collapsibles(text): """Find simple collapsible elements and transform them to full html.""" tree = parseFragment(text, container='div', treebuilder='etree', namespaceHTMLElements=False) base_id = ''.join(filter(str.isdigit, str(time.time()))) collapsibles = tree.findall('./div[@class="collapsible-item"]') for i, collapsible in enumerate(collapsibles): title = collapsible.find('./div[@class="collapsible-item-title"]') body = collapsible.find('./div[@class="collapsible-item-body"]') if title is not None and body is not None: title.tag = 'span' del title.attrib['class'] body.tag = 'div' del body.attrib['class'] final_html = render_to_string( 'a4ckeditor/collapsible_fragment.html', dict( id='a4ckeditor-collapsible-{}_{}'.format(base_id, i), title=serialize(title), body=serialize(body)) ) collapsible.clear() collapsible.append(parseFragment(final_html, treebuilder='etree', namespaceHTMLElements=False)) return serialize(tree)
def parse_comments(self, root, raw): ans = '' ns = tuple(self.selector('#bookDescription_feature_div noscript')) if ns: ns = ns[0] if len(ns) == 0 and ns.text: import html5lib # html5lib parsed noscript as CDATA ns = html5lib.parseFragment('<div>%s</div>' % (ns.text), treebuilder='lxml', namespaceHTMLElements=False)[0] else: ns.tag = 'div' ans = self._render_comments(ns) else: desc = root.xpath('//div[@id="ps-content"]/div[@class="content"]') if desc: ans = self._render_comments(desc[0]) desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]') if desc: ans += self._render_comments(desc[0]) else: # Idiot chickens from amazon strike again. This data is now stored # in a JS variable inside a script tag URL encoded. m = re.search(b'var\s+iframeContent\s*=\s*"([^"]+)"', raw) if m is not None: try: text = unquote(m.group(1)).decode('utf-8') nr = html5lib.parse(text, treebuilder='lxml', namespaceHTMLElements=False) desc = nr.xpath('//div[@id="productDescription"]/*[@class="content"]') if desc: ans += self._render_comments(desc[0]) except Exception as e: self.log.warn('Parsing of obfuscated product description failed with error: %s' % as_unicode(e)) return ans
def load_html(fn): # Open file. with open(fn) as f: doc = f.read() # Parse DOM. It's a fragment so we need to use parseFragment, # which returns a list which we re-assemble into a node. import warnings with warnings.catch_warnings(): warnings.simplefilter("ignore") fragment = html5lib.parseFragment(doc, treebuilder="lxml") dom = lxml.etree.Element("div") for node in fragment: dom.append(node) ## Remove comments - xml_diff can't handle that. ## They seem to already be stripped by the HTML ## sanitization. # for node in dom.xpath("//comment()"): # node.getparent().remove(node) # Take everything out of the HTML namespace so # that when we serialize at the end there are no # namespaces and it's plain HTML. for node in dom.xpath("//*"): node.tag = node.tag.replace("{http://www.w3.org/1999/xhtml}", "") return (doc, dom)
def parse_harlowe_html(s): """ Parse a string containing the HTML of a Twine game written using Harlowe. Args: s (str): The Harlowe source. Returns: (dict, list, OrderedDict): A dictionary of the attributes on the top-level tw-storydata element, a list of non-passage elements in the game (as etree.ElementTree.Element), and a dict whose keys are the passage's names and whose values are the corresponding HarlowePassage objects. """ passages = OrderedDict() # So that we keep the original room order in source code other_elems = list() # The story uses HTML5 custom elements, and so requires an HTML5-aware parser story_elem = html5lib.parseFragment(s, treebuilder='lxml', namespaceHTMLElements=False)[0] if story_elem is None or story_elem.tag != _STORY_TAG: raise RuntimeError('No properly-formatted story tag ('+_STORY_TAG+') found') for elem in story_elem: if elem.tag == _PASSAGE_TAG: passage = HarlowePassage.from_element(elem) passages[passage.name] = passage else: other_elems.append(elem) return story_elem.attrib, other_elems, passages
def warcToText(url): # request the url/warc.gz file resp = requests.get(url, stream=True) # iterate through the archive fail = 0 succeed = 0 for record in ArchiveIterator(resp.raw, arc2warc=True): # if the record type is a response (which is the case for html page) if record.rec_type == 'response': # check if the response is http if record.http_headers != None: # if the http header is one of the following if ((record.http_headers.get_header('Content-Type') =='text/html') |(record.http_headers.get_header('Content-Type') == 'text/html; charset=UTF-8')\ | (record.http_headers.get_header('Content-Type') =='text/html; charset=utf-8')| (record.http_headers.get_header('Content-Type') =='text/html; charset=ISO-8859-1')\ | (record.http_headers.get_header('Content-Type') =='charset=iso-8859-1')): # return the html page try: html = record.content_stream().read() # from html to plain text html_parse = html5lib.parseFragment(html) s = ''.join(html_parse.itertext()) print(s) succeed = succeed +1 except Exception: fail = fail +1 continue print('fail: %s'%(fail)) print('succeed: %s'%(succeed))
def html(self): try: import html5lib self.html5lib = html5lib return html5lib.parseFragment(self.content) except ImportError, err: raise ImproperlyConfigured("Error while importing html5lib: %s" % err)
def truncate(html, truncated_message, suffix, max_entities=None, max_length=None): walker = html5lib.getTreeWalker('etree') html_stream = walker(html5lib.parseFragment(html, treebuilder='etree')) truncated_message_stream = walker( html5lib.parseFragment(truncated_message, treebuilder='etree')) suffix_stream = walker(html5lib.parseFragment(suffix, treebuilder='etree')) truncated = TelegramTruncator(html_stream, truncated_message=truncated_message_stream, suffix=suffix_stream, max_entities=max_entities, max_length=max_length) return HTMLSerializer().render(truncated).strip('\n')
def sanitize_html(html): """ Make the given HTML string safe to display in a Yarrharr page. """ tree = html5lib.parseFragment(html) serializer = html5lib.serializer.HTMLSerializer() source = html5lib.getTreeWalker("etree")(tree) source = _strip_attrs(source) source = _drop_empty_tags(source) source = _ReplaceObjectFilter(source) source = _ElideFilter(source) source = _ReplaceYoutubeEmbedFilter(source) source = _ExtractTitleTextFilter(source) source = _adjust_links(source) source = _video_attrs(source) source = _wp_smileys(source) source = sanitizer.Filter( source, allowed_elements=sanitizer.allowed_elements | frozenset([ ( namespaces["html"], "summary", ), # https://github.com/html5lib/html5lib-python/pull/423 ( namespaces["html"], "wbr", ), # https://github.com/html5lib/html5lib-python/pull/395 ]), ) return serializer.render(source)
def strip_style_and_script(input): dom = html5lib.parseFragment(input, treebuilder="dom") walker = html5lib.getTreeWalker("dom") stream = walker(dom) s = html5lib.serializer.HTMLSerializer() return s.render(NoChildTagFilter(stream, ("script", "style")))
def get_title(self): document = parseFragment(self.content, treebuilder='etree', \ namespaceHTMLElements=False, encoding='utf-8') try: text = \ ' '.join([w for w in document.find('.//h1').itertext()]) return text.encode('utf-8') except AttributeError: return None
def test_sanitizer(expected, input): parsed = parseFragment(expected) expected = serialize(parsed, omit_optional_tags=False, use_trailing_solidus=True, space_before_trailing_solidus=False, quote_attr_values="always", quote_char='"', alphabetical_attributes=True) assert expected == sanitize_html(input)
def runSanitizerTest(_, expected, input): parsed = parseFragment(expected) expected = serialize(parsed, omit_optional_tags=False, use_trailing_solidus=True, space_before_trailing_solidus=False, quote_attr_values="always", quote_char='"', alphabetical_attributes=True) assert expected == sanitize_html(input)
def clean_nl(string): """ This will clean up newlines so that nl2br can properly be called on the cleaned text. """ html_blocks = [ '{http://www.w3.org/1999/xhtml}blockquote', '{http://www.w3.org/1999/xhtml}ol', '{http://www.w3.org/1999/xhtml}li', '{http://www.w3.org/1999/xhtml}ul', ] if not string: return string def parse_html(tree): # In etree, a tag may have: # - some text content (piece of text before its first child) # - a tail (piece of text just after the tag, and before a sibling) # - children # Eg: "<div>text <b>children's text</b> children's tail</div> tail". # Strip new lines directly inside block level elements: first new lines # from the text, and: # - last new lines from the tail of the last child if there's children # (done in the children loop below). # - or last new lines from the text itself. if tree.tag in html_blocks: if tree.text: tree.text = tree.text.lstrip('\n') if not len(tree): # No children. tree.text = tree.text.rstrip('\n') # Remove the first new line after a block level element. if tree.tail and tree.tail.startswith('\n'): tree.tail = tree.tail[1:] for child in tree: # Recurse down the tree. if tree.tag in html_blocks: # Strip new lines directly inside block level elements: remove # the last new lines from the children's tails. if child.tail: child.tail = child.tail.rstrip('\n') parse_html(child) return tree parse = parse_html(html5lib.parseFragment(string)) # Serialize the parsed tree back to html. walker = html5lib.treewalkers.getTreeWalker('etree') stream = walker(parse) serializer = HTMLSerializer(quote_attr_values='always', omit_optional_tags=False) return serializer.render(stream)
def _process_post(self, post): fetchables = [] if post['type'] == 'photo': for photo in post['photos']: #Seems that first alt size is the biggest url = photo['alt_sizes'][0]['url'] fetchables.append(Image(url, self.sfh)) elif post['type'] == 'video': #Video type: youtube, vimeo, unknown #source_url is only present sometimes #To do: download the video. #Perhaps use https://github.com/NFicano/pytube to download Youtube #Youtube and Vimeo are embedded with iframe, where src is link to video video_url = "None" if post['video_type'] in ('youtube', 'vimeo'): #May have videos that do no have players. (I think they are reblogs of videos.) #Parse the embed_code embed_code = post['player'][0]['embed_code'] if embed_code: player_fragment = html5lib.parseFragment(embed_code) video_url = player_fragment[0].attrib['src'] #Vimeo omits http if video_url.startswith("//"): video_url = "http:" + video_url if post['video_type'] == 'youtube': fetchables.append(youtube.Video(video_url, self.sfh)) elif post['video_type'] == 'vimeo': fetchables.append(vimeo.Video(video_url, self.sfh)) elif post['type'] == 'text': #Parse body body_fragment = html5lib.parseFragment(post['body'], namespaceHTMLElements=False) #Extract links for a_elem in body_fragment.findall(".//a[@href]"): fetchables.append(UnknownResource(a_elem.attrib['href'], self.sfh)) #Extract images for img_elem in body_fragment.findall(".//img[@src]"): fetchables.append(Image(img_elem.attrib['src'], self.sfh)) #TODO: Consider whether there are other elements that should be parsed. #Also, need to test if original is markdown, do we get html or markdown. #TODO: Other post types return fetchables
def sanitize(string): """ Ensure that the text does not contain any malicious HTML code which might break the page. """ from html5lib import parseFragment, serialize parsed = parseFragment(string) clean = serialize(parsed, sanitize=True, omit_optional_tags=False, quote_attr_values='always') return clean
def test_linkify(self): tmpl = env.from_string('{{ "http://test.example.com"|linkify}}') rendered = tmpl.render() el = html5lib.parseFragment(rendered) self.assertEquals(len(el.getchildren()), 1) el = el.getchildren()[0] self.assertEquals(el.tag, u'{http://www.w3.org/1999/xhtml}a') self.assertEquals(el.text, u'http://test.example.com') self.assertEquals(sorted(el.items()), [(u'href', u'http://test.example.com'), (u'rel', u'nofollow')])
def typo_html(data, out=None): if data and not isinstance(data, unicode): raise RuntimeError("`typo_html` requires unicode") return_value = False if not out: out = cStringIO.StringIO() return_value = True fragment = html5lib.parseFragment(data) TypoWalker(fragment, out) if return_value: return out.getvalue()
def sanitize_html(stream): parsed = parseFragment(stream) serialized = serialize(parsed, sanitize=True, omit_optional_tags=False, use_trailing_solidus=True, space_before_trailing_solidus=False, quote_attr_values="always", quote_char='"', alphabetical_attributes=True) return serialized
def obfuscate_emails(content): if isinstance(content, contents.Static): return dom = html5lib.parseFragment(content._content, treebuilder="etree") walker = html5lib.getTreeWalker("etree") stream = walker(dom) stream = ObfuscateEmailsFilter(stream) s = html5lib.serializer.HTMLSerializer(quote_attr_values="always", omit_optional_tags=False) content._content = s.render(stream)
def request(self, **options): """ Perform a remote theater program request and return the html5 document with results. You need to extract details yourself. """ fp = urlopen(self.base_url + urlencode(options)) data = '<div>' + fp.read() + '</div>' fp.close() return html5.parseFragment(data, 'div', 'lxml', 'utf-8', False).pop()
def clean_nl(string): """ This will clean up newlines so that nl2br can properly be called on the cleaned text. """ html_blocks = [ "{http://www.w3.org/1999/xhtml}blockquote", "{http://www.w3.org/1999/xhtml}ol", "{http://www.w3.org/1999/xhtml}li", "{http://www.w3.org/1999/xhtml}ul", ] if not string: return string def parse_html(tree): # In etree, a tag may have: # - some text content (piece of text before its first child) # - a tail (piece of text just after the tag, and before a sibling) # - children # Eg: "<div>text <b>children's text</b> children's tail</div> tail". # Strip new lines directly inside block level elements: first new lines # from the text, and: # - last new lines from the tail of the last child if there's children # (done in the children loop below). # - or last new lines from the text itself. if tree.tag in html_blocks: if tree.text: tree.text = tree.text.lstrip("\n") if not len(tree): # No children. tree.text = tree.text.rstrip("\n") # Remove the first new line after a block level element. if tree.tail and tree.tail.startswith("\n"): tree.tail = tree.tail[1:] for child in tree: # Recurse down the tree. if tree.tag in html_blocks: # Strip new lines directly inside block level elements: remove # the last new lines from the children's tails. if child.tail: child.tail = child.tail.rstrip("\n") parse_html(child) return tree parse = parse_html(html5lib.parseFragment(string)) # Serialize the parsed tree back to html. walker = html5lib.treewalkers.getTreeWalker("etree") stream = walker(parse) serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False) return serializer.render(stream)
def test_linkify(self): tmpl = env.from_string('{{ "http://test.example.com"|linkify}}') rendered = tmpl.render() el = html5lib.parseFragment(rendered) self.assertEquals(len(el.getchildren()), 1) el = el.getchildren()[0] self.assertEquals(el.tag, u'{http://www.w3.org/1999/xhtml}a') self.assertEquals(el.text, u'http://test.example.com') self.assertEquals( sorted(el.items()), [(u'href', u'http://test.example.com'), (u'rel', u'nofollow')] )
def _html_serialize(self, chunks, attributes, max_length): """Returns concatenated HTML code with SPAN tag. Args: chunks: The list of chunks to be processed. (ChunkList) attributes: If a dictionary, it should be a map of name-value pairs for attributes of output SPAN tags. If a string, it should be a class name of output SPAN tags. If an array, it should be a list of class names of output SPAN tags. (str or dict or list of str) max_length: Maximum length of span enclosed chunk. (int, optional) Returns: The organized HTML code. (str) """ doc = ET.Element('span') for chunk in chunks: if chunk.is_space(): if doc.getchildren(): if doc.getchildren()[-1].tail is None: doc.getchildren()[-1].tail = ' ' else: doc.getchildren()[-1].tail += ' ' else: if doc.text is not None: # We want to preserve space in cases like "Hello 你好" # But the space in " 你好" can be discarded. doc.text += ' ' else: if chunk.has_cjk() and not (max_length and len(chunk.word) > max_length): ele = ET.Element('span') ele.text = chunk.word for k, v in attributes.items(): ele.attrib[k] = v doc.append(ele) else: # add word without span tag for non-CJK text (e.g. English) # by appending it after the last element if doc.getchildren(): if doc.getchildren()[-1].tail is None: doc.getchildren()[-1].tail = chunk.word else: doc.getchildren()[-1].tail += chunk.word else: if doc.text is None: doc.text = chunk.word else: doc.text += chunk.word result = ET.tostring(doc, encoding='utf-8').decode('utf-8') result = html5lib.serialize(html5lib.parseFragment(result), sanitize=True, quote_attr_values="always") return result
def test_parse_fragment_etree(): """ Parsing a fragment to to an etree produces a fragment root element that directly contains the given HTML. """ fragment = parseFragment("<p>...</p><div>...</div>", treebuilder="etree") assert fragment.tag == 'DOCUMENT_FRAGMENT' [p, div] = fragment assert p.tag == "{http://www.w3.org/1999/xhtml}p" assert p.text == "..." assert div.tag == "{http://www.w3.org/1999/xhtml}div" assert div.text == "..."
def truncate(html, length, killwords=False, end='...'): """ Return a slice of ``html`` <= length chars. killwords and end are currently ignored. """ tree = html5lib.parseFragment(html, encoding='utf-8') if text_length(tree) <= length: return jinja2.Markup(html) else: short, _ = trim(tree, length, killwords, end) return jinja2.Markup(force_unicode(short.toxml()))
def test_parse_fragment_lxml(): """ Parsing a fragment to to an lxml etree produces a list of elements in the fragment. """ fragment = parseFragment("<p>...</p><div>...</div>", treebuilder="lxml") assert isinstance(fragment, list) [p, div] = fragment assert p.tag == "{http://www.w3.org/1999/xhtml}p" assert p.text == "..." assert div.tag == "{http://www.w3.org/1999/xhtml}div" assert div.text == "..."
def test_linkify(): tmpl = env.from_string('{{ "http://test.example.com"|linkify}}') rendered = tmpl.render() el = html5lib.parseFragment(rendered) assert len(el.getchildren()) == 1 el = el.getchildren()[0] assert el.tag == "{http://www.w3.org/1999/xhtml}a" assert el.text == "http://test.example.com" assert sorted(el.items()) == [ ("href", "http://test.example.com"), ("rel", "nofollow"), ]
def run(self, text): parsed = html5lib.parseFragment(text) # if we didn't have to customize our sanitization, could just do: # return html5lib.serialize(parsed, sanitize=True) # instead we do the same steps as that function, # but add our ForgeHTMLSanitizerFilter instead of sanitize=True which would use the standard one TreeWalker = html5lib.treewalkers.getTreeWalker("etree") walker = TreeWalker(parsed) walker = ForgeHTMLSanitizerFilter(walker) # this is our custom step s = html5lib.serializer.HTMLSerializer() return s.render(walker)
def get_excerpt(self): """ Look in the body text to find the ‘chapeau’, the lead text, that can be used as a description. """ dom = html5lib.parseFragment(self.lead, treebuilder="etree", namespaceHTMLElements=False) for el in dom: if el.tag == "p": head = el.text or "" # el.text does not return the entire text if you have <p>Text with <em>child</em> tags</p> # cf http://stackoverflow.com/a/380717 return "".join([head] + [ElementTree.tostring(e) for e in el.getchildren()]) return u""
def preprocess(source): """Removes unnecessary break lines and white spaces. Args: source (str): Input sentence. Returns: Preprocessed sentence. (str) """ doc = html5lib.parseFragment(source) source = ET.tostring(doc, encoding='utf-8', method='text').decode('utf-8') source = source.replace(u'\n', u'').strip() source = re.sub(r'\s\s+', u' ', source) return source
def test_attrib_no_toolbar(self, name='form-0-code', value='<html></html>'): ace_widget = django_ace.AceWidget(toolbar=False) content = ace_widget.render(name, value) root = html5lib.parseFragment(content, namespaceHTMLElements=False) editor = root[0] self.assertEqual(len(editor), 2) self.assertEqual(editor.attrib['class'], 'django-ace-editor') self.assertEqual(editor[0].tag, 'div') self.assertEqual(editor[0].attrib['class'], 'django-ace-widget loading') self.assertEqual(editor[1].tag, 'textarea')
def truncate(html, length, killwords=False, end='...'): """ Return a slice of ``html`` <= length chars. killwords and end are currently ignored. ONLY USE FOR KNOWN-SAFE HTML. """ tree = html5lib.parseFragment(html, encoding='utf-8') if text_length(tree) <= length: return jinja2.Markup(html) else: short, _ = trim(tree, length, killwords, end) return jinja2.Markup(force_unicode(short.toxml()))
def test_green(self): r = Run() r.id = 1 rv = showrun(r) ok_(isinstance(rv, SafeString)) frag = parseFragment(rv) eq_(len(frag.childNodes), 1) a = frag.childNodes[0] eq_(a.attributes, {'data-errors': '0', 'data-total': '0', 'data-missing': '0', 'href': '/dashboard/compare?run=1', 'data-warnings': '0'}) text = a.childNodes[0].value ok_('green' in text)
def get(self, **kwargs): user = kwargs.get('_api_user') fmt = kwargs.get('format', 'object') part = kwargs.get('part', 'all') if fmt not in self._valid_formats: raise APISyntaxError("Unknown format: {0}".format(fmt)) if part not in self._valid_parts: raise APISyntaxError("Unknown part: {0}".format(part)) try: page = confluence_session.getPageById(kwargs.get('id')) except RemoteException: raise NotFound('Page not found') require_access(user, auth.Permissions.READ, page.space) logger.debug("Access checked") page.short_url = confluence_session.make_short_url(page.shortcode) if not page.current: raise Gone(config.get('Text', 'deleted_article', 'Article deleted.')) # Make a copy so we don't clobber the class one marshal_fields = self._fields.copy() render_kwargs = {'page_id': page.id} if part == 'excerpt': if page.excerpt is None: raise NotFound('The article has no excerpt.') del marshal_fields['content'] marshal_fields['excerpt'] = fields.String render_kwargs['content'] = page.excerpt # Why do we not simply always pass 'content' to renderContent, # and just decide between the page content or the excerpt? Because # when a page is rendered by page_id alone, it can be (and is) cached. # when a page is rendered by arbitrary content, it is not. if fmt == 'html': return {'html': confluence_session.renderContent(**render_kwargs) } if fmt == 'div': html = confluence_session.renderContent(style='clean', **render_kwargs) parsed = html5lib.parseFragment(html, treebuilder='etree', namespaceHTMLElements=False) for el in parsed.findall(".//img"): if el.get('src').startswith('/confluence'): el.set('src', 'http://kb.mit.edu' + el.get('src')) for el in parsed.findall(".//a"): if el.get('href', '').startswith('/confluence'): el.set('href', 'http://kb.mit.edu' + el.get('href')) cleaned = xmletree.tostring(parsed[0], method='html' ) return {'html': cleaned} return { 'page': marshal(page, marshal_fields)}
def get_image(self): """ Look in the body text for the first image Try to find the associated filer object so we can make thumbnails """ dom = html5lib.parseFragment(self.body, treebuilder="etree", namespaceHTMLElements=False) images = dom.findall('.//img') if images: img = images[0].get('src') # u'https://medor.coop/media/filer_public/cb/1b/cb1b0760-5931-4766-b062-6ea821ba33c6/gent-cropped.png' img_path = urlparse(img).path # u'/media/filer_public/cb/1b/cb1b0760-5931-4766-b062-6ea821ba33c6/gent-cropped.png' img_filename = basename(img_path) # u'gent-cropped.png' for image in Image.objects.filter(original_filename__iexact=img_filename): if image.url == img_path: return image return None
def generate_slug(html): """Generates a URL slug for a HTML fragment.""" document = parseFragment(html, treebuilder='etree', \ namespaceHTMLElements=False, encoding='utf-8') try: text = ' '.join([t for t in document.find('.//h1').itertext()]) except AttributeError: text = ' '.join([t for t in document.itertext()]) text = get_first_sentence(text) text = unidecode(text).lower() allowed = \ 'abcdefghijklmnopqrstuvwxyz' + \ '1234567890+- ' text = ''.join([c for c in text if c in allowed]) text = '-'.join(text.split()) return text.encode('utf-8')
def test_green(self): r = Run() r.id = 1 rv = showrun(r) ok_(isinstance(rv, SafeUnicode)) frag = parseFragment(rv) childNodes = list(frag) eq_(len(childNodes), 1) a = childNodes[0] eq_(a.attrib, {'data-errors': '0', 'data-total': '0', 'data-missing': '0', 'href': '/dashboard/compare?run=1', 'data-warnings': '0'}) text = a.text ok_('green' in text)
def test_green(self): r = Run() r.id = 1 rv = showrun(r) self.assertIsInstance(rv, SafeUnicode) frag = parseFragment(rv) childNodes = list(frag) self.assertEqual(len(childNodes), 1) a = childNodes[0] self.assertDictEqual( a.attrib, {'data-errors': '0', 'data-total': '0', 'data-missing': '0', 'href': '/dashboard/compare?run=1', 'data-warnings': '0'}) text = a.text self.assertIn('green', text)
def runtest(self): input = self.test["input"] expected = self.test["output"] parsed = parseFragment(input) serialized = serialize(parsed, sanitize=True, omit_optional_tags=False, use_trailing_solidus=True, space_before_trailing_solidus=False, quote_attr_values="always", quote_char="'", alphabetical_attributes=True) errorMsg = "\n".join(["\n\nInput:", input, "\nExpected:", expected, "\nReceived:", serialized]) assert expected == serialized, errorMsg
def test_attrib_options(self, name='form-0-code', value='<html></html>'): ace_widget = django_ace.AceWidget( mode='html', theme='twilight', wordwrap=True, showinvisibles=True, minlines=8, maxlines=16, tabsize=4, fontsize=12, ) content = ace_widget.render(name, value) root = html5lib.parseFragment(content, namespaceHTMLElements=False) editor = root[0] widget = editor[1] self.assertEqual(widget.tag, 'div') self.assertEqual(len(widget.attrib.keys()), 16) self.assertEqual( sorted(widget.attrib.keys()), sorted([ 'class', 'style', 'data-mode', 'data-theme', 'data-wordwrap', 'data-minlines', 'data-maxlines', 'data-tabsize', 'data-fontsize', 'data-behaviours', 'data-readonly', 'data-showgutter', 'data-showinvisibles', 'data-showprintmargin', 'data-usesofttabs', 'data-use-worker', ])) self.assertEqual(widget.attrib['data-mode'], 'html') self.assertEqual(widget.attrib['data-theme'], 'twilight') self.assertEqual(widget.attrib['data-wordwrap'], '') self.assertEqual(widget.attrib['data-minlines'], '8') self.assertEqual(widget.attrib['data-maxlines'], '16') self.assertEqual(widget.attrib['data-tabsize'], '4') self.assertEqual(widget.attrib['data-fontsize'], '12')
def sanitize_html(html): """ Make the given HTML string safe to display in a Yarrharr page. """ tree = html5lib.parseFragment(html) serializer = html5lib.serializer.HTMLSerializer(sanitize=True) source = html5lib.getTreeWalker('etree')(tree) source = _strip_attrs(source) source = _drop_empty_tags(source) source = _ReplaceObjectFilter(source) source = _ElideFilter(source) source = _ReplaceYoutubeEmbedFilter(source) source = _ExtractTitleTextFilter(source) source = _adjust_links(source) source = _video_attrs(source) source = _wp_smileys(source) return serializer.render(source)
def typogrify(html): # Using etree is important here because it does not suffer from a bug # where a text featuring entitities is split into various # adjacent text nodes. # (thanks html5lib folks for the tip). # See <https://github.com/html5lib/html5lib-python/issues/208> dom = html5lib.parseFragment(html, treebuilder="etree") walker = html5lib.getTreeWalker("etree") stream = walker(dom) stream = whitespace.Filter(stream) stream = medor.Filter(stream) stream = figures.Filter(stream) s = html5lib.serializer.HTMLSerializer(quote_attr_values=True, omit_optional_tags=False) return s.render(stream)
def parse_comments(self, root, raw): from urllib import unquote ans = '' ns = root.xpath('//div[@class="descrip"]') if ns: ns = ns[0] if len(ns) == 0 and ns.text: import html5lib # html5lib parsed noscript as CDATA ns = html5lib.parseFragment('<div>%s</div>' % (ns.text), treebuilder='lxml', namespaceHTMLElements=False)[0] ans = self._render_comments(ns) return ans
def _render_comments(self, desc): from calibre.library.comments import sanitize_comments_html import html5lib # html5lib parsed noscript as CDATA desc = html5lib.parseFragment('<div>%s</div>' % (self.totext(desc).replace('textarea', 'div')), \ treebuilder='lxml', namespaceHTMLElements=False)[0] matches = desc.xpath('descendant::*[contains(text(), "内容提要") \ or contains(text(), "内容推荐") or contains(text(), "编辑推荐") \ or contains(text(), "内容简介") or contains(text(), "基本信息")]/../*[self::p or self::div or self::span]' ) if matches: if len(matches) > 1: desc = matches[-1] for item in matches: content_len = len(self.totext(item)) if content_len > 50 and content_len < 200: desc = item break for c in desc.xpath('descendant::noscript'): c.getparent().remove(c) for c in desc.xpath('descendant::*[@class="seeAll" or' ' @class="emptyClear" or @id="collapsePS" or' ' @id="expandPS"]'): c.getparent().remove(c) # for a in desc.xpath('descendant::a[@href]'): del a.attrib['href'] a.tag = 'span' desc = self.tostring(desc, method='text', encoding=unicode).strip() # return desc # Encoding bug in Amazon data U+fffd (replacement char) # in some examples it is present in place of ' desc = desc.replace('\ufffd', "'") # remove all attributes from tags desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) # Collapse whitespace desc = re.sub('\n+', '\n', desc) desc = re.sub(' +', ' ', desc) # Remove the notice about text referring to out of print editions desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc) # Remove comments desc = re.sub(r'(?s)<!--.*?-->', '', desc) return sanitize_comments_html(desc)
def test_attrib_default(self, name='form-0-code', value='<html></html>'): ace_widget = django_ace.AceWidget() content = ace_widget.render(name, value) root = html5lib.parseFragment(content, namespaceHTMLElements=False) editor = root[0] self.assertEqual(len(editor), 3) self.assertEqual(editor.tag, 'div') self.assertEqual(editor.attrib['class'], 'django-ace-editor') toolbar = editor[0] self.assertEqual(toolbar.tag, 'div') self.assertEqual(sorted(toolbar.attrib.keys()), ['class', 'style']) self.assertEqual(toolbar.attrib['class'], 'django-ace-toolbar') self.assertEqual(toolbar.attrib['style'], 'width: 500px') self.assertEqual(toolbar[0].tag, 'a') self.assertEqual(sorted(toolbar[0].attrib.keys()), ['class', 'href']) self.assertEqual(toolbar[0].attrib['class'], 'django-ace-max_min') self.assertEqual(toolbar[0].attrib['href'], './') widget = editor[1] self.assertEqual(widget.tag, 'div') self.assertEqual(len(widget.attrib.keys()), 8) self.assertEqual( sorted(widget.attrib.keys()), sorted([ 'class', 'style', 'data-behaviours', 'data-readonly', 'data-showgutter', 'data-showprintmargin', 'data-usesofttabs', 'data-use-worker', ])) self.assertEqual(widget.attrib['class'], 'django-ace-widget loading') self.assertEqual(widget.attrib['style'], 'width:500px; height:300px') self.assertEqual(widget.attrib['data-showprintmargin'], '') self.assertEqual(widget.attrib['data-usesofttabs'], '') self.assertEqual(widget.attrib['data-use-worker'], '') textarea = editor[2] self.assertEqual(textarea.tag, 'textarea') self.assertEqual(textarea.attrib['name'], name) self.assertEqual(textarea.text, value)
def find_iter(skeleton, document): """ Return an iterator that yields elements from the document that match given skeleton. See `find_all` for details. """ if is_string(document): document = html5lib.parse(document) if is_string(skeleton): fragment = html5lib.parseFragment(skeleton) if len(fragment) != 1: raise ValueError("Skeleton must have exactly one root element.") skeleton = fragment[0] for element in document.iter(): if node_matches_bone(element, skeleton): yield element
def clean_nl(string): """ This will clean up newlines so that nl2br can properly be called on the cleaned text. """ html_blocks = ['blockquote', 'ol', 'li', 'ul'] if not string: return string def parse_html(tree): prev_tag = '' for i, node in enumerate(tree.childNodes): if node.type == 4: # Text node value = node.value # Strip new lines directly inside block level elements. if node.parent.name in html_blocks: value = value.strip('\n') # Remove the first new line after a block level element. if (prev_tag in html_blocks and value.startswith('\n')): value = value[1:] tree.childNodes[i].value = value else: tree.insertBefore(parse_html(node), node) tree.removeChild(node) prev_tag = node.name return tree parse = parse_html(html5lib.parseFragment(string)) if not parse.childNodes: # The parser couldn't make sense of the given html, eg bad markup. return '' walker = html5lib.treewalkers.getTreeWalker('simpletree') stream = walker(parse) serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False) return serializer.render(stream)
def parse_comments(self, root): ans = '' ns = CSSSelect('#bookDescription_feature_div noscript')(root) if ns: ns = ns[0] if len(ns) == 0 and ns.text: import html5lib # html5lib parsed noscript as CDATA ns = html5lib.parseFragment('<div>%s</div>' % (ns.text), treebuilder='lxml', namespaceHTMLElements=False)[0] else: ns.tag = 'div' ans = self._render_comments(ns) else: desc = root.xpath('//div[@id="ps-content"]/div[@class="content"]') if desc: ans = self._render_comments(desc[0]) desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]') if desc: ans += self._render_comments(desc[0]) return ans
def truncate(html, length, killwords=False, end='...'): """ Return a slice of ``html`` <= length chars. killwords and end are currently ignored. ONLY USE FOR KNOWN-SAFE HTML. """ tree = html5lib.parseFragment(html) if text_length(tree) <= length: return jinja2.Markup(html) else: # Get a truncated version of the tree. short, _ = trim(tree, length, killwords, end) # Serialize the parsed tree back to html. walker = html5lib.treewalkers.getTreeWalker('etree') stream = walker(short) serializer = html5lib.serializer.htmlserializer.HTMLSerializer( quote_attr_values=True, omit_optional_tags=False) return jinja2.Markup(force_unicode(serializer.render(stream)))
def clean_nl(string): """ This will clean up newlines so that nl2br can properly be called on the cleaned text. """ html_blocks = ["blockquote", "ol", "li", "ul"] if not string: return string def parse_html(tree): prev_tag = "" for i, node in enumerate(tree.childNodes): if node.type == 4: # Text node value = node.value # Strip new lines directly inside block level elements. if node.parent.name in html_blocks: value = value.strip("\n") # Remove the first new line after a block level element. if prev_tag in html_blocks and value.startswith("\n"): value = value[1:] tree.childNodes[i].value = value else: tree.insertBefore(parse_html(node), node) tree.removeChild(node) prev_tag = node.name return tree parse = parse_html(html5lib.parseFragment(string)) walker = html5lib.treewalkers.getTreeWalker("simpletree") stream = walker(parse) serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False) return serializer.render(stream)
def html_to_text(html): """ Convert HTML to representative text. All HTML tags are dropped. The content of non-visible tags like ``<script>`` and ``<style>`` tags is dropped. Other elements are replaced by their textual content. A single space is injected between `non-phrasing content <https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content>`_. Whitespace is normalized to approximate what CSS's ``white-space: normal`` `would do on display <https://www.w3.org/TR/CSS2/text.html#white-space-model>`_ to minimize the size of the resulting string. Leading and trailing whitespace is dropped. :param str html: HTML string :returns: Plain text """ tree = html5lib.parseFragment(html) buf = StringIO() def visit(el): needs_ws = el.tag not in _NO_WHITESPACE_TAGS if el.tag == _IMG_TAG: buf.write(el.get('alt', '🖼️')) elif el.tag not in _DROP_TAGS: if el.text is not None: if needs_ws: buf.write(' ') buf.write(el.text) for child in el: visit(child) if el.tail is not None: if needs_ws: buf.write(' ') buf.write(el.tail) visit(tree) return _WHITESPACE_RE.sub(' ', buf.getvalue()).strip()
def parse_comments(self, root): ans = "" ns = tuple(self.selector("#bookDescription_feature_div noscript")) if ns: ns = ns[0] if len(ns) == 0 and ns.text: import html5lib # html5lib parsed noscript as CDATA ns = html5lib.parseFragment( "<div>%s</div>" % (ns.text), treebuilder="lxml", namespaceHTMLElements=False )[0] else: ns.tag = "div" ans = self._render_comments(ns) else: desc = root.xpath('//div[@id="ps-content"]/div[@class="content"]') if desc: ans = self._render_comments(desc[0]) desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]') if desc: ans += self._render_comments(desc[0]) return ans