def clean_html(self, elemtree):
     """
     Cleans HTML page in format lxml.etree._ElementTree. This method decodes
     HTML entities and translates national characters into normal form.
     Warining! This method creates new ElementTree instead of the old one!
     """
     ed = HtmlEntityDecoder()
     html = tostring(elemtree)
     html = ed.decode_htmlentities(html)
     html = Normalize.translate_national(html)
     html = re.sub("<[bB][rR][^>]*\/?>", " ", html)
     return ElementTree(fromstring(html))
Пример #2
0
 def clean_html(self, elemtree):
     """
     Cleans HTML page in format lxml.etree._ElementTree. This method decodes
     HTML entities and translates national characters into normal form.
     Warining! This method creates new ElementTree instead of the old one!
     """
     ed = HtmlEntityDecoder()
     html = tostring(elemtree)
     html = ed.decode_htmlentities(html)
     html = Normalize.translate_national(html)
     html = re.sub("<[bB][rR][^>]*\/?>", " ", html)
     return ElementTree(fromstring(html))