def clean_html(self, elemtree): """ Cleans HTML page in format lxml.etree._ElementTree. This method decodes HTML entities and translates national characters into normal form. Warining! This method creates new ElementTree instead of the old one! """ ed = HtmlEntityDecoder() html = tostring(elemtree) html = ed.decode_htmlentities(html) html = Normalize.translate_national(html) html = re.sub("<[bB][rR][^>]*\/?>", " ", html) return ElementTree(fromstring(html))
def clean_html(self, elemtree): """ Cleans HTML page in format lxml.etree._ElementTree. This method decodes HTML entities and translates national characters into normal form. Warining! This method creates new ElementTree instead of the old one! """ ed = HtmlEntityDecoder() html = tostring(elemtree) html = ed.decode_htmlentities(html) html = Normalize.translate_national(html) html = re.sub("<[bB][rR][^>]*\/?>", " ", html) return ElementTree(fromstring(html))