async def example(q): r = await asks.get('https://fr.wikipedia.org/wiki/'+q) tree = html_text.parse_html(r.text) cleaned_tree = html_text.cleaner.clean_html(tree) print(html_text.etree_to_text(cleaned_tree)) f = html_text.etree_to_text(cleaned_tree) return f
def test_webpages(page, extracted): html = _load_file(page) if not six.PY3: # FIXME: produces '\xa0' in Python 2, but ' ' in Python 3 # this difference is ignored in this test. # What is the correct behavior? html = html.replace(' ', ' ') expected = _load_file(extracted) assert extract_text(html) == expected tree = cleaner.clean_html(parse_html(html)) assert etree_to_text(tree) == expected
def _extract_textContent(self, node): clean_node = cleaner.clean_html(node) return html_text.etree_to_text(clean_node)