예제 #1
0
async def example(q):
    r = await asks.get('https://fr.wikipedia.org/wiki/'+q)
    tree = html_text.parse_html(r.text)
    cleaned_tree = html_text.cleaner.clean_html(tree)
    print(html_text.etree_to_text(cleaned_tree))
    f = html_text.etree_to_text(cleaned_tree)
    return f
예제 #2
0
def test_webpages(page, extracted):
    html = _load_file(page)
    if not six.PY3:
        # FIXME:   produces '\xa0' in Python 2, but ' ' in Python 3
        # this difference is ignored in this test.
        # What is the correct behavior?
        html = html.replace(' ', ' ')
    expected = _load_file(extracted)
    assert extract_text(html) == expected

    tree = cleaner.clean_html(parse_html(html))
    assert etree_to_text(tree) == expected
예제 #3
0
 def _extract_textContent(self, node):
     clean_node = cleaner.clean_html(node)
     return html_text.etree_to_text(clean_node)