def test_html_to_text(): res = web.html_to_text(GOOGLE_HTML, method=cfg.LYNX) assert 'Google' in res and 'References' in res # res = web.html_to_text(GOOGLE_HTML, method=cfg.HTML2TEXT) assert 'Google' in res # res = web.html_to_text(GOOGLE_HTML, method='unknown method') assert res is None
def main(url): html = web.get_page(url, user_agent=True) txt = web.html_to_text(html, method=web.HTML2TEXT) #txt = ascii.unicode_to_ascii(txt) #txt = txt.replace(u'\xb7', '-') #txt = ascii.remove_non_ascii(txt).encode('ascii') print_result(txt)
def process(word): url = _template.format(word=word) html = web.get_page(url, user_agent=True) txt = web.html_to_text(html).decode("utf-8") # txt = ascii.unicode_to_ascii(txt) txt = txt.replace(u"\xb7", "-") txt = ascii.remove_non_ascii(txt).encode("ascii") txt = re.sub("\[.*?.gif\]", "", txt) print_result(txt)
def process(word): url = _template.format(word=word) html = web.get_page(url, user_agent=True) txt = web.html_to_text(html).decode('utf-8') #txt = ascii.unicode_to_ascii(txt) txt = txt.replace(u'\xb7', '-') txt = ascii.remove_non_ascii(txt).encode('ascii') txt = re.sub('\[.*?.gif\]', '', txt) print_result(txt)