예제 #1
0
def test_html_to_text():
    res = web.html_to_text(GOOGLE_HTML, method=cfg.LYNX)
    assert 'Google' in res and 'References' in res
    #
    res = web.html_to_text(GOOGLE_HTML, method=cfg.HTML2TEXT)
    assert 'Google' in res
    #
    res = web.html_to_text(GOOGLE_HTML, method='unknown method')
    assert res is None
예제 #2
0
def main(url):
    html = web.get_page(url, user_agent=True)
    txt = web.html_to_text(html, method=web.HTML2TEXT)

    #txt = ascii.unicode_to_ascii(txt)
    #txt = txt.replace(u'\xb7', '-')
    #txt = ascii.remove_non_ascii(txt).encode('ascii')

    print_result(txt)
예제 #3
0
def main(url):
    html = web.get_page(url, user_agent=True)
    txt = web.html_to_text(html, method=web.HTML2TEXT)
    
    #txt = ascii.unicode_to_ascii(txt)
    #txt = txt.replace(u'\xb7', '-')
    #txt = ascii.remove_non_ascii(txt).encode('ascii')

    
    print_result(txt)
예제 #4
0
def process(word):
    url = _template.format(word=word)
    html = web.get_page(url, user_agent=True)
    txt = web.html_to_text(html).decode("utf-8")

    # txt = ascii.unicode_to_ascii(txt)
    txt = txt.replace(u"\xb7", "-")
    txt = ascii.remove_non_ascii(txt).encode("ascii")
    txt = re.sub("\[.*?.gif\]", "", txt)

    print_result(txt)
예제 #5
0
def process(word):
    url = _template.format(word=word)
    html = web.get_page(url, user_agent=True)
    txt = web.html_to_text(html).decode('utf-8')

    #txt = ascii.unicode_to_ascii(txt)
    txt = txt.replace(u'\xb7', '-')
    txt = ascii.remove_non_ascii(txt).encode('ascii')
    txt = re.sub('\[.*?.gif\]', '', txt)

    print_result(txt)