示例#1
0
def get_main_text(html):
    main_text = Document(html).summary()
    main_text = BeautifulSoup(main_text).getText()
    # 处理空行
    r = re.compile(r'\n+', re.M | re.S)
    main_text = r.sub('\n', main_text)
    # 去除首行回车
    if main_text.find('\n') == 0:
        main_text = main_text.replace('\n', '', 1)

    return main_text