Python fragment_to_text示例，nlpclean.html.fragment_to_text Python示例

示例#1

0

显示文件

def extract_woman(html):
    soup = BeautifulSoup(html, 'lxml')

    for node in soup('div', {'class': 'article-info'}):
        node.extract()

    for node in soup('div', {'class': 'article__tags'}):
        node.extract()

    content = []

    header = [str(node) for node in soup('h1')]
    header = '<br><br><br>'.join(header)
    content.append(header)

    for node in soup('div', {'class': 'article__lead-paragraph'}):
        content.append(str(node))
        content.append('<br>' * 10)

    for node in soup('div', {'itemprop': 'articleBody'}):
        content.append(str(node))
        content.append('<br>' * 10)

    for node in soup('div', {'class': 'container__content-text'}):
        content.append(str(node))
        content.append('<br>' * 10)

    for node in soup('div', {'class': 'card__comment'}):
        content.append(str(node))
        content.append('<br>' * 10)

    content = '<div>{}</div>'.format(''.join(content))

    return fragment_to_text(content)

示例#2

0

显示文件

def extract_pikabu(html):
    soup = BeautifulSoup(html, 'lxml')

    for node in soup('div', {'class': 'sidebar'}):
        node.extract()

    content = []

    header = [str(node) for node in soup('h1')]
    header = '<br><br><br>'.join(header)
    content.append(header)

    for node in soup('div', {'class': 'story__description'}):
        content.append(str(node))
        content.append('<br>' * 10)

    for node in soup('div', {'class': 'story__content'}):
        content.append(str(node))
        content.append('<br>' * 10)

    for node in soup('div', {'class': 'comment__content'}):
        content.append(str(node))
        content.append('<br>' * 10)

    content = '<div>{}</div>'.format(''.join(content))

    return fragment_to_text(content)

示例#3

0

显示文件

def extract_article(html):
    if not len(html.strip()):
        return ''

    soup = BeautifulSoup(html, 'lxml')

    header = [str(node) for node in soup('h1')]
    header = '<br><br><br>'.join(header)

    article = html_to_article(html, 'ru')
    text = fragment_to_text('<div>' + header + '<br>' * 3 + article + '</div>')

    return text

示例#4

0

显示文件

def extract_sport(html):
    soup = BeautifulSoup(html, 'lxml')

    content = []

    header = [str(node) for node in soup('h1')]
    header = '<br><br><br>'.join(header)
    content.append(header)

    for node in soup('div', {'class': 'article_text'}):
        content.append(str(node))
        content.append('<br>' * 10)

    content = '<div>{}</div>'.format(''.join(content))

    return fragment_to_text(content)

示例#5

0

显示文件

def extract_ask(html):
    soup = BeautifulSoup(html, 'lxml')

    for node in soup('p', {'class': 'readMore'}):
        node.extract()

    content = []

    for node in soup('header', {'class': 'streamItem_header'}):
        content.append(str(node))
        content.append('<br>' * 10)

    for node in soup('div', {'class': 'streamItem_content'}):
        content.append(str(node))
        content.append('<br>' * 10)

    content = '<div>{}</div>'.format(''.join(content))

    return fragment_to_text(content)

示例#6

0

显示文件

def extract_otvet(html):
    if '</head>' in html and '</body>' in html:
        soup = BeautifulSoup(html, 'lxml')

        content = []

        header = [str(node) for node in soup('h1')]
        header = '<br><br><br>'.join(header)
        content.append(header)

        for node in soup('div', {'itemprop': 'text'}):
            content.append(str(node))
            content.append('<br>' * 10)

        content = '<div>{}</div>'.format(''.join(content))

        return fragment_to_text(content)
    else:
        return html

示例#7

0

显示文件

def extract_lurk(html):
    if 'В базе данных не найдено' in html:
        return ''

    soup = BeautifulSoup(html, 'lxml')

    for node in soup('table', {'class': 'lm-plashka'}):
        node.extract()
    for node in soup('table', {'id': 'toc'}):
        node.extract()
    for node in soup('div', {'class': 'buttons-line'}):
        node.extract()
    for node in soup('div', {'class': 'noprint'}):
        node.extract()
    for node in soup(None, {'class': 'mw-collapsible'}):
        node.extract()

    content = []

    header = [str(node) for node in soup('h1')]

    header = '<br><br><br>'.join(header)
    content.append(header)

    for bad_title in [
            'User:'******'Mediawiki:', 'Special:', 'Lurkmore:', 'Участник:',
            'Служебная:', 'Обсуждение:', 'Категория:', 'Портал:',
            'Обсуждение портала:', 'Шаблон:', 'Обсуждение участника:', 'Файл:',
            'Обсуждение категории:', 'Обсуждение шаблона:',
            'Обсуждение копипасты:', 'Обсуждение смехуечков:',
            'Обсуждение файла:', 'Смехуечки:', 'Обсуждение MediaWiki:'
    ]:
        if bad_title in header:
            return ''

    for node in soup('div', {'id': 'mw-content-text'}):
        content.append(str(node))
        content.append('<br>' * 10)

    content = '<div>{}</div>'.format(''.join(content))

    return fragment_to_text(content)

示例#8

0

显示文件

def extract_irec(html):
    soup = BeautifulSoup(html, 'lxml')

    content = []

    header = [str(node) for node in soup('h2', {'class': 'reviewTitle'})]
    header = '<br><br><br>'.join(header)
    content.append(header)

    for node in soup('div', {'itemprop': 'reviewBody'}):
        content.append(str(node))
        content.append('<br>' * 10)

    for node in soup('div', {'class': 'cmntreply-text'}):
        content.append(str(node))
        content.append('<br>' * 10)

    content = '<div>{}</div>'.format(''.join(content))

    return fragment_to_text(content)

示例#9

0

显示文件

def extract_mk(html):
    soup = BeautifulSoup(html, 'lxml')

    content = []

    header = [str(node) for node in soup('h1')]
    header = '<br><br><br>'.join(header)
    content.append(header)

    for node in soup('div', {'itemprop': 'description'}):
        content.append(str(node))
        content.append('<br>' * 10)

    for node in soup('div', {'itemprop': 'articleBody'}):
        content.append(str(node))
        content.append('<br>' * 10)

    content = '<div>{}</div>'.format(''.join(content))

    return fragment_to_text(content)

示例#10

0

显示文件

def extract_habr(html):
    soup = BeautifulSoup(html, 'lxml')

    content = []

    header = [str(node) for node in soup('h1')]
    header = '<br><br><br>'.join(header)
    content.append(header)

    for node in soup('div', {'class': 'post__text'}):
        content.append(str(node))
        content.append('<br>' * 10)

    for node in soup('div', {'class': 'comment__message'}):
        content.append(str(node))
        content.append('<br>' * 3)

    content = '<div>{}</div>'.format(''.join(content))

    return fragment_to_text(content)

示例#11

0

显示文件

def extract_rbc(html):
    soup = BeautifulSoup(html, 'lxml')

    for node in soup('div', {'class': 'article__main-image'}):
        node.extract()

    content = []

    header = [str(node) for node in soup('h1')]
    header = '<br><br><br>'.join(header)
    content.append(header)

    for node in soup('div', {'class': 'article__header__subtitle'}):
        content.append(str(node))
        content.append('<br>' * 10)

    for node in soup('div', {'itemprop': 'articleBody'}):
        content.append(str(node))
        content.append('<br>' * 10)

    content = '<div>{}</div>'.format(''.join(content))

    return fragment_to_text(content)

示例#12

0

显示文件

def extract_lj(html):
    comments = []
    try:
        meta_text = re.findall(r'\sSite\.page = (\{")([\s\S]+?)(\});\s+Site',
                               html)
        if len(meta_text) == 1 and len(meta_text[0]) == 3:
            meta_dict = json.loads(''.join(meta_text[0]))
            if 'comments' in meta_dict:
                for c in meta_dict['comments']:
                    if 'uname' not in c or 'bot' in c['uname']:
                        continue
                    if 'article' not in c or not c['article']:
                        continue
                    comments.append(c['article'])
    except Exception as e:
        print(e)

    if len(comments):
        comments = '<br><br><br>'.join(comments)
        comments = fragment_to_text(comments)
    else:
        comments = ''

    return extract_article(html) + '\n\n\n' + comments

示例#13

0

显示文件

def extract_drive2(html):
    soup = BeautifulSoup(html, 'lxml')

    content = []

    header = [str(node) for node in soup('h1')]
    header = '<br><br><br>'.join(header)
    content.append(header)

    for node in soup('div', {'itemprop': 'articleBody'}):
        content.append(str(node))
        content.append('<br>' * 10)

    for node in soup('div', {'itemprop': 'reviewBody'}):
        content.append(str(node))
        content.append('<br>' * 10)

    for node in soup('div', {'class': 'c-comment__text'}):
        content.append(str(node))
        content.append('<br>' * 3)

    content = '<div>{}</div>'.format(''.join(content))

    return fragment_to_text(content)