Python htt示例

编程语言: Python

命名空间/包名称: html2text

方法/功能: htt

hotexamples.com的示例: 6

Python htt - 已找到6个示例。这些是从开源项目中提取的最受好评的html2text.htt现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

def dewiki(text):
    text = remove_double_curly(text)
    text = remove_double_brackets(text)
    text = wtp.parse(text).plain_text()
    text = htt(text)
    text = text.replace('\\n', ' ')
    text = re.sub('\[\[', ' ', text)
    text = re.sub('\]\]', ' ', text)
    text = re.sub('\s+', ' ', text)
    return text

示例#2

显示文件

文件： jsonify_wikipedia.py 项目： Katzmann1983/PlainTextWikipedia

def analyze_chunk(text):
    try:
        if '<redirect title="' in text:  # this is not the main article
            return None
        else:
            title = text.split('<title>')[1].split('</title>')[0]
            if ':' in title:  # this is a talk, category, or other (not a real article)
                return None
            title = htt(title).strip()
        serial = text.split('<id>')[1].split('</id>')[0]
        content = text.split('</text')[0].split('<text')[1].split(
            '>', maxsplit=1)[1]
        content = dewiki(content)
        return {'title': title, 'text': content, 'id': serial}
    except:
        return None

示例#3

显示文件

文件： dewiki_functions.py 项目： wangludewdrop/PlainTextWikipedia

def dewiki(text):
    text = remove_simple_links(text)
    text = remove_pictures(text)
    text = remove_audio(text)
    text = remove_compound_links(text)
    text = remove_references(text)
    text = remove_citations(text)
    text = remove_categories(text)
    text = remove_all_links(text)
    text = remove_urls(text)
    text = remove_wikitables(text)  # TODO preserve this data somehow

    text = wtp.parse(text).plain_text()  # wiki to plaintext whatever is left
    text = htt(text)  # de-HTML text

    #text = re.sub('\]\]', ' ', text)  # remove any remnant brackets
    text = text.replace('\\n', ' ')  # replace newlines
    text = re.sub('\s+', ' ', text)  # replace excess whitespace
    return text

示例#4

显示文件

def dewiki(text):
    text = text.replace('\\n', ' ')  # replace newlines
    text = re.sub('\s+', ' ', text)  # replace excess whitespace
    text = remove_audio(text)
    text = remove_references(text)
    text = remove_citations(text)
    text = remove_categories(text)
    text = remove_simple_links(text)
    text = remove_compound_links(text)
    text = remove_pictures(text)
    text = remove_all_links(text)
    text = remove_urls(text)
    # TODO handle class=\"sortable wikitable\" and class=\"wikitable\"

    text = wtp.parse(text).plain_text()  # wiki to plaintext whatever is left
    text = htt(text)  # de-HTML text

    text = re.sub('\]\]', ' ', text)  # remove any remnant brackets
    text = re.sub('\s+', ' ', text)
    return text

示例#5

显示文件

文件： dewiki_functions.py 项目： wangludewdrop/PlainTextWikipedia

def analyze_chunk(text):
    try:
        if '<redirect title="' in text:  # this is not the main article
            return None
        if '(disambiguation)' in text:  # this is not an article
            return None
        else:
            title = text.split('<title>')[1].split('</title>')[0]
            title = htt(title)
            if ':' in title:  # most articles with : in them are not articles we care about
                return None
        serial = text.split('<id>')[1].split('</id>')[0]
        content = text.split('</text')[0].split('<text')[1].split(
            '>', maxsplit=1)[1]
        content = dewiki(content)
        return {
            'title': title.strip(),
            'text': content.strip(),
            'id': serial.strip()
        }
    except Exception as oops:
        print(oops)
        return None

示例#6

显示文件

文件： dewiki_functions.py 项目： tonypdmtr/PlainTextWikipedia

def dewiki(text):
    text = wtp.parse(text).plain_text()  # wiki to plaintext
    text = htt(text)  # remove any HTML
    text = text.replace('\\n', ' ')  # replace newlines
    text = re.sub('\s+', ' ', text)  # replace excess whitespace
    return text