Python normalize_whitespace示例

编程语言: Python

命名空间/包名称: cleantext

方法/功能: normalize_whitespace

hotexamples.com的示例: 3

Python normalize_whitespace - 已找到3个示例。这些是从开源项目中提取的最受好评的cleantext.normalize_whitespace现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： test_clean.py 项目： raoden1/clean-text

def test_normalize_whitespace():
    text = "Hello, world!  Hello...\t \tworld?\n\nHello:\r\n\n\nWorld. "
    proc_text = "Hello, world! Hello... world?\nHello:\nWorld."
    assert cleantext.normalize_whitespace(text,
                                          no_line_breaks=False) == proc_text
    assert cleantext.normalize_whitespace(" dd\nd  ",
                                          no_line_breaks=True) == "dd d"

示例#2

显示文件

文件： scraper.py 项目： rechtegewalt/raa-sachsen-scraper

def split_date_county(t):
    t = normalize_whitespace(t, no_line_breaks=True)
    if "|" in t:
        date, county = t.split("|")
        county = strip_lk(county)
    else:
        date = t
        county = None
    date = date.replace("Vorfall vom", "")
    date = parse(date, languages=["de"])
    return date, county

示例#3

显示文件

文件： scraper.py 项目： rechtegewalt/raa-sachsen-scraper

def process_text_list(entry):
    # getting main text, consider special cases
    text_list = entry.xpath(
        ".//div[contains(@class, 'content-element summary')]//p//text()")

    for x in [
            entry.xpath(
                ".//div[contains(@class, 'content-element summary')]/text()"),
            entry.xpath(".//p//text()"),
            entry.xpath(
                ".//div[contains(@class, 'content-model--text')]//text()"),
            # entry.xpath(".//div[contains(@class, 'content-element')]//text()"),
    ]:
        for t in x:
            if t not in text_list:
                text_list.append(t)

    text_list = [
        normalize_whitespace(t, no_line_breaks=True) for t in text_list
    ]
    text_list = [t for t in text_list if len(t) > 0]

    # skip if really no text
    if len(text_list) == 0:
        return

    sources = None

    for t in text_list:
        if clean(t) == clean("Alle Beiträge sehen"):
            text_list.remove(t)

        if (t.startswith("Quelle:") or t.startswith("Quellen:")
                or t.startswith("Quelle ")):
            if sources is not None:
                raise ValueError("found sources twice?")
            sources = (t.replace("Quelle:",
                                 "").replace("Quellen:",
                                             "").replace("Quelle ",
                                                         "").strip())
            text_list.remove(t)

    if sources is None:
        # old format
        sources = entry.xpath(".//p[@style='text-align: right;']//text()")
        if len(sources) == 0:
            sources = None
        else:
            sources = sources[0]
            text_list = [t for t in text_list if t.strip() != sources.strip()]

    another_county = None
    for t in text_list:
        if t.startswith("Landkreis:"):
            another_county = strip_lk(t)
            text_list.remove(t)

    # not sure about this heuristic...
    title = None
    if len(text_list) > 1 and len(text_list[0]) * 2 < len(text_list[1]):
        title = text_list[0]
        text_list = text_list[1:]

    description = "\n\n".join(text_list)

    if DEBUG:
        print()
        print(sources)
        print(title)
        print(description)
        print()
    return sources, title, description, another_county