Exemplo n.º 1
0
def test_normalize_whitespace():
    text = "Hello, world!  Hello...\t \tworld?\n\nHello:\r\n\n\nWorld. "
    proc_text = "Hello, world! Hello... world?\nHello:\nWorld."
    assert cleantext.normalize_whitespace(text,
                                          no_line_breaks=False) == proc_text
    assert cleantext.normalize_whitespace(" dd\nd  ",
                                          no_line_breaks=True) == "dd d"
Exemplo n.º 2
0
def split_date_county(t):
    t = normalize_whitespace(t, no_line_breaks=True)
    if "|" in t:
        date, county = t.split("|")
        county = strip_lk(county)
    else:
        date = t
        county = None
    date = date.replace("Vorfall vom", "")
    date = parse(date, languages=["de"])
    return date, county
Exemplo n.º 3
0
def process_text_list(entry):
    # getting main text, consider special cases
    text_list = entry.xpath(
        ".//div[contains(@class, 'content-element summary')]//p//text()")

    for x in [
            entry.xpath(
                ".//div[contains(@class, 'content-element summary')]/text()"),
            entry.xpath(".//p//text()"),
            entry.xpath(
                ".//div[contains(@class, 'content-model--text')]//text()"),
            # entry.xpath(".//div[contains(@class, 'content-element')]//text()"),
    ]:
        for t in x:
            if t not in text_list:
                text_list.append(t)

    text_list = [
        normalize_whitespace(t, no_line_breaks=True) for t in text_list
    ]
    text_list = [t for t in text_list if len(t) > 0]

    # skip if really no text
    if len(text_list) == 0:
        return

    sources = None

    for t in text_list:
        if clean(t) == clean("Alle Beiträge sehen"):
            text_list.remove(t)

        if (t.startswith("Quelle:") or t.startswith("Quellen:")
                or t.startswith("Quelle ")):
            if sources is not None:
                raise ValueError("found sources twice?")
            sources = (t.replace("Quelle:",
                                 "").replace("Quellen:",
                                             "").replace("Quelle ",
                                                         "").strip())
            text_list.remove(t)

    if sources is None:
        # old format
        sources = entry.xpath(".//p[@style='text-align: right;']//text()")
        if len(sources) == 0:
            sources = None
        else:
            sources = sources[0]
            text_list = [t for t in text_list if t.strip() != sources.strip()]

    another_county = None
    for t in text_list:
        if t.startswith("Landkreis:"):
            another_county = strip_lk(t)
            text_list.remove(t)

    # not sure about this heuristic...
    title = None
    if len(text_list) > 1 and len(text_list[0]) * 2 < len(text_list[1]):
        title = text_list[0]
        text_list = text_list[1:]

    description = "\n\n".join(text_list)

    if DEBUG:
        print()
        print(sources)
        print(title)
        print(description)
        print()
    return sources, title, description, another_county