def test_normalize_whitespace(): text = "Hello, world! Hello...\t \tworld?\n\nHello:\r\n\n\nWorld. " proc_text = "Hello, world! Hello... world?\nHello:\nWorld." assert cleantext.normalize_whitespace(text, no_line_breaks=False) == proc_text assert cleantext.normalize_whitespace(" dd\nd ", no_line_breaks=True) == "dd d"
def split_date_county(t): t = normalize_whitespace(t, no_line_breaks=True) if "|" in t: date, county = t.split("|") county = strip_lk(county) else: date = t county = None date = date.replace("Vorfall vom", "") date = parse(date, languages=["de"]) return date, county
def process_text_list(entry): # getting main text, consider special cases text_list = entry.xpath( ".//div[contains(@class, 'content-element summary')]//p//text()") for x in [ entry.xpath( ".//div[contains(@class, 'content-element summary')]/text()"), entry.xpath(".//p//text()"), entry.xpath( ".//div[contains(@class, 'content-model--text')]//text()"), # entry.xpath(".//div[contains(@class, 'content-element')]//text()"), ]: for t in x: if t not in text_list: text_list.append(t) text_list = [ normalize_whitespace(t, no_line_breaks=True) for t in text_list ] text_list = [t for t in text_list if len(t) > 0] # skip if really no text if len(text_list) == 0: return sources = None for t in text_list: if clean(t) == clean("Alle Beiträge sehen"): text_list.remove(t) if (t.startswith("Quelle:") or t.startswith("Quellen:") or t.startswith("Quelle ")): if sources is not None: raise ValueError("found sources twice?") sources = (t.replace("Quelle:", "").replace("Quellen:", "").replace("Quelle ", "").strip()) text_list.remove(t) if sources is None: # old format sources = entry.xpath(".//p[@style='text-align: right;']//text()") if len(sources) == 0: sources = None else: sources = sources[0] text_list = [t for t in text_list if t.strip() != sources.strip()] another_county = None for t in text_list: if t.startswith("Landkreis:"): another_county = strip_lk(t) text_list.remove(t) # not sure about this heuristic... title = None if len(text_list) > 1 and len(text_list[0]) * 2 < len(text_list[1]): title = text_list[0] text_list = text_list[1:] description = "\n\n".join(text_list) if DEBUG: print() print(sources) print(title) print(description) print() return sources, title, description, another_county