removes numbers in complex format, but not if a . is followed as it introduces the end of a sentence. run after DateRemover() Examples: 15.10 Uhr OR 3,5 bis 4 stunden. OR 100 000 euro. OR 20?000 förderanträge OR um 2025/2030 OR OR abc 18.000. a OR abc. 18.000. a OR abc 18. a OR abc 7.8.14. a OR abc 7. 14. 18. a OR abc 1970er. a OR abc 20?()/&!%000. a OR abc 2,9-3,5. a OR abc . 18. a OR abc . 7.8.14. a OR abc . 7. 14. 18. a OR abc 1790er OR abc . 20?()/&!%000 a OR abc . 2,9-3,5 a OR abc 45, 59 a OR abc . 14 z OR abc 1. e OR abc v. 2 a """ string = re.sub('(?<!\w)(\d+)([\W\s]+|)|([\W\s]+)\d+', ' ', string) # TODO: check later # Alternative: ((\d+)(.|\s{1,3}|)\d+)(.|\s)(?! er) return string nlp = German() sbd = nlp.create_pipe('sentencizer') nlp.add_pipe(sbd) def Sentencizer(string, verbose=False): """ requires from importing language from spacy and loading of sentence boundary detection: from spacy.lang.de import German nlp = German() sbd = nlp.create_pipe('sentencizer') nlp.add_pipe(sbd) for some single strings nlp() cannot process (rare, e.g. 'nan'), exclude those; except pass solve later """ sents_list = [] try: doc = nlp(string)
def getSentences(text): nlp = German() nlp.add_pipe(nlp.create_pipe('sentencizer')) document = nlp(text) return [sent.string.strip() for sent in document.sents]
import spacy from spacy.lang.de import German import pandas as pd import time nlp = German() nlp.add_pipe(nlp.create_pipe('sentencizer')) texts = pd.read_csv('../data/cleaned-text-dump.csv', low_memory=False) def sentencizer(raw_text, nlp): doc = nlp(raw_text) sentences = [sent.string.strip() for sent in doc.sents] return(sentences) def fix_wrong_splits(sentences): i=0 while i < (len(sentences)-2): if sentences[i].endswith(('Z.n.','V.a.','v.a.', 'Vd.a.' 'i.v', ' re.', ' li.', 'und 4.', 'bds.', 'Bds.', 'Pat.', 'i.p.', 'i.P.', 'b.w.', 'i.e.L.', ' pect.', 'Ggfs.', 'ggf.', 'Ggf.', 'z.B.', 'a.e.' 'I.', 'II.', 'III.', 'IV.', 'V.', 'VI.', 'VII.', 'VIII.', 'IX.', 'X.', 'XI.', 'XII.')): sentences[i:i+2] = [' '.join(sentences[i:i+2])] elif len(sentences[i]) < 10: sentences[i:i+2] = [' '.join(sentences[i:i+2])] i+=1
from spacy.lang.de import German nlp = German() # just the language with no model sentencizer = nlp.create_pipe("sentencizer") nlp.add_pipe(sentencizer) doc = nlp(u""" 14. Davon ich allzeit froehlich sei, Zu springen, singen immer frei Das rechte Susannine* schon, Mit Herzen Lust den suessen Ton. 15. Lob, Ehr sei Gott im hoechsten Thron, Der uns schenkt seinen ein'gen Sohn, Des freuen sich der Engel Schaar Und singen uns solch's neues Jahr. """) for sent in doc.sents: print(sent.text)
CAPITALS = json.loads(f.read()) nlp = German() matcher = PhraseMatcher(nlp.vocab) matcher.add("COUNTRY", None, *list(nlp.pipe(COUNTRIES))) def countries_component(doc): # Erstelle eine Entitäts-Span mit dem Label "LOC" für alle Resultate matches = matcher(doc) doc.ents = [ Span(doc, start, end, label="LOC") for match_id, start, end in matches ] return doc # Füge die Komponente zur Pipeline hinzu nlp.add_pipe(countries_component) print(nlp.pipe_names) # Getter-Funktion, die den Text der Span im Lexikon der Hauptstädte nachschlägt get_capital = lambda span: CAPITALS.get(span.text) # Registriere die Span-Erweiterung "capital" mit Getter-Funktion get_capital Span.set_extension("capital", getter=get_capital) # Verarbeite den Text und drucke den Text, das Label und das Attribut capital für jede Entität doc = nlp( "Tschechien könnte der Slowakei dabei helfen, ihren Luftraum zu schützen") print([(ent.text, ent.label_, ent._.capital) for ent in doc.ents])