def test_pipe_factories_language_specific():
    """Test that language sub-classes can have their own factories, with
    fallbacks to the base factories."""
    name1 = "specific_component1"
    name2 = "specific_component2"
    Language.component(name1, func=lambda: "base")
    English.component(name1, func=lambda: "en")
    German.component(name2, func=lambda: "de")

    assert Language.has_factory(name1)
    assert not Language.has_factory(name2)
    assert English.has_factory(name1)
    assert not English.has_factory(name2)
    assert German.has_factory(name1)
    assert German.has_factory(name2)

    nlp = Language()
    assert nlp.create_pipe(name1)() == "base"
    with pytest.raises(ValueError):
        nlp.create_pipe(name2)
    nlp_en = English()
    assert nlp_en.create_pipe(name1)() == "en"
    with pytest.raises(ValueError):
        nlp_en.create_pipe(name2)
    nlp_de = German()
    assert nlp_de.create_pipe(name1)() == "base"
    assert nlp_de.create_pipe(name2)() == "de"
    """
    removes numbers in complex format, but not if a . is followed as it introduces the end of a sentence.
    run after DateRemover()
    Examples: 15.10 Uhr OR 3,5 bis 4 stunden. OR 100 000 euro. OR 20?000 förderanträge OR um 2025/2030 OR
    OR abc 18.000. a OR abc. 18.000. a OR abc 18. a  OR abc 7.8.14. a  OR abc 7. 14. 18. a OR abc 1970er. a
    OR abc 20?()/&!%000. a  OR abc 2,9-3,5. a OR abc . 18. a OR abc . 7.8.14. a OR abc . 7. 14. 18. a OR abc 1790er
    OR abc . 20?()/&!%000 a  OR abc . 2,9-3,5 a OR abc 45, 59 a OR abc . 14 z OR abc  1. e OR abc  v. 2 a
    """
    string = re.sub('(?<!\w)(\d+)([\W\s]+|)|([\W\s]+)\d+', ' ',
                    string)  # TODO: check later
    # Alternative: ((\d+)(.|\s{1,3}|)\d+)(.|\s)(?! er)
    return string


nlp = German()
sbd = nlp.create_pipe('sentencizer')
nlp.add_pipe(sbd)


def Sentencizer(string, verbose=False):
    """
    requires from importing language from spacy and loading of sentence boundary detection:
    from spacy.lang.de import German
    nlp = German()
    sbd = nlp.create_pipe('sentencizer')
    nlp.add_pipe(sbd)

    for some single strings nlp() cannot process (rare, e.g. 'nan'), exclude those; except pass solve later
    """
    sents_list = []
    try:
Пример #3
0
from spacy.lang.de import German

nlp = German()  # just the language with no model
sentencizer = nlp.create_pipe("sentencizer")
nlp.add_pipe(sentencizer)
doc = nlp(u"""
14. Davon ich allzeit froehlich sei,
Zu springen, singen immer frei
Das rechte Susannine* schon,
Mit Herzen Lust den suessen Ton.

15. Lob, Ehr sei Gott im hoechsten Thron,
Der uns schenkt seinen ein'gen Sohn,
Des freuen sich der Engel Schaar
Und singen uns solch's neues Jahr.
""")
for sent in doc.sents:
    print(sent.text)
def getSentences(text):
    nlp = German()
    nlp.add_pipe(nlp.create_pipe('sentencizer'))
    document = nlp(text)
    return [sent.string.strip() for sent in document.sents]
import spacy
from spacy.lang.de import German
import pandas as pd
import time

nlp = German()
nlp.add_pipe(nlp.create_pipe('sentencizer')) 

texts = pd.read_csv('../data/cleaned-text-dump.csv', low_memory=False) 

def sentencizer(raw_text, nlp):
    doc = nlp(raw_text)
    sentences = [sent.string.strip() for sent in doc.sents]
    return(sentences)

def fix_wrong_splits(sentences): 
    i=0
    
    while i < (len(sentences)-2): 
        if sentences[i].endswith(('Z.n.','V.a.','v.a.', 'Vd.a.' 'i.v', ' re.', 
                                  ' li.', 'und 4.', 'bds.', 'Bds.', 'Pat.', 
                                  'i.p.', 'i.P.', 'b.w.', 'i.e.L.', ' pect.', 
                                  'Ggfs.', 'ggf.', 'Ggf.',  'z.B.', 'a.e.'
                                  'I.', 'II.', 'III.', 'IV.', 'V.', 'VI.', 'VII.', 
                                  'VIII.', 'IX.', 'X.', 'XI.', 'XII.')):
            sentences[i:i+2] = [' '.join(sentences[i:i+2])]

        elif len(sentences[i]) < 10: 
            sentences[i:i+2] = [' '.join(sentences[i:i+2])]

        i+=1