Exemplo n.º 1
0
def read_mtl_file(domain, filename):
    X = []
    Y = []
    if domain == 'en':
        # tokenizer = WordPunctTokenizer()
        tokenizer = English().Defaults.create_tokenizer()
    elif domain == 'fr':
        # tokenizer = nltk.data.load('tokenizers/punkt/french.pickle')
        tokenizer = French().Defaults.create_tokenizer()
    elif domain == 'de':
        # tokenizer = nltk.data.load('tokenizers/punkt/german.pickle')
        tokenizer = German().Defaults.create_tokenizer()
    with open(filename, 'r', encoding='utf-8') as inf:
        for line in inf.readlines():
            parts = line.split('\t')
            if len(parts) == 3:  # labeled
                Y.append(int(float(parts[1])))
            elif len(parts) == 2:  # unlabeled
                Y.append(0)
            else:
                raise Exception('Unknown format')
            clean = clean_sentence(parts[-1])
            # if domain is 'en':
            #     words = word_tokenize(clean, language='english')
            # elif domain is 'fr':
            #     words = word_tokenize(clean, language='french')
            # elif domain is 'de':
            #     words = word_tokenize(clean, language='german')
            words = [str(e) for e in tokenizer(clean)]
            tmp = {}
            tmp['tokens'] = words
            tmp['sent'] = clean
            X.append(tmp)
    #Y = torch.LongTensor(Y).to(opt.device)
    return (X, Y)
Exemplo n.º 2
0
    def __init__(self):
        if LANGUAGE == "DE":
            from spacy.lang.de.stop_words import STOP_WORDS
            self.nlp = spacy.load('de_core_news_sm')
            self.domain_stopwords = ["Ausschreibung", "Bekanntmachung"]
            from spacy.lang.de import German
            self.parser = German()
        elif LANGUAGE == "EN":
            from spacy.lang.en.stop_words import STOP_WORDS
            self.nlp = spacy.load('en')
            self.domain_stopwords = [
                "contract", "system", "service", "tender", "company", "notice",
                "procurement", "work", "include", "support", "approximately",
                "management", "agreement", "office", "solution", "manage",
                "product", "design", "program", "project", "supply", "trust",
                "equipment"
            ]
            from spacy.lang.en import English
            self.parser = English()
        else:
            raise Exception("unknown language")

        self.stopwords = list(STOP_WORDS)
        self.stopwords.extend(self.domain_stopwords)
        self.pipe = None
Exemplo n.º 3
0
def test_pipe_factories_from_source_language_subclass():
    class CustomEnglishDefaults(English.Defaults):
        stop_words = set(["custom", "stop"])

    @registry.languages("custom_en")
    class CustomEnglish(English):
        lang = "custom_en"
        Defaults = CustomEnglishDefaults

    source_nlp = English()
    source_nlp.add_pipe("tagger")

    # custom subclass
    nlp = CustomEnglish()
    nlp.add_pipe("tagger", source=source_nlp)
    assert "tagger" in nlp.pipe_names

    # non-subclass
    nlp = German()
    nlp.add_pipe("tagger", source=source_nlp)
    assert "tagger" in nlp.pipe_names

    # mismatched vectors
    nlp = English()
    nlp.vocab.vectors.resize((1, 4))
    nlp.vocab.vectors.add("cat", vector=[1, 2, 3, 4])
    logger = logging.getLogger("spacy")
    with mock.patch.object(logger, "warning") as mock_warning:
        nlp.add_pipe("tagger", source=source_nlp)
        mock_warning.assert_called()
Exemplo n.º 4
0
def test_issue3002():
    """Test that the tokenizer doesn't hang on a long list of dots"""
    nlp = German()
    doc = nlp(
        '880.794.982.218.444.893.023.439.794.626.120.190.780.624.990.275.671 ist eine lange Zahl'
    )
    assert len(doc) == 5
Exemplo n.º 5
0
def bleu_scores_europarl(input_texts, target_texts, predict, parser=German()):
    assert len(input_texts) == len(target_texts)
    N = len(input_texts)

    # to handle short sequences, see also
    # http://www.nltk.org/_modules/nltk/translate/bleu_score.html#SmoothingFunction.method3
    chencherry = SmoothingFunction()

    def remove_spaces_and_puncts(tokens):
        return [
            token.orth_ for token in tokens
            if not (token.is_space or token.is_punct)
        ]

    bleu_scores = np.zeros(N)

    for i in tqdm(range(N)):
        ref_tokens = remove_spaces_and_puncts(parser(target_texts.iloc[i]))
        pred_tokens = remove_spaces_and_puncts(
            parser(predict(input_texts.iloc[i])))
        bleu_scores[i] = sentence_bleu([ref_tokens],
                                       pred_tokens,
                                       smoothing_function=chencherry.method3)

    return bleu_scores
Exemplo n.º 6
0
def test_pipe_factories_from_source_language_subclass():
    class CustomEnglishDefaults(English.Defaults):
        stop_words = set(["custom", "stop"])

    @registry.languages("custom_en")
    class CustomEnglish(English):
        lang = "custom_en"
        Defaults = CustomEnglishDefaults

    source_nlp = English()
    source_nlp.add_pipe("tagger")

    # custom subclass
    nlp = CustomEnglish()
    nlp.add_pipe("tagger", source=source_nlp)
    assert "tagger" in nlp.pipe_names

    # non-subclass
    nlp = German()
    nlp.add_pipe("tagger", source=source_nlp)
    assert "tagger" in nlp.pipe_names

    # mismatched vectors
    nlp = English()
    nlp.vocab.vectors.resize((1, 4))
    nlp.vocab.vectors.add("cat", vector=[1, 2, 3, 4])
    with pytest.warns(UserWarning):
        nlp.add_pipe("tagger", source=source_nlp)
Exemplo n.º 7
0
    def init_resources(self):
        self.punctuation_pattern = re.compile("|".join(PUNCTUATION))
        self.stemmer = None
        stopwords_path = os.path.join(
            os.path.dirname(assistant_dialog_skill_analysis.__file__),
            "resources",
            self.language_code,
            "stopwords",
        )
        if self.language_code == "en":
            from spacy.lang.en import English

            self.tokenizer = Tokenizer(English().vocab)
            self.stemmer = SnowballStemmer(language="english")
            self.stop_words = self.load_stop_words(stopwords_path)

        elif self.language_code == "fr":
            from spacy.lang.fr import French

            self.tokenizer = Tokenizer(French().vocab)
            self.stemmer = SnowballStemmer(language="french")
            self.stop_words = self.load_stop_words(stopwords_path)

        elif self.language_code == "de":
            from spacy.lang.de import German

            self.tokenizer = Tokenizer(German().vocab)
            self.stemmer = SnowballStemmer(language="german")
            self.stop_words = self.load_stop_words(stopwords_path)

        elif self.language_code == "it":
            from spacy.lang.it import Italian

            self.tokenizer = Tokenizer(Italian().vocab)
            self.stemmer = SnowballStemmer(language="italian")
            self.stop_words = self.load_stop_words(stopwords_path)

        elif self.language_code == "cs":
            from spacy.lang.cs import Czech

            self.tokenizer = Tokenizer(Czech().vocab)
            self.stop_words = self.load_stop_words(stopwords_path)

        elif self.language_code == "pt":
            from spacy.lang.pt import Portuguese

            self.tokenizer = Tokenizer(Portuguese().vocab)
            self.stemmer = SnowballStemmer(language="portuguese")
            self.stop_words = self.load_stop_words(stopwords_path)

        elif self.language_code == "es":
            from spacy.lang.es import Spanish

            self.tokenizer = Tokenizer(Spanish().vocab)
            self.stemmer = SnowballStemmer(language="spanish")
            self.stop_words = self.load_stop_words(stopwords_path)
        else:
            raise Exception("language code %s is not supported",
                            self.language_code)
Exemplo n.º 8
0
def get_spacy_base_model(lang: str):
    if lang == 'english':
        return English()
    elif lang == 'german':
        return German()
    else:
        raise AttributeError(
            f'Language {lang} not supported for spacy-based tokenization')
Exemplo n.º 9
0
 def __init__(self):
     self.domain_stopwords = ["Ausschreibung", "Bekanntmachung"]
     self.parser = German()
     self.stopwords = list(STOP_WORDS)
     self.stopwords.extend(self.domain_stopwords)
     self.fast_text_model = None
     self.svm_average_model = None
     self.create_new_model()
def test_pipe_factories_language_specific():
    """Test that language sub-classes can have their own factories, with
    fallbacks to the base factories."""
    name1 = "specific_component1"
    name2 = "specific_component2"
    Language.component(name1, func=lambda: "base")
    English.component(name1, func=lambda: "en")
    German.component(name2, func=lambda: "de")

    assert Language.has_factory(name1)
    assert not Language.has_factory(name2)
    assert English.has_factory(name1)
    assert not English.has_factory(name2)
    assert German.has_factory(name1)
    assert German.has_factory(name2)

    nlp = Language()
    assert nlp.create_pipe(name1)() == "base"
    with pytest.raises(ValueError):
        nlp.create_pipe(name2)
    nlp_en = English()
    assert nlp_en.create_pipe(name1)() == "en"
    with pytest.raises(ValueError):
        nlp_en.create_pipe(name2)
    nlp_de = German()
    assert nlp_de.create_pipe(name1)() == "base"
    assert nlp_de.create_pipe(name2)() == "de"
Exemplo n.º 11
0
    def __init__(self, filename):

        self._nlp = German()
        self._myfile = open(filename, encoding="utf8")
        self._text = self._nlp(
            re.sub(
                r'[^a-zA-Z_\s_\t]+', '',
                self._myfile.read().replace('\n',
                                            '').replace('\t',
                                                        '').replace('  ', '')))
Exemplo n.º 12
0
 def __get_nlp__(self):
     if self._blank_model != '':  # i.e. the pipeline has only the tokenizer, all other pipes have to added manually
         return spacy.blank(self._blank_model)
     if self._model == 'en':
         return English()
     elif self._model == 'de':
         return German()
     else:
         return spacy.load(
             'en_core_web_sm') if self._model == '' else spacy.load(
                 self._model)
Exemplo n.º 13
0
def main():
    nlp = English()
    doc = nlp("This is a sentence.")
    print(doc.text)

    nlp = German()
    doc = nlp('Liebe Grüße!')
    print(doc.text)

    nlp = Spanish()
    doc = nlp('¿Cómo estás?')
    print(doc.text)
def main():
    nlp = spacy.load('en_core_web_sm')

    cat_hash = nlp.vocab.strings['cat']
    print(cat_hash)

    cat_string = nlp.vocab.strings[cat_hash]
    print(cat_string)

    nlp = English()
    nlp_de = German()

    bowie_id = nlp.vocab.strings['Bowie']
    print(bowie_id)
Exemplo n.º 15
0
    def __init__(self):
        self.stopwords = list(STOP_WORDS)
        self.domain_stopwords = ["Ausschreibung", "Bekanntmachung"]
        self.stopwords.extend(self.domain_stopwords)
        self.parser = German()
        self.punctuations = string.punctuation
        self.domain_stopwords = ["contract", "system", "service", "tender", "company", "notice", "procurement",
                                 "work", "include", "support", "approximately", "management", "agreement",
                                 "office", "solution", "manage", "product", "design", "program", "project",
                                 "supply", "trust", "equipment"]

        self.stopwords = list(STOP_WORDS)
        self.stopwords.extend(self.domain_stopwords)
        self.create_new_model()
Exemplo n.º 16
0
 def get_tokenizers(self, lang):
     os.environ['TOKENIZERS_PARALLELISM'] = "True"
     if lang == 'de':
         spacy = German()
         bert = "deepset/gbert-base"
     elif lang == 'fr':
         spacy = French()
         bert = "camembert/camembert-base-ccnet"
     elif lang == 'it':
         spacy = Italian()
         bert = "dbmdz/bert-base-italian-cased"
     else:
         raise ValueError(
             f"Please choose one of the following languages: {self.languages}"
         )
     return spacy.tokenizer, AutoTokenizer.from_pretrained(bert)
Exemplo n.º 17
0
 def get_tokenizer(lang):
     if lang == "zh":
         # nlp = spacy.load("zh_core_web_sm")
         nlp = Chinese()
     elif lang == "en":
         # nlp = spacy.load("en_core_web_sm")
         nlp = English()
     elif lang == "cs":
         nlp = Czech()
     elif lang == "de":
         # nlp = spacy.load("de_core_web_sm")
         nlp = German()
     elif lang == "ru":
         nlp = Russian()
     else:
         raise Exception("Unacceptable language.")
     return nlp
def main():
    # create an English and German nlp object
    nlp = English()  # spacy.load('en_core_web_sm')
    nlp_de = German()  # nlp = spacy.load("de_core_news_sm")

    # look up a string and hash using in the Vocab
    print('\nShared vocab and String Store -----------------')
    doc = nlp("I love coffee")
    print('hash value:', doc.vocab.strings['coffee'])
    print('string value:', doc.vocab.strings[3197928453018144401])

    # lexemes: entries in the vocabulary
    # a lexeme object is an entry in the vocabulary that
    # conatins the context-independent information about a word
    # rem: orth means hash
    print('\nLexemes: entries in the Vocabulary -----------------')
    lexeme = doc.vocab['coffee']
    print('word: ', lexeme.text)
    print('hash: ', lexeme.orth)
    print('alphanumeric?: ', lexeme.is_alpha)

    # practice 1: look up a string in vocab to get the hash
    print('\npractice 1: English nlp obj ----------------------')
    doc = nlp('My favorite guitar is a parlor guitar by Art and Lutherie.')
    guitar_hash = doc.vocab.strings['guitar']
    print('guitar hash:\t', guitar_hash)

    guitar_string = doc.vocab.strings[13533102915073649304]
    print('guitar string:\t', guitar_string)

    # practice 2:
    print('\npractice 2: insert into EN vocab but not DE vocab ------')
    #    get the ID for the string 'Jazz'
    jazz_id = nlp.vocab.strings['Jazz']
    print(jazz_id)
    # print(nlp.vocab.strings[jazz_id]) # throws error
    jazz_id = nlp('Jazz')
    print(jazz_id)
    print(jazz_id.vocab.strings[16658944612980789447])

    #    look up the ID for 'Jazz' in the vocab
    #print('Jazz in EN vocab: ', nlp.vocab.strings[jazz_id])

    # end program
    print('\nDone.')
Exemplo n.º 19
0
def lang_change(language):
    if language == 'en':
        from spacy.lang.en import English
        from spacy.lang.en.stop_words import STOP_WORDS
        parser = English()
        file = "\config_files\config_spacy_en.yaml"
        configfile_path = os.getcwd() + file
    elif language == 'de':
        from spacy.lang.de import German
        from spacy.lang.de.stop_words import STOP_WORDS
        parser = German()
        file = "\config_files\config_spacy_de.yaml"
        configfile_path = os.getcwd() + file
    elif language == 'es':
        from spacy.lang.es import Spanish
        from spacy.lang.es.stop_words import STOP_WORDS
        parser = Spanish()
        file = "\config_files\config_spacy_es.yaml"
        configfile_path = os.getcwd() + file
    elif language == 'pt':
        from spacy.lang.pt import Portuguese
        from spacy.lang.pt.stop_words import STOP_WORDS
        parser = Portuguese()
        file = "\config_files\config_spacy_pt.yaml"
        configfile_path = os.getcwd() + file
    elif language == 'fr':
        from spacy.lang.fr import French
        from spacy.lang.fr.stop_words import STOP_WORDS
        parser = French()
        file = "\config_files\config_spacy_fr.yaml"
        configfile_path = os.getcwd() + file
    elif language == 'it':
        from spacy.lang.it import Italian
        from spacy.lang.it.stop_words import STOP_WORDS
        parser = Italian()
        file = "\config_files\config_spacy_it.yaml"
        configfile_path = os.getcwd() + file
    elif language == 'nl':
        from spacy.lang.nl import Dutch
        from spacy.lang.nl.stop_words import STOP_WORDS
        parser = Dutch()
        file = "\config_files\config_spacy_nl.yaml"
        configfile_path = os.getcwd() + file

    return parser, STOP_WORDS, configfile_path
def prepare_twitter_data(data_file, type_of_analysis):
    labels = []
    text_fake, text_normal = '', ''
    df = pd.read_csv(data_file, sep='|', encoding='utf-8', keep_default_na=False)
    print('removing duplicates')
    df = utils.remove_duplicates(df)
    print('getting preprocessed train articles')
    idx = 0
    for key, item in enumerate(df['article_text']):
        idx += 1
        if df['is_fake'].values[key] == 1:
            text_fake += get_preprocessed_text(item)
            labels.append('FAKE')
        else:
            text_normal += get_preprocessed_text(item)
            labels.append('NOT_FAKE')
        if idx % 100 == 0:
            print('got {} of {} preprocessed train articles'.format(idx, len(df)))

    print('Finished gathering train text items')

    train = pd.DataFrame()
    train['data'] = df[type_of_analysis]
    train['labels'] = df['is_fake']


    #  TOPIC MODELLING
    nlp = German()
    stop_words = get_stop_words('de')
    stop_words.append('foto')
    stop_words.append('⬅')

    for stopword in stop_words:
        lexeme = nlp.vocab[stopword]
        lexeme.is_stop = True

    texts = get_spacy_corpus(train['data'], nlp, logging=True, topic_modelling=True)
    bigram = gensim.models.Phrases(texts)
    texts = [bigram[line] for line in texts]
    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    ldamodel = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary)
    print(ldamodel.show_topics())
    return do_create_twitter(train, None)
Exemplo n.º 21
0
def create_p3ml_vocab(fdir='', odir=''):
    nlp = spacy.load('de_core_news_sm')
    deTokenizer = German().Defaults.create_tokenizer(nlp)
    i = 0
    for fn in os.listdir(fdir):
        print(i, fn)
        i += 1
        with codecs.open(os.path.join(fdir, fn), 'r', 'utf-8-sig') as fh:
            txt = fh.read()
            # txtLst = list(set(re.sub(r'[^\w\s]',' ', txt).split()))
            txtLst = [str(s) for s in list(deTokenizer(txt))]
            counter = collections.Counter(txtLst)
        ofile = 'VOC_'+fn
        codecs.open(os.path.join(odir, ofile), 'w').close()
        with codecs.open(os.path.join(odir, ofile), 'a+', 'utf-8-sig') as ofh:
            keys = list(counter.keys())
            keys.sort()
            for key in keys:
                if isinstance(key, str):
                    ofh.write(' '.join([key, str(counter[key])])+'\n')
Exemplo n.º 22
0
def tokenize(document, language, punctutation):
    if language == 'fr':
        nlp = French()
    if language == 'de':
        nlp = German()
    if language == 'en':
        nlp = French()
    if language == 'es':
        nlp = Spanish()
    sentencizer = nlp.create_pipe("sentencizer")
    nlp.add_pipe(sentencizer)
    doc = nlp(document)
    if punctutation:
        sentences = [[str(word) for word in sent if str(word) != '\n']
                     for sent in doc.sents]
    else:
        sentences = [[
            str(word) for word in sent
            if ((str(word) != '\n') and (str(word).isalpha()))
        ] for sent in doc.sents]
    return sentences
Exemplo n.º 23
0
    def get_nlp(self, language):

        """"
        this method returns the corresponding spacy language model when 
        provided with a language. To do so it also does the required 
        import. This is certainly not the standard approach. 
        But as this endpoint will be deployed to Heroku (space limitation)
        and only be invoked rarely it is the fastest approach.
        """

        if language == "en":

            from spacy.lang.en import English
            return English()

        elif language == "fr":

            from spacy.lang.fr import French
            return French()

        elif language == "de":

            from spacy.lang.de import German
            return German()

        elif language == "es":

            from spacy.lang.es import Spanish
            return Spanish()

        elif language == "pt":

            from spacy.lang.pt import Portuguese
            return Portuguese()

        else:

            return {"error": "invalid or not supported language entered"}
Exemplo n.º 24
0
import json
from spacy.lang.de import German
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher

with open("exercises/de/countries.json") as f:
    COUNTRIES = json.loads(f.read())

with open("exercises/en/capitals.json") as f:
    CAPITALS = json.loads(f.read())

nlp = German()
matcher = PhraseMatcher(nlp.vocab)
matcher.add("COUNTRY", None, *list(nlp.pipe(COUNTRIES)))


def countries_component(doc):
    # Erstelle eine Entitäts-Span mit dem Label "LOC" für alle Resultate
    matches = matcher(doc)
    doc.ents = [____(____, ____, ____, label=____) for match_id, start, end in matches]
    return doc


# Füge die Komponente zur Pipeline hinzu
____.____(____)
print(nlp.pipe_names)

# Getter-Funktion, die den Text der Span im Lexikon der Hauptstädte nachschlägt
get_capital = lambda span: CAPITALS.get(span.text)

# Registriere die Span-Erweiterung "capital" mit Getter-Funktion get_capital
print(device)

#Reading the English-German sentences pairs from the file
with open("../deu.txt", "r+") as file:
    deu = [x[:-1] for x in file.readlines()]
en = []
de = []
for line in deu:
    en.append(line.split("\t")[0])
    de.append(line.split("\t")[1])

#Setting the number of training sentences we'll use
training_examples = 10000
#We'll be using the spaCy's English and German tokenizers
spacy_en = English()
spacy_de = German()

en_words = Counter()
de_words = Counter()
en_inputs = []
de_inputs = []

#Tokenizing the English and German sentences and creating our word banks for both languages
for i in tqdm_notebook(range(training_examples)):
    en_tokens = spacy_en(en[i])
    de_tokens = spacy_de(de[i])
    if len(en_tokens) == 0 or len(de_tokens) == 0:
        continue
    for token in en_tokens:
        en_words.update([token.text.lower()])
    en_inputs.append([token.text.lower() for token in en_tokens] + ['_EOS'])
def NumberComplexRemover(string):
    """
    removes numbers in complex format, but not if a . is followed as it introduces the end of a sentence.
    run after DateRemover()
    Examples: 15.10 Uhr OR 3,5 bis 4 stunden. OR 100 000 euro. OR 20?000 förderanträge OR um 2025/2030 OR
    OR abc 18.000. a OR abc. 18.000. a OR abc 18. a  OR abc 7.8.14. a  OR abc 7. 14. 18. a OR abc 1970er. a
    OR abc 20?()/&!%000. a  OR abc 2,9-3,5. a OR abc . 18. a OR abc . 7.8.14. a OR abc . 7. 14. 18. a OR abc 1790er
    OR abc . 20?()/&!%000 a  OR abc . 2,9-3,5 a OR abc 45, 59 a OR abc . 14 z OR abc  1. e OR abc  v. 2 a
    """
    string = re.sub('(?<!\w)(\d+)([\W\s]+|)|([\W\s]+)\d+', ' ',
                    string)  # TODO: check later
    # Alternative: ((\d+)(.|\s{1,3}|)\d+)(.|\s)(?! er)
    return string


nlp = German()
sbd = nlp.create_pipe('sentencizer')
nlp.add_pipe(sbd)


def Sentencizer(string, verbose=False):
    """
    requires from importing language from spacy and loading of sentence boundary detection:
    from spacy.lang.de import German
    nlp = German()
    sbd = nlp.create_pipe('sentencizer')
    nlp.add_pipe(sbd)

    for some single strings nlp() cannot process (rare, e.g. 'nan'), exclude those; except pass solve later
    """
    sents_list = []
Exemplo n.º 27
0
import spacy

# python -m spacy download de_core_news_sm --user
from spacy.lang.de import German

nlp = spacy.load('de_core_news_sm')

tokenizer = German().Defaults.create_tokenizer(nlp)

doc = nlp(
    'Hallo Welt, ich   finde es hier so spannend. der die das. dwwomdowmd 404')
for token in doc:
    if token.text in nlp.vocab:
        print('Im Wörterbuch')
    print(token, token.lemma, token.lemma_)

# for x in tokenizer('Hello world!'):
#     print(x)
Exemplo n.º 28
0
import json
from spacy.lang.de import German

with open("exercises/de/countries.json", encoding="utf8") as f:
    COUNTRIES = json.loads(f.read())

nlp = German()
doc = nlp("Tschechien könnte der Slowakei dabei helfen, ihren Luftraum zu schützen")

# Importiere den PhraseMatcher und initialisiere ihn
from spacy.matcher import PhraseMatcher

matcher = PhraseMatcher(nlp.vocab)

# Erstelle Pattern-Doc-Objekte und füge sie zum Matcher hinzu
# Dies ist die schnellere Version von: [nlp(country) for country in COUNTRIES]
patterns = list(nlp.pipe(COUNTRIES))
matcher.add("COUNTRY", None, *patterns)

# Wende den Matcher auf das Test-Dokument an und drucke das Resultat
matches = matcher(doc)
print([doc[start:end] for match_id, start, end in matches])
Exemplo n.º 29
0
# Importiere die Klasse German und erstelle das nlp-Objekt
from spacy.lang.de import German

nlp = German()

# Verarbeite den Text
doc = nlp("Ich mag niedliche Katzen und Faultiere.")

# Wähle den ersten Token aus
erster_token = doc[0]

# Drucke den Text des ersten Tokens
print(erster_token.text)
def getSentences(text):
    nlp = German()
    nlp.add_pipe(nlp.create_pipe('sentencizer'))
    document = nlp(text)
    return [sent.string.strip() for sent in document.sents]