Пример #1
0
 def test_identity_lemmatizer(self):
     """Test identity_lemmatizer()"""
     lemmatizer = IdentityLemmatizer()
     test_str = 'Ceterum antequam destinata componam'
     target = [('ceterum', 'ceterum'), ('antequam', 'antequam'), ('destinata', 'destinata'), ('componam', 'componam')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Пример #2
0
 def test_identity_lemmatizer(self):
     """Test identity_lemmatizer()"""
     lemmatizer = IdentityLemmatizer()
     test_str = 'Ceterum antequam destinata componam'
     target = [('ceterum', 'ceterum'), ('antequam', 'antequam'), ('destinata', 'destinata'), ('componam', 'componam')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Пример #3
0
 def _define_lemmatizer(self: object):
     # Suggested backoff chain--should be tested for optimal order
     self.backoff0 = None
     self.backoff1 = IdentityLemmatizer(verbose=self.VERBOSE)
     self.backoff2 = DictLemmatizer(
         lemmas=self.GREEK_OLD_MODEL,
         source="Morpheus Lemmas",
         backoff=self.backoff1,
         verbose=self.VERBOSE,
     )
     self.backoff3 = RegexpLemmatizer(
         self.greek_sub_patterns,
         source="CLTK Greek Regex Patterns",
         backoff=self.backoff2,
         verbose=self.VERBOSE,
     )
     self.backoff4 = UnigramLemmatizer(
         self.train_sents,
         source="CLTK Sentence Training Data",
         backoff=self.backoff3,
         verbose=self.VERBOSE,
     )
     self.backoff5 = DictLemmatizer(
         lemmas=self.GREEK_MODEL,
         source="Greek Model",
         backoff=self.backoff4,
         verbose=self.VERBOSE,
     )
     self.lemmatizer = self.backoff5
Пример #4
0
 def _define_lemmatizer(self):
     self.backoff0 = None
     self.backoff1 = IdentityLemmatizer(verbose=self.verbose)
     self.backoff2 = DictLemmatizer(
         lemmas=self.token_to_lemmata,
         source='ReferenzKorpus Mittelhochdeutsch Lemmata',
         backoff=self.backoff1,
         verbose=self.verbose)
     self.lemmatizer = self.backoff2
Пример #5
0
 def _define_lemmatizer(self: object):
     # Suggested backoff chain--should be tested for optimal order
     self.backoff0 = None
     self.backoff1 = IdentityLemmatizer(verbose=self.VERBOSE)
     self.backoff2 = DictLemmatizer(lemmas=self.LATIN_OLD_MODEL, source='Morpheus Lemmas', backoff=self.backoff1, verbose=self.VERBOSE)
     self.backoff3 = RegexpLemmatizer(self.latin_sub_patterns, source='CLTK Latin Regex Patterns', backoff=self.backoff2, verbose=self.VERBOSE)
     self.backoff4 = UnigramLemmatizer(self.train_sents, source='CLTK Sentence Training Data', backoff=self.backoff3, verbose=self.VERBOSE)
     self.backoff5 = DictLemmatizer(lemmas=self.LATIN_MODEL, source='Latin Model', backoff=self.backoff4, verbose=self.VERBOSE)
     self.lemmatizer = self.backoff5