def test_identity_lemmatizer(self): """Test identity_lemmatizer()""" lemmatizer = IdentityLemmatizer() test_str = 'Ceterum antequam destinata componam' target = [('ceterum', 'ceterum'), ('antequam', 'antequam'), ('destinata', 'destinata'), ('componam', 'componam')] # pylint: disable=line-too-long jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lemmatize(tokens) self.assertEqual(lemmas, target)
def _define_lemmatizer(self: object): # Suggested backoff chain--should be tested for optimal order self.backoff0 = None self.backoff1 = IdentityLemmatizer(verbose=self.VERBOSE) self.backoff2 = DictLemmatizer( lemmas=self.GREEK_OLD_MODEL, source="Morpheus Lemmas", backoff=self.backoff1, verbose=self.VERBOSE, ) self.backoff3 = RegexpLemmatizer( self.greek_sub_patterns, source="CLTK Greek Regex Patterns", backoff=self.backoff2, verbose=self.VERBOSE, ) self.backoff4 = UnigramLemmatizer( self.train_sents, source="CLTK Sentence Training Data", backoff=self.backoff3, verbose=self.VERBOSE, ) self.backoff5 = DictLemmatizer( lemmas=self.GREEK_MODEL, source="Greek Model", backoff=self.backoff4, verbose=self.VERBOSE, ) self.lemmatizer = self.backoff5
def _define_lemmatizer(self): self.backoff0 = None self.backoff1 = IdentityLemmatizer(verbose=self.verbose) self.backoff2 = DictLemmatizer( lemmas=self.token_to_lemmata, source='ReferenzKorpus Mittelhochdeutsch Lemmata', backoff=self.backoff1, verbose=self.verbose) self.lemmatizer = self.backoff2
def _define_lemmatizer(self: object): # Suggested backoff chain--should be tested for optimal order self.backoff0 = None self.backoff1 = IdentityLemmatizer(verbose=self.VERBOSE) self.backoff2 = DictLemmatizer(lemmas=self.LATIN_OLD_MODEL, source='Morpheus Lemmas', backoff=self.backoff1, verbose=self.VERBOSE) self.backoff3 = RegexpLemmatizer(self.latin_sub_patterns, source='CLTK Latin Regex Patterns', backoff=self.backoff2, verbose=self.VERBOSE) self.backoff4 = UnigramLemmatizer(self.train_sents, source='CLTK Sentence Training Data', backoff=self.backoff3, verbose=self.VERBOSE) self.backoff5 = DictLemmatizer(lemmas=self.LATIN_MODEL, source='Latin Model', backoff=self.backoff4, verbose=self.VERBOSE) self.lemmatizer = self.backoff5