def test_regex_lemmatizer(self): """Test regex_lemmatizer()""" sub = [("(.)ab(o|is|it|imus|itis|unt)$", r"\1o")] lemmatizer = RegexpLemmatizer(sub) test_str = "amabimus" target = [("amabimus", "amo")] tokenizer = LatinWordTokenizer() test_str = test_str.lower() test_str = replace_jv(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lemmatize(tokens) self.assertEqual(lemmas, target)
def test_regex_lemmatizer(self): """Test regex_lemmatizer()""" sub = [('(.)ab(o|is|it|imus|itis|unt)$', r'\1o')] lemmatizer = RegexpLemmatizer(sub) test_str = 'amabimus' target = [('amabimus', 'amo')] jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lemmatize(tokens) self.assertEqual(lemmas, target)
def __init__(self: object, default: str = None, backoff: object = None): """ RomanNumeralLemmatizer :type default: str :param default: Default replacement for lemma; 'NUM' in given pattern """ regexps = [ (r'(?=^[MDCLXVUI]+$)(?=^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|IU|V?I{0,3}|U?I{0,3})$)', 'NUM'), (r'(?=^[mdclxvui]+$)(?=^m{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|iu|v?i{0,3}|u?i{0,3})$)', 'NUM') ] RegexpLemmatizer.__init__(self, regexps, backoff) self._regexs = [(re.compile(regexp), pattern,) for regexp, pattern in regexps] self.default = default
def _define_lemmatizer(self: object): # Suggested backoff chain--should be tested for optimal order self.backoff0 = None self.backoff1 = IdentityLemmatizer(verbose=self.VERBOSE) self.backoff2 = DictLemmatizer( lemmas=self.GREEK_OLD_MODEL, source="Morpheus Lemmas", backoff=self.backoff1, verbose=self.VERBOSE, ) self.backoff3 = RegexpLemmatizer( self.greek_sub_patterns, source="CLTK Greek Regex Patterns", backoff=self.backoff2, verbose=self.VERBOSE, ) self.backoff4 = UnigramLemmatizer( self.train_sents, source="CLTK Sentence Training Data", backoff=self.backoff3, verbose=self.VERBOSE, ) self.backoff5 = DictLemmatizer( lemmas=self.GREEK_MODEL, source="Greek Model", backoff=self.backoff4, verbose=self.VERBOSE, ) self.lemmatizer = self.backoff5
def _define_lemmatizer(self: object): # Suggested backoff chain--should be tested for optimal order self.backoff0 = None self.backoff1 = IdentityLemmatizer(verbose=self.VERBOSE) self.backoff2 = DictLemmatizer(lemmas=self.LATIN_OLD_MODEL, source='Morpheus Lemmas', backoff=self.backoff1, verbose=self.VERBOSE) self.backoff3 = RegexpLemmatizer(self.latin_sub_patterns, source='CLTK Latin Regex Patterns', backoff=self.backoff2, verbose=self.VERBOSE) self.backoff4 = UnigramLemmatizer(self.train_sents, source='CLTK Sentence Training Data', backoff=self.backoff3, verbose=self.VERBOSE) self.backoff5 = DictLemmatizer(lemmas=self.LATIN_MODEL, source='Latin Model', backoff=self.backoff4, verbose=self.VERBOSE) self.lemmatizer = self.backoff5
def evaluate(self: object): if self.VERBOSE: raise AssertionError("evaluate() method only works when verbose: bool = False") return self.lemmatizer.evaluate(self.test_sents) def __repr__(self: object): return f'<BackoffLatinLemmatizer v0.2>' if __name__ == '__main__': from pprint import pprint l1 = DefaultLemmatizer('UNK', verbose=True) l2 = DictLemmatizer(lemmas={'arma': 'arma', 'uirum': 'uir'}, backoff=l1, verbose=True) l3 = UnigramLemmatizer(train=[[('cano', 'cano'), ('.', 'punc')],], backoff=l2, verbose=True) l4 = RegexpLemmatizer(regexps=[('(.)tat(is|i|em|e|es|um|ibus)$', r'\1tas'),], backoff=l3, verbose=True) lemmas = l4.lemmatize('arma uirum -que cano nobilitatis .'.split()) pprint(lemmas) # [('arma', 'arma', <UnigramLemmatizer: [[('res', 'res'), ...], ...]>), # ('uirum', 'uir', <UnigramLemmatizer: [[('res', 'res'), ...], ...]>), # ('-que', '-que', <DictLemmatizer: {'!': 'punc', ...}>), # ('cano', 'cano', <DictLemmatizer: {'-nam': 'nam', ...}>), # ('nobilitatis', # 'nobilitas', # <RegexpLemmatizer: [('(bil)(is|i|e...es|ium|ibus)$', '\\1is'), ...]>), # ('.', 'punc', <DictLemmatizer: {'!': 'punc', ...}>)] print('\n') bll = BackoffLatinLemmatizer(seed=5, verbose=False)