def test_tag_ner_list_str_latin(self): """Test make_ner(), list, str.""" text_list = ["ut", "Venus", "Sirius"] text_list_iu = [replace_jv(x) for x in text_list] text = ner.tag_ner("lat", input_text=text_list_iu, output_type=str) target = " ut Uenus/Entity Sirius/Entity" self.assertEqual(text, target)
def test_tag_ner_str_str_latin(self): """Test make_ner(), str, str.""" text_str = """ut Venus, ut Sirius, ut Spica, ut aliae quae primae dicuntur esse mangitudinis.""" text_str_iu = replace_jv(text_str) text = ner.tag_ner("lat", input_text=text_str_iu, output_type=str) target = " ut Uenus/Entity, ut Sirius/Entity, ut Spica/Entity, ut aliae quae primae dicuntur esse mangitudinis." self.assertEqual(text, target)
def test_tag_ner_str_list_latin(self): """Test make_ner(), str, list.""" text_str = """ut Venus, ut Sirius, ut Spica, ut aliae quae primae dicuntur esse mangitudinis.""" text_str_iu = replace_jv(text_str) tokens = ner.tag_ner("lat", input_text=text_str_iu, output_type=list) target = [ ("ut", ), ("Uenus", "Entity"), (",", ), ("ut", ), ("Sirius", "Entity"), (",", ), ("ut", ), ("Spica", "Entity"), (",", ), ("ut", ), ("aliae", ), ("quae", ), ("primae", ), ("dicuntur", ), ("esse", ), ("mangitudinis", ), (".", ), ] self.assertEqual(tokens, target)
def test_tag_ner_list_list_latin(self): """Test make_ner(), list, list.""" text_list = ["ut", "Venus", "Sirius"] text_list_iu = [replace_jv(x) for x in text_list] tokens = ner.tag_ner("lat", input_text=text_list_iu, output_type=list) target = [("ut", ), ("Uenus", "Entity"), ("Sirius", "Entity")] self.assertEqual(tokens, target)
def test_backoff_latin_lemmatizer_verbose(self): """Test LatinBackoffLemmatizer""" train = [[ ("ceterum", "ceterus"), ("antequam", "antequam"), ("destinata", "destino"), ("componam", "compono"), ]] lemmatizer = LatinBackoffLemmatizer(verbose=True) test_str = """Ceterum antequam destinata componam""" target = [ ("ceterum", "ceterum", "<UnigramLemmatizer: CLTK Sentence Training Data>"), ( "antequam", "antequam", "<UnigramLemmatizer: CLTK Sentence Training Data>", ), ( "destinata", "destino", "<UnigramLemmatizer: CLTK Sentence Training Data>", ), ("componam", "compono", "<DictLemmatizer: Morpheus Lemmas>"), ] tokenizer = LatinWordTokenizer() test_str = test_str.lower() test_str = replace_jv(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lemmatize(tokens) self.assertEqual(lemmas, target)
def test_regex_lemmatizer(self): """Test regex_lemmatizer()""" sub = [("(.)ab(o|is|it|imus|itis|unt)$", r"\1o")] lemmatizer = RegexpLemmatizer(sub) test_str = "amabimus" target = [("amabimus", "amo")] tokenizer = LatinWordTokenizer() test_str = test_str.lower() test_str = replace_jv(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lemmatize(tokens) self.assertEqual(lemmas, target)
def test_backoff_latin_lemmatizer(self): """Test LatinBackoffLemmatizer""" train = [[ ("ceterum", "ceterus"), ("antequam", "antequam"), ("destinata", "destino"), ("componam", "compono"), ]] lemmatizer = LatinBackoffLemmatizer() test_str = """Ceterum antequam destinata componam""" target = [ ("ceterum", "ceterum"), ("antequam", "antequam"), ("destinata", "destino"), ("componam", "compono"), ] tokenizer = LatinWordTokenizer() test_str = test_str.lower() test_str = replace_jv(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lemmatize(tokens) self.assertEqual(lemmas, target)
def test_dict_lemmatizer(self): """Test model_lemmatizer()""" lemmas = { "ceterum": "ceterus", "antequam": "antequam", "destinata": "destino", "componam": "compono", } lemmatizer = DictLemmatizer(lemmas=lemmas) test_str = "Ceterum antequam destinata componam" target = [ ("ceterum", "ceterus"), ("antequam", "antequam"), ("destinata", "destino"), ("componam", "compono"), ] # pylint: disable=line-too-long tokenizer = LatinWordTokenizer() test_str = test_str.lower() test_str = replace_jv(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lemmatize(tokens) self.assertEqual(lemmas, target)
def test_unigram_lemmatizer(self): """Test unigram_lemmatizer()""" train = [ [ ("ceterum", "ceterus"), ("antequam", "antequam"), ("destinata", "destino"), ("componam", "compono"), ] ] # pylint: disable=line-too-long lemmatizer = UnigramLemmatizer(train=train) test_str = """Ceterum antequam destinata componam""" target = [ ("ceterum", "ceterus"), ("antequam", "antequam"), ("destinata", "destino"), ("componam", "compono"), ] # pylint: disable=line-too-long tokenizer = LatinWordTokenizer() test_str = test_str.lower() test_str = replace_jv(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lemmatize(tokens) self.assertEqual(lemmas, target)