Пример #1
0
 def test_tag_ner_list_str_latin(self):
     """Test make_ner(), list, str."""
     text_list = ["ut", "Venus", "Sirius"]
     text_list_iu = [replace_jv(x) for x in text_list]
     text = ner.tag_ner("lat", input_text=text_list_iu, output_type=str)
     target = " ut Uenus/Entity Sirius/Entity"
     self.assertEqual(text, target)
Пример #2
0
 def test_tag_ner_str_str_latin(self):
     """Test make_ner(), str, str."""
     text_str = """ut Venus, ut Sirius, ut Spica, ut aliae quae primae dicuntur esse mangitudinis."""
     text_str_iu = replace_jv(text_str)
     text = ner.tag_ner("lat", input_text=text_str_iu, output_type=str)
     target = " ut Uenus/Entity, ut Sirius/Entity, ut Spica/Entity, ut aliae quae primae dicuntur esse mangitudinis."
     self.assertEqual(text, target)
Пример #3
0
 def test_tag_ner_str_list_latin(self):
     """Test make_ner(), str, list."""
     text_str = """ut Venus, ut Sirius, ut Spica, ut aliae quae primae dicuntur esse mangitudinis."""
     text_str_iu = replace_jv(text_str)
     tokens = ner.tag_ner("lat", input_text=text_str_iu, output_type=list)
     target = [
         ("ut", ),
         ("Uenus", "Entity"),
         (",", ),
         ("ut", ),
         ("Sirius", "Entity"),
         (",", ),
         ("ut", ),
         ("Spica", "Entity"),
         (",", ),
         ("ut", ),
         ("aliae", ),
         ("quae", ),
         ("primae", ),
         ("dicuntur", ),
         ("esse", ),
         ("mangitudinis", ),
         (".", ),
     ]
     self.assertEqual(tokens, target)
Пример #4
0
 def test_tag_ner_list_list_latin(self):
     """Test make_ner(), list, list."""
     text_list = ["ut", "Venus", "Sirius"]
     text_list_iu = [replace_jv(x) for x in text_list]
     tokens = ner.tag_ner("lat", input_text=text_list_iu, output_type=list)
     target = [("ut", ), ("Uenus", "Entity"), ("Sirius", "Entity")]
     self.assertEqual(tokens, target)
Пример #5
0
 def test_backoff_latin_lemmatizer_verbose(self):
     """Test LatinBackoffLemmatizer"""
     train = [[
         ("ceterum", "ceterus"),
         ("antequam", "antequam"),
         ("destinata", "destino"),
         ("componam", "compono"),
     ]]
     lemmatizer = LatinBackoffLemmatizer(verbose=True)
     test_str = """Ceterum antequam destinata componam"""
     target = [
         ("ceterum", "ceterum",
          "<UnigramLemmatizer: CLTK Sentence Training Data>"),
         (
             "antequam",
             "antequam",
             "<UnigramLemmatizer: CLTK Sentence Training Data>",
         ),
         (
             "destinata",
             "destino",
             "<UnigramLemmatizer: CLTK Sentence Training Data>",
         ),
         ("componam", "compono", "<DictLemmatizer: Morpheus Lemmas>"),
     ]
     tokenizer = LatinWordTokenizer()
     test_str = test_str.lower()
     test_str = replace_jv(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Пример #6
0
 def test_regex_lemmatizer(self):
     """Test regex_lemmatizer()"""
     sub = [("(.)ab(o|is|it|imus|itis|unt)$", r"\1o")]
     lemmatizer = RegexpLemmatizer(sub)
     test_str = "amabimus"
     target = [("amabimus", "amo")]
     tokenizer = LatinWordTokenizer()
     test_str = test_str.lower()
     test_str = replace_jv(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Пример #7
0
 def test_backoff_latin_lemmatizer(self):
     """Test LatinBackoffLemmatizer"""
     train = [[
         ("ceterum", "ceterus"),
         ("antequam", "antequam"),
         ("destinata", "destino"),
         ("componam", "compono"),
     ]]
     lemmatizer = LatinBackoffLemmatizer()
     test_str = """Ceterum antequam destinata componam"""
     target = [
         ("ceterum", "ceterum"),
         ("antequam", "antequam"),
         ("destinata", "destino"),
         ("componam", "compono"),
     ]
     tokenizer = LatinWordTokenizer()
     test_str = test_str.lower()
     test_str = replace_jv(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Пример #8
0
 def test_dict_lemmatizer(self):
     """Test model_lemmatizer()"""
     lemmas = {
         "ceterum": "ceterus",
         "antequam": "antequam",
         "destinata": "destino",
         "componam": "compono",
     }
     lemmatizer = DictLemmatizer(lemmas=lemmas)
     test_str = "Ceterum antequam destinata componam"
     target = [
         ("ceterum", "ceterus"),
         ("antequam", "antequam"),
         ("destinata", "destino"),
         ("componam", "compono"),
     ]  # pylint: disable=line-too-long
     tokenizer = LatinWordTokenizer()
     test_str = test_str.lower()
     test_str = replace_jv(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Пример #9
0
 def test_unigram_lemmatizer(self):
     """Test unigram_lemmatizer()"""
     train = [
         [
             ("ceterum", "ceterus"),
             ("antequam", "antequam"),
             ("destinata", "destino"),
             ("componam", "compono"),
         ]
     ]  # pylint: disable=line-too-long
     lemmatizer = UnigramLemmatizer(train=train)
     test_str = """Ceterum antequam destinata componam"""
     target = [
         ("ceterum", "ceterus"),
         ("antequam", "antequam"),
         ("destinata", "destino"),
         ("componam", "compono"),
     ]  # pylint: disable=line-too-long
     tokenizer = LatinWordTokenizer()
     test_str = test_str.lower()
     test_str = replace_jv(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)