def test_hierosylima(self): test = " Hierosolyma" x = Lemmatiseur(load=False) parser = Parser(x) parser.ajMorphos() parser.ajContractions() parser.ajAssims() load_mod_vars(x) parser.register_modele( parser.parse_modele("""modele:uita R:1:1,0 des:1-12:1:$uita""".split("\n"))) parser.register_modele( parser.parse_modele("""modele:roma pere:uita des:413:1:āe""".split("\n"))) parser.register_modele( parser.parse_modele("""modele:doctus R:0:2,0 R:1:2,ĭ R:2:2,īssĭm des:13-48:0:$lupus;$uita;$templum des:49-84:1:$compar;$compar;ŭs;ŭs;ŭs;ōrĭs;ōrī;ōrĕ;ōră;ōră;ōră;ōrŭm;ōrĭbŭs des:85-120:2:$lupus;$uita;$templum""".split("\n"))) parser.register_modele( parser.parse_modele("""modele:aureus pere:doctus des:14:0:ŭs abs:49-120""".split("\n"))) parser.parse_lemme("Hĭĕrŏsŏlўma|roma|||ae, f.|5") parser.parse_lemme("Lўcāŏnĭus|aureus|||a, um|5") self.assertEqual(list(x.lemmatise("Hierosolymam")), [{ 'lemma': 'Hierosolyma', 'form': 'hierosolymam', 'morph': 'accusatif singulier', 'radical': 'Hierosolym', "desinence": "am" }]) self.assertEqual(list(x.lemmatise("Lycaonios")), [{ 'morph': 'accusatif masculin pluriel', 'lemma': 'Lycaonius', 'form': 'lycaonios', 'radical': 'Lycaoni', "desinence": "os" }])
def test_dump_and_load(self): lemmatizer = Lemmatiseur() lemmatizer.compile() del lemmatizer lemmatizer = Lemmatiseur.load() results = lemmatizer.lemmatise_multiple("mihi Romanorum", pos=True) self.maxDiff = 5000 self.assertLemmatisationMultipleEqual( results, [[{ 'form': 'mihi', 'morph': 'datif féminin singulier', 'lemma': 'ego', 'pos': 'p', "radical": "", "desinence": "mihi" }, { 'form': 'mihi', 'morph': 'datif masculin singulier', 'lemma': 'ego', 'pos': 'p', "radical": "", "desinence": "mihi" }], [ { 'form': 'romanorum', 'pos': 'n', 'morph': 'génitif pluriel', 'lemma': 'Romani', "radical": "Roman", "desinence": "orum" }, { 'form': 'romanorum', 'pos': 'n', 'morph': 'génitif pluriel', 'lemma': 'Romanus', "radical": "Roman", "desinence": "orum" }, { 'form': 'romanorum', 'pos': 'a', 'morph': 'génitif masculin pluriel', 'lemma': 'Romanus', "radical": "Roman", "desinence": "orum" }, { 'form': 'romanorum', 'pos': 'a', 'morph': 'génitif neutre pluriel', 'lemma': 'Romanus', "radical": "Roman", "desinence": "orum" }, ]])
def test_romanorum(self): x = Lemmatiseur(load=False) parser = Parser(x) parser.ajMorphos() load_mod_vars(x) lupus = parser.parse_modele( ["modele:lupus", "R:1:2,0", "des:1-12:1:$lupus", "pos:n"]) x._modeles[lupus.gr()] = lupus doctus = parser.parse_modele([ "modele:doctus", "R:0:2,0", "R:1:2,ĭ", "R:2:2,īssĭm", "des:13-48:0:$lupus;$uita;$templum", "des:49-84:1:$compar;$compar;ŭs;ŭs;ŭs;ōrĭs;ōrī;ōrĕ;ōră;ōră;ōră;ōrŭm;ōrĭbŭs", "des:85-120:2:$lupus;$uita;$templum", "pos:a" ]) x._modeles[doctus.gr()] = doctus liberi = parser.parse_modele( ["modele:liberi", "pere:lupus", "R:1:1,0", "abs:1-6", "pos:n"]) x._modeles[liberi.gr()] = liberi parser.parse_lemme("Rōmānus|doctus|||a, um|2392", origin=0) parser.parse_lemme("Rōmānus2|lupus|||i, m.|8", origin=0) Romani = parser.parse_lemme("Rōmāni|liberi|||orum, m.|262", origin=0) self.assertEqual( sorted( list( set([ r["lemma"] + "|" + r["pos"] for r in x.lemmatise("Romanorum", pos=True) ]))), ["Romani|n", "Romanus|a", "Romanus|n"]) self.assertEqual( sorted( list( set([ r["lemma"] + "|" + r["pos"] for r in x.lemmatise("Romana", pos=True) ]))), ["Romanus|a"])
def test_invariables(self): x = Lemmatiseur(load=False) parser = Parser(x) parser.ajMorphos() m = parser.parse_modele(["modele:inv", "R:0:0,0", "des:416:0:-"]) x._modeles[m.gr()] = m parser.parse_lemme("nĕc|inv|||adv.|6689", origin=0) parser.parse_lemme("ergō=ērgō|inv|||conj.|1450", origin=0) self.assertEqual(list(x.lemmatise("nec")), [{ 'lemma': 'nec', 'morph': '-', 'form': 'nec', "radical": "nec", "desinence": "" }]) self.assertEqual(list(x.lemmatise("ergo")), [{ 'lemma': 'ergo', 'morph': '-', 'form': 'ergo', 'radical': 'ergo', "desinence": "" }])
def test_sequens(self): x = Lemmatiseur(load=False) parser = Parser(x) parser.ajMorphos() parser.ajContractions() parser.ajAssims() load_mod_vars(x) doctus = parser.parse_modele("""modele:doctus R:0:2,0 R:1:2,ĭ R:2:2,īssĭm des:13-48:0:$lupus;$uita;$templum des:49-84:1:$compar;$compar;ŭs;ŭs;ŭs;ōrĭs;ōrī;ōrĕ;ōră;ōră;ōră;ōrŭm;ōrĭbŭs des:85-120:2:$lupus;$uita;$templum""".split("\n")) x._modeles[doctus.gr()] = doctus fortis = parser.parse_modele("""modele:fortis pere:doctus R:0:2,0 R:4:K R:5:2,ĭ des:13,14,25,26:4:-;-;-;- des:15-24:1:ĕm;ĭs;ī;ī;ēs;ēs;ēs,īs;ĭŭm;ĭbŭs des:27-36:1:ĕm;ĭs;ī;ī;ēs;ēs;ēs,īs;ĭŭm;ĭbŭs des:37-48:1:ĕ;ĕ;ĕ;ĭs;ī;ī;ĭă;ĭă;ĭă;ĭŭm;ĭbŭs des:49-84:1:ĭ$compar;ĭ$compar;ĭŭs;ĭŭs;ĭŭs;ĭōrĭs;ĭōrī;ĭōrĕ;ĭōră;ĭōră;ĭōră;ĭōrŭm;ĭōrĭbŭs des:85-120:1:īssĭm$lupus;īssĭm$uita;īssĭm$templum""".split("\n")) x._modeles[fortis.gr()] = fortis infans = parser.parse_modele("""modele:infans pere:fortis des+:22,34,46:1:ŭm3""".split("\n")) x._modeles[infans.gr()] = infans lemma = parser.parse_lemme("sĕquens=sĕquēns|infans|sĕquēnt||entis|44") self.assertEqual( sorted(lemma.possible_forms()), sorted([ 'sequentioris', 'sequentissimae', 'sequentissimam', 'sequentissimum', 'sequentiores', 'sequentissimorum', 'sequentissimi', 'sequentissime', 'sequentis', 'sequentior', 'sequentem', 'sequentiori', 'sequentum', 'sequentissimas', 'sequentiorem', 'sequens', 'sequentiora', 'sequentius', 'sequentissimos', 'sequentibus', 'sequenti', 'sequentiorum', 'sequentissimis', 'sequentes', 'sequente', 'sequentioribus', 'sequentissimus', 'sequentiore', 'sequentissima', 'sequentissimarum', 'sequentia', 'sequentissimo', 'sequentium' ]))
def align(lemma_file, dictionary_file, collatinus=False, collatinus_dic=None): """ Align the lemma file with the dictionary file to create a dictionary of sure genders Requires installing unidecode :param lemma_file: _lemma.txt or _noun_lemma.txt :param dictionary_file: Dictionary file from CIRCE/LEMLAT3 """ database = {} secondary_db = {} with open(dictionary_file) as f: db = csv.DictReader(f, delimiter="\t") for line in db: if line["src"] == "O": database[line["lemma"]] = { "gen": line["gen"], "pos": line["upostag"] } else: secondary_db[line["lemma"]] = { "gen": line["gen"], "pos": line["upostag"] } if collatinus_dic: from unidecode import unidecode gender = re.compile(r"^\w+, (\w)\..*$") with open(collatinus_dic) as f: for line in f: if len(line.strip()) > 0 and not line.startswith("!"): content = unidecode(line.strip()) parts = content.split("|") lemma = parts[0].split("=")[0] morph = parts[-2] if gender.match(morph): secondary_db[lemma] = { "gen": gender.findall(morph)[0], "pos": "NOUN" } if collatinus: from pycollatinus import Lemmatiseur collatinus_lemmatiseur = Lemmatiseur() matches = [] unmatched = [] maps = {} sec = 0 proper_nouns = 0 disambiguate = 0 relemmatized = 0 verb_substantived = 0 adje_substantived = 0 deduction = 0 with open(lemma_file) as f: for line in f.readlines(): lemma, decl = tuple(line.strip().split("\t")) lemma = lemma.strip().lower().replace("v", "u") if decl == "7": continue elif lemma in database: matches.append(lemma) elif lemma in secondary_db: matches.append(lemma) sec += 1 elif lemma.replace("_n", "") in database or lemma.replace( "_n", "") in secondary_db: matches.append(lemma) maps[lemma] = lemma.replace("_n", "") proper_nouns += 1 sec += int(lemma.replace("_n", "") in secondary_db) elif lemma in secondary_db: matches.append(lemma) sec += 1 elif lemma.split("_")[0] in database or lemma.split( "_")[0] in secondary_db: matches.append(lemma) maps[lemma] = lemma.split("_")[0] disambiguate += 1 else: unmatched.append(lemma) if collatinus: form = lemma.split("_")[0] # Keep only nouns that are nominatif results = list( filter( lambda res: res["pos"] == "n" and res["morph"]. startswith("nominatif"), collatinus_lemmatiseur.lemmatise( form, pos=True, get_lemma_object=False))) vs = list( filter( lambda res: res["pos"] == "v" and "nominatif" in res["morph"] and "neutre" in res["morph"], collatinus_lemmatiseur.lemmatise( form, pos=True, get_lemma_object=False))) adjs = list( filter( lambda res: "nominatif" in res["morph"], collatinus_lemmatiseur.lemmatise( form, pos=True, get_lemma_object=False))) if len(results): uniques = list(set(map(lambda x: x["lemma"], results))) if len(uniques) == 1: matches.append(lemma) maps[lemma] = form relemmatized += 1 unmatched.pop() elif len(uniques) > 1: gs = list( set([ secondary_db[lem]["gen"] for lem in uniques if lem in secondary_db ])) if len(gs) == 1: secondary_db[form] = { "gen": gs[0], "pos": "NOUN" } matches.append(lemma) maps[lemma] = form relemmatized += 1 unmatched.pop() elif len(vs) > 0: verb_substantived += 1 secondary_db[form] = {"gen": "n", "pos": "NOUN"} matches.append(lemma) maps[lemma] = form unmatched.pop() else: gs = list( set(map(lambda x: x["morph"].split()[1], adjs))) if len(gs) == 1: adje_substantived += 1 secondary_db[form] = {"gen": gs[0], "pos": "NOUN"} matches.append(lemma) maps[lemma] = form unmatched.pop() if len(unmatched) and unmatched[-1] == lemma: form = lemma.split("_n")[0] if form.endswith("i") or form.endswith( "es") and not form.endswith("des"): # Romani, aethiopes -> Masc et Fen deduction += 1 secondary_db[lemma] = {"gen": "3", "pos": "NOUN"} matches.append(lemma) maps[lemma] = lemma unmatched.pop() elif form.endswith("us"): # Romanus -> Masc deduction += 1 secondary_db[lemma] = {"gen": "4", "pos": "NOUN"} matches.append(lemma) maps[lemma] = lemma unmatched.pop() elif form.endswith("a") or form.endswith( "ae"): # Albina -> Fem deduction += 1 secondary_db[lemma] = {"gen": "2", "pos": "NOUN"} matches.append(lemma) maps[lemma] = lemma unmatched.pop() total = max(len(unmatched) + len(matches), 1) print( "{percent:.2f} % of matched lemma over {total} lemma, leaving {unm} unmatched " "\n\t- {sec:.2f}% from secondary db" "\n\t- {prop:.2f}% remapped proper nouns" "\n\t- {undesi:.2f}% undesambiguated nouns" "\n\t- {relem:.2f}% relemmatized nouns" "\n\t- {verb:.2f}% relemmatized substantived neutral verbs" "\n\t- {adje:.2f}% relemmatized substantived adjective" "\n\t- {deducted:.2f}% deducted genders".format( unm=len(unmatched), percent=len(matches) / total * 100, total=total, sec=sec / total * 100, prop=proper_nouns / total * 100, undesi=disambiguate / total * 100, relem=relemmatized / total * 100, verb=verb_substantived / total * 100, adje=adje_substantived / total * 100, deducted=deduction / total * 100, )) with open("result.tsv", "w") as f: for lemma in matches: form = lemma if lemma in maps: form = maps[lemma] f.write("\t".join([ lemma, database.get(form, secondary_db.get(form, {"gen": "???"})) ["gen"] ]) + "\n") print("\n".join(unmatched))
def text2matrixNdictFreq(path, lemma=False): text = getText(path) textList = text.split('\n') new = [] for x in textList: #выкидывает параграфы, в которых содержатся ненужные символы(использовано для примечаний) if ('|' in x): continue if ('HN' in x): continue if ('PHV' in x): continue if ('PNV' in x): continue y = x y = y.lower() y = re.sub(r'[^a-z ]+', '', y) if not x: continue #print(y) new.append(y) myWordList = [] for x in new: myWordList += x.split() latinstem = [] if lemma: F = createLemmatiser(Lemmatiseur()) for x in myWordList: y = F(x) if not y: continue if bool( re.search( r"^M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", y)): continue latinstem.append(y) #Лемматайзер делает ВСЕ имена собственные с большой буквы # 01.01.2021 необходимо вычистить латинские цифры!!!! Для аккуратности else: for x in myWordList: y = removeEndings(x) y = removeEndings(y) y = removeEndings(y) if not y: continue if bool( re.search( r"^M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", y)): continue latinstem.append(y) listForDict = list(set(latinstem)) listForDict.sort() for i in range(10): mylen = len(list(filter(lambda x: len(x) == i, listForDict))) #print(i,mylen) mylen = len(list(filter(lambda x: len(x) > 10, listForDict))) listForDict = list(filter(lambda x: len(x) > 2, listForDict)) ListOfPrepRem = [] for x in ListOfPrep: y = removeEndings(x) ListOfPrepRem.append(y) finalDict = DictLatCrossFreq(ListOfPrepRem, listForDict, latinstem) return finalDict
def text2matrixNdict(path, lemma=False): text = getText(path) textList = text.split('\n') #print("File was loaded") new = [] for x in textList: #выкидывает параграфы, в которых содержатся ненужные символы(использовано для примечаний) if ('|' in x): continue if ('HN' in x): continue if ('PHV' in x): continue if ('PNV' in x): continue y = x y = y.lower() y = re.sub(r'[^a-z ]+', '', y) if not x: continue #print(y) new.append(y) #print("Useless parags were removed") myWordList = [] for x in new: myWordList += x.split() #print("Word list was created") latinstem = [] if lemma: F = createLemmatiser(Lemmatiseur()) for x in myWordList: y = F(x) if not y: continue if bool( re.search( r"^M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", y)): continue latinstem.append(y) else: for x in myWordList: y = removeEndings(x) y = removeEndings(y) y = removeEndings(y) if not y: continue if bool( re.search( r"^M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", y)): continue latinstem.append(y) listForDict = list(set(latinstem)) listForDict.sort() #print("Dict list was created") for i in range(10): mylen = len(list(filter(lambda x: len(x) == i, listForDict))) #print(i,mylen) mylen = len(list(filter(lambda x: len(x) > 10, listForDict))) listForDict = list(filter(lambda x: len(x) > 2, listForDict)) ListOfPrepRem = [] for x in ListOfPrep: y = removeEndings(x) ListOfPrepRem.append(y) #print("Prepositions list was created") finalDict = DictLatCross(ListOfPrepRem, listForDict, latinstem) #print("Dictionary matrix was created") return finalDict
class TestSentences(ExtendedTestCase): lemmatizer = Lemmatiseur() parser = Parser(lemmatizer) def test_cogito_ergo_sum(self): results = TestSentences.lemmatizer.lemmatise_multiple("cogito ergo sum") self.assertLemmatisationMultipleEqual( results, [ [{'lemma': 'cogo', 'morph': '2ème singulier impératif futur actif', 'form': 'cogito', 'radical': 'cog', 'desinence': 'ito'}, {'lemma': 'cogo', 'morph': '3ème singulier impératif futur actif', 'form': 'cogito', 'radical': 'cog', 'desinence': 'ito'}, {'lemma': 'cogito', 'morph': '1ère singulier indicatif présent actif', 'form': 'cogito', 'radical': 'cogit', 'desinence': 'o'}, {'lemma': 'cogito', 'morph': '1ère singulier indicatif présent actif', 'form': 'cogito', 'radical': 'cogit', 'desinence': 'o'}], [{'lemma': 'ergo', 'morph': '1ère singulier indicatif présent actif', 'form': 'ergo', 'radical': 'erg', 'desinence': 'o'}, {'lemma': 'ergo', 'morph': 'positif', 'form': 'ergo', 'radical': 'ergo', 'desinence': ''}, {'lemma': 'ergo', 'morph': '-', 'form': 'ergo', 'radical': 'ergo', 'desinence': ''}], [{'lemma': 'sum', 'morph': '1ère singulier indicatif présent actif', 'form': 'sum', 'radical': 's', 'desinence': 'um'}] ], "Ergo, sum and cogito should be recognized" ) def test_ego_romanus(self): results = TestSentences.lemmatizer.lemmatise_multiple("mihi Romanorum", pos=True) self.maxDiff = 5000 self.assertLemmatisationMultipleEqual(results, [ [ {'form': 'mihi', 'morph': 'datif féminin singulier', 'lemma': 'ego', 'pos': 'p', "radical": "", "desinence": "mihi"}, {'form': 'mihi', 'morph': 'datif masculin singulier', 'lemma': 'ego', 'pos': 'p', "radical": "", "desinence": "mihi"} ], [ {'form': 'romanorum', 'pos': 'n', 'morph': 'génitif pluriel', 'lemma': 'Romani', "radical": "Roman", "desinence": "orum"}, {'form': 'romanorum', 'pos': 'n', 'morph': 'génitif pluriel', 'lemma': 'Romanus', "radical": "Roman", "desinence": "orum"}, {'form': 'romanorum', 'pos': 'a', 'morph': 'génitif masculin pluriel', 'lemma': 'Romanus', "radical": "Roman", "desinence": "orum"}, {'form': 'romanorum', 'pos': 'a', 'morph': 'génitif neutre pluriel', 'lemma': 'Romanus', "radical": "Roman", "desinence": "orum"}, ] ]) def test_nec_aliud_sequenti_quadriduo(self): """ Check that aliud, an irregular form, is well behaving as well as nec, an invariable """ results = TestSentences.lemmatizer.lemmatise_multiple("nec aliud sequenti quadriduo") expected = [ [{'lemma': 'nec', 'morph': '-', 'form': 'nec', 'radical': 'nec', 'desinence': ''}], [ {'form': 'aliud', 'morph': 'nominatif neutre singulier', 'lemma': 'aliud', 'radical': None, 'desinence': None}, {'form': 'aliud', 'morph': 'vocatif neutre singulier', 'lemma': 'aliud', 'radical': None, 'desinence': None}, {'form': 'aliud', 'morph': 'accusatif neutre singulier', 'lemma': 'aliud', 'radical': None, 'desinence': None}, ], [ {'form': 'sequenti', 'morph': 'datif masculin singulier participe présent actif', 'lemma': 'sequor', 'radical': 'sequ', 'desinence': 'enti'}, {'form': 'sequenti', 'morph': 'datif féminin singulier participe présent actif', 'lemma': 'sequor', 'radical': 'sequ', 'desinence': 'enti'}, {'form': 'sequenti', 'morph': 'datif neutre singulier participe présent actif', 'lemma': 'sequor', 'radical': 'sequ', 'desinence': 'enti'}, {'form': 'sequenti', 'morph': 'datif masculin singulier participe présent actif', 'lemma': 'sequo', 'radical': 'sequ', 'desinence': 'enti'}, {'form': 'sequenti', 'morph': 'datif féminin singulier participe présent actif', 'lemma': 'sequo', 'radical': 'sequ', 'desinence': 'enti'}, {'form': 'sequenti', 'morph': 'datif neutre singulier participe présent actif', 'lemma': 'sequo', 'radical': 'sequ', 'desinence': 'enti'}, {'form': 'sequenti', 'morph': 'datif masculin singulier', 'lemma': 'sequens', 'radical': 'sequent', 'desinence': 'i'}, {'form': 'sequenti', 'morph': 'ablatif masculin singulier', 'lemma': 'sequens', 'radical': 'sequent', 'desinence': 'i'}, {'form': 'sequenti', 'morph': 'datif féminin singulier', 'lemma': 'sequens', 'radical': 'sequent', 'desinence': 'i'}, {'form': 'sequenti', 'morph': 'ablatif féminin singulier', 'lemma': 'sequens', 'radical': 'sequent', 'desinence': 'i'}, {'form': 'sequenti', 'morph': 'datif neutre singulier', 'lemma': 'sequens', 'radical': 'sequent', 'desinence': 'i'}, {'form': 'sequenti', 'morph': 'ablatif neutre singulier', 'lemma': 'sequens', 'radical': 'sequent', 'desinence': 'i'}, ], [ {'form': 'quadriduo', 'morph': 'datif singulier', 'lemma': 'quadriduum' , 'radical': 'quadridu', 'desinence': 'o'}, {'form': 'quadriduo', 'morph': 'ablatif singulier', 'lemma': 'quadriduum' , 'radical': 'quadridu', 'desinence': 'o'} ] ] self.assertLemmatisationMultipleEqual( results, expected, "Invar should be correctly recognized" ) def test_possible_forms(self): self.assertEqual( sorted(list(self.lemmatizer.lemmatise("bellus", get_lemma_object=True))[0]["lemma"].possible_forms()), sorted([ 'belliora', 'bellae', 'bellam', 'bellissimis', 'bellioris', 'bellissimarum', 'bellissime', 'bellorum', 'bellissimum', 'belliori', 'bellissimorum', 'bellissima', 'bellum', 'bellus', 'bellissimae', 'bellis', 'belli', 'belliores', 'bellissimo', 'bellissimas', 'bellioribus', 'bellas', 'bellior', 'belliore', 'bellarum', 'bella', 'bellissimus', 'bellissimos', 'belliorum', 'belle', 'bellos', 'belliorem', 'bellissimam', 'bello', 'bellissimi', 'bellius' ]) ) def test_assimilations(self): """ Check that lemmatizer handles correctly assimilations """ results = TestSentences.lemmatizer.lemmatise_multiple("adprehendant expectari") self.assertLemmatisationMultipleEqual( results, [ [{'lemma': 'apprehendo', 'form': 'apprehendant', 'morph': '3ème pluriel subjonctif présent actif', "radical": "apprehend", "desinence": "ant"}], [{'lemma': 'exspecto', 'form': 'exspectari', 'morph': 'infinitif présent passif', "radical": "exspect", "desinence": "ari"}] ] ) def test_contractions(self): """ Check that the lemmatizer handles correctly contractions """ results = TestSentences.lemmatizer.lemmatise_multiple("exspirasset legarat legerat", get_lemma_object=True) self.assertLemmatisationMultipleEqual( results, [ [ {'form': 'exspirauisset', 'morph': '3ème singulier subjonctif PQP actif', 'lemma': TestSentences.lemmatizer.lemme("exspiro"), 'radical': 'exspirav', 'desinence': 'isset'} ], [ {'lemma': TestSentences.lemmatizer.lemme("lego2"), 'morph': '3ème singulier indicatif PQP actif', 'form': 'legauerat', 'radical': 'legav', 'desinence': 'erat'} ], [ {'lemma': TestSentences.lemmatizer.lemme("lego"), 'morph': '3ème singulier indicatif PQP actif', 'form': 'legerat', 'radical': 'leg', 'desinence': 'erat'} ] ], _lemma_obj=True ) def test_lower_case(self): results = TestSentences.lemmatizer.lemmatise("Christi", get_lemma_object=True) self.assertLemmatisationEqual( results, [ {'lemma': TestSentences.lemmatizer.lemme("Christus"), 'form': 'christi', 'morph': 'génitif singulier', 'radical': 'Christ', 'desinence': 'i'}, {'lemma': TestSentences.lemmatizer.lemme("Christus"), 'form': 'christi', 'morph': 'nominatif pluriel', 'radical': 'Christ', 'desinence': 'i'}, {'lemma': TestSentences.lemmatizer.lemme("Christus"), 'form': 'christi', 'morph': 'vocatif pluriel', 'radical': 'Christ', 'desinence': 'i'}, {'lemma': TestSentences.lemmatizer.lemme("christus2"), 'form': 'christi', 'morph': 'génitif masculin singulier', 'radical': 'christ', 'desinence': 'i'}, {'lemma': TestSentences.lemmatizer.lemme("christus2"), 'form': 'christi', 'morph': 'nominatif masculin pluriel', 'radical': 'christ', 'desinence': 'i'}, {'lemma': TestSentences.lemmatizer.lemme("christus2"), 'form': 'christi', 'morph': 'vocatif masculin pluriel', 'radical': 'christ', 'desinence': 'i'}, {'lemma': TestSentences.lemmatizer.lemme("christus2"), 'form': 'christi', 'morph': 'génitif neutre singulier', 'radical': 'christ', 'desinence': 'i'} ], _lemma_obj=True ) def test_roman_num(self): results = TestSentences.lemmatizer.lemmatise_multiple("XIV MDCXXIV xiv", get_lemma_object=True, as_list=True) self.assertLemmatisationMultipleEqual( results, [ [ {'lemma': TestSentences.parser.parse_lemme("XIV|inv|||adj. num.|1", 0, _deramise=False), 'form': 'XIV', 'morph': '', 'radical': None, 'desinence': None}, ], [ {'lemma': TestSentences.parser.parse_lemme("MDCXXIV|inv|||adj. num.|1", 0, _deramise=False), 'form': 'MDCXXIV', 'morph': '', 'radical': None, 'desinence': None}, ], [ {'lemma': TestSentences.parser.parse_lemme("XIV|inv|||adj. num.|1", 0, _deramise=False), 'form': 'XIV', 'morph': '', 'radical': None, 'desinence': None}, ] ], _lemma_obj=True ) def test_when_there_is_a_non_word_char(self): results = TestSentences.lemmatizer.lemmatise_multiple( "Qui, quae , quod ! ", pos=True ) self.assertEqual(len(results), 3, "Splitting should be operational") def test_when_there_is_a_suffixe(self): results = TestSentences.lemmatizer.lemmatise_multiple( "Et flavescit haphe gravesque draucis ", pos=True ) self.assertLemmatisationMultipleEqual( results, [[{'pos': 'cd', 'form': 'et', 'lemma': 'et', 'morph': '-', 'desinence': '', 'radical': 'et'}], [{'pos': 'v', 'form': 'flauescit', 'lemma': 'flavesco', 'morph': '3ème singulier indicatif présent actif', 'desinence': 'it', 'radical': 'flavesc'}], [{'pos': 'n', 'form': 'haphe', 'lemma': 'haphe', 'morph': 'nominatif singulier', 'desinence': 'e', 'radical': 'haph'}, {'pos': 'n', 'form': 'haphe', 'lemma': 'haphe', 'morph': 'vocatif singulier', 'desinence': 'e', 'radical': 'haph'}, {'pos': 'n', 'form': 'haphe', 'lemma': 'haphe', 'morph': 'ablatif singulier', 'desinence': 'e', 'radical': 'haph'}], [{'pos': 'a', 'form': 'graues', 'lemma': 'gravis', 'morph': 'nominatif masculin pluriel', 'desinence': 'es', 'radical': 'grav'}, {'pos': 'a', 'form': 'graues', 'lemma': 'gravis', 'morph': 'vocatif masculin pluriel', 'desinence': 'es', 'radical': 'grav'}, {'pos': 'a', 'form': 'graues', 'lemma': 'gravis', 'morph': 'accusatif masculin pluriel', 'desinence': 'es', 'radical': 'grav'}, {'pos': 'a', 'form': 'graues', 'lemma': 'gravis', 'morph': 'nominatif féminin pluriel', 'desinence': 'es', 'radical': 'grav'}, {'pos': 'a', 'form': 'graues', 'lemma': 'gravis', 'morph': 'vocatif féminin pluriel', 'desinence': 'es', 'radical': 'grav'}, {'pos': 'a', 'form': 'graues', 'lemma': 'gravis', 'morph': 'accusatif féminin pluriel', 'desinence': 'es', 'radical': 'grav'}, {'pos': 'v', 'form': 'graues', 'lemma': 'grauo', 'morph': '2ème singulier subjonctif présent actif', 'desinence': 'es', 'radical': 'grau'} ], [{'pos': 'n', 'form': 'draucis', 'lemma': 'Draucus', 'morph': 'datif pluriel', 'desinence': 'is', 'radical': 'Drauc'}, {'pos': 'n', 'form': 'draucis', 'lemma': 'Draucus', 'morph': 'ablatif pluriel', 'desinence': 'is', 'radical': 'Drauc'}, {'pos': 'n', 'form': 'draucis', 'lemma': 'draucus', 'morph': 'datif pluriel', 'desinence': 'is', 'radical': 'drauc'}, {'pos': 'n', 'form': 'draucis', 'lemma': 'draucus', 'morph': 'ablatif pluriel', 'desinence': 'is', 'radical': 'drauc'} ] ] )
from pycollatinus import Lemmatiseur analyzer = Lemmatiseur() print(analyzer.lemmatise_multiple("arma virum cano"))
def test_contraction(self): x = Lemmatiseur(load=False) parser = Parser(x) parser.ajMorphos() parser.ajContractions() parser.ajAssims() load_mod_vars(x) amo = parser.parse_modele("""modele:amo R:0:1,0 R:1:1,āv R:2:1,āt des:121-126:0:ō̆;ās;ăt;āmŭs;ātĭs;ānt des:127-132:0:ābăm;ābās;ābăt;ābāmŭs;ābātĭs;ābānt des:133-138:0:ābō̆;ābĭs;ābĭt;ābĭmŭs;ābĭtĭs;ābūnt des:139-144:1:ī;īstī;ĭt;ĭmŭs;īstĭs;ērūnt,ērĕ des:145-150:1:ĕrăm;ĕrās;ĕrăt;ĕrāmŭs;ĕrātĭs;ĕrānt des:151-156:1:ĕrō̆;ĕrī̆s;ĕrĭt;ĕrī̆mŭs;ĕrī̆tĭs;ĕrīnt des:157-162:0:$em des:163-168:0:ārĕm;ārēs;ārĕt;ārēmŭs;ārētĭs;ārēnt des:169-174:1:ĕrĭm;ĕrī̆s;ĕrĭt;ĕrī̆mŭs;ĕrī̆tĭs;ĕrīnt des:175-180:1:īssĕm;īssēs;īssĕt;īssēmŭs;īssētĭs;īssēnt des:181-186:0:ā;ātĕ;ātō;ātō;ātōtĕ;āntō des:187:0:ārĕ des:188:1:īssĕ des:188:0:āssĕ des:189-200:0:āns;āns;āntĕm;āntĭs;āntī;āntĕ;āntēs;āntēs;āntēs;āntĭŭm,āntŭm;āntĭbŭs;āntĭbŭs des:201-212:0:āns;āns;āntĕm;āntĭs;āntī;āntĕ;āntēs;āntēs;āntēs;āntĭŭm,āntŭm;āntĭbŭs;āntĭbŭs des:213-224:0:āns;āns;āns;āntĭs;āntī;āntĕ;āntĭă;āntĭă;āntĭă;āntĭŭm,āntŭm;āntĭbŭs;āntĭbŭs des:225-236:2:ūr$lupus des:237-248:2:ūr$uita des:249-260:2:ūr$templum des:261-264:0:āndŭm;āndī;āndō;āndō des:265,266:2:ŭm;ū des:267-272:0:ŏr;ārĭs,ārĕ;ātŭr;āmŭr;āmĭnī;āntŭr des:273-278:0:ābăr;ābārĭs,ābārĕ;ābātŭr;ābāmŭr;ābāmĭnī;ābāntŭr des:279-284:0:ābŏr;ābĕrĭs,ābĕrĕ;ābĭtŭr;ābĭmŭr;ābĭmĭnī;ābūntŭr des:285-290:0:ĕr;ērĭs,ērĕ;ētŭr;ēmŭr;ēmĭnī;ēntŭr des:291-296:0:ārĕr;ārērĭs,ārērĕ;ārētŭr;ārēmŭr;ārēmĭnī;ārēntŭr des:297,298:0:ārĕ;āmĭnī des:299-301:0:ātŏr;ātŏr;āntŏr des:302:0:ārī des:303-314:2:$lupus des:315-326:2:$uita des:327-338:2:$templum des:339-350:0:ānd$lupus des:351-362:0:ānd$uita des:363-374:0:ānd$templum""".split("\n")) x._modeles[amo.gr()] = amo moneo = parser.parse_modele("""modele:moneo pere:amo R:0:2,0 R:2:- des:121-126:0:ĕō̆;ēs;ĕt;ēmŭs;ētĭs;ēnt des:127-132:0:ēbăm;ēbās;ēbăt;ēbāmŭs;ēbātĭs;ēbānt des:133-138:0:ēbō̆;ēbĭs;ēbĭt;ēbĭmŭs;ēbĭtĭs;ēbūnt des:157-162:0:ĕăm;ĕās;ĕăt;ĕāmŭs;ĕātĭs;ĕānt des:163-168:0:ērĕm;ērēs;ērĕt;ērēmŭs;ērētĭs;ērēnt des:181-186:0:ē;ētĕ;ētō;ētō;ētōtĕ;ēntō des:187:0:ērĕ des:188:1:īssĕ des:189-200:0:$ens des:201-212:0:$ens des:213-224:0:ēns;ēns;ēns;ēntĭs;ēntī;ēntĕ;ēntĭă;ēntĭă;ēntĭă;ēntĭŭm,ēntŭm;ēntĭbŭs;ēntĭbŭs des:261-264:0:ēndŭm;ēndī;ēndō;ēndō des:267-272:0:ĕŏr;ērĭs,ērĕ;ētŭr;ēmŭr;ēmĭnī;ēntŭr des:273-278:0:ēbăr;ēbārĭs,ēbārĕ;ēbātŭr;ēbāmŭr;ēbāmĭnī;ēbāntŭr des:279-284:0:ēbŏr;ēbĕrĭs,ēbĕrĕ;ēbĭtŭr;ēbĭmŭr;ēbĭmĭnī;ēbūntŭr des:285-290:0:ĕăr;ĕārĭs,ĕārĕ;ĕātŭr;ĕāmŭr;ĕāmĭnī;ĕāntŭr des:291-296:0:ērĕr;ērērĭs,ērērĕ;ērētŭr;ērēmŭr;ērēmĭnī;ērēntŭr des:297,298:0:ērĕ;ēmĭnī des:299-301:0:ētŏr;ētŏr;ēntŏr des:302:0:ērī des:339-350:0:ēndŭs;ēndĕ;ēndŭm;ēndī;ēndō;ēndō;ēndī;ēndī;ēndōs;ēndōrŭm;ēndīs;ēndīs des:351-362:0:ēndă;ēndă;ēndăm;ēndāe;ēndāe;ēndā;ēndāe;ēndāe;ēndās;ēndārŭm;ēndīs;ēndīs des:363-374:0:ēndŭm;ēndŭm;ēndŭm;ēndī;ēndō;ēndō;ēndă;ēndă;ēndă;ēndōrŭm;ēndīs;ēndīs""" .split("\n")) x._modeles[moneo.gr()] = moneo lego = parser.parse_modele("""modele:lego pere:moneo R:0:1,0 des:121-126:0:ō̆;ĭs;ĭt;ĭmŭs;ĭtĭs;ūnt des:133-138:0:ăm;ēs;ĕt;ēmŭs;ētĭs;ēnt des:157-162:0:ăm;ās;ăt;āmŭs;ātĭs;ānt des:163-168:0:ĕrĕm;ĕrēs;ĕrĕt;ĕrēmŭs;ĕrētĭs;ĕrēnt des:181-186:0:ĕ;ĭtĕ;ĭtō;ĭtō;ĭtōtĕ;ūntō des:187:0:ĕrĕ des:267-272:0:ŏr;ĕrĭs,ĕrĕ;ĭtŭr;ĭmŭr;ĭmĭnī;ūntŭr des:279-284:0:ăr;ērĭs,ērĕ;ētŭr;ēmŭr;ēmĭnī;ēntŭr des:285-290:0:ăr;ārĭs,ārĕ;ātŭr;āmŭr;āmĭnī;āntŭr des:291-296:0:ĕrĕr;ĕrērĭs,ĕrērĕ;ĕrētŭr;ĕrēmŭr;ĕrēmĭnī;ĕrēntŭr des:297,298:0:ĕrĕ;ĭmĭnī des:299-301:0:ĭtŏr;ĭtŏr;ūntŏr des:302:0:ī""".split("\n")) x._modeles[lego.gr()] = lego parser.parse_lemme("lēgo2|amo|||as, are|34", origin=0) parser.parse_lemme("lĕgo|lego|lēg|lēct|is, ere, legi, lectum|619", origin=0) self.assertEqual( x.lemmatise_multiple("legarat legerat", get_lemma_object=True), [[{ 'lemma': x.lemme("lego2"), 'morph': '3ème singulier indicatif PQP actif', 'form': 'legauerat', "radical": "legav", "desinence": "erat" }], [{ 'lemma': x.lemme("lego"), 'morph': '3ème singulier indicatif PQP actif', 'form': 'legerat', "radical": "leg", "desinence": "erat" }]])