def test_hierosylima(self):
        test = " Hierosolyma"
        x = Lemmatiseur(load=False)
        parser = Parser(x)
        parser.ajMorphos()
        parser.ajContractions()
        parser.ajAssims()
        load_mod_vars(x)
        parser.register_modele(
            parser.parse_modele("""modele:uita
R:1:1,0
des:1-12:1:$uita""".split("\n")))
        parser.register_modele(
            parser.parse_modele("""modele:roma
pere:uita
des:413:1:āe""".split("\n")))
        parser.register_modele(
            parser.parse_modele("""modele:doctus
R:0:2,0
R:1:2,ĭ
R:2:2,īssĭm
des:13-48:0:$lupus;$uita;$templum
des:49-84:1:$compar;$compar;ŭs;ŭs;ŭs;ōrĭs;ōrī;ōrĕ;ōră;ōră;ōră;ōrŭm;ōrĭbŭs
des:85-120:2:$lupus;$uita;$templum""".split("\n")))
        parser.register_modele(
            parser.parse_modele("""modele:aureus
pere:doctus
des:14:0:ŭs
abs:49-120""".split("\n")))

        parser.parse_lemme("Hĭĕrŏsŏlўma|roma|||ae, f.|5")
        parser.parse_lemme("Lўcāŏnĭus|aureus|||a, um|5")
        self.assertEqual(list(x.lemmatise("Hierosolymam")), [{
            'lemma': 'Hierosolyma',
            'form': 'hierosolymam',
            'morph': 'accusatif singulier',
            'radical': 'Hierosolym',
            "desinence": "am"
        }])
        self.assertEqual(list(x.lemmatise("Lycaonios")), [{
            'morph': 'accusatif masculin pluriel',
            'lemma': 'Lycaonius',
            'form': 'lycaonios',
            'radical': 'Lycaoni',
            "desinence": "os"
        }])
示例#2
0
    def test_dump_and_load(self):
        lemmatizer = Lemmatiseur()
        lemmatizer.compile()

        del lemmatizer

        lemmatizer = Lemmatiseur.load()
        results = lemmatizer.lemmatise_multiple("mihi Romanorum", pos=True)
        self.maxDiff = 5000
        self.assertLemmatisationMultipleEqual(
            results, [[{
                'form': 'mihi',
                'morph': 'datif féminin singulier',
                'lemma': 'ego',
                'pos': 'p',
                "radical": "",
                "desinence": "mihi"
            }, {
                'form': 'mihi',
                'morph': 'datif masculin singulier',
                'lemma': 'ego',
                'pos': 'p',
                "radical": "",
                "desinence": "mihi"
            }],
                      [
                          {
                              'form': 'romanorum',
                              'pos': 'n',
                              'morph': 'génitif pluriel',
                              'lemma': 'Romani',
                              "radical": "Roman",
                              "desinence": "orum"
                          },
                          {
                              'form': 'romanorum',
                              'pos': 'n',
                              'morph': 'génitif pluriel',
                              'lemma': 'Romanus',
                              "radical": "Roman",
                              "desinence": "orum"
                          },
                          {
                              'form': 'romanorum',
                              'pos': 'a',
                              'morph': 'génitif masculin pluriel',
                              'lemma': 'Romanus',
                              "radical": "Roman",
                              "desinence": "orum"
                          },
                          {
                              'form': 'romanorum',
                              'pos': 'a',
                              'morph': 'génitif neutre pluriel',
                              'lemma': 'Romanus',
                              "radical": "Roman",
                              "desinence": "orum"
                          },
                      ]])
    def test_romanorum(self):
        x = Lemmatiseur(load=False)
        parser = Parser(x)
        parser.ajMorphos()
        load_mod_vars(x)

        lupus = parser.parse_modele(
            ["modele:lupus", "R:1:2,0", "des:1-12:1:$lupus", "pos:n"])
        x._modeles[lupus.gr()] = lupus
        doctus = parser.parse_modele([
            "modele:doctus", "R:0:2,0", "R:1:2,ĭ", "R:2:2,īssĭm",
            "des:13-48:0:$lupus;$uita;$templum",
            "des:49-84:1:$compar;$compar;ŭs;ŭs;ŭs;ōrĭs;ōrī;ōrĕ;ōră;ōră;ōră;ōrŭm;ōrĭbŭs",
            "des:85-120:2:$lupus;$uita;$templum", "pos:a"
        ])
        x._modeles[doctus.gr()] = doctus
        liberi = parser.parse_modele(
            ["modele:liberi", "pere:lupus", "R:1:1,0", "abs:1-6", "pos:n"])
        x._modeles[liberi.gr()] = liberi

        parser.parse_lemme("Rōmānus|doctus|||a, um|2392", origin=0)
        parser.parse_lemme("Rōmānus2|lupus|||i, m.|8", origin=0)
        Romani = parser.parse_lemme("Rōmāni|liberi|||orum, m.|262", origin=0)

        self.assertEqual(
            sorted(
                list(
                    set([
                        r["lemma"] + "|" + r["pos"]
                        for r in x.lemmatise("Romanorum", pos=True)
                    ]))), ["Romani|n", "Romanus|a", "Romanus|n"])
        self.assertEqual(
            sorted(
                list(
                    set([
                        r["lemma"] + "|" + r["pos"]
                        for r in x.lemmatise("Romana", pos=True)
                    ]))), ["Romanus|a"])
    def test_invariables(self):
        x = Lemmatiseur(load=False)
        parser = Parser(x)
        parser.ajMorphos()
        m = parser.parse_modele(["modele:inv", "R:0:0,0", "des:416:0:-"])
        x._modeles[m.gr()] = m

        parser.parse_lemme("nĕc|inv|||adv.|6689", origin=0)
        parser.parse_lemme("ergō=ērgō|inv|||conj.|1450", origin=0)

        self.assertEqual(list(x.lemmatise("nec")), [{
            'lemma': 'nec',
            'morph': '-',
            'form': 'nec',
            "radical": "nec",
            "desinence": ""
        }])
        self.assertEqual(list(x.lemmatise("ergo")), [{
            'lemma': 'ergo',
            'morph': '-',
            'form': 'ergo',
            'radical': 'ergo',
            "desinence": ""
        }])
    def test_sequens(self):

        x = Lemmatiseur(load=False)
        parser = Parser(x)
        parser.ajMorphos()
        parser.ajContractions()
        parser.ajAssims()
        load_mod_vars(x)

        doctus = parser.parse_modele("""modele:doctus
R:0:2,0
R:1:2,ĭ
R:2:2,īssĭm
des:13-48:0:$lupus;$uita;$templum
des:49-84:1:$compar;$compar;ŭs;ŭs;ŭs;ōrĭs;ōrī;ōrĕ;ōră;ōră;ōră;ōrŭm;ōrĭbŭs
des:85-120:2:$lupus;$uita;$templum""".split("\n"))
        x._modeles[doctus.gr()] = doctus
        fortis = parser.parse_modele("""modele:fortis
pere:doctus
R:0:2,0
R:4:K
R:5:2,ĭ
des:13,14,25,26:4:-;-;-;-
des:15-24:1:ĕm;ĭs;ī;ī;ēs;ēs;ēs,īs;ĭŭm;ĭbŭs
des:27-36:1:ĕm;ĭs;ī;ī;ēs;ēs;ēs,īs;ĭŭm;ĭbŭs
des:37-48:1:ĕ;ĕ;ĕ;ĭs;ī;ī;ĭă;ĭă;ĭă;ĭŭm;ĭbŭs
des:49-84:1:ĭ$compar;ĭ$compar;ĭŭs;ĭŭs;ĭŭs;ĭōrĭs;ĭōrī;ĭōrĕ;ĭōră;ĭōră;ĭōră;ĭōrŭm;ĭōrĭbŭs
des:85-120:1:īssĭm$lupus;īssĭm$uita;īssĭm$templum""".split("\n"))
        x._modeles[fortis.gr()] = fortis
        infans = parser.parse_modele("""modele:infans
pere:fortis
des+:22,34,46:1:ŭm3""".split("\n"))
        x._modeles[infans.gr()] = infans
        lemma = parser.parse_lemme("sĕquens=sĕquēns|infans|sĕquēnt||entis|44")
        self.assertEqual(
            sorted(lemma.possible_forms()),
            sorted([
                'sequentioris', 'sequentissimae', 'sequentissimam',
                'sequentissimum', 'sequentiores', 'sequentissimorum',
                'sequentissimi', 'sequentissime', 'sequentis', 'sequentior',
                'sequentem', 'sequentiori', 'sequentum', 'sequentissimas',
                'sequentiorem', 'sequens', 'sequentiora', 'sequentius',
                'sequentissimos', 'sequentibus', 'sequenti', 'sequentiorum',
                'sequentissimis', 'sequentes', 'sequente', 'sequentioribus',
                'sequentissimus', 'sequentiore', 'sequentissima',
                'sequentissimarum', 'sequentia', 'sequentissimo', 'sequentium'
            ]))
示例#6
0
def align(lemma_file, dictionary_file, collatinus=False, collatinus_dic=None):
    """ Align the lemma file with the dictionary file to create
    a dictionary of sure genders

    Requires installing unidecode

    :param lemma_file: _lemma.txt or _noun_lemma.txt
    :param dictionary_file: Dictionary file from CIRCE/LEMLAT3
    """
    database = {}
    secondary_db = {}
    with open(dictionary_file) as f:
        db = csv.DictReader(f, delimiter="\t")
        for line in db:
            if line["src"] == "O":
                database[line["lemma"]] = {
                    "gen": line["gen"],
                    "pos": line["upostag"]
                }
            else:
                secondary_db[line["lemma"]] = {
                    "gen": line["gen"],
                    "pos": line["upostag"]
                }

    if collatinus_dic:
        from unidecode import unidecode
        gender = re.compile(r"^\w+, (\w)\..*$")
        with open(collatinus_dic) as f:
            for line in f:
                if len(line.strip()) > 0 and not line.startswith("!"):
                    content = unidecode(line.strip())
                    parts = content.split("|")
                    lemma = parts[0].split("=")[0]
                    morph = parts[-2]
                    if gender.match(morph):
                        secondary_db[lemma] = {
                            "gen": gender.findall(morph)[0],
                            "pos": "NOUN"
                        }

    if collatinus:
        from pycollatinus import Lemmatiseur
        collatinus_lemmatiseur = Lemmatiseur()

    matches = []
    unmatched = []
    maps = {}
    sec = 0
    proper_nouns = 0
    disambiguate = 0
    relemmatized = 0
    verb_substantived = 0
    adje_substantived = 0
    deduction = 0

    with open(lemma_file) as f:
        for line in f.readlines():
            lemma, decl = tuple(line.strip().split("\t"))
            lemma = lemma.strip().lower().replace("v", "u")
            if decl == "7":
                continue
            elif lemma in database:
                matches.append(lemma)
            elif lemma in secondary_db:
                matches.append(lemma)
                sec += 1
            elif lemma.replace("_n", "") in database or lemma.replace(
                    "_n", "") in secondary_db:
                matches.append(lemma)
                maps[lemma] = lemma.replace("_n", "")
                proper_nouns += 1
                sec += int(lemma.replace("_n", "") in secondary_db)
            elif lemma in secondary_db:
                matches.append(lemma)
                sec += 1
            elif lemma.split("_")[0] in database or lemma.split(
                    "_")[0] in secondary_db:
                matches.append(lemma)
                maps[lemma] = lemma.split("_")[0]
                disambiguate += 1
            else:

                unmatched.append(lemma)

                if collatinus:
                    form = lemma.split("_")[0]
                    # Keep only nouns that are nominatif
                    results = list(
                        filter(
                            lambda res: res["pos"] == "n" and res["morph"].
                            startswith("nominatif"),
                            collatinus_lemmatiseur.lemmatise(
                                form, pos=True, get_lemma_object=False)))
                    vs = list(
                        filter(
                            lambda res: res["pos"] == "v" and "nominatif" in
                            res["morph"] and "neutre" in res["morph"],
                            collatinus_lemmatiseur.lemmatise(
                                form, pos=True, get_lemma_object=False)))
                    adjs = list(
                        filter(
                            lambda res: "nominatif" in res["morph"],
                            collatinus_lemmatiseur.lemmatise(
                                form, pos=True, get_lemma_object=False)))
                    if len(results):
                        uniques = list(set(map(lambda x: x["lemma"], results)))
                        if len(uniques) == 1:
                            matches.append(lemma)
                            maps[lemma] = form
                            relemmatized += 1
                            unmatched.pop()
                        elif len(uniques) > 1:
                            gs = list(
                                set([
                                    secondary_db[lem]["gen"] for lem in uniques
                                    if lem in secondary_db
                                ]))
                            if len(gs) == 1:
                                secondary_db[form] = {
                                    "gen": gs[0],
                                    "pos": "NOUN"
                                }
                                matches.append(lemma)
                                maps[lemma] = form
                                relemmatized += 1
                                unmatched.pop()
                    elif len(vs) > 0:
                        verb_substantived += 1
                        secondary_db[form] = {"gen": "n", "pos": "NOUN"}
                        matches.append(lemma)
                        maps[lemma] = form
                        unmatched.pop()
                    else:
                        gs = list(
                            set(map(lambda x: x["morph"].split()[1], adjs)))
                        if len(gs) == 1:
                            adje_substantived += 1
                            secondary_db[form] = {"gen": gs[0], "pos": "NOUN"}
                            matches.append(lemma)
                            maps[lemma] = form
                            unmatched.pop()
                if len(unmatched) and unmatched[-1] == lemma:
                    form = lemma.split("_n")[0]
                    if form.endswith("i") or form.endswith(
                            "es") and not form.endswith("des"):
                        # Romani, aethiopes -> Masc et Fen
                        deduction += 1
                        secondary_db[lemma] = {"gen": "3", "pos": "NOUN"}
                        matches.append(lemma)
                        maps[lemma] = lemma
                        unmatched.pop()
                    elif form.endswith("us"):  # Romanus -> Masc
                        deduction += 1
                        secondary_db[lemma] = {"gen": "4", "pos": "NOUN"}
                        matches.append(lemma)
                        maps[lemma] = lemma
                        unmatched.pop()
                    elif form.endswith("a") or form.endswith(
                            "ae"):  # Albina -> Fem
                        deduction += 1
                        secondary_db[lemma] = {"gen": "2", "pos": "NOUN"}
                        matches.append(lemma)
                        maps[lemma] = lemma
                        unmatched.pop()

    total = max(len(unmatched) + len(matches), 1)

    print(
        "{percent:.2f} % of matched lemma over {total} lemma, leaving {unm} unmatched "
        "\n\t- {sec:.2f}% from secondary db"
        "\n\t- {prop:.2f}% remapped proper nouns"
        "\n\t- {undesi:.2f}% undesambiguated nouns"
        "\n\t- {relem:.2f}% relemmatized nouns"
        "\n\t- {verb:.2f}% relemmatized substantived neutral verbs"
        "\n\t- {adje:.2f}% relemmatized substantived adjective"
        "\n\t- {deducted:.2f}% deducted genders".format(
            unm=len(unmatched),
            percent=len(matches) / total * 100,
            total=total,
            sec=sec / total * 100,
            prop=proper_nouns / total * 100,
            undesi=disambiguate / total * 100,
            relem=relemmatized / total * 100,
            verb=verb_substantived / total * 100,
            adje=adje_substantived / total * 100,
            deducted=deduction / total * 100,
        ))
    with open("result.tsv", "w") as f:
        for lemma in matches:
            form = lemma
            if lemma in maps:
                form = maps[lemma]
            f.write("\t".join([
                lemma,
                database.get(form, secondary_db.get(form, {"gen": "???"}))
                ["gen"]
            ]) + "\n")
    print("\n".join(unmatched))
def text2matrixNdictFreq(path, lemma=False):

    text = getText(path)
    textList = text.split('\n')

    new = []
    for x in textList:
        #выкидывает параграфы, в которых содержатся ненужные символы(использовано для примечаний)
        if ('|' in x):
            continue
        if ('HN' in x):
            continue
        if ('PHV' in x):
            continue
        if ('PNV' in x):
            continue
        y = x
        y = y.lower()
        y = re.sub(r'[^a-z ]+', '', y)

        if not x:
            continue
        #print(y)
        new.append(y)

    myWordList = []
    for x in new:
        myWordList += x.split()

    latinstem = []
    if lemma:
        F = createLemmatiser(Lemmatiseur())
        for x in myWordList:
            y = F(x)
            if not y:
                continue
            if bool(
                    re.search(
                        r"^M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$",
                        y)):
                continue
            latinstem.append(y)
            #Лемматайзер делает ВСЕ имена собственные с большой буквы
            # 01.01.2021 необходимо вычистить латинские цифры!!!! Для аккуратности
    else:
        for x in myWordList:

            y = removeEndings(x)
            y = removeEndings(y)
            y = removeEndings(y)

            if not y:
                continue
            if bool(
                    re.search(
                        r"^M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$",
                        y)):
                continue
            latinstem.append(y)

    listForDict = list(set(latinstem))
    listForDict.sort()

    for i in range(10):
        mylen = len(list(filter(lambda x: len(x) == i, listForDict)))
        #print(i,mylen)
    mylen = len(list(filter(lambda x: len(x) > 10, listForDict)))

    listForDict = list(filter(lambda x: len(x) > 2, listForDict))
    ListOfPrepRem = []
    for x in ListOfPrep:
        y = removeEndings(x)
        ListOfPrepRem.append(y)

    finalDict = DictLatCrossFreq(ListOfPrepRem, listForDict, latinstem)
    return finalDict
def text2matrixNdict(path, lemma=False):

    text = getText(path)
    textList = text.split('\n')
    #print("File was loaded")
    new = []
    for x in textList:
        #выкидывает параграфы, в которых содержатся ненужные символы(использовано для примечаний)
        if ('|' in x):
            continue
        if ('HN' in x):
            continue
        if ('PHV' in x):
            continue
        if ('PNV' in x):
            continue
        y = x
        y = y.lower()
        y = re.sub(r'[^a-z ]+', '', y)

        if not x:
            continue
        #print(y)
        new.append(y)

    #print("Useless parags were removed")
    myWordList = []
    for x in new:
        myWordList += x.split()

    #print("Word list was created")
    latinstem = []
    if lemma:
        F = createLemmatiser(Lemmatiseur())
        for x in myWordList:
            y = F(x)
            if not y:
                continue
            if bool(
                    re.search(
                        r"^M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$",
                        y)):
                continue
            latinstem.append(y)
    else:
        for x in myWordList:

            y = removeEndings(x)
            y = removeEndings(y)
            y = removeEndings(y)

            if not y:
                continue
            if bool(
                    re.search(
                        r"^M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$",
                        y)):
                continue
            latinstem.append(y)

    listForDict = list(set(latinstem))
    listForDict.sort()

    #print("Dict list was created")
    for i in range(10):
        mylen = len(list(filter(lambda x: len(x) == i, listForDict)))
        #print(i,mylen)
    mylen = len(list(filter(lambda x: len(x) > 10, listForDict)))

    listForDict = list(filter(lambda x: len(x) > 2, listForDict))
    ListOfPrepRem = []
    for x in ListOfPrep:
        y = removeEndings(x)
        ListOfPrepRem.append(y)
    #print("Prepositions list was created")
    finalDict = DictLatCross(ListOfPrepRem, listForDict, latinstem)
    #print("Dictionary matrix was created")
    return finalDict
示例#9
0
class TestSentences(ExtendedTestCase):
    lemmatizer = Lemmatiseur()
    parser = Parser(lemmatizer)

    def test_cogito_ergo_sum(self):
        results = TestSentences.lemmatizer.lemmatise_multiple("cogito ergo sum")
        self.assertLemmatisationMultipleEqual(
            results,
            [
                [{'lemma': 'cogo', 'morph': '2ème singulier impératif futur actif', 'form': 'cogito',
                  'radical': 'cog', 'desinence': 'ito'},
                 {'lemma': 'cogo', 'morph': '3ème singulier impératif futur actif', 'form': 'cogito',
                  'radical': 'cog', 'desinence': 'ito'},
                 {'lemma': 'cogito', 'morph': '1ère singulier indicatif présent actif', 'form': 'cogito',
                  'radical': 'cogit', 'desinence': 'o'},
                 {'lemma': 'cogito', 'morph': '1ère singulier indicatif présent actif', 'form': 'cogito',
                  'radical': 'cogit', 'desinence': 'o'}],
                [{'lemma': 'ergo', 'morph': '1ère singulier indicatif présent actif', 'form': 'ergo',
                  'radical': 'erg', 'desinence': 'o'},
                 {'lemma': 'ergo', 'morph': 'positif', 'form': 'ergo',
                  'radical': 'ergo', 'desinence': ''},
                 {'lemma': 'ergo', 'morph': '-', 'form': 'ergo',
                  'radical': 'ergo', 'desinence': ''}],
                [{'lemma': 'sum', 'morph': '1ère singulier indicatif présent actif', 'form': 'sum',
                  'radical': 's', 'desinence': 'um'}]
            ],
            "Ergo, sum and cogito should be recognized"
        )

    def test_ego_romanus(self):
        results = TestSentences.lemmatizer.lemmatise_multiple("mihi Romanorum", pos=True)
        self.maxDiff = 5000
        self.assertLemmatisationMultipleEqual(results, [
            [
                {'form': 'mihi', 'morph': 'datif féminin singulier', 'lemma': 'ego', 'pos': 'p', "radical": "", "desinence": "mihi"},
                {'form': 'mihi', 'morph': 'datif masculin singulier', 'lemma': 'ego', 'pos': 'p', "radical": "", "desinence": "mihi"}
            ],
            [
                {'form': 'romanorum', 'pos': 'n', 'morph': 'génitif pluriel', 'lemma': 'Romani', "radical": "Roman", "desinence": "orum"},
                {'form': 'romanorum', 'pos': 'n', 'morph': 'génitif pluriel', 'lemma': 'Romanus', "radical": "Roman", "desinence": "orum"},
                {'form': 'romanorum', 'pos': 'a', 'morph': 'génitif masculin pluriel', 'lemma': 'Romanus', "radical": "Roman", "desinence": "orum"},
                {'form': 'romanorum', 'pos': 'a', 'morph': 'génitif neutre pluriel', 'lemma': 'Romanus', "radical": "Roman", "desinence": "orum"},
            ]
        ])

    def test_nec_aliud_sequenti_quadriduo(self):
        """ Check that aliud, an irregular form, is well behaving as well as nec, an invariable """
        results = TestSentences.lemmatizer.lemmatise_multiple("nec aliud sequenti quadriduo")
        expected = [
            [{'lemma': 'nec', 'morph': '-', 'form': 'nec', 'radical': 'nec', 'desinence': ''}],
            [
                {'form': 'aliud', 'morph': 'nominatif neutre singulier', 'lemma': 'aliud',
                                                                                  'radical': None, 'desinence': None},
                {'form': 'aliud', 'morph': 'vocatif neutre singulier', 'lemma': 'aliud',
                                                                                  'radical': None, 'desinence': None},
                {'form': 'aliud', 'morph': 'accusatif neutre singulier', 'lemma': 'aliud',
                                                                                  'radical': None, 'desinence': None},
            ],
            [
                {'form': 'sequenti', 'morph': 'datif masculin singulier participe présent actif', 'lemma': 'sequor',
                 'radical': 'sequ', 'desinence': 'enti'},
                {'form': 'sequenti', 'morph': 'datif féminin singulier participe présent actif', 'lemma': 'sequor',
                 'radical': 'sequ', 'desinence': 'enti'},
                {'form': 'sequenti', 'morph': 'datif neutre singulier participe présent actif', 'lemma': 'sequor',
                 'radical': 'sequ', 'desinence': 'enti'},
                {'form': 'sequenti', 'morph': 'datif masculin singulier participe présent actif', 'lemma': 'sequo',
                 'radical': 'sequ', 'desinence': 'enti'},
                {'form': 'sequenti', 'morph': 'datif féminin singulier participe présent actif', 'lemma': 'sequo',
                 'radical': 'sequ', 'desinence': 'enti'},
                {'form': 'sequenti', 'morph': 'datif neutre singulier participe présent actif', 'lemma': 'sequo',
                 'radical': 'sequ', 'desinence': 'enti'},
                {'form': 'sequenti', 'morph': 'datif masculin singulier', 'lemma': 'sequens',
                 'radical': 'sequent', 'desinence': 'i'},
                {'form': 'sequenti', 'morph': 'ablatif masculin singulier', 'lemma': 'sequens',
                 'radical': 'sequent', 'desinence': 'i'},
                {'form': 'sequenti', 'morph': 'datif féminin singulier', 'lemma': 'sequens',
                 'radical': 'sequent', 'desinence': 'i'},
                {'form': 'sequenti', 'morph': 'ablatif féminin singulier', 'lemma': 'sequens',
                 'radical': 'sequent', 'desinence': 'i'},
                {'form': 'sequenti', 'morph': 'datif neutre singulier', 'lemma': 'sequens',
                 'radical': 'sequent', 'desinence': 'i'},
                {'form': 'sequenti', 'morph': 'ablatif neutre singulier', 'lemma': 'sequens',
                 'radical': 'sequent', 'desinence': 'i'},
            ],
            [
                {'form': 'quadriduo', 'morph': 'datif singulier', 'lemma': 'quadriduum'
                    , 'radical': 'quadridu', 'desinence': 'o'},
                {'form': 'quadriduo', 'morph': 'ablatif singulier', 'lemma': 'quadriduum'
                    , 'radical': 'quadridu', 'desinence': 'o'}
            ]
        ]
        self.assertLemmatisationMultipleEqual(
            results, expected, "Invar should be correctly recognized"
        )

    def test_possible_forms(self):
        self.assertEqual(
            sorted(list(self.lemmatizer.lemmatise("bellus", get_lemma_object=True))[0]["lemma"].possible_forms()),
            sorted([
                'belliora',
                'bellae',
                'bellam',
                'bellissimis',
                'bellioris',
                'bellissimarum',
                'bellissime',
                'bellorum',
                'bellissimum',
                'belliori',
                'bellissimorum',
                'bellissima',
                'bellum',
                'bellus',
                'bellissimae',
                'bellis',
                'belli',
                'belliores',
                'bellissimo',
                'bellissimas',
                'bellioribus',
                'bellas',
                'bellior',
                'belliore',
                'bellarum',
                'bella',
                'bellissimus',
                'bellissimos',
                'belliorum',
                'belle',
                'bellos',
                'belliorem',
                'bellissimam',
                'bello',
                'bellissimi',
                'bellius'
            ])
        )

    def test_assimilations(self):
        """ Check that lemmatizer handles correctly assimilations """
        results = TestSentences.lemmatizer.lemmatise_multiple("adprehendant expectari")
        self.assertLemmatisationMultipleEqual(
            results,
            [
                [{'lemma': 'apprehendo', 'form': 'apprehendant', 'morph': '3ème pluriel subjonctif présent actif',
                  "radical": "apprehend", "desinence": "ant"}],
                [{'lemma': 'exspecto', 'form': 'exspectari', 'morph': 'infinitif présent passif', "radical": "exspect",
                  "desinence": "ari"}]
            ]
        )

    def test_contractions(self):
        """ Check that the lemmatizer handles correctly contractions """
        results = TestSentences.lemmatizer.lemmatise_multiple("exspirasset legarat legerat", get_lemma_object=True)
        self.assertLemmatisationMultipleEqual(
            results,
            [
                [
                    {'form': 'exspirauisset', 'morph': '3ème singulier subjonctif PQP actif',
                     'lemma': TestSentences.lemmatizer.lemme("exspiro"),
                     'radical': 'exspirav', 'desinence': 'isset'}
                ],
                [
                    {'lemma': TestSentences.lemmatizer.lemme("lego2"),
                     'morph': '3ème singulier indicatif PQP actif', 'form': 'legauerat',
                     'radical': 'legav', 'desinence': 'erat'}
                ],
                [
                    {'lemma': TestSentences.lemmatizer.lemme("lego"),
                     'morph': '3ème singulier indicatif PQP actif', 'form': 'legerat',
                     'radical': 'leg', 'desinence': 'erat'}
                ]
            ],
            _lemma_obj=True
        )

    def test_lower_case(self):
        results = TestSentences.lemmatizer.lemmatise("Christi", get_lemma_object=True)
        self.assertLemmatisationEqual(
            results,
            [
                {'lemma': TestSentences.lemmatizer.lemme("Christus"), 'form': 'christi', 'morph': 'génitif singulier',
                 'radical': 'Christ', 'desinence': 'i'},
                {'lemma': TestSentences.lemmatizer.lemme("Christus"), 'form': 'christi', 'morph': 'nominatif pluriel',
                 'radical': 'Christ', 'desinence': 'i'},
                {'lemma': TestSentences.lemmatizer.lemme("Christus"), 'form': 'christi', 'morph': 'vocatif pluriel',
                 'radical': 'Christ', 'desinence': 'i'},
                {'lemma': TestSentences.lemmatizer.lemme("christus2"), 'form': 'christi', 'morph': 'génitif masculin singulier',
                 'radical': 'christ', 'desinence': 'i'},
                {'lemma': TestSentences.lemmatizer.lemme("christus2"), 'form': 'christi', 'morph': 'nominatif masculin pluriel',
                 'radical': 'christ', 'desinence': 'i'},
                {'lemma': TestSentences.lemmatizer.lemme("christus2"), 'form': 'christi', 'morph': 'vocatif masculin pluriel',
                 'radical': 'christ', 'desinence': 'i'},
                {'lemma': TestSentences.lemmatizer.lemme("christus2"), 'form': 'christi', 'morph': 'génitif neutre singulier',
                 'radical': 'christ', 'desinence': 'i'}
            ], _lemma_obj=True
        )

    def test_roman_num(self):
        results = TestSentences.lemmatizer.lemmatise_multiple("XIV MDCXXIV xiv", get_lemma_object=True, as_list=True)
        self.assertLemmatisationMultipleEqual(
            results,
            [
                [
                    {'lemma': TestSentences.parser.parse_lemme("XIV|inv|||adj. num.|1", 0, _deramise=False),
                     'form': 'XIV', 'morph': '', 'radical': None, 'desinence': None},
                ],
                [
                    {'lemma': TestSentences.parser.parse_lemme("MDCXXIV|inv|||adj. num.|1", 0, _deramise=False),
                     'form': 'MDCXXIV', 'morph': '', 'radical': None, 'desinence': None},
                ],
                [
                    {'lemma': TestSentences.parser.parse_lemme("XIV|inv|||adj. num.|1", 0, _deramise=False),
                     'form': 'XIV', 'morph': '', 'radical': None, 'desinence': None},
                ]
            ], _lemma_obj=True
        )

    def test_when_there_is_a_non_word_char(self):
        results = TestSentences.lemmatizer.lemmatise_multiple(
            "Qui, quae , quod ! ", pos=True
        )
        self.assertEqual(len(results), 3, "Splitting should be operational")

    def test_when_there_is_a_suffixe(self):
        results = TestSentences.lemmatizer.lemmatise_multiple(
            "Et flavescit haphe gravesque draucis ", pos=True
        )
        self.assertLemmatisationMultipleEqual(
            results,
            [[{'pos': 'cd', 'form': 'et', 'lemma': 'et', 'morph': '-',
               'desinence': '', 'radical': 'et'}],
             [{'pos': 'v', 'form': 'flauescit', 'lemma': 'flavesco', 'morph': '3ème singulier indicatif présent actif',
               'desinence': 'it', 'radical': 'flavesc'}],
             [{'pos': 'n', 'form': 'haphe', 'lemma': 'haphe', 'morph': 'nominatif singulier',
               'desinence': 'e', 'radical': 'haph'},
              {'pos': 'n', 'form': 'haphe', 'lemma': 'haphe', 'morph': 'vocatif singulier',
               'desinence': 'e', 'radical': 'haph'},
              {'pos': 'n', 'form': 'haphe', 'lemma': 'haphe', 'morph': 'ablatif singulier',
               'desinence': 'e', 'radical': 'haph'}],
             [{'pos': 'a', 'form': 'graues', 'lemma': 'gravis', 'morph': 'nominatif masculin pluriel',
                 'desinence': 'es', 'radical': 'grav'},
              {'pos': 'a', 'form': 'graues', 'lemma': 'gravis', 'morph': 'vocatif masculin pluriel',
                 'desinence': 'es', 'radical': 'grav'},
              {'pos': 'a', 'form': 'graues', 'lemma': 'gravis', 'morph': 'accusatif masculin pluriel',
                 'desinence': 'es', 'radical': 'grav'},
              {'pos': 'a', 'form': 'graues', 'lemma': 'gravis', 'morph': 'nominatif féminin pluriel',
                 'desinence': 'es', 'radical': 'grav'},
              {'pos': 'a', 'form': 'graues', 'lemma': 'gravis', 'morph': 'vocatif féminin pluriel',
                 'desinence': 'es', 'radical': 'grav'},
              {'pos': 'a', 'form': 'graues', 'lemma': 'gravis', 'morph': 'accusatif féminin pluriel',
                 'desinence': 'es', 'radical': 'grav'},
              {'pos': 'v', 'form': 'graues', 'lemma': 'grauo', 'morph': '2ème singulier subjonctif présent actif',
                 'desinence': 'es', 'radical': 'grau'}
              ],
             [{'pos': 'n', 'form': 'draucis', 'lemma': 'Draucus', 'morph': 'datif pluriel',
               'desinence': 'is', 'radical': 'Drauc'},
              {'pos': 'n', 'form': 'draucis', 'lemma': 'Draucus', 'morph': 'ablatif pluriel',
               'desinence': 'is', 'radical': 'Drauc'},
              {'pos': 'n', 'form': 'draucis', 'lemma': 'draucus', 'morph': 'datif pluriel',
               'desinence': 'is', 'radical': 'drauc'},
              {'pos': 'n', 'form': 'draucis', 'lemma': 'draucus', 'morph': 'ablatif pluriel',
               'desinence': 'is', 'radical': 'drauc'}
              ]
             ]
        )
示例#10
0
from pycollatinus import Lemmatiseur

analyzer = Lemmatiseur()
print(analyzer.lemmatise_multiple("arma virum cano"))
示例#11
0
    def test_contraction(self):

        x = Lemmatiseur(load=False)
        parser = Parser(x)
        parser.ajMorphos()
        parser.ajContractions()
        parser.ajAssims()
        load_mod_vars(x)
        amo = parser.parse_modele("""modele:amo
R:0:1,0
R:1:1,āv
R:2:1,āt
des:121-126:0:ō̆;ās;ăt;āmŭs;ātĭs;ānt
des:127-132:0:ābăm;ābās;ābăt;ābāmŭs;ābātĭs;ābānt
des:133-138:0:ābō̆;ābĭs;ābĭt;ābĭmŭs;ābĭtĭs;ābūnt
des:139-144:1:ī;īstī;ĭt;ĭmŭs;īstĭs;ērūnt,ērĕ
des:145-150:1:ĕrăm;ĕrās;ĕrăt;ĕrāmŭs;ĕrātĭs;ĕrānt
des:151-156:1:ĕrō̆;ĕrī̆s;ĕrĭt;ĕrī̆mŭs;ĕrī̆tĭs;ĕrīnt
des:157-162:0:$em
des:163-168:0:ārĕm;ārēs;ārĕt;ārēmŭs;ārētĭs;ārēnt
des:169-174:1:ĕrĭm;ĕrī̆s;ĕrĭt;ĕrī̆mŭs;ĕrī̆tĭs;ĕrīnt
des:175-180:1:īssĕm;īssēs;īssĕt;īssēmŭs;īssētĭs;īssēnt
des:181-186:0:ā;ātĕ;ātō;ātō;ātōtĕ;āntō
des:187:0:ārĕ
des:188:1:īssĕ
des:188:0:āssĕ
des:189-200:0:āns;āns;āntĕm;āntĭs;āntī;āntĕ;āntēs;āntēs;āntēs;āntĭŭm,āntŭm;āntĭbŭs;āntĭbŭs
des:201-212:0:āns;āns;āntĕm;āntĭs;āntī;āntĕ;āntēs;āntēs;āntēs;āntĭŭm,āntŭm;āntĭbŭs;āntĭbŭs
des:213-224:0:āns;āns;āns;āntĭs;āntī;āntĕ;āntĭă;āntĭă;āntĭă;āntĭŭm,āntŭm;āntĭbŭs;āntĭbŭs
des:225-236:2:ūr$lupus
des:237-248:2:ūr$uita
des:249-260:2:ūr$templum
des:261-264:0:āndŭm;āndī;āndō;āndō
des:265,266:2:ŭm;ū
des:267-272:0:ŏr;ārĭs,ārĕ;ātŭr;āmŭr;āmĭnī;āntŭr
des:273-278:0:ābăr;ābārĭs,ābārĕ;ābātŭr;ābāmŭr;ābāmĭnī;ābāntŭr
des:279-284:0:ābŏr;ābĕrĭs,ābĕrĕ;ābĭtŭr;ābĭmŭr;ābĭmĭnī;ābūntŭr
des:285-290:0:ĕr;ērĭs,ērĕ;ētŭr;ēmŭr;ēmĭnī;ēntŭr
des:291-296:0:ārĕr;ārērĭs,ārērĕ;ārētŭr;ārēmŭr;ārēmĭnī;ārēntŭr
des:297,298:0:ārĕ;āmĭnī
des:299-301:0:ātŏr;ātŏr;āntŏr
des:302:0:ārī
des:303-314:2:$lupus
des:315-326:2:$uita
des:327-338:2:$templum
des:339-350:0:ānd$lupus
des:351-362:0:ānd$uita
des:363-374:0:ānd$templum""".split("\n"))
        x._modeles[amo.gr()] = amo
        moneo = parser.parse_modele("""modele:moneo
pere:amo
R:0:2,0
R:2:-
des:121-126:0:ĕō̆;ēs;ĕt;ēmŭs;ētĭs;ēnt
des:127-132:0:ēbăm;ēbās;ēbăt;ēbāmŭs;ēbātĭs;ēbānt
des:133-138:0:ēbō̆;ēbĭs;ēbĭt;ēbĭmŭs;ēbĭtĭs;ēbūnt
des:157-162:0:ĕăm;ĕās;ĕăt;ĕāmŭs;ĕātĭs;ĕānt
des:163-168:0:ērĕm;ērēs;ērĕt;ērēmŭs;ērētĭs;ērēnt
des:181-186:0:ē;ētĕ;ētō;ētō;ētōtĕ;ēntō
des:187:0:ērĕ
des:188:1:īssĕ
des:189-200:0:$ens
des:201-212:0:$ens
des:213-224:0:ēns;ēns;ēns;ēntĭs;ēntī;ēntĕ;ēntĭă;ēntĭă;ēntĭă;ēntĭŭm,ēntŭm;ēntĭbŭs;ēntĭbŭs
des:261-264:0:ēndŭm;ēndī;ēndō;ēndō
des:267-272:0:ĕŏr;ērĭs,ērĕ;ētŭr;ēmŭr;ēmĭnī;ēntŭr
des:273-278:0:ēbăr;ēbārĭs,ēbārĕ;ēbātŭr;ēbāmŭr;ēbāmĭnī;ēbāntŭr
des:279-284:0:ēbŏr;ēbĕrĭs,ēbĕrĕ;ēbĭtŭr;ēbĭmŭr;ēbĭmĭnī;ēbūntŭr
des:285-290:0:ĕăr;ĕārĭs,ĕārĕ;ĕātŭr;ĕāmŭr;ĕāmĭnī;ĕāntŭr
des:291-296:0:ērĕr;ērērĭs,ērērĕ;ērētŭr;ērēmŭr;ērēmĭnī;ērēntŭr
des:297,298:0:ērĕ;ēmĭnī
des:299-301:0:ētŏr;ētŏr;ēntŏr
des:302:0:ērī
des:339-350:0:ēndŭs;ēndĕ;ēndŭm;ēndī;ēndō;ēndō;ēndī;ēndī;ēndōs;ēndōrŭm;ēndīs;ēndīs
des:351-362:0:ēndă;ēndă;ēndăm;ēndāe;ēndāe;ēndā;ēndāe;ēndāe;ēndās;ēndārŭm;ēndīs;ēndīs
des:363-374:0:ēndŭm;ēndŭm;ēndŭm;ēndī;ēndō;ēndō;ēndă;ēndă;ēndă;ēndōrŭm;ēndīs;ēndīs"""
                                    .split("\n"))
        x._modeles[moneo.gr()] = moneo
        lego = parser.parse_modele("""modele:lego
pere:moneo
R:0:1,0
des:121-126:0:ō̆;ĭs;ĭt;ĭmŭs;ĭtĭs;ūnt
des:133-138:0:ăm;ēs;ĕt;ēmŭs;ētĭs;ēnt
des:157-162:0:ăm;ās;ăt;āmŭs;ātĭs;ānt
des:163-168:0:ĕrĕm;ĕrēs;ĕrĕt;ĕrēmŭs;ĕrētĭs;ĕrēnt
des:181-186:0:ĕ;ĭtĕ;ĭtō;ĭtō;ĭtōtĕ;ūntō
des:187:0:ĕrĕ
des:267-272:0:ŏr;ĕrĭs,ĕrĕ;ĭtŭr;ĭmŭr;ĭmĭnī;ūntŭr
des:279-284:0:ăr;ērĭs,ērĕ;ētŭr;ēmŭr;ēmĭnī;ēntŭr
des:285-290:0:ăr;ārĭs,ārĕ;ātŭr;āmŭr;āmĭnī;āntŭr
des:291-296:0:ĕrĕr;ĕrērĭs,ĕrērĕ;ĕrētŭr;ĕrēmŭr;ĕrēmĭnī;ĕrēntŭr
des:297,298:0:ĕrĕ;ĭmĭnī
des:299-301:0:ĭtŏr;ĭtŏr;ūntŏr
des:302:0:ī""".split("\n"))
        x._modeles[lego.gr()] = lego

        parser.parse_lemme("lēgo2|amo|||as, are|34", origin=0)
        parser.parse_lemme("lĕgo|lego|lēg|lēct|is, ere, legi, lectum|619",
                           origin=0)

        self.assertEqual(
            x.lemmatise_multiple("legarat legerat", get_lemma_object=True),
            [[{
                'lemma': x.lemme("lego2"),
                'morph': '3ème singulier indicatif PQP actif',
                'form': 'legauerat',
                "radical": "legav",
                "desinence": "erat"
            }],
             [{
                 'lemma': x.lemme("lego"),
                 'morph': '3ème singulier indicatif PQP actif',
                 'form': 'legerat',
                 "radical": "leg",
                 "desinence": "erat"
             }]])