#!/usr/bin/env python3

from morphgnt.utils import load_yaml, load_wordset, sorted_items

lexemes = load_yaml("lexemes.yaml")
already = load_wordset("nominal-indeclinable.txt")

for lexeme, metadata in sorted_items(lexemes):
    danker = metadata.get("danker-entry", "")
    dodson_pos = metadata.get("dodson-pos", "")
    mounce_morphcat = metadata.get("mounce-morphcat", "")

    if (
        lexeme in already or
        dodson_pos == "N-PRI" or
        mounce_morphcat == "n-3g(2)"
    ):
        print("{:20}|{:45}|{:10}|{:10}|{:5}".format(
            lexeme,
            danker,
            dodson_pos,
            mounce_morphcat,
            "yes" if lexeme in already else "no",
        ))

        if lexeme in already:
            already.remove(lexeme)

print(already)
Пример #2
0
#!/usr/bin/env python3

from morphgnt.utils import load_yaml, sorted_items

lexemes = load_yaml("lexemes.yaml")

# skip these for now until we work out how to handle them
SKIP = ["Ἀππίου", "Λιμήν", "Πάγος", "Πόλις", "Ταβέρνη", "Φόρον"]

for lexeme, metadata in sorted_items(lexemes):
    if "full-citation-form" in metadata and lexeme not in SKIP:
        lexeme = lexeme.split("/")[0]
        citation_form = metadata["full-citation-form"]
        print("{}: {}".format(lexeme, citation_form))
        for alt in citation_form.split(" / "):
            components = alt.split(", ")
            assert len(components) <= 6
            if len(components) == 1:
                assert components[0] == lexeme
            elif len(components) == 2:
                assert components[0] == lexeme
                assert components[1] in ["ὁ", "ἡ", "τό"]
            elif len(components) == 3:
                if components[2].startswith(("acc.", "dat.", "pl.")):
                    assert components[0] == lexeme
                    assert components[1] in ["ὁ", "ἡ", "τό", "τά"]
                else:
                    assert components[0] == lexeme
                    assert components[2] in [
                        "ὁ", "ἡ", "τό", "ὁ/ἡ", "ὁ/τό", "οἱ", "αἱ", "τά"
                    ]
        elif mood in "DSO":
            pass
        elif mood in "P":
            pass
        elif mood in "N":
            pass
        else:
            raise ValueError


SKIP_LIST = [
    "σαβαχθάνι",
    "χρή",
]

for lemma, form_dict in sorted_items(forms):
    if lemma in SKIP_LIST:
        continue

    print()
    for tense_voice in sorted(form_dict):
        first_singular_forms = set()
        for endings in ENDINGS[tense_voice]:
            fail = False
            stems = []
            num = endings["num"]
            first_singular = endings["1S"]
            for person_number, ending in sorted(endings.items()):
                if person_number == "num":
                    continue
                ending = sorted(ending.split("/"))
Пример #4
0
                gender = row["ccat-parse"][6]
                case_number = row["ccat-parse"][4:6]
                form_list = forms.setdefault(lemma, {}).setdefault(tense_voice_mood, {}).setdefault(gender, {}).setdefault(case_number, {}).setdefault("forms", [])
                if {"form": form} not in form_list:
                    form_list.append({"form": form})
            else:
                print >>sys.stderr, "*** can't handle mood {}".format(mood)
        elif lexeme["pos"] in ["P", "X"]:
            form_list = forms.setdefault(lemma, {}).setdefault("forms", [])
            if {"form": form} not in form_list:
                form_list.append({"form": form})
    else:
        print >>sys.stderr, "lexemes file doesn't have {}".format(row["lemma"])


for form, metadata in sorted_items(forms):
    print "{}:".format(form.encode("utf-8"))
    pos = lexemes[form]["pos"]
    if pos in ["RA", "A", "N", "RR"]:
        for gender in ["M", "F", "N"]:
            if gender in metadata:
                print "    {}:".format(gender)
                for case_number in ["NS", "AS", "GS", "DS", "VS", "NP", "AP", "GP", "DP", "VP"]:
                    if case_number in metadata[gender]:
                        print "        {}:".format(case_number)
                        print "            forms:"
                        for form in metadata[gender][case_number]["forms"]:
                            print "                -"
                            print "                    form: {}".format(form["form"].encode("utf-8"))
    elif pos in ["RP1"]:
        for case_number in ["NS", "AS", "GS", "DS", "VS", "NP", "AP", "GP", "DP", "VP"]:
                strip_accents(row["norm"]))
        elif mood in "DSO":
            pass
        elif mood in "P":
            pass
        elif mood in "N":
            pass
        else:
            raise ValueError

SKIP_LIST = [
    "σαβαχθάνι",
    "χρή",
]

for lemma, form_dict in sorted_items(forms):
    if lemma in SKIP_LIST:
        continue

    print()
    for tense_voice in sorted(form_dict):
        first_singular_forms = set()
        for endings in ENDINGS[tense_voice]:
            fail = False
            stems = []
            num = endings["num"]
            first_singular = endings["1S"]
            for person_number, ending in sorted(endings.items()):
                if person_number == "num":
                    continue
                ending = sorted(ending.split("/"))
        # syncretism

        if parse[2] == "N":
            if parse[0] == "N":
                forms[norm].add("A" + parse[1:])
            if parse[0] == "A":
                forms[norm].add("N" + parse[1:])
        if "n-1a" in mounce:
            if parse == "APF":
                forms[norm].add("GSF")
            if parse == "GSF":
                forms[norm].add("APF")
        if "n-1d" in mounce:
            if parse == "NSM":
                forms[norm].add("APM")
            if parse == "APM":
                forms[norm].add("NSM")
        if "n-3e(3)" in mounce or "n-3e(5b)" in mounce:
            if parse[:2] == "AP":
                forms[norm].add("NP" + parse[2])
            if parse[:2] == "NP":
                forms[norm].add("AP" + parse[2])

for form, data in sorted_items(forms):
    if cats[form]:
        cat_string = ":".join(cats[form])
    else:
        cat_string = "@@@"
    print("{}|{}|{}|{}".format(form, cat_string, ":".join(sorted(data)),
                               counts[form]))
        if parse[2] == "N":
            if parse[0] == "N":
                forms[norm].add("A" + parse[1:])
            if parse[0] == "A":
                forms[norm].add("N" + parse[1:])
        if "n-1a" in mounce:
            if parse == "APF":
                forms[norm].add("GSF")
            if parse == "GSF":
                forms[norm].add("APF")
        if "n-1d" in mounce:
            if parse == "NSM":
                forms[norm].add("APM")
            if parse == "APM":
                forms[norm].add("NSM")
        if "n-3e(3)" in mounce or "n-3e(5b)" in mounce:
            if parse[:2] == "AP":
                forms[norm].add("NP" + parse[2])
            if parse[:2] == "NP":
                forms[norm].add("AP" + parse[2])


for form, data in sorted_items(forms):
    if cats[form]:
        cat_string = ":".join(cats[form])
    else:
        cat_string = "@@@"
    print("{}|{}|{}|{}".format(
        form, cat_string, ":".join(sorted(data)), counts[form]))
Пример #8
0
#!/usr/bin/env python
# coding: utf-8

import sys

from morphgnt.utils import load_yaml, sorted_items

danker = load_yaml("../data-cleanup/danker-concise-lexicon/components.yaml")

derivation = load_yaml("derivation.yaml")

skipped = 0
existing = 0
added = 0
for lexeme, metadata in sorted_items(danker):
    components = metadata["components"].strip()

    print "{}:".format(lexeme.encode("utf-8"))
    if lexeme in derivation:
        if derivation[lexeme]:

            def q(key):
                if key in derivation[lexeme]:
                    if isinstance(derivation[lexeme][key], list):
                        print "    {}:".format(key)
                        for item in derivation[lexeme][key]:
                            print "        - {}".format(item.encode("utf-8"))
                    else:
                        print "    {}: {}".format(
                            key, derivation[lexeme][key].encode("utf-8"))
#!/usr/bin/env python
# coding: utf-8

import sys

from morphgnt.utils import load_yaml, sorted_items

danker = load_yaml("../data-cleanup/danker-concise-lexicon/components.yaml")

derivation = load_yaml("derivation.yaml")

skipped = 0
existing = 0
added = 0
for lexeme, metadata in sorted_items(danker):
    components = metadata["components"].strip()

    print "{}:".format(lexeme.encode("utf-8"))
    if lexeme in derivation:
        if derivation[lexeme]:

            def q(key):
                if key in derivation[lexeme]:
                    if isinstance(derivation[lexeme][key], list):
                        print "    {}:".format(key)
                        for item in derivation[lexeme][key]:
                            print "        - {}".format(item.encode("utf-8"))
                    else:
                        print "    {}: {}".format(key, derivation[lexeme][key].encode("utf-8"))

            q("derivation")