#!/usr/bin/env python3 from morphgnt.utils import load_yaml, load_wordset, sorted_items lexemes = load_yaml("lexemes.yaml") already = load_wordset("nominal-indeclinable.txt") for lexeme, metadata in sorted_items(lexemes): danker = metadata.get("danker-entry", "") dodson_pos = metadata.get("dodson-pos", "") mounce_morphcat = metadata.get("mounce-morphcat", "") if ( lexeme in already or dodson_pos == "N-PRI" or mounce_morphcat == "n-3g(2)" ): print("{:20}|{:45}|{:10}|{:10}|{:5}".format( lexeme, danker, dodson_pos, mounce_morphcat, "yes" if lexeme in already else "no", )) if lexeme in already: already.remove(lexeme) print(already)
#!/usr/bin/env python3 from morphgnt.utils import load_yaml, sorted_items lexemes = load_yaml("lexemes.yaml") # skip these for now until we work out how to handle them SKIP = ["Ἀππίου", "Λιμήν", "Πάγος", "Πόλις", "Ταβέρνη", "Φόρον"] for lexeme, metadata in sorted_items(lexemes): if "full-citation-form" in metadata and lexeme not in SKIP: lexeme = lexeme.split("/")[0] citation_form = metadata["full-citation-form"] print("{}: {}".format(lexeme, citation_form)) for alt in citation_form.split(" / "): components = alt.split(", ") assert len(components) <= 6 if len(components) == 1: assert components[0] == lexeme elif len(components) == 2: assert components[0] == lexeme assert components[1] in ["ὁ", "ἡ", "τό"] elif len(components) == 3: if components[2].startswith(("acc.", "dat.", "pl.")): assert components[0] == lexeme assert components[1] in ["ὁ", "ἡ", "τό", "τά"] else: assert components[0] == lexeme assert components[2] in [ "ὁ", "ἡ", "τό", "ὁ/ἡ", "ὁ/τό", "οἱ", "αἱ", "τά" ]
elif mood in "DSO": pass elif mood in "P": pass elif mood in "N": pass else: raise ValueError SKIP_LIST = [ "σαβαχθάνι", "χρή", ] for lemma, form_dict in sorted_items(forms): if lemma in SKIP_LIST: continue print() for tense_voice in sorted(form_dict): first_singular_forms = set() for endings in ENDINGS[tense_voice]: fail = False stems = [] num = endings["num"] first_singular = endings["1S"] for person_number, ending in sorted(endings.items()): if person_number == "num": continue ending = sorted(ending.split("/"))
gender = row["ccat-parse"][6] case_number = row["ccat-parse"][4:6] form_list = forms.setdefault(lemma, {}).setdefault(tense_voice_mood, {}).setdefault(gender, {}).setdefault(case_number, {}).setdefault("forms", []) if {"form": form} not in form_list: form_list.append({"form": form}) else: print >>sys.stderr, "*** can't handle mood {}".format(mood) elif lexeme["pos"] in ["P", "X"]: form_list = forms.setdefault(lemma, {}).setdefault("forms", []) if {"form": form} not in form_list: form_list.append({"form": form}) else: print >>sys.stderr, "lexemes file doesn't have {}".format(row["lemma"]) for form, metadata in sorted_items(forms): print "{}:".format(form.encode("utf-8")) pos = lexemes[form]["pos"] if pos in ["RA", "A", "N", "RR"]: for gender in ["M", "F", "N"]: if gender in metadata: print " {}:".format(gender) for case_number in ["NS", "AS", "GS", "DS", "VS", "NP", "AP", "GP", "DP", "VP"]: if case_number in metadata[gender]: print " {}:".format(case_number) print " forms:" for form in metadata[gender][case_number]["forms"]: print " -" print " form: {}".format(form["form"].encode("utf-8")) elif pos in ["RP1"]: for case_number in ["NS", "AS", "GS", "DS", "VS", "NP", "AP", "GP", "DP", "VP"]:
strip_accents(row["norm"])) elif mood in "DSO": pass elif mood in "P": pass elif mood in "N": pass else: raise ValueError SKIP_LIST = [ "σαβαχθάνι", "χρή", ] for lemma, form_dict in sorted_items(forms): if lemma in SKIP_LIST: continue print() for tense_voice in sorted(form_dict): first_singular_forms = set() for endings in ENDINGS[tense_voice]: fail = False stems = [] num = endings["num"] first_singular = endings["1S"] for person_number, ending in sorted(endings.items()): if person_number == "num": continue ending = sorted(ending.split("/"))
# syncretism if parse[2] == "N": if parse[0] == "N": forms[norm].add("A" + parse[1:]) if parse[0] == "A": forms[norm].add("N" + parse[1:]) if "n-1a" in mounce: if parse == "APF": forms[norm].add("GSF") if parse == "GSF": forms[norm].add("APF") if "n-1d" in mounce: if parse == "NSM": forms[norm].add("APM") if parse == "APM": forms[norm].add("NSM") if "n-3e(3)" in mounce or "n-3e(5b)" in mounce: if parse[:2] == "AP": forms[norm].add("NP" + parse[2]) if parse[:2] == "NP": forms[norm].add("AP" + parse[2]) for form, data in sorted_items(forms): if cats[form]: cat_string = ":".join(cats[form]) else: cat_string = "@@@" print("{}|{}|{}|{}".format(form, cat_string, ":".join(sorted(data)), counts[form]))
if parse[2] == "N": if parse[0] == "N": forms[norm].add("A" + parse[1:]) if parse[0] == "A": forms[norm].add("N" + parse[1:]) if "n-1a" in mounce: if parse == "APF": forms[norm].add("GSF") if parse == "GSF": forms[norm].add("APF") if "n-1d" in mounce: if parse == "NSM": forms[norm].add("APM") if parse == "APM": forms[norm].add("NSM") if "n-3e(3)" in mounce or "n-3e(5b)" in mounce: if parse[:2] == "AP": forms[norm].add("NP" + parse[2]) if parse[:2] == "NP": forms[norm].add("AP" + parse[2]) for form, data in sorted_items(forms): if cats[form]: cat_string = ":".join(cats[form]) else: cat_string = "@@@" print("{}|{}|{}|{}".format( form, cat_string, ":".join(sorted(data)), counts[form]))
#!/usr/bin/env python # coding: utf-8 import sys from morphgnt.utils import load_yaml, sorted_items danker = load_yaml("../data-cleanup/danker-concise-lexicon/components.yaml") derivation = load_yaml("derivation.yaml") skipped = 0 existing = 0 added = 0 for lexeme, metadata in sorted_items(danker): components = metadata["components"].strip() print "{}:".format(lexeme.encode("utf-8")) if lexeme in derivation: if derivation[lexeme]: def q(key): if key in derivation[lexeme]: if isinstance(derivation[lexeme][key], list): print " {}:".format(key) for item in derivation[lexeme][key]: print " - {}".format(item.encode("utf-8")) else: print " {}: {}".format( key, derivation[lexeme][key].encode("utf-8"))
#!/usr/bin/env python # coding: utf-8 import sys from morphgnt.utils import load_yaml, sorted_items danker = load_yaml("../data-cleanup/danker-concise-lexicon/components.yaml") derivation = load_yaml("derivation.yaml") skipped = 0 existing = 0 added = 0 for lexeme, metadata in sorted_items(danker): components = metadata["components"].strip() print "{}:".format(lexeme.encode("utf-8")) if lexeme in derivation: if derivation[lexeme]: def q(key): if key in derivation[lexeme]: if isinstance(derivation[lexeme][key], list): print " {}:".format(key) for item in derivation[lexeme][key]: print " - {}".format(item.encode("utf-8")) else: print " {}: {}".format(key, derivation[lexeme][key].encode("utf-8")) q("derivation")