Exemplo n.º 1
0
#!/usr/bin/env python3

from morphgnt.utils import load_yaml, sorted_items

lexemes = load_yaml("lexemes.yaml")

# skip these for now until we work out how to handle them
SKIP = ["Ἀππίου", "Λιμήν", "Πάγος", "Πόλις", "Ταβέρνη", "Φόρον"]

for lexeme, metadata in sorted_items(lexemes):
    if "full-citation-form" in metadata and lexeme not in SKIP:
        lexeme = lexeme.split("/")[0]
        citation_form = metadata["full-citation-form"]
        print("{}: {}".format(lexeme, citation_form))
        for alt in citation_form.split(" / "):
            components = alt.split(", ")
            assert len(components) <= 6
            if len(components) == 1:
                assert components[0] == lexeme
            elif len(components) == 2:
                assert components[0] == lexeme
                assert components[1] in ["ὁ", "ἡ", "τό"]
            elif len(components) == 3:
                if components[2].startswith(("acc.", "dat.", "pl.")):
                    assert components[0] == lexeme
                    assert components[1] in ["ὁ", "ἡ", "τό", "τά"]
                else:
                    assert components[0] == lexeme
                    assert components[2] in [
                        "ὁ", "ἡ", "τό", "ὁ/ἡ", "ὁ/τό", "οἱ", "αἱ", "τά"
                    ]
Exemplo n.º 2
0
#!/usr/bin/env python

import sys

from pyuca import Collator
collator = Collator()

from morphgnt.utils import load_yaml
from morphgnt.utils import nfkc_normalize as n

danker = load_yaml("../data-cleanup/danker-concise-lexicon/components.yaml")

greenlee = {}
with open("../data-cleanup/greenlee-morphology/morphemes-utf8.txt") as f:
    for line in f:
        key, value = line.strip().split("\t")
        greenlee[n(key.decode("utf-8")).split(",")[0]] = {
            "full-entry": n(key.decode("utf-8")),
            "components": n(value.decode("utf-8")),
        }

words = [n(word) for word in set(danker.keys()).union(set(greenlee.keys()))]

count = 0
for word in sorted(words, key=collator.sort_key):
    count += 1
    print "{}:".format(word.encode("utf-8"))
    if word in danker:
        print "    danker-full-entry: \"{}\"".format(danker[word]["full-entry"].encode("utf-8"))
        print "    danker-components: \"{}\"".format(danker[word]["components"].encode("utf-8"))
    if word in greenlee:
Exemplo n.º 3
0
#!/usr/bin/env python

import sys

from morphgnt import filesets
from morphgnt.utils import load_yaml, sorted_items

lexemes = load_yaml("lexemes.yaml")
forms = load_yaml("forms.yaml")
fs = filesets.load("filesets.yaml")

for row in fs["sblgnt-lexemes"].rows():
    lemma = row["lemma"].decode("utf-8")
    lexeme = lexemes.get(lemma)
    if lexeme is None:
        lemma = "{}/{}".format(row["lemma"], row["ccat-pos"].strip("-")).decode("utf-8")
        lexeme = lexemes.get(lemma)
    if lexeme:
        form = row["norm"].decode("utf-8")
        if isinstance(lexeme["pos"], list):
            print >> sys.stderr, lexeme
        if lexeme["pos"] in ["RA", "A", "N", "RR"]:
            gender = row["ccat-parse"][6]
            case_number = row["ccat-parse"][4:6]
            form_list = forms.setdefault(lemma, {}).setdefault(gender, {}).setdefault(case_number, {}).setdefault("forms", [])
            if {"form": form} not in form_list:
                form_list.append({"form": form})
        elif lexeme["pos"] in ["RP1"]:
            case_number = row["ccat-parse"][4:6]
            form_list = forms.setdefault(lemma, {}).setdefault(case_number, {}).setdefault("forms", [])
            if {"form": form} not in form_list:
#!/usr/bin/env python3

from morphgnt.utils import load_yaml, load_wordset, sorted_items

lexemes = load_yaml("lexemes.yaml")
already = load_wordset("nominal-indeclinable.txt")

for lexeme, metadata in sorted_items(lexemes):
    danker = metadata.get("danker-entry", "")
    dodson_pos = metadata.get("dodson-pos", "")
    mounce_morphcat = metadata.get("mounce-morphcat", "")

    if (
        lexeme in already or
        dodson_pos == "N-PRI" or
        mounce_morphcat == "n-3g(2)"
    ):
        print("{:20}|{:45}|{:10}|{:10}|{:5}".format(
            lexeme,
            danker,
            dodson_pos,
            mounce_morphcat,
            "yes" if lexeme in already else "no",
        ))

        if lexeme in already:
            already.remove(lexeme)

print(already)
#!/usr/bin/env python3
# coding: utf-8

from difflib import ndiff
import sys
import unicodedata

from morphgnt.utils import load_yaml, sorted_items

derivation = load_yaml("derivation.yaml")
lexemes = load_yaml("lexemes.yaml")


def strip_accents(s):
    return "".join((c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn"))


def diff(word1, word2):
    result = ""
    state = 0
    add = ""
    sub = ""
    for x in ndiff(strip_accents(lexeme), strip_accents(other)):
        if state == 0:
            if x[:2] == "  ":
                result += "."
                state = 1
            elif x[:2] == "- ":
                sub += x[2:]
                state = 2
            elif x[:2] == "+ ":
Exemplo n.º 6
0
#!/usr/bin/env python3
# coding: utf-8

from difflib import ndiff
import sys
import unicodedata

from morphgnt.utils import load_yaml, sorted_items

derivation = load_yaml("derivation.yaml")
lexemes = load_yaml("lexemes.yaml")


def strip_accents(s):
    return "".join((c for c in unicodedata.normalize("NFD", s)
                    if unicodedata.category(c) != "Mn"))


def diff(word1, word2):
    result = ""
    state = 0
    add = ""
    sub = ""
    for x in ndiff(strip_accents(lexeme), strip_accents(other)):
        if state == 0:
            if x[:2] == "  ":
                result += "."
                state = 1
            elif x[:2] == "- ":
                sub += x[2:]
                state = 2
Exemplo n.º 7
0
#!/usr/bin/env python3

import sys

from morphgnt.utils import load_yaml, sorted_items

lexemes = load_yaml("lexemes.yaml")
full_citation = load_yaml("../greek-vocab-assessment/headwords.txt")

n_missed = []
non_n_found = []
used = []

for lexeme, metadata in sorted_items(lexemes):
    print("{}:".format(lexeme))

    def q(metadata_name):
        if metadata_name in metadata:
            print("    {}: {}".format(metadata_name, metadata[metadata_name]))

    q("pos")

    if "full-citation-form" in metadata:
        print("    full-citation-form: {}".format(
            metadata["full-citation-form"]))
    else:
        if lexeme in full_citation:
            print("    full-citation-form: {}".format(full_citation[lexeme]))
            used.append(lexeme)
            if metadata["pos"] != "N":
                non_n_found.append(lexeme)
#!/usr/bin/env python3

import sys

from morphgnt.utils import load_yaml, sorted_items

lexemes = load_yaml("lexemes.yaml")
full_citation = load_yaml("../greek-vocab-assessment/headwords.txt")

n_missed = []
non_n_found = []
used = []

for lexeme, metadata in sorted_items(lexemes):
    print("{}:".format(lexeme))

    def q(metadata_name):
        if metadata_name in metadata:
            print(
                "    {}: {}".format(
                    metadata_name,
                    metadata[metadata_name]
                )
            )

    q("pos")

    if "full-citation-form" in metadata:
        print(
            "    full-citation-form: {}".format(
                metadata["full-citation-form"]
#!/usr/bin/env python

import sys

from morphgnt.utils import load_yaml, load_wordset, sorted_items

lexemes = load_yaml("lexemes.yaml")
danker = load_yaml("../data-cleanup/danker-concise-lexicon/danker_headwords.yaml")
missing_danker = load_wordset("missing_danker.txt")

problems = []
skipped = 0

for lexeme, metadata in sorted_items(lexemes):
    print "{}:".format(lexeme.encode("utf-8"))

    def q(metadata_name):
        if metadata_name in metadata:
            print "    {}: {}".format(metadata_name, unicode(metadata[metadata_name]).encode("utf-8"))

    q("pos")
    q("bdag-headword")

    if "danker-entry" in metadata:
        print "    {}: {}".format("danker-entry", metadata["danker-entry"].encode("utf-8"))
    else:
        if lexeme in missing_danker:
            skipped += 1
        else:
            if lexeme in danker:
                entry = danker[lexeme]
#!/usr/bin/env python3

import re
import unicodedata

from morphgnt.utils import load_yaml, sorted_items

from citation_form_data import CITATION_FORMS

lexemes = load_yaml("../../lexemes.yaml")

ACUTE = u"\u0301"
GRAVE = u"\u0300"
CIRCUMFLEX = u"\u0342"


def strip_accents(w):
    return "".join(
        unicodedata.normalize(
            "NFC", "".join(component
                           for component in unicodedata.normalize("NFD", ch)
                           if component not in [ACUTE, GRAVE, CIRCUMFLEX]))
        for ch in w)


DODSON_OVERRIDES = {
    "ἀφθορία": "N:F",
    "δοκιμασία": "N:F",
    "εἰδέα": "N:F",
    "οἰκετεία": "N:F",
    "ὀλιγοπιστία": "N:F",
Exemplo n.º 11
0
#!/usr/bin/env python

import sys

from morphgnt.utils import load_yaml, load_wordset, sorted_items

lexemes = load_yaml("lexemes.yaml")
danker = load_yaml(
    "../data-cleanup/danker-concise-lexicon/danker_headwords.yaml")
missing_danker = load_wordset("missing_danker.txt")

problems = []
skipped = 0

for lexeme, metadata in sorted_items(lexemes):
    print "{}:".format(lexeme.encode("utf-8"))

    def q(metadata_name):
        if metadata_name in metadata:
            print "    {}: {}".format(
                metadata_name,
                unicode(metadata[metadata_name]).encode("utf-8"))

    q("pos")
    q("bdag-headword")

    if "danker-entry" in metadata:
        print "    {}: {}".format("danker-entry",
                                  metadata["danker-entry"].encode("utf-8"))
    else:
        if lexeme in missing_danker: