#!/usr/bin/env python

from collections import defaultdict
import sys

from morphgnt.utils import load_yaml, sorted_items, load_wordset
from morphgnt.utils import nfkc_normalize as n

lexemes = load_yaml("lexemes.yaml")
missing_mounce = load_wordset("missing_mounce.txt")

problems = []
skipped = 0

mounce = defaultdict(list)
with open("../data-cleanup/mounce-morphcat/mounce-tauber-morphcat-utf8.txt"
          ) as f:
    for line in f:
        gk, greek, morphcat = line.strip().decode("utf-8").split(":")
        mounce[int(gk.split("?")[0])].append(n(greek))

for lexeme, metadata in sorted_items(lexemes):
    print "{}:".format(lexeme.encode("utf-8"))

    def q(metadata_name):
        if metadata_name in metadata:
            print "    {}: {}".format(
                metadata_name,
                unicode(metadata[metadata_name]).encode("utf-8"))
            return True
Пример #2
0
#!/usr/bin/env python

import sys

from morphgnt.utils import load_yaml, load_wordset, sorted_items
from morphgnt.utils import nfkc_normalize as n

lexemes = load_yaml("lexemes.yaml")
missing_bdag = load_wordset("missing_bdag.txt")

headwords = set()
with open("../data-cleanup/bdag-headwords/bdag_headwords.txt") as f:
    for line in f:
        headwords.add(n(line.strip().decode("utf-8")))

existing_not_in_headwords = []
missing_not_in_headwords = []
added = []
for lexeme, metadata in sorted_items(lexemes):
    print "{}:".format(lexeme.encode("utf-8"))

    def q(metadata_name):
        if metadata_name in metadata:
            print "    {}: {}".format(
                metadata_name,
                unicode(metadata[metadata_name]).encode("utf-8"))

    q("pos")

    if "bdag-headword" in metadata:
        print "    bdag-headword: {}".format(
Пример #3
0
#!/usr/bin/env python

from collections import defaultdict
import sys

from morphgnt.utils import load_yaml, load_wordset, sorted_items
from morphgnt.utils import nfkc_normalize as n

lexemes = load_yaml("lexemes.yaml")
missing_dodson = load_wordset("missing_dodson.txt")

dodson = defaultdict(list)
with open("../data-cleanup/dodson-lexicon/dodson_lexicon.txt") as f:
    for line in f:
        strongs, gk, pos, greek, short_gloss, long_gloss = line.strip().decode(
            "utf-8").split("\t")
        head_word = n(greek.split(",")[0])
        dodson[head_word].append({
            "strongs": strongs,
            "gk": gk,
            "pos": pos,
            "greek": n(greek),
            "short-gloss": short_gloss,
            "long-gloss": long_gloss
        })

not_in_dodson = set()
for lexeme, metadata in sorted_items(lexemes):
    print "{}:".format(lexeme.encode("utf-8"))

    def q(metadata_name):
Пример #4
0
ACUTE = u"\u0301"
GRAVE = u"\u0300"
CIRCUMFLEX = u"\u0342"


def strip_accents(w):
    return "".join(
        unicodedata.normalize(
            "NFC", "".join(component
                           for component in unicodedata.normalize("NFD", ch)
                           if component not in [ACUTE, GRAVE, CIRCUMFLEX]))
        for ch in w)


INDECLINABLE = load_wordset("nominal-indeclinable.txt")

# lemma -> person_number -> set of forms
forms = defaultdict(lambda: defaultdict(set))

fs = filesets.load("filesets.yaml")

for row in fs["sblgnt-lexemes"].rows():
    if row["lemma"] in INDECLINABLE:
        continue
    if row["ccat-pos"] in ["N-", "A-", "RA", "RD", "RI", "RP", "RR"]:
        case_number = row["ccat-parse"][4:6]
        if row["ccat-pos"] == "N-":
            key = row["lemma"]
        else:
            key = "{} ({}:{})".format(row["lemma"], row["ccat-parse"][6],
from pyuca import Collator
import yaml

from characters import strip_accents
from morphgnt.utils import load_wordset

from collections import defaultdict
import re
import sys

collator = Collator()


# we will ignore indeclinable nominals

IGNORE_SET = load_wordset("../../nominal-indeclinable.txt")


# we also have to ignore the following nominals because they are combinations
# of words that both inflect independently

IGNORE_SET.update({
    "ὅδε",
    "τοιόσδε",
    "ὅστις",
})


# we need to change some lemmas

LEMMA_OVERRIDE = {
#!/usr/bin/env python

import sys

from morphgnt.utils import load_yaml, load_wordset, sorted_items
from morphgnt.utils import nfkc_normalize as n

lexemes = load_yaml("lexemes.yaml")
missing_bdag = load_wordset("missing_bdag.txt")


headwords = set()
with open("../data-cleanup/bdag-headwords/bdag_headwords.txt") as f:
    for line in f:
        headwords.add(n(line.strip().decode("utf-8")))

existing_not_in_headwords = []
missing_not_in_headwords = []
added = []
for lexeme, metadata in sorted_items(lexemes):
    print "{}:".format(lexeme.encode("utf-8"))

    def q(metadata_name):
        if metadata_name in metadata:
            print "    {}: {}".format(metadata_name, unicode(metadata[metadata_name]).encode("utf-8"))

    q("pos")

    if "bdag-headword" in metadata:
        print "    bdag-headword: {}".format(metadata["bdag-headword"].encode("utf-8"))
        if metadata["bdag-headword"] not in headwords:

ACUTE = u"\u0301"
GRAVE = u"\u0300"
CIRCUMFLEX = u"\u0342"


def strip_accents(w):
    return "".join(
        unicodedata.normalize("NFC", "".join(
            component for component in unicodedata.normalize("NFD", ch) if component not in [ACUTE, GRAVE, CIRCUMFLEX]
        )) for ch in w
    )


INDECLINABLE = load_wordset("nominal-indeclinable.txt")


# lemma -> person_number -> set of forms
forms = defaultdict(lambda: defaultdict(set))

fs = filesets.load("filesets.yaml")

for row in fs["sblgnt-lexemes"].rows():
    if row["lemma"] in INDECLINABLE:
        continue
    if row["ccat-pos"] in ["N-", "A-", "RA", "RD", "RI", "RP", "RR"]:
        case_number = row["ccat-parse"][4:6]
        if row["ccat-pos"] == "N-":
            key = row["lemma"]
        else:
#!/usr/bin/env python

from collections import defaultdict
import sys

from morphgnt.utils import load_yaml, load_wordset, sorted_items
from morphgnt.utils import nfkc_normalize as n

lexemes = load_yaml("lexemes.yaml")
missing_morphcat = load_wordset("missing_morphcat.txt")

mounce = defaultdict(list)
with open("../data-cleanup/mounce-morphcat/mounce-tauber-morphcat-utf8.txt"
          ) as f:
    for line in f:
        gk, greek, morphcat = line.strip().decode("utf-8").split(":")
        greek = n(greek)
        mounce[greek].append({
            "gk": gk,
            "morphcat": morphcat,
        })

problems = []
skipped = 0
for lexeme, metadata in sorted_items(lexemes):
    print "{}:".format(lexeme.encode("utf-8"))

    def q(metadata_name):
        if metadata_name in metadata:
            print "    {}: {}".format(
                metadata_name,
#!/usr/bin/env python

from collections import defaultdict
import sys

from morphgnt.utils import load_yaml, sorted_items, load_wordset
from morphgnt.utils import nfkc_normalize as n

lexemes = load_yaml("lexemes.yaml")
missing_mounce = load_wordset("missing_mounce.txt")

problems = []
skipped = 0

mounce = defaultdict(list)
with open("../data-cleanup/mounce-morphcat/mounce-tauber-morphcat-utf8.txt") as f:
    for line in f:
        gk, greek, morphcat = line.strip().decode("utf-8").split(":")
        mounce[int(gk.split("?")[0])].append(n(greek))

for lexeme, metadata in sorted_items(lexemes):
    print "{}:".format(lexeme.encode("utf-8"))

    def q(metadata_name):
        if metadata_name in metadata:
            print "    {}: {}".format(metadata_name, unicode(metadata[metadata_name]).encode("utf-8"))
            return True

    q("pos")
    q("bdag-headword")
    q("danker-entry")
#!/usr/bin/env python

import sys

from morphgnt.utils import load_yaml, load_wordset, sorted_items

lexemes = load_yaml("lexemes.yaml")
danker = load_yaml("../data-cleanup/danker-concise-lexicon/danker_headwords.yaml")
missing_danker = load_wordset("missing_danker.txt")

problems = []
skipped = 0

for lexeme, metadata in sorted_items(lexemes):
    print "{}:".format(lexeme.encode("utf-8"))

    def q(metadata_name):
        if metadata_name in metadata:
            print "    {}: {}".format(metadata_name, unicode(metadata[metadata_name]).encode("utf-8"))

    q("pos")
    q("bdag-headword")

    if "danker-entry" in metadata:
        print "    {}: {}".format("danker-entry", metadata["danker-entry"].encode("utf-8"))
    else:
        if lexeme in missing_danker:
            skipped += 1
        else:
            if lexeme in danker:
                entry = danker[lexeme]
#!/usr/bin/env python3

from collections import defaultdict

from morphgnt.utils import load_yaml, load_wordset, sorted_items
from morphgnt import filesets

lexemes = load_yaml("lexemes.yaml")
indeclinables = load_wordset("nominal-indeclinable.txt")

SKIP = [
    "ἄγαμος",
    "ἀλάβαστρος",
    "ἅλα",
    "ἄρκος",
    "βάτος",
    "γείτων",
    "διάκονος",
    "θεός",
    "θυρωρός",
    "κάμηλος",
    "κοινωνός",
    "λιμός",
    "ὄνος",
    "ὄρνις",
    "παῖς",
    "παρθένος",
    "Πάτμος",
    "στάμνος",
    "ὕαλος",
    "ὕσσωπος",
#!/usr/bin/env python3

from collections import defaultdict

from morphgnt.utils import load_yaml, load_wordset, sorted_items
from morphgnt import filesets

lexemes = load_yaml("lexemes.yaml")
indeclinables = load_wordset("nominal-indeclinable.txt")

SKIP = [
    "ἄγαμος",
    "ἀλάβαστρος",
    "ἅλα",
    "ἄρκος",
    "βάτος",
    "γείτων",
    "διάκονος",
    "θεός",
    "θυρωρός",
    "κάμηλος",
    "κοινωνός",
    "λιμός",
    "ὄνος",
    "ὄρνις",
    "παῖς",
    "παρθένος",
    "Πάτμος",
    "στάμνος",
    "ὕαλος",
    "ὕσσωπος",
#!/usr/bin/env python

from collections import defaultdict
import sys

from morphgnt.utils import load_yaml, load_wordset, sorted_items
from morphgnt.utils import nfkc_normalize as n

lexemes = load_yaml("lexemes.yaml")
missing_morphcat = load_wordset("missing_morphcat.txt")

mounce = defaultdict(list)
with open("../data-cleanup/mounce-morphcat/mounce-tauber-morphcat-utf8.txt") as f:
    for line in f:
        gk, greek, morphcat = line.strip().decode("utf-8").split(":")
        greek = n(greek)
        mounce[greek].append({
            "gk": gk,
            "morphcat": morphcat,
        })

problems = []
skipped = 0
for lexeme, metadata in sorted_items(lexemes):
    print "{}:".format(lexeme.encode("utf-8"))

    def q(metadata_name):
        if metadata_name in metadata:
            print "    {}: {}".format(metadata_name, unicode(metadata[metadata_name]).encode("utf-8"))
            return True
Пример #14
0
from pysblgnt import morphgnt_rows
from pyuca import Collator
import yaml

from characters import strip_accents
from morphgnt.utils import load_wordset

from collections import defaultdict
import re
import sys

collator = Collator()

# we will ignore indeclinable nominals

IGNORE_SET = load_wordset("../../nominal-indeclinable.txt")

# we also have to ignore the following nominals because they are combinations
# of words that both inflect independently

IGNORE_SET.update({
    "ὅδε",
    "τοιόσδε",
    "ὅστις",
})

# we need to change some lemmas

LEMMA_OVERRIDE = {
    "μήν": "μήν/N",
}
Пример #15
0
#!/usr/bin/env python

from collections import defaultdict
import sys

from morphgnt.utils import load_yaml, load_wordset, sorted_items
from morphgnt.utils import nfkc_normalize as n

lexemes = load_yaml("lexemes.yaml")
missing_dodson = load_wordset("missing_dodson.txt")

dodson = defaultdict(list)
with open("../data-cleanup/dodson-lexicon/dodson_lexicon.txt") as f:
    for line in f:
        strongs, gk, pos, greek, short_gloss, long_gloss = line.strip().decode("utf-8").split("\t")
        head_word = n(greek.split(",")[0])
        dodson[head_word].append(
            {
                "strongs": strongs,
                "gk": gk,
                "pos": pos,
                "greek": n(greek),
                "short-gloss": short_gloss,
                "long-gloss": long_gloss,
            }
        )

not_in_dodson = set()
for lexeme, metadata in sorted_items(lexemes):
    print "{}:".format(lexeme.encode("utf-8"))
#!/usr/bin/env python3

from morphgnt.utils import load_yaml, load_wordset, sorted_items

lexemes = load_yaml("lexemes.yaml")
already = load_wordset("nominal-indeclinable.txt")

for lexeme, metadata in sorted_items(lexemes):
    danker = metadata.get("danker-entry", "")
    dodson_pos = metadata.get("dodson-pos", "")
    mounce_morphcat = metadata.get("mounce-morphcat", "")

    if (
        lexeme in already or
        dodson_pos == "N-PRI" or
        mounce_morphcat == "n-3g(2)"
    ):
        print("{:20}|{:45}|{:10}|{:10}|{:5}".format(
            lexeme,
            danker,
            dodson_pos,
            mounce_morphcat,
            "yes" if lexeme in already else "no",
        ))

        if lexeme in already:
            already.remove(lexeme)

print(already)
Пример #17
0
#!/usr/bin/env python

import sys

from morphgnt.utils import load_yaml, load_wordset, sorted_items

lexemes = load_yaml("lexemes.yaml")
danker = load_yaml(
    "../data-cleanup/danker-concise-lexicon/danker_headwords.yaml")
missing_danker = load_wordset("missing_danker.txt")

problems = []
skipped = 0

for lexeme, metadata in sorted_items(lexemes):
    print "{}:".format(lexeme.encode("utf-8"))

    def q(metadata_name):
        if metadata_name in metadata:
            print "    {}: {}".format(
                metadata_name,
                unicode(metadata[metadata_name]).encode("utf-8"))

    q("pos")
    q("bdag-headword")

    if "danker-entry" in metadata:
        print "    {}: {}".format("danker-entry",
                                  metadata["danker-entry"].encode("utf-8"))
    else:
        if lexeme in missing_danker: