Пример #1
0
#!/usr/bin/env python

import sys

from morphgnt.utils import load_yaml, load_wordset, sorted_items
from morphgnt.utils import nfkc_normalize as n

lexemes = load_yaml("lexemes.yaml")
missing_bdag = load_wordset("missing_bdag.txt")

headwords = set()
with open("../data-cleanup/bdag-headwords/bdag_headwords.txt") as f:
    for line in f:
        headwords.add(n(line.strip().decode("utf-8")))

existing_not_in_headwords = []
missing_not_in_headwords = []
added = []
for lexeme, metadata in sorted_items(lexemes):
    print "{}:".format(lexeme.encode("utf-8"))

    def q(metadata_name):
        if metadata_name in metadata:
            print "    {}: {}".format(
                metadata_name,
                unicode(metadata[metadata_name]).encode("utf-8"))

    q("pos")

    if "bdag-headword" in metadata:
        print "    bdag-headword: {}".format(
Пример #2
0
import sys

from pyuca import Collator
collator = Collator()

from morphgnt.utils import load_yaml
from morphgnt.utils import nfkc_normalize as n

danker = load_yaml("../data-cleanup/danker-concise-lexicon/components.yaml")

greenlee = {}
with open("../data-cleanup/greenlee-morphology/morphemes-utf8.txt") as f:
    for line in f:
        key, value = line.strip().split("\t")
        greenlee[n(key.decode("utf-8")).split(",")[0]] = {
            "full-entry": n(key.decode("utf-8")),
            "components": n(value.decode("utf-8")),
        }

words = [n(word) for word in set(danker.keys()).union(set(greenlee.keys()))]

count = 0
for word in sorted(words, key=collator.sort_key):
    count += 1
    print "{}:".format(word.encode("utf-8"))
    if word in danker:
        print "    danker-full-entry: \"{}\"".format(danker[word]["full-entry"].encode("utf-8"))
        print "    danker-components: \"{}\"".format(danker[word]["components"].encode("utf-8"))
    if word in greenlee:
        print "    greenlee-full-entry: \"{}\"".format(greenlee[word]["full-entry"].encode("utf-8"))
Пример #3
0
from collections import defaultdict
import sys

from morphgnt.utils import load_yaml, load_wordset, sorted_items
from morphgnt.utils import nfkc_normalize as n

lexemes = load_yaml("lexemes.yaml")
missing_dodson = load_wordset("missing_dodson.txt")

dodson = defaultdict(list)
with open("../data-cleanup/dodson-lexicon/dodson_lexicon.txt") as f:
    for line in f:
        strongs, gk, pos, greek, short_gloss, long_gloss = line.strip().decode(
            "utf-8").split("\t")
        head_word = n(greek.split(",")[0])
        dodson[head_word].append({
            "strongs": strongs,
            "gk": gk,
            "pos": pos,
            "greek": n(greek),
            "short-gloss": short_gloss,
            "long-gloss": long_gloss
        })

not_in_dodson = set()
for lexeme, metadata in sorted_items(lexemes):
    print "{}:".format(lexeme.encode("utf-8"))

    def q(metadata_name):
        if metadata_name in metadata:
from morphgnt.utils import load_yaml, sorted_items, load_wordset
from morphgnt.utils import nfkc_normalize as n

lexemes = load_yaml("lexemes.yaml")
missing_mounce = load_wordset("missing_mounce.txt")

problems = []
skipped = 0

mounce = defaultdict(list)
with open("../data-cleanup/mounce-morphcat/mounce-tauber-morphcat-utf8.txt"
          ) as f:
    for line in f:
        gk, greek, morphcat = line.strip().decode("utf-8").split(":")
        mounce[int(gk.split("?")[0])].append(n(greek))

for lexeme, metadata in sorted_items(lexemes):
    print "{}:".format(lexeme.encode("utf-8"))

    def q(metadata_name):
        if metadata_name in metadata:
            print "    {}: {}".format(
                metadata_name,
                unicode(metadata[metadata_name]).encode("utf-8"))
            return True

    q("pos")
    q("bdag-headword")
    q("danker-entry")
    q("dodson-entry")
#!/usr/bin/env python

import sys

from morphgnt.utils import load_yaml, load_wordset, sorted_items
from morphgnt.utils import nfkc_normalize as n

lexemes = load_yaml("lexemes.yaml")
missing_bdag = load_wordset("missing_bdag.txt")


headwords = set()
with open("../data-cleanup/bdag-headwords/bdag_headwords.txt") as f:
    for line in f:
        headwords.add(n(line.strip().decode("utf-8")))

existing_not_in_headwords = []
missing_not_in_headwords = []
added = []
for lexeme, metadata in sorted_items(lexemes):
    print "{}:".format(lexeme.encode("utf-8"))

    def q(metadata_name):
        if metadata_name in metadata:
            print "    {}: {}".format(metadata_name, unicode(metadata[metadata_name]).encode("utf-8"))

    q("pos")

    if "bdag-headword" in metadata:
        print "    bdag-headword: {}".format(metadata["bdag-headword"].encode("utf-8"))
        if metadata["bdag-headword"] not in headwords:
#!/usr/bin/env python

from collections import defaultdict
import sys

from morphgnt.utils import load_yaml, load_wordset, sorted_items
from morphgnt.utils import nfkc_normalize as n

lexemes = load_yaml("lexemes.yaml")
missing_dodson = load_wordset("missing_dodson.txt")

dodson = defaultdict(list)
with open("../data-cleanup/dodson-lexicon/dodson_lexicon.txt") as f:
    for line in f:
        strongs, gk, pos, greek, short_gloss, long_gloss = line.strip().decode("utf-8").split("\t")
        head_word = n(greek.split(",")[0])
        dodson[head_word].append(
            {
                "strongs": strongs,
                "gk": gk,
                "pos": pos,
                "greek": n(greek),
                "short-gloss": short_gloss,
                "long-gloss": long_gloss,
            }
        )

not_in_dodson = set()
for lexeme, metadata in sorted_items(lexemes):
    print "{}:".format(lexeme.encode("utf-8"))
import sys

from morphgnt.utils import load_yaml, sorted_items, load_wordset
from morphgnt.utils import nfkc_normalize as n

lexemes = load_yaml("lexemes.yaml")
missing_mounce = load_wordset("missing_mounce.txt")

problems = []
skipped = 0

mounce = defaultdict(list)
with open("../data-cleanup/mounce-morphcat/mounce-tauber-morphcat-utf8.txt") as f:
    for line in f:
        gk, greek, morphcat = line.strip().decode("utf-8").split(":")
        mounce[int(gk.split("?")[0])].append(n(greek))

for lexeme, metadata in sorted_items(lexemes):
    print "{}:".format(lexeme.encode("utf-8"))

    def q(metadata_name):
        if metadata_name in metadata:
            print "    {}: {}".format(metadata_name, unicode(metadata[metadata_name]).encode("utf-8"))
            return True

    q("pos")
    q("bdag-headword")
    q("danker-entry")
    q("dodson-entry")

    if not q("mounce-headword"):
#!/usr/bin/env python

from collections import defaultdict
import sys

from morphgnt.utils import load_yaml, load_wordset, sorted_items
from morphgnt.utils import nfkc_normalize as n

lexemes = load_yaml("lexemes.yaml")
missing_morphcat = load_wordset("missing_morphcat.txt")

mounce = defaultdict(list)
with open("../data-cleanup/mounce-morphcat/mounce-tauber-morphcat-utf8.txt") as f:
    for line in f:
        gk, greek, morphcat = line.strip().decode("utf-8").split(":")
        greek = n(greek)
        mounce[greek].append({
            "gk": gk,
            "morphcat": morphcat,
        })

problems = []
skipped = 0
for lexeme, metadata in sorted_items(lexemes):
    print "{}:".format(lexeme.encode("utf-8"))

    def q(metadata_name):
        if metadata_name in metadata:
            print "    {}: {}".format(metadata_name, unicode(metadata[metadata_name]).encode("utf-8"))
            return True
from collections import defaultdict
import sys

from morphgnt.utils import load_yaml, load_wordset, sorted_items
from morphgnt.utils import nfkc_normalize as n

lexemes = load_yaml("lexemes.yaml")
missing_morphcat = load_wordset("missing_morphcat.txt")

mounce = defaultdict(list)
with open("../data-cleanup/mounce-morphcat/mounce-tauber-morphcat-utf8.txt"
          ) as f:
    for line in f:
        gk, greek, morphcat = line.strip().decode("utf-8").split(":")
        greek = n(greek)
        mounce[greek].append({
            "gk": gk,
            "morphcat": morphcat,
        })

problems = []
skipped = 0
for lexeme, metadata in sorted_items(lexemes):
    print "{}:".format(lexeme.encode("utf-8"))

    def q(metadata_name):
        if metadata_name in metadata:
            print "    {}: {}".format(
                metadata_name,
                unicode(metadata[metadata_name]).encode("utf-8"))