#!/usr/bin/env python from collections import defaultdict import sys from morphgnt.utils import load_yaml, sorted_items, load_wordset from morphgnt.utils import nfkc_normalize as n lexemes = load_yaml("lexemes.yaml") missing_mounce = load_wordset("missing_mounce.txt") problems = [] skipped = 0 mounce = defaultdict(list) with open("../data-cleanup/mounce-morphcat/mounce-tauber-morphcat-utf8.txt" ) as f: for line in f: gk, greek, morphcat = line.strip().decode("utf-8").split(":") mounce[int(gk.split("?")[0])].append(n(greek)) for lexeme, metadata in sorted_items(lexemes): print "{}:".format(lexeme.encode("utf-8")) def q(metadata_name): if metadata_name in metadata: print " {}: {}".format( metadata_name, unicode(metadata[metadata_name]).encode("utf-8")) return True
#!/usr/bin/env python import sys from morphgnt.utils import load_yaml, load_wordset, sorted_items from morphgnt.utils import nfkc_normalize as n lexemes = load_yaml("lexemes.yaml") missing_bdag = load_wordset("missing_bdag.txt") headwords = set() with open("../data-cleanup/bdag-headwords/bdag_headwords.txt") as f: for line in f: headwords.add(n(line.strip().decode("utf-8"))) existing_not_in_headwords = [] missing_not_in_headwords = [] added = [] for lexeme, metadata in sorted_items(lexemes): print "{}:".format(lexeme.encode("utf-8")) def q(metadata_name): if metadata_name in metadata: print " {}: {}".format( metadata_name, unicode(metadata[metadata_name]).encode("utf-8")) q("pos") if "bdag-headword" in metadata: print " bdag-headword: {}".format(
#!/usr/bin/env python from collections import defaultdict import sys from morphgnt.utils import load_yaml, load_wordset, sorted_items from morphgnt.utils import nfkc_normalize as n lexemes = load_yaml("lexemes.yaml") missing_dodson = load_wordset("missing_dodson.txt") dodson = defaultdict(list) with open("../data-cleanup/dodson-lexicon/dodson_lexicon.txt") as f: for line in f: strongs, gk, pos, greek, short_gloss, long_gloss = line.strip().decode( "utf-8").split("\t") head_word = n(greek.split(",")[0]) dodson[head_word].append({ "strongs": strongs, "gk": gk, "pos": pos, "greek": n(greek), "short-gloss": short_gloss, "long-gloss": long_gloss }) not_in_dodson = set() for lexeme, metadata in sorted_items(lexemes): print "{}:".format(lexeme.encode("utf-8")) def q(metadata_name):
ACUTE = u"\u0301" GRAVE = u"\u0300" CIRCUMFLEX = u"\u0342" def strip_accents(w): return "".join( unicodedata.normalize( "NFC", "".join(component for component in unicodedata.normalize("NFD", ch) if component not in [ACUTE, GRAVE, CIRCUMFLEX])) for ch in w) INDECLINABLE = load_wordset("nominal-indeclinable.txt") # lemma -> person_number -> set of forms forms = defaultdict(lambda: defaultdict(set)) fs = filesets.load("filesets.yaml") for row in fs["sblgnt-lexemes"].rows(): if row["lemma"] in INDECLINABLE: continue if row["ccat-pos"] in ["N-", "A-", "RA", "RD", "RI", "RP", "RR"]: case_number = row["ccat-parse"][4:6] if row["ccat-pos"] == "N-": key = row["lemma"] else: key = "{} ({}:{})".format(row["lemma"], row["ccat-parse"][6],
from pyuca import Collator import yaml from characters import strip_accents from morphgnt.utils import load_wordset from collections import defaultdict import re import sys collator = Collator() # we will ignore indeclinable nominals IGNORE_SET = load_wordset("../../nominal-indeclinable.txt") # we also have to ignore the following nominals because they are combinations # of words that both inflect independently IGNORE_SET.update({ "ὅδε", "τοιόσδε", "ὅστις", }) # we need to change some lemmas LEMMA_OVERRIDE = {
#!/usr/bin/env python import sys from morphgnt.utils import load_yaml, load_wordset, sorted_items from morphgnt.utils import nfkc_normalize as n lexemes = load_yaml("lexemes.yaml") missing_bdag = load_wordset("missing_bdag.txt") headwords = set() with open("../data-cleanup/bdag-headwords/bdag_headwords.txt") as f: for line in f: headwords.add(n(line.strip().decode("utf-8"))) existing_not_in_headwords = [] missing_not_in_headwords = [] added = [] for lexeme, metadata in sorted_items(lexemes): print "{}:".format(lexeme.encode("utf-8")) def q(metadata_name): if metadata_name in metadata: print " {}: {}".format(metadata_name, unicode(metadata[metadata_name]).encode("utf-8")) q("pos") if "bdag-headword" in metadata: print " bdag-headword: {}".format(metadata["bdag-headword"].encode("utf-8")) if metadata["bdag-headword"] not in headwords:
ACUTE = u"\u0301" GRAVE = u"\u0300" CIRCUMFLEX = u"\u0342" def strip_accents(w): return "".join( unicodedata.normalize("NFC", "".join( component for component in unicodedata.normalize("NFD", ch) if component not in [ACUTE, GRAVE, CIRCUMFLEX] )) for ch in w ) INDECLINABLE = load_wordset("nominal-indeclinable.txt") # lemma -> person_number -> set of forms forms = defaultdict(lambda: defaultdict(set)) fs = filesets.load("filesets.yaml") for row in fs["sblgnt-lexemes"].rows(): if row["lemma"] in INDECLINABLE: continue if row["ccat-pos"] in ["N-", "A-", "RA", "RD", "RI", "RP", "RR"]: case_number = row["ccat-parse"][4:6] if row["ccat-pos"] == "N-": key = row["lemma"] else:
#!/usr/bin/env python from collections import defaultdict import sys from morphgnt.utils import load_yaml, load_wordset, sorted_items from morphgnt.utils import nfkc_normalize as n lexemes = load_yaml("lexemes.yaml") missing_morphcat = load_wordset("missing_morphcat.txt") mounce = defaultdict(list) with open("../data-cleanup/mounce-morphcat/mounce-tauber-morphcat-utf8.txt" ) as f: for line in f: gk, greek, morphcat = line.strip().decode("utf-8").split(":") greek = n(greek) mounce[greek].append({ "gk": gk, "morphcat": morphcat, }) problems = [] skipped = 0 for lexeme, metadata in sorted_items(lexemes): print "{}:".format(lexeme.encode("utf-8")) def q(metadata_name): if metadata_name in metadata: print " {}: {}".format( metadata_name,
#!/usr/bin/env python from collections import defaultdict import sys from morphgnt.utils import load_yaml, sorted_items, load_wordset from morphgnt.utils import nfkc_normalize as n lexemes = load_yaml("lexemes.yaml") missing_mounce = load_wordset("missing_mounce.txt") problems = [] skipped = 0 mounce = defaultdict(list) with open("../data-cleanup/mounce-morphcat/mounce-tauber-morphcat-utf8.txt") as f: for line in f: gk, greek, morphcat = line.strip().decode("utf-8").split(":") mounce[int(gk.split("?")[0])].append(n(greek)) for lexeme, metadata in sorted_items(lexemes): print "{}:".format(lexeme.encode("utf-8")) def q(metadata_name): if metadata_name in metadata: print " {}: {}".format(metadata_name, unicode(metadata[metadata_name]).encode("utf-8")) return True q("pos") q("bdag-headword") q("danker-entry")
#!/usr/bin/env python import sys from morphgnt.utils import load_yaml, load_wordset, sorted_items lexemes = load_yaml("lexemes.yaml") danker = load_yaml("../data-cleanup/danker-concise-lexicon/danker_headwords.yaml") missing_danker = load_wordset("missing_danker.txt") problems = [] skipped = 0 for lexeme, metadata in sorted_items(lexemes): print "{}:".format(lexeme.encode("utf-8")) def q(metadata_name): if metadata_name in metadata: print " {}: {}".format(metadata_name, unicode(metadata[metadata_name]).encode("utf-8")) q("pos") q("bdag-headword") if "danker-entry" in metadata: print " {}: {}".format("danker-entry", metadata["danker-entry"].encode("utf-8")) else: if lexeme in missing_danker: skipped += 1 else: if lexeme in danker: entry = danker[lexeme]
#!/usr/bin/env python3 from collections import defaultdict from morphgnt.utils import load_yaml, load_wordset, sorted_items from morphgnt import filesets lexemes = load_yaml("lexemes.yaml") indeclinables = load_wordset("nominal-indeclinable.txt") SKIP = [ "ἄγαμος", "ἀλάβαστρος", "ἅλα", "ἄρκος", "βάτος", "γείτων", "διάκονος", "θεός", "θυρωρός", "κάμηλος", "κοινωνός", "λιμός", "ὄνος", "ὄρνις", "παῖς", "παρθένος", "Πάτμος", "στάμνος", "ὕαλος", "ὕσσωπος",
#!/usr/bin/env python from collections import defaultdict import sys from morphgnt.utils import load_yaml, load_wordset, sorted_items from morphgnt.utils import nfkc_normalize as n lexemes = load_yaml("lexemes.yaml") missing_morphcat = load_wordset("missing_morphcat.txt") mounce = defaultdict(list) with open("../data-cleanup/mounce-morphcat/mounce-tauber-morphcat-utf8.txt") as f: for line in f: gk, greek, morphcat = line.strip().decode("utf-8").split(":") greek = n(greek) mounce[greek].append({ "gk": gk, "morphcat": morphcat, }) problems = [] skipped = 0 for lexeme, metadata in sorted_items(lexemes): print "{}:".format(lexeme.encode("utf-8")) def q(metadata_name): if metadata_name in metadata: print " {}: {}".format(metadata_name, unicode(metadata[metadata_name]).encode("utf-8")) return True
from pysblgnt import morphgnt_rows from pyuca import Collator import yaml from characters import strip_accents from morphgnt.utils import load_wordset from collections import defaultdict import re import sys collator = Collator() # we will ignore indeclinable nominals IGNORE_SET = load_wordset("../../nominal-indeclinable.txt") # we also have to ignore the following nominals because they are combinations # of words that both inflect independently IGNORE_SET.update({ "ὅδε", "τοιόσδε", "ὅστις", }) # we need to change some lemmas LEMMA_OVERRIDE = { "μήν": "μήν/N", }
#!/usr/bin/env python from collections import defaultdict import sys from morphgnt.utils import load_yaml, load_wordset, sorted_items from morphgnt.utils import nfkc_normalize as n lexemes = load_yaml("lexemes.yaml") missing_dodson = load_wordset("missing_dodson.txt") dodson = defaultdict(list) with open("../data-cleanup/dodson-lexicon/dodson_lexicon.txt") as f: for line in f: strongs, gk, pos, greek, short_gloss, long_gloss = line.strip().decode("utf-8").split("\t") head_word = n(greek.split(",")[0]) dodson[head_word].append( { "strongs": strongs, "gk": gk, "pos": pos, "greek": n(greek), "short-gloss": short_gloss, "long-gloss": long_gloss, } ) not_in_dodson = set() for lexeme, metadata in sorted_items(lexemes): print "{}:".format(lexeme.encode("utf-8"))
#!/usr/bin/env python3 from morphgnt.utils import load_yaml, load_wordset, sorted_items lexemes = load_yaml("lexemes.yaml") already = load_wordset("nominal-indeclinable.txt") for lexeme, metadata in sorted_items(lexemes): danker = metadata.get("danker-entry", "") dodson_pos = metadata.get("dodson-pos", "") mounce_morphcat = metadata.get("mounce-morphcat", "") if ( lexeme in already or dodson_pos == "N-PRI" or mounce_morphcat == "n-3g(2)" ): print("{:20}|{:45}|{:10}|{:10}|{:5}".format( lexeme, danker, dodson_pos, mounce_morphcat, "yes" if lexeme in already else "no", )) if lexeme in already: already.remove(lexeme) print(already)
#!/usr/bin/env python import sys from morphgnt.utils import load_yaml, load_wordset, sorted_items lexemes = load_yaml("lexemes.yaml") danker = load_yaml( "../data-cleanup/danker-concise-lexicon/danker_headwords.yaml") missing_danker = load_wordset("missing_danker.txt") problems = [] skipped = 0 for lexeme, metadata in sorted_items(lexemes): print "{}:".format(lexeme.encode("utf-8")) def q(metadata_name): if metadata_name in metadata: print " {}: {}".format( metadata_name, unicode(metadata[metadata_name]).encode("utf-8")) q("pos") q("bdag-headword") if "danker-entry" in metadata: print " {}: {}".format("danker-entry", metadata["danker-entry"].encode("utf-8")) else: if lexeme in missing_danker: