Пример #1
0
    def get_stem_set(self, parse, norm, test_length):
        stem_set = set()

        norm = debreath(norm)

        if parse in stemming_rules:
            pairs = stemming_rules[parse]
            while isinstance(pairs, dict) and "ref" in pairs:
                if pairs["ref"] in stemming_rules:
                    pairs = stemming_rules[pairs["ref"]]
                else:
                    raise Exception("ref to {} which doesn't exist".format(pairs["ref"]))
            for entry in pairs:
                if not test_length:
                    entry = strip_length(entry)
                s1, s234, s5 = entry.split("|")
                s2, s34 = s234.split(">")
                s3, s4 = s34.split("<")
                s3 = s3.replace("(", "\\(")
                s3 = s3.replace(")", "\\)")
                s5 = s5.replace("(", "\\(")
                s5 = s5.replace(")", "\\)")
                regex_pair = ("(.*{}){}{}$".format(s1, s3, s5), s2)
                if re.match(regex_pair[0], norm):
                    stem_set.add(rebreath(strip_accents(re.sub(regex_pair[0], r"\1" + regex_pair[1], norm))))
        else:
            return None

        return stem_set
Пример #2
0
    def ending_info(self, form, parse, test_length=False):
        stem_set = set()

        form = debreath(form)

        if parse in self.endings:
            pairs = self.endings[parse]

            while isinstance(pairs, dict) and "ref" in pairs:
                if pairs["ref"] in self.endings:
                    pairs = self.endings[pairs["ref"]]
                else:
                    raise Exception("ref to {} which doesn't exist".format(pairs["ref"]))

            for entry in pairs:
                if not test_length:
                    entry = strip_length(entry)
                s1, s234, s5 = entry.split("|")
                s2, s34 = s234.split(">")
                s3, s4 = s34.split("<")
                s3 = s3.replace("(", "\\(")
                s3 = s3.replace(")", "\\)")
                s5 = s5.replace("(", "\\(")
                s5 = s5.replace(")", "\\)")
                regex_pair = ("(.*{}){}{}$".format(s1, s3, s5), s2)

                if re.match(regex_pair[0], form):
                    stem = rebreath(
                        strip_accents(
                            re.sub(regex_pair[0], r"\1" + regex_pair[1], form)))

                    stem_set.add(EndingInfo(stem, (s1, s2, s3, s4, s5)))
        else:
            return None

        return stem_set
Пример #3
0
            lexeme = lexemes[lemma]
            try:
                mounce_cat = lexeme["mounce-morphcat"]
            except:
                error("{} has no mounce-morphcat".format(lemma))
        if not isinstance(mounce_cat, list):
            mounce_cat = [mounce_cat]

        for cat in mounce_cat:
            mounce_by_lemma[lemma].add(cat)

        new_mounce_cat = map_non_noun_categories(mounce_cat, aspect_voice,
                                                 gender, lemma)

        orig_norm = norm
        norm = decompose_breathing(strip_accents(norm))

        success = False
        for ending_and_class_regex in noun_endings[case_number + gender]:
            try:
                ending, class_regex, explanation = ending_and_class_regex.split(
                )
            except ValueError:
                error("{}\n{} {}".format(row["bcv"], case_number + gender,
                                         ending_and_class_regex))

            if norm.endswith(ending.replace(".", "")):
                success = set()
                for cat in new_mounce_cat:
                    if re.match(class_regex, cat):
                        success.add(cat)
from collections import defaultdict

from characters import strip_accents, strip_breathing

CELLS = defaultdict(lambda: defaultdict(set))
GENDER = defaultdict(set)
LEMMAS = defaultdict(set)

with open("nominals.txt") as f:
    for line in f:
        lemma, mounce1, aspect_voice, gender, mounce2, theme1, case_number, norm, theme2, distinguisher, explanation = line.strip(
        ).split()

        CELLS[mounce2][case_number + gender].add(
            (strip_breathing(strip_accents(distinguisher)), explanation))
        GENDER[mounce2].add(gender)
        LEMMAS[mounce2 + " " + gender].add(lemma)

for mounce in sorted(CELLS, key=lambda x: x[0] + x[2:]):
    for gender in ["M", "F", "N", "-"]:
        if gender in GENDER[mounce]:
            print("\n\n{} {} ({}):".format(mounce, gender,
                                           len(LEMMAS[mounce + " " + gender])))
            for case_number in [
                    "NS", "GS", "DS", "AS", "VS", "NP", "VP", "GP", "DP", "AP"
            ]:
                if case_number + gender in CELLS[mounce]:
                    if len(CELLS[mounce][case_number + gender]) == 1:
                        cell = CELLS[mounce][case_number + gender].pop()
                        print("    {}:   {:10} {{{}}}".format(
        else:
            lexeme = lexemes[lemma]
            try:
                mounce_cat = lexeme["mounce-morphcat"]
            except:
                error("{} has no mounce-morphcat".format(lemma))
        if not isinstance(mounce_cat, list):
            mounce_cat = [mounce_cat]

        for cat in mounce_cat:
            mounce_by_lemma[lemma].add(cat)

        new_mounce_cat = map_non_noun_categories(mounce_cat, aspect_voice, gender, lemma)

        orig_norm = norm
        norm = decompose_breathing(strip_accents(norm))

        success = False
        for ending_and_class_regex in noun_endings[case_number + gender]:
            try:
                ending, class_regex, explanation = ending_and_class_regex.split()
            except ValueError:
                error("{}\n{} {}".format(row["bcv"], case_number + gender, ending_and_class_regex))

            if norm.endswith(ending.replace(".", "")):
                success = set()
                for cat in new_mounce_cat:
                    if re.match(class_regex, cat):
                        success.add(cat)
                if success:
                    break
Пример #6
0
    def generate(self, lemma, parse, allow_form_override=True, context=None):
        answers = []
        stems = None
        accent_override = None
        is_enclitic = False
        ending_override = None

        if lemma in self.lexicon:
            if allow_form_override:
                answer = self.lexicon[lemma].get("forms", {}).get(parse)
                if answer:
                    return answer

            stems = self.regex_list(lemma, parse, context)

            if "." in parse:
                accents = self.lexicon[lemma].get("accents", {}).get(parse.split(".")[0])
                if accents == "enclitic":
                    is_enclitic = True
                else:
                    accent_override = accents

            ending_override = self.lexicon[lemma].get("endings", {}).get(parse)

        if stems is None:
            return
        else:
            stems = stems.split("/")

        if parse not in stemming_rules:
            return

        for stem in stems:
            stem = debreath(stem)
            pairs = stemming_rules[parse]
            while isinstance(pairs, dict) and "ref" in pairs:
                if pairs["ref"] in stemming_rules:
                    pairs = stemming_rules[pairs["ref"]]
                else:
                    # @@@ raise error?
                    return
            base_endings = []
            default = []
            for rule in pairs:
                s1, s234, s5 = rule.split("|")
                s2, s34 = s234.split(">")
                s3, s4 = s34.split("<")

                if stem.endswith(strip_accents(s1 + s2)):
                    if s2:
                        base = stem[:-len(s2)]
                    else:
                        base = stem
                else:
                    continue

                if ending_override:
                    ending_list = ending_override.split("/")
                else:
                    ending_list = [s3 + s5]

                if s1 + s2:
                    base_endings.append((base, ending_list))
                else:
                    default.append((base, ending_list))

            # only use default if there are no other options
            if len(base_endings) == 0 and len(default) > 0:
                base_endings = default

            for base, ending_list in base_endings:
                for ending in ending_list:
                    if accent(ending):
                        answers.append((base + ending).replace("|", ""))
                    elif is_enclitic:
                        answers.append(make_oxytone(base + ending).replace("|", ""))
                    else:
                        if parse[2] == "P":
                            if accent_override:
                                answers.append(persistent(base + ending, accent_override))
                            elif parse == "AAP.NSM" and ending == "ων":
                                answers.append(make_oxytone(base + ending).replace("|", ""))
                            elif parse == "AAP.NSM" and ending == "_3+ς":
                                answers.append(make_oxytone(base + ending).replace("|", ""))
                            elif parse == "PAP.NSM" and ending == "_3+ς":
                                answers.append(make_oxytone(base + ending).replace("|", ""))
                            elif parse[0:3] == "AAP" and parse != "AAP.NSM":
                                # calculate NSM
                                nsms = self.generate(lemma, "AAP.NSM", context=context)
                                nsms = nsms.split("/")
                                for nsm in nsms:
                                    if nsm.endswith(("ών", "ούς")):
                                        answers.append(persistent(base + ending, nsm))
                                    else:
                                        answers.append(persistent(base + ending, lemma))
                            elif parse[0:3] == "PAP" and parse != "PAP.NSM":
                                # calculate NSM
                                nsms = self.generate(lemma, "PAP.NSM").split("/")
                                for nsm in nsms:
                                    nsm = strip_length(nsm)
                                    answers.append(persistent(base + ending, nsm))
                            else:
                                answers.append(recessive(base + ending, default_short=True))
                        elif parse[0:3] in ["AAN", "XAN", "XMN", "XPN"]:
                            answers.append(on_penult(base + ending, default_short=True))
                        elif parse[0:3] == "PAN" and stem.endswith("!"):
                            answers.append(on_penult(base + ending, default_short=True))
                        else:
                            answers.append(recessive(base + ending, default_short=True))

        return "/".join(remove_duplicates(rebreath(w) for w in answers))
                        "F": "n-1b",
                        "N": "n-2c",
                    }[gender]
                elif aspect_voice == "FP":
                    cat = {
                        "M": "n-2a",
                        "F": "n-1b",
                        "N": "n-2c",
                    }[gender]
                else:
                    assert False, aspect_voice

            new_mounce_cat.append(cat)

        orig_norm = norm
        norm = strip_accents(norm)
        norm = norm.replace("ἡ", "hη")
        norm = norm.replace("ὁ", "hο")
        norm = norm.replace("οὑ", "hου")
        norm = norm.replace("οἱ", "hοι")
        norm = norm.replace("αἱ", "hαι")
        norm = norm.replace("εἱ", "hει")
        norm = norm.replace("ἁ", "hα")
        norm = norm.replace("ἑ", "hε")
        norm = norm.replace("ὡ", "hω")
        norm = norm.replace("ὑ", "hυ")
        norm = norm.replace("ᾑ", "hῃ")
        norm = norm.replace("ᾡ", "hῳ")
        norm = norm.replace("οὐ", "ου")
        norm = norm.replace("ὠ", "ω")
        norm = norm.replace("ὀ", "ο")