예제 #1
0
    def __init__(self, manuscript_id: str, source: str, grammemes: List[str]):
        self.manuscript_id = manuscript_id
        self.source = replace_chars(source, latin_homoglyphs,
                                    cyrillic_homoglyphs)
        self.error = None

        # Extract correction with break characters removed
        if "<" in source:
            self.error, self.source = (
                self.source[1:self.source.index("<") -
                            1],  # Text between ~ and <
                self.source[self.source.index("<") +
                            1:self.source.index(">")],  # Text between < and >
            )

        self.is_proper = self.source.startswith("*")
        self.source = (self.source[1:] if self.is_proper else self.source
                       )  # Remove property marker

        # Check that tagset is not empty before assignment
        self.tagset = tagset_factory(grammemes) if grammemes[0] else None

        self.pos = (self.tagset.pos if self.tagset is not None else None
                    )  # Add POS alias for convenience
        self.norm: Optional[str] = None
        self.lemma: Optional[str] = None
예제 #2
0
    def normalize(cls, word: Word) -> str:
        res = word.source.strip().upper()

        # Remove yer before linebreak unless tagged otherwise
        if word.tagset.note is not None and not ("+ъ" in word.tagset.note
                                                 or "+ь" in word.tagset.note):
            res = cls._replace_yer_before_linebreak(res)

        # Remove milestones
        res = re.sub(Milestone.REGEX, "", res)

        if word.is_cardinal_number():
            return word.tagset.pos  # Non-spelled out numerals
        if word.is_ordinal_number():
            return str(
                Number(res.replace("(", "").replace(")", "").replace(
                    " ", "")))  # Spelled-out numerals

        res = replace_chars(
            res,
            characters.latin_special_characters,
            characters.cyrillic_special_characters,
        )

        for idx in [idx for idx, char in enumerate(res)
                    if char == "V"]:  # Izhitsa positional replacement
            res = res[:idx] + cls._replace_izhitsa(res, idx) + res[idx + 1:]

        # Orthography normalization
        res = modif(res, word.tagset.pos if word.tagset is not None else "")

        return res.replace("#", "").replace("(", "").replace(")", "")
예제 #3
0
    def __init__(self, pos: str, grammemes: List[str]):
        super().__init__(pos)

        # Additonal property
        self.is_reflexive = self.pos.endswith("/в")

        self.mood = replace_chars(grammemes[0], latin_lowercase_homoglyphs,
                                  cyrillic_lowercase_homoglyphs).replace(
                                      "изьяв", "изъяв")

        if self.mood == "изъяв":
            self.tense = grammemes[1]
            self.number = grammemes[3].split("/")[-1]

            if grammemes[2].isnumeric():
                self.person = grammemes[2]
            else:
                self.gender = grammemes[2]

            if grammemes[4].split("/")[0].isnumeric():
                self.cls = grammemes[4].split("/")[0]
            else:
                self.role = grammemes[4]
        elif self.mood == "сосл":
            if grammemes[1].isnumeric():
                self.person = grammemes[1]
            else:
                self.gender = grammemes[1]

            self.number = grammemes[2].split("/")[-1]
            self.role = grammemes[3]
        else:
            self.person = grammemes[1]
            self.number = grammemes[2]
            self.cls = grammemes[3].split("/")[0]
예제 #4
0
 def __replace_overline_chars(cls, match: re.Match) -> str:
     """
     Overline character replacer function.
     :param match: Match object for characters in parentheses
     :return: Unicode replacement string, with or without the titlo character
     """
     text = replace_chars(
         match.group(1).upper(), cls.OVERLINE_ASCII, cls.OVERLINE_UNICODE)
     return "҇" + text if match.group(1).islower() else text
예제 #5
0
    def convert(cls, text: str) -> str:
        text = re.sub(r"\((.+?)\)", cls.__replace_overline_chars, text)
        text = replace_chars(text, cls.INLINE_ASCII, cls.INLINE_UNICODE)

        if "#" in text:
            text = text.replace("#", "")
            number = int(cls.__count_chars(text) > 1)
            text = (text[:cls.__count_chars(text, number) + 1] + "҃" +
                    text[cls.__count_chars(text, number) + 1:])

        return text.replace("ѡⷮ", "ѿ").lower()
예제 #6
0
def parse_line(manuscript_id: str, line: str) -> Tuple[List[Row], List[str]]:
    nums = line[line.rfind("/") + 1:].split()
    toks = list(
        filter(bool,
               re.split(r'(</.+?>|<[a-z].+?">)|\s+', line[:line.rfind("/")])))

    for i in range(len(toks)):
        if not (toks[i].startswith("</") or toks[i].endswith('">')):
            toks[i] = replace_chars(toks[i], latin_homoglyphs,
                                    cyrillic_homoglyphs)

    i = 0

    # Сборка токенов из множества кусков
    while i < len(toks):
        # Межстраничные разрывы
        if toks[i] == "Z":
            toks[i] = " ".join([toks[i], toks[i + 1]])
            del toks[i + 1]

        # Разрывы *до* ошибок: ср. '~АБВZ -123 ГДЖ <АБВZ -123 ГДЕ>'
        elif toks[i].endswith("Z"):
            toks[i] = " ".join([toks[i], toks[i + 1], toks[i + 2]])
            del toks[i + 1:i + 3]

        # Ошибочные написания
        if (len(toks) > i + 1 and toks[i + 1].startswith("<")
                and not toks[i + 1].startswith("</")
                and not toks[i + 1].endswith('">')):
            corr = toks[i + 1]
            del toks[i + 1]

            # Бывают и множественные
            while ">" not in corr:
                corr += " " + toks[i + 1]
                del toks[i + 1]

            toks[i] = " ".join([toks[i], corr])

        # Висячая пунктуация справа и мелкие разрывы
        if len(toks) > i + 1 and re.match(r"[.,:;\]&\\]+", toks[i + 1]):
            toks[i] += toks[i + 1]
            del toks[i + 1]

        i += 1

    return (
        [(XMLRow if t.startswith("<") and t.endswith(">") else WordRow)(
            manuscript_id, [t] + [""] * 6) for t in toks],
        nums,
    )
예제 #7
0
def tagset_factory(columns: List[str]):
    pos = replace_chars(columns[0], latin_lowercase_homoglyphs,
                        cyrillic_lowercase_homoglyphs)

    # Ignore ambiguous part of speech
    if "/" in pos and not re.search(r"/([внп]|ср)$", pos):
        pos = pos.split("/")[-1]

    if pos in ("сущ", "прил", "прил/ср", "числ", "числ/п"):
        return NounTagset(pos, columns[1:])
    if pos == "мест":
        return (PronounTagset(pos, columns[1:])
                if columns[1] == "личн" else NounTagset(pos, columns[1:]))
    if pos in ("гл", "гл/в"):
        return VerbTagset(pos, columns[1:])
    if pos in ("прич", "прич/в"):
        return ParticipleTagset(pos, columns[1:])

    return Tagset(pos)
예제 #8
0
    def __init__(self, pos: str, grammemes: List[str]):
        super().__init__(pos)

        # Additonal property
        self.is_reflexive = self.pos.endswith("/в")

        declension = replace_chars(grammemes[0], cyrillic_lowercase_homoglyphs,
                                   latin_lowercase_homoglyphs)
        if "/" in declension:
            self.declension = declension.split("/")
        else:
            self.declension = [declension, declension]

        self.tense = grammemes[1]
        self.case = grammemes[2].split("/")[-1]
        self.number = grammemes[3].split("/")[-1]
        self.gender = grammemes[4].split(
            "/")[-1] if grammemes[4] != "0" else "м"

        # Additional property
        self.voice = "пас" if self.declension[0] in ("a", "o", "тв") else "акт"
예제 #9
0
    def __init__(self, pos: str, grammemes: List[str]):
        super().__init__(pos)

        # Save both the etymologic and morphological declension type
        declension = (grammemes[0] if self.pos == "мест" else replace_chars(
            grammemes[0], cyrillic_lowercase_homoglyphs,
            latin_lowercase_homoglyphs))
        if "/" in declension:
            if declension.count("/") > 1:
                if declension.startswith("р/скл"):
                    self.declension = declension.rsplit("/", 1)
                else:
                    self.declension = declension.split("/", 1)
            elif declension == "р/скл":
                self.declension = [declension, declension]
            else:
                self.declension = declension.split("/")
        else:
            self.declension = [declension, declension]

        # Save the factual case only
        self.case = grammemes[1].split("/")[-1]

        # Additional property
        self.is_plurale_tantum = grammemes[2] == "pt"
        if self.is_plurale_tantum:
            self.number = "мн"
        else:
            self.number = grammemes[2].split(
                "/")[-1] if grammemes[2] != "0" else "ед"

        self.gender = grammemes[3].split(
            "/")[-1] if grammemes[3] != "0" else "м"

        # Additional property for morphonological notes
        self.note = grammemes[4]