def __init__(self, manuscript_id: str, source: str, grammemes: List[str]): self.manuscript_id = manuscript_id self.source = replace_chars(source, latin_homoglyphs, cyrillic_homoglyphs) self.error = None # Extract correction with break characters removed if "<" in source: self.error, self.source = ( self.source[1:self.source.index("<") - 1], # Text between ~ and < self.source[self.source.index("<") + 1:self.source.index(">")], # Text between < and > ) self.is_proper = self.source.startswith("*") self.source = (self.source[1:] if self.is_proper else self.source ) # Remove property marker # Check that tagset is not empty before assignment self.tagset = tagset_factory(grammemes) if grammemes[0] else None self.pos = (self.tagset.pos if self.tagset is not None else None ) # Add POS alias for convenience self.norm: Optional[str] = None self.lemma: Optional[str] = None
def normalize(cls, word: Word) -> str: res = word.source.strip().upper() # Remove yer before linebreak unless tagged otherwise if word.tagset.note is not None and not ("+ъ" in word.tagset.note or "+ь" in word.tagset.note): res = cls._replace_yer_before_linebreak(res) # Remove milestones res = re.sub(Milestone.REGEX, "", res) if word.is_cardinal_number(): return word.tagset.pos # Non-spelled out numerals if word.is_ordinal_number(): return str( Number(res.replace("(", "").replace(")", "").replace( " ", ""))) # Spelled-out numerals res = replace_chars( res, characters.latin_special_characters, characters.cyrillic_special_characters, ) for idx in [idx for idx, char in enumerate(res) if char == "V"]: # Izhitsa positional replacement res = res[:idx] + cls._replace_izhitsa(res, idx) + res[idx + 1:] # Orthography normalization res = modif(res, word.tagset.pos if word.tagset is not None else "") return res.replace("#", "").replace("(", "").replace(")", "")
def __init__(self, pos: str, grammemes: List[str]): super().__init__(pos) # Additonal property self.is_reflexive = self.pos.endswith("/в") self.mood = replace_chars(grammemes[0], latin_lowercase_homoglyphs, cyrillic_lowercase_homoglyphs).replace( "изьяв", "изъяв") if self.mood == "изъяв": self.tense = grammemes[1] self.number = grammemes[3].split("/")[-1] if grammemes[2].isnumeric(): self.person = grammemes[2] else: self.gender = grammemes[2] if grammemes[4].split("/")[0].isnumeric(): self.cls = grammemes[4].split("/")[0] else: self.role = grammemes[4] elif self.mood == "сосл": if grammemes[1].isnumeric(): self.person = grammemes[1] else: self.gender = grammemes[1] self.number = grammemes[2].split("/")[-1] self.role = grammemes[3] else: self.person = grammemes[1] self.number = grammemes[2] self.cls = grammemes[3].split("/")[0]
def __replace_overline_chars(cls, match: re.Match) -> str: """ Overline character replacer function. :param match: Match object for characters in parentheses :return: Unicode replacement string, with or without the titlo character """ text = replace_chars( match.group(1).upper(), cls.OVERLINE_ASCII, cls.OVERLINE_UNICODE) return "҇" + text if match.group(1).islower() else text
def convert(cls, text: str) -> str: text = re.sub(r"\((.+?)\)", cls.__replace_overline_chars, text) text = replace_chars(text, cls.INLINE_ASCII, cls.INLINE_UNICODE) if "#" in text: text = text.replace("#", "") number = int(cls.__count_chars(text) > 1) text = (text[:cls.__count_chars(text, number) + 1] + "҃" + text[cls.__count_chars(text, number) + 1:]) return text.replace("ѡⷮ", "ѿ").lower()
def parse_line(manuscript_id: str, line: str) -> Tuple[List[Row], List[str]]: nums = line[line.rfind("/") + 1:].split() toks = list( filter(bool, re.split(r'(</.+?>|<[a-z].+?">)|\s+', line[:line.rfind("/")]))) for i in range(len(toks)): if not (toks[i].startswith("</") or toks[i].endswith('">')): toks[i] = replace_chars(toks[i], latin_homoglyphs, cyrillic_homoglyphs) i = 0 # Сборка токенов из множества кусков while i < len(toks): # Межстраничные разрывы if toks[i] == "Z": toks[i] = " ".join([toks[i], toks[i + 1]]) del toks[i + 1] # Разрывы *до* ошибок: ср. '~АБВZ -123 ГДЖ <АБВZ -123 ГДЕ>' elif toks[i].endswith("Z"): toks[i] = " ".join([toks[i], toks[i + 1], toks[i + 2]]) del toks[i + 1:i + 3] # Ошибочные написания if (len(toks) > i + 1 and toks[i + 1].startswith("<") and not toks[i + 1].startswith("</") and not toks[i + 1].endswith('">')): corr = toks[i + 1] del toks[i + 1] # Бывают и множественные while ">" not in corr: corr += " " + toks[i + 1] del toks[i + 1] toks[i] = " ".join([toks[i], corr]) # Висячая пунктуация справа и мелкие разрывы if len(toks) > i + 1 and re.match(r"[.,:;\]&\\]+", toks[i + 1]): toks[i] += toks[i + 1] del toks[i + 1] i += 1 return ( [(XMLRow if t.startswith("<") and t.endswith(">") else WordRow)( manuscript_id, [t] + [""] * 6) for t in toks], nums, )
def tagset_factory(columns: List[str]): pos = replace_chars(columns[0], latin_lowercase_homoglyphs, cyrillic_lowercase_homoglyphs) # Ignore ambiguous part of speech if "/" in pos and not re.search(r"/([внп]|ср)$", pos): pos = pos.split("/")[-1] if pos in ("сущ", "прил", "прил/ср", "числ", "числ/п"): return NounTagset(pos, columns[1:]) if pos == "мест": return (PronounTagset(pos, columns[1:]) if columns[1] == "личн" else NounTagset(pos, columns[1:])) if pos in ("гл", "гл/в"): return VerbTagset(pos, columns[1:]) if pos in ("прич", "прич/в"): return ParticipleTagset(pos, columns[1:]) return Tagset(pos)
def __init__(self, pos: str, grammemes: List[str]): super().__init__(pos) # Additonal property self.is_reflexive = self.pos.endswith("/в") declension = replace_chars(grammemes[0], cyrillic_lowercase_homoglyphs, latin_lowercase_homoglyphs) if "/" in declension: self.declension = declension.split("/") else: self.declension = [declension, declension] self.tense = grammemes[1] self.case = grammemes[2].split("/")[-1] self.number = grammemes[3].split("/")[-1] self.gender = grammemes[4].split( "/")[-1] if grammemes[4] != "0" else "м" # Additional property self.voice = "пас" if self.declension[0] in ("a", "o", "тв") else "акт"
def __init__(self, pos: str, grammemes: List[str]): super().__init__(pos) # Save both the etymologic and morphological declension type declension = (grammemes[0] if self.pos == "мест" else replace_chars( grammemes[0], cyrillic_lowercase_homoglyphs, latin_lowercase_homoglyphs)) if "/" in declension: if declension.count("/") > 1: if declension.startswith("р/скл"): self.declension = declension.rsplit("/", 1) else: self.declension = declension.split("/", 1) elif declension == "р/скл": self.declension = [declension, declension] else: self.declension = declension.split("/") else: self.declension = [declension, declension] # Save the factual case only self.case = grammemes[1].split("/")[-1] # Additional property self.is_plurale_tantum = grammemes[2] == "pt" if self.is_plurale_tantum: self.number = "мн" else: self.number = grammemes[2].split( "/")[-1] if grammemes[2] != "0" else "ед" self.gender = grammemes[3].split( "/")[-1] if grammemes[3] != "0" else "м" # Additional property for morphonological notes self.note = grammemes[4]