def toString(self, short_variant: bool, lang: 'MorphLang' = None, lev: int = 0) -> str: nam = None for l_ in range(2): for s in self.slots: if (((s.type_name == UnitReferent.ATTR_NAME and short_variant)) or ((s.type_name == UnitReferent.ATTR_FULLNAME and not short_variant))): val = Utils.asObjectOrNull(s.value, str) if (lang is not None and l_ == 0): if (lang.is_ru != LanguageHelper.isCyrillic(val)): continue nam = val break if (nam is not None): break if (nam is None): nam = self.getStringValue(UnitReferent.ATTR_NAME) pow0_ = self.getStringValue(UnitReferent.ATTR_POW) if (Utils.isNullOrEmpty(pow0_) or lev > 0): return Utils.ifNotNull(nam, "?") res = ("{0}{1}".format(nam, pow0_) if (pow0_[0] != '-') else "{0}<{1}>".format(nam, pow0_)) if (not short_variant and self.is_unknown): res = ("(?)" + res) return res
def run(self, text: str, only_tokenizing: bool, dlang: 'MorphLang', progress: EventHandler, good_text: bool) -> typing.List['MorphToken']: """ Произвести морфологический анализ текста Args: text(str): исходный текст lang: язык (если null, то попробует определить) Returns: typing.List[MorphToken]: последовательность результирующих морфем """ if (Utils.isNullOrEmpty(text)): return None twr = TextWrapper(text, good_text) twrch = twr.chars res = list() uni_lex = dict() term0 = None pure_rus_words = 0 pure_ukr_words = 0 pure_by_words = 0 pure_kz_words = 0 tot_rus_words = 0 tot_ukr_words = 0 tot_by_words = 0 tot_kz_words = 0 i = 0 first_pass2708 = True while True: if first_pass2708: first_pass2708 = False else: i += 1 if (not (i < twr.length)): break ty = InnerMorphology._getCharTyp(twrch[i]) if (ty == 0): continue if (ty > 2): j = (i + 1) else: j = (i + 1) while j < twr.length: if (InnerMorphology._getCharTyp(twrch[j]) != ty): break j += 1 wstr = text[i:i + j - i] term = None if (good_text): term = wstr else: trstr = LanguageHelper.transliteralCorrection( wstr, term0, False) term = LanguageHelper.correctWord(trstr) if (Utils.isNullOrEmpty(term)): i = (j - 1) continue lang = InnerMorphology.__detectLang(twr, i, j - 1, term) if (lang == MorphLang.UA): pure_ukr_words += 1 elif (lang == MorphLang.RU): pure_rus_words += 1 elif (lang == MorphLang.BY): pure_by_words += 1 elif (lang == MorphLang.KZ): pure_kz_words += 1 if ((((lang) & MorphLang.RU)) != MorphLang.UNKNOWN): tot_rus_words += 1 if ((((lang) & MorphLang.UA)) != MorphLang.UNKNOWN): tot_ukr_words += 1 if ((((lang) & MorphLang.BY)) != MorphLang.UNKNOWN): tot_by_words += 1 if ((((lang) & MorphLang.KZ)) != MorphLang.UNKNOWN): tot_kz_words += 1 if (ty == 1): term0 = term lemmas = None if (ty == 1 and not only_tokenizing): wraplemmas7 = RefOutArgWrapper(None) inoutres8 = Utils.tryGetValue(uni_lex, term, wraplemmas7) lemmas = wraplemmas7.value if (not inoutres8): lemmas = InnerMorphology.UniLexWrap._new6(lang) uni_lex[term] = lemmas tok = MorphToken() tok.term = term tok.begin_char = i if (i == 733860): pass tok.end_char = (j - 1) tok.tag = (lemmas) res.append(tok) i = (j - 1) def_lang = MorphLang(dlang) if (pure_rus_words > pure_ukr_words and pure_rus_words > pure_by_words and pure_rus_words > pure_kz_words): def_lang = MorphLang.RU elif (tot_rus_words > tot_ukr_words and tot_rus_words > tot_by_words and tot_rus_words > tot_kz_words): def_lang = MorphLang.RU elif (pure_ukr_words > pure_rus_words and pure_ukr_words > pure_by_words and pure_ukr_words > pure_kz_words): def_lang = MorphLang.UA elif (tot_ukr_words > tot_rus_words and tot_ukr_words > tot_by_words and tot_ukr_words > tot_kz_words): def_lang = MorphLang.UA elif (pure_kz_words > pure_rus_words and pure_kz_words > pure_ukr_words and pure_kz_words > pure_by_words): def_lang = MorphLang.KZ elif (tot_kz_words > tot_rus_words and tot_kz_words > tot_ukr_words and tot_kz_words > tot_by_words): def_lang = MorphLang.KZ elif (pure_by_words > pure_rus_words and pure_by_words > pure_ukr_words and pure_by_words > pure_kz_words): def_lang = MorphLang.BY elif (tot_by_words > tot_rus_words and tot_by_words > tot_ukr_words and tot_by_words > tot_kz_words): if (tot_rus_words > 10 and tot_by_words > (tot_rus_words + 20)): def_lang = MorphLang.BY elif (tot_rus_words == 0 or tot_by_words >= (tot_rus_words * 2)): def_lang = MorphLang.BY if (((def_lang.is_undefined or def_lang.is_ua)) and tot_rus_words > 0): if (((tot_ukr_words > tot_rus_words and InnerMorphology.M_ENGINE_UA.language.is_ua)) or ((tot_by_words > tot_rus_words and InnerMorphology.M_ENGINE_BY.language.is_by)) or ((tot_kz_words > tot_rus_words and InnerMorphology.M_ENGINE_KZ.language.is_kz))): cou0 = 0 tot_kz_words = 0 tot_ukr_words = tot_kz_words tot_by_words = tot_ukr_words tot_rus_words = tot_by_words for kp in uni_lex.items(): lang = MorphLang() wraplang9 = RefOutArgWrapper(lang) kp[1].word_forms = self.__processOneWord(kp[0], wraplang9) lang = wraplang9.value if (kp[1].word_forms is not None): for wf in kp[1].word_forms: lang |= wf.language kp[1].lang = lang if (lang.is_ru): tot_rus_words += 1 if (lang.is_ua): tot_ukr_words += 1 if (lang.is_by): tot_by_words += 1 if (lang.is_kz): tot_kz_words += 1 if (lang.is_cyrillic): cou0 += 1 if (cou0 >= 100): break if (tot_rus_words > ((math.floor(tot_by_words / 2))) and tot_rus_words > ((math.floor(tot_ukr_words / 2)))): def_lang = MorphLang.RU elif (tot_ukr_words > ((math.floor(tot_rus_words / 2))) and tot_ukr_words > ((math.floor(tot_by_words / 2)))): def_lang = MorphLang.UA elif (tot_by_words > ((math.floor(tot_rus_words / 2))) and tot_by_words > ((math.floor(tot_ukr_words / 2)))): def_lang = MorphLang.BY elif (def_lang.is_undefined): def_lang = MorphLang.RU cou = 0 tot_kz_words = 0 tot_ukr_words = tot_kz_words tot_by_words = tot_ukr_words tot_rus_words = tot_by_words for kp in uni_lex.items(): lang = def_lang if (lang.is_undefined): if (tot_rus_words > tot_by_words and tot_rus_words > tot_ukr_words and tot_rus_words > tot_kz_words): lang = MorphLang.RU elif (tot_ukr_words > tot_rus_words and tot_ukr_words > tot_by_words and tot_ukr_words > tot_kz_words): lang = MorphLang.UA elif (tot_by_words > tot_rus_words and tot_by_words > tot_ukr_words and tot_by_words > tot_kz_words): lang = MorphLang.BY elif (tot_kz_words > tot_rus_words and tot_kz_words > tot_ukr_words and tot_kz_words > tot_by_words): lang = MorphLang.KZ wraplang10 = RefOutArgWrapper(lang) kp[1].word_forms = self.__processOneWord(kp[0], wraplang10) lang = wraplang10.value kp[1].lang = lang if ((((lang) & MorphLang.RU)) != MorphLang.UNKNOWN): tot_rus_words += 1 if ((((lang) & MorphLang.UA)) != MorphLang.UNKNOWN): tot_ukr_words += 1 if ((((lang) & MorphLang.BY)) != MorphLang.UNKNOWN): tot_by_words += 1 if ((((lang) & MorphLang.KZ)) != MorphLang.UNKNOWN): tot_kz_words += 1 if (progress is not None): self.__onProgress(cou, len(uni_lex), progress) cou += 1 debug_token = None empty_list = None for r in res: uni = Utils.asObjectOrNull(r.tag, InnerMorphology.UniLexWrap) r.tag = None if (uni is None or uni.word_forms is None or len(uni.word_forms) == 0): if (empty_list is None): empty_list = list() r.word_forms = empty_list if (uni is not None): r.language = uni.lang else: r.word_forms = uni.word_forms if (r.begin_char == 733860): debug_token = r if (not good_text): i = 0 first_pass2709 = True while True: if first_pass2709: first_pass2709 = False else: i += 1 if (not (i < (len(res) - 2))): break ui0 = twrch[res[i].begin_char] ui1 = twrch[res[i + 1].begin_char] ui2 = twrch[res[i + 2].begin_char] if (ui1.is_quot): p = res[i + 1].begin_char if ((p >= 2 and "БбТт".find(text[p - 1]) >= 0 and ((p + 3) < len(text))) and "ЕеЯяЁё".find(text[p + 1]) >= 0): wstr = LanguageHelper.transliteralCorrection( LanguageHelper.correctWord("{0}Ъ{1}".format( res[i].getSourceText(text), res[i + 2].getSourceText(text))), None, False) li = self.__processOneWord0(wstr) if (li is not None and len(li) > 0 and li[0].is_in_dictionary): res[i].end_char = res[i + 2].end_char res[i].term = wstr res[i].word_forms = li del res[i + 1:i + 1 + 2] elif ((ui1.is_apos and p > 0 and str.isalpha(text[p - 1])) and ((p + 1) < len(text)) and str.isalpha(text[p + 1])): if (def_lang == MorphLang.UA or (((res[i].language) & MorphLang.UA)) != MorphLang.UNKNOWN or (((res[i + 2].language) & MorphLang.UA)) != MorphLang.UNKNOWN): wstr = LanguageHelper.transliteralCorrection( LanguageHelper.correctWord("{0}{1}".format( res[i].getSourceText(text), res[i + 2].getSourceText(text))), None, False) li = self.__processOneWord0(wstr) okk = True if (okk): res[i].end_char = res[i + 2].end_char res[i].term = wstr if (li is None): li = list() res[i].word_forms = li if (li is not None and len(li) > 0): res[i].language = li[0].language del res[i + 1:i + 1 + 2] elif (((ui1.uni_char == '3' or ui1.uni_char == '4')) and res[i + 1].length == 1): src = ("З" if ui1.uni_char == '3' else "Ч") i0 = i + 1 if ((res[i].end_char + 1) == res[i + 1].begin_char and ui0.is_cyrillic): i0 -= 1 src = (res[i0].getSourceText(text) + src) i1 = i + 1 if ((res[i + 1].end_char + 1) == res[i + 2].begin_char and ui2.is_cyrillic): i1 += 1 src += res[i1].getSourceText(text) if (len(src) > 2): wstr = LanguageHelper.transliteralCorrection( LanguageHelper.correctWord(src), None, False) li = self.__processOneWord0(wstr) if (li is not None and len(li) > 0 and li[0].is_in_dictionary): res[i0].end_char = res[i1].end_char res[i0].term = wstr res[i0].word_forms = li del res[i0 + 1:i0 + 1 + i1 - i0] elif ((ui1.is_hiphen and ui0.is_letter and ui2.is_letter) and res[i].end_char > res[i].begin_char and res[i + 2].end_char > res[i + 2].begin_char): newline = False sps = 0 j = (res[i + 1].end_char + 1) while j < res[i + 2].begin_char: if (text[j] == '\r' or text[j] == '\n'): newline = True sps += 1 elif (not Utils.isWhitespace(text[j])): break else: sps += 1 j += 1 full_word = LanguageHelper.correctWord( res[i].getSourceText(text) + res[i + 2].getSourceText(text)) if (not newline): if (full_word in uni_lex or full_word == "ИЗЗА"): newline = True elif (text[res[i + 1].begin_char] == (chr(0x00AD))): newline = True elif (LanguageHelper.endsWithEx( res[i].getSourceText(text), "О", "о", None, None) and len(res[i + 2].word_forms) > 0 and res[i + 2].word_forms[0].is_in_dictionary): if (text[res[i + 1].begin_char] == '¬'): li = self.__processOneWord0(full_word) if (li is not None and len(li) > 0 and li[0].is_in_dictionary): newline = True elif ((res[i].end_char + 2) == res[i + 2].begin_char): if (not str.isupper(text[res[i + 2].begin_char]) and (sps < 2) and len(full_word) > 4): newline = True if ((i + 3) < len(res)): ui3 = twrch[res[i + 3].begin_char] if (ui3.is_hiphen): newline = False elif (((res[i].end_char + 1) == res[i + 1].begin_char and sps > 0 and (sps < 3)) and len(full_word) > 4): newline = True if (newline): li = self.__processOneWord0(full_word) if (li is not None and len(li) > 0 and ((li[0].is_in_dictionary or full_word in uni_lex))): res[i].end_char = res[i + 2].end_char res[i].term = full_word res[i].word_forms = li del res[i + 1:i + 1 + 2] else: pass elif ((ui1.is_letter and ui0.is_letter and res[i].length > 2) and res[i + 1].length > 1): if (ui0.is_upper != ui1.is_upper): continue if (not ui0.is_cyrillic or not ui1.is_cyrillic): continue newline = False j = (res[i].end_char + 1) while j < res[i + 1].begin_char: if (twrch[j].code == 0xD or twrch[j].code == 0xA): newline = True break j += 1 if (not newline): continue full_word = LanguageHelper.correctWord( res[i].getSourceText(text) + res[i + 1].getSourceText(text)) if (not full_word in uni_lex): continue li = self.__processOneWord0(full_word) if (li is not None and len(li) > 0 and li[0].is_in_dictionary): res[i].end_char = res[i + 1].end_char res[i].term = full_word res[i].word_forms = li del res[i + 1] i = 0 first_pass2710 = True while True: if first_pass2710: first_pass2710 = False else: i += 1 if (not (i < len(res))): break mt = res[i] mt.char_info = CharsInfo() ui0 = twrch[mt.begin_char] ui00 = UnicodeInfo.ALL_CHARS[ord((res[i].term[0]))] j = (mt.begin_char + 1) while j <= mt.end_char: if (ui0.is_letter): break ui0 = twrch[j] j += 1 if (ui0.is_letter): res[i].char_info.is_letter = True if (ui00.is_latin): res[i].char_info.is_latin_letter = True elif (ui00.is_cyrillic): res[i].char_info.is_cyrillic_letter = True if (res[i].language == MorphLang.UNKNOWN): if (LanguageHelper.isCyrillic(mt.term)): res[i].language = (MorphLang.RU if def_lang.is_undefined else def_lang) if (good_text): continue all_up = True all_lo = True j = mt.begin_char while j <= mt.end_char: if (twrch[j].is_upper or twrch[j].is_digit): all_lo = False else: all_up = False j += 1 if (all_up): mt.char_info.is_all_upper = True elif (all_lo): mt.char_info.is_all_lower = True elif (((ui0.is_upper or twrch[mt.begin_char].is_digit)) and mt.end_char > mt.begin_char): all_lo = True j = (mt.begin_char + 1) while j <= mt.end_char: if (twrch[j].is_upper or twrch[j].is_digit): all_lo = False break j += 1 if (all_lo): mt.char_info.is_capital_upper = True elif (twrch[mt.end_char].is_lower and (mt.end_char - mt.begin_char) > 1): all_up = True j = mt.begin_char while j < mt.end_char: if (twrch[j].is_lower): all_up = False break j += 1 if (all_up): mt.char_info.is_last_lower = True if (mt.char_info.is_last_lower and mt.length > 2 and mt.char_info.is_cyrillic_letter): pref = text[mt.begin_char:mt.begin_char + mt.end_char - mt.begin_char] ok = False for wf in mt.word_forms: if (wf.normal_case == pref or wf.normal_full == pref): ok = True break if (not ok): mt.word_forms = list(mt.word_forms) mt.word_forms.insert( 0, MorphWordForm._new11(pref, MorphClass.NOUN, 1)) if (good_text or only_tokenizing): return res i = 0 first_pass2711 = True while True: if first_pass2711: first_pass2711 = False else: i += 1 if (not (i < len(res))): break if (res[i].length == 1 and res[i].char_info.is_latin_letter): ch = res[i].term[0] if (ch == 'C' or ch == 'A' or ch == 'P'): pass else: continue is_rus = False for ii in range(i - 1, -1, -1): if ((res[ii].end_char + 1) != res[ii + 1].begin_char): break elif (res[ii].char_info.is_letter): is_rus = res[ii].char_info.is_cyrillic_letter break if (not is_rus): ii = i + 1 while ii < len(res): if ((res[ii - 1].end_char + 1) != res[ii].begin_char): break elif (res[ii].char_info.is_letter): is_rus = res[ii].char_info.is_cyrillic_letter break ii += 1 if (is_rus): res[i].term = LanguageHelper.transliteralCorrection( res[i].term, None, True) res[i].char_info.is_cyrillic_letter = True res[i].char_info.is_latin_letter = True for r in res: if (r.char_info.is_all_upper or r.char_info.is_capital_upper): if (r.language.is_cyrillic): ok = False for wf in r.word_forms: if (wf.class0_.is_proper_surname): ok = True break if (not ok): r.word_forms = list(r.word_forms) InnerMorphology.M_ENGINE_RU.processSurnameVariants( r.term, r.word_forms) for r in res: for mv in r.word_forms: if (mv.normal_case is None): mv.normal_case = r.term i = 0 while i < (len(res) - 2): if (res[i].char_info.is_latin_letter and res[i].char_info.is_all_upper and res[i].length == 1): if (twrch[res[i + 1].begin_char].is_quot and res[i + 2].char_info.is_latin_letter and res[i + 2].length > 2): if ((res[i].end_char + 1) == res[i + 1].begin_char and (res[i + 1].end_char + 1) == res[i + 2].begin_char): wstr = "{0}{1}".format(res[i].term, res[i + 2].term) li = self.__processOneWord0(wstr) if (li is not None): res[i].word_forms = li res[i].end_char = res[i + 2].end_char res[i].term = wstr if (res[i + 2].char_info.is_all_lower): res[i].char_info.is_all_upper = False res[i].char_info.is_capital_upper = True elif (not res[i + 2].char_info.is_all_upper): res[i].char_info.is_all_upper = False del res[i + 1:i + 1 + 2] i += 1 i = 0 first_pass2712 = True while True: if first_pass2712: first_pass2712 = False else: i += 1 if (not (i < (len(res) - 1))): break if (not res[i].char_info.is_letter and not res[i + 1].char_info.is_letter and (res[i].end_char + 1) == res[i + 1].begin_char): if (twrch[res[i].begin_char].is_hiphen and twrch[res[i + 1].begin_char].is_hiphen): if (i == 0 or not twrch[res[i - 1].begin_char].is_hiphen): pass else: continue if ((i + 2) == len(res) or not twrch[res[i + 2].begin_char].is_hiphen): pass else: continue res[i].end_char = res[i + 1].end_char del res[i + 1] return res