def __init__(self, source: 'MorphCollection' = None) -> None: super().__init__() self.__m_class = MorphClass() self.__m_gender = MorphGender.UNDEFINED self.__m_number = MorphNumber.UNDEFINED self.__m_case = MorphCase() self.__m_language = MorphLang() self.__m_voice = MorphVoice.UNDEFINED self.__m_need_recalc = True self.__m_items = None if (source is None): return for it in source.items: mi = None if (isinstance(it, MorphWordForm)): wf = MorphWordForm() wf.copy_from_word_form(Utils.asObjectOrNull(it, MorphWordForm)) mi = (wf) else: mi = MorphBaseInfo() mi.copy_from(it) if (self.__m_items is None): self.__m_items = list() self.__m_items.append(mi) self.__m_class = MorphClass._new53(source.__m_class.value) self.__m_gender = source.__m_gender self.__m_case = MorphCase._new29(source.__m_case.value) self.__m_number = source.__m_number self.__m_language = MorphLang._new56(source.__m_language.value) self.__m_voice = source.__m_voice self.__m_need_recalc = False
def _deserialize(self, str0_: 'ByteArrayWrapper', pos: int) -> bool: id0__ = str0_.deserialize_short(pos) if (id0__ <= 0): return False self.misc_info_id = (id0__) iii = str0_.deserialize_short(pos) mc = MorphClass() mc.value = (iii) if (mc.is_misc and mc.is_proper): mc.is_misc = False self.class0_ = mc bbb = 0 bbb = str0_.deserialize_byte(pos) self.gender = Utils.valToEnum(bbb, MorphGender) bbb = str0_.deserialize_byte(pos) self.number = Utils.valToEnum(bbb, MorphNumber) bbb = str0_.deserialize_byte(pos) mca = MorphCase() mca.value = (bbb) self.case_ = mca s = str0_.deserialize_string(pos) self.normal_tail = s s = str0_.deserialize_string(pos) self.full_normal_tail = s return True
def copy_from(self, src: 'MorphBaseInfo') -> None: cla = MorphClass() cla.value = src.class0_.value self.class0_ = cla self.gender = src.gender self.number = src.number cas = MorphCase() cas.value = src.case_.value self.case_ = cas lng = MorphLang() lng.value = src.language.value self.language = lng
def deserializeDerivateGroup(str0_ : 'ByteArrayWrapper', dg : 'DerivateGroup') -> None: attr = str0_.deserializeShort() if (((attr & 1)) != 0): dg.is_dummy = True if (((attr & 2)) != 0): dg.not_generate = True if (((attr & 4)) != 0): dg.m_transitive = 0 if (((attr & 8)) != 0): dg.m_transitive = 1 dg.prefix = str0_.deserializeString() cou = str0_.deserializeShort() while cou > 0: w = DerivateWord(dg) w.spelling = str0_.deserializeString() w.class0_ = MorphClass() w.class0_.value = (str0_.deserializeShort()) w.lang = MorphLang._new5(str0_.deserializeShort()) w.attrs.value = (str0_.deserializeShort()) dg.words.append(w) cou -= 1 cou = str0_.deserializeShort() while cou > 0: pref = Utils.ifNotNull(str0_.deserializeString(), "") cas = MorphCase() cas.value = (str0_.deserializeShort()) if (dg.nexts is None): dg.nexts = dict() dg.nexts[pref] = cas cou -= 1
def __deserialize_item(self, stream: Stream) -> 'MorphBaseInfo': from pullenti.ner.core.internal.SerializerHelper import SerializerHelper ty = stream.readbyte() res = (MorphBaseInfo() if ty == 0 else MorphWordForm()) res.class0_ = MorphClass._new53( SerializerHelper.deserialize_short(stream)) res.case_ = MorphCase._new29( SerializerHelper.deserialize_short(stream)) res.gender = Utils.valToEnum( SerializerHelper.deserialize_short(stream), MorphGender) res.number = Utils.valToEnum( SerializerHelper.deserialize_short(stream), MorphNumber) res.language = MorphLang._new56( SerializerHelper.deserialize_short(stream)) if (ty == 0): return res wf = Utils.asObjectOrNull(res, MorphWordForm) wf.normal_case = SerializerHelper.deserialize_string(stream) wf.normal_full = SerializerHelper.deserialize_string(stream) wf.undef_coef = SerializerHelper.deserialize_short(stream) cou = SerializerHelper.deserialize_int(stream) i = 0 while i < cou: if (wf.misc is None): wf.misc = MorphMiscInfo() wf.misc.attrs.append(SerializerHelper.deserialize_string(stream)) i += 1 return res
def add(self, val: str, shortval: str, gen: 'MorphGender', add_other_gender_var: bool = False) -> None: if (val is None): return if (self.head is None): if (len(val) > 3): self.head = val[0:0 + 3] else: self.head = val if (gen == MorphGender.MASCULINE or gen == MorphGender.FEMINIE): for it in self.items: if (it.value == val and it.gender == gen): return self.items.append( PersonMorphCollection.PersonMorphVariant._new2591( val, gen, shortval)) if (add_other_gender_var): g0 = (MorphGender.MASCULINE if gen == MorphGender.FEMINIE else MorphGender.FEMINIE) v = MorphologyService.get_wordform( val, MorphBaseInfo._new193(MorphClass._new2572(True), g0)) if (v is not None): self.items.append( PersonMorphCollection.PersonMorphVariant._new2591( v, g0, shortval)) else: self.add(val, shortval, MorphGender.MASCULINE, False) self.add(val, shortval, MorphGender.FEMINIE, False)
def __init__(self, bi: 'MorphBaseInfo' = None) -> None: self.__m_cla = MorphClass() self.__gender = MorphGender.UNDEFINED self.__number = MorphNumber.UNDEFINED self.__m_cas = MorphCase() self.__m_lang = MorphLang() if (bi is not None): bi.copy_to(self)
def __deserializeMorphRuleVariant(str0_: 'ByteArrayWrapper', me: 'MorphEngine') -> 'MorphRuleVariant': id0_ = str0_.deserializeShort() - 1 if ((id0_ < 0) or id0_ >= len(me._m_vars)): return None mrv = MorphRuleVariant._new36(me._m_vars[id0_]) mc = MorphClass() mc.value = (str0_.deserializeShort()) if (mc.is_misc and mc.is_proper): mc.is_misc = False mrv.class0_ = mc mrv.gender = Utils.valToEnum(str0_.deserializeByte(), MorphGender) mrv.number = Utils.valToEnum(str0_.deserializeByte(), MorphNumber) mca = MorphCase() mca.value = (str0_.deserializeByte()) mrv.case_ = mca mrv.normal_tail = str0_.deserializeString() mrv.full_normal_tail = str0_.deserializeString() return mrv
def deserialize_derivate_group(str0_: 'ByteArrayWrapper', dg: 'DerivateGroup', pos: int) -> None: attr = str0_.deserialize_short(pos) if (((attr & 1)) != 0): dg.is_dummy = True if (((attr & 2)) != 0): dg.not_generate = True if (((attr & 4)) != 0): dg.m_transitive = 0 if (((attr & 8)) != 0): dg.m_transitive = 1 if (((attr & 0x10)) != 0): dg.m_rev_agent_case = 0 if (((attr & 0x20)) != 0): dg.m_rev_agent_case = 1 if (((attr & 0x40)) != 0): dg.m_rev_agent_case = 2 dg.questions = (Utils.valToEnum(str0_.deserialize_short(pos), NextModelQuestion)) dg.questions_ref = (Utils.valToEnum(str0_.deserialize_short(pos), NextModelQuestion)) dg.prefix = str0_.deserialize_string(pos) cou = str0_.deserialize_short(pos) while cou > 0: w = DerivateWord(dg) w.spelling = str0_.deserialize_string(pos) w.class0_ = MorphClass() w.class0_.value = (str0_.deserialize_short(pos)) w.lang = MorphLang._new10(str0_.deserialize_short(pos)) w.attrs.value = (str0_.deserialize_short(pos)) dg.words.append(w) cou -= 1 cou = str0_.deserialize_short(pos) while cou > 0: pref = Utils.ifNotNull(str0_.deserialize_string(pos), "") cas = MorphCase() cas.value = (str0_.deserialize_short(pos)) if (dg.nexts is None): dg.nexts = dict() dg.nexts[pref] = cas cou -= 1 cou = str0_.deserialize_short(pos) while cou > 0: pref = Utils.ifNotNull(str0_.deserialize_string(pos), "") cas = MorphCase() cas.value = (str0_.deserialize_short(pos)) if (dg.nexts_ref is None): dg.nexts_ref = dict() dg.nexts_ref[pref] = cas cou -= 1
def get_word_base_info(word: str, lang: 'MorphLang' = None, is_case_nominative: bool = False, in_dict_only: bool = False) -> 'MorphBaseInfo': """ Получить для словоформы род\число\падеж Args: word(str): словоформа lang(MorphLang): возможный язык is_case_nominative(bool): исходное слово в именительном падеже (иначе считается падеж любым) in_dict_only(bool): при true не строить гипотезы для несловарных слов Returns: MorphBaseInfo: базовая морфологическая информация """ mt = Morphology.__m_inner.run(word, False, lang, None, False) bi = MorphWordForm() cla = MorphClass() if (mt is not None and len(mt) > 0): for k in range(2): ok = False for wf in mt[0].word_forms: if (k == 0): if (not wf.is_in_dictionary): continue elif (wf.is_in_dictionary): continue if (is_case_nominative): if (not wf.case_.is_nominative and not wf.case_.is_undefined): continue cla.value |= wf.class0_.value bi.gender = Utils.valToEnum((bi.gender) | (wf.gender), MorphGender) bi.case_ = (bi.case_) | wf.case_ bi.number = Utils.valToEnum((bi.number) | (wf.number), MorphNumber) if (wf.misc is not None and bi.misc is None): bi.misc = wf.misc ok = True if (ok or in_dict_only): break bi.class0_ = cla return bi
def clone(self) -> 'MorphCollection': """ Создать копию """ res = MorphCollection() if (self.__m_items is not None): res.__m_items = list() try: res.__m_items.extend(self.__m_items) except Exception as ex: pass if (not self.__m_need_recalc): res.__m_class = MorphClass._new53(self.__m_class.value) res.__m_gender = self.__m_gender res.__m_case = MorphCase._new29(self.__m_case.value) res.__m_number = self.__m_number res.__m_language = MorphLang._new56(self.__m_language.value) res.__m_need_recalc = False res.__m_voice = self.__m_voice return res
def _deserialize(self, str0_ : 'ByteArrayWrapper', pos : int) -> None: attr = str0_.deserialize_short(pos) if (((attr & 1)) != 0): self.is_dummy = True if (((attr & 2)) != 0): self.not_generate = True self.prefix = str0_.deserialize_string(pos) self.model._deserialize(str0_, pos) self.cm._deserialize(str0_, pos) self.cm_rev._deserialize(str0_, pos) cou = str0_.deserialize_short(pos) while cou > 0: w = DerivateWord() w.spelling = str0_.deserialize_string(pos) sh = str0_.deserialize_short(pos) w.class0_ = MorphClass() w.class0_.value = (sh) sh = str0_.deserialize_short(pos) w.lang = MorphLang() w.lang.value = (sh) sh = str0_.deserialize_short(pos) w.attrs.value = (sh) b = str0_.deserialize_byte(pos) w.aspect = (Utils.valToEnum(b, MorphAspect)) b = str0_.deserialize_byte(pos) w.tense = (Utils.valToEnum(b, MorphTense)) b = str0_.deserialize_byte(pos) w.voice = (Utils.valToEnum(b, MorphVoice)) b = str0_.deserialize_byte(pos) cou1 = b while cou1 > 0: n = str0_.deserialize_string(pos) if (w.next_words is None): w.next_words = list() if (n is not None): w.next_words.append(n) cou1 -= 1 self.words.append(w) cou -= 1
def _deserialize(self, stream: Stream) -> None: from pullenti.ner.core.internal.SerializerHelper import SerializerHelper self.__m_class = MorphClass._new53( SerializerHelper.deserialize_short(stream)) self.__m_case = MorphCase._new29( SerializerHelper.deserialize_short(stream)) self.__m_gender = (Utils.valToEnum( SerializerHelper.deserialize_short(stream), MorphGender)) self.__m_number = (Utils.valToEnum( SerializerHelper.deserialize_short(stream), MorphNumber)) self.__m_voice = (Utils.valToEnum( SerializerHelper.deserialize_short(stream), MorphVoice)) self.__m_language = MorphLang._new56( SerializerHelper.deserialize_short(stream)) cou = SerializerHelper.deserialize_int(stream) self.__m_items = list() i = 0 while i < cou: it = self.__deserialize_item(stream) if (it is not None): self.__m_items.append(it) i += 1 self.__m_need_recalc = False
def __recalc(self) -> None: self.__m_need_recalc = False if (self.__m_items is None or len(self.__m_items) == 0): return self.__m_class = MorphClass() self.__m_gender = MorphGender.UNDEFINED g = self.__m_gender == MorphGender.UNDEFINED self.__m_number = MorphNumber.UNDEFINED n = self.__m_number == MorphNumber.UNDEFINED self.__m_case = MorphCase() ca = self.__m_case.is_undefined la = self.__m_language is None or self.__m_language.is_undefined self.__m_voice = MorphVoice.UNDEFINED verb_has_undef = False if (self.__m_items is not None): for it in self.__m_items: self.__m_class.value |= it.class0_.value if (g): self.__m_gender = (Utils.valToEnum( (self.__m_gender) | (it.gender), MorphGender)) if (ca): self.__m_case |= it.case_ if (n): self.__m_number = (Utils.valToEnum( (self.__m_number) | (it.number), MorphNumber)) if (la): self.__m_language.value |= it.language.value if (it.class0_.is_verb): if (isinstance(it, MorphWordForm)): v = it.misc.voice if (v == MorphVoice.UNDEFINED): verb_has_undef = True else: self.__m_voice = (Utils.valToEnum( (self.__m_voice) | (v), MorphVoice)) if (verb_has_undef): self.__m_voice = MorphVoice.UNDEFINED
def process(self, word : str) -> typing.List['MorphWordForm']: """ Обработка одного слова Args: word(str): слово должно быть в верхнем регистре """ if (Utils.isNullOrEmpty(word)): return None res = None if (len(word) > 1): i = 0 while i < len(word): ch = word[i] if (LanguageHelper.isCyrillicVowel(ch) or LanguageHelper.isLatinVowel(ch)): break i += 1 if (i >= len(word)): return res mvs = [ ] tn = self.m_root i = 0 while i <= len(word): if (tn.lazy_pos > 0): self.__loadTreeNode(tn) if (tn.rules is not None): word_begin = None word_end = None if (i == 0): word_end = word elif (i < len(word)): word_end = word[i:] else: word_end = "" if (res is None): res = list() for r in tn.rules: wrapmvs14 = RefOutArgWrapper(None) inoutres15 = Utils.tryGetValue(r.variants, word_end, wrapmvs14) mvs = wrapmvs14.value if (inoutres15): if (word_begin is None): if (i == len(word)): word_begin = word elif (i > 0): word_begin = word[0:0+i] else: word_begin = "" r.processResult(res, word_begin, mvs) if (tn.nodes is None or i >= len(word)): break ch = ord(word[i]) wraptn16 = RefOutArgWrapper(None) inoutres17 = Utils.tryGetValue(tn.nodes, ch, wraptn16) tn = wraptn16.value if (not inoutres17): break i += 1 need_test_unknown_vars = True if (res is not None): for r in res: if ((r.class0_.is_pronoun or r.class0_.is_noun or r.class0_.is_adjective) or (r.class0_.is_misc and r.class0_.is_conjunction) or r.class0_.is_preposition): need_test_unknown_vars = False elif (r.class0_.is_adverb and r.normal_case is not None): if (not LanguageHelper.endsWithEx(r.normal_case, "О", "А", None, None)): need_test_unknown_vars = False elif (r.normal_case == "МНОГО"): need_test_unknown_vars = False elif (r.class0_.is_verb and len(res) > 1): ok = False for rr in res: if (rr != r and rr.class0_ != r.class0_): ok = True break if (ok and not LanguageHelper.endsWith(word, "ИМ")): need_test_unknown_vars = False if (need_test_unknown_vars and LanguageHelper.isCyrillicChar(word[0])): gl = 0 sog = 0 j = 0 while j < len(word): if (LanguageHelper.isCyrillicVowel(word[j])): gl += 1 else: sog += 1 j += 1 if ((gl < 2) or (sog < 2)): need_test_unknown_vars = False if (need_test_unknown_vars and res is not None and len(res) == 1): if (res[0].class0_.is_verb): if ("н.вр." in res[0].misc.attrs and "нес.в." in res[0].misc.attrs and not "страд.з." in res[0].misc.attrs): need_test_unknown_vars = False elif ("б.вр." in res[0].misc.attrs and "сов.в." in res[0].misc.attrs): need_test_unknown_vars = False elif (res[0].normal_case is not None and LanguageHelper.endsWith(res[0].normal_case, "СЯ")): need_test_unknown_vars = False if (res[0].class0_.is_undefined and "прдктв." in res[0].misc.attrs): need_test_unknown_vars = False if (need_test_unknown_vars): if (self.m_root_reverce is None): return res tn = self.m_root_reverce tn0 = None for i in range(len(word) - 1, -1, -1): if (tn.lazy_pos > 0): self.__loadTreeNode(tn) ch = ord(word[i]) if (tn.nodes is None): break wrapnext18 = RefOutArgWrapper(None) inoutres19 = Utils.tryGetValue(tn.nodes, ch, wrapnext18) next0_ = wrapnext18.value if (not inoutres19): break tn = next0_ if (tn.lazy_pos > 0): self.__loadTreeNode(tn) if (tn.reverce_variants is not None): tn0 = tn break else: i = -1 if (tn0 is not None): glas = i < 4 while i >= 0: if (LanguageHelper.isCyrillicVowel(word[i]) or LanguageHelper.isLatinVowel(word[i])): glas = True break i -= 1 if (glas): for mv in tn0.reverce_variants: if (((not mv.class0_.is_verb and not mv.class0_.is_adjective and not mv.class0_.is_noun) and not mv.class0_.is_proper_surname and not mv.class0_.is_proper_geo) and not mv.class0_.is_proper_secname): continue ok = False for rr in res: if (rr.is_in_dictionary): if (rr.class0_ == mv.class0_ or rr.class0_.is_noun): ok = True break if (not mv.class0_.is_adjective and rr.class0_.is_verb): ok = True break if (ok): continue if (len(mv.tail) > 0 and not LanguageHelper.endsWith(word, mv.tail)): continue r = MorphWordForm(mv, word) if (not MorphWordForm._hasMorphEquals(res, r)): r.undef_coef = mv.coef if (res is None): res = list() res.append(r) if (word == "ПРИ" and res is not None): for i in range(len(res) - 1, -1, -1): if (res[i].class0_.is_proper_geo): del res[i] else: i = -1 if (res is None or len(res) == 0): return None MorphEngine.__sort(res, word) for v in res: if (v.normal_case is None): v.normal_case = word if (v.class0_.is_verb): if (v.normal_full is None and LanguageHelper.endsWith(v.normal_case, "ТЬСЯ")): v.normal_full = v.normal_case[0:0+len(v.normal_case) - 2] v.language = self.language if (v.class0_.is_preposition): v.normal_case = LanguageHelper.normalizePreposition(v.normal_case) mc = MorphClass() for i in range(len(res) - 1, -1, -1): if (not res[i].is_in_dictionary and res[i].class0_.is_adjective and len(res) > 1): if ("к.ф." in res[i].misc.attrs or "неизм." in res[i].misc.attrs): del res[i] continue if (res[i].is_in_dictionary): mc.value |= res[i].class0_.value else: i = -1 if (mc == MorphClass.VERB and len(res) > 1): for r in res: if (r.undef_coef > (100) and r.class0_ == MorphClass.ADJECTIVE): r.undef_coef = (0) if (len(res) == 0): return None return res
def __init__(self) -> None: self.__m_cla = MorphClass() self.__gender = MorphGender.UNDEFINED self.__number = MorphNumber.UNDEFINED self.__m_cas = MorphCase() self.__m_lang = MorphLang()
def get_morph_class_in_dictionary(self) -> 'MorphClass': res = MorphClass() for wf in self.morph.items: if ((isinstance(wf, MorphWordForm)) and wf.is_in_dictionary): res |= wf.class0_ return res
def get_normal_case_text(self, mc: 'MorphClass' = None, num: 'MorphNumber' = MorphNumber.UNDEFINED, gender: 'MorphGender' = MorphGender.UNDEFINED, keep_chars: bool = False) -> str: from pullenti.ner.core.MiscHelper import MiscHelper empty = True if (mc is not None and mc.is_preposition): return LanguageHelper.normalize_preposition(self.term) for it in self.morph.items: if (mc is not None and not mc.is_undefined): cc = (it.class0_) & mc if (cc.is_undefined): continue if (cc.is_misc and not cc.is_proper and mc != it.class0_): continue wf = Utils.asObjectOrNull(it, MorphWordForm) normal_full = False if (gender != MorphGender.UNDEFINED): if (((it.gender) & (gender)) == (MorphGender.UNDEFINED)): if ((gender == MorphGender.MASCULINE and ((it.gender != MorphGender.UNDEFINED or it.number == MorphNumber.PLURAL)) and wf is not None) and wf.normal_full is not None): normal_full = True elif (gender == MorphGender.MASCULINE and it.class0_.is_personal_pronoun): pass else: continue if (not it.case_.is_undefined): empty = False if (wf is not None): res = None if (num == MorphNumber.SINGULAR and it.number == MorphNumber.PLURAL and wf.normal_full is not None): le = len(wf.normal_case) if ((le == (len(wf.normal_full) + 2) and le > 4 and wf.normal_case[le - 2] == 'С') and wf.normal_case[le - 1] == 'Я'): res = wf.normal_case else: res = (wf.normal_full if normal_full else wf.normal_full) else: res = (wf.normal_full if normal_full else (Utils.ifNotNull(wf.normal_case, self.term))) if (num == MorphNumber.SINGULAR and mc is not None and mc == MorphClass.NOUN): if (res == "ДЕТИ"): res = "РЕБЕНОК" if (keep_chars): if (self.chars.is_all_lower): res = res.lower() elif (self.chars.is_capital_upper): res = MiscHelper.convert_first_char_upper_and_other_lower( res) return res if (not empty): return None te = None if (num == MorphNumber.SINGULAR and mc is not None): bi = MorphBaseInfo._new492(MorphClass._new53(mc.value), gender, MorphNumber.SINGULAR, self.morph.language) vars0_ = MorphologyService.get_wordform(self.term, bi) if (vars0_ is not None): te = vars0_ if (te is None): te = self.term if (keep_chars): if (self.chars.is_all_lower): return te.lower() elif (self.chars.is_capital_upper): return MiscHelper.convert_first_char_upper_and_other_lower(te) return te
def copy_to(self, dst: 'MorphBaseInfo') -> None: dst.class0_ = MorphClass(self.class0_) dst.gender = self.gender dst.number = self.number dst.case_ = MorphCase(self.case_) dst.language = MorphLang(self.language)
def get_normal_case_text(self, mc: 'MorphClass' = None, num: 'MorphNumber' = MorphNumber.UNDEFINED, gender: 'MorphGender' = MorphGender.UNDEFINED, keep_chars: bool = False) -> str: if ((isinstance(self.begin_token, ReferentToken)) and self.begin_token == self.end_token): return self.begin_token.get_normal_case_text( mc, num, gender, keep_chars) res = None max_coef = 0 def_coef = -1 for it in self.morph.items: v = Utils.asObjectOrNull(it, NounPhraseItemTextVar) if (v is None): continue if (v.undef_coef > 0 and (((v.undef_coef < max_coef) or def_coef >= 0))): continue if (num == MorphNumber.SINGULAR and v.single_number_value is not None): if (mc is not None and ((gender == MorphGender.NEUTER or gender == MorphGender.FEMINIE)) and mc.is_adjective): bi = MorphBaseInfo._new401(MorphClass._new53(mc.value), gender, MorphNumber.SINGULAR, MorphCase.NOMINATIVE, self.morph.language) str0_ = MorphologyService.get_wordform( v.single_number_value, bi) if (str0_ is not None): res = str0_ else: res = v.single_number_value if (v.undef_coef == 0): break max_coef = v.undef_coef continue if (Utils.isNullOrEmpty(v.normal_value)): continue if (str.isdigit(v.normal_value[0]) and mc is not None and mc.is_adjective): val = 0 wrapval402 = RefOutArgWrapper(0) inoutres403 = Utils.tryParseInt(v.normal_value, wrapval402) val = wrapval402.value if (inoutres403): str0_ = NumberHelper.get_number_adjective( val, gender, (MorphNumber.SINGULAR if num == MorphNumber.SINGULAR or val == 1 else MorphNumber.PLURAL)) if (str0_ is not None): res = str0_ if (v.undef_coef == 0): break max_coef = v.undef_coef continue res1 = it.normal_value if (num == MorphNumber.SINGULAR): if (res1 == "ДЕТИ"): res1 = "РЕБЕНОК" elif (res1 == "ЛЮДИ"): res1 = "ЧЕЛОВЕК" max_coef = v.undef_coef if (v.undef_coef > 0): res = res1 continue def_co = 0 if (mc is not None and mc.is_adjective and v.undef_coef == 0): pass elif ( ((isinstance(self.begin_token, TextToken)) and res1 == self.begin_token.term and it.case_.is_nominative) and it.number == MorphNumber.SINGULAR): def_co = 1 if (num == MorphNumber.PLURAL and ((v.number) & (MorphNumber.PLURAL)) == (MorphNumber.PLURAL)): def_co += 3 if (res is None or def_co > def_coef): res = res1 def_coef = def_co if (def_co > 0): break if (res is not None): return self.__corr_chars(res, keep_chars) if (res is None and self.begin_token == self.end_token): res = self.begin_token.get_normal_case_text( mc, num, gender, keep_chars) elif (res is None): res = self.begin_token.get_normal_case_text( mc, num, gender, keep_chars) if (res is None): res = MiscHelper.get_text_value_of_meta_token( self, (GetTextAttr.KEEPREGISTER if keep_chars else GetTextAttr.NO)) else: res = "{0} {1}".format( res, MiscHelper.get_text_value( self.begin_token.next0_, self.end_token, (GetTextAttr.KEEPREGISTER if keep_chars else GetTextAttr.NO))) return Utils.ifNotNull(res, "?")
def getNormalCaseText(self, mc: 'MorphClass' = None, single_number: bool = False, gender: 'MorphGender' = MorphGender.UNDEFINED, keep_chars: bool = False) -> str: if ((isinstance(self.begin_token, ReferentToken)) and self.begin_token == self.end_token): return self.begin_token.getNormalCaseText(mc, single_number, gender, keep_chars) res = None max_coef = 0 def_coef = -1 for it in self.morph.items: v = Utils.asObjectOrNull(it, NounPhraseItemTextVar) if (v.undef_coef > 0 and (((v.undef_coef < max_coef) or def_coef >= 0))): continue if (single_number and v.single_number_value is not None): if (mc is not None and ((gender == MorphGender.NEUTER or gender == MorphGender.FEMINIE)) and mc.is_adjective): bi = MorphBaseInfo._new467(MorphClass(mc), gender, MorphNumber.SINGULAR, MorphCase.NOMINATIVE, self.morph.language) str0_ = Morphology.getWordform(v.single_number_value, bi) if (str0_ is not None): res = str0_ else: res = v.single_number_value if (v.undef_coef == 0): break max_coef = v.undef_coef continue if (Utils.isNullOrEmpty(v.normal_value)): continue if (str.isdigit(v.normal_value[0]) and mc is not None and mc.is_adjective): wrapval468 = RefOutArgWrapper(0) inoutres469 = Utils.tryParseInt(v.normal_value, wrapval468) val = wrapval468.value if (inoutres469): str0_ = NumberHelper.getNumberAdjective( val, gender, (MorphNumber.SINGULAR if single_number or val == 1 else MorphNumber.PLURAL)) if (str0_ is not None): res = str0_ if (v.undef_coef == 0): break max_coef = v.undef_coef continue res1 = (it).normal_value if (single_number): if (res1 == "ДЕТИ"): res1 = "РЕБЕНОК" elif (res1 == "ЛЮДИ"): res1 = "ЧЕЛОВЕК" max_coef = v.undef_coef if (v.undef_coef > 0): res = res1 continue def_co = 0 if (mc is not None and mc.is_adjective and v.undef_coef == 0): pass elif (((isinstance(self.begin_token, TextToken)) and res1 == (self.begin_token).term and it.case_.is_nominative) and it.number == MorphNumber.SINGULAR): def_co = 1 if (res is None or def_co > def_coef): res = res1 def_coef = def_co if (def_co > 0): break if (res is not None): return self.__corrChars(res, keep_chars) if (res is None and self.begin_token == self.end_token): res = self.begin_token.getNormalCaseText(mc, single_number, gender, keep_chars) return Utils.ifNotNull(res, "?")
def process(self, word : str) -> typing.List['MorphWordForm']: if (Utils.isNullOrEmpty(word)): return None res = None i = 0 if (len(word) > 1): i = 0 while i < len(word): ch = word[i] if (LanguageHelper.is_cyrillic_vowel(ch) or LanguageHelper.is_latin_vowel(ch)): break i += 1 if (i >= len(word)): return res mvs = [ ] tn = self.m_root i = 0 while i <= len(word): if (tn.lazy_pos > 0): self.__load_tree_node(tn) if (tn.rule_ids is not None): word_begin = None word_end = None if (i == 0): word_end = word elif (i < len(word)): word_end = word[i:] else: word_end = "" if (res is None): res = list() for rid in tn.rule_ids: r = self.get_rule(rid) mvs = r.get_vars(word_end) if (mvs is None): continue if (word_begin is None): if (i == len(word)): word_begin = word elif (i > 0): word_begin = word[0:0+i] else: word_begin = "" self.__process_result(res, word_begin, mvs) if (tn.nodes is None or i >= len(word)): break ch = ord(word[i]) wraptn9 = RefOutArgWrapper(None) inoutres10 = Utils.tryGetValue(tn.nodes, ch, wraptn9) tn = wraptn9.value if (not inoutres10): break i += 1 need_test_unknown_vars = True if (res is not None): for r in res: if ((r.class0_.is_pronoun or r.class0_.is_noun or r.class0_.is_adjective) or (r.class0_.is_misc and r.class0_.is_conjunction) or r.class0_.is_preposition): need_test_unknown_vars = False elif (r.class0_.is_adverb and r.normal_case is not None): if (not LanguageHelper.ends_with_ex(r.normal_case, "О", "А", None, None)): need_test_unknown_vars = False elif (r.normal_case == "МНОГО"): need_test_unknown_vars = False elif (r.class0_.is_verb and len(res) > 1): ok = False for rr in res: if (rr != r and rr.class0_ != r.class0_): ok = True break if (ok and not LanguageHelper.ends_with(word, "ИМ")): need_test_unknown_vars = False if (need_test_unknown_vars and LanguageHelper.is_cyrillic_char(word[0])): gl = 0 sog = 0 j = 0 while j < len(word): if (LanguageHelper.is_cyrillic_vowel(word[j])): gl += 1 else: sog += 1 j += 1 if ((gl < 2) or (sog < 2)): need_test_unknown_vars = False if (need_test_unknown_vars and res is not None and len(res) == 1): if (res[0].class0_.is_verb): if ("н.вр." in res[0].misc.attrs and "нес.в." in res[0].misc.attrs and not "страд.з." in res[0].misc.attrs): need_test_unknown_vars = False elif ("б.вр." in res[0].misc.attrs and "сов.в." in res[0].misc.attrs): need_test_unknown_vars = False elif ("инф." in res[0].misc.attrs and "сов.в." in res[0].misc.attrs): need_test_unknown_vars = False elif (res[0].normal_case is not None and LanguageHelper.ends_with(res[0].normal_case, "СЯ")): need_test_unknown_vars = False if (res[0].class0_.is_undefined and "прдктв." in res[0].misc.attrs): need_test_unknown_vars = False if (need_test_unknown_vars): if (self.m_root_reverce is None): return res tn = self.m_root_reverce tn0 = self.m_root_reverce for i in range(len(word) - 1, -1, -1): if (tn.lazy_pos > 0): self.__load_tree_node(tn) ch = ord(word[i]) if (tn.nodes is None): break if (not ch in tn.nodes): break tn = tn.nodes[ch] if (tn.lazy_pos > 0): self.__load_tree_node(tn) if (tn.reverce_variants is not None): tn0 = tn break else: i = -1 if (tn0 != self.m_root_reverce): glas = i < 4 while i >= 0: if (LanguageHelper.is_cyrillic_vowel(word[i]) or LanguageHelper.is_latin_vowel(word[i])): glas = True break i -= 1 if (glas): for mvref in tn0.reverce_variants: mv = self.get_rule_var(mvref.rule_id, mvref.variant_id) if (mv is None): continue if (((not mv.class0_.is_verb and not mv.class0_.is_adjective and not mv.class0_.is_noun) and not mv.class0_.is_proper_surname and not mv.class0_.is_proper_geo) and not mv.class0_.is_proper_secname): continue ok = False for rr in res: if (rr.is_in_dictionary): if (rr.class0_ == mv.class0_ or rr.class0_.is_noun): ok = True break if (not mv.class0_.is_adjective and rr.class0_.is_verb): ok = True break if (ok): continue if (len(mv.tail) > 0 and not LanguageHelper.ends_with(word, mv.tail)): continue r = MorphWordForm(mv, word, self.get_misc_info(mv.misc_info_id)) if (not r._has_morph_equals(res)): r.undef_coef = mvref.coef if (res is None): res = list() res.append(r) if (word == "ПРИ" and res is not None): for i in range(len(res) - 1, -1, -1): if (res[i].class0_.is_proper_geo): del res[i] else: i = -1 if (res is None or len(res) == 0): return None self.__sort(res, word) for v in res: if (v.normal_case is None): v.normal_case = word if (v.class0_.is_verb): if (v.normal_full is None and LanguageHelper.ends_with(v.normal_case, "ТЬСЯ")): v.normal_full = v.normal_case[0:0+len(v.normal_case) - 2] v.language = self.language if (v.class0_.is_preposition): v.normal_case = LanguageHelper.normalize_preposition(v.normal_case) mc = MorphClass() for i in range(len(res) - 1, -1, -1): if (not res[i].is_in_dictionary and res[i].class0_.is_adjective and len(res) > 1): if ("к.ф." in res[i].misc.attrs or "неизм." in res[i].misc.attrs): del res[i] continue if (res[i].is_in_dictionary): mc.value |= res[i].class0_.value else: i = -1 if (mc == MorphClass.VERB and len(res) > 1): for r in res: if (r.undef_coef > (100) and r.class0_ == MorphClass.ADJECTIVE): r.undef_coef = (0) if (len(res) == 0): return None return res
def getNormalCaseText(self, mc: 'MorphClass' = None, single_number: bool = False, gender: 'MorphGender' = MorphGender.UNDEFINED, keep_chars: bool = False) -> str: from pullenti.ner.core.MiscHelper import MiscHelper empty = True if (mc is not None and mc.is_preposition): return LanguageHelper.normalizePreposition(self.term) for it in self.morph.items: if (mc is not None and not mc.is_undefined): cc = (it.class0_.value) & (mc.value) if (cc == 0): continue if (MorphClass.isMiscInt(cc) and not MorphClass.isProperInt(cc) and mc.value != it.class0_.value): continue wf = Utils.asObjectOrNull(it, MorphWordForm) normal_full = False if (gender != MorphGender.UNDEFINED): if ((((it.gender) & (gender))) == (MorphGender.UNDEFINED)): if ((gender == MorphGender.MASCULINE and ((it.gender != MorphGender.UNDEFINED or it.number == MorphNumber.PLURAL)) and wf is not None) and wf.normal_full is not None): normal_full = True elif (gender == MorphGender.MASCULINE and it.class0_.is_personal_pronoun): pass else: continue if (not it.case_.is_undefined): empty = False if (wf is not None): if (single_number and it.number == MorphNumber.PLURAL and wf.normal_full is not None): le = len(wf.normal_case) if ((le == (len(wf.normal_full) + 2) and le > 4 and wf.normal_case[le - 2] == 'С') and wf.normal_case[le - 1] == 'Я'): res = wf.normal_case else: res = (wf.normal_full if normal_full else wf.normal_full) else: res = (wf.normal_full if normal_full else (Utils.ifNotNull(wf.normal_case, self.term))) if (single_number and mc is not None and mc == MorphClass.NOUN): if (res == "ДЕТИ"): res = "РЕБЕНОК" if (keep_chars): if (self.chars.is_all_lower): res = res.lower() elif (self.chars.is_capital_upper): res = MiscHelper.convertFirstCharUpperAndOtherLower( res) return res if (not empty): return None te = None if (single_number and mc is not None): bi = MorphBaseInfo._new549(MorphClass(mc), gender, MorphNumber.SINGULAR, self.morph.language) vars0_ = Morphology.getWordform(self.term, bi) if (vars0_ is not None): te = vars0_ if (self.chars.is_cyrillic_letter and te is None and len(self.term) > 3): ch0 = self.term[len(self.term) - 1] ch1 = self.term[len(self.term) - 2] if (ch0 == 'М' and ((ch1 == 'О' or ch1 == 'А'))): te = self.term[0:0 + len(self.term) - 2] elif (not LanguageHelper.isCyrillicVowel(ch1) and LanguageHelper.isCyrillicVowel(ch0)): te = self.term[0:0 + len(self.term) - 1] if (te is None): te = self.term if (keep_chars): if (self.chars.is_all_lower): return te.lower() elif (self.chars.is_capital_upper): return MiscHelper.convertFirstCharUpperAndOtherLower(te) return te