def try_parse(t: 'Token') -> 'DelimToken': if (not (isinstance(t, TextToken))): return None if (t.is_comma_and): res0 = DelimToken.try_parse(t.next0_) if (res0 is not None): res0.begin_token = t return res0 return None tok = DelimToken.__m_onto.try_parse(t, TerminParseAttr.NO) if (tok is not None): res = DelimToken(t, tok.end_token) res.typ = (Utils.valToEnum(tok.termin.tag, DelimType)) res.doublt = tok.termin.tag2 is not None res2 = DelimToken.try_parse(res.end_token.next0_) if (res2 is not None): if (res2.typ == res.typ): res.end_token = res2.end_token res.doublt = False if (t.morph.class0_.is_pronoun): npt = NounPhraseHelper.try_parse( t, NounPhraseParseAttr.PARSEADVERBS, 0, None) if (npt is not None and npt.end_char > res.end_char): return None return res return None
def try_parse(t : 'Token', prev : 'WeaponItemToken', after_conj : bool, attach_high : bool=False) -> 'WeaponItemToken': res = WeaponItemToken.__try_parse(t, prev, after_conj, attach_high) if (res is None): npt = NounPhraseHelper.try_parse(t, NounPhraseParseAttr.NO, 0, None) if (npt is not None and npt.noun.begin_char > npt.begin_char): res = WeaponItemToken.__try_parse(npt.noun.begin_token, prev, after_conj, attach_high) if (res is not None): if (res.typ == WeaponItemToken.Typs.NOUN): str0_ = npt.get_normal_case_text(None, MorphNumber.SINGULAR, MorphGender.UNDEFINED, False) if (str0_ == "РУЧНОЙ ГРАНАТ"): str0_ = "РУЧНАЯ ГРАНАТА" if ((Utils.ifNotNull(str0_, "")).endswith(res.value)): if (res.alt_value is None): res.alt_value = str0_ else: str0_ = str0_[0:0+len(str0_) - len(res.value)].strip() res.alt_value = "{0} {1}".format(str0_, res.alt_value) res.begin_token = t return res return None if (res.typ == WeaponItemToken.Typs.NAME): br = BracketHelper.try_parse(res.end_token.next0_, BracketParseAttr.NO, 100) if (br is not None and br.is_char('(')): alt = MiscHelper.get_text_value_of_meta_token(br, GetTextAttr.NO) if (MiscHelper.can_be_equal_cyr_and_latss(res.value, alt)): res.alt_value = alt res.end_token = br.end_token return res
def try_parse(t: 'Token') -> 'ConjunctionToken': """ Попытаться выделить союз с указанного токена. Args: t(Token): начальный токен Returns: ConjunctionToken: результат или null """ if (not (isinstance(t, TextToken))): return None if (t.is_comma): ne = ConjunctionHelper.try_parse(t.next0_) if (ne is not None): ne.begin_token = t ne.is_simple = False return ne return ConjunctionToken._new478(t, t, ConjunctionType.COMMA, True, ",") tok = ConjunctionHelper.__m_ontology.try_parse(t, TerminParseAttr.NO) if (tok is not None): if (t.is_value("ТО", None)): npt = NounPhraseHelper.try_parse( t, NounPhraseParseAttr.PARSEADVERBS, 0, None) if (npt is not None and npt.end_char > tok.end_token.end_char): return None if (tok.termin.tag2 is not None): if (not (isinstance(tok.end_token, TextToken))): return None if (tok.end_token.get_morph_class_in_dictionary().is_verb): if (not tok.end_token.term.endswith("АЯ")): return None return ConjunctionToken._new479( t, tok.end_token, tok.termin.canonic_text, Utils.valToEnum(tok.termin.tag, ConjunctionType)) if (not t.get_morph_class_in_dictionary().is_conjunction): return None if (t.is_and or t.is_or): res = ConjunctionToken._new480( t, t, t.term, True, (ConjunctionType.OR if t.is_or else ConjunctionType.AND)) if (((t.next0_ is not None and t.next0_.is_char('(') and (isinstance(t.next0_.next0_, TextToken))) and t.next0_.next0_.is_or and t.next0_.next0_.next0_ is not None) and t.next0_.next0_.next0_.is_char(')')): res.end_token = t.next0_.next0_.next0_ elif ((t.next0_ is not None and t.next0_.is_char_of("\\/") and (isinstance(t.next0_.next0_, TextToken))) and t.next0_.next0_.is_or): res.end_token = t.next0_.next0_ return res term = t.term if (term == "НИ"): return ConjunctionToken._new479(t, t, term, ConjunctionType.NOT) if ((term == "А" or term == "НО" or term == "ЗАТО") or term == "ОДНАКО"): return ConjunctionToken._new479(t, t, term, ConjunctionType.BUT) return None
def check_unknown_region(t: 'Token') -> 'Token': from pullenti.ner.geo.internal.TerrItemToken import TerrItemToken if (not (isinstance(t, TextToken))): return None npt = NounPhraseHelper.try_parse(t, NounPhraseParseAttr.NO, 0, None) if (npt is None): return None if (TerrItemToken._m_unknown_regions.try_parse( npt.end_token, TerminParseAttr.FULLWORDSONLY) is not None): return npt.end_token return None
def try_attach(t0: 'Token') -> 'PhoneItemToken': res = PhoneItemToken.__try_attach(t0) if (res is None): return None if (res.item_type != PhoneItemToken.PhoneItemType.PREFIX): return res t = res.end_token.next0_ first_pass3388 = True while True: if first_pass3388: first_pass3388 = False else: t = t.next0_ if (not (t is not None)): break if (t.is_table_control_char): break if (t.is_newline_before): break res2 = PhoneItemToken.__try_attach(t) if (res2 is not None): if (res2.item_type == PhoneItemToken.PhoneItemType.PREFIX): if (res.kind == PhoneKind.UNDEFINED): res.kind = res2.kind res.end_token = res2.end_token t = res.end_token continue break if (t.is_char(':')): res.end_token = t break if (not (isinstance(t, TextToken))): break if (t0.length_char == 1): break npt = NounPhraseHelper.try_parse(t, NounPhraseParseAttr.NO, 0, None) if (npt is not None): t = npt.end_token if (t.is_value("ПОСЕЛЕНИЕ", None)): return None res.end_token = t continue if (t.get_morph_class_in_dictionary().is_proper): res.end_token = t continue if (t.morph.class0_.is_preposition): continue break return res
def __site_before(t: 'Token') -> 'Token': if (t is not None and t.is_char(':')): t = t.previous if (t is None): return None if ((t.is_value("ВЕБСАЙТ", None) or t.is_value("WEBSITE", None) or t.is_value("WEB", None)) or t.is_value("WWW", None)): return t t0 = None if (t.is_value("САЙТ", None) or t.is_value("SITE", None)): t0 = t t = t.previous elif (t.is_value("АДРЕС", None)): t0 = t.previous if (t0 is not None and t0.is_char('.')): t0 = t0.previous if (t0 is not None): if (t0.is_value("ЭЛ", None) or t0.is_value("ЭЛЕКТРОННЫЙ", None)): return t0 return None else: return None if (t is not None and t.is_hiphen): t = t.previous if (t is None): return t0 if (t.is_value("WEB", None) or t.is_value("ВЕБ", None)): t0 = t if (t0.previous is not None and t0.previous.morph.class0_.is_adjective and (t0.whitespaces_before_count < 3)): npt = NounPhraseHelper.try_parse(t0.previous, NounPhraseParseAttr.NO, 0, None) if (npt is not None): t0 = npt.begin_token return t0
def try_attach(t : 'Token', p1 : 'InstrumentParticipantReferent'=None, p2 : 'InstrumentParticipantReferent'=None, is_contract : bool=False) -> 'ParticipantToken': if (t is None): return None tt = t br = False if (p1 is None and p2 is None and is_contract): r1 = t.get_referent() if ((r1 is not None and t.next0_ is not None and t.next0_.is_comma_and) and (isinstance(t.next0_.next0_, ReferentToken))): r2 = t.next0_.next0_.get_referent() if (r1.type_name == r2.type_name): ttt = t.next0_.next0_.next0_ refs = list() refs.append(r1) refs.append(r2) first_pass3282 = True while True: if first_pass3282: first_pass3282 = False else: ttt = ttt.next0_ if (not (ttt is not None)): break if ((ttt.is_comma_and and ttt.next0_ is not None and ttt.next0_.get_referent() is not None) and ttt.next0_.get_referent().type_name == r1.type_name): ttt = ttt.next0_ if (not ttt.get_referent() in refs): refs.append(ttt.get_referent()) continue break first_pass3283 = True while True: if first_pass3283: first_pass3283 = False else: ttt = ttt.next0_ if (not (ttt is not None)): break if (ttt.is_comma or ttt.morph.class0_.is_preposition): continue if ((ttt.is_value("ИМЕНОВАТЬ", None) or ttt.is_value("ДАЛЬНЕЙШИЙ", None) or ttt.is_value("ДАЛЕЕ", None)) or ttt.is_value("ТЕКСТ", None)): continue if (ttt.is_value("ДОГОВАРИВАТЬСЯ", None)): continue npt = NounPhraseHelper.try_parse(ttt, NounPhraseParseAttr.NO, 0, None) if (npt is not None and npt.noun.is_value("СТОРОНА", None) and npt.morph.number != MorphNumber.SINGULAR): re = ParticipantToken._new1573(t, npt.end_token, ParticipantToken.Kinds.NAMEDASPARTS) re.parts = refs return re break if ((isinstance(r1, OrganizationReferent)) or (isinstance(r1, PersonReferent))): has_br = False has_named = False if (isinstance(r1, PersonReferent)): if (t.previous is not None and t.previous.is_value("ЛИЦО", None)): return None elif (t.previous is not None and ((t.previous.is_value("ВЫДАВАТЬ", None) or t.previous.is_value("ВЫДАТЬ", None)))): return None ttt = t.begin_token while ttt is not None and (ttt.end_char < t.end_char): if (ttt.is_char('(')): has_br = True elif ((ttt.is_value("ИМЕНОВАТЬ", None) or ttt.is_value("ДАЛЬНЕЙШИЙ", None) or ttt.is_value("ДАЛЕЕ", None)) or ttt.is_value("ТЕКСТ", None)): has_named = True elif ((ttt.is_comma or ttt.morph.class0_.is_preposition or ttt.is_hiphen) or ttt.is_char(':')): pass elif (isinstance(ttt, ReferentToken)): pass elif (has_br or has_named): npt = NounPhraseHelper.try_parse(ttt, NounPhraseParseAttr.REFERENTCANBENOUN, 0, None) if (npt is None): break if (has_br): if (npt.end_token.next0_ is None or not npt.end_token.next0_.is_char(')')): break if (not has_named): if (ParticipantToken.M_ONTOLOGY.try_parse(ttt, TerminParseAttr.NO) is None): break re = ParticipantToken._new1573(t, t, ParticipantToken.Kinds.NAMEDAS) re.typ = npt.get_normal_case_text(None, MorphNumber.SINGULAR, MorphGender.UNDEFINED, False) re.parts = list() re.parts.append(r1) return re ttt = ttt.next0_ has_br = False has_named = False end_side = None brr = None add_refs = None ttt = t.next0_ first_pass3284 = True while True: if first_pass3284: first_pass3284 = False else: ttt = ttt.next0_ if (not (ttt is not None)): break if ((isinstance(ttt, NumberToken)) and (isinstance(ttt.next0_, TextToken)) and ttt.next0_.term == "СТОРОНЫ"): ttt = ttt.next0_ end_side = ttt if (ttt.next0_ is not None and ttt.next0_.is_comma): ttt = ttt.next0_ if (ttt.next0_ is not None and ttt.next0_.is_and): break if (brr is not None and ttt.begin_char > brr.end_char): brr = (None) if (BracketHelper.can_be_start_of_sequence(ttt, False, False)): brr = BracketHelper.try_parse(ttt, BracketParseAttr.NO, 100) if (brr is not None and (brr.length_char < 7) and ttt.is_char('(')): ttt = brr.end_token brr = (None) continue elif ((ttt.is_value("ИМЕНОВАТЬ", None) or ttt.is_value("ДАЛЬНЕЙШИЙ", None) or ttt.is_value("ДАЛЕЕ", None)) or ttt.is_value("ТЕКСТ", None)): has_named = True elif ((ttt.is_comma or ttt.morph.class0_.is_preposition or ttt.is_hiphen) or ttt.is_char(':')): pass elif (brr is not None or has_named): if (BracketHelper.can_be_start_of_sequence(ttt, True, False)): ttt = ttt.next0_ npt = NounPhraseHelper.try_parse(ttt, NounPhraseParseAttr.REFERENTCANBENOUN, 0, None) typ22 = None if (npt is not None): ttt = npt.end_token if (npt.end_token.is_value("ДОГОВОР", None)): continue else: ttok = None if (isinstance(ttt, MetaToken)): ttok = ParticipantToken.M_ONTOLOGY.try_parse(ttt.begin_token, TerminParseAttr.NO) if (ttok is not None): typ22 = ttok.termin.canonic_text elif (has_named and ttt.morph.class0_.is_adjective): typ22 = ttt.get_normal_case_text(MorphClass.ADJECTIVE, MorphNumber.UNDEFINED, MorphGender.UNDEFINED, False) elif (brr is not None): continue else: break if (BracketHelper.can_be_end_of_sequence(ttt.next0_, True, None, False)): ttt = ttt.next0_ if (brr is not None): if (ttt.next0_ is None): ttt = brr.end_token continue ttt = ttt.next0_ if (not has_named and typ22 is None): if (ParticipantToken.M_ONTOLOGY.try_parse(npt.begin_token, TerminParseAttr.NO) is None): break re = ParticipantToken._new1573(t, ttt, ParticipantToken.Kinds.NAMEDAS) re.typ = (Utils.ifNotNull(typ22, npt.get_normal_case_text(None, MorphNumber.SINGULAR, MorphGender.UNDEFINED, False))) re.parts = list() re.parts.append(r1) return re elif ((ttt.is_value("ЗАРЕГИСТРИРОВАННЫЙ", None) or ttt.is_value("КАЧЕСТВО", None) or ttt.is_value("ПРОЖИВАЮЩИЙ", None)) or ttt.is_value("ЗАРЕГ", None)): pass elif (ttt.get_referent() == r1): pass elif ((isinstance(ttt.get_referent(), PersonIdentityReferent)) or (isinstance(ttt.get_referent(), AddressReferent))): if (add_refs is None): add_refs = list() add_refs.append(ttt.get_referent()) else: prr = ttt.kit.process_referent("PERSONPROPERTY", ttt) if (prr is not None): ttt = prr.end_token continue if (isinstance(ttt.get_referent(), GeoReferent)): continue npt = NounPhraseHelper.try_parse(ttt, NounPhraseParseAttr.NO, 0, None) if (npt is not None): if ((npt.noun.is_value("МЕСТО", None) or npt.noun.is_value("ЖИТЕЛЬСТВО", None) or npt.noun.is_value("ПРЕДПРИНИМАТЕЛЬ", None)) or npt.noun.is_value("ПОЛ", None) or npt.noun.is_value("РОЖДЕНИЕ", None)): ttt = npt.end_token continue if (ttt.is_newline_before): break if (ttt.length_char < 3): continue mc = ttt.get_morph_class_in_dictionary() if (mc.is_adverb or mc.is_adjective): continue if (ttt.chars.is_all_upper): continue break if (end_side is not None or ((add_refs is not None and t.previous is not None and t.previous.is_and))): re = ParticipantToken._new1573(t, Utils.ifNotNull(end_side, t), ParticipantToken.Kinds.NAMEDAS) re.typ = (None) re.parts = list() re.parts.append(r1) if (add_refs is not None): re.parts.extend(add_refs) return re too = ParticipantToken.M_ONTOLOGY.try_parse(t, TerminParseAttr.NO) if (too is not None): if ((isinstance(t.previous, TextToken)) and t.previous.is_value("ЛИЦО", None)): too = (None) if (too is not None and too.termin.tag is not None and too.termin.canonic_text != "СТОРОНА"): tt1 = too.end_token.next0_ if (tt1 is not None): if (tt1.is_hiphen or tt1.is_char(':')): tt1 = tt1.next0_ if (isinstance(tt1, ReferentToken)): r1 = tt1.get_referent() if ((isinstance(r1, PersonReferent)) or (isinstance(r1, OrganizationReferent))): re = ParticipantToken._new1573(t, tt1, ParticipantToken.Kinds.NAMEDAS) re.typ = too.termin.canonic_text re.parts = list() re.parts.append(r1) return re add_typ1 = (None if p1 is None else p1.typ) add_typ2 = (None if p2 is None else p2.typ) if (BracketHelper.can_be_start_of_sequence(tt, False, False) and tt.next0_ is not None): br = True tt = tt.next0_ term1 = None term2 = None if (add_typ1 is not None and add_typ1.find(' ') > 0 and not add_typ1.startswith("СТОРОНА")): term1 = Termin(add_typ1) if (add_typ2 is not None and add_typ2.find(' ') > 0 and not add_typ2.startswith("СТОРОНА")): term2 = Termin(add_typ2) named = False typ_ = None t1 = None t0 = tt first_pass3285 = True while True: if first_pass3285: first_pass3285 = False else: tt = tt.next0_ if (not (tt is not None)): break if (tt.morph.class0_.is_preposition and typ_ is not None): continue if (tt.is_char_of("(:)") or tt.is_hiphen): continue if (tt.is_table_control_char): break if (tt.is_newline_before and tt != t0): if (isinstance(tt, NumberToken)): break if ((isinstance(tt, TextToken)) and (isinstance(tt.previous, TextToken))): if (tt.previous.is_value(tt.term, None)): break if (BracketHelper.is_bracket(tt, False)): continue tok = (ParticipantToken.M_ONTOLOGY.try_parse(tt, TerminParseAttr.NO) if ParticipantToken.M_ONTOLOGY is not None else None) if (tok is not None and (isinstance(tt.previous, TextToken))): if (tt.previous.is_value("ЛИЦО", None)): return None if (tok is None): if (add_typ1 is not None and ((MiscHelper.is_not_more_than_one_error(add_typ1, tt) or (((isinstance(tt, MetaToken)) and tt.begin_token.is_value(add_typ1, None)))))): if (typ_ is not None): if (not ParticipantToken.__is_types_equal(add_typ1, typ_)): break typ_ = add_typ1 t1 = tt continue if (add_typ2 is not None and ((MiscHelper.is_not_more_than_one_error(add_typ2, tt) or (((isinstance(tt, MetaToken)) and tt.begin_token.is_value(add_typ2, None)))))): if (typ_ is not None): if (not ParticipantToken.__is_types_equal(add_typ2, typ_)): break typ_ = add_typ2 t1 = tt continue if (tt.chars.is_letter): if (term1 is not None): tok1 = term1.try_parse(tt, TerminParseAttr.NO) if (tok1 is not None): if (typ_ is not None): if (not ParticipantToken.__is_types_equal(add_typ1, typ_)): break typ_ = add_typ1 tt = tok1.end_token t1 = tt continue if (term2 is not None): tok2 = term2.try_parse(tt, TerminParseAttr.NO) if (tok2 is not None): if (typ_ is not None): if (not ParticipantToken.__is_types_equal(add_typ2, typ_)): break typ_ = add_typ2 tt = tok2.end_token t1 = tt continue if (named and tt.get_morph_class_in_dictionary().is_noun): if (not tt.chars.is_all_lower or BracketHelper.is_bracket(tt.previous, True)): if (DecreeToken.is_keyword(tt, False) is None): val = tt.get_normal_case_text(MorphClass.NOUN, MorphNumber.SINGULAR, MorphGender.UNDEFINED, False) if (typ_ is not None): if (not ParticipantToken.__is_types_equal(typ_, val)): break typ_ = val t1 = tt continue if (named and typ_ is None and is_contract): if ((isinstance(tt, TextToken)) and tt.chars.is_cyrillic_letter and tt.chars.is_capital_upper): dc = tt.get_morph_class_in_dictionary() if (dc.is_undefined or dc.is_noun): dt = DecreeToken.try_attach(tt, None, False) ok = True if (dt is not None): ok = False elif (tt.is_value("СТОРОНА", None)): ok = False if (ok): typ_ = tt.lemma t1 = tt continue if (dc.is_adjective): npt = NounPhraseHelper.try_parse(tt, NounPhraseParseAttr.NO, 0, None) if (npt is not None and len(npt.adjectives) > 0 and npt.noun.get_morph_class_in_dictionary().is_noun): typ_ = npt.get_normal_case_text(None, MorphNumber.SINGULAR, MorphGender.UNDEFINED, False) t1 = npt.end_token continue if (tt == t): break if ((isinstance(tt, NumberToken)) or tt.is_char('.')): break if (tt.length_char < 4): if (typ_ is not None): continue break if (tok.termin.tag is None): named = True else: if (typ_ is not None): break if (tok.termin.canonic_text == "СТОРОНА"): tt1 = tt.next0_ if (tt1 is not None and tt1.is_hiphen): tt1 = tt1.next0_ if (not (isinstance(tt1, NumberToken))): break if (tt1.is_newline_before): break typ_ = "{0} {1}".format(tok.termin.canonic_text, tt1.value) t1 = tt1 else: typ_ = tok.termin.canonic_text t1 = tok.end_token break tt = tok.end_token if (typ_ is None): return None if (not named and t1 != t and not typ_.startswith("СТОРОНА")): if (not ParticipantToken.__is_types_equal(typ_, add_typ1) and not ParticipantToken.__is_types_equal(typ_, add_typ2)): return None if (BracketHelper.can_be_end_of_sequence(t1.next0_, False, None, False)): t1 = t1.next0_ if (not t.is_whitespace_before and BracketHelper.can_be_start_of_sequence(t.previous, False, False)): t = t.previous elif (BracketHelper.can_be_start_of_sequence(t, False, False) and BracketHelper.can_be_end_of_sequence(t1.next0_, True, t, True)): t1 = t1.next0_ if (br and t1.next0_ is not None and BracketHelper.can_be_end_of_sequence(t1.next0_, False, None, False)): t1 = t1.next0_ res = ParticipantToken._new1578(t, t1, (ParticipantToken.Kinds.NAMEDAS if named else ParticipantToken.Kinds.PURE), typ_) if (t.is_char(':')): res.begin_token = t.next0_ return res
def __try_attach_contract_ground(t : 'Token', ip : 'InstrumentParticipantReferent', can_be_passport : bool=False) -> 'Token': ok = False first_pass3289 = True while True: if first_pass3289: first_pass3289 = False else: t = t.next0_ if (not (t is not None)): break if (t.is_char(',') or t.morph.class0_.is_preposition): continue if (t.is_char('(')): br = BracketHelper.try_parse(t, BracketParseAttr.NO, 100) if (br is not None): t = br.end_token continue if (t.is_value("ОСНОВАНИЕ", None) or t.is_value("ДЕЙСТВОВАТЬ", None) or t.is_value("ДЕЙСТВУЮЩИЙ", None)): ok = True if (t.next0_ is not None and t.next0_.is_char('(')): br = BracketHelper.try_parse(t.next0_, BracketParseAttr.NO, 100) if (br is not None and (br.length_char < 10)): t = br.end_token continue dr = Utils.asObjectOrNull(t.get_referent(), DecreeReferent) if (dr is not None): ip.ground = dr return t pir = Utils.asObjectOrNull(t.get_referent(), PersonIdentityReferent) if (pir is not None and can_be_passport): if (pir.typ is not None and not "паспорт" in pir.typ): ip.ground = pir return t if (t.is_value("УСТАВ", None)): ip.ground = t.get_normal_case_text(MorphClass.NOUN, MorphNumber.SINGULAR, MorphGender.UNDEFINED, False) return t if (t.is_value("ДОВЕРЕННОСТЬ", None)): dts = DecreeToken.try_attach_list(t.next0_, None, 10, False) if (dts is None): has_spec = False ttt = t.next0_ first_pass3290 = True while True: if first_pass3290: first_pass3290 = False else: ttt = ttt.next0_ if (not (ttt is not None and ((ttt.end_char - t.end_char) < 200))): break if (ttt.is_comma): continue if (ttt.is_value("УДОСТОВЕРИТЬ", None) or ttt.is_value("УДОСТОВЕРЯТЬ", None)): has_spec = True continue dt = DecreeToken.try_attach(ttt, None, False) if (dt is not None): if (dt.typ == DecreeToken.ItemType.DATE or dt.typ == DecreeToken.ItemType.NUMBER): dts = DecreeToken.try_attach_list(ttt, None, 10, False) break npt = NounPhraseHelper.try_parse(ttt, NounPhraseParseAttr.NO, 0, None) if (npt is not None): if (npt.end_token.is_value("НОТАРИУС", None)): ttt = npt.end_token has_spec = True continue if (ttt.get_referent() is not None): if (has_spec): continue break if (dts is not None and len(dts) > 0): t0 = t dr = DecreeReferent() dr.typ = "ДОВЕРЕННОСТЬ" for d in dts: if (d.typ == DecreeToken.ItemType.DATE): dr._add_date(d) t = d.end_token elif (d.typ == DecreeToken.ItemType.NUMBER): dr._add_number(d) t = d.end_token else: break ad = t.kit.get_analyzer_data_by_analyzer_name(InstrumentAnalyzer.ANALYZER_NAME) ip.ground = ad.register_referent(dr) rt = ReferentToken(Utils.asObjectOrNull(ip.ground, Referent), t0, t) t.kit.embed_token(rt) return rt ip.ground = "ДОВЕРЕННОСТЬ" return t break return None
def try_parse(t : 'Token', attrs : 'BracketParseAttr'=BracketParseAttr.NO, max_tokens : int=100) -> 'BracketSequenceToken': """ Попробовать восстановить последовательность, обрамляемую кавычками или скобками. Поддерживается вложенность, возможность отсутствия закрывающего элемента и др. Args: t(Token): начальный токен attrs(BracketParseAttr): параметры выделения max_tokens(int): максимально токенов (вдруг забыли закрывающую кавычку) Returns: BracketSequenceToken: метатокен BracketSequenceToken """ t0 = t cou = 0 if (not BracketHelper.can_be_start_of_sequence(t0, False, False)): return None br_list = list() br_list.append(BracketHelper.Bracket(t0)) cou = 0 crlf = 0 last = None lev = 1 is_assim = br_list[0].char0_ != '«' and BracketHelper.M_ASSYMOPEN_CHARS.find(br_list[0].char0_) >= 0 gen_case = False t = t0.next0_ first_pass3057 = True while True: if first_pass3057: first_pass3057 = False else: t = t.next0_ if (not (t is not None)): break if (t.is_table_control_char): break last = t if (t.is_char_of(BracketHelper.M_OPEN_CHARS) or t.is_char_of(BracketHelper.M_CLOSE_CHARS)): if (t.is_newline_before and (((attrs) & (BracketParseAttr.CANBEMANYLINES))) == (BracketParseAttr.NO)): if (t.whitespaces_before_count > 10 or BracketHelper.can_be_start_of_sequence(t, False, False)): if (t.is_char('(') and not t0.is_char('(')): pass else: last = t.previous break bb = BracketHelper.Bracket(t) br_list.append(bb) if (len(br_list) > 20): break if ((len(br_list) == 3 and br_list[1].can_be_open and bb.can_be_close) and BracketHelper.__must_be_close_char(bb.char0_, br_list[1].char0_) and BracketHelper.__must_be_close_char(bb.char0_, br_list[0].char0_)): ok = False tt = t.next0_ while tt is not None: if (tt.is_newline_before): break if (tt.is_char(',')): break if (tt.is_char('.')): tt = tt.next0_ while tt is not None: if (tt.is_newline_before): break elif (tt.is_char_of(BracketHelper.M_OPEN_CHARS) or tt.is_char_of(BracketHelper.M_CLOSE_CHARS)): bb2 = BracketHelper.Bracket(tt) if (BracketHelper.can_be_end_of_sequence(tt, False, None, False) and BracketHelper.__can_be_close_char(bb2.char0_, br_list[0].char0_)): ok = True break tt = tt.next0_ break if (t.is_char_of(BracketHelper.M_OPEN_CHARS) or t.is_char_of(BracketHelper.M_CLOSE_CHARS)): ok = True break tt = tt.next0_ if (not ok): break if (is_assim): if (bb.can_be_open and not bb.can_be_close and bb.char0_ == br_list[0].char0_): lev += 1 elif (bb.can_be_close and not bb.can_be_open and BracketHelper.M_OPEN_CHARS.find(br_list[0].char0_) == BracketHelper.M_CLOSE_CHARS.find(bb.char0_)): lev -= 1 if (lev == 0): break else: cou += 1 if (cou > max_tokens): break if ((((attrs) & (BracketParseAttr.CANCONTAINSVERBS))) == (BracketParseAttr.NO)): if (t.morph.language.is_cyrillic): if (t.get_morph_class_in_dictionary() == MorphClass.VERB): if (not t.morph.class0_.is_adjective and not t.morph.contains_attr("страд.з.", None)): if (t.chars.is_all_lower): norm = t.get_normal_case_text(None, MorphNumber.UNDEFINED, MorphGender.UNDEFINED, False) if (not LanguageHelper.ends_with(norm, "СЯ")): if (len(br_list) > 1): break if (br_list[0].char0_ != '('): break elif (t.morph.language.is_en): if (t.morph.class0_ == MorphClass.VERB and t.chars.is_all_lower): break r = t.get_referent() if (r is not None and r.type_name == "ADDRESS"): if (not t0.is_char('(')): break if ((((attrs) & (BracketParseAttr.CANBEMANYLINES))) != (BracketParseAttr.NO)): if (t.is_newline_before): if (t.newlines_before_count > 1): break crlf += 1 continue if (t.is_newline_before): if (t.whitespaces_before_count > 15): last = t.previous break crlf += 1 if (not t.chars.is_all_lower): if (MiscHelper.can_be_start_of_sentence(t)): has = False tt = t.next0_ while tt is not None: if (tt.is_newline_before): break elif (tt.length_char == 1 and tt.is_char_of(BracketHelper.M_OPEN_CHARS) and tt.is_whitespace_before): break elif (tt.length_char == 1 and tt.is_char_of(BracketHelper.M_CLOSE_CHARS) and not tt.is_whitespace_before): has = True break tt = tt.next0_ if (not has): last = t.previous break if ((isinstance(t.previous, MetaToken)) and BracketHelper.can_be_end_of_sequence(t.previous.end_token, False, None, False)): last = t.previous break if (crlf > 1): if (len(br_list) > 1): break if (crlf > 10): break if (t.is_char(';') and t.is_newline_after): break npt = NounPhraseHelper.try_parse(t, NounPhraseParseAttr.NO, 0, None) if (npt is not None): if (t.is_newline_before): gen_case = npt.morph.case_.is_genitive t = npt.end_token last = t if ((len(br_list) == 1 and br_list[0].can_be_open and (isinstance(last, MetaToken))) and last.is_newline_after): if (BracketHelper.can_be_end_of_sequence(last.end_token, False, None, False)): return BracketSequenceToken(t0, last) if ((len(br_list) == 1 and br_list[0].can_be_open and gen_case) and last.is_newline_after and crlf <= 2): return BracketSequenceToken(t0, last) if (len(br_list) < 1): return None i = 1 while i < (len(br_list) - 1): if (br_list[i].char0_ == '<' and br_list[i + 1].char0_ == '>'): br_list[i].can_be_open = True br_list[i + 1].can_be_close = True i += 1 internals = None while len(br_list) > 3: i = len(br_list) - 1 if ((br_list[i].can_be_close and br_list[i - 1].can_be_open and not BracketHelper.__can_be_close_char(br_list[i].char0_, br_list[0].char0_)) and BracketHelper.__can_be_close_char(br_list[i].char0_, br_list[i - 1].char0_)): del br_list[len(br_list) - 2:len(br_list) - 2+2] continue break while len(br_list) >= 4: changed = False i = 1 while i < (len(br_list) - 2): if ((br_list[i].can_be_open and not br_list[i].can_be_close and br_list[i + 1].can_be_close) and not br_list[i + 1].can_be_open): ok = False if (BracketHelper.__must_be_close_char(br_list[i + 1].char0_, br_list[i].char0_) or br_list[i].char0_ != br_list[0].char0_): ok = True if ((i == 1 and ((i + 2) < len(br_list)) and br_list[i + 2].char0_ == ')') and br_list[i + 1].char0_ != ')' and BracketHelper.__can_be_close_char(br_list[i + 1].char0_, br_list[i - 1].char0_)): br_list[i + 2] = br_list[i + 1] elif (i > 1 and ((i + 2) < len(br_list)) and BracketHelper.__must_be_close_char(br_list[i + 2].char0_, br_list[i - 1].char0_)): ok = True if (ok): if (internals is None): internals = list() internals.append(BracketSequenceToken(br_list[i].source, br_list[i + 1].source)) del br_list[i:i+2] changed = True break i += 1 if (not changed): break res = None if ((len(br_list) >= 4 and br_list[1].can_be_open and br_list[2].can_be_close) and br_list[3].can_be_close and not br_list[3].can_be_open): if (BracketHelper.__can_be_close_char(br_list[3].char0_, br_list[0].char0_)): res = BracketSequenceToken(br_list[0].source, br_list[3].source) if (br_list[0].source.next0_ != br_list[1].source or br_list[2].source.next0_ != br_list[3].source): res.internal.append(BracketSequenceToken(br_list[1].source, br_list[2].source)) if (internals is not None): res.internal.extend(internals) if ((res is None and len(br_list) >= 3 and br_list[2].can_be_close) and not br_list[2].can_be_open): if ((((attrs) & (BracketParseAttr.NEARCLOSEBRACKET))) != (BracketParseAttr.NO)): if (BracketHelper.__can_be_close_char(br_list[1].char0_, br_list[0].char0_)): return BracketSequenceToken(br_list[0].source, br_list[1].source) ok = True if (BracketHelper.__can_be_close_char(br_list[2].char0_, br_list[0].char0_) and BracketHelper.__can_be_close_char(br_list[1].char0_, br_list[0].char0_) and br_list[1].can_be_close): t = br_list[1].source while t != br_list[2].source and t is not None: if (t.is_newline_before): ok = False break if (t.chars.is_letter and t.chars.is_all_lower): ok = False break npt = NounPhraseHelper.try_parse(t, NounPhraseParseAttr.NO, 0, None) if (npt is not None): t = npt.end_token t = t.next0_ if (ok): t = br_list[0].source.next0_ while t != br_list[1].source and t is not None: if (t.is_newline_before): return BracketSequenceToken(br_list[0].source, t.previous) t = t.next0_ lev1 = 0 tt = br_list[0].source.previous first_pass3058 = True while True: if first_pass3058: first_pass3058 = False else: tt = tt.previous if (not (tt is not None)): break if (tt.is_newline_after or tt.is_table_control_char): break if (not (isinstance(tt, TextToken))): continue if (tt.chars.is_letter or tt.length_char > 1): continue ch = tt.term[0] if (BracketHelper.__can_be_close_char(ch, br_list[0].char0_)): lev1 += 1 elif (BracketHelper.__can_be_close_char(br_list[1].char0_, ch)): lev1 -= 1 if (lev1 < 0): return BracketSequenceToken(br_list[0].source, br_list[1].source) if (ok and BracketHelper.__can_be_close_char(br_list[2].char0_, br_list[0].char0_)): intern = BracketSequenceToken(br_list[1].source, br_list[2].source) res = BracketSequenceToken(br_list[0].source, br_list[2].source) res.internal.append(intern) elif (ok and BracketHelper.__can_be_close_char(br_list[2].char0_, br_list[1].char0_) and br_list[0].can_be_open): if (BracketHelper.__can_be_close_char(br_list[2].char0_, br_list[0].char0_)): intern = BracketSequenceToken(br_list[1].source, br_list[2].source) res = BracketSequenceToken(br_list[0].source, br_list[2].source) res.internal.append(intern) elif (len(br_list) == 3): return None if (res is None and len(br_list) > 1 and br_list[1].can_be_close): res = BracketSequenceToken(br_list[0].source, br_list[1].source) if (res is None and len(br_list) > 1 and BracketHelper.__can_be_close_char(br_list[1].char0_, br_list[0].char0_)): res = BracketSequenceToken(br_list[0].source, br_list[1].source) if (res is None and len(br_list) == 2 and br_list[0].char0_ == br_list[1].char0_): res = BracketSequenceToken(br_list[0].source, br_list[1].source) if (res is not None and internals is not None): for i in internals: if (i.begin_char < res.end_char): res.internal.append(i) if (res is None): cou = 0 tt = t0.next0_ first_pass3059 = True while True: if first_pass3059: first_pass3059 = False else: tt = tt.next0_; cou += 1 if (not (tt is not None)): break if (tt.is_table_control_char): break if (MiscHelper.can_be_start_of_sentence(tt)): break if (max_tokens > 0 and cou > max_tokens): break mt = Utils.asObjectOrNull(tt, MetaToken) if (mt is None): continue if (isinstance(mt.end_token, TextToken)): if (mt.end_token.is_char_of(BracketHelper.M_CLOSE_CHARS)): bb = BracketHelper.Bracket(Utils.asObjectOrNull(mt.end_token, TextToken)) if (bb.can_be_close and BracketHelper.__can_be_close_char(bb.char0_, br_list[0].char0_)): return BracketSequenceToken(t0, tt) return res
def try_attach(t: 'Token') -> 'TitleItemToken': tt = Utils.asObjectOrNull(t, TextToken) if (tt is not None): t1 = tt if (tt.term == "ТЕМА"): tit = TitleItemToken.try_attach(tt.next0_) if (tit is not None and tit.typ == TitleItemToken.Types.TYP): t1 = tit.end_token if (t1.next0_ is not None and t1.next0_.is_char(':')): t1 = t1.next0_ return TitleItemToken._new2655( t, t1, TitleItemToken.Types.TYPANDTHEME, tit.value) if (tt.next0_ is not None and tt.next0_.is_char(':')): t1 = tt.next0_ return TitleItemToken(tt, t1, TitleItemToken.Types.THEME) if (tt.term == "ПО" or tt.term == "НА"): if (tt.next0_ is not None and tt.next0_.is_value("ТЕМА", None)): t1 = tt.next0_ if (t1.next0_ is not None and t1.next0_.is_char(':')): t1 = t1.next0_ return TitleItemToken(tt, t1, TitleItemToken.Types.THEME) if (tt.term == "ПЕРЕВОД" or tt.term == "ПЕР"): tt2 = tt.next0_ if (tt2 is not None and tt2.is_char('.')): tt2 = tt2.next0_ if (isinstance(tt2, TextToken)): if (tt2.term == "C" or tt2.term == "С"): tt2 = tt2.next0_ if (isinstance(tt2, TextToken)): return TitleItemToken( t, tt2, TitleItemToken.Types.TRANSLATE) if (tt.term == "СЕКЦИЯ" or tt.term == "SECTION" or tt.term == "СЕКЦІЯ"): t1 = tt.next0_ if (t1 is not None and t1.is_char(':')): t1 = t1.next0_ br = BracketHelper.try_parse(t1, BracketParseAttr.NO, 100) if (br is not None): t1 = br.end_token elif (t1 != tt.next0_): while t1 is not None: if (t1.is_newline_after): break t1 = t1.next0_ if (t1 is None): return None if (t1 != tt.next0_): return TitleItemToken(tt, t1, TitleItemToken.Types.DUST) t1 = (None) if (tt.is_value("СПЕЦИАЛЬНОСТЬ", "СПЕЦІАЛЬНІСТЬ")): t1 = tt.next0_ elif (tt.morph.class0_.is_preposition and tt.next0_ is not None and tt.next0_.is_value("СПЕЦИАЛЬНОСТЬ", "СПЕЦІАЛЬНІСТЬ")): t1 = tt.next0_.next0_ elif (tt.is_char('/') and tt.is_newline_before): t1 = tt.next0_ if (t1 is not None): if (t1.is_char_of(":") or t1.is_hiphen): t1 = t1.next0_ spec = TitleItemToken.__try_attach_speciality(t1, True) if (spec is not None): spec.begin_token = t return spec sss = TitleItemToken.__try_attach_speciality(t, False) if (sss is not None): return sss if (isinstance(t, ReferentToken)): return None npt = NounPhraseHelper.try_parse(t, NounPhraseParseAttr.NO, 0, None) if (npt is not None): s = npt.get_normal_case_text(None, MorphNumber.UNDEFINED, MorphGender.UNDEFINED, False) tok = TitleItemToken.M_TERMINS.try_parse(npt.end_token, TerminParseAttr.NO) if (tok is not None): ty = Utils.valToEnum(tok.termin.tag, TitleItemToken.Types) if (ty == TitleItemToken.Types.TYP): tit = TitleItemToken.try_attach(tok.end_token.next0_) if (tit is not None and tit.typ == TitleItemToken.Types.THEME): return TitleItemToken._new2655( npt.begin_token, tit.end_token, TitleItemToken.Types.TYPANDTHEME, s) if (s == "РАБОТА" or s == "РОБОТА" or s == "ПРОЕКТ"): return None t1 = tok.end_token if (s == "ДИССЕРТАЦИЯ" or s == "ДИСЕРТАЦІЯ"): err = 0 ttt = t1.next0_ first_pass3394 = True while True: if first_pass3394: first_pass3394 = False else: ttt = ttt.next0_ if (not (ttt is not None)): break if (ttt.morph.class0_.is_preposition): continue if (ttt.is_value("СОИСКАНИЕ", "")): continue npt1 = NounPhraseHelper.try_parse( ttt, NounPhraseParseAttr.NO, 0, None) if (npt1 is not None and npt1.noun.is_value( "СТЕПЕНЬ", "СТУПІНЬ")): ttt = npt1.end_token t1 = ttt continue rt = t1.kit.process_referent("PERSON", ttt) if (rt is not None and (isinstance( rt.referent, PersonPropertyReferent))): ppr = Utils.asObjectOrNull( rt.referent, PersonPropertyReferent) if (ppr.name == "доктор наук"): t1 = rt.end_token s = "ДОКТОРСКАЯ ДИССЕРТАЦИЯ" break elif (ppr.name == "кандидат наук"): t1 = rt.end_token s = "КАНДИДАТСКАЯ ДИССЕРТАЦИЯ" break elif (ppr.name == "магистр"): t1 = rt.end_token s = "МАГИСТЕРСКАЯ ДИССЕРТАЦИЯ" break if (ttt.is_value("ДОКТОР", None) or ttt.is_value("КАНДИДАТ", None) or ttt.is_value("МАГИСТР", "МАГІСТР")): t1 = ttt npt1 = NounPhraseHelper.try_parse( ttt.next0_, NounPhraseParseAttr.NO, 0, None) if (npt1 is not None and npt1.end_token.is_value( "НАУК", None)): t1 = npt1.end_token s = ("МАГИСТЕРСКАЯ ДИССЕРТАЦИЯ" if ttt.is_value("МАГИСТР", "МАГІСТР") else ("ДОКТОРСКАЯ ДИССЕРТАЦИЯ" if ttt.is_value( "ДОКТОР", None) else "КАНДИДАТСКАЯ ДИССЕРТАЦИЯ")) break err += 1 if (err > 3): break if (t1.next0_ is not None and t1.next0_.is_char('.')): t1 = t1.next0_ if (s.endswith("ОТЧЕТ") and t1.next0_ is not None and t1.next0_.is_value("О", None)): npt1 = NounPhraseHelper.try_parse( t1.next0_, NounPhraseParseAttr.PARSEPREPOSITION, 0, None) if (npt1 is not None and npt1.morph.case_.is_prepositional): t1 = npt1.end_token return TitleItemToken._new2655(npt.begin_token, t1, ty, s) tok1 = TitleItemToken.M_TERMINS.try_parse(t, TerminParseAttr.NO) if (tok1 is not None): t1 = tok1.end_token re = TitleItemToken( tok1.begin_token, t1, Utils.valToEnum(tok1.termin.tag, TitleItemToken.Types)) return re if (BracketHelper.can_be_start_of_sequence(t, False, False)): tok1 = TitleItemToken.M_TERMINS.try_parse(t.next0_, TerminParseAttr.NO) if (tok1 is not None and BracketHelper.can_be_end_of_sequence( tok1.end_token.next0_, False, None, False)): t1 = tok1.end_token.next0_ return TitleItemToken( tok1.begin_token, t1, Utils.valToEnum(tok1.termin.tag, TitleItemToken.Types)) return None
def try_attach_territory( li: typing.List['TerrItemToken'], ad: 'AnalyzerData', attach_always: bool = False, cits: typing.List['CityItemToken'] = None, exists: typing.List['GeoReferent'] = None) -> 'ReferentToken': if (li is None or len(li) == 0): return None ex_obj = None new_name = None adj_list = list() noun = None add_noun = None rt = TerrAttachHelper.__try_attach_moscowao(li, ad) if (rt is not None): return rt if (li[0].termin_item is not None and li[0].termin_item.canonic_text == "ТЕРРИТОРИЯ"): res2 = TerrAttachHelper.__try_attach_pure_terr(li, ad) return res2 if (len(li) == 2): if (li[0].rzd is not None and li[1].rzd_dir is not None): rzd = GeoReferent() rzd._add_name(li[1].rzd_dir) rzd._add_typ_ter(li[0].kit.base_language) rzd.add_slot(GeoReferent.ATTR_REF, li[0].rzd.referent, False, 0) rzd.add_ext_referent(li[0].rzd) return ReferentToken(rzd, li[0].begin_token, li[1].end_token) if (li[1].rzd is not None and li[0].rzd_dir is not None): rzd = GeoReferent() rzd._add_name(li[0].rzd_dir) rzd._add_typ_ter(li[0].kit.base_language) rzd.add_slot(GeoReferent.ATTR_REF, li[1].rzd.referent, False, 0) rzd.add_ext_referent(li[1].rzd) return ReferentToken(rzd, li[0].begin_token, li[1].end_token) can_be_city_before = False adj_terr_before = False if (cits is not None): if (cits[0].typ == CityItemToken.ItemType.CITY): can_be_city_before = True elif (cits[0].typ == CityItemToken.ItemType.NOUN and len(cits) > 1): can_be_city_before = True k = 0 k = 0 while k < len(li): if (li[k].onto_item is not None): if (ex_obj is not None or new_name is not None): break if (noun is not None): if (k == 1): if (noun.termin_item.canonic_text == "РАЙОН" or noun.termin_item.canonic_text == "ОБЛАСТЬ" or noun.termin_item.canonic_text == "СОЮЗ"): if (isinstance(li[k].onto_item.referent, GeoReferent)): if (li[k].onto_item.referent.is_state): break ok = False tt = li[k].end_token.next0_ if (tt is None): ok = True elif (tt.is_char_of(",.")): ok = True if (not ok): ok = MiscLocationHelper.check_geo_object_before( li[0].begin_token) if (not ok): adr = AddressItemToken.try_parse( tt, None, False, False, None) if (adr is not None): if (adr.typ == AddressItemToken.ItemType.STREET): ok = True if (not ok): break if (li[k].onto_item is not None): if (noun.begin_token.is_value("МО", None) or noun.begin_token.is_value("ЛО", None)): return None ex_obj = li[k] elif (li[k].termin_item is not None): if (noun is not None): break if (li[k].termin_item.is_always_prefix and k > 0): break if (k > 0 and li[k].is_doubt): if (li[k].begin_token == li[k].end_token and li[k].begin_token.is_value("ЗАО", None)): break if (li[k].termin_item.is_adjective or li[k].is_geo_in_dictionary): adj_list.append(li[k]) else: if (ex_obj is not None): geo_ = Utils.asObjectOrNull(ex_obj.onto_item.referent, GeoReferent) if (geo_ is None): break if (ex_obj.is_adjective and ((li[k].termin_item.canonic_text == "СОЮЗ" or li[k].termin_item.canonic_text == "ФЕДЕРАЦИЯ"))): str0_ = str(ex_obj.onto_item) if (not li[k].termin_item.canonic_text in str0_): return None if (li[k].termin_item.canonic_text == "РАЙОН" or li[k].termin_item.canonic_text == "ОКРУГ" or li[k].termin_item.canonic_text == "КРАЙ"): tmp = io.StringIO() for s in geo_.slots: if (s.type_name == GeoReferent.ATTR_TYPE): print("{0};".format(s.value), end="", file=tmp, flush=True) if (not li[k].termin_item.canonic_text in Utils.toStringStringIO(tmp).upper()): if (k != 1 or new_name is not None): break new_name = li[0] new_name.is_adjective = True new_name.onto_item = (None) ex_obj = (None) noun = li[k] if (k == 0): tt = TerrItemToken.try_parse( li[k].begin_token.previous, None, True, False, None) if (tt is not None and tt.morph.class0_.is_adjective): adj_terr_before = True else: if (ex_obj is not None): break if (new_name is not None): break new_name = li[k] k += 1 name = None alt_name = None full_name = None morph_ = None if (ex_obj is not None): if (ex_obj.is_adjective and not ex_obj.morph.language.is_en and noun is None): if (attach_always and ex_obj.end_token.next0_ is not None): npt = NounPhraseHelper.try_parse(ex_obj.begin_token, NounPhraseParseAttr.NO, 0, None) if (ex_obj.end_token.next0_.is_comma_and): pass elif (npt is None): pass else: str0_ = StreetItemToken.try_parse( ex_obj.end_token.next0_, None, False, None, False) if (str0_ is not None): if (str0_.typ == StreetItemType.NOUN and str0_.end_token == npt.end_token): return None else: cit = CityItemToken.try_parse(ex_obj.end_token.next0_, None, False, None) if (cit is not None and ((cit.typ == CityItemToken.ItemType.NOUN or cit.typ == CityItemToken.ItemType.CITY))): npt = NounPhraseHelper.try_parse( ex_obj.begin_token, NounPhraseParseAttr.NO, 0, None) if (npt is not None and npt.end_token == cit.end_token): pass else: return None elif (ex_obj.begin_token.is_value("ПОДНЕБЕСНЫЙ", None)): pass else: return None if (noun is None and ex_obj.can_be_city): cit0 = CityItemToken.try_parse_back( ex_obj.begin_token.previous) if (cit0 is not None and cit0.typ != CityItemToken.ItemType.PROPERNAME): return None if (ex_obj.is_doubt and noun is None): ok2 = False if (TerrAttachHelper.__can_be_geo_after( ex_obj.end_token.next0_)): ok2 = True elif (not ex_obj.can_be_surname and not ex_obj.can_be_city): if ((ex_obj.end_token.next0_ is not None and ex_obj.end_token.next0_.is_char(')') and ex_obj.begin_token.previous is not None) and ex_obj.begin_token.previous.is_char('(')): ok2 = True elif (ex_obj.chars.is_latin_letter and ex_obj.begin_token.previous is not None): if (ex_obj.begin_token.previous.is_value("IN", None)): ok2 = True elif (ex_obj.begin_token.previous.is_value( "THE", None) and ex_obj.begin_token.previous.previous is not None and ex_obj.begin_token.previous.previous.is_value( "IN", None)): ok2 = True if (not ok2): cit0 = CityItemToken.try_parse_back( ex_obj.begin_token.previous) if (cit0 is not None and cit0.typ != CityItemToken.ItemType.PROPERNAME): pass elif (MiscLocationHelper.check_geo_object_before( ex_obj.begin_token.previous)): pass else: return None name = ex_obj.onto_item.canonic_text morph_ = ex_obj.morph elif (new_name is not None): if (noun is None): return None j = 1 while j < k: if (li[j].is_newline_before and not li[0].is_newline_before): if (BracketHelper.can_be_start_of_sequence( li[j].begin_token, False, False)): pass else: return None j += 1 morph_ = noun.morph if (new_name.is_adjective): if (noun.termin_item.acronym == "АО"): if (noun.begin_token != noun.end_token): return None if (new_name.morph.gender != MorphGender.FEMINIE): return None geo_before = None tt0 = li[0].begin_token.previous if (tt0 is not None and tt0.is_comma_and): tt0 = tt0.previous if (not li[0].is_newline_before and tt0 is not None): geo_before = (Utils.asObjectOrNull(tt0.get_referent(), GeoReferent)) if (Utils.indexOfList(li, noun, 0) < Utils.indexOfList( li, new_name, 0)): if (noun.termin_item.is_state): return None if (new_name.can_be_surname and geo_before is None): if (((noun.morph.case_) & new_name.morph.case_).is_undefined): return None if (MiscHelper.is_exists_in_dictionary( new_name.begin_token, new_name.end_token, (MorphClass.ADJECTIVE) | MorphClass.PRONOUN | MorphClass.VERB)): if (noun.begin_token != new_name.begin_token): if (geo_before is None): if (len(li) == 2 and TerrAttachHelper.__can_be_geo_after( li[1].end_token.next0_)): pass elif (len(li) == 3 and li[2].termin_item is not None and TerrAttachHelper.__can_be_geo_after( li[2].end_token.next0_)): pass elif (new_name.is_geo_in_dictionary): pass elif (new_name.end_token.is_newline_after): pass else: return None npt = NounPhraseHelper.try_parse( new_name.end_token, NounPhraseParseAttr.PARSEPRONOUNS, 0, None) if (npt is not None and npt.end_token != new_name.end_token): if (len(li) >= 3 and li[2].termin_item is not None and npt.end_token == li[2].end_token): add_noun = li[2] else: return None rtp = new_name.kit.process_referent( "PERSON", new_name.begin_token) if (rtp is not None): return None name = ProperNameHelper.get_name_ex( new_name.begin_token, new_name.end_token, MorphClass.ADJECTIVE, MorphCase.UNDEFINED, noun.termin_item.gender, False, False) else: ok = False if (((k + 1) < len(li)) and li[k].termin_item is None and li[k + 1].termin_item is not None): ok = True elif ((k < len(li)) and li[k].onto_item is not None): ok = True elif (k == len(li) and not new_name.is_adj_in_dictionary): ok = True elif (MiscLocationHelper.check_geo_object_before( li[0].begin_token) or can_be_city_before): ok = True elif (MiscLocationHelper.check_geo_object_after( li[k - 1].end_token, False)): ok = True elif (len(li) == 3 and k == 2): cit = CityItemToken.try_parse(li[2].begin_token, None, False, None) if (cit is not None): if (cit.typ == CityItemToken.ItemType.CITY or cit.typ == CityItemToken.ItemType.NOUN): ok = True elif (len(li) == 2): ok = TerrAttachHelper.__can_be_geo_after( li[len(li) - 1].end_token.next0_) if (not ok and not li[0].is_newline_before and not li[0].chars.is_all_lower): rt00 = li[0].kit.process_referent( "PERSONPROPERTY", li[0].begin_token.previous) if (rt00 is not None): ok = True if (noun.termin_item is not None and noun.termin_item.is_strong and new_name.is_adjective): ok = True if (noun.is_doubt and len(adj_list) == 0 and geo_before is None): return None name = ProperNameHelper.get_name_ex( new_name.begin_token, new_name.end_token, MorphClass.ADJECTIVE, MorphCase.UNDEFINED, noun.termin_item.gender, False, False) if (not ok and not attach_always): if (MiscHelper.is_exists_in_dictionary( new_name.begin_token, new_name.end_token, (MorphClass.ADJECTIVE) | MorphClass.PRONOUN | MorphClass.VERB)): if (exists is not None): for e0_ in exists: if (e0_.find_slot(GeoReferent.ATTR_NAME, name, True) is not None): ok = True break if (not ok): return None full_name = "{0} {1}".format( ProperNameHelper.get_name_ex(li[0].begin_token, noun.begin_token.previous, MorphClass.ADJECTIVE, MorphCase.UNDEFINED, noun.termin_item.gender, False, False), noun.termin_item.canonic_text) else: if (not attach_always or ((noun.termin_item is not None and noun.termin_item.canonic_text == "ФЕДЕРАЦИЯ"))): is_latin = noun.chars.is_latin_letter and new_name.chars.is_latin_letter if (Utils.indexOfList(li, noun, 0) > Utils.indexOfList( li, new_name, 0)): if (not is_latin): return None if (not new_name.is_district_name and not BracketHelper.can_be_start_of_sequence( new_name.begin_token, False, False)): if (len(adj_list) == 0 and MiscHelper.is_exists_in_dictionary( new_name.begin_token, new_name.end_token, (MorphClass.NOUN) | MorphClass.PRONOUN)): if (len(li) == 2 and noun.is_city_region and (noun.whitespaces_after_count < 2)): pass else: return None if (not is_latin): if ((noun.termin_item.is_region and not attach_always and ((not adj_terr_before or new_name.is_doubt))) and not noun.is_city_region and not noun.termin_item.is_specific_prefix): if (not MiscLocationHelper. check_geo_object_before( noun.begin_token)): if (not noun.is_doubt and noun.begin_token != noun.end_token): pass elif ((noun.termin_item.is_always_prefix and len(li) == 2 and li[0] == noun) and li[1] == new_name): pass else: return None if (noun.is_doubt and len(adj_list) == 0): if (noun.termin_item.acronym == "МО" or noun.termin_item.acronym == "ЛО"): if (k == (len(li) - 1) and li[k].termin_item is not None): add_noun = li[k] k += 1 elif (len(li) == 2 and noun == li[0] and str(new_name).endswith("совет")): pass else: return None else: return None pers = new_name.kit.process_referent( "PERSON", new_name.begin_token) if (pers is not None): return None name = MiscHelper.get_text_value(new_name.begin_token, new_name.end_token, GetTextAttr.NO) if (new_name.begin_token != new_name.end_token): ttt = new_name.begin_token.next0_ while ttt is not None and ttt.end_char <= new_name.end_char: if (ttt.chars.is_letter): ty = TerrItemToken.try_parse( ttt, None, False, False, None) if ((ty is not None and ty.termin_item is not None and noun is not None) and ((noun.termin_item.canonic_text in ty.termin_item.canonic_text or ty.termin_item.canonic_text in noun.termin_item.canonic_text))): name = MiscHelper.get_text_value( new_name.begin_token, ttt.previous, GetTextAttr.NO) break ttt = ttt.next0_ if (len(adj_list) > 0): npt = NounPhraseHelper.try_parse(adj_list[0].begin_token, NounPhraseParseAttr.NO, 0, None) if (npt is not None and npt.end_token == noun.end_token): alt_name = "{0} {1}".format( npt.get_normal_case_text(None, MorphNumber.UNDEFINED, MorphGender.UNDEFINED, False), name) else: if ((len(li) == 1 and noun is not None and noun.end_token.next0_ is not None) and (isinstance( noun.end_token.next0_.get_referent(), GeoReferent))): g = Utils.asObjectOrNull(noun.end_token.next0_.get_referent(), GeoReferent) if (noun.termin_item is not None): tyy = noun.termin_item.canonic_text.lower() ooo = False if (g.find_slot(GeoReferent.ATTR_TYPE, tyy, True) is not None): ooo = True elif (tyy.endswith("район") and g.find_slot( GeoReferent.ATTR_TYPE, "район", True) is not None): ooo = True if (ooo): return ReferentToken._new734(g, noun.begin_token, noun.end_token.next0_, noun.begin_token.morph) if ((len(li) == 1 and noun == li[0] and li[0].termin_item is not None) and TerrItemToken.try_parse(li[0].end_token.next0_, None, True, False, None) is None and TerrItemToken.try_parse(li[0].begin_token.previous, None, True, False, None) is None): if (li[0].morph.number == MorphNumber.PLURAL): return None cou = 0 str0_ = li[0].termin_item.canonic_text.lower() tt = li[0].begin_token.previous first_pass3158 = True while True: if first_pass3158: first_pass3158 = False else: tt = tt.previous if (not (tt is not None)): break if (tt.is_newline_after): cou += 10 else: cou += 1 if (cou > 500): break g = Utils.asObjectOrNull(tt.get_referent(), GeoReferent) if (g is None): continue ok = True cou = 0 tt = li[0].end_token.next0_ first_pass3159 = True while True: if first_pass3159: first_pass3159 = False else: tt = tt.next0_ if (not (tt is not None)): break if (tt.is_newline_before): cou += 10 else: cou += 1 if (cou > 500): break tee = TerrItemToken.try_parse(tt, None, True, False, None) if (tee is None): continue ok = False break if (ok): ii = 0 while g is not None and (ii < 3): if (g.find_slot(GeoReferent.ATTR_TYPE, str0_, True) is not None): return ReferentToken._new734( g, li[0].begin_token, li[0].end_token, noun.begin_token.morph) g = g.higher ii += 1 break return None ter = None if (ex_obj is not None and (isinstance(ex_obj.tag, GeoReferent))): ter = (Utils.asObjectOrNull(ex_obj.tag, GeoReferent)) else: ter = GeoReferent() if (ex_obj is not None): geo_ = Utils.asObjectOrNull(ex_obj.onto_item.referent, GeoReferent) if (geo_ is not None and not geo_.is_city): ter._merge_slots2(geo_, li[0].kit.base_language) else: ter._add_name(name) if (noun is None and ex_obj.can_be_city): ter._add_typ_city(li[0].kit.base_language) else: pass elif (new_name is not None): ter._add_name(name) if (alt_name is not None): ter._add_name(alt_name) if (noun is not None): if (noun.termin_item.canonic_text == "АО"): ter._add_typ( ("АВТОНОМНИЙ ОКРУГ" if li[0].kit.base_language.is_ua else "АВТОНОМНЫЙ ОКРУГ")) elif (noun.termin_item.canonic_text == "МУНИЦИПАЛЬНОЕ СОБРАНИЕ" or noun.termin_item.canonic_text == "МУНІЦИПАЛЬНЕ ЗБОРИ"): ter._add_typ(("МУНІЦИПАЛЬНЕ УТВОРЕННЯ" if li[0].kit.base_language.is_ua else "МУНИЦИПАЛЬНОЕ ОБРАЗОВАНИЕ")) elif (noun.termin_item.acronym == "МО" and add_noun is not None): ter._add_typ(add_noun.termin_item.canonic_text) else: if (noun.termin_item.canonic_text == "СОЮЗ" and ex_obj is not None and ex_obj.end_char > noun.end_char): return ReferentToken._new734(ter, ex_obj.begin_token, ex_obj.end_token, ex_obj.morph) ter._add_typ(noun.termin_item.canonic_text) if (noun.termin_item.is_region and ter.is_state): ter._add_typ_reg(li[0].kit.base_language) if (ter.is_state and ter.is_region): for a in adj_list: if (a.termin_item.is_region): ter._add_typ_reg(li[0].kit.base_language) break if (ter.is_state): if (full_name is not None): ter._add_name(full_name) res = ReferentToken(ter, li[0].begin_token, li[k - 1].end_token) if (noun is not None and noun.morph.class0_.is_noun): res.morph = noun.morph else: res.morph = MorphCollection() ii = 0 while ii < k: for v in li[ii].morph.items: bi = MorphBaseInfo() bi.copy_from(v) if (noun is not None): if (bi.class0_.is_adjective): bi.class0_ = MorphClass.NOUN res.morph.add_item(bi) ii += 1 if (li[0].termin_item is not None and li[0].termin_item.is_specific_prefix): res.begin_token = li[0].end_token.next0_ if (add_noun is not None and add_noun.end_char > res.end_char): res.end_token = add_noun.end_token if ((isinstance(res.begin_token.previous, TextToken)) and (res.whitespaces_before_count < 2)): tt = Utils.asObjectOrNull(res.begin_token.previous, TextToken) if (tt.term == "АР"): for ty in ter.typs: if ("республика" in ty or "республіка" in ty): res.begin_token = tt break return res
def __try_attach(t: 'Token', prev: typing.List['DateItemToken'], detail_regime: bool) -> 'DateItemToken': from pullenti.ner.measure.internal.MeasureToken import MeasureToken if (t is None): return None nt = Utils.asObjectOrNull(t, NumberToken) begin = t end = t is_in_brack = False if ((BracketHelper.can_be_start_of_sequence(t, False, False) and t.next0_ is not None and (isinstance(t.next0_, NumberToken))) and BracketHelper.can_be_end_of_sequence( t.next0_.next0_, False, None, False)): nt = (Utils.asObjectOrNull(t.next0_, NumberToken)) end = t.next0_.next0_ is_in_brack = True if ((t.is_newline_before and BracketHelper.is_bracket(t, False) and (isinstance(t.next0_, NumberToken))) and BracketHelper.is_bracket(t.next0_.next0_, False)): nt = (Utils.asObjectOrNull(t.next0_, NumberToken)) end = t.next0_.next0_ is_in_brack = True if (nt is not None): if (nt.int_value is None): return None if (nt.typ == NumberSpellingType.WORDS): if (nt.morph.class0_.is_noun and not nt.morph.class0_.is_adjective): if (t.next0_ is not None and ((t.next0_.is_value("КВАРТАЛ", None) or t.next0_.is_value("ПОЛУГОДИЕ", None) or t.next0_.is_value("ПІВРІЧЧЯ", None)))): pass else: return None if (NumberHelper.try_parse_age(nt) is not None): return None tt = None res = DateItemToken._new628(begin, end, DateItemToken.DateItemType.NUMBER, nt.int_value, nt.morph) if ((res.int_value == 20 and (isinstance(nt.next0_, NumberToken)) and nt.next0_.int_value is not None) and nt.next0_.length_char == 2 and prev is not None): num = 2000 + nt.next0_.int_value if ((num < 2030) and len(prev) > 0 and prev[len(prev) - 1].typ == DateItemToken.DateItemType.MONTH): ok = False if (nt.whitespaces_after_count == 1): ok = True elif (nt.is_newline_after and nt.is_newline_after): ok = True if (ok): nt = (Utils.asObjectOrNull(nt.next0_, NumberToken)) res.end_token = nt res.int_value = num if (res.int_value == 20 or res.int_value == 201): tt = t.next0_ if (tt is not None and tt.is_char('_')): while tt is not None: if (not tt.is_char('_')): break tt = tt.next0_ tt = DateItemToken.__test_year_rus_word(tt, False) if (tt is not None): res.int_value = 0 res.end_token = tt res.typ = DateItemToken.DateItemType.YEAR return res if (res.int_value <= 12 and t.next0_ is not None and (t.whitespaces_after_count < 3)): tt = t.next0_ if (tt.is_value("ЧАС", None)): if (((isinstance(t.previous, TextToken)) and not t.previous.chars.is_letter and not t.is_whitespace_before) and (isinstance(t.previous.previous, NumberToken)) and not t.previous.is_whitespace_before): pass else: res.typ = DateItemToken.DateItemType.HOUR res.end_token = tt tt = tt.next0_ if (tt is not None and tt.is_char('.')): res.end_token = tt tt = tt.next0_ first_pass3072 = True while True: if first_pass3072: first_pass3072 = False else: tt = tt.next0_ if (not (tt is not None)): break if (tt.is_value("УТРО", "РАНОК")): res.end_token = tt res.typ = DateItemToken.DateItemType.HOUR return res if (tt.is_value("ВЕЧЕР", "ВЕЧІР")): res.end_token = tt res.int_value += 12 res.typ = DateItemToken.DateItemType.HOUR return res if (tt.is_value("ДЕНЬ", None)): res.end_token = tt if (res.int_value < 10): res.int_value += 12 res.typ = DateItemToken.DateItemType.HOUR return res if (tt.is_value("НОЧЬ", "НІЧ")): res.end_token = tt if (res.int_value == 12): res.int_value = 0 elif (res.int_value > 9): res.int_value += 12 res.typ = DateItemToken.DateItemType.HOUR return res if (tt.is_comma or tt.morph.class0_.is_adverb): continue break if (res.typ == DateItemToken.DateItemType.HOUR): return res can_be_year_ = True if (prev is not None and len(prev) > 0 and prev[len(prev) - 1].typ == DateItemToken.DateItemType.MONTH): pass elif ((prev is not None and len(prev) >= 4 and prev[len(prev) - 1].typ == DateItemToken.DateItemType.DELIM) and prev[len(prev) - 2].can_by_month): pass elif (nt.next0_ is not None and ((nt.next0_.is_value("ГОД", None) or nt.next0_.is_value("РІК", None)))): if (res.int_value < 1000): can_be_year_ = False tt = DateItemToken.__test_year_rus_word(nt.next0_, False) if (tt is not None and DateItemToken.__is_new_age(tt.next0_)): res.typ = DateItemToken.DateItemType.YEAR res.end_token = tt elif (can_be_year_): if (res.can_be_year or res.typ == DateItemToken.DateItemType.NUMBER): tt = DateItemToken.__test_year_rus_word( nt.next0_, res.is_newline_before) if ((tt) is not None): if ((tt.is_value("Г", None) and not tt.is_whitespace_before and t.previous is not None) and ((t.previous.is_value("КОРПУС", None) or t.previous.is_value("КОРП", None)))): pass elif ( (((nt.next0_.is_value("Г", None) and (t.whitespaces_before_count < 3) and t.previous is not None) and t.previous.is_value("Я", None) and t.previous.previous is not None) and t.previous.previous.is_char_of("\\/") and t.previous.previous.previous is not None) and t.previous.previous.previous.is_value( "А", None)): return None elif (nt.next0_.length_char == 1 and not res.can_be_year and ((prev is None or ((len(prev) > 0 and prev[len(prev) - 1].typ != DateItemToken.DateItemType.DELIM))))): pass else: res.end_token = tt res.typ = DateItemToken.DateItemType.YEAR res.lang = tt.morph.language elif (tt is not None and (nt.whitespaces_after_count < 2) and (nt.end_char - nt.begin_char) == 1): res.end_token = tt res.typ = DateItemToken.DateItemType.YEAR res.lang = tt.morph.language if (nt.previous is not None): if (nt.previous.is_value("В", "У") or nt.previous.is_value("К", None) or nt.previous.is_value("ДО", None)): tt = DateItemToken.__test_year_rus_word(nt.next0_, False) if ((tt) is not None): ok = False if ((res.int_value < 100) and (isinstance(tt, TextToken)) and ((tt.term == "ГОДА" or tt.term == "РОКИ"))): pass else: ok = True if (nt.previous.is_value("ДО", None) and nt.next0_.is_value("Г", None)): cou = 0 ttt = nt.previous.previous while ttt is not None and (cou < 10): mt = MeasureToken.try_parse( ttt, None, False, False, False, False) if (mt is not None and mt.end_char > nt.end_char): ok = False break ttt = ttt.previous cou += 1 if (ok): res.end_token = tt res.typ = DateItemToken.DateItemType.YEAR res.lang = tt.morph.language res.begin_token = nt.previous elif (((nt.previous.is_value("IN", None) or nt.previous.is_value("SINCE", None))) and res.can_be_year): uu = (NumbersWithUnitToken.try_parse( nt, None, False, False, False, False) if nt.previous.is_value("IN", None) else None) if (uu is not None and len(uu.units) > 0): pass else: res.typ = DateItemToken.DateItemType.YEAR res.begin_token = nt.previous elif (nt.previous.is_value("NEL", None) or nt.previous.is_value("DEL", None)): if (res.can_be_year): res.typ = DateItemToken.DateItemType.YEAR res.lang = MorphLang.IT res.begin_token = nt.previous elif (nt.previous.is_value("IL", None) and res.can_be_day): res.lang = MorphLang.IT res.begin_token = nt.previous t1 = res.end_token.next0_ if (t1 is not None): if (t1.is_value("ЧАС", "ГОДИНА") or t1.is_value("HOUR", None)): if ((((prev is not None and len(prev) == 2 and prev[0].can_be_hour) and prev[1].typ == DateItemToken.DateItemType.DELIM and not prev[1].is_whitespace_after) and not prev[1].is_whitespace_after and res.int_value >= 0) and (res.int_value < 59)): prev[0].typ = DateItemToken.DateItemType.HOUR res.typ = DateItemToken.DateItemType.MINUTE res.end_token = t1 elif (res.int_value < 24): if (t1.next0_ is not None and t1.next0_.is_char('.')): t1 = t1.next0_ res.typ = DateItemToken.DateItemType.HOUR res.end_token = t1 elif ((res.int_value < 60) and ((t1.is_value("МИНУТА", "ХВИЛИНА") or t1.is_value( "МИН", None) or t.is_value("MINUTE", None)))): if (t1.next0_ is not None and t1.next0_.is_char('.')): t1 = t1.next0_ res.typ = DateItemToken.DateItemType.MINUTE res.end_token = t1 elif ( (res.int_value < 60) and ((t1.is_value("СЕКУНДА", None) or t1.is_value("СЕК", None) or t1.is_value("SECOND", None)))): if (t1.next0_ is not None and t1.next0_.is_char('.')): t1 = t1.next0_ res.typ = DateItemToken.DateItemType.SECOND res.end_token = t1 elif ((res.int_value < 30) and ((t1.is_value("ВЕК", "ВІК") or t1.is_value("СТОЛЕТИЕ", "СТОЛІТТЯ")))): res.typ = DateItemToken.DateItemType.CENTURY res.end_token = t1 elif (res.int_value <= 4 and t1.is_value("КВАРТАЛ", None)): res.typ = DateItemToken.DateItemType.QUARTAL res.end_token = t1 elif (res.int_value <= 2 and ((t1.is_value("ПОЛУГОДИЕ", None) or t1.is_value("ПІВРІЧЧЯ", None)))): res.typ = DateItemToken.DateItemType.HALFYEAR res.end_token = t1 return res t0 = Utils.asObjectOrNull(t, TextToken) if (t0 is None): return None txt = t0.get_source_text() if ((txt[0] == 'I' or txt[0] == 'X' or txt[0] == 'Х') or txt[0] == 'V'): lat = NumberHelper.try_parse_roman(t) if (lat is not None and lat.end_token.next0_ is not None and lat.int_value is not None): val = lat.int_value tt = lat.end_token.next0_ if (tt.is_value("КВАРТАЛ", None) and val > 0 and val <= 4): return DateItemToken._new629( t, tt, DateItemToken.DateItemType.QUARTAL, val) if (tt.is_value("ПОЛУГОДИЕ", "ПІВРІЧЧЯ") and val > 0 and val <= 2): return DateItemToken._new629( t, lat.end_token.next0_, DateItemToken.DateItemType.HALFYEAR, val) if (tt.is_value("ВЕК", "ВІК") or tt.is_value("СТОЛЕТИЕ", "СТОЛІТТЯ")): return DateItemToken._new629( t, lat.end_token.next0_, DateItemToken.DateItemType.CENTURY, val) if (tt.is_value("В", None) and tt.next0_ is not None and tt.next0_.is_char('.')): if (prev is not None and len(prev) > 0 and prev[len(prev) - 1].typ == DateItemToken.DateItemType.POINTER): return DateItemToken._new629( t, tt.next0_, DateItemToken.DateItemType.CENTURY, val) if (DateItemToken.__is_new_age(tt.next0_.next0_)): return DateItemToken._new629( t, tt.next0_, DateItemToken.DateItemType.CENTURY, val) if (tt.is_hiphen): lat2 = NumberHelper.try_parse_roman(tt.next0_) if (lat2 is not None and lat2.int_value is not None and lat2.end_token.next0_ is not None): if (lat2.end_token.next0_.is_value("ВЕК", "ВІК") or lat2.end_token.next0_.is_value( "СТОЛЕТИЕ", "СТОЛІТТЯ")): ddd = DateItemToken.try_attach( tt.next0_, None, False) return DateItemToken._new634( t, lat.end_token, DateItemToken.DateItemType.CENTURY, val, ((ddd.new_age if ddd is not None else 0))) if (t is not None and t.is_value("НАПРИКІНЦІ", None)): return DateItemToken._new635(t, t, DateItemToken.DateItemType.POINTER, "конец") if (t is not None and t.is_value("ДОНЕДАВНА", None)): return DateItemToken._new635(t, t, DateItemToken.DateItemType.POINTER, "сегодня") if (prev is None): if (t is not None): if (t.is_value("ОКОЛО", "БІЛЯ") or t.is_value("ПРИМЕРНО", "ПРИБЛИЗНО") or t.is_value("ABOUT", None)): return DateItemToken._new635( t, t, DateItemToken.DateItemType.POINTER, "около") if (t.is_value("ОК", None) or t.is_value("OK", None)): if (t.next0_ is not None and t.next0_.is_char('.')): return DateItemToken._new635( t, t.next0_, DateItemToken.DateItemType.POINTER, "около") return DateItemToken._new635( t, t, DateItemToken.DateItemType.POINTER, "около") tok = DateItemToken.M_SEASONS.try_parse(t, TerminParseAttr.NO) if ((tok is not None and (Utils.valToEnum(tok.termin.tag, DatePointerType)) == DatePointerType.SUMMER and t.morph.language.is_ru) and (isinstance(t, TextToken))): str0_ = t.term if (str0_ != "ЛЕТОМ" and str0_ != "ЛЕТА" and str0_ != "ЛЕТО"): tok = (None) if (tok is not None): return DateItemToken._new629( t, tok.end_token, DateItemToken.DateItemType.POINTER, Utils.valToEnum(tok.termin.tag, DatePointerType)) npt = NounPhraseHelper.try_parse(t, NounPhraseParseAttr.NO, 0, None) if (npt is not None): tok = DateItemToken.M_SEASONS.try_parse(npt.end_token, TerminParseAttr.NO) if ((tok is not None and (Utils.valToEnum(tok.termin.tag, DatePointerType)) == DatePointerType.SUMMER and t.morph.language.is_ru) and (isinstance(t, TextToken))): str0_ = t.term if (str0_ != "ЛЕТОМ" and str0_ != "ЛЕТА" and str0_ != "ЛЕТО"): tok = (None) if (tok is not None): return DateItemToken._new629( t, tok.end_token, DateItemToken.DateItemType.POINTER, Utils.valToEnum(tok.termin.tag, DatePointerType)) typ_ = DateItemToken.DateItemType.NUMBER if (npt.noun.is_value("КВАРТАЛ", None)): typ_ = DateItemToken.DateItemType.QUARTAL elif (npt.end_token.is_value("ПОЛУГОДИЕ", None) or npt.end_token.is_value("ПІВРІЧЧЯ", None)): typ_ = DateItemToken.DateItemType.HALFYEAR elif (npt.end_token.is_value("НАЧАЛО", None) or npt.end_token.is_value("ПОЧАТОК", None)): return DateItemToken._new635( t, npt.end_token, DateItemToken.DateItemType.POINTER, "начало") elif (npt.end_token.is_value("СЕРЕДИНА", None)): return DateItemToken._new635( t, npt.end_token, DateItemToken.DateItemType.POINTER, "середина") elif (npt.end_token.is_value("КОНЕЦ", None) or npt.end_token.is_value("КІНЕЦЬ", None) or npt.end_token.is_value("НАПРИКІНЕЦЬ", None)): return DateItemToken._new635( t, npt.end_token, DateItemToken.DateItemType.POINTER, "конец") elif (npt.end_token.is_value("ВРЕМЯ", None) and len(npt.adjectives) > 0 and npt.end_token.previous.is_value("НАСТОЯЩЕЕ", None)): return DateItemToken._new635( t, npt.end_token, DateItemToken.DateItemType.POINTER, "сегодня") elif (npt.end_token.is_value("ЧАС", None) and len(npt.adjectives) > 0 and npt.end_token.previous.is_value("ДАНИЙ", None)): return DateItemToken._new635( t, npt.end_token, DateItemToken.DateItemType.POINTER, "сегодня") if (typ_ != DateItemToken.DateItemType.NUMBER or detail_regime): delta = 0 if (len(npt.adjectives) > 0): if (npt.adjectives[0].is_value("ПОСЛЕДНИЙ", "ОСТАННІЙ")): return DateItemToken._new629( t0, npt.end_token, typ_, (4 if typ_ == DateItemToken.DateItemType.QUARTAL else 2)) if (npt.adjectives[0].is_value("ПРЕДЫДУЩИЙ", "ПОПЕРЕДНІЙ") or npt.adjectives[0].is_value("ПРОШЛЫЙ", None)): delta = -1 elif (npt.adjectives[0].is_value("СЛЕДУЮЩИЙ", None) or npt.adjectives[0].is_value("ПОСЛЕДУЮЩИЙ", None) or npt.adjectives[0].is_value("НАСТУПНИЙ", None)): delta = 1 else: return None cou = 0 tt = t.previous first_pass3073 = True while True: if first_pass3073: first_pass3073 = False else: tt = tt.previous if (not (tt is not None)): break if (cou > 200): break dr = Utils.asObjectOrNull(tt.get_referent(), DateRangeReferent) if (dr is None): continue if (typ_ == DateItemToken.DateItemType.QUARTAL): ii = dr.quarter_number if (ii < 1): continue ii += delta if ((ii < 1) or ii > 4): continue return DateItemToken._new629(t0, npt.end_token, typ_, ii) if (typ_ == DateItemToken.DateItemType.HALFYEAR): ii = dr.halfyear_number if (ii < 1): continue ii += delta if ((ii < 1) or ii > 2): continue return DateItemToken._new629(t0, npt.end_token, typ_, ii) term = t0.term if (not str.isalnum(term[0])): if (t0.is_char_of(".\\/:") or t0.is_hiphen): return DateItemToken._new635(t0, t0, DateItemToken.DateItemType.DELIM, term) elif (t0.is_char(',')): return DateItemToken._new635(t0, t0, DateItemToken.DateItemType.DELIM, term) else: return None if (term == "O" or term == "О"): if ((isinstance(t.next0_, NumberToken)) and not t.is_whitespace_after and len(t.next0_.value) == 1): return DateItemToken._new629(t, t.next0_, DateItemToken.DateItemType.NUMBER, t.next0_.int_value) if (str.isalpha(term[0])): inf = DateItemToken.M_MONTHES.try_parse(t, TerminParseAttr.NO) if (inf is not None and inf.termin.tag is None): inf = DateItemToken.M_MONTHES.try_parse( inf.end_token.next0_, TerminParseAttr.NO) if (inf is not None and (isinstance(inf.termin.tag, int))): return DateItemToken._new653(inf.begin_token, inf.end_token, DateItemToken.DateItemType.MONTH, inf.termin.tag, inf.termin.lang) return None
def __try_noun_name(li: typing.List['CityItemToken'], oi: 'IntOntologyItem', always: bool) -> 'ReferentToken': oi.value = (None) if (li is None or (len(li) < 2) or ((li[0].typ != CityItemToken.ItemType.NOUN and li[0].typ != CityItemToken.ItemType.MISC))): return None ok = not li[0].doubtful if (ok and li[0].typ == CityItemToken.ItemType.MISC): ok = False typ = (None if li[0].typ == CityItemToken.ItemType.MISC else li[0].value) typ2 = (None if li[0].typ == CityItemToken.ItemType.MISC else li[0].alt_value) prob_adj = None i1 = 1 org0_ = None if ((typ is not None and li[i1].typ == CityItemToken.ItemType.NOUN and ((i1 + 1) < len(li))) and li[0].whitespaces_after_count <= 1 and (((LanguageHelper.ends_with(typ, "ПОСЕЛОК") or LanguageHelper.ends_with(typ, "СЕЛИЩЕ") or typ == "ДЕРЕВНЯ") or typ == "СЕЛО"))): if (li[i1].begin_token == li[i1].end_token): ooo = AddressItemToken.try_attach_org(li[i1].begin_token) if (ooo is not None and ooo.ref_token is not None): return None typ2 = li[i1].value if (typ2 == "СТАНЦИЯ" and li[i1].begin_token.is_value("СТ", None) and ((i1 + 1) < len(li))): m = li[i1 + 1].morph if (m.number == MorphNumber.PLURAL): prob_adj = "СТАРЫЕ" elif (m.gender == MorphGender.FEMINIE): prob_adj = "СТАРАЯ" elif (m.gender == MorphGender.MASCULINE): prob_adj = "СТАРЫЙ" else: prob_adj = "СТАРОЕ" i1 += 1 name = Utils.ifNotNull(li[i1].value, ((None if li[i1].onto_item is None else li[i1].onto_item.canonic_text))) alt_name = li[i1].alt_value if (name is None): return None mc = li[0].morph if (i1 == 1 and li[i1].typ == CityItemToken.ItemType.CITY and ((li[0].value == "ГОРОД" or li[0].value == "МІСТО" or li[0].typ == CityItemToken.ItemType.MISC))): if (typ is None and ((i1 + 1) < len(li)) and li[i1 + 1].typ == CityItemToken.ItemType.NOUN): return None oi.value = li[i1].onto_item if (oi.value is not None): name = oi.value.canonic_text if (len(name) > 2 or oi.value.misc_attr is not None): if (not li[1].doubtful or ((oi.value is not None and oi.value.misc_attr is not None))): ok = True elif (not ok and not li[1].is_newline_before): if (li[0].geo_object_before or li[1].geo_object_after): ok = True elif (StreetDefineHelper.check_street_after( li[1].end_token.next0_)): ok = True elif (li[1].end_token.next0_ is not None and (isinstance(li[1].end_token.next0_.get_referent(), DateReferent))): ok = True elif ((li[1].whitespaces_before_count < 2) and li[1].onto_item is not None): if (li[1].is_newline_after): ok = True else: ok = True if (li[1].doubtful and li[1].end_token.next0_ is not None and li[1].end_token.chars == li[1].end_token.next0_.chars): ok = False if (li[0].begin_token.previous is not None and li[0].begin_token.previous.is_value("В", None)): ok = True if (not ok): ok = CityAttachHelper.check_year_after(li[1].end_token.next0_) if (not ok): ok = CityAttachHelper.check_city_after(li[1].end_token.next0_) elif ((li[i1].typ == CityItemToken.ItemType.PROPERNAME or li[i1].typ == CityItemToken.ItemType.CITY)): if (((li[0].value == "АДМИНИСТРАЦИЯ" or li[0].value == "АДМІНІСТРАЦІЯ")) and i1 == 1): return None if (li[i1].is_newline_before): if (len(li) != 2): return None if (not li[0].doubtful): ok = True if (len(name) < 2): ok = False elif ((len(name) < 3) and li[0].morph.number != MorphNumber.SINGULAR): ok = False if (li[i1].doubtful and not li[i1].geo_object_after and not li[0].geo_object_before): if (li[i1].morph.case_.is_genitive): if (li[i1].end_token.next0_ is None or MiscLocationHelper.check_geo_object_after( li[i1].end_token.next0_, False) or AddressItemToken.check_house_after( li[i1].end_token.next0_, False, True)): pass elif (li[0].begin_token.previous is None or MiscLocationHelper.check_geo_object_before( li[0].begin_token)): pass else: ok = False if (ok): rt0 = li[i1].kit.process_referent( "PERSONPROPERTY", li[0].begin_token.previous) if (rt0 is not None): rt1 = li[i1].kit.process_referent( "PERSON", li[i1].begin_token) if (rt1 is not None): ok = False npt = NounPhraseHelper.try_parse(li[i1].begin_token, NounPhraseParseAttr.NO, 0, None) if (npt is not None): if (npt.end_token.end_char > li[i1].end_char and len(npt.adjectives) > 0 and not npt.adjectives[0].end_token.next0_.is_comma): ok = False elif (TerrItemToken._m_unknown_regions.try_parse( npt.end_token, TerminParseAttr.FULLWORDSONLY) is not None): ok1 = False if (li[0].begin_token.previous is not None): ttt = li[0].begin_token.previous if (ttt.is_comma and ttt.previous is not None): ttt = ttt.previous geo_ = Utils.asObjectOrNull( ttt.get_referent(), GeoReferent) if (geo_ is not None and not geo_.is_city): ok1 = True if (npt.end_token.next0_ is not None): ttt = npt.end_token.next0_ if (ttt.is_comma and ttt.next0_ is not None): ttt = ttt.next0_ geo_ = Utils.asObjectOrNull( ttt.get_referent(), GeoReferent) if (geo_ is not None and not geo_.is_city): ok1 = True if (not ok1): return None if (li[0].value == "ПОРТ"): if (li[i1].chars.is_all_upper or li[i1].chars.is_latin_letter): return None elif (li[0].geo_object_before): ok = True elif (li[i1].geo_object_after and not li[i1].is_newline_after): ok = True else: ok = CityAttachHelper.check_year_after(li[i1].end_token.next0_) if (not ok): ok = CityAttachHelper.check_street_after( li[i1].end_token.next0_) if (not ok and li[0].begin_token.previous is not None and li[0].begin_token.previous.is_value("В", None)): ok = True else: return None if (not ok and not always): if (MiscLocationHelper.check_near_before( li[0].begin_token.previous) is None): return None if (len(li) > (i1 + 1)): del li[i1 + 1:i1 + 1 + len(li) - i1 - 1] city = GeoReferent() if (oi.value is not None and oi.value.referent is not None): city = (Utils.asObjectOrNull(oi.value.referent.clone(), GeoReferent)) city.occurrence.clear() if (not li[0].morph.case_.is_undefined and li[0].morph.gender != MorphGender.UNDEFINED): if (li[i1].end_token.morph.class0_.is_adjective and li[i1].begin_token == li[i1].end_token): nam = ProperNameHelper.get_name_ex( li[i1].begin_token, li[i1].end_token, MorphClass.ADJECTIVE, li[0].morph.case_, li[0].morph.gender, False, False) if (nam is not None and nam != name): name = nam if (li[0].morph.case_.is_nominative): if (alt_name is not None): city._add_name(alt_name) alt_name = (None) city._add_name(name) if (prob_adj is not None): city._add_name(prob_adj + " " + name) if (alt_name is not None): city._add_name(alt_name) if (prob_adj is not None): city._add_name(prob_adj + " " + alt_name) if (typ is not None): city._add_typ(typ) elif (not city.is_city): city._add_typ_city(li[0].kit.base_language) if (typ2 is not None): city._add_typ(typ2.lower()) if (li[0].higher_geo is not None and GeoOwnerHelper.can_be_higher(li[0].higher_geo, city)): city.higher = li[0].higher_geo if (li[0].typ == CityItemToken.ItemType.MISC): del li[0] res = ReferentToken._new734(city, li[0].begin_token, li[len(li) - 1].end_token, mc) if (res.end_token.next0_ is not None and res.end_token.next0_.is_hiphen and (isinstance(res.end_token.next0_.next0_, NumberToken))): num = Utils.asObjectOrNull(res.end_token.next0_.next0_, NumberToken) if ((num.typ == NumberSpellingType.DIGIT and not num.morph.class0_.is_adjective and num.int_value is not None) and (num.int_value < 50)): for s in city.slots: if (s.type_name == GeoReferent.ATTR_NAME): city.upload_slot(s, "{0}-{1}".format(s.value, num.value)) res.end_token = num if (li[0].begin_token == li[0].end_token and li[0].begin_token.is_value("ГОРОДОК", None)): if (AddressItemToken.check_house_after(res.end_token.next0_, True, False)): return None return res
def __try_attach(self, t : 'Token', key_word : bool) -> 'ReferentToken': if (t is None): return None t0 = t t1 = t uris_keys = None uris = None org0_ = None cor_org = None org_is_bank = False empty = 0 last_uri = None first_pass3017 = True while True: if first_pass3017: first_pass3017 = False else: t = t.next0_ if (not (t is not None)): break if (t.is_table_control_char and t != t0): break if (t.is_comma or t.morph.class0_.is_preposition or t.is_char_of("/\\")): continue bank_keyword = False if (t.is_value("ПОЛНЫЙ", None) and t.next0_ is not None and ((t.next0_.is_value("НАИМЕНОВАНИЕ", None) or t.next0_.is_value("НАЗВАНИЕ", None)))): t = t.next0_.next0_ if (t is None): break if (t.is_value("БАНК", None)): if ((isinstance(t, ReferentToken)) and t.get_referent().type_name == "ORGANIZATION"): bank_keyword = True tt = t.next0_ npt = NounPhraseHelper.try_parse(tt, NounPhraseParseAttr.NO, 0, None) if (npt is not None): tt = npt.end_token.next0_ if (tt is not None and tt.is_char(':')): tt = tt.next0_ if (tt is not None): if (not bank_keyword): t = tt bank_keyword = True elif (tt.get_referent() is not None and tt.get_referent().type_name == "ORGANIZATION"): t = tt r = t.get_referent() if (r is not None and r.type_name == "ORGANIZATION"): is_bank = False kk = 0 rr = r while rr is not None and (kk < 4): is_bank = Utils.compareStrings(Utils.ifNotNull(rr.get_string_value("KIND"), ""), "Bank", True) == 0 if (is_bank): break rr = rr.parent_referent; kk += 1 if (not is_bank and bank_keyword): is_bank = True if (not is_bank and uris is not None and "ИНН" in uris_keys): return None if ((last_uri is not None and last_uri.scheme == "К/С" and t.previous is not None) and t.previous.is_value("В", None)): cor_org = r t1 = t elif (org0_ is None or ((not org_is_bank and is_bank))): org0_ = r t1 = t org_is_bank = is_bank if (is_bank): continue if (uris is None and not key_word): return None continue if (isinstance(r, UriReferent)): u = Utils.asObjectOrNull(r, UriReferent) if (uris is None): if (not BankAnalyzer.__is_bank_req(u.scheme)): return None if (u.scheme == "ИНН" and t.is_newline_after): return None uris = list() uris_keys = list() else: if (not BankAnalyzer.__is_bank_req(u.scheme)): break if (u.scheme in uris_keys): break if (u.scheme == "ИНН"): if (empty > 0): break uris_keys.append(u.scheme) uris.append(u) last_uri = u t1 = t empty = 0 continue elif (uris is None and not key_word and not org_is_bank): return None if (r is not None and ((r.type_name == "GEO" or r.type_name == "ADDRESS"))): empty += 1 continue if (isinstance(t, TextToken)): if (t.is_value("ПОЛНЫЙ", None) or t.is_value("НАИМЕНОВАНИЕ", None) or t.is_value("НАЗВАНИЕ", None)): pass elif (t.chars.is_letter): tok = BankAnalyzer.__m_ontology.try_parse(t, TerminParseAttr.NO) if (tok is not None): t = tok.end_token empty = 0 else: empty += 1 if (t.is_newline_before): nnn = NounPhraseHelper.try_parse(t, NounPhraseParseAttr.NO, 0, None) if (nnn is not None and nnn.end_token.next0_ is not None and nnn.end_token.next0_.is_char(':')): break if (uris is None): break if (empty > 2): break if (empty > 0 and t.is_char(':') and t.is_newline_after): break if (((isinstance(t, NumberToken)) and t.is_newline_before and t.next0_ is not None) and not t.next0_.chars.is_letter): break if (uris is None): return None if (not "Р/С" in uris_keys and not "Л/С" in uris_keys): return None ok = False if ((len(uris) < 2) and org0_ is None): return None bdr = BankDataReferent() for u in uris: bdr.add_slot(BankDataReferent.ATTR_ITEM, u, False, 0) if (org0_ is not None): bdr.add_slot(BankDataReferent.ATTR_BANK, org0_, False, 0) if (cor_org is not None): bdr.add_slot(BankDataReferent.ATTR_CORBANK, cor_org, False, 0) org0 = (None if t0.previous is None else t0.previous.get_referent()) if (org0 is not None and org0.type_name == "ORGANIZATION"): for s in org0.slots: if (isinstance(s.value, UriReferent)): u = Utils.asObjectOrNull(s.value, UriReferent) if (BankAnalyzer.__is_bank_req(u.scheme)): if (not u.scheme in uris_keys): bdr.add_slot(BankDataReferent.ATTR_ITEM, u, False, 0) return ReferentToken(bdr, t0, t1)
def process(self, kit : 'AnalysisKit') -> None: # Основная функция выделения телефонов ad = kit.get_analyzer_data(self) has_denoms = False for a in kit.processor.analyzers: if ((isinstance(a, DenominationAnalyzer)) and not a.ignore_this_analyzer): has_denoms = True if (not has_denoms): a = DenominationAnalyzer() a.process(kit) li = list() tmp = io.StringIO() tmp2 = list() max0_ = 0 t = kit.first_token while t is not None: max0_ += 1 t = t.next0_ cur = 0 t = kit.first_token first_pass3292 = True while True: if first_pass3292: first_pass3292 = False else: t = t.next0_; cur += 1 if (not (t is not None)): break r = t.get_referent() if (r is not None): t = self.__add_referents(ad, t, cur, max0_) continue if (not (isinstance(t, TextToken))): continue if (not t.chars.is_letter or (t.length_char < 3)): continue term = t.term if (term == "ЕСТЬ"): if ((isinstance(t.previous, TextToken)) and t.previous.morph.class0_.is_verb): pass else: continue npt = None npt = NounPhraseHelper.try_parse(t, Utils.valToEnum((NounPhraseParseAttr.ADJECTIVECANBELAST) | (NounPhraseParseAttr.PARSEPREPOSITION), NounPhraseParseAttr), 0, None) if (npt is None): mc = t.get_morph_class_in_dictionary() if (mc.is_verb and not mc.is_preposition): if (t.is_verb_be): continue if (t.is_value("МОЧЬ", None) or t.is_value("WOULD", None)): continue kref = KeywordReferent._new1595(KeywordType.PREDICATE) norm = t.get_normal_case_text(MorphClass.VERB, MorphNumber.SINGULAR, MorphGender.UNDEFINED, False) if (norm is None): norm = t.lemma if (norm.endswith("ЬСЯ")): norm = norm[0:0+len(norm) - 2] kref.add_slot(KeywordReferent.ATTR_VALUE, norm, False, 0) drv = DerivateService.find_derivates(norm, True, t.morph.language) KeywordAnalyzer.__add_normals(kref, drv, norm) kref = (Utils.asObjectOrNull(ad.register_referent(kref), KeywordReferent)) KeywordAnalyzer.__set_rank(kref, cur, max0_) rt1 = ReferentToken._new734(ad.register_referent(kref), t, t, t.morph) kit.embed_token(rt1) t = (rt1) continue continue if (npt.internal_noun is not None): continue if (npt.end_token.is_value("ЦЕЛОМ", None) or npt.end_token.is_value("ЧАСТНОСТИ", None)): if (npt.preposition is not None): t = npt.end_token continue if (npt.end_token.is_value("СТОРОНЫ", None) and npt.preposition is not None and npt.preposition.normal == "С"): t = npt.end_token continue if (npt.begin_token == npt.end_token): mc = t.get_morph_class_in_dictionary() if (mc.is_preposition): continue elif (mc.is_adverb): if (t.is_value("ПОТОМ", None)): continue else: pass li.clear() t0 = t tt = t first_pass3293 = True while True: if first_pass3293: first_pass3293 = False else: tt = tt.next0_ if (not (tt is not None and tt.end_char <= npt.end_char)): break if (not (isinstance(tt, TextToken))): continue if (tt.is_value("NATURAL", None)): pass if ((tt.length_char < 3) or not tt.chars.is_letter): continue mc = tt.get_morph_class_in_dictionary() if ((mc.is_preposition or mc.is_pronoun or mc.is_personal_pronoun) or mc.is_conjunction): if (tt.is_value("ОТНОШЕНИЕ", None)): pass else: continue if (mc.is_misc): if (MiscHelper.is_eng_article(tt)): continue kref = KeywordReferent._new1595(KeywordType.OBJECT) norm = tt.lemma kref.add_slot(KeywordReferent.ATTR_VALUE, norm, False, 0) if (norm != "ЕСТЬ"): drv = DerivateService.find_derivates(norm, True, tt.morph.language) KeywordAnalyzer.__add_normals(kref, drv, norm) kref = (Utils.asObjectOrNull(ad.register_referent(kref), KeywordReferent)) KeywordAnalyzer.__set_rank(kref, cur, max0_) rt1 = ReferentToken._new734(kref, tt, tt, tt.morph) kit.embed_token(rt1) if (tt == t and len(li) == 0): t0 = (rt1) t = (rt1) li.append(kref) if (len(li) > 1): kref = KeywordReferent._new1595(KeywordType.OBJECT) Utils.setLengthStringIO(tmp, 0) tmp2.clear() has_norm = False for kw in li: s = kw.get_string_value(KeywordReferent.ATTR_VALUE) if (tmp.tell() > 0): print(' ', end="", file=tmp) print(s, end="", file=tmp) n = kw.get_string_value(KeywordReferent.ATTR_NORMAL) if (n is not None): has_norm = True tmp2.append(n) else: tmp2.append(s) kref.add_slot(KeywordReferent.ATTR_REF, kw, False, 0) val = npt.get_normal_case_text(None, MorphNumber.SINGULAR, MorphGender.UNDEFINED, False) kref.add_slot(KeywordReferent.ATTR_VALUE, val, False, 0) Utils.setLengthStringIO(tmp, 0) tmp2.sort() for s in tmp2: if (tmp.tell() > 0): print(' ', end="", file=tmp) print(s, end="", file=tmp) norm = Utils.toStringStringIO(tmp) if (norm != val): kref.add_slot(KeywordReferent.ATTR_NORMAL, norm, False, 0) kref = (Utils.asObjectOrNull(ad.register_referent(kref), KeywordReferent)) KeywordAnalyzer.__set_rank(kref, cur, max0_) rt1 = ReferentToken._new734(kref, t0, t, npt.morph) kit.embed_token(rt1) t = (rt1) cur = 0 t = kit.first_token first_pass3294 = True while True: if first_pass3294: first_pass3294 = False else: t = t.next0_; cur += 1 if (not (t is not None)): break kw = Utils.asObjectOrNull(t.get_referent(), KeywordReferent) if (kw is None or kw.typ != KeywordType.OBJECT): continue if (t.next0_ is None or kw.child_words > 2): continue t1 = t.next0_ if (t1.is_value("OF", None) and (t1.whitespaces_after_count < 3) and t1.next0_ is not None): t1 = t1.next0_ if ((isinstance(t1, TextToken)) and MiscHelper.is_eng_article(t1) and t1.next0_ is not None): t1 = t1.next0_ elif (not t1.morph.case_.is_genitive or t.whitespaces_after_count > 1): continue kw2 = Utils.asObjectOrNull(t1.get_referent(), KeywordReferent) if (kw2 is None): continue if (kw == kw2): continue if (kw2.typ != KeywordType.OBJECT or (kw.child_words + kw2.child_words) > 3): continue kw_un = KeywordReferent() kw_un._union(kw, kw2, MiscHelper.get_text_value(t1, t1, GetTextAttr.NO)) kw_un = (Utils.asObjectOrNull(ad.register_referent(kw_un), KeywordReferent)) KeywordAnalyzer.__set_rank(kw_un, cur, max0_) rt1 = ReferentToken._new734(kw_un, t, t1, t.morph) kit.embed_token(rt1) t = (rt1) if (KeywordAnalyzer.SORT_KEYWORDS_BY_RANK): all0_ = list(ad.referents) all0_.sort(key=operator.attrgetter('rank'), reverse=True) ad.referents = all0_ if (KeywordAnalyzer.ANNOTATION_MAX_SENTENCES > 0): ano = AutoannoSentToken.create_annotation(kit, KeywordAnalyzer.ANNOTATION_MAX_SENTENCES) if (ano is not None): ad.register_referent(ano)
def try_parse(t: 'Token', items: typing.List['NounPhraseItem'], attrs: 'NounPhraseParseAttr') -> 'NounPhraseItem': if (t is None): return None t0 = t _can_be_surname = False _is_doubt_adj = False rt = Utils.asObjectOrNull(t, ReferentToken) if (rt is not None and rt.begin_token == rt.end_token and (isinstance(rt.begin_token, TextToken))): res = NounPhraseItem.try_parse(rt.begin_token, items, attrs) if (res is not None): res.begin_token = res.end_token = t res.can_be_noun = True return res if (rt is not None): res = NounPhraseItem(t, t) for m in t.morph.items: v = NounPhraseItemTextVar(m, None) v.normal_value = str(t.get_referent()) res.noun_morph.append(v) res.can_be_noun = True return res if (isinstance(t, NumberToken)): pass has_legal_verb = False if (isinstance(t, TextToken)): if (not t.chars.is_letter): return None str0_ = t.term if (str0_[len(str0_) - 1] == 'А' or str0_[len(str0_) - 1] == 'О'): for wf in t.morph.items: if ((isinstance(wf, MorphWordForm)) and wf.is_in_dictionary): if (wf.class0_.is_verb): mc = t.get_morph_class_in_dictionary() if (not mc.is_noun and (((attrs) & (NounPhraseParseAttr.IGNOREPARTICIPLES))) == (NounPhraseParseAttr.NO)): if (not LanguageHelper.ends_with_ex( str0_, "ОГО", "ЕГО", None, None)): return None has_legal_verb = True if (wf.class0_.is_adverb): if (t.next0_ is None or not t.next0_.is_hiphen): if ((str0_ == "ВСЕГО" or str0_ == "ДОМА" or str0_ == "НЕСКОЛЬКО") or str0_ == "МНОГО" or str0_ == "ПОРЯДКА"): pass else: return None if (wf.class0_.is_adjective): if (wf.contains_attr("к.ф.", None)): if (t.get_morph_class_in_dictionary() == MorphClass.ADJECTIVE): pass else: _is_doubt_adj = True mc0 = t.morph.class0_ if (mc0.is_proper_surname and not t.chars.is_all_lower): for wf in t.morph.items: if (wf.class0_.is_proper_surname and wf.number != MorphNumber.PLURAL): wff = Utils.asObjectOrNull(wf, MorphWordForm) if (wff is None): continue s = Utils.ifNotNull((Utils.ifNotNull( wff.normal_full, wff.normal_case)), "") if (LanguageHelper.ends_with_ex( s, "ИН", "ЕН", "ЫН", None)): if (not wff.is_in_dictionary): _can_be_surname = True else: return None if (wff.is_in_dictionary and LanguageHelper.ends_with(s, "ОВ")): _can_be_surname = True if (mc0.is_proper_name and not t.chars.is_all_lower): for wff in t.morph.items: wf = Utils.asObjectOrNull(wff, MorphWordForm) if (wf is None): continue if (wf.normal_case == "ГОР"): continue if (wf.class0_.is_proper_name and wf.is_in_dictionary): if (wf.normal_case is None or not wf.normal_case.startswith("ЛЮБ")): if (mc0.is_adjective and t.morph.contains_attr("неизм.", None)): pass elif ( (((attrs) & (NounPhraseParseAttr.REFERENTCANBENOUN)) ) == (NounPhraseParseAttr.REFERENTCANBENOUN)): pass else: if (items is None or (len(items) < 1)): return None if (not items[0].is_std_adjective): return None if (mc0.is_adjective and t.morph.items_count == 1): if (t.morph.get_indexer_item(0).contains_attr( "в.ср.ст.", None)): return None mc1 = t.get_morph_class_in_dictionary() if (mc1 == MorphClass.VERB and t.morph.case_.is_undefined): return None if (((((attrs) & (NounPhraseParseAttr.IGNOREPARTICIPLES))) == (NounPhraseParseAttr.IGNOREPARTICIPLES) and t.morph.class0_.is_verb and not t.morph.class0_.is_noun) and not t.morph.class0_.is_proper): for wf in t.morph.items: if (wf.class0_.is_verb): if (wf.contains_attr("дейст.з.", None)): if (LanguageHelper.ends_with(t.term, "СЯ")): pass else: return None t1 = None for k in range(2): t = (Utils.ifNotNull(t1, t0)) if (k == 0): if (((isinstance(t0, TextToken)) and t0.next0_ is not None and t0.next0_.is_hiphen) and t0.next0_.next0_ is not None): if (not t0.is_whitespace_after and not t0.morph.class0_.is_pronoun and not (isinstance(t0.next0_.next0_, NumberToken))): if (not t0.next0_.is_whitespace_after): t = t0.next0_.next0_ elif (t0.next0_.next0_.chars.is_all_lower and LanguageHelper.ends_with(t0.term, "О")): t = t0.next0_.next0_ it = NounPhraseItem._new404(t0, t, _can_be_surname) if (t0 == t and (isinstance(t0, ReferentToken))): it.can_be_noun = True it.morph = MorphCollection(t0.morph) can_be_prepos = False for v in t.morph.items: wf = Utils.asObjectOrNull(v, MorphWordForm) if (v.class0_.is_verb and not v.case_.is_undefined): it.can_be_adj = True it.adj_morph.append(NounPhraseItemTextVar(v, t)) continue if (v.class0_.is_preposition): can_be_prepos = True if (v.class0_.is_adjective or ((v.class0_.is_pronoun and not v.class0_.is_personal_pronoun and not v.contains_attr("неизм.", None))) or ((v.class0_.is_noun and (isinstance(t, NumberToken))))): if (NounPhraseItem.try_accord_variant( items, (0 if items is None else len(items)), v, False)): is_doub = False if (v.contains_attr("к.ф.", None)): continue if (v.contains_attr("собир.", None) and not (isinstance(t, NumberToken))): if (wf is not None and wf.is_in_dictionary): return None continue if (v.contains_attr("сравн.", None)): continue ok = True if (isinstance(t, TextToken)): s = t.term if (s == "ПРАВО" or s == "ПРАВА"): ok = False elif (LanguageHelper.ends_with(s, "ОВ") and t.get_morph_class_in_dictionary().is_noun): ok = False elif (isinstance(t, NumberToken)): if (v.class0_.is_noun and t.morph.class0_.is_adjective): ok = False elif (t.morph.class0_.is_noun and (( (attrs) & (NounPhraseParseAttr.PARSENUMERICASADJECTIVE))) == (NounPhraseParseAttr.NO)): ok = False if (ok): it.adj_morph.append(NounPhraseItemTextVar(v, t)) it.can_be_adj = True if (_is_doubt_adj and t0 == t): it.is_doubt_adjective = True if (has_legal_verb and wf is not None and wf.is_in_dictionary): it.can_be_noun = True if (wf is not None and wf.class0_.is_pronoun): it.can_be_noun = True it.noun_morph.append( NounPhraseItemTextVar(v, t)) can_be_noun_ = False if (isinstance(t, NumberToken)): pass elif (v.class0_.is_noun or ((wf is not None and wf.normal_case == "САМ"))): can_be_noun_ = True elif (v.class0_.is_personal_pronoun): if (items is None or len(items) == 0): can_be_noun_ = True else: for it1 in items: if (it1.is_verb): if (len(items) == 1 and not v.case_.is_nominative): can_be_noun_ = True else: return None if (len(items) == 1): if (items[0].can_be_adj_for_personal_pronoun): can_be_noun_ = True elif ( (v.class0_.is_pronoun and ((items is None or len(items) == 0 or ((len(items) == 1 and items[0].can_be_adj_for_personal_pronoun)))) and wf is not None) and (((((wf.normal_case == "ТОТ" or wf.normal_full == "ТО" or wf.normal_case == "ТО") or wf.normal_case == "ЭТО" or wf.normal_case == "ВСЕ") or wf.normal_case == "ЧТО" or wf.normal_case == "КТО") or wf.normal_full == "КОТОРЫЙ" or wf.normal_case == "КОТОРЫЙ"))): if (wf.normal_case == "ВСЕ"): if (t.next0_ is not None and t.next0_.is_value("РАВНО", None)): return None can_be_noun_ = True elif (wf is not None and ((Utils.ifNotNull( wf.normal_full, wf.normal_case))) == "КОТОРЫЙ" and (((attrs) & (NounPhraseParseAttr.PARSEPRONOUNS))) == (NounPhraseParseAttr.NO)): return None elif (v.class0_.is_proper and (isinstance(t, TextToken))): if (t.length_char > 4 or v.class0_.is_proper_name): can_be_noun_ = True if (can_be_noun_): added = False if (items is not None and len(items) > 1 and (((attrs) & (NounPhraseParseAttr.MULTINOUNS))) != (NounPhraseParseAttr.NO)): ok1 = True ii = 1 while ii < len(items): if (not items[ii].conj_before): ok1 = False break ii += 1 if (ok1): if (NounPhraseItem.try_accord_variant( items, (0 if items is None else len(items)), v, True)): it.noun_morph.append( NounPhraseItemTextVar(v, t)) it.can_be_noun = True it.multi_nouns = True added = True if (not added): if (NounPhraseItem.try_accord_variant( items, (0 if items is None else len(items)), v, False)): it.noun_morph.append(NounPhraseItemTextVar(v, t)) it.can_be_noun = True if (v.class0_.is_personal_pronoun and t.morph.contains_attr("неизм.", None) and not it.can_be_adj): itt = NounPhraseItemTextVar(v, t) itt.case_ = MorphCase.ALL_CASES itt.number = MorphNumber.UNDEFINED if (itt.normal_value is None): pass it.adj_morph.append(itt) it.can_be_adj = True elif ((len(items) > 0 and len(items[0].adj_morph) > 0 and items[0].adj_morph[0].number == MorphNumber.PLURAL) and not ((items[0].adj_morph[0].case_) & v.case_).is_undefined and not items[0].adj_morph[0].class0_.is_verb): if (t.next0_ is not None and t.next0_.is_comma_and and (isinstance(t.next0_.next0_, TextToken))): npt2 = NounPhraseHelper.try_parse( t.next0_.next0_, attrs, 0, None) if (npt2 is not None and npt2.preposition is None and not ((npt2.morph.case_) & v.case_ & items[0].adj_morph[0].case_ ).is_undefined): it.noun_morph.append( NounPhraseItemTextVar(v, t)) it.can_be_noun = True if (t0 != t): for v in it.adj_morph: v.correct_prefix(Utils.asObjectOrNull(t0, TextToken), False) for v in it.noun_morph: v.correct_prefix(Utils.asObjectOrNull(t0, TextToken), True) if (k == 1 and it.can_be_noun and not it.can_be_adj): if (t1 is not None): it.end_token = t1 else: it.end_token = t0.next0_.next0_ for v in it.noun_morph: if (v.normal_value is not None and (v.normal_value.find('-') < 0)): v.normal_value = "{0}-{1}".format( v.normal_value, it.end_token.get_normal_case_text( None, MorphNumber.UNDEFINED, MorphGender.UNDEFINED, False)) if (it.can_be_adj): if (NounPhraseItem.__m_std_adjectives.try_parse( it.begin_token, TerminParseAttr.NO) is not None): it.is_std_adjective = True if (can_be_prepos and it.can_be_noun): if (items is not None and len(items) > 0): npt1 = NounPhraseHelper.try_parse( t, Utils.valToEnum((NounPhraseParseAttr.PARSEPREPOSITION) | (NounPhraseParseAttr.PARSEPRONOUNS) | (NounPhraseParseAttr.PARSEVERBS), NounPhraseParseAttr), 0, None) if (npt1 is not None and npt1.end_char > t.end_char): return None else: npt1 = NounPhraseHelper.try_parse( t.next0_, Utils.valToEnum((NounPhraseParseAttr.PARSEPRONOUNS) | (NounPhraseParseAttr.PARSEVERBS), NounPhraseParseAttr), 0, None) if (npt1 is not None): mc = LanguageHelper.get_case_after_preposition(t.lemma) if (not ((mc) & npt1.morph.case_).is_undefined): return None if (it.can_be_noun or it.can_be_adj or k == 1): if (it.begin_token.morph.class0_.is_pronoun): tt2 = it.end_token.next0_ if ((tt2 is not None and tt2.is_hiphen and not tt2.is_whitespace_after) and not tt2.is_whitespace_before): tt2 = tt2.next0_ if (isinstance(tt2, TextToken)): ss = tt2.term if ((ss == "ЖЕ" or ss == "БЫ" or ss == "ЛИ") or ss == "Ж"): it.end_token = tt2 elif (ss == "НИБУДЬ" or ss == "ЛИБО" or (((ss == "ТО" and tt2.previous.is_hiphen)) and it.can_be_adj)): it.end_token = tt2 for m in it.adj_morph: m.normal_value = "{0}-{1}".format( m.normal_value, ss) if (m.single_number_value is not None): m.single_number_value = "{0}-{1}".format( m.single_number_value, ss) return it if (t0 == t): if (t0.is_value("БИЗНЕС", None) and t0.next0_ is not None and t0.next0_.chars == t0.chars): t1 = t0.next0_ continue return it return None
def try_parse(t : 'Token', add_units : 'TerminCollection', can_be_set : bool=True, can_units_absent : bool=False, is_resctriction : bool=False, is_subval : bool=False) -> 'MeasureToken': if (not (isinstance(t, TextToken))): return None if (t.is_table_control_char): return None t0 = t whd = None minmax = 0 wrapminmax1625 = RefOutArgWrapper(minmax) tt = NumbersWithUnitToken._is_min_or_max(t0, wrapminmax1625) minmax = wrapminmax1625.value if (tt is not None): t = tt.next0_ npt = NounPhraseHelper.try_parse(t, Utils.valToEnum((NounPhraseParseAttr.PARSEPREPOSITION) | (NounPhraseParseAttr.IGNOREBRACKETS), NounPhraseParseAttr), 0, None) if (npt is None): whd = NumbersWithUnitToken._try_parsewhl(t) if (whd is not None): npt = NounPhraseToken(t0, whd.end_token) elif (t0.is_value("КПД", None)): npt = NounPhraseToken(t0, t0) elif ((isinstance(t0, TextToken)) and t0.length_char > 3 and t0.get_morph_class_in_dictionary().is_undefined): npt = NounPhraseToken(t0, t0) elif (t0.is_value("T", None) and t0.chars.is_all_lower): npt = NounPhraseToken(t0, t0) t = t0 if (t.next0_ is not None and t.next0_.is_char('=')): npt.end_token = t.next0_ elif ((isinstance(t0, TextToken)) and t0.chars.is_letter and is_subval): if (NumbersWithUnitToken.try_parse(t, add_units, False, False, False, False) is not None): return None npt = NounPhraseToken(t0, t0) t = t0.next0_ while t is not None: if (t.whitespaces_before_count > 2): break elif (not (isinstance(t, TextToken))): break elif (not t.chars.is_letter): br = BracketHelper.try_parse(t, BracketParseAttr.NO, 100) if (br is not None): t = br.end_token npt.end_token = t else: break elif (NumbersWithUnitToken.try_parse(t, add_units, False, False, False, False) is not None): break else: npt.end_token = t t = t.next0_ else: return None elif (NumberHelper.try_parse_real_number(t, True, False) is not None): return None else: dtok = DateItemToken.try_attach(t, None, False) if (dtok is not None): return None t1 = npt.end_token t = npt.end_token name_ = MetaToken._new509(npt.begin_token, npt.end_token, npt.morph) units = None units2 = None internals_ = list() not0_ = False tt = t1.next0_ first_pass3305 = True while True: if first_pass3305: first_pass3305 = False else: tt = tt.next0_ if (not (tt is not None)): break if (tt.is_newline_before): break if (tt.is_table_control_char): break wrapminmax1617 = RefOutArgWrapper(minmax) tt2 = NumbersWithUnitToken._is_min_or_max(tt, wrapminmax1617) minmax = wrapminmax1617.value if (tt2 is not None): tt = tt2 t = tt t1 = t continue if ((tt.is_value("БЫТЬ", None) or tt.is_value("ДОЛЖЕН", None) or tt.is_value("ДОЛЖНЫЙ", None)) or tt.is_value("МОЖЕТ", None) or ((tt.is_value("СОСТАВЛЯТЬ", None) and not tt.get_morph_class_in_dictionary().is_adjective))): t = tt t1 = t if (tt.previous.is_value("НЕ", None)): not0_ = True continue www = NumbersWithUnitToken._try_parsewhl(tt) if (www is not None): whd = www tt = www.end_token t = tt t1 = t continue if (tt.is_value("ПРИ", None)): mt1 = MeasureToken.try_parse(tt.next0_, add_units, False, False, True, False) if (mt1 is not None): internals_.append(mt1) tt = mt1.end_token t = tt t1 = t continue n1 = NumbersWithUnitToken.try_parse(tt.next0_, add_units, False, False, False, False) if (n1 is not None and len(n1.units) > 0): mt1 = MeasureToken._new1612(n1.begin_token, n1.end_token, n1) internals_.append(mt1) tt = mt1.end_token t = tt t1 = t continue if (tt.is_value("ПО", None) and tt.next0_ is not None and tt.next0_.is_value("U", None)): tt = tt.next0_ t = tt t1 = t continue if (len(internals_) > 0): if (tt.is_char(':')): break mt1 = MeasureToken.try_parse(tt.next0_, add_units, False, False, True, False) if (mt1 is not None and mt1.reliable): internals_.append(mt1) tt = mt1.end_token t = tt t1 = t continue if ((isinstance(tt, NumberToken)) and tt.typ == NumberSpellingType.WORDS): npt3 = NounPhraseHelper.try_parse(tt, NounPhraseParseAttr.PARSENUMERICASADJECTIVE, 0, None) if (npt3 is not None): tt = npt3.end_token t1 = tt if (len(internals_) == 0): name_.end_token = t1 continue if (((tt.is_hiphen and not tt.is_whitespace_before and not tt.is_whitespace_after) and (isinstance(tt.next0_, NumberToken)) and (isinstance(tt.previous, TextToken))) and tt.previous.chars.is_all_upper): t = tt.next0_ tt = t t1 = tt if (len(internals_) == 0): name_.end_token = t1 continue if (((isinstance(tt, NumberToken)) and not tt.is_whitespace_before and (isinstance(tt.previous, TextToken))) and tt.previous.chars.is_all_upper): t = tt t1 = t if (len(internals_) == 0): name_.end_token = t1 continue if ((((isinstance(tt, NumberToken)) and not tt.is_whitespace_after and tt.next0_.is_hiphen) and not tt.next0_.is_whitespace_after and (isinstance(tt.next0_.next0_, TextToken))) and tt.next0_.next0_.length_char > 2): tt = tt.next0_.next0_ t = tt t1 = t npt1 = NounPhraseHelper.try_parse(tt, NounPhraseParseAttr.NO, 0, None) if (npt1 is not None and npt1.end_char > tt.end_char): tt = npt1.end_token t = tt t1 = t if (len(internals_) == 0): name_.end_token = t1 continue if ((isinstance(tt, NumberToken)) and tt.previous is not None): if (tt.previous.is_value("USB", None)): t = tt t1 = t if (len(internals_) == 0): name_.end_token = t1 ttt = tt.next0_ while ttt is not None: if (ttt.is_whitespace_before): break if (ttt.is_char_of(",:")): break tt = ttt t = tt t1 = t if (len(internals_) == 0): name_.end_token = t1 ttt = ttt.next0_ continue mt0 = NumbersWithUnitToken.try_parse(tt, add_units, False, False, False, False) if (mt0 is not None): npt1 = NounPhraseHelper.try_parse(tt, Utils.valToEnum((NounPhraseParseAttr.PARSENUMERICASADJECTIVE) | (NounPhraseParseAttr.PARSEPREPOSITION), NounPhraseParseAttr), 0, None) if (npt1 is not None and npt1.end_char > mt0.end_char): tt = npt1.end_token t = tt t1 = t if (len(internals_) == 0): name_.end_token = t1 continue break if (((tt.is_comma or tt.is_char('('))) and tt.next0_ is not None): www = NumbersWithUnitToken._try_parsewhl(tt.next0_) if (www is not None): whd = www tt = www.end_token t = tt t1 = t if (tt.next0_ is not None and tt.next0_.is_comma): tt = tt.next0_ t1 = tt if (tt.next0_ is not None and tt.next0_.is_char(')')): tt = tt.next0_ t1 = tt continue uu = UnitToken.try_parse_list(tt.next0_, add_units, False) if (uu is not None): t = uu[len(uu) - 1].end_token t1 = t units = uu if (tt.is_char('(') and t1.next0_ is not None and t1.next0_.is_char(')')): tt = t1.next0_ t = tt t1 = t continue elif (t1.next0_ is not None and t1.next0_.is_char('(')): uu = UnitToken.try_parse_list(t1.next0_.next0_, add_units, False) if (uu is not None and uu[len(uu) - 1].end_token.next0_ is not None and uu[len(uu) - 1].end_token.next0_.is_char(')')): units2 = uu tt = uu[len(uu) - 1].end_token.next0_ t = tt t1 = t continue www = NumbersWithUnitToken._try_parsewhl(t1.next0_) if (www is not None): whd = www tt = www.end_token t = tt t1 = t continue if (uu is not None and len(uu) > 0 and not uu[0].is_doubt): break if (t1.next0_ is not None): if (t1.next0_.is_table_control_char or t1.is_newline_after): break units = (None) if (BracketHelper.can_be_start_of_sequence(tt, False, False) and not (isinstance(tt.next0_, NumberToken))): br = BracketHelper.try_parse(tt, BracketParseAttr.NO, 100) if (br is not None): tt = br.end_token t = tt t1 = t continue if (tt.is_value("НЕ", None) and tt.next0_ is not None): mc = tt.next0_.get_morph_class_in_dictionary() if (mc.is_adverb or mc.is_misc): break continue if (tt.is_value("ЯМЗ", None)): pass npt2 = NounPhraseHelper.try_parse(tt, Utils.valToEnum((NounPhraseParseAttr.PARSEPREPOSITION) | (NounPhraseParseAttr.IGNOREBRACKETS) | (NounPhraseParseAttr.PARSEPRONOUNS), NounPhraseParseAttr), 0, None) if (npt2 is None): if (tt.morph.class0_.is_preposition or tt.morph.class0_.is_conjunction): to = NumbersWithUnitToken.M_TERMINS.try_parse(tt, TerminParseAttr.NO) if (to is not None): if ((isinstance(to.end_token.next0_, TextToken)) and to.end_token.next0_.is_letters): pass else: break t1 = tt continue mc = tt.get_morph_class_in_dictionary() if (((isinstance(tt, TextToken)) and tt.chars.is_letter and tt.length_char > 1) and (((tt.chars.is_all_upper or mc.is_adverb or mc.is_undefined) or mc.is_adjective))): uu = UnitToken.try_parse_list(tt, add_units, False) if (uu is not None): if (uu[0].length_char > 1 or len(uu) > 1): units = uu t = uu[len(uu) - 1].end_token t1 = t break t = tt t1 = t if (len(internals_) == 0): name_.end_token = tt continue if (tt.is_comma): continue if (tt.is_char('.')): if (not MiscHelper.can_be_start_of_sentence(tt.next0_)): continue uu = UnitToken.try_parse_list(tt.next0_, add_units, False) if (uu is not None): if (uu[0].length_char > 2 or len(uu) > 1): units = uu t = uu[len(uu) - 1].end_token t1 = t break break tt = npt2.end_token t = tt t1 = t if (len(internals_) > 0): pass elif (t.is_value("ПРЕДЕЛ", None) or t.is_value("ГРАНИЦА", None) or t.is_value("ДИАПАЗОН", None)): pass elif (t.chars.is_letter): name_.end_token = t1 t11 = t1 t1 = t1.next0_ first_pass3306 = True while True: if first_pass3306: first_pass3306 = False else: t1 = t1.next0_ if (not (t1 is not None)): break if (t1.is_table_control_char): pass elif (t1.is_char_of(":,_")): if (is_resctriction): return None www = NumbersWithUnitToken._try_parsewhl(t1.next0_) if (www is not None): whd = www t = www.end_token t1 = t continue uu = UnitToken.try_parse_list(t1.next0_, add_units, False) if (uu is not None): if (uu[0].length_char > 1 or len(uu) > 1): units = uu t = uu[len(uu) - 1].end_token t1 = t continue if (t1.is_char(':')): li = list() ttt = t1.next0_ first_pass3307 = True while True: if first_pass3307: first_pass3307 = False else: ttt = ttt.next0_ if (not (ttt is not None)): break if (ttt.is_hiphen or ttt.is_table_control_char): continue if ((isinstance(ttt, TextToken)) and not ttt.chars.is_letter): continue mt1 = MeasureToken.try_parse(ttt, add_units, True, True, False, True) if (mt1 is None): break li.append(mt1) ttt = mt1.end_token if (ttt.next0_ is not None and ttt.next0_.is_char(';')): ttt = ttt.next0_ if (ttt.is_char(';')): pass elif (ttt.is_newline_after and mt1.is_newline_before): pass else: break if (len(li) > 1): res0 = MeasureToken._new1618(t0, li[len(li) - 1].end_token, li, True) if (internals_ is not None and len(internals_) > 0): res0.internal_ex = internals_[0] nam = MiscHelper.get_text_value_of_meta_token(name_, GetTextAttr.FIRSTNOUNGROUPTONOMINATIVE) li[0].begin_token = t0 for v in li: v.name = "{0} ({1})".format(nam, Utils.ifNotNull(v.name, "")).strip() if (v.nums is not None and len(v.nums.units) == 0 and units is not None): v.nums.units = units return res0 elif (t1.is_hiphen and t1.is_whitespace_after and t1.is_whitespace_before): pass elif (t1.is_hiphen and t1.next0_ is not None and t1.next0_.is_char('(')): pass else: break if (t1 is None): return None mts = NumbersWithUnitToken.try_parse_multi(t1, add_units, False, not0_, True, is_resctriction) if (mts is None): if (units is not None and len(units) > 0): if (t1 is None or t1.previous.is_char(':')): mts = list() if (t1 is None): t1 = t11 while t1 is not None and t1.next0_ is not None: pass t1 = t1.next0_ else: t1 = t1.previous mts.append(NumbersWithUnitToken._new1619(t0, t1, math.nan)) if (mts is None): return None mt = mts[0] if (mt.begin_token == mt.end_token and not (isinstance(mt.begin_token, NumberToken))): return None if (not is_subval and name_.begin_token.morph.class0_.is_preposition): name_.begin_token = name_.begin_token.next0_ if (mt.whl is not None): whd = mt.whl for kk in range(10): if (whd is not None and whd.end_token == name_.end_token): name_.end_token = whd.begin_token.previous continue if (units is not None): if (units[len(units) - 1].end_token == name_.end_token): name_.end_token = units[0].begin_token.previous continue break if (len(mts) > 1 and len(internals_) == 0): if (len(mt.units) == 0): if (units is not None): for m in mts: m.units = units res1 = MeasureToken._new1620(t0, mts[len(mts) - 1].end_token, name_.morph, True) res1.name = MiscHelper.get_text_value_of_meta_token(name_, GetTextAttr.FIRSTNOUNGROUPTONOMINATIVE) k = 0 while k < len(mts): ttt = MeasureToken._new1612(mts[k].begin_token, mts[k].end_token, mts[k]) if (whd is not None): nams = Utils.asObjectOrNull(whd.tag, list) if (k < len(nams)): ttt.name = nams[k] res1.internals.append(ttt) k += 1 tt1 = res1.end_token.next0_ if (tt1 is not None and tt1.is_char('±')): nn = NumbersWithUnitToken._try_parse(tt1, add_units, True, False, False) if (nn is not None and nn.plus_minus_percent): res1.end_token = nn.end_token res1.nums = nn if (len(nn.units) > 0 and units is None and len(mt.units) == 0): for m in mts: m.units = nn.units return res1 if (not mt.is_whitespace_before): if (mt.begin_token.previous is None): return None if (mt.begin_token.previous.is_char_of(":),") or mt.begin_token.previous.is_table_control_char or mt.begin_token.previous.is_value("IP", None)): pass elif (mt.begin_token.is_hiphen and len(mt.units) > 0 and not mt.units[0].is_doubt): pass else: return None if (len(mt.units) == 0 and units is not None): mt.units = units if (mt.div_num is not None and len(units) > 1 and len(mt.div_num.units) == 0): i = 1 while i < len(units): if (units[i].pow0_ == -1): j = i while j < len(units): mt.div_num.units.append(units[j]) units[j].pow0_ = (- units[j].pow0_) j += 1 del mt.units[i:i+len(units) - i] break i += 1 if ((minmax < 0) and mt.single_val is not None): mt.from_val = mt.single_val mt.from_include = True mt.single_val = (None) if (minmax > 0 and mt.single_val is not None): mt.to_val = mt.single_val mt.to_include = True mt.single_val = (None) if (len(mt.units) == 0): units = UnitToken.try_parse_list(mt.end_token.next0_, add_units, True) if (units is None): if (can_units_absent): pass else: return None else: mt.units = units res = MeasureToken._new1622(t0, mt.end_token, name_.morph, internals_) if (((not t0.is_whitespace_before and t0.previous is not None and t0 == name_.begin_token) and t0.previous.is_hiphen and not t0.previous.is_whitespace_before) and (isinstance(t0.previous.previous, TextToken))): name_.begin_token = res.begin_token = name_.begin_token.previous.previous res.name = MiscHelper.get_text_value_of_meta_token(name_, (GetTextAttr.FIRSTNOUNGROUPTONOMINATIVE if not is_subval else GetTextAttr.NO)) res.nums = mt for u in res.nums.units: if (u.keyword is not None): if (u.keyword.begin_char >= res.begin_char): res.reliable = True res.__parse_internals(add_units) if (len(res.internals) > 0 or not can_be_set): return res t1 = res.end_token.next0_ if (t1 is not None and t1.is_comma_and): t1 = t1.next0_ mts1 = NumbersWithUnitToken.try_parse_multi(t1, add_units, False, False, False, False) if ((mts1 is not None and len(mts1) == 1 and (t1.whitespaces_before_count < 3)) and len(mts1[0].units) > 0 and not UnitToken.can_be_equals(mts[0].units, mts1[0].units)): res.is_set = True res.nums = (None) res.internals.append(MeasureToken._new1612(mt.begin_token, mt.end_token, mt)) res.internals.append(MeasureToken._new1612(mts1[0].begin_token, mts1[0].end_token, mts1[0])) res.end_token = mts1[0].end_token return res
def __get_name_without_brackets(begin: 'Token', end: 'Token', normalize_first_noun_group: bool = False, normal_first_group_single: bool = False, ignore_geo_referent: bool = False) -> str: res = None if (BracketHelper.can_be_start_of_sequence(begin, False, False) and BracketHelper.can_be_end_of_sequence( end, False, begin, False)): begin = begin.next0_ end = end.previous if (normalize_first_noun_group and not begin.morph.class0_.is_preposition): npt = NounPhraseHelper.try_parse( begin, NounPhraseParseAttr.REFERENTCANBENOUN, 0, None) if (npt is not None): if (npt.noun.get_morph_class_in_dictionary().is_undefined and len(npt.adjectives) == 0): npt = (None) if (npt is not None and npt.end_token.end_char > end.end_char): npt = (None) if (npt is not None): res = npt.get_normal_case_text( None, (MorphNumber.SINGULAR if normal_first_group_single else MorphNumber.UNDEFINED), MorphGender.UNDEFINED, False) te = npt.end_token.next0_ if (((te is not None and te.next0_ is not None and te.is_comma) and (isinstance(te.next0_, TextToken)) and te.next0_.end_char <= end.end_char) and te.next0_.morph.class0_.is_verb and te.next0_.morph.class0_.is_adjective): for it in te.next0_.morph.items: if (it.gender == npt.morph.gender or ((it.gender) & (npt.morph.gender)) != (MorphGender.UNDEFINED)): if (not ( (it.case_) & npt.morph.case_).is_undefined): if (it.number == npt.morph.number or ((it.number) & (npt.morph.number)) != (MorphNumber.UNDEFINED)): var = te.next0_.term if (isinstance(it, MorphWordForm)): var = it.normal_case bi = MorphBaseInfo._new492( MorphClass.ADJECTIVE, npt.morph.gender, npt.morph.number, npt.morph.language) var = MorphologyService.get_wordform( var, bi) if (var is not None): res = "{0}, {1}".format(res, var) te = te.next0_.next0_ break if (te is not None and te.end_char <= end.end_char): s = ProperNameHelper.get_name_ex(te, end, MorphClass.UNDEFINED, MorphCase.UNDEFINED, MorphGender.UNDEFINED, True, ignore_geo_referent) if (not Utils.isNullOrEmpty(s)): if (not str.isalnum(s[0])): res = "{0}{1}".format(res, s) else: res = "{0} {1}".format(res, s) elif ((isinstance(begin, TextToken)) and begin.chars.is_cyrillic_letter): mm = begin.get_morph_class_in_dictionary() if (not mm.is_undefined): res = begin.get_normal_case_text(mm, MorphNumber.UNDEFINED, MorphGender.UNDEFINED, False) if (begin.end_char < end.end_char): res = "{0} {1}".format( res, ProperNameHelper.get_name_ex( begin.next0_, end, MorphClass.UNDEFINED, MorphCase.UNDEFINED, MorphGender.UNDEFINED, True, False)) if (res is None): res = ProperNameHelper.get_name_ex(begin, end, MorphClass.UNDEFINED, MorphCase.UNDEFINED, MorphGender.UNDEFINED, True, ignore_geo_referent) if (not Utils.isNullOrEmpty(res)): k = 0 i = len(res) - 1 while i >= 0: if (res[i] == '*' or Utils.isWhitespace(res[i])): pass else: break i -= 1 k += 1 if (k > 0): if (k == len(res)): return None res = res[0:0 + len(res) - k] return res
def create(t: 'Token', names: 'TerminCollection') -> 'BlockLine': if (t is None): return None res = BlockLine(t, t) tt = t while tt is not None: if (tt != t and tt.is_newline_before): break else: res.end_token = tt tt = tt.next0_ nums = 0 while t is not None and t.next0_ is not None and t.end_char <= res.end_char: if (isinstance(t, NumberToken)): pass else: rom = NumberHelper.try_parse_roman(t) if (rom is not None and rom.end_token.next0_ is not None): t = rom.end_token else: break if (t.next0_.is_char('.')): pass elif ((isinstance(t.next0_, TextToken)) and not t.next0_.chars.is_all_lower): pass else: break res.number_end = t t = t.next0_ if (t.is_char('.') and t.next0_ is not None): res.number_end = t t = t.next0_ if (t.is_newline_before): return res nums += 1 tok = BlockLine.__m_ontology.try_parse(t, TerminParseAttr.NO) if (tok is None): npt1 = NounPhraseHelper.try_parse(t, NounPhraseParseAttr.NO, 0, None) if (npt1 is not None and npt1.end_token != npt1.begin_token): tok = BlockLine.__m_ontology.try_parse(npt1.noun.begin_token, TerminParseAttr.NO) if (tok is not None): if (t.previous is not None and t.previous.is_char(':')): tok = (None) if (tok is not None): typ_ = Utils.valToEnum(tok.termin.tag, BlkTyps) if (typ_ == BlkTyps.CONSLUSION): if (t.is_newline_after): pass elif (t.next0_ is not None and t.next0_.morph.class0_.is_preposition and t.next0_.next0_ is not None): tok2 = BlockLine.__m_ontology.try_parse( t.next0_.next0_, TerminParseAttr.NO) if (tok2 is not None and (Utils.valToEnum( tok2.termin.tag, BlkTyps)) == BlkTyps.CHAPTER): pass else: tok = (None) else: tok = (None) if (t.kit.base_language != t.morph.language): tok = (None) if (typ_ == BlkTyps.INDEX and not t.is_value("ОГЛАВЛЕНИЕ", None)): if (not t.is_newline_after and t.next0_ is not None): npt = NounPhraseHelper.try_parse(t.next0_, NounPhraseParseAttr.NO, 0, None) if (npt is not None and npt.is_newline_after and npt.morph.case_.is_genitive): tok = (None) elif (npt is None): tok = (None) if ((typ_ == BlkTyps.INTRO and tok is not None and not tok.is_newline_after) and t.is_value("ВВЕДЕНИЕ", None)): npt = NounPhraseHelper.try_parse(t.next0_, NounPhraseParseAttr.NO, 0, None) if (npt is not None and npt.morph.case_.is_genitive): tok = (None) if (tok is not None): if (res.number_end is None): res.number_end = tok.end_token if (res.number_end.end_char > res.end_char): res.end_token = res.number_end res.typ = typ_ t = tok.end_token if (t.next0_ is not None and t.next0_.is_char_of(":.")): t = t.next0_ res.end_token = t if (t.is_newline_after or t.next0_ is None): return res t = t.next0_ if (t.is_char('§') and (isinstance(t.next0_, NumberToken))): res.typ = BlkTyps.CHAPTER res.number_end = t t = t.next0_ if (names is not None): tok2 = names.try_parse(t, TerminParseAttr.NO) if (tok2 is not None and tok2.end_token.is_newline_after): res.end_token = tok2.end_token res.is_exist_name = True if (res.typ == BlkTyps.UNDEFINED): li2 = BlockLine.create((None if res.number_end is None else res.number_end.next0_), None) if (li2 is not None and ((li2.typ == BlkTyps.LITERATURE or li2.typ == BlkTyps.INTRO or li2.typ == BlkTyps.CONSLUSION))): res.typ = li2.typ else: res.typ = BlkTyps.CHAPTER return res t1 = res.end_token if ((((isinstance(t1, NumberToken)) or t1.is_char('.'))) and t1.previous is not None): t1 = t1.previous if (t1.is_char('.')): res.has_content_item_tail = True while t1 is not None and t1.begin_char > res.begin_char: if (not t1.is_char('.')): break t1 = t1.previous res.is_all_upper = True while t is not None and t.end_char <= t1.end_char: if (not (isinstance(t, TextToken)) or not t.chars.is_letter): res.not_words += 1 else: mc = t.get_morph_class_in_dictionary() if (mc.is_undefined): res.not_words += 1 elif (t.length_char > 2): res.words += 1 if (not t.chars.is_all_upper): res.is_all_upper = False if (t.is_pure_verb): if (not t.term.endswith("ING")): res.has_verb = True t = t.next0_ if (res.typ == BlkTyps.UNDEFINED): npt = NounPhraseHelper.try_parse( (res.begin_token if res.number_end is None else res.number_end.next0_), NounPhraseParseAttr.NO, 0, None) if (npt is not None): if (npt.noun.is_value("ХАРАКТЕРИСТИКА", None) or npt.noun.is_value("СОДЕРЖАНИЕ", "ЗМІСТ")): ok = True tt = npt.end_token.next0_ first_pass3032 = True while True: if first_pass3032: first_pass3032 = False else: tt = tt.next0_ if (not (tt is not None and tt.end_char <= res.end_char)): break if (tt.is_char('.')): continue npt2 = NounPhraseHelper.try_parse( tt, NounPhraseParseAttr.NO, 0, None) if (npt2 is None or not npt2.morph.case_.is_genitive): ok = False break tt = npt2.end_token if (tt.end_char > res.end_char): res.end_token = tt if (not tt.is_newline_after): while res.end_token.next0_ is not None: if (res.end_token.is_newline_after): break res.end_token = res.end_token.next0_ if (ok): res.typ = BlkTyps.INTRO res.is_exist_name = True elif (npt.noun.is_value("ВЫВОД", "ВИСНОВОК") or npt.noun.is_value("РЕЗУЛЬТАТ", "ДОСЛІДЖЕННЯ")): ok = True tt = npt.end_token.next0_ first_pass3033 = True while True: if first_pass3033: first_pass3033 = False else: tt = tt.next0_ if (not (tt is not None and tt.end_char <= res.end_char)): break if (tt.is_char_of(",.") or tt.is_and): continue npt1 = NounPhraseHelper.try_parse( tt, NounPhraseParseAttr.NO, 0, None) if (npt1 is not None): if (npt1.noun.is_value("РЕЗУЛЬТАТ", "ДОСЛІДЖЕННЯ") or npt1.noun.is_value( "РЕКОМЕНДАЦИЯ", "РЕКОМЕНДАЦІЯ") or npt1.noun.is_value( "ИССЛЕДОВАНИЕ", "ДОСЛІДЖЕННЯ")): tt = npt1.end_token if (tt.end_char > res.end_char): res.end_token = tt if (not tt.is_newline_after): while res.end_token.next0_ is not None: if (res.end_token.is_newline_after ): break res.end_token = res.end_token.next0_ continue ok = False break if (ok): res.typ = BlkTyps.CONSLUSION res.is_exist_name = True if (res.typ == BlkTyps.UNDEFINED and npt is not None and npt.end_char <= res.end_char): ok = False publ = 0 if (BlockLine.__is_pub(npt)): ok = True publ = 1 elif ((npt.noun.is_value("СПИСОК", None) or npt.noun.is_value("УКАЗАТЕЛЬ", "ПОКАЖЧИК") or npt.noun.is_value("ПОЛОЖЕНИЕ", "ПОЛОЖЕННЯ")) or npt.noun.is_value("ВЫВОД", "ВИСНОВОК") or npt.noun.is_value("РЕЗУЛЬТАТ", "ДОСЛІДЖЕННЯ")): if (npt.end_char == res.end_char): return None ok = True if (ok): if (npt.begin_token == npt.end_token and npt.noun.is_value("СПИСОК", None) and npt.end_char == res.end_char): ok = False tt = npt.end_token.next0_ first_pass3034 = True while True: if first_pass3034: first_pass3034 = False else: tt = tt.next0_ if (not (tt is not None and tt.end_char <= res.end_char)): break if (tt.is_char_of(",.:") or tt.is_and or tt.morph.class0_.is_preposition): continue if (tt.is_value("ОТРАЖЕНЫ", "ВІДОБРАЖЕНІ")): continue npt = NounPhraseHelper.try_parse( tt, NounPhraseParseAttr.NO, 0, None) if (npt is None): ok = False break if (((BlockLine.__is_pub(npt) or npt.noun.is_value( "РАБОТА", "РОБОТА") or npt.noun.is_value( "ИССЛЕДОВАНИЕ", "ДОСЛІДЖЕННЯ")) or npt.noun.is_value("АВТОР", None) or npt.noun.is_value("ТРУД", "ПРАЦЯ")) or npt.noun.is_value("ТЕМА", None) or npt.noun.is_value( "ДИССЕРТАЦИЯ", "ДИСЕРТАЦІЯ")): tt = npt.end_token if (BlockLine.__is_pub(npt)): publ += 1 if (tt.end_char > res.end_char): res.end_token = tt if (not tt.is_newline_after): while res.end_token.next0_ is not None: if (res.end_token.is_newline_after ): break res.end_token = res.end_token.next0_ continue ok = False break if (ok): res.typ = BlkTyps.LITERATURE res.is_exist_name = True if (publ == 0 and (res.end_char < ((math.floor( (len(res.kit.sofa.text) * 2) / 3))))): if (res.number_end is not None): res.typ = BlkTyps.MISC else: res.typ = BlkTyps.UNDEFINED return res
def try_parse(t: 'Token', add_units: 'TerminCollection', prev: 'UnitToken', parse_unknown_units: bool = False) -> 'UnitToken': if (t is None): return None t0 = t pow0__ = 1 is_neg = False if ((t.is_char_of("\\/") or t.is_value("НА", None) or t.is_value("OF", None)) or t.is_value("PER", None)): is_neg = True t = t.next0_ elif (t.is_value("В", None) and prev is not None): is_neg = True t = t.next0_ elif (MeasureHelper.is_mult_char(t)): t = t.next0_ tt = Utils.asObjectOrNull(t, TextToken) if (tt is None): return None if (tt.term == "КВ" or tt.term == "КВАДР" or tt.is_value("КВАДРАТНЫЙ", None)): pow0__ = 2 tt = (Utils.asObjectOrNull(tt.next0_, TextToken)) if (tt is not None and tt.is_char('.')): tt = (Utils.asObjectOrNull(tt.next0_, TextToken)) if (tt is None): return None elif (tt.term == "КУБ" or tt.term == "КУБИЧ" or tt.is_value("КУБИЧЕСКИЙ", None)): pow0__ = 3 tt = (Utils.asObjectOrNull(tt.next0_, TextToken)) if (tt is not None and tt.is_char('.')): tt = (Utils.asObjectOrNull(tt.next0_, TextToken)) if (tt is None): return None elif (tt.term == "µ"): res = UnitToken.try_parse(tt.next0_, add_units, prev, False) if (res is not None): for u in UnitsHelper.UNITS: if (u.factor == UnitsFactors.MICRO and Utils.compareStrings("мк" + u.name_cyr, res.unit.name_cyr, True) == 0): res.unit = u res.begin_token = tt res.pow0_ = pow0__ if (is_neg): res.pow0_ = (-pow0__) return res toks = UnitsHelper.TERMINS.try_parse_all(tt, TerminParseAttr.NO) if (toks is not None): if ((prev is not None and tt == t0 and len(toks) == 1) and t.is_whitespace_before): return None if (toks[0].begin_token == toks[0].end_token and tt.morph.class0_.is_preposition and (tt.whitespaces_after_count < 3)): if (NounPhraseHelper.try_parse( tt, NounPhraseParseAttr.PARSEPREPOSITION, 0, None) is not None): return None if (isinstance(tt.next0_, NumberToken)): if (tt.next0_.typ != NumberSpellingType.DIGIT): return None nex = UnitToken.try_parse(tt.next0_, add_units, None, False) if (nex is not None): return None if (toks[0].begin_token == toks[0].end_token and ((toks[0].begin_token.is_value("М", None) or toks[0].begin_token.is_value("M", None))) and toks[0].begin_token.chars.is_all_lower): if (prev is not None and prev.unit is not None and prev.unit.kind == MeasureKind.LENGTH): res = UnitToken._new1626(t0, toks[0].end_token, UnitsHelper.UMINUTE) res.pow0_ = pow0__ if (is_neg): res.pow0_ = (-pow0__) return res uts = list() for tok in toks: res = UnitToken._new1626( t0, tok.end_token, Utils.asObjectOrNull(tok.termin.tag, Unit)) res.pow0_ = pow0__ if (is_neg): res.pow0_ = (-pow0__) if (res.unit.base_multiplier == 1000000 and (isinstance(t0, TextToken)) and str.islower(t0.get_source_text()[0])): for u in UnitsHelper.UNITS: if (u.factor == UnitsFactors.MILLI and Utils.compareStrings( u.name_cyr, res.unit.name_cyr, True) == 0): res.unit = u break res.__correct() res.__check_doubt() uts.append(res) max0_ = 0 best = None for ut in uts: if (ut.keyword is not None): if (ut.keyword.begin_char >= max0_): max0_ = ut.keyword.begin_char best = ut if (best is not None): return best for ut in uts: if (not ut.is_doubt): return ut return uts[0] t1 = None if (t.is_char_of("º°")): t1 = t elif ((t.is_char('<') and t.next0_ is not None and t.next0_.next0_ is not None) and t.next0_.next0_.is_char('>') and ((t.next0_.is_value("О", None) or t.next0_.is_value("O", None) or (((isinstance(t.next0_, NumberToken)) and t.next0_.value == "0"))))): t1 = t.next0_.next0_ if (t1 is not None): res = UnitToken._new1626(t0, t1, UnitsHelper.UGRADUS) res.__check_doubt() t = t1.next0_ if (t is not None and t.is_comma): t = t.next0_ if (t is not None and t.is_value("ПО", None)): t = t.next0_ if (isinstance(t, TextToken)): vv = t.term if (vv == "C" or vv == "С" or vv.startswith("ЦЕЛЬС")): res.unit = UnitsHelper.UGRADUSC res.is_doubt = False res.end_token = t if (vv == "F" or vv.startswith("ФАР")): res.unit = UnitsHelper.UGRADUSF res.is_doubt = False res.end_token = t return res if ((isinstance(t, TextToken)) and ((t.is_value("ОС", None) or t.is_value("OC", None)))): str0_ = t.get_source_text() if (str0_ == "оС" or str0_ == "oC"): res = UnitToken._new1738(t, t, UnitsHelper.UGRADUSC, False) return res if (t.is_char('%')): tt1 = t.next0_ if (tt1 is not None and tt1.is_char('(')): tt1 = tt1.next0_ if ((isinstance(tt1, TextToken)) and tt1.term.startswith("ОБ")): re = UnitToken._new1626(t, tt1, UnitsHelper.UALCO) if (re.end_token.next0_ is not None and re.end_token.next0_.is_char('.')): re.end_token = re.end_token.next0_ if (re.end_token.next0_ is not None and re.end_token.next0_.is_char(')') and t.next0_.is_char('(')): re.end_token = re.end_token.next0_ return re return UnitToken._new1626(t, t, UnitsHelper.UPERCENT) if (add_units is not None): tok = add_units.try_parse(t, TerminParseAttr.NO) if (tok is not None): res = UnitToken._new1741( t0, tok.end_token, Utils.asObjectOrNull(tok.termin.tag, UnitReferent)) if (tok.end_token.next0_ is not None and tok.end_token.next0_.is_char('.')): tok.end_token = tok.end_token.next0_ res.pow0_ = pow0__ if (is_neg): res.pow0_ = (-pow0__) res.__correct() return res if (not parse_unknown_units): return None if ((t.whitespaces_before_count > 2 or not t.chars.is_letter or t.length_char > 5) or not (isinstance(t, TextToken))): return None if (MiscHelper.can_be_start_of_sentence(t)): return None t1 = t if (t.next0_ is not None and t.next0_.is_char('.')): t1 = t ok = False if (t1.next0_ is None or t1.whitespaces_after_count > 2): ok = True elif (t1.next0_.is_comma or t1.next0_.is_char_of("\\/") or t1.next0_.is_table_control_char): ok = True elif (MeasureHelper.is_mult_char(t1.next0_)): ok = True if (not ok): return None mc = t.get_morph_class_in_dictionary() if (mc.is_undefined): pass elif (t.length_char > 7): return None res1 = UnitToken._new1742(t0, t1, pow0__, True) res1.unknown_name = t.get_source_text() res1.__correct() return res1
def main(args: typing.List[str]) -> None: sw = Stopwatch() # инициализация - необходимо проводить один раз до обработки текстов print("Initializing ... ", end="", flush=True) # инициализируются движок и все имеющиеся анализаторы Sdk.initialize((MorphLang.RU) | MorphLang.EN) sw.stop() print("OK (by {0} ms), version {1}".format( sw.elapsedMilliseconds, ProcessorService.get_version()), flush=True) # анализируемый текст txt = "Единственным конкурентом «Трансмаша» на этом сомнительном тендере было ООО «Плассер Алека Рейл Сервис», основным владельцем которого является австрийская компания «СТЦ-Холдинг ГМБХ». До конца 2011 г. эта же фирма была совладельцем «Трансмаша» вместе с «Тако» Краснова. Зато совладельцем «Плассера», также до конца 2011 г., был тот самый Карл Контрус, который имеет четверть акций «Трансмаша». " print("Text: {0}".format(txt), flush=True) # запускаем обработку на пустом процессоре (без анализаторов NER) are = ProcessorService.get_empty_processor().process( SourceOfAnalysis(txt), None, None) print("Noun groups: ", end="", flush=True) t = are.first_token # перебираем токены first_pass2879 = True while True: if first_pass2879: first_pass2879 = False else: t = t.next0_ if (not (t is not None)): break # выделяем именную группу с текущего токена npt = NounPhraseHelper.try_parse(t, NounPhraseParseAttr.NO, 0) # не получилось if (npt is None): continue # получилось, выводим в нормализованном виде print("[{0}=>{1}] ".format( npt.get_source_text(), npt.get_normal_case_text(None, True, MorphGender.UNDEFINED, False)), end="", flush=True) # указатель на последний токен именной группы t = npt.end_token with ProcessorService.create_processor() as proc: # анализируем текст ar = proc.process(SourceOfAnalysis(txt), None, None) # результирующие сущности print( "\r\n==========================================\r\nEntities: ", flush=True) for e0_ in ar.entities: print("{0}: {1}".format(e0_.type_name, str(e0_)), flush=True) for s in e0_.slots: print(" {0}: {1}".format(s.type_name, s.value), flush=True) # пример выделения именных групп print( "\r\n==========================================\r\nNoun groups: ", flush=True) t = ar.first_token first_pass2880 = True while True: if first_pass2880: first_pass2880 = False else: t = t.next0_ if (not (t is not None)): break # токены с сущностями игнорируем if (t.get_referent() is not None): continue # пробуем создать именную группу npt = NounPhraseHelper.try_parse( t, NounPhraseParseAttr.ADJECTIVECANBELAST, 0) # не получилось if (npt is None): continue print(npt, flush=True) # указатель перемещаем на последний токен группы t = npt.end_token with ProcessorService.create_specific_processor( KeywordAnalyzer.ANALYZER_NAME) as proc: ar = proc.process(SourceOfAnalysis(txt), None, None) print( "\r\n==========================================\r\nKeywords1: ", flush=True) for e0_ in ar.entities: if (isinstance(e0_, KeywordReferent)): print(e0_, flush=True) print( "\r\n==========================================\r\nKeywords2: ", flush=True) t = ar.first_token first_pass2881 = True while True: if first_pass2881: first_pass2881 = False else: t = t.next0_ if (not (t is not None)): break if (isinstance(t, ReferentToken)): kw = Utils.asObjectOrNull(t.get_referent(), KeywordReferent) if (kw is None): continue kwstr = MiscHelper.get_text_value_of_meta_token( Utils.asObjectOrNull(t, ReferentToken), Utils.valToEnum( (GetTextAttr.FIRSTNOUNGROUPTONOMINATIVESINGLE) | (GetTextAttr.KEEPREGISTER), GetTextAttr)) print("{0} = {1}".format(kwstr, kw), flush=True) print("Over!", flush=True)
def main(args: typing.List[str]) -> None: sw = Stopwatch() # инициализация - необходимо проводить один раз до обработки текстов print("Initializing SDK Pullenti ver {0} ({1}) ... ".format( Sdk.get_version(), Sdk.get_version_date()), end="", flush=True) # инициализируются движок и все имеющиеся анализаторы Sdk.initialize_all() sw.stop() print("OK (by {0} ms), version {1}".format( sw.elapsedMilliseconds, ProcessorService.get_version()), flush=True) # посмотрим, какие анализаторы доступны for a in ProcessorService.get_analyzers(): print(" {0} {1} \"{2}\"".format( ("Specific analyzer" if a.is_specific else "Common analyzer"), a.name, a.caption), flush=True) # анализируемый текст txt = "Система разрабатывается с 2011 года российским программистом Михаилом Жуковым, проживающим в Москве на Красной площади в доме номер один на втором этаже. Конкурентов у него много: Abbyy, Yandex, ООО \"Russian Context Optimizer\" (RCO) и другие компании. Он планирует продать SDK за 1.120.000.001,99 (миллиард сто двадцать миллионов один рубль 99 копеек) рублей, без НДС." print("Text: {0}".format(txt), flush=True) # запускаем обработку на пустом процессоре (без анализаторов NER) are = ProcessorService.get_empty_processor().process( SourceOfAnalysis(txt), None, None) print("Noun groups: ", end="", flush=True) t = are.first_token # перебираем токены first_pass2974 = True while True: if first_pass2974: first_pass2974 = False else: t = t.next0_ if (not (t is not None)): break # выделяем именную группу с текущего токена npt = NounPhraseHelper.try_parse(t, NounPhraseParseAttr.NO, 0, None) # не получилось if (npt is None): continue # получилось, выводим в нормализованном виде print("[{0}=>{1}] ".format( npt.get_source_text(), npt.get_normal_case_text(None, MorphNumber.SINGULAR, MorphGender.UNDEFINED, False)), end="", flush=True) # указатель на последний токен именной группы t = npt.end_token with ProcessorService.create_processor() as proc: # анализируем текст ar = proc.process(SourceOfAnalysis(txt), None, None) # результирующие сущности print( "\r\n==========================================\r\nEntities: ", flush=True) for e0_ in ar.entities: print("{0}: {1}".format(e0_.type_name, str(e0_)), flush=True) for s in e0_.slots: print(" {0}: {1}".format(s.type_name, s.value), flush=True) # пример выделения именных групп print( "\r\n==========================================\r\nNoun groups: ", flush=True) t = ar.first_token first_pass2975 = True while True: if first_pass2975: first_pass2975 = False else: t = t.next0_ if (not (t is not None)): break # токены с сущностями игнорируем if (t.get_referent() is not None): continue # пробуем создать именную группу npt = NounPhraseHelper.try_parse( t, NounPhraseParseAttr.ADJECTIVECANBELAST, 0, None) # не получилось if (npt is None): continue print(npt, flush=True) # указатель перемещаем на последний токен группы t = npt.end_token with ProcessorService.create_specific_processor( KeywordAnalyzer.ANALYZER_NAME) as proc: ar = proc.process(SourceOfAnalysis(txt), None, None) print( "\r\n==========================================\r\nKeywords1: ", flush=True) for e0_ in ar.entities: if (isinstance(e0_, KeywordReferent)): print(e0_, flush=True) print( "\r\n==========================================\r\nKeywords2: ", flush=True) t = ar.first_token first_pass2976 = True while True: if first_pass2976: first_pass2976 = False else: t = t.next0_ if (not (t is not None)): break if (isinstance(t, ReferentToken)): kw = Utils.asObjectOrNull(t.get_referent(), KeywordReferent) if (kw is None): continue kwstr = MiscHelper.get_text_value_of_meta_token( Utils.asObjectOrNull(t, ReferentToken), Utils.valToEnum( (GetTextAttr.FIRSTNOUNGROUPTONOMINATIVESINGLE) | (GetTextAttr.KEEPREGISTER), GetTextAttr)) print("{0} = {1}".format(kwstr, kw), flush=True) print("Over!", flush=True)
def try_attach_list(t: 'Token', max_count: int = 20) -> typing.List['DateItemToken']: p = DateItemToken.try_attach(t, None, False) if (p is None): return None if (p.typ == DateItemToken.DateItemType.DELIM): return None res = list() res.append(p) tt = p.end_token.next0_ while tt is not None: if (isinstance(tt, TextToken)): if (tt.check_value(DateItemToken.M_EMPTY_WORDS) is not None): tt = tt.next0_ continue p0 = DateItemToken.try_attach(tt, res, False) if (p0 is None): if (tt.is_newline_before): break if (tt.chars.is_latin_letter): break if (tt.morph is not None and tt.morph.check((MorphClass.ADJECTIVE) | MorphClass.PRONOUN)): tt = tt.next0_ continue break if (tt.is_newline_before): if (p.typ == DateItemToken.DateItemType.MONTH and p0.can_be_year): pass elif (p.typ == DateItemToken.DateItemType.NUMBER and p.can_be_day and p0.typ == DateItemToken.DateItemType.MONTH): pass else: break if (p0.can_be_year and p0.typ == DateItemToken.DateItemType.NUMBER): if (p.typ == DateItemToken.DateItemType.HALFYEAR or p.typ == DateItemToken.DateItemType.QUARTAL): p0.typ = DateItemToken.DateItemType.YEAR elif (p.typ == DateItemToken.DateItemType.POINTER and p0.int_value > 1990): p0.typ = DateItemToken.DateItemType.YEAR p = p0 res.append(p) if (max_count > 0 and len(res) >= max_count): break tt = p.end_token.next0_ for i in range(len(res) - 1, -1, -1): if (res[i].typ == DateItemToken.DateItemType.DELIM): del res[i] else: break if (len(res) > 0 and res[len(res) - 1].typ == DateItemToken.DateItemType.NUMBER): nex = NumberHelper.try_parse_number_with_postfix( res[len(res) - 1].begin_token) if (nex is not None and nex.ex_typ != NumberExType.HOUR): if (len(res) > 3 and res[len(res) - 2].typ == DateItemToken.DateItemType.DELIM and res[len(res) - 2].string_value == ":"): pass else: del res[len(res) - 1] if (len(res) == 0): return None i = 1 while i < (len(res) - 1): if (res[i].typ == DateItemToken.DateItemType.DELIM and res[i].begin_token.is_comma): if ((i == 1 and res[i - 1].typ == DateItemToken.DateItemType.MONTH and res[i + 1].can_be_year) and (i + 1) == (len(res) - 1)): del res[i] i += 1 if (res[len(res) - 1].typ == DateItemToken.DateItemType.NUMBER): rr = res[len(res) - 1] npt = NounPhraseHelper.try_parse(rr.begin_token, NounPhraseParseAttr.NO, 0, None) if (npt is not None and npt.end_char > rr.end_char): del res[len(res) - 1] if (len(res) > 0 and res[len(res) - 1].typ == DateItemToken.DateItemType.DELIM): del res[len(res) - 1] if (len(res) == 0): return None if (len(res) == 2 and not res[0].is_whitespace_after): if (not res[0].is_whitespace_before and not res[1].is_whitespace_after): return None return res
def try_parse(t: 'Token') -> 'DefinitionWithNumericToken': if (not MiscHelper.can_be_start_of_sentence(t)): return None tt = t noun_ = None num = None first_pass3146 = True while True: if first_pass3146: first_pass3146 = False else: tt = tt.next0_ if (not (tt is not None)): break if (tt != t and MiscHelper.can_be_start_of_sentence(tt)): return None if (not (isinstance(tt, NumberToken))): continue if (tt.whitespaces_after_count > 2 or tt == t): continue if (tt.morph.class0_.is_adjective): continue nn = NounPhraseHelper.try_parse(tt.next0_, NounPhraseParseAttr.NO, 0, None) if (nn is None): continue num = (Utils.asObjectOrNull(tt, NumberToken)) noun_ = nn break if (num is None or num.int_value is None): return None res = DefinitionWithNumericToken(t, noun_.end_token) res.number = num.int_value res.number_begin_char = num.begin_char res.number_end_char = num.end_char res.noun = noun_.get_normal_case_text(None, MorphNumber.SINGULAR, MorphGender.UNDEFINED, False) res.nouns_genetive = (Utils.ifNotNull( noun_.get_morph_variant(MorphCase.GENITIVE, True), (res.noun if res is not None else None))) res.text = MiscHelper.get_text_value( t, num.previous, Utils.valToEnum( (GetTextAttr.KEEPQUOTES) | (GetTextAttr.KEEPREGISTER), GetTextAttr)) if (num.is_whitespace_before): res.text += " " res.number_substring = MiscHelper.get_text_value( num, noun_.end_token, Utils.valToEnum( (GetTextAttr.KEEPQUOTES) | (GetTextAttr.KEEPREGISTER), GetTextAttr)) res.text += res.number_substring tt = noun_.end_token while tt is not None: if (MiscHelper.can_be_start_of_sentence(tt)): break res.end_token = tt tt = tt.next0_ if (res.end_token != noun_.end_token): if (noun_.is_whitespace_after): res.text += " " res.text += MiscHelper.get_text_value( noun_.end_token.next0_, res.end_token, Utils.valToEnum( (GetTextAttr.KEEPQUOTES) | (GetTextAttr.KEEPREGISTER), GetTextAttr)) return res
def __try_parse_ru(first: 'Token', typ: 'NounPhraseParseAttr', max_char_pos: int, def_noun: 'NounPhraseItem' = None) -> 'NounPhraseToken': if (first is None): return None items = None adverbs = None prep = None kak = False t0 = first if ((((typ) & (NounPhraseParseAttr.PARSEPREPOSITION))) != (NounPhraseParseAttr.NO) and t0.is_value("КАК", None)): t0 = t0.next0_ prep = PrepositionHelper.try_parse(t0) if (prep is not None): t0 = prep.end_token.next0_ kak = True internal_noun_prase = None conj_before = False t = t0 first_pass3041 = True while True: if first_pass3041: first_pass3041 = False else: t = t.next0_ if (not (t is not None)): break if (max_char_pos > 0 and t.begin_char > max_char_pos): break if ((t.morph.class0_.is_conjunction and not t.morph.class0_.is_adjective and not t.morph.class0_.is_pronoun) and not t.morph.class0_.is_noun): if (conj_before): break if ((((typ) & (NounPhraseParseAttr.CANNOTHASCOMMAAND))) != (NounPhraseParseAttr.NO)): break if (items is not None and ((t.is_and or t.is_or))): conj_before = True if ((t.next0_ is not None and t.next0_.is_char_of("\\/") and t.next0_.next0_ is not None) and t.next0_.next0_.is_or): t = t.next0_.next0_ if (((t.next0_ is not None and t.next0_.is_char('(') and t.next0_.next0_ is not None) and t.next0_.next0_.is_or and t.next0_.next0_.next0_ is not None) and t.next0_.next0_.next0_.is_char(')')): t = t.next0_.next0_.next0_ continue break elif (t.is_comma): if (conj_before or items is None): break if ((((typ) & (NounPhraseParseAttr.CANNOTHASCOMMAAND))) != (NounPhraseParseAttr.NO)): break mc = t.previous.get_morph_class_in_dictionary() if (mc.is_proper_surname or mc.is_proper_secname): break conj_before = True if (kak and t.next0_ is not None and t.next0_.is_value("ТАК", None)): t = t.next0_ if (t.next0_ is not None and t.next0_.is_and): t = t.next0_ pr = PrepositionHelper.try_parse(t.next0_) if (pr is not None): t = pr.end_token if (items[len(items) - 1].can_be_noun and items[len(items) - 1].end_token.morph.class0_.is_pronoun): break continue elif (t.is_char('(')): if (items is None): return None brr = BracketHelper.try_parse(t, BracketParseAttr.NO, 100) if (brr is None): break if (brr.length_char > 100): break t = brr.end_token continue if (isinstance(t, ReferentToken)): if ((((typ) & (NounPhraseParseAttr.REFERENTCANBENOUN))) == ( NounPhraseParseAttr.NO)): break elif (t.chars.is_latin_letter): break it = NounPhraseItem.try_parse(t, items, typ) if (it is None or ((not it.can_be_adj and not it.can_be_noun))): if (((it is not None and items is not None and t.chars.is_capital_upper) and (t.whitespaces_before_count < 3) and t.length_char > 3) and not t.get_morph_class_in_dictionary().is_noun and not t.get_morph_class_in_dictionary().is_adjective): it.can_be_noun = True items.append(it) break if ((((typ) & (NounPhraseParseAttr.PARSEADVERBS))) != (NounPhraseParseAttr.NO) and (isinstance(t, TextToken)) and t.morph.class0_.is_adverb): if (adverbs is None): adverbs = list() adverbs.append(Utils.asObjectOrNull(t, TextToken)) continue break it.conj_before = conj_before conj_before = False if (not it.can_be_adj and not it.can_be_noun): break if (t.is_newline_before and t != first): if ((((typ) & (NounPhraseParseAttr.MULTILINES))) != (NounPhraseParseAttr.NO)): pass elif (items is not None and t.chars != items[len(items) - 1].chars): if (t.chars.is_all_lower and items[len(items) - 1].chars.is_capital_upper): pass else: break if (items is None): items = list() else: it0 = items[len(items) - 1] if (it0.can_be_noun and it0.is_personal_pronoun): if (it.is_pronoun): break if ((it0.begin_token.previous is not None and it0.begin_token.previous. get_morph_class_in_dictionary().is_verb and not it0.begin_token.previous. get_morph_class_in_dictionary().is_adjective) and not it0.begin_token.previous. get_morph_class_in_dictionary().is_preposition): if (t.morph.case_.is_nominative or t.morph.case_.is_accusative): pass else: break if (it.can_be_noun and it.is_verb): if (it0.previous is None): pass elif ((isinstance(it0.previous, TextToken)) and not it0.previous.chars.is_letter): pass else: break items.append(it) t = it.end_token if (t.is_newline_after and not t.chars.is_all_lower): mc = t.get_morph_class_in_dictionary() if (mc.is_proper_surname): break if (t.morph.class0_.is_proper_surname and mc.is_undefined): break if (items is None): return None tt1 = None if (len(items) == 1 and items[0].can_be_adj): and0_ = False tt1 = items[0].end_token.next0_ first_pass3042 = True while True: if first_pass3042: first_pass3042 = False else: tt1 = tt1.next0_ if (not (tt1 is not None)): break if (tt1.is_and or tt1.is_or): and0_ = True break if (tt1.is_comma or tt1.is_value("НО", None) or tt1.is_value("ТАК", None)): continue break if (and0_): if (items[0].can_be_noun and items[0].is_personal_pronoun): and0_ = False if (and0_): tt2 = tt1.next0_ if (tt2 is not None and tt2.morph.class0_.is_preposition): tt2 = tt2.next0_ npt1 = _NounPraseHelperInt.__try_parse_ru( tt2, typ, max_char_pos, None) if (npt1 is not None and len(npt1.adjectives) > 0): ok1 = False for av in items[0].adj_morph: for v in npt1.noun.noun_morph: if (v.check_accord(av, False, False)): items[0].morph.add_item(av) ok1 = True if (ok1): npt1.begin_token = items[0].begin_token npt1.end_token = tt1.previous npt1.adjectives.clear() npt1.adjectives.append(items[0]) return npt1 if (def_noun is not None): items.append(def_noun) last1 = items[len(items) - 1] check = True for it in items: if (not it.can_be_adj): check = False break elif (it.can_be_noun and it.is_personal_pronoun): check = False break tt1 = last1.end_token.next0_ if ((tt1 is not None and check and ((tt1.morph.class0_.is_preposition or tt1.morph.case_.is_instrumental))) and (tt1.whitespaces_before_count < 2)): inp = NounPhraseHelper.try_parse( tt1, Utils.valToEnum((typ) | (NounPhraseParseAttr.PARSEPREPOSITION), NounPhraseParseAttr), max_char_pos, None) if (inp is not None): tt1 = inp.end_token.next0_ npt1 = _NounPraseHelperInt.__try_parse_ru( tt1, typ, max_char_pos, None) if (npt1 is not None): ok = True ii = 0 first_pass3043 = True while True: if first_pass3043: first_pass3043 = False else: ii += 1 if (not (ii < len(items))): break it = items[ii] if (NounPhraseItem.try_accord_adj_and_noun( it, Utils.asObjectOrNull(npt1.noun, NounPhraseItem))): continue if (ii > 0): inp2 = NounPhraseHelper.try_parse( it.begin_token, typ, max_char_pos, None) if (inp2 is not None and inp2.end_token == inp.end_token): del items[ii:ii + len(items) - ii] inp = inp2 break ok = False break if (ok): if (npt1.morph.case_.is_genitive and not inp.morph.case_.is_instrumental): ok = False if (ok): i = 0 while i < len(items): npt1.adjectives.insert(i, items[i]) i += 1 npt1.internal_noun = inp mmm = MorphCollection(npt1.morph) for it in items: mmm.remove_items(it.adj_morph[0], False) if (mmm.gender != MorphGender.UNDEFINED or mmm.number != MorphNumber.UNDEFINED or not mmm.case_.is_undefined): npt1.morph = mmm if (adverbs is not None): if (npt1.adverbs is None): npt1.adverbs = adverbs else: npt1.adverbs[0:0] = adverbs npt1.begin_token = first return npt1 if (tt1 is not None and tt1.morph.class0_.is_noun and not tt1.morph.case_.is_genitive): it = NounPhraseItem.try_parse(tt1, items, typ) if (it is not None and it.can_be_noun): internal_noun_prase = inp inp.begin_token = items[0].end_token.next0_ items.append(it) i = 0 first_pass3044 = True while True: if first_pass3044: first_pass3044 = False else: i += 1 if (not (i < len(items))): break if (items[i].can_be_adj and items[i].begin_token.morph.class0_.is_verb): it = items[i].begin_token if (not it.get_morph_class_in_dictionary().is_verb): continue if (it.is_value("УПОЛНОМОЧЕННЫЙ", None)): continue if ((((typ) & (NounPhraseParseAttr.PARSEVERBS))) == ( NounPhraseParseAttr.NO)): continue inp = _NounPraseHelperInt.__try_parse_ru( items[i].end_token.next0_, NounPhraseParseAttr.NO, max_char_pos, None) if (inp is None): continue if (inp.anafor is not None and i == (len(items) - 1) and NounPhraseItem.try_accord_adj_and_noun( items[i], Utils.asObjectOrNull(inp.noun, NounPhraseItem))): inp.begin_token = first ii = 0 while ii < len(items): inp.adjectives.insert(ii, items[ii]) ii += 1 return inp if (inp.end_token.whitespaces_after_count > 3): continue npt1 = _NounPraseHelperInt.__try_parse_ru( inp.end_token.next0_, NounPhraseParseAttr.NO, max_char_pos, None) if (npt1 is None): continue ok = True j = 0 while j <= i: if (not NounPhraseItem.try_accord_adj_and_noun( items[j], Utils.asObjectOrNull(npt1.noun, NounPhraseItem))): ok = False break j += 1 if (not ok): continue verb = VerbPhraseHelper.try_parse(it, True, False, False) if (verb is None): continue vlinks = SemanticHelper.try_create_links(verb, inp, None) nlinks = SemanticHelper.try_create_links(inp, npt1, None) if (len(vlinks) == 0 and len(nlinks) > 0): continue j = 0 while j <= i: npt1.adjectives.insert(j, items[j]) j += 1 items[i].end_token = inp.end_token mmm = MorphCollection(npt1.morph) bil = list() j = 0 while j <= i: bil.clear() for m in items[j].adj_morph: bil.append(m) mmm.remove_items_list_cla(bil, None) j += 1 if (mmm.gender != MorphGender.UNDEFINED or mmm.number != MorphNumber.UNDEFINED or not mmm.case_.is_undefined): npt1.morph = mmm if (adverbs is not None): if (npt1.adverbs is None): npt1.adverbs = adverbs else: npt1.adverbs[0:0] = adverbs npt1.begin_token = first return npt1 ok2 = False if ((len(items) == 1 and (((typ) & (NounPhraseParseAttr.ADJECTIVECANBELAST))) != (NounPhraseParseAttr.NO) and (items[0].whitespaces_after_count < 3)) and not items[0].is_adverb): if (not items[0].can_be_adj): ok2 = True elif (items[0].is_personal_pronoun and items[0].can_be_noun): ok2 = True if (ok2): it = NounPhraseItem.try_parse(items[0].end_token.next0_, None, typ) if (it is not None and it.can_be_adj and it.begin_token.chars.is_all_lower): ok2 = True if (it.is_adverb or it.is_verb): ok2 = False if (it.is_pronoun and items[0].is_pronoun): ok2 = False if (it.can_be_adj_for_personal_pronoun and items[0].is_personal_pronoun): ok2 = True if (ok2 and NounPhraseItem.try_accord_adj_and_noun( it, items[0])): npt1 = _NounPraseHelperInt.__try_parse_ru( it.begin_token, typ, max_char_pos, None) if (npt1 is not None and ((npt1.end_char > it.end_char or len(npt1.adjectives) > 0))): pass else: items.insert(0, it) noun = None adj_after = None for i in range(len(items) - 1, -1, -1): if (items[i].can_be_noun): if (items[i].conj_before): continue if (i > 0 and not items[i - 1].can_be_adj): continue if (i > 0 and items[i - 1].can_be_noun): if (items[i - 1].is_doubt_adjective): continue if (items[i - 1].is_pronoun and items[i].is_pronoun): if (items[i].is_pronoun and items[i - 1].can_be_adj_for_personal_pronoun): pass else: continue noun = items[i] del items[i:i + len(items) - i] if (adj_after is not None): items.append(adj_after) elif (len(items) > 0 and items[0].can_be_noun and not items[0].can_be_adj): noun = items[0] items.clear() break if (noun is None): return None res = NounPhraseToken._new466(first, noun.end_token, prep) if (adverbs is not None): for a in adverbs: if (a.begin_char < noun.begin_char): if (len(items) == 0 and prep is None): return None if (res.adverbs is None): res.adverbs = list() res.adverbs.append(a) res.noun = (noun) res.multi_nouns = noun.multi_nouns if (kak): res.multi_nouns = True res.internal_noun = internal_noun_prase for v in noun.noun_morph: noun.morph.add_item(v) res.morph = noun.morph if (res.morph.case_.is_nominative and first.previous is not None and first.previous.morph.class0_.is_preposition): res.morph.case_ = (res.morph.case_) ^ MorphCase.NOMINATIVE if ((((typ) & (NounPhraseParseAttr.PARSEPRONOUNS))) == (NounPhraseParseAttr.NO) and ((res.morph.class0_.is_pronoun or res.morph.class0_.is_personal_pronoun))): return None stat = None if (len(items) > 1): stat = dict() need_update_morph = False if (len(items) > 0): ok_list = list() is_num_not = False for vv in noun.noun_morph: i = 0 v = vv i = 0 while i < len(items): ok = False for av in items[i].adj_morph: if (v.check_accord(av, False, False)): ok = True if (not ((av.case_) & v.case_).is_undefined and av.case_ != v.case_): v.case_ = av.case_ = (av.case_) & v.case_ break if (not ok): if (items[i].can_be_numeric_adj and items[i].try_accord_var(v, False)): ok = True v1 = NounPhraseItemTextVar() v1.copy_from_item(v) v1.number = MorphNumber.PLURAL is_num_not = True v1.case_ = MorphCase() for a in items[i].adj_morph: v1.case_ = (v1.case_) | a.case_ v = v1 else: break i += 1 if (i >= len(items)): ok_list.append(v) if (len(ok_list) > 0 and (((len(ok_list) < res.morph.items_count) or is_num_not))): res.morph = MorphCollection() for v in ok_list: res.morph.add_item(v) if (not is_num_not): noun.morph = res.morph i = 0 first_pass3045 = True while True: if first_pass3045: first_pass3045 = False else: i += 1 if (not (i < len(items))): break for av in items[i].adj_morph: for v in noun.noun_morph: if (v.check_accord(av, False, False)): if (not ((av.case_) & v.case_).is_undefined and av.case_ != v.case_): v.case_ = av.case_ = (av.case_) & v.case_ need_update_morph = True items[i].morph.add_item(av) if (stat is not None and av.normal_value is not None and len(av.normal_value) > 1): last = av.normal_value[len(av.normal_value) - 1] if (not last in stat): stat[last] = 1 else: stat[last] += 1 if (items[i].is_pronoun or items[i].is_personal_pronoun): res.anafor = items[i].begin_token if ((((typ) & (NounPhraseParseAttr.PARSEPRONOUNS))) == ( NounPhraseParseAttr.NO)): continue tt = Utils.asObjectOrNull(items[i].begin_token, TextToken) if (tt is not None and not tt.term.startswith("ВЫСШ")): err = False for wf in tt.morph.items: if (wf.class0_.is_adjective): if (wf.contains_attr("прев.", None)): if ((((typ) & (NounPhraseParseAttr.IGNOREADJBEST))) != (NounPhraseParseAttr.NO)): err = True if (wf.contains_attr("к.ф.", None) and tt.morph.class0_.is_personal_pronoun): return None if (err): continue if (res.morph.case_.is_nominative): v = MiscHelper.get_text_value_of_meta_token( items[i], GetTextAttr.KEEPQUOTES) if (not Utils.isNullOrEmpty(v)): if (items[i].get_normal_case_text( None, MorphNumber.UNDEFINED, MorphGender.UNDEFINED, False) != v): wf = NounPhraseItemTextVar(items[i].morph, None) wf.normal_value = v wf.class0_ = MorphClass.ADJECTIVE wf.case_ = res.morph.case_ if (res.morph.case_.is_prepositional or res.morph.gender == MorphGender.NEUTER or res.morph.gender == MorphGender.FEMINIE): items[i].morph.add_item(wf) else: items[i].morph.insert_item(0, wf) res.adjectives.append(items[i]) if (items[i].end_char > res.end_char): res.end_token = items[i].end_token i = 0 first_pass3046 = True while True: if first_pass3046: first_pass3046 = False else: i += 1 if (not (i < (len(res.adjectives) - 1))): break if (res.adjectives[i].whitespaces_after_count > 5): if (res.adjectives[i].chars != res.adjectives[i + 1].chars): if (not res.adjectives[i + 1].chars.is_all_lower): return None if (res.adjectives[i].chars.is_all_upper and res.adjectives[i + 1].chars.is_capital_upper): return None if (res.adjectives[i].chars.is_capital_upper and res.adjectives[i + 1].chars.is_all_upper): return None if (res.adjectives[i].whitespaces_after_count > 10): if (res.adjectives[i].newlines_after_count == 1): if (res.adjectives[i].chars.is_capital_upper and i == 0 and res.adjectives[i + 1].chars.is_all_lower): continue if (res.adjectives[i].chars == res.adjectives[ i + 1].chars): continue return None if (need_update_morph): noun.morph = MorphCollection() for v in noun.noun_morph: noun.morph.add_item(v) res.morph = noun.morph if (len(res.adjectives) > 0): if (noun.begin_token.previous is not None): if (noun.begin_token.previous.is_comma_and): if (res.adjectives[0].begin_char > noun.begin_char): pass else: return None zap = 0 and0_ = 0 cou = 0 last_and = False i = 0 while i < (len(res.adjectives) - 1): te = res.adjectives[i].end_token.next0_ if (te is None): return None if (te.is_char('(')): pass elif (te.is_comma): zap += 1 last_and = False elif (te.is_and or te.is_or): and0_ += 1 last_and = True if (not res.adjectives[i].begin_token.morph.class0_.is_pronoun ): cou += 1 i += 1 if ((zap + and0_) > 0): if (and0_ > 1): return None elif (and0_ == 1 and not last_and): return None if ((zap + and0_) != cou): if (and0_ == 1): pass else: return None last = Utils.asObjectOrNull( res.adjectives[len(res.adjectives) - 1], NounPhraseItem) if (last.is_pronoun and not last_and): return None if (stat is not None): for adj in items: if (adj.morph.items_count > 1): w1 = Utils.asObjectOrNull(adj.morph.get_indexer_item(0), NounPhraseItemTextVar) w2 = Utils.asObjectOrNull(adj.morph.get_indexer_item(1), NounPhraseItemTextVar) if ((len(w1.normal_value) < 2) or (len(w2.normal_value) < 2)): break l1 = w1.normal_value[len(w1.normal_value) - 1] l2 = w2.normal_value[len(w2.normal_value) - 1] i1 = 0 i2 = 0 wrapi1468 = RefOutArgWrapper(0) Utils.tryGetValue(stat, l1, wrapi1468) i1 = wrapi1468.value wrapi2467 = RefOutArgWrapper(0) Utils.tryGetValue(stat, l2, wrapi2467) i2 = wrapi2467.value if (i1 < i2): adj.morph.remove_item(1) adj.morph.insert_item(0, w2) if (res.begin_token.get_morph_class_in_dictionary().is_verb and len(items) > 0): if (not res.begin_token.chars.is_all_lower or res.begin_token.previous is None): pass elif (res.begin_token.previous.morph.class0_.is_preposition): pass else: comma = False tt = res.begin_token.previous first_pass3047 = True while True: if first_pass3047: first_pass3047 = False else: tt = tt.previous if (not (tt is not None and tt.end_char <= res.end_char)): break if (tt.morph.class0_.is_adverb): continue if (tt.is_char_of(".;")): break if (tt.is_comma): comma = True continue if (tt.is_value("НЕ", None)): continue if (((tt.morph.class0_.is_noun or tt.morph.class0_.is_proper)) and comma): for it in res.begin_token.morph.items: if (it.class0_.is_verb and (isinstance(it, MorphWordForm))): if (tt.morph.check_accord(it, False, False)): if (res.morph.case_.is_instrumental): return None break if (res.begin_token == res.end_token): mc = res.begin_token.get_morph_class_in_dictionary() if (mc.is_adverb): if (res.begin_token.previous is not None and res.begin_token.previous.morph.class0_.is_preposition): pass elif (mc.is_noun and not mc.is_preposition and not mc.is_conjunction): pass elif (res.begin_token.is_value("ВЕСЬ", None)): pass else: return None if (def_noun is not None and def_noun.end_token == res.end_token and len(res.adjectives) > 0): res.end_token = res.adjectives[len(res.adjectives) - 1].end_token return res
def __calc_rank_and_value(self, min_newlines_count: int) -> bool: self.rank = 0 if (self.begin_token.chars.is_all_lower): self.rank -= 30 words = 0 up_words = 0 notwords = 0 line_number = 0 tstart = self.begin_token tend = self.end_token t = self.begin_token first_pass3396 = True while True: if first_pass3396: first_pass3396 = False else: t = t.next0_ if (not (t != self.end_token.next0_ and t is not None and t.end_char <= self.end_token.end_char)): break if (t.is_newline_before): pass tit = TitleItemToken.try_attach(t) if (tit is not None): if (tit.typ == TitleItemToken.Types.THEME or tit.typ == TitleItemToken.Types.TYPANDTHEME): if (t != self.begin_token): if (line_number > 0): return False notwords = 0 up_words = notwords words = up_words tstart = tit.end_token.next0_ t = tit.end_token if (t.next0_ is None): return False if (t.next0_.chars.is_letter and t.next0_.chars.is_all_lower): self.rank += 20 else: self.rank += 100 tstart = t.next0_ if (tit.typ == TitleItemToken.Types.TYPANDTHEME): self.type_value = tit.value continue if (tit.typ == TitleItemToken.Types.TYP): if (t == self.begin_token): if (tit.end_token.is_newline_after): self.type_value = tit.value self.rank += 5 tstart = tit.end_token.next0_ t = tit.end_token words += 1 if (tit.begin_token != tit.end_token): words += 1 if (tit.chars.is_all_upper): up_words += 1 continue if (tit.typ == TitleItemToken.Types.DUST or tit.typ == TitleItemToken.Types.SPECIALITY): if (t == self.begin_token): return False self.rank -= 20 if (tit.typ == TitleItemToken.Types.SPECIALITY): self.speciality = tit.value t = tit.end_token continue if (tit.typ == TitleItemToken.Types.CONSULTANT or tit.typ == TitleItemToken.Types.BOSS or tit.typ == TitleItemToken.Types.EDITOR): t = tit.end_token if (t.next0_ is not None and ((t.next0_.is_char_of(":") or t.next0_.is_hiphen or t.whitespaces_after_count > 4))): self.rank -= 10 else: self.rank -= 2 continue return False blt = BookLinkToken.try_parse(t, 0) if (blt is not None): if (blt.typ == BookLinkTyp.MISC or blt.typ == BookLinkTyp.N or blt.typ == BookLinkTyp.PAGES): self.rank -= 10 elif (blt.typ == BookLinkTyp.N or blt.typ == BookLinkTyp.PAGERANGE): self.rank -= 20 if (t == self.begin_token and BookLinkToken.try_parse_author( t, FioTemplateType.UNDEFINED) is not None): self.rank -= 20 if (t.is_newline_before and t != self.begin_token): line_number += 1 if (line_number > 4): return False if (t.chars.is_all_lower): self.rank += 10 elif (t.previous.is_char('.')): self.rank -= 10 elif (t.previous.is_char_of(",-")): self.rank += 10 else: npt = NounPhraseHelper.try_parse(t.previous, NounPhraseParseAttr.NO, 0, None) if (npt is not None and npt.end_char >= t.end_char): self.rank += 10 if (t != self.begin_token and t.newlines_before_count > min_newlines_count): self.rank -= (t.newlines_before_count - min_newlines_count) bst = BracketHelper.try_parse(t, BracketParseAttr.NO, 100) if (bst is not None and bst.is_quote_type and bst.end_token.end_char <= self.end_token.end_char): if (words == 0): tstart = bst.begin_token self.rank += 10 if (bst.end_token == self.end_token): tend = self.end_token self.rank += 10 rli = t.get_referents() if (rli is not None): for r in rli: if (isinstance(r, OrganizationReferent)): if (t.is_newline_before): self.rank -= 10 else: self.rank -= 4 continue if ((isinstance(r, GeoReferent)) or (isinstance(r, PersonReferent))): if (t.is_newline_before): self.rank -= 5 if (t.is_newline_after or t.next0_ is None): self.rank -= 20 elif (t.next0_.is_hiphen or (isinstance(t.next0_, NumberToken)) or (isinstance(t.next0_.get_referent(), DateReferent))): self.rank -= 20 elif (t != self.begin_token): self.rank -= 20 continue if ((isinstance(r, GeoReferent)) or (isinstance(r, DenominationReferent))): continue if ((isinstance(r, UriReferent)) or (isinstance(r, PhoneReferent))): return False if (t.is_newline_before): self.rank -= 4 else: self.rank -= 2 if (t == self.begin_token and (isinstance( self.end_token.get_referent(), PersonReferent))): self.rank -= 10 words += 1 if (t.chars.is_all_upper): up_words += 1 if (t == self.begin_token): if (t.is_newline_after): self.rank -= 10 elif (t.next0_ is not None and t.next0_.is_char('.') and t.next0_.is_newline_after): self.rank -= 10 continue if (isinstance(t, NumberToken)): if (t.typ == NumberSpellingType.WORDS): words += 1 if (t.chars.is_all_upper): up_words += 1 else: notwords += 1 continue pat = PersonAttrToken.try_attach( t, None, PersonAttrToken.PersonAttrAttachAttrs.NO) if (pat is not None): if (t.is_newline_before): if (not pat.morph.case_.is_undefined and not pat.morph.case_.is_nominative): pass elif (pat.chars.is_all_upper): pass else: self.rank -= 20 elif (t.chars.is_all_lower): self.rank -= 1 while t is not None: words += 1 if (t.chars.is_all_upper): up_words += 1 if (t == pat.end_token): break t = t.next0_ continue oitt = OrgItemTypeToken.try_attach(t, True, None) if (oitt is not None): if (oitt.morph.number != MorphNumber.PLURAL and not oitt.is_doubt_root_word): if (not oitt.morph.case_.is_undefined and not oitt.morph.case_.is_nominative): words += 1 if (t.chars.is_all_upper): up_words += 1 else: self.rank -= 4 if (t == self.begin_token): self.rank -= 5 else: words += 1 if (t.chars.is_all_upper): up_words += 1 t = oitt.end_token continue tt = Utils.asObjectOrNull(t, TextToken) if (tt is not None): if (tt.is_char('©')): self.rank -= 10 if (tt.is_char('_')): self.rank -= 1 if (tt.chars.is_letter): if (tt.length_char > 2): words += 1 if (t.chars.is_all_upper): up_words += 1 elif (not tt.is_char(',')): notwords += 1 if (tt.is_pure_verb): self.rank -= 30 words -= 1 break if (tt == self.end_token): if (tt.morph.class0_.is_preposition or tt.morph.class0_.is_conjunction): self.rank -= 10 elif (tt.is_char('.')): self.rank += 5 elif (tt.is_char_of("._")): self.rank -= 5 self.rank += words self.rank -= notwords if ((words < 1) and (self.rank < 50)): return False if (tstart is None or tend is None): return False if (tstart.end_char > tend.end_char): return False tit1 = TitleItemToken.try_attach(self.end_token.next0_) if (tit1 is not None and ((tit1.typ == TitleItemToken.Types.TYP or tit1.typ == TitleItemToken.Types.SPECIALITY))): if (tit1.end_token.is_newline_after): self.rank += 15 else: self.rank += 10 if (tit1.typ == TitleItemToken.Types.SPECIALITY): self.speciality = tit1.value if (up_words > 4 and up_words > (math.floor((0.8 * (words))))): if (tstart.previous is not None and (isinstance(tstart.previous.get_referent(), PersonReferent))): self.rank += (5 + up_words) self.begin_name_token = tstart self.end_name_token = tend return True
def process(self, kit: 'AnalysisKit') -> None: ad = kit.get_analyzer_data(self) t = kit.first_token first_pass3419 = True while True: if first_pass3419: first_pass3419 = False else: t = t.next0_ if (not (t is not None)): break tt = t i = 0 tok = UriAnalyzer._m_schemes.try_parse(t, TerminParseAttr.NO) if (tok is not None): i = (tok.termin.tag) tt = tok.end_token if (tt.next0_ is not None and tt.next0_.is_char('(')): tok1 = UriAnalyzer._m_schemes.try_parse( tt.next0_.next0_, TerminParseAttr.NO) if ((tok1 is not None and tok1.termin.canonic_text == tok.termin.canonic_text and tok1.end_token.next0_ is not None) and tok1.end_token.next0_.is_char(')')): tt = tok1.end_token.next0_ if (i == 0): if ((tt.next0_ is None or ((not tt.next0_.is_char_of(":|") and not tt.is_table_control_char)) or tt.next0_.is_whitespace_before) or tt.next0_.whitespaces_after_count > 2): continue t1 = tt.next0_.next0_ while t1 is not None and t1.is_char_of("/\\"): t1 = t1.next0_ if (t1 is None or t1.whitespaces_before_count > 2): continue ut = UriItemToken.attach_uri_content(t1, False) if (ut is None): continue ur = Utils.asObjectOrNull( ad.register_referent( UriReferent._new2714( tok.termin.canonic_text.lower(), ut.value)), UriReferent) rt = ReferentToken(ad.register_referent(ur), t, ut.end_token) rt.begin_token = Utils.ifNotNull( UriAnalyzer.__site_before(t.previous), t) if (rt.end_token.next0_ is not None and rt.end_token.next0_.is_char_of("/\\")): rt.end_token = rt.end_token.next0_ kit.embed_token(rt) t = (rt) continue if (i == 10): tt = tt.next0_ if (tt is None or not tt.is_char(':')): continue tt = tt.next0_ while tt is not None: if (tt.is_char_of("/\\")): pass else: break tt = tt.next0_ if (tt is None): continue if (tt.is_value("WWW", None) and tt.next0_ is not None and tt.next0_.is_char('.')): tt = tt.next0_.next0_ if (tt is None or tt.is_newline_before): continue ut = UriItemToken.attach_uri_content(tt, True) if (ut is None): continue if (len(ut.value) < 4): continue ur = Utils.asObjectOrNull( ad.register_referent( UriReferent._new2714( tok.termin.canonic_text.lower(), ut.value)), UriReferent) rt = ReferentToken(ad.register_referent(ur), t, ut.end_token) rt.begin_token = Utils.ifNotNull( UriAnalyzer.__site_before(t.previous), t) if (rt.end_token.next0_ is not None and rt.end_token.next0_.is_char_of("/\\")): rt.end_token = rt.end_token.next0_ kit.embed_token(rt) t = (rt) continue if (i == 2): if (tt.next0_ is None or not tt.next0_.is_char('.') or tt.next0_.is_whitespace_before): continue if (tt.next0_.is_whitespace_after and tok.termin.canonic_text != "WWW"): continue ut = UriItemToken.attach_uri_content( tt.next0_.next0_, True) if (ut is None): continue ur = Utils.asObjectOrNull( ad.register_referent( UriReferent._new2714("http", ut.value)), UriReferent) rt = ReferentToken(ur, t, ut.end_token) rt.begin_token = Utils.ifNotNull( UriAnalyzer.__site_before(t.previous), t) if (rt.end_token.next0_ is not None and rt.end_token.next0_.is_char_of("/\\")): rt.end_token = rt.end_token.next0_ kit.embed_token(rt) t = (rt) continue if (i == 1): sch = tok.termin.canonic_text ut = None if (sch == "ISBN"): ut = UriItemToken.attachisbn(tt.next0_) if ((ut is None and t.previous is not None and t.previous.is_char('(')) and t.next0_ is not None and t.next0_.is_char(')')): tt0 = t.previous.previous while tt0 is not None: if (tt0.whitespaces_after_count > 2): break if (tt0.is_whitespace_before): ut = UriItemToken.attachisbn(tt0) if (ut is not None and ut.end_token.next0_ != t.previous): ut = (None) break tt0 = tt0.previous elif ((sch == "RFC" or sch == "ISO" or sch == "ОКФС") or sch == "ОКОПФ"): ut = UriItemToken.attachisocontent(tt.next0_, ":") elif (sch == "ГОСТ"): ut = UriItemToken.attachisocontent(tt.next0_, "-.") elif (sch == "ТУ"): if (tok.chars.is_all_upper): ut = UriItemToken.attachisocontent(tt.next0_, "-.") if (ut is not None and (ut.length_char < 10)): ut = (None) else: ut = UriItemToken.attachbbk(tt.next0_) if (ut is None): continue ur = Utils.asObjectOrNull( ad.register_referent( UriReferent._new2717(ut.value, sch)), UriReferent) rt = None if (ut.begin_char < t.begin_char): rt = ReferentToken(ur, ut.begin_token, t) if (t.next0_ is not None and t.next0_.is_char(')')): rt.end_token = t.next0_ else: rt = ReferentToken(ur, t, ut.end_token) if (t.previous is not None and t.previous.is_value("КОД", None)): rt.begin_token = t.previous if (ur.scheme.startswith("ОК")): UriAnalyzer.__check_detail(rt) kit.embed_token(rt) t = (rt) if (ur.scheme.startswith("ОК")): while t.next0_ is not None: if (t.next0_.is_comma_and and (isinstance(t.next0_.next0_, NumberToken))): pass else: break ut = UriItemToken.attachbbk(t.next0_.next0_) if (ut is None): break ur = (Utils.asObjectOrNull( ad.register_referent( UriReferent._new2717(ut.value, sch)), UriReferent)) rt = ReferentToken(ur, t.next0_.next0_, ut.end_token) UriAnalyzer.__check_detail(rt) kit.embed_token(rt) t = (rt) continue if (i == 3): t0 = tt.next0_ while t0 is not None: if (t0.is_char_of(":|") or t0.is_table_control_char or t0.is_hiphen): t0 = t0.next0_ else: break if (t0 is None): continue ut = UriItemToken.attach_skype(t0) if (ut is None): continue ur = Utils.asObjectOrNull( ad.register_referent( UriReferent._new2717( ut.value.lower(), ("skype" if tok.termin.canonic_text == "SKYPE" else tok.termin.canonic_text))), UriReferent) rt = ReferentToken(ur, t, ut.end_token) kit.embed_token(rt) t = (rt) continue if (i == 4): t0 = tt.next0_ if (t0 is not None and ((t0.is_char(':') or t0.is_hiphen))): t0 = t0.next0_ if (t0 is None): continue ut = UriItemToken.attach_icq_content(t0) if (ut is None): continue ur = Utils.asObjectOrNull( ad.register_referent( UriReferent._new2717(ut.value, "ICQ")), UriReferent) rt = ReferentToken(ur, t, t0) kit.embed_token(rt) t = (rt) continue if (i == 5 or i == 6): t0 = tt.next0_ has_tab_cel = False is_iban = False first_pass3420 = True while True: if first_pass3420: first_pass3420 = False else: t0 = t0.next0_ if (not (t0 is not None)): break if ((((t0.is_value("БАНК", None) or t0.morph.class0_.is_preposition or t0.is_hiphen) or t0.is_char_of(".:") or t0.is_value( "РУБЛЬ", None)) or t0.is_value("РУБ", None) or t0.is_value("ДОЛЛАР", None)) or t0.is_value("№", None) or t0.is_value("N", None)): pass elif (t0.is_table_control_char): has_tab_cel = True elif (t0.is_char_of("\\/") and t0.next0_ is not None and t0.next0_.is_value("IBAN", None)): is_iban = True t0 = t0.next0_ elif (t0.is_value("IBAN", None)): is_iban = True elif (isinstance(t0, TextToken)): npt = NounPhraseHelper.try_parse( t0, NounPhraseParseAttr.NO, 0, None) if (npt is not None and npt.morph.case_.is_genitive): t0 = npt.end_token continue break else: break if (t0 is None): continue ur2 = None ur2begin = None ur2end = None t00 = t0 val = t0.get_source_text() if (str.isdigit(val[0]) and ((((i == 6 or tok.termin.canonic_text == "ИНН" or tok.termin.canonic_text == "БИК") or tok.termin.canonic_text == "ОГРН" or tok.termin.canonic_text == "СНИЛС") or tok.termin.canonic_text == "ОКПО"))): if (t0.chars.is_letter): continue if (Utils.isNullOrEmpty(val) or not str.isdigit(val[0])): continue if (t0.length_char < 9): tmp = io.StringIO() print(val, end="", file=tmp) ttt = t0.next0_ first_pass3421 = True while True: if first_pass3421: first_pass3421 = False else: ttt = ttt.next0_ if (not (ttt is not None)): break if (ttt.whitespaces_before_count > 1): break if (isinstance(ttt, NumberToken)): print(ttt.get_source_text(), end="", file=tmp) t0 = ttt continue if (ttt.is_hiphen or ttt.is_char('.')): if (ttt.next0_ is None or not (isinstance( ttt.next0_, NumberToken))): break if (ttt.is_whitespace_after or ttt.is_whitespace_before): break continue break val = (None) if (tmp.tell() == 20): val = Utils.toStringStringIO(tmp) elif (tmp.tell() == 9 and tok.termin.canonic_text == "БИК"): val = Utils.toStringStringIO(tmp) elif (((tmp.tell() == 10 or tmp.tell() == 12)) and tok.termin.canonic_text == "ИНН"): val = Utils.toStringStringIO(tmp) elif (tmp.tell() >= 15 and tok.termin.canonic_text == "Л/С"): val = Utils.toStringStringIO(tmp) elif (tmp.tell() >= 11 and ((tok.termin.canonic_text == "ОГРН" or tok.termin.canonic_text == "СНИЛС"))): val = Utils.toStringStringIO(tmp) elif (tok.termin.canonic_text == "ОКПО"): val = Utils.toStringStringIO(tmp) if (val is None): continue elif (not (isinstance(t0, NumberToken))): if ((isinstance(t0, TextToken)) and is_iban): tmp1 = io.StringIO() t1 = None ttt = t0 first_pass3422 = True while True: if first_pass3422: first_pass3422 = False else: ttt = ttt.next0_ if (not (ttt is not None)): break if (ttt.is_newline_before and ttt != t0): break if (ttt.is_hiphen): continue if (not (isinstance(ttt, NumberToken))): if (not (isinstance(ttt, TextToken)) or not ttt.chars.is_latin_letter): break print(ttt.get_source_text(), end="", file=tmp1) t1 = ttt if (tmp1.tell() >= 34): break if (tmp1.tell() < 10): continue ur1 = UriReferent._new2717( Utils.toStringStringIO(tmp1), tok.termin.canonic_text) ur1.add_slot(UriReferent.ATTR_DETAIL, "IBAN", False, 0) rt1 = ReferentToken(ad.register_referent(ur1), t, t1) kit.embed_token(rt1) t = (rt1) continue if (not t0.is_char_of("/\\") or t0.next0_ is None): continue tok2 = UriAnalyzer._m_schemes.try_parse( t0.next0_, TerminParseAttr.NO) if (tok2 is None or not (isinstance(tok2.termin.tag, int)) or (tok2.termin.tag) != i): continue t0 = tok2.end_token.next0_ while t0 is not None: if (t0.is_char_of(":N№")): t0 = t0.next0_ elif (t0.is_table_control_char): t0 = t0.next0_ t00 = t0 has_tab_cel = True else: break if (not (isinstance(t0, NumberToken))): continue tmp = io.StringIO() while t0 is not None: if (not (isinstance(t0, NumberToken))): break print(t0.get_source_text(), end="", file=tmp) t0 = t0.next0_ if (t0 is None or not t0.is_char_of("/\\,") or not (isinstance(t0.next0_, NumberToken))): continue val = Utils.toStringStringIO(tmp) Utils.setLengthStringIO(tmp, 0) ur2begin = t0.next0_ t0 = t0.next0_ while t0 is not None: if (not (isinstance(t0, NumberToken))): break if (t0.whitespaces_before_count > 4 and tmp.tell() > 0): break print(t0.get_source_text(), end="", file=tmp) ur2end = t0 t0 = t0.next0_ ur2 = (Utils.asObjectOrNull( ad.register_referent( UriReferent._new2714( tok2.termin.canonic_text, Utils.toStringStringIO(tmp))), UriReferent)) if (len(val) < 5): continue ur = Utils.asObjectOrNull( ad.register_referent( UriReferent._new2717(val, tok.termin.canonic_text)), UriReferent) rt = ReferentToken( ur, t, (t0 if ur2begin is None else ur2begin.previous)) if (has_tab_cel): rt.begin_token = t00 if (ur.scheme.startswith("ОК")): UriAnalyzer.__check_detail(rt) ttt = t.previous first_pass3423 = True while True: if first_pass3423: first_pass3423 = False else: ttt = ttt.previous if (not (ttt is not None)): break if (ttt.is_table_control_char): break if (ttt.morph.class0_.is_preposition): continue if (ttt.is_value("ОРГАНИЗАЦИЯ", None)): continue if (ttt.is_value("НОМЕР", None) or ttt.is_value("КОД", None)): rt.begin_token = ttt t = rt.begin_token break kit.embed_token(rt) t = (rt) if (ur2 is not None): rt2 = ReferentToken(ur2, ur2begin, ur2end) kit.embed_token(rt2) t = (rt2) while ( t.next0_ is not None and t.next0_.is_comma_and and (isinstance(t.next0_.next0_, NumberToken)) ) and t.next0_.next0_.length_char == len( val ) and t.next0_.next0_.typ == NumberSpellingType.DIGIT: val2 = t.next0_.next0_.get_source_text() ur2 = UriReferent() ur2.scheme = ur.scheme ur2.value = val2 ur2 = (Utils.asObjectOrNull(ad.register_referent(ur2), UriReferent)) rt2 = ReferentToken(ur2, t.next0_, t.next0_.next0_) kit.embed_token(rt2) t = (rt2) continue continue if (t.is_char('@')): u1s = UriItemToken.attach_mail_users(t.previous) if (u1s is None): continue u2 = UriItemToken.attach_domain_name(t.next0_, False, True) if (u2 is None): continue for ii in range(len(u1s) - 1, -1, -1): ur = Utils.asObjectOrNull( ad.register_referent( UriReferent._new2717( "{0}@{1}".format(u1s[ii].value, u2.value).lower(), "mailto")), UriReferent) b = u1s[ii].begin_token t0 = b.previous if (t0 is not None and t0.is_char(':')): t0 = t0.previous if (t0 is not None and ii == 0): br = False ttt = t0 first_pass3424 = True while True: if first_pass3424: first_pass3424 = False else: ttt = ttt.previous if (not (ttt is not None)): break if (not (isinstance(ttt, TextToken))): break if (ttt != t0 and ttt.whitespaces_after_count > 1): break if (ttt.is_char(')')): br = True continue if (ttt.is_char('(')): if (not br): break br = False continue if (ttt.is_value("EMAIL", None) or ttt.is_value("MAILTO", None)): b = ttt break if (ttt.is_value("MAIL", None)): b = ttt if ((ttt.previous is not None and ttt.previous.is_hiphen and ttt.previous.previous is not None) and ((ttt.previous.previous.is_value( "E", None) or ttt.previous.previous.is_value( "Е", None)))): b = ttt.previous.previous break if (ttt.is_value("ПОЧТА", None) or ttt.is_value("АДРЕС", None)): b = t0 ttt = ttt.previous if (ttt is not None and ttt.is_char('.')): ttt = ttt.previous if (ttt is not None and ((t0.is_value("ЭЛ", None) or ttt.is_value("ЭЛЕКТРОННЫЙ", None)))): b = ttt if (b.previous is not None and b.previous.is_value("АДРЕС", None)): b = b.previous break if (ttt.morph.class0_.is_preposition): continue rt = ReferentToken( ur, b, (u2.end_token if ii == (len(u1s) - 1) else u1s[ii].end_token)) kit.embed_token(rt) t = (rt) continue if (not t.chars.is_cyrillic_letter): if (t.is_whitespace_before or ((t.previous is not None and t.previous.is_char_of(",(")))): u1 = UriItemToken.attach_url(t) if (u1 is not None): if (u1.is_whitespace_after or u1.end_token.next0_ is None or not u1.end_token.next0_.is_char('@')): if (u1.end_token.next0_ is not None and u1.end_token.next0_.is_char_of("\\/")): u2 = UriItemToken.attach_uri_content(t, False) if (u2 is not None): u1 = u2 ur = Utils.asObjectOrNull( ad.register_referent( UriReferent._new2714("http", u1.value)), UriReferent) rt = ReferentToken(ur, u1.begin_token, u1.end_token) rt.begin_token = Utils.ifNotNull( UriAnalyzer.__site_before( u1.begin_token.previous), u1.begin_token) kit.embed_token(rt) t = (rt) continue if ((isinstance(t, TextToken)) and not t.is_whitespace_after and t.length_char > 2): if (UriAnalyzer.__site_before(t.previous) is not None): ut = UriItemToken.attach_uri_content(t, True) if (ut is None or ut.value.find('.') <= 0 or ut.value.find('@') > 0): continue ur = Utils.asObjectOrNull( ad.register_referent( UriReferent._new2714("http", ut.value)), UriReferent) rt = ReferentToken(ur, t, ut.end_token) rt.begin_token = UriAnalyzer.__site_before(t.previous) if (rt.end_token.next0_ is not None and rt.end_token.next0_.is_char_of("/\\")): rt.end_token = rt.end_token.next0_ kit.embed_token(rt) t = (rt) continue if ((t.chars.is_latin_letter and not t.chars.is_all_lower and t.next0_ is not None) and not t.is_whitespace_after): if (t.next0_.is_char('/')): rt = UriAnalyzer.__try_attach_lotus( Utils.asObjectOrNull(t, TextToken)) if (rt is not None): rt.referent = ad.register_referent(rt.referent) kit.embed_token(rt) t = (rt) continue
def __try_name_exist(li: typing.List['CityItemToken'], oi: 'IntOntologyItem', always: bool) -> 'ReferentToken': oi.value = (None) if (li is None or li[0].typ != CityItemToken.ItemType.CITY): return None oi.value = li[0].onto_item tt = Utils.asObjectOrNull(li[0].begin_token, TextToken) if (tt is None): return None ok = False nam = (li[0].value if oi.value is None else oi.value.canonic_text) if (nam is None): return None if (nam == "РИМ"): if (tt.term == "РИМ"): if ((isinstance(tt.next0_, TextToken)) and tt.next0_. get_morph_class_in_dictionary().is_proper_secname): pass else: ok = True elif (tt.previous is not None and tt.previous.is_value("В", None) and tt.term == "РИМЕ"): ok = True elif (oi.value is not None and oi.value.referent is not None and oi.value.owner.is_ext_ontology): ok = True elif (nam.endswith("ГРАД") or nam.endswith("СК")): ok = True elif (nam.endswith("TOWN") or nam.startswith("SAN")): ok = True elif (li[0].chars.is_latin_letter and li[0].begin_token.previous is not None and ((li[0].begin_token.previous.is_value("IN", None) or li[0].begin_token.previous.is_value("FROM", None)))): ok = True else: tt2 = li[0].end_token.next0_ first_pass3150 = True while True: if first_pass3150: first_pass3150 = False else: tt2 = tt2.next0_ if (not (tt2 is not None)): break if (tt2.is_newline_before): break if ((tt2.is_char_of(",(") or tt2.morph.class0_.is_preposition or tt2.morph.class0_.is_conjunction) or tt2.morph.class0_.is_misc): continue if ((isinstance(tt2.get_referent(), GeoReferent)) and tt2.chars.is_cyrillic_letter == li[0].chars.is_cyrillic_letter): ok = True break if (not ok): tt2 = li[0].begin_token.previous first_pass3151 = True while True: if first_pass3151: first_pass3151 = False else: tt2 = tt2.previous if (not (tt2 is not None)): break if (tt2.is_newline_after): break if ((tt2.is_char_of(",)") or tt2.morph.class0_.is_preposition or tt2.morph.class0_.is_conjunction) or tt2.morph.class0_.is_misc): continue if ((isinstance(tt2.get_referent(), GeoReferent)) and tt2.chars.is_cyrillic_letter == li[0].chars.is_cyrillic_letter): ok = True if (ok): sits = StreetItemToken.try_parse_list( li[0].begin_token, None, 10) if (sits is not None and len(sits) > 1): ss = StreetDefineHelper._try_parse_street( sits, False, False) if (ss is not None): del sits[0] if (StreetDefineHelper._try_parse_street( sits, False, False) is None): ok = False if (ok): if (len(li) > 1 and li[1].typ == CityItemToken.ItemType.PROPERNAME and (li[1].whitespaces_before_count < 3)): ok = False else: mc = li[ 0].begin_token.get_morph_class_in_dictionary() if (mc.is_proper_name or mc.is_proper_surname or mc.is_adjective): ok = False else: npt = NounPhraseHelper.try_parse( li[0].begin_token, NounPhraseParseAttr.NO, 0, None) if (npt is not None and npt.end_char > li[0].end_char): ok = False if (AddressItemToken.try_attach_org(li[0].begin_token) is not None): ok = False break break if (always): if (li[0].whitespaces_before_count > 3 and li[0].doubtful and li[0].begin_token.get_morph_class_in_dictionary( ).is_proper_surname): pp = li[0].kit.process_referent("PERSON", li[0].begin_token) if (pp is not None): always = False if (li[0].begin_token.chars.is_latin_letter and li[0].begin_token == li[0].end_token): tt1 = li[0].end_token.next0_ if (tt1 is not None and tt1.is_char(',')): tt1 = tt1.next0_ if (((isinstance(tt1, TextToken)) and tt1.chars.is_latin_letter and (tt1.length_char < 3)) and not tt1.chars.is_all_lower): ok = False if (not ok and not always): return None city = None if (oi.value is not None and (isinstance(oi.value.referent, GeoReferent)) and not oi.value.owner.is_ext_ontology): city = (Utils.asObjectOrNull(oi.value.referent.clone(), GeoReferent)) city.occurrence.clear() else: city = GeoReferent() city._add_name(nam) if (oi.value is not None and (isinstance(oi.value.referent, GeoReferent))): city._merge_slots2( Utils.asObjectOrNull(oi.value.referent, GeoReferent), li[0].kit.base_language) if (not city.is_city): city._add_typ_city(li[0].kit.base_language) return ReferentToken._new734(city, li[0].begin_token, li[0].end_token, li[0].morph)
def try_attach(t: 'Token') -> 'ParenthesisToken': if (t is None): return None tok = ParenthesisToken.__m_termins.try_parse(t, TerminParseAttr.NO) if (tok is not None): res = ParenthesisToken(t, tok.end_token) return res if (not (isinstance(t, TextToken))): return None mc = t.get_morph_class_in_dictionary() ok = False t1 = None if (mc.is_adverb): ok = True elif (mc.is_adjective): if (t.morph.contains_attr("сравн.", None) and t.morph.contains_attr("кач.прил.", None)): ok = True if (ok and t.next0_ is not None): if (t.next0_.is_char(',')): return ParenthesisToken(t, t) t1 = t.next0_ if (t1.get_morph_class_in_dictionary() == MorphClass.VERB): if (t1.morph.contains_attr("н.вр.", None) and t1.morph.contains_attr("нес.в.", None) and t1.morph.contains_attr("дейст.з.", None)): return ParenthesisToken(t, t1) t1 = (None) if ((t.is_value("В", None) and t.next0_ is not None and t.next0_.is_value("СООТВЕТСТВИЕ", None)) and t.next0_.next0_ is not None and t.next0_.next0_.morph.class0_.is_preposition): t1 = t.next0_.next0_.next0_ elif (t.is_value("СОГЛАСНО", None)): t1 = t.next0_ elif (t.is_value("В", None) and t.next0_ is not None): if (t.next0_.is_value("СИЛА", None)): t1 = t.next0_.next0_ elif (t.next0_.morph.class0_.is_adjective or t.next0_.morph.class0_.is_pronoun): npt = NounPhraseHelper.try_parse(t.next0_, NounPhraseParseAttr.NO, 0, None) if (npt is not None): if (npt.noun.is_value("ВИД", None) or npt.noun.is_value("СЛУЧАЙ", None) or npt.noun.is_value("СФЕРА", None)): return ParenthesisToken(t, npt.end_token) if (t1 is not None): if (t1.next0_ is not None): npt1 = NounPhraseHelper.try_parse(t1, NounPhraseParseAttr.NO, 0, None) if (npt1 is not None): if (npt1.noun.is_value("НОРМА", None) or npt1.noun.is_value("ПОЛОЖЕНИЕ", None) or npt1.noun.is_value("УКАЗАНИЕ", None)): t1 = npt1.end_token.next0_ r = t1.get_referent() if (r is not None): res = ParenthesisToken._new1115(t, t1, r) if (t1.next0_ is not None and t1.next0_.is_comma): sila = False ttt = t1.next0_.next0_ first_pass3133 = True while True: if first_pass3133: first_pass3133 = False else: ttt = ttt.next0_ if (not (ttt is not None)): break if (ttt.is_value("СИЛА", None) or ttt.is_value("ДЕЙСТВИЕ", None)): sila = True continue if (ttt.is_comma): if (sila): res.end_token = ttt.previous break if (BracketHelper.can_be_start_of_sequence( ttt, False, False)): break return res npt = NounPhraseHelper.try_parse(t1, NounPhraseParseAttr.NO, 0, None) if (npt is not None): return ParenthesisToken(t, npt.end_token) tt = t if (tt.is_value("НЕ", None) and t is not None): tt = tt.next0_ if (tt.morph.class0_.is_preposition and tt is not None): tt = tt.next0_ npt1 = NounPhraseHelper.try_parse(tt, NounPhraseParseAttr.NO, 0, None) if (npt1 is not None): tt = npt1.end_token if (tt.next0_ is not None and tt.next0_.is_comma): return ParenthesisToken(t, tt.next0_) if (npt1.noun.is_value("ОЧЕРЕДЬ", None)): return ParenthesisToken(t, tt) if (t.is_value("ВЕДЬ", None)): return ParenthesisToken(t, t) return None
def try_attach(t: 'Token', must_has_prefix: bool = False) -> 'OrgItemEponymToken': from pullenti.ner.org.internal.OrgItemNameToken import OrgItemNameToken tt = Utils.asObjectOrNull(t, TextToken) if (tt is None): if (t is None): return None r1 = t.get_referent() if (r1 is not None and r1.type_name == "DATE"): str0_ = str(r1).upper() if ((str0_ == "1 МАЯ" or str0_ == "7 ОКТЯБРЯ" or str0_ == "9 МАЯ") or str0_ == "8 МАРТА"): dt = OrgItemEponymToken._new1797(t, t, list()) dt.eponyms.append(str0_) return dt age = NumberHelper.try_parse_age(t) if ((age is not None and (((isinstance(age.end_token.next0_, TextToken)) or (isinstance(age.end_token.next0_, ReferentToken)))) and (age.whitespaces_after_count < 3)) and not age.end_token.next0_.chars.is_all_lower and age.end_token.next0_.chars.is_cyrillic_letter): dt = OrgItemEponymToken._new1797(t, age.end_token.next0_, list()) dt.eponyms.append("{0} {1}".format( age.value, dt.end_token.get_source_text().upper())) return dt return None t1 = None full = False has_name = False if (tt.term == "ИМЕНИ" or tt.term == "ІМЕНІ"): t1 = t.next0_ full = True has_name = True elif (((tt.term == "ИМ" or tt.term == "ІМ")) and tt.next0_ is not None): if (tt.next0_.is_char('.')): t1 = tt.next0_.next0_ full = True elif ((isinstance(tt.next0_, TextToken)) and tt.chars.is_all_lower and not tt.next0_.chars.is_all_lower): t1 = tt.next0_ has_name = True elif (tt.previous is not None and ((tt.previous.is_value("ФОНД", None) or tt.previous.is_value("ХРАМ", None) or tt.previous.is_value("ЦЕРКОВЬ", "ЦЕРКВА")))): if ((not tt.chars.is_cyrillic_letter or tt.morph.class0_.is_preposition or tt.morph.class0_.is_conjunction) or not tt.chars.is_letter): return None if (tt.whitespaces_before_count != 1): return None if (tt.chars.is_all_lower): return None if (tt.morph.class0_.is_adjective): npt = NounPhraseHelper.try_parse(tt, NounPhraseParseAttr.NO, 0, None) if (npt is not None and npt.begin_token != npt.end_token): return None na = OrgItemNameToken.try_attach(tt, None, False, True) if (na is not None): if (na.is_empty_word or na.is_std_name or na.is_std_tail): return None t1 = (tt) if (t1 is None or ((t1.is_newline_before and not full))): return None if (tt.previous is not None and tt.previous.morph.class0_.is_preposition): return None if (must_has_prefix and not has_name): return None r = t1.get_referent() if ((r is not None and r.type_name == "DATE" and full) and r.find_slot("DAY", None, True) is not None and r.find_slot("YEAR", None, True) is None): dt = OrgItemEponymToken._new1797(t, t1, list()) dt.eponyms.append(str(r).upper()) return dt holy = False if ((t1.is_value("СВЯТОЙ", None) or t1.is_value("СВЯТИЙ", None) or t1.is_value("СВ", None)) or t1.is_value("СВЯТ", None)): t1 = t1.next0_ holy = True if (t1 is not None and t1.is_char('.')): t1 = t1.next0_ if (t1 is None): return None cl = t1.get_morph_class_in_dictionary() if (cl.is_noun or cl.is_adjective): rt = t1.kit.process_referent("PERSON", t1) if (rt is not None and rt.referent.type_name == "PERSON" and rt.begin_token != rt.end_token): e0_ = rt.referent.get_string_value("LASTNAME") if (e0_ is not None): if (rt.end_token.is_value(e0_, None)): re = OrgItemEponymToken(t, rt.end_token) re.eponyms.append(rt.end_token.get_source_text()) return re nt = NumberHelper.try_parse_anniversary(t1) if (nt is not None and nt.typ == NumberSpellingType.AGE): npt = NounPhraseHelper.try_parse(nt.end_token.next0_, NounPhraseParseAttr.NO, 0, None) if (npt is not None): s = "{0}-{1} {2}".format( nt.value, ("РОКІВ" if t.kit.base_language.is_ua else "ЛЕТ"), MiscHelper.get_text_value(npt.begin_token, npt.end_token, GetTextAttr.NO)) res = OrgItemEponymToken(t, npt.end_token) res.eponyms.append(s) return res its = OrgItemEponymToken.PersonItemToken.try_attach(t1) if (its is None): if ((isinstance(t1, ReferentToken)) and (isinstance(t1.get_referent(), GeoReferent))): s = MiscHelper.get_text_value(t1, t1, GetTextAttr.NO) re = OrgItemEponymToken(t, t1) re.eponyms.append(s) return re return None eponims = list() i = 0 j = 0 if (its[i].typ == OrgItemEponymToken.PersonItemType.LOCASEWORD): i += 1 if (i >= len(its)): return None if (not full): if (its[i].begin_token.morph.class0_.is_adjective and not its[i].begin_token.morph.class0_.is_proper_surname): return None if (its[i].typ == OrgItemEponymToken.PersonItemType.INITIAL): i += 1 while True: if ((i < len(its)) and its[i].typ == OrgItemEponymToken.PersonItemType.INITIAL): i += 1 if (i >= len(its) or ((its[i].typ != OrgItemEponymToken.PersonItemType.SURNAME and its[i].typ != OrgItemEponymToken.PersonItemType.NAME))): break eponims.append(its[i].value) t1 = its[i].end_token if ((i + 2) >= len(its) or its[i + 1].typ != OrgItemEponymToken.PersonItemType.AND or its[i + 2].typ != OrgItemEponymToken.PersonItemType.INITIAL): break i += 3 elif (((i + 1) < len(its)) and its[i].typ == OrgItemEponymToken.PersonItemType.NAME and its[i + 1].typ == OrgItemEponymToken.PersonItemType.SURNAME): eponims.append(its[i + 1].value) t1 = its[i + 1].end_token i += 2 if ((((i + 2) < len(its)) and its[i].typ == OrgItemEponymToken.PersonItemType.AND and its[i + 1].typ == OrgItemEponymToken.PersonItemType.NAME) and its[i + 2].typ == OrgItemEponymToken.PersonItemType.SURNAME): eponims.append(its[i + 2].value) t1 = its[i + 2].end_token elif (its[i].typ == OrgItemEponymToken.PersonItemType.SURNAME): if (len(its) == (i + 2) and its[i].chars == its[i + 1].chars): its[i].value += (" " + its[i + 1].value) its[i].end_token = its[i + 1].end_token del its[i + 1] eponims.append(its[i].value) if (((i + 1) < len(its)) and its[i + 1].typ == OrgItemEponymToken.PersonItemType.NAME): if ((i + 2) == len(its)): i += 1 elif (its[i + 2].typ != OrgItemEponymToken.PersonItemType.SURNAME): i += 1 elif (((i + 1) < len(its)) and its[i + 1].typ == OrgItemEponymToken.PersonItemType.INITIAL): if ((i + 2) == len(its)): i += 1 elif (its[i + 2].typ == OrgItemEponymToken.PersonItemType.INITIAL and (i + 3) == len(its)): i += 2 elif (((i + 2) < len(its)) and its[i + 1].typ == OrgItemEponymToken.PersonItemType.AND and its[i + 2].typ == OrgItemEponymToken.PersonItemType.SURNAME): ok = True npt = NounPhraseHelper.try_parse(its[i + 2].begin_token, NounPhraseParseAttr.NO, 0, None) if (npt is not None and not npt.morph.case_.is_genitive and not npt.morph.case_.is_undefined): ok = False if (ok): eponims.append(its[i + 2].value) i += 2 t1 = its[i].end_token elif (its[i].typ == OrgItemEponymToken.PersonItemType.NAME and holy): t1 = its[i].end_token sec = False if (((i + 1) < len(its)) and its[i].chars == its[i + 1].chars and its[i + 1].typ != OrgItemEponymToken.PersonItemType.INITIAL): sec = True t1 = its[i + 1].end_token if (sec): eponims.append("СВЯТ.{0} {1}".format(its[i].value, its[i + 1].value)) else: eponims.append("СВЯТ.{0}".format(its[i].value)) elif (full and (i + 1) == len(its) and ((its[i].typ == OrgItemEponymToken.PersonItemType.NAME or its[i].typ == OrgItemEponymToken.PersonItemType.SURNAME))): t1 = its[i].end_token eponims.append(its[i].value) elif ((its[i].typ == OrgItemEponymToken.PersonItemType.NAME and len(its) == 3 and its[i + 1].typ == OrgItemEponymToken.PersonItemType.NAME) and its[i + 2].typ == OrgItemEponymToken.PersonItemType.SURNAME): t1 = its[i + 2].end_token eponims.append("{0} {1} {2}".format(its[i].value, its[i + 1].value, its[i + 2].value)) i += 2 if (len(eponims) == 0): return None return OrgItemEponymToken._new1797(t, t1, eponims)