def tryAttachToExist(t: 'Token', p1: 'InstrumentParticipant', p2: 'InstrumentParticipant') -> 'ReferentToken': if (t is None): return None if (t.begin_char >= 7674 and (t.begin_char < 7680)): pass pp = ParticipantToken.tryAttach(t, p1, p2, False) p = None rt = None if (pp is None or pp.kind != ParticipantToken.Kinds.PURE): pers = t.getReferent() if ((isinstance(pers, PersonReferent)) or (isinstance(pers, GeoReferent)) or (isinstance(pers, OrganizationReferent))): if (p1 is not None and p1._containsRef(pers)): p = p1 elif (p2 is not None and p2._containsRef(pers)): p = p2 if (p is not None): rt = ReferentToken(p, t, t) else: if (p1 is not None and ParticipantToken.__isTypesEqual(pp.typ, p1.typ)): p = p1 elif (p2 is not None and ParticipantToken.__isTypesEqual(pp.typ, p2.typ)): p = p2 if (p is not None): rt = ReferentToken(p, pp.begin_token, pp.end_token) if (rt.begin_token.previous is not None and rt.begin_token.previous.isValue("ОТ", None)): rt.begin_token = rt.begin_token.previous if (rt is None): return None if (rt.end_token.next0_ is not None and rt.end_token.next0_.isChar(':')): rt1 = ParticipantToken.tryAttachRequisites( rt.end_token.next0_.next0_, p, (p2 if p == p1 else p1), False) if (rt1 is not None): rt1.begin_token = rt.begin_token return rt1 rt.end_token = rt.end_token.next0_ while rt.end_token.next0_ is not None and (isinstance( rt.end_token.next0_.getReferent(), OrganizationReferent)): org0_ = Utils.asObjectOrNull(rt.end_token.next0_.getReferent(), OrganizationReferent) if (rt.referent.findSlot(None, org0_, True) is not None): rt.end_token = rt.end_token.next0_ continue break return rt
def process_ontology_item(self, begin: 'Token') -> 'ReferentToken': if (begin is None): return None ga = GoodAttributeReferent() if (begin.chars.is_latin_letter): if (begin.is_value("KEYWORD", None)): ga.typ = GoodAttrType.KEYWORD begin = begin.next0_ elif (begin.is_value("CHARACTER", None)): ga.typ = GoodAttrType.CHARACTER begin = begin.next0_ elif (begin.is_value("PROPER", None)): ga.typ = GoodAttrType.PROPER begin = begin.next0_ elif (begin.is_value("MODEL", None)): ga.typ = GoodAttrType.MODEL begin = begin.next0_ if (begin is None): return None res = ReferentToken(ga, begin, begin) t = begin first_pass3181 = True while True: if first_pass3181: first_pass3181 = False else: t = t.next0_ if (not (t is not None)): break if (t.is_char(';')): ga.add_slot( GoodAttributeReferent.ATTR_VALUE, MiscHelper.get_text_value(begin, t.previous, GetTextAttr.NO), False, 0) begin = t.next0_ continue res.end_token = t if (res.end_char > begin.begin_char): ga.add_slot( GoodAttributeReferent.ATTR_VALUE, MiscHelper.get_text_value(begin, res.end_token, GetTextAttr.NO), False, 0) if (ga.typ == GoodAttrType.UNDEFINED): if (not begin.chars.is_all_lower): ga.typ = GoodAttrType.PROPER return res
def process(self, kit : 'AnalysisKit') -> None: """ Основная функция выделения объектов Args: container: lastStage: """ ad = kit.getAnalyzerData(self) t = kit.first_token first_pass3149 = True while True: if first_pass3149: first_pass3149 = False else: t = t.next0_ if (not (t is not None)): break tt = t tok = UriAnalyzer.__m_schemes.tryParse(t, TerminParseAttr.NO) if (tok is not None): i = (tok.termin.tag) tt = tok.end_token if (tt.next0_ is not None and tt.next0_.isChar('(')): tok1 = UriAnalyzer.__m_schemes.tryParse(tt.next0_.next0_, TerminParseAttr.NO) if ((tok1 is not None and tok1.termin.canonic_text == tok.termin.canonic_text and tok1.end_token.next0_ is not None) and tok1.end_token.next0_.isChar(')')): tt = tok1.end_token.next0_ if (i == 0): if ((tt.next0_ is None or ((not tt.next0_.isCharOf(":|") and not tt.is_table_control_char)) or tt.next0_.is_whitespace_before) or tt.next0_.whitespaces_after_count > 2): continue t1 = tt.next0_.next0_ while t1 is not None and t1.isCharOf("/\\"): t1 = t1.next0_ if (t1 is None or t1.whitespaces_before_count > 2): continue ut = UriItemToken.attachUriContent(t1, False) if (ut is None): continue ur = Utils.asObjectOrNull(ad.registerReferent(UriReferent._new2557(tok.termin.canonic_text.lower(), ut.value)), UriReferent) rt = ReferentToken(ad.registerReferent(ur), t, ut.end_token) rt.begin_token = Utils.ifNotNull(UriAnalyzer.__siteBefore(t.previous), t) if (rt.end_token.next0_ is not None and rt.end_token.next0_.isCharOf("/\\")): rt.end_token = rt.end_token.next0_ kit.embedToken(rt) t = (rt) continue if (i == 10): tt = tt.next0_ if (tt is None or not tt.isChar(':')): continue tt = tt.next0_ while tt is not None: if (tt.isCharOf("/\\")): pass else: break tt = tt.next0_ if (tt is None): continue if (tt.isValue("WWW", None) and tt.next0_ is not None and tt.next0_.isChar('.')): tt = tt.next0_.next0_ if (tt is None or tt.is_newline_before): continue ut = UriItemToken.attachUriContent(tt, True) if (ut is None): continue if (len(ut.value) < 4): continue ur = Utils.asObjectOrNull(ad.registerReferent(UriReferent._new2557(tok.termin.canonic_text.lower(), ut.value)), UriReferent) rt = ReferentToken(ad.registerReferent(ur), t, ut.end_token) rt.begin_token = Utils.ifNotNull(UriAnalyzer.__siteBefore(t.previous), t) if (rt.end_token.next0_ is not None and rt.end_token.next0_.isCharOf("/\\")): rt.end_token = rt.end_token.next0_ kit.embedToken(rt) t = (rt) continue if (i == 2): if (tt.next0_ is None or not tt.next0_.isChar('.') or tt.next0_.is_whitespace_before): continue if (tt.next0_.is_whitespace_after and tok.termin.canonic_text != "WWW"): continue ut = UriItemToken.attachUriContent(tt.next0_.next0_, True) if (ut is None): continue ur = Utils.asObjectOrNull(ad.registerReferent(UriReferent._new2557("http", ut.value)), UriReferent) rt = ReferentToken(ur, t, ut.end_token) rt.begin_token = Utils.ifNotNull(UriAnalyzer.__siteBefore(t.previous), t) if (rt.end_token.next0_ is not None and rt.end_token.next0_.isCharOf("/\\")): rt.end_token = rt.end_token.next0_ kit.embedToken(rt) t = (rt) continue if (i == 1): sch = tok.termin.canonic_text ut = None if (sch == "ISBN"): ut = UriItemToken.attachISBN(tt.next0_) if ((ut is None and t.previous is not None and t.previous.isChar('(')) and t.next0_ is not None and t.next0_.isChar(')')): tt0 = t.previous.previous while tt0 is not None: if (tt0.whitespaces_after_count > 2): break if (tt0.is_whitespace_before): ut = UriItemToken.attachISBN(tt0) if (ut is not None and ut.end_token.next0_ != t.previous): ut = (None) break tt0 = tt0.previous elif ((sch == "RFC" or sch == "ISO" or sch == "ОКФС") or sch == "ОКОПФ"): ut = UriItemToken.attachISOContent(tt.next0_, ":") elif (sch == "ГОСТ"): ut = UriItemToken.attachISOContent(tt.next0_, "-.") elif (sch == "ТУ"): if (tok.chars.is_all_upper): ut = UriItemToken.attachISOContent(tt.next0_, "-.") if (ut is not None and (ut.length_char < 10)): ut = (None) else: ut = UriItemToken.attachBBK(tt.next0_) if (ut is None): continue ur = Utils.asObjectOrNull(ad.registerReferent(UriReferent._new2560(ut.value, sch)), UriReferent) if (ut.begin_char < t.begin_char): rt = ReferentToken(ur, ut.begin_token, t) if (t.next0_ is not None and t.next0_.isChar(')')): rt.end_token = t.next0_ else: rt = ReferentToken(ur, t, ut.end_token) if (t.previous is not None and t.previous.isValue("КОД", None)): rt.begin_token = t.previous if (ur.scheme.startswith("ОК")): UriAnalyzer.__checkDetail(rt) kit.embedToken(rt) t = (rt) if (ur.scheme.startswith("ОК")): while t.next0_ is not None: if (t.next0_.is_comma_and and (isinstance(t.next0_.next0_, NumberToken))): pass else: break ut = UriItemToken.attachBBK(t.next0_.next0_) if (ut is None): break ur = (Utils.asObjectOrNull(ad.registerReferent(UriReferent._new2560(ut.value, sch)), UriReferent)) rt = ReferentToken(ur, t.next0_.next0_, ut.end_token) UriAnalyzer.__checkDetail(rt) kit.embedToken(rt) t = (rt) continue if (i == 3): t0 = tt.next0_ while t0 is not None: if (t0.isCharOf(":|") or t0.is_table_control_char or t0.is_hiphen): t0 = t0.next0_ else: break if (t0 is None): continue ut = UriItemToken.attachSkype(t0) if (ut is None): continue ur = Utils.asObjectOrNull(ad.registerReferent(UriReferent._new2560(ut.value.lower(), ("skype" if tok.termin.canonic_text == "SKYPE" else tok.termin.canonic_text))), UriReferent) rt = ReferentToken(ur, t, ut.end_token) kit.embedToken(rt) t = (rt) continue if (i == 4): t0 = tt.next0_ if (t0 is not None and ((t0.isChar(':') or t0.is_hiphen))): t0 = t0.next0_ if (t0 is None): continue ut = UriItemToken.attachIcqContent(t0) if (ut is None): continue ur = Utils.asObjectOrNull(ad.registerReferent(UriReferent._new2560(ut.value, "ICQ")), UriReferent) rt = ReferentToken(ur, t, t0) kit.embedToken(rt) t = (rt) continue if (i == 5 or i == 6): t0 = tt.next0_ has_tab_cel = False is_iban = False first_pass3150 = True while True: if first_pass3150: first_pass3150 = False else: t0 = t0.next0_ if (not (t0 is not None)): break if ((((t0.isValue("БАНК", None) or t0.morph.class0_.is_preposition or t0.is_hiphen) or t0.isCharOf(".:") or t0.isValue("РУБЛЬ", None)) or t0.isValue("РУБ", None) or t0.isValue("ДОЛЛАР", None)) or t0.isValue("№", None) or t0.isValue("N", None)): pass elif (t0.is_table_control_char): has_tab_cel = True elif (t0.isCharOf("\\/") and t0.next0_ is not None and t0.next0_.isValue("IBAN", None)): is_iban = True t0 = t0.next0_ elif (t0.isValue("IBAN", None)): is_iban = True elif (isinstance(t0, TextToken)): npt = NounPhraseHelper.tryParse(t0, NounPhraseParseAttr.NO, 0) if (npt is not None and npt.morph.case_.is_genitive): t0 = npt.end_token continue break else: break if (t0 is None): continue ur2 = None ur2begin = None ur2end = None t00 = t0 val = t0.getSourceText() if (str.isdigit(val[0]) and ((((i == 6 or tok.termin.canonic_text == "ИНН" or tok.termin.canonic_text == "БИК") or tok.termin.canonic_text == "ОГРН" or tok.termin.canonic_text == "СНИЛС") or tok.termin.canonic_text == "ОКПО"))): if (t0.chars.is_letter): continue if (Utils.isNullOrEmpty(val) or not str.isdigit(val[0])): continue if (t0.length_char < 9): tmp = io.StringIO() print(val, end="", file=tmp) ttt = t0.next0_ first_pass3151 = True while True: if first_pass3151: first_pass3151 = False else: ttt = ttt.next0_ if (not (ttt is not None)): break if (ttt.whitespaces_before_count > 1): break if (isinstance(ttt, NumberToken)): print(ttt.getSourceText(), end="", file=tmp) t0 = ttt continue if (ttt.is_hiphen or ttt.isChar('.')): if (ttt.next0_ is None or not ((isinstance(ttt.next0_, NumberToken)))): break if (ttt.is_whitespace_after or ttt.is_whitespace_before): break continue break val = (None) if (tmp.tell() == 20): val = Utils.toStringStringIO(tmp) elif (tmp.tell() == 9 and tok.termin.canonic_text == "БИК"): val = Utils.toStringStringIO(tmp) elif (((tmp.tell() == 10 or tmp.tell() == 12)) and tok.termin.canonic_text == "ИНН"): val = Utils.toStringStringIO(tmp) elif (tmp.tell() >= 15 and tok.termin.canonic_text == "Л/С"): val = Utils.toStringStringIO(tmp) elif (tmp.tell() >= 11 and ((tok.termin.canonic_text == "ОГРН" or tok.termin.canonic_text == "СНИЛС"))): val = Utils.toStringStringIO(tmp) elif (tok.termin.canonic_text == "ОКПО"): val = Utils.toStringStringIO(tmp) if (val is None): continue elif (not ((isinstance(t0, NumberToken)))): if ((isinstance(t0, TextToken)) and is_iban): tmp1 = io.StringIO() t1 = None ttt = t0 first_pass3152 = True while True: if first_pass3152: first_pass3152 = False else: ttt = ttt.next0_ if (not (ttt is not None)): break if (ttt.is_newline_before and ttt != t0): break if (ttt.is_hiphen): continue if (not ((isinstance(ttt, NumberToken)))): if (not ((isinstance(ttt, TextToken))) or not ttt.chars.is_latin_letter): break print(ttt.getSourceText(), end="", file=tmp1) t1 = ttt if (tmp1.tell() >= 34): break if (tmp1.tell() < 10): continue ur1 = UriReferent._new2560(Utils.toStringStringIO(tmp1), tok.termin.canonic_text) ur1.addSlot(UriReferent.ATTR_DETAIL, "IBAN", False, 0) rt1 = ReferentToken(ad.registerReferent(ur1), t, t1) kit.embedToken(rt1) t = (rt1) continue if (not t0.isCharOf("/\\") or t0.next0_ is None): continue tok2 = UriAnalyzer.__m_schemes.tryParse(t0.next0_, TerminParseAttr.NO) if (tok2 is None or not ((isinstance(tok2.termin.tag, int))) or (tok2.termin.tag) != i): continue t0 = tok2.end_token.next0_ while t0 is not None: if (t0.isCharOf(":N№")): t0 = t0.next0_ elif (t0.is_table_control_char): t0 = t0.next0_ t00 = t0 has_tab_cel = True else: break if (not ((isinstance(t0, NumberToken)))): continue tmp = io.StringIO() while t0 is not None: if (not ((isinstance(t0, NumberToken)))): break print(t0.getSourceText(), end="", file=tmp) t0 = t0.next0_ if (t0 is None or not t0.isCharOf("/\\,") or not ((isinstance(t0.next0_, NumberToken)))): continue val = Utils.toStringStringIO(tmp) Utils.setLengthStringIO(tmp, 0) ur2begin = t0.next0_ t0 = t0.next0_ while t0 is not None: if (not ((isinstance(t0, NumberToken)))): break if (t0.whitespaces_before_count > 4 and tmp.tell() > 0): break print(t0.getSourceText(), end="", file=tmp) ur2end = t0 t0 = t0.next0_ ur2 = (Utils.asObjectOrNull(ad.registerReferent(UriReferent._new2557(tok2.termin.canonic_text, Utils.toStringStringIO(tmp))), UriReferent)) if (len(val) < 5): continue ur = Utils.asObjectOrNull(ad.registerReferent(UriReferent._new2560(val, tok.termin.canonic_text)), UriReferent) rt = ReferentToken(ur, t, (t0 if ur2begin is None else ur2begin.previous)) if (has_tab_cel): rt.begin_token = t00 if (ur.scheme.startswith("ОК")): UriAnalyzer.__checkDetail(rt) ttt = t.previous first_pass3153 = True while True: if first_pass3153: first_pass3153 = False else: ttt = ttt.previous if (not (ttt is not None)): break if (ttt.is_table_control_char): break if (ttt.morph.class0_.is_preposition): continue if (ttt.isValue("ОРГАНИЗАЦИЯ", None)): continue if (ttt.isValue("НОМЕР", None) or ttt.isValue("КОД", None)): rt.begin_token = ttt t = rt.begin_token break kit.embedToken(rt) t = (rt) if (ur2 is not None): rt2 = ReferentToken(ur2, ur2begin, ur2end) kit.embedToken(rt2) t = (rt2) continue continue if (t.isChar('@')): u1s = UriItemToken.attachMailUsers(t.previous) if (u1s is None): continue u2 = UriItemToken.attachDomainName(t.next0_, False, True) if (u2 is None): continue for ii in range(len(u1s) - 1, -1, -1): ur = Utils.asObjectOrNull(ad.registerReferent(UriReferent._new2560("{0}@{1}".format(u1s[ii].value, u2.value).lower(), "mailto")), UriReferent) b = u1s[ii].begin_token t0 = b.previous if (t0 is not None and t0.isChar(':')): t0 = t0.previous if (t0 is not None and ii == 0): br = False ttt = t0 first_pass3154 = True while True: if first_pass3154: first_pass3154 = False else: ttt = ttt.previous if (not (ttt is not None)): break if (not ((isinstance(ttt, TextToken)))): break if (ttt != t0 and ttt.whitespaces_after_count > 1): break if (ttt.isChar(')')): br = True continue if (ttt.isChar('(')): if (not br): break br = False continue if (ttt.isValue("EMAIL", None) or ttt.isValue("MAILTO", None)): b = ttt break if (ttt.isValue("MAIL", None)): b = ttt if ((ttt.previous is not None and ttt.previous.is_hiphen and ttt.previous.previous is not None) and ((ttt.previous.previous.isValue("E", None) or ttt.previous.previous.isValue("Е", None)))): b = ttt.previous.previous break if (ttt.isValue("ПОЧТА", None) or ttt.isValue("АДРЕС", None)): b = t0 ttt = ttt.previous if (ttt is not None and ttt.isChar('.')): ttt = ttt.previous if (ttt is not None and ((t0.isValue("ЭЛ", None) or ttt.isValue("ЭЛЕКТРОННЫЙ", None)))): b = ttt if (b.previous is not None and b.previous.isValue("АДРЕС", None)): b = b.previous break if (ttt.morph.class0_.is_preposition): continue rt = ReferentToken(ur, b, (u2.end_token if ii == (len(u1s) - 1) else u1s[ii].end_token)) kit.embedToken(rt) t = (rt) continue if (not t.morph.language.is_cyrillic): if (t.is_whitespace_before or ((t.previous is not None and t.previous.isCharOf(",(")))): u1 = UriItemToken.attachUrl(t) if (u1 is not None): if (u1.is_whitespace_after or u1.end_token.next0_ is None or not u1.end_token.next0_.isChar('@')): ur = Utils.asObjectOrNull(ad.registerReferent(UriReferent._new2557("http", u1.value)), UriReferent) rt = ReferentToken(ur, u1.begin_token, u1.end_token) rt.begin_token = Utils.ifNotNull(UriAnalyzer.__siteBefore(u1.begin_token.previous), u1.begin_token) kit.embedToken(rt) t = (rt) continue if ((isinstance(t, TextToken)) and not t.is_whitespace_after and t.length_char > 2): if (UriAnalyzer.__siteBefore(t.previous) is not None): ut = UriItemToken.attachUriContent(t, True) if (ut is None or ut.value.find('.') <= 0 or ut.value.find('@') > 0): continue ur = Utils.asObjectOrNull(ad.registerReferent(UriReferent._new2557("http", ut.value)), UriReferent) rt = ReferentToken(ur, t, ut.end_token) rt.begin_token = UriAnalyzer.__siteBefore(t.previous) if (rt.end_token.next0_ is not None and rt.end_token.next0_.isCharOf("/\\")): rt.end_token = rt.end_token.next0_ kit.embedToken(rt) t = (rt) continue if ((t.chars.is_latin_letter and not t.chars.is_all_lower and t.next0_ is not None) and not t.is_whitespace_after): if (t.next0_.isChar('/')): rt = UriAnalyzer.__TryAttachLotus(Utils.asObjectOrNull(t, TextToken)) if (rt is not None): rt.referent = ad.registerReferent(rt.referent) kit.embedToken(rt) t = (rt) continue
def try_attach_territory( li: typing.List['TerrItemToken'], ad: 'AnalyzerData', attach_always: bool = False, cits: typing.List['CityItemToken'] = None, exists: typing.List['GeoReferent'] = None) -> 'ReferentToken': if (li is None or len(li) == 0): return None ex_obj = None new_name = None adj_list = list() noun = None add_noun = None rt = TerrAttachHelper.__try_attach_moscowao(li, ad) if (rt is not None): return rt if (li[0].termin_item is not None and li[0].termin_item.canonic_text == "ТЕРРИТОРИЯ"): res2 = TerrAttachHelper.__try_attach_pure_terr(li, ad) return res2 if (len(li) == 2): if (li[0].rzd is not None and li[1].rzd_dir is not None): rzd = GeoReferent() rzd._add_name(li[1].rzd_dir) rzd._add_typ_ter(li[0].kit.base_language) rzd.add_slot(GeoReferent.ATTR_REF, li[0].rzd.referent, False, 0) rzd.add_ext_referent(li[0].rzd) return ReferentToken(rzd, li[0].begin_token, li[1].end_token) if (li[1].rzd is not None and li[0].rzd_dir is not None): rzd = GeoReferent() rzd._add_name(li[0].rzd_dir) rzd._add_typ_ter(li[0].kit.base_language) rzd.add_slot(GeoReferent.ATTR_REF, li[1].rzd.referent, False, 0) rzd.add_ext_referent(li[1].rzd) return ReferentToken(rzd, li[0].begin_token, li[1].end_token) can_be_city_before = False adj_terr_before = False if (cits is not None): if (cits[0].typ == CityItemToken.ItemType.CITY): can_be_city_before = True elif (cits[0].typ == CityItemToken.ItemType.NOUN and len(cits) > 1): can_be_city_before = True k = 0 k = 0 while k < len(li): if (li[k].onto_item is not None): if (ex_obj is not None or new_name is not None): break if (noun is not None): if (k == 1): if (noun.termin_item.canonic_text == "РАЙОН" or noun.termin_item.canonic_text == "ОБЛАСТЬ" or noun.termin_item.canonic_text == "СОЮЗ"): if (isinstance(li[k].onto_item.referent, GeoReferent)): if (li[k].onto_item.referent.is_state): break ok = False tt = li[k].end_token.next0_ if (tt is None): ok = True elif (tt.is_char_of(",.")): ok = True if (not ok): ok = MiscLocationHelper.check_geo_object_before( li[0].begin_token) if (not ok): adr = AddressItemToken.try_parse( tt, None, False, False, None) if (adr is not None): if (adr.typ == AddressItemToken.ItemType.STREET): ok = True if (not ok): break if (li[k].onto_item is not None): if (noun.begin_token.is_value("МО", None) or noun.begin_token.is_value("ЛО", None)): return None ex_obj = li[k] elif (li[k].termin_item is not None): if (noun is not None): break if (li[k].termin_item.is_always_prefix and k > 0): break if (k > 0 and li[k].is_doubt): if (li[k].begin_token == li[k].end_token and li[k].begin_token.is_value("ЗАО", None)): break if (li[k].termin_item.is_adjective or li[k].is_geo_in_dictionary): adj_list.append(li[k]) else: if (ex_obj is not None): geo_ = Utils.asObjectOrNull(ex_obj.onto_item.referent, GeoReferent) if (geo_ is None): break if (ex_obj.is_adjective and ((li[k].termin_item.canonic_text == "СОЮЗ" or li[k].termin_item.canonic_text == "ФЕДЕРАЦИЯ"))): str0_ = str(ex_obj.onto_item) if (not li[k].termin_item.canonic_text in str0_): return None if (li[k].termin_item.canonic_text == "РАЙОН" or li[k].termin_item.canonic_text == "ОКРУГ" or li[k].termin_item.canonic_text == "КРАЙ"): tmp = io.StringIO() for s in geo_.slots: if (s.type_name == GeoReferent.ATTR_TYPE): print("{0};".format(s.value), end="", file=tmp, flush=True) if (not li[k].termin_item.canonic_text in Utils.toStringStringIO(tmp).upper()): if (k != 1 or new_name is not None): break new_name = li[0] new_name.is_adjective = True new_name.onto_item = (None) ex_obj = (None) noun = li[k] if (k == 0): tt = TerrItemToken.try_parse( li[k].begin_token.previous, None, True, False, None) if (tt is not None and tt.morph.class0_.is_adjective): adj_terr_before = True else: if (ex_obj is not None): break if (new_name is not None): break new_name = li[k] k += 1 name = None alt_name = None full_name = None morph_ = None if (ex_obj is not None): if (ex_obj.is_adjective and not ex_obj.morph.language.is_en and noun is None): if (attach_always and ex_obj.end_token.next0_ is not None): npt = NounPhraseHelper.try_parse(ex_obj.begin_token, NounPhraseParseAttr.NO, 0, None) if (ex_obj.end_token.next0_.is_comma_and): pass elif (npt is None): pass else: str0_ = StreetItemToken.try_parse( ex_obj.end_token.next0_, None, False, None, False) if (str0_ is not None): if (str0_.typ == StreetItemType.NOUN and str0_.end_token == npt.end_token): return None else: cit = CityItemToken.try_parse(ex_obj.end_token.next0_, None, False, None) if (cit is not None and ((cit.typ == CityItemToken.ItemType.NOUN or cit.typ == CityItemToken.ItemType.CITY))): npt = NounPhraseHelper.try_parse( ex_obj.begin_token, NounPhraseParseAttr.NO, 0, None) if (npt is not None and npt.end_token == cit.end_token): pass else: return None elif (ex_obj.begin_token.is_value("ПОДНЕБЕСНЫЙ", None)): pass else: return None if (noun is None and ex_obj.can_be_city): cit0 = CityItemToken.try_parse_back( ex_obj.begin_token.previous) if (cit0 is not None and cit0.typ != CityItemToken.ItemType.PROPERNAME): return None if (ex_obj.is_doubt and noun is None): ok2 = False if (TerrAttachHelper.__can_be_geo_after( ex_obj.end_token.next0_)): ok2 = True elif (not ex_obj.can_be_surname and not ex_obj.can_be_city): if ((ex_obj.end_token.next0_ is not None and ex_obj.end_token.next0_.is_char(')') and ex_obj.begin_token.previous is not None) and ex_obj.begin_token.previous.is_char('(')): ok2 = True elif (ex_obj.chars.is_latin_letter and ex_obj.begin_token.previous is not None): if (ex_obj.begin_token.previous.is_value("IN", None)): ok2 = True elif (ex_obj.begin_token.previous.is_value( "THE", None) and ex_obj.begin_token.previous.previous is not None and ex_obj.begin_token.previous.previous.is_value( "IN", None)): ok2 = True if (not ok2): cit0 = CityItemToken.try_parse_back( ex_obj.begin_token.previous) if (cit0 is not None and cit0.typ != CityItemToken.ItemType.PROPERNAME): pass elif (MiscLocationHelper.check_geo_object_before( ex_obj.begin_token.previous)): pass else: return None name = ex_obj.onto_item.canonic_text morph_ = ex_obj.morph elif (new_name is not None): if (noun is None): return None j = 1 while j < k: if (li[j].is_newline_before and not li[0].is_newline_before): if (BracketHelper.can_be_start_of_sequence( li[j].begin_token, False, False)): pass else: return None j += 1 morph_ = noun.morph if (new_name.is_adjective): if (noun.termin_item.acronym == "АО"): if (noun.begin_token != noun.end_token): return None if (new_name.morph.gender != MorphGender.FEMINIE): return None geo_before = None tt0 = li[0].begin_token.previous if (tt0 is not None and tt0.is_comma_and): tt0 = tt0.previous if (not li[0].is_newline_before and tt0 is not None): geo_before = (Utils.asObjectOrNull(tt0.get_referent(), GeoReferent)) if (Utils.indexOfList(li, noun, 0) < Utils.indexOfList( li, new_name, 0)): if (noun.termin_item.is_state): return None if (new_name.can_be_surname and geo_before is None): if (((noun.morph.case_) & new_name.morph.case_).is_undefined): return None if (MiscHelper.is_exists_in_dictionary( new_name.begin_token, new_name.end_token, (MorphClass.ADJECTIVE) | MorphClass.PRONOUN | MorphClass.VERB)): if (noun.begin_token != new_name.begin_token): if (geo_before is None): if (len(li) == 2 and TerrAttachHelper.__can_be_geo_after( li[1].end_token.next0_)): pass elif (len(li) == 3 and li[2].termin_item is not None and TerrAttachHelper.__can_be_geo_after( li[2].end_token.next0_)): pass elif (new_name.is_geo_in_dictionary): pass elif (new_name.end_token.is_newline_after): pass else: return None npt = NounPhraseHelper.try_parse( new_name.end_token, NounPhraseParseAttr.PARSEPRONOUNS, 0, None) if (npt is not None and npt.end_token != new_name.end_token): if (len(li) >= 3 and li[2].termin_item is not None and npt.end_token == li[2].end_token): add_noun = li[2] else: return None rtp = new_name.kit.process_referent( "PERSON", new_name.begin_token) if (rtp is not None): return None name = ProperNameHelper.get_name_ex( new_name.begin_token, new_name.end_token, MorphClass.ADJECTIVE, MorphCase.UNDEFINED, noun.termin_item.gender, False, False) else: ok = False if (((k + 1) < len(li)) and li[k].termin_item is None and li[k + 1].termin_item is not None): ok = True elif ((k < len(li)) and li[k].onto_item is not None): ok = True elif (k == len(li) and not new_name.is_adj_in_dictionary): ok = True elif (MiscLocationHelper.check_geo_object_before( li[0].begin_token) or can_be_city_before): ok = True elif (MiscLocationHelper.check_geo_object_after( li[k - 1].end_token, False)): ok = True elif (len(li) == 3 and k == 2): cit = CityItemToken.try_parse(li[2].begin_token, None, False, None) if (cit is not None): if (cit.typ == CityItemToken.ItemType.CITY or cit.typ == CityItemToken.ItemType.NOUN): ok = True elif (len(li) == 2): ok = TerrAttachHelper.__can_be_geo_after( li[len(li) - 1].end_token.next0_) if (not ok and not li[0].is_newline_before and not li[0].chars.is_all_lower): rt00 = li[0].kit.process_referent( "PERSONPROPERTY", li[0].begin_token.previous) if (rt00 is not None): ok = True if (noun.termin_item is not None and noun.termin_item.is_strong and new_name.is_adjective): ok = True if (noun.is_doubt and len(adj_list) == 0 and geo_before is None): return None name = ProperNameHelper.get_name_ex( new_name.begin_token, new_name.end_token, MorphClass.ADJECTIVE, MorphCase.UNDEFINED, noun.termin_item.gender, False, False) if (not ok and not attach_always): if (MiscHelper.is_exists_in_dictionary( new_name.begin_token, new_name.end_token, (MorphClass.ADJECTIVE) | MorphClass.PRONOUN | MorphClass.VERB)): if (exists is not None): for e0_ in exists: if (e0_.find_slot(GeoReferent.ATTR_NAME, name, True) is not None): ok = True break if (not ok): return None full_name = "{0} {1}".format( ProperNameHelper.get_name_ex(li[0].begin_token, noun.begin_token.previous, MorphClass.ADJECTIVE, MorphCase.UNDEFINED, noun.termin_item.gender, False, False), noun.termin_item.canonic_text) else: if (not attach_always or ((noun.termin_item is not None and noun.termin_item.canonic_text == "ФЕДЕРАЦИЯ"))): is_latin = noun.chars.is_latin_letter and new_name.chars.is_latin_letter if (Utils.indexOfList(li, noun, 0) > Utils.indexOfList( li, new_name, 0)): if (not is_latin): return None if (not new_name.is_district_name and not BracketHelper.can_be_start_of_sequence( new_name.begin_token, False, False)): if (len(adj_list) == 0 and MiscHelper.is_exists_in_dictionary( new_name.begin_token, new_name.end_token, (MorphClass.NOUN) | MorphClass.PRONOUN)): if (len(li) == 2 and noun.is_city_region and (noun.whitespaces_after_count < 2)): pass else: return None if (not is_latin): if ((noun.termin_item.is_region and not attach_always and ((not adj_terr_before or new_name.is_doubt))) and not noun.is_city_region and not noun.termin_item.is_specific_prefix): if (not MiscLocationHelper. check_geo_object_before( noun.begin_token)): if (not noun.is_doubt and noun.begin_token != noun.end_token): pass elif ((noun.termin_item.is_always_prefix and len(li) == 2 and li[0] == noun) and li[1] == new_name): pass else: return None if (noun.is_doubt and len(adj_list) == 0): if (noun.termin_item.acronym == "МО" or noun.termin_item.acronym == "ЛО"): if (k == (len(li) - 1) and li[k].termin_item is not None): add_noun = li[k] k += 1 elif (len(li) == 2 and noun == li[0] and str(new_name).endswith("совет")): pass else: return None else: return None pers = new_name.kit.process_referent( "PERSON", new_name.begin_token) if (pers is not None): return None name = MiscHelper.get_text_value(new_name.begin_token, new_name.end_token, GetTextAttr.NO) if (new_name.begin_token != new_name.end_token): ttt = new_name.begin_token.next0_ while ttt is not None and ttt.end_char <= new_name.end_char: if (ttt.chars.is_letter): ty = TerrItemToken.try_parse( ttt, None, False, False, None) if ((ty is not None and ty.termin_item is not None and noun is not None) and ((noun.termin_item.canonic_text in ty.termin_item.canonic_text or ty.termin_item.canonic_text in noun.termin_item.canonic_text))): name = MiscHelper.get_text_value( new_name.begin_token, ttt.previous, GetTextAttr.NO) break ttt = ttt.next0_ if (len(adj_list) > 0): npt = NounPhraseHelper.try_parse(adj_list[0].begin_token, NounPhraseParseAttr.NO, 0, None) if (npt is not None and npt.end_token == noun.end_token): alt_name = "{0} {1}".format( npt.get_normal_case_text(None, MorphNumber.UNDEFINED, MorphGender.UNDEFINED, False), name) else: if ((len(li) == 1 and noun is not None and noun.end_token.next0_ is not None) and (isinstance( noun.end_token.next0_.get_referent(), GeoReferent))): g = Utils.asObjectOrNull(noun.end_token.next0_.get_referent(), GeoReferent) if (noun.termin_item is not None): tyy = noun.termin_item.canonic_text.lower() ooo = False if (g.find_slot(GeoReferent.ATTR_TYPE, tyy, True) is not None): ooo = True elif (tyy.endswith("район") and g.find_slot( GeoReferent.ATTR_TYPE, "район", True) is not None): ooo = True if (ooo): return ReferentToken._new734(g, noun.begin_token, noun.end_token.next0_, noun.begin_token.morph) if ((len(li) == 1 and noun == li[0] and li[0].termin_item is not None) and TerrItemToken.try_parse(li[0].end_token.next0_, None, True, False, None) is None and TerrItemToken.try_parse(li[0].begin_token.previous, None, True, False, None) is None): if (li[0].morph.number == MorphNumber.PLURAL): return None cou = 0 str0_ = li[0].termin_item.canonic_text.lower() tt = li[0].begin_token.previous first_pass3158 = True while True: if first_pass3158: first_pass3158 = False else: tt = tt.previous if (not (tt is not None)): break if (tt.is_newline_after): cou += 10 else: cou += 1 if (cou > 500): break g = Utils.asObjectOrNull(tt.get_referent(), GeoReferent) if (g is None): continue ok = True cou = 0 tt = li[0].end_token.next0_ first_pass3159 = True while True: if first_pass3159: first_pass3159 = False else: tt = tt.next0_ if (not (tt is not None)): break if (tt.is_newline_before): cou += 10 else: cou += 1 if (cou > 500): break tee = TerrItemToken.try_parse(tt, None, True, False, None) if (tee is None): continue ok = False break if (ok): ii = 0 while g is not None and (ii < 3): if (g.find_slot(GeoReferent.ATTR_TYPE, str0_, True) is not None): return ReferentToken._new734( g, li[0].begin_token, li[0].end_token, noun.begin_token.morph) g = g.higher ii += 1 break return None ter = None if (ex_obj is not None and (isinstance(ex_obj.tag, GeoReferent))): ter = (Utils.asObjectOrNull(ex_obj.tag, GeoReferent)) else: ter = GeoReferent() if (ex_obj is not None): geo_ = Utils.asObjectOrNull(ex_obj.onto_item.referent, GeoReferent) if (geo_ is not None and not geo_.is_city): ter._merge_slots2(geo_, li[0].kit.base_language) else: ter._add_name(name) if (noun is None and ex_obj.can_be_city): ter._add_typ_city(li[0].kit.base_language) else: pass elif (new_name is not None): ter._add_name(name) if (alt_name is not None): ter._add_name(alt_name) if (noun is not None): if (noun.termin_item.canonic_text == "АО"): ter._add_typ( ("АВТОНОМНИЙ ОКРУГ" if li[0].kit.base_language.is_ua else "АВТОНОМНЫЙ ОКРУГ")) elif (noun.termin_item.canonic_text == "МУНИЦИПАЛЬНОЕ СОБРАНИЕ" or noun.termin_item.canonic_text == "МУНІЦИПАЛЬНЕ ЗБОРИ"): ter._add_typ(("МУНІЦИПАЛЬНЕ УТВОРЕННЯ" if li[0].kit.base_language.is_ua else "МУНИЦИПАЛЬНОЕ ОБРАЗОВАНИЕ")) elif (noun.termin_item.acronym == "МО" and add_noun is not None): ter._add_typ(add_noun.termin_item.canonic_text) else: if (noun.termin_item.canonic_text == "СОЮЗ" and ex_obj is not None and ex_obj.end_char > noun.end_char): return ReferentToken._new734(ter, ex_obj.begin_token, ex_obj.end_token, ex_obj.morph) ter._add_typ(noun.termin_item.canonic_text) if (noun.termin_item.is_region and ter.is_state): ter._add_typ_reg(li[0].kit.base_language) if (ter.is_state and ter.is_region): for a in adj_list: if (a.termin_item.is_region): ter._add_typ_reg(li[0].kit.base_language) break if (ter.is_state): if (full_name is not None): ter._add_name(full_name) res = ReferentToken(ter, li[0].begin_token, li[k - 1].end_token) if (noun is not None and noun.morph.class0_.is_noun): res.morph = noun.morph else: res.morph = MorphCollection() ii = 0 while ii < k: for v in li[ii].morph.items: bi = MorphBaseInfo() bi.copy_from(v) if (noun is not None): if (bi.class0_.is_adjective): bi.class0_ = MorphClass.NOUN res.morph.add_item(bi) ii += 1 if (li[0].termin_item is not None and li[0].termin_item.is_specific_prefix): res.begin_token = li[0].end_token.next0_ if (add_noun is not None and add_noun.end_char > res.end_char): res.end_token = add_noun.end_token if ((isinstance(res.begin_token.previous, TextToken)) and (res.whitespaces_before_count < 2)): tt = Utils.asObjectOrNull(res.begin_token.previous, TextToken) if (tt.term == "АР"): for ty in ter.typs: if ("республика" in ty or "республіка" in ty): res.begin_token = tt break return res
def tryAttachOrg(t : 'Token', can_be_cyr : bool=False) -> 'ReferentToken': from pullenti.ner.org.internal.OrgItemNameToken import OrgItemNameToken if (t is None): return None br = False if (t.isChar('(') and t.next0_ is not None): t = t.next0_ br = True if (isinstance(t, NumberToken)): if ((t).typ == NumberSpellingType.WORDS and t.morph.class0_.is_adjective and t.chars.is_capital_upper): pass else: return None else: if (t.chars.is_all_lower): return None if ((t.length_char < 3) and not t.chars.is_letter): return None if (not t.chars.is_latin_letter): if (not can_be_cyr or not t.chars.is_cyrillic_letter): return None t0 = t t1 = t0 nam_wo = 0 tok = None geo_ = None add_typ = None first_pass3043 = True while True: if first_pass3043: first_pass3043 = False else: t = t.next0_ if (not (t is not None)): break if (t != t0 and t.whitespaces_before_count > 1): break if (t.isChar(')')): break if (t.isChar('(') and t.next0_ is not None): if ((isinstance(t.next0_.getReferent(), GeoReferent)) and t.next0_.next0_ is not None and t.next0_.next0_.isChar(')')): geo_ = (Utils.asObjectOrNull(t.next0_.getReferent(), GeoReferent)) t = t.next0_.next0_ continue typ = OrgItemTypeToken.tryAttach(t.next0_, True, None) if ((typ is not None and typ.end_token.next0_ is not None and typ.end_token.next0_.isChar(')')) and typ.chars.is_latin_letter): add_typ = typ t = typ.end_token.next0_ continue if (((isinstance(t.next0_, TextToken)) and t.next0_.next0_ is not None and t.next0_.next0_.isChar(')')) and t.next0_.chars.is_capital_upper): t = t.next0_.next0_ t1 = t continue break tok = OrgItemEngItem.tryAttach(t, can_be_cyr) if (tok is None and t.isCharOf(".,") and t.next0_ is not None): tok = OrgItemEngItem.tryAttach(t.next0_, can_be_cyr) if (tok is None and t.next0_.isCharOf(",.")): tok = OrgItemEngItem.tryAttach(t.next0_.next0_, can_be_cyr) if (tok is not None): if (tok.length_char == 1 and t0.chars.is_cyrillic_letter): return None break if (t.is_hiphen and not t.is_whitespace_after and not t.is_whitespace_before): continue if (t.isCharOf("&+") or t.is_and): continue if (t.isChar('.')): if (t.previous is not None and t.previous.length_char == 1): continue elif (MiscHelper.canBeStartOfSentence(t.next0_)): break if (not t.chars.is_latin_letter): if (not can_be_cyr or not t.chars.is_cyrillic_letter): break if (t.chars.is_all_lower): if (t.morph.class0_.is_preposition or t.morph.class0_.is_conjunction): continue if (br): continue break mc = t.getMorphClassInDictionary() if (mc.is_verb): if (t.next0_ is not None and t.next0_.morph.class0_.is_preposition): break if (t.next0_ is not None and t.next0_.isValue("OF", None)): break if (isinstance(t, TextToken)): nam_wo += 1 t1 = t if (tok is None): return None if (t0 == tok.begin_token): br2 = BracketHelper.tryParse(tok.end_token.next0_, BracketParseAttr.NO, 100) if (br2 is not None): org1 = OrganizationReferent() if (tok.short_value is not None): org1.addTypeStr(tok.short_value) org1.addTypeStr(tok.full_value) nam1 = MiscHelper.getTextValue(br2.begin_token, br2.end_token, GetTextAttr.NO) if (nam1 is not None): org1.addName(nam1, True, None) return ReferentToken(org1, t0, br2.end_token) return None org0_ = OrganizationReferent() te = tok.end_token if (tok.is_bank): t1 = tok.end_token if (tok.full_value == "company" and (tok.whitespaces_after_count < 3)): tok1 = OrgItemEngItem.tryAttach(tok.end_token.next0_, can_be_cyr) if (tok1 is not None): t1 = tok.end_token tok = tok1 te = tok.end_token if (tok.full_value == "company"): if (nam_wo == 0): return None nam = MiscHelper.getTextValue(t0, t1, GetTextAttr.IGNOREARTICLES) if (nam == "STOCK" and tok.full_value == "company"): return None alt_nam = None if (Utils.isNullOrEmpty(nam)): return None if (nam.find('(') > 0): i1 = nam.find('(') i2 = nam.find(')') if (i1 < i2): alt_nam = nam tai = None if ((i2 + 1) < len(nam)): tai = nam[i2:].strip() nam = nam[0:0+i1].strip() if (tai is not None): nam = "{0} {1}".format(nam, tai) if (tok.is_bank): org0_.addTypeStr(("bank" if tok.kit.base_language.is_en else "банк")) org0_.addProfile(OrgProfile.FINANCE) if ((t1.next0_ is not None and t1.next0_.isValue("OF", None) and t1.next0_.next0_ is not None) and t1.next0_.next0_.chars.is_latin_letter): nam0 = OrgItemNameToken.tryAttach(t1.next0_, None, False, False) if (nam0 is not None): te = nam0.end_token else: te = t1.next0_.next0_ nam = MiscHelper.getTextValue(t0, te, GetTextAttr.NO) if (isinstance(te.getReferent(), GeoReferent)): org0_._addGeoObject(Utils.asObjectOrNull(te.getReferent(), GeoReferent)) elif (t0 == t1): return None else: if (tok.short_value is not None): org0_.addTypeStr(tok.short_value) org0_.addTypeStr(tok.full_value) if (Utils.isNullOrEmpty(nam)): return None org0_.addName(nam, True, None) if (alt_nam is not None): org0_.addName(alt_nam, True, None) res = ReferentToken(org0_, t0, te) t = te while t.next0_ is not None: if (t.next0_.isCharOf(",.")): t = t.next0_ else: break if (t.whitespaces_after_count < 2): tok = OrgItemEngItem.tryAttach(t.next0_, can_be_cyr) if (tok is not None): if (tok.short_value is not None): org0_.addTypeStr(tok.short_value) org0_.addTypeStr(tok.full_value) res.end_token = tok.end_token if (geo_ is not None): org0_._addGeoObject(geo_) if (add_typ is not None): org0_.addType(add_typ, False) if (not br): return res t = res.end_token if (t.next0_ is None or t.next0_.isChar(')')): res.end_token = t.next0_ else: return None return res
def tryAttach(t: 'Token') -> 'ReferentToken': if (t is None): return None f = FundsItemToken.tryParse(t, None) if (f is None): return None if (f.typ == FundsItemTyp.ORG): return None if (f.typ == FundsItemTyp.PRICE or f.typ == FundsItemTyp.PERCENT or f.typ == FundsItemTyp.COUNT): if (t.previous is not None and t.previous.isCharOf(",.") and (isinstance(t.previous.previous, NumberToken))): return None li = list() li.append(f) is_in_br = False tt = f.end_token.next0_ first_pass2768 = True while True: if first_pass2768: first_pass2768 = False else: tt = tt.next0_ if (not (tt is not None)): break if ((tt.is_whitespace_before and tt.previous is not None and tt.previous.isChar('.')) and tt.chars.is_capital_upper): break f0 = FundsItemToken.tryParse(tt, f) if (f0 is not None): if (f0.kind == FundsKind.CAPITAL and is_in_br): for l_ in li: if (l_.typ == FundsItemTyp.NOUN): f0.kind = l_.kind break f = f0 li.append(f) tt = f.end_token continue if (tt.isChar('(')): is_in_br = True continue if (tt.isChar(')')): if (is_in_br or ((t.previous is not None and t.previous.isChar('(')))): is_in_br = False li[len(li) - 1].end_token = tt continue if (tt.morph.class0_.is_verb or tt.morph.class0_.is_adverb): continue break funds = FundsReferent() res = ReferentToken(funds, t, t) org_prob = None i = 0 while i < len(li): if (li[i].typ == FundsItemTyp.NOUN): funds.kind = li[i].kind if (li[i].string_val is not None): funds.typ = li[i].string_val if (isinstance(li[i].ref, OrganizationReferent)): org_prob = (Utils.asObjectOrNull(li[i].ref, OrganizationReferent)) res.end_token = li[i].end_token elif (li[i].typ == FundsItemTyp.COUNT): if (funds.count > 0 or li[i].num_val is None or li[i].num_val.int_value is None): break funds.count = li[i].num_val.int_value res.end_token = li[i].end_token elif (li[i].typ == FundsItemTyp.ORG): if (funds.source is not None and funds.source != li[i].ref): break funds.source = Utils.asObjectOrNull(li[i].ref, OrganizationReferent) res.end_token = li[i].end_token elif (li[i].typ == FundsItemTyp.PERCENT): if (funds.percent > 0 or li[i].num_val is None or li[i].num_val.real_value == 0): break funds.percent = li[i].num_val.real_value res.end_token = li[i].end_token elif (li[i].typ == FundsItemTyp.SUM): if (funds.sum0_ is not None): break funds.sum0_ = Utils.asObjectOrNull(li[i].ref, MoneyReferent) res.end_token = li[i].end_token elif (li[i].typ == FundsItemTyp.PRICE): if (funds.price is not None): break funds.price = Utils.asObjectOrNull(li[i].ref, MoneyReferent) res.end_token = li[i].end_token else: break i += 1 if (funds.percent > 0 and funds.source is not None and funds.kind == FundsKind.UNDEFINED): funds.kind = FundsKind.STOCK if (not funds._checkCorrect()): return None if (funds.source is None): cou = 0 tt = res.begin_token.previous while tt is not None: cou += 1 if ((cou) > 500): break if (tt.is_newline_after): cou += 10 fr = Utils.asObjectOrNull(tt.getReferent(), FundsReferent) if (fr is not None and fr.source is not None): funds.source = fr.source break tt = tt.previous if (funds.source is None and org_prob is not None): funds.source = org_prob if (funds.source is None): cou = 0 tt = res.begin_token.previous while tt is not None: cou += 1 if ((cou) > 300): break if (tt.is_newline_after): cou += 10 refs = tt.getReferents() if (refs is not None): for r in refs: if (isinstance(r, OrganizationReferent)): ki = (r).kind if (ki == OrganizationKind.JUSTICE or ki == OrganizationKind.GOVENMENT): continue funds.source = Utils.asObjectOrNull( r, OrganizationReferent) cou = 10000 break tt = tt.previous return res
def attach_first(self, p : 'InstrumentParticipantReferent', min_char : int, max_char : int) -> 'ReferentToken': t = None tt0 = self.begin_token refs = list() t = tt0.previous first_pass3287 = True while True: if first_pass3287: first_pass3287 = False else: t = t.previous if (not (t is not None and t.begin_char >= min_char)): break if (t.is_newline_after): if (t.newlines_after_count > 1): break if (isinstance(t.next0_, NumberToken)): break tt = ParticipantToken.__try_attach_contract_ground(t, p, False) if (tt is not None): continue r = t.get_referent() if (((((isinstance(r, OrganizationReferent)) or (isinstance(r, PhoneReferent)) or (isinstance(r, PersonReferent))) or (isinstance(r, PersonPropertyReferent)) or (isinstance(r, AddressReferent))) or (isinstance(r, UriReferent)) or (isinstance(r, PersonIdentityReferent))) or (isinstance(r, BankDataReferent))): if (not r in refs): refs.insert(0, r) tt0 = t if (len(refs) > 0): for r in refs: if (r != refs[0] and (isinstance(refs[0], OrganizationReferent)) and (((isinstance(r, PersonReferent)) or (isinstance(r, PersonPropertyReferent))))): p.add_slot(InstrumentParticipantReferent.ATTR_DELEGATE, r, False, 0) else: p.add_slot(InstrumentParticipantReferent.ATTR_REF, r, False, 0) rt = ReferentToken(p, tt0, self.end_token) t = self.end_token.next0_ if (BracketHelper.is_bracket(t, False)): t = t.next0_ if (t is not None and t.is_char(',')): t = t.next0_ first_pass3288 = True while True: if first_pass3288: first_pass3288 = False else: t = t.next0_ if (not (t is not None and ((max_char == 0 or t.begin_char <= max_char)))): break if (t.is_value("СТОРОНА", None)): break r = t.get_referent() if (((((isinstance(r, OrganizationReferent)) or (isinstance(r, PhoneReferent)) or (isinstance(r, PersonReferent))) or (isinstance(r, PersonPropertyReferent)) or (isinstance(r, AddressReferent))) or (isinstance(r, UriReferent)) or (isinstance(r, PersonIdentityReferent))) or (isinstance(r, BankDataReferent))): if ((((isinstance(r, PersonPropertyReferent)) and t.next0_ is not None and t.next0_.is_comma) and (isinstance(t.next0_.next0_, ReferentToken)) and (isinstance(t.next0_.next0_.get_referent(), PersonReferent))) and not t.next0_.is_newline_after): pe = Utils.asObjectOrNull(t.next0_.next0_.get_referent(), PersonReferent) pe.add_slot(PersonReferent.ATTR_ATTR, r, False, 0) r = (pe) t = t.next0_.next0_ is_delegate = False if (t.previous.is_value("ЛИЦО", None) or t.previous.is_value("ИМЯ", None)): is_delegate = True if (t.previous.is_value("КОТОРЫЙ", None) and t.previous.previous is not None and ((t.previous.previous.is_value("ИМЯ", None) or t.previous.previous.is_value("ЛИЦО", None)))): is_delegate = True p.add_slot((InstrumentParticipantReferent.ATTR_DELEGATE if (((isinstance(r, PersonReferent)) or (isinstance(r, PersonPropertyReferent)))) and is_delegate else InstrumentParticipantReferent.ATTR_REF), r, False, 0) rt.end_token = t continue tt = ParticipantToken.__try_attach_contract_ground(t, p, False) if (tt is not None): rt.end_token = tt t = rt.end_token if (rt.begin_char == tt.begin_char): rt.begin_token = tt continue if (t.is_value("В", None) and t.next0_ is not None and t.next0_.is_value("ЛИЦО", None)): t = t.next0_ continue if (t.is_value("ОТ", None) and t.next0_ is not None and t.next0_.is_value("ИМЯ", None)): t = t.next0_ continue if (t.is_value("ПО", None) and t.next0_ is not None and t.next0_.is_value("ПОРУЧЕНИЕ", None)): t = t.next0_ continue if (t.is_newline_before): break if (t.get_morph_class_in_dictionary() == MorphClass.VERB): if ((not t.is_value("УДОСТОВЕРЯТЬ", None) and not t.is_value("ПРОЖИВАТЬ", None) and not t.is_value("ЗАРЕГИСТРИРОВАТЬ", None)) and not t.is_value("ДЕЙСТВОВАТЬ", None)): break if (t.is_and and t.previous is not None and t.previous.is_comma): break if (t.is_and and t.next0_.get_referent() is not None): if (isinstance(t.next0_.get_referent(), OrganizationReferent)): break pe = Utils.asObjectOrNull(t.next0_.get_referent(), PersonReferent) if (pe is not None): has_ip = False for s in pe.slots: if (s.type_name == PersonReferent.ATTR_ATTR): if (str(s.value).startswith("индивидуальный предприниматель")): has_ip = True break if (has_ip): break t = rt.begin_token while t is not None and t.end_char <= rt.end_char: tt = ParticipantToken.__try_attach_contract_ground(t, p, True) if (tt is not None): if (tt.end_char > rt.end_char): rt.end_token = tt t = tt t = t.next0_ return rt
def try_attach_requisites(t : 'Token', cur : 'InstrumentParticipantReferent', other : 'InstrumentParticipantReferent', cant_be_empty : bool=False) -> 'ReferentToken': if (t is None or cur is None): return None if (t.is_table_control_char): return None err = 0 spec_chars = 0 rt = None t0 = t is_in_tab_cell = False cou = 0 tt = t.next0_ while tt is not None and (cou < 300): if (tt.is_table_control_char): is_in_tab_cell = True break tt = tt.next0_; cou += 1 first_pass3286 = True while True: if first_pass3286: first_pass3286 = False else: t = t.next0_ if (not (t is not None)): break if (t.begin_char == 8923): pass if (t.is_table_control_char): if (t != t0): if (rt is not None): rt.end_token = t.previous elif (not cant_be_empty): rt = ReferentToken(cur, t0, t.previous) break else: continue if ((t.is_char_of(":.") or t.is_value("М", None) or t.is_value("M", None)) or t.is_value("П", None)): if (rt is not None): rt.end_token = t continue pp = ParticipantToken.try_attach_to_exist(t, cur, other) if (pp is not None): if (pp.referent != cur): break if (rt is None): rt = ReferentToken(cur, t, t) rt.end_token = pp.end_token err = 0 continue if (t.is_newline_before): iii = InstrToken.parse(t, 0, None) if (iii is not None): if (iii.typ == ILTypes.APPENDIX): break if (t.whitespaces_before_count > 25 and not is_in_tab_cell): if (t != t0): if (t.previous is not None and t.previous.is_char_of(",;")): pass elif (t.newlines_before_count > 1): break if ((isinstance(t.get_referent(), PersonReferent)) or (isinstance(t.get_referent(), OrganizationReferent))): if (not cur._contains_ref(t.get_referent())): break if ((t.is_char_of(";:,.") or t.is_hiphen or t.morph.class0_.is_preposition) or t.morph.class0_.is_conjunction): continue if (t.is_char_of("_/\\")): spec_chars += 1 if (spec_chars > 10 and rt is None): rt = ReferentToken(cur, t0, t) if (rt is not None): rt.end_token = t continue if (t.is_newline_before and (isinstance(t, NumberToken))): break if (t.is_value("ОФИС", None)): if (BracketHelper.can_be_start_of_sequence(t.next0_, True, False)): br = BracketHelper.try_parse(t.next0_, BracketParseAttr.NO, 100) if (br is not None): t = br.end_token continue if ((isinstance(t.next0_, TextToken)) and not t.next0_.chars.is_all_lower): t = t.next0_ continue r = t.get_referent() if ((((isinstance(r, PersonReferent)) or (isinstance(r, AddressReferent)) or (isinstance(r, UriReferent))) or (isinstance(r, OrganizationReferent)) or (isinstance(r, PhoneReferent))) or (isinstance(r, PersonIdentityReferent)) or (isinstance(r, BankDataReferent))): if (other is not None and other.find_slot(None, r, True) is not None): if (not (isinstance(r, UriReferent))): break if (rt is None): rt = ReferentToken(cur, t, t) if (cur.find_slot(InstrumentParticipantReferent.ATTR_DELEGATE, r, True) is not None): pass else: cur.add_slot(InstrumentParticipantReferent.ATTR_REF, r, False, 0) rt.end_token = t err = 0 else: if ((isinstance(t, TextToken)) and t.length_char > 1): err += 1 if (is_in_tab_cell and rt is not None): if (err > 300): break elif (err > 4): break return rt
def __tryParse(t: 'Token', is_in_lit: bool, max_char: int = 0) -> typing.List['ReferentToken']: if (t is None): return None is_bracket_regime = False if (t.previous is not None and t.previous.isChar('(')): is_bracket_regime = True blt = BookLinkToken.tryParse(t, 0) if (blt is None): blt = BookLinkToken.tryParseAuthor(t, FioTemplateType.UNDEFINED) if (blt is None and not is_bracket_regime): return None t0 = t coef = 0 is_electr_res = False decree = None regtyp = BookLinkAnalyzer.RegionTyp.UNDEFINED num = None spec_see = None book_prev = None if (is_bracket_regime): regtyp = BookLinkAnalyzer.RegionTyp.AUTHORS elif (blt.typ == BookLinkTyp.PERSON): if (not is_in_lit): return None regtyp = BookLinkAnalyzer.RegionTyp.AUTHORS elif (blt.typ == BookLinkTyp.NUMBER): num = blt.value t = blt.end_token.next0_ if (t is None or t.is_newline_before): return None if (not t.is_whitespace_before): if (isinstance(t, NumberToken)): n = (t).value if ((((n == "3" or n == "0")) and not t.is_whitespace_after and (isinstance(t.next0_, TextToken))) and t.next0_.chars.is_all_lower): pass else: return None elif (not ((isinstance(t, TextToken))) or t.chars.is_all_lower): r = t.getReferent() if (isinstance(r, PersonReferent)): pass elif (is_in_lit and r is not None and r.type_name == "DECREE"): pass else: return None first_pass2757 = True while True: if first_pass2757: first_pass2757 = False else: t = t.next0_ if (not (t is not None)): break if (isinstance(t, NumberToken)): break if (not ((isinstance(t, TextToken)))): break if (BracketHelper.canBeStartOfSequence(t, True, False)): break if (not t.chars.is_letter): continue bbb = BookLinkToken.tryParse(t, 0) if (bbb is not None): if (bbb.typ == BookLinkTyp.TAMZE): spec_see = bbb t = bbb.end_token.next0_ break if (bbb.typ == BookLinkTyp.SEE): t = bbb.end_token continue break if (spec_see is not None and spec_see.typ == BookLinkTyp.TAMZE): coef += 1 max0_ = 1000 tt = t0 while tt is not None and max0_ > 0: if (isinstance(tt.getReferent(), BookLinkRefReferent)): book_prev = (tt.getReferent()).book break tt = tt.previous max0_ -= 1 blt1 = BookLinkToken.tryParseAuthor(t, FioTemplateType.UNDEFINED) if (blt1 is not None and blt1.typ == BookLinkTyp.PERSON): regtyp = BookLinkAnalyzer.RegionTyp.AUTHORS else: ok = False tt = t first_pass2758 = True while True: if first_pass2758: first_pass2758 = False else: tt = (None if tt is None else tt.next0_) if (not (tt is not None)): break if (tt.is_newline_before): break if (is_in_lit and tt.getReferent() is not None and tt.getReferent().type_name == "DECREE"): ok = True decree = tt break bbb = BookLinkToken.tryParse(tt, 0) if (bbb is None): continue if (bbb.typ == BookLinkTyp.ELECTRONRES): is_electr_res = True ok = True break if (bbb.typ == BookLinkTyp.DELIMETER): tt = bbb.end_token.next0_ if (BookLinkToken.tryParseAuthor( tt, FioTemplateType.UNDEFINED) is not None): ok = True break bbb = BookLinkToken.tryParse(tt, 0) if (bbb is not None): if (bbb.typ == BookLinkTyp.EDITORS or bbb.typ == BookLinkTyp.TRANSLATE or bbb.typ == BookLinkTyp.SOSTAVITEL): ok = True break if (not ok and not is_in_lit): if (BookLinkToken.checkLinkBefore(t0, num)): pass else: return None regtyp = BookLinkAnalyzer.RegionTyp.NAME else: return None res = BookLinkReferent() corr_authors = list() t00 = t blt00 = None start_of_name = None prev_pers_templ = FioTemplateType.UNDEFINED if (regtyp == BookLinkAnalyzer.RegionTyp.AUTHORS): first_pass2759 = True while True: if first_pass2759: first_pass2759 = False else: t = t.next0_ if (not (t is not None)): break if (max_char > 0 and t.begin_char >= max_char): break if (t.isCharOf(".;") or t.is_comma_and): continue if (t.isChar('/')): break if ((t.isChar('(') and t.next0_ is not None and t.next0_.isValue("EDS", None)) and t.next0_.next0_ is not None and t.next0_.next0_.isChar(')')): t = t.next0_.next0_.next0_ break blt = BookLinkToken.tryParseAuthor(t, prev_pers_templ) if (blt is None and t.previous is not None and t.previous.is_and): blt = BookLinkToken.tryParseAuthor( t.previous, FioTemplateType.UNDEFINED) if (blt is None): if ((isinstance(t.getReferent(), OrganizationReferent)) and blt00 is not None): bbb2 = BookLinkToken.tryParse(t.next0_, 0) if (bbb2 is not None): if (bbb2.typ == BookLinkTyp.YEAR): res.addSlot(BookLinkReferent.ATTR_AUTHOR, t.getReferent(), False, 0) res.year = int(bbb2.value) coef += .5 t = bbb2.end_token.next0_ break if (blt.typ == BookLinkTyp.PERSON): tt2 = blt.end_token.next0_ bbb2 = BookLinkToken.tryParse(tt2, 0) if (bbb2 is not None): if (bbb2.typ == BookLinkTyp.YEAR): res.year = int(bbb2.value) coef += .5 blt.end_token = bbb2.end_token blt00 = (None) if (blt00 is not None and ((blt00.end_token.next0_ == blt.begin_token or blt.begin_token.previous.isChar('.')))): tt11 = blt.end_token.next0_ nex = BookLinkToken.tryParse(tt11, 0) if (nex is not None and nex.typ == BookLinkTyp.ANDOTHERS): pass else: if (tt11 is None): break if (tt11.isChar('/') and tt11.next0_ is not None and tt11.next0_.isChar('/')): break if (tt11.isChar(':')): break if ((str(blt).find('.') < 0) and str(blt00).find('.') > 0): break if ((isinstance(tt11, TextToken)) and tt11.chars.is_all_lower): break if (tt11.isCharOf(",.;") and tt11.next0_ is not None): tt11 = tt11.next0_ nex = BookLinkToken.tryParse(tt11, 0) if (nex is not None and nex.typ != BookLinkTyp.PERSON and nex.typ != BookLinkTyp.ANDOTHERS): break elif ( (blt00 is not None and blt00.person_template != FioTemplateType.UNDEFINED and blt.person_template != blt00.person_template) and blt.person_template == FioTemplateType.NAMESURNAME): if (blt.end_token.next0_ is None or not blt.end_token.next0_.is_comma_and): break if (BookLinkToken.tryParseAuthor( blt.end_token.next0_.next0_, FioTemplateType.UNDEFINED) is not None): pass else: break if (blt00 is None and blt.person_template == FioTemplateType.NAMESURNAME): tt = blt.end_token.next0_ if (tt is not None and tt.is_hiphen): tt = tt.next0_ if (isinstance(tt, NumberToken)): break BookLinkAnalyzer.__addAuthor(res, blt) coef += 1 t = blt.end_token if (isinstance(t.getReferent(), PersonReferent)): corr_authors.append( Utils.asObjectOrNull(t, ReferentToken)) blt00 = blt prev_pers_templ = blt.person_template start_of_name = blt.start_of_name if ((start_of_name) is not None): t = t.next0_ break continue if (blt.typ == BookLinkTyp.ANDOTHERS): coef += .5 t = blt.end_token.next0_ res.authors_and_other = True break break if (t is None): return None if ((t.is_newline_before and t != t0 and num is None) and res.findSlot( BookLinkReferent.ATTR_AUTHOR, None, True) is None): return None if (start_of_name is None): if (t.chars.is_all_lower): coef -= (1) if (t.chars.is_latin_letter and not is_electr_res and num is None): if (res.getSlotValue(BookLinkReferent.ATTR_AUTHOR) is None): return None tn0 = t tn1 = None uri = None next_num = None wrapnn393 = RefOutArgWrapper(0) inoutres394 = Utils.tryParseInt(Utils.ifNotNull(num, ""), wrapnn393) nn = wrapnn393.value if (inoutres394): next_num = str((nn + 1)) br = (BracketHelper.tryParse( t, Utils.valToEnum( (BracketParseAttr.CANCONTAINSVERBS) | (BracketParseAttr.CANBEMANYLINES), BracketParseAttr), 100) if BracketHelper.canBeStartOfSequence(t, True, False) else None) if (br is not None): t = t.next0_ pages = None first_pass2760 = True while True: if first_pass2760: first_pass2760 = False else: t = t.next0_ if (not (t is not None)): break if (max_char > 0 and t.begin_char >= max_char): break if (br is not None and br.end_token == t): tn1 = t break tit = TitleItemToken.tryAttach(t) if (tit is not None): if ((tit.typ == TitleItemToken.Types.TYP and tn0 == t and br is None) and BracketHelper.canBeStartOfSequence( tit.end_token.next0_, True, False)): br = BracketHelper.tryParse(tit.end_token.next0_, BracketParseAttr.NO, 100) if (br is not None): coef += (1) if (num is not None): coef += 1 tn0 = br.begin_token tn1 = br.end_token res.typ = tit.value.lower() t = br.end_token.next0_ break if (t.is_newline_before and t != tn0): if (br is not None and (t.end_char < br.end_char)): pass elif (not MiscHelper.canBeStartOfSentence(t)): pass else: if (t.newlines_before_count > 1): break if ((isinstance(t, NumberToken)) and num is not None and (t).int_value is not None): if (num == str(((t).int_value - 1))): break elif (num is not None): pass else: nnn = NounPhraseHelper.tryParse( t.previous, Utils.valToEnum( ((NounPhraseParseAttr.PARSEPREPOSITION) | (NounPhraseParseAttr.PARSEADVERBS) | (NounPhraseParseAttr.PARSENUMERICASADJECTIVE)) | (NounPhraseParseAttr.MULTILINES), NounPhraseParseAttr), 0) if (nnn is not None and nnn.end_char >= t.end_char): pass else: break if (t.isCharOf(".;") and t.whitespaces_after_count > 0): tit = TitleItemToken.tryAttach(t.next0_) if ((tit) is not None): if (tit.typ == TitleItemToken.Types.TYP): break stop = True words = 0 notwords = 0 tt = t.next0_ first_pass2761 = True while True: if first_pass2761: first_pass2761 = False else: tt = tt.next0_ if (not (tt is not None)): break blt0 = BookLinkToken.tryParse(tt, 0) if (blt0 is None): if (tt.is_newline_before): break if ((isinstance(tt, TextToken)) and not tt.getMorphClassInDictionary().is_undefined ): words += 1 else: notwords += 1 if (words > 6 and words > (notwords * 4)): stop = False break continue if ((blt0.typ == BookLinkTyp.DELIMETER or blt0.typ == BookLinkTyp.TRANSLATE or blt0.typ == BookLinkTyp.TYPE) or blt0.typ == BookLinkTyp.GEO or blt0.typ == BookLinkTyp.PRESS): stop = False break if (br is not None and br.end_token.previous.end_char > t.end_char): stop = False if (stop): break if (t == decree): t = t.next0_ break blt = BookLinkToken.tryParse(t, 0) if (blt is None): tn1 = t continue if (blt.typ == BookLinkTyp.DELIMETER): break if (((blt.typ == BookLinkTyp.MISC or blt.typ == BookLinkTyp.TRANSLATE or blt.typ == BookLinkTyp.NAMETAIL) or blt.typ == BookLinkTyp.TYPE or blt.typ == BookLinkTyp.VOLUME) or blt.typ == BookLinkTyp.PAGERANGE or blt.typ == BookLinkTyp.PAGES): coef += 1 break if (blt.typ == BookLinkTyp.GEO or blt.typ == BookLinkTyp.PRESS): if (t.previous.is_hiphen or t.previous.isCharOf(".;") or blt.add_coef > 0): break if (blt.typ == BookLinkTyp.YEAR): if (t.previous is not None and t.previous.is_comma): break if (blt.typ == BookLinkTyp.ELECTRONRES): is_electr_res = True break if (blt.typ == BookLinkTyp.URL): if (t == tn0 or t.previous.isCharOf(":.")): is_electr_res = True break tn1 = t if (tn1 is None and start_of_name is None): if (is_electr_res): uri_re = BookLinkReferent() rt0 = ReferentToken(uri_re, t00, t) rts0 = list() bref0 = BookLinkRefReferent._new389(uri_re) if (num is not None): bref0.number = num rt01 = ReferentToken(bref0, t0, rt0.end_token) ok = False while t is not None: if (t.is_newline_before): break blt0 = BookLinkToken.tryParse(t, 0) if (blt0 is not None): if (isinstance(blt0.ref, UriReferent)): uri_re.addSlot( BookLinkReferent.ATTR_URL, Utils.asObjectOrNull(blt0.ref, UriReferent), False, 0) ok = True t = blt0.end_token rt0.end_token = rt01.end_token = t t = t.next0_ if (ok): rts0.append(rt01) rts0.append(rt0) return rts0 if (decree is not None and num is not None): rts0 = list() bref0 = BookLinkRefReferent._new389(decree.getReferent()) if (num is not None): bref0.number = num rt01 = ReferentToken(bref0, t0, decree) t = decree.next0_ while t is not None: if (t.is_newline_before): break if (isinstance(t, TextToken)): if ((t).is_pure_verb): return None rt01.end_token = t t = t.next0_ rts0.append(rt01) return rts0 if (book_prev is not None): tt = t while tt is not None and ((tt.isCharOf(",.") or tt.is_hiphen)): tt = tt.next0_ blt0 = BookLinkToken.tryParse(tt, 0) if (blt0 is not None and blt0.typ == BookLinkTyp.PAGERANGE): rts0 = list() bref0 = BookLinkRefReferent._new389(book_prev) if (num is not None): bref0.number = num bref0.pages = blt0.value rt00 = ReferentToken(bref0, t0, blt0.end_token) rts0.append(rt00) return rts0 return None if (br is not None and ((tn1 == br.end_token or tn1 == br.end_token.previous))): tn0 = tn0.next0_ tn1 = tn1.previous if (start_of_name is None): while tn0 is not None: if (tn0.isCharOf(":,~")): tn0 = tn0.next0_ else: break while tn1 is not None and tn1.begin_char > tn0.begin_char: if (tn1.isCharOf(".;,:(~") or tn1.is_hiphen or tn1.isValue("РЕД", None)): pass else: break tn1 = tn1.previous nam = MiscHelper.getTextValue( tn0, tn1, Utils.valToEnum( (GetTextAttr.KEEPQUOTES) | (GetTextAttr.KEEPREGISTER), GetTextAttr)) if (start_of_name is not None): if (nam is None or (len(nam) < 3)): nam = start_of_name else: nam = "{0}{1}{2}".format( start_of_name, (" " if tn0.is_whitespace_before else ""), nam) if (nam is None): return None res.name = nam if (num is None and not is_in_lit): if (len(nam) < 20): return None coef -= (2) if (len(nam) > 500): coef -= (math.floor(len(nam) / 500)) if (is_bracket_regime): coef -= 1 if (len(nam) > 200): if (num is None): return None if (res.findSlot(BookLinkReferent.ATTR_AUTHOR, None, True) is None and not BookLinkToken.checkLinkBefore(t0, num)): return None en = 0 ru = 0 ua = 0 cha = 0 nocha = 0 chalen = 0 lt0 = tn0 lt1 = tn1 if (tn1 is None): if (t is None): return None lt0 = t0 lt1 = t tn1 = t.previous tt = lt0 while tt is not None and tt.end_char <= lt1.end_char: if ((isinstance(tt, TextToken)) and tt.chars.is_letter): if (tt.chars.is_latin_letter): en += 1 elif (tt.morph.language.is_ua): ua += 1 elif (tt.morph.language.is_ru): ru += 1 if (tt.length_char > 2): cha += 1 chalen += tt.length_char elif (not ((isinstance(tt, ReferentToken)))): nocha += 1 tt = tt.next0_ if (ru > (ua + en)): res.lang = "RU" elif (ua > (ru + en)): res.lang = "UA" elif (en > (ru + ua)): res.lang = "EN" if (nocha > 3 and nocha > cha and start_of_name is None): if (nocha > (math.floor(chalen / 3))): coef -= (2) if (res.lang == "EN"): tt = tn0.next0_ first_pass2762 = True while True: if first_pass2762: first_pass2762 = False else: tt = tt.next0_ if (not (tt is not None and (tt.end_char < tn1.end_char))): break if (tt.is_comma and tt.next0_ is not None and ((not tt.next0_.chars.is_all_lower or (isinstance(tt.next0_, ReferentToken))))): if (tt.next0_.next0_ is not None and tt.next0_.next0_.is_comma_and): if (isinstance(tt.next0_, ReferentToken)): pass else: continue nam = MiscHelper.getTextValue( tn0, tt.previous, Utils.valToEnum((GetTextAttr.KEEPQUOTES) | (GetTextAttr.KEEPREGISTER), GetTextAttr)) if (nam is not None and len(nam) > 15): res.name = nam break rt = ReferentToken(res, t00, tn1) authors = True edits = False br = (None) first_pass2763 = True while True: if first_pass2763: first_pass2763 = False else: t = t.next0_ if (not (t is not None)): break if (max_char > 0 and t.begin_char >= max_char): break if (BracketHelper.canBeStartOfSequence(t, False, False)): br = BracketHelper.tryParse(t, BracketParseAttr.CANBEMANYLINES, 100) if (br is not None and br.length_char > 300): br = (None) blt = BookLinkToken.tryParse(t, 0) if (t.is_newline_before and not t.isChar('/') and not t.previous.isChar('/')): if (blt is not None and blt.typ == BookLinkTyp.NUMBER): break if (t.previous.isCharOf(":")): pass elif (blt is not None and (( ((blt.typ == BookLinkTyp.DELIMETER or blt.typ == BookLinkTyp.PAGERANGE or blt.typ == BookLinkTyp.PAGES) or blt.typ == BookLinkTyp.GEO or blt.typ == BookLinkTyp.PRESS) or blt.typ == BookLinkTyp.N))): pass elif (num is not None and BookLinkToken.tryParseAuthor( t, FioTemplateType.UNDEFINED) is not None): pass elif (num is not None and blt is not None and blt.typ != BookLinkTyp.NUMBER): pass elif (br is not None and (t.end_char < br.end_char) and t.begin_char > br.begin_char): pass else: ok = False mmm = 50 tt = t.next0_ while tt is not None and mmm > 0: if (tt.is_newline_before): blt2 = BookLinkToken.tryParse(tt, 0) if (blt2 is not None and blt2.typ == BookLinkTyp.NUMBER and blt2.value == next_num): ok = True break if (blt2 is not None): if (blt2.typ == BookLinkTyp.PAGES or blt2.typ == BookLinkTyp.GEO or blt2.typ == BookLinkTyp.PRESS): ok = True break tt = tt.next0_ mmm -= 1 if (not ok): npt = NounPhraseHelper.tryParse( t.previous, Utils.valToEnum( ((NounPhraseParseAttr.MULTILINES) | (NounPhraseParseAttr.PARSEADVERBS) | (NounPhraseParseAttr.PARSEPREPOSITION)) | (NounPhraseParseAttr.PARSEVERBS) | (NounPhraseParseAttr.PARSEPRONOUNS), NounPhraseParseAttr), 0) if (npt is not None and npt.end_char >= t.end_char): ok = True if (not ok): break rt.end_token = t if (blt is not None): rt.end_token = blt.end_token if (t.isCharOf(".,") or t.is_hiphen): continue if (t.isValue("С", None)): pass if (regtyp == BookLinkAnalyzer.RegionTyp.FIRST and blt is not None and blt.typ == BookLinkTyp.EDITORS): edits = True t = blt.end_token coef += 1 continue if (regtyp == BookLinkAnalyzer.RegionTyp.FIRST and blt is not None and blt.typ == BookLinkTyp.SOSTAVITEL): edits = False t = blt.end_token coef += 1 continue if (regtyp == BookLinkAnalyzer.RegionTyp.FIRST and authors): blt2 = BookLinkToken.tryParseAuthor(t, prev_pers_templ) if (blt2 is not None and blt2.typ == BookLinkTyp.PERSON): prev_pers_templ = blt2.person_template if (not edits): BookLinkAnalyzer.__addAuthor(res, blt2) coef += 1 t = blt2.end_token continue if (blt2 is not None and blt2.typ == BookLinkTyp.ANDOTHERS): if (not edits): res.authors_and_other = True coef += 1 t = blt2.end_token continue authors = False if (blt is None): continue if (blt.typ == BookLinkTyp.ELECTRONRES or blt.typ == BookLinkTyp.URL): is_electr_res = True if (blt.typ == BookLinkTyp.ELECTRONRES): coef += 1.5 else: coef += .5 if (isinstance(blt.ref, UriReferent)): res.addSlot(BookLinkReferent.ATTR_URL, Utils.asObjectOrNull(blt.ref, UriReferent), False, 0) elif (blt.typ == BookLinkTyp.YEAR): if (res.year == 0): res.year = int(blt.value) coef += .5 elif (blt.typ == BookLinkTyp.DELIMETER): coef += 1 if (blt.length_char == 2): regtyp = BookLinkAnalyzer.RegionTyp.SECOND else: regtyp = BookLinkAnalyzer.RegionTyp.FIRST elif ( (((blt.typ == BookLinkTyp.MISC or blt.typ == BookLinkTyp.TYPE or blt.typ == BookLinkTyp.PAGES) or blt.typ == BookLinkTyp.NAMETAIL or blt.typ == BookLinkTyp.TRANSLATE) or blt.typ == BookLinkTyp.PRESS or blt.typ == BookLinkTyp.VOLUME) or blt.typ == BookLinkTyp.N): coef += 1 elif (blt.typ == BookLinkTyp.PAGERANGE): pages = blt coef += 1 if (is_bracket_regime and blt.end_token.next0_ is not None and blt.end_token.next0_.isChar(')')): coef += (2) if (res.name is not None and res.findSlot(BookLinkReferent.ATTR_AUTHOR, None, True) is not None): coef = (10) elif (blt.typ == BookLinkTyp.GEO and ((regtyp == BookLinkAnalyzer.RegionTyp.SECOND or regtyp == BookLinkAnalyzer.RegionTyp.FIRST))): coef += 1 elif (blt.typ == BookLinkTyp.GEO and t.previous is not None and t.previous.isChar('.')): coef += 1 elif (blt.typ == BookLinkTyp.ANDOTHERS): coef += 1 if (authors): res.authors_and_other = True coef += blt.add_coef t = blt.end_token if ((coef < 2.5) and num is not None): if (BookLinkToken.checkLinkBefore(t0, num)): coef += (2) elif (BookLinkToken.checkLinkAfter(rt.end_token, num)): coef += (1) if (rt.length_char > 500): return None if (is_in_lit): coef += 1 if (coef < 2.5): if (is_electr_res and uri is not None): pass elif (coef >= 2 and is_in_lit): pass else: return None for rr in corr_authors: pits0 = PersonItemToken.tryAttachList( rr.begin_token, None, PersonItemToken.ParseAttr.CANINITIALBEDIGIT, 10) if (pits0 is None or (len(pits0) < 2)): continue if (pits0[0].typ == PersonItemToken.ItemType.VALUE): exi = False for i in range(len(rr.referent.slots) - 1, -1, -1): s = rr.referent.slots[i] if (s.type_name == PersonReferent.ATTR_LASTNAME): ln = Utils.asObjectOrNull(s.value, str) if (ln is None): continue if (ln == pits0[0].value): exi = True continue if (ln.find('-') > 0): ln = ln[0:0 + ln.find('-')] if (pits0[0].begin_token.isValue(ln, None)): del rr.referent.slots[i] if (not exi): rr.referent.addSlot(PersonReferent.ATTR_LASTNAME, pits0[0].value, False, 0) rts = list() bref = BookLinkRefReferent._new389(res) if (num is not None): bref.number = num rt1 = ReferentToken(bref, t0, rt.end_token) if (pages is not None): if (pages.value is not None): bref.pages = pages.value rt.end_token = pages.begin_token.previous rts.append(rt1) rts.append(rt) return rts
def __try1(li: typing.List['CityItemToken'], oi: 'IntOntologyItem', ad: 'AnalyzerDataWithOntology') -> 'ReferentToken': oi.value = (None) if (li is None or (len(li) < 1)): return None elif (li[0].typ != CityItemToken.ItemType.CITY): if (len(li) != 2 or li[0].typ != CityItemToken.ItemType.PROPERNAME or li[1].typ != CityItemToken.ItemType.NOUN): return None i = 1 oi.value = li[0].onto_item ok = not li[0].doubtful if ((ok and li[0].onto_item is not None and li[0].onto_item.misc_attr is None) and ad is not None): if (li[0].onto_item.owner != ad.local_ontology and not li[0].onto_item.owner.is_ext_ontology): if (li[0].begin_token.previous is not None and li[0].begin_token.previous.isValue("В", None)): pass else: ok = False if (len(li) == 1 and li[0].begin_token.morph.class0_.is_adjective): sits = StreetItemToken.tryParseList(li[0].begin_token, None, 3) if (sits is not None and len(sits) == 2 and sits[1].typ == StreetItemType.NOUN): return None typ = None alttyp = None mc = li[0].morph if (i < len(li)): if (li[i].typ == CityItemToken.ItemType.NOUN): at = None if (not li[i].chars.is_all_lower and (li[i].whitespaces_after_count < 2)): sit = StreetItemToken.tryParse(li[i].end_token.next0_, None, False, None, False) if (sit is not None and sit.typ == StreetItemType.NOUN): at = AddressItemToken.tryParse(li[i].begin_token, None, False, False, None) if (at is not None): at2 = AddressItemToken.tryParse( li[i].end_token.next0_, None, False, False, None) if (at2 is not None and at2.typ == AddressItemToken.ItemType.STREET): at = (None) if (at is None): typ = li[i].value alttyp = li[i].alt_value if (li[i].begin_token.isValue("СТ", None) and li[i].begin_token.chars.is_all_upper): return None if ((i + 1) == len(li)): ok = True if (not li[i].morph.case_.is_undefined): mc = li[i].morph i += 1 elif (ok): i += 1 else: tt0 = li[0].begin_token.previous if ((isinstance(tt0, TextToken)) and (tt0.whitespaces_after_count < 3)): if (tt0.isValue("МЭР", "МЕР") or tt0.isValue("ГЛАВА", None) or tt0.isValue("ГРАДОНАЧАЛЬНИК", None)): ok = True i += 1 if (not ok and oi.value is not None and (len(oi.value.canonic_text) < 4)): return None if (not ok and li[0].begin_token.morph.class0_.is_proper_name): return None if (not ok): if (not MiscHelper.isExistsInDictionary( li[0].begin_token, li[0].end_token, (MorphClass.ADJECTIVE) | MorphClass.NOUN | MorphClass.PRONOUN)): ok = (li[0].geo_object_before or li[i - 1].geo_object_after) if (ok and li[0].begin_token == li[0].end_token): mcc = li[0].begin_token.getMorphClassInDictionary() if (mcc.is_proper_name or mcc.is_proper_surname): ok = False elif (li[0].geo_object_before and (li[0].whitespaces_after_count < 2)): ad1 = AddressItemToken.tryParse( li[0].begin_token, None, False, False, None) if (ad1 is not None and ad1.typ == AddressItemToken.ItemType.STREET): ad2 = AddressItemToken.tryParse( li[0].end_token.next0_, None, False, False, None) if (ad2 is None or ad2.typ != AddressItemToken.ItemType.STREET): ok = False elif (AddressItemToken.tryAttachOrg(li[0].begin_token) is not None): ok = False if (ok): if (li[0].kit.processReferent("PERSON", li[0].begin_token) is not None): ok = False if (not ok): ok = CityAttachHelper.checkYearAfter(li[0].end_token.next0_) if (not ok and ((not li[0].begin_token.morph.class0_.is_adjective or li[0].begin_token != li[0].end_token))): ok = CityAttachHelper.checkCityAfter(li[0].end_token.next0_) if (not ok): return None if (i < len(li)): del li[i:i + len(li) - i] rt = None if (oi.value is None): if (li[0].value is not None and li[0].higher_geo is not None): cap = GeoReferent() cap._addName(li[0].value) cap._addTypCity(li[0].kit.base_language) cap.higher = li[0].higher_geo if (typ is not None): cap._addTyp(typ) if (alttyp is not None): cap._addTyp(alttyp) rt = ReferentToken(cap, li[0].begin_token, li[0].end_token) else: if (li[0].value is None): return None if (typ is None): if ((len(li) == 1 and li[0].begin_token.previous is not None and li[0].begin_token.previous.is_hiphen) and (isinstance(li[0].begin_token.previous.previous, ReferentToken)) and (isinstance( li[0].begin_token.previous.previous.getReferent(), GeoReferent))): pass else: return None else: if (not LanguageHelper.endsWithEx(typ, "ПУНКТ", "ПОСЕЛЕНИЕ", "ПОСЕЛЕННЯ", "ПОСЕЛОК")): if (not LanguageHelper.endsWith(typ, "CITY")): if (typ == "СТАНЦИЯ" and ((MiscLocationHelper.checkGeoObjectBefore( li[0].begin_token)))): pass elif (len(li) > 1 and li[1].typ == CityItemToken.ItemType.NOUN and li[0].typ == CityItemToken.ItemType.CITY): pass else: return None if (li[0].begin_token.morph.class0_.is_adjective): li[0].value = ProperNameHelper.getNameEx( li[0].begin_token, li[0].end_token, MorphClass.ADJECTIVE, li[1].morph.case_, li[1].morph.gender, False, False) elif (isinstance(oi.value.referent, GeoReferent)): rt = ReferentToken._new719( Utils.asObjectOrNull(oi.value.referent, GeoReferent), li[0].begin_token, li[len(li) - 1].end_token, mc) elif (typ is None): typ = oi.value.typ if (rt is None): city = GeoReferent() city._addName( (li[0].value if oi.value is None else oi.value.canonic_text)) if (typ is not None): city._addTyp(typ) else: city._addTypCity(li[0].kit.base_language) if (alttyp is not None): city._addTyp(alttyp) rt = ReferentToken._new719(city, li[0].begin_token, li[len(li) - 1].end_token, mc) if ((isinstance(rt.referent, GeoReferent)) and len(li) == 1 and (rt.referent).is_city): if (rt.begin_token.previous is not None and rt.begin_token.previous.isValue("Г", None)): rt.begin_token = rt.begin_token.previous elif ((rt.begin_token.previous is not None and rt.begin_token.previous.isChar('.') and rt.begin_token.previous.previous is not None) and rt.begin_token.previous.previous.isValue("Г", None)): rt.begin_token = rt.begin_token.previous.previous elif (rt.end_token.next0_ is not None and (rt.whitespaces_after_count < 2) and rt.end_token.next0_.isValue("Г", None)): rt.end_token = rt.end_token.next0_ if (rt.end_token.next0_ is not None and rt.end_token.next0_.isChar('.')): rt.end_token = rt.end_token.next0_ return rt