Пример #1
0
 def process(self, kit: 'AnalysisKit') -> None:
     ad = kit.get_analyzer_data(self)
     delta = 100000
     parts = math.floor((((len(kit.sofa.text) + delta) - 1)) / delta)
     if (parts == 0):
         parts = 1
     cur = 0
     next_pos = 0
     t = kit.first_token
     first_pass3182 = True
     while True:
         if first_pass3182: first_pass3182 = False
         else: t = t.next0_
         if (not (t is not None)): break
         if (t.begin_char > next_pos):
             next_pos += delta
             cur += 1
             if (not self._on_progress(cur, parts, kit)):
                 break
         at = GoodAttrToken.try_parse(t, None, True, True)
         if (at is None):
             continue
         attr = at._create_attr()
         if (attr is None):
             t = at.end_token
             continue
         rt = ReferentToken(attr, at.begin_token, at.end_token)
         rt.referent = ad.register_referent(attr)
         kit.embed_token(rt)
         t = (rt)
Пример #2
0
 def process(self, kit : 'AnalysisKit') -> None:
     """ Основная функция выделения объектов
     
     Args:
         container: 
         lastStage: 
     
     """
     ad = kit.getAnalyzerData(self)
     t = kit.first_token
     first_pass3149 = True
     while True:
         if first_pass3149: first_pass3149 = False
         else: t = t.next0_
         if (not (t is not None)): break
         tt = t
         tok = UriAnalyzer.__m_schemes.tryParse(t, TerminParseAttr.NO)
         if (tok is not None): 
             i = (tok.termin.tag)
             tt = tok.end_token
             if (tt.next0_ is not None and tt.next0_.isChar('(')): 
                 tok1 = UriAnalyzer.__m_schemes.tryParse(tt.next0_.next0_, TerminParseAttr.NO)
                 if ((tok1 is not None and tok1.termin.canonic_text == tok.termin.canonic_text and tok1.end_token.next0_ is not None) and tok1.end_token.next0_.isChar(')')): 
                     tt = tok1.end_token.next0_
             if (i == 0): 
                 if ((tt.next0_ is None or ((not tt.next0_.isCharOf(":|") and not tt.is_table_control_char)) or tt.next0_.is_whitespace_before) or tt.next0_.whitespaces_after_count > 2): 
                     continue
                 t1 = tt.next0_.next0_
                 while t1 is not None and t1.isCharOf("/\\"):
                     t1 = t1.next0_
                 if (t1 is None or t1.whitespaces_before_count > 2): 
                     continue
                 ut = UriItemToken.attachUriContent(t1, False)
                 if (ut is None): 
                     continue
                 ur = Utils.asObjectOrNull(ad.registerReferent(UriReferent._new2557(tok.termin.canonic_text.lower(), ut.value)), UriReferent)
                 rt = ReferentToken(ad.registerReferent(ur), t, ut.end_token)
                 rt.begin_token = Utils.ifNotNull(UriAnalyzer.__siteBefore(t.previous), t)
                 if (rt.end_token.next0_ is not None and rt.end_token.next0_.isCharOf("/\\")): 
                     rt.end_token = rt.end_token.next0_
                 kit.embedToken(rt)
                 t = (rt)
                 continue
             if (i == 10): 
                 tt = tt.next0_
                 if (tt is None or not tt.isChar(':')): 
                     continue
                 tt = tt.next0_
                 while tt is not None: 
                     if (tt.isCharOf("/\\")): 
                         pass
                     else: 
                         break
                     tt = tt.next0_
                 if (tt is None): 
                     continue
                 if (tt.isValue("WWW", None) and tt.next0_ is not None and tt.next0_.isChar('.')): 
                     tt = tt.next0_.next0_
                 if (tt is None or tt.is_newline_before): 
                     continue
                 ut = UriItemToken.attachUriContent(tt, True)
                 if (ut is None): 
                     continue
                 if (len(ut.value) < 4): 
                     continue
                 ur = Utils.asObjectOrNull(ad.registerReferent(UriReferent._new2557(tok.termin.canonic_text.lower(), ut.value)), UriReferent)
                 rt = ReferentToken(ad.registerReferent(ur), t, ut.end_token)
                 rt.begin_token = Utils.ifNotNull(UriAnalyzer.__siteBefore(t.previous), t)
                 if (rt.end_token.next0_ is not None and rt.end_token.next0_.isCharOf("/\\")): 
                     rt.end_token = rt.end_token.next0_
                 kit.embedToken(rt)
                 t = (rt)
                 continue
             if (i == 2): 
                 if (tt.next0_ is None or not tt.next0_.isChar('.') or tt.next0_.is_whitespace_before): 
                     continue
                 if (tt.next0_.is_whitespace_after and tok.termin.canonic_text != "WWW"): 
                     continue
                 ut = UriItemToken.attachUriContent(tt.next0_.next0_, True)
                 if (ut is None): 
                     continue
                 ur = Utils.asObjectOrNull(ad.registerReferent(UriReferent._new2557("http", ut.value)), UriReferent)
                 rt = ReferentToken(ur, t, ut.end_token)
                 rt.begin_token = Utils.ifNotNull(UriAnalyzer.__siteBefore(t.previous), t)
                 if (rt.end_token.next0_ is not None and rt.end_token.next0_.isCharOf("/\\")): 
                     rt.end_token = rt.end_token.next0_
                 kit.embedToken(rt)
                 t = (rt)
                 continue
             if (i == 1): 
                 sch = tok.termin.canonic_text
                 ut = None
                 if (sch == "ISBN"): 
                     ut = UriItemToken.attachISBN(tt.next0_)
                     if ((ut is None and t.previous is not None and t.previous.isChar('(')) and t.next0_ is not None and t.next0_.isChar(')')): 
                         tt0 = t.previous.previous
                         while tt0 is not None: 
                             if (tt0.whitespaces_after_count > 2): 
                                 break
                             if (tt0.is_whitespace_before): 
                                 ut = UriItemToken.attachISBN(tt0)
                                 if (ut is not None and ut.end_token.next0_ != t.previous): 
                                     ut = (None)
                                 break
                             tt0 = tt0.previous
                 elif ((sch == "RFC" or sch == "ISO" or sch == "ОКФС") or sch == "ОКОПФ"): 
                     ut = UriItemToken.attachISOContent(tt.next0_, ":")
                 elif (sch == "ГОСТ"): 
                     ut = UriItemToken.attachISOContent(tt.next0_, "-.")
                 elif (sch == "ТУ"): 
                     if (tok.chars.is_all_upper): 
                         ut = UriItemToken.attachISOContent(tt.next0_, "-.")
                         if (ut is not None and (ut.length_char < 10)): 
                             ut = (None)
                 else: 
                     ut = UriItemToken.attachBBK(tt.next0_)
                 if (ut is None): 
                     continue
                 ur = Utils.asObjectOrNull(ad.registerReferent(UriReferent._new2560(ut.value, sch)), UriReferent)
                 if (ut.begin_char < t.begin_char): 
                     rt = ReferentToken(ur, ut.begin_token, t)
                     if (t.next0_ is not None and t.next0_.isChar(')')): 
                         rt.end_token = t.next0_
                 else: 
                     rt = ReferentToken(ur, t, ut.end_token)
                 if (t.previous is not None and t.previous.isValue("КОД", None)): 
                     rt.begin_token = t.previous
                 if (ur.scheme.startswith("ОК")): 
                     UriAnalyzer.__checkDetail(rt)
                 kit.embedToken(rt)
                 t = (rt)
                 if (ur.scheme.startswith("ОК")): 
                     while t.next0_ is not None:
                         if (t.next0_.is_comma_and and (isinstance(t.next0_.next0_, NumberToken))): 
                             pass
                         else: 
                             break
                         ut = UriItemToken.attachBBK(t.next0_.next0_)
                         if (ut is None): 
                             break
                         ur = (Utils.asObjectOrNull(ad.registerReferent(UriReferent._new2560(ut.value, sch)), UriReferent))
                         rt = ReferentToken(ur, t.next0_.next0_, ut.end_token)
                         UriAnalyzer.__checkDetail(rt)
                         kit.embedToken(rt)
                         t = (rt)
                 continue
             if (i == 3): 
                 t0 = tt.next0_
                 while t0 is not None:
                     if (t0.isCharOf(":|") or t0.is_table_control_char or t0.is_hiphen): 
                         t0 = t0.next0_
                     else: 
                         break
                 if (t0 is None): 
                     continue
                 ut = UriItemToken.attachSkype(t0)
                 if (ut is None): 
                     continue
                 ur = Utils.asObjectOrNull(ad.registerReferent(UriReferent._new2560(ut.value.lower(), ("skype" if tok.termin.canonic_text == "SKYPE" else tok.termin.canonic_text))), UriReferent)
                 rt = ReferentToken(ur, t, ut.end_token)
                 kit.embedToken(rt)
                 t = (rt)
                 continue
             if (i == 4): 
                 t0 = tt.next0_
                 if (t0 is not None and ((t0.isChar(':') or t0.is_hiphen))): 
                     t0 = t0.next0_
                 if (t0 is None): 
                     continue
                 ut = UriItemToken.attachIcqContent(t0)
                 if (ut is None): 
                     continue
                 ur = Utils.asObjectOrNull(ad.registerReferent(UriReferent._new2560(ut.value, "ICQ")), UriReferent)
                 rt = ReferentToken(ur, t, t0)
                 kit.embedToken(rt)
                 t = (rt)
                 continue
             if (i == 5 or i == 6): 
                 t0 = tt.next0_
                 has_tab_cel = False
                 is_iban = False
                 first_pass3150 = True
                 while True:
                     if first_pass3150: first_pass3150 = False
                     else: t0 = t0.next0_
                     if (not (t0 is not None)): break
                     if ((((t0.isValue("БАНК", None) or t0.morph.class0_.is_preposition or t0.is_hiphen) or t0.isCharOf(".:") or t0.isValue("РУБЛЬ", None)) or t0.isValue("РУБ", None) or t0.isValue("ДОЛЛАР", None)) or t0.isValue("№", None) or t0.isValue("N", None)): 
                         pass
                     elif (t0.is_table_control_char): 
                         has_tab_cel = True
                     elif (t0.isCharOf("\\/") and t0.next0_ is not None and t0.next0_.isValue("IBAN", None)): 
                         is_iban = True
                         t0 = t0.next0_
                     elif (t0.isValue("IBAN", None)): 
                         is_iban = True
                     elif (isinstance(t0, TextToken)): 
                         npt = NounPhraseHelper.tryParse(t0, NounPhraseParseAttr.NO, 0)
                         if (npt is not None and npt.morph.case_.is_genitive): 
                             t0 = npt.end_token
                             continue
                         break
                     else: 
                         break
                 if (t0 is None): 
                     continue
                 ur2 = None
                 ur2begin = None
                 ur2end = None
                 t00 = t0
                 val = t0.getSourceText()
                 if (str.isdigit(val[0]) and ((((i == 6 or tok.termin.canonic_text == "ИНН" or tok.termin.canonic_text == "БИК") or tok.termin.canonic_text == "ОГРН" or tok.termin.canonic_text == "СНИЛС") or tok.termin.canonic_text == "ОКПО"))): 
                     if (t0.chars.is_letter): 
                         continue
                     if (Utils.isNullOrEmpty(val) or not str.isdigit(val[0])): 
                         continue
                     if (t0.length_char < 9): 
                         tmp = io.StringIO()
                         print(val, end="", file=tmp)
                         ttt = t0.next0_
                         first_pass3151 = True
                         while True:
                             if first_pass3151: first_pass3151 = False
                             else: ttt = ttt.next0_
                             if (not (ttt is not None)): break
                             if (ttt.whitespaces_before_count > 1): 
                                 break
                             if (isinstance(ttt, NumberToken)): 
                                 print(ttt.getSourceText(), end="", file=tmp)
                                 t0 = ttt
                                 continue
                             if (ttt.is_hiphen or ttt.isChar('.')): 
                                 if (ttt.next0_ is None or not ((isinstance(ttt.next0_, NumberToken)))): 
                                     break
                                 if (ttt.is_whitespace_after or ttt.is_whitespace_before): 
                                     break
                                 continue
                             break
                         val = (None)
                         if (tmp.tell() == 20): 
                             val = Utils.toStringStringIO(tmp)
                         elif (tmp.tell() == 9 and tok.termin.canonic_text == "БИК"): 
                             val = Utils.toStringStringIO(tmp)
                         elif (((tmp.tell() == 10 or tmp.tell() == 12)) and tok.termin.canonic_text == "ИНН"): 
                             val = Utils.toStringStringIO(tmp)
                         elif (tmp.tell() >= 15 and tok.termin.canonic_text == "Л/С"): 
                             val = Utils.toStringStringIO(tmp)
                         elif (tmp.tell() >= 11 and ((tok.termin.canonic_text == "ОГРН" or tok.termin.canonic_text == "СНИЛС"))): 
                             val = Utils.toStringStringIO(tmp)
                         elif (tok.termin.canonic_text == "ОКПО"): 
                             val = Utils.toStringStringIO(tmp)
                     if (val is None): 
                         continue
                 elif (not ((isinstance(t0, NumberToken)))): 
                     if ((isinstance(t0, TextToken)) and is_iban): 
                         tmp1 = io.StringIO()
                         t1 = None
                         ttt = t0
                         first_pass3152 = True
                         while True:
                             if first_pass3152: first_pass3152 = False
                             else: ttt = ttt.next0_
                             if (not (ttt is not None)): break
                             if (ttt.is_newline_before and ttt != t0): 
                                 break
                             if (ttt.is_hiphen): 
                                 continue
                             if (not ((isinstance(ttt, NumberToken)))): 
                                 if (not ((isinstance(ttt, TextToken))) or not ttt.chars.is_latin_letter): 
                                     break
                             print(ttt.getSourceText(), end="", file=tmp1)
                             t1 = ttt
                             if (tmp1.tell() >= 34): 
                                 break
                         if (tmp1.tell() < 10): 
                             continue
                         ur1 = UriReferent._new2560(Utils.toStringStringIO(tmp1), tok.termin.canonic_text)
                         ur1.addSlot(UriReferent.ATTR_DETAIL, "IBAN", False, 0)
                         rt1 = ReferentToken(ad.registerReferent(ur1), t, t1)
                         kit.embedToken(rt1)
                         t = (rt1)
                         continue
                     if (not t0.isCharOf("/\\") or t0.next0_ is None): 
                         continue
                     tok2 = UriAnalyzer.__m_schemes.tryParse(t0.next0_, TerminParseAttr.NO)
                     if (tok2 is None or not ((isinstance(tok2.termin.tag, int))) or (tok2.termin.tag) != i): 
                         continue
                     t0 = tok2.end_token.next0_
                     while t0 is not None:
                         if (t0.isCharOf(":N№")): 
                             t0 = t0.next0_
                         elif (t0.is_table_control_char): 
                             t0 = t0.next0_
                             t00 = t0
                             has_tab_cel = True
                         else: 
                             break
                     if (not ((isinstance(t0, NumberToken)))): 
                         continue
                     tmp = io.StringIO()
                     while t0 is not None: 
                         if (not ((isinstance(t0, NumberToken)))): 
                             break
                         print(t0.getSourceText(), end="", file=tmp)
                         t0 = t0.next0_
                     if (t0 is None or not t0.isCharOf("/\\,") or not ((isinstance(t0.next0_, NumberToken)))): 
                         continue
                     val = Utils.toStringStringIO(tmp)
                     Utils.setLengthStringIO(tmp, 0)
                     ur2begin = t0.next0_
                     t0 = t0.next0_
                     while t0 is not None: 
                         if (not ((isinstance(t0, NumberToken)))): 
                             break
                         if (t0.whitespaces_before_count > 4 and tmp.tell() > 0): 
                             break
                         print(t0.getSourceText(), end="", file=tmp)
                         ur2end = t0
                         t0 = t0.next0_
                     ur2 = (Utils.asObjectOrNull(ad.registerReferent(UriReferent._new2557(tok2.termin.canonic_text, Utils.toStringStringIO(tmp))), UriReferent))
                 if (len(val) < 5): 
                     continue
                 ur = Utils.asObjectOrNull(ad.registerReferent(UriReferent._new2560(val, tok.termin.canonic_text)), UriReferent)
                 rt = ReferentToken(ur, t, (t0 if ur2begin is None else ur2begin.previous))
                 if (has_tab_cel): 
                     rt.begin_token = t00
                 if (ur.scheme.startswith("ОК")): 
                     UriAnalyzer.__checkDetail(rt)
                 ttt = t.previous
                 first_pass3153 = True
                 while True:
                     if first_pass3153: first_pass3153 = False
                     else: ttt = ttt.previous
                     if (not (ttt is not None)): break
                     if (ttt.is_table_control_char): 
                         break
                     if (ttt.morph.class0_.is_preposition): 
                         continue
                     if (ttt.isValue("ОРГАНИЗАЦИЯ", None)): 
                         continue
                     if (ttt.isValue("НОМЕР", None) or ttt.isValue("КОД", None)): 
                         rt.begin_token = ttt
                         t = rt.begin_token
                     break
                 kit.embedToken(rt)
                 t = (rt)
                 if (ur2 is not None): 
                     rt2 = ReferentToken(ur2, ur2begin, ur2end)
                     kit.embedToken(rt2)
                     t = (rt2)
                 continue
             continue
         if (t.isChar('@')): 
             u1s = UriItemToken.attachMailUsers(t.previous)
             if (u1s is None): 
                 continue
             u2 = UriItemToken.attachDomainName(t.next0_, False, True)
             if (u2 is None): 
                 continue
             for ii in range(len(u1s) - 1, -1, -1):
                 ur = Utils.asObjectOrNull(ad.registerReferent(UriReferent._new2560("{0}@{1}".format(u1s[ii].value, u2.value).lower(), "mailto")), UriReferent)
                 b = u1s[ii].begin_token
                 t0 = b.previous
                 if (t0 is not None and t0.isChar(':')): 
                     t0 = t0.previous
                 if (t0 is not None and ii == 0): 
                     br = False
                     ttt = t0
                     first_pass3154 = True
                     while True:
                         if first_pass3154: first_pass3154 = False
                         else: ttt = ttt.previous
                         if (not (ttt is not None)): break
                         if (not ((isinstance(ttt, TextToken)))): 
                             break
                         if (ttt != t0 and ttt.whitespaces_after_count > 1): 
                             break
                         if (ttt.isChar(')')): 
                             br = True
                             continue
                         if (ttt.isChar('(')): 
                             if (not br): 
                                 break
                             br = False
                             continue
                         if (ttt.isValue("EMAIL", None) or ttt.isValue("MAILTO", None)): 
                             b = ttt
                             break
                         if (ttt.isValue("MAIL", None)): 
                             b = ttt
                             if ((ttt.previous is not None and ttt.previous.is_hiphen and ttt.previous.previous is not None) and ((ttt.previous.previous.isValue("E", None) or ttt.previous.previous.isValue("Е", None)))): 
                                 b = ttt.previous.previous
                             break
                         if (ttt.isValue("ПОЧТА", None) or ttt.isValue("АДРЕС", None)): 
                             b = t0
                             ttt = ttt.previous
                             if (ttt is not None and ttt.isChar('.')): 
                                 ttt = ttt.previous
                             if (ttt is not None and ((t0.isValue("ЭЛ", None) or ttt.isValue("ЭЛЕКТРОННЫЙ", None)))): 
                                 b = ttt
                             if (b.previous is not None and b.previous.isValue("АДРЕС", None)): 
                                 b = b.previous
                             break
                         if (ttt.morph.class0_.is_preposition): 
                             continue
                 rt = ReferentToken(ur, b, (u2.end_token if ii == (len(u1s) - 1) else u1s[ii].end_token))
                 kit.embedToken(rt)
                 t = (rt)
             continue
         if (not t.morph.language.is_cyrillic): 
             if (t.is_whitespace_before or ((t.previous is not None and t.previous.isCharOf(",(")))): 
                 u1 = UriItemToken.attachUrl(t)
                 if (u1 is not None): 
                     if (u1.is_whitespace_after or u1.end_token.next0_ is None or not u1.end_token.next0_.isChar('@')): 
                         ur = Utils.asObjectOrNull(ad.registerReferent(UriReferent._new2557("http", u1.value)), UriReferent)
                         rt = ReferentToken(ur, u1.begin_token, u1.end_token)
                         rt.begin_token = Utils.ifNotNull(UriAnalyzer.__siteBefore(u1.begin_token.previous), u1.begin_token)
                         kit.embedToken(rt)
                         t = (rt)
                         continue
         if ((isinstance(t, TextToken)) and not t.is_whitespace_after and t.length_char > 2): 
             if (UriAnalyzer.__siteBefore(t.previous) is not None): 
                 ut = UriItemToken.attachUriContent(t, True)
                 if (ut is None or ut.value.find('.') <= 0 or ut.value.find('@') > 0): 
                     continue
                 ur = Utils.asObjectOrNull(ad.registerReferent(UriReferent._new2557("http", ut.value)), UriReferent)
                 rt = ReferentToken(ur, t, ut.end_token)
                 rt.begin_token = UriAnalyzer.__siteBefore(t.previous)
                 if (rt.end_token.next0_ is not None and rt.end_token.next0_.isCharOf("/\\")): 
                     rt.end_token = rt.end_token.next0_
                 kit.embedToken(rt)
                 t = (rt)
                 continue
         if ((t.chars.is_latin_letter and not t.chars.is_all_lower and t.next0_ is not None) and not t.is_whitespace_after): 
             if (t.next0_.isChar('/')): 
                 rt = UriAnalyzer.__TryAttachLotus(Utils.asObjectOrNull(t, TextToken))
                 if (rt is not None): 
                     rt.referent = ad.registerReferent(rt.referent)
                     kit.embedToken(rt)
                     t = (rt)
                     continue