def initialize() -> None: MetaTitleInfo.initialize() try: Termin.ASSIGN_ALL_TEXTS_AS_NORMAL = True TitleItemToken.initialize() Termin.ASSIGN_ALL_TEXTS_AS_NORMAL = False except Exception as ex: raise Utils.newException(ex.__str__(), ex) ProcessorService.registerAnalyzer(TitlePageAnalyzer())
def parse(t0: 'Token', max_lines: int, max_chars: int, max_end_char: int) -> typing.List['Line']: res = list() total_chars = 0 t = t0 while t is not None: if (max_end_char > 0): if (t.begin_char > max_end_char): break t1 = None t1 = t first_pass3393 = True while True: if first_pass3393: first_pass3393 = False else: t1 = t1.next0_ if (not (t1 is not None and t1.next0_ is not None)): break if (t1.is_newline_after): if (t1.next0_ is None or MiscHelper.can_be_start_of_sentence(t1.next0_)): break if (t1 == t and t.is_newline_before and (isinstance(t.get_referent(), PersonReferent))): if (t1.next0_ is None): continue if ((isinstance(t1.next0_, TextToken)) and t1.next0_.chars.is_letter and not t1.next0_.chars.is_all_lower): break if (t1 is None): t1 = t tit = TitleItemToken.try_attach(t) if (tit is not None): if (tit.typ == TitleItemToken.Types.KEYWORDS): break bl = BlockTitleToken.try_attach(t, False, None) if (bl is not None): if (bl.typ != BlkTyps.UNDEFINED): break l_ = Line(t, t1) res.append(l_) total_chars += l_.chars_count if (len(res) >= max_lines or total_chars >= max_chars): break t = t1 t = t.next0_ return res
def __calc_rank_and_value(self, min_newlines_count: int) -> bool: self.rank = 0 if (self.begin_token.chars.is_all_lower): self.rank -= 30 words = 0 up_words = 0 notwords = 0 line_number = 0 tstart = self.begin_token tend = self.end_token t = self.begin_token first_pass3396 = True while True: if first_pass3396: first_pass3396 = False else: t = t.next0_ if (not (t != self.end_token.next0_ and t is not None and t.end_char <= self.end_token.end_char)): break if (t.is_newline_before): pass tit = TitleItemToken.try_attach(t) if (tit is not None): if (tit.typ == TitleItemToken.Types.THEME or tit.typ == TitleItemToken.Types.TYPANDTHEME): if (t != self.begin_token): if (line_number > 0): return False notwords = 0 up_words = notwords words = up_words tstart = tit.end_token.next0_ t = tit.end_token if (t.next0_ is None): return False if (t.next0_.chars.is_letter and t.next0_.chars.is_all_lower): self.rank += 20 else: self.rank += 100 tstart = t.next0_ if (tit.typ == TitleItemToken.Types.TYPANDTHEME): self.type_value = tit.value continue if (tit.typ == TitleItemToken.Types.TYP): if (t == self.begin_token): if (tit.end_token.is_newline_after): self.type_value = tit.value self.rank += 5 tstart = tit.end_token.next0_ t = tit.end_token words += 1 if (tit.begin_token != tit.end_token): words += 1 if (tit.chars.is_all_upper): up_words += 1 continue if (tit.typ == TitleItemToken.Types.DUST or tit.typ == TitleItemToken.Types.SPECIALITY): if (t == self.begin_token): return False self.rank -= 20 if (tit.typ == TitleItemToken.Types.SPECIALITY): self.speciality = tit.value t = tit.end_token continue if (tit.typ == TitleItemToken.Types.CONSULTANT or tit.typ == TitleItemToken.Types.BOSS or tit.typ == TitleItemToken.Types.EDITOR): t = tit.end_token if (t.next0_ is not None and ((t.next0_.is_char_of(":") or t.next0_.is_hiphen or t.whitespaces_after_count > 4))): self.rank -= 10 else: self.rank -= 2 continue return False blt = BookLinkToken.try_parse(t, 0) if (blt is not None): if (blt.typ == BookLinkTyp.MISC or blt.typ == BookLinkTyp.N or blt.typ == BookLinkTyp.PAGES): self.rank -= 10 elif (blt.typ == BookLinkTyp.N or blt.typ == BookLinkTyp.PAGERANGE): self.rank -= 20 if (t == self.begin_token and BookLinkToken.try_parse_author( t, FioTemplateType.UNDEFINED) is not None): self.rank -= 20 if (t.is_newline_before and t != self.begin_token): line_number += 1 if (line_number > 4): return False if (t.chars.is_all_lower): self.rank += 10 elif (t.previous.is_char('.')): self.rank -= 10 elif (t.previous.is_char_of(",-")): self.rank += 10 else: npt = NounPhraseHelper.try_parse(t.previous, NounPhraseParseAttr.NO, 0, None) if (npt is not None and npt.end_char >= t.end_char): self.rank += 10 if (t != self.begin_token and t.newlines_before_count > min_newlines_count): self.rank -= (t.newlines_before_count - min_newlines_count) bst = BracketHelper.try_parse(t, BracketParseAttr.NO, 100) if (bst is not None and bst.is_quote_type and bst.end_token.end_char <= self.end_token.end_char): if (words == 0): tstart = bst.begin_token self.rank += 10 if (bst.end_token == self.end_token): tend = self.end_token self.rank += 10 rli = t.get_referents() if (rli is not None): for r in rli: if (isinstance(r, OrganizationReferent)): if (t.is_newline_before): self.rank -= 10 else: self.rank -= 4 continue if ((isinstance(r, GeoReferent)) or (isinstance(r, PersonReferent))): if (t.is_newline_before): self.rank -= 5 if (t.is_newline_after or t.next0_ is None): self.rank -= 20 elif (t.next0_.is_hiphen or (isinstance(t.next0_, NumberToken)) or (isinstance(t.next0_.get_referent(), DateReferent))): self.rank -= 20 elif (t != self.begin_token): self.rank -= 20 continue if ((isinstance(r, GeoReferent)) or (isinstance(r, DenominationReferent))): continue if ((isinstance(r, UriReferent)) or (isinstance(r, PhoneReferent))): return False if (t.is_newline_before): self.rank -= 4 else: self.rank -= 2 if (t == self.begin_token and (isinstance( self.end_token.get_referent(), PersonReferent))): self.rank -= 10 words += 1 if (t.chars.is_all_upper): up_words += 1 if (t == self.begin_token): if (t.is_newline_after): self.rank -= 10 elif (t.next0_ is not None and t.next0_.is_char('.') and t.next0_.is_newline_after): self.rank -= 10 continue if (isinstance(t, NumberToken)): if (t.typ == NumberSpellingType.WORDS): words += 1 if (t.chars.is_all_upper): up_words += 1 else: notwords += 1 continue pat = PersonAttrToken.try_attach( t, None, PersonAttrToken.PersonAttrAttachAttrs.NO) if (pat is not None): if (t.is_newline_before): if (not pat.morph.case_.is_undefined and not pat.morph.case_.is_nominative): pass elif (pat.chars.is_all_upper): pass else: self.rank -= 20 elif (t.chars.is_all_lower): self.rank -= 1 while t is not None: words += 1 if (t.chars.is_all_upper): up_words += 1 if (t == pat.end_token): break t = t.next0_ continue oitt = OrgItemTypeToken.try_attach(t, True, None) if (oitt is not None): if (oitt.morph.number != MorphNumber.PLURAL and not oitt.is_doubt_root_word): if (not oitt.morph.case_.is_undefined and not oitt.morph.case_.is_nominative): words += 1 if (t.chars.is_all_upper): up_words += 1 else: self.rank -= 4 if (t == self.begin_token): self.rank -= 5 else: words += 1 if (t.chars.is_all_upper): up_words += 1 t = oitt.end_token continue tt = Utils.asObjectOrNull(t, TextToken) if (tt is not None): if (tt.is_char('©')): self.rank -= 10 if (tt.is_char('_')): self.rank -= 1 if (tt.chars.is_letter): if (tt.length_char > 2): words += 1 if (t.chars.is_all_upper): up_words += 1 elif (not tt.is_char(',')): notwords += 1 if (tt.is_pure_verb): self.rank -= 30 words -= 1 break if (tt == self.end_token): if (tt.morph.class0_.is_preposition or tt.morph.class0_.is_conjunction): self.rank -= 10 elif (tt.is_char('.')): self.rank += 5 elif (tt.is_char_of("._")): self.rank -= 5 self.rank += words self.rank -= notwords if ((words < 1) and (self.rank < 50)): return False if (tstart is None or tend is None): return False if (tstart.end_char > tend.end_char): return False tit1 = TitleItemToken.try_attach(self.end_token.next0_) if (tit1 is not None and ((tit1.typ == TitleItemToken.Types.TYP or tit1.typ == TitleItemToken.Types.SPECIALITY))): if (tit1.end_token.is_newline_after): self.rank += 15 else: self.rank += 10 if (tit1.typ == TitleItemToken.Types.SPECIALITY): self.speciality = tit1.value if (up_words > 4 and up_words > (math.floor((0.8 * (words))))): if (tstart.previous is not None and (isinstance(tstart.previous.get_referent(), PersonReferent))): self.rank += (5 + up_words) self.begin_name_token = tstart self.end_name_token = tend return True
def _process(begin : 'Token', max_char_pos : int, kit : 'AnalysisKit', end_token : 'Token') -> 'TitlePageReferent': end_token.value = begin res = TitlePageReferent() term = None lines = Line.parse(begin, 30, 1500, max_char_pos) if (len(lines) < 1): return None cou = len(lines) min_newlines_count = 10 lines_count_stat = dict() i = 0 while i < len(lines): if (TitleNameToken.can_be_start_of_text_or_content(lines[i].begin_token, lines[i].end_token)): cou = i break j = lines[i].newlines_before_count if (i > 0 and j > 0): if (not j in lines_count_stat): lines_count_stat[j] = 1 else: lines_count_stat[j] += 1 i += 1 max0_ = 0 for kp in lines_count_stat.items(): if (kp[1] > max0_): max0_ = kp[1] min_newlines_count = kp[0] end_char = (lines[cou - 1].end_char if cou > 0 else 0) if (max_char_pos > 0 and end_char > max_char_pos): end_char = max_char_pos names = list() i = 0 while i < cou: if (i == 6): pass j = i while (j < cou) and (j < (i + 5)): if (i == 6 and j == 8): pass if (j > i): if (lines[j - 1].is_pure_en and lines[j].is_pure_ru): break if (lines[j - 1].is_pure_ru and lines[j].is_pure_en): break if (lines[j].newlines_before_count >= (min_newlines_count * 2)): break ttt = TitleNameToken.try_parse(lines[i].begin_token, lines[j].end_token, min_newlines_count) if (ttt is not None): if (lines[i].is_pure_en): ttt.morph.language = MorphLang.EN elif (lines[i].is_pure_ru): ttt.morph.language = MorphLang.RU names.append(ttt) j += 1 i += 1 TitleNameToken.sort(names) name_rt = None if (len(names) > 0): i0 = 0 if (names[i0].morph.language.is_en): ii = 1 while ii < len(names): if (names[ii].morph.language.is_ru and names[ii].rank > 0): i0 = ii break ii += 1 term = res._add_name(names[i0].begin_name_token, names[i0].end_name_token) if (names[i0].type_value is not None): res._add_type(names[i0].type_value) if (names[i0].speciality is not None): res.speciality = names[i0].speciality rt = ReferentToken(res, names[i0].begin_token, names[i0].end_token) if (kit is not None): kit.embed_token(rt) else: res.add_occurence(TextAnnotation(rt.begin_token, rt.end_token)) end_token.value = rt.end_token name_rt = rt if (begin.begin_char == rt.begin_char): begin = (rt) if (term is not None and kit is not None): t = kit.first_token first_pass3397 = True while True: if first_pass3397: first_pass3397 = False else: t = t.next0_ if (not (t is not None)): break tok = term.try_parse(t, TerminParseAttr.NO) if (tok is None): continue t0 = t t1 = tok.end_token if (t1.next0_ is not None and t1.next0_.is_char('.')): t1 = t1.next0_ if (BracketHelper.can_be_start_of_sequence(t0.previous, False, False) and BracketHelper.can_be_end_of_sequence(t1.next0_, False, None, False)): t0 = t0.previous t1 = t1.next0_ rt = ReferentToken(res, t0, t1) kit.embed_token(rt) t = (rt) pr = PersonRelations() pers_typ = TitleItemToken.Types.UNDEFINED pers_types = pr.rel_types t = begin first_pass3398 = True while True: if first_pass3398: first_pass3398 = False else: t = t.next0_ if (not (t is not None)): break if (max_char_pos > 0 and t.begin_char > max_char_pos): break if (t == name_rt): continue tpt = TitleItemToken.try_attach(t) if (tpt is not None): pers_typ = TitleItemToken.Types.UNDEFINED if (tpt.typ == TitleItemToken.Types.TYP): if (len(res.types) == 0): res._add_type(tpt.value) elif (len(res.types) == 1): ty = res.types[0].upper() if (ty == "РЕФЕРАТ"): res._add_type(tpt.value) elif (ty == "АВТОРЕФЕРАТ"): if (tpt.value == "КАНДИДАТСКАЯ ДИССЕРТАЦИЯ"): res.add_slot(TitlePageReferent.ATTR_TYPE, "автореферат кандидатской диссертации", True, 0) elif (tpt.value == "ДОКТОРСКАЯ ДИССЕРТАЦИЯ"): res.add_slot(TitlePageReferent.ATTR_TYPE, "автореферат докторской диссертации", True, 0) elif (tpt.value == "МАГИСТЕРСКАЯ ДИССЕРТАЦИЯ"): res.add_slot(TitlePageReferent.ATTR_TYPE, "автореферат магистерской диссертации", True, 0) elif (tpt.value == "КАНДИДАТСЬКА ДИСЕРТАЦІЯ"): res.add_slot(TitlePageReferent.ATTR_TYPE, "автореферат кандидатської дисертації", True, 0) elif (tpt.value == "ДОКТОРСЬКА ДИСЕРТАЦІЯ"): res.add_slot(TitlePageReferent.ATTR_TYPE, "автореферат докторської дисертації", True, 0) elif (tpt.value == "МАГІСТЕРСЬКА ДИСЕРТАЦІЯ"): res.add_slot(TitlePageReferent.ATTR_TYPE, "автореферат магістерської дисертації", True, 0) else: res._add_type(tpt.value) elif (tpt.value == "РЕФЕРАТ" or tpt.value == "АВТОРЕФЕРАТ"): if (not tpt.value in ty): res._add_type(tpt.value) elif (tpt.typ == TitleItemToken.Types.SPECIALITY): if (res.speciality is None): res.speciality = tpt.value elif (tpt.typ in pers_types): pers_typ = tpt.typ t = tpt.end_token if (t.end_char > end_token.value.end_char): end_token.value = t if (t.next0_ is not None and t.next0_.is_char_of(":-")): t = t.next0_ continue if (t.end_char > end_char): break rli = t.get_referents() if (rli is None): continue if (not t.is_newline_before and (isinstance(t.previous, TextToken))): s = t.previous.term if (s == "ИМЕНИ" or s == "ИМ"): continue if (s == "." and t.previous.previous is not None and t.previous.previous.is_value("ИМ", None)): continue for r in rli: if (isinstance(r, PersonReferent)): if (r != rli[0]): continue p = Utils.asObjectOrNull(r, PersonReferent) if (pers_typ != TitleItemToken.Types.UNDEFINED): if (t.previous is not None and t.previous.is_char('.')): pers_typ = TitleItemToken.Types.UNDEFINED typ = pr.calc_typ_from_attrs(p) if (typ != TitleItemToken.Types.UNDEFINED): pr.add(p, typ, 1) pers_typ = typ elif (pers_typ != TitleItemToken.Types.UNDEFINED): pr.add(p, pers_typ, 1) elif (t.previous is not None and t.previous.is_char('©')): pers_typ = TitleItemToken.Types.WORKER pr.add(p, pers_typ, 1) else: tt = t.next0_ first_pass3399 = True while True: if first_pass3399: first_pass3399 = False else: tt = tt.next0_ if (not (tt is not None)): break rr = tt.get_referent() if (rr == res): pers_typ = TitleItemToken.Types.WORKER break if (isinstance(rr, PersonReferent)): if (pr.calc_typ_from_attrs(Utils.asObjectOrNull(r, PersonReferent)) != TitleItemToken.Types.UNDEFINED): break else: continue if (rr is not None): break tpt = TitleItemToken.try_attach(tt) if (tpt is not None): if (tpt.typ != TitleItemToken.Types.TYP and tpt.typ != TitleItemToken.Types.TYPANDTHEME): break tt = tpt.end_token if (tt.end_char > end_token.value.end_char): end_token.value = tt continue if (pers_typ == TitleItemToken.Types.UNDEFINED): tt = t.previous while tt is not None: rr = tt.get_referent() if (rr == res): pers_typ = TitleItemToken.Types.WORKER break if (rr is not None): break if ((tt.is_value("СТУДЕНТ", None) or tt.is_value("СТУДЕНТКА", None) or tt.is_value("СЛУШАТЕЛЬ", None)) or tt.is_value("ДИПЛОМНИК", None) or tt.is_value("ИСПОЛНИТЕЛЬ", None)): pers_typ = TitleItemToken.Types.WORKER break tpt = TitleItemToken.try_attach(tt) if (tpt is not None and tpt.typ != TitleItemToken.Types.TYP): break tt = tt.previous if (pers_typ != TitleItemToken.Types.UNDEFINED): pr.add(p, pers_typ, 1) else: pr.add(p, pers_typ, 0.5) if (t.end_char > end_token.value.end_char): end_token.value = t continue if (r == rli[0]): pers_typ = TitleItemToken.Types.UNDEFINED if (isinstance(r, DateReferent)): if (res.date is None): res.date = Utils.asObjectOrNull(r, DateReferent) if (t.end_char > end_token.value.end_char): end_token.value = t elif (isinstance(r, GeoReferent)): if (res.city is None and r.is_city): res.city = Utils.asObjectOrNull(r, GeoReferent) if (t.end_char > end_token.value.end_char): end_token.value = t if (isinstance(r, OrganizationReferent)): org0_ = Utils.asObjectOrNull(r, OrganizationReferent) if ("курс" in org0_.types and org0_.number is not None): i = 0 wrapi2673 = RefOutArgWrapper(0) inoutres2674 = Utils.tryParseInt(org0_.number, wrapi2673) i = wrapi2673.value if (inoutres2674): if (i > 0 and (i < 8)): res.student_year = i while org0_.higher is not None: if (org0_.kind != OrganizationKind.DEPARTMENT): break org0_ = org0_.higher if (org0_.kind != OrganizationKind.DEPARTMENT): if (res.org0_ is None): res.org0_ = org0_ elif (OrganizationReferent.can_be_higher(res.org0_, org0_)): res.org0_ = org0_ if (t.end_char > end_token.value.end_char): end_token.value = t if ((isinstance(r, UriReferent)) or (isinstance(r, GeoReferent))): if (t.end_char > end_token.value.end_char): end_token.value = t for ty in pers_types: for p in pr.get_persons(ty): if (pr.get_attr_name_for_type(ty) is not None): res.add_slot(pr.get_attr_name_for_type(ty), p, False, 0) if (res.get_slot_value(TitlePageReferent.ATTR_AUTHOR) is None): for p in pr.get_persons(TitleItemToken.Types.UNDEFINED): res.add_slot(TitlePageReferent.ATTR_AUTHOR, p, False, 0) break if (res.city is None and res.org0_ is not None): s = res.org0_.find_slot(OrganizationReferent.ATTR_GEO, None, True) if (s is not None and (isinstance(s.value, GeoReferent))): if (s.value.is_city): res.city = Utils.asObjectOrNull(s.value, GeoReferent) if (res.date is None): t = begin first_pass3400 = True while True: if first_pass3400: first_pass3400 = False else: t = t.next0_ if (not (t is not None and t.end_char <= end_char)): break city = Utils.asObjectOrNull(t.get_referent(), GeoReferent) if (city is None): continue if (isinstance(t.next0_, TextToken)): if (t.next0_.is_char_of(":,") or t.next0_.is_hiphen): t = t.next0_ rt = t.kit.process_referent(DateAnalyzer.ANALYZER_NAME, t.next0_) if (rt is not None): rt.save_to_local_ontology() res.date = Utils.asObjectOrNull(rt.referent, DateReferent) if (kit is not None): kit.embed_token(rt) break if (len(res.slots) == 0): return None else: return res
def __tryParse(t: 'Token', is_in_lit: bool, max_char: int = 0) -> typing.List['ReferentToken']: if (t is None): return None is_bracket_regime = False if (t.previous is not None and t.previous.isChar('(')): is_bracket_regime = True blt = BookLinkToken.tryParse(t, 0) if (blt is None): blt = BookLinkToken.tryParseAuthor(t, FioTemplateType.UNDEFINED) if (blt is None and not is_bracket_regime): return None t0 = t coef = 0 is_electr_res = False decree = None regtyp = BookLinkAnalyzer.RegionTyp.UNDEFINED num = None spec_see = None book_prev = None if (is_bracket_regime): regtyp = BookLinkAnalyzer.RegionTyp.AUTHORS elif (blt.typ == BookLinkTyp.PERSON): if (not is_in_lit): return None regtyp = BookLinkAnalyzer.RegionTyp.AUTHORS elif (blt.typ == BookLinkTyp.NUMBER): num = blt.value t = blt.end_token.next0_ if (t is None or t.is_newline_before): return None if (not t.is_whitespace_before): if (isinstance(t, NumberToken)): n = (t).value if ((((n == "3" or n == "0")) and not t.is_whitespace_after and (isinstance(t.next0_, TextToken))) and t.next0_.chars.is_all_lower): pass else: return None elif (not ((isinstance(t, TextToken))) or t.chars.is_all_lower): r = t.getReferent() if (isinstance(r, PersonReferent)): pass elif (is_in_lit and r is not None and r.type_name == "DECREE"): pass else: return None first_pass2757 = True while True: if first_pass2757: first_pass2757 = False else: t = t.next0_ if (not (t is not None)): break if (isinstance(t, NumberToken)): break if (not ((isinstance(t, TextToken)))): break if (BracketHelper.canBeStartOfSequence(t, True, False)): break if (not t.chars.is_letter): continue bbb = BookLinkToken.tryParse(t, 0) if (bbb is not None): if (bbb.typ == BookLinkTyp.TAMZE): spec_see = bbb t = bbb.end_token.next0_ break if (bbb.typ == BookLinkTyp.SEE): t = bbb.end_token continue break if (spec_see is not None and spec_see.typ == BookLinkTyp.TAMZE): coef += 1 max0_ = 1000 tt = t0 while tt is not None and max0_ > 0: if (isinstance(tt.getReferent(), BookLinkRefReferent)): book_prev = (tt.getReferent()).book break tt = tt.previous max0_ -= 1 blt1 = BookLinkToken.tryParseAuthor(t, FioTemplateType.UNDEFINED) if (blt1 is not None and blt1.typ == BookLinkTyp.PERSON): regtyp = BookLinkAnalyzer.RegionTyp.AUTHORS else: ok = False tt = t first_pass2758 = True while True: if first_pass2758: first_pass2758 = False else: tt = (None if tt is None else tt.next0_) if (not (tt is not None)): break if (tt.is_newline_before): break if (is_in_lit and tt.getReferent() is not None and tt.getReferent().type_name == "DECREE"): ok = True decree = tt break bbb = BookLinkToken.tryParse(tt, 0) if (bbb is None): continue if (bbb.typ == BookLinkTyp.ELECTRONRES): is_electr_res = True ok = True break if (bbb.typ == BookLinkTyp.DELIMETER): tt = bbb.end_token.next0_ if (BookLinkToken.tryParseAuthor( tt, FioTemplateType.UNDEFINED) is not None): ok = True break bbb = BookLinkToken.tryParse(tt, 0) if (bbb is not None): if (bbb.typ == BookLinkTyp.EDITORS or bbb.typ == BookLinkTyp.TRANSLATE or bbb.typ == BookLinkTyp.SOSTAVITEL): ok = True break if (not ok and not is_in_lit): if (BookLinkToken.checkLinkBefore(t0, num)): pass else: return None regtyp = BookLinkAnalyzer.RegionTyp.NAME else: return None res = BookLinkReferent() corr_authors = list() t00 = t blt00 = None start_of_name = None prev_pers_templ = FioTemplateType.UNDEFINED if (regtyp == BookLinkAnalyzer.RegionTyp.AUTHORS): first_pass2759 = True while True: if first_pass2759: first_pass2759 = False else: t = t.next0_ if (not (t is not None)): break if (max_char > 0 and t.begin_char >= max_char): break if (t.isCharOf(".;") or t.is_comma_and): continue if (t.isChar('/')): break if ((t.isChar('(') and t.next0_ is not None and t.next0_.isValue("EDS", None)) and t.next0_.next0_ is not None and t.next0_.next0_.isChar(')')): t = t.next0_.next0_.next0_ break blt = BookLinkToken.tryParseAuthor(t, prev_pers_templ) if (blt is None and t.previous is not None and t.previous.is_and): blt = BookLinkToken.tryParseAuthor( t.previous, FioTemplateType.UNDEFINED) if (blt is None): if ((isinstance(t.getReferent(), OrganizationReferent)) and blt00 is not None): bbb2 = BookLinkToken.tryParse(t.next0_, 0) if (bbb2 is not None): if (bbb2.typ == BookLinkTyp.YEAR): res.addSlot(BookLinkReferent.ATTR_AUTHOR, t.getReferent(), False, 0) res.year = int(bbb2.value) coef += .5 t = bbb2.end_token.next0_ break if (blt.typ == BookLinkTyp.PERSON): tt2 = blt.end_token.next0_ bbb2 = BookLinkToken.tryParse(tt2, 0) if (bbb2 is not None): if (bbb2.typ == BookLinkTyp.YEAR): res.year = int(bbb2.value) coef += .5 blt.end_token = bbb2.end_token blt00 = (None) if (blt00 is not None and ((blt00.end_token.next0_ == blt.begin_token or blt.begin_token.previous.isChar('.')))): tt11 = blt.end_token.next0_ nex = BookLinkToken.tryParse(tt11, 0) if (nex is not None and nex.typ == BookLinkTyp.ANDOTHERS): pass else: if (tt11 is None): break if (tt11.isChar('/') and tt11.next0_ is not None and tt11.next0_.isChar('/')): break if (tt11.isChar(':')): break if ((str(blt).find('.') < 0) and str(blt00).find('.') > 0): break if ((isinstance(tt11, TextToken)) and tt11.chars.is_all_lower): break if (tt11.isCharOf(",.;") and tt11.next0_ is not None): tt11 = tt11.next0_ nex = BookLinkToken.tryParse(tt11, 0) if (nex is not None and nex.typ != BookLinkTyp.PERSON and nex.typ != BookLinkTyp.ANDOTHERS): break elif ( (blt00 is not None and blt00.person_template != FioTemplateType.UNDEFINED and blt.person_template != blt00.person_template) and blt.person_template == FioTemplateType.NAMESURNAME): if (blt.end_token.next0_ is None or not blt.end_token.next0_.is_comma_and): break if (BookLinkToken.tryParseAuthor( blt.end_token.next0_.next0_, FioTemplateType.UNDEFINED) is not None): pass else: break if (blt00 is None and blt.person_template == FioTemplateType.NAMESURNAME): tt = blt.end_token.next0_ if (tt is not None and tt.is_hiphen): tt = tt.next0_ if (isinstance(tt, NumberToken)): break BookLinkAnalyzer.__addAuthor(res, blt) coef += 1 t = blt.end_token if (isinstance(t.getReferent(), PersonReferent)): corr_authors.append( Utils.asObjectOrNull(t, ReferentToken)) blt00 = blt prev_pers_templ = blt.person_template start_of_name = blt.start_of_name if ((start_of_name) is not None): t = t.next0_ break continue if (blt.typ == BookLinkTyp.ANDOTHERS): coef += .5 t = blt.end_token.next0_ res.authors_and_other = True break break if (t is None): return None if ((t.is_newline_before and t != t0 and num is None) and res.findSlot( BookLinkReferent.ATTR_AUTHOR, None, True) is None): return None if (start_of_name is None): if (t.chars.is_all_lower): coef -= (1) if (t.chars.is_latin_letter and not is_electr_res and num is None): if (res.getSlotValue(BookLinkReferent.ATTR_AUTHOR) is None): return None tn0 = t tn1 = None uri = None next_num = None wrapnn393 = RefOutArgWrapper(0) inoutres394 = Utils.tryParseInt(Utils.ifNotNull(num, ""), wrapnn393) nn = wrapnn393.value if (inoutres394): next_num = str((nn + 1)) br = (BracketHelper.tryParse( t, Utils.valToEnum( (BracketParseAttr.CANCONTAINSVERBS) | (BracketParseAttr.CANBEMANYLINES), BracketParseAttr), 100) if BracketHelper.canBeStartOfSequence(t, True, False) else None) if (br is not None): t = t.next0_ pages = None first_pass2760 = True while True: if first_pass2760: first_pass2760 = False else: t = t.next0_ if (not (t is not None)): break if (max_char > 0 and t.begin_char >= max_char): break if (br is not None and br.end_token == t): tn1 = t break tit = TitleItemToken.tryAttach(t) if (tit is not None): if ((tit.typ == TitleItemToken.Types.TYP and tn0 == t and br is None) and BracketHelper.canBeStartOfSequence( tit.end_token.next0_, True, False)): br = BracketHelper.tryParse(tit.end_token.next0_, BracketParseAttr.NO, 100) if (br is not None): coef += (1) if (num is not None): coef += 1 tn0 = br.begin_token tn1 = br.end_token res.typ = tit.value.lower() t = br.end_token.next0_ break if (t.is_newline_before and t != tn0): if (br is not None and (t.end_char < br.end_char)): pass elif (not MiscHelper.canBeStartOfSentence(t)): pass else: if (t.newlines_before_count > 1): break if ((isinstance(t, NumberToken)) and num is not None and (t).int_value is not None): if (num == str(((t).int_value - 1))): break elif (num is not None): pass else: nnn = NounPhraseHelper.tryParse( t.previous, Utils.valToEnum( ((NounPhraseParseAttr.PARSEPREPOSITION) | (NounPhraseParseAttr.PARSEADVERBS) | (NounPhraseParseAttr.PARSENUMERICASADJECTIVE)) | (NounPhraseParseAttr.MULTILINES), NounPhraseParseAttr), 0) if (nnn is not None and nnn.end_char >= t.end_char): pass else: break if (t.isCharOf(".;") and t.whitespaces_after_count > 0): tit = TitleItemToken.tryAttach(t.next0_) if ((tit) is not None): if (tit.typ == TitleItemToken.Types.TYP): break stop = True words = 0 notwords = 0 tt = t.next0_ first_pass2761 = True while True: if first_pass2761: first_pass2761 = False else: tt = tt.next0_ if (not (tt is not None)): break blt0 = BookLinkToken.tryParse(tt, 0) if (blt0 is None): if (tt.is_newline_before): break if ((isinstance(tt, TextToken)) and not tt.getMorphClassInDictionary().is_undefined ): words += 1 else: notwords += 1 if (words > 6 and words > (notwords * 4)): stop = False break continue if ((blt0.typ == BookLinkTyp.DELIMETER or blt0.typ == BookLinkTyp.TRANSLATE or blt0.typ == BookLinkTyp.TYPE) or blt0.typ == BookLinkTyp.GEO or blt0.typ == BookLinkTyp.PRESS): stop = False break if (br is not None and br.end_token.previous.end_char > t.end_char): stop = False if (stop): break if (t == decree): t = t.next0_ break blt = BookLinkToken.tryParse(t, 0) if (blt is None): tn1 = t continue if (blt.typ == BookLinkTyp.DELIMETER): break if (((blt.typ == BookLinkTyp.MISC or blt.typ == BookLinkTyp.TRANSLATE or blt.typ == BookLinkTyp.NAMETAIL) or blt.typ == BookLinkTyp.TYPE or blt.typ == BookLinkTyp.VOLUME) or blt.typ == BookLinkTyp.PAGERANGE or blt.typ == BookLinkTyp.PAGES): coef += 1 break if (blt.typ == BookLinkTyp.GEO or blt.typ == BookLinkTyp.PRESS): if (t.previous.is_hiphen or t.previous.isCharOf(".;") or blt.add_coef > 0): break if (blt.typ == BookLinkTyp.YEAR): if (t.previous is not None and t.previous.is_comma): break if (blt.typ == BookLinkTyp.ELECTRONRES): is_electr_res = True break if (blt.typ == BookLinkTyp.URL): if (t == tn0 or t.previous.isCharOf(":.")): is_electr_res = True break tn1 = t if (tn1 is None and start_of_name is None): if (is_electr_res): uri_re = BookLinkReferent() rt0 = ReferentToken(uri_re, t00, t) rts0 = list() bref0 = BookLinkRefReferent._new389(uri_re) if (num is not None): bref0.number = num rt01 = ReferentToken(bref0, t0, rt0.end_token) ok = False while t is not None: if (t.is_newline_before): break blt0 = BookLinkToken.tryParse(t, 0) if (blt0 is not None): if (isinstance(blt0.ref, UriReferent)): uri_re.addSlot( BookLinkReferent.ATTR_URL, Utils.asObjectOrNull(blt0.ref, UriReferent), False, 0) ok = True t = blt0.end_token rt0.end_token = rt01.end_token = t t = t.next0_ if (ok): rts0.append(rt01) rts0.append(rt0) return rts0 if (decree is not None and num is not None): rts0 = list() bref0 = BookLinkRefReferent._new389(decree.getReferent()) if (num is not None): bref0.number = num rt01 = ReferentToken(bref0, t0, decree) t = decree.next0_ while t is not None: if (t.is_newline_before): break if (isinstance(t, TextToken)): if ((t).is_pure_verb): return None rt01.end_token = t t = t.next0_ rts0.append(rt01) return rts0 if (book_prev is not None): tt = t while tt is not None and ((tt.isCharOf(",.") or tt.is_hiphen)): tt = tt.next0_ blt0 = BookLinkToken.tryParse(tt, 0) if (blt0 is not None and blt0.typ == BookLinkTyp.PAGERANGE): rts0 = list() bref0 = BookLinkRefReferent._new389(book_prev) if (num is not None): bref0.number = num bref0.pages = blt0.value rt00 = ReferentToken(bref0, t0, blt0.end_token) rts0.append(rt00) return rts0 return None if (br is not None and ((tn1 == br.end_token or tn1 == br.end_token.previous))): tn0 = tn0.next0_ tn1 = tn1.previous if (start_of_name is None): while tn0 is not None: if (tn0.isCharOf(":,~")): tn0 = tn0.next0_ else: break while tn1 is not None and tn1.begin_char > tn0.begin_char: if (tn1.isCharOf(".;,:(~") or tn1.is_hiphen or tn1.isValue("РЕД", None)): pass else: break tn1 = tn1.previous nam = MiscHelper.getTextValue( tn0, tn1, Utils.valToEnum( (GetTextAttr.KEEPQUOTES) | (GetTextAttr.KEEPREGISTER), GetTextAttr)) if (start_of_name is not None): if (nam is None or (len(nam) < 3)): nam = start_of_name else: nam = "{0}{1}{2}".format( start_of_name, (" " if tn0.is_whitespace_before else ""), nam) if (nam is None): return None res.name = nam if (num is None and not is_in_lit): if (len(nam) < 20): return None coef -= (2) if (len(nam) > 500): coef -= (math.floor(len(nam) / 500)) if (is_bracket_regime): coef -= 1 if (len(nam) > 200): if (num is None): return None if (res.findSlot(BookLinkReferent.ATTR_AUTHOR, None, True) is None and not BookLinkToken.checkLinkBefore(t0, num)): return None en = 0 ru = 0 ua = 0 cha = 0 nocha = 0 chalen = 0 lt0 = tn0 lt1 = tn1 if (tn1 is None): if (t is None): return None lt0 = t0 lt1 = t tn1 = t.previous tt = lt0 while tt is not None and tt.end_char <= lt1.end_char: if ((isinstance(tt, TextToken)) and tt.chars.is_letter): if (tt.chars.is_latin_letter): en += 1 elif (tt.morph.language.is_ua): ua += 1 elif (tt.morph.language.is_ru): ru += 1 if (tt.length_char > 2): cha += 1 chalen += tt.length_char elif (not ((isinstance(tt, ReferentToken)))): nocha += 1 tt = tt.next0_ if (ru > (ua + en)): res.lang = "RU" elif (ua > (ru + en)): res.lang = "UA" elif (en > (ru + ua)): res.lang = "EN" if (nocha > 3 and nocha > cha and start_of_name is None): if (nocha > (math.floor(chalen / 3))): coef -= (2) if (res.lang == "EN"): tt = tn0.next0_ first_pass2762 = True while True: if first_pass2762: first_pass2762 = False else: tt = tt.next0_ if (not (tt is not None and (tt.end_char < tn1.end_char))): break if (tt.is_comma and tt.next0_ is not None and ((not tt.next0_.chars.is_all_lower or (isinstance(tt.next0_, ReferentToken))))): if (tt.next0_.next0_ is not None and tt.next0_.next0_.is_comma_and): if (isinstance(tt.next0_, ReferentToken)): pass else: continue nam = MiscHelper.getTextValue( tn0, tt.previous, Utils.valToEnum((GetTextAttr.KEEPQUOTES) | (GetTextAttr.KEEPREGISTER), GetTextAttr)) if (nam is not None and len(nam) > 15): res.name = nam break rt = ReferentToken(res, t00, tn1) authors = True edits = False br = (None) first_pass2763 = True while True: if first_pass2763: first_pass2763 = False else: t = t.next0_ if (not (t is not None)): break if (max_char > 0 and t.begin_char >= max_char): break if (BracketHelper.canBeStartOfSequence(t, False, False)): br = BracketHelper.tryParse(t, BracketParseAttr.CANBEMANYLINES, 100) if (br is not None and br.length_char > 300): br = (None) blt = BookLinkToken.tryParse(t, 0) if (t.is_newline_before and not t.isChar('/') and not t.previous.isChar('/')): if (blt is not None and blt.typ == BookLinkTyp.NUMBER): break if (t.previous.isCharOf(":")): pass elif (blt is not None and (( ((blt.typ == BookLinkTyp.DELIMETER or blt.typ == BookLinkTyp.PAGERANGE or blt.typ == BookLinkTyp.PAGES) or blt.typ == BookLinkTyp.GEO or blt.typ == BookLinkTyp.PRESS) or blt.typ == BookLinkTyp.N))): pass elif (num is not None and BookLinkToken.tryParseAuthor( t, FioTemplateType.UNDEFINED) is not None): pass elif (num is not None and blt is not None and blt.typ != BookLinkTyp.NUMBER): pass elif (br is not None and (t.end_char < br.end_char) and t.begin_char > br.begin_char): pass else: ok = False mmm = 50 tt = t.next0_ while tt is not None and mmm > 0: if (tt.is_newline_before): blt2 = BookLinkToken.tryParse(tt, 0) if (blt2 is not None and blt2.typ == BookLinkTyp.NUMBER and blt2.value == next_num): ok = True break if (blt2 is not None): if (blt2.typ == BookLinkTyp.PAGES or blt2.typ == BookLinkTyp.GEO or blt2.typ == BookLinkTyp.PRESS): ok = True break tt = tt.next0_ mmm -= 1 if (not ok): npt = NounPhraseHelper.tryParse( t.previous, Utils.valToEnum( ((NounPhraseParseAttr.MULTILINES) | (NounPhraseParseAttr.PARSEADVERBS) | (NounPhraseParseAttr.PARSEPREPOSITION)) | (NounPhraseParseAttr.PARSEVERBS) | (NounPhraseParseAttr.PARSEPRONOUNS), NounPhraseParseAttr), 0) if (npt is not None and npt.end_char >= t.end_char): ok = True if (not ok): break rt.end_token = t if (blt is not None): rt.end_token = blt.end_token if (t.isCharOf(".,") or t.is_hiphen): continue if (t.isValue("С", None)): pass if (regtyp == BookLinkAnalyzer.RegionTyp.FIRST and blt is not None and blt.typ == BookLinkTyp.EDITORS): edits = True t = blt.end_token coef += 1 continue if (regtyp == BookLinkAnalyzer.RegionTyp.FIRST and blt is not None and blt.typ == BookLinkTyp.SOSTAVITEL): edits = False t = blt.end_token coef += 1 continue if (regtyp == BookLinkAnalyzer.RegionTyp.FIRST and authors): blt2 = BookLinkToken.tryParseAuthor(t, prev_pers_templ) if (blt2 is not None and blt2.typ == BookLinkTyp.PERSON): prev_pers_templ = blt2.person_template if (not edits): BookLinkAnalyzer.__addAuthor(res, blt2) coef += 1 t = blt2.end_token continue if (blt2 is not None and blt2.typ == BookLinkTyp.ANDOTHERS): if (not edits): res.authors_and_other = True coef += 1 t = blt2.end_token continue authors = False if (blt is None): continue if (blt.typ == BookLinkTyp.ELECTRONRES or blt.typ == BookLinkTyp.URL): is_electr_res = True if (blt.typ == BookLinkTyp.ELECTRONRES): coef += 1.5 else: coef += .5 if (isinstance(blt.ref, UriReferent)): res.addSlot(BookLinkReferent.ATTR_URL, Utils.asObjectOrNull(blt.ref, UriReferent), False, 0) elif (blt.typ == BookLinkTyp.YEAR): if (res.year == 0): res.year = int(blt.value) coef += .5 elif (blt.typ == BookLinkTyp.DELIMETER): coef += 1 if (blt.length_char == 2): regtyp = BookLinkAnalyzer.RegionTyp.SECOND else: regtyp = BookLinkAnalyzer.RegionTyp.FIRST elif ( (((blt.typ == BookLinkTyp.MISC or blt.typ == BookLinkTyp.TYPE or blt.typ == BookLinkTyp.PAGES) or blt.typ == BookLinkTyp.NAMETAIL or blt.typ == BookLinkTyp.TRANSLATE) or blt.typ == BookLinkTyp.PRESS or blt.typ == BookLinkTyp.VOLUME) or blt.typ == BookLinkTyp.N): coef += 1 elif (blt.typ == BookLinkTyp.PAGERANGE): pages = blt coef += 1 if (is_bracket_regime and blt.end_token.next0_ is not None and blt.end_token.next0_.isChar(')')): coef += (2) if (res.name is not None and res.findSlot(BookLinkReferent.ATTR_AUTHOR, None, True) is not None): coef = (10) elif (blt.typ == BookLinkTyp.GEO and ((regtyp == BookLinkAnalyzer.RegionTyp.SECOND or regtyp == BookLinkAnalyzer.RegionTyp.FIRST))): coef += 1 elif (blt.typ == BookLinkTyp.GEO and t.previous is not None and t.previous.isChar('.')): coef += 1 elif (blt.typ == BookLinkTyp.ANDOTHERS): coef += 1 if (authors): res.authors_and_other = True coef += blt.add_coef t = blt.end_token if ((coef < 2.5) and num is not None): if (BookLinkToken.checkLinkBefore(t0, num)): coef += (2) elif (BookLinkToken.checkLinkAfter(rt.end_token, num)): coef += (1) if (rt.length_char > 500): return None if (is_in_lit): coef += 1 if (coef < 2.5): if (is_electr_res and uri is not None): pass elif (coef >= 2 and is_in_lit): pass else: return None for rr in corr_authors: pits0 = PersonItemToken.tryAttachList( rr.begin_token, None, PersonItemToken.ParseAttr.CANINITIALBEDIGIT, 10) if (pits0 is None or (len(pits0) < 2)): continue if (pits0[0].typ == PersonItemToken.ItemType.VALUE): exi = False for i in range(len(rr.referent.slots) - 1, -1, -1): s = rr.referent.slots[i] if (s.type_name == PersonReferent.ATTR_LASTNAME): ln = Utils.asObjectOrNull(s.value, str) if (ln is None): continue if (ln == pits0[0].value): exi = True continue if (ln.find('-') > 0): ln = ln[0:0 + ln.find('-')] if (pits0[0].begin_token.isValue(ln, None)): del rr.referent.slots[i] if (not exi): rr.referent.addSlot(PersonReferent.ATTR_LASTNAME, pits0[0].value, False, 0) rts = list() bref = BookLinkRefReferent._new389(res) if (num is not None): bref.number = num rt1 = ReferentToken(bref, t0, rt.end_token) if (pages is not None): if (pages.value is not None): bref.pages = pages.value rt.end_token = pages.begin_token.previous rts.append(rt1) rts.append(rt) return rts