Пример #1
0
 def initialize(lang: 'MorphLang' = None) -> None:
     """ Инициализация сервиса. Каждый анализатор нужно аинициализировать отдельно.
     Если вызывается Sdk.Initialize(), то там инициализация сервиса и всех анализаторов делается.
     
     Args:
         lang(MorphLang): необходимые языки (по умолчанию, русский и английский)
     
     """
     from pullenti.ner.core.internal.NumberExHelper import NumberExHelper
     from pullenti.ner.core.internal.BlockLine import BlockLine
     from pullenti.ner.core.internal.NounPhraseItem import NounPhraseItem
     from pullenti.ner.core.PrepositionHelper import PrepositionHelper
     from pullenti.ner.core.ConjunctionHelper import ConjunctionHelper
     if (ProcessorService.__m_inited):
         return
     ProcessorService.__m_inited = True
     MorphologyService.initialize(lang)
     DerivateService.initialize(lang)
     Termin.ASSIGN_ALL_TEXTS_AS_NORMAL = True
     PrepositionHelper._initialize()
     ConjunctionHelper._initialize()
     NounPhraseItem._initialize()
     NumberHelper._initialize()
     NumberExHelper._initialize()
     BlockLine.initialize()
     Termin.ASSIGN_ALL_TEXTS_AS_NORMAL = False
Пример #2
0
 def parse_start_of_lit_block(t: 'Token') -> 'Token':
     if (t is None):
         return None
     bl = BlockLine.create(t, None)
     if (bl is not None and bl.typ == BlkTyps.LITERATURE):
         return bl.end_token
     return None
Пример #3
0
 def initialize(lang: 'MorphLang' = None) -> None:
     """ Инициализация сервиса.  
      Внимание! После этого нужно инициализровать анализаторы (см. документацию)
      <param name="lang">необходимые языки (по умолчанию, русский и английский)</param> """
     from pullenti.ner.core.internal.NumberExHelper import NumberExHelper
     from pullenti.ner.core.internal.NounPhraseItem import NounPhraseItem
     if (ProcessorService.__m_inited):
         return
     ProcessorService.__m_inited = True
     Morphology.initialize(lang)
     Explanatory.initialize(lang)
     Termin.ASSIGN_ALL_TEXTS_AS_NORMAL = True
     NounPhraseItem._initialize()
     NumberHelper._initialize()
     NumberExHelper._initialize()
     BlockLine.initialize()
     Termin.ASSIGN_ALL_TEXTS_AS_NORMAL = False
Пример #4
0
 def try_attach_list(t: 'Token') -> typing.List['BlockTitleToken']:
     content = None
     intro = None
     lits = None
     tt = t
     first_pass3035 = True
     while True:
         if first_pass3035: first_pass3035 = False
         else: tt = tt.next0_
         if (not (tt is not None)): break
         if (tt.is_newline_before):
             btt = BlockTitleToken.try_attach(tt, False, None)
             if (btt is None):
                 continue
             if (btt.typ == BlkTyps.INDEX):
                 content = btt
                 break
             if (btt.typ == BlkTyps.INTRO):
                 tt2 = btt.end_token.next0_
                 for k in range(5):
                     li = BlockLine.create(tt2, None)
                     if (li is None):
                         break
                     if (li.has_content_item_tail
                             or li.typ == BlkTyps.INDEXITEM):
                         content = btt
                         break
                     if (li.has_verb):
                         break
                     if (li.typ != BlkTyps.UNDEFINED):
                         if ((li.begin_char - btt.end_char) < 400):
                             content = btt
                             break
                     tt2 = li.end_token.next0_
                 if (content is None):
                     intro = btt
                 break
             if (btt.typ == BlkTyps.LITERATURE):
                 if (lits is None):
                     lits = list()
                 lits.append(btt)
     if (content is None and intro is None
             and ((lits is None or len(lits) != 1))):
         return None
     res = list()
     chapter_names = TerminCollection()
     t0 = None
     if (content is not None):
         res.append(content)
         cou = 0
         err = 0
         tt = content.end_token.next0_
         first_pass3036 = True
         while True:
             if first_pass3036: first_pass3036 = False
             else: tt = tt.next0_
             if (not (tt is not None)): break
             if (not tt.is_newline_before):
                 continue
             li = BlockLine.create(tt, None)
             if (li is None):
                 break
             if (li.has_verb):
                 if (li.end_token.is_char('.')):
                     break
                 if (li.length_char > 100):
                     break
             btt = BlockTitleToken.try_attach(tt, True, None)
             if (btt is None):
                 continue
             err = 0
             if (btt.typ == BlkTyps.INTRO):
                 if (content.typ == BlkTyps.INTRO or cou > 2):
                     break
             cou += 1
             content.end_token = btt.end_token
             tt = content.end_token
             if (btt.value is not None):
                 chapter_names.add_string(btt.value, None, None, False)
         content.typ = BlkTyps.INDEX
         t0 = content.end_token.next0_
     elif (intro is not None):
         t0 = intro.begin_token
     elif (lits is not None):
         t0 = t
     else:
         return None
     first = True
     tt = t0
     first_pass3037 = True
     while True:
         if first_pass3037: first_pass3037 = False
         else: tt = tt.next0_
         if (not (tt is not None)): break
         if (not tt.is_newline_before):
             continue
         if (tt.is_value("СЛАБОЕ", None)):
             pass
         btt = BlockTitleToken.try_attach(tt, False, chapter_names)
         if (btt is None):
             continue
         if (len(res) == 104):
             pass
         tt = btt.end_token
         if (content is not None and btt.typ == BlkTyps.INDEX):
             continue
         if (len(res) > 0 and res[len(res) - 1].typ == BlkTyps.LITERATURE):
             if (btt.typ != BlkTyps.APPENDIX and btt.typ != BlkTyps.MISC
                     and btt.typ != BlkTyps.LITERATURE):
                 if (btt.typ == BlkTyps.CHAPTER
                         and (res[len(res) - 1].end_char < (math.floor(
                             (len(tt.kit.sofa.text) * 3) / 4)))):
                     pass
                 else:
                     continue
         if (first):
             if ((tt.begin_char - t0.begin_char) > 300):
                 btt0 = BlockTitleToken(
                     t0, (t0 if t0.previous is None else t0.previous))
                 btt0.typ = BlkTyps.CHAPTER
                 btt0.value = "Похоже на начало"
                 res.append(btt0)
         res.append(btt)
         tt = btt.end_token
         first = False
     i = 0
     while i < (len(res) - 1):
         if (res[i].typ == BlkTyps.LITERATURE
                 and res[i + 1].typ == res[i].typ):
             del res[i + 1]
             i -= 1
         i += 1
     return res
Пример #5
0
 def try_attach(t: 'Token',
                is_content_item: bool = False,
                names: 'TerminCollection' = None) -> 'BlockTitleToken':
     if (t is None):
         return None
     if (not t.is_newline_before):
         return None
     if (t.chars.is_all_lower):
         return None
     li = BlockLine.create(t, names)
     if (li is None):
         return None
     if (li.words == 0 and li.typ == BlkTyps.UNDEFINED):
         return None
     if (li.typ == BlkTyps.INDEX):
         pass
     if (li.is_exist_name):
         return BlockTitleToken._new392(t, li.end_token, li.typ)
     if (li.end_token == li.number_end
             or ((li.end_token.is_char_of(".:")
                  and li.end_token.previous == li.number_end))):
         res2 = BlockTitleToken._new392(t, li.end_token, li.typ)
         if (li.typ == BlkTyps.CHAPTER or li.typ == BlkTyps.APPENDIX):
             li2 = BlockLine.create(li.end_token.next0_, names)
             if ((li2 is not None and li2.typ == BlkTyps.UNDEFINED
                  and li2.is_all_upper) and li2.words > 0):
                 res2.end_token = li2.end_token
                 tt = res2.end_token.next0_
                 while tt is not None:
                     li2 = BlockLine.create(tt, names)
                     if (li2 is None):
                         break
                     if (li2.typ != BlkTyps.UNDEFINED
                             or not li2.is_all_upper or li2.words == 0):
                         break
                     res2.end_token = li2.end_token
                     tt = res2.end_token
                     tt = tt.next0_
         return res2
     if (li.number_end is None):
         return None
     res = BlockTitleToken._new392(t, li.end_token, li.typ)
     if (res.typ == BlkTyps.UNDEFINED):
         if (li.words < 1):
             return None
         if (li.has_verb):
             return None
         if (not is_content_item):
             if (not li.is_all_upper or li.not_words >
                 (math.floor(li.words / 2))):
                 return None
         res.typ = BlkTyps.CHAPTER
         if ((li.number_end.end_char - t.begin_char) == 7
                 and li.number_end.next0_ is not None
                 and li.number_end.next0_.is_hiphen):
             res.typ = BlkTyps.UNDEFINED
     if (li.has_content_item_tail and is_content_item):
         res.typ = BlkTyps.INDEXITEM
     if (res.typ == BlkTyps.CHAPTER or res.typ == BlkTyps.APPENDIX):
         if (li.has_verb):
             return None
         if (li.not_words > li.words and not is_content_item):
             return None
         t = li.end_token.next0_
         while t is not None:
             li2 = BlockLine.create(t, names)
             if (li2 is None):
                 break
             if (li2.has_verb or (li2.words < 1)):
                 break
             if (not li2.is_all_upper and not is_content_item):
                 break
             if (li2.typ != BlkTyps.UNDEFINED
                     or li2.number_end is not None):
                 break
             res.end_token = li2.end_token
             t = res.end_token
             if (is_content_item and li2.has_content_item_tail):
                 res.typ = BlkTyps.INDEXITEM
                 break
             t = t.next0_
     tt = res.end_token
     while tt is not None and tt.begin_char > li.number_end.end_char:
         if ((isinstance(tt, TextToken)) and tt.chars.is_letter):
             res.value = MiscHelper.get_text_value(li.number_end.next0_, tt,
                                                   GetTextAttr.NO)
             break
         tt = tt.previous
     if ((res.typ == BlkTyps.INDEX or res.typ == BlkTyps.INTRO
          or res.typ == BlkTyps.CONSLUSION)
             or res.typ == BlkTyps.LITERATURE):
         if (res.value is not None and len(res.value) > 100):
             return None
         if (li.words < li.not_words):
             return None
     return res