Пример #1
0
 def morph(self) -> 'MorphCollection':
     """ Морфологическая информация
     
     """
     if (self.__m_morph is None):
         self.__m_morph = MorphCollection()
     return self.__m_morph
Пример #2
0
 def _serialize(self, stream: io.IOBase) -> None:
     from pullenti.ner.core.internal.SerializerHelper import SerializerHelper
     SerializerHelper.serializeInt(stream, self.begin_char)
     SerializerHelper.serializeInt(stream, self.end_char)
     SerializerHelper.serializeInt(stream, self.__m_attrs)
     SerializerHelper.serializeInt(stream, self.chars.value)
     if (self.__m_morph is None):
         self.__m_morph = MorphCollection()
     self.__m_morph._serialize(stream)
Пример #3
0
 def __init__(self,
              source: 'MorphToken',
              kit_: 'AnalysisKit',
              bchar: int = -1,
              echar: int = -1) -> None:
     super().__init__(kit_, (bchar if bchar >= 0 else
                             (0 if source is None else source.begin_char)),
                      (echar if echar >= 0 else
                       (0 if source is None else source.end_char)))
     self.term = None
     self.lemma = None
     self.term0 = None
     self.invariant_prefix_length_of_morph_vars = 0
     self.max_length_of_morph_vars = 0
     if (source is None):
         return
     self.chars = source.char_info
     self.term = source.term
     self.lemma = (Utils.ifNotNull(source.get_lemma(), self.term))
     self.max_length_of_morph_vars = (len(self.term))
     self.morph = MorphCollection()
     if (source.word_forms is not None):
         for wf in source.word_forms:
             self.morph.add_item(wf)
             if (wf.normal_case is not None and
                 (self.max_length_of_morph_vars < len(wf.normal_case))):
                 self.max_length_of_morph_vars = (len(wf.normal_case))
             if (wf.normal_full is not None and
                 (self.max_length_of_morph_vars < len(wf.normal_full))):
                 self.max_length_of_morph_vars = (len(wf.normal_full))
     i = 0
     while i < len(self.term):
         ch = self.term[i]
         j = 0
         j = 0
         while j < self.morph.items_count:
             wf = Utils.asObjectOrNull(self.morph.get_indexer_item(j),
                                       MorphWordForm)
             if (wf.normal_case is not None):
                 if (i >= len(wf.normal_case)):
                     break
                 if (wf.normal_case[i] != ch):
                     break
             if (wf.normal_full is not None):
                 if (i >= len(wf.normal_full)):
                     break
                 if (wf.normal_full[i] != ch):
                     break
             j += 1
         if (j < self.morph.items_count):
             break
         self.invariant_prefix_length_of_morph_vars = ((i + 1))
         i += 1
     if (self.morph.language.is_undefined
             and not source.language.is_undefined):
         self.morph.language = source.language
Пример #4
0
 def _deserialize(self, stream: io.IOBase, kit_: 'AnalysisKit',
                  vers: int) -> None:
     from pullenti.ner.core.internal.SerializerHelper import SerializerHelper
     self.kit = kit_
     self.begin_char = SerializerHelper.deserializeInt(stream)
     self.end_char = SerializerHelper.deserializeInt(stream)
     self.__m_attrs = (SerializerHelper.deserializeInt(stream))
     self.chars = CharsInfo._new2656(
         SerializerHelper.deserializeInt(stream))
     self.__m_morph = MorphCollection()
     self.__m_morph._deserialize(stream)
Пример #5
0
 def __tryParseRu(t: 'Token') -> 'VerbPhraseToken':
     res = None
     t0 = t
     not0_ = None
     has_verb = False
     first_pass2814 = True
     while True:
         if first_pass2814: first_pass2814 = False
         else: t = t.next0_
         if (not (t is not None)): break
         if (not ((isinstance(t, TextToken)))):
             break
         tt = Utils.asObjectOrNull(t, TextToken)
         if (tt.term == "НЕ"):
             not0_ = t
             continue
         ty = 0
         mc = tt.getMorphClassInDictionary()
         if (tt.term == "НЕТ"):
             ty = 1
         elif (mc.is_adverb):
             ty = 2
         elif (tt.is_pure_verb or tt.is_verb_be):
             ty = 1
         elif (mc.is_verb):
             if (mc.is_preposition or mc.is_misc):
                 pass
             elif (mc.is_noun):
                 if (tt.term == "СТАЛИ"):
                     ty = 1
                 elif (not tt.chars.is_all_lower
                       and not MiscHelper.canBeStartOfSentence(tt)):
                     ty = 1
             elif (mc.is_proper):
                 if (tt.chars.is_all_lower):
                     ty = 1
             else:
                 ty = 1
         if (ty == 0):
             break
         if (res is None):
             res = VerbPhraseToken(t0, t)
         res.end_token = t
         it = VerbPhraseItemToken._new638(t, t, MorphCollection(t.morph))
         if (not0_ is not None):
             it.begin_token = not0_
             it.not0_ = True
             not0_ = (None)
         it.is_adverb = ty == 2
         it.normal = t.getNormalCaseText(
             (MorphClass.ADVERB if ty == 2 else MorphClass.VERB), False,
             MorphGender.UNDEFINED, False)
         res.items.append(it)
         if (not has_verb and ty == 1):
             res.morph = it.morph
             has_verb = True
     if (not has_verb):
         return None
     return res
Пример #6
0
 def __init__(self, entity : 'Referent', begin : 'Token', end : 'Token', kit_ : 'AnalysisKit'=None) -> None:
     super().__init__(begin, end, kit_)
     self.referent = None;
     self.data = None;
     self.misc_attrs = 0
     self.referent = entity
     if (self.morph is None): 
         self.morph = MorphCollection()
 def __try_parse_ru(first: 'Token',
                    typ: 'NounPhraseParseAttr',
                    max_char_pos: int,
                    def_noun: 'NounPhraseItem' = None) -> 'NounPhraseToken':
     if (first is None):
         return None
     items = None
     adverbs = None
     prep = None
     kak = False
     t0 = first
     if ((((typ) & (NounPhraseParseAttr.PARSEPREPOSITION))) !=
         (NounPhraseParseAttr.NO) and t0.is_value("КАК", None)):
         t0 = t0.next0_
         prep = PrepositionHelper.try_parse(t0)
         if (prep is not None):
             t0 = prep.end_token.next0_
         kak = True
     internal_noun_prase = None
     conj_before = False
     t = t0
     first_pass3041 = True
     while True:
         if first_pass3041: first_pass3041 = False
         else: t = t.next0_
         if (not (t is not None)): break
         if (max_char_pos > 0 and t.begin_char > max_char_pos):
             break
         if ((t.morph.class0_.is_conjunction
              and not t.morph.class0_.is_adjective
              and not t.morph.class0_.is_pronoun)
                 and not t.morph.class0_.is_noun):
             if (conj_before):
                 break
             if ((((typ) & (NounPhraseParseAttr.CANNOTHASCOMMAAND))) !=
                 (NounPhraseParseAttr.NO)):
                 break
             if (items is not None and ((t.is_and or t.is_or))):
                 conj_before = True
                 if ((t.next0_ is not None and t.next0_.is_char_of("\\/")
                      and t.next0_.next0_ is not None)
                         and t.next0_.next0_.is_or):
                     t = t.next0_.next0_
                 if (((t.next0_ is not None and t.next0_.is_char('(')
                       and t.next0_.next0_ is not None)
                      and t.next0_.next0_.is_or
                      and t.next0_.next0_.next0_ is not None)
                         and t.next0_.next0_.next0_.is_char(')')):
                     t = t.next0_.next0_.next0_
                 continue
             break
         elif (t.is_comma):
             if (conj_before or items is None):
                 break
             if ((((typ) & (NounPhraseParseAttr.CANNOTHASCOMMAAND))) !=
                 (NounPhraseParseAttr.NO)):
                 break
             mc = t.previous.get_morph_class_in_dictionary()
             if (mc.is_proper_surname or mc.is_proper_secname):
                 break
             conj_before = True
             if (kak and t.next0_ is not None
                     and t.next0_.is_value("ТАК", None)):
                 t = t.next0_
                 if (t.next0_ is not None and t.next0_.is_and):
                     t = t.next0_
                 pr = PrepositionHelper.try_parse(t.next0_)
                 if (pr is not None):
                     t = pr.end_token
             if (items[len(items) - 1].can_be_noun
                     and items[len(items) -
                               1].end_token.morph.class0_.is_pronoun):
                 break
             continue
         elif (t.is_char('(')):
             if (items is None):
                 return None
             brr = BracketHelper.try_parse(t, BracketParseAttr.NO, 100)
             if (brr is None):
                 break
             if (brr.length_char > 100):
                 break
             t = brr.end_token
             continue
         if (isinstance(t, ReferentToken)):
             if ((((typ) & (NounPhraseParseAttr.REFERENTCANBENOUN))) == (
                     NounPhraseParseAttr.NO)):
                 break
         elif (t.chars.is_latin_letter):
             break
         it = NounPhraseItem.try_parse(t, items, typ)
         if (it is None or ((not it.can_be_adj and not it.can_be_noun))):
             if (((it is not None and items is not None
                   and t.chars.is_capital_upper) and
                  (t.whitespaces_before_count < 3) and t.length_char > 3)
                     and not t.get_morph_class_in_dictionary().is_noun and
                     not t.get_morph_class_in_dictionary().is_adjective):
                 it.can_be_noun = True
                 items.append(it)
                 break
             if ((((typ) & (NounPhraseParseAttr.PARSEADVERBS))) !=
                 (NounPhraseParseAttr.NO) and (isinstance(t, TextToken))
                     and t.morph.class0_.is_adverb):
                 if (adverbs is None):
                     adverbs = list()
                 adverbs.append(Utils.asObjectOrNull(t, TextToken))
                 continue
             break
         it.conj_before = conj_before
         conj_before = False
         if (not it.can_be_adj and not it.can_be_noun):
             break
         if (t.is_newline_before and t != first):
             if ((((typ) & (NounPhraseParseAttr.MULTILINES))) !=
                 (NounPhraseParseAttr.NO)):
                 pass
             elif (items is not None
                   and t.chars != items[len(items) - 1].chars):
                 if (t.chars.is_all_lower
                         and items[len(items) - 1].chars.is_capital_upper):
                     pass
                 else:
                     break
         if (items is None):
             items = list()
         else:
             it0 = items[len(items) - 1]
             if (it0.can_be_noun and it0.is_personal_pronoun):
                 if (it.is_pronoun):
                     break
                 if ((it0.begin_token.previous is not None
                      and it0.begin_token.previous.
                      get_morph_class_in_dictionary().is_verb
                      and not it0.begin_token.previous.
                      get_morph_class_in_dictionary().is_adjective)
                         and not it0.begin_token.previous.
                         get_morph_class_in_dictionary().is_preposition):
                     if (t.morph.case_.is_nominative
                             or t.morph.case_.is_accusative):
                         pass
                     else:
                         break
                 if (it.can_be_noun and it.is_verb):
                     if (it0.previous is None):
                         pass
                     elif ((isinstance(it0.previous, TextToken))
                           and not it0.previous.chars.is_letter):
                         pass
                     else:
                         break
         items.append(it)
         t = it.end_token
         if (t.is_newline_after and not t.chars.is_all_lower):
             mc = t.get_morph_class_in_dictionary()
             if (mc.is_proper_surname):
                 break
             if (t.morph.class0_.is_proper_surname and mc.is_undefined):
                 break
     if (items is None):
         return None
     tt1 = None
     if (len(items) == 1 and items[0].can_be_adj):
         and0_ = False
         tt1 = items[0].end_token.next0_
         first_pass3042 = True
         while True:
             if first_pass3042: first_pass3042 = False
             else: tt1 = tt1.next0_
             if (not (tt1 is not None)): break
             if (tt1.is_and or tt1.is_or):
                 and0_ = True
                 break
             if (tt1.is_comma or tt1.is_value("НО", None)
                     or tt1.is_value("ТАК", None)):
                 continue
             break
         if (and0_):
             if (items[0].can_be_noun and items[0].is_personal_pronoun):
                 and0_ = False
         if (and0_):
             tt2 = tt1.next0_
             if (tt2 is not None and tt2.morph.class0_.is_preposition):
                 tt2 = tt2.next0_
             npt1 = _NounPraseHelperInt.__try_parse_ru(
                 tt2, typ, max_char_pos, None)
             if (npt1 is not None and len(npt1.adjectives) > 0):
                 ok1 = False
                 for av in items[0].adj_morph:
                     for v in npt1.noun.noun_morph:
                         if (v.check_accord(av, False, False)):
                             items[0].morph.add_item(av)
                             ok1 = True
                 if (ok1):
                     npt1.begin_token = items[0].begin_token
                     npt1.end_token = tt1.previous
                     npt1.adjectives.clear()
                     npt1.adjectives.append(items[0])
                     return npt1
     if (def_noun is not None):
         items.append(def_noun)
     last1 = items[len(items) - 1]
     check = True
     for it in items:
         if (not it.can_be_adj):
             check = False
             break
         elif (it.can_be_noun and it.is_personal_pronoun):
             check = False
             break
     tt1 = last1.end_token.next0_
     if ((tt1 is not None and check and
          ((tt1.morph.class0_.is_preposition
            or tt1.morph.case_.is_instrumental)))
             and (tt1.whitespaces_before_count < 2)):
         inp = NounPhraseHelper.try_parse(
             tt1,
             Utils.valToEnum((typ) | (NounPhraseParseAttr.PARSEPREPOSITION),
                             NounPhraseParseAttr), max_char_pos, None)
         if (inp is not None):
             tt1 = inp.end_token.next0_
             npt1 = _NounPraseHelperInt.__try_parse_ru(
                 tt1, typ, max_char_pos, None)
             if (npt1 is not None):
                 ok = True
                 ii = 0
                 first_pass3043 = True
                 while True:
                     if first_pass3043: first_pass3043 = False
                     else: ii += 1
                     if (not (ii < len(items))): break
                     it = items[ii]
                     if (NounPhraseItem.try_accord_adj_and_noun(
                             it,
                             Utils.asObjectOrNull(npt1.noun,
                                                  NounPhraseItem))):
                         continue
                     if (ii > 0):
                         inp2 = NounPhraseHelper.try_parse(
                             it.begin_token, typ, max_char_pos, None)
                         if (inp2 is not None
                                 and inp2.end_token == inp.end_token):
                             del items[ii:ii + len(items) - ii]
                             inp = inp2
                             break
                     ok = False
                     break
                 if (ok):
                     if (npt1.morph.case_.is_genitive
                             and not inp.morph.case_.is_instrumental):
                         ok = False
                 if (ok):
                     i = 0
                     while i < len(items):
                         npt1.adjectives.insert(i, items[i])
                         i += 1
                     npt1.internal_noun = inp
                     mmm = MorphCollection(npt1.morph)
                     for it in items:
                         mmm.remove_items(it.adj_morph[0], False)
                     if (mmm.gender != MorphGender.UNDEFINED
                             or mmm.number != MorphNumber.UNDEFINED
                             or not mmm.case_.is_undefined):
                         npt1.morph = mmm
                     if (adverbs is not None):
                         if (npt1.adverbs is None):
                             npt1.adverbs = adverbs
                         else:
                             npt1.adverbs[0:0] = adverbs
                     npt1.begin_token = first
                     return npt1
             if (tt1 is not None and tt1.morph.class0_.is_noun
                     and not tt1.morph.case_.is_genitive):
                 it = NounPhraseItem.try_parse(tt1, items, typ)
                 if (it is not None and it.can_be_noun):
                     internal_noun_prase = inp
                     inp.begin_token = items[0].end_token.next0_
                     items.append(it)
     i = 0
     first_pass3044 = True
     while True:
         if first_pass3044: first_pass3044 = False
         else: i += 1
         if (not (i < len(items))): break
         if (items[i].can_be_adj
                 and items[i].begin_token.morph.class0_.is_verb):
             it = items[i].begin_token
             if (not it.get_morph_class_in_dictionary().is_verb):
                 continue
             if (it.is_value("УПОЛНОМОЧЕННЫЙ", None)):
                 continue
             if ((((typ) & (NounPhraseParseAttr.PARSEVERBS))) == (
                     NounPhraseParseAttr.NO)):
                 continue
             inp = _NounPraseHelperInt.__try_parse_ru(
                 items[i].end_token.next0_, NounPhraseParseAttr.NO,
                 max_char_pos, None)
             if (inp is None):
                 continue
             if (inp.anafor is not None and i == (len(items) - 1)
                     and NounPhraseItem.try_accord_adj_and_noun(
                         items[i],
                         Utils.asObjectOrNull(inp.noun, NounPhraseItem))):
                 inp.begin_token = first
                 ii = 0
                 while ii < len(items):
                     inp.adjectives.insert(ii, items[ii])
                     ii += 1
                 return inp
             if (inp.end_token.whitespaces_after_count > 3):
                 continue
             npt1 = _NounPraseHelperInt.__try_parse_ru(
                 inp.end_token.next0_, NounPhraseParseAttr.NO, max_char_pos,
                 None)
             if (npt1 is None):
                 continue
             ok = True
             j = 0
             while j <= i:
                 if (not NounPhraseItem.try_accord_adj_and_noun(
                         items[j],
                         Utils.asObjectOrNull(npt1.noun, NounPhraseItem))):
                     ok = False
                     break
                 j += 1
             if (not ok):
                 continue
             verb = VerbPhraseHelper.try_parse(it, True, False, False)
             if (verb is None):
                 continue
             vlinks = SemanticHelper.try_create_links(verb, inp, None)
             nlinks = SemanticHelper.try_create_links(inp, npt1, None)
             if (len(vlinks) == 0 and len(nlinks) > 0):
                 continue
             j = 0
             while j <= i:
                 npt1.adjectives.insert(j, items[j])
                 j += 1
             items[i].end_token = inp.end_token
             mmm = MorphCollection(npt1.morph)
             bil = list()
             j = 0
             while j <= i:
                 bil.clear()
                 for m in items[j].adj_morph:
                     bil.append(m)
                 mmm.remove_items_list_cla(bil, None)
                 j += 1
             if (mmm.gender != MorphGender.UNDEFINED
                     or mmm.number != MorphNumber.UNDEFINED
                     or not mmm.case_.is_undefined):
                 npt1.morph = mmm
             if (adverbs is not None):
                 if (npt1.adverbs is None):
                     npt1.adverbs = adverbs
                 else:
                     npt1.adverbs[0:0] = adverbs
             npt1.begin_token = first
             return npt1
     ok2 = False
     if ((len(items) == 1 and
          (((typ) & (NounPhraseParseAttr.ADJECTIVECANBELAST))) !=
          (NounPhraseParseAttr.NO) and
          (items[0].whitespaces_after_count < 3))
             and not items[0].is_adverb):
         if (not items[0].can_be_adj):
             ok2 = True
         elif (items[0].is_personal_pronoun and items[0].can_be_noun):
             ok2 = True
     if (ok2):
         it = NounPhraseItem.try_parse(items[0].end_token.next0_, None, typ)
         if (it is not None and it.can_be_adj
                 and it.begin_token.chars.is_all_lower):
             ok2 = True
             if (it.is_adverb or it.is_verb):
                 ok2 = False
             if (it.is_pronoun and items[0].is_pronoun):
                 ok2 = False
                 if (it.can_be_adj_for_personal_pronoun
                         and items[0].is_personal_pronoun):
                     ok2 = True
             if (ok2 and NounPhraseItem.try_accord_adj_and_noun(
                     it, items[0])):
                 npt1 = _NounPraseHelperInt.__try_parse_ru(
                     it.begin_token, typ, max_char_pos, None)
                 if (npt1 is not None and ((npt1.end_char > it.end_char
                                            or len(npt1.adjectives) > 0))):
                     pass
                 else:
                     items.insert(0, it)
     noun = None
     adj_after = None
     for i in range(len(items) - 1, -1, -1):
         if (items[i].can_be_noun):
             if (items[i].conj_before):
                 continue
             if (i > 0 and not items[i - 1].can_be_adj):
                 continue
             if (i > 0 and items[i - 1].can_be_noun):
                 if (items[i - 1].is_doubt_adjective):
                     continue
                 if (items[i - 1].is_pronoun and items[i].is_pronoun):
                     if (items[i].is_pronoun and
                             items[i - 1].can_be_adj_for_personal_pronoun):
                         pass
                     else:
                         continue
             noun = items[i]
             del items[i:i + len(items) - i]
             if (adj_after is not None):
                 items.append(adj_after)
             elif (len(items) > 0 and items[0].can_be_noun
                   and not items[0].can_be_adj):
                 noun = items[0]
                 items.clear()
             break
     if (noun is None):
         return None
     res = NounPhraseToken._new466(first, noun.end_token, prep)
     if (adverbs is not None):
         for a in adverbs:
             if (a.begin_char < noun.begin_char):
                 if (len(items) == 0 and prep is None):
                     return None
                 if (res.adverbs is None):
                     res.adverbs = list()
                 res.adverbs.append(a)
     res.noun = (noun)
     res.multi_nouns = noun.multi_nouns
     if (kak):
         res.multi_nouns = True
     res.internal_noun = internal_noun_prase
     for v in noun.noun_morph:
         noun.morph.add_item(v)
     res.morph = noun.morph
     if (res.morph.case_.is_nominative and first.previous is not None
             and first.previous.morph.class0_.is_preposition):
         res.morph.case_ = (res.morph.case_) ^ MorphCase.NOMINATIVE
     if ((((typ) &
           (NounPhraseParseAttr.PARSEPRONOUNS))) == (NounPhraseParseAttr.NO)
             and ((res.morph.class0_.is_pronoun
                   or res.morph.class0_.is_personal_pronoun))):
         return None
     stat = None
     if (len(items) > 1):
         stat = dict()
     need_update_morph = False
     if (len(items) > 0):
         ok_list = list()
         is_num_not = False
         for vv in noun.noun_morph:
             i = 0
             v = vv
             i = 0
             while i < len(items):
                 ok = False
                 for av in items[i].adj_morph:
                     if (v.check_accord(av, False, False)):
                         ok = True
                         if (not ((av.case_) & v.case_).is_undefined
                                 and av.case_ != v.case_):
                             v.case_ = av.case_ = (av.case_) & v.case_
                         break
                 if (not ok):
                     if (items[i].can_be_numeric_adj
                             and items[i].try_accord_var(v, False)):
                         ok = True
                         v1 = NounPhraseItemTextVar()
                         v1.copy_from_item(v)
                         v1.number = MorphNumber.PLURAL
                         is_num_not = True
                         v1.case_ = MorphCase()
                         for a in items[i].adj_morph:
                             v1.case_ = (v1.case_) | a.case_
                         v = v1
                     else:
                         break
                 i += 1
             if (i >= len(items)):
                 ok_list.append(v)
         if (len(ok_list) > 0 and
             (((len(ok_list) < res.morph.items_count) or is_num_not))):
             res.morph = MorphCollection()
             for v in ok_list:
                 res.morph.add_item(v)
             if (not is_num_not):
                 noun.morph = res.morph
     i = 0
     first_pass3045 = True
     while True:
         if first_pass3045: first_pass3045 = False
         else: i += 1
         if (not (i < len(items))): break
         for av in items[i].adj_morph:
             for v in noun.noun_morph:
                 if (v.check_accord(av, False, False)):
                     if (not ((av.case_) & v.case_).is_undefined
                             and av.case_ != v.case_):
                         v.case_ = av.case_ = (av.case_) & v.case_
                         need_update_morph = True
                     items[i].morph.add_item(av)
                     if (stat is not None and av.normal_value is not None
                             and len(av.normal_value) > 1):
                         last = av.normal_value[len(av.normal_value) - 1]
                         if (not last in stat):
                             stat[last] = 1
                         else:
                             stat[last] += 1
         if (items[i].is_pronoun or items[i].is_personal_pronoun):
             res.anafor = items[i].begin_token
             if ((((typ) & (NounPhraseParseAttr.PARSEPRONOUNS))) == (
                     NounPhraseParseAttr.NO)):
                 continue
         tt = Utils.asObjectOrNull(items[i].begin_token, TextToken)
         if (tt is not None and not tt.term.startswith("ВЫСШ")):
             err = False
             for wf in tt.morph.items:
                 if (wf.class0_.is_adjective):
                     if (wf.contains_attr("прев.", None)):
                         if ((((typ) &
                               (NounPhraseParseAttr.IGNOREADJBEST))) !=
                             (NounPhraseParseAttr.NO)):
                             err = True
                     if (wf.contains_attr("к.ф.", None)
                             and tt.morph.class0_.is_personal_pronoun):
                         return None
             if (err):
                 continue
         if (res.morph.case_.is_nominative):
             v = MiscHelper.get_text_value_of_meta_token(
                 items[i], GetTextAttr.KEEPQUOTES)
             if (not Utils.isNullOrEmpty(v)):
                 if (items[i].get_normal_case_text(
                         None, MorphNumber.UNDEFINED, MorphGender.UNDEFINED,
                         False) != v):
                     wf = NounPhraseItemTextVar(items[i].morph, None)
                     wf.normal_value = v
                     wf.class0_ = MorphClass.ADJECTIVE
                     wf.case_ = res.morph.case_
                     if (res.morph.case_.is_prepositional
                             or res.morph.gender == MorphGender.NEUTER
                             or res.morph.gender == MorphGender.FEMINIE):
                         items[i].morph.add_item(wf)
                     else:
                         items[i].morph.insert_item(0, wf)
         res.adjectives.append(items[i])
         if (items[i].end_char > res.end_char):
             res.end_token = items[i].end_token
     i = 0
     first_pass3046 = True
     while True:
         if first_pass3046: first_pass3046 = False
         else: i += 1
         if (not (i < (len(res.adjectives) - 1))): break
         if (res.adjectives[i].whitespaces_after_count > 5):
             if (res.adjectives[i].chars != res.adjectives[i + 1].chars):
                 if (not res.adjectives[i + 1].chars.is_all_lower):
                     return None
                 if (res.adjectives[i].chars.is_all_upper
                         and res.adjectives[i + 1].chars.is_capital_upper):
                     return None
                 if (res.adjectives[i].chars.is_capital_upper
                         and res.adjectives[i + 1].chars.is_all_upper):
                     return None
             if (res.adjectives[i].whitespaces_after_count > 10):
                 if (res.adjectives[i].newlines_after_count == 1):
                     if (res.adjectives[i].chars.is_capital_upper and i == 0
                             and res.adjectives[i + 1].chars.is_all_lower):
                         continue
                     if (res.adjectives[i].chars == res.adjectives[
                             i + 1].chars):
                         continue
                 return None
     if (need_update_morph):
         noun.morph = MorphCollection()
         for v in noun.noun_morph:
             noun.morph.add_item(v)
         res.morph = noun.morph
     if (len(res.adjectives) > 0):
         if (noun.begin_token.previous is not None):
             if (noun.begin_token.previous.is_comma_and):
                 if (res.adjectives[0].begin_char > noun.begin_char):
                     pass
                 else:
                     return None
         zap = 0
         and0_ = 0
         cou = 0
         last_and = False
         i = 0
         while i < (len(res.adjectives) - 1):
             te = res.adjectives[i].end_token.next0_
             if (te is None):
                 return None
             if (te.is_char('(')):
                 pass
             elif (te.is_comma):
                 zap += 1
                 last_and = False
             elif (te.is_and or te.is_or):
                 and0_ += 1
                 last_and = True
             if (not res.adjectives[i].begin_token.morph.class0_.is_pronoun
                 ):
                 cou += 1
             i += 1
         if ((zap + and0_) > 0):
             if (and0_ > 1):
                 return None
             elif (and0_ == 1 and not last_and):
                 return None
             if ((zap + and0_) != cou):
                 if (and0_ == 1):
                     pass
                 else:
                     return None
             last = Utils.asObjectOrNull(
                 res.adjectives[len(res.adjectives) - 1], NounPhraseItem)
             if (last.is_pronoun and not last_and):
                 return None
     if (stat is not None):
         for adj in items:
             if (adj.morph.items_count > 1):
                 w1 = Utils.asObjectOrNull(adj.morph.get_indexer_item(0),
                                           NounPhraseItemTextVar)
                 w2 = Utils.asObjectOrNull(adj.morph.get_indexer_item(1),
                                           NounPhraseItemTextVar)
                 if ((len(w1.normal_value) < 2)
                         or (len(w2.normal_value) < 2)):
                     break
                 l1 = w1.normal_value[len(w1.normal_value) - 1]
                 l2 = w2.normal_value[len(w2.normal_value) - 1]
                 i1 = 0
                 i2 = 0
                 wrapi1468 = RefOutArgWrapper(0)
                 Utils.tryGetValue(stat, l1, wrapi1468)
                 i1 = wrapi1468.value
                 wrapi2467 = RefOutArgWrapper(0)
                 Utils.tryGetValue(stat, l2, wrapi2467)
                 i2 = wrapi2467.value
                 if (i1 < i2):
                     adj.morph.remove_item(1)
                     adj.morph.insert_item(0, w2)
     if (res.begin_token.get_morph_class_in_dictionary().is_verb
             and len(items) > 0):
         if (not res.begin_token.chars.is_all_lower
                 or res.begin_token.previous is None):
             pass
         elif (res.begin_token.previous.morph.class0_.is_preposition):
             pass
         else:
             comma = False
             tt = res.begin_token.previous
             first_pass3047 = True
             while True:
                 if first_pass3047: first_pass3047 = False
                 else: tt = tt.previous
                 if (not (tt is not None and tt.end_char <= res.end_char)):
                     break
                 if (tt.morph.class0_.is_adverb):
                     continue
                 if (tt.is_char_of(".;")):
                     break
                 if (tt.is_comma):
                     comma = True
                     continue
                 if (tt.is_value("НЕ", None)):
                     continue
                 if (((tt.morph.class0_.is_noun
                       or tt.morph.class0_.is_proper)) and comma):
                     for it in res.begin_token.morph.items:
                         if (it.class0_.is_verb
                                 and (isinstance(it, MorphWordForm))):
                             if (tt.morph.check_accord(it, False, False)):
                                 if (res.morph.case_.is_instrumental):
                                     return None
                 break
     if (res.begin_token == res.end_token):
         mc = res.begin_token.get_morph_class_in_dictionary()
         if (mc.is_adverb):
             if (res.begin_token.previous is not None and
                     res.begin_token.previous.morph.class0_.is_preposition):
                 pass
             elif (mc.is_noun and not mc.is_preposition
                   and not mc.is_conjunction):
                 pass
             elif (res.begin_token.is_value("ВЕСЬ", None)):
                 pass
             else:
                 return None
     if (def_noun is not None and def_noun.end_token == res.end_token
             and len(res.adjectives) > 0):
         res.end_token = res.adjectives[len(res.adjectives) - 1].end_token
     return res
 def __try_parse_en(first: 'Token', typ: 'NounPhraseParseAttr',
                    max_char_pos: int) -> 'NounPhraseToken':
     if (first is None):
         return None
     items = None
     has_article = False
     has_prop = False
     has_misc = False
     if (first.previous is not None
             and first.previous.morph.class0_.is_preposition
             and (first.whitespaces_before_count < 3)):
         has_prop = True
     t = first
     first_pass3048 = True
     while True:
         if first_pass3048: first_pass3048 = False
         else: t = t.next0_
         if (not (t is not None)): break
         if (max_char_pos > 0 and t.begin_char > max_char_pos):
             break
         if (not t.chars.is_latin_letter):
             break
         if (t != first and t.whitespaces_before_count > 2):
             if ((((typ) & (NounPhraseParseAttr.MULTILINES))) !=
                 (NounPhraseParseAttr.NO)):
                 pass
             elif (MiscHelper.is_eng_article(t.previous)):
                 pass
             else:
                 break
         tt = Utils.asObjectOrNull(t, TextToken)
         if (t == first and tt is not None):
             if (MiscHelper.is_eng_article(tt)):
                 has_article = True
                 continue
         if (isinstance(t, ReferentToken)):
             if ((((typ) & (NounPhraseParseAttr.REFERENTCANBENOUN))) == (
                     NounPhraseParseAttr.NO)):
                 break
         elif (tt is None):
             break
         if ((t.is_value("SO", None) and t.next0_ is not None
              and t.next0_.is_hiphen) and t.next0_.next0_ is not None):
             if (t.next0_.next0_.is_value("CALL", None)):
                 t = t.next0_.next0_
                 continue
         mc = t.get_morph_class_in_dictionary()
         if (mc.is_conjunction or mc.is_preposition):
             break
         if (mc.is_pronoun or mc.is_personal_pronoun):
             if ((((typ) & (NounPhraseParseAttr.PARSEPRONOUNS))) == (
                     NounPhraseParseAttr.NO)):
                 break
         elif (mc.is_misc):
             if (t.is_value("THIS", None) or t.is_value("THAT", None)):
                 has_misc = True
                 if ((((typ) & (NounPhraseParseAttr.PARSEPRONOUNS))) == (
                         NounPhraseParseAttr.NO)):
                     break
         is_adj = False
         if (((has_article or has_prop or has_misc)) and items is None):
             pass
         elif (isinstance(t, ReferentToken)):
             pass
         else:
             if (not mc.is_noun and not mc.is_adjective):
                 if (mc.is_undefined and has_article):
                     pass
                 elif (items is None and mc.is_undefined
                       and t.chars.is_capital_upper):
                     pass
                 elif (mc.is_pronoun):
                     pass
                 elif (tt.term.endswith("EAN")):
                     is_adj = True
                 elif (MiscHelper.is_eng_adj_suffix(tt.next0_)):
                     pass
                 else:
                     break
             if (mc.is_verb):
                 if (t.next0_ is not None and t.next0_.morph.class0_.is_verb
                         and (t.whitespaces_after_count < 2)):
                     pass
                 elif (t.chars.is_capital_upper
                       and not MiscHelper.can_be_start_of_sentence(t)):
                     pass
                 elif ((t.chars.is_capital_upper and mc.is_noun and
                        (isinstance(t.next0_, TextToken)))
                       and t.next0_.chars.is_capital_upper):
                     pass
                 elif (isinstance(t, ReferentToken)):
                     pass
                 else:
                     break
         if (items is None):
             items = list()
         it = NounPhraseItem(t, t)
         if (mc.is_noun):
             it.can_be_noun = True
         if (mc.is_adjective or mc.is_pronoun or is_adj):
             it.can_be_adj = True
         items.append(it)
         t = it.end_token
         if (len(items) == 1):
             if (MiscHelper.is_eng_adj_suffix(t.next0_)):
                 mc.is_noun = False
                 mc.is_adjective = True
                 t = t.next0_.next0_
     if (items is None):
         return None
     noun = items[len(items) - 1]
     res = NounPhraseToken(first, noun.end_token)
     res.noun = (noun)
     res.morph = MorphCollection()
     for v in noun.end_token.morph.items:
         if (v.class0_.is_verb):
             continue
         if (v.class0_.is_proper and noun.begin_token.chars.is_all_lower):
             continue
         if (isinstance(v, MorphWordForm)):
             wf = MorphWordForm()
             wf.copy_from_word_form(Utils.asObjectOrNull(v, MorphWordForm))
             if (has_article and v.number != MorphNumber.SINGULAR):
                 wf.number = MorphNumber.SINGULAR
             res.morph.add_item(wf)
         else:
             bi = MorphBaseInfo()
             bi.copy_from(v)
             if (has_article and v.number != MorphNumber.SINGULAR):
                 bi.number = MorphNumber.SINGULAR
             res.morph.add_item(bi)
     if (res.morph.items_count == 0 and has_article):
         res.morph.add_item(
             MorphBaseInfo._new192(MorphClass.NOUN, MorphNumber.SINGULAR))
     i = 0
     while i < (len(items) - 1):
         res.adjectives.append(items[i])
         i += 1
     return res
Пример #9
0
 def try_parse(t: 'Token', items: typing.List['NounPhraseItem'],
               attrs: 'NounPhraseParseAttr') -> 'NounPhraseItem':
     if (t is None):
         return None
     t0 = t
     _can_be_surname = False
     _is_doubt_adj = False
     rt = Utils.asObjectOrNull(t, ReferentToken)
     if (rt is not None and rt.begin_token == rt.end_token
             and (isinstance(rt.begin_token, TextToken))):
         res = NounPhraseItem.try_parse(rt.begin_token, items, attrs)
         if (res is not None):
             res.begin_token = res.end_token = t
             res.can_be_noun = True
             return res
     if (rt is not None):
         res = NounPhraseItem(t, t)
         for m in t.morph.items:
             v = NounPhraseItemTextVar(m, None)
             v.normal_value = str(t.get_referent())
             res.noun_morph.append(v)
         res.can_be_noun = True
         return res
     if (isinstance(t, NumberToken)):
         pass
     has_legal_verb = False
     if (isinstance(t, TextToken)):
         if (not t.chars.is_letter):
             return None
         str0_ = t.term
         if (str0_[len(str0_) - 1] == 'А' or str0_[len(str0_) - 1] == 'О'):
             for wf in t.morph.items:
                 if ((isinstance(wf, MorphWordForm))
                         and wf.is_in_dictionary):
                     if (wf.class0_.is_verb):
                         mc = t.get_morph_class_in_dictionary()
                         if (not mc.is_noun and
                             (((attrs) &
                               (NounPhraseParseAttr.IGNOREPARTICIPLES)))
                                 == (NounPhraseParseAttr.NO)):
                             if (not LanguageHelper.ends_with_ex(
                                     str0_, "ОГО", "ЕГО", None, None)):
                                 return None
                         has_legal_verb = True
                     if (wf.class0_.is_adverb):
                         if (t.next0_ is None or not t.next0_.is_hiphen):
                             if ((str0_ == "ВСЕГО" or str0_ == "ДОМА"
                                  or str0_ == "НЕСКОЛЬКО")
                                     or str0_ == "МНОГО"
                                     or str0_ == "ПОРЯДКА"):
                                 pass
                             else:
                                 return None
                     if (wf.class0_.is_adjective):
                         if (wf.contains_attr("к.ф.", None)):
                             if (t.get_morph_class_in_dictionary() ==
                                     MorphClass.ADJECTIVE):
                                 pass
                             else:
                                 _is_doubt_adj = True
         mc0 = t.morph.class0_
         if (mc0.is_proper_surname and not t.chars.is_all_lower):
             for wf in t.morph.items:
                 if (wf.class0_.is_proper_surname
                         and wf.number != MorphNumber.PLURAL):
                     wff = Utils.asObjectOrNull(wf, MorphWordForm)
                     if (wff is None):
                         continue
                     s = Utils.ifNotNull((Utils.ifNotNull(
                         wff.normal_full, wff.normal_case)), "")
                     if (LanguageHelper.ends_with_ex(
                             s, "ИН", "ЕН", "ЫН", None)):
                         if (not wff.is_in_dictionary):
                             _can_be_surname = True
                         else:
                             return None
                     if (wff.is_in_dictionary
                             and LanguageHelper.ends_with(s, "ОВ")):
                         _can_be_surname = True
         if (mc0.is_proper_name and not t.chars.is_all_lower):
             for wff in t.morph.items:
                 wf = Utils.asObjectOrNull(wff, MorphWordForm)
                 if (wf is None):
                     continue
                 if (wf.normal_case == "ГОР"):
                     continue
                 if (wf.class0_.is_proper_name and wf.is_in_dictionary):
                     if (wf.normal_case is None
                             or not wf.normal_case.startswith("ЛЮБ")):
                         if (mc0.is_adjective
                                 and t.morph.contains_attr("неизм.", None)):
                             pass
                         elif (
                             (((attrs) &
                               (NounPhraseParseAttr.REFERENTCANBENOUN))
                              ) == (NounPhraseParseAttr.REFERENTCANBENOUN)):
                             pass
                         else:
                             if (items is None or (len(items) < 1)):
                                 return None
                             if (not items[0].is_std_adjective):
                                 return None
         if (mc0.is_adjective and t.morph.items_count == 1):
             if (t.morph.get_indexer_item(0).contains_attr(
                     "в.ср.ст.", None)):
                 return None
         mc1 = t.get_morph_class_in_dictionary()
         if (mc1 == MorphClass.VERB and t.morph.case_.is_undefined):
             return None
         if (((((attrs) & (NounPhraseParseAttr.IGNOREPARTICIPLES)))
              == (NounPhraseParseAttr.IGNOREPARTICIPLES)
              and t.morph.class0_.is_verb and not t.morph.class0_.is_noun)
                 and not t.morph.class0_.is_proper):
             for wf in t.morph.items:
                 if (wf.class0_.is_verb):
                     if (wf.contains_attr("дейст.з.", None)):
                         if (LanguageHelper.ends_with(t.term, "СЯ")):
                             pass
                         else:
                             return None
     t1 = None
     for k in range(2):
         t = (Utils.ifNotNull(t1, t0))
         if (k == 0):
             if (((isinstance(t0, TextToken)) and t0.next0_ is not None
                  and t0.next0_.is_hiphen)
                     and t0.next0_.next0_ is not None):
                 if (not t0.is_whitespace_after
                         and not t0.morph.class0_.is_pronoun and
                         not (isinstance(t0.next0_.next0_, NumberToken))):
                     if (not t0.next0_.is_whitespace_after):
                         t = t0.next0_.next0_
                     elif (t0.next0_.next0_.chars.is_all_lower
                           and LanguageHelper.ends_with(t0.term, "О")):
                         t = t0.next0_.next0_
         it = NounPhraseItem._new404(t0, t, _can_be_surname)
         if (t0 == t and (isinstance(t0, ReferentToken))):
             it.can_be_noun = True
             it.morph = MorphCollection(t0.morph)
         can_be_prepos = False
         for v in t.morph.items:
             wf = Utils.asObjectOrNull(v, MorphWordForm)
             if (v.class0_.is_verb and not v.case_.is_undefined):
                 it.can_be_adj = True
                 it.adj_morph.append(NounPhraseItemTextVar(v, t))
                 continue
             if (v.class0_.is_preposition):
                 can_be_prepos = True
             if (v.class0_.is_adjective
                     or ((v.class0_.is_pronoun
                          and not v.class0_.is_personal_pronoun
                          and not v.contains_attr("неизм.", None))) or
                 ((v.class0_.is_noun and (isinstance(t, NumberToken))))):
                 if (NounPhraseItem.try_accord_variant(
                         items, (0 if items is None else len(items)), v,
                         False)):
                     is_doub = False
                     if (v.contains_attr("к.ф.", None)):
                         continue
                     if (v.contains_attr("собир.", None)
                             and not (isinstance(t, NumberToken))):
                         if (wf is not None and wf.is_in_dictionary):
                             return None
                         continue
                     if (v.contains_attr("сравн.", None)):
                         continue
                     ok = True
                     if (isinstance(t, TextToken)):
                         s = t.term
                         if (s == "ПРАВО" or s == "ПРАВА"):
                             ok = False
                         elif (LanguageHelper.ends_with(s, "ОВ") and
                               t.get_morph_class_in_dictionary().is_noun):
                             ok = False
                     elif (isinstance(t, NumberToken)):
                         if (v.class0_.is_noun
                                 and t.morph.class0_.is_adjective):
                             ok = False
                         elif (t.morph.class0_.is_noun and ((
                             (attrs) &
                             (NounPhraseParseAttr.PARSENUMERICASADJECTIVE)))
                               == (NounPhraseParseAttr.NO)):
                             ok = False
                     if (ok):
                         it.adj_morph.append(NounPhraseItemTextVar(v, t))
                         it.can_be_adj = True
                         if (_is_doubt_adj and t0 == t):
                             it.is_doubt_adjective = True
                         if (has_legal_verb and wf is not None
                                 and wf.is_in_dictionary):
                             it.can_be_noun = True
                         if (wf is not None and wf.class0_.is_pronoun):
                             it.can_be_noun = True
                             it.noun_morph.append(
                                 NounPhraseItemTextVar(v, t))
             can_be_noun_ = False
             if (isinstance(t, NumberToken)):
                 pass
             elif (v.class0_.is_noun
                   or ((wf is not None and wf.normal_case == "САМ"))):
                 can_be_noun_ = True
             elif (v.class0_.is_personal_pronoun):
                 if (items is None or len(items) == 0):
                     can_be_noun_ = True
                 else:
                     for it1 in items:
                         if (it1.is_verb):
                             if (len(items) == 1
                                     and not v.case_.is_nominative):
                                 can_be_noun_ = True
                             else:
                                 return None
                     if (len(items) == 1):
                         if (items[0].can_be_adj_for_personal_pronoun):
                             can_be_noun_ = True
             elif (
                 (v.class0_.is_pronoun and
                  ((items is None or len(items) == 0 or
                    ((len(items) == 1
                      and items[0].can_be_adj_for_personal_pronoun))))
                  and wf is not None) and
                 (((((wf.normal_case == "ТОТ" or wf.normal_full == "ТО"
                      or wf.normal_case == "ТО") or wf.normal_case == "ЭТО"
                     or wf.normal_case == "ВСЕ") or wf.normal_case == "ЧТО"
                    or wf.normal_case == "КТО") or wf.normal_full
                   == "КОТОРЫЙ" or wf.normal_case == "КОТОРЫЙ"))):
                 if (wf.normal_case == "ВСЕ"):
                     if (t.next0_ is not None
                             and t.next0_.is_value("РАВНО", None)):
                         return None
                 can_be_noun_ = True
             elif (wf is not None and ((Utils.ifNotNull(
                     wf.normal_full, wf.normal_case))) == "КОТОРЫЙ"
                   and (((attrs) & (NounPhraseParseAttr.PARSEPRONOUNS)))
                   == (NounPhraseParseAttr.NO)):
                 return None
             elif (v.class0_.is_proper and (isinstance(t, TextToken))):
                 if (t.length_char > 4 or v.class0_.is_proper_name):
                     can_be_noun_ = True
             if (can_be_noun_):
                 added = False
                 if (items is not None and len(items) > 1 and
                     (((attrs) & (NounPhraseParseAttr.MULTINOUNS))) !=
                     (NounPhraseParseAttr.NO)):
                     ok1 = True
                     ii = 1
                     while ii < len(items):
                         if (not items[ii].conj_before):
                             ok1 = False
                             break
                         ii += 1
                     if (ok1):
                         if (NounPhraseItem.try_accord_variant(
                                 items,
                             (0 if items is None else len(items)), v,
                                 True)):
                             it.noun_morph.append(
                                 NounPhraseItemTextVar(v, t))
                             it.can_be_noun = True
                             it.multi_nouns = True
                             added = True
                 if (not added):
                     if (NounPhraseItem.try_accord_variant(
                             items, (0 if items is None else len(items)), v,
                             False)):
                         it.noun_morph.append(NounPhraseItemTextVar(v, t))
                         it.can_be_noun = True
                         if (v.class0_.is_personal_pronoun
                                 and t.morph.contains_attr("неизм.", None)
                                 and not it.can_be_adj):
                             itt = NounPhraseItemTextVar(v, t)
                             itt.case_ = MorphCase.ALL_CASES
                             itt.number = MorphNumber.UNDEFINED
                             if (itt.normal_value is None):
                                 pass
                             it.adj_morph.append(itt)
                             it.can_be_adj = True
                     elif ((len(items) > 0 and len(items[0].adj_morph) > 0
                            and items[0].adj_morph[0].number
                            == MorphNumber.PLURAL)
                           and not ((items[0].adj_morph[0].case_)
                                    & v.case_).is_undefined
                           and not items[0].adj_morph[0].class0_.is_verb):
                         if (t.next0_ is not None and t.next0_.is_comma_and
                                 and
                             (isinstance(t.next0_.next0_, TextToken))):
                             npt2 = NounPhraseHelper.try_parse(
                                 t.next0_.next0_, attrs, 0, None)
                             if (npt2 is not None
                                     and npt2.preposition is None
                                     and not ((npt2.morph.case_) & v.case_
                                              & items[0].adj_morph[0].case_
                                              ).is_undefined):
                                 it.noun_morph.append(
                                     NounPhraseItemTextVar(v, t))
                                 it.can_be_noun = True
         if (t0 != t):
             for v in it.adj_morph:
                 v.correct_prefix(Utils.asObjectOrNull(t0, TextToken),
                                  False)
             for v in it.noun_morph:
                 v.correct_prefix(Utils.asObjectOrNull(t0, TextToken), True)
         if (k == 1 and it.can_be_noun and not it.can_be_adj):
             if (t1 is not None):
                 it.end_token = t1
             else:
                 it.end_token = t0.next0_.next0_
             for v in it.noun_morph:
                 if (v.normal_value is not None
                         and (v.normal_value.find('-') < 0)):
                     v.normal_value = "{0}-{1}".format(
                         v.normal_value,
                         it.end_token.get_normal_case_text(
                             None, MorphNumber.UNDEFINED,
                             MorphGender.UNDEFINED, False))
         if (it.can_be_adj):
             if (NounPhraseItem.__m_std_adjectives.try_parse(
                     it.begin_token, TerminParseAttr.NO) is not None):
                 it.is_std_adjective = True
         if (can_be_prepos and it.can_be_noun):
             if (items is not None and len(items) > 0):
                 npt1 = NounPhraseHelper.try_parse(
                     t,
                     Utils.valToEnum((NounPhraseParseAttr.PARSEPREPOSITION)
                                     | (NounPhraseParseAttr.PARSEPRONOUNS) |
                                     (NounPhraseParseAttr.PARSEVERBS),
                                     NounPhraseParseAttr), 0, None)
                 if (npt1 is not None and npt1.end_char > t.end_char):
                     return None
             else:
                 npt1 = NounPhraseHelper.try_parse(
                     t.next0_,
                     Utils.valToEnum((NounPhraseParseAttr.PARSEPRONOUNS) |
                                     (NounPhraseParseAttr.PARSEVERBS),
                                     NounPhraseParseAttr), 0, None)
                 if (npt1 is not None):
                     mc = LanguageHelper.get_case_after_preposition(t.lemma)
                     if (not ((mc) & npt1.morph.case_).is_undefined):
                         return None
         if (it.can_be_noun or it.can_be_adj or k == 1):
             if (it.begin_token.morph.class0_.is_pronoun):
                 tt2 = it.end_token.next0_
                 if ((tt2 is not None and tt2.is_hiphen
                      and not tt2.is_whitespace_after)
                         and not tt2.is_whitespace_before):
                     tt2 = tt2.next0_
                 if (isinstance(tt2, TextToken)):
                     ss = tt2.term
                     if ((ss == "ЖЕ" or ss == "БЫ" or ss == "ЛИ")
                             or ss == "Ж"):
                         it.end_token = tt2
                     elif (ss == "НИБУДЬ" or ss == "ЛИБО"
                           or (((ss == "ТО" and tt2.previous.is_hiphen))
                               and it.can_be_adj)):
                         it.end_token = tt2
                         for m in it.adj_morph:
                             m.normal_value = "{0}-{1}".format(
                                 m.normal_value, ss)
                             if (m.single_number_value is not None):
                                 m.single_number_value = "{0}-{1}".format(
                                     m.single_number_value, ss)
             return it
         if (t0 == t):
             if (t0.is_value("БИЗНЕС", None) and t0.next0_ is not None
                     and t0.next0_.chars == t0.chars):
                 t1 = t0.next0_
                 continue
             return it
     return None
Пример #10
0
class TextToken(Token):
    """ Входной токен (после морфанализа) """
    def __init__(self, source: 'MorphToken', kit_: 'AnalysisKit') -> None:
        super().__init__(kit_, (0 if source is None else source.begin_char),
                         (0 if source is None else source.end_char))
        self.term = None
        self.lemma = None
        self.term0 = None
        self.invariant_prefix_length = 0
        self.max_length = 0
        if (source is None):
            return
        self.chars = source.char_info
        self.term = source.term
        self.lemma = (Utils.ifNotNull(source.lemma, self.term))
        self.max_length = (len(self.term))
        self.morph = MorphCollection()
        if (source.word_forms is not None):
            for wf in source.word_forms:
                self.morph.addItem(wf)
                if (wf.normal_case is not None
                        and (self.max_length < len(wf.normal_case))):
                    self.max_length = (len(wf.normal_case))
                if (wf.normal_full is not None
                        and (self.max_length < len(wf.normal_full))):
                    self.max_length = (len(wf.normal_full))
        i = 0
        while i < len(self.term):
            ch = self.term[i]
            j = 0
            while j < self.morph.items_count:
                wf = Utils.asObjectOrNull(self.morph.getIndexerItem(j),
                                          MorphWordForm)
                if (wf.normal_case is not None):
                    if (i >= len(wf.normal_case)):
                        break
                    if (wf.normal_case[i] != ch):
                        break
                if (wf.normal_full is not None):
                    if (i >= len(wf.normal_full)):
                        break
                    if (wf.normal_full[i] != ch):
                        break
                j += 1
            if (j < self.morph.items_count):
                break
            self.invariant_prefix_length = ((i + 1))
            i += 1
        if (self.morph.language.is_undefined
                and not source.language.is_undefined):
            self.morph.language = source.language

    def getLemma(self) -> str:
        """ Получить лемму (устарело, используйте Lemma)
        
        """
        return self.lemma

    def __str__(self) -> str:
        res = Utils.newStringIO(self.term)
        for l_ in self.morph.items:
            print(", {0}".format(str(l_)), end="", file=res, flush=True)
        return Utils.toStringStringIO(res)

    def checkValue(self, dict0_: typing.List[tuple]) -> object:
        """ Попробовать привязать словарь
        
        Args:
            dict0_(typing.List[tuple]): 
        
        """
        if (dict0_ is None):
            return None
        wrapres2699 = RefOutArgWrapper(None)
        inoutres2700 = Utils.tryGetValue(dict0_, self.term, wrapres2699)
        res = wrapres2699.value
        if (inoutres2700):
            return res
        if (self.morph is not None):
            for it in self.morph.items:
                mf = Utils.asObjectOrNull(it, MorphWordForm)
                if (mf is not None):
                    if (mf.normal_case is not None):
                        wrapres2695 = RefOutArgWrapper(None)
                        inoutres2696 = Utils.tryGetValue(
                            dict0_, mf.normal_case, wrapres2695)
                        res = wrapres2695.value
                        if (inoutres2696):
                            return res
                    if (mf.normal_full is not None
                            and mf.normal_case != mf.normal_full):
                        wrapres2697 = RefOutArgWrapper(None)
                        inoutres2698 = Utils.tryGetValue(
                            dict0_, mf.normal_full, wrapres2697)
                        res = wrapres2697.value
                        if (inoutres2698):
                            return res
        return None

    def getSourceText(self) -> str:
        return super().getSourceText()

    def isValue(self, term_: str, termua: str = None) -> bool:
        if (termua is not None and self.morph.language.is_ua):
            if (self.isValue(termua, None)):
                return True
        if (term_ is None):
            return False
        if (self.invariant_prefix_length > len(term_)):
            return False
        if (self.max_length >= len(self.term)
                and (self.max_length < len(term_))):
            return False
        if (term_ == self.term):
            return True
        for wf in self.morph.items:
            if ((wf).normal_case == term_ or (wf).normal_full == term_):
                return True
        return False

    @property
    def is_and(self) -> bool:
        """ Это соединительный союз И (на всех языках) """
        if (not self.morph.class0_.is_conjunction):
            if (self.length_char == 1 and self.isChar('&')):
                return True
            return False
        val = self.term
        if (val == "И" or val == "AND" or val == "UND"):
            return True
        if (self.morph.language.is_ua):
            if (val == "І" or val == "ТА"):
                return True
        return False

    @property
    def is_or(self) -> bool:
        """ Это соединительный союз ИЛИ (на всех языках) """
        if (not self.morph.class0_.is_conjunction):
            return False
        val = self.term
        if (val == "ИЛИ" or val == "OR"):
            return True
        if (self.morph.language.is_ua):
            if (val == "АБО"):
                return True
        return False

    @property
    def is_letters(self) -> bool:
        return str.isalpha(self.term[0])

    def getMorphClassInDictionary(self) -> 'MorphClass':
        res = MorphClass()
        for wf in self.morph.items:
            if ((isinstance(wf, MorphWordForm)) and (wf).is_in_dictionary):
                res |= wf.class0_
        return res

    def getNormalCaseText(self,
                          mc: 'MorphClass' = None,
                          single_number: bool = False,
                          gender: 'MorphGender' = MorphGender.UNDEFINED,
                          keep_chars: bool = False) -> str:
        from pullenti.ner.core.MiscHelper import MiscHelper
        empty = True
        if (mc is not None and mc.is_preposition):
            return LanguageHelper.normalizePreposition(self.term)
        for it in self.morph.items:
            if (mc is not None and not mc.is_undefined):
                cc = (it.class0_.value) & (mc.value)
                if (cc == 0):
                    continue
                if (MorphClass.isMiscInt(cc) and not MorphClass.isProperInt(cc)
                        and mc.value != it.class0_.value):
                    continue
            wf = Utils.asObjectOrNull(it, MorphWordForm)
            normal_full = False
            if (gender != MorphGender.UNDEFINED):
                if ((((it.gender) & (gender))) == (MorphGender.UNDEFINED)):
                    if ((gender == MorphGender.MASCULINE and
                         ((it.gender != MorphGender.UNDEFINED or it.number
                           == MorphNumber.PLURAL)) and wf is not None)
                            and wf.normal_full is not None):
                        normal_full = True
                    elif (gender == MorphGender.MASCULINE
                          and it.class0_.is_personal_pronoun):
                        pass
                    else:
                        continue
            if (not it.case_.is_undefined):
                empty = False
            if (wf is not None):
                if (single_number and it.number == MorphNumber.PLURAL
                        and wf.normal_full is not None):
                    le = len(wf.normal_case)
                    if ((le == (len(wf.normal_full) + 2) and le > 4
                         and wf.normal_case[le - 2] == 'С')
                            and wf.normal_case[le - 1] == 'Я'):
                        res = wf.normal_case
                    else:
                        res = (wf.normal_full
                               if normal_full else wf.normal_full)
                else:
                    res = (wf.normal_full if normal_full else
                           (Utils.ifNotNull(wf.normal_case, self.term)))
                if (single_number and mc is not None
                        and mc == MorphClass.NOUN):
                    if (res == "ДЕТИ"):
                        res = "РЕБЕНОК"
                if (keep_chars):
                    if (self.chars.is_all_lower):
                        res = res.lower()
                    elif (self.chars.is_capital_upper):
                        res = MiscHelper.convertFirstCharUpperAndOtherLower(
                            res)
                return res
        if (not empty):
            return None
        te = None
        if (single_number and mc is not None):
            bi = MorphBaseInfo._new549(MorphClass(mc), gender,
                                       MorphNumber.SINGULAR,
                                       self.morph.language)
            vars0_ = Morphology.getWordform(self.term, bi)
            if (vars0_ is not None):
                te = vars0_
        if (self.chars.is_cyrillic_letter and te is None
                and len(self.term) > 3):
            ch0 = self.term[len(self.term) - 1]
            ch1 = self.term[len(self.term) - 2]
            if (ch0 == 'М' and ((ch1 == 'О' or ch1 == 'А'))):
                te = self.term[0:0 + len(self.term) - 2]
            elif (not LanguageHelper.isCyrillicVowel(ch1)
                  and LanguageHelper.isCyrillicVowel(ch0)):
                te = self.term[0:0 + len(self.term) - 1]
        if (te is None):
            te = self.term
        if (keep_chars):
            if (self.chars.is_all_lower):
                return te.lower()
            elif (self.chars.is_capital_upper):
                return MiscHelper.convertFirstCharUpperAndOtherLower(te)
        return te

    @staticmethod
    def getSourceTextTokens(begin: 'Token',
                            end: 'Token') -> typing.List['TextToken']:
        from pullenti.ner.MetaToken import MetaToken
        res = list()
        t = begin
        while t is not None and t != end.next0_ and t.end_char <= end.end_char:
            if (isinstance(t, TextToken)):
                res.append(Utils.asObjectOrNull(t, TextToken))
            elif (isinstance(t, MetaToken)):
                res.extend(
                    TextToken.getSourceTextTokens((t).begin_token,
                                                  (t).end_token))
            t = t.next0_
        return res

    @property
    def is_pure_verb(self) -> bool:
        """ Признак того, что это чистый глагол """
        ret = False
        if ((self.isValue("МОЖНО", None) or self.isValue("МОЖЕТ", None)
             or self.isValue("ДОЛЖНЫЙ", None)) or self.isValue("НУЖНО", None)):
            return True
        for it in self.morph.items:
            if ((isinstance(it, MorphWordForm)) and (it).is_in_dictionary):
                if (it.class0_.is_verb and it.case_.is_undefined):
                    ret = True
                elif (not it.class0_.is_verb):
                    if (it.class0_.is_adjective
                            and it.containsAttr("к.ф.", None)):
                        pass
                    else:
                        return False
        return ret

    @property
    def is_verb_be(self) -> bool:
        """ Проверка, что это глагол типа БЫТЬ, ЯВЛЯТЬСЯ и т.п. """
        if ((self.isValue("БЫТЬ", None) or self.isValue("ЕСТЬ", None)
             or self.isValue("ЯВЛЯТЬ", None)) or self.isValue("BE", None)):
            return True
        if (self.term == "IS" or self.term == "WAS" or self.term == "BECAME"):
            return True
        if (self.term == "Є"):
            return True
        return False

    def _serialize(self, stream: io.IOBase) -> None:
        from pullenti.ner.core.internal.SerializerHelper import SerializerHelper
        super()._serialize(stream)
        SerializerHelper.serializeString(stream, self.term)
        SerializerHelper.serializeString(stream, self.lemma)
        SerializerHelper.serializeShort(stream, self.invariant_prefix_length)
        SerializerHelper.serializeShort(stream, self.max_length)

    def _deserialize(self, stream: io.IOBase, kit_: 'AnalysisKit',
                     vers: int) -> None:
        from pullenti.ner.core.internal.SerializerHelper import SerializerHelper
        super()._deserialize(stream, kit_, vers)
        self.term = SerializerHelper.deserializeString(stream)
        self.lemma = SerializerHelper.deserializeString(stream)
        self.invariant_prefix_length = SerializerHelper.deserializeShort(
            stream)
        self.max_length = SerializerHelper.deserializeShort(stream)

    @staticmethod
    def _new538(_arg1: 'MorphToken', _arg2: 'AnalysisKit',
                _arg3: str) -> 'TextToken':
        res = TextToken(_arg1, _arg2)
        res.term0 = _arg3
        return res

    @staticmethod
    def _new541(_arg1: 'MorphToken', _arg2: 'AnalysisKit', _arg3: 'CharsInfo',
                _arg4: int, _arg5: int, _arg6: str) -> 'TextToken':
        res = TextToken(_arg1, _arg2)
        res.chars = _arg3
        res.begin_char = _arg4
        res.end_char = _arg5
        res.term0 = _arg6
        return res
Пример #11
0
 def tryParse(
         self,
         t0: 'Token',
         pars: 'TerminParseAttr' = TerminParseAttr.NO) -> 'TerminToken':
     """ Попробовать привязать термин
     
     Args:
         t0(Token): 
         fullWordsOnly: 
     
     """
     from pullenti.ner.core.MiscHelper import MiscHelper
     from pullenti.ner.core.BracketHelper import BracketHelper
     if (t0 is None):
         return None
     term = None
     if (isinstance(t0, TextToken)):
         term = (t0).term
     if (self.acronym_smart is not None
             and (((pars) &
                   (TerminParseAttr.FULLWORDSONLY))) == (TerminParseAttr.NO)
             and term is not None):
         if (self.acronym_smart == term):
             if (t0.next0_ is not None and t0.next0_.isChar('.')
                     and not t0.is_whitespace_after):
                 return TerminToken._new606(t0, t0.next0_, self)
             else:
                 return TerminToken._new606(t0, t0, self)
         t1 = Utils.asObjectOrNull(t0, TextToken)
         tt = Utils.asObjectOrNull(t0, TextToken)
         i = 0
         while i < len(self.acronym):
             if (tt is None):
                 break
             term1 = tt.term
             if (len(term1) != 1 or tt.is_whitespace_after):
                 break
             if (i > 0 and tt.is_whitespace_before):
                 break
             if (term1[0] != self.acronym[i]):
                 break
             if (tt.next0_ is None or not tt.next0_.isChar('.')):
                 break
             t1 = (Utils.asObjectOrNull(tt.next0_, TextToken))
             tt = (Utils.asObjectOrNull(tt.next0_.next0_, TextToken))
             i += 1
         if (i >= len(self.acronym)):
             return TerminToken._new606(t0, t1, self)
     if (self.acronym is not None and term is not None
             and self.acronym == term):
         if (t0.chars.is_all_upper or self.acronym_can_be_lower
                 or ((not t0.chars.is_all_lower and len(term) >= 3))):
             return TerminToken._new606(t0, t0, self)
     if (self.acronym is not None and t0.chars.is_last_lower
             and t0.length_char > 3):
         if (t0.isValue(self.acronym, None)):
             return TerminToken._new606(t0, t0, self)
     cou = 0
     i = 0
     while i < len(self.terms):
         if (self.terms[i].is_hiphen):
             cou -= 1
         else:
             cou += 1
         i += 1
     if (len(self.terms) > 0
             and ((not self.ignore_terms_order or cou == 1))):
         t1 = t0
         tt = t0
         e0_ = None
         eup = None
         ok = True
         mc = None
         dont_change_mc = False
         i = 0
         first_pass2812 = True
         while True:
             if first_pass2812: first_pass2812 = False
             else: i += 1
             if (not (i < len(self.terms))): break
             if (self.terms[i].is_hiphen):
                 continue
             if (tt is not None and tt.is_hiphen and i > 0):
                 tt = tt.next0_
             if (i > 0 and tt is not None):
                 if ((((pars) & (TerminParseAttr.IGNOREBRACKETS))) !=
                     (TerminParseAttr.NO) and not tt.chars.is_letter
                         and BracketHelper.isBracket(tt, False)):
                     tt = tt.next0_
             if (((((pars) & (TerminParseAttr.CANBEGEOOBJECT))) !=
                  (TerminParseAttr.NO) and i > 0 and
                  (isinstance(tt, ReferentToken)))
                     and tt.getReferent().type_name == "GEO"):
                 tt = tt.next0_
             if ((isinstance(tt, ReferentToken)) and e0_ is None):
                 eup = tt
                 e0_ = (tt).end_token
                 tt = (tt).begin_token
             if (tt is None):
                 ok = False
                 break
             if (not self.terms[i].checkByToken(tt)):
                 if (tt.next0_ is not None and tt.isChar('.')
                         and self.terms[i].checkByToken(tt.next0_)):
                     tt = tt.next0_
                 elif (((i > 0 and tt.next0_ is not None and
                         (isinstance(tt, TextToken))) and
                        ((tt.morph.class0_.is_preposition
                          or MiscHelper.isEngArticle(tt)))
                        and self.terms[i].checkByToken(tt.next0_))
                       and not self.terms[i - 1].is_pattern_any):
                     tt = tt.next0_
                 else:
                     ok = False
                     if (((i + 2) < len(self.terms))
                             and self.terms[i + 1].is_hiphen
                             and self.terms[i + 2].checkByPrefToken(
                                 self.terms[i],
                                 Utils.asObjectOrNull(tt, TextToken))):
                         i += 2
                         ok = True
                     elif (((not tt.is_whitespace_after
                             and tt.next0_ is not None and
                             (isinstance(tt, TextToken))) and
                            (tt).length_char == 1
                            and tt.next0_.isCharOf("\"'`’“”"))
                           and not tt.next0_.is_whitespace_after
                           and (isinstance(tt.next0_.next0_, TextToken))):
                         if (self.terms[i].checkByStrPrefToken(
                             (tt).term,
                                 Utils.asObjectOrNull(
                                     tt.next0_.next0_, TextToken))):
                             ok = True
                             tt = tt.next0_.next0_
                     if (not ok):
                         if (i > 0 and
                             (((pars) &
                               (TerminParseAttr.IGNORESTOPWORDS))) !=
                             (TerminParseAttr.NO)):
                             if (isinstance(tt, TextToken)):
                                 if (not tt.chars.is_letter):
                                     tt = tt.next0_
                                     i -= 1
                                     continue
                                 mc1 = tt.getMorphClassInDictionary()
                                 if (mc1.is_conjunction
                                         or mc1.is_preposition):
                                     tt = tt.next0_
                                     i -= 1
                                     continue
                             if (isinstance(tt, NumberToken)):
                                 tt = tt.next0_
                                 i -= 1
                                 continue
                         break
             if (tt.morph.items_count > 0 and not dont_change_mc):
                 mc = MorphCollection(tt.morph)
                 if (((mc.class0_.is_noun or mc.class0_.is_verb))
                         and not mc.class0_.is_adjective):
                     if (((i + 1) < len(self.terms))
                             and self.terms[i + 1].is_hiphen):
                         pass
                     else:
                         dont_change_mc = True
             if (tt.morph.class0_.is_preposition
                     or tt.morph.class0_.is_conjunction):
                 dont_change_mc = True
             if (tt == e0_):
                 tt = eup
                 eup = (None)
                 e0_ = (None)
             if (e0_ is None):
                 t1 = tt
             tt = tt.next0_
         if (ok and i >= len(self.terms)):
             if (t1.next0_ is not None and t1.next0_.isChar('.')
                     and self.abridges is not None):
                 for a in self.abridges:
                     if (a.tryAttach(t0) is not None):
                         t1 = t1.next0_
                         break
             if (t0 != t1 and t0.morph.class0_.is_adjective):
                 npt = NounPhraseHelper.tryParse(t0, NounPhraseParseAttr.NO,
                                                 0)
                 if (npt is not None and npt.end_char <= t1.end_char):
                     mc = npt.morph
             return TerminToken._new611(t0, t1, mc)
     if (len(self.terms) > 1 and self.ignore_terms_order):
         terms_ = list(self.terms)
         t1 = t0
         tt = t0
         while len(terms_) > 0:
             if (tt != t0 and tt is not None and tt.is_hiphen):
                 tt = tt.next0_
             if (tt is None):
                 break
             j = 0
             while j < len(terms_):
                 if (terms_[j].checkByToken(tt)):
                     break
                 j += 1
             if (j >= len(terms_)):
                 if (tt != t0 and (((pars) &
                                    (TerminParseAttr.IGNORESTOPWORDS))) !=
                     (TerminParseAttr.NO)):
                     if (isinstance(tt, TextToken)):
                         if (not tt.chars.is_letter):
                             tt = tt.next0_
                             continue
                         mc1 = tt.getMorphClassInDictionary()
                         if (mc1.is_conjunction or mc1.is_preposition):
                             tt = tt.next0_
                             continue
                     if (isinstance(tt, NumberToken)):
                         tt = tt.next0_
                         continue
                 break
             del terms_[j]
             t1 = tt
             tt = tt.next0_
         for i in range(len(terms_) - 1, -1, -1):
             if (terms_[i].is_hiphen):
                 del terms_[i]
         if (len(terms_) == 0):
             return TerminToken(t0, t1)
     if (self.abridges is not None and
         (((pars) &
           (TerminParseAttr.FULLWORDSONLY))) == (TerminParseAttr.NO)):
         res = None
         for a in self.abridges:
             r = a.tryAttach(t0)
             if (r is None):
                 continue
             if (r.abridge_without_point and len(self.terms) > 0):
                 if (not ((isinstance(t0, TextToken)))):
                     continue
                 if (a.parts[0].value != (t0).term):
                     continue
             if (res is None or (res.length_char < r.length_char)):
                 res = r
         if (res is not None):
             return res
     return None
Пример #12
0
class Token:
    """ Базовый класс для всех токенов. Наследные классы - TextToken (конечная словоформа) и MetaToken (связный фрагмент других токенов).
    
    Токен
    """
    def __init__(self, kit_: 'AnalysisKit', begin: int, end: int) -> None:
        self.kit = None
        self.__m_begin_char = 0
        self.__m_end_char = 0
        self.tag = None
        self._m_previous = None
        self._m_next = None
        self.__m_morph = None
        self.chars = None
        self.__m_attrs = 0
        self.kit = kit_
        self.__m_begin_char = begin
        self.__m_end_char = end

    @property
    def begin_char(self) -> int:
        """ Позиция в тексте начального символа
        
        """
        return self.__m_begin_char

    @property
    def end_char(self) -> int:
        """ Позиция в тексте конечного символа
        
        """
        return self.__m_end_char

    @property
    def length_char(self) -> int:
        """ Длина в текстовых символах """
        return (self.end_char - self.begin_char) + 1

    @property
    def previous(self) -> 'Token':
        """ Предыдущий токен в цепочке токенов
        
        """
        return self._m_previous

    @previous.setter
    def previous(self, value) -> 'Token':
        self._m_previous = value
        if (value is not None):
            value._m_next = self
        self.__m_attrs = (0)
        return value

    @property
    def next0_(self) -> 'Token':
        """ Следующий токен в цепочке токенов
        
        """
        return self._m_next

    @next0_.setter
    def next0_(self, value) -> 'Token':
        self._m_next = value
        if (value is not None):
            value._m_previous = self
        self.__m_attrs = (0)
        return value

    @property
    def morph(self) -> 'MorphCollection':
        """ Морфологическая информация
        
        """
        if (self.__m_morph is None):
            self.__m_morph = MorphCollection()
        return self.__m_morph

    @morph.setter
    def morph(self, value) -> 'MorphCollection':
        self.__m_morph = value
        return value

    def __str__(self) -> str:
        return self.kit.sofa.text[self.begin_char:self.begin_char +
                                  (self.end_char + 1) - self.begin_char]

    def __get_attr(self, i: int) -> bool:
        ch = '\x00'
        if ((((self.__m_attrs) & 1)) == 0):
            self.__m_attrs = (1)
            if (self._m_previous is None):
                self._set_attr(1, True)
                self._set_attr(3, True)
            else:
                j = self._m_previous.end_char + 1
                while j < self.begin_char:
                    ch = self.kit.sofa.text[j]
                    if (Utils.isWhitespace((ch))):
                        self._set_attr(1, True)
                        if ((ord(ch)) == 0xD or (ord(ch)) == 0xA
                                or ch == '\f'):
                            self._set_attr(3, True)
                    j += 1
            if (self._m_next is None):
                self._set_attr(2, True)
                self._set_attr(4, True)
            else:
                j = self.end_char + 1
                while j < self._m_next.begin_char:
                    ch = self.kit.sofa.text[j]
                    if (Utils.isWhitespace(ch)):
                        self._set_attr(2, True)
                        if ((ord(ch)) == 0xD or (ord(ch)) == 0xA
                                or ch == '\f'):
                            self._set_attr(4, True)
                    j += 1
        return (((((self.__m_attrs) >> i)) & 1)) != 0

    def _set_attr(self, i: int, val: bool) -> None:
        if (val):
            self.__m_attrs |= ((1 << i))
        else:
            self.__m_attrs &= (~((1 << i)))

    @property
    def is_whitespace_before(self) -> bool:
        """ Наличие пробельных символов перед """
        return self.__get_attr(1)

    @is_whitespace_before.setter
    def is_whitespace_before(self, value) -> bool:
        self._set_attr(1, value)
        return value

    @property
    def is_whitespace_after(self) -> bool:
        """ Наличие пробельных символов после """
        return self.__get_attr(2)

    @is_whitespace_after.setter
    def is_whitespace_after(self, value) -> bool:
        self._set_attr(2, value)
        return value

    @property
    def is_newline_before(self) -> bool:
        """ Элемент начинается с новой строки.
        Для 1-го элемента всегда true. """
        return self.__get_attr(3)

    @is_newline_before.setter
    def is_newline_before(self, value) -> bool:
        self._set_attr(3, value)
        return value

    @property
    def is_newline_after(self) -> bool:
        """ Элемент заканчивает строку.
        Для последнего элемента всегда true. """
        return self.__get_attr(4)

    @is_newline_after.setter
    def is_newline_after(self, value) -> bool:
        self._set_attr(4, value)
        return value

    @property
    def inner_bool(self) -> bool:
        # Это используется внутренним образом
        return self.__get_attr(5)

    @inner_bool.setter
    def inner_bool(self, value) -> bool:
        self._set_attr(5, value)
        return value

    @property
    def not_noun_phrase(self) -> bool:
        # Это используется внутренним образом
        # (признак того, что здесь не начинается именная группа, чтобы повторно не пытаться выделять)
        return self.__get_attr(6)

    @not_noun_phrase.setter
    def not_noun_phrase(self, value) -> bool:
        self._set_attr(6, value)
        return value

    @property
    def whitespaces_before_count(self) -> int:
        """ Количество пробелов перед, переход на новую строку = 10, табуляция = 5 """
        if (self.previous is None):
            return 100
        if ((self.previous.end_char + 1) == self.begin_char):
            return 0
        return self.__calc_whitespaces(self.previous.end_char + 1,
                                       self.begin_char - 1)

    @property
    def newlines_before_count(self) -> int:
        """ Количество переходов на новую строку перед """
        ch0 = chr(0)
        res = 0
        txt = self.kit.sofa.text
        for p in range(self.begin_char - 1, -1, -1):
            ch = txt[p]
            if ((ord(ch)) == 0xA):
                res += 1
            elif ((ord(ch)) == 0xD and (ord(ch0)) != 0xA):
                res += 1
            elif (ch == '\f'):
                res += 10
            elif (not Utils.isWhitespace(ch)):
                break
            ch0 = ch
        return res

    @property
    def newlines_after_count(self) -> int:
        """ Количество переходов на новую строку перед """
        ch0 = chr(0)
        res = 0
        txt = self.kit.sofa.text
        p = self.end_char + 1
        while p < len(txt):
            ch = txt[p]
            if ((ord(ch)) == 0xD):
                res += 1
            elif ((ord(ch)) == 0xA and (ord(ch0)) != 0xD):
                res += 1
            elif (ch == '\f'):
                res += 10
            elif (not Utils.isWhitespace(ch)):
                break
            ch0 = ch
            p += 1
        return res

    @property
    def whitespaces_after_count(self) -> int:
        """ Количество пробелов перед, переход на новую строку = 10, табуляция = 5 """
        if (self.next0_ is None):
            return 100
        if ((self.end_char + 1) == self.next0_.begin_char):
            return 0
        return self.__calc_whitespaces(self.end_char + 1,
                                       self.next0_.begin_char - 1)

    def __calc_whitespaces(self, p0: int, p1: int) -> int:
        if ((p0 < 0) or p0 > p1 or p1 >= len(self.kit.sofa.text)):
            return -1
        res = 0
        i = p0
        while i <= p1:
            ch = self.kit.get_text_character(i)
            if (ch == '\r' or ch == '\n'):
                res += 10
                ch1 = self.kit.get_text_character(i + 1)
                if (ch != ch1 and ((ch1 == '\r' or ch1 == '\n'))):
                    i += 1
            elif (ch == '\t'):
                res += 5
            elif (ch == '\u0007'):
                res += 100
            elif (ch == '\f'):
                res += 100
            else:
                res += 1
            i += 1
        return res

    @property
    def is_hiphen(self) -> bool:
        """ Это символ переноса """
        ch = self.kit.sofa.text[self.begin_char]
        return LanguageHelper.is_hiphen(ch)

    @property
    def is_table_control_char(self) -> bool:
        """ Это спец-символы для табличных элементов (7h, 1Eh, 1Fh) """
        ch = self.kit.sofa.text[self.begin_char]
        return (ord(ch)) == 7 or (ord(ch)) == 0x1F or (ord(ch)) == 0x1E

    @property
    def is_and(self) -> bool:
        """ Это соединительный союз И (на всех языках) """
        return False

    @property
    def is_or(self) -> bool:
        """ Это соединительный союз ИЛИ (на всех языках) """
        return False

    @property
    def is_comma(self) -> bool:
        """ Это запятая """
        return self.is_char(',')

    @property
    def is_comma_and(self) -> bool:
        """ Это запятая или союз И """
        return self.is_comma or self.is_and

    def is_char(self, ch: 'char') -> bool:
        """ Токен состоит из конкретного символа
        
        Args:
            ch('char'): проверяемый символ
        
        """
        if (self.begin_char != self.end_char):
            return False
        return self.kit.sofa.text[self.begin_char] == ch

    def is_char_of(self, chars_: str) -> bool:
        """ Токен состоит из одного символа, который есть в указанной строке
        
        Args:
            chars_(str): строка возможных символов
        
        """
        if (self.begin_char != self.end_char):
            return False
        return chars_.find(self.kit.sofa.text[self.begin_char]) >= 0

    def is_value(self, term: str, termua: str = None) -> bool:
        """ Проверка конкретного значения слова
        
        Args:
            term(str): слово (проверяется значение TextToken.Term)
            termua(str): слово для проверки на украинском языке
        
        Returns:
            bool: да-нет
        """
        return False

    @property
    def is_letters(self) -> bool:
        """ Признак того, что это буквенный текстовой токен (TextToken) """
        return False

    def get_referent(self) -> 'Referent':
        """ Получить ссылку на сущность (не null только для ReferentToken)
        
        """
        return None

    def get_referents(self) -> typing.List['Referent']:
        """ Получить список ссылок на все сущности, скрывающиеся под элементом.
        Дело в том, что одни сущности могут накрывать другие (например, адрес накроет город).
        
        """
        return None

    def get_normal_case_text(self,
                             mc: 'MorphClass' = None,
                             num: 'MorphNumber' = MorphNumber.UNDEFINED,
                             gender: 'MorphGender' = MorphGender.UNDEFINED,
                             keep_chars: bool = False) -> str:
        """ Получить связанный с токеном текст в именительном падеже
        
        Args:
            mc(MorphClass): желательная часть речи
            num(MorphNumber): желательное число
            gender(MorphGender): желательный пол
            keep_chars(bool): сохранять регистр символов (по умолчанию, всё в верхний)
        
        Returns:
            str: строка текста
        """
        return str(self)

    def get_source_text(self) -> str:
        """ Получить фрагмент исходного текста, связанный с токеном
        
        Returns:
            str: фрагмент исходного текста
        """
        len0_ = (self.end_char + 1) - self.begin_char
        if ((len0_ < 1) or (self.begin_char < 0)):
            return None
        if ((self.begin_char + len0_) > len(self.kit.sofa.text)):
            return None
        return self.kit.sofa.text[self.begin_char:self.begin_char + len0_]

    def get_morph_class_in_dictionary(self) -> 'MorphClass':
        """ Проверка, что слово есть в словаре соответствующего языка
        
        Returns:
            MorphClass: части речи, если не из словаря, то IsUndefined
        """
        return self.morph.class0_

    def _serialize(self, stream: Stream) -> None:
        from pullenti.ner.core.internal.SerializerHelper import SerializerHelper
        SerializerHelper.serialize_int(stream, self.begin_char)
        SerializerHelper.serialize_int(stream, self.end_char)
        SerializerHelper.serialize_int(stream, self.__m_attrs)
        SerializerHelper.serialize_int(stream, self.chars.value)
        if (self.__m_morph is None):
            self.__m_morph = MorphCollection()
        self.__m_morph._serialize(stream)

    def _deserialize(self, stream: Stream, kit_: 'AnalysisKit',
                     vers: int) -> None:
        from pullenti.ner.core.internal.SerializerHelper import SerializerHelper
        self.kit = kit_
        self.__m_begin_char = SerializerHelper.deserialize_int(stream)
        self.__m_end_char = SerializerHelper.deserialize_int(stream)
        self.__m_attrs = (SerializerHelper.deserialize_int(stream))
        self.chars = CharsInfo._new2561(
            SerializerHelper.deserialize_int(stream))
        self.__m_morph = MorphCollection()
        self.__m_morph._deserialize(stream)
Пример #13
0
class TextToken(Token):
    """ Входной токен (после морфанализа)
    Текстовой токен
    """
    def __init__(self,
                 source: 'MorphToken',
                 kit_: 'AnalysisKit',
                 bchar: int = -1,
                 echar: int = -1) -> None:
        super().__init__(kit_, (bchar if bchar >= 0 else
                                (0 if source is None else source.begin_char)),
                         (echar if echar >= 0 else
                          (0 if source is None else source.end_char)))
        self.term = None
        self.lemma = None
        self.term0 = None
        self.invariant_prefix_length_of_morph_vars = 0
        self.max_length_of_morph_vars = 0
        if (source is None):
            return
        self.chars = source.char_info
        self.term = source.term
        self.lemma = (Utils.ifNotNull(source.get_lemma(), self.term))
        self.max_length_of_morph_vars = (len(self.term))
        self.morph = MorphCollection()
        if (source.word_forms is not None):
            for wf in source.word_forms:
                self.morph.add_item(wf)
                if (wf.normal_case is not None and
                    (self.max_length_of_morph_vars < len(wf.normal_case))):
                    self.max_length_of_morph_vars = (len(wf.normal_case))
                if (wf.normal_full is not None and
                    (self.max_length_of_morph_vars < len(wf.normal_full))):
                    self.max_length_of_morph_vars = (len(wf.normal_full))
        i = 0
        while i < len(self.term):
            ch = self.term[i]
            j = 0
            j = 0
            while j < self.morph.items_count:
                wf = Utils.asObjectOrNull(self.morph.get_indexer_item(j),
                                          MorphWordForm)
                if (wf.normal_case is not None):
                    if (i >= len(wf.normal_case)):
                        break
                    if (wf.normal_case[i] != ch):
                        break
                if (wf.normal_full is not None):
                    if (i >= len(wf.normal_full)):
                        break
                    if (wf.normal_full[i] != ch):
                        break
                j += 1
            if (j < self.morph.items_count):
                break
            self.invariant_prefix_length_of_morph_vars = ((i + 1))
            i += 1
        if (self.morph.language.is_undefined
                and not source.language.is_undefined):
            self.morph.language = source.language

    def __str__(self) -> str:
        res = Utils.newStringIO(self.term)
        for l_ in self.morph.items:
            print(", {0}".format(str(l_)), end="", file=res, flush=True)
        return Utils.toStringStringIO(res)

    def check_value(self, dict0_: typing.List[tuple]) -> object:
        """ Попробовать привязать словарь
        
        Args:
            dict0_(typing.List[tuple]): 
        
        """
        if (dict0_ is None):
            return None
        res = None
        wrapres2868 = RefOutArgWrapper(None)
        inoutres2869 = Utils.tryGetValue(dict0_, self.term, wrapres2868)
        res = wrapres2868.value
        if (inoutres2869):
            return res
        if (self.morph is not None):
            for it in self.morph.items:
                mf = Utils.asObjectOrNull(it, MorphWordForm)
                if (mf is not None):
                    if (mf.normal_case is not None):
                        wrapres2864 = RefOutArgWrapper(None)
                        inoutres2865 = Utils.tryGetValue(
                            dict0_, mf.normal_case, wrapres2864)
                        res = wrapres2864.value
                        if (inoutres2865):
                            return res
                    if (mf.normal_full is not None
                            and mf.normal_case != mf.normal_full):
                        wrapres2866 = RefOutArgWrapper(None)
                        inoutres2867 = Utils.tryGetValue(
                            dict0_, mf.normal_full, wrapres2866)
                        res = wrapres2866.value
                        if (inoutres2867):
                            return res
        return None

    def get_source_text(self) -> str:
        return super().get_source_text()

    def is_value(self, term_: str, termua: str = None) -> bool:
        if (termua is not None and self.morph.language.is_ua):
            if (self.is_value(termua, None)):
                return True
        if (term_ is None):
            return False
        if (self.invariant_prefix_length_of_morph_vars > len(term_)):
            return False
        if (self.max_length_of_morph_vars >= len(self.term)
                and (self.max_length_of_morph_vars < len(term_))):
            return False
        if (term_ == self.term):
            return True
        for wf in self.morph.items:
            if ((isinstance(wf, MorphWordForm)) and
                ((wf.normal_case == term_ or wf.normal_full == term_))):
                return True
        return False

    @property
    def is_and(self) -> bool:
        """ Это соединительный союз И (на всех языках) """
        if (not self.morph.class0_.is_conjunction):
            if (self.length_char == 1 and self.is_char('&')):
                return True
            return False
        val = self.term
        if (val == "И" or val == "AND" or val == "UND"):
            return True
        if (self.morph.language.is_ua):
            if (val == "І" or val == "ТА"):
                return True
        return False

    @property
    def is_or(self) -> bool:
        """ Это соединительный союз ИЛИ (на всех языках) """
        if (not self.morph.class0_.is_conjunction):
            return False
        val = self.term
        if (val == "ИЛИ" or val == "ЛИБО" or val == "OR"):
            return True
        if (self.morph.language.is_ua):
            if (val == "АБО"):
                return True
        return False

    @property
    def is_letters(self) -> bool:
        return str.isalpha(self.term[0])

    def get_morph_class_in_dictionary(self) -> 'MorphClass':
        res = MorphClass()
        for wf in self.morph.items:
            if ((isinstance(wf, MorphWordForm)) and wf.is_in_dictionary):
                res |= wf.class0_
        return res

    def get_normal_case_text(self,
                             mc: 'MorphClass' = None,
                             num: 'MorphNumber' = MorphNumber.UNDEFINED,
                             gender: 'MorphGender' = MorphGender.UNDEFINED,
                             keep_chars: bool = False) -> str:
        from pullenti.ner.core.MiscHelper import MiscHelper
        empty = True
        if (mc is not None and mc.is_preposition):
            return LanguageHelper.normalize_preposition(self.term)
        for it in self.morph.items:
            if (mc is not None and not mc.is_undefined):
                cc = (it.class0_) & mc
                if (cc.is_undefined):
                    continue
                if (cc.is_misc and not cc.is_proper and mc != it.class0_):
                    continue
            wf = Utils.asObjectOrNull(it, MorphWordForm)
            normal_full = False
            if (gender != MorphGender.UNDEFINED):
                if (((it.gender) & (gender)) == (MorphGender.UNDEFINED)):
                    if ((gender == MorphGender.MASCULINE and
                         ((it.gender != MorphGender.UNDEFINED or it.number
                           == MorphNumber.PLURAL)) and wf is not None)
                            and wf.normal_full is not None):
                        normal_full = True
                    elif (gender == MorphGender.MASCULINE
                          and it.class0_.is_personal_pronoun):
                        pass
                    else:
                        continue
            if (not it.case_.is_undefined):
                empty = False
            if (wf is not None):
                res = None
                if (num == MorphNumber.SINGULAR
                        and it.number == MorphNumber.PLURAL
                        and wf.normal_full is not None):
                    le = len(wf.normal_case)
                    if ((le == (len(wf.normal_full) + 2) and le > 4
                         and wf.normal_case[le - 2] == 'С')
                            and wf.normal_case[le - 1] == 'Я'):
                        res = wf.normal_case
                    else:
                        res = (wf.normal_full
                               if normal_full else wf.normal_full)
                else:
                    res = (wf.normal_full if normal_full else
                           (Utils.ifNotNull(wf.normal_case, self.term)))
                if (num == MorphNumber.SINGULAR and mc is not None
                        and mc == MorphClass.NOUN):
                    if (res == "ДЕТИ"):
                        res = "РЕБЕНОК"
                if (keep_chars):
                    if (self.chars.is_all_lower):
                        res = res.lower()
                    elif (self.chars.is_capital_upper):
                        res = MiscHelper.convert_first_char_upper_and_other_lower(
                            res)
                return res
        if (not empty):
            return None
        te = None
        if (num == MorphNumber.SINGULAR and mc is not None):
            bi = MorphBaseInfo._new492(MorphClass._new53(mc.value), gender,
                                       MorphNumber.SINGULAR,
                                       self.morph.language)
            vars0_ = MorphologyService.get_wordform(self.term, bi)
            if (vars0_ is not None):
                te = vars0_
        if (te is None):
            te = self.term
        if (keep_chars):
            if (self.chars.is_all_lower):
                return te.lower()
            elif (self.chars.is_capital_upper):
                return MiscHelper.convert_first_char_upper_and_other_lower(te)
        return te

    @staticmethod
    def get_source_text_tokens(begin: 'Token',
                               end: 'Token') -> typing.List['TextToken']:
        from pullenti.ner.MetaToken import MetaToken
        res = list()
        t = begin
        while t is not None and t != end.next0_ and t.end_char <= end.end_char:
            if (isinstance(t, TextToken)):
                res.append(Utils.asObjectOrNull(t, TextToken))
            elif (isinstance(t, MetaToken)):
                res.extend(
                    TextToken.get_source_text_tokens(t.begin_token,
                                                     t.end_token))
            t = t.next0_
        return res

    @property
    def is_pure_verb(self) -> bool:
        """ Признак того, что это чистый глагол """
        ret = False
        if ((self.is_value("МОЖНО", None) or self.is_value("МОЖЕТ", None)
             or self.is_value("ДОЛЖНЫЙ", None))
                or self.is_value("НУЖНО", None)):
            return True
        for it in self.morph.items:
            if ((isinstance(it, MorphWordForm)) and it.is_in_dictionary):
                if (it.class0_.is_verb and it.case_.is_undefined):
                    ret = True
                elif (not it.class0_.is_verb):
                    if (it.class0_.is_adjective
                            and it.contains_attr("к.ф.", None)):
                        pass
                    else:
                        return False
        return ret

    @property
    def is_verb_be(self) -> bool:
        """ Проверка, что это глагол типа БЫТЬ, ЯВЛЯТЬСЯ и т.п. """
        if ((self.is_value("БЫТЬ", None) or self.is_value("ЕСТЬ", None)
             or self.is_value("ЯВЛЯТЬ", None)) or self.is_value("BE", None)):
            return True
        if (self.term == "IS" or self.term == "WAS" or self.term == "BECAME"):
            return True
        if (self.term == "Є"):
            return True
        return False

    def _serialize(self, stream: Stream) -> None:
        from pullenti.ner.core.internal.SerializerHelper import SerializerHelper
        super()._serialize(stream)
        SerializerHelper.serialize_string(stream, self.term)
        SerializerHelper.serialize_string(stream, self.lemma)
        SerializerHelper.serialize_short(
            stream, self.invariant_prefix_length_of_morph_vars)
        SerializerHelper.serialize_short(stream, self.max_length_of_morph_vars)

    def _deserialize(self, stream: Stream, kit_: 'AnalysisKit',
                     vers: int) -> None:
        from pullenti.ner.core.internal.SerializerHelper import SerializerHelper
        super()._deserialize(stream, kit_, vers)
        self.term = SerializerHelper.deserialize_string(stream)
        self.lemma = SerializerHelper.deserialize_string(stream)
        self.invariant_prefix_length_of_morph_vars = SerializerHelper.deserialize_short(
            stream)
        self.max_length_of_morph_vars = SerializerHelper.deserialize_short(
            stream)

    @staticmethod
    def _new470(_arg1: 'MorphToken', _arg2: 'AnalysisKit', _arg3: int,
                _arg4: int, _arg5: str) -> 'TextToken':
        res = TextToken(_arg1, _arg2, _arg3, _arg4)
        res.term0 = _arg5
        return res

    @staticmethod
    def _new473(_arg1: 'MorphToken', _arg2: 'AnalysisKit', _arg3: int,
                _arg4: int, _arg5: 'CharsInfo', _arg6: str) -> 'TextToken':
        res = TextToken(_arg1, _arg2, _arg3, _arg4)
        res.chars = _arg5
        res.term0 = _arg6
        return res
Пример #14
0
 def __tryParseRu(first: 'Token', typ: 'NounPhraseParseAttr',
                  max_char_pos: int) -> 'NounPhraseToken':
     if (first is None):
         return None
     items = None
     adverbs = None
     internal_noun_prase = None
     conj_before = False
     t = first
     first_pass2788 = True
     while True:
         if first_pass2788: first_pass2788 = False
         else: t = t.next0_
         if (not (t is not None)): break
         if (max_char_pos > 0 and t.begin_char > max_char_pos):
             break
         if ((t.morph.class0_.is_conjunction
              and not t.morph.class0_.is_adjective
              and not t.morph.class0_.is_pronoun)
                 and not t.morph.class0_.is_noun):
             if (conj_before):
                 break
             if ((((typ) & (NounPhraseParseAttr.CANNOTHASCOMMAAND))) !=
                 (NounPhraseParseAttr.NO)):
                 break
             if (items is not None and t.is_and):
                 conj_before = True
                 if ((t.next0_ is not None and t.next0_.isCharOf("\\/")
                      and t.next0_.next0_ is not None)
                         and t.next0_.next0_.is_or):
                     t = t.next0_.next0_
                 continue
             break
         elif (t.is_comma):
             if (conj_before or items is None):
                 break
             if ((((typ) & (NounPhraseParseAttr.CANNOTHASCOMMAAND))) !=
                 (NounPhraseParseAttr.NO)):
                 break
             mc = t.previous.getMorphClassInDictionary()
             if (mc.is_proper_surname or mc.is_proper_secname):
                 break
             conj_before = True
             continue
         elif (t.isChar('(')):
             if (items is None):
                 return None
             if ((((typ) & (NounPhraseParseAttr.IGNOREBRACKETS))) !=
                 (NounPhraseParseAttr.IGNOREBRACKETS)):
                 break
             brr = BracketHelper.tryParse(t, BracketParseAttr.NO, 100)
             if (brr is None):
                 break
             if (brr.length_char > 100):
                 break
             t = brr.end_token
             continue
         if (isinstance(t, ReferentToken)):
             if ((((typ) & (NounPhraseParseAttr.REFERENTCANBENOUN))) == (
                     NounPhraseParseAttr.NO)):
                 break
         elif (t.chars.is_latin_letter):
             break
         it = NounPhraseItem.tryParse(t, items, typ)
         if (it is None or ((not it.can_be_adj and not it.can_be_noun))):
             if ((((typ) & (NounPhraseParseAttr.PARSEADVERBS))) !=
                 (NounPhraseParseAttr.NO) and (isinstance(t, TextToken))
                     and t.morph.class0_.is_adverb):
                 if (items is None):
                     if (t.previous is not None
                             and t.previous.morph.class0_.is_preposition):
                         pass
                     else:
                         return None
                 if (adverbs is None):
                     adverbs = list()
                 adverbs.append(Utils.asObjectOrNull(t, TextToken))
                 continue
             break
         it.conj_before = conj_before
         conj_before = False
         if (not it.can_be_adj and not it.can_be_noun):
             break
         if (t.is_newline_before and t != first):
             if ((((typ) & (NounPhraseParseAttr.MULTILINES))) !=
                 (NounPhraseParseAttr.NO)):
                 pass
             elif (items is not None
                   and t.chars != items[len(items) - 1].chars):
                 if (t.chars.is_all_lower
                         and items[len(items) - 1].chars.is_capital_upper):
                     pass
                 else:
                     break
         if (items is None):
             items = list()
         else:
             it0 = items[len(items) - 1]
             if (it0.can_be_noun and it0.is_personal_pronoun):
                 if (it.is_pronoun):
                     break
                 if ((it0.begin_token.previous is not None and
                      it0.begin_token.previous.getMorphClassInDictionary(
                      ).is_verb and not it0.begin_token.previous.
                      getMorphClassInDictionary().is_adjective)
                         and not it0.begin_token.previous.
                         getMorphClassInDictionary().is_preposition):
                     if (t.morph.case_.is_nominative
                             or t.morph.case_.is_accusative):
                         pass
                     else:
                         break
                 if (it.can_be_noun and it.is_verb):
                     break
         items.append(it)
         t = it.end_token
         if (t.is_newline_after and not t.chars.is_all_lower):
             mc = t.getMorphClassInDictionary()
             if (mc.is_proper_surname):
                 break
             if (t.morph.class0_.is_proper_surname and mc.is_undefined):
                 break
     if (items is None):
         return None
     if (len(items) == 1 and items[0].can_be_adj):
         and0_ = False
         tt1 = items[0].end_token.next0_
         first_pass2789 = True
         while True:
             if first_pass2789: first_pass2789 = False
             else: tt1 = tt1.next0_
             if (not (tt1 is not None)): break
             if (tt1.is_and or tt1.is_or):
                 and0_ = True
                 break
             if (tt1.is_comma or tt1.isValue("НО", None)
                     or tt1.isValue("ТАК", None)):
                 continue
             break
         if (and0_):
             if (items[0].can_be_noun and items[0].is_personal_pronoun):
                 and0_ = False
         if (and0_):
             tt2 = tt1.next0_
             if (tt2 is not None and tt2.morph.class0_.is_preposition):
                 tt2 = tt2.next0_
             npt1 = _NounPraseHelperInt.__tryParseRu(tt2, typ, max_char_pos)
             if (npt1 is not None and len(npt1.adjectives) > 0):
                 ok1 = False
                 for av in items[0].adj_morph:
                     for v in (npt1.noun).noun_morph:
                         if (v.checkAccord(av, False)):
                             items[0].morph.addItem(av)
                             ok1 = True
                 if (ok1):
                     npt1.begin_token = items[0].begin_token
                     npt1.end_token = tt1.previous
                     npt1.adjectives.clear()
                     npt1.adjectives.append(items[0])
                     return npt1
     last1 = items[len(items) - 1]
     check = True
     for it in items:
         if (not it.can_be_adj):
             check = False
             break
         elif (it.can_be_noun and it.is_personal_pronoun):
             check = False
             break
     tt1 = last1.end_token.next0_
     if ((tt1 is not None and check and
          ((tt1.morph.class0_.is_preposition
            or tt1.morph.case_.is_instrumental)))
             and (tt1.whitespaces_before_count < 2)):
         inp = NounPhraseHelper.tryParse(
             tt1,
             Utils.valToEnum((typ) | (NounPhraseParseAttr.PARSEPREPOSITION),
                             NounPhraseParseAttr), max_char_pos)
         if (inp is not None):
             tt1 = inp.end_token.next0_
             npt1 = _NounPraseHelperInt.__tryParseRu(tt1, typ, max_char_pos)
             if (npt1 is not None):
                 ok = True
                 for it in items:
                     if (not NounPhraseItem.tryAccordAdjAndNoun(
                             it,
                             Utils.asObjectOrNull(npt1.noun,
                                                  NounPhraseItem))):
                         ok = False
                         break
                 if (ok):
                     i = 0
                     while i < len(items):
                         npt1.adjectives.insert(i, items[i])
                         i += 1
                     npt1.internal_noun = inp
                     mmm = MorphCollection(npt1.morph)
                     for it in items:
                         mmm.removeItems(it.adj_morph[0], False)
                     if (mmm.gender != MorphGender.UNDEFINED
                             or mmm.number != MorphNumber.UNDEFINED
                             or not mmm.case_.is_undefined):
                         npt1.morph = mmm
                     if (adverbs is not None):
                         if (npt1.adverbs is None):
                             npt1.adverbs = adverbs
                         else:
                             npt1.adverbs[0:0] = adverbs
                     return npt1
             if (tt1 is not None and tt1.morph.class0_.is_noun):
                 it = NounPhraseItem.tryParse(tt1, items, typ)
                 if (it is not None and it.can_be_noun):
                     internal_noun_prase = inp
                     inp.begin_token = items[0].end_token.next0_
                     items.append(it)
     ok2 = False
     if ((len(items) == 1 and
          (((typ) & (NounPhraseParseAttr.ADJECTIVECANBELAST))) !=
          (NounPhraseParseAttr.NO) and
          (items[0].whitespaces_after_count < 3))
             and not items[0].is_adverb):
         if (not items[0].can_be_adj):
             ok2 = True
         elif (items[0].is_personal_pronoun and items[0].can_be_noun):
             ok2 = True
     if (ok2):
         it = NounPhraseItem.tryParse(items[0].end_token.next0_, None, typ)
         if (it is not None and it.can_be_adj
                 and it.begin_token.chars.is_all_lower):
             ok2 = True
             if (it.is_adverb or it.is_verb):
                 ok2 = False
             if (it.is_pronoun and items[0].is_pronoun):
                 ok2 = False
                 if (it.can_be_adj_for_personal_pronoun
                         and items[0].is_personal_pronoun):
                     ok2 = True
             if (ok2 and NounPhraseItem.tryAccordAdjAndNoun(it, items[0])):
                 npt1 = _NounPraseHelperInt.__tryParseRu(
                     it.begin_token, typ, max_char_pos)
                 if (npt1 is not None and ((npt1.end_char > it.end_char
                                            or len(npt1.adjectives) > 0))):
                     pass
                 else:
                     items.insert(0, it)
     noun = None
     adj_after = None
     for i in range(len(items) - 1, -1, -1):
         if (items[i].can_be_noun):
             if (items[i].conj_before):
                 continue
             if (i > 0 and not items[i - 1].can_be_adj):
                 continue
             if (i > 0 and items[i - 1].can_be_noun):
                 if (items[i - 1].is_doubt_adjective):
                     continue
                 if (items[i - 1].is_pronoun and items[i].is_pronoun):
                     if (items[i].is_pronoun and
                             items[i - 1].can_be_adj_for_personal_pronoun):
                         pass
                     else:
                         continue
             noun = items[i]
             del items[i:i + len(items) - i]
             if (adj_after is not None):
                 items.append(adj_after)
             break
     if (noun is None):
         return None
     res = NounPhraseToken(first, noun.end_token)
     if (adverbs is not None):
         for a in adverbs:
             if (a.begin_char < noun.begin_char):
                 if (res.adverbs is None):
                     res.adverbs = list()
                 res.adverbs.append(a)
     res.noun = (noun)
     res.internal_noun = internal_noun_prase
     for v in noun.noun_morph:
         noun.morph.addItem(v)
     res.morph = noun.morph
     if (res.morph.case_.is_nominative and first.previous is not None
             and first.previous.morph.class0_.is_preposition):
         res.morph.case_ = (res.morph.case_) ^ MorphCase.NOMINATIVE
     if ((((typ) &
           (NounPhraseParseAttr.PARSEPRONOUNS))) == (NounPhraseParseAttr.NO)
             and ((res.morph.class0_.is_pronoun
                   or res.morph.class0_.is_personal_pronoun))):
         return None
     stat = None
     if (len(items) > 1):
         stat = dict()
     need_update_morph = False
     if (len(items) > 0):
         ok_list = list()
         is_num_not = False
         for vv in noun.noun_morph:
             v = vv
             i = 0
             while i < len(items):
                 ok = False
                 for av in items[i].adj_morph:
                     if (v.checkAccord(av, False)):
                         ok = True
                         if (not ((av.case_) & v.case_).is_undefined
                                 and av.case_ != v.case_):
                             v.case_ = av.case_ = (av.case_) & v.case_
                         break
                 if (not ok):
                     if (items[i].can_be_numeric_adj
                             and items[i].tryAccordVar(v)):
                         ok = True
                         v = (Utils.asObjectOrNull(v.clone(),
                                                   NounPhraseItemTextVar))
                         v.number = MorphNumber.PLURAL
                         is_num_not = True
                         v.case_ = MorphCase()
                         for a in items[i].adj_morph:
                             v.case_ = (v.case_) | a.case_
                     else:
                         break
                 i += 1
             if (i >= len(items)):
                 ok_list.append(v)
         if (len(ok_list) > 0 and
             (((len(ok_list) < res.morph.items_count) or is_num_not))):
             res.morph = MorphCollection()
             for v in ok_list:
                 res.morph.addItem(v)
             if (not is_num_not):
                 noun.morph = res.morph
     i = 0
     first_pass2790 = True
     while True:
         if first_pass2790: first_pass2790 = False
         else: i += 1
         if (not (i < len(items))): break
         for av in items[i].adj_morph:
             for v in noun.noun_morph:
                 if (v.checkAccord(av, False)):
                     if (not ((av.case_) & v.case_).is_undefined
                             and av.case_ != v.case_):
                         v.case_ = av.case_ = (av.case_) & v.case_
                         need_update_morph = True
                     items[i].morph.addItem(av)
                     if (stat is not None and len(av.normal_value) > 1):
                         last = av.normal_value[len(av.normal_value) - 1]
                         if (not last in stat):
                             stat[last] = 1
                         else:
                             stat[last] += 1
         if (items[i].is_pronoun or items[i].is_personal_pronoun):
             res.anafor = items[i].begin_token
             if ((((typ) & (NounPhraseParseAttr.PARSEPRONOUNS))) == (
                     NounPhraseParseAttr.NO)):
                 continue
         tt = Utils.asObjectOrNull(items[i].begin_token, TextToken)
         if (tt is not None and not tt.term.startswith("ВЫСШ")):
             err = False
             for wf in tt.morph.items:
                 if (wf.class0_.is_adjective):
                     if (wf.containsAttr("прев.", None)):
                         if ((((typ) &
                               (NounPhraseParseAttr.IGNOREADJBEST))) !=
                             (NounPhraseParseAttr.NO)):
                             err = True
                     if (wf.containsAttr("к.ф.", None)
                             and tt.morph.class0_.is_personal_pronoun):
                         return None
             if (err):
                 continue
         if (res.morph.case_.is_nominative):
             v = MiscHelper.getTextValueOfMetaToken(items[i],
                                                    GetTextAttr.KEEPQUOTES)
             if (not Utils.isNullOrEmpty(v)):
                 if (items[i].getNormalCaseText(
                         None, False, MorphGender.UNDEFINED, False) != v):
                     wf = NounPhraseItemTextVar(items[i].morph, None)
                     wf.normal_value = v
                     wf.class0_ = MorphClass.ADJECTIVE
                     wf.case_ = res.morph.case_
                     if (res.morph.case_.is_prepositional
                             or res.morph.gender == MorphGender.NEUTER
                             or res.morph.gender == MorphGender.FEMINIE):
                         items[i].morph.addItem(wf)
                     else:
                         items[i].morph.insertItem(0, wf)
         res.adjectives.append(items[i])
         if (items[i].end_char > res.end_char):
             res.end_token = items[i].end_token
     i = 0
     first_pass2791 = True
     while True:
         if first_pass2791: first_pass2791 = False
         else: i += 1
         if (not (i < (len(res.adjectives) - 1))): break
         if (res.adjectives[i].whitespaces_after_count > 5):
             if (res.adjectives[i].chars != res.adjectives[i + 1].chars):
                 if (not res.adjectives[i + 1].chars.is_all_lower):
                     return None
                 if (res.adjectives[i].chars.is_all_upper
                         and res.adjectives[i + 1].chars.is_capital_upper):
                     return None
                 if (res.adjectives[i].chars.is_capital_upper
                         and res.adjectives[i + 1].chars.is_all_upper):
                     return None
             if (res.adjectives[i].whitespaces_after_count > 10):
                 if (res.adjectives[i].newlines_after_count == 1):
                     if (res.adjectives[i].chars.is_capital_upper and i == 0
                             and res.adjectives[i + 1].chars.is_all_lower):
                         continue
                     if (res.adjectives[i].chars == res.adjectives[
                             i + 1].chars):
                         continue
                 return None
     if (need_update_morph):
         noun.morph = MorphCollection()
         for v in noun.noun_morph:
             noun.morph.addItem(v)
         res.morph = noun.morph
     if (len(res.adjectives) > 0):
         if (noun.begin_token.previous is not None):
             if (noun.begin_token.previous.is_comma_and):
                 if (res.adjectives[0].begin_char > noun.begin_char):
                     pass
                 else:
                     return None
         zap = 0
         and0_ = 0
         cou = 0
         last_and = False
         i = 0
         while i < (len(res.adjectives) - 1):
             te = res.adjectives[i].end_token.next0_
             if (te is None):
                 return None
             if (te.isChar('(')):
                 pass
             elif (te.is_comma):
                 zap += 1
             elif (te.is_and):
                 and0_ += 1
                 if (i == (len(res.adjectives) - 2)):
                     last_and = True
             if (not res.adjectives[i].begin_token.morph.class0_.is_pronoun
                 ):
                 cou += 1
             i += 1
         if ((zap + and0_) > 0):
             if (and0_ > 1):
                 return None
             elif (and0_ == 1 and not last_and):
                 return None
             if ((zap + and0_) != cou):
                 if (and0_ == 1):
                     pass
                 else:
                     return None
             last = Utils.asObjectOrNull(
                 res.adjectives[len(res.adjectives) - 1], NounPhraseItem)
             if (last.is_pronoun and not last_and):
                 return None
     if (stat is not None):
         for adj in items:
             if (adj.morph.items_count > 1):
                 w1 = Utils.asObjectOrNull(adj.morph.getIndexerItem(0),
                                           NounPhraseItemTextVar)
                 w2 = Utils.asObjectOrNull(adj.morph.getIndexerItem(1),
                                           NounPhraseItemTextVar)
                 if ((len(w1.normal_value) < 2)
                         or (len(w2.normal_value) < 2)):
                     break
                 l1 = w1.normal_value[len(w1.normal_value) - 1]
                 l2 = w2.normal_value[len(w2.normal_value) - 1]
                 i1 = 0
                 i2 = 0
                 wrapi1534 = RefOutArgWrapper(0)
                 Utils.tryGetValue(stat, l1, wrapi1534)
                 i1 = wrapi1534.value
                 wrapi2533 = RefOutArgWrapper(0)
                 Utils.tryGetValue(stat, l2, wrapi2533)
                 i2 = wrapi2533.value
                 if (i1 < i2):
                     adj.morph.removeItem(1)
                     adj.morph.insertItem(0, w2)
     if (res.begin_token.getMorphClassInDictionary().is_verb
             and len(items) > 0):
         if (not res.begin_token.chars.is_all_lower
                 or res.begin_token.previous is None):
             pass
         elif (res.begin_token.previous.morph.class0_.is_preposition):
             pass
         else:
             comma = False
             tt = res.begin_token.previous
             first_pass2792 = True
             while True:
                 if first_pass2792: first_pass2792 = False
                 else: tt = tt.previous
                 if (not (tt is not None)): break
                 if (tt.morph.class0_.is_adverb):
                     continue
                 if (tt.isCharOf(".;")):
                     break
                 if (tt.is_comma):
                     comma = True
                     continue
                 if (tt.isValue("НЕ", None)):
                     continue
                 if (((tt.morph.class0_.is_noun
                       or tt.morph.class0_.is_proper)) and comma):
                     for it in res.begin_token.morph.items:
                         if (it.class0_.is_verb
                                 and (isinstance(it, MorphWordForm))):
                             if (tt.morph.checkAccord(it, False)):
                                 if (res.morph.case_.is_instrumental):
                                     return None
                                 ews = Explanatory.findDerivates(
                                     (it).normal_case, True,
                                     tt.morph.language)
                                 if (ews is not None):
                                     for ew in ews:
                                         if (ew.transitive > 0):
                                             if (res.morph.case_.is_genitive
                                                 ):
                                                 return None
                                         if (ew.nexts is not None):
                                             wrapcm535 = RefOutArgWrapper(
                                                 None)
                                             inoutres536 = Utils.tryGetValue(
                                                 ew.nexts, "", wrapcm535)
                                             cm = wrapcm535.value
                                             if (inoutres536):
                                                 if (not (
                                                     (cm) & res.morph.case_
                                                 ).is_undefined):
                                                     return None
                 break
     if (res.begin_token == res.end_token):
         mc = res.begin_token.getMorphClassInDictionary()
         if (mc.is_adverb):
             if (res.begin_token.previous is not None and
                     res.begin_token.previous.morph.class0_.is_preposition):
                 pass
             elif (mc.is_noun and not mc.is_preposition
                   and not mc.is_conjunction):
                 pass
             elif (res.begin_token.isValue("ВЕСЬ", None)):
                 pass
             else:
                 return None
     return res
Пример #15
0
 def parse_near_items(t : 'Token', t1 : 'Token', lev : int, prev : typing.List['SentItem']) -> typing.List['SentItem']:
     if (lev > 100): 
         return None
     if (t is None or t.begin_char > t1.end_char): 
         return None
     res = list()
     if (isinstance(t, ReferentToken)): 
         res.append(SentItem(Utils.asObjectOrNull(t, MetaToken)))
         return res
     delim = DelimToken.try_parse(t)
     if (delim is not None): 
         res.append(SentItem(delim))
         return res
     conj = ConjunctionHelper.try_parse(t)
     if (conj is not None): 
         res.append(SentItem(conj))
         return res
     prep_ = PrepositionHelper.try_parse(t)
     t111 = (t if prep_ is None else prep_.end_token.next0_)
     if ((isinstance(t111, NumberToken)) and ((t111.morph.class0_.is_adjective and not t111.morph.class0_.is_noun))): 
         t111 = (None)
     num = (None if t111 is None else NumbersWithUnitToken.try_parse(t111, None, False, False, False, False))
     if (num is not None): 
         if (len(num.units) == 0): 
             npt1 = NounPhraseHelper.try_parse(num.end_token.next0_, SentItem.__m_npt_attrs, 0, None)
             if (npt1 is None and num.end_token.next0_ is not None and num.end_token.next0_.is_value("РАЗ", None)): 
                 npt1 = NounPhraseToken(num.end_token.next0_, num.end_token.next0_)
                 npt1.noun = MetaToken(num.end_token.next0_, num.end_token.next0_)
             if (npt1 is not None and prep_ is not None): 
                 if (npt1.noun.end_token.is_value("РАЗ", None)): 
                     npt1.morph.remove_items(prep_.next_case, False)
                 elif (((npt1.morph.case_) & prep_.next_case).is_undefined): 
                     npt1 = (None)
                 else: 
                     npt1.morph.remove_items(prep_.next_case, False)
             if ((npt1 is not None and npt1.end_token.is_value("ОНИ", None) and npt1.preposition is not None) and npt1.preposition.normal == "ИЗ"): 
                 npt1.morph = MorphCollection(num.end_token.morph)
                 npt1.preposition = (None)
                 nn = str(num)
                 si1 = SentItem(npt1)
                 if (nn == "1" and (isinstance(num.end_token, NumberToken)) and num.end_token.end_token.is_value("ОДИН", None)): 
                     a = SemAttribute._new2946(SemAttributeType.ONEOF, num.end_token.end_token.get_normal_case_text(None, MorphNumber.SINGULAR, MorphGender.UNDEFINED, False))
                     aex = SemAttributeEx._new2945(num, a)
                     si1.attrs = list()
                     si1.attrs.append(aex)
                 else: 
                     si1.quant = SemQuantity(nn, num.begin_token, num.end_token)
                 if (prep_ is not None): 
                     si1.prep = prep_.normal
                 res.append(si1)
                 return res
             if (npt1 is not None): 
                 si1 = SentItem._new2948(npt1, SemQuantity(str(num), num.begin_token, num.end_token))
                 if (prep_ is not None): 
                     si1.prep = prep_.normal
                 if (npt1.end_token.is_value("РАЗ", None)): 
                     si1.typ = SentItemType.FORMULA
                 if (((npt1.morph.number) & (MorphNumber.PLURAL)) == (MorphNumber.UNDEFINED) and si1.quant.spelling != "1"): 
                     ok = False
                     if (si1.quant.spelling.endswith("1")): 
                         ok = True
                     elif (si1.typ == SentItemType.FORMULA): 
                         ok = True
                     elif (si1.quant.spelling.endswith("2") and npt1.morph.case_.is_genitive): 
                         ok = True
                     elif (si1.quant.spelling.endswith("3") and npt1.morph.case_.is_genitive): 
                         ok = True
                     elif (si1.quant.spelling.endswith("4") and npt1.morph.case_.is_genitive): 
                         ok = True
                     if (ok): 
                         npt1.morph = MorphCollection()
                         npt1.morph.number = MorphNumber.PLURAL
                 res.append(si1)
                 return res
         num.begin_token = t
         num.morph = MorphCollection(num.end_token.morph)
         si = SentItem(num)
         if (prep_ is not None): 
             si.prep = prep_.normal
         res.append(si)
         if (si.prep == "НА"): 
             aa = AdverbToken.try_parse(si.end_token.next0_)
             if (aa is not None and ((aa.typ == SemAttributeType.LESS or aa.typ == SemAttributeType.GREAT))): 
                 si.add_attr(aa)
                 si.end_token = aa.end_token
         return res
     mc = t.get_morph_class_in_dictionary()
     adv = AdverbToken.try_parse(t)
     npt = NounPhraseHelper.try_parse(t, SentItem.__m_npt_attrs, 0, None)
     if (npt is not None and (isinstance(npt.end_token, TextToken)) and npt.end_token.term == "БЫЛИ"): 
         npt = (None)
     if (npt is not None and adv is not None): 
         if (adv.end_char > npt.end_char): 
             npt = (None)
         elif (adv.end_char == npt.end_char): 
             res.append(SentItem(npt))
             res.append(SentItem(adv))
             return res
     if (npt is not None and len(npt.adjectives) == 0): 
         if (npt.end_token.is_value("КОТОРЫЙ", None) and t.previous is not None and t.previous.is_comma_and): 
             res1 = SentItem.__parse_subsent(npt, t1, lev + 1, prev)
             if (res1 is not None): 
                 return res1
         if (npt.end_token.is_value("СКОЛЬКО", None)): 
             tt1 = npt.end_token.next0_
             if (tt1 is not None and tt1.is_value("ВСЕГО", None)): 
                 tt1 = tt1.next0_
             npt1 = NounPhraseHelper.try_parse(tt1, NounPhraseParseAttr.NO, 0, None)
             if (npt1 is not None and not npt1.morph.case_.is_undefined and prep_ is not None): 
                 if (((prep_.next_case) & npt1.morph.case_).is_undefined): 
                     npt1 = (None)
                 else: 
                     npt1.morph.remove_items(prep_.next_case, False)
             if (npt1 is not None): 
                 npt1.begin_token = npt.begin_token
                 npt1.preposition = npt.preposition
                 npt1.adjectives.append(MetaToken(npt.end_token, npt.end_token))
                 npt = npt1
         if (npt.end_token.morph.class0_.is_adjective): 
             if (VerbPhraseHelper.try_parse(t, True, False, False) is not None): 
                 npt = (None)
     vrb = None
     if (npt is not None and len(npt.adjectives) > 0): 
         vrb = VerbPhraseHelper.try_parse(t, True, False, False)
         if (vrb is not None and vrb.first_verb.is_participle): 
             npt = (None)
     elif (adv is None or npt is not None): 
         vrb = VerbPhraseHelper.try_parse(t, True, False, False)
     if (npt is not None): 
         res.append(SentItem(npt))
     if (vrb is not None and not vrb.first_verb.is_participle and not vrb.first_verb.is_dee_participle): 
         vars0_ = list()
         for wf in vrb.first_verb.morph.items: 
             if (wf.class0_.is_verb and (isinstance(wf, MorphWordForm)) and wf.is_in_dictionary): 
                 vars0_.append(Utils.asObjectOrNull(wf, MorphWordForm))
         if (len(vars0_) < 2): 
             res.append(SentItem(vrb))
         else: 
             vrb.first_verb.verb_morph = vars0_[0]
             res.append(SentItem(vrb))
             i = 1
             while i < len(vars0_): 
                 vrb = VerbPhraseHelper.try_parse(t, False, False, False)
                 if (vrb is None): 
                     break
                 vrb.first_verb.verb_morph = vars0_[i]
                 res.append(SentItem(vrb))
                 i += 1
             if (vars0_[0].misc.mood == MorphMood.IMPERATIVE and vars0_[1].misc.mood != MorphMood.IMPERATIVE): 
                 rr = res[0]
                 res[0] = res[1]
                 res[1] = rr
         return res
     if (vrb is not None): 
         res1 = SentItem.__parse_participles(vrb, t1, lev + 1)
         if (res1 is not None): 
             res.extend(res1)
     if (len(res) > 0): 
         return res
     if (adv is not None): 
         if (adv.typ == SemAttributeType.OTHER): 
             npt1 = NounPhraseHelper.try_parse(adv.end_token.next0_, SentItem.__m_npt_attrs, 0, None)
             if (npt1 is not None and npt1.end_token.is_value("ОНИ", None) and npt1.preposition is not None): 
                 si1 = SentItem(npt1)
                 a = SemAttribute._new2946(SemAttributeType.OTHER, adv.end_token.get_normal_case_text(None, MorphNumber.UNDEFINED, MorphGender.UNDEFINED, False))
                 aex = SemAttributeEx._new2945(num, a)
                 si1.attrs = list()
                 si1.attrs.append(aex)
                 if (prep_ is not None): 
                     si1.prep = prep_.normal
                 res.append(si1)
                 return res
             for i in range(len(prev) - 1, -1, -1):
                 if (prev[i].attrs is not None): 
                     for a in prev[i].attrs: 
                         if (a.attr.typ == SemAttributeType.ONEOF): 
                             si1 = SentItem(prev[i].source)
                             aa = SemAttribute._new2946(SemAttributeType.OTHER, adv.end_token.get_normal_case_text(None, MorphNumber.UNDEFINED, MorphGender.UNDEFINED, False))
                             aex = SemAttributeEx._new2945(adv, aa)
                             si1.attrs = list()
                             si1.attrs.append(aex)
                             if (prep_ is not None): 
                                 si1.prep = prep_.normal
                             si1.begin_token = adv.begin_token
                             si1.end_token = adv.end_token
                             res.append(si1)
                             return res
         res.append(SentItem(adv))
         return res
     if (mc.is_adjective): 
         npt = NounPhraseToken._new2953(t, t, MorphCollection(t.morph))
         npt.noun = MetaToken(t, t)
         res.append(SentItem(npt))
         return res
     return None
Пример #16
0
 def try_attach_territory(
         li: typing.List['TerrItemToken'],
         ad: 'AnalyzerData',
         attach_always: bool = False,
         cits: typing.List['CityItemToken'] = None,
         exists: typing.List['GeoReferent'] = None) -> 'ReferentToken':
     if (li is None or len(li) == 0):
         return None
     ex_obj = None
     new_name = None
     adj_list = list()
     noun = None
     add_noun = None
     rt = TerrAttachHelper.__try_attach_moscowao(li, ad)
     if (rt is not None):
         return rt
     if (li[0].termin_item is not None
             and li[0].termin_item.canonic_text == "ТЕРРИТОРИЯ"):
         res2 = TerrAttachHelper.__try_attach_pure_terr(li, ad)
         return res2
     if (len(li) == 2):
         if (li[0].rzd is not None and li[1].rzd_dir is not None):
             rzd = GeoReferent()
             rzd._add_name(li[1].rzd_dir)
             rzd._add_typ_ter(li[0].kit.base_language)
             rzd.add_slot(GeoReferent.ATTR_REF, li[0].rzd.referent, False,
                          0)
             rzd.add_ext_referent(li[0].rzd)
             return ReferentToken(rzd, li[0].begin_token, li[1].end_token)
         if (li[1].rzd is not None and li[0].rzd_dir is not None):
             rzd = GeoReferent()
             rzd._add_name(li[0].rzd_dir)
             rzd._add_typ_ter(li[0].kit.base_language)
             rzd.add_slot(GeoReferent.ATTR_REF, li[1].rzd.referent, False,
                          0)
             rzd.add_ext_referent(li[1].rzd)
             return ReferentToken(rzd, li[0].begin_token, li[1].end_token)
     can_be_city_before = False
     adj_terr_before = False
     if (cits is not None):
         if (cits[0].typ == CityItemToken.ItemType.CITY):
             can_be_city_before = True
         elif (cits[0].typ == CityItemToken.ItemType.NOUN
               and len(cits) > 1):
             can_be_city_before = True
     k = 0
     k = 0
     while k < len(li):
         if (li[k].onto_item is not None):
             if (ex_obj is not None or new_name is not None):
                 break
             if (noun is not None):
                 if (k == 1):
                     if (noun.termin_item.canonic_text == "РАЙОН"
                             or noun.termin_item.canonic_text == "ОБЛАСТЬ"
                             or noun.termin_item.canonic_text == "СОЮЗ"):
                         if (isinstance(li[k].onto_item.referent,
                                        GeoReferent)):
                             if (li[k].onto_item.referent.is_state):
                                 break
                         ok = False
                         tt = li[k].end_token.next0_
                         if (tt is None):
                             ok = True
                         elif (tt.is_char_of(",.")):
                             ok = True
                         if (not ok):
                             ok = MiscLocationHelper.check_geo_object_before(
                                 li[0].begin_token)
                         if (not ok):
                             adr = AddressItemToken.try_parse(
                                 tt, None, False, False, None)
                             if (adr is not None):
                                 if (adr.typ ==
                                         AddressItemToken.ItemType.STREET):
                                     ok = True
                         if (not ok):
                             break
                     if (li[k].onto_item is not None):
                         if (noun.begin_token.is_value("МО", None)
                                 or noun.begin_token.is_value("ЛО", None)):
                             return None
             ex_obj = li[k]
         elif (li[k].termin_item is not None):
             if (noun is not None):
                 break
             if (li[k].termin_item.is_always_prefix and k > 0):
                 break
             if (k > 0 and li[k].is_doubt):
                 if (li[k].begin_token == li[k].end_token
                         and li[k].begin_token.is_value("ЗАО", None)):
                     break
             if (li[k].termin_item.is_adjective
                     or li[k].is_geo_in_dictionary):
                 adj_list.append(li[k])
             else:
                 if (ex_obj is not None):
                     geo_ = Utils.asObjectOrNull(ex_obj.onto_item.referent,
                                                 GeoReferent)
                     if (geo_ is None):
                         break
                     if (ex_obj.is_adjective and
                         ((li[k].termin_item.canonic_text == "СОЮЗ" or
                           li[k].termin_item.canonic_text == "ФЕДЕРАЦИЯ"))):
                         str0_ = str(ex_obj.onto_item)
                         if (not li[k].termin_item.canonic_text in str0_):
                             return None
                     if (li[k].termin_item.canonic_text == "РАЙОН"
                             or li[k].termin_item.canonic_text == "ОКРУГ"
                             or li[k].termin_item.canonic_text == "КРАЙ"):
                         tmp = io.StringIO()
                         for s in geo_.slots:
                             if (s.type_name == GeoReferent.ATTR_TYPE):
                                 print("{0};".format(s.value),
                                       end="",
                                       file=tmp,
                                       flush=True)
                         if (not li[k].termin_item.canonic_text
                                 in Utils.toStringStringIO(tmp).upper()):
                             if (k != 1 or new_name is not None):
                                 break
                             new_name = li[0]
                             new_name.is_adjective = True
                             new_name.onto_item = (None)
                             ex_obj = (None)
                 noun = li[k]
                 if (k == 0):
                     tt = TerrItemToken.try_parse(
                         li[k].begin_token.previous, None, True, False,
                         None)
                     if (tt is not None and tt.morph.class0_.is_adjective):
                         adj_terr_before = True
         else:
             if (ex_obj is not None):
                 break
             if (new_name is not None):
                 break
             new_name = li[k]
         k += 1
     name = None
     alt_name = None
     full_name = None
     morph_ = None
     if (ex_obj is not None):
         if (ex_obj.is_adjective and not ex_obj.morph.language.is_en
                 and noun is None):
             if (attach_always and ex_obj.end_token.next0_ is not None):
                 npt = NounPhraseHelper.try_parse(ex_obj.begin_token,
                                                  NounPhraseParseAttr.NO, 0,
                                                  None)
                 if (ex_obj.end_token.next0_.is_comma_and):
                     pass
                 elif (npt is None):
                     pass
                 else:
                     str0_ = StreetItemToken.try_parse(
                         ex_obj.end_token.next0_, None, False, None, False)
                     if (str0_ is not None):
                         if (str0_.typ == StreetItemType.NOUN
                                 and str0_.end_token == npt.end_token):
                             return None
             else:
                 cit = CityItemToken.try_parse(ex_obj.end_token.next0_,
                                               None, False, None)
                 if (cit is not None
                         and ((cit.typ == CityItemToken.ItemType.NOUN
                               or cit.typ == CityItemToken.ItemType.CITY))):
                     npt = NounPhraseHelper.try_parse(
                         ex_obj.begin_token, NounPhraseParseAttr.NO, 0,
                         None)
                     if (npt is not None
                             and npt.end_token == cit.end_token):
                         pass
                     else:
                         return None
                 elif (ex_obj.begin_token.is_value("ПОДНЕБЕСНЫЙ", None)):
                     pass
                 else:
                     return None
         if (noun is None and ex_obj.can_be_city):
             cit0 = CityItemToken.try_parse_back(
                 ex_obj.begin_token.previous)
             if (cit0 is not None
                     and cit0.typ != CityItemToken.ItemType.PROPERNAME):
                 return None
         if (ex_obj.is_doubt and noun is None):
             ok2 = False
             if (TerrAttachHelper.__can_be_geo_after(
                     ex_obj.end_token.next0_)):
                 ok2 = True
             elif (not ex_obj.can_be_surname and not ex_obj.can_be_city):
                 if ((ex_obj.end_token.next0_ is not None
                      and ex_obj.end_token.next0_.is_char(')')
                      and ex_obj.begin_token.previous is not None)
                         and ex_obj.begin_token.previous.is_char('(')):
                     ok2 = True
                 elif (ex_obj.chars.is_latin_letter
                       and ex_obj.begin_token.previous is not None):
                     if (ex_obj.begin_token.previous.is_value("IN", None)):
                         ok2 = True
                     elif (ex_obj.begin_token.previous.is_value(
                             "THE", None) and
                           ex_obj.begin_token.previous.previous is not None
                           and
                           ex_obj.begin_token.previous.previous.is_value(
                               "IN", None)):
                         ok2 = True
             if (not ok2):
                 cit0 = CityItemToken.try_parse_back(
                     ex_obj.begin_token.previous)
                 if (cit0 is not None
                         and cit0.typ != CityItemToken.ItemType.PROPERNAME):
                     pass
                 elif (MiscLocationHelper.check_geo_object_before(
                         ex_obj.begin_token.previous)):
                     pass
                 else:
                     return None
         name = ex_obj.onto_item.canonic_text
         morph_ = ex_obj.morph
     elif (new_name is not None):
         if (noun is None):
             return None
         j = 1
         while j < k:
             if (li[j].is_newline_before and not li[0].is_newline_before):
                 if (BracketHelper.can_be_start_of_sequence(
                         li[j].begin_token, False, False)):
                     pass
                 else:
                     return None
             j += 1
         morph_ = noun.morph
         if (new_name.is_adjective):
             if (noun.termin_item.acronym == "АО"):
                 if (noun.begin_token != noun.end_token):
                     return None
                 if (new_name.morph.gender != MorphGender.FEMINIE):
                     return None
             geo_before = None
             tt0 = li[0].begin_token.previous
             if (tt0 is not None and tt0.is_comma_and):
                 tt0 = tt0.previous
             if (not li[0].is_newline_before and tt0 is not None):
                 geo_before = (Utils.asObjectOrNull(tt0.get_referent(),
                                                    GeoReferent))
             if (Utils.indexOfList(li, noun, 0) < Utils.indexOfList(
                     li, new_name, 0)):
                 if (noun.termin_item.is_state):
                     return None
                 if (new_name.can_be_surname and geo_before is None):
                     if (((noun.morph.case_)
                          & new_name.morph.case_).is_undefined):
                         return None
                 if (MiscHelper.is_exists_in_dictionary(
                         new_name.begin_token, new_name.end_token,
                     (MorphClass.ADJECTIVE) | MorphClass.PRONOUN
                         | MorphClass.VERB)):
                     if (noun.begin_token != new_name.begin_token):
                         if (geo_before is None):
                             if (len(li) == 2 and
                                     TerrAttachHelper.__can_be_geo_after(
                                         li[1].end_token.next0_)):
                                 pass
                             elif (len(li) == 3
                                   and li[2].termin_item is not None
                                   and TerrAttachHelper.__can_be_geo_after(
                                       li[2].end_token.next0_)):
                                 pass
                             elif (new_name.is_geo_in_dictionary):
                                 pass
                             elif (new_name.end_token.is_newline_after):
                                 pass
                             else:
                                 return None
                 npt = NounPhraseHelper.try_parse(
                     new_name.end_token, NounPhraseParseAttr.PARSEPRONOUNS,
                     0, None)
                 if (npt is not None
                         and npt.end_token != new_name.end_token):
                     if (len(li) >= 3 and li[2].termin_item is not None
                             and npt.end_token == li[2].end_token):
                         add_noun = li[2]
                     else:
                         return None
                 rtp = new_name.kit.process_referent(
                     "PERSON", new_name.begin_token)
                 if (rtp is not None):
                     return None
                 name = ProperNameHelper.get_name_ex(
                     new_name.begin_token, new_name.end_token,
                     MorphClass.ADJECTIVE, MorphCase.UNDEFINED,
                     noun.termin_item.gender, False, False)
             else:
                 ok = False
                 if (((k + 1) < len(li)) and li[k].termin_item is None
                         and li[k + 1].termin_item is not None):
                     ok = True
                 elif ((k < len(li)) and li[k].onto_item is not None):
                     ok = True
                 elif (k == len(li) and not new_name.is_adj_in_dictionary):
                     ok = True
                 elif (MiscLocationHelper.check_geo_object_before(
                         li[0].begin_token) or can_be_city_before):
                     ok = True
                 elif (MiscLocationHelper.check_geo_object_after(
                         li[k - 1].end_token, False)):
                     ok = True
                 elif (len(li) == 3 and k == 2):
                     cit = CityItemToken.try_parse(li[2].begin_token, None,
                                                   False, None)
                     if (cit is not None):
                         if (cit.typ == CityItemToken.ItemType.CITY
                                 or cit.typ == CityItemToken.ItemType.NOUN):
                             ok = True
                 elif (len(li) == 2):
                     ok = TerrAttachHelper.__can_be_geo_after(
                         li[len(li) - 1].end_token.next0_)
                 if (not ok and not li[0].is_newline_before
                         and not li[0].chars.is_all_lower):
                     rt00 = li[0].kit.process_referent(
                         "PERSONPROPERTY", li[0].begin_token.previous)
                     if (rt00 is not None):
                         ok = True
                 if (noun.termin_item is not None
                         and noun.termin_item.is_strong
                         and new_name.is_adjective):
                     ok = True
                 if (noun.is_doubt and len(adj_list) == 0
                         and geo_before is None):
                     return None
                 name = ProperNameHelper.get_name_ex(
                     new_name.begin_token, new_name.end_token,
                     MorphClass.ADJECTIVE, MorphCase.UNDEFINED,
                     noun.termin_item.gender, False, False)
                 if (not ok and not attach_always):
                     if (MiscHelper.is_exists_in_dictionary(
                             new_name.begin_token, new_name.end_token,
                         (MorphClass.ADJECTIVE) | MorphClass.PRONOUN
                             | MorphClass.VERB)):
                         if (exists is not None):
                             for e0_ in exists:
                                 if (e0_.find_slot(GeoReferent.ATTR_NAME,
                                                   name, True) is not None):
                                     ok = True
                                     break
                         if (not ok):
                             return None
                 full_name = "{0} {1}".format(
                     ProperNameHelper.get_name_ex(li[0].begin_token,
                                                  noun.begin_token.previous,
                                                  MorphClass.ADJECTIVE,
                                                  MorphCase.UNDEFINED,
                                                  noun.termin_item.gender,
                                                  False, False),
                     noun.termin_item.canonic_text)
         else:
             if (not attach_always or
                 ((noun.termin_item is not None
                   and noun.termin_item.canonic_text == "ФЕДЕРАЦИЯ"))):
                 is_latin = noun.chars.is_latin_letter and new_name.chars.is_latin_letter
                 if (Utils.indexOfList(li, noun, 0) > Utils.indexOfList(
                         li, new_name, 0)):
                     if (not is_latin):
                         return None
                 if (not new_name.is_district_name
                         and not BracketHelper.can_be_start_of_sequence(
                             new_name.begin_token, False, False)):
                     if (len(adj_list) == 0
                             and MiscHelper.is_exists_in_dictionary(
                                 new_name.begin_token, new_name.end_token,
                                 (MorphClass.NOUN) | MorphClass.PRONOUN)):
                         if (len(li) == 2 and noun.is_city_region
                                 and (noun.whitespaces_after_count < 2)):
                             pass
                         else:
                             return None
                     if (not is_latin):
                         if ((noun.termin_item.is_region
                              and not attach_always and
                              ((not adj_terr_before or new_name.is_doubt)))
                                 and not noun.is_city_region and
                                 not noun.termin_item.is_specific_prefix):
                             if (not MiscLocationHelper.
                                     check_geo_object_before(
                                         noun.begin_token)):
                                 if (not noun.is_doubt and noun.begin_token
                                         != noun.end_token):
                                     pass
                                 elif ((noun.termin_item.is_always_prefix
                                        and len(li) == 2 and li[0] == noun)
                                       and li[1] == new_name):
                                     pass
                                 else:
                                     return None
                         if (noun.is_doubt and len(adj_list) == 0):
                             if (noun.termin_item.acronym == "МО"
                                     or noun.termin_item.acronym == "ЛО"):
                                 if (k == (len(li) - 1)
                                         and li[k].termin_item is not None):
                                     add_noun = li[k]
                                     k += 1
                                 elif (len(li) == 2 and noun == li[0]
                                       and str(new_name).endswith("совет")):
                                     pass
                                 else:
                                     return None
                             else:
                                 return None
                         pers = new_name.kit.process_referent(
                             "PERSON", new_name.begin_token)
                         if (pers is not None):
                             return None
             name = MiscHelper.get_text_value(new_name.begin_token,
                                              new_name.end_token,
                                              GetTextAttr.NO)
             if (new_name.begin_token != new_name.end_token):
                 ttt = new_name.begin_token.next0_
                 while ttt is not None and ttt.end_char <= new_name.end_char:
                     if (ttt.chars.is_letter):
                         ty = TerrItemToken.try_parse(
                             ttt, None, False, False, None)
                         if ((ty is not None and ty.termin_item is not None
                              and noun is not None)
                                 and ((noun.termin_item.canonic_text
                                       in ty.termin_item.canonic_text
                                       or ty.termin_item.canonic_text
                                       in noun.termin_item.canonic_text))):
                             name = MiscHelper.get_text_value(
                                 new_name.begin_token, ttt.previous,
                                 GetTextAttr.NO)
                             break
                     ttt = ttt.next0_
             if (len(adj_list) > 0):
                 npt = NounPhraseHelper.try_parse(adj_list[0].begin_token,
                                                  NounPhraseParseAttr.NO, 0,
                                                  None)
                 if (npt is not None and npt.end_token == noun.end_token):
                     alt_name = "{0} {1}".format(
                         npt.get_normal_case_text(None,
                                                  MorphNumber.UNDEFINED,
                                                  MorphGender.UNDEFINED,
                                                  False), name)
     else:
         if ((len(li) == 1 and noun is not None
              and noun.end_token.next0_ is not None) and (isinstance(
                  noun.end_token.next0_.get_referent(), GeoReferent))):
             g = Utils.asObjectOrNull(noun.end_token.next0_.get_referent(),
                                      GeoReferent)
             if (noun.termin_item is not None):
                 tyy = noun.termin_item.canonic_text.lower()
                 ooo = False
                 if (g.find_slot(GeoReferent.ATTR_TYPE, tyy, True)
                         is not None):
                     ooo = True
                 elif (tyy.endswith("район") and g.find_slot(
                         GeoReferent.ATTR_TYPE, "район", True) is not None):
                     ooo = True
                 if (ooo):
                     return ReferentToken._new734(g, noun.begin_token,
                                                  noun.end_token.next0_,
                                                  noun.begin_token.morph)
         if ((len(li) == 1 and noun == li[0]
              and li[0].termin_item is not None)
                 and TerrItemToken.try_parse(li[0].end_token.next0_, None,
                                             True, False, None) is None and
                 TerrItemToken.try_parse(li[0].begin_token.previous, None,
                                         True, False, None) is None):
             if (li[0].morph.number == MorphNumber.PLURAL):
                 return None
             cou = 0
             str0_ = li[0].termin_item.canonic_text.lower()
             tt = li[0].begin_token.previous
             first_pass3158 = True
             while True:
                 if first_pass3158: first_pass3158 = False
                 else: tt = tt.previous
                 if (not (tt is not None)): break
                 if (tt.is_newline_after):
                     cou += 10
                 else:
                     cou += 1
                 if (cou > 500):
                     break
                 g = Utils.asObjectOrNull(tt.get_referent(), GeoReferent)
                 if (g is None):
                     continue
                 ok = True
                 cou = 0
                 tt = li[0].end_token.next0_
                 first_pass3159 = True
                 while True:
                     if first_pass3159: first_pass3159 = False
                     else: tt = tt.next0_
                     if (not (tt is not None)): break
                     if (tt.is_newline_before):
                         cou += 10
                     else:
                         cou += 1
                     if (cou > 500):
                         break
                     tee = TerrItemToken.try_parse(tt, None, True, False,
                                                   None)
                     if (tee is None):
                         continue
                     ok = False
                     break
                 if (ok):
                     ii = 0
                     while g is not None and (ii < 3):
                         if (g.find_slot(GeoReferent.ATTR_TYPE, str0_, True)
                                 is not None):
                             return ReferentToken._new734(
                                 g, li[0].begin_token, li[0].end_token,
                                 noun.begin_token.morph)
                         g = g.higher
                         ii += 1
                 break
         return None
     ter = None
     if (ex_obj is not None and (isinstance(ex_obj.tag, GeoReferent))):
         ter = (Utils.asObjectOrNull(ex_obj.tag, GeoReferent))
     else:
         ter = GeoReferent()
         if (ex_obj is not None):
             geo_ = Utils.asObjectOrNull(ex_obj.onto_item.referent,
                                         GeoReferent)
             if (geo_ is not None and not geo_.is_city):
                 ter._merge_slots2(geo_, li[0].kit.base_language)
             else:
                 ter._add_name(name)
             if (noun is None and ex_obj.can_be_city):
                 ter._add_typ_city(li[0].kit.base_language)
             else:
                 pass
         elif (new_name is not None):
             ter._add_name(name)
             if (alt_name is not None):
                 ter._add_name(alt_name)
         if (noun is not None):
             if (noun.termin_item.canonic_text == "АО"):
                 ter._add_typ(
                     ("АВТОНОМНИЙ ОКРУГ" if li[0].kit.base_language.is_ua
                      else "АВТОНОМНЫЙ ОКРУГ"))
             elif (noun.termin_item.canonic_text == "МУНИЦИПАЛЬНОЕ СОБРАНИЕ"
                   or noun.termin_item.canonic_text
                   == "МУНІЦИПАЛЬНЕ ЗБОРИ"):
                 ter._add_typ(("МУНІЦИПАЛЬНЕ УТВОРЕННЯ"
                               if li[0].kit.base_language.is_ua else
                               "МУНИЦИПАЛЬНОЕ ОБРАЗОВАНИЕ"))
             elif (noun.termin_item.acronym == "МО"
                   and add_noun is not None):
                 ter._add_typ(add_noun.termin_item.canonic_text)
             else:
                 if (noun.termin_item.canonic_text == "СОЮЗ"
                         and ex_obj is not None
                         and ex_obj.end_char > noun.end_char):
                     return ReferentToken._new734(ter, ex_obj.begin_token,
                                                  ex_obj.end_token,
                                                  ex_obj.morph)
                 ter._add_typ(noun.termin_item.canonic_text)
                 if (noun.termin_item.is_region and ter.is_state):
                     ter._add_typ_reg(li[0].kit.base_language)
         if (ter.is_state and ter.is_region):
             for a in adj_list:
                 if (a.termin_item.is_region):
                     ter._add_typ_reg(li[0].kit.base_language)
                     break
         if (ter.is_state):
             if (full_name is not None):
                 ter._add_name(full_name)
     res = ReferentToken(ter, li[0].begin_token, li[k - 1].end_token)
     if (noun is not None and noun.morph.class0_.is_noun):
         res.morph = noun.morph
     else:
         res.morph = MorphCollection()
         ii = 0
         while ii < k:
             for v in li[ii].morph.items:
                 bi = MorphBaseInfo()
                 bi.copy_from(v)
                 if (noun is not None):
                     if (bi.class0_.is_adjective):
                         bi.class0_ = MorphClass.NOUN
                 res.morph.add_item(bi)
             ii += 1
     if (li[0].termin_item is not None
             and li[0].termin_item.is_specific_prefix):
         res.begin_token = li[0].end_token.next0_
     if (add_noun is not None and add_noun.end_char > res.end_char):
         res.end_token = add_noun.end_token
     if ((isinstance(res.begin_token.previous, TextToken))
             and (res.whitespaces_before_count < 2)):
         tt = Utils.asObjectOrNull(res.begin_token.previous, TextToken)
         if (tt.term == "АР"):
             for ty in ter.typs:
                 if ("республика" in ty or "республіка" in ty):
                     res.begin_token = tt
                     break
     return res
Пример #17
0
 def tryAttach(self, t0: 'Token') -> 'TerminToken':
     from pullenti.ner.Token import Token
     from pullenti.ner.TextToken import TextToken
     from pullenti.ner.MetaToken import MetaToken
     from pullenti.ner.MorphCollection import MorphCollection
     from pullenti.ner.core.TerminToken import TerminToken
     t1 = Utils.asObjectOrNull(t0, TextToken)
     if (t1 is None):
         return None
     if (t1.term != self.parts[0].value):
         if (len(self.parts) != 1
                 or not t1.isValue(self.parts[0].value, None)):
             return None
     if (self.tail is None):
         te = t1
         point = False
         if (te.next0_ is not None):
             if (te.next0_.isChar('.')):
                 te = te.next0_
                 point = True
             elif (len(self.parts) > 1):
                 while te.next0_ is not None:
                     if (te.next0_.isCharOf("\\/.")
                             or te.next0_.is_hiphen):
                         te = te.next0_
                         point = True
                     else:
                         break
         if (te is None):
             return None
         tt = te.next0_
         i = 1
         while i < len(self.parts):
             if (tt is not None and tt.whitespaces_before_count > 2):
                 return None
             if (tt is not None
                     and ((tt.is_hiphen or tt.isCharOf("\\/.")))):
                 tt = tt.next0_
             elif (not point and self.parts[i - 1].has_delim):
                 return None
             if (tt is None):
                 return None
             if (isinstance(tt, TextToken)):
                 tet = Utils.asObjectOrNull(tt, TextToken)
                 if (tet.term != self.parts[i].value):
                     if (not tet.isValue(self.parts[i].value, None)):
                         return None
             elif (isinstance(tt, MetaToken)):
                 mt = Utils.asObjectOrNull(tt, MetaToken)
                 if (mt.begin_token != mt.end_token):
                     return None
                 if (not mt.begin_token.isValue(self.parts[i].value,
                                                None)):
                     return None
             te = tt
             if (tt.next0_ is not None and
                 ((tt.next0_.isCharOf(".\\/") or tt.next0_.is_hiphen))):
                 tt = tt.next0_
                 point = True
                 if (tt is not None):
                     te = tt
             else:
                 point = False
             tt = tt.next0_
             i += 1
         res = TerminToken._new603(t0, te, t0 == te)
         if (point):
             res.morph = MorphCollection()
         return res
     t1 = (Utils.asObjectOrNull(t1.next0_, TextToken))
     if (t1 is None or not t1.isCharOf("-\\/")):
         return None
     t1 = (Utils.asObjectOrNull(t1.next0_, TextToken))
     if (t1 is None):
         return None
     if (t1.term[0] != self.tail[0]):
         return None
     return TerminToken(t0, t1)
Пример #18
0
class Token:
    """ Базовый класс для всех токенов """
    def __init__(self, kit_: 'AnalysisKit', begin: int, end: int) -> None:
        self.kit = None
        self.begin_char = 0
        self.end_char = 0
        self.tag = None
        self._m_previous = None
        self._m_next = None
        self.__m_morph = None
        self.chars = None
        self.__m_attrs = 0
        self.kit = kit_
        self.begin_char = begin
        self.end_char = end

    @property
    def length_char(self) -> int:
        """ Длина в исходных символах """
        return (self.end_char - self.begin_char) + 1

    @property
    def previous(self) -> 'Token':
        """ Предыдущий токен """
        return self._m_previous

    @previous.setter
    def previous(self, value) -> 'Token':
        self._m_previous = value
        if (value is not None):
            value._m_next = self
        self.__m_attrs = (0)
        return value

    @property
    def next0_(self) -> 'Token':
        """ Следующий токен """
        return self._m_next

    @next0_.setter
    def next0_(self, value) -> 'Token':
        self._m_next = value
        if (value is not None):
            value._m_previous = self
        self.__m_attrs = (0)
        return value

    @property
    def morph(self) -> 'MorphCollection':
        """ Морфологическая информация """
        if (self.__m_morph is None):
            self.__m_morph = MorphCollection()
        return self.__m_morph

    @morph.setter
    def morph(self, value) -> 'MorphCollection':
        self.__m_morph = value
        return value

    def __str__(self) -> str:
        return self.kit.sofa.text[self.begin_char:self.begin_char +
                                  (self.end_char + 1) - self.begin_char]

    def __getAttr(self, i: int) -> bool:
        if ((((self.__m_attrs) & 1)) == 0):
            self.__m_attrs = (1)
            if (self._m_previous is None):
                self._setAttr(1, True)
                self._setAttr(3, True)
            else:
                j = self._m_previous.end_char + 1
                while j < self.begin_char:
                    ch = self.kit.sofa.text[j]
                    if (Utils.isWhitespace((ch))):
                        self._setAttr(1, True)
                        if ((ord(ch)) == 0xD or (ord(ch)) == 0xA
                                or ch == '\f'):
                            self._setAttr(3, True)
                    j += 1
            if (self._m_next is None):
                self._setAttr(2, True)
                self._setAttr(4, True)
            else:
                j = self.end_char + 1
                while j < self._m_next.begin_char:
                    ch = self.kit.sofa.text[j]
                    if (Utils.isWhitespace(ch)):
                        self._setAttr(2, True)
                        if ((ord(ch)) == 0xD or (ord(ch)) == 0xA
                                or ch == '\f'):
                            self._setAttr(4, True)
                    j += 1
        return (((((self.__m_attrs) >> i)) & 1)) != 0

    def _setAttr(self, i: int, val: bool) -> None:
        if (val):
            self.__m_attrs |= ((1 << i))
        else:
            self.__m_attrs &= (~((1 << i)))

    @property
    def is_whitespace_before(self) -> bool:
        """ Наличие пробельных символов перед """
        return self.__getAttr(1)

    @is_whitespace_before.setter
    def is_whitespace_before(self, value) -> bool:
        self._setAttr(1, value)
        return value

    @property
    def is_whitespace_after(self) -> bool:
        """ Наличие пробельных символов после """
        return self.__getAttr(2)

    @is_whitespace_after.setter
    def is_whitespace_after(self, value) -> bool:
        self._setAttr(2, value)
        return value

    @property
    def is_newline_before(self) -> bool:
        """ Элемент начинается с новой строки.
         Для 1-го элемента всегда true. """
        return self.__getAttr(3)

    @is_newline_before.setter
    def is_newline_before(self, value) -> bool:
        self._setAttr(3, value)
        return value

    @property
    def is_newline_after(self) -> bool:
        """ Элемент заканчивает строку.
         Для последнего элемента всегда true. """
        return self.__getAttr(4)

    @is_newline_after.setter
    def is_newline_after(self, value) -> bool:
        self._setAttr(4, value)
        return value

    @property
    def inner_bool(self) -> bool:
        """ Это используется внутренним образом """
        return self.__getAttr(5)

    @inner_bool.setter
    def inner_bool(self, value) -> bool:
        self._setAttr(5, value)
        return value

    @property
    def not_noun_phrase(self) -> bool:
        """ Это используется внутренним образом 
         (признак того, что здесь не начинается именная группа, чтобы повторно не пытаться выделять) """
        return self.__getAttr(6)

    @not_noun_phrase.setter
    def not_noun_phrase(self, value) -> bool:
        self._setAttr(6, value)
        return value

    @property
    def whitespaces_before_count(self) -> int:
        """ Количество пробелов перед, переход на новую строку = 10, табуляция = 5 """
        if (self.previous is None):
            return 100
        if ((self.previous.end_char + 1) == self.begin_char):
            return 0
        return self.__calcWhitespaces(self.previous.end_char + 1,
                                      self.begin_char - 1)

    @property
    def newlines_before_count(self) -> int:
        """ Количество переходов на новую строку перед """
        ch0 = chr(0)
        res = 0
        txt = self.kit.sofa.text
        for p in range(self.begin_char - 1, -1, -1):
            ch = txt[p]
            if ((ord(ch)) == 0xA):
                res += 1
            elif ((ord(ch)) == 0xD and (ord(ch0)) != 0xA):
                res += 1
            elif (ch == '\f'):
                res += 10
            elif (not Utils.isWhitespace(ch)):
                break
            ch0 = ch
        return res

    @property
    def newlines_after_count(self) -> int:
        """ Количество переходов на новую строку перед """
        ch0 = chr(0)
        res = 0
        txt = self.kit.sofa.text
        p = self.end_char + 1
        while p < len(txt):
            ch = txt[p]
            if ((ord(ch)) == 0xD):
                res += 1
            elif ((ord(ch)) == 0xA and (ord(ch0)) != 0xD):
                res += 1
            elif (ch == '\f'):
                res += 10
            elif (not Utils.isWhitespace(ch)):
                break
            ch0 = ch
            p += 1
        return res

    @property
    def whitespaces_after_count(self) -> int:
        """ Количество пробелов перед, переход на новую строку = 10, табуляция = 5 """
        if (self.next0_ is None):
            return 100
        if ((self.end_char + 1) == self.next0_.begin_char):
            return 0
        return self.__calcWhitespaces(self.end_char + 1,
                                      self.next0_.begin_char - 1)

    def __calcWhitespaces(self, p0: int, p1: int) -> int:
        if ((p0 < 0) or p0 > p1 or p1 >= len(self.kit.sofa.text)):
            return -1
        res = 0
        i = p0
        while i <= p1:
            ch = self.kit.getTextCharacter(i)
            if (ch == '\r' or ch == '\n'):
                res += 10
                ch1 = self.kit.getTextCharacter(i + 1)
                if (ch != ch1 and ((ch1 == '\r' or ch1 == '\n'))):
                    i += 1
            elif (ch == '\t'):
                res += 5
            elif (ch == '\u0007'):
                res += 100
            elif (ch == '\f'):
                res += 100
            else:
                res += 1
            i += 1
        return res

    @property
    def is_hiphen(self) -> bool:
        """ Это символ переноса """
        ch = self.kit.sofa.text[self.begin_char]
        return LanguageHelper.isHiphen(ch)

    @property
    def is_table_control_char(self) -> bool:
        """ Это спец-символы для табличных элементов (7h, 1Eh, 1Fh) """
        ch = self.kit.sofa.text[self.begin_char]
        return (ord(ch)) == 7 or (ord(ch)) == 0x1F or (ord(ch)) == 0x1E

    @property
    def is_and(self) -> bool:
        """ Это соединительный союз И (на всех языках) """
        return False

    @property
    def is_or(self) -> bool:
        """ Это соединительный союз ИЛИ (на всех языках) """
        return False

    @property
    def is_comma(self) -> bool:
        """ Это запятая """
        return self.isChar(',')

    @property
    def is_comma_and(self) -> bool:
        """ Это запятая или союз И """
        return self.is_comma or self.is_and

    def isChar(self, ch: 'char') -> bool:
        """ Токен состоит из символа
        
        Args:
            ch('char'): проверяемый символ
        
        """
        if (self.begin_char != self.end_char):
            return False
        return self.kit.sofa.text[self.begin_char] == ch

    def isCharOf(self, chars_: str) -> bool:
        """ Токен состоит из одного символа, который есть в указанной строке
        
        Args:
            chars_(str): строка возможных символов
        
        """
        if (self.begin_char != self.end_char):
            return False
        return chars_.find(self.kit.sofa.text[self.begin_char]) >= 0

    def isValue(self, term: str, termua: str = None) -> bool:
        return False

    @property
    def is_letters(self) -> bool:
        """ Признак того, что это буквенный текстовой токен (TextToken) """
        return False

    @property
    def is_number(self) -> bool:
        """ Это число (в различных вариантах задания) """
        return False

    @property
    def is_referent(self) -> bool:
        """ Это сущность (Referent) """
        return False

    def getReferent(self) -> 'Referent':
        """ Ссылка на сущность (для ReferentToken) """
        return None

    def getReferents(self) -> typing.List['Referent']:
        """ Получить список ссылок на все сущности, скрывающиеся под элементом
         (дело в том, что одни сущности могут поглощать дркгие, например, адрес поглотит город)
        
        """
        return None

    def getNormalCaseText(self,
                          mc: 'MorphClass' = None,
                          single_number: bool = False,
                          gender: 'MorphGender' = MorphGender.UNDEFINED,
                          keep_chars: bool = False) -> str:
        """ Получить связанный с токеном текст в именительном падеже
        
        Args:
            mc(MorphClass): 
            single_number(bool): переводить ли в единственное число
        
        """
        return str(self)

    def getSourceText(self) -> str:
        """ Получить чистый фрагмент исходного текста
        
        """
        len0_ = (self.end_char + 1) - self.begin_char
        if ((len0_ < 1) or (self.begin_char < 0)):
            return None
        if ((self.begin_char + len0_) > len(self.kit.sofa.text)):
            return None
        return self.kit.sofa.text[self.begin_char:self.begin_char + len0_]

    def getMorphClassInDictionary(self) -> 'MorphClass':
        """ Проверка, что это текстовый токен и есть в словаре соотв. тип
        
        Args:
            cla: 
        
        """
        return self.morph.class0_

    def _serialize(self, stream: io.IOBase) -> None:
        from pullenti.ner.core.internal.SerializerHelper import SerializerHelper
        SerializerHelper.serializeInt(stream, self.begin_char)
        SerializerHelper.serializeInt(stream, self.end_char)
        SerializerHelper.serializeInt(stream, self.__m_attrs)
        SerializerHelper.serializeInt(stream, self.chars.value)
        if (self.__m_morph is None):
            self.__m_morph = MorphCollection()
        self.__m_morph._serialize(stream)

    def _deserialize(self, stream: io.IOBase, kit_: 'AnalysisKit',
                     vers: int) -> None:
        from pullenti.ner.core.internal.SerializerHelper import SerializerHelper
        self.kit = kit_
        self.begin_char = SerializerHelper.deserializeInt(stream)
        self.end_char = SerializerHelper.deserializeInt(stream)
        self.__m_attrs = (SerializerHelper.deserializeInt(stream))
        self.chars = CharsInfo._new2656(
            SerializerHelper.deserializeInt(stream))
        self.__m_morph = MorphCollection()
        self.__m_morph._deserialize(stream)
Пример #19
0
 def _createReferentToken(p : 'PersonReferent', begin : 'Token', end : 'Token', morph_ : 'MorphCollection', attrs : typing.List['PersonAttrToken'], ad : 'PersonAnalyzerData', for_attribute : bool, after_be_predicate : bool) -> 'ReferentToken':
     from pullenti.ner.person.internal.PersonIdentityToken import PersonIdentityToken
     if (p is None): 
         return None
     has_prefix = False
     if (attrs is not None): 
         for a in attrs: 
             if (a.typ == PersonAttrTerminType.BESTREGARDS): 
                 has_prefix = True
             else: 
                 if (a.begin_char < begin.begin_char): 
                     begin = a.begin_token
                 if (a.typ != PersonAttrTerminType.PREFIX): 
                     if (a.age is not None): 
                         p.addSlot(PersonReferent.ATTR_AGE, a.age, False, 0)
                     if (a.prop_ref is None): 
                         p.addSlot(PersonReferent.ATTR_ATTR, a.value, False, 0)
                     else: 
                         p.addSlot(PersonReferent.ATTR_ATTR, a, False, 0)
                 elif (a.gender == MorphGender.FEMINIE and not p.is_female): 
                     p.is_female = True
                 elif (a.gender == MorphGender.MASCULINE and not p.is_male): 
                     p.is_male = True
     elif ((isinstance(begin.previous, TextToken)) and (begin.whitespaces_before_count < 3)): 
         if ((begin.previous).term == "ИП"): 
             a = PersonAttrToken(begin.previous, begin.previous)
             a.prop_ref = PersonPropertyReferent()
             a.prop_ref.name = "индивидуальный предприниматель"
             p.addSlot(PersonReferent.ATTR_ATTR, a, False, 0)
             begin = begin.previous
     m0 = MorphCollection()
     for it in morph_.items: 
         bi = MorphBaseInfo(it)
         bi.number = MorphNumber.SINGULAR
         if (bi.gender == MorphGender.UNDEFINED): 
             if (p.is_male and not p.is_female): 
                 bi.gender = MorphGender.MASCULINE
             if (not p.is_male and p.is_female): 
                 bi.gender = MorphGender.FEMINIE
         m0.addItem(bi)
     morph_ = m0
     if ((attrs is not None and len(attrs) > 0 and not attrs[0].morph.case_.is_undefined) and morph_.case_.is_undefined): 
         morph_.case_ = attrs[0].morph.case_
         if (attrs[0].morph.number == MorphNumber.SINGULAR): 
             morph_.number = MorphNumber.SINGULAR
         if (p.is_male and not p.is_female): 
             morph_.gender = MorphGender.MASCULINE
         elif (p.is_female): 
             morph_.gender = MorphGender.FEMINIE
     if (begin.previous is not None): 
         ttt = begin.previous
         if (ttt.isValue("ИМЕНИ", "ІМЕНІ")): 
             for_attribute = True
         else: 
             if (ttt.isChar('.') and ttt.previous is not None): 
                 ttt = ttt.previous
             if (ttt.whitespaces_after_count < 3): 
                 if (ttt.isValue("ИМ", "ІМ")): 
                     for_attribute = True
     if (for_attribute): 
         return ReferentToken._new2329(p, begin, end, morph_, p._m_person_identity_typ)
     if ((begin.previous is not None and begin.previous.is_comma_and and (isinstance(begin.previous.previous, ReferentToken))) and (isinstance(begin.previous.previous.getReferent(), PersonReferent))): 
         rt00 = Utils.asObjectOrNull(begin.previous.previous, ReferentToken)
         ttt = rt00
         while ttt is not None: 
             if (ttt.previous is None or not ((isinstance(ttt.previous.previous, ReferentToken)))): 
                 break
             if (not ttt.previous.is_comma_and or not ((isinstance(ttt.previous.previous.getReferent(), PersonReferent)))): 
                 break
             rt00 = (Utils.asObjectOrNull(ttt.previous.previous, ReferentToken))
             ttt = (rt00)
         if (isinstance(rt00.begin_token.getReferent(), PersonPropertyReferent)): 
             ok = False
             if ((rt00.begin_token).end_token.next0_ is not None and (rt00.begin_token).end_token.next0_.isChar(':')): 
                 ok = True
             elif (rt00.begin_token.morph.number == MorphNumber.PLURAL): 
                 ok = True
             if (ok): 
                 p.addSlot(PersonReferent.ATTR_ATTR, rt00.begin_token.getReferent(), False, 0)
     if (ad is not None): 
         if (ad.overflow_level > 10): 
             return ReferentToken._new2329(p, begin, end, morph_, p._m_person_identity_typ)
         ad.overflow_level += 1
     attrs1 = None
     has_position = False
     open_br = False
     t = end.next0_
     first_pass3095 = True
     while True:
         if first_pass3095: first_pass3095 = False
         else: t = t.next0_
         if (not (t is not None)): break
         if (t.is_table_control_char): 
             break
         if (t.is_newline_before): 
             if (t.newlines_before_count > 2): 
                 break
             if (attrs1 is not None and len(attrs1) > 0): 
                 break
             ml = MailLine.parse(t, 0)
             if (ml is not None and ml.typ == MailLine.Types.FROM): 
                 break
             if (t.chars.is_capital_upper): 
                 attr1 = PersonAttrToken.tryAttach(t, (None if ad is None else ad.local_ontology), PersonAttrToken.PersonAttrAttachAttrs.NO)
                 ok1 = False
                 if (attr1 is not None): 
                     if (has_prefix or attr1.is_newline_after or ((attr1.end_token.next0_ is not None and attr1.end_token.next0_.is_table_control_char))): 
                         ok1 = True
                     else: 
                         tt2 = t.next0_
                         while tt2 is not None and tt2.end_char <= attr1.end_char: 
                             if (tt2.is_whitespace_before): 
                                 ok1 = True
                             tt2 = tt2.next0_
                 else: 
                     ttt = PersonHelper.__correctTailAttributes(p, t)
                     if (ttt is not None and ttt != t): 
                         t = ttt
                         end = t
                         continue
                 if (not ok1): 
                     break
         if (t.is_hiphen or t.isCharOf("_>|")): 
             continue
         if (t.isValue("МОДЕЛЬ", None)): 
             break
         tt = PersonHelper.__correctTailAttributes(p, t)
         if (tt != t and tt is not None): 
             t = tt
             end = t
             continue
         is_be = False
         if (t.isChar('(') and t == end.next0_): 
             open_br = True
             t = t.next0_
             if (t is None): 
                 break
             pit1 = PersonItemToken.tryAttach(t, None, PersonItemToken.ParseAttr.NO, None)
             if ((pit1 is not None and t.chars.is_capital_upper and pit1.end_token.next0_ is not None) and (isinstance(t, TextToken)) and pit1.end_token.next0_.isChar(')')): 
                 if (pit1.lastname is not None): 
                     inf = MorphBaseInfo._new2321(MorphCase.NOMINATIVE)
                     if (p.is_male): 
                         inf.gender = Utils.valToEnum((inf.gender) | (MorphGender.MASCULINE), MorphGender)
                     if (p.is_female): 
                         inf.gender = Utils.valToEnum((inf.gender) | (MorphGender.FEMINIE), MorphGender)
                     sur = PersonIdentityToken.createLastname(pit1, inf)
                     if (sur is not None): 
                         p._addFioIdentity(sur, None, None)
                         t = pit1.end_token.next0_
                         end = t
                         continue
         elif (t.is_comma): 
             t = t.next0_
             if ((isinstance(t, TextToken)) and (t).isValue("WHO", None)): 
                 continue
         elif ((isinstance(t, TextToken)) and (t).is_verb_be): 
             t = t.next0_
         elif (t.is_and and t.is_whitespace_after and not t.is_newline_after): 
             if (t == end.next0_): 
                 break
             t = t.next0_
         elif (t.is_hiphen and t == end.next0_): 
             t = t.next0_
         elif (t.isChar('.') and t == end.next0_ and has_prefix): 
             t = t.next0_
         ttt2 = PersonHelper.createNickname(p, t)
         if (ttt2 is not None): 
             end = ttt2
             t = end
             continue
         if (t is None): 
             break
         attr = None
         attr = PersonAttrToken.tryAttach(t, (None if ad is None else ad.local_ontology), PersonAttrToken.PersonAttrAttachAttrs.NO)
         if (attr is None): 
             if ((t is not None and t.getReferent() is not None and t.getReferent().type_name == "GEO") and attrs1 is not None and open_br): 
                 continue
             if ((t.chars.is_capital_upper and open_br and t.next0_ is not None) and t.next0_.isChar(')')): 
                 if (p.findSlot(PersonReferent.ATTR_LASTNAME, None, True) is None): 
                     p.addSlot(PersonReferent.ATTR_LASTNAME, t.getSourceText().upper(), False, 0)
                     t = t.next0_
                     end = t
             if (t is not None and t.isValue("КОТОРЫЙ", None) and t.morph.number == MorphNumber.SINGULAR): 
                 if (not p.is_female and t.morph.gender == MorphGender.FEMINIE): 
                     p.is_female = True
                     p._correctData()
                 elif (not p.is_male and t.morph.gender == MorphGender.MASCULINE): 
                     p.is_male = True
                     p._correctData()
             break
         if (attr.morph.number == MorphNumber.PLURAL): 
             break
         if (attr.typ == PersonAttrTerminType.BESTREGARDS): 
             break
         if (attr.is_doubt): 
             if (has_prefix): 
                 pass
             elif (t.is_newline_before and attr.is_newline_after): 
                 pass
             elif (t.previous is not None and ((t.previous.is_hiphen or t.previous.isChar(':')))): 
                 pass
             else: 
                 break
         if (not morph_.case_.is_undefined and not attr.morph.case_.is_undefined): 
             if (((morph_.case_) & attr.morph.case_).is_undefined and not is_be): 
                 break
         if (open_br): 
             if (PersonAnalyzer._tryAttachPerson(t, ad, False, 0, True) is not None): 
                 break
         if (attrs1 is None): 
             if (t.previous.is_comma and t.previous == end.next0_): 
                 ttt = attr.end_token.next0_
                 if (ttt is not None): 
                     if (ttt.morph.class0_.is_verb): 
                         if (MiscHelper.canBeStartOfSentence(begin)): 
                             pass
                         else: 
                             break
             attrs1 = list()
         attrs1.append(attr)
         if (attr.typ == PersonAttrTerminType.POSITION or attr.typ == PersonAttrTerminType.KING): 
             if (not is_be): 
                 has_position = True
         elif (attr.typ != PersonAttrTerminType.PREFIX): 
             if (attr.typ == PersonAttrTerminType.OTHER and attr.age is not None): 
                 pass
             else: 
                 attrs1 = (None)
                 break
         t = attr.end_token
     if (attrs1 is not None and has_position and attrs is not None): 
         te1 = attrs[len(attrs) - 1].end_token.next0_
         te2 = attrs1[0].begin_token
         if (te1.whitespaces_after_count > te2.whitespaces_before_count and (te2.whitespaces_before_count < 2)): 
             pass
         elif (attrs1[0].age is not None): 
             pass
         elif (((te1.is_hiphen or te1.isChar(':'))) and not attrs1[0].is_newline_before and ((te2.previous.is_comma or te2.previous == end))): 
             pass
         else: 
             for a in attrs: 
                 if (a.typ == PersonAttrTerminType.POSITION): 
                     te = attrs1[len(attrs1) - 1].end_token
                     if (te.next0_ is not None): 
                         if (not te.next0_.isChar('.')): 
                             attrs1 = (None)
                             break
     if (attrs1 is not None and not has_prefix): 
         attr = attrs1[len(attrs1) - 1]
         ok = False
         if (attr.end_token.next0_ is not None and attr.end_token.next0_.chars.is_capital_upper): 
             ok = True
         else: 
             rt = PersonAnalyzer._tryAttachPerson(attr.begin_token, ad, False, -1, False)
             if (rt is not None and (isinstance(rt.referent, PersonReferent))): 
                 ok = True
         if (ok): 
             if (attr.begin_token.whitespaces_before_count > attr.end_token.whitespaces_after_count): 
                 attrs1 = (None)
             elif (attr.begin_token.whitespaces_before_count == attr.end_token.whitespaces_after_count): 
                 rt1 = PersonAnalyzer._tryAttachPerson(attr.begin_token, ad, False, -1, False)
                 if (rt1 is not None): 
                     attrs1 = (None)
     if (attrs1 is not None): 
         for a in attrs1: 
             if (a.typ != PersonAttrTerminType.PREFIX): 
                 if (a.age is not None): 
                     p.addSlot(PersonReferent.ATTR_AGE, a.age, True, 0)
                 elif (a.prop_ref is None): 
                     p.addSlot(PersonReferent.ATTR_ATTR, a.value, False, 0)
                 else: 
                     p.addSlot(PersonReferent.ATTR_ATTR, a, False, 0)
                 end = a.end_token
                 if (a.gender != MorphGender.UNDEFINED and not p.is_female and not p.is_male): 
                     if (a.gender == MorphGender.MASCULINE and not p.is_male): 
                         p.is_male = True
                         p._correctData()
                     elif (a.gender == MorphGender.FEMINIE and not p.is_female): 
                         p.is_female = True
                         p._correctData()
         if (open_br): 
             if (end.next0_ is not None and end.next0_.isChar(')')): 
                 end = end.next0_
     crlf_cou = 0
     t = end.next0_
     first_pass3096 = True
     while True:
         if first_pass3096: first_pass3096 = False
         else: t = t.next0_
         if (not (t is not None)): break
         if (t.is_table_control_char): 
             break
         if (t.is_newline_before): 
             ml = MailLine.parse(t, 0)
             if (ml is not None and ml.typ == MailLine.Types.FROM): 
                 break
             crlf_cou += 1
         if (t.isCharOf(":,(") or t.is_hiphen): 
             continue
         if (t.isChar('.') and t == end.next0_): 
             continue
         r = t.getReferent()
         if (r is not None): 
             if (r.type_name == "PHONE" or r.type_name == "URI" or r.type_name == "ADDRESS"): 
                 ty = r.getStringValue("SCHEME")
                 if (r.type_name == "URI"): 
                     if ((ty != "mailto" and ty != "skype" and ty != "ICQ") and ty != "http"): 
                         break
                 p._addContact(r)
                 end = t
                 crlf_cou = 0
                 continue
         if (isinstance(r, PersonIdentityReferent)): 
             p.addSlot(PersonReferent.ATTR_IDDOC, r, False, 0)
             end = t
             crlf_cou = 0
             continue
         if (r is not None and r.type_name == "ORGANIZATION"): 
             if (t.next0_ is not None and t.next0_.morph.class0_.is_verb): 
                 break
             if (begin.previous is not None and begin.previous.morph.class0_.is_verb): 
                 break
             if (t.whitespaces_after_count == 1): 
                 break
             exist = False
             for s in p.slots: 
                 if (s.type_name == PersonReferent.ATTR_ATTR and (isinstance(s.value, PersonPropertyReferent))): 
                     pr = Utils.asObjectOrNull(s.value, PersonPropertyReferent)
                     if (pr.findSlot(PersonPropertyReferent.ATTR_REF, r, True) is not None): 
                         exist = True
                         break
                 elif (s.type_name == PersonReferent.ATTR_ATTR and (isinstance(s.value, PersonAttrToken))): 
                     pr = Utils.asObjectOrNull(s.value, PersonAttrToken)
                     if (pr.referent.findSlot(PersonPropertyReferent.ATTR_REF, r, True) is not None): 
                         exist = True
                         break
             if (not exist): 
                 pat = PersonAttrToken(t, t)
                 pat.prop_ref = PersonPropertyReferent._new2291("сотрудник")
                 pat.prop_ref.addSlot(PersonPropertyReferent.ATTR_REF, r, False, 0)
                 p.addSlot(PersonReferent.ATTR_ATTR, pat, False, 0)
             continue
         if (r is not None): 
             break
         if (not has_prefix or crlf_cou >= 2): 
             break
         rt = t.kit.processReferent("PERSON", t)
         if (rt is not None): 
             break
     if (ad is not None): 
         ad.overflow_level -= 1
     return ReferentToken._new2329(p, begin, end, morph_, p._m_person_identity_typ)
Пример #20
0
 def __try_parse_ru(t: 'Token', can_be_partition: bool,
                    can_be_adj_partition: bool,
                    force_parse: bool) -> 'VerbPhraseToken':
     res = None
     t0 = t
     not0_ = None
     has_verb = False
     verb_be_before = False
     prep = None
     first_pass3070 = True
     while True:
         if first_pass3070: first_pass3070 = False
         else: t = t.next0_
         if (not (t is not None)): break
         if (not (isinstance(t, TextToken))):
             break
         tt = Utils.asObjectOrNull(t, TextToken)
         is_participle = False
         if (tt.term == "НЕ"):
             not0_ = t
             continue
         ty = 0
         norm = None
         mc = tt.get_morph_class_in_dictionary()
         if (tt.term == "НЕТ"):
             if (has_verb):
                 break
             ty = 1
         elif (tt.term == "ДОПУСТИМО"):
             ty = 3
         elif (mc.is_adverb and not mc.is_verb):
             ty = 2
         elif (tt.is_pure_verb or tt.is_verb_be):
             ty = 1
             if (has_verb):
                 if (not tt.morph.contains_attr("инф.", None)):
                     if (verb_be_before):
                         pass
                     else:
                         break
         elif (mc.is_verb):
             if (mc.is_preposition or mc.is_misc or mc.is_pronoun):
                 pass
             elif (mc.is_noun):
                 if (tt.term == "СТАЛИ" or tt.term == "СТЕКЛО"
                         or tt.term == "БЫЛИ"):
                     ty = 1
                 elif (not tt.chars.is_all_lower
                       and not MiscHelper.can_be_start_of_sentence(tt)):
                     ty = 1
                 elif (mc.is_adjective and can_be_partition):
                     ty = 1
                 elif (force_parse):
                     ty = 1
             elif (mc.is_proper):
                 if (tt.chars.is_all_lower):
                     ty = 1
             else:
                 ty = 1
             if (mc.is_adjective):
                 is_participle = True
             if (not tt.morph.case_.is_undefined):
                 is_participle = True
             if (not can_be_partition and is_participle):
                 break
             if (has_verb):
                 if (tt.morph.contains_attr("инф.", None)):
                     pass
                 elif (not is_participle):
                     pass
                 else:
                     break
         elif ((mc.is_adjective and tt.morph.contains_attr("к.ф.", None)
                and tt.term.endswith("О")) and NounPhraseHelper.try_parse(
                    tt, NounPhraseParseAttr.NO, 0, None) is None):
             ty = 2
         elif (mc.is_adjective
               and ((can_be_partition or can_be_adj_partition))):
             if (tt.morph.contains_attr("к.ф.", None)
                     and not can_be_adj_partition):
                 break
             norm = tt.get_normal_case_text(MorphClass.ADJECTIVE,
                                            MorphNumber.SINGULAR,
                                            MorphGender.MASCULINE, False)
             if (norm.endswith("ЙШИЙ")):
                 pass
             else:
                 grs = DerivateService.find_derivates(norm, True, None)
                 if (grs is not None and len(grs) > 0):
                     hverb = False
                     hpart = False
                     for gr in grs:
                         for w in gr.words:
                             if (w.class0_.is_adjective
                                     and w.class0_.is_verb):
                                 if (w.spelling == norm):
                                     hpart = True
                             elif (w.class0_.is_verb):
                                 hverb = True
                     if (hpart and hverb):
                         ty = 3
                     elif (can_be_adj_partition):
                         ty = 3
                     if (ty != 3 and not Utils.isNullOrEmpty(grs[0].prefix)
                             and norm.startswith(grs[0].prefix)):
                         hverb = False
                         hpart = False
                         norm1 = norm[len(grs[0].prefix):]
                         grs = DerivateService.find_derivates(
                             norm1, True, None)
                         if (grs is not None and len(grs) > 0):
                             for gr in grs:
                                 for w in gr.words:
                                     if (w.class0_.is_adjective
                                             and w.class0_.is_verb):
                                         if (w.spelling == norm1):
                                             hpart = True
                                     elif (w.class0_.is_verb):
                                         hverb = True
                         if (hpart and hverb):
                             ty = 3
         if (ty == 0 and t == t0 and can_be_partition):
             prep = PrepositionHelper.try_parse(t)
             if (prep is not None):
                 t = prep.end_token
                 continue
         if (ty == 0):
             break
         if (res is None):
             res = VerbPhraseToken(t0, t)
         res.end_token = t
         it = VerbPhraseItemToken._new603(t, t, MorphCollection(t.morph))
         if (not0_ is not None):
             it.begin_token = not0_
             it.not0_ = True
             not0_ = (None)
         it.is_adverb = ty == 2
         if (prep is not None and not t.morph.case_.is_undefined
                 and len(res.items) == 0):
             if (((prep.next_case) & t.morph.case_).is_undefined):
                 return None
             it.morph.remove_items(prep.next_case, False)
             res.preposition = prep
         if (norm is None):
             norm = t.get_normal_case_text(
                 (MorphClass.ADJECTIVE if ty == 3 else
                  (MorphClass.ADVERB if ty == 2 else MorphClass.VERB)),
                 MorphNumber.SINGULAR, MorphGender.MASCULINE, False)
             if (ty == 1 and not tt.morph.case_.is_undefined):
                 mi = MorphWordForm._new604(MorphCase.NOMINATIVE,
                                            MorphNumber.SINGULAR,
                                            MorphGender.MASCULINE)
                 for mit in tt.morph.items:
                     if (isinstance(mit, MorphWordForm)):
                         mi.misc = mit.misc
                         break
                 nnn = MorphologyService.get_wordform("КК" + t.term, mi)
                 if (nnn is not None):
                     norm = nnn[2:]
         it.normal = norm
         res.items.append(it)
         if (not has_verb and ((ty == 1 or ty == 3))):
             res.morph = it.morph
             has_verb = True
         if (ty == 1 or ty == 3):
             if (ty == 1 and tt.is_verb_be):
                 verb_be_before = True
             else:
                 verb_be_before = False
     if (not has_verb):
         return None
     for i in range(len(res.items) - 1, 0, -1):
         if (res.items[i].is_adverb):
             del res.items[i]
             res.end_token = res.items[i - 1].end_token
         else:
             break
     return res