Exemplo n.º 1
0
 def deserialize(self, stream: io.IOBase, all0_: typing.List['Referent'],
                 sofa: 'SourceOfAnalysis') -> None:
     typ = SerializerHelper.deserializeString(stream)
     cou = SerializerHelper.deserializeInt(stream)
     i = 0
     while i < cou:
         typ = SerializerHelper.deserializeString(stream)
         c = SerializerHelper.deserializeInt(stream)
         id0_ = SerializerHelper.deserializeInt(stream)
         val = None
         if (id0_ < 0):
             val = (all0_[(-id0_) - 1])
         elif (id0_ > 0):
             stream.seek(stream.tell() - (4), io.SEEK_SET)
             val = (SerializerHelper.deserializeString(stream))
         self.addSlot(typ, val, False, c)
         i += 1
     cou = SerializerHelper.deserializeInt(stream)
     self.__m_occurrence = list()
     i = 0
     while i < cou:
         a = TextAnnotation._new2691(sofa, self)
         self.__m_occurrence.append(a)
         a.begin_char = SerializerHelper.deserializeInt(stream)
         a.end_char = SerializerHelper.deserializeInt(stream)
         attr = SerializerHelper.deserializeInt(stream)
         if (((attr & 1)) != 0):
             a.essential_for_occurence = True
         i += 1
Exemplo n.º 2
0
 def embed_token(self, mt: 'MetaToken') -> None:
     """ Встроить токен в основную цепочку токенов
     
     Args:
         mt(MetaToken): встраиваемый метатокен
     
     """
     if (mt is None):
         return
     if (mt.begin_char > mt.end_char):
         bg = mt.begin_token
         mt.begin_token = mt.end_token
         mt.end_token = bg
     if (mt.begin_char > mt.end_char):
         return
     if (mt.begin_token == self.first_token):
         self.first_token = (mt)
     else:
         tp = mt.begin_token.previous
         mt.previous = tp
     tn = mt.end_token.next0_
     mt.next0_ = tn
     if (isinstance(mt, ReferentToken)):
         if (mt.referent is not None):
             mt.referent.add_occurence(
                 TextAnnotation._new474(self.sofa, mt.begin_char,
                                        mt.end_char))
Exemplo n.º 3
0
 def deserialize(self, stream: Stream, all0_: typing.List['Referent'],
                 sofa: 'SourceOfAnalysis') -> None:
     typ = SerializerHelper.deserialize_string(stream)
     cou = SerializerHelper.deserialize_int(stream)
     i = 0
     while i < cou:
         typ = SerializerHelper.deserialize_string(stream)
         c = SerializerHelper.deserialize_int(stream)
         id0_ = SerializerHelper.deserialize_int(stream)
         val = None
         if ((id0_ < 0) and all0_ is not None):
             id1 = (-id0_) - 1
             if (id1 < len(all0_)):
                 val = (all0_[id1])
         elif (id0_ > 0):
             stream.position = stream.position - (4)
             val = (SerializerHelper.deserialize_string(stream))
         self.add_slot(typ, val, False, c)
         i += 1
     cou = SerializerHelper.deserialize_int(stream)
     self.__m_occurrence = list()
     i = 0
     while i < cou:
         a = TextAnnotation._new2863(sofa, self)
         self.__m_occurrence.append(a)
         a.begin_char = SerializerHelper.deserialize_int(stream)
         a.end_char = SerializerHelper.deserialize_int(stream)
         attr = SerializerHelper.deserialize_int(stream)
         if (((attr & 1)) != 0):
             a.essential_for_occurence = True
         i += 1
Exemplo n.º 4
0
 def createAnnotation(kit_: 'AnalysisKit',
                      max_sents: int) -> 'KeywordReferent':
     sents = list()
     t = kit_.first_token
     first_pass3023 = True
     while True:
         if first_pass3023: first_pass3023 = False
         else: t = t.next0_
         if (not (t is not None)): break
         sent = AutoannoSentToken.__tryParse(t)
         if (sent is None):
             continue
         if (sent.rank > 0):
             sents.append(sent)
         t = sent.end_token
     if (len(sents) < 2):
         return None
     i = 0
     while i < len(sents):
         sents[i].rank *= ((((len(sents) - i))) / (len(sents)))
         i += 1
     if ((max_sents * 3) > len(sents)):
         max_sents = (math.floor(len(sents) / 3))
         if (max_sents == 0):
             max_sents = 1
     while len(sents) > max_sents:
         mini = 0
         min0_ = sents[0].rank
         i = 1
         while i < len(sents):
             if (sents[i].rank <= min0_):
                 min0_ = sents[i].rank
                 mini = i
             i += 1
         del sents[mini]
     ano = KeywordReferent()
     ano.typ = KeywordType.ANNOTATION
     tmp = io.StringIO()
     for s in sents:
         if (tmp.tell() > 0):
             print(' ', end="", file=tmp)
         print(s.value, end="", file=tmp)
         ano.occurrence.append(
             TextAnnotation._new1488(s.begin_char, s.end_char, ano,
                                     kit_.sofa))
     ano.addSlot(KeywordReferent.ATTR_VALUE, Utils.toStringStringIO(tmp),
                 True, 0)
     return ano
Exemplo n.º 5
0
 def save_to_local_ontology(self) -> None:
     if (self.data is None):
         return
     r = self.data.register_referent(self.referent)
     self.data = (None)
     if (r is not None):
         self.referent = r
         anno = TextAnnotation()
         anno.sofa = self.kit.sofa
         anno.occurence_of = self.referent
         anno.begin_char = self.begin_char
         anno.end_char = self.end_char
         self.referent.add_occurence(anno)
Exemplo n.º 6
0
 def addOccurence(self, anno: 'TextAnnotation') -> None:
     """ Добавить аннотацию
     
     Args:
         anno(TextAnnotation): 
     """
     for l_ in self.occurrence:
         typ = l_._compareWith(anno)
         if (typ == TextsCompareType.NONCOMPARABLE):
             continue
         if (typ == TextsCompareType.EQUIVALENT
                 or typ == TextsCompareType.CONTAINS):
             return
         if (typ == TextsCompareType.IN
                 or typ == TextsCompareType.INTERSECT):
             l_._merge(anno)
             return
     if (anno.occurence_of != self and anno.occurence_of is not None):
         anno = TextAnnotation._new2689(anno.begin_char, anno.end_char,
                                        anno.sofa)
     if (self.__m_occurrence is None):
         self.__m_occurrence = list()
     anno.occurence_of = self
     if (len(self.__m_occurrence) == 0):
         anno.essential_for_occurence = True
         self.__m_occurrence.append(anno)
         return
     if (anno.begin_char < self.__m_occurrence[0].begin_char):
         self.__m_occurrence.insert(0, anno)
         return
     if (anno.begin_char >=
             self.__m_occurrence[len(self.__m_occurrence) - 1].begin_char):
         self.__m_occurrence.append(anno)
         return
     i = 0
     while i < (len(self.__m_occurrence) - 1):
         if (anno.begin_char >= self.__m_occurrence[i].begin_char and
                 anno.begin_char <= self.__m_occurrence[i + 1].begin_char):
             self.__m_occurrence.insert(i + 1, anno)
             return
         i += 1
     self.__m_occurrence.append(anno)
Exemplo n.º 7
0
 def _process(begin : 'Token', max_char_pos : int, kit : 'AnalysisKit', end_token : 'Token') -> 'TitlePageReferent':
     end_token.value = begin
     res = TitlePageReferent()
     term = None
     lines = Line.parse(begin, 30, 1500, max_char_pos)
     if (len(lines) < 1): 
         return None
     cou = len(lines)
     min_newlines_count = 10
     lines_count_stat = dict()
     i = 0
     while i < len(lines): 
         if (TitleNameToken.can_be_start_of_text_or_content(lines[i].begin_token, lines[i].end_token)): 
             cou = i
             break
         j = lines[i].newlines_before_count
         if (i > 0 and j > 0): 
             if (not j in lines_count_stat): 
                 lines_count_stat[j] = 1
             else: 
                 lines_count_stat[j] += 1
         i += 1
     max0_ = 0
     for kp in lines_count_stat.items(): 
         if (kp[1] > max0_): 
             max0_ = kp[1]
             min_newlines_count = kp[0]
     end_char = (lines[cou - 1].end_char if cou > 0 else 0)
     if (max_char_pos > 0 and end_char > max_char_pos): 
         end_char = max_char_pos
     names = list()
     i = 0
     while i < cou: 
         if (i == 6): 
             pass
         j = i
         while (j < cou) and (j < (i + 5)): 
             if (i == 6 and j == 8): 
                 pass
             if (j > i): 
                 if (lines[j - 1].is_pure_en and lines[j].is_pure_ru): 
                     break
                 if (lines[j - 1].is_pure_ru and lines[j].is_pure_en): 
                     break
                 if (lines[j].newlines_before_count >= (min_newlines_count * 2)): 
                     break
             ttt = TitleNameToken.try_parse(lines[i].begin_token, lines[j].end_token, min_newlines_count)
             if (ttt is not None): 
                 if (lines[i].is_pure_en): 
                     ttt.morph.language = MorphLang.EN
                 elif (lines[i].is_pure_ru): 
                     ttt.morph.language = MorphLang.RU
                 names.append(ttt)
             j += 1
         i += 1
     TitleNameToken.sort(names)
     name_rt = None
     if (len(names) > 0): 
         i0 = 0
         if (names[i0].morph.language.is_en): 
             ii = 1
             while ii < len(names): 
                 if (names[ii].morph.language.is_ru and names[ii].rank > 0): 
                     i0 = ii
                     break
                 ii += 1
         term = res._add_name(names[i0].begin_name_token, names[i0].end_name_token)
         if (names[i0].type_value is not None): 
             res._add_type(names[i0].type_value)
         if (names[i0].speciality is not None): 
             res.speciality = names[i0].speciality
         rt = ReferentToken(res, names[i0].begin_token, names[i0].end_token)
         if (kit is not None): 
             kit.embed_token(rt)
         else: 
             res.add_occurence(TextAnnotation(rt.begin_token, rt.end_token))
         end_token.value = rt.end_token
         name_rt = rt
         if (begin.begin_char == rt.begin_char): 
             begin = (rt)
     if (term is not None and kit is not None): 
         t = kit.first_token
         first_pass3397 = True
         while True:
             if first_pass3397: first_pass3397 = False
             else: t = t.next0_
             if (not (t is not None)): break
             tok = term.try_parse(t, TerminParseAttr.NO)
             if (tok is None): 
                 continue
             t0 = t
             t1 = tok.end_token
             if (t1.next0_ is not None and t1.next0_.is_char('.')): 
                 t1 = t1.next0_
             if (BracketHelper.can_be_start_of_sequence(t0.previous, False, False) and BracketHelper.can_be_end_of_sequence(t1.next0_, False, None, False)): 
                 t0 = t0.previous
                 t1 = t1.next0_
             rt = ReferentToken(res, t0, t1)
             kit.embed_token(rt)
             t = (rt)
     pr = PersonRelations()
     pers_typ = TitleItemToken.Types.UNDEFINED
     pers_types = pr.rel_types
     t = begin
     first_pass3398 = True
     while True:
         if first_pass3398: first_pass3398 = False
         else: t = t.next0_
         if (not (t is not None)): break
         if (max_char_pos > 0 and t.begin_char > max_char_pos): 
             break
         if (t == name_rt): 
             continue
         tpt = TitleItemToken.try_attach(t)
         if (tpt is not None): 
             pers_typ = TitleItemToken.Types.UNDEFINED
             if (tpt.typ == TitleItemToken.Types.TYP): 
                 if (len(res.types) == 0): 
                     res._add_type(tpt.value)
                 elif (len(res.types) == 1): 
                     ty = res.types[0].upper()
                     if (ty == "РЕФЕРАТ"): 
                         res._add_type(tpt.value)
                     elif (ty == "АВТОРЕФЕРАТ"): 
                         if (tpt.value == "КАНДИДАТСКАЯ ДИССЕРТАЦИЯ"): 
                             res.add_slot(TitlePageReferent.ATTR_TYPE, "автореферат кандидатской диссертации", True, 0)
                         elif (tpt.value == "ДОКТОРСКАЯ ДИССЕРТАЦИЯ"): 
                             res.add_slot(TitlePageReferent.ATTR_TYPE, "автореферат докторской диссертации", True, 0)
                         elif (tpt.value == "МАГИСТЕРСКАЯ ДИССЕРТАЦИЯ"): 
                             res.add_slot(TitlePageReferent.ATTR_TYPE, "автореферат магистерской диссертации", True, 0)
                         elif (tpt.value == "КАНДИДАТСЬКА ДИСЕРТАЦІЯ"): 
                             res.add_slot(TitlePageReferent.ATTR_TYPE, "автореферат кандидатської дисертації", True, 0)
                         elif (tpt.value == "ДОКТОРСЬКА ДИСЕРТАЦІЯ"): 
                             res.add_slot(TitlePageReferent.ATTR_TYPE, "автореферат докторської дисертації", True, 0)
                         elif (tpt.value == "МАГІСТЕРСЬКА ДИСЕРТАЦІЯ"): 
                             res.add_slot(TitlePageReferent.ATTR_TYPE, "автореферат магістерської дисертації", True, 0)
                         else: 
                             res._add_type(tpt.value)
                     elif (tpt.value == "РЕФЕРАТ" or tpt.value == "АВТОРЕФЕРАТ"): 
                         if (not tpt.value in ty): 
                             res._add_type(tpt.value)
             elif (tpt.typ == TitleItemToken.Types.SPECIALITY): 
                 if (res.speciality is None): 
                     res.speciality = tpt.value
             elif (tpt.typ in pers_types): 
                 pers_typ = tpt.typ
             t = tpt.end_token
             if (t.end_char > end_token.value.end_char): 
                 end_token.value = t
             if (t.next0_ is not None and t.next0_.is_char_of(":-")): 
                 t = t.next0_
             continue
         if (t.end_char > end_char): 
             break
         rli = t.get_referents()
         if (rli is None): 
             continue
         if (not t.is_newline_before and (isinstance(t.previous, TextToken))): 
             s = t.previous.term
             if (s == "ИМЕНИ" or s == "ИМ"): 
                 continue
             if (s == "." and t.previous.previous is not None and t.previous.previous.is_value("ИМ", None)): 
                 continue
         for r in rli: 
             if (isinstance(r, PersonReferent)): 
                 if (r != rli[0]): 
                     continue
                 p = Utils.asObjectOrNull(r, PersonReferent)
                 if (pers_typ != TitleItemToken.Types.UNDEFINED): 
                     if (t.previous is not None and t.previous.is_char('.')): 
                         pers_typ = TitleItemToken.Types.UNDEFINED
                 typ = pr.calc_typ_from_attrs(p)
                 if (typ != TitleItemToken.Types.UNDEFINED): 
                     pr.add(p, typ, 1)
                     pers_typ = typ
                 elif (pers_typ != TitleItemToken.Types.UNDEFINED): 
                     pr.add(p, pers_typ, 1)
                 elif (t.previous is not None and t.previous.is_char('©')): 
                     pers_typ = TitleItemToken.Types.WORKER
                     pr.add(p, pers_typ, 1)
                 else: 
                     tt = t.next0_
                     first_pass3399 = True
                     while True:
                         if first_pass3399: first_pass3399 = False
                         else: tt = tt.next0_
                         if (not (tt is not None)): break
                         rr = tt.get_referent()
                         if (rr == res): 
                             pers_typ = TitleItemToken.Types.WORKER
                             break
                         if (isinstance(rr, PersonReferent)): 
                             if (pr.calc_typ_from_attrs(Utils.asObjectOrNull(r, PersonReferent)) != TitleItemToken.Types.UNDEFINED): 
                                 break
                             else: 
                                 continue
                         if (rr is not None): 
                             break
                         tpt = TitleItemToken.try_attach(tt)
                         if (tpt is not None): 
                             if (tpt.typ != TitleItemToken.Types.TYP and tpt.typ != TitleItemToken.Types.TYPANDTHEME): 
                                 break
                             tt = tpt.end_token
                             if (tt.end_char > end_token.value.end_char): 
                                 end_token.value = tt
                             continue
                     if (pers_typ == TitleItemToken.Types.UNDEFINED): 
                         tt = t.previous
                         while tt is not None: 
                             rr = tt.get_referent()
                             if (rr == res): 
                                 pers_typ = TitleItemToken.Types.WORKER
                                 break
                             if (rr is not None): 
                                 break
                             if ((tt.is_value("СТУДЕНТ", None) or tt.is_value("СТУДЕНТКА", None) or tt.is_value("СЛУШАТЕЛЬ", None)) or tt.is_value("ДИПЛОМНИК", None) or tt.is_value("ИСПОЛНИТЕЛЬ", None)): 
                                 pers_typ = TitleItemToken.Types.WORKER
                                 break
                             tpt = TitleItemToken.try_attach(tt)
                             if (tpt is not None and tpt.typ != TitleItemToken.Types.TYP): 
                                 break
                             tt = tt.previous
                     if (pers_typ != TitleItemToken.Types.UNDEFINED): 
                         pr.add(p, pers_typ, 1)
                     else: 
                         pr.add(p, pers_typ, 0.5)
                     if (t.end_char > end_token.value.end_char): 
                         end_token.value = t
                 continue
             if (r == rli[0]): 
                 pers_typ = TitleItemToken.Types.UNDEFINED
             if (isinstance(r, DateReferent)): 
                 if (res.date is None): 
                     res.date = Utils.asObjectOrNull(r, DateReferent)
                     if (t.end_char > end_token.value.end_char): 
                         end_token.value = t
             elif (isinstance(r, GeoReferent)): 
                 if (res.city is None and r.is_city): 
                     res.city = Utils.asObjectOrNull(r, GeoReferent)
                     if (t.end_char > end_token.value.end_char): 
                         end_token.value = t
             if (isinstance(r, OrganizationReferent)): 
                 org0_ = Utils.asObjectOrNull(r, OrganizationReferent)
                 if ("курс" in org0_.types and org0_.number is not None): 
                     i = 0
                     wrapi2673 = RefOutArgWrapper(0)
                     inoutres2674 = Utils.tryParseInt(org0_.number, wrapi2673)
                     i = wrapi2673.value
                     if (inoutres2674): 
                         if (i > 0 and (i < 8)): 
                             res.student_year = i
                 while org0_.higher is not None: 
                     if (org0_.kind != OrganizationKind.DEPARTMENT): 
                         break
                     org0_ = org0_.higher
                 if (org0_.kind != OrganizationKind.DEPARTMENT): 
                     if (res.org0_ is None): 
                         res.org0_ = org0_
                     elif (OrganizationReferent.can_be_higher(res.org0_, org0_)): 
                         res.org0_ = org0_
                 if (t.end_char > end_token.value.end_char): 
                     end_token.value = t
             if ((isinstance(r, UriReferent)) or (isinstance(r, GeoReferent))): 
                 if (t.end_char > end_token.value.end_char): 
                     end_token.value = t
     for ty in pers_types: 
         for p in pr.get_persons(ty): 
             if (pr.get_attr_name_for_type(ty) is not None): 
                 res.add_slot(pr.get_attr_name_for_type(ty), p, False, 0)
     if (res.get_slot_value(TitlePageReferent.ATTR_AUTHOR) is None): 
         for p in pr.get_persons(TitleItemToken.Types.UNDEFINED): 
             res.add_slot(TitlePageReferent.ATTR_AUTHOR, p, False, 0)
             break
     if (res.city is None and res.org0_ is not None): 
         s = res.org0_.find_slot(OrganizationReferent.ATTR_GEO, None, True)
         if (s is not None and (isinstance(s.value, GeoReferent))): 
             if (s.value.is_city): 
                 res.city = Utils.asObjectOrNull(s.value, GeoReferent)
     if (res.date is None): 
         t = begin
         first_pass3400 = True
         while True:
             if first_pass3400: first_pass3400 = False
             else: t = t.next0_
             if (not (t is not None and t.end_char <= end_char)): break
             city = Utils.asObjectOrNull(t.get_referent(), GeoReferent)
             if (city is None): 
                 continue
             if (isinstance(t.next0_, TextToken)): 
                 if (t.next0_.is_char_of(":,") or t.next0_.is_hiphen): 
                     t = t.next0_
             rt = t.kit.process_referent(DateAnalyzer.ANALYZER_NAME, t.next0_)
             if (rt is not None): 
                 rt.save_to_local_ontology()
                 res.date = Utils.asObjectOrNull(rt.referent, DateReferent)
                 if (kit is not None): 
                     kit.embed_token(rt)
                 break
     if (len(res.slots) == 0): 
         return None
     else: 
         return res
Exemplo n.º 8
0
 def addOccurenceOfRefTok(self, rt: 'ReferentToken') -> None:
     self.addOccurence(
         TextAnnotation._new700(rt.kit.sofa, rt.begin_char, rt.end_char,
                                rt.referent))
Exemplo n.º 9
0
 def add_occurence_of_ref_tok(self, rt: 'ReferentToken') -> None:
     self.add_occurence(
         TextAnnotation._new714(rt.kit.sofa, rt.begin_char, rt.end_char,
                                rt.referent))