Exemplo n.º 1
0
 def __correctWordsByMerging(self, lang : 'MorphLang') -> None:
     t = self.first_token
     first_pass2798 = True
     while True:
         if first_pass2798: first_pass2798 = False
         else: t = t.next0_
         if (not (t is not None and t.next0_ is not None)): break
         if (not t.chars.is_letter or (t.length_char < 2)): 
             continue
         mc0 = t.getMorphClassInDictionary()
         if (t.morph.containsAttr("прдктв.", None)): 
             continue
         t1 = t.next0_
         if (t1.is_hiphen and t1.next0_ is not None and not t1.is_newline_after): 
             t1 = t1.next0_
         if (t1.length_char == 1): 
             continue
         if (not t1.chars.is_letter or not t.chars.is_letter or t1.chars.is_latin_letter != t.chars.is_latin_letter): 
             continue
         if (t1.chars.is_all_upper and not t.chars.is_all_upper): 
             continue
         elif (not t1.chars.is_all_lower): 
             continue
         elif (t.chars.is_all_upper): 
             continue
         if (t1.morph.containsAttr("прдктв.", None)): 
             continue
         mc1 = t1.getMorphClassInDictionary()
         if (not mc1.is_undefined and not mc0.is_undefined): 
             continue
         if ((len((t).term) + len((t1).term)) < 6): 
             continue
         corw = (t).term + (t1).term
         ccc = Morphology.process(corw, lang, None)
         if (ccc is None or len(ccc) != 1): 
             continue
         if (corw == "ПОСТ" or corw == "ВРЕД"): 
             continue
         tt = TextToken(ccc[0], self)
         if (tt.getMorphClassInDictionary().is_undefined): 
             continue
         tt.begin_char = t.begin_char
         tt.end_char = t1.end_char
         tt.chars = t.chars
         if (t == self.first_token): 
             self.first_token = (tt)
         else: 
             t.previous.next0_ = tt
         if (t1.next0_ is not None): 
             tt.next0_ = t1.next0_
         t = (tt)
Exemplo n.º 2
0
 def __init__(self, sofa_ : 'SourceOfAnalysis'=None, only_tokenizing : bool=False, lang : 'MorphLang'=None, progress : EventHandler=None) -> None:
     self._start_date = datetime.datetime(1, 1, 1, 0, 0, 0)
     self.corrected_tokens = None
     self.first_token = None;
     self.__m_entities = list()
     self.ontology = None;
     self.base_language = MorphLang()
     self.__m_sofa = None;
     self.statistics = None;
     self.__m_datas = dict()
     self.misc_data = dict()
     self.processor = None;
     self.recurse_level = 0
     self._m_analyzer_stack = list()
     if (sofa_ is None): 
         return
     self.__m_sofa = sofa_
     self._start_date = datetime.datetime.now()
     tokens = Morphology.process(sofa_.text, lang, None)
     t0 = None
     if (tokens is not None): 
         ii = 0
         while ii < len(tokens): 
             mt = tokens[ii]
             if (mt.begin_char == 733860): 
                 pass
             tt = TextToken(mt, self)
             if (sofa_.correction_dict is not None): 
                 wrapcorw539 = RefOutArgWrapper(None)
                 inoutres540 = Utils.tryGetValue(sofa_.correction_dict, mt.term, wrapcorw539)
                 corw = wrapcorw539.value
                 if (inoutres540): 
                     ccc = Morphology.process(corw, lang, None)
                     if (ccc is not None and len(ccc) == 1): 
                         tt1 = TextToken._new538(ccc[0], self, tt.term)
                         tt1.begin_char = tt.begin_char
                         tt1.end_char = tt.end_char
                         tt1.chars = tt.chars
                         tt = tt1
                         if (self.corrected_tokens is None): 
                             self.corrected_tokens = dict()
                         self.corrected_tokens[tt] = tt.getSourceText()
             if (t0 is None): 
                 self.first_token = (tt)
             else: 
                 t0.next0_ = tt
             t0 = (tt)
             ii += 1
     if (sofa_.clear_dust): 
         self.__clearDust()
     if (sofa_.do_words_merging_by_morph): 
         self.__correctWordsByMerging(lang)
     if (sofa_.do_word_correction_by_morph): 
         self.__correctWordsByMorph(lang)
     self.__mergeLetters()
     self.__defineBaseLanguage()
     t = self.first_token
     first_pass2794 = True
     while True:
         if first_pass2794: first_pass2794 = False
         else: t = t.next0_
         if (not (t is not None)): break
         nt = NumberHelper._tryParseNumber(t)
         if (nt is None): 
             continue
         self.embedToken(nt)
         t = (nt)
     if (only_tokenizing): 
         return
     t = self.first_token
     first_pass2795 = True
     while True:
         if first_pass2795: first_pass2795 = False
         else: t = t.next0_
         if (not (t is not None)): break
         if (t.morph.class0_.is_preposition): 
             continue
         mc = t.getMorphClassInDictionary()
         if (mc.is_undefined and t.chars.is_cyrillic_letter and t.length_char > 4): 
             tail = sofa_.text[t.end_char - 1:t.end_char - 1+2]
             tte = None
             tt = t.previous
             if (tt is not None and ((tt.is_comma_and or tt.morph.class0_.is_preposition or tt.morph.class0_.is_conjunction))): 
                 tt = tt.previous
             if ((tt is not None and not tt.getMorphClassInDictionary().is_undefined and (((tt.morph.class0_.value) & (t.morph.class0_.value))) != 0) and tt.length_char > 4): 
                 tail2 = sofa_.text[tt.end_char - 1:tt.end_char - 1+2]
                 if (tail2 == tail): 
                     tte = tt
             if (tte is None): 
                 tt = t.next0_
                 if (tt is not None and ((tt.is_comma_and or tt.morph.class0_.is_preposition or tt.morph.class0_.is_conjunction))): 
                     tt = tt.next0_
                 if ((tt is not None and not tt.getMorphClassInDictionary().is_undefined and (((tt.morph.class0_.value) & (t.morph.class0_.value))) != 0) and tt.length_char > 4): 
                     tail2 = sofa_.text[tt.end_char - 1:tt.end_char - 1+2]
                     if (tail2 == tail): 
                         tte = tt
             if (tte is not None): 
                 t.morph.removeItemsEx(tte.morph, tte.getMorphClassInDictionary())
         continue
     self.__createStatistics()