def __mergeLetters(self) -> None: before_word = False tmp = io.StringIO() t = self.first_token first_pass2800 = True while True: if first_pass2800: first_pass2800 = False else: t = t.next0_ if (not (t is not None)): break tt = Utils.asObjectOrNull(t, TextToken) if (not tt.chars.is_letter or tt.length_char != 1): before_word = False continue i = t.whitespaces_before_count if (i > 2 or ((i == 2 and before_word))): pass else: before_word = False continue i = 0 Utils.setLengthStringIO(tmp, 0) print(tt.getSourceText(), end="", file=tmp) t1 = t while t1.next0_ is not None: tt = (Utils.asObjectOrNull(t1.next0_, TextToken)) if (tt.length_char != 1 or tt.whitespaces_before_count != 1): break i += 1 print(tt.getSourceText(), end="", file=tmp) t1 = t1.next0_ if (i > 3 or ((i > 1 and before_word))): pass else: before_word = False continue before_word = False mt = Morphology.process(Utils.toStringStringIO(tmp), None, None) if (mt is None or len(mt) != 1): t = t1 continue for wf in mt[0].word_forms: if (wf.is_in_dictionary): before_word = True break if (not before_word): t = t1 continue tt = TextToken(mt[0], self) if (t == self.first_token): self.first_token = (tt) else: tt.previous = t.previous tt.next0_ = t1.next0_ tt.begin_char = t.begin_char tt.end_char = t1.end_char t = (tt)
def __correctWordsByMerging(self, lang : 'MorphLang') -> None: t = self.first_token first_pass2798 = True while True: if first_pass2798: first_pass2798 = False else: t = t.next0_ if (not (t is not None and t.next0_ is not None)): break if (not t.chars.is_letter or (t.length_char < 2)): continue mc0 = t.getMorphClassInDictionary() if (t.morph.containsAttr("прдктв.", None)): continue t1 = t.next0_ if (t1.is_hiphen and t1.next0_ is not None and not t1.is_newline_after): t1 = t1.next0_ if (t1.length_char == 1): continue if (not t1.chars.is_letter or not t.chars.is_letter or t1.chars.is_latin_letter != t.chars.is_latin_letter): continue if (t1.chars.is_all_upper and not t.chars.is_all_upper): continue elif (not t1.chars.is_all_lower): continue elif (t.chars.is_all_upper): continue if (t1.morph.containsAttr("прдктв.", None)): continue mc1 = t1.getMorphClassInDictionary() if (not mc1.is_undefined and not mc0.is_undefined): continue if ((len((t).term) + len((t1).term)) < 6): continue corw = (t).term + (t1).term ccc = Morphology.process(corw, lang, None) if (ccc is None or len(ccc) != 1): continue if (corw == "ПОСТ" or corw == "ВРЕД"): continue tt = TextToken(ccc[0], self) if (tt.getMorphClassInDictionary().is_undefined): continue tt.begin_char = t.begin_char tt.end_char = t1.end_char tt.chars = t.chars if (t == self.first_token): self.first_token = (tt) else: t.previous.next0_ = tt if (t1.next0_ is not None): tt.next0_ = t1.next0_ t = (tt)
def __correctWordsByMorph(self, lang : 'MorphLang') -> None: tt = self.first_token first_pass2799 = True while True: if first_pass2799: first_pass2799 = False else: tt = tt.next0_ if (not (tt is not None)): break if (not ((isinstance(tt, TextToken)))): continue if (tt.morph.containsAttr("прдктв.", None)): continue dd = tt.getMorphClassInDictionary() if (not dd.is_undefined or (tt.length_char < 4)): continue if (tt.morph.class0_.is_proper_surname and not tt.chars.is_all_lower): continue if (tt.chars.is_all_upper): continue corw = Morphology.correctWord((tt).term, (lang if tt.morph.language.is_undefined else tt.morph.language)) if (corw is None): continue ccc = Morphology.process(corw, lang, None) if (ccc is None or len(ccc) != 1): continue tt1 = TextToken._new541(ccc[0], self, tt.chars, tt.begin_char, tt.end_char, (tt).term) mc = tt1.getMorphClassInDictionary() if (mc.is_proper_surname): continue if (tt == self.first_token): self.first_token = (tt1) else: tt.previous.next0_ = tt1 tt1.next0_ = tt.next0_ tt = (tt1) if (self.corrected_tokens is None): self.corrected_tokens = dict() self.corrected_tokens[tt] = tt.getSourceText()
def __init__(self, source: str = None, lang_: 'MorphLang' = None, source_is_normal: bool = False) -> None: """ Создать термин из строки с добавлением всех морфологических вариантов написания Args: source(str): строка lang_(MorphLang): возможный язык source_is_normal(bool): при true морфварианты не добавляются (эквивалентно вызову InitByNormalText) """ self.terms = list() self.additional_vars = None self.__m_canonic_text = None self.ignore_terms_order = False self.acronym = None self.acronym_smart = None self.acronym_can_be_lower = False self.abridges = None self.lang = MorphLang() self.tag = None self.tag2 = None if (source is None): return if (source_is_normal or Termin.ASSIGN_ALL_TEXTS_AS_NORMAL): self.initByNormalText(source, lang_) return toks = Morphology.process(source, lang_, None) if (toks is not None): i = 0 while i < len(toks): tt = TextToken(toks[i], None) self.terms.append(Termin.Term(tt, not source_is_normal)) i += 1 self.lang = MorphLang(lang_)
def __init__(self, sofa_ : 'SourceOfAnalysis'=None, only_tokenizing : bool=False, lang : 'MorphLang'=None, progress : EventHandler=None) -> None: self._start_date = datetime.datetime(1, 1, 1, 0, 0, 0) self.corrected_tokens = None self.first_token = None; self.__m_entities = list() self.ontology = None; self.base_language = MorphLang() self.__m_sofa = None; self.statistics = None; self.__m_datas = dict() self.misc_data = dict() self.processor = None; self.recurse_level = 0 self._m_analyzer_stack = list() if (sofa_ is None): return self.__m_sofa = sofa_ self._start_date = datetime.datetime.now() tokens = Morphology.process(sofa_.text, lang, None) t0 = None if (tokens is not None): ii = 0 while ii < len(tokens): mt = tokens[ii] if (mt.begin_char == 733860): pass tt = TextToken(mt, self) if (sofa_.correction_dict is not None): wrapcorw539 = RefOutArgWrapper(None) inoutres540 = Utils.tryGetValue(sofa_.correction_dict, mt.term, wrapcorw539) corw = wrapcorw539.value if (inoutres540): ccc = Morphology.process(corw, lang, None) if (ccc is not None and len(ccc) == 1): tt1 = TextToken._new538(ccc[0], self, tt.term) tt1.begin_char = tt.begin_char tt1.end_char = tt.end_char tt1.chars = tt.chars tt = tt1 if (self.corrected_tokens is None): self.corrected_tokens = dict() self.corrected_tokens[tt] = tt.getSourceText() if (t0 is None): self.first_token = (tt) else: t0.next0_ = tt t0 = (tt) ii += 1 if (sofa_.clear_dust): self.__clearDust() if (sofa_.do_words_merging_by_morph): self.__correctWordsByMerging(lang) if (sofa_.do_word_correction_by_morph): self.__correctWordsByMorph(lang) self.__mergeLetters() self.__defineBaseLanguage() t = self.first_token first_pass2794 = True while True: if first_pass2794: first_pass2794 = False else: t = t.next0_ if (not (t is not None)): break nt = NumberHelper._tryParseNumber(t) if (nt is None): continue self.embedToken(nt) t = (nt) if (only_tokenizing): return t = self.first_token first_pass2795 = True while True: if first_pass2795: first_pass2795 = False else: t = t.next0_ if (not (t is not None)): break if (t.morph.class0_.is_preposition): continue mc = t.getMorphClassInDictionary() if (mc.is_undefined and t.chars.is_cyrillic_letter and t.length_char > 4): tail = sofa_.text[t.end_char - 1:t.end_char - 1+2] tte = None tt = t.previous if (tt is not None and ((tt.is_comma_and or tt.morph.class0_.is_preposition or tt.morph.class0_.is_conjunction))): tt = tt.previous if ((tt is not None and not tt.getMorphClassInDictionary().is_undefined and (((tt.morph.class0_.value) & (t.morph.class0_.value))) != 0) and tt.length_char > 4): tail2 = sofa_.text[tt.end_char - 1:tt.end_char - 1+2] if (tail2 == tail): tte = tt if (tte is None): tt = t.next0_ if (tt is not None and ((tt.is_comma_and or tt.morph.class0_.is_preposition or tt.morph.class0_.is_conjunction))): tt = tt.next0_ if ((tt is not None and not tt.getMorphClassInDictionary().is_undefined and (((tt.morph.class0_.value) & (t.morph.class0_.value))) != 0) and tt.length_char > 4): tail2 = sofa_.text[tt.end_char - 1:tt.end_char - 1+2] if (tail2 == tail): tte = tt if (tte is not None): t.morph.removeItemsEx(tte.morph, tte.getMorphClassInDictionary()) continue self.__createStatistics()