def __str__(self) -> str: if (self.countries is None): return self.pref res = Utils.newStringIO(self.pref) for c in self.countries: print(" {0}".format(c), end="", file=res, flush=True) return Utils.toStringStringIO(res)
def __str__(self) -> str: res = Utils.newStringIO(self.value) if (self.is_noun_phrase): print(" NounPrase", end="", file=res) if (self.is_denomination): print(" Denom", end="", file=res) if (self.is_in_dictionary): print(" InDictionary", end="", file=res) if (self.is_after_conjunction): print(" IsAfterConjunction", end="", file=res) if (self.is_std_tail): print(" IsStdTail", end="", file=res) if (self.is_std_name): print(" IsStdName", end="", file=res) if (self.is_ignored_part): print(" IsIgnoredPart", end="", file=res) if (self.preposition is not None): print(" IsAfterPreposition '{0}'".format(self.preposition), end="", file=res, flush=True) print(" {0} ({1})".format(str(self.chars), self.getSourceText()), end="", file=res, flush=True) return Utils.toStringStringIO(res)
def attach_url(t0: 'Token') -> 'UriItemToken': srv = UriItemToken.attach_domain_name(t0, True, False) if (srv is None): return None txt = Utils.newStringIO(srv.value) t1 = srv.end_token if (t1.next0_ is not None and t1.next0_.is_char(':') and (isinstance(t1.next0_.next0_, NumberToken))): t1 = t1.next0_.next0_ print(":{0}".format(t1.value), end="", file=txt, flush=True) elif ((srv.value == "vk.com" and t1.next0_ is not None and t1.next0_.is_hiphen) and t1.next0_.next0_ is not None): t1 = t1.next0_.next0_ dat = UriItemToken.__attach_uri_content(t1, ".-_+%", False) if (dat is not None): t1 = dat.end_token print("/{0}".format(dat.value), end="", file=txt, flush=True) t = t1.next0_ while t is not None: if (t.is_whitespace_before): break if (not t.is_char('/')): break if (t.is_whitespace_after): t1 = t break dat = UriItemToken.__attach_uri_content(t.next0_, ".-_+%", False) if (dat is None): t1 = t break t1 = dat.end_token t = t1 print("/{0}".format(dat.value), end="", file=txt, flush=True) t = t.next0_ if ((t1.next0_ is not None and t1.next0_.is_char('?') and not t1.next0_.is_whitespace_after) and not t1.is_whitespace_after): dat = UriItemToken.__attach_uri_content(t1.next0_.next0_, ".-_+%=&", False) if (dat is not None): t1 = dat.end_token print("?{0}".format(dat.value), end="", file=txt, flush=True) if ((t1.next0_ is not None and t1.next0_.is_char('#') and not t1.next0_.is_whitespace_after) and not t1.is_whitespace_after): dat = UriItemToken.__attach_uri_content(t1.next0_.next0_, ".-_+%", False) if (dat is not None): t1 = dat.end_token print("#{0}".format(dat.value), end="", file=txt, flush=True) i = 0 i = 0 while i < txt.tell(): if (str.isalpha(Utils.getCharAtStringIO(txt, i))): break i += 1 if (i >= txt.tell()): return None return UriItemToken._new2706(t0, t1, Utils.toStringStringIO(txt))
def __str__(self) -> str: if (Utils.isNullOrEmpty(self.term)): return "Null" str0_ = self.term if (self.char_info.is_all_lower): str0_ = str0_.lower() elif (self.char_info.is_capital_upper and len(str0_) > 0): str0_ = "{0}{1}".format(self.term[0], self.term[1:].lower()) elif (self.char_info.is_last_lower): str0_ = "{0}{1}".format(self.term[0:0+len(self.term) - 1], self.term[len(self.term) - 1:].lower()) if (self.word_forms is None): return str0_ res = Utils.newStringIO(str0_) for l_ in self.word_forms: print(", {0}".format(str(l_)), end="", file=res, flush=True) return Utils.toStringStringIO(res)
def __str__(self) -> str: res = Utils.newStringIO(Utils.ifNotNull(self.caption, self.name)) if (self.upper_bound > 0 or self.lower_bound > 0): if (self.upper_bound == 0): print("[{0}..*]".format(self.lower_bound), end="", file=res, flush=True) elif (self.upper_bound == self.lower_bound): print("[{0}]".format(self.upper_bound), end="", file=res, flush=True) else: print("[{0}..{1}]".format(self.lower_bound, self.upper_bound), end="", file=res, flush=True) return Utils.toStringStringIO(res)
def to_string(self, short_variant: bool, lang: 'MorphLang' = None, lev: int = 0) -> str: res = Utils.newStringIO(self.template) vals = list() for s in self.slots: if (s.type_name == MeasureReferent.ATTR_VALUE): if (isinstance(s.value, str)): val = Utils.asObjectOrNull(s.value, str) if (val == "NaN"): val = "?" vals.append(val) elif (isinstance(s.value, Referent)): vals.append(s.value.to_string(True, lang, 0)) for i in range(res.tell() - 1, -1, -1): ch = Utils.getCharAtStringIO(res, i) if (not str.isdigit(ch)): continue j = ((ord(ch)) - (ord('1'))) if ((j < 0) or j >= len(vals)): continue Utils.removeStringIO(res, i, 1) Utils.insertStringIO(res, i, vals[j]) print(self.out_units(lang), end="", file=res) if (not short_variant): nam = self.get_string_value(MeasureReferent.ATTR_NAME) if (nam is not None): print(" - {0}".format(nam), end="", file=res, flush=True) for s in self.slots: if (s.type_name == MeasureReferent.ATTR_REF and (isinstance(s.value, MeasureReferent))): print(" / {0}".format(s.value.to_string(True, lang, 0)), end="", file=res, flush=True) ki = self.kind if (ki != MeasureKind.UNDEFINED): print(" ({0})".format(Utils.enumToString(ki).upper()), end="", file=res, flush=True) return Utils.toStringStringIO(res)
def toString(self, short_variant : bool, lang : 'MorphLang'=None, lev : int=0) -> str: res = Utils.newStringIO(self.template) vals = list() for s in self.slots: if (s.type_name == MeasureReferent.ATTR_VALUE): if (isinstance(s.value, str)): vals.append(Utils.asObjectOrNull(s.value, str)) elif (isinstance(s.value, Referent)): vals.append((s.value).toString(True, lang, 0)) for i in range(res.tell() - 1, -1, -1): ch = Utils.getCharAtStringIO(res, i) if (not str.isdigit(ch)): continue j = ((ord(ch)) - (ord('1'))) if ((j < 0) or j >= len(vals)): continue Utils.removeStringIO(res, i, 1) Utils.insertStringIO(res, i, vals[j]) uu = self.units if (len(uu) > 0): print(uu[0].toString(True, lang, 0), end="", file=res) i = 1 while i < len(uu): pow0_ = uu[i].getStringValue(UnitReferent.ATTR_POW) if (not Utils.isNullOrEmpty(pow0_) and pow0_[0] == '-'): print("/{0}".format(uu[i].toString(True, lang, 1)), end="", file=res, flush=True) if (pow0_ != "-1"): print("<{0}>".format(pow0_[1:]), end="", file=res, flush=True) else: print("*{0}".format(uu[i].toString(True, lang, 0)), end="", file=res, flush=True) i += 1 if (not short_variant): nam = self.getStringValue(MeasureReferent.ATTR_NAME) if (nam is not None): print(" - {0}".format(nam), end="", file=res, flush=True) for s in self.slots: if (s.type_name == MeasureReferent.ATTR_REF and (isinstance(s.value, MeasureReferent))): print(" / {0}".format((s.value).toString(True, lang, 0)), end="", file=res, flush=True) ki = self.kind if (ki != MeasureKind.UNDEFINED): print(" ({0})".format(Utils.enumToString(ki).upper()), end="", file=res, flush=True) return Utils.toStringStringIO(res)
def __str__(self) -> str: res = Utils.newStringIO(Utils.ifNotNull(self.normal_case, "")) if (self.normal_full is not None and self.normal_full != self.normal_case): print("\\{0}".format(self.normal_full), end="", file=res, flush=True) if (res.tell() > 0): print(' ', end="", file=res) print(super().__str__(), end="", file=res) s = (None if self.misc is None else str(self.misc)) if (not Utils.isNullOrEmpty(s)): print(" {0}".format(s), end="", file=res, flush=True) if (self.undef_coef > (0)): print(" (? {0})".format(self.undef_coef), end="", file=res, flush=True) return Utils.toStringStringIO(res)
def correct_word_by_morph(self, word: str) -> str: vars0_ = list() tmp = Utils.newStringIO(len(word)) ch = 1 while ch < len(word): Utils.setLengthStringIO(tmp, 0) print(word, end="", file=tmp) Utils.setCharAtStringIO(tmp, ch, '*') var = self.__check_corr_var(Utils.toStringStringIO(tmp), self.m_root, 0) if (var is not None): if (not var in vars0_): vars0_.append(var) ch += 1 if (len(vars0_) == 0): ch = 1 while ch < len(word): Utils.setLengthStringIO(tmp, 0) print(word, end="", file=tmp) Utils.insertStringIO(tmp, ch, '*') var = self.__check_corr_var(Utils.toStringStringIO(tmp), self.m_root, 0) if (var is not None): if (not var in vars0_): vars0_.append(var) ch += 1 if (len(vars0_) == 0): ch = 1 while ch < (len(word) - 1): Utils.setLengthStringIO(tmp, 0) print(word, end="", file=tmp) Utils.removeStringIO(tmp, ch, 1) var = self.__check_corr_var(Utils.toStringStringIO(tmp), self.m_root, 0) if (var is not None): if (not var in vars0_): vars0_.append(var) ch += 1 if (len(vars0_) != 1): return None return vars0_[0]
def transliteral_correction(value: str, prev_value: str, always: bool = False) -> str: """ Транслитеральная корректировка Args: value(str): prev_value(str): always(bool): """ pure_cyr = 0 pure_lat = 0 ques_cyr = 0 ques_lat = 0 udar_cyr = 0 y = False udaren = False i = 0 first_pass2897 = True while True: if first_pass2897: first_pass2897 = False else: i += 1 if (not (i < len(value))): break ch = value[i] ui = UnicodeInfo.ALL_CHARS[ord(ch)] if (not ui.is_letter): if (ui.is_udaren): udaren = True continue if (ui.is_apos and len(value) > 2): return LanguageHelper.transliteral_correction( value.replace("{0}".format(ch), ""), prev_value, False) return value if (ui.is_cyrillic): if (LanguageHelper._m_cyr_chars.find(ch) >= 0): ques_cyr += 1 else: pure_cyr += 1 elif (ui.is_latin): if (LanguageHelper._m_lat_chars.find(ch) >= 0): ques_lat += 1 else: pure_lat += 1 elif (LanguageHelper.__m_udar_chars.find(ch) >= 0): udar_cyr += 1 else: return value if (ch == 'Ь' and ((i + 1) < len(value)) and value[i + 1] == 'I'): y = True to_rus = False to_lat = False if (pure_lat > 0 and pure_cyr > 0): return value if (((pure_lat > 0 or always)) and ques_cyr > 0): to_lat = True elif (((pure_cyr > 0 or always)) and ques_lat > 0): to_rus = True elif (pure_cyr == 0 and pure_lat == 0): if (ques_cyr > 0 and ques_lat > 0): if (not Utils.isNullOrEmpty(prev_value)): if (LanguageHelper.is_cyrillic_char(prev_value[0])): to_rus = True elif (LanguageHelper.is_latin_char(prev_value[0])): to_lat = True if (not to_lat and not to_rus): if (ques_cyr > ques_lat): to_rus = True elif (ques_cyr < ques_lat): to_lat = True if (not to_rus and not to_lat): if (not y and not udaren and udar_cyr == 0): return value tmp = Utils.newStringIO(value) i = 0 first_pass2898 = True while True: if first_pass2898: first_pass2898 = False else: i += 1 if (not (i < tmp.tell())): break if (Utils.getCharAtStringIO(tmp, i) == 'Ь' and ((i + 1) < tmp.tell()) and Utils.getCharAtStringIO(tmp, i + 1) == 'I'): Utils.setCharAtStringIO(tmp, i, 'Ы') Utils.removeStringIO(tmp, i + 1, 1) continue cod = ord(Utils.getCharAtStringIO(tmp, i)) if (cod >= 0x300 and (cod < 0x370)): Utils.removeStringIO(tmp, i, 1) continue if (to_rus): ii = LanguageHelper._m_lat_chars.find( Utils.getCharAtStringIO(tmp, i)) if (ii >= 0): Utils.setCharAtStringIO(tmp, i, LanguageHelper._m_cyr_chars[ii]) else: ii = LanguageHelper.__m_udar_chars.find( Utils.getCharAtStringIO(tmp, i)) if (((ii)) >= 0): Utils.setCharAtStringIO( tmp, i, LanguageHelper.__m_udar_cyr_chars[ii]) elif (to_lat): ii = LanguageHelper._m_cyr_chars.find( Utils.getCharAtStringIO(tmp, i)) if (ii >= 0): Utils.setCharAtStringIO(tmp, i, LanguageHelper._m_lat_chars[ii]) else: ii = LanguageHelper.__m_udar_chars.find( Utils.getCharAtStringIO(tmp, i)) if (ii >= 0): Utils.setCharAtStringIO( tmp, i, LanguageHelper.__m_udar_cyr_chars[ii]) return Utils.toStringStringIO(tmp)
def __str__(self) -> str: res = Utils.newStringIO( ("Null" if self.referent is None else str(self.referent))) if (self.morph is not None): print(" {0}".format(str(self.morph)), end="", file=res, flush=True) return Utils.toStringStringIO(res)
def __str__(self) -> str: res = Utils.newStringIO(self.term) for l_ in self.morph.items: print(", {0}".format(str(l_)), end="", file=res, flush=True) return Utils.toStringStringIO(res)
def __doCrLfCorrection(self, txt: str) -> str: """ Это анализ случаев принудительно отформатированного текста Args: txt(str): """ cou = 0 total_len = 0 i = 0 first_pass3166 = True while True: if first_pass3166: first_pass3166 = False else: i += 1 if (not (i < len(txt))): break ch = txt[i] if ((ord(ch)) != 0xD and (ord(ch)) != 0xA): continue len0_ = 0 last_char = ch j = (i + 1) while j < len(txt): ch = txt[j] if ((ord(ch)) == 0xD or (ord(ch)) == 0xA): break elif ((ord(ch)) == 0x9): len0_ += 5 else: last_char = ch len0_ += 1 j += 1 if (j >= len(txt)): break if (len0_ < 30): continue if (last_char != '.' and last_char != ':' and last_char != ';'): next_is_dig = False k = j + 1 while k < len(txt): if (not Utils.isWhitespace(txt[k])): if (str.isdigit(txt[k])): next_is_dig = True break k += 1 if (not next_is_dig): cou += 1 total_len += len0_ i = j if (cou < 4): return txt total_len = math.floor(total_len / cou) if ((total_len < 50) or total_len > 100): return txt tmp = Utils.newStringIO(txt) i = 0 while i < tmp.tell(): ch = Utils.getCharAtStringIO(tmp, i) len0_ = 0 last_char = ch j = (i + 1) while j < tmp.tell(): ch = Utils.getCharAtStringIO(tmp, j) if ((ord(ch)) == 0xD or (ord(ch)) == 0xA): break elif ((ord(ch)) == 0x9): len0_ += 5 else: last_char = ch len0_ += 1 j += 1 if (j >= tmp.tell()): break for jj in range(j - 1, -1, -1): last_char = Utils.getCharAtStringIO(tmp, jj) if (not Utils.isWhitespace(last_char)): break else: jj = -1 not_single = False jj = (j + 1) if ((jj < tmp.tell()) and (ord(Utils.getCharAtStringIO(tmp, j))) == 0xD and (ord(Utils.getCharAtStringIO(tmp, jj))) == 0xA): jj += 1 while jj < tmp.tell(): ch = Utils.getCharAtStringIO(tmp, jj) if (not Utils.isWhitespace(ch)): break if ((ord(ch)) == 0xD or (ord(ch)) == 0xA): not_single = True break jj += 1 if (((not not_single and len0_ > (total_len - 20) and (len0_ < (total_len + 10))) and last_char != '.' and last_char != ':') and last_char != ';'): Utils.setCharAtStringIO(tmp, j, ' ') self.crlf_corrected_count += 1 if ((j + 1) < tmp.tell()): ch = Utils.getCharAtStringIO(tmp, j + 1) if ((ord(ch)) == 0xA): Utils.setCharAtStringIO(tmp, j + 1, ' ') j += 1 i = (j - 1) i += 1 return Utils.toStringStringIO(tmp)
def __calcTransliteralStatistics(txt: str, info: io.StringIO) -> int: if (txt is None): return 0 tmp = Utils.newStringIO(txt) return SourceOfAnalysis.__doTransliteralCorrection(tmp, info)