def markup_choices_for_prereform_spelling(text): split_pattern = re.compile(r'(<choice.*?>.*?</choice>)') tokens = split_pattern.split(text) # print(tokens) t = [] for i, token in enumerate(tokens): if split_pattern.search(token) is not None: corr_pattern = r'<choice(.*?)<corr>(.*?)</corr></choice>' matchobj = re.search(corr_pattern, token) to_corr = matchobj.group(2) text_res, changes, s_json = Processor.process_text( text=to_corr, show=True, delimiters=['<choice><reg>', '</reg><orig>', '</orig></choice>'], check_brackets=False ) tokens[i] = f'<choice{matchobj.group(1)}<corr>{text_res}</corr></choice>' else: t = [] for tok in token.split(): if re.search(r'(>\[[0-9]*\])', tok): t.append(tok) else: text_res, changes, s_json = Processor.process_text( text=tok, show=True, delimiters=['<choice><reg>', '</reg><orig>', '</orig></choice>'], check_brackets=False ) t.append(text_res) tokens[i] = " ".join(t) return ''.join(tokens)
def change_spelling(orig_text): text_res, changes, s_json = Processor.process_text( text=orig_text, show=True, delimiters=['<choice><reg>', '</reg><orig>', '</orig></choice>'], check_brackets=False) return text_res
def get_date2(text): pattern = re.compile("(\[)*.*[0-9]+.*(\])*") year_pattern = re.compile("1[0-9]{3}") month_pattern2 = re.compile(".*(январ.{0,1}|феврал.{0,1}|апрел.{0,1}|март.{0,1}|ма.{0,1}|июн.{0,1}|июл.{0,1}|август.{0,1}|сентябр.{0,1}|октябр.{0,1}|ноябр.{0,2}|декабр.{0,1}|iюн.{0,1}|iюл.{0,1})\s*([0-9]{0,2})") if isinstance(text, str): text = re.sub("І", "I", text) text, changes, s_json = Processor.process_text( text=text, show=False, delimiters=False, check_brackets=False ) text = re.sub("Окт", "Октября", text) text = re.sub("(Авг|Ав)", "Августа", text) text = re.sub("Апр", "Апреля", text) text = re.sub("Дек", "Декабря", text) text = re.sub("Июн", "Июня", text) text = re.sub("Ию", "Июля", text) text = re.sub("Ил", "Июля", text) text = re.sub("А\.\s", "Августа ", text) text = re.sub("Сент", "Сентября", text) text = re.sub("Н\.\s", "Ноября", text) text = re.sub("Д\.\s", "Декабря", text) words = text.lower().split() l = [] for word in words: if re.search("\d+", word) or month_pattern2.search(word): l.append(word) text = (" ".join(l)) if year_pattern.search(text): date = get_date(text) # print(date, text, 1) return date, text elif month_pattern2.search(text): date = get_date(f"{date2parser} {text}") #print(date, f"{date2parser} {text}", 2) # print(f"{date2parser} {text}") return date, text else: print(text, 3) #print(f"{date2parser} {text}") return None else: # print(text, 4) return None
def _read_token(self, line): fields = line.split(self._separator) token_no = fields[0] token_text = fields[self._token_col_index] original_text = token_text upd_tt, _, _ = Processor.process_text( text=token_text, show=False, delimiters=False, check_brackets=False) # преобразование дореформенной орфографии token_text = upd_tt lemma, pos_tag, grammar_value, head, head_tag = None, None, None, None, None if self._lemma_col_index is not None and self._lemma_col_index < len( fields): lemma = fields[self._lemma_col_index] if (self._grammar_val_col_indices is not None and all(index < len(fields) for index in self._grammar_val_col_indices)): grammar_value = '|'.join( chain(*( sorted(fields[col_index].split(self._grammemes_separator)) for col_index in self._grammar_val_col_indices))) if self._grammar_val_col_indices and self._grammar_val_col_indices[ 0] < len(fields): pos_tag = fields[self._grammar_val_col_indices[0]] if self._head_col_index is not None and self._head_col_index < len( fields): hv = fields[self._head_col_index] if hv != "_": head = int(hv) if self._head_tag_col_index is not None and self._head_tag_col_index < len( fields): head_tag = fields[self._head_tag_col_index] return Token(token_no=token_no, text=token_text, original_text=original_text, lemma=lemma, pos_tag=pos_tag, grammar_value=grammar_value, head=head, head_tag=head_tag)
def get_date2(text): pattern = re.compile("(\[)*.*[0-9]+.*(\])*") year_pattern = re.compile("1[0-9]{3}") if isinstance(text, str): text = re.sub("І", "I", text) text, changes, s_json = Processor.process_text( text=text, show=False, delimiters=False, check_brackets=False ) if year_pattern.search(text): date = get_date(text) print(date, text, 1) return date else: date = get_date(f"1890-1900 гг. {text}") print(date, text,f"1890-1900 гг.{text}", 2) return date else: print(type(text)) return None