예제 #1
0
def markup_choices_for_prereform_spelling(text):
    split_pattern = re.compile(r'(<choice.*?>.*?</choice>)')
    tokens = split_pattern.split(text)
    # print(tokens)
    t = []
    for i, token in enumerate(tokens):
        if split_pattern.search(token) is not None:
            corr_pattern = r'<choice(.*?)<corr>(.*?)</corr></choice>'
            matchobj = re.search(corr_pattern, token)
            to_corr = matchobj.group(2)
            text_res, changes, s_json = Processor.process_text(
                text=to_corr,
                show=True,
                delimiters=['<choice><reg>', '</reg><orig>', '</orig></choice>'],
                check_brackets=False
            )
            tokens[i] = f'<choice{matchobj.group(1)}<corr>{text_res}</corr></choice>'
        else:
            t = []
            for tok in token.split():
                if re.search(r'(>\[[0-9]*\])', tok):
                    t.append(tok)
                else:
                    text_res, changes, s_json = Processor.process_text(
                        text=tok,
                        show=True,
                        delimiters=['<choice><reg>', '</reg><orig>', '</orig></choice>'],
                        check_brackets=False
                    )
                    t.append(text_res)
            tokens[i] = " ".join(t)
    return ''.join(tokens)
예제 #2
0
def change_spelling(orig_text):
    text_res, changes, s_json = Processor.process_text(
        text=orig_text,
        show=True,
        delimiters=['<choice><reg>', '</reg><orig>', '</orig></choice>'],
        check_brackets=False)
    return text_res
예제 #3
0
def get_date2(text):
    pattern = re.compile("(\[)*.*[0-9]+.*(\])*")
    year_pattern = re.compile("1[0-9]{3}")
    month_pattern2 = re.compile(".*(январ.{0,1}|феврал.{0,1}|апрел.{0,1}|март.{0,1}|ма.{0,1}|июн.{0,1}|июл.{0,1}|август.{0,1}|сентябр.{0,1}|октябр.{0,1}|ноябр.{0,2}|декабр.{0,1}|iюн.{0,1}|iюл.{0,1})\s*([0-9]{0,2})")

    if isinstance(text, str):
        text = re.sub("І", "I", text)
        text, changes, s_json = Processor.process_text(
                text=text,
                show=False,
                delimiters=False,
                check_brackets=False
            ) 
        text = re.sub("Окт", "Октября", text)
        text = re.sub("(Авг|Ав)", "Августа", text)
        text = re.sub("Апр", "Апреля", text)
        text = re.sub("Дек", "Декабря", text)
        text = re.sub("Июн", "Июня", text)
        text = re.sub("Ию", "Июля", text)
        text = re.sub("Ил", "Июля", text)
        text = re.sub("А\.\s", "Августа ", text)
        text = re.sub("Сент", "Сентября", text)
        text = re.sub("Н\.\s", "Ноября", text)
        text = re.sub("Д\.\s", "Декабря", text)


        

        words = text.lower().split()
        l = []
        for word in words:
            if re.search("\d+", word) or month_pattern2.search(word):
                l.append(word)
            text = (" ".join(l))

        if year_pattern.search(text):
                date = get_date(text)
               # print(date, text, 1)
                return date, text
        elif month_pattern2.search(text):
            date = get_date(f"{date2parser} {text}")
            #print(date, f"{date2parser} {text}", 2)
           # print(f"{date2parser} {text}")
            return date, text
        else:
           
            print(text, 3)
            #print(f"{date2parser} {text}")
            return None
    else:
       # print(text, 4)
        return None
예제 #4
0
    def _read_token(self, line):
        fields = line.split(self._separator)

        token_no = fields[0]

        token_text = fields[self._token_col_index]

        original_text = token_text
        upd_tt, _, _ = Processor.process_text(
            text=token_text,
            show=False,
            delimiters=False,
            check_brackets=False)  # преобразование дореформенной орфографии
        token_text = upd_tt

        lemma, pos_tag, grammar_value, head, head_tag = None, None, None, None, None

        if self._lemma_col_index is not None and self._lemma_col_index < len(
                fields):
            lemma = fields[self._lemma_col_index]

        if (self._grammar_val_col_indices is not None
                and all(index < len(fields)
                        for index in self._grammar_val_col_indices)):
            grammar_value = '|'.join(
                chain(*(
                    sorted(fields[col_index].split(self._grammemes_separator))
                    for col_index in self._grammar_val_col_indices)))

        if self._grammar_val_col_indices and self._grammar_val_col_indices[
                0] < len(fields):
            pos_tag = fields[self._grammar_val_col_indices[0]]

        if self._head_col_index is not None and self._head_col_index < len(
                fields):
            hv = fields[self._head_col_index]
            if hv != "_":
                head = int(hv)

        if self._head_tag_col_index is not None and self._head_tag_col_index < len(
                fields):
            head_tag = fields[self._head_tag_col_index]

        return Token(token_no=token_no,
                     text=token_text,
                     original_text=original_text,
                     lemma=lemma,
                     pos_tag=pos_tag,
                     grammar_value=grammar_value,
                     head=head,
                     head_tag=head_tag)
예제 #5
0
def get_date2(text):
    pattern = re.compile("(\[)*.*[0-9]+.*(\])*")
    year_pattern = re.compile("1[0-9]{3}")
    if isinstance(text, str):
        text = re.sub("І", "I", text)
        text, changes, s_json = Processor.process_text(
                text=text,
                show=False,
                delimiters=False,
                check_brackets=False
            )
        if year_pattern.search(text):
                date = get_date(text)
                print(date, text, 1)
                return date
        else:
            date = get_date(f"1890-1900 гг. {text}")
            print(date, text,f"1890-1900 гг.{text}", 2)
            return date
    else:
        print(type(text))
        return None