예제 #1
0
 def test_split_simple(self):
     assert simple_word_tokenize('Мама мыла раму') == [
         'Мама', 'мыла', 'раму'
     ]
     assert simple_word_tokenize('Постой, паровоз!') == [
         'Постой', ',', 'паровоз', '!'
     ]
예제 #2
0
def general_data():
    path_opencorpora = "." + os.sep + "!data" + os.sep + \
                       "newcorpus" + os.sep + "OpenCorpora_txt_clean"
    files = [
        item for item in os.listdir(path_opencorpora) if item.endswith(".txt")
    ]
    total_pars = []
    total_avs = []
    total_pars_count = 0
    for file in files:
        with open(path_opencorpora + os.sep + file, "r",
                  encoding="utf-8") as f:
            pars = f.readlines()
            total_pars_count += len(pars)
            total_pars.append(len(pars))
        try:
            av = []
            for par in pars:
                av.append(len(tokenizers.simple_word_tokenize(par)))
            total_avs.append(mean(av))
        except:
            # это пустые файлы
            pass
    print("В среднем слов в абзаце: {:.2f}\n".format(mean(total_avs)) +
          "Всего абзацев: {:,}\n".format(total_pars_count).replace(",", " ") +
          "В среднем абзацев в тексте: {:.2f}".format(mean(total_pars)))
예제 #3
0
    def test_exctract_words(self):
        text = '''Это  отразилось: на количественном,и на качествен_ном
                - росте карельско-финляндского сотрудничества - офигеть! кони+лошади=масло.
                -сказал кто-то --нет--'''

        assert simple_word_tokenize(text) == [
            'Это',
            'отразилось',
            ':',
            'на',
            'количественном',
            ',',
            'и',
            'на',
            'качествен_ном',
            '-',
            'росте',
            'карельско-финляндского',
            'сотрудничества',
            '-',
            'офигеть',
            '!',
            'кони',
            '+',
            'лошади',
            '=',
            'масло',
            '.',
            '-сказал',
            'кто-то',
            '--нет--',
        ]
예제 #4
0
def lemmatize(s):
    l = [
        morph.parse(w)[0].normal_form for w in simple_word_tokenize(s)
        if not is_punctuation(w)
    ]
    l = [w for w in l if w not in stop_words]
    return " ".join(l)
예제 #5
0
def api_morphy(request):
    query = request.json_body
    if query["all"]:
        words = simple_word_tokenize(query["phrase"])
        new_phrase = []
        for word in words:
            new_phrase.append(lean(word, case=query["case"]))
    return {"phrase": " ".join(new_phrase)}
예제 #6
0
def pymorphy_simple(text):
    """
    Разбивает текст на слова функцией из pymorphy2
    :param text: Входной текст
    :return: список слов
    """
    words = simple_word_tokenize(text)
    return words
예제 #7
0
def lemmatize_text(text):
    text = re.sub("\s+", " ", text)
    split_text = tokenizers.simple_word_tokenize(text)
    lemmatized_text = [lemmatize(t) for t in split_text]
    lemmas = [l[0] for l in lemmatized_text]
    t_tags = [l[1] for l in lemmatized_text]
    tags = [t_tags.count(gm) for gm in gram]
    words_known = sum([word_is_known(t) for t in split_text])
    return ' '.join(lemmas), words_known, tags
예제 #8
0
def tokenize(definition: str) -> List[str]:
    """
    разбивает строку на токены
    :param definition: строка - определение
    :return: токены (без знаков препинания)
    """
    return [
        x for x in simple_word_tokenize(definition) if x not in punctuation
    ]
예제 #9
0
    def test_exctract_words(self):
        text = '''Это  отразилось: на количественном,и на качествен_ном
                - росте карельско-финляндского сотрудничества - офигеть! кони+лошади=масло.
                -сказал кто-то --нет--'''

        assert simple_word_tokenize(text) == [
            'Это', 'отразилось', ':', 'на', 'количественном', ',', 'и', 'на',
            'качествен_ном', '-', 'росте', 'карельско-финляндского',
            'сотрудничества', '-', 'офигеть', '!', 'кони', '+', 'лошади',
            '=', 'масло', '.', '-сказал', 'кто-то', '--нет--',
        ]
예제 #10
0
def combine_sent(sent, lemma_seq, pos_seq):
    tokens = simple_word_tokenize(sent)
    norm_tokens = [token for token in tokens if token[0].isalpha()]
    if type(lemma_seq) != str:
        return []
    lemmas = lemma_seq.split()
    pos = pos_seq.split()
    combined_list = []
    for i in range(len(pos)):
        combined_list.append(' ' + norm_tokens[i] + '|' + lemmas[i] + '|' +
                             pos[i])
    return combined_list
예제 #11
0
파일: ml.py 프로젝트: zxsted/qa_bot
def str_handler(in_string):
    """Обработчик строк. Удаление лишних символов, детранслитерация, нормализация(приведение к инфинитиву).

    Keyword arguments:
    in_string -- строка для обработки

    """
    tokens = tokenizers.simple_word_tokenize(re.sub('[!?,.%]', '', in_string))
    new_string = ''
    for word_in in tokens:
        if re.search(latin_pattern, word_in):
            word_in = translit.detranslify(word_in)
        new_string += morph.parse(word_in)[0].normal_form + ' '
    return new_string
예제 #12
0
 def pos(self, line):
     self.pos_dict = {}
     # parsing
     if self.language == "rus":
         play_pos = [
             self.analyzer.parse(token)[0].tag.POS
             for token in simple_word_tokenize(line)
         ]
     elif self.language in self.spacy_analyzers.keys():
         play_pos = [token.pos_ for token in self.analyzer(line)]
     elif self.language in self.cltk_analyzers.keys():
         print(self.language)
         play_pos = self.analyzer.analyze(text=line).pos
     return play_pos
예제 #13
0
def tokenize(text_data):
    text_data = str(text_data).lower()  # к нижнему регистру
    text_data = re.sub(
        '\s+', ' ',
        text_data)  # нормализовать все переносы и табы к просто пробелу
    text_data = re.sub('\[id\d*\|\w*\]', 'username',
                       text_data)  # замена всех упоминаний пользователей
    text_data = re.sub('"', '', text_data)  # удаление двойных кавычек
    text_data = re.sub("'", '', text_data)  # удаление апострофов
    text_data = re.sub('ё', 'е', text_data)  # замена Ё на Е
    text_data = re.sub(r'(?<=\w)\*', 'ё',
                       text_data)  # замена * на Ё, если перед ним буква
    tokens = simple_word_tokenize(text_data)  # токенизация
    return ' '.join(tokens)
def tokenizeSingleText(text, configurations):
    minimal_words_in_sentence = 1
    if (configurations != None):
        minimal_words_in_sentence = configurations.get(
            "minimal_words_in_sentence", 1)

    remove_index_list = []

    for index, sentence in enumerate(text.original_sentences):
        if (len(sentence) > 1):
            tokenized_sentence = tokenizers.simple_word_tokenize(sentence)
            updated_tokenized_sentence = []
            for word in tokenized_sentence:
                if word.isalpha() and len(word) > 1:
                    updated_tokenized_sentence.append(word)
            tokenized_sentence = updated_tokenized_sentence

            if (len(tokenized_sentence) >= minimal_words_in_sentence):
                text.tokenized_sentences.append(tokenized_sentence)
            else:
                remove_index_list.append(index)
        else:
            remove_index_list.append(index)

    # Печать предложений перед вырезкой предложений
    # string_for_print = ''
    # for index, sentence in enumerate(text.original_sentences):
    #     string_for_print = string_for_print + str(index) + ')' + sentence + '\n'
    # writeStringToFile(string_for_print, 'output_files/preProcessing_before_cut.txt')
    #
    need_agresive_filtration = False
    if (configurations != None):
        need_agresive_filtration = configurations.get(
            "need_agresive_filtration", False)

    sorted_remove_index_list = sorted(remove_index_list,
                                      key=lambda x: x,
                                      reverse=True)
    if (need_agresive_filtration):
        for index in sorted_remove_index_list:
            text.original_sentences.pop(index)

    # Печать предложений после вырезки
    # string_for_print = ''
    # for index, sentence in enumerate(text.original_sentences):
    #     string_for_print = string_for_print + str(index) + ')' + sentence + '\n'
    # writeStringToFile(string_for_print, 'output_files/preProcessing_after_cut.txt')

    return text.tokenized_sentences
예제 #15
0
 def lemmatize(self, line):
     self.lemmas = []
     play_lemmas = []
     if self.language == "rus":
         play_lemmas = [
             self.analyzer.parse(token)[0].normal_form
             for token in simple_word_tokenize(line)
         ]
     elif self.language in self.spacy_analyzers.keys():
         play_lemmas = [token.lemma_ for token in self.analyzer(line)]
     elif self.language in self.cltk_analyzers.keys():
         print(self.language)
         play_lemmas = self.analyzer.analyze(text=line).lemmata
     self.lemmas += play_lemmas
     return " ".join(play_lemmas)
예제 #16
0
def main():
    morph = MorphAnalyzer()
    texts_path = '..' + os.sep + '..' + os.sep + '..' + os.sep + '..' + os.sep + 'RuCoref' + os.sep + 'rucoref_texts'
    for folder in os.listdir(texts_path):
        text_folder = texts_path + os.sep + folder
        for filename in os.listdir(text_folder):
            if filename.endswith('.txt'):
                with open(text_folder + os.sep + filename,
                          'r',
                          encoding='utf-8') as source_file:
                    source_text = source_file.read()
                    # get tokens
                    tokens = tokenizers.simple_word_tokenize(source_text)
                    # parse tokens
                    tokens_with_tags = morphology_features(tokens, morph)
                    # write tokens to new file
                    write_info(tokens_with_tags, text_folder, filename)
예제 #17
0
def tokenize_me(file_text):
    file_text = file_text.lower()
    tokens = tokenizers.simple_word_tokenize(file_text)

    tokens = [morph.parse(w)[0].normal_form for w in tokens]

    #deleting punctuation symbols
    tokens = [i for i in tokens if (i not in string.punctuation)]

    #deleting stop_words
    stop_words = stopwords.words('russian')
    stop_words.extend([
        'что', 'это', 'так', 'вот', 'быть', 'как', 'в', '—', 'к', 'на', '...'
    ])
    tokens = [i for i in tokens if (i not in stop_words)]

    return ' '.join(tokens)
예제 #18
0
    def _get_gender(self, profession: str) -> GENDER:
        """
        Predict gender, without using cache
        """
        if not profession.strip():
            # Empty string
            return GENDER.unknown

        toks = simple_word_tokenize(profession)

        observed_genders = [self.get_word_gender(tok) for tok in toks]

        if not observed_genders:
            # No observed gendered words - return neutral
            return GENDER.neutral

        # Return the most commonly observed gender
        return Counter(observed_genders).most_common()[0][0]
예제 #19
0
def normalized(twit):
    twit = re.sub(r'#\S+', '', #вычищение хештегов
                  re.sub(r'@\S+', '', #вычищение @name
                         re.sub(r'http\S+', '', #вычищение ссылок
                                re.sub(r'RT ', '', #убрать RT
                                       re.sub("\d+", "", twit)))) #убираем все цифры
                  ).strip()
    tokens = simple_word_tokenize(twit)
    parses = [
        morph.parse(w)[0]
        for w in tokens
        if w.lower() not in stopwords
    ]
    parses = [
        p for p in parses
        if p.tag.POS not in {'PNCT', 'UNKN', 'NUMB', 'CONJ', 'LATN'}
    ]
    return [p.normal_form.lower() for p in parses]
예제 #20
0
def api_morphy(request):
    query = request.json_body
    command = query["command"]
    phrase = query["phrase"]
    if command == "all":
        words = simple_word_tokenize(phrase)
        gender = russian.gender(words, "M", "F", "_")
        if gender == "_":
            new_phrase = []
            for word in words:
                new_phrase.append(lean(word, case=query["case"]))
        else:
            case_ = query["case"]
            c = TR.get(case_, case_)
            new_phrase = russian.make_human_case(words, c)
        return {"phrase": " ".join(new_phrase)}
    elif command == "gender":
        p = russian.gender(phrase, query["M"], query["F"], ";-)")
        return {"phrase": p}
예제 #21
0
def preprocessing(sentence, clist):
    s = re.sub('[^а-яА-Яa-zA-Z]+', ' ', sentence).strip().lower()
    s = re.sub('ё', 'е', s)
    result = []
    for word in tokenizers.simple_word_tokenize(s):
        if word not in clist:
            if not morph.word_is_known(word):
                new_words = split_word(word)
            else:
                new_words = word,
            for new_word in new_words:
                parse = morph.parse(new_word)[0]
                pos = parse.tag.POS
                if pos is not None and pos not in [
                        'NPRO', 'PREP', 'CONJ', 'PRCL', 'INTJ'
                ]:
                    result.append(parse.normal_form)
        else:
            result.append(word)
    return ' '.join(result)
예제 #22
0
def normalize_text(text):
    lemmas = []
    for t in simple_word_tokenize(text):
        if t not in stops:
            lemmas.append(m.parse(t)[0].normal_form)
    return ' '.join(lemmas)
예제 #23
0
def tokenize_if_needed(tokens):
    if not isinstance(tokens, (list, tuple)):
        return simple_word_tokenize(tokens)
    return tokens
예제 #24
0
 def predict(self, tokens):
     if not isinstance(tokens, (list, tuple)):
         tokens = simple_word_tokenize(tokens)
     return [self.morph.parse(token)[0] for token in tokens]
예제 #25
0
 def tokenize(self, text):
     return simple_word_tokenize(text)
예제 #26
0
 def test_split_signs(self):
     assert simple_word_tokenize('a+b=c_1') == ['a', '+', 'b', '=', 'c_1']
예제 #27
0
def read_text_lemmas(fileobj):
    m = MorphAnalyzer()
    for line in fileobj:
        yield ' '.join((m.parse(t)[0].normal_form for t in simple_word_tokenize(line.decode('utf-8'))))
예제 #28
0
 def tokenize(text):
     return tokenizers.simple_word_tokenize(text)
예제 #29
0
 def test_split_simple(self):
     assert simple_word_tokenize('Мама мыла раму') == ['Мама', 'мыла', 'раму']
     assert simple_word_tokenize('Постой, паровоз!') == ['Постой', ',', 'паровоз', '!']
예제 #30
0
def pos_tag(text):
    return [f"{m.parse(word)[0].normal_form}_{m.parse(word)[0].tag.POS}"
            for word in simple_word_tokenize(text)]
예제 #31
0
 def test_split_signs(self):
     assert simple_word_tokenize('a+b=c_1') == ['a','+','b','=','c_1']
예제 #32
0
파일: task234.py 프로젝트: Aydar710/is2021
import pickle
from pymorphy2 import MorphAnalyzer, tokenizers
from constants import PAGES_PATH

morph = MorphAnalyzer()

inverted_index = {}
page_occurrences = {}

pages = os.listdir(path=PAGES_PATH)

for index, page in enumerate(pages):
    file = open(PAGES_PATH + page, 'r', encoding="utf-8")
    text = file.read()

    tokens = tokenizers.simple_word_tokenize(text)

    page_occurrences[index] = len(tokens)

    for token in tokens:
        lemma = morph.parse(token)[0].normal_form.lower()
        value = inverted_index.get(lemma)

        if value is None:
            inverted_index[lemma] = {index: 1}
        elif inverted_index[lemma].get(index) is None:
            inverted_index[lemma][index] = 1
        else:
            inverted_index[lemma][index] += 1

inverted_index_file = open("inverted_index.pkl", "wb")
예제 #33
0
 def test_split_hyphen(self):
     assert simple_word_tokenize('Ростов-на-Дону') == ['Ростов-на-Дону']
     assert simple_word_tokenize('Ура - победа') == ['Ура', '-', 'победа']
예제 #34
0
def get_sents(input_data):
    return [normalized(simple_word_tokenize(sent)) for sent in input_data]
예제 #35
0
def _iter_tokens_tokenize(fp):
    """ Return an iterator of input tokens; each line is tokenized """
    return (token for line in fp for token in simple_word_tokenize(line))
예제 #36
0
def lemmatize(text):
    return [
        m.parse(word)[0].normal_form for word in simple_word_tokenize(text)
    ]
예제 #37
0
파일: cli.py 프로젝트: Koryakov/pymorphy2
def _iter_tokens_tokenize(fp):
    """ Return an iterator of input tokens; each line is tokenized """
    return (token for line in fp for token in simple_word_tokenize(line))
예제 #38
0
 def test_split_hyphen(self):
     assert simple_word_tokenize('Ростов-на-Дону') == ['Ростов-на-Дону']
     assert simple_word_tokenize('Ура - победа') == ['Ура', '-', 'победа']
예제 #39
0
def tokenize(line):
    return [token for token in simple_word_tokenize(line) if token not in string.punctuation]