def normalisation(new_data): m = Mystem() m.start() normalize_data = [] data_size = len(new_data) count = 0 for i in tqdm(new_data, desc="normalisation"): lemmas = m.lemmatize(i) normalize_data.append((''.join(lemmas)).replace("\n", "")) count = count + 1 # print(count*100/data_size) return normalize_data
import torch from pymystem3 import Mystem from itertools import count import onmt.io import onmt.translate import onmt import onmt.ModelConstructor import onmt.modules import sys from onmt.io.IO import build_dataset_request m = Mystem() m.start() opt = Namespace( alpha=0.0, attn_debug=False, batch_size=1, beam_size=10, beta=-0.0, data_type='text', dump_beam='', dynamic_dict=False, gpu=-1, max_length=100, max_sent_length=None, min_length=0, model=os.path.dirname(os.path.abspath(__file__)) +
import re, os import gensim from gensim import corpora, models import nltk from nltk import FreqDist from nltk.collocations import * from pymystem3 import Mystem from stop_words import get_stop_words ru_stop = get_stop_words('ru') mystem_object = Mystem() mystem_object.start() puncts = "[«–»—!\$%&'()*+,./:;<=>?@^_`{|}~']*-–—...]" extra_words = [ "понимать", "знать", "хотеть", "глаз", "рука", "голова", "увидеть", "что-то", "смотреть", "нога", "свой", 'видеть', 'становиться', 'остаться', 'давать', 'стоять', 'оставаться', 'оказываться', 'думать' ] #Fantasy def processFileFantasy(file): doc = [] with open(file, 'r', encoding='utf-8') as f: #print(file) text = f.read() #print(len(text)) words = text.split() for word in words:
def make_lemmantisation(text): m = Mystem() m.start() lemmas = m.lemmatize(str(text)) return lemmas
parser = argparse.ArgumentParser(description='RE to CONLL') parser.add_argument('--re', type=str, help='REs to apply') parser.add_argument('--data_dir', type=str, help='Folder with docs') parser.add_argument('--file', type=str, help='Source file') parser.add_argument('--lines', action='store_true', help='Lines as docs') parser.add_argument('--bioes', action='store_true', help='Output BEOES encoding') args = parser.parse_args() def build_re(): patterns = filter(lambda line : line and not line.startswith('#') and not line.isspace(), open(args.re).readlines()) return map(lambda line: re.compile(line.strip().decode('utf8'), flags=re.U+re.M+re.S), patterns) from pymystem3 import Mystem mystem = Mystem(grammar_info=False, disambiguation=False) mystem.start() def parse_doc(mystem, text): morph_parse = mystem.analyze(text) current_pos = 0 offsets = [] lemmas = [] words = [] all_words = [] for word_parse in morph_parse: word = word_parse['text'] all_words.append(word) sword = word.strip(' ').replace('\n', u'\u2028') if re.search("\w", sword, flags=re.U): words.append(sword) analysis = word_parse.get('analysis')