def build_corpora(directory): signs = ['.', ',', '«', '»', '(', ')', '-', ':', ';', '?', '!', '@'] out = open(out_file, mode='w', encoding='utf-8') out_lem = open(file=out_lemmatized, encoding='utf-8', mode='w') out_filenames = open(out_names, encoding='utf-8', mode='w') #printing corpora print("Printing corpora") for file in os.scandir("./" + directory): if os.DirEntry.is_file(file) and file.name != '.DS_Store': text = open(file, mode='r', encoding='utf-8').read() out.write(file.name + clean_text(text) + '\n') out_filenames.write(file.name + '\n') #printing lemmed print("Printing lemmatized:", file.name) lemmer = mystem.Mystem() lemmatized = lemmer.lemmatize(clean_text(text)) out_lem.write(file.name + ' ') for lemma in lemmatized: if not lemma in signs: out_lem.write(lemma) out_lem.write(' ') #printing dictionary print("Printing morphological dictionary") subprocess.run(args_dictionary) #printing ngrams print("Printing ngrams") subprocess.run(args_turbotopics, stdout=None)
def save_bag_of_words(path: str): """ algorithm preprocessing text """ stop_words = nltk.corpus.stopwords.words('russian') mystem = pymystem3.Mystem() file_object = open(BAG_OF_WORDS_PATH, 'w', encoding="utf-8") text = " " #converterUTF8(filename) with codecs.open(path, encoding='UTF-8') as f_manager: for line in f_manager: if len(line) != 0: text = text + " " + line word = nltk.word_tokenize(text) word_ws = [w.lower() for w in word if w.isalpha()] word_w = [w for w in word_ws if w not in stop_words] lem = mystem.lemmatize((" ").join(word_w)) lema = [w for w in lem if w.isalpha() and len(w) > 1] freq = nltk.FreqDist(lema) results = [] results = [(key + ":" + str(val)) for key, val in freq.items() if val > 1] file_object.write("|text" + " " + (" ").join(results) + '\n') file_object.close() return freq.items()
def prepare_group(mypath): files = [] stem = pymystem3.Mystem() for r, d, f in os.walk(mypath): for file in f: files.append(os.path.join(r, file)) docs = [] full = '' wordset = set() for filepath in files: corpus = [] with open(filepath) as file: for line in file: corpus.append(line) corpus = [el for el in corpus if el != '\n'] corpus = [re.sub(r'[^\w\s]', '', el)[:-1] for el in corpus] for el in corpus: words = el.lower() proc = stem.lemmatize(words) proc = [w for w in proc if (w.strip() != '') and (w != '\n')] docs.append(proc) for word in proc: full += ' ' + word wordset.add(word) print(filepath) return docs, wordset, full
def __init__(self): # lemmatizer self.lemmatizer = pymystem3.Mystem() # nltk stopwords self.stopwords = stopwords.words('russian') self.stopwords.extend(['', ' ', '\n', '«', '»']) self.stopwords.extend([p for p in string.punctuation])
def get_lexemas_from_text(cursor, atext=""): term_extractor = TermExtractor() mystem = pymystem3.Mystem() lexemas = [] for term in term_extractor(atext): for lexema in str(term.normalized).split(" "): lexema = mystem.analyze(lexema)[0]['analysis'][0]['lex'] id_lexema = lexema_id_by_inf(cursor, lexema) lexemas += [id_lexema] return lexemas
def test(model): while True: in_word = input("Введите слово или q: ") if in_word == "q": break in_word = pymystem3.Mystem().lemmatize(in_word)[0] try: for word in model.most_similar(positive=[in_word], topn=10): print(word) except KeyError: print("word '{}' not in vocabulary".format(in_word))
def stem_russian_batches(batch_filename): if not hasattr(stem_russian_batches, "stemmer"): stem_russian_batches.stemmer = pymystem3.Mystem() batch_stem_path = './stem/' print batch_filename + " loading..." batch = artm.library.Library().LoadBatch(batch_filename) print batch_filename + " loading done." batch_stem = artm.messages_pb2.Batch() # stem tokens token_list = list() for token in batch.token: token_list.append(token) text = ' '.join(token_list) text_stem = stem_russian_batches.stemmer.lemmatize(text) token_stem_list = ''.join(text_stem).strip().split(' ') token_id_to_token_stem_id = dict() token_stem_to_token_stem_id = dict() for (token_id, token_stem) in enumerate(token_stem_list): #print token_id, token_stem if not token_stem_to_token_stem_id.has_key(token_stem): token_stem_to_token_stem_id[token_stem] = len(batch_stem.token) batch_stem.token.append(token_stem) token_id_to_token_stem_id[token_id] = token_stem_to_token_stem_id[ token_stem] print batch_filename + " " + str(len(batch.token)) + " -> " + str( len(batch_stem.token)) # convert items for item in batch.item: # print item.title # add item item_stem = batch_stem.item.add() item_stem.id = item.id item_stem.title = item.title # add fields for field in item.field: field_stem_dict = defaultdict(int) for token_num in xrange(len(field.token_id)): token_id = field.token_id[token_num] token_stem_id = token_id_to_token_stem_id[token_id] token_count = field.token_count[token_num] field_stem_dict[token_stem_id] += token_count field_stem = item_stem.field.add() field_stem.name = field.name for token_stem_id in field_stem_dict: field_stem.token_id.append(token_stem_id) field_stem.token_count.append(field_stem_dict[token_stem_id]) # save batch print batch_filename + " saving result..." artm.library.Library().SaveBatch(batch_stem, batch_stem_path) print batch_filename + " saving done." return 0
def prep_words(words): lemmer = pymystem3.Mystem() stopwords = set(nltk.corpus.stopwords.words("russian") + ["весь", "это"]) tokens = lemmer.lemmatize(' '.join(words).lower()) tokens = [token.replace(' ', '') for token in tokens] tokens = [ token for token in tokens if token and token not in stopwords and not all( char in punctuation or char.isnumeric() for char in token) ] return tokens
def one_word_production(word, model, sets, fnames, tfidf, coef, coef2): primal = word stem = pymystem3.Mystem() proc = stem.lemmatize(word)[0] word = proc.strip() if word == '': return 0, primal if choose_by_tfidf(fnames, tfidf, word): chose, pos = select_by_cos(word, sets, model, tfidf, fnames, coef, coef2) return 1, chose else: return 0, primal
def __init__(self, config_file): self.config = json.load(open(config_file, 'r')) self.__tokenizer = TreebankWordTokenizer() #self.stemmer = SnowballStemmer('russian') self.__mystem = pymystem3.Mystem() self.__ft_c = pickle.load(open(self.config['word_embeddings'], 'rb')) self.__class_names = pickle.load(open(self.config['class_names'], 'rb')) self.__old2new = pickle.load(open(self.config['old2new'], 'rb')) self.__new2old = pickle.load(open(self.config['new2old'], 'rb')) self.__num2title = pickle.load(open(self.config['num2title'], 'rb')) self.__build_net(mtype='cnn')
def part_production(string, model, sets, fnames, tfidf, coef, coef2): start = string.split() res = [] stem = pymystem3.Mystem() proc = stem.lemmatize(string) corpus = [re.sub(r'[^\w\s]', '', el) for el in proc] corpus = [el for el in corpus if el.strip() != ''] for i, word in enumerate(corpus): if choose_by_tfidf(fnames, tfidf, word): chose, pos = select_by_cos(word, sets, model, tfidf, fnames, coef, coef2) res.append([word, chose, i]) return res
def __init__(self, settings, task_type=None): """ Arguments --------- settings : dict task_type : str """ self.mystem = pymystem3.Mystem(entire_input=False) self.settings = settings if (task_type is not None): key = task_type + '_stop_words' self.task_specific_stop_words = self.settings[key] else: self.task_specific_stop_words = []
def preprocess_document(document, russian_stop_words): document = document.lower() document = re.sub(u'\xa0|\n', ' ', document) document = re.sub('[^а-яa-z ]', '', document) mystem = pymystem3.Mystem() tokens = mystem.lemmatize(document) tokens = [ token for token in tokens if ((token not in russian_stop_words) and ( token.strip() not in string.punctuation) and (len(token) > 2)) ] document = ' '.join(tokens) return document
def checkExecTimeMystemOneText(texts): lol = lambda lst, sz: [lst[i:i+sz] for i in range(0, len(lst), sz)] txtpart = lol(texts, 1000) res = [] for txtp in txtpart: alltexts = ' '.join([txt + ' br ' for txt in txtp]) m = Stem.Mystem() words = m.lemmatize(alltexts) doc = [] for txt in words: if txt != '\n' and txt.strip() != '': if txt == 'br': res.append(doc) doc = [] else: doc.append(" "+txt+" ") return res
def _soup_parsing(output_soup, input_soup, freq_dictionary): _stemmer = pymystem3.Mystem() _line_counter = 0 _word_counter = -1 _words_array = re.findall(r"[а-яА-ЯёЁ]+|\n", input_soup.text) _words_set = set(_words_array) _some_threads = [] _word_annotation_dict = {} new_q = queue.Queue() start = time.clock() thread_num = 8 for i in range(thread_num): t = threading.Thread(target=_get_word_annotation, args=(new_q, _word_annotation_dict, _stemmer)) t.start() _some_threads.append(t) for word in _words_set: new_q.put(word) # for word in _words_array: # if word == '\n': # _line_counter += 1 # _append_to_tag(output_soup, "body", "p") # else: # _word_counter += 1 # t = threading.Thread(target=_add_word_with_annotation,args=(_line_counter,_word_counter, # word,_stemmer,output_soup, # freq_dictionary)) # t.start() # try: # freq_dictionary[word.lower()] += 1 # except KeyError: # freq_dictionary[word.lower()] = 1 # _some_threads.append(t) new_q.join() for i in range(thread_num): new_q.put(None) for t in _some_threads: t.join() stop = time.clock() print(stop - start) return output_soup
def utterance_to_bow(utterance): stop_words = nltk.corpus.stopwords.words('russian') stop_word = "пожалуйста здоавствуйте" for i in stop_words: stop_word = stop_word + " " + i stoplist = stop_word utterance = utterance.lower() utterance = utterance.replace("тк", "").replace( "сбербанк", "банк").replace("сбер", "банк").replace("сбербанка", "банк").replace("банка", "банк") utterance = re.sub(r'[^а-яА-Я ]+', '', utterance) tokens = [word for word in str(utterance).split() if word not in stoplist] mystem = pymystem3.Mystem() utterance = [mystem.lemmatize(token)[0] for token in tokens] bow = dictionary.doc2bow(utterance) return bow
def _lemmatize_words(self, texts): mystem = pymystem3.Mystem() return [[mystem.lemmatize(token)[0] for token in text] for text in texts]
def __init__(self): self.mystem_gr_vocab = self.mystem_gr_tokens.split('|') self.mystemmer = pymystem3.Mystem() self.mystemmer_cache = {} self.mystem_gr_tokenizer = RegexpTokenizer(self.mystem_gr_tokens) self.mystem_gr_vectorizer = CountVectorizer( tokenizer=self.mystem_gr_tokenizer.tokenize, vocabulary=self.mystem_gr_vocab, binary=True)
def print_most_common(items_list): print('Lemma frequency:') print(',\n'.join("'{}' : {}".format(str(elt[0]), str(elt[1])) for elt in items_list)) def print_csv(item_list): with open('word_bag_stat.csv', 'w', newline='') as csv_file: csv_writer = csv.writer(csv_file, delimiter=';') for item in item_list: csv_writer.writerow(item) file = open('word_bag_text.txt', 'r', encoding='utf-8') text = file.read() file.close() stemmer = pymystem3.Mystem(entire_input=False, speedup=True) statistics = Counter() if text == '\n': print('Your text is empty.') else: lemmas = stemmer.lemmatize(text) statistics += Counter(lemmas) print() stat_sorted_list = statistics.most_common() print_most_common(stat_sorted_list) print_csv(stat_sorted_list)
def setUpClass(cls): mystem = pymystem3.Mystem(entire_input=False, disambiguation=True) cls.text_analyzer = Analyzer(mystem)
Он лучше справляется с бастардами и там есть морфоанализатор. Там нет разрешения омонимии, но ее сделали другие чуваки: проект PyPi: rnnmorph""" import re from collections import defaultdict, namedtuple import pandas as pd import numpy as np import pymystem3 import constants import write_lemmas_to_file MYSTEM = pymystem3.Mystem(entire_input=False, disambiguation=True) # Frequency distribution of lemmas LEMMAS = defaultdict(lambda: 0) # Lemmas and their hashes LEMMAS_HASHES = dict() # Filter for non-proper Nouns (S) and all Verbs (V) PAT = re.compile('([SV]),(?!имя,|фам,|сокр=|гео)') """ FIXME: Лемматизация происходит для двух задач: - составление ключ слов - классификация строк описаний на основе ключевых слов """ # Добавит в дату просто колонку с леммами
def __init__(self): wikipedia.set_lang("en") self.stem = pymystem3.Mystem() self.sparql = SPARQLWrapper("http://dbpedia.org/sparql") self.sparql.setReturnFormat(JSON) self.ner = ner_detector.NerDetector()
def __init__(self, vocab_filename=None): self.__tokenizer = TreebankWordTokenizer() self.__mystem = pymystem3.Mystem()
def init(self): if self._mystem is None: self._mystem = pymystem3.Mystem() self._mystem.start()
else: index = doc.split('|')[0] prob = doc.split('|')[1] docs[int(index)].append(topic_name + '|' + prob) all_tokens = model_artm.score_tracker['top_tokens_score'].last_tokens ready_tokens = model_artm.score_tracker['top_tokens_score'].last_tokens ngrams_tokens = model_artm.score_tracker['top_tokens_score'].last_tokens for topic in ngrams_tokens: ngrams_tokens[topic] = [] #for topic in all_tokens.keys(): # tokens = all_tokens[topic] # ready_tokens[topic] = tokens[:10] ngrams = open('adapted.txt', mode='r', encoding='utf-8').read().split('\n') lemmer = mystem.Mystem() topicfile = open('topics.txt', mode='w', encoding='utf-8') tokens = [] for ngram in ngrams: if ngram == '': continue for topic in all_tokens.keys(): tokens = all_tokens[topic] all_in = True for word in ngram.strip('\n').split(' '): if lemmer.lemmatize(word)[0] not in tokens: all_in = False break if all_in: ngrams_tokens[topic].append(ngram)
def parsing(BATCH_SIZE, dsl_dict, short, dialect): m = pymystem3.Mystem() idx_word = [] words = [] infos = [] pos_list = [] for i in range(3, 60001): s = dsl_dict[i] if not s.startswith('\t'): idx_word.append(i) words.append(s.strip()) for l in range(len(short) - 2): if short[l] == 'см.': del short[l] if short[l] == 'что-л.': del short[l] print('Кол-во слов:', len(words)) for q in range(len(idx_word) - 1): if q % 100 == 0: print(q, datetime.now()) s = ' '.join( [w.strip() for w in dsl_dict[idx_word[q] + 1:idx_word[q + 1]]]) s = re.search('\[.*?\].*?\[.*?\](.*?)\[.*?\]', s).group(1) s_list = s.split(' ') start = 0 end = 0 st_ch = 0 if '2)' and '2.' not in s_list: for i in range(len(s_list) - 1): for d in dialect: if s_list[i].startswith(d): start = i st_ch = 1 for l in short: if s_list[i].startswith(l): start = i st_ch = 1 if 'см.' in s_list: end = s_list.index('см.') if '2)' in s_list or '2.' in s_list: try: end = s_list.index('2)') except: end = s_list.index('2.') for i in range(end - 1): for d in dialect: if s_list[i].startswith(d): start = i st_ch = 1 for l in short: if s_list[i].startswith(l): start = i st_ch = 1 if 'см.' in s_list: if end > s_list.index('см.'): end = s_list.index('см.') result = [] if start == 0 and st_ch == 0: start = -1 if end != 0: if start > end: start = -1 for i in range(start + 1, end): result.append(s_list[i]) else: for i in range(start + 1, len(s_list) - 1): result.append(s_list[i]) for r in result: if r.endswith(')'): result = result[result.index(r) + 1:] info = ' '.join(result) info = info.split(';')[0] if info.startswith('3 '): info = info.split('3 ')[1] if info.startswith('1.'): info = info.split('1.')[1] if info.endswith('.'): info = info.split('.')[0] infos.append(info.strip()) if 'межд.' in s_list: pos = 'ij' else: if words[len(infos) - 1].endswith('-мӣ'): pos = 'v' else: pos = detect_pos(m, info) pos_list.append(pos) final_list = [] work_list = [] for j in range(len(pos_list) - 1): if pos_list[j] == "nothing": work_list.append(j) work_list = list(chunk(work_list, BATCH_SIZE)) for el in work_list: text = [] for q in el: text.append(infos[q].split(' ')[0]) pos_l = detect_pos(m, ' '.join(text)) for p in range(len(pos_l)): pos_list[el[p]] = pos_l[p] for i in range(len(words) - 1): final_list.append(words[i] + '\t' + infos[i] + '\t' + pos_list[i]) return final_list
# coding=utf-8 from SPARQLWrapper import SPARQLWrapper, JSON import pymystem3 import requests import json import re m = pymystem3.Mystem() relations = dict() sparql = SPARQLWrapper("http://dbpedia.org/sparql") sparql.setQuery(""" SELECT distinct ((SUBSTR(str(?property), 29)) as ?property) ((SUBSTR(str(?equals), 32)) as ?equals) WHERE {{ ?instance a dbo:Person . ?instance ?property ?obj . ?property owl:equivalentProperty ?equals . FILTER (SUBSTR(str(?equals), 1, 31) = "http://www.wikidata.org/entity/") . }} """) sparql.addParameter("timeout", "30000") sparql.setReturnFormat(JSON) results = sparql.query().convert() for z in results['results']['bindings']: relations[z['property']['value']] = z['equals']['value'] print(len(relations), relations) counter = 0
def __init__(self): self.__analyzer = pymystem3.Mystem()
def stemming(text): lemmas = stemmer.lemmatize(text, speedup=True) lemma_stat = Counter(lemmas).most_common() for tuple in lemma_stat: print('{}: {}'.format(tuple[1], tuple[0])) print('\n#####\n') over_file.write('Number of tokens: {}\n'.format(len(lemmas))) over_file.write('Number of lemmas: {}\n\n'.format(len(lemma_stat))) texts_file = open('SampleRU.txt', 'r', encoding='utf-8') over_file = open('overview.txt', 'w', encoding='utf-8') stemmer = pymystem3.Mystem(disambiguation=False, entire_input=False) new_text = True text_str = '' for line in texts_file: # если встречается разделитель между текстами, заводим новые переменные if line == '####\n': stemming(text_str) new_text = True text_str = '' continue # записываем название текста elif new_text: over_file.write(line.upper()) new_text = False # пишем текст в одну строку
'вилка', 'винт', 'горшок', # single sense in the dictionary 'вата', 'бык', 'байка', 'баян', 'бомба', # really single sense 'борщ', 'воск', 'бухгалтер', ] MyStem = pymystem3.Mystem() def load_stopwords(): with open('stopwords.txt') as f: return {line.strip().split()[0] for line in f if line.strip()} stopwords = load_stopwords() def load_contexts(root, word, window=None): with open(os.path.join(root, '{}.txt'.format(word))) as f: contexts = [] for line in f: left, _, right = line.split('\t')