def without_pronouns(directory): input_files = filter(lambda x: not x.endswith('~'), os.listdir(directory)) output_data = {} m = Mystem() #иду по документам for input_file in input_files: with open(directory + '/' + input_file) as data_file: data = json.load(data_file) list_of_terms = filter(lambda x: x not in ('', ' ', '\n'), data['text'].split(' ')) my_list = list_of_terms list_of_terms = [] for term in my_list: if m.analyze(term)[0].get(u'analysis'): if not m.analyze(term)[0][u'analysis'][0][u'gr'].startswith((u'SPRO', u'APRO')): list_of_terms.append(term) else: list_of_terms.append(term) text_of_output = ' '.join(['%s' % term for term in list_of_terms]) output_data[input_file] = {} output_data[input_file]['id'] = data['id'] output_data[input_file]['positive'] = data['positive'] output_data[input_file]['sarcasm'] = data['sarcasm'] output_data[input_file]['text'] = text_of_output with open(directory + '/' + input_file, 'w') as output_file: json.dump(output_data[input_file], output_file)
def result(): morph = MorphAnalyzer() if request.args: sent = request.args['sentence'] m = Mystem() ana = m.analyze(sent) new_sent = open('sentence.txt', 'w', encoding='utf-8') for word in ana: if 'analysis' in word: forma_slova = word['analysis'][0]['gr'] sent2 = clear_words for w in sent2: ana2 = m.analyze(w) try: an_word = ana2[0] if 'analysis' in an_word: print(an_word) forma_slova2 = an_word['analysis'][0]['gr'] if forma_slova == forma_slova2: new_sent.write(w + ' ') break except IndexError: pass new_sent.close() with open('sentence.txt', 'r', encoding='utf-8') as f: read_sent = f.read() return render_template('result.html', sentence=read_sent) return render_template('result.html')
def paral2(): for sentence in st.sentences_from_text( text2): #выделяем из текста2 предложение и бежим по нему for word in wt.tokenize( sentence): #бежим по словам в выделенном тексте m = Mystem() analize = m.analyze(word) #Морфологический анализ слова print(m.analyze(word)) for i in analize: #углубляемся в полученный словарь for j in i: for k in i[j]: for m in k: if "gr" in k: for o in k[m]: if "муж" and "имя" in k[ m]: #Проверяем есть ли параметры муж и имя if Names2.get( word ) is None: #Если в словаре имен нет такого имени Names2.update({word: 1}) #добавляем его else: Names2[ word] += 1 #Ищем необходимые нам параметры break #выходим из цикла разбора анализа
def paral1(q2, q22): # определяем функцию с двумя аргументами - очередями _Kx = q22.get( ) # вытаскиваем список первого и последнего предложения, используемые для данного процесса, из q22 for k in _Kx: xNames = text2_[k] for word in wt.tokenize( sentence): #бежим по словам в выделенном тексте m = Mystem() analize = m.analyze(word) #Морфологический анализ слова print(m.analyze(word)) for i in analize: #углубляемся в полученный словарь for j in i: for k in i[j]: for m in k: if "gr" in k: for o in k[m]: if "муж" and "имя" in k[ m]: #Проверяем есть ли параметры муж и имя if Names2.get( word ) is None: #Если в словаре имен нет такого имени Names2.update({word: 1}) #добавляем его else: Names2[ word] += 1 #Иначе инкрементируем индекс break #выходим из цикла разбора анализа q2.put(Names2) # кладем в очередь q2 получившийся словарь
def lmtze(textfile): m = Mystem() text = open(textfile, encoding='utf-8').readlines() newfile = open(textfile.replace('txt', 'lem.txt'), 'w', encoding='utf-8') result_full = [] for line in text: try: element = etree.fromstring(line.strip('\n')) text_ = element.xpath('text()') entities = element.xpath('*') result = ['<sent>'] while text_: l = text_.pop(0) # open('temp.txt', 'w', encoding='utf-8').write(l) # subprocess.call(['C:\\Mystem\\mystem', 'i']) l = m.analyze(l) # print(l) for x in l: if x.get('analysis') is not None: if x.get('analysis') == []: result.append(x['text']) else: result.append(x['analysis'][0]['lex'] + '_' + x['analysis'][0]['gr'].split(',')[0].split('=')[0]) else: continue if text_: e = entities.pop(0) e_ = m.analyze(e.text) result.append('<' + e.tag + '>') for x in e_: if x.get('analysis') is not None: if x.get('analysis') == []: result.append(x['text']) else: result.append(x['analysis'][0]['lex']) else: continue result.append('</' + e.tag + '>') except Exception: continue result.append('</sent>') result_full.append(result) result = [] print(len(result_full), ' разобралось') for sent in result_full: prev = '' for x in sent: if '<' in x and '/' not in x: newfile.write(prev + x) prev = '' elif '_' in x or x.isalpha(): newfile.write(prev + x) prev = ' ' else: newfile.write(x) newfile.write('\n')
def extract(self): try: #вычисляем, сколько в директории лежит файлов input_files = filter(lambda x: not x.endswith('~'), os.listdir(self.input_directory)) output_data = {} list_of_all_terms = {} m = Mystem() #иду по документам for file in input_files: with open(self.input_directory + '/' + file) as data_file: data = json.load(data_file) list_of_terms = filter(lambda x: x != "", re.split(""" |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +""", data['text'])) text = " ".join(["%s" % term for term in list_of_terms]) list_of_terms = filter(lambda x: x not in (" ", "\n"), m.lemmatize(text)) my_list = list_of_terms list_of_terms = [] for term in my_list: if m.analyze(term)[0].get(u'analysis'): if not m.analyze(term)[0][u'analysis'][0][u'gr'].startswith(self.service_parts_of_speech) and len(term) > 1: list_of_terms.append(term) if term == u'не': list_of_terms.append(term) else: list_of_terms.append(term) output_data[file] = {} output_data[file]['id'] = data['id'] output_data[file]['positive'] = data['positive'] output_data[file]['sarcasm'] = data['sarcasm'] output_data[file]['terms'] = {} #убираю повторяющиеся слова for term in list_of_terms: if term not in output_data[file]['terms']: output_data[file]['terms'][term] = 1 else: output_data[file]['terms'][term] += 1 for term in output_data[file]['terms'].keys(): if term not in list_of_all_terms: list_of_all_terms[term] = 1 else: list_of_all_terms[term] += 1 #подсчёт tf count_of_terms = output_data[file]['terms'][term] output_data[file]['terms'][term] = {'tf': float(count_of_terms)/len(list_of_terms), 'idf': 0, 'count': count_of_terms} for file in input_files: #подсчёт idf for term in output_data[file]['terms'].keys(): output_data[file]['terms'][term]['idf'] = math.log(float(len(input_files))/list_of_all_terms[term]) #запись результата with open(self.output_directory + '/' + file + '_tf-idf', 'w') as output_file: json.dump(output_data[file], output_file) except Exception: return False else: return True
class PyMyStemTagger: def __init__(self): self.tagger = Mystem() def parse(self, sentence): result = self.tagger.analyze(sentence) print(result) return [(t['text'].strip(), t['analysis'][0]['gr'] if 'analysis' in t and t['analysis'] else 'NONLEX') for t in result if t['text'].strip() not in {' ',''}] def tag_word(self, word): result = self.tagger.analyze(word) return [(t['text'].strip(), t['analysis'][0]['gr'] if 'analysis' in t and t['analysis'] else 'NONLEX') for t in result[:1] if t['text'].strip() not in {' ',''}]
def verbs_statistics(text): ''' Выделяет из текста данные о частотности глаголов, их лемм, вида и совершенности :param text: текст :return: словари с частотностью ''' m = Mystem() # создаем экземпляр класса-анализатора ana = m.analyze(text) print(ana) print(len(ana)) for i in ana: try: i['analysis'] print(i) except KeyError: print(int()) pos = [ i['analysis'][0]['gr'].split('=')[0].split(',')[0] for i in ana if i['text'].strip() and 'analysis' in i and i['analysis'] ] verbs = [ i['analysis'][0] for i in ana if i['text'].strip() and 'analysis' in i and i['analysis'] and i['analysis'][0]['gr'].split('=')[0].split(',')[0] == 'V' ] print(verbs) print(type(pos)) all_pos, v, ratio = pos_counter(pos) lemms = lemma_counter(verbs) tr, intr = trans_couter(verbs) s, ns, amb = aspect_counter(verbs) return all_pos, v, ratio, lemms, tr, intr, s, ns, amb
def __init__(self, path, doc_id, limit): """ :param doc_id: numerical id of a document, pass manually """ self.text = open(path).read().lower().replace('\n', '.') # need a better regex self.sentences = [sentence for sentence in re.split(r'(?:[.]\s*){3}|[.?!]', self.text) if sentence and len(sentence.split()) > 2] self.pos_data = [] self.testing_data = [] self.id = doc_id m = Mystem() counter = Counter(DEFAULTS) if not limit or limit > len(self.sentences): limit = len(self.sentences) for sentence in self.sentences[:limit]: # parse with mystem data = m.analyze(sentence) # get POS and count for each sentence pos = [word.get('analysis', None)[0]['gr'].split('(')[0].split(',')[0].split('=')[0] for word in data if word.get('analysis', None)] counter.update(pos) # append to dataset self.pos_data.append([counter[key] for key in sorted(counter)]) # reset counter counter = Counter(DEFAULTS)
def mystem_normalizer(texts, batch_size=150, mapping=mystem2upos): """ Normalizer(lemmatisation and PoS tagging) with Mystem backend. :param texts: :param batch_size: :param mapping: :return: """ m = Mystem() # not very good place to store it. for batch_start in range(0, len(texts), batch_size): batch = texts[batch_start: batch_start + batch_size] total = ' $ '.join(batch.apply(lambda x: x.replace('\n', '').replace('$', ''))) text = [] for word in m.analyze(total): if word['text'] == '$': yield ' '.join(text) text = [] continue try: token = word['analysis'][0] except (KeyError, IndexError) as e: continue text.append(pos_extractor(token, mapping=mapping)) yield ' '.join(text)
def index(name = None): if request.args: story = request.args['joke'] mystem = Mystem() gramm = mystem.analyze(story) characters = set() for i in gramm: if (str(i).find("од=") != -1) and (str(i).find("неод=") == -1): s1 = str(i)[str(i).find("'lex': '") + 8:] characters.add(s1[:s1.find( "'")]) file = open("corp.txt", 'r', encoding = "UTF-8") f = file.read()[1:].split('\n\n') file.close() file = open("ans.txt", 'w', encoding = "UTF-8") for i in f: words = ((re.sub('[,\.\?\!\—\-\(\)\:\;]', '', i)).lower()).split(' ') if characters <= set(words): f = file.write(i + '\n\n') file.close() with open("ans.txt", "r", encoding='utf-8') as f: content = f.read().split('\n\n') return render_template("index.html", content=content) return render_template('index.html')
def mystem_new(file, msg): bigdata = {} f = open(file, 'r', encoding='utf-8') text = f.read() f.close() capitals = [ 'Й', 'Ц', 'У', 'К', 'Е', 'Н', 'Г', 'Ш', 'Щ', 'З', 'Х', 'Ф', 'Ы', 'В', 'А', 'П', 'Р', 'О', 'Л', 'Д', 'Ж', 'Э', 'Я', 'Ч', 'С', 'М', 'И', 'Т', 'Б', 'Ю' ] m = Mystem() f1 = open(file[:-4] + '_output.txt', 'w', encoding='utf-8') analyse = m.analyze(text) pos = ['A', 'S', 'V'] for one in analyse: if len(msg) > 0: if 'analysis' in one and len(one['analysis']) != 0 and one[ 'analysis'][0]['gr'][0] in pos: if one['text'][0] not in capitals: sub_dic = one['analysis'] for value in sub_dic: if 'lex' in value: short = sketch_engine(value['lex']) if len(short) > 1: tr_text = dic_bin_codes(short) for smt in tr_text: if len(msg) > 0: if tr_text[smt] == msg[0]: print(one['text']) print(smt) one['text'] = phpmorphy(smt, one) msg.remove(msg[0]) break f1.write(one['text']) f1.close()
def tag_mystem(text='Текст нужно передать функции в виде строки!'): m = Mystem() #print(text) text = ''.join([x for x in text.split(';') if ('итература' not in x)]) #print(text) processed = m.analyze(text) tagged = [] for w in processed: try: if not w["analysis"]: continue lemma = w["analysis"][0]["lex"].lower().strip() if lemma in russian_sw: continue pos = w["analysis"][0]["gr"].split(',')[0] pos = pos.split('=')[0].strip() if pos in mapping: tagged.append(lemma + '_' + mapping[pos]) # здесь мы конвертируем тэги else: tagged.append( lemma + '_X' ) # на случай, если попадется тэг, которого нет в маппинге except KeyError: continue return tagged, text
def post(self): json_data = request.get_json(force=True) text = json_data['text'] m = Mystem() result = m.analyze(text) return jsonify({"analysis": result})
def tag_mystem(mapping, text="Текст нужно передать функции в виде строки!"): m = Mystem() processed = m.analyze(text) tagged = [] for w in processed: try: if w["analysis"]: lemma = w["analysis"][0]["lex"].lower().strip() pos = w["analysis"][0]["gr"].split(",")[0] pos = pos.split("=")[0].strip() # print(lemma) if lemma not in set(russian_stopwords): if pos in mapping: tagged.append( lemma + "_" + mapping[pos]) # здесь мы конвертируем тэги else: tagged.append( lemma + "_X" ) # на случай, если попадется тэг, которого нет в маппинге else: continue except KeyError: continue # я здесь пропускаю знаки препинания, но вы можете поступить по-другому return tagged
def from_freq(string): ''' take a word return [ipm, r, d] ''' big_ru = {} start = time.time() with open('./Freq2011/freqrnc2011.csv') as rus: ru = rus.readlines()[1:] for line in ru: lemma, pos, ipm, r, d, doc = line.split('\t') # lp = lemma + ',' + pos big_ru[lemma + ',' + pos] = [ipm, r, d] print("dictionary: %s seconds" % (time.time() - start)) start_time = time.time() mystem = Mystem() mystemmed = mystem.analyze(string) print("pymystem: %s seconds" % (time.time() - start_time)) lemma_mystem = mystemmed[0]['analysis'][0]['lex'] pos_mystem = mystemmed[0]['analysis'][0]['gr'].split('=')[0].split( ',')[0].lower() try: return big_ru[lemma_mystem + ',' + pos_mystem] except: return ['', '', '']
def __init__(self, path): self.text = open(path).read().lower() self.sentences = [sentence for sentence in re.split(r'(?:[.]\s*){3}|[.?!]', self.text) if len(sentence) > 1] self.pos_data = [] m = Mystem() counter = [0, 0, 0, 0, 0] for sentence in self.sentences: # parse with mystem # count adjectives A, nouns S, verbs V, adverbs ADV, pronouns PR data = m.analyze(sentence) for word in data: analysis = word.get('analysis', None) if analysis: best = analysis[0] gr = best['gr'] if 'S' in gr: counter[3] += 1 elif 'ADV' in gr: counter[1] += 1 elif 'A' in gr: counter[0] += 1 elif 'V' in gr: counter[4] += 1 elif 'PR' in gr: counter[2] += 1 self.pos_data.append(counter) counter = [0, 0, 0, 0, 0] self.data = np.array(self.pos_data)
def get_patterns(): with open('expert_phrases.txt') as f: t = f.read() my = Mystem() analyz = my.analyze(t) fras = [] word = [] k = 0 l = 0 for i in range(0, len(analyz), 2): if analyz[i].get('analysis', 1) == 1: if analyz[i + 1].get('text').find('\n') != -1: fras.append(' '.join(word[k - l:k])) l = 0 continue if analyz[i].get('analysis') != []: a = analyz[i].get('analysis')[0].get('gr') if a.find(',') != -1: if a.find(',') < a.find('='): word.append(a[0:a.find(',')]) k += 1 l += 1 else: word.append(a[0:a.find('=')]) k += 1 l += 1 else: word.append(a[0:a.find('=')]) k += 1 l += 1 if analyz[i + 1].get('text').find('\n') != -1: fras.append(' '.join(word[k - l:k])) l = 0 f = open('phrases_morphologe.txt', 'w') for i in range(len(fras)): f.write(fras[i] + '\n') f.close() print('phrases_morphologe.txt are written') fras.sort() k = 1 paterns = [] kol = [] for i in range(len(fras) - 1): if fras[i] == fras[i + 1]: k += 1 else: paterns.append(fras[i]) kol.append(k) k = 1 i = 0 while i < len(paterns): if kol[i] == 1: paterns.pop(i) kol.pop(i) else: i += 1 return paterns, kol
def normalize(self, texts, path): '''Normalize texts in DataFrame object''' rus_dict_file = path + 'rus_stop_dict.txt' eng_dict_file = path + 'eng_stop_dict.txt' try: with open(rus_dict_file) as f: russian_stopwords = [line.rstrip('\n') for line in f] except Exception as err: print(err) try: with open(eng_dict_file) as f: english_stopwords = [line.rstrip('\n') for line in f] except Exception as err: print(err) text_list = [] mystem = Mystem() for text in texts: text = text.lower() text = re.sub("<!--?.*?-->", "", text) text = re.sub("(\\d|\\W)+", " ", text) tokens = mystem.lemmatize(text) rus_r = re.compile("[а-я]+") eng_r = re.compile("[a-z]+") rus_tokens = [w for w in filter(rus_r.match, tokens)] eng_tokens = [w for w in filter(eng_r.match, tokens)] rus_tokens = [ token for token in rus_tokens if token not in russian_stopwords and token != " " and token.strip() not in punctuation ] # take only substantive: rus_sub_tokens = [] for token in rus_tokens: try: if mystem.analyze(token)[0]['analysis'][0]['gr'][0] == 'S': rus_sub_tokens.append(token) except: pass eng_tokens = [ token for token in eng_tokens if token not in english_stopwords and token != " " and token.strip() not in punctuation ] text = " ".join(rus_sub_tokens) + ' ' + " ".join(eng_tokens) text_list.append(text) return text_list
def get_inf(text): w = [] m = Mystem() for a in m.analyze(text): try: w.append(a['analysis'][0]['lex']) except: pass return w
def make_clear_text(text): # текст без спец символов/цифр/пунктуации m = Mystem() clear_text = [] lm = m.analyze(text.lower()) for i in range(0, len(lm)): if 'analysis' in lm[i]: clear_text.append(lm[i]['text']) return clear_text
def with_not(directory): input_files = filter(lambda x: not x.endswith('~'), os.listdir(directory)) output_data = {} m = Mystem() #иду по документам for input_file in input_files: with open(directory + '/' + input_file) as data_file: data = json.load(data_file) list_of_terms = filter(lambda x: x not in ('', ' ', '\n'), data['text'].split(' ')) # обработка не + (слово) nums_of_bigrams = [] helping_words = [u'совсем', u'очень', u'слишком', u'самый'] for i in range(0, len(list_of_terms)): if list_of_terms[i] == u'не' and list_of_terms[i+1] not in helping_words: if m.analyze(list_of_terms[i+1])[0].get(u'analysis'): if not m.analyze(list_of_terms[i+1])[0][u'analysis'][0][u'gr'].startswith(u'S,'): nums_of_bigrams.append((i, i+1)) elif list_of_terms[i] == u'не' and list_of_terms[i+1] in helping_words: if m.analyze(list_of_terms[i+2])[0].get(u'analysis'): if not m.analyze(list_of_terms[i+2])[0][u'analysis'][0][u'gr'].startswith(u'S,'): nums_of_bigrams.append((i, i+2)) for i in range(0, len(nums_of_bigrams)): if nums_of_bigrams[i][0] + 1 == nums_of_bigrams[i][1]: list_of_terms[nums_of_bigrams[i][0]] = list_of_terms[nums_of_bigrams[i][0]] + list_of_terms[nums_of_bigrams[i][1]] list_of_terms[nums_of_bigrams[i][1]] = '' elif nums_of_bigrams[i][0] + 2 == nums_of_bigrams[i][1]: list_of_terms[nums_of_bigrams[i][0]] = list_of_terms[nums_of_bigrams[i][0]] + list_of_terms[nums_of_bigrams[i][1]] list_of_terms[nums_of_bigrams[i][1] - 1] = '' list_of_terms[nums_of_bigrams[i][1]] = '' list_of_terms = filter(lambda x: x != '', list_of_terms) text_of_output = ' '.join(['%s' % term for term in list_of_terms]) output_data[input_file] = {} output_data[input_file]['id'] = data['id'] output_data[input_file]['positive'] = data['positive'] output_data[input_file]['sarcasm'] = data['sarcasm'] output_data[input_file]['text'] = text_of_output with open(directory + '/' + input_file, 'w') as output_file: json.dump(output_data[input_file], output_file)
def mystem(sentence): m = Mystem() mystem_lemmas = [] lemmas = m.lemmatize(sentence) for lemma in lemmas: ana = m.analyze(lemma) for analysis in ana: if 'analysis' in analysis: mystem_lemmas.append(lemma) return mystem_lemmas
def pos_analyze(text): m = Mystem() a = 'analysis' pos_list = [] mystemmed = m.analyze(text) for record in mystemmed: if a in record and record[a]: gr = record[a][0]['gr'] pos = re.split(',|=', gr)[0] pos_list.append(pos) return pos_list
def getA(text): '''Get all adv''' m = Mystem() w = [] for a in m.analyze(text): try: atype = (a['analysis'][0]['gr'][0]) except: atype = '' if atype == 'A': w.append(a['analysis'][0]['lex']) return w
def ms(text): m = Mystem() ana = m.analyze(text) arr = [] for word in ana: if 'analysis' in word: if len(word['analysis']) > 0: gr = word['analysis'][0]['gr'] pos = gr.split('=')[0] gram = gr.split('=')[1].split('|')[0].strip('()') arr.append(word['text'].lower() + ' ' + pos + ',' + gr) return arr
def pos_counter(text): ''' Считаем части речи :param text: текст :return: словари с частотностью ''' m = Mystem() # создаем экземпляр класса-анализатора ana = m.analyze(text) pos = [i['analysis'][0]['gr'].split('=')[0].split(',')[0] for i in ana if i['text'].strip() and 'analysis' in i and i['analysis']] c_pos = Counter(pos) return c_pos
class MystemTextAnalyzer(TextAnalyzer): def __init__(self): self._mystem = Mystem() def process(self, cas): src_text = cas.input_text infos = self._mystem.analyze(src_text) cas.tokens = [ tanno for tanno in (self._extract_token_anno(i) for i in infos) if tanno ] _GRAMMEME_SEP_RE = re.compile('[,=|()]') _POS_RE = re.compile('^\w+') @classmethod def _extract_token_anno(cls, info): if 'analysis' in info: morph_arr = info['analysis'] if morph_arr: morph_item = morph_arr[0] lemma = morph_item['lex'] tag = morph_item['gr'] pos = None if tag is not None: pos = cls._extract_pos(tag) else: # tag is None, so we should force the lower case lemma = lemma.lower() # lemma = cls._join_lemma_pos(lemma, tag) token_anno = TokenAnnotation(lemma, pos) if tag: token_anno.grammemes = set(cls._GRAMMEME_SEP_RE.split(tag)) if '' in token_anno.grammemes: token_anno.grammemes.remove('') else: token_anno.grammemes = set() return token_anno # in other cases: no analysis OR empty analysis results => fallback to original text lemma = info['text'].strip() if lemma: lemma = lemma.lower() token_anno = TokenAnnotation(lemma, None) token_anno.grammemes = set() return token_anno else: return None @classmethod def _extract_pos(cls, tag): pos_match = cls._POS_RE.search(tag) return pos_match.group(0) if pos_match else None
def analyzer(): source = set() part = parsing() for elem in part: source.add(elem.lower()) mystem = Mystem() for item in source: result = mystem.analyze(item) info = result[0] verb = info['analysis'] line = json.dumps(verb) if 'V' in line: print(item)
def processNews(source_path, source_name, headers, start_date, end_date, filesFolderPathPrefix, table_path): for single_date in daterange(start_date, end_date): date = single_date.strftime("%Y/%m/%d") links = getLinks(source_path, headers, date) for link in links: url = source_path + link try: f = urllib.request.urlopen(url) except urllib.request.HTTPError as e: if e.code == 404: continue html_page = f.read().decode('utf-8') tree = lxml.html.fromstring(html_page) author, title, article = getArticleInfo(tree) tokens = article.split() words = [] for token in tokens: if token != "—": words.append(token) wordcount = len(words) path = filesFolderPathPrefix + date[0:7] if not os.path.exists(path): os.makedirs(path) file = open(path + "/" + title + ".txt", "w", encoding="utf-8") file.write(article) file.close() m = Mystem() lemmas = m.lemmatize(article) article_lemmatized = ''.join(lemmas) article_analyzed = m.analyze(article) path2 = filesFolderPathPrefix + date[0:7] + ' ' + 'mystem' if not os.path.exists(path2): os.makedirs(path2) file_mystem = open(path2 + "/" + title + ".txt", 'w', encoding="utf-8") file_mystem.write(article_lemmatized + '\n' + '\n' + '\n' + str(article_analyzed)) file_mystem.close() table.write(";".join( [source_name, path, author, date, title, url, str(wordcount)]) + "\n") table.close()
def remove_verbs(text_ru): m = Mystem() full_info = m.analyze(text_ru) result_text_ru = "" for element in full_info: check = 1 if element.get('analysis') is not None: if len(element['analysis']) > 0: if element['analysis'][0]['gr'][0] == 'V': check = 0 if check == 1: result_text_ru += element['text'] return result_text_ru
def mystem_tokenize(text): from pymystem3 import Mystem global MYSTEM if not MYSTEM: MYSTEM = Mystem( grammar_info=False, entire_input=True, disambiguation=False, weight=False ) data = MYSTEM.analyze(text) chunks = parse_mystem(data) return find_substrings(chunks, text)
class MystemTokenizer: label = 'mystem' def __init__(self): from pymystem3 import Mystem self.analyzer = Mystem(grammar_info=False, entire_input=True, disambiguation=False, weight=False) def __call__(self, text): data = self.analyzer.analyze(text) chunks = parse_mystem(data) return find_substrings(chunks, text)
def process_mystem(words, lang): m = Mystem() analysis = m.analyze(words) with open(lang + '_processed.txt', 'w', encoding='utf-8') as file: for elem in analysis: if elem['text'] != ' ' and elem['text'] != '\n': try: token = elem['text'] lemma = elem['analysis'][0]['lex'] pos_tag = elem['analysis'][0]['gr'].split(',')[0].split( '=')[0] info = '%s\t%s\t%s\n' % (token, lemma, pos_tag) file.write(info) except: pass
def count_words(posts, needed_pos=None): ''' Считает леммы :param posts: текст :param needed_pos: части речи, леммы которых нужно посчитать :type posts: str :type needed_pos: list :return: отсортированный по убыванию частотности словарь {лемма глагола : частотность в тексте} :rtype: dict ''' m = Mystem() # создаем экземпляр класса-анализатора ana = m.analyze(posts) words = [i['analysis'][0] for i in ana if i['text'].strip() and 'analysis' in i and i['analysis']] if needed_pos is not None: words = [i for i in words if i['gr'].split('=')[0].split(',')[0] in needed_pos] return lemma_counter(words)
def pos_bi(text): pos_tags = [] m = Mystem() sents = sent_tokenize(text) for sent in sents: sent_an = [] analy = m.analyze(sent) for x in analy: try: if 'analysis' in x.keys(): tag = x['analysis'][0]['gr'] sent_an.append(re.sub(r'[=|,].*', '', tag).lower()) except IndexError: pass pos_tags.append(sent_an) return pos_bi
def build_pos(self): m = Mystem() counter = Counter(DEFAULTS) for doc in self.documents: # parse with mystem data = m.analyze(doc.text) # get POS and count for each sentence pos = [word.get('analysis', None)[0]['gr'].split('(')[0].split(',')[0].split('=')[0] for word in data if word.get('analysis', None)] counter.update(pos) # append to dataset self.pos_data.append([counter[key] for key in sorted(counter)]) # reset counter counter = Counter(DEFAULTS)
def produce_lemmas(connection, tableName, outputTableName): mystem = Mystem() cursor = connection.cursor() inserter = connection.cursor() query = 'DELETE FROM `%s`' % outputTableName inserter.execute(query) connection.commit() query = 'SELECT * FROM `%s`' % tableName cursor.execute(query) query = 'INSERT INTO `' + outputTableName + '` (`' + tableName + '_id`, `word_class_id`, `lex`, `gr`)' \ 'SELECT %i, `id`, "%s", "%s" FROM `word_classes` WHERE `abbr`="%s"' for id, concept, scheme in cursor: lemmas = mystem.analyze(concept) for lemma in lemmas: for analysis in lemma.get('analysis', []): inserter.execute(query % prepare_content(id, analysis)) connection.commit() cursor.close()
def fill_mystem(): from pymystem3 import Mystem m = Mystem() for sentence in get_sentences(1): lemmas = m.analyze(sentence.source) items = list() for lemma in lemmas: text = lemma['text'] analysis = lemma.get('analysis') if not analysis: text = text.strip() if not len(text): print 'spaces = "%s"' % text continue if ' ' in text: for item in re.split('\s+', text): items.append("%s %s ?" % (item, item)) print 'several =', "|".join(re.split('\s+', text)) continue print 'delimiter = "%s"' % text items.append("%s %s ?" % (text, text)) continue if not len(text.strip()): raise Exception('Impossible') if ' ' in text: raise Exception('Impossible') lexemes = list() for lexeme in analysis: print 'lex=', lexeme.get('lex', '-') print 'gr=', lexeme.get('gr', '-') lexemes.append("%s %s" % (lexeme['lex'], lexeme['gr'])) items.append("%s %s" % (text, ' '.join(lexemes))) sentence.mystem = '\n'.join(items) sentence.save()
__author__ = 'Bogdan' # encoding=utf-8 from pprint import pprint from pymystem3 import Mystem import codecs, re mystem = Mystem() fulltext = '' textout = '' f = codecs.open('1.txt', 'r', 'utf-8') for line in f: fulltext+=line lemmas = mystem.analyze(fulltext) for lemm in lemmas: for k,v in lemm.items(): if k == 'analysis': for new in v: for n1,n2 in new.items(): #print n1,n2 textout += n1+' '+n2 textout += '\r\n' print textout fout = codecs.open('out.txt', 'w', 'utf-8') m = re.findall('lex\s(\w+)', textout, flags=re.U) a = m[:999] print len(set(a))
class MystemOCTagger(object): def __init__(self): self.mystem_inst = Mystem() def run_and_convert(self, input_file, output_file, strict_match = False): f_in = open(input_file, 'rb') f_out = open(output_file, 'w+') context = etree.iterparse(f_in, tag='sentence') for event, sentence_elem in context: sentence = sentence_elem.find('source') analyzed = self.analyze_sentence(sentence.text) tokens_tree = sentence_elem.find('tokens') tokens = self.extract_tokens(tokens_tree) matched = self.match_analyzed_tokens(tokens, analyzed, strict_match) result = self.analyzed_to_csv_list(matched) for s in result: f_out.write(s+'\n') sentence_elem.clear() def analyze_sentence(self, sentence): return self.mystem_inst.analyze(sentence) # builds word-index mapping, indices sorted in order of appearance def extract_tokens(self, tokens_tree): tokens_dict = {} for t in tokens_tree.iter('token'): idx = t.get('id') token = t.get('text') token = strip_word(token) if (len(token) > 0): if token in tokens_dict: tokens_dict.get(token).append(idx) else: tokens_dict[token] = [idx] return tokens_dict # matches analysis with original tokens indices def match_analyzed_tokens(self, tokens_index, analyzed, strict_match = False): analysis_indexed = {} unindexed = [] for t in analyzed: t_text = t.get('text') t_text = strip_word(t_text) if len(t_text) > 0: if t_text in tokens_index: idx = tokens_index.get(t_text).pop(0) if (len(tokens_index.get(t_text)) == 0): tokens_index.pop(t_text) analysis_indexed[idx] = t.get('analysis') else: unindexed.append(t) if (not strict_match): analysis_not_strict = {} if len(tokens_index) > 0: analysis_not_strict = self.match_not_strict(tokens_index, unindexed) analysis_indexed.update(analysis_not_strict) not_analyzed = [] if len(tokens_index) > 0: for t in tokens_index: not_analyzed.append(t) # if len(not_analyzed) > 0: # f_unindexed = open('mismatch.txt', 'a+') # f_unindexed.write('oc ') # f_unindexed.write(str(not_analyzed)+' ') # # if len(unindexed) > 0: # f_unindexed = open('mismatch.txt', 'a+') # for u in unindexed: # f_unindexed.write(' ') # f_unindexed.write(str(u.get('text'))) # f_unindexed.write('\n') return analysis_indexed def match_not_strict(self, tokens_index, analyzed): analysis_indexed = {} for t_indexed, idx_list in tokens_index.items(): for idx in idx_list: for i in range(0, len(analyzed)): t_analyzed = analyzed[i] if t_indexed.endswith(t_analyzed.get('text')): analysis_indexed[idx] = t_analyzed.get('analysis') #print(t_analyzed.get('text')+' '+t_indexed) analyzed.pop(i) idx_list.remove(idx) break idx_copy = tokens_index.copy() for t, i in idx_copy.items(): if len(i) == 0: del tokens_index[t] return analysis_indexed def analyzed_to_csv_list(self, analyzed): out = [] for idx, analysis in sorted(analyzed.items()): if analysis and len(analysis) > 0: #do we need only grammar? s = str(idx) + ', ' + str(analysis[0].get('gr')) out.append(s) return out
def __init__(self, path): self.text = open(path).read().lower() self.sentences = [sentence for sentence in re.split(r'(?:[.]\s*){3}|[.?!]', self.text) if len(sentence) > 1] self.pos_data = [] # compute all the things! # I started having fun with list comprehensions, but it quickly got out of hand... # if stuff's too slow, will stick everything into one for-loop. # length of sentences in letters self.sentence_lengths = [len([char for char in sentence if char not in PUNCT]) for sentence in self.sentences] # number of different letters in the sentence self.sentence_letters = [len(set(char for char in sentence if char not in PUNCT)) for sentence in self.sentences] # number of vowels in a sentence self.sentence_vowels = [len([char for char in sentence if char in VOWELS]) for sentence in self.sentences] # median of letters in a word self.median_letters = [np.median([len(word.strip(PUNCT)) for word in sentence.split()]) for sentence in self.sentences] # median of vowels in a word self.median_vowels = [ np.median([len([char for char in word if char in VOWELS]) for word in sentence.split()]) for sentence in self.sentences] # word length params self.sentlens = [[len(word) for word in sentence.split()] for sentence in self.sentences if len(sentence) > 1] m = Mystem() counter = [0, 0, 0, 0, 0] for sentence in self.sentences: # parse with mystem # count adjectives A, nouns S, verbs V, adverbs ADV, pronouns PR data = m.analyze(sentence) for word in data: analysis = word.get('analysis', None) if analysis: best = analysis[0] gr = best['gr'] if 'S' in gr: counter[3] += 1 elif 'ADV' in gr: counter[1] += 1 elif 'A' in gr: counter[0] += 1 elif 'V' in gr: counter[4] += 1 elif 'PR' in gr: counter[2] += 1 self.pos_data.append(counter) counter = [0, 0, 0, 0, 0] # and join self.data = np.array(list(zip(self.sentence_lengths, self.sentence_letters, self.sentence_vowels, self.median_letters, self.median_vowels, [item[0] for item in self.pos_data], [item[1] for item in self.pos_data], [item[2] for item in self.pos_data], [item[3] for item in self.pos_data], [item[4] for item in self.pos_data], [len(sentence) for sentence in self.sentlens], [np.mean(sentence) for sentence in self.sentlens], [np.median(sentence) for sentence in self.sentlens] )))
class CsvHandler: INPUTFILE = 'wiki_noxml_full.txt' OUTPUTFILE = 'my_frequency_list.csv' def __init__(self): self.file_name = self.INPUTFILE self.csvlength = 0 self.lemmatiser = Mystem() #self.freq_dict = {} self.fd = defaultdict(dict) def do_cprofile(func): def profiled_func(*args, **kwargs): profile = cProfile.Profile() try: profile.enable() result = func(*args, **kwargs) profile.disable() return result finally: profile.print_stats() return profiled_func def get_freq_dict(self, filename): t0 = time.time() print("Start freq dict") counter = 0 with open(filename, 'r') as csvfile: datareader = csv.reader(csvfile, delimiter='\t') for ln, row in enumerate(datareader): if ln % 100 == 0: print(ln, "articles processed") input_text = row[2] counter += 1 #if counter > 10: #break lemmas = self.get_lem_set(input_text) for i,li in enumerate(lemmas): self.fd[li] = 1 if li not in self.fd else self.fd[li] + 1 t1 = time.time() for a,b in self.fd.items(): print(a,b) print("Finished. Get input file processing time %2.2f secs, whoosh !" % (t1 - t0)) def get_lem_set(self, text): return_set = set() for el in self.lemmatiser.analyze(text): analysis = el.get('analysis', None) if analysis: POS = ['A=', 'S,', 'V='] if (analysis[0].get('gr')[0:2] in POS) and (len(analysis[0].get('lex'))>1): return_set.add(analysis[0].get('lex')) return return_set def output_dict(self, filename, output_dictionary, threshold): t0 = time.time() with open(filename, 'w', newline='', encoding="UTF-8") as csv_file: csv_writer = csv.writer(csv_file, dialect='excel') csv_writer.writerow(["First word", "Second word", "Frequency"]) for key in output_dictionary.keys(): if output_dictionary[key] > threshold: words = key.split(':::') first_word = words[0] second_word = words[1] csv_writer.writerow([ first_word, second_word, output_dictionary[key] ]) csv_file.flush() csv_file.close() t1 = time.time() print("Finished. Get output file processing time %2.2f secs, whoosh !" % (t1 - t0)) def process(self): self.get_freq_dict(self.file_name)
def main(argv): with open(argv[1], encoding='utf-8') as f: s = re.sub(r'\s+', ' ', f.read(), flags=re.M) f=re.split(r'(?<=[.!?…]) ',s) sentens=[] for i,t in enumerate(f): sentens.append(t) print(str(i)," ",t) morph = pymorphy2.MorphAnalyzer() ZnakiP=[",","!","/n",".",":",";",'"',"'","\n","...","?","!","(",")","-"," "," "] t = Mystem() PARS=[] for sent in sentens: input_file=open("input.txt","w",encoding="utf-8") input_file.write(sent) input_file.close() # Делаем синтаксический анализ текста, находим граматические основы process = subprocess.Popen('tomitaparser.exe config.proto', stdout=subprocess.PIPE,shell=True) process.communicate() process.wait() predicate=[] Nouns=[] DOP=[] DOP.append({}) OPR=[] with open("pretty.html",encoding='utf8') as fp: soup = BeautifulSoup(fp,"html.parser") par_f=soup.find_all('table') for table in par_f: th=table.find('th') if(th.text=="Noun1"): slovo=th.find_parent("table").find('a').text Nouns.append(slovo) if(th.text=="Verb1"): slovo=th.find_parent("table").find('a').text predicate.append(slovo) if(th.text=="OPR1"): sl=th.find_parent("table").find_all('a') for slovo in sl: OPR.append(slovo.text) if(th.text=="DOP1"): sl=th.find_parent("table").find_all('a') for slovo in sl: DOP[0][slovo.text.lower()]=slovo.next_element.next_element.next_element.next_element TREE={} TREE[Nouns[0]]={} for v in predicate: TREE[Nouns[0]][v]={} if(OPR!=[]): for temp in OPR: for noun in TREE: if(len(re.split(r"[,' ']",temp))==1): TREE[Nouns[0]][temp]=t.analyze(temp)[0]['analysis'][0]['gr'] else: m2=[] for f in re.split(r"[,' ']",temp): if(f!=''): m2.append(f) if(noun in m2): mk=t.analyze(temp) wsp=[] for tr in mk: if(not tr['text'] in ZnakiP): if(not 'CONJ' in tr['analysis'][0]['gr']): wsp.append(tr['text']) for tl in wsp: if(tl!=noun): TREE[Nouns[0]][tl]=t.analyze(tl)[0]['analysis'][0]['gr'] for temp in TREE[Nouns[0]]: if(temp in DOP[0].values()): for sp in DOP[0]: if(DOP[0][sp]==temp): m2=[] for f in re.split(r"[,' ']",sp): if(f!=''): m2.append(f) for rg in m2: TREE[Nouns[0]][temp][rg]={} for _opr in OPR: reg=re.split(r"[,' ']",temp) if(noun in reg): mk=t.analyze(_opr) wsp=[] for tr in mk: if(not tr['text'] in ZnakiP): if(not 'CONJ' in tr['analysis'][0]['gr']): wsp.append(tr['text']) for tl in wsp: if(tl!=rg): TREE[Nouns[0]][temp][rg][tl]=t.analyze(tl)[0]['analysis'][0]['gr'] for noun in TREE: d1=[noun] for verb in TREE[noun]: if(morph.parse(verb)[0].tag.POS=='ADJF'): d2=[noun,'быть'] d2.append(verb) if(not d2 in PARS): PARS.append(d2.copy()) d2.pop() else: d4=[verb,"может быть"] d1.append(verb) for temp in TREE[noun][verb]: if(morph.parse(temp)[0].tag.POS=='NOUN'): d1.append(morph.parse(temp)[0].normal_form) if(not d1 in PARS): PARS.append(d1.copy()) d1.pop() d3=[temp,'быть'] for temp2 in TREE[noun][verb][temp]: d3.append(temp2) PARS.append(d3.copy()) d3.pop() else: d4.append(temp) if(not d4 in PARS): PARS.append(d4.copy()) d4.pop() obj = PARS.copy() g1=gv.Digraph(format='png') for temp in obj: a=morph.parse(temp[0])[0].tag.POS if(a=='VERB' or a=='INFN'): for t in obj: if(t[1]==temp[0]): g1.node(t[0],shape='rect',style='filled',fillcolor='#cccccc') g1.node(temp[0]) g1.node(temp[2],shape='rect',style='filled',fillcolor='#cccccc') g1.edge(t[0],temp[0]) g1.edge(temp[0],temp[2],label=temp[1]) g1.edge(temp[0],t[2]) else: g1.node(temp[0],shape='rect',style='filled',fillcolor='#cccccc') g1.node(temp[2],shape='rect',style='filled',fillcolor='#cccccc') g1.edge(temp[0],temp[2],label=temp[1]) print(g1.source) g1.render('img/'+argv[2])
import os, json, dicttoxml from pymystem3 import Mystem m = Mystem() top = 'C:\\Users\\John\\Desktop\\py_files\\питон\\korpus\\no_marks' for root, dirs, files in os.walk(top): for name in files: loc = os.path.join(root, name) loc_list = loc.split('\\') #creates list in order to remove path content new_root = loc.replace('\\no_marks\\{0}\\{1}\\{2}'.format(loc_list[8], loc_list[9], loc_list[10]), '') #removes path ending dir_marks = os.path.join(new_root + '\\marks\\{0}\\{1}'.format(loc_list[8], loc_list[9])) #adds new path ending for json.docs dir_xml = os.path.join(new_root + '\\xml\\{0}\\{1}'.format(loc_list[8], loc_list[9])) #adds new path ending for xml docs new_name = name.replace('.txt', '') if not os.path.exists(dir_marks): #makes nesessary dirs if not present os.makedirs(dir_marks) if not os.path.exists(dir_xml): os.makedirs(dir_xml) with open(loc, "r", encoding = 'utf-8') as doc: text_doc = doc.read() lines = doc.readlines() info = json.dumps(m.analyze(text_doc), ensure_ascii = False) #creates text file with gram and lem info with open("{0}\\{1}.json".format(dir_marks, new_name), 'w', encoding = 'utf-8') as doc_marks: doc_marks.write(info) xml = dicttoxml.dicttoxml(info).decode('utf-8') #converts json to xml with open("{0}\\{1}.xml".format(dir_xml, new_name), 'w', encoding = 'utf-8') as doc_xml: doc_xml.write(xml)
# coding: utf-8 from pymystem3 import Mystem # text = "Голкипер «Нью-Йорк Айлендерс» а-б г-н ваыва-ыфвафыа Выступая на пресс-конференции в Лондоне, он подчеркнул, что опубликованные необработанные отчеты с мест боевых действий не содержат имен или информации, которая может повредить каким-либо лицам или организациям. Красивая, — 123.2 latin мама 4,5 7:8 красиво мыла раму" text = "слив воды" m = Mystem() # lemmas = m.lemmatize(text) # print(''.join(lemmas)) lemmas = m.analyze(text) for lemma in lemmas: print '#"%s"' % lemma['text'] a = lemma.get('analysis') # print a if a: for b in a: print 'lex=', b.get('lex', '-') print 'gr=', b.get('gr', '-') print
class CsvHandler: #INPUTFILE = 'corpus-ru-dbpedia-short-dea.csv' #INPUTFILE = 'wiki-one-line.txt' INPUTFILE = 'wiki_noxml_full.txt' OUTPUTFILE = 'my_output-large.csv' def __init__(self): self.file_name = self.INPUTFILE self.csvlength = 0 self.lemmatiser = Mystem() #self.freq_dict = {} self.fd = defaultdict(dict) def do_cprofile(func): def profiled_func(*args, **kwargs): profile = cProfile.Profile() try: profile.enable() result = func(*args, **kwargs) profile.disable() return result finally: profile.print_stats() return profiled_func def get_freq_dict(self, filename): t0 = time.time() counter = 0 with open(filename, 'r') as csvfile: datareader = csv.reader(csvfile, delimiter='\t') for ln, row in enumerate(datareader): if ln % 100 == 0: print(ln, "articles processed") input_text = row[2] counter += 1 #if counter > 100: #break lemmas = self.get_lem_set(input_text) for i, li in enumerate(lemmas): for j, lj in enumerate(lemmas): if i < j: self.fd[li][lj] = 1 if lj not in self.fd[li] else self.fd[li][lj] + 1 #key = li + ":::" + lj #if self.freq_dict.get(key, None): # self.freq_dict[key] += 1 #else: # self.freq_dict[key] = 1 t1 = time.time() for a in self.fd: for b in self.fd[a]: print(a, b, self.fd[a][b]) print("Finished. Get input file processing time %2.2f secs, whoosh !" % (t1 - t0)) #@do_cprofile #def sort_dict(self): # return OrderedDict(sorted(self.freq_dict.items(), key=lambda t: t[1], reverse=True)) def get_lem_set(self, text): return_set = set() for el in self.lemmatiser.analyze(text): analysis = el.get('analysis', None) if analysis: POS = ['A=', 'S,', 'V='] if (analysis[0].get('gr')[0:2] in POS) and (len(analysis[0].get('lex'))>1): return_set.add(analysis[0].get('lex')) ''' for el in self.lemmatiser.lemmatize(text): el = el.strip() if (el not in punctuation) and (not el.isdigit()): return_set.add(el) ''' return return_set def output_dict(self, filename, output_dictionary, threshold): t0 = time.time() with open(filename, 'w', newline='', encoding="UTF-8") as csv_file: csv_writer = csv.writer(csv_file, dialect='excel') csv_writer.writerow(["First word", "Second word", "Frequency"]) for key in output_dictionary.keys(): if output_dictionary[key] > threshold: words = key.split(':::') first_word = words[0] second_word = words[1] csv_writer.writerow([ first_word, second_word, output_dictionary[key] ]) csv_file.flush() csv_file.close() t1 = time.time() print("Finished. Get output file processing time %2.2f secs, whoosh !" % (t1 - t0)) def process(self): self.get_freq_dict(self.file_name)
if options: title = options.group(1) for stuff in title.split('|'): yield gr.replace("(" + title + ")", stuff) else: yield gr lines = set([]) with open("data/test.txt", "r") as input_file: logging.info("file opened") for line in input_file: for w in m.analyze(line): if 'analysis' in w: for item in w['analysis']: for gramm_info in parse_gr(item['gr']): lines.add("\t".join( [gramm_info, item['lex'], w['text'].lower()]).encode("utf-8") + "\n") with open("data/pairs_with_grammar.tsv", "w+") as f: for line in lines: f.write(line) dict = {} for line in open("data/pairs_with_grammar.tsv", "r+"): if line.strip():
output_fpath = sys.argv[1] if len(sys.argv) > 1 else "output.txt" tic = time() with codecs.open(output_fpath, "w", "utf-8") as output: m = Mystem() i = 0 for line in sys.stdin: try: i += 1 if i % 1000 == 0: print i f = line.split("\t") url, title, text = f[0], f[1], ' '.join(f[2:]) print >> output, "<doc url='%s' title='%s'>" % (url, title.decode("utf-8")) res = m.analyze(text) for r in res: if "analysis" not in r or "text" not in r: continue if len(r["analysis"]) < 1 or "lex" not in r["analysis"][0] or "gr" not in r["analysis"][0]: print >> output, "%s\t%s\t%s" % (r["text"], r["text"], "?") else: pos = re.split('=|,', r["analysis"][0]["gr"])[0] print >> output, "%s\t%s\t%s" % (r["text"], r["analysis"][0]["lex"], pos) print >> output, "</doc>" except: print "Bad line: '%s'" % line print "Error:", traceback.format_exc() print "Fields num:", len(line.split("\t")) print "Elapsed:", time() - tic, "sec."