def textToWordList(txt): p_stemmer = RussianStemmer() tokenizer = RegexpTokenizer(r'\w+') stop_w = [p_stemmer.stem(i) for i in get_stop_words('ru')] r = re.compile('^[а-я]+$') badword =[ 'дом', 'город', "дорог", "час", "ноч", "слов", "утр", "стран", "пут", "путешеств", "мест", 'нов', "друз", "добр" ] txt = txt.lower().replace("<br>", "\n") tokens = [p_stemmer.stem(i) for i in tokenizer.tokenize(txt)] tokens = [i for i in tokens if not i in stop_w and r.match(i) and not i in badword] return tokens
class Tokenizer(object): def __init__(self): self.cache = {} self.r_stemmer = RussianStemmer() self.e_stemmer = EnglishStemmer() def process_word(self, w): if w in self.cache: return self.cache[w] else: struct = check_structure(w) if struct == 'TRASH': w_proc = '' elif struct == 'WORD': if is_ascii(w): w_proc = self.e_stemmer.stem(w) else: w_proc = self.r_stemmer.stem(w) elif struct == 'NUMBER': w_proc = '' elif struct == 'COMPLEX': w_proc = w self.cache[w] = w_proc return w_proc def tokenize(self, text): text = preprosess_text(text) words = text.split(' ') tokens = [] for w in words: tokens.append(self.process_word(w)) tokens = [t for t in tokens if len(t)] return tokens
def learn(self, class_name): self.classes.add(class_name) print class_name self.words_freq[class_name] = {} if class_name is "internet": dir_name = learn_internet else: dir_name = learn_nointernet for file_name in os.listdir(dir_name): print "processing", file_name text = open(dir_name + "/" + file_name, "r").read().decode("utf-8") words = [word.lower() for word in tokenizers.extract_words(text)] self.docs_number += 1 self.unique_words_set = self.unique_words_set | set(words) stemmer = RussianStemmer() for word in words: stemmed = stemmer.stem(word) if stemmed in self.words_freq[class_name]: self.words_freq[class_name][stemmed] += 1 else: self.words_freq[class_name][stemmed] = 1 if class_name in self.words_in_class: self.words_in_class[class_name] += len(words) self.docs_in_class[class_name] += 1 else: self.words_in_class[class_name] = len(words) self.docs_in_class[class_name] = 1
def parse(self, fname): """ Парсинг текста файла :param fname: имя файла :return: (<имя_файла>, тошнота, мошенничество) """ density, fraud = 0, 0 with codecs.open(fname, "r", encoding="utf-8") as f: text = f.read() tknz = RegexpTokenizer(pattern="[А-Яа-яA-zё]+") txt_list = tknz.tokenize(text) if txt_list: for i, word in enumerate(txt_list): new_word = self.check_word(word) if new_word: txt_list[i] = new_word fraud += 1 txt_list = [ word.lower() for word in txt_list if not (word.lower() in self.sw) ] stemmer_ru = RussianStemmer() txt_list = [ stemmer_ru.stem(token.lower()) for token in txt_list if len(token) > 1 ] dict_w = Counter(txt_list) top5 = heapq.nlargest(5, dict_w, key=dict_w.get) top5_count = sum([dict_w[word] for word in top5]) density = top5_count / len(txt_list) # такой критерий (fraud > 2) был выбран на основании тестирования на имеющейся выборке # часто попадается такое, что в объявлении есть слова типа "ШxДхВ" которые мы не можем однозначно распознать # готов обсуждать этот критерий, возможно исправить каким то образом return fname, density, fraud > 2
class PhraseStemmer(PhraseSplitter): def __init__(self): self.tokenizer = Tokenizer() self.stemmer = RussianStemmer() def tokenize(self, phrase): return [self.stemmer.stem(w) for w in self.tokenizer.tokenize(phrase) if len(w.strip()) > 0]
def parse_text(self, text): text = list(text) for i in range(len(text)): is_cyrillic_symbol = False if text[i] >= 'А' and text[i] <= 'Я': is_cyrillic_symbol = True if text[i] >= 'а' and text[i] <= 'я': is_cyrillic_symbol = True if is_cyrillic_symbol == False: text[i] = ' ' text = ''.join(text) text = text.split() filtered_words = [ word for word in text if word not in stopwords.words('russian') and word not in self.badwords ] stemmer = RussianStemmer() for i in range(len(filtered_words)): filtered_words[i] = stemmer.stem(filtered_words[i]) return filtered_words
def textrank(self, text, similar='serense'): text = treatment_text(text) text = text.split('.') text = list(filter(lambda x: len(x.split()) > 6, text)) text = '.'.join(text) sentences = sent_tokenize(text) tokenizer = RegexpTokenizer(r'\w+') lmtzr = RussianStemmer() words = [ set( lmtzr.stem(word) for word in tokenizer.tokenize(sentence.lower())) for sentence in sentences ] pairs = combinations(range(len(sentences)), 2) if similar == 'serense': scores = [(i, j, self.similarity_1(words[i], words[j])) for i, j in pairs] if similar == 'cos': scores = [(i, j, self.similarity_2(words[i], words[j])) for i, j in pairs] scores = filter(lambda x: x[2], scores) g = nx.Graph() g.add_weighted_edges_from(scores) pr = nx.pagerank(g) return sorted( ((i, pr[i], s) for i, s in enumerate(sentences) if i in pr), key=lambda x: pr[x[0]], reverse=True)
def stem_corpus(input_path, output_path): stem = RussianStemmer() last_word = '' i = 0 with open(output_path, 'w', encoding='utf8') as o: with open(input_path, 'r', encoding='utf8') as f: while True: s = f.read(1024 * 1024) if not s or not len(s): o.write(last_word) break words = s.split(' ') if s[0] != ' ': # last_word was incomplete words[0] = last_word + words[0] for word in words[:-1]: stemmed = stem.stem(word) o.write(stemmed + ' ') i += 1 print('Stemmed {} MBs'.format(i)) last_word = words[-1]
def fillDicts(self, maxDocs=0): self.classes = set() self.documentsInClass = dict() #количество документов в классе self.documentsNumber = 0 # число документов self.uniqueWords = set() # множество уникальных слов self.wordsInClass = dict() # количество слов в классе self.wordsFreq = dict() # частота появления слова в классе i = 0 for document in self.collection.find(): i += 1 if i > maxDocs and maxDocs > 0: break if i % 100 == 0: print "Processed " + str(i) + " documents" self.classes.add(document['topic']) match = re.findall(re.compile(u"[а-яА-Яa-zA-Z0-9]*"), document['body']) match = [word for word in match if word != ''] self.documentsNumber += 1 self.uniqueWords = self.uniqueWords | set(match) wordsFreq = dict() stemmer = RussianStemmer() for _match in match: stemmed = stemmer.stem(_match) if stemmed in wordsFreq: wordsFreq[stemmed] += 1 else: wordsFreq[stemmed] = 1 if document['topic'] in self.wordsInClass: self.wordsInClass[document['topic']] += len(match) self.wordsFreq[document['topic']].update(wordsFreq) self.documentsInClass[document['topic']] += 1 else: self.wordsInClass[document['topic']] = len(match) self.wordsFreq[document['topic']] = wordsFreq self.documentsInClass[document['topic']] = 1
def stemming_sent(sent): pattern = re.compile('[a-zA-Zа-яА-Я]+') words = pattern.findall(sent) stemmer = RussianStemmer() words = list(map(lambda word: stemmer.stem(word), words)) new_sent = functools.reduce(lambda x, y: x + ' ' + y, words) return new_sent
def stemming(corpus): stemmer = RussianStemmer() stems = [] for comment in corpus: comment = comment.split() s = [stemmer.stem(word) for word in comment] stems.append(' '.join(s)) return stems
def method2(tokens): print("The way 2") stemmer = RussianStemmer(False) dictionary = dict() for word in tokens: normal_form = stemmer.stem(word) dictionary[normal_form] = dictionary.get(normal_form, 0) + 1 printDic(dictionary, 2)
def preprocessing(sentence): porter = RussianStemmer() punctuation = string.punctuation + "«»—•’" stop = stopwords.words('russian') for p in punctuation: sentence = sentence.replace(p, "") sentence = [porter.stem(word) for word in sentence.split() if word not in stop] return sentence
def build_stemmer_morphology(data_filename, output_filename): vocab = load_vocab(data_filename) print 'Total words in vocab: %d' % len(vocab) prefix_map = defaultdict(set) stemmer = RussianStemmer() for w in vocab: prefix_map[stemmer.stem(w)].add(w) print 'Total lemm groups: %d' % (len(prefix_map)) write_morphology(prefix_map, output_filename)
class findSubject: SUBJECTS_NAME = [] SUBJECTS_REAL_NAME = [] IS_LOADED_SUBJECTS = False regex = 0 stemer = 0 def __init__(self): self.stemer = RussianStemmer() self.regex = re.compile('[^а-яА-Я ]') self.load_subjects('textParsing/data/subjects.csv') def get_stem(self, token, checkHash=True): token = self.regex.sub('', token).lower() stem = self.stemer.stem(token) return stem def load_subjects(self, filepath): pd_subjects = pd.read_csv(filepath, delimiter=';') self.SUBJECTS_NAME = list(np.array(pd_subjects[['name']])) self.SUBJECTS_REAL_NAME = list(np.array(pd_subjects[['subject']])) for ind in range(len(self.SUBJECTS_NAME)): self.SUBJECTS_NAME[ind] = self.get_stem( str(self.SUBJECTS_NAME[ind][0]), False) self.IS_LOADED_SUBJECTS = True def get(self, text): sent = text.split(' ') find_fst_po = -1 for ind, word in enumerate(sent): if word == 'по': find_fst_po = ind break if (find_fst_po == -1): return None subjects = set() for ind, word in enumerate(sent): if (ind > find_fst_po): word = self.get_stem(word, False) if (word in self.SUBJECTS_NAME): subjects.add( str(self.SUBJECTS_REAL_NAME[self.SUBJECTS_NAME.index( word)])) if (len(subjects) == 0): return None return subjects
def stemWord(self, word, lng): '''Separates the word's changeable part with a '|' for wordfast''' if lng == 'ru': stemmer = RussianStemmer() elif lng == 'en': stemmer = PorterStemmer() elif lng == 'de': stemmer = GermanStemmer() else: print('Language error. Exiting...') sys.exit(1) word = word.lower() #otherwise the stemmer fails if len(word) <= 3: return word elif len(word) == len(stemmer.stem(word)): return "{0}|{1}".format(word[:-1], word[-1]) else: return "{0}|{1}".format(word[:len(stemmer.stem(word))], \ word[len(stemmer.stem(word)):])
def nltk_preprocessor(sentences): ''' токенизация + стемминг''' tokenizer = RegexpTokenizer(r'\w+') # стемминг до корневой основы lmtzr = RussianStemmer() words = [set(lmtzr.stem(word) # стемминг for word in tokenizer.tokenize(sentence.lower()) # токенизация ) for sentence in sentences ] return words
class NN: def __init__(self): self._stem_cache = {} self._validator_regex = re.compile(r'[^А-яЁё]') self._stemmer = RussianStemmer() self.vocabulary = self._load_vocabulary() self.model = self._load_model() def _get_stem(self, token): stem = self._stem_cache.get(token, None) if stem: return stem token = self._validator_regex.sub('', token).lower() stem = self._stemmer.stem(token) self._stem_cache[token] = stem return stem def _load_vocabulary(self): with open('data/vocabulary.txt') as f: vocabulary_arr = f.read().split('\n') return {vocabulary_arr[i]: i for i in range(len(vocabulary_arr))} def message_to_vector(self, message): vector = numpy.zeros(len(self.vocabulary), dtype=numpy.byte) tokenizer = TweetTokenizer() for token in tokenizer.tokenize(message): stem = self._get_stem(token) idx = self.vocabulary.get(stem, None) if idx is not None: vector[idx] = 1 return vector def _build_model(self): tensorflow.reset_default_graph() net = tflearn.input_data([None, len(self.vocabulary)]) net = tflearn.fully_connected(net, 125, activation='RelU') net = tflearn.fully_connected(net, 25, activation='RelU') net = tflearn.fully_connected(net, 2, activation='softmax') return tflearn.DNN(net) def _load_model(self): model = self._build_model() model.load('data/model/model') return model def take_answer(self, message): vector = [self.message_to_vector(message)] return self.model.predict(vector)[0][1] >= 0.5
def detect_cheat_in_text(text): """Detect cheats in text""" new_text = [] is_cheat = False for word in text: is_cheated_word, recovery_token = detect_cheat(word) if is_cheated_word: is_cheat = True new_text.append(recovery_token) stop_words = set(stopwords.words('russian')) st = RussianStemmer() new_text = [word for word in new_text if (word not in stop_words)] return is_cheat, [st.stem(word) for word in new_text]
def stem_words(self, words): """ Stem words by Porter or Snowball stemmers and join to one string """ stemmer = None if self.lang == 'uk': return ' '.join( [UkrainianStemmer(word).stem_word() for word in words]) elif self.lang == 'ru': stemmer = RussianStemmer() elif self.lang == 'en': stemmer = EnglishStemmer() return ' '.join([stemmer.stem(word) for word in words])
def wrk_words_wt_no(sent): """Making stemming""" # morph = pymorphy2.MorphAnalyzer() stemmer = RussianStemmer() words=word_tokenize(sent.lower()) try: arr=[] for i in range(len(words)): if re.search(u'[а-яА-Я]',words[i]): arr.append(stemmer.stem(words[i]))###стемминг # arr.append(morph.parse(words[i])[0].normal_form)###лемматизация words1=[w for w in arr if w not in russian_stops] words1=No_with_word(words1) return words1 except TypeError: pass
def textrank(text): sentences = sent_tokenize(text) tokenizer = RegexpTokenizer(r'\w+') lmtzr = RussianStemmer() words = [ set(lmtzr.stem(word) for word in tokenizer.tokenize(sentence.lower())) for sentence in sentences ] pairs = combinations(range(len(sentences)), 2) scores = [(i, j, similarity(words[i], words[j])) for i, j in pairs] scores = filter(lambda x: x[2], scores) g = nx.Graph() g.add_weighted_edges_from(scores) pr = nx.pagerank(g) return sorted(((i, pr[i], s) for i, s in enumerate(sentences) if i in pr), key=lambda x: pr[x[0]], reverse=True)
def stem_keyword(self): """ Stem keyword by Porter or Snowball stemmers """ if self.language == 'uk': self.keyword = UkrainianStemmer(self.keyword).stem_word() return elif self.language == 'ru': stemmer = RussianStemmer() elif self.language == 'en': stemmer = EnglishStemmer() else: return self.keyword = stemmer.stem(self.keyword)
def cleanText(textToClean): myPunctuation = u'–«»—…' exclude = set(string.punctuation + myPunctuation) #textToClean = unicode(textToClean, "utf-8") textToClean = ''.join(ch for ch in textToClean if ch not in exclude) textToClean = ''.join([i for i in textToClean if not i.isdigit()]) stop_words = get_stop_words('ru') words_after_deleting_stop_words = [w for w in textToClean.split() if (not w in stop_words and w in model.vocab)] rs = RussianStemmer() words_after_stemming = [rs.stem(w) for w in words_after_deleting_stop_words] text_after_cleaning = ' '.join(words_after_stemming) #text_after_cleaning = text_after_cleaning.replace(u'кпм', '').replace(u'пмп', '') if text_after_cleaning: return text_after_cleaning
class TextStemmerStage(PipelineStage): def __init__(self): self.english_stemmer = PorterStemmer() self.russian_stemmer = RussianStemmer() def accept(self, consumer_input: PipedInput): text = consumer_input.get_text().lower() token_words = word_tokenize(text) result = [] for token in token_words: token = token.strip() if is_russian(token) or is_belarusian(token): result.append(self.russian_stemmer.stem(token)) if is_english(token): result.append(self.english_stemmer.stem(token)) return consumer_input.new(text=" ".join(result)) def dump(self): pass
class SearchForm(hay_forms.SearchForm): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) choices = hay_forms.model_choices() self.fields["models"] = forms.ChoiceField(choices=choices, required=False, label='Искать', widget=forms.RadioSelect, initial=choices[0][0]) self.stopwords = set(stopwords.words('russian')) self.stemmer = RussianStemmer() self.tokenizer = RegexpTokenizer(r'\w+') def get_model(self): if self.is_valid(): model = self.cleaned_data['models'] if model: return haystack_get_model(*model.split(".")) return None def prepare_query(self, query_string): words = self.tokenizer.tokenize(query_string.lower()) words = [ self.stemmer.stem(word) for word in words if word not in self.stopwords ] return ' '.join(words) def search(self): if not (self.is_valid() and self.cleaned_data.get('q')): return self.no_query_found() query = self.prepare_query(self.cleaned_data['q']) sqs = self.searchqueryset.filter(content__contains=query) if self.load_all: sqs = sqs.load_all() search_model = self.get_model() if search_model: return sqs.models(search_model) return sqs
class country_russian_stemmer: def __init__(self): self.stemmer = RussianStemmer() def stem_helper(self, word): stem = self.stemmer.stem(word) min_stem_length = min(4, len(word)) stem_length = len(stem) if stem_length < min_stem_length: return word[0:min_stem_length] # pretty sure stemmers only cut suffix return word[0:stem_length] def stem(self, word): words = word.split() l = [] for word in words: # TO UNDO l.append(self.stem_helper(word.decode('utf-8')).encode('utf-8')) return " ".join(l)
def textrank(text): """ TextRank algorithm for text summarization. https://gist.github.com/igor-shevchenko/5821166 """ sentences = sent_tokenize(text) tokenizer = RegexpTokenizer(r'\w+') lmtzr = RussianStemmer() words = [set(lmtzr.stem(word) for word in tokenizer.tokenize(sentence.lower())) for sentence in sentences] pairs = combinations(range(len(sentences)), 2) scores = [(i, j, similarity(words[i], words[j])) for i, j in pairs] scores = filter(lambda x: x[2], scores) g = nx.Graph() g.add_weighted_edges_from(scores) pr = nx.pagerank(g) return sorted(((i, pr[i], s) for i, s in enumerate(sentences) if i in pr), key=lambda x: pr[x[0]], reverse=True)
def tokenize_and_stem(self, text): from nltk.stem.snowball import RussianStemmer stemmer = RussianStemmer() # first tokenize by sentence, then by word to ensure that punctuation # is caught as it's own token tokens = [ word for sent in self.nltk.sent_tokenize(text, self.locale) for word in self.nltk.word_tokenize(sent, self.locale) ] #tokens = [word for word in self.nltk.word_tokenize(text[1])]#, language=self.locale) filtered_tokens = [] # filter out any tokens not containing letters (e.g., numeric tokens, # raw punctuation) for token in tokens: if self.re.search('[a-zA-Zа-яА-Я]', token): if token not in self.stopwords: filtered_tokens.append(token) stems = [stemmer.stem(t) for t in filtered_tokens] return stems
def stem_text(self): """ Stem words by Porter or Snowball stemmers """ stemmer = None if self.lang == 'ru': stemmer = RussianStemmer() elif self.lang == 'en': stemmer = EnglishStemmer() for i in range(len(self.__corpora)): words = self.__corpora[i].split() if self.lang == 'uk': self.__corpora[i] = ' '.join( [UkrainianStemmer(word).stem_word() for word in words]) else: self.__corpora[i] = ' '.join( [stemmer.stem(word) for word in words])
def clean_col(self, column, data, stem=False): punct = "!#$«%&\'.()*+-<=>?@[\\]^_`°{|}" spec = "/\n" digits = "\d" regex = re.compile('[%s]' % re.escape(punct)) regexspec = re.compile('[%s]' % re.escape(spec)) digits = re.compile(digits) toktok = ToktokTokenizer() rs = RussianStemmer(ignore_stopwords=True) if stem: res = data[column].fillna("__NA__").map(lambda sent: regex.sub("", sent))\ .map(lambda sent: digits.sub("#", sent))\ .map(lambda sent: regexspec.sub(" ", sent))\ .map(lambda sent: rs.stem(sent))\ .map(lambda sent: toktok.tokenize(sent.lower())).tolist() else: res = data[column].fillna("__NA__").map(lambda sent: regex.sub("", sent))\ .map(lambda sent: digits.sub("#", sent))\ .map(lambda sent: regexspec.sub(" ", sent))\ .map(lambda sent: toktok.tokenize(sent.lower())).tolist() return res
def learn(self, class_name): dir_name = "." file_name = "tweets_by_trend.xml" self.classes.add(class_name) self.words_freq[class_name] = {} if class_name is "negative": code = 0 else: code = 1 print "processing", file_name tree = ET.parse(dir_name + "/" + file_name) root = tree.getroot() for tweet in root.findall('tweet'): sent = int(tweet.find('sent').text) if sent == code: text = tweet.find('text').text words = [ word.lower() for word in tokenizers.extract_words(text) ] self.docs_number += 1 self.unique_words_set = self.unique_words_set | set(words) stemmer = RussianStemmer() for word in words: stemmed = stemmer.stem(word) if stemmed in self.words_freq[class_name]: self.words_freq[class_name][stemmed] += 1 else: self.words_freq[class_name][stemmed] = 1 if class_name in self.words_in_class: self.words_in_class[class_name] += len(words) self.docs_in_class[class_name] += 1 else: self.words_in_class[class_name] = len(words) self.docs_in_class[class_name] = 1
class SentimentClassifier(object): def __init__(self): self.model = joblib.load("./models/clf.pkl") self.vectorizer = joblib.load("./models/vectorizer.pkl") self.classes_dict = { 0: "отрицательный", 1: "положительный", -1: "ошибка" } self.numbers_str = '0123456789' self.punc_translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation)) self.num_translator = str.maketrans(self.numbers_str, ' ' * len(self.numbers_str)) self.short_word_len = 1 self.stemmer = RussianStemmer() self.stop_words = stopwords.words('russian') + ['br'] def predprocess_text(self, text): text = text.lower().translate(self.punc_translator).translate( self.num_translator) text = ' '.join(self.stemmer.stem(word) for word in text.split()) return text.strip() def predict_text(self, text): text = self.predprocess_text(text) try: print(text) vectorized = self.vectorizer.transform( [self.predprocess_text(text)]) return self.model.predict(vectorized)[0] except: print("prediction error") return -1 def get_prediction_message(self, text): prediction = self.predict_text(text) return self.classes_dict[prediction]
def learn(self, class_name): dir_name = "." file_name = "tweets_by_trend.xml" self.classes.add(class_name) self.words_freq[class_name] = {} if class_name is "negative": code = 0 else: code = 1 print "processing", file_name tree = ET.parse(dir_name + "/" + file_name) root = tree.getroot() for tweet in root.findall('tweet'): sent = int(tweet.find('sent').text) if sent == code: text = tweet.find('text').text words = [word.lower() for word in tokenizers.extract_words(text)] self.docs_number += 1 self.unique_words_set = self.unique_words_set | set(words) stemmer = RussianStemmer() for word in words: stemmed = stemmer.stem(word) if stemmed in self.words_freq[class_name]: self.words_freq[class_name][stemmed] += 1 else: self.words_freq[class_name][stemmed] = 1 if class_name in self.words_in_class: self.words_in_class[class_name] += len(words) self.docs_in_class[class_name] += 1 else: self.words_in_class[class_name] = len(words) self.docs_in_class[class_name] = 1
def cleanText(text): ''' Function checks and repairs words with hidden latin characters in and vv. Function assuming that there are only latin and cyrillic characters in text. ''' ad = AlphabetDetector() st = RussianStemmer() is_broken = False clean_text = [] for word in text: if ad.only_alphabet_chars(word, 'CYRILLIC'): clean_text.append(word) elif ad.only_alphabet_chars(word, 'LATIN'): clean_text.append(word) else: is_broken = True clean_text.append(letterSwap(word)) clean_text = [st.stem(word) for word in clean_text] return clean_text, is_broken
from sklearn.neural_network import BernoulliRBM from sklearn.linear_model import LogisticRegression,Perceptron from sklearn.tree import DecisionTreeClassifier,ExtraTreeClassifier from sklearn.ensemble import AdaBoostClassifier,BaggingClassifier,RandomForestClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.decomposition import TruncatedSVD,NMF,FactorAnalysis,PCA from nltk.stem.snowball import RussianStemmer from sklearn.pipeline import Pipeline from sklearn.linear_model import SGDClassifier from nltk import word_tokenize from nltk.tokenize.api import StringTokenizer from nltk.corpus import stopwords import numpy st=RussianStemmer() libra=pd.read_excel('libra.xls')[['body','ticket_queue_id']].dropna() libra.body=pd.Series(st.stem(x) for x in libra.body) libra=libra.dropna() classifier=SVC(probability=True,kernel='linear') from nltk.stem.snowball import RussianStemmer import nltk st = RussianStemmer()
# -*- coding: utf-8 -*- """ Created on Thu Jul 21 18:10:41 2016 @author: asamoylov """ from nltk.stem.snowball import RussianStemmer mystem = RussianStemmer() str0 = "поздно" print mystem.stem(str0.decode("utf-8"))
class LSI(object): def __init__(self, stopwords, ignorechars, docs): self.stemmer = RussianStemmer() self.wdict = {} self.dictionary = [] self.stopwords = stopwords if type(ignorechars) == unicode: ignorechars = ignorechars.encode('utf-8') self.ignorechars = ignorechars self.docss = [] self.docs = docs for doc in docs: self.add_doc(doc) def prepare(self): self.build() self.calc() def dic(self, word, add = False): if type(word) == unicode: word = word.encode('utf-8') word = word.lower().translate(None, self.ignorechars) word = word.decode('utf-8') word = self.stemmer.stem(word) if word in self.dictionary: return self.dictionary.index(word) else: if add: self.dictionary.append(word) return len(self.dictionary) - 1 else: return None def add_doc(self, doc): words = [self.dic(word, True) for word in doc.lower().split()] self.docss.append(words) for word in words: if word in self.stopwords: continue elif word in self.wdict: self.wdict[word].append(len(self.docs) - 1) else: self.wdict[word] = [len(self.docs) - 1] def build(self): self.keys = [k for k in self.wdict.keys() if len(self.wdict[k]) > 0] self.keys.sort() self.A = np.zeros([len(self.keys), len(self.docs)]) for i, k in enumerate(self.keys): for d in self.wdict[k]: self.A[i,d] += 1 def calc(self): self.U, self.S, self.Vt = svd(self.A) def TFIDF(self): wordsPerDoc = sum(self.A, axis=0) docsPerWord = sum(np.asarray(self.A > 0, 'i'), axis=1) rows, cols = self.A.shape for i in range(rows): for j in range(cols): self.A[i,j] = (self.A[i,j] / wordsPerDoc[j]) * log(float(cols) / docsPerWord[i]) def dump_src(self): self.prepare() print 'Здесь представлен расчет матрицы ' for i, row in enumerate(self.A): print self.dictionary[i], row def print_svd(self): self.prepare() print 'Здесь сингулярные значения' print self.S print 'Здесь первые 3 колонки U матрица ' for i, row in enumerate(self.U): print self.dictionary[self.keys[i]], row[0:3] print 'Здесь первые 3 строчки Vt матрица' print -1*self.Vt[0:3, :] def find(self, word): self.prepare() idx = self.dic(word) if not idx: print 'слово невстерчается' return [] if not idx in self.keys: print 'слово отброшено как не имеющее значения которое через stopwords' return [] idx = self.keys.index(idx) print 'word --- ', word, '=', self.dictionary[self.keys[idx]], '.\n' # получаем координаты слова wx, wy = (-1 * self.U[:, 1:3])[idx] print 'word {}\t{:0.2f}\t{:0.2f}\t{}\n'.format(idx, wx, wy, word) arts = [] xx, yy = -1 * self.Vt[1:3, :] for k, v in enumerate(self.docs): ax, ay = xx[k], yy[k] dx, dy = float(wx - ax), float(wy - ay) arts.append((k, v, ax, ay, sqrt(dx * dx + dy * dy))) return sorted(arts, key = lambda a: a[4])
# analyze statues status_stats = dict() tokenizer = RegexpTokenizer(r"[A-Za-zА-Яа-я]+") stemmer = RussianStemmer() users_file_name = sys.argv[2] with open(users_file_name, "r") as users_file: for line in users_file: user = json.loads(line) uid = str(user["_id"]) if uid in pazans_groups: pazan_groups = pazans_groups[uid] status_text = user.get("status", "") filtered_status_text = "".join([stemmer.stem(token).lower() for token in tokenizer.tokenize(status_text)]) if len(filtered_status_text) > 1: status_stats_item = status_stats.get(filtered_status_text, { "full": status_text, "count-boys": 0, "count-girls": 0, }) if user["sex"] == 2: status_stats_item["count-boys"] += len(pazan_groups) if user["sex"] == 1: status_stats_item["count-girls"] += len(pazan_groups) status_stats[filtered_status_text] = status_stats_item # print result dest_file_name = sys.argv[3] with open(dest_file_name, "w", encoding="utf-8") as f_out:
WordsMyStem.create_table() WordsPorter.create_table() stop = stopwords.words('russian') for article in articles: text = " ".join([article.title.lower(), article.content.lower(), article.keywords.lower()]) for p in punctuation: text = text.replace(p, "") text = text.replace("\\n", "") text = re.sub(' +', ' ', text) text = [word for word in text.split() if word not in stop] text = " ".join(text) # Mystem mystem_words = mystem.lemmatize(text) raw_words = text.split() # Porter Stemmer porter_words = [porter.stem(word) for word in raw_words] for word in porter_words: data_porter = WordsPorter(id=uuid.uuid4(), term=word, article_id=article.id) data_porter.save(force_insert=True) for word in mystem_words: if word != ' ': data_mystem = WordsMyStem(id=uuid.uuid4(), term=word, article_id=article.id) data_mystem.save(force_insert=True)
def stemData(posts): global happy global sad global invert global shouldStemData statHap = {} statSad = {} statAll = {} from nltk.stem.snowball import RussianStemmer from nltk import word_tokenize, sent_tokenize from gensim.models.doc2vec import LabeledSentence stemmer = RussianStemmer() toRet = [] curI = 0 if shouldStemData: # renew smiles happy = stemmer.stem(happy) sad = stemmer.stem(sad) positives = [] negatives = [] for i in range(0, len(posts)): if i % 10000 == 0: print i sentences = sent_tokenize(posts[i]) for j in range(0, len(sentences)): words = word_tokenize(sentences[j]) import string for k in range(0, len(words)): try: if shouldStemData and words[k] not in invert: words[k] = unicode(stemmer.stem(words[k])) # words[k] = cyr_to_r(words[k]).encode('utf8') letters = u'абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ' words[k] = filter(lambda x: x in letters + string.letters + string.digits + '.!?', words[k]) except Exception: print 'failed word: ' + words[k] raise Exception('') try: if words == []: del sentences[j] continue if words == [happy, '.']: sentences[j] = LabeledSentence(words=words, tags=[happy]) if j > 0: positives += [curI - 1] elif words == [sad, '.']: sentences[j] = LabeledSentence(words=words, tags=[sad]) if j > 0: negatives += [curI - 1] else: for word in words: if word in statAll: statAll[word] += 1 else: statAll[word] = 1 if happy in words: positives += [curI] while happy in words: words.remove(happy) for word in words: if word in statHap: statHap[word] += 1 else: statHap[word] = 1 if sad in words: negatives += [curI] while sad in words: words.remove(sad) for word in words: if word in statSad: statSad[word] += 1 else: statSad[word] = 1 sentences[j] = LabeledSentence(words=words, tags=[str(curI)]) curI += 1 except Exception, e: print words sentences[j] = [''] raise e toRet += sentences
class KareninaParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.inside_dd = False self.doc_id = 0 self.token_count = 0 self.token_sum_len = 0 self.iindex = {} self.paragraphs = [] self.tokenizer = WordPunctTokenizer() self.stemmer = RussianStemmer() def handle_starttag(self, tag, attrs): if tag == "dd": self.inside_dd = True self.doc_id += 1 else: self.inside_dd = False def handle_data(self, data): if self.inside_dd: self.paragraphs.append(data) terms = set() for token in self.tokenizer.tokenize(unicode(data.lower(), 'utf-8')): if token[0] in string.punctuation: continue self.token_count += 1 self.token_sum_len += len(token) term = self.stemmer.stem(token) if not term in terms: terms.add(term) if self.iindex.has_key(term): self.iindex[term].append(self.doc_id) else: self.iindex[term] = [ self.doc_id ] def dump_iindex(self, output_name): output = open(output_name, 'wb') pickle.dump(self.iindex, output) output.close() def dump_paragraphs(self, output_name): output = open(output_name, 'wb') pickle.dump(self.paragraphs, output) output.close() def get_stat(self): term_sum_len = 0 for term in self.iindex.keys(): term_sum_len += len(term) term_count = len(self.iindex.keys()) if not (term_count and self.token_count): self.stat = {} else: self.stat = { 'token_count': self.token_count, 'token_avg_len': self.token_sum_len/float(self.token_count), 'term_count': term_count, 'term_avg_len': term_sum_len/float(term_count) } return self.stat def print_iindex(self): for term in sorted(self.iindex.keys()): posting_list = self.iindex[term] print term print len(posting_list) print posting_list print '---------------------'
from decimal import * reload(sys) sys.setdefaultencoding("utf-8") from stop_words import get_stop_words # next line delete file content open('text_after_cleaning.csv', 'w').close() with open('text_after_cleaning.csv', 'w') as data_csv: fieldnames = ['post_text', 'stars'] writer = csv.DictWriter(data_csv, fieldnames=fieldnames) writer.writeheader() with open('items.csv') as csvfile: reader = csv.DictReader(csvfile) myPunctuation = u'–«»' exclude = set(string.punctuation+myPunctuation) for row in reader: text_before_cleaning = row['post_text'] post_text = row['post_text'] post_text = unicode(post_text, "utf-8") post_text = ''.join([i for i in post_text if not i.isdigit()]) post_words = post_text.split() stop_words = get_stop_words('ru') words_after_deleting_stop_words = [w for w in post_text.split() if not w in stop_words] rs = RussianStemmer() words_after_stemming = [rs.stem(w) for w in words_after_deleting_stop_words] text_after_cleaning = ' '.join(words_after_stemming) if text_after_cleaning: writer.writerow({'post_text': text_after_cleaning, 'stars': row['stars']})