def _preprocess_string(news): _id, con, label = news preprocessed_con = [] if isinstance(con, tuple): for _con in con: preprocessed_con.append(preprocess_string(str(_con))) preprocessed_con = tuple(preprocessed_con) else: preprocessed_con = preprocess_string(con) return _id, preprocessed_con, label
def _get_vocabulary(texts): sentences = [preprocess_string(text) for text in texts] count_words = Counter(set(word for lst in sentences for word in lst)) total_words = len(count_words) sorted_words = count_words.most_common(total_words) vocab_to_int = {w: i + 1 for i, (w, c) in enumerate(sorted_words)} return vocab_to_int
def cleaning_pipe(document): transform_to_lower = lambda s: s.lower() remove_single_char = lambda s: re.sub(r'\s+\w{1}\s+', '', s) # Filters to be executed in pipeline CLEAN_FILTERS = [strip_tags, strip_numeric, strip_punctuation, strip_multiple_whitespaces, transform_to_lower, remove_stopwords, remove_single_char] # Invoking gensim.parsing.preprocess_string method with set of filters processed_words = preprocess_string(document, CLEAN_FILTERS) return processed_words
def __iter__(self): with self.db_object as conn: cursor = conn.cursor() cursor.execute(self.__dbSQL.sql) record = cursor.fetchone() while record: document_text = record[self.__dbSQL.doc_location] if self.preprocessor: document_text = self.preprocessor.clean_text(document_text) yield parsing.preprocess_string(document_text) record = cursor.fetchone()
def save_word_dict(text): proc_text = [] sentences = text sentences = tokenize.sent_tokenize(sentences) for sentence in sentences: proc_sentence = preprocess_string(sentence) if (len(proc_sentence) == 0): continue proc_text.append(proc_sentence) dictionary = corpora.Dictionary(proc_text) return [dictionary, proc_text, sentences]
def save_word_dict(text): proc_text = [] sentences = text sentences = tokenize.sent_tokenize(sentences) for sentence in sentences: proc_sentence = preprocess_string(sentence) if(len(proc_sentence) == 0): continue proc_text.append(proc_sentence) dictionary = corpora.Dictionary(proc_text) return [dictionary, proc_text, sentences]
def print_cluster(self, cluster_id_list): """Prints the clusters in the given cluster list and performs coherence calculation. Args: cluster_id_list (list): A list of clusters that make up a coherent chain. """ cluster_content = [] print "----- Cluster -----" for cid in sorted(cluster_id_list): cur_cluster = self.clusters[cid - self.cluster_const] print cur_cluster[7] print cur_cluster[8] print utils.load_nyt_by_article_id(str(cur_cluster[9])) print "Distance: ", cur_cluster[10] article_list = cur_cluster[4].strip("[]").split(", ") tmp = "" for article_id in article_list: res = utils.load_nyt_by_article_id(article_id) print res[0][0] + " # " + res[0][4] + " # " + res[0][2] tmp += res[0][0] + " " + res[0][4] + " " cluster_content.append(parsing.preprocess_string( str.lower(str(tmp)), filters=[parsing.strip_tags, parsing.strip_punctuation, parsing.strip_multiple_whitespaces, parsing.strip_numeric, parsing.remove_stopwords] ) ) print # Coherence calculation cluster_content = [list(set(x)) for x in cluster_content] coherence = sys.maxint for i in range(0, len(cluster_content) - 1): cnt = 0 tmp = [] for word in cluster_content[i]: if word in cluster_content[i + 1]: tmp.append(word) cnt += 1 print tmp coherence = min(coherence, cnt) print "Coherence: " + str(coherence)
def save_word_dict(text): proc_text = [] sentences = text sentences = tokenize.sent_tokenize(sentences) for sentence in sentences: proc_sentence = preprocess_string(sentence) # ' '.join(preprocess_string(sentence)) if(len(proc_sentence) == 0): continue proc_text.append(proc_sentence) dictionary = corpora.Dictionary(proc_text) # dictionary.save(os.pardir + '/data/text.dict') return [dictionary, proc_text, sentences]
def save_word_dict(text): proc_text = [] sentences = text sentences = tokenize.sent_tokenize(sentences) for sentence in sentences: proc_sentence = preprocess_string( sentence) # ' '.join(preprocess_string(sentence)) if (len(proc_sentence) == 0): continue proc_text.append(proc_sentence) dictionary = corpora.Dictionary(proc_text) # dictionary.save(os.pardir + '/data/text.dict') return [dictionary, proc_text, sentences]
def __iter__(self): db = self.dict_source.db_object dbsql = self.dict_source.db_sql with db as conn: cursor = conn.cursor() cursor.execute(dbsql.sql) record = cursor.fetchone() while record: self._length += 1 document_text = record[dbsql.doc_location] pk, title = record[dbsql.unique_key_location], record[dbsql.title_location] if self.preprocessor: title = self.preprocessor.clean_text(title) document_text = self.preprocessor.clean_text(document_text) self.record_identifiers.append((pk, title)) tokens = parsing.preprocess_string(document_text) yield self._dictionary.doc2bow(tokens) record = cursor.fetchone()
def __iter__(self): db = self.dict_source.db_object dbsql = self.dict_source.db_sql with db as conn: cursor = conn.cursor() cursor.execute(dbsql.sql) record = cursor.fetchone() while record: self._length += 1 document_text = record[dbsql.doc_location] pk, title = record[dbsql.unique_key_location], record[ dbsql.title_location] if self.preprocessor: title = self.preprocessor.clean_text(title) document_text = self.preprocessor.clean_text(document_text) self.record_identifiers.append((pk, title)) tokens = parsing.preprocess_string(document_text) yield self._dictionary.doc2bow(tokens) record = cursor.fetchone()
def tokenize_dictionary_content(self, licenses_dict): for name, content in licenses_dict.items(): licenses_dict[name] = parsing.preprocess_string(content)
def searchIndbFacebookSaved(search_value): for x in "and or it is the a".split(): search_value.replace(" " + x + " ", "") result = dbFacebookSaved.query.filter( dbFacebookSaved.title.ilike("%" + search_value.replace(" ", "%") + "%")) #("%" + search_value + "%"))# idList = [ result.order_by(dbFacebookSaved.date)[count - 1].id for count in range(result.count(), 0, -1) ] idDict = dict() idDict = adding_weight_to_dict(idDict, idList, 1) print ".ilike" print idDict stemmer = PorterStemmer() search_value = search_value.split() search_valueRaw = list(search_value) if len(search_value) > 1: sumVector = model3['car'] * 0 for searchTerm in search_valueRaw: if searchTerm.lower() in model3.vocab: sumVector = sumVector + model3[searchTerm.lower()] similarList = model3.similar_by_vector(sumVector) print "similarList (sumVector)" print similarList """ for i in range(min(5,len(similarList))): if similarList[i][1] >= 0.7 and similarList[i][0] not in search_value: search_value.append(similarList[i][0]) print "append " + similarList[i][0] + " from fasttext(sum of vec)" """ print "New search value after sumVec:" search_value += [ similarList[i][0] for i in range(min(5, len(similarList))) if similarList[i][1] >= 0.72 and similarList[i][0] not in search_value ] print search_value search_valueR = [] for searchTerm in search_valueRaw: for i, mdl in enumerate([model, model2]): if searchTerm.lower() in mdl.vocab: similarList = mdl.most_similar(searchTerm.lower()) listLengh = 3 if i == 0 else 5 scoreThreshold = 0.5 if i == 0 else 0.55 tempText = " from gensim_word2vec for relating to " if i == 0 else " from fasttext(CBOW) for relating to " for i in range(min(listLengh, len(similarList))): if similarList[i][1] >= scoreThreshold and similarList[i][ 0] not in search_value: search_value.append(similarList[i][0]) search_valueR.append(similarList[i][0]) print "append " + similarList[i][ 0] + tempText + searchTerm """ if searchTerm.lower() in model.vocab: similarList = model.most_similar(searchTerm.lower()) for i in range(min(3,len(similarList))): if similarList[i][1] >= 0.5 and similarList[i][0] not in search_value: search_value.append(similarList[i][0]) search_valueR.append(similarList[i][0]) print "append " + similarList[i][0] + " from gensim_word2vec for relating to " + searchTerm if searchTerm.lower() in model2.vocab: similarList = model2.most_similar(searchTerm.lower()) for i in range(min(5,len(similarList))): if similarList[i][1] >= 0.55 and similarList[i][0] not in search_value: search_value.append(similarList[i][0]) search_valueR.append(similarList[i][0]) print "append " + similarList[i][0] + " from fasttext(CBOW) for relating to " + searchTerm """ """ print "search_value before stemming:" print search_value stemmer = PorterStemmer() search_value = [stemmer.stem(word) for word in search_value] search_value = list(set(search_value)) search_valueR = [stemmer.stem(word) for word in search_valueR] search_valueR = list(set(search_valueR)) print "search_value bafter stemming:" """ print search_value for word in search_value: if word == stemmer.stem( word) or not stemmer.stem(word) in search_value: result = dbFacebookSaved.query.filter( dbFacebookSaved.title.contains(word)) resultKwd = dbFacebookSaved.query.filter( dbFacebookSaved.keywords.contains(word)) resultSummary = dbFacebookSaved.query.filter( dbFacebookSaved.summary.contains(word)) weight = 1 if len(preprocess_string(word)) == 0: weight = 0.1 elif word in search_valueR: weight = 0.5 idList = [ read_db_data_to_article( result.order_by(dbFacebookSaved.date)[count - 1])['id'] for count in range(result.count(), 0, -1) ] idDict = adding_weight_to_dict(idDict, idList, 1 * weight) print ".title.contains(" + word + ")" print idDict idList = [ read_db_data_to_article( resultKwd.order_by(dbFacebookSaved.date)[count - 1])['id'] for count in range(resultKwd.count(), 0, -1) ] idDict = adding_weight_to_dict(idDict, idList, 0.5 * weight) print ".keywords.contains(" + word + ")" print idDict idList = [] for count in range(resultSummary.count(), 0, -1): if not resultSummary.order_by( dbFacebookSaved.date)[count - 1].id in idList and len( preprocess_string(word)) > 0: article = read_db_data_to_article( resultSummary.order_by(dbFacebookSaved.date)[count - 1]) idList.append(article['id']) cumsum = 0 # preprocess_string is a gensim function that do preprocessing for a string. ex: people -> peopl, Oranges -> orang word = preprocess_string(word)[0] for w in article['text']: if len(preprocess_string(w)) > 0: w = preprocess_string(w) if cumsum <= 0.6 and word in w: idDict[article['id']] = idDict.get( article['id'], 0) + 0.2 * weight cumsum = cumsum + 0.2 * weight print ".summary.contains(" + word + ")" #idDict = adding_weight_to_dict(idDict, idList, 0.2) print idDict else: print "ignore " + word + " for " + stemmer.stem(word) return idDict
# -*- coding: utf-8 -*- from numpy import * from sklearn.datasets import fetch_20newsgroups from gensim.parsing import preprocess_string from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import BernoulliNB, MultinomialNB from time import time random.seed(0) df = fetch_20newsgroups() N = len(df.data) # 訓練文書数 # 前処理 (Porterのステマーなど) corpus = map(lambda s: " ".join(preprocess_string(s)), df.data) # 全体の9割を訓練に使い,残りの1割のクラスを正しく決定出来るか検証 indices = arange(N) random.shuffle(indices) train_indices = indices[:9*N/10] test_indices = indices[9*N/10:] # コーパスを読み込み,特徴ベクトル変換器を構築 vec = CountVectorizer() vec.fit([corpus[i] for i in train_indices]) # 全文書を特徴ベクトルに変換 X = vec.transform(corpus) # 単純ベイズ(ベルヌーイ) start = time()
def create_query(self, query_content_filepath, corpora_dict, lsi_model): query_content = self.get_file_content(query_content_filepath) tokenized_content = parsing.preprocess_string(query_content) vec_bag_of_words = corpora_dict.doc2bow(tokenized_content) vec_lsi = lsi_model[vec_bag_of_words] return vec_lsi
def process_articles(self): final_articles = [] for article in self.articles: final_articles.append(parsing.preprocess_string(article)) return final_articles
def normalize_with_gensim(text): custom_filters = [strip_multiple_whitespaces, strip_numeric, strip_punctuation, strip_short] text = preprocess_string(to_unicode(text).lower(), custom_filters) text = [word for word in text if word not in stopwords.words('indonesian')] return text, len(text)
def preprocess_new_document(doc): return preprocess_string(doc)
def find_similar(doc, model, *args, **kwargs): cleaned_doc = preprocess_string(doc) inferred_vector = model.infer_vector(cleaned_doc) sims = model.docvecs.most_similar([inferred_vector], **kwargs) return sims
def encode(self, texts, seq_length): processed_texts = [preprocess_string(text) for text in texts] encoded_texts = [] for text in processed_texts: encoded_texts.append([self.vocab.get(w, 0) for w in text]) return self._pad_features(encoded_texts, seq_length)
from gensim.parsing import preprocess_string from sklearn.preprocessing import LabelEncoder data_path = '.' rating = pd.read_feather(os.path.join(data_path, 'ratings.feather')) print(rating.iidx.min()) user_num, item_num = rating.uidx.max() + 1, rating.iidx.max() + 1 word_set = set() genre_set = set() data = [] with open(os.path.join(data_path, 'movies.dat')) as f: for line in f: iidx_raw, title_raw, genre_raw = line.strip().split('::') iidx = int(iidx_raw) title_feat = preprocess_string(genre_raw) word_set.update(title_feat) genre_list = genre_raw.strip().split('|') genre_set.update(genre_list) data.append((iidx, title_feat, genre_list)) word_encoder = LabelEncoder().fit(list(word_set)) genre_encoder = LabelEncoder().fit(list(genre_set)) bow_title = np.zeros((item_num, len(word_set))) bow_genre = np.zeros((item_num, len(genre_set))) for iidx, word_list, genre_list in data: word_idx_list = word_encoder.transform(word_list) genre_idx_list = genre_encoder.transform(genre_list) bow_title[iidx, word_idx_list] += 1
def __iter__(self): for url, doc in scrape(self.testing): yield doc2vec.TaggedDocument(preprocess_string(doc), [url])