from gensim import corpora # create a toy corpus of 2 documents, as a plain python list corpus = [[(1, 0.5)], []] # serialise to disk corpora.MmCorpus.serialize('/tmp/corpus.mm', corpus) # read from disk corpus = corpora.MmCorpus('/tmp/corpus.mm') print(corpus) print(list(corpus)) # or for doc in corpus: print(doc) # other way to serialise to disk # corpora.SvmLightCorpus.serialize('/tmp/corpus.svmlight', corpus) # corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus) # corpora.LowCorpus.serialize('/tmp/corpus.low', corpus)
# compile corpus (vectors number of times each elements appears) raw_corpus = [dictionary.doc2bow(t) for t in tweets] print("Then convert convert tokenized documents to vectors: %s" % type(raw_corpus)) corpora.MmCorpus.serialize('/tmp/tweets.mm', raw_corpus) # store to disk print("Save the vectorized corpus as a .mm file") print() # STEP 2 : similarity between corpuses print("STEP 2 : Transform and compute similarity between corpuses") print('-' * 10) dictionary = corpora.Dictionary.load('/tmp/tweets.dict') print("We load our dictionary : %s" % type(dictionary)) corpus = corpora.MmCorpus('/tmp/tweets.mm') print("We load our vector corpus : %s " % type(corpus)) # Transform Text with TF-IDF tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model print("We initialize our TF-IDF transformation tool : %s" % type(tfidf)) # corpus tf-idf corpus_tfidf = tfidf[corpus] print("We convert our vectors corpus to TF-IDF space : %s" % type(corpus_tfidf)) print() # STEP 3 : Create similarity matrix of all files print("STEP 3 : Create similarity matrix of all docs") print('-' * 10)
import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) from gensim import corpora, models, similarities directory = corpora.Dictionary.load('./tmp/deerwester.dict') corpus = corpora.MmCorpus("./tmp/deerwester.mm") lsi = models.LsiModel(corpus, id2word=directory, num_topics=2) doc = "Human computer interaction" vec_bow = directory.doc2bow(doc.lower().split()) vec_lsi = lsi[vec_bow] # print(vec_lsi) index = similarities.MatrixSimilarity(lsi[corpus]) index.save('./tmp/deerwester.index') sims = index[vec_lsi] # print(list(enumerate(sims))) sims = sorted(enumerate(sims),key=lambda item: -item[1]) print(sims)
def load_corpus(corpus_file): corpus_ = corpora.MmCorpus(corpus_file) return corpus_
import sqlite3 import pandas as pd import numpy as np import time import pickle from nltk.tokenize import RegexpTokenizer from nltk.stem.porter import PorterStemmer from nltk.tokenize.moses import MosesDetokenizer from stop_words import get_stop_words from collections import defaultdict import gensim from gensim import corpora, models, similarities from gensim.models.doc2vec import TaggedDocument, TaggedLineDocument from gensim.models import Doc2Vec import gensim.models.doc2vec print('loading...') corpus = corpora.MmCorpus('/volume/models/corpus.mm') dictionary = corpora.Dictionary.load('/volume/models/dictionary.dict') print('training lsi...') start_time = time.time() lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=10) lsi.save('/volume/models/tax_model.lsi') print("--- %s seconds ---" % (time.time() - start_time)) print('training lda...') start_time = time.time() lda = models.LdaModel(corpus, id2word=dictionary, num_topics=10) lda.save('/volume/models/tax_model.lda') print("--- %s seconds ---" % (time.time() - start_time))
dictionary.save(os.path.join( __location__, 'data/KeyVis.dict')) #store dictionary for future reference #dictionary = corpora.Dictionary.load(os.path.join(__location__, 'data/KeyVis.dict')) """Initialize corpus""" class MyCorpus(object): def __iter__(self): for line in open( os.path.join(__location__, 'KeyVisCorpora', 'abstracts.txt'), 'rU'): line = unicode(line, errors='ignore') lowers = line.lower() tokenList = lowers.split() output = [stem(word, stemmer=LEMMA) for word in tokenList] #Assume there's one document per line, tokens separated by space yield dictionary.doc2bow([x.strip() for x in output]) corpus = MyCorpus() """tf-idf transformation; tfidf is a read-only object that converts any vector from the old representation to the new representation""" tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] #Save as market matrix corpora.MmCorpus.serialize(os.path.join(__location__, 'data/KeyVis_tfidf.mm'), corpus_tfidf) mm = corpora.MmCorpus(os.path.join(__location__, 'data/KeyVis_tfidf.mm')) print "DONE:", mm
def loadCorpus(p): c_path = os.path.join(PROJECT_ROOT, "data/{0}_corpus.mm".format(p)) c = corpora.MmCorpus(c_path) return c
def lda(input_file=sys.argv[1]): # List for remove stop words. slothlib_stopwords = [] with open("./slothlib.txt", "r") as f: slothlib_stopwords = [line.strip() for line in f] separated_document_list = documents_wakati(input_file) separated_document_list_temp = [] # Remove stop words. for l in separated_document_list: b_pivot = 0 for i in range(len(l)): if l[b_pivot] in slothlib_stopwords: l.pop(b_pivot) continue b_pivot += 1 separated_document_list_temp.append(l) separated_document_list = separated_document_list_temp separated_document_list_temp = None # Generate a corpora. dictionary = corpora.Dictionary(separated_document_list) dictionary.filter_extremes(no_below=2, no_above=0.3) # (Provisional) dictionary.save_as_text('dict.txt') # Generate a Dictionary. corpus = [dictionary.doc2bow(text) for text in separated_document_list] corpora.MmCorpus.serialize('cop.mm', corpus) dictionary = gensim.corpora.Dictionary.load_from_text('dict.txt') corpus = corpora.MmCorpus('cop.mm') # Create a model by Hierarchical Dirichlet Process. #topic_N = 150 #model = gensim.models.hdpmodel.HdpModel(corpus=corpus, id2word=dictionary) # Create a model by Latent Dirichlet Allocation. topic_N = 20 model = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=topic_N, id2word=dictionary) # Topics(Max. 150), and words that construct a topic. #topics_list = model.print_topics(num_topics=-1, num_words=999999) topics_list = model.print_topics(num_topics=topic_N, num_words=999999) # Count topics that estimated above script. estimated_topicnum_list = [] topics = [model[c] for c in corpus] json_data = {} #with open("models/lda20_2_30per.json", "r") as f: # lda_result = json.load(f) with open("lda20_2_30per.json", "w") as f: for i in range(len(topics)): if len(topics[i]) == 0: continue else: print(i, u"番目の文書のトピックは, ", topics[i]) # [(topic_index, topic_weight), ...] for topic_and_prob_tuple in topics[i]: estimated_topicnum_list.append(topic_and_prob_tuple[0]) index = [j[0] for j in topics[i]] c = [j[1] for j in topics[i]] top_n_word = [] # Words (=N) related to topic. top_n = 10 counter = 0 words_list = topics_list[index[np.argmax(c)]][1].split("+") for w in words_list: # 0.000*"hoge" -> hoge w = w.replace("*", "").replace(" ", "").replace('"', '')[5:] if w in separated_document_list[i]: if counter == top_n: break #if w not in lda_result[str(i)]: # top_n_word.append(w) # counter += 1 top_n_word.append(w) counter += 1 print(i, top_n_word) json_data[i] = top_n_word json.dump(json_data, f, indent=4, sort_keys=True, separators=(',', ': ')) estimated_topicnum_dict = collections.Counter(estimated_topicnum_list) print(u"推定されたトピックの数: ", len(estimated_topicnum_dict)) # Calculate a occuerence probabilty of each words in topics that most represent a input document. input_text_topics = topics[len(separated_document_list) - 1] # [(topic's index, occProb), ...] #word_prob_in_topic = model.print_topics(num_topics=-1, num_words=len(dictionary)) # [(topic's index, u"'occProb*word', ..."), ...] word_prob_in_topic = model.print_topics( num_topics=topic_N, num_words=len( dictionary)) # [(topic's index, u"'occProb*word', ..."), ...] word_prob_in_topic_dic = {} # {"word": "prob", ...} for input_text_topic in input_text_topics: word_prob_in_topic_list = word_prob_in_topic[ input_text_topic[0]][1].split(",") for factors in word_prob_in_topic_list: factor = factors.split("*") if len(factor) == 2: if factor[1] in word_prob_in_topic_dic: word_prob_in_topic_dic[factor[1]] = str( float(word_prob_in_topic_dic[factor[1]]) + float(factor[0])) else: word_prob_in_topic_dic[factor[1]] = factor[0] return word_prob_in_topic_dic
def load_corpus(self): """ Load self.corpus from a file it was saved to earlier. """ self.corpus = corpora.MmCorpus(self.corpus_filepath)
class EmotionShot(object): """ 根据论文算法计算shot的emotion """ """ 类属性 """ print "*****************************Loading class attributes**********************************" print 'Loading TopicInfo' ldaParameter = {'topicNum': 20, 'iteration': 500} print 'ldaParameter:', ldaParameter topicFile = GLOBAL_generatedFiles + '/txtall_t' + str( ldaParameter['topicNum']) + '_it' + str( ldaParameter['iteration']) + '.txt' et = EmotionTopic(topicFile) _TopicsInfo = et.topicsInfo() print '\nLoading the Dictionary, LDA and list_corpora, index ...' # 载入语料库和lda模型 myDictLocation = GLOBAL_generatedFiles + '/' + GLOBAL_dictionaryName myCorporaLocation = GLOBAL_generatedFiles + '/' + GLOBAL_corporaTfidfName myLDALocation = GLOBAL_generatedFiles + '/' + 'topics' + str( ldaParameter['topicNum']) + '___iterations' + str( ldaParameter['iteration']) + '___.lda' _dictionary = corpora.Dictionary.load(myDictLocation) _list_corpus = corpora.MmCorpus(myCorporaLocation) _lda = models.LdaModel.load(myLDALocation) print '\nCommunicating the index...' # _index是后面检索相似shot用,如果分析单个shot不需要 _index = similarities.MatrixSimilarity(_lda[_list_corpus]) print "\nCalculating all Movie Emotion Vectors...." em = EmotionMovie() em.calculateEmoMovieVector() MoviesVectors = em.allEmoMovie # dict: allEmoMovie:{movieName1:<Counter>, movieName2:<Counter>} print "*****************************Loading class attributes END********************************" """ 类属性 """ def __init__(self, shotLocation): """ 参考SimilarityUtil.py 初始化,dictionanry,LDA :return: """ # shot(window)的位置 self.shotLocation = shotLocation # shot归属movie self.belongedMovie = extractMovieName(shotLocation=shotLocation) self.wordsCount = 0 # shot中的词量 self.numofMaxTopic = -1 self.maxTopicWeight = 0 # shot Emotion Vector self.shotVector = Counter({ 'surprise': 0, 'sorrow': 0, 'love': 0, 'joy': 0, 'hate': 0, 'expect': 0, 'anxiety': 0, 'anger': 0, }) def emoCalculate4OneShot(self): """ 计算shot_i的情感向量 :return: """ if self.shotLocation.find('.txt') == -1: print 'It is not a windowTXT.' return fr = open(self.shotLocation, 'r') windowContent = fr.read().decode('utf-8', 'ignore').encode('utf-8') # 忽略繁体字 windowCounter = Counter() fill_windowCounter( windowContent, windowCounter) # InitialCorporaUtil.fill_windowCounter listWindowContent = [] # 该shot里的word # print 'The words in this Shot:' for key, count in windowCounter.items(): for x in xrange(count): # print key, listWindowContent.append(key) # 打印shot里words的数量 # print '\nnumber of shotwords:',len(listWindowContent) self.wordsCount = len(listWindowContent) vec_bow = EmotionShot._dictionary.doc2bow(listWindowContent) vec_lda = EmotionShot._lda[vec_bow] # 该文档的主题分布,可以是个新文档 self.maxTopicWeight = 0 maxtopic = -1 # 记录shot的权重最大的topic编号 for tuple_topic in vec_lda: if tuple_topic[1] > self.maxTopicWeight: self.maxTopicWeight = tuple_topic[1] maxtopic = tuple_topic[0] self.numofMaxTopic = maxtopic # print '\nMaxTopic:', self.numofMaxTopic, self.maxTopicWeight # 打印该shot-maxTopicInfo # print '***************maxTopicInfo*****************' # EmotionShot.et.show_topic(self.numofMaxTopic) # 计算Shot 的emotion Vector self.calculateShotVector(shotWordsList=listWindowContent) def calculateShotVector(self, shotWordsList): """ called by emoCalculate4OneShot 计算Shot 的emotion Vector :param shotWordsList: 这个shot的词:list<string> :return: """ # print '*********************calculating shotVector**********************' maxtopicInfo = EmotionShot._TopicsInfo[self.numofMaxTopic] count4shotDictWord = 0 # 在dictionary中出现的词 count4shotTopicWord = 0 # 没在dictionary中找到但是在maxtopic中找到 count4for = 0 # 定义两个计数器分别记录词典词的总向量,和topic中的总向量 shotVector_dict = Counter() shotVector_topic = Counter() for shotword in shotWordsList: # 遍历shot中的词 count4for += 1 flagfind = False for emoword in EmotionShot.et.Emodictionary: if emoword.word == shotword: flagfind = True count4shotDictWord += 1 # self.shotVector.update(emoword.emotionVector) # print 'indict:',emoword.word, emoword.emotionVector shotVector_dict.update(emoword.emotionVector) break if not flagfind: # 没有在词典里找到 # 论文公式4 for topicItem in maxtopicInfo['topicWords']: if shotword in topicItem: count4shotTopicWord += 1 alpha, topicword = topicItem.split('*') alpha = float(alpha) # 论文公式(4) otherwise: 没有在词典找到的词的词向量 wordVector = calculateWordVectorInMaxTopic( maxTopicVector=maxtopicInfo['topicVector'], maxTopicWeight=self.maxTopicWeight, wordWeightInMaxTopic=alpha) # print ' intopic:',shotword, wordVector # self.shotVector.update(wordVector) shotVector_topic.update(wordVector) break # 打印inDict和inTopic两种类型的词的信息 # print 'count4shotDictWord:', count4shotDictWord # print 'count4shotTopicWord:', count4shotTopicWord # print 'shotVector_dict:', shotVector_dict # print 'shotVector_topic', shotVector_topic self.shotVector.update(shotVector_dict) self.shotVector.update(shotVector_topic) # print 'shotBelongedMovie:', self.belongedMovie # print 'shotVector:', self.shotVector @property def list_corpus(self): return self._list_corpus
corpus) # store to disk, for later use print corpus[:10] # In[5]: from gensim import models, similarities import os if (os.path.exists( '/Users/jordanchisam/Desktop/ProgrammingTextAnalysis/corpora/practicenovel.dict' )): dictionary = corpora.Dictionary.load( '/Users/jordanchisam/Desktop/ProgrammingTextAnalysis/corpora/practicenovel.dict' ) corpus = corpora.MmCorpus( "/Users/jordanchisam/Desktop/ProgrammingTextAnalysis/corpora/corpusnovel.mm" ) print("Lets get to work") else: print("Invalid data set provided") # ## Model # # Decided to continue with TF IDF model as opposed to measuring raw word count values or other frequency weigting methods because TF IDF works well for measuring the significance of words. Additionally, it properly shows the similarilty and differences in texts. # In[6]: tfidf = models.TfidfModel(corpus) # In[7]:
def load(dic_path, cor_path): dic = corpora.Dictionary.load_from_text(dic_path) cor = corpora.MmCorpus(cor_path) return dic,cor
groupdic = { d[0]: d[1] for d in [ i.split(' ') for i in open('texts/' + prefix + '/metadata_extra.txt', 'r').read().strip().split('\n') ] } groups = [groupdic[i] for i in labels] else: groups = [l.split('.')[0] for l in labels] #step 1 prepare corpus #prepare_corpus([open('texts/'+section,'r').read() for section in book]) dictionary = corpora.Dictionary.load('texts/' + prefix + '/dictionary.dict') corpus = corpora.MmCorpus('texts/' + prefix + "/corpus.mm") #step 2 create tf-idf model tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] # convert vector space to tfidf space #step 2.1 create LDA model numTopics = 50 lda = models.ldamodel.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=numTopics) topics = lda.show_topics(num_topics=numTopics) #print topics for text in corpus: for id, freq in lda[text]: print(dictionary[id], freq)
# Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean) # Filter out words that occur less than 20 documents, or more than 50% of the documents. dictionary.filter_extremes(no_below=20, no_above=0.5) dictionary.save('./models/dictionary.dict') else: dictionary = corpora.Dictionary.load('./models/dictionary.dict') print("preprocessed ictionary loaded...") if not os.path.exists('./models/corpus.mm'): # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above. [Bag Of Word] corpus = [dictionary.doc2bow(doc) for doc in doc_clean] # Save the matrix into Market Matrix format. corpora.MmCorpus.serialize('./models/corpus.mm', corpus) else: corpus = corpora.MmCorpus('./models/corpus.mm') print("document to term matrix loaded...") # Use TF-IDF model tfidf = gensim.models.TfidfModel(corpus, normalize=True) corpus_tfidf = tfidf[corpus] print("de-normaliza tf-idf corpus...") corpus_tfidf = map(lambda x: map(lambda y: (y[0], round(y[1] * 200, 1)), x), corpus_tfidf) # pprint(dictionary[237]) print("tfidf weights of the first document after de-normalization") # print("BOW of the first document") # pprint(map(lambda x: (dictionary[x[0]], x[1]), corpus[0])) # pprint(len(corpus[0]))
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2) # initialize an LSI transformation corpus_lsi = lsi[ corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi lsi.print_topics(2) todas = [] for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly todas.append(doc) todas from gensim import corpora, models, similarities dictionary = corpora.Dictionary.load('/tmp/deerwester4.dict') corpus = corpora.MmCorpus( '/tmp/deerwester4.mm' ) # comes from the first tutorial, "From strings to vectors" print(corpus) np.array(corpus).shape lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2) p = [] for i in range(0, len(documents)): doc1 = documents[i] vec_bow2 = dictionary.doc2bow(doc1.lower().split()) vec_lsi2 = lsi[vec_bow2] # convert the query to LSI space p.append(vec_lsi2) index = similarities.MatrixSimilarity(
raw_corpus) corpora.MmCorpus.serialize('/tmp/' + str(i) + '.mm', raw_corpus) # store to disk print "Save the vectorized corpus as a .mm file" print # STEP 2 : similarity between corpuses print "STEP 2 : Transform and compute similarity between corpuses" print '-' * 10 t0 = time() for i, t in enumerate(to_vectorize): dictionary = corpora.Dictionary.load('/tmp/' + str(i) + '.dict') print "Load our dictionary : %s" % type(dictionary) corpus = corpora.MmCorpus('/tmp/' + str(i) + '.mm') print "Load our vector corpus : %s " % type(corpus) # Train the TF-IDF model tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model print "Initialize our TF-IDF transformation tool : %s" % type(tfidf) # corpus tf-idf corpus_tfidf = tfidf[corpus] print "Convert our vectors corpus to TF-IDF space : %s" % type( corpus_tfidf) print "Save the tranformed corpus" corpus_tfidf.save('/tmp/' + str(i) + '.trans') print
# 学习其他同学过滤掉无意义的标点符号和助词等 word = words[i] if '/w' not in word and '/y' not in word and '/u' not in word \ and '/c' not in word: doc.append(word) documents.append(doc) # 提取出现频数大于 1 的关键词作为词袋,以此为基础将文档向量化并使用 TF-IDF 作为词的权重 fre = {} for doc in documents: for word in doc: if word in fre: fre[word] += 1 else: fre[word] = 1 documents = [[word for word in doc if fre[word] > 1] for doc in documents] bag = corpora.Dictionary(documents) # 词袋(bag of words) corpus = [bag.doc2bow(doc) for doc in documents] # 基于频数的文档向量列表 tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] # 转换成 TF-IDF 文档向量列表并持久化 corpora.MmCorpus.serialize('tmp/corpus_tfidf.mm', corpus_tfidf) corpus_tfidf = corpora.MmCorpus('tmp/corpus_tfidf.mm') # 构建文档相似度矩阵索引用于查询,再使用文档列表本身进行相似度查询(默认使用 Cosine) index = similarities.SparseMatrixSimilarity(corpus_tfidf) with open('result.csv', 'w') as f_out: for sims in index[corpus_tfidf]: f_out.write(','.join(map(str, sims)) + '\n') print("总共耗时(秒):" + str(time.time() - start_time))
all_tokens = sum(cleaned_comment, []) token_set = set(all_tokens) tokens_once = set(word for word in token_set if all_tokens.count(word) == 1) comment_tokens = [[word for word in text if word not in tokens_once] for text in cleaned_comment] pickle.dump(comment_tokens, open('../data/comment_tokens.pkl', 'wd')) print "making dict" dictionary = corpora.Dictionary(comment_tokens) dictionary.save('../model/comments_' + sys.argv[1] + '.dict') # store the dictionary, for future reference print(dictionary) print "making corpus" corpus = [dictionary.doc2bow(text) for text in comment_tokens] corpora.MmCorpus.serialize('../model/comments' + sys.argv[1] + '.mm', corpus) print "loading corpus" mm = corpora.MmCorpus('../model/comments' + sys.argv[1] + '.mm') print mm #logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) #print "training LDA model" #lda = gensim.models.ldamodel.LdaModel(corpus=mm, id2word=dictionary, num_topics=int(sys.argv[1]), update_every=1, chunksize=10000, passes=5) #lda.print_topics() #save_name = '../model/lda_model_' + sys.argv[1] + '.pkl' #print 'saving model ', save_name #pickle.dump(lda, open(save_name, 'wd'))
def calc_similarity(self, prefix: str, sysno: int, text: str): """计算相似度 返回索引和余弦值 Arguments: prefix {str} -- 模型前缀 text {str} -- 文本数据 value {float} -- 设定的阈值,返回大于这个值的数据 """ dictionary = corpora.Dictionary.load('./models/{}_dict.dic'.format(prefix)) # 加载字典 corpus = corpora.MmCorpus('./models/{}_corpuse.mm'.format(prefix)) # 加载语料 tfidf_model = models.TfidfModel.load("./models/{}_tfidf_model.model".format(prefix)) # 加载Tfidf模型 corpus_tfidf = tfidf_model[corpus] lsi = models.LsiModel(corpus_tfidf) corpus_lsi = lsi[corpus_tfidf] similarity_lsi = similarities.Similarity('./models/similarity-lsi-index', corpus_lsi, num_features=400, num_best=1) cut_raw = self.segment(text) # 1.分词 corpus = dictionary.doc2bow(cut_raw) # 2.转换成bow向量 corpus_tfidf = tfidf_model[corpus] # 3.计算tfidf值 corpus_lsi = lsi[corpus_tfidf] # 4.计算lsi值 sims = similarity_lsi[corpus_lsi] def find_idx(x): dtc = self.mongo_db.find_one("{}_idx".format(prefix), {"_id": int(x)}) val = None if dtc is not None: val = dtc["data"] return val ids_dic = [] if sims is not None: # 取索引 index_dic = [(idx + 1) for idx, val in sims if val > self.keep_val] # 取编号 for x in index_dic: tt = find_idx(x) if tt is not None: ids_dic.append(tt) idxs = self.mongo_db.find("{}_idx".format(prefix), {"data": {"$in": ids_dic}}).sort([("_id", -1)]) # 查找编号是否存在 ids = self.mongo_db.find("{}_idx".format(prefix), {"data": sysno}) if len(ids_dic) > 0: # 最新一条 _id = idxs[0]["_id"] if ids.count() > 0: # 编号存在 is_update = False if _id not in index_dic: # 最新一条索引不在返回的索引中,和之前编辑内容重复,更新模型 ids_dic = [] is_update = True else: # 编号不存在,和之前的内容重复 is_update = False else: # 编号存在内容不重复,为编辑,更新模型, # 编号不存在内容不重复,为新增,更新模型 is_update = True return ids_dic, is_update
# remove stop words and words that appear only once dictionary.filter_tokens(once_ids) # remove gaps in id sequence after words that were removed dictionary.compactify() print(dictionary) dictionary.save( os.path.join('C:\\Users\\soapk\\Desktop', saved_filename + ".dict")) # print(len(dictionary)) if os.path.exists( os.path.join('C:\\Users\\soapk\\Desktop', saved_filename + ".mm")): corpus = corpora.MmCorpus( os.path.join('C:\\Users\\soapk\\Desktop', saved_filename + ".mm")) else: corpus = [dictionary.doc2bow(title) for title in so_titles] corpora.MmCorpus.serialize( os.path.join('C:\\Users\\soapk\\Desktop', saved_filename + ".mm"), corpus) tfidf = models.TfidfModel(corpus) if os.path.exists( os.path.join('C:\\Users\\soapk\\Desktop', saved_filename + ".index")): index = similarities.SparseMatrixSimilarity.load( os.path.join('C:\\Users\\soapk\\Desktop', saved_filename + ".index")) else: corpus_tfidf = tfidf[corpus]
from nltk.stem import SnowballStemmer from gensim import corpora, models, similarities import logging import glob import errno import re import os '''reading info that are not printed''' logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) '''read generated files''' if (os.path.exists( "D:/HOME/TCSS456/NLP-TCSS-456-A-Winter-2018/2017/tmp/dictionary.dic")): dictionary = corpora.Dictionary.load( 'D:/HOME/TCSS456/NLP-TCSS-456-A-Winter-2018/2017/tmp/dictionary.dic') corpus = corpora.MmCorpus( 'D:/HOME/TCSS456/NLP-TCSS-456-A-Winter-2018/2017/tmp/corpus.cop') print("Used files generated") else: print("Please generate data set") ##print(list(corpus)) ##print(dictionary) ##print(dictionary.token2id) '''TF-IDF''' tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] ##for doc in corpus_tfidf: ## print(doc) print( '10----------------------------------------------------------------------------------------------------------------' )
""" train baseline LDA model. """ import UbuntuCorpus as UC from gensim import corpora, models, similarities import logging num_topics = 100 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) dictionary = corpora.Dictionary.load_from_text('tmp/dialogs4.dict') corpus = corpora.MmCorpus('tmp/dialogs4-corpus.mm') # compute tfidf tfidf = models.TfidfModel(corpus) # convert the corpus to tfidf representation corpus_tfidf = tfidf[corpus] lda = models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=num_topics, update_every=1, eta=0.02, chunksize=10000, passes=10) print("****TOP TOPICS****") lda.print_topics(10)
# Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean) # Filter out words that occur less than 20 documents, or more than 50% of the documents. dictionary.filter_extremes(no_below=20, no_above=0.5) dictionary.save('models/3/new-rc-lda.dict') else: dictionary = corpora.Dictionary.load('models/3/new-rc-lda.dict') if not os.path.exists('models/3/new-doc-term.mm'): # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above. [Bag Of Word] doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean] # Save the matrix into Market Matrix format. corpora.MmCorpus.serialize('models/3/new-doc-term.mm', doc_term_matrix) else: doc_term_matrix = corpora.MmCorpus('models/3/new-doc-term.mm') # pprint(doc_term_matrix) import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # Save LDA model if not os.path.exists('models/3/new-model.lda'): # Creating the object for LDA model using gensim library Lda = gensim.models.ldamodel.LdaModel # Running and Trainign LDA model on the document term matrix. ldamodel = Lda(doc_term_matrix, num_topics=100, id2word = dictionary, passes=50) ldamodel.save('models/3/new-model.lda') else:
# store to disk, for later use corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus) class MyCorpus(object): def __iter__(self): for line in open('mycorpus.txt'): # assume there's one document per line, tokens separated by whitespace yield dictionary.doc2bow(line.lower().split()) corpus_memory_friendly = MyCorpus() # doesn't load the corpus into memory! print(corpus_memory_friendly) dictionary = corpora.Dictionary.load('/tmp/deerwester.dict') corpus = corpora.MmCorpus('/tmp/deerwester.mm') print(corpus) lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2) # convert the query to LSI space doc = "Human computer interaction" vec_bow = dictionary.doc2bow(doc.lower().split()) vec_lsi = lsi[vec_bow] print(vec_lsi) # transform corpus to LSI space and index it index = similarities.MatrixSimilarity(lsi[corpus]) # Saving and loading index index.save('/tmp/deerwester.index')
from gensim.models.ldaseqmodel import LdaSeqModel from gensim.models.ldamodel import LdaModel from gensim import corpora from nltk import word_tokenize from tqdm import tqdm from csv import DictReader from collections import defaultdict import pprint pp = pprint.PrettyPrinter(indent=4) id2word = corpora.Dictionary.load('tokens.dict') mm = corpora.MmCorpus('messages.mm') ldaseq = LdaModel(corpus=mm, id2word=id2word, num_topics=15) pp.pprint(ldaseq.print_topics()) ldaseq.save("lda_model")
# GLOBAL calculation for weights (same for all recommendations) # Weights: reciproacal ranks of the 500 items in each list (hd2v and bm25) hybrid_weights = [1 / (i + 1) for i in range(500)] hybrid_weights.extend(hybrid_weights) hybrid_weights = np.array(hybrid_weights) # Convert to probabilities hybrid_weights = hybrid_weights / hybrid_weights.sum() # GLOBAL num_items_to_pick (with replacement) -- high number: half a million num_picks = 1000000 # LOAD MODELS loadmodstart = time() id2word_dictionary = corpora.Dictionary.load( '/home/ashwath/Programs/ACLAAn/LDA/aclmag.dict') corpus = corpora.MmCorpus( '/home/ashwath/Programs/ACLAAn/LDA/aclmag_bow_corpus.mm') try: ldamallet = LdaMallet.load( '/home/ashwath/Programs/ACLAAn/LDA/lda_model.model') vec_bow_test = id2word_dictionary.doc2bow(['test']) vec_ldamallet = ldamallet[vec_bow_test] except subprocess.CalledProcessError: print("LDA MALLET COULDN'T READ INSTANCE FILE. USING NORMAL LDA INSTEAD") ldamallet = LdaModel.load( '/home/ashwath/Programs/ACLAAn/LDA/ldanormal_acl.model') #index = similarities.MatrixSimilarity(ldamallet[corpus]) #index.save("simIndex.index") malletindex = similarities.MatrixSimilarity.load( '/home/ashwath/Programs/ACLAAn/LDA/simIndexAcl.index') with open(
corr_phi_tot = np.zeros([1, nbrun]) cosine_theta_tot = np.zeros([1, nbrun]) cosine_phi_tot = np.zeros([1, nbrun]) KL_theta_tot = np.zeros([1, nbrun]) KL_phi_tot = np.zeros([1, nbrun]) num_topics = 3 num_docs = 100 term_per_doc = 100 voc_size = 1000 beta = [0.01 for i in range(voc_size)] alpha = [1 for i in range(num_topics)] for i in range(nbrun): X, p_generate, theta_generate, phi_generate, data = generate_data( num_topics, num_docs, term_per_doc, voc_size, alpha, beta) dct = corpora.Dictionary.load('dct.dict') corpus = corpora.MmCorpus('corpus.mm') num_words = len(dct) print("Corpus actuel : ", i) if nbrun == 1: phi_gensim, corr_theta_gensim, corr_phi_gensim, cosine_theta_gensim, cosine_phi_gensim, KL_theta_gensim, KL_phi_gensim = lda_train( p_generate, theta_generate, phi_generate, num_topics, num_docs) phi_cgs, corr_theta, cosine_theta, KL_theta, corr_phi, cosine_phi, KL_phi = gibbs_vanilla( X, p_generate, theta_generate, phi_generate, num_topics, num_docs) words_id = np.arange(num_words, dtype=float) ymax = max(np.max(phi_generate), np.max(phi_gensim), np.max(phi_cgs)) fig1 = plt.figure() for i in range(num_topics): plt.subplot(1, num_topics, i + 1) plt.bar(words_id, phi_generate[i, :], label="Généré", color="r") #plt.subplot(3,num_topics,num_topics+i+1) plt.bar(words_id, phi_gensim[i, :], label="Gensim", color="g")
from gensim.corpora import dictionary from gensim import models if len(sys.argv) > 1: fname_suffix = sys.argv[1] else: fname_suffix = '' # In[6]: corpus_fname = 'corpus' + fname_suffix + '.mm' tfidf_corpus_fname = 'tfidf_corpus' + fname_suffix + '.mm' my_dict = dictionary.Dictionary.load( os.path.join(settings.PERSIST_DIR, 'my_dict')) corpus = corpora.MmCorpus(os.path.join(settings.PERSIST_DIR, corpus_fname)) # In[8]: tfidf = models.TfidfModel(corpus) # In[10]: tfidf_corpus = tfidf[corpus] tfidf.save(os.path.join(settings.PERSIST_DIR, 'tfidf_model' + fname_suffix)) # In[11]: corpora.MmCorpus.serialize( os.path.join(settings.PERSIST_DIR, tfidf_corpus_fname), tfidf_corpus)
## import some packages ################################################################################################# print('Loading modules ... ') from gensim import corpora, models, similarities import logging import os.path logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) ################################################################################################# ## load dictionary, corpus - bag of words representation ################################################################################################# print('Loading dictionary and corpus ... ') root = os.path.dirname(os.getcwd()) + '\\OBJ\\LSI\\' dictionary = corpora.Dictionary.load(root + 'fullDictionary.dict') corpus = corpora.MmCorpus(root + 'fullCorpus.mm') print corpus ################################################################################################# ## Tfidf model (topic frequency, inverse document frequency) ################################################################################################# print('\nGenerating Tfidf model ... ') tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] ################################################################################################# ## LSI model with Nfeat features ################################################################################################# Nfeat = 100 print('\nGenerating LSI model with {} features ... '.format(Nfeat)) lsi = models.LsiModel(corpus_tfidf, id2word=dictionary,
def classify_stock_news(self, unseen_raw_document, database_name, collection_name, label_name="60DaysLabel", topic_model_type="lda", classifier_model="svm", ori_dict_path=None, bowvec_save_path=None, is_saved_bow_vector=False): historical_raw_documents_list = [] Y = [] for row in self.database.get_collection(database_name, collection_name).find(): if label_name in row.keys(): if row[label_name] != "": historical_raw_documents_list.append(row["Article"]) Y.append(row[label_name]) logging.info( "fetch symbol '{}' historical news with label '{}' from [DB:'{}' - COL:'{}'] ... " .format(collection_name, label_name, database_name, collection_name)) le = preprocessing.LabelEncoder() Y = le.fit_transform(Y) logging.info( "encode historical label list by sklearn preprocessing for training ... " ) label_name_list = le.classes_ # ['中性' '利好' '利空'] -> [0, 1, 2] # 根据历史新闻数据库创建词典,以及计算每个历史新闻的词袋向量;如果历史数据库创建的字典存在,则加载进内存 # 用未见过的新闻tokens去更新该词典 if not os.path.exists(ori_dict_path): if not os.path.exists(bowvec_save_path): _, _, historical_bow_vec = self.create_bag_of_word_representation( historical_raw_documents_list, new_dict_path=ori_dict_path, bow_vector_save_path=bowvec_save_path, is_saved_dict=True) logging.info( "create dictionary of historical news, and serialized in path -> {} ... " .format(ori_dict_path)) logging.info( "create bow-vector of historical news, and serialized in path -> {} ... " .format(bowvec_save_path)) else: _, _, _ = self.create_bag_of_word_representation( historical_raw_documents_list, new_dict_path=ori_dict_path, is_saved_dict=True) logging.info( "create dictionary of historical news, and serialized in path -> {} ... " .format(ori_dict_path)) else: if not os.path.exists(bowvec_save_path): _, _, historical_bow_vec = self.create_bag_of_word_representation( historical_raw_documents_list, new_dict_path=ori_dict_path, bow_vector_save_path=bowvec_save_path, is_saved_dict=True) logging.info( "historical news dictionary existed, which saved in path -> {}, but not the historical bow-vector" " ... ".format(ori_dict_path)) else: historical_bow_vec_mmcorpus = corpora.MmCorpus( bowvec_save_path ) # type -> <gensim.corpora.mmcorpus.MmCorpus> historical_bow_vec = [] for _bow in historical_bow_vec_mmcorpus: historical_bow_vec.append(_bow) logging.info( "both historical news dictionary and bow-vector existed, load historical bow-vector to memory ... " ) start_time = time.time() updated_dictionary_with_old_and_unseen_news, unssen_documents_token_list = self.renew_dictionary( ori_dict_path, [unseen_raw_document], is_saved=True) end_time = time.time() logging.info( "renew dictionary with unseen news tokens, and serialized in path -> {}, " "which took {} mins ... ".format(ori_dict_path, (end_time - start_time) / 60)) unseen_bow_vector = [ updated_dictionary_with_old_and_unseen_news.doc2bow(doc_token) for doc_token in unssen_documents_token_list ] updated_bow_vector_with_old_and_unseen_news = [] updated_bow_vector_with_old_and_unseen_news.extend(historical_bow_vec) updated_bow_vector_with_old_and_unseen_news.extend(unseen_bow_vector) # 原先updated_bow_vector_with_old_and_unseen_news是list类型, # 但是经过下面序列化后重新加载进来的类型是gensim.corpora.mmcorpus.MmCorpus if is_saved_bow_vector and bowvec_save_path: corpora.MmCorpus.serialize( bowvec_save_path, updated_bow_vector_with_old_and_unseen_news ) # 保存更新后的bow向量,即包括新旧新闻的bow向量集 logging.info( "combined bow vector(type -> 'list') generated by historical news with unseen bow " "vector to create a new one ... ") if topic_model_type == "lsi": start_time = time.time() updated_tfidf_model_vector = self.transform_vectorized_corpus( updated_dictionary_with_old_and_unseen_news, updated_bow_vector_with_old_and_unseen_news, model_type="tfidf" ) # type -> <gensim.interfaces.TransformedCorpus object> end_time = time.time() logging.info( "regenerated TF-IDF model vector by updated dictionary and updated bow-vector, " "which took {} mins ... ".format((end_time - start_time) / 60)) start_time = time.time() model = models.LsiModel( updated_tfidf_model_vector, id2word=updated_dictionary_with_old_and_unseen_news, num_topics=config.TOPIC_NUMBER) # 初始化模型 model_vector = model[ updated_tfidf_model_vector] # type -> <gensim.interfaces.TransformedCorpus object> end_time = time.time() logging.info( "regenerated LSI model vector space by updated TF-IDF model vector space, " "which took {} mins ... ".format((end_time - start_time) / 60)) elif topic_model_type == "lda": start_time = time.time() model_vector = self.transform_vectorized_corpus( updated_dictionary_with_old_and_unseen_news, updated_bow_vector_with_old_and_unseen_news, model_type="lda") end_time = time.time() logging.info( "regenerated LDA model vector space by updated dictionary and bow-vector, " "which took {} mins ... ".format((end_time - start_time) / 60)) # 将gensim.interfaces.TransformedCorpus类型的lsi模型向量转为numpy矩阵 start_time = time.time() latest_matrix = corpus2dense(model_vector, num_terms=model_vector.obj.num_terms).T end_time = time.time() logging.info( "transform {} model vector space to numpy.adarray, " "which took {} mins ... ".format(topic_model_type.upper(), (end_time - start_time) / 60)) # 利用历史数据的话题模型向量(或特征),进一步训练新闻分类器 start_time = time.time() train_x, train_y, test_x, test_y = utils.generate_training_set( latest_matrix[:-1, :], Y) clf = self.classifier.train(train_x, train_y, test_x, test_y, model_type=classifier_model) end_time = time.time() logging.info( "finished training by sklearn {} using latest {} model vector space, which took {} mins ... " .format(classifier_model.upper(), topic_model_type.upper(), (end_time - start_time) / 60)) label_id = clf.predict(latest_matrix[-1, :].reshape(1, -1))[0] return label_name_list[label_id]