def build_lsi(docs): ''' build lsi model from beginning the documents that needs to extract topics ''' logging.info('There are {} documents'.format(docs.count())) # copy the iterator # build the dictionary logging.info('Building the dictionary...') dictionary = Dict.build_dict(docs) corpus = [i for i in get_corpus(dictionary)] # freeze all the corpus logging.info('number of corpus {}'.format(len(corpus))) logging.info('Construction Completed.') # build the tfidf model logging.info('Building the tfidf model...') tfidf_model = TfidfModel(corpus, normalize=True) corpus_tfidf = tfidf_model[corpus] logging.info('Construction Completed.') # build the lsi model logging.info('Building the LSI model...') lsi_model = LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10) corpus_lsi = lsi_model[corpus_tfidf] logging.info('Construction Complete.') lsi_model.show_topics() return
def train_lsa(is_tfidf, num_topics): # Create corpus print('Create corpus') corpus = doc_processor.create_corpus(dictionary, doc_list, is_tfidf) # Set training parameters. num_topics = num_topics chunksize = 20000 start = time.time() temp = dictionary[0] id2word = dictionary.id2token print('Start LSI training') lsi_model = LsiModel( corpus=corpus, id2word=id2word, num_topics=num_topics, chunksize=chunksize, ) lsi_model.show_topics() ir_method = 'tfidf' if is_tfidf else 'bow' lsi_model.save('saved_models/lsi_model_%s_%s' % (ir_method, num_topics)) print('LSA for %s %s done in %.1f seconds' % (ir_method, num_topics, time.time() - start))
def latent_semantic_indexing(corpus, num_topics, id2word): ''' LATENT SEMANTIC INDEXING # Advantage of LSI: ranks topics by itself. Outputs topics in a ranked order. # Requires a num_topics parameter (200 by default) to determine the number of latent dimensions after the SVD. ''' print 'Latent Semantic Indexing' lsi_model = LsiModel(corpus = corpus, num_topics = num_topics, id2word = id2word) lsi_model.show_topics(num_topics = num_topics) lsi_topic = lsi_model.show_topics(formatted = False) return lsi_model
def topicsLSI(self, num_topics=10, num_words=10, num_iterations=2000, chunksize=20000, decay=0.5, onepass=False): # LsiModel(corpus=None, num_topics=200, id2word=None, chunksize=20000, decay=1.0, distributed=False, onepass=True, power_iters=2, extra_samples=100) lsi = LsiModel(corpus=self.corpus, num_topics=num_topics, id2word=self.id2word, chunksize=chunksize, onepass=onepass, power_iters=num_iterations, decay=decay) # documents for each topic if self.doc2class: doc_idx = 0 for line in lsi[self.corpus]: # get topic with maximum percentage if line: topic_idx = max(line, key=lambda item: item[1])[0] else: # if there is no topic assign a random one topic_idx = random.randint(0, num_topics - 1) # make the dictionary if self.doc2topicLSI.get(self.doc2class[doc_idx]) is None: self.doc2topicLSI[self.doc2class[doc_idx]] = {} for i in range(0, num_topics): self.doc2topicLSI[self.doc2class[doc_idx]][i] = 0 self.doc2topicLSI[self.doc2class[doc_idx]][topic_idx] += 1 doc_idx += 1 print self.doc2topicLSI # show_topics(num_topics=-1, num_words=10, log=False, formatted=True) # Return num_topics most significant topics (return all by default). # For each topic, show num_words most significant words (10 words by default). # The topics are returned as a list – a list of strings if formatted is True, or a list of (weight, word) 2-tuples if False. # If log is True, also output this result to log. return lsi.show_topics(num_topics=num_topics, num_words=num_words, formatted=False) # show_topics(num_topics=-1, num_words=10, log=False, formatted=True) # Return num_topics most significant topics (return all by default). # For each topic, show num_words most significant words (10 words by default). # The topics are returned as a list – a list of strings if formatted is True, or a list of (weight, word) 2-tuples if False. # If log is True, also output this result to log. return lsi.show_topics(num_topics=num_topics, num_words=num_words, formatted=False)
class LSITransformation: def __init__(self, input_space_vectors_map): self.input_space_vectors = input_space_vectors_map.values() self.transform() def transform(self): self.space = LSISpace(self.input_space_vectors) #TODO Handle Saner Reduction self.reduced_space = 15 input_BOWs = [self.space.doc2bow(vector) for vector in self.input_space_vectors] self.lsi_model = LsiModel(corpus=input_BOWs, num_topics=self.reduced_space, id2word=self.space.id2Word()) return self.lsi_model def dissimilarity_score(self, tokens, other_tokens): bows = self.space.doc2bow(tokens) other_bows = self.space.doc2bow(other_tokens) vector = self.infer_and_vectorize(bows) other_vector = self.infer_and_vectorize(other_bows) similarity = CosineSimilarity().calculate(vector, other_vector) return 1 - similarity def infer_and_vectorize(self, bows): transformed_bow = defaultdict(float) transformed_bow.update(dict(self.lsi_model[bows])) return [transformed_bow[dimension] for dimension in range(0, self.reduced_space)] def print_transformation(self): topics = self.lsi_model.show_topics(num_words=self.space.length(), formatted=False) for topic in topics: print [(round(value, 4), token) for value, token in topic]
def get_topic(text): np.random.seed(100) nlp = spacy.load('en') my_stop_words = [ u'say', u'\'s', u'Mr', u'be', u'said', u'says', u'saying', u'get' ] for stopword in my_stop_words: lexeme = nlp.vocab[stopword] lexeme.is_stop = True doc = nlp(text) article = [] texts = [] for w in doc: # if it's not a stop word or punctuation mark, add it to our article! if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num: # we add the lematized version of the word article.append(w.lemma_) texts.append(article) # getting bigrams out of words using gensim bigram = gensim.models.Phrases(texts) texts = [bigram[line] for line in texts] # Creating corpus with our words dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(i) for i in texts] # Applying LDA and LSI models lsimodel = LsiModel(corpus=corpus, num_topics=10, id2word=dictionary) ldamodel = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary) lsitopics = [[word for word, prob in topic] for topicid, topic in lsimodel.show_topics(formatted=False)] ldatopics = [[word for word, prob in topic] for topicid, topic in ldamodel.show_topics(formatted=False)] topics = [] for i in ldatopics: topics.append(i[0]) tags = nltk.pos_tag(topics) # removing verbs as generally nouns are topics lfinaltopics = [ word for word, pos in tags if pos != 'VB' and pos != 'VBD' and pos != 'VBN' and pos != 'VBP' and pos != 'VBZ' and pos != 'VBG' and pos != 'JJ' and pos != 'RB' ] ldafinaltopics = list(set(lfinaltopics)) lstopics = [] for i in lsitopics: for j in i: lstopics.append(j) ltags = nltk.pos_tag(lstopics) lsifinaltopics = [ word for word, pos in ltags if pos != 'VB' and pos != 'VBD' and pos != 'VBN' and pos != 'VBP' and pos != 'VBZ' and pos != 'VBG' and pos != 'RB' and pos != 'JJ' ] # Intersection of results from both models finaltopics = list(set(ldafinaltopics) & set(lsifinaltopics)) final_topics = [] for i in finaltopics: if len(i) >= 2: final_topics.append(i) return final_topics
def lsi(corpus, dictionary): lsi_model = LsiModel(corpus, id2word=dictionary, num_topics=100) lsi_corpus = [] for i in range(len(corpus)): lsi_corpus.append(lsi_model[corpus[i]]) lsi_similarity_matrix = MatrixSimilarity(lsi_corpus) print(lsi_model.show_topics()) return lsi_similarity_matrix
def comparison(texts): dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] lsimodel = LsiModel(corpus=corpus, num_topics=2, id2word=dictionary) print('LSI Model output') print(lsimodel.show_topics()) hdpmodel = HdpModel(corpus=corpus, id2word=dictionary) print('hdp model output') print(hdpmodel.show_topics()) ldamodel = LdaModel(corpus=corpus, num_topics=2, id2word=dictionary) print('LDA Model output') print(ldamodel.show_topics()) pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary) lsitopics = [[word for word, prob in topic] for topicid, topic in lsimodel.show_topics(formatted=False)] hdptopics = [[word for word, prob in topic] for topicid, topic in hdpmodel.show_topics(formatted=False)] ldatopics = [[word for word, prob in topic] for topicid, topic in ldamodel.show_topics(formatted=False)] lsi_coherence = CoherenceModel(topics=lsitopics[:10], texts=texts, dictionary=dictionary, window_size=10).get_coherence() hdp_coherence = CoherenceModel(topics=hdptopics[:10], texts=texts, dictionary=dictionary, window_size=10).get_coherence() lda_coherence = CoherenceModel(topics=ldatopics, texts=texts, dictionary=dictionary, window_size=10).get_coherence() def evaluate_bar_graph(coherences, indices): assert len(coherences) == len(indices) n = len(coherences) x = np.arange(n) plt.bar(x, coherences, width=0.2, tick_label=indices, align='center') plt.xlabel('Models') plt.ylabel('Coherence Value') plt.show() evaluate_bar_graph([lsi_coherence, hdp_coherence, lda_coherence], ['LSI', 'HDP', 'LDA'])
def topicsLSI(self, num_topics=10, num_words=10): # LsiModel(corpus=None, num_topics=200, id2word=None, chunksize=20000, decay=1.0, distributed=False, onepass=True, power_iters=2, extra_samples=100) lsi = LsiModel(corpus=self.corpus, num_topics=num_topics, id2word=self.id2word) # show_topics(num_topics=-1, num_words=10, log=False, formatted=True) # Return num_topics most significant topics (return all by default). # For each topic, show num_words most significant words (10 words by default). # The topics are returned as a list – a list of strings if formatted is True, or a list of (weight, word) 2-tuples if False. # If log is True, also output this result to log. return lsi.show_topics(num_words=num_words, formatted=False)
def create_gensim_lsa_model(doc_clean, number_of_topics, words): """ Input : clean document, number of topics and number of words associated with each topic Purpose: create LSA model using gensim Output : return LSA model """ dictionary, doc_term_matrix = prepare_corpus(doc_clean) # generate LSA model lsamodel = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word=dictionary) # train model # print(lsamodel.print_topics(num_topics=number_of_topics, num_words=words)) print(lsamodel.show_topics(number_of_topics, words)) return lsamodel
def algorithm_lsi(self, category_id, objs, goldstandards): numTopics = self.calculate_k_using_firstnames(objs) print "Using k = "+str(numTopics) texts = [] for obj in objs: texts.append(get_categorizedproduct_content(obj)) dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] # bag of words print "Create models" lsi_model = LsiModel(corpus, id2word=dictionary, num_topics=numTopics) corpus_lsi = lsi_model[corpus] print "Done creating models" results = [] labels = [] cont = 0 for probabilities, obj in izip(corpus_lsi, objs): if probabilities: max_prop = max(probabilities, key=lambda item:item[1])[0] else: max_prop = "WARNING "+str(texts[cont]) labels.append(max_prop) results.append(str(max_prop)+" # "+obj['name'].encode('utf8')) cont += 1 results.sort() for r in results: print r topic_id = 0 for topic in lsi_model.show_topics(num_words=5): print "TOPIC (LSI2) " + str(topic_id) + " : " + topic topic_id+=1 if numTopics > 1: self.calculate_metrics(category_id, objs, labels, goldstandards) else: print "number of clusters equals or lower than 1, ignoring metric"
def get_best_model(token_list, min_topic_num=3, max_topic_num=14, coherence_metric="c_v", model_type="lsi"): model_list = [] coherence_values = [] #create the corpus for the model corpus, tfidf_vect = create_corpus_and_vectorizer(token_list) for topics_num in range(min_topic_num, max_topic_num + 1): #Create the LsiModels with increasing number of Topics\ if model_type == "nmf": model = nmf.Nmf(tfidf_vect[corpus], id2word=dataset, num_topics=topics_num) else: model = LsiModel(tfidf_vect[corpus], id2word=dataset, num_topics=topics_num) model_list.append(model) topics_model = [[ word for word, prob in topic ] for topicid, topic in model.show_topics(formatted=False)] #Create the CoherenceModel and evaluate its score coherence_model = CoherenceModel(topics=topics_model, texts=token_list, dictionary=dataset, coherence=coherence_metric, window_size=30) coherence_values.append(coherence_model.get_coherence()) try: index_value = coherence_values.index(max(coherence_values)) except: index_value = 0 best_model = model_list[index_value] return best_model, corpus
print ("创建模型") tfidf_model = TfidfModel(corpus)#转换成局部/全局加权TF_IDF矩阵,它可以将一个简单的计数表示成TFIDF空间。 # tfidf = TfidfModel(corpus) # print(tfidf[some_doc])#输出模型 # tfidf.save('/tmp/foo.tfidf_model')#保存模型 lsi_model = LsiModel(corpus) #LSA(latent semantic analysis)潜在语义分析,也被称为LSI(latent semantic index), #是一种新的索引和检索方法。该方法和传统向量空间模型(vector space model)一样使用向量来表示词(terms)和文档(documents), #并通过向量间的关系(如夹角)来判断词及文档间的关系;而不同的是,LSA将词和文档映射到潜在语义空间。 #同义词和多义词如何导致传统向量空间模型检索精确度的下降。 #LSA潜在语义分析的目的,就是要找出词(terms)在文档和查询中真正的含义,也就是潜在语义,从而解决上节所描述的问题。 topic_id = 0 for topic in lsi_model.show_topics(): topic_id+=1 print ("TOPIC (LSI) " + str(topic_id) + " : ", topic) print('#'*50) print(lsi_model.num_topics) for i in range(0, lsi_model.num_topics-1): if lsi_model.print_topic(i): print (lsi_model.print_topic(i)) corpus_tfidf = tfidf_model[corpus] corpus_lsi = lsi_model[corpus] lsi_model_2 = LsiModel(corpus_tfidf, id2word=corpus.dictionary, num_topics=300) corpus_lsi_2 = lsi_model_2[corpus] print ('完成创建模型')
class LSA(object): def __init__(self, stopwords, ignorechars): #self.stopwords = stopwords self.ignorechars = ignorechars self.wdict = {} self.dcount = 0 def createStopwords(self, stopword_path): with open(stopword_path, 'r') as file1: temp = file1.read() self.stopwords = temp.split() def parse_dic_bow(self, seg_post): self.posts = [post for post in seg_post.values()] logger.info("BOW process... ") print "original post:" logger.debug("original post:") logger.debug(self.posts) #print self.posts self.mergeLineForOnePost = [ " ".join(post) for post in self.posts ] #change to ['\xe9\xa3\x9f\xe8\xa8\x98 \xe8\xa7\x92\xe9\xa0\xad',' efffe wedw'] #print self.mergeLineForOnePost #self.texts = [[word for word in post.split()] for post in self.mergeLineForOnePost] #change to [['human', 'interface', 'computer'],['survey', 'user']] ## covert UTF to ASCII self.texts = [ [word.encode('utf8') for word in post.split()] for post in self.mergeLineForOnePost ] #change to [['human', 'interface', 'computer'],['survey', 'user']] print "self.mergeLineForOnePost: " self.dictionary = gensim.corpora.Dictionary(self.texts) self.postIdList = [str(postId) for postId in seg_post.keys()] logger.debug("original dic and list:") logger.debug(self.dictionary, len(self.dictionary), self.postIdList) print "original dic and list:" print self.dictionary, self.postIdList ### preprocess - remove the once-word, stopwords, other shits stop_ids = [ self.dictionary.token2id[stopword] for stopword in self.stopwords if stopword in self.dictionary.token2id ] once_ids = [ tokenid for tokenid, docfreq in self.dictionary.dfs.iteritems() if docfreq == 1 ] ### remove once_id sometime cause invalid shape of LSA (TOO LESS words to cluster) #self.dictionary.filter_tokens(once_ids) self.dictionary.filter_tokens(stop_ids) logger.info("removed once-words and stopwords......") logger.debug(self.dictionary, len(self.dictionary)) print "removed once-words and stopwords......" print self.dictionary self.dictionary.compactify() self.new_vec = [self.dictionary.doc2bow(post) for post in self.texts] #self.new_vec = self.dictionary.doc2bow(post for post in self.coverts) def store(self): logger.info("store process starts") self.dictionary.save(testDictionary) self.dictionary.save_as_text(testDictionaryString) corpora.MmCorpus.serialize( testBOWCorpus, self.new_vec) # store to disk, for later use #corpus = corpora.MmCorpus(testBOWCorpus) # comes from the store #dictionary = corpora.Dictionary.load(testDictionary) # comes from the store def TFIDF(self): logger.info("TFIDF process starts") self.tfidf = TfidfModel(self.new_vec) self.corpus_tfidf = self.tfidf[self.new_vec] def printInfo(self): print 'show Dic: ' print self.dictionary print 'show BOW: ' for bow in self.new_vec: print bow print 'show corpus_tfidf model: ' print self.tfidf print "show corpus_tfidf: " for i in self.corpus_tfidf: print i print "show LSA assignment of each post: " #self.num = len(self.corpus_lsi) #for doc, i in zip(self.corpus_lsi, range(self.num)): # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly for doc, postId in zip(self.corpus_lsi, self.postIdList): templist = [] print 'post: {0}'.format(postId) print doc #print "breakdown" #for each in doc: # templist.append(abs(each[1])) #print "templist: " #print templist theLarge = nlargest( 1, doc, key=lambda e: abs(e[1])) ## 1 means find the largest one if theLarge: print "the largest one with absoule value: ", theLarge[0][0] else: print "cannot find it!!!!" print "LSA Topics : " print self.topics print "Break down : " for i in self.topics: print i print type(i) def build(self): ### need to find out a way to pick the proper number of the cluster - may be based on the number of POST self.lsi_model = LsiModel(self.corpus_tfidf, id2word=self.dictionary, num_topics=3) self.corpus_lsi = self.lsi_model[self.corpus_tfidf] ##self.topics = self.lsi_model.print_topics(num_topics=5, num_words=4) #print "topics difference" #print self.lsi_model.print_topic(2, topn=4) self.topics = self.lsi_model.show_topics(num_topics=5, num_words=4, log=False, formatted=False) #print "tuple!@!" #print ss def repaserForOutput(self): ### post_assignment = {post_id:topic} Ex. {"p1":"t1"} ### topic_assignment = {topic_id:[keywords]} Ex. {"t1":["秘密", "飛行器", "新華", "任務"] #print "start to extact info for post_assignment" self.post_assignment = {} self.topic_assignment = {} for doc, postId in zip( self.corpus_lsi, self.postIdList ): #self.postIdList // ['p2', 'p3', 'p1', 'p6', 'p7', 'p4', 'p5', 'p8'] theTopic = nlargest(1, doc, key=lambda e: abs(e[1])) if theTopic: self.post_assignment[postId] = theTopic[0][0] else: self.post_assignment[postId] = "NB" #self.post_assignment[postId] = theTopic[0] self.num = len(self.topics) for topic, num in zip(self.topics, range(self.num)): topicWords = [] for each in topic: #covert from string to unicode topicWords.append(each[1].decode('utf8')) #topicWords.append(each[1]) ## just exact the first topic content, for example, use "秘密" in ["秘密", "飛行器", "新華", "任務"] #self.topic_assignment[str(num)] = topicWords[0] self.topic_assignment[str(num)] = topicWords #matchObj = re.match( r'(.*) are(\.*)', line) #rerurn(self.post_assignment,self.topic_assignment) return (self.post_assignment, self.topic_assignment) def create_result(self, seg_post): logger.info('LSA main process starts.....') self.createStopwords(stopword_path) self.parse_dic_bow(seg_post) self.TFIDF() self.build() self.store() def get_result(self): self.printInfo() return (self.repaserForOutput())
print vector topics = 200 num_clusters = 4 print "Create models" lsi_model = LsiModel(corpus, id2word=corpus.dictionary, num_topics=topics) corpus_lsi = lsi_model[corpus] print "Done creating models" #lsi_model_2 .print_topics(5) topic_id = 0 for topic in lsi_model.show_topics(num_words=5): print "TOPIC (LSI2) " + str(topic_id) + " : " + topic topic_id+=1 #for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly # print "Doc " + str(doc) corpus_lsi_dense = corpus2dense(corpus_lsi, topics) print "Dense Matrix Shape " + str(corpus_lsi_dense.shape) #attempt scikit integration km = KMeans(num_clusters, init='random', max_iter=100, n_init=1, verbose=1) km.fit(corpus_lsi_dense)
# In[29]: lsi2 = LsiModel(bows, num_topics=2, id2word=vocab, extra_samples=100, power_iters=2) lsi2 # In[30]: lsi.save(os.path.join(DATA_PATH, 'lsi100')) lsi2.save(os.path.join(DATA_PATH, 'lsi2')) # In[16]: lsi2.show_topics() # In[23]: # for topic in lsi.show_topics(): # print(topic) lsi.show_topic(0, 100) # ## Hold onto your hat # This will take a lot of RAM! # (and CPU) # In[31]:
def get_topics(model, num_topics): word_dict = {}; for i in range(num_topics): words = model.show_topic(i, topn = 20); word_dict['Topic # ' + '{:02d}'.format(i+1)] = [i[0] for i in words]; return pd.DataFrame(word_dict); get_topics(lda, num_topics) pyLDAvis.enable_notebook() pyLDAvis.gensim.prepare(lda, corpus, id2word) lsitopics = [[word for word, prob in topic] for topicid, topic in lsimodel.show_topics(formatted=False)] ldatopics = [[word for word, prob in topic] for topicid, topic in lda.show_topics(formatted=False)] lsi_coherence = CoherenceModel(model=lsimodel,topics=lsitopics,dictionary=id2word, texts=train_headlines,window_size=10).get_coherence() lda_coherence = CoherenceModel(model=lda,topics=ldatopics,dictionary=id2word,texts=train_headlines,window_size=10).get_coherence() #lda_coherence =CoherenceModel(model=lsimodel, corpus=corpus, coherence='u_mass').get_coherence() def evaluate_bar_graph(coherences, indices): """ Function to plot bar graph. coherences: list of coherence values indices: Indices to be used to mark bars. Length of this and coherences should be equal. """ assert len(coherences) == len(indices)
print vector topics = 200 num_clusters = 4 print "Create models" lsi_model = LsiModel(corpus, id2word=corpus.dictionary, num_topics=topics) corpus_lsi = lsi_model[corpus] print "Done creating models" #lsi_model_2 .print_topics(5) topic_id = 0 for topic in lsi_model.show_topics(num_words=5): print "TOPIC (LSI2) " + str(topic_id) + " : " + topic topic_id+=1 #for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly # print "Doc " + str(doc) corpus_lsi_dense = corpus2dense(corpus_lsi, topics) print "Dense Matrix Shape " + str(corpus_lsi_dense.shape) #attempt scikit integration km = KMeans(k=num_clusters, init='random', max_iter=100, n_init=1, verbose=1) km.fit(corpus_lsi_dense)
dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] # We're now done with a very important part of any text analysis - the data cleaning and setting up of corpus. It must be kept in mind that we created the corpus the way we did because that's how gensim requires it - most algorithms still require one to clean the data set the way we did, by removing stop words and numbers, adding the lemmatized form of the word, and using bigrams. # ### LSI # # LSI stands for Latent Semantic Indeixing - it is a popular information retreival method which works by decomposing the original matrix of words to maintain key topics. Gensim's implementation uses an SVD. # In[11]: lsimodel = LsiModel(corpus=corpus, num_topics=10, id2word=dictionary) # In[12]: lsimodel.show_topics(num_topics=5) # Showing only the top 5 topics # ### HDP # # HDP, the Hierarchical Dirichlet process is an unsupervised topic model which figures out the number of topics on it's own. # In[13]: hdpmodel = HdpModel(corpus=corpus, id2word=dictionary) # In[14]: hdpmodel.show_topics() # ### LDA #
# for i in model.show_topics(): # print(i) from gensim.models import LsiModel from gensim import corpora, models import jieba file_dir = "../corpora/test1" documents = [] with open(file_dir, "r", encoding='utf-8') as f: lines = f.readlines() for line in lines: seg_list = jieba.cut(line, cut_all=False) sentence = [word for word in seg_list] documents.append(sentence) Dict = corpora.Dictionary(documents) corpus = [Dict.doc2bow(doc) for doc in documents] tf_idf = models.TfidfModel(corpus) lsimodel = LsiModel(corpus=tf_idf[corpus], id2word=Dict, num_topics=4) # for i in lsimodel[tf_idf[corpus]]: # print(i) for i in lsimodel.show_topics(): print(i) # 添加文档 lsimodel.add_documents([[(1, 2), (2, 1)]])
print(len(texts_list)) # print("TDF Vectorizer, matrix and texts loaded from file") ###### bigram = gensim.models.Phrases(texts_list) # for bigram collocation detection stops = set(stopwords.words('english')) # nltk stopwords list texts_list = process_texts(texts_list) dictionary = Dictionary(texts_list) corpus = [dictionary.doc2bow(text) for text in texts_list if text !=[] and text != [[]]] print(len(corpus)) ### LSI lsimodel = LsiModel(corpus=corpus, num_topics=10, id2word=dictionary) print(lsimodel.show_topics(num_topics=5)) # Showing only the top 5 topics lsitopics = lsimodel.show_topics(formatted=False) ### HDP hdpmodel = HdpModel(corpus=corpus, id2word=dictionary) print(hdpmodel.show_topics()) hdptopics = hdpmodel.show_topics(formatted=False) ### LDA ldamodel = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary) pyLDAvis.enable_notebook() pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
# In[ ]: texts = [bigram[line] for line in texts] # In[ ]: dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] # In[ ]: lsimodel = LsiModel(corpus=corpus, num_topics=5, id2word=dictionary) # In[ ]: lsimodel.show_topics(num_topics=5) # Showing only the top 5 topics # In[ ]: hdpmodel = HdpModel(corpus=corpus, id2word=dictionary) # In[ ]: hdpmodel.show_topics() # In[ ]: ldamodel = LdaModel(corpus=corpus, num_topics=5, id2word=dictionary) # In[ ]:
# In[100]: # create dictionary and corpus dictionary = Dictionary(cleaned_tweets) corpus = [dictionary.doc2bow(clean_tween) for clean_tween in cleaned_tweets] # In[101]: #### LSI MODEL basically SVD / Principal component analysis lsimodel = LsiModel(corpus=corpus, num_topics=10, id2word=dictionary) # In[102]: lsimodel.show_topics(num_topics=5) # In[103]: # HDP - Hierarchical Dirichlet process hdpmodel = HdpModel(corpus=corpus, id2word=dictionary) hdpmodel.show_topics() # In[123]: # LDA ldamodel = LdaModel(corpus=corpus, num_topics=20, id2word=dictionary) ldamodel.show_topics() # In[105]:
results = hdp.retrieveText(pn) bigram = gensim.models.Phrases(results) #train_texts = process_texts(train_texts) train_texts = process_texts(results) preProcsText(results) dictionary = Dictionary(train_texts) corpus = [dictionary.doc2bow(text) for text in train_texts] for i in range(10, 100, 10): lsimodel = LsiModel(corpus=corpus, num_topics=i, id2word=dictionary) lsitopics = lsimodel.show_topics(num_topics=i) result_dict = addTotalTermResults(lsitopics) addToResults(result_dict) printResults(i, 'lsi') del listResults[:] hdpmodel = HdpModel(corpus=corpus, id2word=dictionary) hdpmodel.show_topics() hdptopics = hdpmodel.show_topics(num_topics=i) result_dict = addTotalTermResults(hdptopics) #add results to total kept in a list
class LSA(object): def __init__(self, stopwords, ignorechars): #self.stopwords = stopwords self.ignorechars = ignorechars self.wdict = {} self.dcount = 0 def createStopwords(self, stopword_path): with open(stopword_path, 'r') as file1: temp = file1.read() self.stopwords = temp.split() def parse_dic_bow(self, seg_post): self.posts = [post for post in seg_post.values()] logger.info("BOW process... ") print "original post:" logger.debug("original post:") logger.debug(self.posts) #print self.posts self.mergeLineForOnePost = [" ".join(post) for post in self.posts] #change to ['\xe9\xa3\x9f\xe8\xa8\x98 \xe8\xa7\x92\xe9\xa0\xad',' efffe wedw'] #print self.mergeLineForOnePost #self.texts = [[word for word in post.split()] for post in self.mergeLineForOnePost] #change to [['human', 'interface', 'computer'],['survey', 'user']] ## covert UTF to ASCII self.texts = [[word.encode('utf8') for word in post.split()] for post in self.mergeLineForOnePost] #change to [['human', 'interface', 'computer'],['survey', 'user']] print "self.mergeLineForOnePost: " self.dictionary = gensim.corpora.Dictionary(self.texts) self.postIdList = [str(postId) for postId in seg_post.keys()] logger.debug("original dic and list:") logger.debug(self.dictionary, len(self.dictionary), self.postIdList) print "original dic and list:" print self.dictionary, self.postIdList ### preprocess - remove the once-word, stopwords, other shits stop_ids = [self.dictionary.token2id[stopword] for stopword in self.stopwords if stopword in self.dictionary.token2id] once_ids = [tokenid for tokenid, docfreq in self.dictionary.dfs.iteritems() if docfreq == 1] ### remove once_id sometime cause invalid shape of LSA (TOO LESS words to cluster) #self.dictionary.filter_tokens(once_ids) self.dictionary.filter_tokens(stop_ids) logger.info("removed once-words and stopwords......") logger.debug(self.dictionary, len(self.dictionary)) print "removed once-words and stopwords......" print self.dictionary self.dictionary.compactify() self.new_vec = [self.dictionary.doc2bow(post) for post in self.texts] #self.new_vec = self.dictionary.doc2bow(post for post in self.coverts) def store(self): logger.info("store process starts") self.dictionary.save(testDictionary) self.dictionary.save_as_text(testDictionaryString) corpora.MmCorpus.serialize(testBOWCorpus, self.new_vec) # store to disk, for later use #corpus = corpora.MmCorpus(testBOWCorpus) # comes from the store #dictionary = corpora.Dictionary.load(testDictionary) # comes from the store def TFIDF(self): logger.info("TFIDF process starts") self.tfidf = TfidfModel(self.new_vec) self.corpus_tfidf = self.tfidf[self.new_vec] def printInfo(self): print 'show Dic: ' print self.dictionary print 'show BOW: ' for bow in self.new_vec: print bow print 'show corpus_tfidf model: ' print self.tfidf print "show corpus_tfidf: " for i in self.corpus_tfidf: print i print "show LSA assignment of each post: " #self.num = len(self.corpus_lsi) #for doc, i in zip(self.corpus_lsi, range(self.num)): # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly for doc, postId in zip(self.corpus_lsi,self.postIdList): templist = [] print 'post: {0}'.format(postId) print doc #print "breakdown" #for each in doc: # templist.append(abs(each[1])) #print "templist: " #print templist theLarge = nlargest(1, doc, key=lambda e:abs(e[1])) ## 1 means find the largest one if theLarge: print "the largest one with absoule value: ", theLarge[0][0] else: print "cannot find it!!!!" print "LSA Topics : " print self.topics print "Break down : " for i in self.topics: print i print type(i) def build(self): ### need to find out a way to pick the proper number of the cluster - may be based on the number of POST self.lsi_model = LsiModel(self.corpus_tfidf, id2word = self.dictionary, num_topics=3) self.corpus_lsi = self.lsi_model[self.corpus_tfidf] ##self.topics = self.lsi_model.print_topics(num_topics=5, num_words=4) #print "topics difference" #print self.lsi_model.print_topic(2, topn=4) self.topics = self.lsi_model.show_topics(num_topics=5, num_words=4, log=False, formatted=False) #print "tuple!@!" #print ss def repaserForOutput(self): ### post_assignment = {post_id:topic} Ex. {"p1":"t1"} ### topic_assignment = {topic_id:[keywords]} Ex. {"t1":["秘密", "飛行器", "新華", "任務"] #print "start to extact info for post_assignment" self.post_assignment = {} self.topic_assignment = {} for doc, postId in zip(self.corpus_lsi,self.postIdList): #self.postIdList // ['p2', 'p3', 'p1', 'p6', 'p7', 'p4', 'p5', 'p8'] theTopic = nlargest(1, doc, key=lambda e:abs(e[1])) if theTopic: self.post_assignment[postId] = theTopic[0][0] else: self.post_assignment[postId] = "NB" #self.post_assignment[postId] = theTopic[0] self.num = len(self.topics) for topic, num in zip(self.topics, range(self.num)): topicWords = [] for each in topic: #covert from string to unicode topicWords.append(each[1].decode('utf8')) #topicWords.append(each[1]) ## just exact the first topic content, for example, use "秘密" in ["秘密", "飛行器", "新華", "任務"] #self.topic_assignment[str(num)] = topicWords[0] self.topic_assignment[str(num)] = topicWords #matchObj = re.match( r'(.*) are(\.*)', line) #rerurn(self.post_assignment,self.topic_assignment) return (self.post_assignment,self.topic_assignment) def create_result(self,seg_post): logger.info('LSA main process starts.....') self.createStopwords(stopword_path) self.parse_dic_bow(seg_post) self.TFIDF() self.build() self.store() def get_result(self): self.printInfo() return (self.repaserForOutput())
def topicmodiling(): l=[] text='' for i in range(len(dfs)): for j in dfs[i]: if(j=='\n'): j=' ' text=text+j else: text=text + j l.append(text) text='' for i in l: text=text+i+"\n" nlp=English() doc = nlp(text) texts, article = [], [] for w in doc: # if it's not a stop word or punctuation mark or it is not a number, add it to our article! if w.is_stop == False and w.is_punct == False and w.like_num == False and w.like_email ==False : # we add the lematized version of the word article.append(w.lemma_) # if it's a new line, it means we're onto our next document if w.text == '\n': texts.append(article) article = [] bigram = gensim.models.Phrases(texts) texts = [bigram[line] for line in texts] for i in texts: for j in i: if(j=='\n'): i.remove(j) dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] dictionary.token2id dictionary lsimodel = LsiModel(corpus=corpus, num_topics=5, id2word=dictionary) a=lsimodel.show_topics(num_topics=5) # Showing only the top 5 topics b=[] for i in range(0,len(a)): b.append(a[i][1].split('+')) k=[] for i in range(0,len(b)): k.append(b[i][0:5]) top1=[] for i in range(0,5): top1.append(k[0][i].split('*')) top2=[] for i in range(0,5): top2.append(k[1][i].split('*')) top3=[] for i in range(0,5): top3.append(k[2][i].split('*')) top4=[] for i in range(0,5): top4.append(k[3][i].split('*')) df1 = DataFrame (top1,columns=['Topic 1 weight','Topic 1 words']) df2 = DataFrame (top2,columns=['Topic 2 weight','Topic 2 words']) df3 = DataFrame (top3,columns=['Topic 3 weight','Topic 3 words']) df4 = DataFrame (top4,columns=['Topic 4 weight','Topic 4 words']) result = pd.concat([df1, df2,df3,df4], axis=1) for col in result.columns: result[col]=result[col].str.replace('"','') result[col]=result[col].str.replace('-','') return result
def main(): # --- arguments --- (dataset, version, _, _, nbs_topics, _, _, cache_in_memory, use_callbacks, tfidf, args) = parse_args() model_class = 'LSImodel' _split_ = "_split" if use_callbacks else "" data_name = f'{dataset}_{version}_{tfidf}' data_dir = join(LDA_PATH, version, tfidf) # --- logging --- logger = init_logging(name=data_name, basic=False, to_stdout=True, to_file=True) logg = logger.info log_args(logger, args) # --- load dict --- logg('Loading dictionary') data_file = join(data_dir, f'{data_name}.dict') dictionary = Dictionary.load(data_file) # --- load corpus --- logg('Loading corpus') data_file = join(data_dir, f'{data_name}.mm') corpus = MmCorpus(data_file) if cache_in_memory: logg('Reading corpus into RAM') corpus = list(corpus) if use_callbacks: train, test = split_corpus(corpus) else: train, test = corpus, [] logg(f'size of... train_set={len(train)}, test_set={len(test)}') # --- train --- topn = 20 columns = [f'term{x}' for x in range(topn)] + [f'weight{x}' for x in range(topn)] for nbtopics in nbs_topics: gc.collect() logg(f'Running {model_class} with {nbtopics} topics') model = LsiModel(corpus=train, num_topics=nbtopics, id2word=dictionary) model_dir = join(LSI_PATH, version, tfidf, f'{_split_}') model_path = join(model_dir, f'{dataset}_{model_class}{_split_}_{nbtopics}') if not exists(model_dir): makedirs(model_dir) # --- save topics --- topics = model.show_topics(num_words=topn, formatted=False) topics = [list(chain(*zip(*topic[1]))) for topic in topics] topics = pd.DataFrame(topics, columns=columns) logg(f'Saving topics to {model_path}.csv') topics.to_csv(f'{model_path}.csv') # --- save model --- logg(f'Saving model to {model_path}') model.save(model_path) # --- done --- logg(f'\n' f'----- end -----\n' f'----- {dataset.upper()} -----\n' f'{"#" * 50}\n')
# combine bigrams and add to corpus bigram = gensim.models.Phrases(texts) texts = [bigram[line] for line in texts] dictionary = Dictionary(texts) # (word id, number of times word appears in document) corpus = [dictionary.doc2bow(text) for text in texts] # latent semantic indexing, a popular information retrieval method, # which works by decomposing the original matrix of words to # maintain key topics. Gensim's implementation uses an SVD. lsi_model = LsiModel(corpus=corpus, num_topics=2, id2word=dictionary) lsi_topics = lsi_model.show_topics(num_topics=5) print(lsi_topics) # hierarchical dirichlet process is an unsupervised topic model which # determines the number of topics on its own hdp_model = HdpModel(corpus=corpus, id2word=dictionary) hdp_topics = hdp_model.show_topics() print(hdp_topics) # latent dirichlet allocation lda_model = LdaModel(corpus=corpus, num_topics=2, id2word=dictionary) lda_topics = lda_model.show_topics() print(lda_topics) lsi_topics_clean = [[ word for word, prob in topic
# HDP Hierarchical Dirichlet Process - unsupervised method that determines number of topics itself print('HDP Hierarchical Dirichlet Process') hdp_model = HdpModel(corpus=corpus, id2word=dictionary) with open('./tm_results.txt', 'w') as f: f.write('Without Spelling Correction\nHDP\n') for topic in hdp_model.show_topics(formatted=True): f.write('{}\t{}\n'.format(topic[0], topic[1])) num_topics = len(hdp_model.show_topics()) # LSI Latent Symantex Indexing print('LSI Latent Symantex Indexing') lsi_model = LsiModel(corpus=corpus, num_topics=num_topics, id2word=dictionary) with open('./tm_results.txt', 'a') as f: f.write('LSI\n') for topic in lsi_model.show_topics(formatted=True): f.write('{}\t{}\n'.format(topic[0], topic[1])) # LDA Latent Dirichlet Allocation print('LDA Latent Dirichlet Allocation') lda_model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary) with open('./tm_results.txt', 'a') as f: f.write('LDA\n') for topic in lda_model.show_topics(formatted=True): f.write('{}\t{}\n'.format(topic[0], topic[1])) # HDP Hierarchical Dirichlet Process - unsupervised method that determines number of topics itself print('HDP Hierarchical Dirichlet Process') hdp_model = HdpModel(corpus=corpus_spell, id2word=dictionary_spell) with open('./tm_results.txt', 'a') as f: f.write('\nWith Spelling Correction\nHDP\n')
train_corpus = norm_corpus.apply(remove_stopwords) #Bigram would been necessary for joining words like new_york so they dont affect the model # bigram = gensim.models.Phrases(train_corpus) # train_corpus = [bigram[line] for line in train_corpus] dictionary = Dictionary(train_corpus) final_corpus = [dictionary.doc2bow(text) for text in train_corpus] # Unsupervised learning approach to get the number of topics in this dataset hdpmodel = HdpModel(corpus=final_corpus, id2word=dictionary) print(hdpmodel.show_topics()) #Latent Semantic Indeixing, a popular information retreival method which works by decomposing the original matrix of words to maintain key topics lsimodel = LsiModel(corpus=final_corpus, num_topics=10, id2word=dictionary) print(lsimodel.show_topics()) #Latent Dirichlet Allocation - famous topic modelling algorithm out there ldamodel = LdaModel(corpus=final_corpus, num_topics=10, chunksize=100, update_every=1, id2word=dictionary, minimum_probability=0) print(ldamodel.show_topics()) #Topic Coherence to identify which model is doing better lsitopics = [[word for word, prob in topic] for topicid, topic in lsimodel.show_topics(formatted=False)] hdptopics = [[word for word, prob in topic] for topicid, topic in hdpmodel.show_topics(formatted=False)]
######################################## print(corpus) tfidf = TfidfModel(corpus) corpus_tfidf = tfidf[corpus] ######################################## ## Applying LSI ######################################## lsi = LsiModel(corpus_tfidf, id2word=dictionary, num_topics=400, decay=1, onepass=False, extra_samples=20) corpus_lsi = lsi[corpus_tfidf] print(lsi.show_topics(num_topics=10)) lsitopics = lsi.show_topics(formatted=False) ######################################## ## Applying LDA ######################################## lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=400, update_every=1, chunksize=100, passes=1) print(lda.show_topics(num_topics=10)) ldatopics = lda.show_topics(formatted=False) ########################################
config_file = "/home/rohola/Codes/Python/topic_modeling_visualization-master/configs/lsi_config.json" config = LSIConfig.from_json_file(config_file) corpus_manager = CorpusManager() corpus, dictionary = corpus_manager.read_corpus(config.dataset_dir) tfidf = TfidfModel(corpus) corpus_tfidf = tfidf[corpus] lsi = LsiModel( corpus_tfidf, id2word=dictionary, num_topics=config.num_topics, power_iters=config.power_iters) # initialize an LSI transformation topic_words = lsi.show_topics(config.num_topics_to_show, num_words=config.num_words, formatted=False) topic_words = [j for (i, j) in topic_words] visualize_method = "" if config.dimension == 2: visualize_method = 'plotly' elif config.dimension == 3: visualize_method = 'plotly3d' else: raise ("Wrong dimension, can accept only 2 or 3") topic_modeling_semantic_network.visualize_semantic_netwrok( config, topic_words, visualize_method=visualize_method)
texts = [bigram[line] for line in texts] texts[10] dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] corpus[100] lsimodel = LsiModel(corpus=corpus, num_topics=10, id2word=dictionary) lsimodel.show_topics(num_topics=5) # Showing only the top 5 topics hdpmodel = HdpModel(corpus=corpus, id2word=dictionary) hdpmodel.show_topics() ldamodel = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary) ldamodel.show_topics() pyLDAvis.enable_notebook() pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary) lsitopics = [[word for word, prob in topic] for topicid, topic in lsimodel.show_topics(formatted=False)]