def select_lda(self, journal, year, num_topics): self.build_corpus(journal, year) # 输出 lda = LdaModel(corpus=self.corpus_a, id2word=self.dictionary, num_topics=num_topics, passes=2, update_every=0, alpha='auto', iterations=500) output_path = self.abspath + '/data/lda_topic/' + journal + '/' output_filename = year + '.txt' with open(output_path + output_filename, 'w', newline='', encoding='UTF-8') as f: for i in range(0, num_topics): input_str = lda.show_topic(i, topn=30)[0][0] + ':' + str( lda.show_topic(i, topn=30)[0][1]) for j in range(1, len(lda.show_topic(i, topn=30))): word = lda.show_topic(i, topn=30)[j][0] + ':' + str( lda.show_topic(i, topn=30)[j][1]) input_str = input_str + ',' + word f.write(input_str + '\n') self.select_over_msg.emit(journal, year, num_topics)
def Lda_topic_model(docs, dictionary, nb_topics, true_labels): k = 5 lda = LdaModel(docs, num_topics=k, id2word=dictionary, passes=10) top_words = [[word[::-1] for word, _ in lda.show_topic(topic_id, topn=50)] for topic_id in range(lda.num_topics)] top_betas = [[beta for _, beta in lda.show_topic(topic_id, topn=50)] for topic_id in range(lda.num_topics)] nb_words = 12 f, ax = plt.subplots(3, 2, figsize=(20, 15)) for i in range(nb_topics): # ax = plt.subplot(gs[i]) m, n = np.unravel_index(i, shape=(3, 2))[0], np.unravel_index(i, shape=(3, 2))[1] ax[m, n].barh(range(nb_words), top_betas[i][:nb_words], align='center', color='green', ecolor='black') ax[m, n].invert_yaxis() ax[m, n].set_yticks(range(nb_words)) ax[m, n].set_yticklabels(top_words[i][:nb_words]) ax[m, n].set_title("Topic " + str(i)) plt.show() # get distribution of docs on topics. dist_on_topics = lda.get_document_topics(docs) topic_predict = [] for d in dist_on_topics: p = 0 win_topic = 0 print(d) for i, t in enumerate(d): if t[1] > p: p = t[1] win_topic = t[0] print(win_topic) topic_predict.append(win_topic) mat = confusion_matrix(true_labels, topic_predict) print(mat) cluster_to_class = {} for i in range(5): cluster_to_class[i] = np.argmax(mat[:, i]) custom_labels = [cluster_to_class[c] for c in topic_predict] print("accuracy:", accuracy_score(true_labels, custom_labels)) print("f1_score micro: ", f1_score(true_labels, custom_labels, average='micro')) print("f1_score: macro", f1_score(true_labels, custom_labels, average='macro')) print("NMI", NMI(true_labels, custom_labels))
def exercise4(filename): """ Topic Modelling """ articles = [] stopWords = set(stopwords.words('english')) stopWords = stopWords | { '</H1>', 'The', 'In', 'For', 'was', 'be', 'will', '<H1>' } text = open(filename, 'r').read().split() index_start = list(np.where(np.array(text) == "<DOC")[0]) for i in range(len(index_start) - 1): start_art = index_start[i] + 2 end_art = index_start[i + 1] article = text[start_art:end_art] article = [word for word in article if word not in stopWords] articles.append(article) common_dictionary = corpora.Dictionary(articles) common_corpus = [common_dictionary.doc2bow(a) for a in articles] # each doc to BOW n_topics = 2 lda = LdaModel(common_corpus, id2word=common_dictionary, num_topics=n_topics, passes=200) for k in range(n_topics): top_words = lda.show_topic(k, topn=5) print("Top words in topic {}: {}\n".format(k + 1, top_words))
class CorpusLdaModelWrapper: def __init__(self, corpus, dictionary, doc_labels, preprocessing_pipeline, numtopics): self.corpus = corpus self.dictionary = dictionary self.doc_labels = doc_labels self.pipeline = preprocessing_pipeline self.numtopics = numtopics self.trained = False def train(self): # training self.model = LdaModel(self.corpus, id2word=self.dictionary, num_topics=self.numtopics) self.index = MatrixSimilarity(self.model[self.corpus]) # flag self.trained = True def convertTextToReducedVector(self, text): if not self.trained: raise exceptions.ModelNotTrainedException() tokens = word_tokenize(prep.preprocess_text(text, self.pipeline)) tokens = filter(lambda token: self.dictionary.token2id.has_key(token), tokens) bow = self.dictionary.doc2bow(tokens) return self.model[bow] def queryDoc(self, text): reducedVec = self.convertTextToReducedVector(text) sims = self.index[reducedVec] simtuples = zip(range(len(sims)), sims) if self.doc_labels==None else zip(self.doc_labels, sims) simtuples = sorted(simtuples, key=lambda item: item[1], reverse=True) return simtuples def show_topic(self, id): return self.model.show_topic(id)
def get(self, s, e): # Loading our datas without treatment dataObject = getDatas() data = dataObject.get() dataEpisode = dataObject.getDataEpisode(data, s, e) # preprocess of the words of the episode tokenEpisode = [] tokenEpisode.append( [token for token in self.preprocessEpisode(dataEpisode)]) dictionnaryEpisode = Dictionary(tokenEpisode) # creating our model corpus model_corpus = [] for episode in tokenEpisode: model_corpus.append(dictionnaryEpisode.doc2bow(episode)) # Creating our list of topics with the LDA models topicsList = [] string = "Voici les sujets recurrents pour l'episode " + e + " de la saison " + s topicsList.append(string) lda_model = LdaModel( corpus=model_corpus, id2word=dictionnaryEpisode, num_topics=3 ) # We choose to get only the 3 most significant topics for topic_id, topic_keywords in lda_model.show_topics(formatted=False): string = "=== Pour le sujet au mot cle principal '" + str( lda_model.show_topic(topic_id, topn=1)[0] [0]) + "', les mots clefs representatifs sont ===" topicsList.append(string) # Broswe the keywords of each topic for keyword in topic_keywords: string = "-> " + str(keyword[0]) + " (" + str(keyword[1]) + ")" topicsList.append(string) # Return our list of topics return topicsList
def __theme_re_weight(self, tokens): dictionary = Dictionary(tokens) corpus = [dictionary.doc2bow(text) for text in tokens] lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=2, passes=20) topic = [] topic.append(lda.show_topic(topicid=0, topn=8)) topic.append(lda.show_topic(topicid=1, topn=8)) return topic
def format_term_search_results(model: LdaModel, search_results: dict): temp_list = [] for key, value in search_results.items(): sorted_value = sorted(value, key=lambda x: x[1], reverse=True) for i in sorted_value: topic_id, topic_prob = i wp = model.show_topic(topic_id) topic_keywords = ", ".join([word for word, prop in wp]) temp_list.append([key, topic_id, topic_prob, topic_keywords]) return pd.DataFrame( temp_list, columns=['Search_Term', 'Topic_ID', 'Topic_Prob', 'Topic_Keywords'])
def train_lda(self, cache_path): print(cache_path) trainBatchIter = BatchIterBert(self.trainDataIter, filling_last_batch=False, postProcessor=batchPostProcessor, batch_size=1) bow_list = [] for item in trainBatchIter: bow = item[1].squeeze().detach().numpy().tolist() bow_list.append(self.bow_2_gensim(bow)) print(len(bow_list)) #print(self.dictProcess.common_dictionary.id2token) lda = LdaModel(np.array(bow_list), num_topics=50, passes=200, chunksize=len(bow_list), id2word=self.dictProcess.common_dictionary) #print(lda.show_topic(1, topn=10)) output_topic_line = '' for topic_id in range(50): current_topic_list = [] current_topic = lda.show_topic(topic_id, topn=10) for topic_tuple in current_topic: current_topic_list.append(topic_tuple[0]) output_topic_line += ' '.join(current_topic_list) + '\n' #print(current_topic_list) topic_file = os.path.join(cache_path, 'ldatopic.txt') with open(topic_file, 'w') as fo: fo.write(output_topic_line) testBatchIter = BatchIterBert(self.testDataIter, filling_last_batch=False, postProcessor=batchPostProcessor, batch_size=1) test_bow_list = [] word_count = 0 for item in testBatchIter: bow = item[1].squeeze().detach().numpy().tolist() word_count += sum(bow) test_bow_list.append(self.bow_2_gensim(bow)) print(word_count) ppl = lda.log_perplexity(test_bow_list, len(test_bow_list)) print(ppl) bound = lda.bound(test_bow_list) print(bound / word_count) print(np.exp2(-bound / word_count))
def topicModeling(corpus, dictionary, texts): ldamodel = LdaModel(corpus=corpus, num_topics=3, id2word=dictionary, passes=5) x = ldamodel.show_topics() #show generated topics #---------------------------------------------------------- sent_topics_df = pd.DataFrame() # Get main topic in each document for i, row in enumerate(ldamodel[corpus]): row = sorted(row, key=lambda x: (x[1]), reverse=True) # Get the Dominant topic, Perc Contribution and Keywords for each document for j, (topic_num, prop_topic) in enumerate(row): if j == 0: # => dominant topic wp = ldamodel.show_topic(topic_num) topic_keywords = ", ".join([word for word, prop in wp]) sent_topics_df = sent_topics_df.append(pd.Series( [int(topic_num), round(prop_topic, 4), topic_keywords]), ignore_index=True) else: break sent_topics_df.columns = [ 'Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords' ] # Add original text to the end of the output contents = pd.Series(texts) sent_topics_df = pd.concat([sent_topics_df, contents], axis=1) #-------Generate Visualization------------------------------ pyLDAvis.enable_notebook() topicModel = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary) pyLDAvis.save_html( topicModel, '/Users/[email protected]/Documents/projects/PEM/elon.html') pyLDAvis.show(topicModel) return x, sent_topics_df
def ldavis(importfile, num_topic, outputfile): review_df = pd.read_csv(importfile, encoding='cp949') review_df['review_txt'] = review_df['review_txt'].str.replace("\n", "") review_df['review_txt'] = review_df['review_txt'].str.replace(r'[0-9]', "") review_df['review_txt'] = review_df['review_txt'].str.replace(r'(\.)', "") review_df['review_txt'] = review_df['review_txt'].str.replace( r"[ㄱ-ㅎㅏ-ㅣ]+", "") review_df['review_txt'] = review_df['review_txt'].str.replace( r"[-=.#/★^&*)~?(:$}]", "") review_df['review_txt'] = review_df['review_txt'].str.replace(r"[잼]", "재미") review_df['review_txt'] = review_df['review_txt'].str.replace("겜", "게임") review_df['review_txt'] = review_df['review_txt'].str.replace("게임", "") review_df['review_txt'] = review_df['review_txt'].str.replace("너무", "") review_df['review_txt'] = review_df['review_txt'].str.replace("진짜", "") review_df['review_txt'] = review_df['review_txt'].str.replace("정말", "") review_df['review_txt'] = review_df['review_txt'].str.replace( r"[" + str(importfile[:3]) + "]+", "") # 각 리뷰마다 명사만 남기고 띄어쓰기 기준으로 구분되어 있는 list of list of str okt = Okt() texts = [] for i in range(review_df.shape[0]): review_noun = [ noun_ for noun_ in okt.nouns(review_df.iloc[i, 1]) if len(noun_) > 1 ] texts.append(review_noun) dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] NUM_TOPICS = int(num_topic) # This is an assumption. ldamodel = LdaModel(corpus, num_topics=NUM_TOPICS, id2word=dictionary, passes=20) # This might take some time. word_dict = {} for i in range(NUM_TOPICS): words = ldamodel.show_topic(i, topn=20) word_dict['Topic # ' + '{:02d}'.format(i + 1)] = [i[0] for i in words] topic_df = pd.DataFrame(word_dict) topic_df.to_csv(outputfile + ".csv", index=False) prepared_data = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary) print("LDA topic modeling ...") pyLDAvis.save_html(prepared_data, outputfile + ".html")
def topicsLDA(self, num_topics=10, num_iterations=10000, num_words=10): # LdaModel(corpus=None, num_topics=100, id2word=None, distributed=False, chunksize=2000, passes=1, update_every=1, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001) try: lda = LdaModel(corpus=self.corpus, num_topics=num_topics, id2word=self.id2word, iterations=num_iterations) result = {} tpd = lda[self.corpus] # topic probability distribution for topics in tpd: for elem in topics: if result.get(elem[0], -1) == -1: words = lda.show_topic(elem[0], topn=num_words) result[elem[0]] = {'weight': elem[1], 'words': words} else: result[elem[0]]['weight'] += elem[1] return result except Exception as e: print e return None
class TopicModel(object): def __init__(self, documents, cut=True, num_topics=10, min_length=1): from cla.util.util import CutDocument from gensim.corpora import Dictionary from gensim.models import LdaModel self.document = CutDocument(documents, cut, cleanup=True, min_length=min_length) self.dictionary = Dictionary(self.document) self.model = LdaModel(BowCorpus(self.document, self.dictionary), id2word=self.dictionary, num_topics=num_topics) def topic_words(self, topic_id, limit=10): return self.model.show_topic(topicid=topic_id, topn=limit) def identify_topic(self, words): return self.model.get_document_topics(self.dictionary.doc2bow(words))
def topic_modelling(files=['114.txt', '100.txt', '465.txt', '059.txt']): """ perform topic modelling for a given list of files """ ntopics = 2 articles = [] stop_words = set(stopwords.words('english')) | {'Mr', 'The', '-', 'said'} for f in files: fp = path.join(data_dir, f) with open(fp) as f: text = f.read().split() # word_tokenize( articles.append([word for word in text if word not in stop_words]) dictionary = corpora.Dictionary(articles) corpus = [dictionary.doc2bow(a) for a in articles] # doc to BOW lda = LdaModel(corpus, id2word=dictionary, num_topics=ntopics, passes=500) for i in range(ntopics): topwords = lda.show_topic(i, topn=5) print("Top words in topic {}: {}\n".format(i + 1, topwords))
def topicsLDA(self, num_topics=10, num_iterations=10000, num_words=10): # LdaModel(corpus=None, num_topics=100, id2word=None, distributed=False, chunksize=2000, passes=1, update_every=1, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001) try: lda = LdaModel(corpus=self.corpus, num_topics=num_topics, id2word=self.id2word, iterations=num_iterations) result = {} tpd = lda[self.corpus] # topic probability distribution for topics in tpd: for elem in topics: if result.get(elem[0], -1) == -1: words = lda.show_topic(elem[0], topn=num_words) result[elem[0]] = {'weight': elem[1], 'words': words} else: result[elem[0]]['weight'] += elem[1] return result except Exception as e: print e return None
def predict_and_format_topics(ldamodel: LdaModel, corpus, texts, doc_id: list = None, n_topics=5): """Predict top n topics of corpus and format results in a pandas DataFrame DataFrame has the following columns: 'Document_No' and 'Topic_Id', 'Topic_Prob' and'Topic Keywords' for each n topics TODO: Refactor code to optimize prediction speed """ df = pd.DataFrame() # Get main topic in each document for row in ldamodel[corpus]: row = sorted(row, key=lambda x: (x[1]), reverse=True) # Get the top n topic and topic probability for each document temp_list = [] for topic_num, prob_topic in row[:n_topics]: wp = ldamodel.show_topic(topic_num) topic_keywords = ", ".join([word for word, prop in wp]) temp_list = temp_list + \ [int(topic_num), round(prob_topic, 4), topic_keywords] df = df.append(pd.Series(temp_list), ignore_index=True) # Add original text to the end of the output # Code commented out for backward compatibility. # Uncomment below line to enable concat of original text # df = pd.concat([df, pd.Series(texts)], axis=1) if doc_id: df.insert(0, 'Document_No', doc_id) else: df.reset_index(inplace=True) df.columns = ['Document_No'] + np.array( [(f'Dominant_Topic_{i+1}', f'Topic_Prob_{i+1}', 'Topic Keywords') for i in range(n_topics)]).flatten().tolist() return df
for seg in seg_list: seg = ''.join(seg.split()) if len(seg) > 1 and seg not in skiplist and seg not in stopwords: result.append(seg) train.append(result) print('Starting gensim module') dictionary = Dictionary(train) corpus = [dictionary.doc2bow(text) for text in train] tfidf = TfidfModel(corpus) corpus_tfidf = tfidf[corpus] lda_model = LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=50) corpus_lda = lda_model[corpus_tfidf] topic_list = [] for i in range(lda_model.num_topics): topic_list.append(lda_model.show_topic(i)) word_index = {} for i in range(len(pagelist)): theme, dist = sorted(corpus_lda[i], key=lambda x:x[1], reverse=True)[0] #print(lda_model.print_topic(theme)) weight = pages[pagelist[i]]*1e5 for topic_word, likelihood in topic_list[theme]: if topic_word in word_stats.keys(): word_stats[topic_word] += likelihood * weight word_index[topic_word].append((dist * likelihood, pagelist[i])) else: word_stats[topic_word] = likelihood * weight word_index[topic_word] = [(dist * likelihood, pagelist[i])] cloud = WordCloud( font_path = 'simhei.ttf',
def upload_file(): """ Upload csv files and create: * ~/out/corpus.dict * ~/out/corpus.lda * ~/out/corpus.lda.state * ~/out/corpus.mm * ~/out/corpus.mm.index * ~/out/corpus_doclabels.txt * ~/out/corpus_topics.txt * ~/mycorpus.txt As well as (for example): * ~/swcorp/Doyle_AStudyinScarlet.txt * ~/swcorp/Lovecraft_AttheMountainofMadness.txt * etc. """ # INPUT # columns to read from csv file columns = ['ParagraphId', 'TokenId', 'Lemma', 'CPOS', 'NamedEntity'] # parts-of-speech to include into the model pos_tags = ['ADJ', 'NN', 'V'] # stopwords regex = re.compile('\w+') stopwords = request.files['stoplist'] stopwords = str(stopwords.readlines()) stopwords = regex.findall(stopwords) stopwords.extend(("'", "'d", "'s")) # temporary solution print(stopwords) # document size (in words) doc_size = 1000 # uses the pipeline's ParagraphId to split text into documents, # overrides doc_size - 1: on, 0: off doc_split = 0 # no. of topics to be generated no_of_topics = 30 # no. of lda iterations - usually, the more the better, but # increases computing time no_of_passes = 1 # perplexity estimation every n chunks - # the smaller the better, but increases computing time eval = 1 # documents to process at once chunk = 100 # "symmetric", "asymmetric", "auto", or array # (default: a symmetric 1.0/num_topics prior) affects sparsity of # the document-topic (theta) distribution alpha = "symmetric" # custom alpha may increase topic coherence, but may also produce # more topics with zero probability alpha = np.array([ 0.02, 0.02, # 0.02, 0.03, 0.03, 0.03, 0.04, 0.04, 0.04, 0.05, 0.05, 0.04, 0.04, # 0.04, 0.03, 0.03, 0.03, 0.02, 0.02, 0.02]) # can be a number (int/float), an array, or None # affects topic-word (lambda) distribution - not necessarily # beneficial to topic coherence eta = None # PREPROCESSING files = request.files.getlist('files') docs = [] doc_labels = [] print("\n reading files ...\n") for file in files: file_label = secure_filename(file.filename).split('.')[0] df = pd.read_csv(file, sep="\t", quoting=csv.QUOTE_NONE) df = df[columns] df = df.groupby('CPOS') doc = pd.DataFrame() for p in pos_tags: # collect only the specified parts-of-speech doc = doc.append(df.get_group(p)) # construct documents if doc_split: # size according to paragraph id doc = doc.groupby('ParagraphId') for para_id, para in doc: docs.append(para['Lemma'].values.astype(str)) doc_labels.append( ''.join([file_label, " #", str(para_id)])) else: # size according to doc_size doc = doc.sort_values(by='TokenId') i = 1 while(doc_size < doc.shape[0]): docs.append( doc[:doc_size]['Lemma'].values.astype(str)) doc_labels.append( ''.join([file_label, " #", str(i)])) doc = doc.drop(doc.index[:doc_size]) i += 1 docs.append(doc['Lemma'].values.astype(str)) doc_labels.append(''.join([file_label, " #", str(i)])) if not os.path.exists(os.path.join(os.getcwd(), "swcorp")): os.makedirs(os.path.join(os.getcwd(), "swcorp")) swpath = os.path.join('swcorp', "".join(file_label)) with open(swpath + ".txt", 'w', encoding="utf-8") as text: text.write(" ".join( word for word in doc['Lemma'].values.astype(str) if word not in stopwords)) print("\n normalizing and vectorizing ...\n") # texts = [ # [word for word in doc if word not in stopwords] for doc in docs] print("\n stopwords removed ...\n") print("\n writing mastercorpus ...\n") mastercorpus = os.path.join(os.getcwd(), 'mycorpus.txt') with open(mastercorpus, 'w', encoding="utf-8") as data: folder = glob.glob("swcorp/*") for text in folder: with open(text, 'r', encoding="utf-8") as text: textline = [re.sub( r'\\n\\r', '', document) for document in ' '.join( text.read().split())] if text != folder[-1]: data.write("".join(textline) + "\n") else: data.write("".join(textline)) # MAIN PART mastercorpus = os.path.join(os.getcwd(), 'mycorpus.txt') dictionary = corpora.Dictionary( line.lower().split() for line in open( mastercorpus, encoding="utf-8")) class MyCorpus(object): def __iter__(self): for line in open('mycorpus.txt'): # assume there's one document per line, tokens # separated by whitespace yield dictionary.doc2bow(line.lower().split()) # corpus = buildCorpus(mastercorpus, dictionary) corpus = MyCorpus() # corpus = glob.glob("swcorpus/*") if not os.path.exists("out"): os.makedirs("out") # if not os.path.exists(os.path.join(os.path.join(os.getcwd(), # 'out'), foldername)): os.makedirs(os.path.join # (os.path.join(os.getcwd(), 'out'), foldername)) MmCorpus.serialize( os.path.join(os.path.join(os.getcwd(), "out"), '.'.join( ['corpus.mm'])), corpus) mm = MmCorpus('out/corpus.mm') print(mm) # doc_labels = glob.glob("corpus/*") print("fitting the model ...\n") model = LdaModel( corpus=mm, id2word=dictionary, num_topics=no_of_topics, passes=no_of_passes, eval_every=eval, chunksize=chunk, alpha=alpha, eta=eta) # model = LdaMulticore(corpus=corpus, id2word=dictionary, # num_topics=no_of_topics, passes=no_of_passes, # eval_every=eval, chunksize=chunk, alpha=alpha, eta=eta) print(model, "\n") topics = model.show_topics(num_topics=no_of_topics) for item, i in zip(topics, enumerate(topics)): print("topic #"+str(i[0])+": "+str(item)+"\n") print("saving ...\n") if not os.path.exists("out"): os.makedirs("out") # if not os.path.exists(os.path.join(os.path.join(os.getcwd(), # 'out'), foldername)): # os.makedirs(os.path.join(os.path.join(os.getcwd(), 'out'), # foldername)) with open( os.path.join(os.path.join(os.getcwd(), "out"), ''.join( ["corpus_doclabels.txt"])), "w", encoding="utf-8") as f: for item in doc_labels: f.write(item + "\n") with open( os.path.join(os.path.join(os.getcwd(), "out"), ''.join( ["corpus_topics.txt"])), "w", encoding="utf-8") as f: for item, i in zip(topics, enumerate(topics)): f.write( "".join(["topic #", str(i[0]), ": ", str(item), "\n"])) dictionary.save( os.path.join(os.path.join(os.getcwd(), "out"), '.'.join( ['corpus', 'dict']))) # MmCorpus.serialize( # os.path.join(os.path.join(os.getcwd(), "out"), '.'.join( # [foldername, 'mm'])), corpus) model.save( os.path.join(os.path.join(os.getcwd(), "out"), '.'.join( ['corpus', 'lda']))) print("\n ta-daaaa ...\n") # VISUALIZATION no_of_topics = model.num_topics no_of_docs = len(doc_labels) doc_topic = np.zeros((no_of_docs, no_of_topics)) for doc, i in zip(corpus, range(no_of_docs)): # topic_dist is a list of tuples (topic_id, topic_prob) topic_dist = model.__getitem__(doc) for topic in topic_dist: doc_topic[i][topic[0]] = topic[1] # get plot labels topic_labels = [] for i in range(no_of_topics): # show_topic() returns tuples (word_prob, word) topic_terms = [x[0] for x in model.show_topic(i, topn=3)] topic_labels.append(" ".join(topic_terms)) # cf. https://de.dariah.eu/tatom/topic_model_visualization.html if no_of_docs > 20 or no_of_topics > 20: plt.figure(figsize=(20, 20)) # if many items, enlarge figure plt.pcolor(doc_topic, norm=None, cmap='Reds') plt.yticks(np.arange(doc_topic.shape[0])+1.0, doc_labels) plt.xticks( np.arange(doc_topic.shape[1])+0.5, topic_labels, rotation='90') plt.gca().invert_yaxis() plt.colorbar(cmap='Reds') plt.tight_layout() plt.savefig("./static/corpus_heatmap.svg") return render_template('success.html')
def main(): # How to create a dictionary from a list of sentences? documents = read_course_descriptions() # Tokenize(split) the sentences into words texts = [[text for text in doc.split()] for doc in documents] # Create dictionary dictionary = corpora.Dictionary(texts) # Get information about the dictionary print(dictionary) # print(dictionary.token2id) # Tokenize the docs stopword_nltk = stopwords.words('english') tokenized_list = [preprocess(doc, stopword_nltk) for doc in documents] # Create the Corpus mydict = corpora.Dictionary() mycorpus = [ mydict.doc2bow(doc, allow_update=True) for doc in tokenized_list ] # pprint(mycorpus) # > [[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)], [(4, 4)]] word_counts = [[(mydict[id], count) for id, count in line] for line in mycorpus] pprint(word_counts) # Create the TF-IDF model tfidf = models.TfidfModel(mycorpus, smartirs='ntc') # Show the TF-IDF weights #for doc in tfidf[mycorpus]: # print([[mydict[id], np.around(freq, decimals=2)] for id, freq in doc]) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logging.root.setLevel(level=logging.INFO) lda_model = LdaModel(corpus=tfidf[mycorpus], id2word=mydict, random_state=100, num_topics=30, passes=100, chunksize=1000, alpha='asymmetric', decay=0.5, offset=64, eta=None, eval_every=0, iterations=1000, gamma_threshold=0.001, per_word_topics=True) # save the model lda_model.save('lda_model.model') # See the topics #lda_model.print_topics(-1) lda_model.show_topic(0)
class TopicModel: def __init__(self, topicCollection, string): if string.lower() == "nmf": self.model = "NMF" print("Topic Extraction Model: sklearn.NMF") else: self.model = "LDA" print("Topic Extraction Model: gensim.LDAModel") self.stemmer = PorterStemmer() #Train the LDA model on the current discussion def train(self, sentences): if self.model == "NMF": self.sentenceData = [] for sentence in sentences: self.sentenceData.append(preprocess(sentence, self.stemmer)) self.tfidf_vectorizer = TfidfVectorizer( max_features=1500, ngram_range=(1, 2), preprocessor=' '.join, stop_words='english' ) tfidf = self.tfidf_vectorizer.fit_transform(self.sentenceData) self.nmf = NMF(n_components=2, solver="mu") self.W = self.nmf.fit_transform(tfidf) self.H = self.nmf.components_ else: sentenceData = [] for sentence in sentences: sentenceData.append(preprocess(sentence, self.stemmer)) self.dictionary = Dictionary(sentenceData) bow_corpus = [self.dictionary.doc2bow(doc) for doc in sentenceData] self.lda_model = LdaModel(bow_corpus, num_topics=2, id2word=self.dictionary, passes=10) #Classify a given sentence to one of the topics found in training def classify(self, sentence): if self.model == "NMF": index = self.sentenceData.index(preprocess(sentence, self.stemmer)) topic = self.W.argmax(axis=1)[index] return "Topic " + str(topic) else: bow_vector = self.dictionary.doc2bow(preprocess(sentence, self.stemmer)) return "Topic " + str(sorted(self.lda_model[bow_vector], key=lambda tup: -1*tup[1])[0][0]) #Shows the terms of a given topic def showTerms(self, topic): if self.model == "NMF": terms = "" top_features = [] tfidf_feature_names = self.tfidf_vectorizer.get_feature_names() for topic_idx, topicID in enumerate(self.H): if topic_idx == int(topic.split(' ')[-1]): top_features_ind = topicID.argsort()[:-20 - 1:-1] top_features = [tfidf_feature_names[i] for i in top_features_ind] weights = topicID[top_features_ind] for term in top_features: terms += term + ", " print(topic.split(' ')[-1] + " " + terms) return terms else: terms = "" topic = int(topic.split(" ")[-1]) for term in self.lda_model.show_topic(topic): terms += term[0] + ", " print(str(topic) + " " + terms) return terms #Gets the probability or the coefficient of the given term in the topic def getCoeff(self, topic, term): if self.model == "NMF": weights = [] top_features = [] tfidf_feature_names = self.tfidf_vectorizer.get_feature_names() for topic_idx, topicID in enumerate(self.H): if topic_idx == topic: top_features_ind = topicID.argsort()[:-20 - 1:-1] top_features = [tfidf_feature_names[i] for i in top_features_ind] weights = topicID[top_features_ind] for coeff, terms in zip(weights, top_features): if terms == term: return coeff else: topic = int(topic.split(" ")[-1]) for terms in self.lda_model.show_topic(topic): if terms[0] == term: return terms[1] #Shows all the topics found in training def showTopics(self): if self.model == "NMF": ret = [] for topic_idx, topicID in enumerate(self.H): ret.append("Topic " + str(topic_idx)) return ret else: topics = self.lda_model.print_topics() ret = [] for topic in topics: ret.append("Topic " + str(topic[0])) return ret #Returns a flag to check what model is deployed at the moment def getModel(self): return self.model
except: continue writer.writerow(new_sentence) new_sentences.append(new_sentence) # 単語と単語IDを対応させる辞書の作成 dictionary = Dictionary(new_sentences) # LdaModelが読み込めるBoW形式に変換 corpus = [dictionary.doc2bow(text) for text in new_sentences] # トピック数を指定してモデルを学習 lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=9, minimum_probability=0.001, passes=20, update_every=0, chunksize=10000) with open('output_topics.csv', 'w', encoding='utf-8') as csvfile: writer = csv.writer(csvfile) for i in range(9): writer.writerow([f'------------topic{i}------------']) print("\n") print("=" * 80) print("TOPIC {0}\n".format(i)) topic = lda.show_topic(i, topn=20) for t in topic: print("{0:20s}{1}".format(t[0], t[1])) writer.writerow([t[0], t[1]])
# WordCloud # 日本語フォントをダウンロードしてwork以下に設置 fig, axs = plt.subplots(ncols=2, nrows=math.ceil(lda_model.num_topics / 2), figsize=(16, 20)) axs = axs.flatten() def color_func(word, font_size, position, orientation, random_state, font_path): return 'darkturquoise' for i, t in enumerate(range(lda_model.num_topics)): x = dict(lda_model.show_topic(t, 30)) im = WordCloud(background_color='black', color_func=color_func, max_words=4000, width=300, height=300, random_state=0, font_path='./work/ipaexg.ttf').generate_from_frequencies(x) axs[i].imshow(im.recolor(colormap='Paired_r', random_state=244), alpha=0.98) axs[i].axis('off') axs[i].set_title('Topic ' + str(t)) # vis plt.tight_layout() plt.show()
bow_corpus = [ dictionary.doc2bow(t) for t in texts ] print 'Serializing corpus (%s) ...' % BOW MmCorpus.serialize(BOW, bow_corpus) size = len(bow_corpus) * 9/10 training = bow_corpus[:size] testing = bow_corpus[size:] t0 = time() print 'Training LDA w/ %d topics on first %d texts ...' % (Num_Topics, len(training)) lda = LdaModel(training, id2word=dictionary, num_topics=Num_Topics, passes=5) print("done in %0.3fs." % (time() - t0)) print 'Saving LDA model (%s) ...' % NSFLDA lda.save(NSFLDA) print 'Random subset of topics:' print '\n'.join(lda.print_topics()) print 'Computing perplexity on %d held-out documents ...' % len(testing) perplexity = 2 ** -(lda.log_perplexity(testing)) print 'Perplexity: %.2f' % perplexity for i in range(0, Num_Topics): temp = lda.show_topic(i, 10) terms = [] for term in temp: terms.append(term[1]) print "Top 10 terms for topic #" + str(i) + ": "+ ", ".join(terms)
print 'Building bag-of-words corpus ...' bow_corpus = [dictionary.doc2bow(t) for t in texts] print 'Serializing corpus (%s) ...' % BOW MmCorpus.serialize(BOW, bow_corpus) size = len(bow_corpus) * 9 / 10 training = bow_corpus[:size] testing = bow_corpus[size:] t0 = time() print 'Training LDA w/ %d topics on first %d texts ...' % (Num_Topics, len(training)) lda = LdaModel(training, id2word=dictionary, num_topics=Num_Topics, passes=5) print("done in %0.3fs." % (time() - t0)) print 'Saving LDA model (%s) ...' % NSFLDA lda.save(NSFLDA) print 'Random subset of topics:' print '\n'.join(lda.print_topics()) print 'Computing perplexity on %d held-out documents ...' % len(testing) perplexity = 2**-(lda.log_perplexity(testing)) print 'Perplexity: %.2f' % perplexity for i in range(0, Num_Topics): temp = lda.show_topic(i, 10) terms = [] for term in temp: terms.append(term[1]) print "Top 10 terms for topic #" + str(i) + ": " + ", ".join(terms)
largest = pairwise.max() for ti in range(len(topics)): pairwise[ti, ti] = largest + 1 def closest_to(doc_id): return pairwise[doc_id].argmin() counts = np.zeros(100) for doc_top in topics: for ti, _ in doc_top: counts[ti] += 1 words = lda_model.show_topic(counts.argmax(), 64) print words # # plot # # for ti in xrange(84): # words = lda_model.show_topic(ti, 64) # tf = sum(f for f, w in words) # print('\n'.join('{}:{}'.format(w, int(1000. * f / tf)) for f, w in words)) # print() # print() # print() # thetas = [lda_model[c] for c in corpus_lda]
def get_topics(raw_text, ngram=1, vocab_binary=True, nwords=30, ntopics=1): # Enable logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) print('Number of documents: ' + str(len(raw_text))) print('\nTokenizing documents..') tokens = [] for doc in raw_text: try: doc = doc.encode("utf-8") token = word_tokenize(str(doc)) clean_token = [ i.lower() for i in token if i.strip() not in stop_words and i[:-2].strip() not in stop_words and i[:-1].strip() not in stop_words and i.strip().isalpha() and len(i.strip()) > 1 ] ngram_tokens = [] ngram_tokens.extend([x[0] for x in ngrams(clean_token, 1)]) if ngram > 1: for i in range(2, ngram + 1): ngram_tokens.extend( [' '.join(x) for x in ngrams(clean_token, i)]) if vocab_binary: tokens.append(set(ngram_tokens)) else: tokens.append(ngram_tokens) except Exception as e: print(e) # turn our tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(tokens) # dictionary.filter_extremes(no_above=1.0, keep_n=None) len(dictionary) # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in tokens] lm = LdaModel(corpus=corpus, id2word=dictionary, passes=1, num_topics=ntopics) # get topic 0 words (Topic: Default) for foam-tree viz words = (lm.show_topic(0, nwords)) output = [] weights = [x[1] for x in words] scaled_weights = scale_range(weights, 5, 60) scaled_weights = numpy.nan_to_num(scaled_weights, 5) for i, word in enumerate(words): output.append({'label': word[0], 'weight': int(scaled_weights[i])}) return output
class WordEmbeddingRewarder(): def __init__(self): self.word_embedding = KeyedVectors.load_word2vec_format(W2V_PATH) def __call__(self, docs, summaries): tfs = [] df = OrderedDict() weights = [] entities = [] for doc in docs: tf = OrderedDict() token_found = set() doc_token = [] for sent in doc[1]: sent = sent2tokens_wostop(sent, set(stopwords.words(LANGUAGE)), LANGUAGE) for token in sent: if token in tf: tf[token] += 1 else: tf[token] = 1 if token not in token_found: token_found.add(token) if token in df: df[token] += 1 else: df[token] = 1 embedding = np.zeros(300) try: embedding += self.word_embedding[token] except KeyError: pass embedding /= len(embedding) weights.append(embedding) entities.append(str(len(entities))) tfs.append(tf) id2word = {i:word for i, word in enumerate(df.keys())} word2id = {id2word[id]:id for id in id2word.keys()} corpora = [[(word2id[token], tf[token]) for token in tf.keys()] for tf in tfs] self.doc_entities = [] for i, tf in enumerate(tfs): divisor = sum([tf[token]/df[token] for token in tf.keys()]) embedding = [] for token in tf.keys(): try: embedding.append(self.word_embedding[token]*tf[token]/df[token]) except KeyError: pass embedding = np.sum(np.array(embedding), 0)/(len(embedding)*divisor) weights.append(embedding) entities.append('d'+str(i)) self.doc_entities.append('d'+str(i)) self.lda = LdaModel(corpus=corpora, num_topics=10, id2word=id2word, passes=10) self.topic_entities = [] for i in range(10): topic_words = self.lda.show_topic(i, topn=30) embedding = [] divisor = sum([w_p_pair[1]for w_p_pair in topic_words]) for w_p_pair in topic_words: try: embedding.append(self.word_embedding[w_p_pair[0]]*w_p_pair[1]/divisor) except KeyError: pass embedding = np.sum(np.array(embedding), 0)/len(embedding) weights.append(embedding) entities.append('t'+str(i)) self.topic_entities.append('t'+str(i)) self.sent_embedding = WordEmbeddingsKeyedVectors(300) self.sent_embedding.add(entities, np.array(weights), replace=True) return self.distributional_semantic_similarity(summaries), self.topic_relevance(summaries), self.coherence(summaries) def distributional_semantic_similarity(self, summaries): results = [] for summ in summaries: sent_entities = list(map(str, summ)) wmd = self.sent_embedding.wmdistance(sent_entities , self.doc_entities) results.append(wmd) return results def topic_relevance(self, summaries): results = [] for summ in summaries: sent_entities = list(map(str, summ)) wmd = self.sent_embedding.wmdistance(sent_entities , self.topic_entities) results.append(wmd) return results def coherence(self, summaries): results = [] for summ in summaries: sim = [] for i in range(len(summ)-1): s = cosine(self.sent_embedding[str(summ[i])], self.sent_embedding[str(summ[i+1])]) if not np.isnan(s): sim.append(s) sim = np.array(sim) results.append([np.mean(sim), np.std(sim)]) return results
with open(os.path.join(path, 'data.tsv'), encoding='utf8') as f: reader = csv.reader(f, delimiter="\t") for line in reader: labels = line[0].split(', ') multi_hot_labels.append(labels) c = line[1:] c = clean_data(c) context.extend(c) #convert to multi-hot encoding mlb = MultiLabelBinarizer() labels = mlb.fit_transform(multi_hot_labels) label_list = list(mlb.classes_) token_context = [word_tokenize(x) for x in context] token_list = [] for x in token_context: temp = [i for i in x if not i in stop_words] token_list.append(temp) token_context = [clean_data(x) for x in token_list] del token_list common_dictionary = Dictionary(token_context) common_corpus = [common_dictionary.doc2bow(text) for text in token_context] # Train the model on the corpus. lda = LdaModel(common_corpus, id2word=common_dictionary, alpha='auto', num_topics=3, passes=5) print(lda.show_topic(2, 20))
texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents] from collections import defaultdict frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 texts = [[token for token in text if frequency[token] >= 1] for text in texts] from pprint import pprint # pretty-printer dictionary = corpora.Dictionary(texts) # dictionary.save('/tmp/deerwester.dict') # store the dictionary, for future reference # print(dictionary) corpus = [dictionary.doc2bow(text) for text in texts] # corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus) lda = LdaModel(corpus, num_topics=2) # on a new document: new_doc = "pretty obvious that when i write my tellall memoir someday there will be four to six" new_vec = dictionary.doc2bow(new_doc.lower().split()) print(lda.print_topic(0)) print(lda.show_topic(1)) print(lda.get_document_topics(new_vec))
import pickle from gensim.models import CoherenceModel ldamodel = pickle.load( open( "\\Users\\hamed\\Desktop\\ECE 143 Project Data Files\\ldamodel_100_QAT.pkl", "rb")) ldamulticore = pickle.load( open( "\\Users\\hamed\\Desktop\\ECE 143 Project Data Files\\ldamulticore_100_QAT.pkl", "rb")) # In[ ]: # Get top significant terms and their probabilities for each topic using LDA multicore topics_ldam = [[(term, round(wt, 3)) for term, wt in ldamodel.show_topic(n, topn=20)] for n in range(0, ldamodel.num_topics)] # In[ ]: # 5 most probable words for each topic for LDA topics_df = pd.DataFrame( [[term for term, wt in topic] for topic in topics_ldamulticore], columns=['Term' + str(i) for i in range(1, 21)], index=['Topic ' + str(t) for t in range(1, ldamulticore.num_topics + 1)]).T topics_df.head() # In[ ]: # 5 most probable words for each topic for LDA topics_df = pd.DataFrame(
class LdaModelHelper: status_scheduled = 'scheduled' status_computing = 'computing' status_completed = 'completed' status_error = 'killed' default_use_lemmer = True default_min_df = 2 default_max_df = 0.8 def __init__(self, training_number_of_topics_to_extract, language, training_use_lemmer=True, training_min_df=2, training_max_df=0.8, chunksize=2000, passes=2): """ :rtype: LdaModelHelper :param training_use_lemmer: :param training_min_df: int or float, min document frequency / document proportion (if float < 1) to consider a term in the model :param training_max_df: int or float, max document frequency / document proportion (if float < 1) to consider a term in the model """ self.language = language self.analysis_use_lemmer = LdaModelHelper.default_use_lemmer self.analysis_min_df = LdaModelHelper.default_min_df self.analysis_max_df = LdaModelHelper.default_max_df self.analysis_corpus = None self.analysis_features_names = None self.analysis_documents = None self.training_number_of_topics_to_extract = training_number_of_topics_to_extract self.training_use_lemmer = training_use_lemmer self.training_min_df = training_min_df self.training_max_df = training_max_df self.chunksize = chunksize self.passes = passes self.training_corpus = None self.training_features_names = None self.analysis_documents = None self.training_documents = None self.lda_model = None self.model_computation_time = None self.topic_labels = None self.topic_assignment = None def set_analysis_parameters(self, analysis_use_lemmer=True, analysis_min_df=2, analysis_max_df=0.8): self.analysis_use_lemmer = analysis_use_lemmer self.analysis_min_df = analysis_min_df self.analysis_max_df = analysis_max_df # reset related fields self.topic_assignment = None self.topic_labels = None self.analysis_corpus = None self.analysis_features_names = None self.analysis_documents = None def generate_model_filename(self): return "_".join([ str(time.time()), str(self.training_number_of_topics_to_extract), str(self.training_min_df), str(self.training_max_df), str(self.training_use_lemmer) ]).replace('.', '') def set_lda_model(self, lda_model): self.lda_model = lda_model ##################### # Model computation ##################### def compute_lda_model(self, texts): """ Compute the lda model :return: """ if self.training_corpus is None: self.compute_corpus(texts, parameters='training') if self.training_corpus is None or len(self.training_corpus) == 0: raise Exception( 'The training corpus is empty. Tune model computation parameters.' ) start = time.time() if self.passes == 2: passes = 10 if (len(self.training_corpus) / self.chunksize) < 10 else 2 else: passes = self.passes id2word = {k: v for k, v in enumerate(self.training_features_names)} self.lda_model = LdaModel( self.training_corpus, id2word=id2word, num_topics=self.training_number_of_topics_to_extract, eval_every=1, passes=passes, chunksize=self.chunksize) end = time.time() self.model_computation_time = end - start def save_model_to_file(self, file_path): """ :type file_path: str :param file_path: the path of the models file :return: """ if self.lda_model is None: logging.error('The model has not been computed yet.') return False else: self.lda_model.save(file_path) def load_model_from_file(self, input_filepath): """ :param input_folder: :return: """ self.lda_model = LdaModel.load(input_filepath) def compute_corpus(self, texts, parameters='training'): """ Compute the corpus in gensim format considering the specified set of parameters 'training' or 'analysis'. :param parameters: :param texts: :return: """ if parameters == 'training': tf_matrix, tf_matrix_features_names, tf_matrix_docs_ids = self.compute_tf_matrix( texts, parameters) if tf_matrix_features_names is None or len( tf_matrix_features_names) == 0: return [] self.training_corpus = matutils.Sparse2Corpus( tf_matrix, documents_columns=False) self.training_features_names = tf_matrix_features_names self.training_documents = tf_matrix_docs_ids return self.training_corpus elif parameters == 'analysis': if self.lda_model is None: logging.error('The model has not been computed yet.') return None else: # Note: words not included in the model are ignored tf_matrix, tf_matrix_features_names, tf_matrix_docs_ids = self.compute_tf_matrix( texts, parameters) if len(tf_matrix_features_names) == 0: return [] corpus = [None] * tf_matrix.shape[0] if len(tf_matrix_features_names) != 0: word2id = { self.lda_model.id2word[id]: id for id in self.lda_model.id2word.keys() } for i in range(tf_matrix.shape[0]): doc = tf_matrix.getrow(i) _, cols = doc.nonzero() corpus[i] = [None] * len(cols) count = 0 for col in cols: if tf_matrix_features_names[col] in word2id.keys(): corpus[i][count] = (int( word2id[tf_matrix_features_names[col]]), int(tf_matrix[i, col])) count += 1 corpus[i] = corpus[i][:count] self.analysis_corpus = corpus self.analysis_features_names = tf_matrix_features_names self.analysis_documents = tf_matrix_docs_ids return self.analysis_corpus else: logging.error( "Value not allowed for argument parameters. Allowed values are 'training' or 'analysis'." ) return None def compute_corpus_single_query(self, text): """ Compute the corpus in gensim format for a single query (this implies using special parameters for preprocessing) :param text: :return: """ if self.lda_model is None: logging.error('The model has not been computed or loaded yet.') return None, None else: # Note: words not included in the model are ignored stopwords_list = lda_utils.get_stopwords(self.language) tf_matrix, tf_matrix_features_names = lda_utils.compute_tf( [text], stopwords_list, self.language, True, 1, 1.0) if len(tf_matrix_features_names) == 0: return [], tf_matrix_features_names corpus = [None] * tf_matrix.shape[0] if len(tf_matrix_features_names) != 0: word2id = { self.lda_model.id2word[id]: id for id in self.lda_model.id2word.keys() } for i in range(tf_matrix.shape[0]): doc = tf_matrix.getrow(i) _, cols = doc.nonzero() corpus[i] = [None] * len(cols) count = 0 for col in cols: if tf_matrix_features_names[col] in word2id.keys(): corpus[i][count] = (int( word2id[tf_matrix_features_names[col]]), int(tf_matrix[i, col])) count += 1 corpus[i] = corpus[i][:count] return corpus, tf_matrix_features_names def compute_tf_matrix(self, texts, parameters='training'): """ Compute the tf matrix using the specified set of parameters ('training' or 'analysis'). If texts is not specified the system tries to retrieve data directly from the associated db. :param parameters: 'training' or 'analysis' :param texts: list of strings representing texts to transform. :return: """ tf_matrix_docs_id = None if parameters == 'training' or parameters == 'analysis': stopwords_list = lda_utils.get_stopwords(self.language) if parameters == 'training': use_lemmer = self.training_use_lemmer min_df = self.training_min_df max_df = self.training_max_df else: use_lemmer = self.analysis_use_lemmer min_df = self.analysis_min_df max_df = self.analysis_max_df tf_matrix, tf_matrix_features_names = lda_utils.compute_tf( texts, stopwords_list, self.language, use_lemmer, min_df, max_df) else: logging.error( "Value not allowed for argument parameters. Allowed values are 'training' or 'analysis'." ) return None return tf_matrix, tf_matrix_features_names, tf_matrix_docs_id def compute_topic_assignment(self, texts): """ Computes the topics assignment for each document w.r.t the specified topic_model Example of output = [[(25, 0.1174058544855012), (49, 0.82926081218116554)], [(6, 0.29928250617927882), (49, 0.59405082715405444)]] :param texts: :return: """ corpus = self.compute_corpus(texts, parameters='analysis') if len(corpus) == 0: raise Exception( 'The corpus is empty. Tune analysis parameters and check stopwords.' ) computed_assignment = self.lda_model[corpus] if texts is not None: # is the corpus related to analysis parameters self.topic_assignment = computed_assignment return computed_assignment def compute_topic_assignment_for_query(self, text): corpus, _ = self.compute_corpus_single_query(text) if corpus is None or len(corpus) == 0: raise Exception( 'The corpus is empty. Tune analysis parameters and check stopwords.' ) computed_assignment = self.lda_model[corpus] return computed_assignment ####################### # Print functions ####################### def print_topic_assignment(self, topic_assignment): """ Print a topic assignment in a human readable format :param topic_assignment: :return: """ print('\tTopic importance\tTopic description') for i, doc in enumerate(topic_assignment): print('Document {0}'.format(i)) for a in doc: print() string_topic = a[ 0] if self.lda_model is None else self.lda_model.print_topic( a[0]) print('\t{1:2f}\t\t{0}'.format(string_topic, a[1])) def print_all_topics(self, num_topics=10, num_words=20, try_to_disambiguate=False, min_word_probabity_for_disambiguation=0.010): """ Print topics from a given LdaModel """ print('Print {0} topics'.format(num_topics)) print('------------') for t in self.lda_model.show_topics(num_topics=num_topics, num_words=num_words, formatted=False): if try_to_disambiguate: possible_labels = self.__class__.label_topic_by_probability( self.lda_model.show_topic(t[0]), min_word_probability=min_word_probabity_for_disambiguation )[:2] print('{0}:\t{1}\n'.format(t[0], possible_labels)) print('{0}\n'.format(t[1])) else: print('{0}:\t{1}\n'.format(t[0], t[1])) def get_topic_description(self, topic_id, num_words=20): """ Print topics from a given LdaModel """ if self.lda_model is None: logging.error('The model has not been computed yet.') else: return self.lda_model.show_topic(topic_id, num_words) ####################### # Labeling functions ####################### def compute_topic_labels(self, labeling_mode='mixed', min_word_probability=0.01, max_number_of_words_per_query=6, n_words_to_label=3): """ The labeling is performed querying wikipedia with a set of representative words for the topic. The words are chosen with the parameter labeling_mode: - 'based_on_probability': considers all words with a weight (probability) greater than 0.010 - 'based_on_top_words': considers the 3 most probable words for the topic - 'mixed': try with 'based_on_probability', if there are no results try with 'based_on_top_words' """ if self.lda_model is None: logging.error('No LDA model loaded.') n_labels_to_save = 3 self.topic_labels = {} # label topics for t in self.lda_model.show_topics( num_topics=self.training_number_of_topics_to_extract, num_words=40, formatted=False): topic_id = t[0] possible_labels = [] if labeling_mode == 'mixed' or labeling_mode == 'based_on_probability': possible_labels = self.__class__.label_topic_by_probability( self.lda_model.show_topic(topic_id), min_word_probability=min_word_probability, max_words=max_number_of_words_per_query)[:n_labels_to_save] if len(possible_labels) == 0: # try to disambiguate by n_words possible_labels = self.__class__.label_topic_by_number_of_words( self.lda_model.show_topic(topic_id), n_words=n_words_to_label)[:n_labels_to_save] for i in range(len(possible_labels), n_labels_to_save): # fill empty labels possible_labels.append('') self.topic_labels[topic_id] = possible_labels time.sleep(0.5) def get_topic_labels(self): if self.topic_labels is None: self.compute_topic_labels() return self.topic_labels def get_all_topics(self): """ Return a dictionary where keys are topic ids (integers) and values are words distributions. Words distribution should be a dictionary where keys are words and values are words weights within the topic :rtype: dict :return: """ topics = {} for t in self.lda_model.show_topics( num_topics=self.training_number_of_topics_to_extract, num_words=config.max_number_of_words_per_topic, formatted=False): topic_id = t[0] topic_distr = self.get_word_frequencies( self.lda_model.show_topic( topic_id, config.max_number_of_words_per_topic)) topics[topic_id] = topic_distr return topics def _get_words_distribution(self, topic_id): """ Return a a dictionary where keys are words and values are words weights within the topic :param topic_id: the topic index :rtype: dict :return: """ topic_description = self.lda_model.show_topic( topic_id, config.max_number_of_words_per_topic) return self.__class__.get_word_frequencies(topic_description) @classmethod def delete_model_files(cls, folder_path, files_prefix): """ Delete all files related to a model that have the specified file prefix :param folder_path: :param files_prefix: :rtype: :return: 200 if all files have been removed, 404 if files does not exist """ if os.path.exists(os.path.join(folder_path, files_prefix)): files_to_remove = [ files_prefix, files_prefix + ".state", files_prefix + ".expElogbeta.npy", files_prefix + ".id2word", ] for f in files_to_remove: os.remove(os.path.join(folder_path, f)) return 200 else: logging.error('[ERROR] Model files does not exists.') return 404 ####################### # Topic labeling ####################### @classmethod def label_topic_by_probability(cls, topic_description, min_word_probability=0.010, max_words=6): """ Try to disambiguate a topic considering all words with a weight greater than min_word_probability :param max_words: :param topic_description: is a list of pairs (word, word_probability) :param min_word_probability: is the minimum probability for words :return: list of strings, possible wikipedia pages """ words = [w for w, p in topic_description if p >= min_word_probability] words = words[:max_words] if len(words) == 0: # if no words are over the threshold return empty res = [] else: res = wikipedia.search(' '.join(words)) return res @classmethod def label_topic_by_number_of_words(cls, topic_description, n_words=5): """ Try to disambiguate a topic considering top k words in its description :param n_words: :param topic_description: is a list of pairs (word, word_probability) :return: list of strings, possible wikipedia pages """ words = [t[0] for i, t in enumerate(topic_description) if i < n_words] if len(words) == 0: # if no words are over the threshold, take the first words = [topic_description[0][0]] res = wikipedia.search(' '.join(words)) return res @classmethod def get_word_frequencies(cls, topic_description): """ Given a topic description, returns the corresponding dictionary with words as keys and frequencies (weight * 1000) as values. :param topic_description: list of pairs (word, word_weight) :return: """ frequencies = {w: f for w, f in topic_description} return frequencies
obj = lda.get_topics() a = lda.inference(corpus) print(doc_distribution[:853]) # training corpus document by topic matrix doc_topic_dist_corpus = np.array([[tup[1] for tup in lst] for lst in lda[corpus]]) save_obj(lda, 'LDA_MODEL_APPLICATION') #%% lda = load_obj('LDA_MODEL_APPLICATION') fig, axes = plt.subplots(2, 3, figsize=(20, 10), sharex=True, sharey=True) for i, ax in enumerate(axes.flatten()): fig.add_subplot(ax) plt.imshow( WordCloud(background_color="white").fit_words( dict(lda.show_topic(i, 200)))) plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16)) plt.gca().axis('off') plt.subplots_adjust(wspace=0, hspace=0) plt.axis('off') plt.margins(x=0, y=0) plt.tight_layout() plt.show() #%% # finding dominant topics in the corpus for each document def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data): # Init output sent_topics_df = pd.DataFrame()
files = ["deprecated_libraries", "explicit_mention"] for filename in files: # read in the data filepath = "data/filtered/tokenized/"+filename+"_lda.pkl" df = read_pickle(filepath) num_topics = 20 chunksize = 300 dictionary = Dictionary(df['tokenized']) corpus = [dictionary.doc2bow(doc) for doc in df['tokenized']] # low alpha means each document is only represented by a small number of topics, and vice versa # low eta means each topic is only represented by a small number of words, and vice versa model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, alpha=1e-2, eta=0.5e-2, chunksize=chunksize, passes=5) # save the model model_name = "models/"+filename+"_ldamodel" model.save(model_name) print("\n====\n") print(filename.upper()) for topic_id in range(model.num_topics): topic = model.show_topic(topic_id, 10) topic_words = [ w for w, _ in topic ] print('{}: {}'.format(topic_id, ' '.join(topic_words)))
class LDA(object): def __init__(self, source=None, corpus=None, client=None, corpus_specs=None): self.corpus = corpus # if sum([t is not None for t in [source, corpus, client]]) != 1: # raise NameError( # 'You need to provide one and only one of those (source, corpus, client)') if source: client = get_instance(**source) if client: corpus_specs = corpus_specs or {} self.corpus = client.get_corpus(**corpus_specs) self.native_model = None self.dictionary = None def train(self, num_topics, alpha="auto", passes=10, eta="auto", **kargs): """Train Model""" kargs = kargs or {} prev_mode = self.corpus.mode self.corpus.mode = "bow" self.native_model = LdaModel(self.corpus, id2word=self.corpus.get_dictionary(), num_topics=num_topics, alpha=alpha, passes=passes, eta=eta, **kargs) self.corpus.mode = prev_mode @classmethod def __display(cls, data): template_file = os.path.join( os.path.dirname(os.path.abspath(__file__)), "TopicSheet.html.jinja2") with open(template_file) as file_: template = Template(file_.read()) html = template.render(data) display(HTML(html)) def save(self, model_path): self.native_model.save(model_path) def load(self, model_path): self.native_model = LdaModel.load(model_path) self.dictionary = self.native_model.id2word if self.corpus: self.corpus.dictionary = self.native_model.id2word def get_topics_matrix(self, topn=10): data = [] for i in range(self.native_model.num_topics): data.append((i, self.native_model.show_topic(i, topn=topn))) return data def get_dictionary(self): if self.dictionary: return self.dictionary if self.corpus: return self.corpus.get_dictionary() if self.native_model.id2word: return self.native_model.id2word return None def predict(self, text): text = self.corpus.doc2bow(text) result = sorted(self.native_model[text], key=lambda d: d[1], reverse=True) return result def get_topic_words(self, topic, topn=10): return self.native_model.show_topic(topic, topn=topn) def compute_uniquiness_score(self, matrix, sort=False): num_topic = self.native_model.num_topics temp_dic = {} for topic in matrix: for word in topic[1]: w = word[0] count = temp_dic.get(w, 0) temp_dic[w] = count + 1 for topic_idx in range(len(matrix)): topic = matrix[topic_idx] words_topic = topic[1] for word_idx in range(len(words_topic)): word = matrix[topic_idx][1][word_idx] words_topic[word_idx] = (*word, 1 - temp_dic[word[0]] / num_topic) if sort: words_topic = sorted(words_topic, key=lambda w: w[1] * w[2], reverse=True) topic = (topic[0], words_topic) matrix[topic_idx] = topic return matrix def display_topics(self, topn=20, sort_by_uniquiness=False): matrix = self.get_topics_matrix(topn=topn) matrix = self.compute_uniquiness_score(matrix, sort=sort_by_uniquiness) self.__display({"topics": matrix}) return matrix def display_topic(self, topic_id, topn=20): matrix = self.get_topics_matrix(topn=topn) topic = [self.compute_uniquiness_score(matrix, sort=True)[0]] self.__display({"topics": topic})
id2word=dictionary, num_topics=num_topics, iterations=5, passes=10, alpha='auto' ) #DESCARGA DE DATOS A FICHEROS word_dict = {} today = date.today() today_path = '../data/topic_today_EN.csv' hist_path = '../data/topic_history_EN.csv' for i in range(num_topics): words = lda_model.show_topic(i, topn = 10) word_dict['date'] = today word_dict['Topic'] = [i[0] for i in words] topic_today = pd.DataFrame(word_dict) topic_today.to_csv(today_path, index=False) if os.path.isfile(hist_path): topic_hist = pd.read_csv(hist_path) topic_hist = pd.concat([topic_hist, topic_today]) topic_hist.to_csv(hist_path, index=False) else: topic_today.to_csv(hist_path, index=False)
# topic distribution for the given document bow, as a list of (topic_id, topic_probability) 2-tuples. test = dct.doc2bow("I love Kitten".lower().strip().split()) print(lda.get_document_topics(test)) print(lda[test]) # 参数(word_id, minimum_probability=None) # 关联的topics for the given word. # Each topic is represented as a tuple of (topic_id, term_probability). print(lda.get_term_topics(0)) # ----- 输出指定topic的构成 ----- # 参数(word_id, minimum_probability=None) # 输出形式 list, format: [(word, probability), … ]. print(lda.get_topic_terms(0)) # 参数(topicno, topn=10) print(lda.show_topic(0)) # 输出形式 String, format: ‘-0.340 * “category” + 0.298 * “$M$” + 0.183 * “algebra” + … ‘. # 参数(topicno, topn=10) print(lda.print_topic(0)) # ----- 输出所有topic的构成 ----- # 默认参数(num_topics=10, num_words=10, log=False, formatted=True) # 输出形式 String, format: [(0, ‘-0.340 * “category” + 0.298 * “$M$” + 0.183 * “algebra” + … ‘), ...] print(lda.show_topics()) # [num_topics, vocabulary_size] array of floats (self.dtype) # which represents the term topic matrix learned during inference. print(lda.get_topics()) # ----- save and load model ----- lda.save(fname="lda_model") lda.load(fname="lda_model")