def refine_metadata(metadata_file_name, new_metadata_file_name, graph_file_name, lda_preamble=True, null_inference=True): with open(graph_file_name, "rb") as graph_file: G = pkl.load(graph_file) with open(metadata_file_name, "r") as metadata_file, open(new_metadata_file_name, "w") as new_metadata_file: metadata = json.load(metadata_file)["files"] for i in range(0, len(metadata)): item = metadata[i] full_path = os.path.join(item["system"]["path"], item["system"]["file"]) print "refining metadata for {}".format(full_path) if "lda" not in item["system"]["extractors"]: topics = topic_mixture(item, metadata, G) item["topics"] = topics max_topic = max(topics, key=lambda (i, p): p)[0] topic_words = [str(w[0]) for w in LdaModel.show_topic(lda_model, max_topic)] item["tags"] = topic_words if "columnar" in item["system"]["extractors"] and null_inference: nulls = inferred_nulls(item) if not all([null == 0 for null in nulls]): with open(full_path, "r") as file_handle: new_columns = extract_columnar_metadata(file_handle, pass_fail=False, lda_preamble=lda_preamble, null_inference=True)["columns"] metadata[i]["columns"] = new_columns json.dump(metadata, new_metadata_file)
def train_lda(n_topics, id2word_dictionary=None, documents=None, corpus=None): """ Training method for LDA. documents is a list of lists of words/tokens documents is used to construct a dictionary and a corpus from which the topics for LDA are inferred """ # Construct dictionary of words if it's not passed if not id2word_dictionary: id2word_dictionary = corpora.Dictionary(documents) word2idx_dictionary = dict([(w, idx) for (idx, w) in id2word_dictionary.items()]) # Construct corpus for model if documents and not corpus: corpus = [id2word_dictionary.doc2bow(document) for document in documents] # Cluster the documents into topics using LDA. number of topics is given # by n_topics lda_model = LdaModel(corpus=corpus, id2word=id2word_dictionary, num_topics=n_topics, update_every=1, chunksize=10000, passes=1) """ Default value for topn (number of top words to show by probability) is 10. A high enough value should return the words covering most or all of the probability mass """ topics = [lda_model.show_topic(idx, topn=50000) for idx in range(0, n_topics)] return lda_model, id2word_dictionary, word2idx_dictionary, topics
class LMDL_LDA(): def __init__(self): self.lmdl = LMDL_Corpus() self.texts = self.lmdl.get_corpus_texts_words() self.dictionary = Dictionary(self.texts) self.corpus = [self.dictionary.doc2bow(text) for text in self.texts] self.lda = LdaModel(self.corpus, num_topics=LDA_NUM_TOPICS, id2word=self.dictionary) def print_topics(self): return self.lda.print_topics(LDA_NUM_TOPICS) def get_document_topics(self, document_name): document_tokens = self.lmdl.token_list_processed(document_name) topics = self.lda.get_document_topics( self.dictionary.doc2bow(document_tokens), minimum_probability=None, minimum_phi_value=None, per_word_topics=False) show_topics_list = [] for topic in topics: lda_topic = self.lda.show_topic(topic[0], topn=10) show_topics_list.append(lda_topic) return show_topics_list def top_topics(self): return self.lda.top_topics(corpus=self.corpus, texts=self.texts, dictionary=self.dictionary, window_size=None, coherence='u_mass', topn=20, processes=-1)
class topicExtract_lda: def __init__(self,docs,nTopic = 20): """ extract topic using LDA input: list of list of words, each list is a token of a doc """ for i,idoc in enumerate(docs): if isinstance(idoc, str): docs[i] = word_tokenize(idoc) self.wordDict = Dictionary(docs) self.corpus_docs = [self.wordDict.doc2bow(doc) for doc in docs] corpus_csc = corpus2csc(self.corpus_docs) #tfidf_model = models.TfidfModel(self.corpus_docs) #tfidf_corpus = tfidf_model[self.corpus_docs] self.nmf = NMF(n_components = nTopic, random_state = 42) self.W = self.nmf.fit_transform(corpus_csc) self.topics = {'Topic '+ str(i):' '.join(list(self.get_topic_words(i)[1].values())) for i in range(nTopic)} self.lda2 = LdaModel(corpus=self.corpus_docs, id2word=self.wordDict, num_topics=nTopic, update_every=1, chunksize=1000, passes=4, random_state = 24) #self.lda2.show_topics(num_topics=-1, num_words=4) def get_topic_words(self, component_number): """ NMF topics with a gensim corpus represented by component vectors """ sorted_idx = np.argsort(self.W[:,component_number])[::-1][:5] component_words = {self.W[:, component_number][number]:self.wordDict[number] for number in sorted_idx[:5]} return sorted_idx, component_words def get_doc_components(self, doc_number): sorted_idx = np.argsort(self.nmf.components_[:,doc_number])[::-1][0:3] result = {number: self.nmf.components_[:,doc_number][number] for number in sorted_idx} return result def get_document_details(self, doc_number): results = [] for item, val in self.get_doc_components(doc_number).items(): print("document is composed of topic %d with weight %.4f" % (item, val)) result = self.get_topic_words(item)[1] results.append(result) return results def show_lda(self, doc_num, threshold = 0.05, nWord = 5): topic_list = [] for topic, weight in self.lda2[self.corpus_docs[doc_num]]: if weight > threshold: topic_list.append({(topic, weight):self.lda2.show_topic(topic, topn = nWord)}) return topic_list def showTopics(self, nWord= 4): output = self.lda2.show_topics(num_topics=-1, num_words = nWord) for i in output: print(i)
def allot_topics(self, topic_num: int, review_dict_list: list) -> OrderedDict: corpus, dictionary = self._make_property(review_dict_list) lda = LdaModel(corpus=corpus, num_topics=topic_num, id2word=dictionary) word_dict = OrderedDict() for topic_id in range(topic_num): word_dict[topic_id] = lda.show_topic(topic_id) return word_dict
def ldamodel(doc_clean,n_topics,n_words,description,tfidfmodel=False,unseen_docs=None): doc_clean = [min_char(doc).split() for doc in doc_clean] dictionary = corpora.Dictionary(doc_clean) # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above. corpus = [dictionary.doc2bow(doc) for doc in doc_clean] compute_coherence_values(dictionary=dictionary, corpus=corpus, texts=doc_clean, start=2, limit=40, step=6) if tfidfmodel: tfidf = TfidfModel(corpus,id2word=dictionary,smartirs='ntc') corpus = tfidf[corpus] ldamodel = LdaModel(corpus, num_topics=16, id2word=dictionary,random_state=1,passes=50,per_word_topics=True) print("#Tópicos LDA") for i in range(0, n_topics): temp = ldamodel.show_topic(i, n_words) terms = [] for term in temp: terms.append(term) print("Topic #" + str(i) + ": ", ", ".join([t + '*' + str(i) for t, i in terms])) print('Bound: ',ldamodel.bound(corpus)) # Compute Perplexity print('Perplexity: ',ldamodel.log_perplexity(corpus)) # Compute Coherence Score coherence_model_lda = CoherenceModel(model=ldamodel, texts=doc_clean, dictionary=dictionary, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda) if unseen_docs: corpus_new = [dictionary.doc2bow(doc) for doc in unseen_docs] for i, unseen_doc in enumerate(corpus_new): topic = None score = 0 inference_doc = ldamodel[unseen_doc] print(unseen_docs[i]) for index,tmpScore in inference_doc[0]: if tmpScore > score: score = tmpScore topic = ldamodel.print_topic(index, 5) print ("Score: {}\t Topic: {}".format(score, topic)) print("Log perplexity for new corpus is", ldamodel.log_perplexity(corpus_new)) print_result(ldamodel, doc_clean, corpus, n_topics, description) pickle.dump(corpus, open(description+'.pkl', 'wb')) dictionary.save(description+'dictionary.gensim') ldamodel.save(description+'_ldamodel.gensim')
def getPageID2TopicDist(docs, n_topics): """ :docs=[(docID,[word1,word2,..]),...] :param n_topics: Number of topics to generate from the data :return: [docid]=[0.9,0.1] # sequential topic probability """ # prepare the corpus in Gensim format texts = [doc[1] for doc in docs] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] #print corpus; # apply LDA on the corpus lda = LdaModel(corpus, id2word=dictionary, num_topics=n_topics, passes=4) # print top terms from each topic for i in range(lda.num_topics): topic = lda.print_topic(i, topn=8) print[tup[1] for tup in lda.show_topic(topicid=i, topn=8)] # get the topic distribution in each document pageID2TopicDist = dict() doc_topic = [] for index, doc in enumerate(docs): doc_bow = dictionary.doc2bow(doc[1]) topic_distri = lda[doc_bow] dlist = [0.0] * n_topics for tup in topic_distri: dlist[tup[0]] = tup[1] #print topic_distri; #top_topic = sorted(topic_distri, key=lambda x: x[1], reverse=True)[0][0] #doc_topic.append(top_topic) pageid = doc[0] pageID2TopicDist[pageid] = dlist # topic of each document #print doc_topic; return pageID2TopicDist
def topic_choosing(number): lda_temp = LdaModel(bow_corpus, num_topics=number, id2word=word_dictionary) temp_dic = {} for topic_number in range(len(lda_temp.print_topics())): temp_dic.setdefault(topic_number, {}) for term, frequency in lda_temp.show_topic(topic_number, topn=25): temp_dic[topic_number].setdefault(term) temp_dic[topic_number][term] = frequency #计算每个分类里前25的词的词频 X = pd.DataFrame(temp_dic).fillna(0) pairs = [record for record in combinations(X.columns, 2)] distance = [] for pair in pairs: x1 = np.asarray(X[pair[0]]).reshape(-1, 1) x2 = np.asarray(X[pair[1]]).reshape(-1, 1) distance.append(cosine_distances(x1, x2)) #计算不同分类间在词频上的距离 average_distance = np.average(distance) #计算平均距离 return average_distance
def extract_topics(model: LdaModel) -> list: """ From the words in a topic deduce a topic name that covers the content of the words in that topic. This is done by selecting candidates from the wikipedia API and compare these terms according to the following paper: http://www.aclweb.org/anthology/P11-1154 Args:f model: The trained Lda model Returns: List of topic labels ordered by topic cluster """ topic_list = [] for x in range(model.num_topics): words = model.show_topic(x, topn=10) titles = get_wikipedia_titles([w[0] for w in words]) # TODO alter? possible_labels = chunk(titles) print(possible_labels) labels_content = retrieve_content(possible_labels) best_topic = rate_labels(possible_labels, labels_content, words) print(best_topic, words) print('-' * 80) topic_list.append(best_topic) return topic_list
def extract_topic(file_handle, pass_fail=False): """Create free-text metadata JSON from file indicating topic and some human-readable indication of its content. :param file_handle: (str) file :param pass_fail: (bool) whether to exit after ascertaining file class :returns: (dict) metadata dictionary""" tokenizer = RegexpTokenizer(r'[a-zA-Z]{3,}') tag_remover = re.compile('<.+>') doc = re.sub(tag_remover, '', file_handle.read()) doc = tokenizer.tokenize(doc) # if the doc is an empty list, it clearly can't be topic modeled if not doc: raise ExtractionFailed elif pass_fail: raise ExtractionPassed doc_bow = dictionary.doc2bow(doc) topics = lda_model[doc_bow] # normalize topics to sum to 1, as they are usually just short sum_topics = sum([topic[1] for topic in topics]) topics = [[topic_num, prob / sum_topics] for [topic_num, prob] in topics] # if no words are common to the training corpus, topics will be an empty list if not topics: raise ExtractionFailed max_topic = max(topics, key=lambda (i, p): p)[0] topic_words = [ str(w[0]) for w in LdaModel.show_topic(lda_model, max_topic) ] metadata = {"topics": topics, "tags": topic_words} return metadata
#for topic in ldamodel.print_topics(num_topics=4, num_words=30): # print (topic[0]+1, " ", topic[1],"\n") #ldamodel.save('topic_comments_lda.model') #MAIN PLOT viz = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary) pyLDAvis.save_html(viz, '650000_20t.html') #ldamodel.save('TM_lda_1000_4t.model') #FUGURES fig = plt.figure(figsize=(15, 30)) for i in range(20): df = pd.DataFrame(ldamodel.show_topic(i), columns=['term', 'prob']).set_index('term') # df=df.sort_values('prob') plt.subplot(10, 2, i + 1) plt.title('topic ' + str(i + 1)) sns.barplot(x='prob', y=df.index, data=df, label='Cities', palette='GnBu_d') plt.xlabel('probability') #plt.show() fig.savefig('650000_20t.png')
n_topics = 2 lda = LdaModel(corpus_tfidf, id2word=dictionary, iterations=50, num_topics=n_topics) #lda.save('/Users/roysourish/Desktop/SENG 607/corpus_apps/batphone_0.01.model') #plt.plot(lda,n_topics) #plt.show() #run_model('data.json', field='abstract', model='lda_online', r_ldavis=True, output_file=True) #names.to_csv(r'/Users/roysourish/Desktop/SENG 607/np.txt', header=None, index=None, sep=' ', mode='a') ## word lists for i in range(0, n_topics): temp = lda.show_topic(i, 5) terms = [] for term in temp: terms.append(term) print("Top 5 terms for topic #" + str(i) + ": " + ", ".join(str(i[0]) for i in terms)) #lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2) doc = "maven-release-plugin], release, prepare, zxing-2.2, rollback, changes, c++, german, inspection, remove, issue, pdf417, add, test, remove, update, c++, port, issue, fix,issue, fix, add, use, remove" vec_bow = dictionary.doc2bow(doc.lower().split()) vec_lsi = lda[vec_bow] # convert the query to LSI space print(vec_lsi) index = similarities.MatrixSimilarity(lda[corpus])
def LDA_pd(data=data_path, list_keys=keywords, num_topics=num_topics, iterations=iterations, alpha=alpha, eta=eta, embeddings=embeddings, top=topn, output_path=output, use_keywords=use_keywords): output = open(output_path + '.output', 'w') output.write("Generating {} topics from {} initial keywords \n".format( num_topics, len(keywords))) output.write( "LDA model parameters:\n(1) alpha {}\n(2) eta {}\n(3) running {} iterations. \n" .format(alpha, eta, iterations)) if use_keywords: # if false, LDA is performed on all data (NOT Partial Data LDA) data_words = list(word_lists(data, list_keys)) output.write("Standard set of keywords includes:\n" + ', '.join(i for i in list_keys)) if embeddings: display_log("Loading word embeddings") model = load_model(model_path) most_similar = grab_most_similar(list_keys, model=model, top=topn) list_keys = add_similar(list_keys, most_similar) output.write("Supplemented keyword list includes:\n" + ', '.join(i for i in list_keys)) output.write('\n') output.write( "Top {} most similar words added from word emeddings (if found) \n" .format(topn)) else: data_words = list(word_lists_no_keywords(data_path)) display_log("Created data word list of size {}".format(str( len(data_words)))) # generate bigrams if bigrams: data_words = make_bigrams(data_words) display_log("Created bigrams word list") output.write("Topic integrates bigrams.\n\n") # create dictionary id2word = corpora.Dictionary(data_words) display_log("Created dictionary") # TDF corpus = [id2word.doc2bow(text) for text in data_words] display_log("Created corpus") #LDA model lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, random_state=100, update_every=1, chunksize=60, passes=25, alpha=alpha, eta=eta, iterations=iterations) display_log("Created LDA model") #pprint(lda_model.print_topics()) topic_header = ["Topic " + str(i + 1) for i in range(num_topics)] topic_array = np.array( [lda_model.show_topic(i) for i in range(num_topics)]).T output.write("Topics\n-----------------------\n") output.write( tabulate(topic_array[0], headers=topic_header, tablefmt='github')) output.write("\n\n") output.write("Similarity Scores\n-----------------------\n") output.write( tabulate(topic_array[1], headers=topic_header, tablefmt='github')) output.write("\n\n") display_log("printed table into output file " + output_path) df_all = pd.DataFrame() topics_transposed = topic_array.T for i in range(num_topics): new = pd.DataFrame(topics_transposed[i], columns=['Topic ' + str(i), 'score']) df_all = pd.concat([df_all, new], axis=1) df_all.to_csv(output_csv, index=False, encoding='utf-16') display_log("Exported topics and scores into csv file " + output_csv + '.csv') #coherence for LDA-PDs if coherence: coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() output.write( "Coherence and Preplexity Scores\n-----------------------\n") output.write( "LDA-PD Model with {} keywords: \n Perplexity: {} \n Coherence: {}" .format(len(keywords), lda_model.log_perplexity(corpus), coherence_lda)) display_log("Coherence and Perplexity calculated, see " + output_path + '.output') display_log("Log saved in " + output_path + '.log') display_log("Output saved in " + output_path + '.output') display_log("Topics saved in " + output_path + '.csv') return lda_model
# print keywords in n topics sorted(model_lda.print_topics(), key=lambda x: x[1]) # In[87]: # print keywords in n topics sorted(model_lda.print_topics(), key=lambda x: x[0]) # In[88]: # show_topic() returns n most important/relevant words, and their weights, that comprise given topic pprint(model_lda.show_topic(1, topn=10)) # In[89]: pprint(model_lda.show_topics(num_topics=5, num_words=10)) # ### Evaluate - model #1 # In[91]: # calculate perplexity metrics perplexity = model_lda.log_perplexity(corpus_train)
class LdaPersistent(object): """ - save LDA model - rank words within topics - write ranked words to db 'object' must be of type CreateCorpus """ def __init__(self, corpus, num_topics): if type(corpus) is not CreateCorpus: raise TypeError("Initialization object must be of class CreateCorpus") self.corpus = corpus self.num_topics = num_topics print "Generating LDA Model..." self.model = LdaModel( self.corpus, num_topics=self.num_topics, id2word=self.corpus.dictionary) def saveLda(self, model_file_path, dictionary_file_path): """ save model object for loading later """ print "Pickling model object..." if model_file_path: self.model.save(model_file_path) else: self.model.save('./lda-model') if dictionary_file_path: self.corpus.dictionary.save(dictionary_file_path) else: self.corpus.dictionary.save('./lda-dictionary') def _dbConnect(self, connect_file, database): """ connect to specified MongoDb database requires json document of form: {"connect-string": "mongodb://<connect-string-here>"} """ with open(connect_file, 'rb') as cnf: connection_string = json.load(cnf)['connect-string'] try: connection = pymongo.MongoClient(connection_string) except ValueError: print "Connection to remote MongoDB client failed" try: self.db = getattr(connection, database) except AttributeError: print "Specified database not found in MongoDB Client" # try: # self.db_collection = getattr(db, collection) # except AttributeError: # print "Specified collection not found in database" def _topicCollectionsGen(self): """ generator function warning - this iteratively updates a database connection attribute """ topn = len(self.corpus.dictionary) for topic in xrange(self.num_topics): collection_name = 'topic_' + str(topic) self._db_collection = getattr(self.db, collection_name) for score, word in self.model.show_topic(topicid=topic, topn=topn): yield word, score def dbInsert(self, connect_file, database): """ Insert the word and word-score for each topic into MongoDB collection Each topic has its own collection that scores each word in the Dictionary """ print "Loading words and word-ranks into MongoDB..." self._dbConnect(connect_file, database) for word, score in self._topicCollectionsGen(): doc = {"word": word, "score": score} self._db_collection.insert(doc) self._db_collection.create_index("word") #self.db_collection def _wordCollectionsGen(self): self._db_collection = getattr(self.db, "word_topic_mappings") for word in self.corpus.dictionary.itervalues(): doc = {} doc["word"] = word for topic in xrange(self.num_topics): collection_name = 'topic_' + str(topic) doc[collection_name] = {} current_collection = getattr(self.db, collection_name) subdoc = {} for insert in current_collection.find({"word" : word}): subdoc["score"] = insert["score"] doc[collection_name] = subdoc yield doc def dbETL(self): print "Loading data into new collection indexed by word..." for doc in self._wordCollectionsGen(): self._db_collection.insert(doc) self._db_collection.create_index("word")
class LDA_parser(): """ This class implements a wrapper pipeline for text preprocessing and LDA parsing of an input corpus in the form ['str','str','str', ... ]. """ def __init__(self, corpus='', language='english', preprocessor_type="spacy", tags=["DET", "PUNCT", "NUM", "SYM", "SPACE"], custom_filter=[], lemmatize=False, stem=False, min_len=2, num_topics=10, passes=100): """ Parses the input text into a suitable format, then performs all LDA extraction tasks. It expects the input corpus to be a list of texts. If input is a long string, it will attempt create documents by splitting by @ params: @ corpus: Input corpus in str or ['str','str','str', ... ] format, where each entry is a document of type str. Alternatively, a str format input (not recommended). @ preprocessor_type: Use nltk-based or spaCy-base preprocessor @ language: language to use in the preprocessor @ tags: if spaCy is selected, will filter words with input POS tags @ custom_filter: filter words in this input list in the preprocessing step @ lemmatize: use lemmatization in the preprocessing @ stem: use stemming in the preprocessing @ num_topics: maximum number of topics in the LDA algorithm @ passes: number of training epochs in the LDA """ print("Initializing model...\n") if preprocessor_type == "nltk": print("NLTK preprocessor selected.") self.preprocessor = nltk_preprocessor(language=language) if preprocessor_type == "spacy": print("spaCy preprocessor selected.") self.preprocessor = spacy_preprocessor(language=language) self.language = language # input language self.raw_corpus = "" # simply stores the input if in str type self.clean_corpus = [ ] # [doc, doc, ..., doc] = [[sent, sent, ...], ... ,[sent, sent, ...]] self.dictionary = None # holds a corpora.Dictionary representation of corpus self.doc2bow_corpus = None # contains doc2bow vector representations of each document in the corpus self.lda_model = None # LDA model trained on the input corpus self.topic_mixtures = [ ] # contains str representations of mixtures of words with their probabilities self.topics = { } # Contains a dictionary of topics with words and respective mix probabilities once "extract topics" is called. self.topic_words = { } # As above, but only contains the respective words of the topic # check for raw str corpus format if isinstance(corpus, str): print( "***WARNING***\nRaw input (str) received. Text will be sentence-tokenized and parsed accordingly." ) print("Make sure this is intended. \n") self.raw_corpus = str(corpus) # transform input to string self.fit(corpus, raw=True, language=language, num_topics=num_topics, passes=passes, min_len=min_len) # fit corpus as raw elif corpus == '': print("***WARNING***\nNull Corpus") # assume input corpus is in the right format else: self.fit(corpus, language=language, num_topics=num_topics, passes=passes, min_len=min_len) def fit(self, corpus, raw=False, language='english', stem=False, lemmatize=False, num_topics=10, passes=100, min_len=2, echo_corpus=False): """ Assumes input corpus is in the right format. @args: @ corpus = input corpus @ language = input language @ stem/lemmatize = if true, stem or lemmatize input corpus @ num_topics = number of topics to choose in the algorithm @ passes = number of epochs of the LDA @ min_len = minimum length of words to consider when preprocessing words """ if echo_corpus: print("CORPUS: {}".format(corpus)) t0 = time.time() print("Fitting LDA topic modelling...") self.raw_corpus = corpus # input corpus as is self.language = language # in case initial language changed if raw: print("Preprocessing corpus...(raw)") self.clean_corpus = self.preprocessor.preprocess_str_corpus( corpus, stem=stem, lemmatize=lemmatize, min_len=min_len) else: print("Preprocessing corpus...") self.clean_corpus = self.preprocessor.preprocess_texts( self.raw_corpus, min_len=2) # preprocess text list print("Creating corpora dictionary...") self.dictionary = corpora.Dictionary( self.clean_corpus) # create corpora.Dictionary mapping print("Translating doc2bow corpus...") self.doc2bow_corpus = [ self.dictionary.doc2bow(text) for text in self.clean_corpus ] # doc2bow corpus representation print("Running LDA...") self.lda_model = LdaModel(self.doc2bow_corpus, num_topics=num_topics, id2word=self.dictionary, passes=passes) self.topic_mixtures = self.lda_model.show_topics( num_topics=-1, num_words=10) # string representation of topics mixtures t1 = time.time() print("\nDone in {:.3f} seconds.".format(t1 - t0)) def print_topics(self, words_per_topic=5): """ Displays the topics in string format """ topics = self.lda_model.print_topics(num_words=words_per_topic) for topic in topics: print(topic) def extract_topics(self, max_words_per_topic=50, threshold=0.005): """ Returns all topics as a dictionary of tuples, where the key is the topic number, and the value is a list of tuples of words_per_topic many words with probability at least as high as threshold, where the second value is the density for the topic. @params: @ max_words_per_topic: Maximum topic mixture component words to consider. @ threshold: select words whose density is at least this value """ topics = {} # to store the topics indexes = [tup[0] for tup in self.topic_mixtures] # indexes of the thing # assign the topics mixtures for i in indexes: topics[i] = [ tup for tup in self.lda_model.show_topic(i, topn=max_words_per_topic) if tup[1] >= threshold ] # extract mosst probable words for topic i self.topics = topics # update attribute return topics def extract_topic_words(self, max_words_per_topic=50, threshold=0.005): """ Returns all topics as a dictionary of tuples, where the key is the topic number, and the value is a list of words_per_topic many words with probability at least as high as threshold. """ topics = {} # to store the topics indexes = [tup[0] for tup in self.topic_mixtures] # indexes of the thing # assign the topics mixtures for i in indexes: topics[i] = [ tup[0] for tup in self.lda_model.show_topic(i, topn=max_words_per_topic) if tup[1] >= threshold ] # extract mosst probable words for topic i self.topic_words = topics # update attribute return topics def parse_new(self, new_text, top_n_topics=100, top_n_w=30, max_words_per_topic=50, threshold=0.005, verbose=True): """ Parses a new text by obtaining the most likely topics for the new input, as well as the respective words. This function should be used only after the LDA parser has been fitted. @params: @ new_text: new input text @ top_n_topics: top n topics with larges densities p(topic) @ top_n_w: top n word with largest densities p(word) = p(word|topic)*p(topic) @ verbose: display information @ max_words_per_topic: maximum words per topic @ thrshold: only consider words with density greater than threshold @returns: @ max_topic: most likely topic for the document @ doc_max_topic_words: words associated with the most likely topic @ doc_topics: all topics related to the document @ doc_topic_words: all words from all topics associated with the document """ self.extract_topic_words( max_words_per_topic, threshold) # extract topics to ensure they are there new_text_clean = self.preprocessor.preprocess_sentence( new_text) # preprocess input text new_doc_bow = self.dictionary.doc2bow( new_text_clean) # convert to doc2bow doc_topics = self.lda_model.get_document_topics( new_doc_bow) # obtain topics for input document topic_idx = [tup[0] for tup in doc_topics] # topic indices doc_topic_words = [ word for idx in topic_idx for word in self.topic_words[idx] ] # extract all words from every topic top_n_topics = nlargest(top_n_topics, list(doc_topics), key=lambda x: x[1]) # extract top n topics top_n_words = list( set([ word for idx in [tup[0] for tup in top_n_topics] for word in self.topic_words[idx] ])) # extrac the word for the topc words # Currently, we have access to the top n topics and their actual probabilities. # We want to collect all the words for those topics, and multiply them with their probabilities words_with_probs = [ ] # will store words with their actual probabilities: for topic_tup in doc_topics: topic_idx = topic_tup[0] # obtain topic index topic_prob = topic_tup[1] # obtain topic probability p(topic) for word_tup in self.lda_model.show_topic(topic_idx, topn=10): word_probability = word_tup[ 1] * topic_prob # p(w) = p(w|topic)p(topic) words_with_probs.append( (word_tup[0], word_probability)) # (word, p(w)) # obtain the n most likely words according to they individual probabilities n_most_likely_words = [ tup[0] for tup in nlargest( top_n_w, list(words_with_probs), key=lambda x: x[1]) ] if verbose: print("\nLOGS: \n") print("*** Most likely topic: ***\n", top_n_topics) print("*** Words for most likely topic: ***\n", top_n_words) print("*** All topics: ***\n", doc_topics) print("*** All topics words: ***\n", doc_topic_words) return n_most_likely_words, top_n_topics, top_n_words, doc_topics, doc_topic_words def pickle_save(self, savename="full_LDA_parser.pkl"): """ Saves the full model object in pkl format """ pickle.dump(self, open(savename, 'wb')) def save_model(self, name="LDA_model"): """ Saves the LDA model, doc2bow_corpus and dictionary. These parameters can be used to instantiate a gensim model, so there is no load in this class. """ dictionary_name = name + "_dictionary.gensim" corpus_name = name + "_doc2bow_corpus.pkl" model_name = name + ".gensim" pickle.dump(self.doc2bow_corpus, open(corpus_name, 'wb')) # save the doc2bow_corpus self.dictionary.save(dictionary_name) # save corpus dictionary mapping self.lda_model.save(model_name) # save the full model
plt.ylabel('Number of topics') fim.savefig('50t_Topic_Popularity_on_Articles.png') #plt.show() #FIGURE SHOWING THE MOST POPULAR TOPICS: import gensim import warnings import pandas as pd import seaborn as sns import matplotlib.pyplot as plt warnings.filterwarnings("ignore") fip = plt.figure(figsize=(15, 30)) for i in range(10): df = pd.DataFrame(ldamodel.show_topic((q.iloc[i]['topics_id'].astype(int)), topn=12), columns=['term', 'prob']).set_index('term') df = df.loc[df['prob'] > 0.005] df plt.subplot(5, 2, i + 1) plt.title('topic ' + str((q.iloc[i]['topics_id'].astype(int)))) sns.barplot(x='prob', y=df.index, data=df, label='Cities', palette='Reds_d') plt.xlabel('probability') fip.savefig('The_Most_Popular_Topics_on_Articles.png') #plt.show() p = result.groupby(['author', 'topics_id']).size().reset_index(
class Lda(Analysis): def __init__(self): self.times_fitted = 0 self.corpus = None self.vocabulary = None self.lda = None self.nb_docs_trained = 0 self.selected_clusters = set() def fit(self, documents, add_prediction='', field='text', nb_topics=20, normalizing='stem', language=DEFAULTLANGUAGE, **kwargs): """ This method trains the Lda model by fitting its parameters to the extracted textual data from the given documents\ (dictionaries) and selected field key. It infers n number of topics/clusters equal to the given parameter.\ Input documents can be optionally mutated by adding to them the trained model "prediction" value.\n `alpha` and `eta` are hyperparameters that affect sparsity of the document-topic (theta) and topic-word (lambda)\ distributions respectively. 'alpha' parameter is learned as an asymmetric prior directly from your data and 'eta'\ defaults to a symmetric 1.0/nb_topics prior.\n `decay` and `offset` parameters are the same as Kappa and Tau_0 in Hoffman et al, respectively.\n\n :param documents: the documents (dictionaries) to train on :type documents: iterable :param add_prediction: this switch signals the mutation of the train set documents by adding a key, value pair,\ per document. The value holds the documents's topic distribution predicted by the trained model :param field: the requested dictionary/document key pointing to the data. If 'all' is given then returns the\ concatenation of all the dictionary values with '\\\\n' :type field: str :param nb_topics: the number of clusters/topics to assume when performing topic modeling. Controls granularity :type nb_topics: int :param normalizing: if 'lemmatize' then perfoms word net lemmatization with the default pos noun ('n') NOTE: only supported for english if 'stem' perform stemming with the porter stemmer else uses the input words as they are. :param language: language of the documents to be classified, important for preprocessing :type language: str :References: * https://radimrehurek.com/gensim/models/ldamodel.html : gensim.models.ldamodel * https://www.di.ens.fr/~fbach/mdhnips2010.pdf : Hoffman et al """ self.vocabulary, self.corpus = create_corpus(documents, field=field, normalizing=normalizing, language=language) print('Training Lda model ...') self.lda = LdaModel( corpus=self.corpus, num_topics=nb_topics, alpha='auto' ) # alpha can be also set to 'symmetric' or to an explicit array self.nb_docs_trained = len(self.corpus) #lda = gensim.models.ldamodel.LdaModel(corpus=mm, id2word=id2word, num_topics=100, update_every=0, passes=20) def predict(self, documents, add_prediction='', field='text'): docs_lda = [] for doc in documents: docs_lda.append(self.lda[get_bow(extract_data(doc, field=field), self.corpus)]) if add_prediction != '': doc[add_prediction] = str(docs_lda[-1]) def update(self, documents, field='text'): pass # corp = CorpusCreator.create_corpus(documents, field=field, normalizing=self.corpus.normalizer) # print('Updating model ...') # self.lda.update((get_bow(text_data, corp) for text_data in get_data_generator(documents, field=field))) def interpretation(self, prec=3): ordered_selected_clusters = [ _id for _id in range(self.lda.num_topics) if _id in self.selected_clusters ] body, max_len = self.get_rows( [self.lda.show_topic(i) for i in ordered_selected_clusters], prob_precision=prec) header = ' - '.join('{}'.format(idd) + ' ' * (3 + prec + max_len - len(str(idd))) for idd in ordered_selected_clusters) + '\n' return header + body def get_rows(self, top, prob_precision=3): max_token_len = max( len(self.vocabulary[int(top[j][i][0])]) for j in range(len(top)) for i in range(len(top[0]))) b = '' for i in range(len(top[0])): b += ' | '.join('{} '.format( str(self.vocabulary[int(top[j][i][0])]) + ' ' * (max_token_len - len(self.vocabulary[int(top[j][i][0])]))) + "{1:.{0}f}".format(prob_precision, top[j][i][1]) for j in range(len(top))) + '\n' return b, max_token_len def select_topics(self, topic_ids): """Use this method to indicate which topics/clusters you are interested in for "selecting" (i.e. interpreting, visualizing) by providing your desired numerical ids. Note that it only adds to the set of currently "selected" topics the new ids provided. To clear the set before selecting use "clear_selected_topics" first.\n :param topic_ids: the numerical ids of the topics/clusters to add to the "selected" set :type topic_ids: list """ self.selected_clusters.update(topic_ids) def deselect_topics(self, topic_ids): """Use this method to indicate which topics/clusters you are NOT interested in "selecting" (i.e. for interpreting, visualizing..) by providing your desired numerical ids to exclude from the selected set. It removes the found ids from the set.\n :param topic_ids: the numerical ids of the topics/clusters to remove from the "selected" set :type topic_ids: list """ self.selected_clusters.difference_update(topic_ids) def select_all_topics(self): """Use this method in case you are want to "select" (i.e. for interpreting, visualizing..) all topics/clusters infered by the model. It adds all numerical topic/cluster ids to the "selected" set.""" self.selected_clusters.update(range(self.lda.num_topics)) def clear_selected_topics(self): """Use this method to clear the selection of topics/clusters. It removes all nnumerical ids from the "selected" set""" self.selected_clusters.clear()
corpus = [words.doc2bow(doc) for doc in doc_list] # Create LDA model lda = LdaModel(corpus=corpus, id2word=words, num_topics=3, random_state=2, update_every=1, passes=20, alpha='auto', per_word_topics=True) # Pickle that too # with open('pickles/movie_lda_1.pickle', 'wb') as f: # pickle.dump(lda, f) with open('pickles/movie_lda_1.pickle', 'rb') as f: lda = pickle.load(f) pprint(lda.print_topics(num_words=10)) # Generate wordclouds for t in range(lda.num_topics): wc = WordCloud(width=800, height=400) wc.fit_words(dict(lda.show_topic(t, 200))) plt.figure() plt.imshow(wc) plt.axis("off") plt.show()
#for topic in ldamodel.print_topics(num_topics=4, num_words=30): # print (topic[0]+1, " ", topic[1],"\n") #ldamodel.save('topic_comments_lda.model') #MAIN PLOT viz = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary) pyLDAvis.save_html(viz, '50t.html') #ldamodel.save('TM_lda_1000_4t.model') #FUGURES fig = plt.figure(figsize=(30,60)) for i in range(50): df=pd.DataFrame(ldamodel.show_topic(i, topn = 15), columns=['term','prob']).set_index('term') # df=df.sort_values('prob') df = df.loc[df['prob'] >0.005] df plt.subplot(10,5,i+1) plt.title('topic '+str(i+1)) sns.barplot(x='prob', y=df.index, data=df, label='Cities', palette='GnBu_d') plt.xlabel('probability') #plt.show() fig.savefig('50t.png')
# for i in tweets[list(tweets.keys())[2]].split("|||"): # words =[word for word in nltk.word_tokenize(i) if word not in STOPWORDS and word.isalnum() and len(word)>=2] # words_list.append(words) num_topics = 3 dictionary = corpora.Dictionary(words_list) corpus = [dictionary.doc2bow(words) for words in words_list] lda = LdaModel(corpus, id2word=dictionary, num_topics=num_topics) ###output1: topics and corresponding words pp = pprint.PrettyPrinter(indent=4) pp.pprint(lda.print_topics(num_words=10)) ###output2: 2 ways of showing one topic and corresponding words lda.print_topic(topicno=0) lda.show_topic(1) ### ouput3: show topic of one user (even new user) sorted(lda.get_document_topics(corpus[100], minimum_probability=0, per_word_topics=False), key=lambda x: x[1], reverse=True) ### output4: visualize LDA lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, R=15, sort_topics=False) pyLDAvis.display(lda_display)
print (word_list) print ('********************************************************************************************************') print (tweet_clean_fin) print (len(tweet_clean_fin)) from gensim.models.ldamodel import LdaModel from gensim.corpora.dictionary import Dictionary dictionary = Dictionary(tweet_clean_fin) print("\n --- dictionary \n",dictionary) bow_vectors = [dictionary.doc2bow(text) for text in tweet_clean_fin] goodLdaModel=LdaModel(corpus=bow_vectors,id2word=dictionary,iterations=50,num_topics=6) print('\n --- goodLdaModel: all topics in result ordered by significance \n') all_goos_topics=goodLdaModel.print_topics(-1) print(all_goos_topics) print("\n---goodLdaModel.print_topics(num_topics=6,num_words=12 \n") print(goodLdaModel.print_topics(num_topics=6,num_words=16)) %%time import warnings import pandas as pd import seaborn as sns import matplotlib.pyplot as plt warnings.filterwarnings("ignore") fiz=plt.figure(figsize=(30,60)) for i in range(6): df=pd.DataFrame(goodLdaModel.show_topic(i,16),columns=['term','prob']).set_index('term') plt.subplot(6,3,i+1) plt.title('topic'+str(i+1)) sns.barplot(x='prob',y=df.index,data=df,label='Cities',palette='Reds_d') plt.xlabel('probability') plt.show()
class LDAWDF: mysql: mysql.MySQL ldamodel: LdaModel dictionary = None corpus = None def __init__(self, mysql): self.mysql = mysql self.dataFolder = './data/' self.saveFile = 'lda_model' self.saveFileDict = 'lda_model_dict' def trainFromStart(self): with self.mysql as db: content = db.getContentsText() documents = [] for item in content: documents.append(item['content'].split()) self.dictionary = corpora.Dictionary(documents) self.dictionary.filter_extremes(no_below=5, no_above=0.5) doc_term_matrix = [self.dictionary.doc2bow(doc) for doc in documents] self.corpus = doc_term_matrix # Running and Training LDA model on the document term matrix. print("Starting to train LDA Model...") self.ldamodel = LdaModel( doc_term_matrix, num_topics=200, id2word=self.dictionary, passes=100) def printTest(self): print(self.ldamodel.print_topics(num_topics=10, num_words=5)) def save(self): self.ldamodel.save(self.dataFolder + self.saveFile) self.dictionary.save(self.dataFolder + self.saveFileDict) def canLoad(self): my_file = Path(self.dataFolder + self.saveFile) my_file_dict = Path(self.dataFolder + self.saveFileDict) return my_file.is_file() and my_file_dict.is_file() def update(self, corpus): self.ldamodel.update(corpus) def load(self, subfolder=None): if subfolder: sf = subfolder + '/' else: sf = '' self.ldamodel = LdaModel.load(self.dataFolder + sf + self.saveFile) self.dictionary = gensim.corpora.Dictionary.load(self.dataFolder + sf + self.saveFileDict) def fillDb(self): topics = {} result = [] result2 = [] nbTopics = self.ldamodel.get_topics().shape[0] # "Old" for topicId in range(0, nbTopics): topicTerms = self.ldamodel.get_topic_terms(topicId, 3) topicTerms.sort(key=lambda x: x[1], reverse=True) words = [] for topicTerm in topicTerms: words.append(self.dictionary.get(topicTerm[0])) topics[topicId] = ' '.join(words) with mysql as db: contentsText = db.getContentsText() for element in contentsText: bow = self.dictionary.doc2bow(element['content'].split()) docTopics = self.ldamodel.get_document_topics(bow, minimum_probability=0.05) if len(docTopics) > 0: docTopics.sort(key=lambda x: x[1], reverse=True) result.append((element['url'], topics[docTopics[0][0]])) for docTopic in docTopics: result2.append((element['url'], docTopic[0], str(docTopic[1]))) db.emptyUrlsTopic() db.emptyCurrentUrlsTopic() db.emptyCurrentUserTags() db.setCurrentUrlsTopic(result2) db.setPrecalcTopics() # "New" terms = [] for topicId in range(0, nbTopics): topicTerms = self.ldamodel.get_topic_terms(topicId, 5) topicTerms.sort(key=lambda x: x[1], reverse=True) for topicTerm in topicTerms: terms.append((topicId, self.dictionary.get(topicTerm[0]), str(topicTerm[1]))) with mysql as db: db.emptyLdaTopics() db.setLdaTopics(terms) def get_terms_topics(self, keywords): bow = self.dictionary.doc2bow(keywords[:30]) topics = {} keywordsResult = {} for word in bow: wordTopics = self.ldamodel.get_term_topics(word[0], 0.05) keywordsResult[word[0]] = {'word': self.dictionary.get(word[0]), 'topics': wordTopics} for wordTopic in wordTopics: wordTopicId = wordTopic[0] if wordTopicId not in topics: topics[wordTopicId] = self.ldamodel.show_topic(wordTopicId) return {'topics': topics, 'keywords': keywordsResult}
# Create word cloud in grey scale at 300 dpi for publication def grey_color_func(word, font_size, position, orientation, random_state=None, **kwargs): return "hsl(0, 0%%, %d%%)" % np.random.randint(60, 95) for t in range(lda_model.num_topics): plt.figure() wc = WordCloud(background_color='black', color_func=grey_color_func).fit_words( dict(lda_model.show_topic(t, 200))) plt.imshow(wc) plt.axis("off") plt.title("Topic #" + str(t)) # plt.show() plt.savefig(os.path.join('./results/wordcloud_' + str(num_topics) + '_' + str(t) + '.png'), format='png', dpi=300) else: pass if inference: # Infer topic distribution in the data to find examples # lda_model = last model from above # unpickle another model that you prefer