def train(self): split_archives = [article.tokens for article in self.articles] # create dictionary and corpus dictionary = corpora.Dictionary(split_archives) dictionary.filter_extremes(no_above=self.words_no_above) corpus = [dictionary.doc2bow(article) for article in split_archives] logger.info('Created dictionary and corpus') # get eta to force topics eta = get_eta(self.num_topics, dictionary) # create lda model with gensim lda_progress = LDAProgress(self.passes) ldamodel = LdaMulticore(corpus, num_topics=self.num_topics, id2word=dictionary, passes=self.passes, per_word_topics=True, iterations=self.iterations, eta=eta, workers=cpu_count()) lda_progress.close() logger.info('Created Topics model') # print the topics (debug) logger.debug('Topics:') topics = ldamodel.print_topics(num_words=5) for topic in topics: logger.debug(topic) self.model = ldamodel self.dictionary = dictionary
def createlda(num_topics, filename): dumppick(filename) num_topics = 50 texts, texts_tf_idf, dictionary = loadpcik() # 利用lsi做主题分类的情况 """ print("**************LSI*************") lsi = models.lsimodel.LsiModel(corpus=texts, id2word=dictionary, num_topics=20) # 初始化一个LSI转换 texts_lsi = lsi[texts_tf_idf] # 对其在向量空间进行转换 print(lsi.print_topics(num_topics=20, num_words=10)) """ # 利用LDA做主题分类的情况 print("**************LDA*************") #ppl = [] #for i in range(1,50,1): #texts = shuffle(texts) #texts_train = texts[:int(24012*(0.9))] #texts_vad = texts[int(24012*(0.9)):] lda = LdaMulticore(corpus=texts, iterations=1000, id2word=dictionary, num_topics=num_topics, passes=200, per_word_topics=True) #texts_lda = lda[texts_tf_idf] out = open("./ldamd/{}tpc-tpc".format(num_topics), mode="w", encoding="utf8") print(lda.print_topics(num_topics=num_topics, num_words=10), file=out) lda.save("./ldamd/{}tpc+{}".format(num_topics, filename[9:18])) #ppl.append(np.exp2(-lda.log_perplexity(texts_vad))/i) return lda, texts, texts_tf_idf, dictionary
class LdaProcessor(object): def __init__(self, token_docs, **filter_extremes_args): """ token_docs : a list of lists of word or n-gram or sentence tokens. Eg, [['the','crazy','cat'],['that','doggone','dog']] """ self.token_docs = token_docs self.id2word = corpora.Dictionary(token_docs) if filter_extremes_args: print 'filtering words with extreme frequencies' self.id2word.filter_extremes(**filter_extremes_args) # initialize the bow_corpus self.reset_bow_corpus(token_docs) print 'Got %i total tokens (words)' % len(self.id2word) def reset_bow_corpus(self, documents): """set or reset the corpus with the given documents""" self.bow_corpus = [self.id2word.doc2bow(doc) for doc in documents] return None def train_lda(self, num_topics, **kwargs): print 'training LDA...' self.lda = LdaMulticore(self.bow_corpus, id2word=self.id2word, num_topics=num_topics, **kwargs) return self def word_topics(self, num_words=10): return [topic[1] for topic in self.lda.print_topics(num_topics=self.lda.num_topics, num_words=num_words)] # utility functions def significant_topic_terms(self, topicid): raise NotImplementedError()
def generate_tags(tokens: list) -> list: """Perform LDA Topic Modelling to aquire tags. Args: tokens (list): List of tokens Returns: tags_list (list) List of appropriate tags for given tokens. """ id2word = Dictionary(tokens) corpus = [id2word.doc2bow(d) for d in tokens] model = LdaMulticore( corpus=corpus, id2word=id2word, random_state=42, num_topics=10, passes=2, workers=1 ) words = [re.findall(r'"([^"]*)"', t[1]) for t in model.print_topics()] wordcount = Counter(words[0] + words[1] + words[2] + words[3] + words[4]) tags = pd.DataFrame.from_dict( wordcount, orient='index', columns=['number'] ) tags = tags.drop(tags[tags['number'] <= 1].index) tags = tags.sort_values(by=['number'], ascending=False).T tags_list = [word for word in tags.columns] return tags_list
def print_terms(self, model: LdaMulticore): topics = [] for topic in model.print_topics(num_topics=self.n_topics, num_words=10): topics.append([(s.split('*\"')[1].split('\"')[0], float(s.split('*\"')[0])) for s in str(topic[1]).split('+ ')]) pprint(topics)
class LdaMaker: def __init__(self, corpora, num_topics, print_topics=True): self.num_topics = num_topics self.tokenizer = nltk.tokenize.TreebankWordTokenizer() self.stemmer = nltk.stem.snowball.RussianStemmer() corpora_tokenzied = [ self.tokenizer.tokenize( (self._keep_only_russian_chars(str(doc).lower()))) for doc in corpora ] corpora_stemmed = [] for doc in corpora_tokenzied: stemmed_doc = [ self.stemmer.stem(token) for token in doc if token not in ru_stopwords ] stemmed_doc = [ token for token in stemmed_doc if token not in ru_stopwords ] corpora_stemmed.append(stemmed_doc) self.dictionary = gensim.corpora.Dictionary(corpora_stemmed) corpora_bow = [self.dictionary.doc2bow(doc) for doc in corpora_stemmed] # self.tfidf = gensim.models.TfidfModel(corpora_bow) # corpora_tfidf = self.tfidf[corpora_bow] self.lda = LdaMulticore(num_topics=self.num_topics, corpus=corpora_bow, id2word=self.dictionary) if print_topics: for s in self.lda.print_topics(): print(s) def get(self, doc): doc = self.tokenizer.tokenize( self._keep_only_russian_chars(doc.lower())) doc = [ self.stemmer.stem(token) for token in doc if token not in ru_stopwords ] doc = [token for token in doc if token not in ru_stopwords] doc = self.dictionary.doc2bow(doc) # doc = self.tfidf[doc] return self.lda[doc] @staticmethod def _keep_only_russian_chars(s): new_s = '' for c in s: if 'а' <= c <= 'я' or 'А' <= c <= 'Я': new_s += c else: new_s += ' ' return new_s
def main(): corpus = [] dictionary = corpora.Dictionary() #tokenized_doc = pd.Series() print("start") idx = 1 print("Topics(", NUM_TOPICS, "개)") print("Docs (", NUM_DOCS - idx, "개)") while (True): if idx > NUM_DOCS: break print("##", idx, "~", idx + DOC_SPLIT - 1, "docs") print("docs loading...") news_df = get_posts_df(get_coll(), idx, DOC_SPLIT) print("docs tokenizing...") tokenized_doc = news_df['text'].apply(lambda x: tkn_func(x, idx)) print("make Dict...") dictionary.add_documents(tokenized_doc) print("Token to Corpus...") corpus += [dictionary.doc2bow(text) for text in tokenized_doc] idx += DOC_SPLIT get_time() print() ## 싱글 코어 # ldamodel = gensim.models.ldamodel.LdaModel( # corpus, # num_topics = NUM_TOPICS, # id2word = dictionary, # passes=20) # passes 알고리즘 반복 횟수 ## 멀티코어 get_time() print("Model Learning...") ldamodel = LdaMulticore(corpus, num_topics=NUM_TOPICS, id2word=dictionary, passes=20, workers=4) topics = ldamodel.print_topics(num_words=5) # 토픽 단어 제한 #토픽 및 토픽에 대한 단어의 기여도 for topic in topics: print(topic) for i, topic_list in enumerate(ldamodel[corpus]): if i == 5: break print(i, '번째 문서의 topic 비율:', topic_list) get_time() print("model saving...") save_model(ldamodel, dictionary) visual(ldamodel, corpus, dictionary) print("end")
def write_results(lda_model: lda.LdaMulticore, df_topic_doc_keywords, topicsFile: str, topicToDocFile: str): # Format df_dominant_topic = df_topic_doc_keywords.reset_index() df_dominant_topic.columns = [ 'Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text' ] df_selected = df_dominant_topic[['Document_No', 'Dominant_Topic']] #print (df_selected.head(100)) np.savetxt(topicToDocFile, df_selected.values, fmt='%s') with open(topicsFile, "w") as file: pprint(lda_model.print_topics(-1, 10), file)
def main(): corpus = [] dictionary = corpora.Dictionary() print("start") print("docs loading...") df = pd.read_csv("news_data.csv") idx = 0 last = len(df) while (True): if idx > last: break print("##", idx, "docs") news_df = df.loc[idx:idx, :] print("docs tokenizing...") tokenized_doc = news_df['text'].apply(lambda x: tkn_func(x, idx)) print("make Dict...") dictionary.add_documents(tokenized_doc) print("Token to Corpus...") corpus += [dictionary.doc2bow(text) for text in tokenized_doc] idx += 1 get_time() print() ## 싱글 코어 # ldamodel = gensim.models.ldamodel.LdaModel( # corpus, # num_topics = NUM_TOPICS, # id2word = dictionary, # passes=20) # passes 알고리즘 반복 횟수 ## 멀티코어 get_time() print("Model Learning...") ldamodel = LdaMulticore(corpus, num_topics=NUM_TOPICS, id2word=dictionary, passes=20, workers=4) topics = ldamodel.print_topics(num_words=5) # 토픽 단어 제한 #토픽 및 토픽에 대한 단어의 기여도 for topic in topics: print(topic) for i, topic_list in enumerate(ldamodel[corpus]): if i == 5: break print(i, '번째 문서의 topic 비율:', topic_list) get_time() print("model saving...") save_model(ldamodel, dictionary) visual(ldamodel, corpus, dictionary) print("end")
def test_lda_model(): dictionary = Dictionary(TOKEN_SETS) bags_of_words = [dictionary.doc2bow(tokens) for tokens in TOKEN_SETS] lda = LdaMulticore(corpus=bags_of_words, id2word=dictionary, random_state=723812, passes=10, workers=4) response = lda.print_topics() assert isinstance(response, list) assert isinstance(response[0], tuple) assert isinstance(response[0][0], np.int64) assert isinstance(response[0][1], str) topic_strings = [topic_str for topic_str in response[0][1].split(" + ")] assert topic_strings[0] == '0.067*"sleep"'
def learn(corpus): dictionary = Dictionary.load('lda.dict') lda = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=NUM_TOPICS, chunksize=10000, passes=5) for line in lda.print_topics(NUM_TOPICS): print line lda.save('lda.gensim')
# topic_number, number_of_aritcles, top_words #============================================================================== def get_topic(n): doc_lda = model[doc_list[n]] current_prob = 0 for var in doc_lda: if var[1]>current_prob: current_prob = var[1] topic_num = var[0] return topic_num,re.sub('[+.0123456789\*]','',topic[topic_num]) doc_list = [] for var in matutils.Sparse2Corpus(X,documents_columns=False): doc_list.append(var) topic = model.print_topics(num_topics=topic_number, num_words=50) # store topic with probability with open(folder_name+'topic_with_prob_'+str(topic_number)+'_topics.txt','w') as new: for i in range(topic_number): new.write('{}\t{}\n'.format(str(i),topic[i])) fin_sum = [] for i in range(len(doc_list)): fin_sum.append(get_topic(i)[0]) topic_count = co.Counter(fin_sum) #path = '/Users/royyang/Desktop/trending_project/re_categorization_ehow/top_words_28topics.txt' path = folder_name+'top_words_for_'+str(topic_number)+'_topics.txt'
#============================================================================== def get_topic(n): doc_lda = model[doc_list[n]] current_prob = 0 for var in doc_lda: if var[1] > current_prob: current_prob = var[1] topic_num = var[0] return topic_num, re.sub('[+.0123456789\*]', '', topic[topic_num]) doc_list = [] for var in matutils.Sparse2Corpus(X, documents_columns=False): doc_list.append(var) topic = model.print_topics(num_topics=topic_number, num_words=50) # store topic with probability with open(folder_name + 'topic_with_prob_' + str(topic_number) + '_topics.txt', 'w') as new: for i in range(topic_number): new.write('{}\t{}\n'.format(str(i), topic[i])) fin_sum = [] for i in range(len(doc_list)): fin_sum.append(get_topic(i)[0]) topic_count = co.Counter(fin_sum) #path = '/Users/royyang/Desktop/trending_project/re_categorization_ehow/top_words_28topics.txt' path = folder_name + 'top_words_for_' + str(topic_number) + '_topics.txt'
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] # add tokens to list other_texts.append(stemmed_tokens) other_corpus = [dictionary.doc2bow(text) for text in other_texts] # unseen_doc = other_corpus[2] # vector = ldamodel[unseen_doc] # print(vector) # generate LDA model------------------------------------------------------------------------- my_loop_num_topics = [2, 5, 8, 10, 15, 20, 25, 30, 35, 40, 45, 50, 100] for i in my_loop_num_topics: my_num_topics = i print(my_num_topics) # ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=my_num_topics, id2word = dictionary, passes=20) myldamodel = LdaMulticore(corpus, num_topics=my_num_topics, id2word=dictionary, workers=3, alpha=1e-5, eta=5e-1) print(myldamodel.print_topics(num_topics=my_num_topics, num_words=5)) print(myldamodel.log_perplexity(corpus)) print(myldamodel.log_perplexity(other_corpus))
# dictionary = Dictionary(token_stream(NOVELS_DIRPATH)) dictionary.filter_extremes(no_below=10, no_above=0.66) # excludes terms like "the", "to", "and", "of", "i", etc. print("-------------") print("TOKENS", len(dictionary.token2id), list(dictionary.token2id.items())[0:4], "...") bags_of_words = [dictionary.doc2bow(tokens) for tokens in token_stream(NOVELS_DIRPATH)] print("-------------") print("BAGS OF WORDS (CORPUS)", len(bags_of_words), bags_of_words[0]) lda = LdaMulticore(corpus=bags_of_words, id2word=dictionary, random_state=723812, num_topics=15, passes=10, workers=4) print("-------------") print("LDA MODEL", type(lda)) results = lda.print_topics() print("-------------") print("TOPICS (RAW RESULTS)...") print(results) parsed_topics = parse_topics(lda) print("-------------") print("TOPICS (PARSED RESULTS)...") pprint(parsed_topics) # h/t: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#11createthedictionaryandcorpusneededfortopicmodeling topics = lda[bags_of_words] print(topics[0]) #> [(4, 0.3149784), (7, 0.47801575), (13, 0.20485382)] # a measure of how good the model is. lower the better. print("Perplexity:", lda.log_perplexity(bags_of_words)) #> -7.74115184561741
dictionary = corpora.Dictionary(texts) # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in texts] # generate LDA model my_num_topics = 30 # ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=my_num_topics, id2word = dictionary, passes=20) ldamodel = LdaMulticore(corpus, num_topics=my_num_topics, id2word=dictionary, workers=3, alpha=1e-5, eta=5e-1) print(ldamodel.print_topics(num_topics=my_num_topics, num_words=5)) print(corpus[0]) print(corpus[1]) print(corpus[2]) print(ldamodel[corpus[0]]) print(ldamodel[corpus[1]]) print(ldamodel[corpus[2]]) print(ldamodel.print_topics(20)) #---------------------------------------------------------------------- new_texts_set = [ 'comedy collection comedy favorites', 'alternative punk blue', 'human game computer house' ]
print type(dictionary), type(corpus) #path where dtm file is installed dtm_path = "/home/ankit081190/NLP/dtm/dtm/dtm" #model = DtmModel(dtm_path, corpus, time_seq, num_topics=1, # id2word=corpus.dictionary, initialize_lda=True) model = LdaMulticore(corpus, num_topics=10, id2word=dictionary) model.save("DTModelMultiCore_" + files + ".model") #Gives top 25 topics tp = model.show_topics(num_topics=25, log=False, formatted=True) print model.print_topics(num_topics=25) data = pyLDAvis.gensim.prepare(model, corpus, dictionary) pyLDAvis.save_html(data, 'index_lda_' + files + '.html') cnt = Counter(tp) with codecs.open("topicsMultiLDA" + files + ".txt", "w", "utf-8") as f: for i, j in cnt: print i, j f.write("\nFor Topic Number " + str(i) + ":\n" + str(j).decode("utf-8") + "\n") f.close() #for i, j in cnt: # print "\nFor topic number: " ,i, "\n"; # print j.decode("utf-8") #for i in range(0,model.num_topics-1)):
model = LdaMulticore( matutils.Sparse2Corpus(X,documents_columns=False), num_topics=7,passes=10, id2word=dict([(i, s) for i, s in enumerate(vocab)]), workers=7, ) print("--- %s seconds ---" % (time.time() - start_time)) # Get all topics from training doc_list = [] for var in matutils.Sparse2Corpus(X,documents_columns=False): doc_list.append(var) topic = model.print_topics(num_topics=7, num_words=10) fin_sum = [] for i in range(len(doc_list)): fin_sum.append(get_topic(i)[0]) topic_count = co.Counter(fin_sum) for i,var in enumerate(topic): [i,str(re.sub('[+.0123456789\*]','',var)),topic_count[i]] # [topic,topic_words,doc_title] for i in range(100): [get_topic(i),titles[i]]
start_time = time.time() model = LdaMulticore( matutils.Sparse2Corpus(X, documents_columns=False), num_topics=7, passes=10, id2word=dict([(i, s) for i, s in enumerate(vocab)]), workers=7, ) print("--- %s seconds ---" % (time.time() - start_time)) # Get all topics from training doc_list = [] for var in matutils.Sparse2Corpus(X, documents_columns=False): doc_list.append(var) topic = model.print_topics(num_topics=7, num_words=10) fin_sum = [] for i in range(len(doc_list)): fin_sum.append(get_topic(i)[0]) topic_count = co.Counter(fin_sum) for i, var in enumerate(topic): [i, str(re.sub('[+.0123456789\*]', '', var)), topic_count[i]] # [topic,topic_words,doc_title] for i in range(100): [get_topic(i), titles[i]] #help(model)
def start(num_topics, kind): data = loader.load_data(kind) df = pd.DataFrame(data) cleaner.clean(df) nlps = { 'it': spacy.load('it_core_news_lg'), 'en': spacy.load('en_core_web_lg'), 'fr': spacy.load('fr'), 'de': spacy.load('de') } tokenizers = { 'it': Tokenizer(nlps['it'].vocab), 'en': Tokenizer(nlps['en'].vocab), 'fr': Tokenizer(nlps['fr'].vocab), 'de': Tokenizer(nlps['de'].vocab) } # Customize stop words by adding to the default list stop_words = [] stop_words += nlps['it'].Defaults.stop_words stop_words += nlps['en'].Defaults.stop_words stop_words += nlps['fr'].Defaults.stop_words stop_words += nlps['de'].Defaults.stop_words stop_words += s.ALL_STOPWORDS stop_words = set(stop_words) # ALL_STOP_WORDS = spacy + gensim + wordcloud ALL_STOP_WORDS = stop_words.union(SW).union(stopwords) cleaner.remove_stopwords(df, tokenizers, ALL_STOP_WORDS) cleaner.lemmas(df, nlps) tok.tokenize_text(df) # Create a id2word dictionary id2word = Dictionary(df['lemma_tokens']) print(len(id2word)) # Filtering Extremes id2word.filter_extremes(no_below=2, no_above=.99) print(len(id2word)) # Creating a corpus object corpus = [id2word.doc2bow(d) for d in df['lemma_tokens']] # Instantiating a Base LDA model base_model = LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=id2word, workers=12, passes=5) # Filtering for words words = [re.findall(r'"([^"]*)"', t[1]) for t in base_model.print_topics()] # Create Topics topics = [' '.join(t[0:10]) for t in words] # Getting the topics for id, t in enumerate(topics): print(f"------ Topic {id} ------") print(t, end="\n\n") # Compute Perplexity # a measure of how good the model is. lower the better base_perplexity = base_model.log_perplexity(corpus) print('\nPerplexity: ', base_perplexity) # Compute Coherence Score coherence_model = CoherenceModel(model=base_model, texts=df['lemma_tokens'], dictionary=id2word, coherence='c_v') coherence_lda_model_base = coherence_model.get_coherence() print('\nCoherence Score: ', coherence_lda_model_base) lda_display = pyLDAvis.gensim.prepare(base_model, corpus, id2word) d = pyLDAvis.display(lda_display) today = date.today() directory_path = f"/home/marco/Scrivania/tirocinio-unicredit/lda-html/{kind}/{today}/" if not os.path.exists(directory_path): os.makedirs(directory_path) f = open( f"/home/marco/Scrivania/tirocinio-unicredit/lda-html/{kind}/{today}/{num_topics}.html", 'w') f.write(d.data) f.close() vectorizer = CountVectorizer() data_vectorized = vectorizer.fit_transform(df['lemmas_back_to_text']) # Define Search Param search_params = { 'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9] } # Init the Model lda = LatentDirichletAllocation() # Init Grid Search Class model = GridSearchCV(lda, param_grid=search_params) # Do the Grid Search model.fit(data_vectorized) GridSearchCV(cv=None, error_score='raise', estimator=LatentDirichletAllocation(batch_size=128, doc_topic_prior=None, evaluate_every=-1, learning_decay=0.7, learning_method=None, learning_offset=10.0, max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001, n_components=10, n_jobs=1, perp_tol=0.1, random_state=None, topic_word_prior=None, total_samples=1000000.0, verbose=0), iid=True, n_jobs=1, param_grid={ 'n_topics': [10, 15, 20, 30], 'learning_decay': [0.5, 0.7, 0.9] }, pre_dispatch='2*n_jobs', refit=True, return_train_score='warn', scoring=None, verbose=0) # Best Model best_lda_model = model.best_estimator_ # Model Parameters print("Best Model's Params: ", model.best_params_) # Log Likelihood Score print("Best Log Likelihood Score: ", model.best_score_) # Perplexity print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))
class LDA(): corpus = None model = None dictionary = None util = None loaded = False topicLabelling = defaultdict(int) def __init__(self, utilObj=None, logfilename=None): if (utilObj != None): self.util = utilObj elif (logfilename != None): self.util = Utilities.Utility() self.util.setupLogFileLoc(logfilename) self.util.startTimeTrack() def labelTopics(self, modelFilename): if (os.path.exists(modelFilename + '.label')): f = open(modelFilename + '.label', "rb") self.topicLabelling = pickle.load(f) f.close() else: #Label file not available, performing manual labelling. (One time operation) topics = self.model.show_topics(num_topics=100, num_words=20) print( 'You will be shown a series of words and asked to label the topic in the form of an integer\n' ) for topic in topics: print('The words affliated to this topic is as follows\n', topic[1]) print( '\033[92m' + 'Please label as one of these \n(0) EDUCATION\n(1) SKILLS\n(2) PERSONAL DETAILS\n(3) WORK EXPERIENCE' + '\033[0m') mappedTopicInt = input( 'Please enter a new integer for this topic: ') self.topicLabelling[topic[0]] = mappedTopicInt f = open(modelFilename + '.label', "wb") pickle.dump(self.topicLabelling, f) f.close() def buildCorpus(self, folderListOfCorpus=None, maxdocs=-1): """ For each folder for each cvd2v in in folder Get tokens from Utility tokenise and then form into a string Append string into a list (This forms a document) """ self.util.logDebug('LDA', 'Building and fitting corpus ') documentList = [] maxDocPerFolder = int(maxdocs / len(folderListOfCorpus.split(','))) docCounter = 0 for folder in folderListOfCorpus.split(','): self.util.logDebug('LDA', 'Processing ' + folder) for filename in sorted(glob.iglob(folder + '/*.cvd2v')): if (docCounter <= maxDocPerFolder): fileContent = self.util.tokensToStr( self.util.tokenize( self.util.readFileContent(filename=filename), removeStopwords=True, toLowercase=True, replaceSlash=True, flatEmail=True, flatMonth=True, flatNumber=True, lemmatize=True), ' ') documentList.append(fileContent) docCounter = docCounter + 1 else: docCounter = 0 break self.util.logDebug( 'LDA', str(len(documentList)) + ' documents loaded in ' + self.util.stopTimeTrack()) texts = [[word for word in document.lower().split()] for document in documentList] self.util.logDebug('LDA', 'No of vocab words: ' + str(len(texts))) self.util.logDebug('LDA', 'Text example: ' + str(texts[0])) self.dictionary = Dictionary(texts) self.corpus = [self.dictionary.doc2bow(text) for text in texts] self.util.logDebug('LDA', 'Corpus built in ' + self.util.stopTimeTrack()) def trainModel(self, noOfTopics=4, dstFilename=None): workers = 30 eval_every = 10 iterations = 400 passes = 20 self.util.logDebug('LDA', 'Training model...') self.model = LdaMulticore(self.corpus, workers=workers, num_topics=noOfTopics, id2word=self.dictionary, eval_every=None, iterations=iterations, passes=passes) self.util.logDebug('LDA', 'Model trained in ' + self.util.stopTimeTrack()) print(self.model.print_topics()) self.saveModel(dstFilename) self.loaded = True def saveModel(self, filename): self.util.logDebug('LDA', 'Saving model to ' + filename) self.model.save(filename) self.dictionary.save(filename + '.dict') MmCorpus.serialize(filename + '.corpus', self.corpus) self.util.logDebug('LDA', 'Saved in ' + self.util.stopTimeTrack()) def loadModel(self, filename): self.util.logDebug('LDA', 'Loading model from ' + filename) self.model = LdaMulticore.load(fname=filename) self.dictionary = Dictionary.load(fname=filename + '.dict') self.corpus = MmCorpus(filename + '.corpus') print(self.dictionary) print(self.model.print_topic(0, topn=5)) print(self.model.print_topic(1, topn=5)) print(self.model.print_topic(2, topn=5)) print(self.model.print_topic(3, topn=5)) self.loaded = True self.util.logDebug('LDA', 'Model loaded in ' + self.util.stopTimeTrack()) self.labelTopics(filename) def getTopTopic(self, inferenceOutput): thisDict = defaultdict(int) probList = [] for topic, prob in inferenceOutput: thisDict[str(prob)] = topic probList.append(prob) largestProb = max(probList) mostLikelyTopic = thisDict[str(largestProb)] return mostLikelyTopic def infer_topic_proba(self, string): import numpy as np prediction = [0.0, 0.0, 0.0, 0.0] if (self.loaded): bow = self.dictionary.doc2bow(self.util.tokenize(string)) results = self.model.get_document_topics(bow) for result in results: prediction[result[0]] = result[1] else: self.util.logError('LDA', 'Model is not loaded, cannot infer') prediction = np.array(prediction) return prediction def infer_topic(self, string): results = None if (self.loaded): bow = self.dictionary.doc2bow(self.util.tokenize(string)) results = self.model.get_document_topics(bow) else: self.util.logError('LDA', 'Model is not loaded, cannot infer') results = self.getTopTopic(results) return results def visualizeLDA(self, filename): dictionary = Dictionary.load(filename + '.dict') corpus = MmCorpus(filename + '.corpus') lda = LdaMulticore.load(filename) self.util.logDebug('LDA', 'Preparing HTML ') ldavis = pyLDAvis.gensim.prepare(lda, corpus, dictionary) self.util.logDebug('LDA', 'HTML prepared in ' + self.util.stopTimeTrack()) pyLDAvis.save_html(ldavis, filename + '.html') self.util.logDebug('LDA', 'HTML saved in ' + self.util.stopTimeTrack()) # # lda = LDA(logfilename='/home/kah1/test.log') # lda.loadModel('/u01/bigdata/02d_d2vModel1/CvLda4TopicModel.model') # lda.labelTopics()
#============================================================================== def get_topic(n): doc_lda = model[doc_list[n]] current_prob = 0 for var in doc_lda: if var[1] > current_prob: current_prob = var[1] topic_num = var[0] return topic_num, re.sub('[+.0123456789\*]', '', topic[topic_num]) doc_list = [] for var in matutils.Sparse2Corpus(X, documents_columns=False): doc_list.append(var) topic = model.print_topics(num_topics=9, num_words=50) # store topic with probability with open( '/Users/royyang/Desktop/trending_project/re_categorization_ls/topic_with_prob.txt', 'w') as new: for i in range(9): new.write('{}\t{}\n'.format(str(i), topic[i])) fin_sum = [] for i in range(len(doc_list)): fin_sum.append(get_topic(i)[0]) topic_count = co.Counter(fin_sum) #path = '/Users/royyang/Desktop/trending_project/re_categorization_ehow/top_words_28topics.txt' path = '/Users/royyang/Desktop/trending_project/re_categorization_ls/top_words_9topics.txt'
# # if x in idmap: # # return x # # else: # # return -1 # for idx, (doc_id, document) in enumerate(corpus.documents.items()): # if idx % 1000 == 0: # logger.info("remapping: %d documents finished" % idx) # # corpus.documents[doc_id] = [check_and_replace(oldid) for oldid in document] # corpus.documents[doc_id] = [idmap[oldid] for oldid in document if oldid in idmap] corpus.save_tbmm_corpus(args.corpus_filename) if args.train_lda: # from gensim.models.ldamodel import LdaModel from gensim.models.ldamulticore import LdaMulticore # setting metadata to False is required because of the way logperplexity code requires the # output of get_texts to be. corpus.metadata = False lda = LdaMulticore(workers=19, corpus=corpus, id2word=corpus.dictionary, num_topics=20, eval_every=100, chunksize=100, passes=5) lda.print_topics(20) lda.save(args.corpus_filename + ".tbmm_lda.model")
def main(): df = read_forum_json('json/levergunscommunity.com.json') corpus, dictionary = generate_corpus(df) lda = LdaMulticore(corpus, num_topics=20, id2word=dictionary, workers=3) lda.print_topics(num_topics=20, num_words=20)
# workers=3, # eval_every=eval_every) # Build LDA model # lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, # id2word=id2word, # num_topics=35, # random_state=100, # update_every=1, # chunksize=100, # passes=10, # alpha='auto', # per_word_topics=True) pprint(model.print_topics()) doc_lda = model[corpus] doc_lda[4] model.get_document_topics(corpus)[1] # Compute Perplexity print('\nPerplexity: ', model.log_perplexity(corpus)) # a measure of how good the model is. lower the better. # Compute Coherence Score coherence_model_lda = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda) pyLDAvis.enable_notebook() vis = pyLDAvis.gensim.prepare(lda_multicore, corpus, dictionary) vis
tokens.append(doc_tokens) # Makes tokens column df['tokens'] = tokens id2word = Dictionary(df['tokens']) id2word.filter_extremes(no_below=2, no_above=.99) corpus = [id2word.doc2bow(d) for d in df['tokens']] # Instantiating a Base LDA model base_model = LdaMulticore(corpus=corpus, num_topics=10, id2word=id2word, workers=12, passes=5) words = [re.findall(r'"([^"]*)"',t[1]) for t in base_model.print_topics()] topics = [' '.join(t[0:10]) for t in words] # Getting the topics for id, t in enumerate(topics): print(f"------ Topic {id} ------") print(t, end="\n\n") p=pyLDAvis.gensim.prepare(base_model, corpus, id2word) pyLDAvis.save_html(p, 'biden_lda.html') ldamodel.save('biden_model.gensim') biden_df=df
# topic_number, number_of_aritcles, top_words #============================================================================== def get_topic(n): doc_lda = model[doc_list[n]] current_prob = 0 for var in doc_lda: if var[1]>current_prob: current_prob = var[1] topic_num = var[0] return topic_num,re.sub('[+.0123456789\*]','',topic[topic_num]) doc_list = [] for var in matutils.Sparse2Corpus(X,documents_columns=False): doc_list.append(var) topic = model.print_topics(num_topics=9, num_words=50) # store topic with probability with open('/Users/royyang/Desktop/trending_project/re_categorization_ls/topic_with_prob.txt','w') as new: for i in range(9): new.write('{}\t{}\n'.format(str(i),topic[i])) fin_sum = [] for i in range(len(doc_list)): fin_sum.append(get_topic(i)[0]) topic_count = co.Counter(fin_sum) #path = '/Users/royyang/Desktop/trending_project/re_categorization_ehow/top_words_28topics.txt' path = '/Users/royyang/Desktop/trending_project/re_categorization_ls/top_words_9topics.txt'
sentence_length = [len(tokens) for tokens in clean] #len([i for i in sentence_length if i > 100]) #LDA Model - Select key concerns dictionary = corpora.Dictionary(clean) dictionary.filter_extremes(no_below=100, no_above=0.5, keep_n=10000) doc_term_matrix = [dictionary.doc2bow(doc) for doc in clean] tfidf = models.TfidfModel(doc_term_matrix) corpus_tfidf = tfidf[doc_term_matrix] ldamodel = LdaMulticore(corpus_tfidf, num_topics=7, id2word=dictionary, passes=100) print(*ldamodel.print_topics(num_topics=7, num_words=20), sep='\n') lda_display = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False) # keywords dictionary # key_lda = {'bonus', # 'business', # 'career', # 'change', # 'collaboration', # 'communication', # 'consumer', # 'cost', # 'customer',