def get_lda_model(X, y): from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD # Build LDA Model lda_model = LatentDirichletAllocation( n_components=20, # Number of topics max_iter=10, # Max learning iterations learning_method='online', random_state=100, # Random state batch_size=128, # n docs in each learning iter evaluate_every=-1, # compute perplexity every n iters, default: Don't n_jobs=-1, # Use all available CPUs ) lda_output = lda_model.fit_transform(X, y) print(lda_model) # Model attributes from pprint import pprint # Log Likelyhood: Higher the better print("Log Likelihood: ", lda_model.score(X, y)) # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word) #this is giving some error #print("Perplexity: ", lda_model.perplexity(X,y)) # See model parameters pprint(lda_model.get_params()) return lda_model
def applyLDA2(self, number_of_clusters, country_specific_tweets): train, feature_names = self.extractFeatures(country_specific_tweets,False) name = "lda" if self.results: print("Fitting LDA model with tfidf", end= " - ") t0 = time() lda = LatentDirichletAllocation(n_topics=number_of_clusters, max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda.fit(train) if self.results: print("done in %0.3fs." % (time() - t0)) parameters = lda.get_params() topics = lda.components_ doc_topic = lda.transform(train) top10, labels = self.printTopicCluster(topics, doc_topic, feature_names) labels = numpy.asarray(labels) if self.results: print("Silhouette Coefficient {0}: {1}".format(name, metrics.silhouette_score(train, labels))) return name, parameters, top10, labels
def LDA_sklearn(text_data, num_topics, iterations, visualization = False, gridsearch = False ): vectorizer = OwnCountVectorizer(max_df = 0.95, min_df = 2, stop_words = 'english', lowercase = True, token_pattern = '[a-zA-Z\-][a-zA-Z\-]{2,}', ngram_range = (2, 3), decode_error = 'ignore') vectorized_text_data = vectorizer.fit_transform(text_data) lda_model = LatentDirichletAllocation(n_topics = num_topics, max_iter = iterations, learning_method = 'online', random_state = 100, batch_size = 120, evaluate_every = -1, n_jobs = -1) lda_output = lda_model.fit_transform(vectorized_text_data) print lda_model # model attributes print 'Log likelihood: ', lda_model.score(vectorized_text_data) # log-likelihood: the higher the better print 'Perplexity: ', lda_model.perplexity(vectorized_text_data) # perplexity = exp(-1. * log-likelihood per word, the lower the better pprint(lda_model.get_params()) # see model parameters # GridSearch the best model search_params = {'n_components': [41, 45, 50, 55, 60], 'learning_decay': [.5, .7, .9]} lda = LatentDirichletAllocation() # initialize the model model = GridSearchCV(lda, param_grid = search_params) # initialize the gridsearch class model.fit(vectorized_text_data) # do the grid search best_lda_model = model.best_estimator_ # best model print 'Best parameters: ', model.best_params_ # best parameters print 'Best Log-likelihood score: ', model.best_score_ print 'Model perplexity: ', best_lda_model.perplexity(vectorized_text_data) # Compare LDA model performance scores # Get Log-likelihoods from Gridsearch otputs n_topics = [41, 45, 50, 55, 60] log_likelihoods_5 = [round(gscore.mean_validation_score) for gscore in model.cv_results_ if g.score.parameters['learning_decay' == 0.5]] log_likelihoods_7 = [round(gscore.mean_validation_score) for gscore in model.cv_results_ if g.score.parameters['learning_decay' == 0.7]] log_likelihoods_9 = [round(gscore.mean_validation_score) for gscore in model.cv_results_ if g.score.parameters['learning_decay' == 0.9]] # Show graph plt.figure(figsize = (10, 8)) plt.plot(n_topics, log_likelihoods_5, label = '0.5') plt.plot(n_topics, log_likelihoods_7, label = '0.7') plt.plot(n_topics, log_likelihoods_9, label = '0.9') plt.title('Gridsearch output on choosing optimal LDA model') plt.xlabel('Number of topics') plt.ylabel('Log likelihood scores') plt.legend(title = 'Learning decay', loc = 'best') plt.show() if visualize == True: panel = pyLDAvis.sklearn.prepare(lda_model, vectorized_text_data, vectorizer, mds = 'tsne') pyLDAvis.show(panel) else: return lda_output[0] # for verification that it works
def embeddings_LDA(data): n_features = 1000 # use scikit-learn implementation # https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html tf_vectorizer = CountVectorizer(max_df=0.95, min_df=0.2, max_features=n_features, stop_words=None) tf = tf_vectorizer.fit_transform(data) lda = LatentDirichletAllocation(n_components=3, max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda.fit(tf) print("\nTopics in LDA model:") tf_feature_names = tf_vectorizer.get_feature_names() print_top_words(lda, tf_feature_names, n_top_words=100) params = lda.get_params() print(params) # Show topic distribution over words # https://stackoverflow.com/questions/44208501/getting-topic-word-distribution-from-lda-in-scikit-learn topic_embeddings = lda.components_ / lda.components_.sum(axis=1)[:, np.newaxis] print(topic_embeddings) # TODO project with t-SNE # t SNE ok for user embeddings. # Will be harder for product embeddings, too many dimensions. tsne = TSNE(n_components=2, verbose=0, perplexity=40, n_iter=300) tsne_results = tsne.fit_transform(topic_embeddings) N = 10000 df = pd.DataFrame(tsne_results) rndperm = np.random.permutation(df.shape[0]) df_subset = df.loc[rndperm[:N],:].copy() df_subset['tsne-one'] = tsne_results[:,0] df_subset['tsne-two'] = tsne_results[:,1] plt.figure(figsize=(16,4)) ax = plt.subplot(1, 3, 3) sns.scatterplot( x="tsne-one", y="tsne-two", hue="y", palette=sns.color_palette("hls", 10), data=df_subset, legend="full", alpha=0.3, ax=ax )
def perform_lda_analysis(txtDir='', numOfTxts=None, numOfTopics=5, maxIter=20, learningMode='online', randomState=100, batchSize=128, evaluateEvery=-1, nJobs=-1): """ :param txtDir: :param numOfTxts: an integer or None for selecting all files :param numOfTopics: :param maxIter: :param learningMode: :param randomState: :param batchSize: :param evaluateEvery: :param nJobs: :return: """ warnings.simplefilter("ignore", DeprecationWarning) txtLst = [] for fname in os.listdir(txtDir)[:numOfTxts]: with codecs.open(os.path.join(cfg.pwc['cleanTxtDir'], fname), 'r', 'utf-8-sig') as fh: txt = get_content_words(fh.read()) txtLst.append(txt) txtLst = txtLst vectorizer = CountVectorizer(analyzer='word', min_df=4, lowercase=True, token_pattern='[a-zA-Z0-9]{3,}') dataVector = vectorizer.fit_transform(txtLst) dataDense = dataVector.todense() print("Sparsicity: ", ((dataDense > 0).sum() / dataDense.size) * 100, "%") lda_model = LatentDirichletAllocation(n_topics=numOfTopics, max_iter=maxIter, learning_method=learningMode, random_state=randomState, batch_size=batchSize, evaluate_every=evaluateEvery, n_jobs=nJobs) lda_result = lda_model.fit_transform(dataVector) results = { 'result':lda_result, 'logLikelyhood': lda_model.score(dataVector), # the higher the better 'perplexity': lda_model.perplexity(dataVector), # the lower the better 'params': lda_model.get_params() } pprint(results) return results
def get_model_metrics(model: LatentDirichletAllocation, doc_mat: np.array): """ Args: model (): doc_mat (): Returns: """ print(doc_mat.shape) print('Perplexity: ', model.perplexity(doc_mat)) print('Log likelihood', model.score(doc_mat)) print('Params', model.get_params())
def _learn_lda(data, **kwargs): from sklearn.decomposition import LatentDirichletAllocation if hasattr(data.retention, 'datatype') and data.retention.datatype == 'features': features = data.copy() else: if 'ngram_range' not in kwargs: kwargs.update({'ngram_range': (1, 2)}) features = data.retention.extract_features(**kwargs) lda_filter = LatentDirichletAllocation.get_params( LatentDirichletAllocation) if 'random_state' not in kwargs: kwargs.update({'random_state': 0}) kwargs = {i: j for i, j in kwargs.items() if i in lda_filter} lda = LatentDirichletAllocation(**kwargs) lda.fit(features) mech_desc = pd.DataFrame(lda.components_, columns=features.columns) return mech_desc, lda
def analyser(data): _, data_vectorized = get_vectorized_data(data) # Build LDA Model lda_model = LatentDirichletAllocation( n_components=20, # Number of topics max_iter=10, # Max learning iterations learning_method='online', random_state=100, # Random state batch_size=128, # n docs in each learning iter evaluate_every=-1, # compute perplexity every n iters, default: Don't n_jobs=-1, # Use all available CPUs ) lda_output = lda_model.fit_transform(data_vectorized) print(lda_output) # Log Likelyhood: Higher the better print("Log Likelihood: ", lda_model.score(data_vectorized)) # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word) print("Perplexity: ", lda_model.perplexity(data_vectorized)) # See model parameters pprint(lda_model.get_params())
timestamp = time.time() print("加载停词表...") load_stopwords(swlist) print("加载停词表耗时:", time.time() - timestamp, "s") timestamp = time.time() print("分词...") lemmatizer = WordNetLemmatizer() load_corpus(expected_tags, lemmatizer, swlist, corpus) print("分词耗时:", time.time() - timestamp, "s") tf_vectorizer = CountVectorizer(stop_words="english", lowercase=False) word_freq = tf_vectorizer.fit_transform(corpus) tf_feature_names = tf_vectorizer.get_feature_names() print("语料库总词数:", len(tf_feature_names)) lda = LatentDirichletAllocation(max_iter=50, doc_topic_prior=0.5, \ topic_word_prior=0.1, learning_method="batch", random_state=0) for n_topics in [5, 10, 20]: lda.set_params(n_components=n_topics) params = lda.get_params(False) print("\nLDA模型参数:") for key, value in params.items(): print(key, "<-", value) timestamp = time.time() print("LDA(" + "n_components = " + str(n_topics) + ")训练...") lda.fit(word_freq) print("LDA(" + "n_components = " + str(n_topics) + ")训练耗时:", time.time() - timestamp, "s") print("输出结果到" + "../../output_python/topic" + str(n_topics) + "/topic-top" + str(n_top_words) + "keywords.txt") save_top_topciwords(lda, tf_feature_names, n_top_words) print("\n结束时间:", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
def getDecomposition(self): # cluster images into a dictionary # number has to be finalised after testing dictionary_size = 25 h, w = self.dataMatrix.shape # print(w) # print(h) kmeans = MiniBatchKMeans(n_clusters=dictionary_size, init='k-means++', batch_size=250, random_state=0, verbose=0) kmeans.fit(self.dataMatrix) kmeans.get_params() kmeans.cluster_centers_ labels = kmeans.labels_ # histogram of labels for each image = term-document matrix num_train_images = h self.dataMatrix #num_kps needs to be calculated dynamically num_kps = 192 A = np.zeros((dictionary_size, num_train_images)) ii = 0 jj = 0 for img_idx in range(num_train_images): if img_idx == 0: A[:, img_idx], bins = np.histogram(labels[0:num_kps], bins=range(dictionary_size + 1)) else: ii = np.int(ii + num_kps) jj = np.int(ii + num_kps) A[:, img_idx], bins = np.histogram(labels[ii:jj], bins=range(dictionary_size + 1)) # print str(ii) + ':' + str(jj) # end for # plt.figure() # plt.spy(A.T, cmap='gray') # plt.gca().set_aspect('auto') # plt.title('AP tf-idf corpus') # plt.xlabel('dictionary') # plt.ylabel('documents') # plt.show() # print(self.dataMatrix) # Needs to be finalised num_topics = 25 lda_vb = LatentDirichletAllocation(n_components=num_topics, max_iter=10, learning_method='online', batch_size=512, random_state=0, n_jobs=1) lda_vb.fit(self.dataMatrix.T) lda_vb.get_params() topics = lda_vb.components_ H = lda_vb.transform(self.dataMatrix.T) # print(topics) # print(H.T) return topics, H.T
#fit LDA model print "Fitting LDA model..." lda_vb = LatentDirichletAllocation(n_topics=num_topics, max_iter=10, learning_method='online', batch_size=512, random_state=0, n_jobs=-1) tic = time() lda_vb.fit(A_tfidf_sp) #online VB toc = time() print "elapsed time: %.4f sec" % (toc - tic) print "LDA params" print lda_vb.get_params() print "number of EM iter: %d" % lda_vb.n_batch_iter_ print "number of dataset sweeps: %d" % lda_vb.n_iter_ #topic matrix W: K x V #components[i,j]: topic i, word j topics = lda_vb.components_ f = plt.figure() plt.matshow(topics, cmap='gray') plt.gca().set_aspect('auto') plt.title('learned topic matrix') plt.ylabel('topics') plt.xlabel('dictionary') plt.show()
class NewsBias: def __init__(self): self.tf_vectorizer = [] self.tf = [] self.lda_model = [] self.feature_names = [] self.topics_mat = [] self.sentiment_by_topic = [] def fix_sites(mongo_db): fix_cnn(mongo_db) fix_huffpo(mongo_db) def from_mongo(self, db_name): df = get_df(db_name) df = clean_df(df) df = df[pd.notnull(df['processed_text'])] df = df[df['processed_text'] != ''] return df def from_csv(self, csv_name): try: df = pd.read_csv('data/' + csv_name, parse_dates=False) return df except: print('CSV file does not exist!') print('Make sure CSV file is in data folder.') return False def to_csv(self, df, filename): filename = 'data/' + filename df.to_csv(filename, index=False) print('CSV file saved to: ' + filename) def update_from_bucket(self, filename): path = os.getcwd() # Example filename: 'dsiprojectdata/rss_feeds_new.tar' result = from_bucket(filename, path) if not result: print('Error updating data from bucket!') print( 'Make sure you include folder and file in filename from bucket.' ) def update_to_bucket(self, filename, bucketname, mongo_db=False): # If mongo database then just give database name as filename if mongo_db: cwd = os.getcwd() # Give permission to bash file then run p1 = subprocess.Popen('chmod', '+x', 'backup.sh', stdout=subprocess.PIPE, stderr=subprocess.PIPE) out1, err1 = p1.communicate() p2 = subprocess.Popen(cwd + '/backup.sh', filename, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out2, err2 = p2.communicate() else: p = subprocess.Popen('/usr/bin/aws', 's3', 'cp', filename, 's3://' + bucketname + '/', stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = p.communicate() def run_lda(self, df, max_features=1000, n_topics=20): df = df[pd.notnull(df['processed_text'])] processed_text = df['processed_text'].values.tolist() # Inclued quotes in LDA processed_quote = df['processed_quote'].values.tolist() processed_tweet = df['processed_tweet'].values.tolist() processed_all = [] for text, quote, tweet in zip(processed_text, processed_quote): # Check if quote is nan if type(quote) == float: quote = '' if type(tweet) == float: tweet = '' processed_all.append(text + quote + tweet) try: self.tf_vectorizer = CountVectorizer(max_df=0.95, min_df=0.05, max_features=max_features, stop_words='english') self.tf = self.tf_vectorizer.fit_transform(processed_all) except: import pdb pdb.set_trace() self.lda_model = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0, n_jobs=-1) self.lda_model.fit(self.tf) self.feature_names = np.array(self.tf_vectorizer.get_feature_names()) self.topics_mat = self.lda_model.components_ return self.lda_model def run_gensim_lda(self, df, n_topics=20): self.lda_model = gensim_lda(df, n_topics) def get_top_word_by_topic(topic, n_words): return self.feature_names[np.argsort( self.topics_mat[topic, :])[::-1]][:n_words] def visualize_lda(self, df, display=False): if self.lda_model == []: self.run_lda(df) max_features = self.tf_vectorizer.get_params()['max_features'] n_topics = self.lda_model.get_params()['n_topics'] vis_data = pyLDAvis.sklearn.prepare(self.lda_model, self.tf, self.tf_vectorizer, R=n_topics, n_jobs=-1) pyLDAvis.save_html( vis_data, 'plots/pyLDAvis_' + str(max_features) + 'feats_' + str(n_topics) + 'topics.html') if display: pyLDAvis.show(vis_data) def get_sentiment_of_words(self, df): sentiment_of_words = sentiment_of_words_wordnet(df) return sentiment_of_words def get_sentiment_by_topic(self, df, display=False): n_topics = self.lda_model.get_params()['n_topics'] self.sentiment_by_topic = sentiment_by_topic_wordnet( df, self.topics_mat, self.feature_names) if display: for i, site in enumerate(sentiment_by_topic.keys()): plt.subplot(3, 4, i + 1) score = [] for topic in range(n_topics): score.append(sentiment_by_topic[site][topic][3]) score = np.array(score) score /= sum(np.abs(score)) plt.bar(np.arange(len(score)), score, align='center') plt.ylabel('Score') plt.title('Score by Topic for ' + site) plt.subplots_adjust(hspace=0.4, wspace=0.4) plt.show() return self.sentiment_by_topic def length_of_articles_hist(self, df): for i, site in enumerate(df['source'].unique()): plt.subplot(3, 4, i + 1) new_df = df[df['source'] == site] article_len = [ len(article.split(' ')) for article in new_df['article_text'] ] plt.hist(article_len, normed=True) plt.xlabel('Length of Article') plt.ylabel('# of Articles') plt.title('Length of articles for ' + site) plt.subplots_adjust(hspace=0.4, wspace=0.4) plt.show() def pickle_everything(self): filename = '../pickles/lda_model.pkl' pickle.dump(self.lda_model, open(filename, 'wb'), protocol=2) filename = '../pickles/tf_vectorizer.pkl' pickle.dump(self.tf_vectorizer, open(filename, 'wb'), protocol=2)
# random_state=100, topic_word_prior=None, # total_samples=1000000.0, verbose=0) lda = LatentDirichletAllocation(n_components=no_topics, max_iter=100, learning_method='online', learning_offset=50., random_state=0).fit(tf) # Log Likelyhood: Higher the better print("Log Likelihood: ", lda.score(tf)) # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word) print("Perplexity: ", lda.perplexity(tf)) # See model parameters print(lda.get_params()) no_top_words = 15 # display_topics(nmf, tfidf_feature_names, no_top_words) display_topics(lda, tf_feature_names, no_top_words) # text = "S'han detectat interrupcions degut a incidencies. Accident a Torredembarra." # # LDA # x = lda.transform(tf_vectorizer.transform([text]))[0] # print ("Pel primer text, LDA es: ", x ) # text2 = "Exemple dun tweet que no te res a veure amb el tema i espero que no generi correlacions amb topics entrenats." # # LDA # x = lda.transform(tf_vectorizer.transform([text2]))[0]
print(feature) print(" ") # In[86]: print("Log Liklihood", ldavect.score(dtm)) # higher the better # In[87]: # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word) print("Perplexity: ", ldavect.perplexity(dtm)) # In[88]: # See model parameters print(ldavect.get_params()) # In[89]: #create a transform dataframe for the LDA model ldadf_tr = ldavect.fit_transform(dtm) ldadf_tr print("Completed in %0.4fs." % (time() - t0)) # In[90]: #Now we will optimize using grid search and find the best parameters to model the topic using LDA grid_params = {'n_components': [5, 8, 10, 15], 'learning_decay': [.5, .7, .9]} # In[91]:
def lda_topic_modeling(csv_file, n_topics): # Import dataset df = pd.read_csv(csv_file, delimiter=';', usecols=['post_name', 'post_description', 'post_tagline']) print(df.head(15)) # Remove email and new line characters data = remove_email_and_new_line_chars(df.post_tagline) print('\n') pprint(data[:1]) data = remove_email_and_new_line_chars(df.post_description) print('\n') pprint(data[:1]) # Tokenize and Clean-up text data_words = list(sent_to_words(data)) print('\n', data_words[:1]) # Lemmatization nlp = spacy.load('en_core_web_sm', disable=[ 'parser', 'ner' ]) # Initialize spacy 'en' model, keeping only tagger # component (for efficiency) data_lemmatized = lemmatization(data_words, ['NOUN', 'ADJ', 'VERB', 'ADV'], nlp) # Do lemmatization keeping only # Noun, Adj, Verb, Adverb print('\n', data_lemmatized[:2]) # Create the Document-Word matrix vectorizer = CountVectorizer( analyzer='word', min_df=10, # minimum required occurences of a word stop_words='english', # remove english stop words lowercase=True, # convert all words to lowercase token_pattern='[a-zA-Z0-9]{3,}', # number of chars in a word > 3 ) data_vectorized = vectorizer.fit_transform(data_lemmatized) # Check the Sparsicity data_dense = data_vectorized.todense() # Materialize the sparse data print("\nSparsicity: ", ((data_dense > 0).sum() / data_dense.size) * 100, "%") # Compute Sparsicity = Percentage of # Non-Zero cells # Build LDA model with sklearn lda_model = LatentDirichletAllocation( n_components=10, # Number of topics max_iter=10, # Max learning iterations learning_method='online', random_state=100, # Random state batch_size=128, # number of documents in each learning iter evaluate_every=-1, # compute perplexity every n iters, default: Don't n_jobs=-1, # Use all available CPUs ) lda_output = lda_model.fit_transform(data_vectorized) print('\n', lda_model) # Model attributes # Diagnose model performance with perplexity and log-likelihood print( "\nLog Likelihood: ", lda_model.score(data_vectorized)) # Log Likelyhood: Higher the better print( "Perplexity: ", lda_model.perplexity(data_vectorized)) # Perplexity: Lower the better. # Perplexity = exp(-1. * log-likelihood per word) pprint(lda_model.get_params()) # See model parameters # GridSearch the best LDA model search_params = { 'n_components': [n_topics], 'learning_decay': [.5, .7, .9] } # Define Search Param model = GridSearchCV(lda_model, param_grid=search_params, n_jobs=1, iid=True, cv=3, error_score='raise') # Init Grid Search Class model.fit(data_vectorized) # Do the Grid Search # Find the best topic model and its parameters best_lda_model = model.best_estimator_ # Best Model print("\nBest Model's Params: ", model.best_params_) # Model Parameters print("Best Log Likelihood Score: ", model.best_score_) # Log Likelihood Score print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized)) # Perplexity # How to see the dominant topic in each document df_document_topic = create_document_topic_matrix(best_lda_model, data_vectorized, data) n_documents = 15 # This number indicates an excerpt of documents to which visualize their own dominant topic df_document_topics = df_document_topic.head(n_documents).style.applymap( color_green).applymap(make_bold).highlight_max(color='yellow', axis=1) print('\n', df_document_topics) # Get the top 15 keywords each topic topic_keywords = show_topics( vectorizer, best_lda_model, 15) # Show top n keywords for each topic in order of # highest probability. In this case n is equal to 15 df_topic_keywords = pd.DataFrame( topic_keywords) # Topic- Keywords Dataframe df_topic_keywords.columns = [ 'Word ' + str(i) for i in range(df_topic_keywords.shape[1]) ] df_topic_keywords.index = [ 'Topic ' + str(i) for i in range(df_topic_keywords.shape[0]) ] print('\n', df_topic_keywords) return df_document_topic
class LDA(GenericModel): def __init__(self, **kwargs): self._corpus_matrix = None self._query_vector = None self.vectorizer = None self.lda_model = LatentDirichletAllocation(n_jobs=-1) super().__init__() self.similarity_measure = None self.set_basic_params(**kwargs) self.set_vectorizer(**kwargs) self.set_lda_model(**kwargs) def set_name(self, name): super().set_name(name) def set_model_gen_name(self, gen_name): super().set_model_gen_name(gen_name) def set_basic_params(self, **kwargs): self.set_name('LDA' if LDA_Model_Hyperp.NAME.value not in kwargs.keys() else kwargs[LDA_Model_Hyperp.NAME.value]) self.set_model_gen_name('lda') self.set_similarity_measure( sm.SimilarityMeasure.COSINE if LDA_Model_Hyperp.SIMILARITY_MEASURE. value not in kwargs.keys() else kwargs[LDA_Model_Hyperp. SIMILARITY_MEASURE.value]) def set_similarity_measure(self, sim_measure): self.similarity_measure = sim_measure def set_vectorizer(self, **kwargs): self.vectorizer = TfidfVectorizer( stop_words='english', use_idf=True, smooth_idf=True ) if LDA_Model_Hyperp.VECTORIZER.value not in kwargs.keys( ) else kwargs[LDA_Model_Hyperp.VECTORIZER.value] vec_params = { key.split('__')[2]: kwargs[key] for key, val in kwargs.items() if '__vectorizer__' in key } self.vectorizer.set_params(**vec_params) def set_lda_model(self, **kwargs): lda_model_params = { key.split('__')[2]: kwargs[key] for key, val in kwargs.items() if '__lda_model__' in key } self.lda_model.set_params(**lda_model_params) def recover_links(self, corpus, query, test_cases_names, bug_reports_names): self._corpus_matrix = self.vectorizer.fit_transform(corpus) self._query_vector = self.vectorizer.transform(query) self.out_1 = self.lda_model.fit_transform(self._corpus_matrix) self.out_2 = self.lda_model.transform(self._query_vector) metric = self.similarity_measure if metric == sm.SimilarityMeasure.COSINE: self._sim_matrix = pairwise.cosine_similarity(X=self.out_1, Y=self.out_2) elif metric == sm.SimilarityMeasure.JSD: self._sim_matrix = pairwise_distances(X=self.out_1, Y=self.out_2, metric=SimilarityMeasure.jsd) elif metric == sm.SimilarityMeasure.EUCLIDIAN_DISTANCE: self._sim_matrix = pairwise_distances(X=self.out_1, Y=self.out_2, metric='euclidean') #self._sim_matrix = super().normalize_sim_matrix(self._sim_matrix) self._sim_matrix = pd.DataFrame(data=self._sim_matrix, index=test_cases_names, columns=bug_reports_names) self._record_docs_feats(corpus, query, test_cases_names, bug_reports_names) def _record_docs_feats(self, corpus, query, test_cases_names, bug_reports_names): self.mrw_tcs = self._recover_mrw_list(test_cases_names, corpus) self.mrw_brs = self._recover_mrw_list(bug_reports_names, query) self.dl_tcs = self._recover_dl_list(test_cases_names, corpus) self.dl_brs = self._recover_dl_list(bug_reports_names, query) index = list(test_cases_names) + list(bug_reports_names) self.docs_feats_df = pd.DataFrame(index=index, columns=['mrw', 'dl']) for tc_name, mrw in self.mrw_tcs: self.docs_feats_df.at[tc_name, 'mrw'] = mrw for tc_name, dl in self.dl_tcs: self.docs_feats_df.at[tc_name, 'dl'] = dl for br_name, mrw in self.mrw_brs: self.docs_feats_df.at[br_name, 'mrw'] = mrw for br_name, dl in self.dl_brs: self.docs_feats_df.at[br_name, 'dl'] = dl def _recover_dl_list(self, artf_names, artf_descs): tokenizer = PorterStemmerBased_Tokenizer() dl_list = [] for artf_name, artf_desc in zip(artf_names, artf_descs): dl_list.append((artf_name, len(tokenizer.__call__(artf_desc)))) return dl_list def _recover_mrw_list(self, artf_names, artf_descs): N_REL_WORDS = 6 mrw_list = [] # list of tuples (artf_name, mrw_list={}) for artf_name, artf_desc in zip(artf_names, artf_descs): X = self.vectorizer.transform([artf_desc]) df1 = pd.DataFrame(X.T.toarray()) df1['token'] = self.vectorizer.get_feature_names() df1.sort_values(by=0, ascending=False, inplace=True) mrw = list(df1.iloc[0:N_REL_WORDS, 1].values) mrw_list.append((artf_name, mrw)) return mrw_list def model_setup(self): return { "Setup": [{ "Name": self.get_name() }, { "Similarity Measure and Minimum Threshold": self.get_sim_measure_min_threshold() }, { "Top Value": self.get_top_value() }, { "LDA Model": self.lda_model.get_params() }, { "Vectorizer": self.vectorizer.get_params() }, { "Vectorizer Type": type(self.vectorizer) }] } def get_name(self): return super().get_name() def get_model_gen_name(self): return super().get_model_gen_name() def get_similarity_measure(self): return self.similarity_measure def get_sim_matrix(self): return super().get_sim_matrix() def get_tokenizer_type(self): return type(self.tokenizer) def save_sim_matrix(self): super().save_sim_matrix() def get_query_vector(self): return self._query_vector def get_corpus_matrix(self): return self._corpus_matrix def get_vectorizer_type(self): return type(self.vectorizer) def print_topics(self): feature_names = self.vectorizer.get_feature_names() n_top_words = 10 for topic_idx, topic in enumerate(self.lda_model.components_): message = "Topic #%d: " % topic_idx message += " ".join([ feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1] ]) print(message)
class LDA(object): """ Class for Latent Dirichlet Allocation model """ def __init__(self, filename): self.filename = filename self.vectorized_data = None self.df_topic_keywords = None self.fitted = False # load fitted model if exists try: self.load_model(self.filename) self.fitted = True except IOError: self.vectorizer = CountVectorizer(lowercase=False) self.model = LatentDirichletAllocation() def __str__(self): print_string = 'LDA model. Params:\n' params = self.model.get_params() for key in params: print_string += '{0}: {1}\n'.format(key, params[key]) return print_string def check_model(self): if not self.fitted or self.model is None: raise ValueError('Model is not fitted or not created') def fit(self, corpora): """ Fit LDA model by texts collection :param corpora: list of str """ self.vectorized_data = self.vectorizer.fit_transform(corpora) search_params = {'n_components': [10, 15, 20, 25, 30]} model = GridSearchCV(self.model, param_grid=search_params, cv=3) model.fit(self.vectorized_data) self.model = model.best_estimator_ self.fitted = True self.construct_df_topics() self.save_model() def predict(self, text, distribution_only=True): """ Predict most relevant words for given text :param text: str :param distribution_only: bool, compute only document-topics distribution :return: list of (keyword, probability score) """ self.check_model() if distribution_only: return self.model.transform(self.vectorizer.transform(text)) topic_probability_scores = self.model.transform( self.vectorizer.transform(text))[0] topics = self.df_topic_keywords.iloc[ argmax(topic_probability_scores), :].values.tolist() topics = list(zip(topics, topic_probability_scores)) return sorted(topics, key=lambda x: x[1], reverse=True) def compute_similarity(self, text1, text2): """ Compute the Jensen-Shannon distance between probability arrays of two texts :param text1: list of str :param text2: list of str :return: float in [0, 1], bigger - less similar """ text1_dist = self.predict(text1)[0] text2_dist = self.predict(text2)[0] return jensenshannon(text1_dist, text2_dist) def construct_df_topics(self, n_words=20): """ Construct pd.DataFrame with top %n_words% keywords for each topic """ self.check_model() topic_keywords = [] keywords = array(self.vectorizer.get_feature_names()) for topic_weights in self.model.components_: top_keyword_locs = (-topic_weights).argsort()[:n_words] topic_keywords.append(keywords.take(top_keyword_locs)) self.df_topic_keywords = pd.DataFrame(topic_keywords) self.df_topic_keywords.columns = [ 'Word ' + str(i) for i in range(self.df_topic_keywords.shape[1]) ] self.df_topic_keywords.index = [ 'Topic ' + str(i) for i in range(self.df_topic_keywords.shape[0]) ] def stats(self): self.check_model() print('Log Likelihood:', self.model.score(self.vectorized_data)) print('Perplexity:', self.model.perplexity(self.vectorized_data)) def visualize(self): """ Start local web-server and display LDA fitted model """ self.check_model() show( prepare(self.model, self.vectorized_data, self.vectorizer, mds='tsne')) def load_model(self, filename): """ Load LDA model, CountVectorizer instance and term-document matrix from binary file """ with open(filename, 'rb') as file: model_dict = pickle.load(file) self.model = model_dict['model'] self.vectorizer = model_dict['vec'] self.vectorized_data = model_dict['vec_data'] self.df_topic_keywords = model_dict['df'] def save_model(self): """ Save fitted LDA model by pickle """ self.check_model() with open(self.filename, 'wb') as file: pickle.dump( { 'model': self.model, 'vec': self.vectorizer, 'vec_data': self.vectorized_data, 'df': self.df_topic_keywords }, file)
print "number of docs: %d" %A_tfidf_sp.shape[0] print "dictionary size: %d" %A_tfidf_sp.shape[1] #tf-idf dictionary tfidf_dict = tfidf.get_feature_names() #fit LDA model print "Fitting LDA model..." lda_vb = LatentDirichletAllocation(n_topics = num_topics, max_iter=10, learning_method='online', batch_size = 512, random_state=0, n_jobs=-1) tic = time() lda_vb.fit(A_tfidf_sp) #online VB toc = time() print "elapsed time: %.4f sec" %(toc - tic) print "LDA params" print lda_vb.get_params() print "number of EM iter: %d" % lda_vb.n_batch_iter_ print "number of dataset sweeps: %d" % lda_vb.n_iter_ #topic matrix W: K x V #components[i,j]: topic i, word j topics = lda_vb.components_ f = plt.figure() plt.matshow(topics, cmap = 'gray') plt.gca().set_aspect('auto') plt.title('learned topic matrix') plt.ylabel('topics') plt.xlabel('dictionary') plt.show()
# Save the dropped sentences drop_doc_path = "selected_%d_dropped.txt" % total with open("data/"+drop_doc_path, "w") as doc: for line in doc_dropped: doc.write(line + "\n") # Training LDA tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=None, stop_words='english') tf = tf_vectorizer.fit_transform(doc_dropped) lda = LatentDirichletAllocation(n_topics=10, max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda.fit(tf) vocab_dict = tf_vectorizer.vocabulary_ components = lda.components_ component_names = tf_vectorizer.get_feature_names() lda_params = lda.get_params(deep=True) with open('data/params/lda_params.save', 'wb') as f: pickle.dump(lda_params, f, protocol=pickle.HIGHEST_PROTOCOL) # Save topics and their associate words for topic_idx, topic in enumerate(components): message = "topic_%d" % topic_idx print(message) idx = topic.argsort() with open("data/topics/" + message + ".txt", "w") as doc: for i in idx: doc.write(component_names[i] + "\n") tf_sentence_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=None, stop_words='english',
fileTfVector = fileVector.fit_transform(data) print(fileTfVector.shape) # LDA训练 使用16部小说进行训练 topic = args.k model = LDA(n_components=topic, max_iter=50, learning_method='batch') docres = model.fit_transform(fileTfVector[:16]) # print(docres) value, indices = torch.max(torch.tensor(docres), 1) print(indices) print("{}个主题识别出来了{}个主题".format(topic, len(list(set(indices.tolist()))))) # print(len(model.components_)) res = model.transform(fileTfVector) assert len(res) == len(labels) df_labels = pd.DataFrame(labels) # df_labels.to_excel("labels.xlsx") df_res = pd.DataFrame(res) # df_res.to_excel("ldaVector.xlsx") df = pd.concat([df_labels, df_res], axis=1) df.to_excel("labels_with_vector.xlsx") with open("history.txt", "a", encoding="utf-8") as f: print(topic, file=f) print(model.get_params(), file=f) print("perplexity:", model.perplexity(fileTfVector[:16]), file=f) print("", file=f)
import pandas as pd columnMap = pd.read_csv(dataDirectory + dataFile + "-columnMap.txt",header=None, names=("Idx","Term")) targetMap = pd.read_csv(dataDirectory + dataFile + "-targetMap.txt",header=None, names=("Target","Idx")) NUM_TOPICS=5 from sklearn.decomposition import LatentDirichletAllocation # Build a Latent Dirichlet Allocation Model lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=400, random_state=10,learning_method='batch', learning_decay=0, evaluate_every=1, perp_tol=0.01 , topic_word_prior=1/1000, verbose=1) #lda_hr= lda_model.fit_transform(X) lda_hr= lda_model.fit(X) # checking model specification lda_model.get_params() def print_topics(model, feature_names, top_n=10): for idx, topic in enumerate(model.components_): print("Topic %d:" % (idx)) print([(feature_names[i], round(topic[i],2)) for i in topic.argsort()[:-top_n -1:-1]]) print("LDA Model:") print_topics(lda_hr, columnMap['Term']) print("=" * 40) # grid search from sklearn.model_selection import GridSearchCV search_params = {'n_components': [2, 3, 4, 5, 6, 7]}
batch_size=128, # No of docs in each iter evaluate_every=-1, # Compute perplexity every n iters n_jobs=-1) # Use all available CPUs lda_output = lda_model.fit_transform(samples) print(lda_model) # Diagnose model performance with perplexity and log-likelihood # Log Likelyhood: Higher the better print "Log Likelihood: ", lda_model.score(samples) # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word) print("Perplexity: ", lda_model.perplexity(samples)) # See model parameters pprint(lda_model.get_params()) # Perform GridSearch for the best LDA model # Define Search Param search_params = { 'n_components': [6, 7, 8, 9], # take 10 topics 'learning_decay': [0.5, 0.7, 0.9], 'max_iter': [6, 7, 8, 9], 'random_state': [2018] } # Init the Model lda = LatentDirichletAllocation() # Init Grid Search Class model = GridSearchCV(lda, param_grid=search_params)
batch_size=128, # n docs in each learning iter evaluate_every = -1, # compute perplexity every n iters, default: Don't n_jobs = -1, # Use all available CPUs ) lda_output = lda_model.fit_transform(data_vectorized) #takes times... print(lda_model) #print the Model attributes ############################################################################### #4bis. Diagnose model performance with perplexity and log-likelihood print("Log Likelihood: ", lda_model.score(data_vectorized)) #Higher the better #out : Log Likelihood: -8 509 466.557993239 print("Perplexity: ", lda_model.perplexity(data_vectorized)) #Lower the better. Perplexity = exp(-1. * log-likelihood per word) #out: Perplexity: 1 039.767935888455 print(lda_model.get_params()) #print the lda_paramètres ###############################################################################" #5. Use Grid-Search.fit & .best_estimator_ to have the best LDA model.n_components ? # Define Grid-Search Param search_params = {'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9]} lda = LatentDirichletAllocation(max_iter=5, learning_method='online', learning_offset=50., random_state=0) model = GridSearchCV(lda, param_grid=search_params) model.fit(data_vectorized) #takes time ! #Grid-Search constructs multiple LDA models for all possible combinations of param values GridSearchCV(cv=None, error_score='raise', estimator=LatentDirichletAllocation(batch_size=128, doc_topic_prior=None, evaluate_every=-1, learning_decay=0.7, learning_method=None,