def lda_tuner(ingroup_otu, best_models): best_score = -1*np.inf dtp_series = [0.0001, 0.001, 0.01, 0.1, 0.2] twp_series = [0.0001, 0.001, 0.01, 0.1, 0.2] topic_series = [3] X = ingroup_otu.values eval_counter = 0 for topics in topic_series: for dtp in dtp_series: for twp in twp_series: eval_counter +=1 X_train, X_test = train_test_split(X, test_size=0.5) lda = LatentDirichletAllocation(n_topics=topics, doc_topic_prior=dtp, topic_word_prior=twp, learning_method='batch', random_state=42, max_iter=20) lda.fit(X_train) this_score = lda.score(X_test) this_perplexity = lda.perplexity(X_test) if this_score > best_score: best_score = this_score print "New Max Likelihood: {}".format(best_score) print "#{}: n:{}, dtp:{}, twp:{}, score:{}, perp:{}".format(eval_counter, topics, dtp, twp, this_score, this_perplexity) best_models.append({'n': topics, 'dtp': dtp, 'twp': twp, 'score': this_score, 'perp': this_perplexity}) if (dtp == dtp_series[-1]) and (twp == twp_series[-1]): eval_counter +=1 X_train, X_test = train_test_split(X, test_size=0.5) lda = LatentDirichletAllocation(n_topics=topics, doc_topic_prior=1./topics, topic_word_prior=1./topics, learning_method='batch', random_state=42, max_iter=20) lda.fit(X_train) this_score = lda.score(X_test) this_perplexity = lda.perplexity(X_test) if this_score > best_score: best_score = this_score print "New Max Likelihood: {}".format(best_score) print "#{}: n:{}, dtp:{}, twp:{}, score:{} perp: {}".format(eval_counter, topics, (1./topics), (1./topics), this_score, this_perplexity) best_models.append({'n': topics, 'dtp': (1./topics), 'twp': (1./topics), 'score': this_score, 'perp': this_perplexity}) return best_models
def test_lda_score(): # Test LDA score for batch training # score should be higher after each iteration n_topics, X = _build_sparse_mtx() for method in ('online', 'batch'): lda_1 = LatentDirichletAllocation(n_topics=n_topics, max_iter=1, learning_method=method, total_samples=100, random_state=0) lda_2 = LatentDirichletAllocation(n_topics=n_topics, max_iter=10, learning_method=method, total_samples=100, random_state=0) lda_1.fit_transform(X) score_1 = lda_1.score(X) lda_2.fit_transform(X) score_2 = lda_2.score(X) assert_greater_equal(score_2, score_1)
def get_score(filepath, min_word_count, num_topics, max_df_, min_df_): text_corpus, raw_corpus, filepath = load_corpus(min_word_count, filepath) num_segs = len(text_corpus) #Create CountVectorizer to get Document-Term matrix vectorizer = CountVectorizer(stop_words='english', lowercase=True, max_df=max_df_, min_df=min_df_, tokenizer=LemmaTokenizer()) #train vectorizer on corpus dt_matrix = vectorizer.fit_transform(text_corpus) feature_names = vectorizer.get_feature_names() #initialize model lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=5, learning_method='batch') #train the model on the corpus and get a document topic matrix for the corpus doc_topic_matrix = lda.fit_transform(dt_matrix) feature_names = vectorizer.get_feature_names() num_features = len(feature_names) score = lda.score(dt_matrix) / get_num_tokens(dt_matrix) return score, (num_segs, len(raw_corpus)), num_features
def computeLDA(analyzer, xCol: str, nWords: int, n_topics: int, file: str): """ Compute LDA process for 1 file.""" print(f'processing LDA for {file} and [{n_topics}] topics...') # create output directory to store results outputDir = createOutputDirectory(file, n_topics) rawCorpus = readCorpus(file) # get X_train. y_train is ignored X_train = prepareXyTrain(rawCorpus, xCol, rawCorpus.columns[0])[0] tfidf = getVectorizer('lda', analyzer) sparseX = tfidf.fit_transform(X_train) # lda model with default parameters lda = LatentDirichletAllocation(n_components=n_topics, n_jobs=-1) # size(n_documents, n_topics). Data = topic probas dfVectorized = pd.DataFrame(lda.fit_transform(sparseX)) # prepare 2 dataframes to visualize frequencies and percentages dfCounts, dfProbasNormalized = prepareDfs(dfVectorized) prepareBarPLots(dfCounts, dfProbasNormalized, outputDir) # normalize and round lda components (size is (n_topics, n_features)) probas = lda.components_ / lda.components_.sum(axis=1)[:, np.newaxis] rProbas = np.apply_along_axis(lambda n: np.round(n, 4), 1, probas) # produce and save dataframe with top features features = tfidf.get_feature_names() headers = ["topic_" + str(c) for c in dfVectorized.columns] l2headers = ['word', 'proba'] topDf = dfTopFeatures(features, headers, rProbas, l2headers, 'topic', nWords) topDf.to_csv(f'{outputDir}topFeatures.csv') return [lda.score(sparseX), lda.perplexity(sparseX)]
def test_lda_score(method): # Test LDA score for batch training # score should be higher after each iteration n_components, X = _build_sparse_mtx() lda_1 = LatentDirichletAllocation(n_components=n_components, max_iter=1, learning_method=method, total_samples=100, random_state=0) lda_2 = LatentDirichletAllocation(n_components=n_components, max_iter=10, learning_method=method, total_samples=100, random_state=0) lda_1.fit_transform(X) score_1 = lda_1.score(X) lda_2.fit_transform(X) score_2 = lda_2.score(X) assert score_2 >= score_1
def main(): print("\n-----LDA CONCEPT DETECITON-----") text_corpus, text_corpus_ids, raw_corpus, raw_corpus_ids, filepath = load_corpus( 'v') # text_corpus_lemma = lemmatize_corpus(text_corpus, 'v') concepts_raw = load_document(CONCEPTS_PATH) concepts = parse_concepts(concepts_raw) num_segs = len(text_corpus) print("MAX_DF: " + str(MAX_DF)) print("MIN_DF: " + str(MIN_DF)) print("Number of Segs: %d/%d" % (len(text_corpus), len(raw_corpus))) #Create CountVectorizer to get Document-Term matrix vectorizer = CountVectorizer(stop_words='english', lowercase=True, max_df=MAX_DF, min_df=MIN_DF, tokenizer=LemmaTokenizer()) #train vectorizer on corpus dt_matrix = vectorizer.fit_transform(text_corpus) feature_names = vectorizer.get_feature_names() print("Number of Features: " + str(len(feature_names))) #initialize model print("initialize model") lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=400, learning_method='batch') #train the model on the corpus and get a document topic matrix for the corpus print('fit model to corpus') doc_topic_matrix = lda.fit_transform(dt_matrix) topic_term_matrix = lda.components_ # print("visualizing") # visualize(doc_topic_matrix) print("Score: " + str(lda.score(dt_matrix) / get_num_tokens(dt_matrix))) print("running elbow") #print topics, 10 is the number of words in the topic dist to display (e.g. top 10) topic_str_list = print_topics(lda, feature_names, 10) run_elbow(lda, feature_names) # # for i in range(0, len(concepts)): # query_list = concepts[i] # topicid_list = get_topics_w_query(topic_term_matrix, TOP_N_WORDS, feature_names, query_list) # seg_list, num_rel_segs = get_segs_w_query(doc_topic_matrix, topicid_list, 10, query_list) # # if len(seg_list) > 0: # write_output_file_xlsx(query_list, topic_str_list, topicid_list, filepath, num_segs, seg_list, num_rel_segs, text_corpus) # # return 0
class LDATopicGen: def __init__(self, data, topics=5): self.data = data self.components = topics self.model = None def fit_predict(self): self.model = LatentDirichletAllocation(n_components=self.components, random_state=0) topics = self.model.fit_transform(self.data) print("LDA Perplexity Score %s" % self.model.perplexity(self.data)) print("LDA Log Likelihood Score %s" % self.model.score(self.data)) return topics def plot(self): norm = matplotlib.colors.Normalize(-1, 1) colors = [[norm(-1.0), "midnightblue"], [norm(-0.5), "seagreen"], [norm(0.5), "mediumspringgreen"], [norm(1.0), "yellow"]] cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", colors) ax = sns.clustermap(self.model.components_ / self.model.components_.sum(axis=1)[:, np.newaxis], linewidth=0.5, cmap=cmap) plt.show()
def get_lda_model(X, y): from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD # Build LDA Model lda_model = LatentDirichletAllocation( n_components=20, # Number of topics max_iter=10, # Max learning iterations learning_method='online', random_state=100, # Random state batch_size=128, # n docs in each learning iter evaluate_every=-1, # compute perplexity every n iters, default: Don't n_jobs=-1, # Use all available CPUs ) lda_output = lda_model.fit_transform(X, y) print(lda_model) # Model attributes from pprint import pprint # Log Likelyhood: Higher the better print("Log Likelihood: ", lda_model.score(X, y)) # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word) #this is giving some error #print("Perplexity: ", lda_model.perplexity(X,y)) # See model parameters pprint(lda_model.get_params()) return lda_model
def generate_topics(): db, cursor = dbConnect() for domain in c.domains: start_time = time.time() papers, tf, feature_names = load_corpus(domain, db) #lda,feature_names=load_model(domain,c.domain_topics[domain]) lda = LatentDirichletAllocation(n_topics=c.domain_topics[domain], max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda.fit(tf) #---------- MODEL EVALUATION PARAMETERS -------------------------- perplexity1 = lda.perplexity(tf) perplexity2 = lda.perplexity(tf, lda._e_step(tf, False, False)[0]) score = lda.score(tf, lda._e_step(tf, False, False)[0]) topic_paper_dist = lda.transform(tf) print "for", c.domain_topics[ domain], domain, "topics ==> perplexity:", perplexity2, "log likelihood:", score save_model(lda, domain, c.domain_topics[domain], feature_names) #lda,feature_names=load_model(domain,c.domain_topics[domain]) store_in_db(db, lda, topic_paper_dist, papers, feature_names, domain) print "--- time for " + domain + ": " + str( (time.time() - start_time) / 60) + " minutes ---"
def train_model(self, n_components, learning_offset=10.0, learning_decay=0.7, max_doc_update_iter=100, n_jobs=-1): ''' 训练LDA模型 :param n_components: LDA的主题参数 :param learning_offset: :param learning_decay: :param max_doc_update_iter: :param n_jobs: :return: lda model ''' ldaModel = LatentDirichletAllocation( n_components=n_components, learning_decay=learning_decay, learning_offset=learning_offset, max_doc_update_iter=max_doc_update_iter, n_jobs=n_jobs) ldaModel.fit(self.tfVector) print('The Log Likelihood Score:{}'.format( np.round(ldaModel.score(self._get_tf_vector()), 3))) print('The Perplexity:{}'.format( np.round(ldaModel.perplexity(self._get_tf_vector()), 3))) return ldaModel
def cluster_sk_latent_dirichlet_allocation(content): """ SK LDA """ _config = LatentDirichletAllocation( n_components=content['n_components'], doc_topic_prior=None, topic_word_prior=None, learning_method=content['learning_method'], learning_decay=content['learning_decay'], learning_offset=content['learning_offset'], max_iter=10, batch_size=128, mean_change_tol=content['mean_change_tol'], n_jobs=-1) _result = _config.fit(content['data']).transform(content['data']) return httpWrapper( json.dumps( { 'result': _result.tolist(), 'components': _config.components_.tolist(), 'batchIter': _config.n_batch_iter_, 'nIter': _config.n_iter_, 'perplexity': _config.perplexity(content['data']), 'score': _config.score(content['data']) }, ignore_nan=True))
def topicmodel( comments ): _texts = [] texts = [] for c in comments: c = c['text'] _texts.append( c ) texts.append( c ) tf_vectorizer = CountVectorizer( max_df=.20, min_df=10, stop_words = stopwords ) texts = tf_vectorizer.fit_transform( texts ) ## test between 2 and 20 topics topics = {} for k in range(2, 10): print "Testing", k model = LatentDirichletAllocation( n_topics= k , max_iter=5, learning_method='batch', learning_offset=50., random_state=0 ) model.fit( texts ) ll = model.score( texts ) topics[ ll ] = model topic = max( topics.keys() ) ret = collections.defaultdict( list ) ## ugly, rewrite some day model = topics[ topic ] ## for debug pront chosen models' names feature_names = tf_vectorizer.get_feature_names() for topic_idx, topic in enumerate(model.components_): print "Topic #%d:" % topic_idx print " ".join( [feature_names[i].encode('utf8') for i in topic.argsort()[:-5 - 1:-1]]) print for i, topic in enumerate( model.transform( texts ) ): topic = numpy.argmax( topic ) text = _texts[ i ].encode('utf8') ret[ topic ].append( text ) return ret
def test_lda_score(): # Test LDA score for batch training # score should be higher after each iteration n_topics, X = _build_sparse_mtx() for method in ('online', 'batch'): lda_1 = LatentDirichletAllocation(n_topics=n_topics, max_iter=1, learning_method=method, total_samples=100, random_state=0) lda_2 = LatentDirichletAllocation(n_topics=n_topics, max_iter=10, learning_method=method, total_samples=100, random_state=0) lda_1.fit_transform(X) score_1 = lda_1.score(X) lda_2.fit_transform(X) score_2 = lda_2.score(X) assert_greater_equal(score_2, score_1)
def wybor(topics, data): for data in data: loglikelihood=[] perplexity=[] for topics in topics: lda=LatentDirichletAllocation(n_topics=topics, learning_method="batch", max_iter=25, random_state=0) loglikelihood.append(lda.score(data)) perplexity.append(lda.perplexity(data))
def main(): print("\n-----LDA CONCEPT DETECITON-----") text_corpus, raw_corpus = load_corpus() print("MAX_DF: " + str(MAX_DF)) print("MIN_DF: " + str(MIN_DF)) print("Number of Segs: %d/%d" % (len(text_corpus), len(raw_corpus))) #Create CountVectorizer to get Document-Term matrix vectorizer = CountVectorizer(stop_words='english', lowercase=True, max_df=MAX_DF, min_df=MIN_DF, tokenizer=LemmaTokenizer()) #train vectorizer on corpus dt_matrix = vectorizer.fit_transform(text_corpus) feature_names = vectorizer.get_feature_names() print("Number of Features: " + str(len(feature_names))) #initialize model lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=1000, learning_method='batch') #train the model on the corpus and get a document topic matrix for the corpus doc_topic_matrix = lda.fit_transform(dt_matrix) topic_term_matrix = lda.components_ print("Score: " + str(lda.score(dt_matrix) / get_num_tokens(dt_matrix))) #get freq of topics in corpus topic_prev = get_topic_prevelance(doc_topic_matrix, NUM_TOPICS, len(text_corpus)) #print topics print_topics(lda, feature_names, 10, topic_prev) #get top segs assoc with each topic top_segs = get_top_segs_threshold(NUM_TOPICS, doc_topic_matrix, TOPIC_PRESSENCE_THRESHOLD) #print_top_segs(top_segs, text_corpus) kw_per_topic = get_key_words(NUM_TOPICS) kw_segs = get_kw_segs(kw_per_topic, top_segs, text_corpus) print("--------------SEGMENTS CONTAINING KW--------------") for i in range(0, len(kw_segs)): print("\nTOPIC: %d\n" % (i)) for j in range(0, len(kw_segs[i])): print("--------------") print("Seg: " + str(kw_segs[i][j])) return 0
def test_lda_score_perplexity(): # Test the relationship between LDA score and perplexity n_components, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_components=n_components, max_iter=10, random_state=0) lda.fit(X) perplexity_1 = lda.perplexity(X, sub_sampling=False) score = lda.score(X) perplexity_2 = np.exp(-1. * (score / np.sum(X.data))) assert_almost_equal(perplexity_1, perplexity_2)
def test_lda_score_perplexity(): # Test the relationship between LDA score and perplexity n_components, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_components=n_components, max_iter=10, random_state=0) lda.fit(X) perplexity_1 = lda.perplexity(X, sub_sampling=False) score = lda.score(X) perplexity_2 = np.exp(-1. * (score / np.sum(X.data))) assert_almost_equal(perplexity_1, perplexity_2)
def lda_operation(data_samples, num_features: int=400, num_topics: int=6)-> Tuple: """Performs Latent Dirichlet Allocation on a list of our text samles Args: data_samples List[str]: List of strings representing the text of each Piazza post num_features (int): Max number of features to be considered by term frequency num_topics (int): Number of topics Returns: tuple: Trained LDA Model and the embedded text in the CountVectorizer """ tf_vectorizer = CountVectorizer(max_df=.85, min_df=.05, max_features=num_features, stop_words='english', token_pattern=u'(?ui)\\b\\w\w*[a-z]+\\w*\\b') tf_data_samples = tf_vectorizer.fit_transform(data_samples) tf_feature_names = tf_vectorizer.get_feature_names() lda = LatentDirichletAllocation(n_components=num_topics, max_iter=100, learning_method='online', learning_offset=10.,random_state=1).fit(tf_data_samples) lda.score(tf_data_samples) return lda, tf_vectorizer
def LDA_sklearn(text_data, num_topics, iterations, visualization = False, gridsearch = False ): vectorizer = OwnCountVectorizer(max_df = 0.95, min_df = 2, stop_words = 'english', lowercase = True, token_pattern = '[a-zA-Z\-][a-zA-Z\-]{2,}', ngram_range = (2, 3), decode_error = 'ignore') vectorized_text_data = vectorizer.fit_transform(text_data) lda_model = LatentDirichletAllocation(n_topics = num_topics, max_iter = iterations, learning_method = 'online', random_state = 100, batch_size = 120, evaluate_every = -1, n_jobs = -1) lda_output = lda_model.fit_transform(vectorized_text_data) print lda_model # model attributes print 'Log likelihood: ', lda_model.score(vectorized_text_data) # log-likelihood: the higher the better print 'Perplexity: ', lda_model.perplexity(vectorized_text_data) # perplexity = exp(-1. * log-likelihood per word, the lower the better pprint(lda_model.get_params()) # see model parameters # GridSearch the best model search_params = {'n_components': [41, 45, 50, 55, 60], 'learning_decay': [.5, .7, .9]} lda = LatentDirichletAllocation() # initialize the model model = GridSearchCV(lda, param_grid = search_params) # initialize the gridsearch class model.fit(vectorized_text_data) # do the grid search best_lda_model = model.best_estimator_ # best model print 'Best parameters: ', model.best_params_ # best parameters print 'Best Log-likelihood score: ', model.best_score_ print 'Model perplexity: ', best_lda_model.perplexity(vectorized_text_data) # Compare LDA model performance scores # Get Log-likelihoods from Gridsearch otputs n_topics = [41, 45, 50, 55, 60] log_likelihoods_5 = [round(gscore.mean_validation_score) for gscore in model.cv_results_ if g.score.parameters['learning_decay' == 0.5]] log_likelihoods_7 = [round(gscore.mean_validation_score) for gscore in model.cv_results_ if g.score.parameters['learning_decay' == 0.7]] log_likelihoods_9 = [round(gscore.mean_validation_score) for gscore in model.cv_results_ if g.score.parameters['learning_decay' == 0.9]] # Show graph plt.figure(figsize = (10, 8)) plt.plot(n_topics, log_likelihoods_5, label = '0.5') plt.plot(n_topics, log_likelihoods_7, label = '0.7') plt.plot(n_topics, log_likelihoods_9, label = '0.9') plt.title('Gridsearch output on choosing optimal LDA model') plt.xlabel('Number of topics') plt.ylabel('Log likelihood scores') plt.legend(title = 'Learning decay', loc = 'best') plt.show() if visualize == True: panel = pyLDAvis.sklearn.prepare(lda_model, vectorized_text_data, vectorizer, mds = 'tsne') pyLDAvis.show(panel) else: return lda_output[0] # for verification that it works
def RandScore(CountsMatrix, K, no_iter): """ Calculates score for observed data with LDA model fitted to randomized matrix CountsMatrix - numpy array. Counts matrix (Document x terms matrix) for our real data K - number of clusters """ #Randomize CountsMatrix RandMatrix = RandomizeMatrix(CountsMatrix) #LDA for randomized matrix lda_rand = LatentDirichletAllocation(n_topics=K, learning_method='online', max_iter=no_iter).fit( RandMatrix) #Model fitting return lda_rand.score(CountsMatrix)
def _test_LDA(data_samples=[], term=7, random_state=1, max_iter=100, **l): shuffle(data_samples) tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english') tf = tf_vectorizer.fit_transform(data_samples) lda1 = LatentDirichletAllocation(max_iter=max_iter, learning_method='online', random_state=random_state, **l) lda1.fit_transform(tf) return lda1.score(tf)
def elbowplotlda(listoftopics, vectorizedcorpus): perplexitylst = [] log_likelihood = [] for num in listoftopics: model = LatentDirichletAllocation(n_jobs=-1, n_components=num) model.fit(vectorizedcorpus) log_likelihood.append(model.score(vectorizedcorpus)) perplexitylst.append(model.perplexity(vectorizedcorpus)) print(log_likelihood) print(perplexitylst) plt.plot(listoftopics, log_likelihood, '-', label="log_likelihood") #plt.plot(listoftopics, perplexitylst, '-', label='perplexitylst') plt.legend() plt.savefig("../images/elbowplot") plt.show()
def perform_lda_analysis(txtDir='', numOfTxts=None, numOfTopics=5, maxIter=20, learningMode='online', randomState=100, batchSize=128, evaluateEvery=-1, nJobs=-1): """ :param txtDir: :param numOfTxts: an integer or None for selecting all files :param numOfTopics: :param maxIter: :param learningMode: :param randomState: :param batchSize: :param evaluateEvery: :param nJobs: :return: """ warnings.simplefilter("ignore", DeprecationWarning) txtLst = [] for fname in os.listdir(txtDir)[:numOfTxts]: with codecs.open(os.path.join(cfg.pwc['cleanTxtDir'], fname), 'r', 'utf-8-sig') as fh: txt = get_content_words(fh.read()) txtLst.append(txt) txtLst = txtLst vectorizer = CountVectorizer(analyzer='word', min_df=4, lowercase=True, token_pattern='[a-zA-Z0-9]{3,}') dataVector = vectorizer.fit_transform(txtLst) dataDense = dataVector.todense() print("Sparsicity: ", ((dataDense > 0).sum() / dataDense.size) * 100, "%") lda_model = LatentDirichletAllocation(n_topics=numOfTopics, max_iter=maxIter, learning_method=learningMode, random_state=randomState, batch_size=batchSize, evaluate_every=evaluateEvery, n_jobs=nJobs) lda_result = lda_model.fit_transform(dataVector) results = { 'result':lda_result, 'logLikelyhood': lda_model.score(dataVector), # the higher the better 'perplexity': lda_model.perplexity(dataVector), # the lower the better 'params': lda_model.get_params() } pprint(results) return results
def main(): print("\n-----LDA CONCEPT DETECITON-----") text_corpus, text_corpus_ids, raw_corpus, raw_corpus_ids, filepath = load_corpus( 'v') # text_corpus_lemma = lemmatize_corpus(text_corpus, 'v') concepts_raw = load_document(CONCEPTS_PATH) concepts = parse_concepts(concepts_raw) num_segs = len(text_corpus) print("MAX_DF: " + str(MAX_DF)) print("MIN_DF: " + str(MIN_DF)) print("Number of Segs: %d/%d" % (len(text_corpus), len(raw_corpus))) #Create CountVectorizer to get Document-Term matrix vectorizer = CountVectorizer(stop_words='english', lowercase=True, max_df=MAX_DF, min_df=MIN_DF, tokenizer=LemmaTokenizer()) #train vectorizer on corpus dt_matrix = vectorizer.fit_transform(text_corpus) feature_names = vectorizer.get_feature_names() print("Number of Features: " + str(len(feature_names))) num_iter = 200 #initialize model lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=num_iter, learning_method='batch', verbose=1, random_state=55, evaluate_every=5) #train the model on the corpus and get a document topic matrix for the corpus doc_topic_matrix = lda.fit_transform(dt_matrix) topic_term_matrix = lda.components_ print("Number of Iterations: ", lda.n_iter_) print("Score: " + str(lda.score(dt_matrix) / get_num_tokens(dt_matrix))) print_topics(lda, feature_names, 10) return 0
def __init__(self, X, features, Klist=list(range(1, 10)), random_state=0): self.Klist = Klist self.features = features self.random_state = random_state self.X = X self.lda = [] self.perplex = [] self.score = [] for k in Klist: lda = LatentDirichletAllocation(n_components=k, random_state=random_state) lda.fit(X) self.lda.append(lda) px = lda.perplexity(X) ll = lda.score(X) self.perplex.append(px) self.score.append(ll) print('K = %i, perplex = %f, log-like = %f' % (k, px, ll))
def test_topic_ks(text, ck = 80): #text is a list of documents count_vectorizer = CountVectorizer(stop_words='english') count_data = count_vectorizer.fit_transform(text) print("testing Ks...") cks = range(ck) candidate_ks = cks[40:] for number_topics in candidate_ks: print("K =", number_topics) lda = LDA(n_components=number_topics, n_jobs=-1) lda.fit(count_data) # Log Likelihood: Higher the better print("---> Log Likelihood: ", lda.score(count_data)) # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word) print("---> Perplexity: ", lda.perplexity(count_data))
def get_model_metrics(model: LatentDirichletAllocation, doc_mat: np.array): """ Args: model (): doc_mat (): Returns: """ print(doc_mat.shape) print('Perplexity: ', model.perplexity(doc_mat)) print('Log likelihood', model.score(doc_mat)) print('Params', model.get_params())
def k_grid_search(X, test_size=0.25, gridval=[10, 100, 10], n_iter=30, seed=23): X_train, X_test = train_test_split( X, test_size=test_size, random_state=seed ) grid = range(gridval[0], gridval[1], gridval[2]) loglik = list() perplex = list() for k in grid: print("Estimating model at k: {}".format(k)) lda = LatentDirichletAllocation( n_components=k, max_iter=n_iter, learning_method='online', learning_offset=50., random_state=seed, n_jobs=6 ) lda.fit(X_train) loglik.append(lda.score(X_test)) perplex.append(lda.perplexity(X_test)) lda = None return list(grid), loglik, perplex
def objective(space): print(space) global data_vectorized lda_model = LatentDirichletAllocation( n_components=int(space['n_topics']), # number of topics learning_decay=space[ 'learning_decay'], # control learning rate in the online learning method max_iter=10, # max learning iterations learning_method='online', # use mini-batch of training data batch_size=128, # n docs in each learning iter n_jobs=-1, # use all available CPUs ) lda_model.fit_transform(data_vectorized) score = lda_model.score(data_vectorized) print("SCORE:", score) return { 'loss': -score, 'status': STATUS_OK } # minnimizing negative log-likelihood is equivalent to maximing log-likelihood
def lda_decomp(t, n_components, learning_method="online", learning_offset=10.0, max_iter=20, random_state=1): #t0=time() #print(f"Fit LDA with {n_components} components") lda = LatentDirichletAllocation(n_components=n_components, max_iter=max_iter, learning_method=learning_method, learning_offset=learning_offset, random_state=random_state).fit(t) #print(f"Transform TD/IDF matrix with {n_components} components LDA") t_lda = lda.transform(t) score = lda.score(t) perplexity = lda.perplexity(t) #print("Approximate log likelihood score (higher the better): %.3f" % score) #print("Approximate perplexity (lower the better): %.3f" % perplexity) #print("done in %0.3fs." % (time() - t0)) return (lda, t_lda)
def LDA_SK(data_vectorized, vectorizer): #Build LDA Model '''lda_model = LatentDirichletAllocation(n_topics=20, # Number of topics max_iter=10, # Max learning iterations learning_method='online', random_state=100, # Random state batch_size=128, # n docs in each learning iter evaluate_every = -1, # compute perplexity every n iters, default: Don't n_jobs = -1, # Use all available CPUs ) ''' lda_model = LatentDirichletAllocation(batch_size=128, doc_topic_prior=None, evaluate_every=-1, learning_decay=0.7, learning_method='online', learning_offset=10.0, max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001, n_components=10, n_jobs=-1, n_topics=10, perp_tol=0.1, random_state=100, topic_word_prior=None, total_samples=1000000.0, verbose=0) lda_output = lda_model.fit_transform(data_vectorized) #print(lda_model) # Model attributes # Log Likelyhood: Higher the better print("Log Likelihood: ", lda_model.score(data_vectorized)) # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word) print("Perplexity: ", lda_model.perplexity(data_vectorized)) return lda_output
def analyser(data): _, data_vectorized = get_vectorized_data(data) # Build LDA Model lda_model = LatentDirichletAllocation( n_components=20, # Number of topics max_iter=10, # Max learning iterations learning_method='online', random_state=100, # Random state batch_size=128, # n docs in each learning iter evaluate_every=-1, # compute perplexity every n iters, default: Don't n_jobs=-1, # Use all available CPUs ) lda_output = lda_model.fit_transform(data_vectorized) print(lda_output) # Log Likelyhood: Higher the better print("Log Likelihood: ", lda_model.score(data_vectorized)) # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word) print("Perplexity: ", lda_model.perplexity(data_vectorized)) # See model parameters pprint(lda_model.get_params())
# Build LDA model lda_model = LatentDirichletAllocation( n_components=10, # Number or topics max_iter=10, # Max learning iterations random_state=100, # Random state (seed) learning_method='online', batch_size=128, # No of docs in each iter evaluate_every=-1, # Compute perplexity every n iters n_jobs=-1) # Use all available CPUs lda_output = lda_model.fit_transform(samples) print(lda_model) # Diagnose model performance with perplexity and log-likelihood # Log Likelyhood: Higher the better print "Log Likelihood: ", lda_model.score(samples) # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word) print("Perplexity: ", lda_model.perplexity(samples)) # See model parameters pprint(lda_model.get_params()) # Perform GridSearch for the best LDA model # Define Search Param search_params = { 'n_components': [6, 7, 8, 9], # take 10 topics 'learning_decay': [0.5, 0.7, 0.9], 'max_iter': [6, 7, 8, 9], 'random_state': [2018] }
print(X[1], data_lemmatized[1]) # ## Latent Dirichlet Allocation # In[11]: from sklearn.decomposition import LatentDirichletAllocation from sklearn.datasets import make_multilabel_classification lda = LatentDirichletAllocation(n_jobs=10, n_components=20, random_state=0) lda.fit(X) matriz_topics = lda.transform(X) samples = lda.decision_function(X) acurracy = lda.score(X) columna_nueva_fecha = np.array(df['FECHA SCRAPING']) X_final = np.column_stack((columna_nueva_fecha, matriz_topics)) print(score) print(samples) print(lda.transform(X[:17])) # In[65]: #pd.DataFrame(X_final).to_csv('MatrizFrecuencia_LDA_index_fecha.csv',sep=',',header=None) df_final = pd.DataFrame(matriz_topics, dtype='float') df_final['FECHA'] = columna_nueva_fecha #df_final.insert(0, 'id', df_final.index) #df_final.rename(columns={'Unnamed: 0':'ID', 0: 'FECHA'}, inplace=True)
X = vectorizer.fit_transform(df.text) vectorizer.get_feature_names() vect_df = pd.DataFrame(X.toarray(), columns=[vectorizer.get_feature_names()]) vect_df.shape vect_df.head() lda_range= range(1,20) lda_eval = [] for n in lda_range: lda = LatentDirichletAllocation(n_topics=n, max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda.fit(vect_df) score = lda.score(vect_df) perplexity = lda.perplexity(vect_df) print n,score,perplexity lda_eval.append({'topics':n,'score':score,'perplexity':perplexity}) for item in lda_eval: print item lda = LatentDirichletAllocation(n_topics=5, n_jobs=-1) topics = lda.fit_transform(vect_df) lda.perplexity(vect_df) lda.score(vect_df) topics[2545] df.ix[2545].text
n_samples = 2000 n_features = 1000 n_topics = 10 n_top_words = 20 lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda.fit(corpusVect) tf_feature_names = vectorizer.get_feature_names() print_top_words(lda, tf_feature_names, n_top_words) lda.score(corpusVect) lda.perplexity(corpusVect) #### Titles corp2 = dataWeek.title CleanTextTransformer().fit(corp2) corpCTT2 = CleanTextTransformer().transform(corp2) corpCTTvect = vectorizer.fit_transform(corpCTT2) corpusTitlesVect = pd.DataFrame(corpCTTvect.todense(),columns=vectorizer.get_feature_names()) lda2 = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0)
stop_words='english') tf = tf_vectorizer.fit_transform(blogs.article_body) lda_eval2 = [] ldaRANGE = [9,10,11,12,13,14,15,16,17,18,19,20,30,40,50,60,70,80,90,100,150,200,300] for n in ldaRANGE: lda = LatentDirichletAllocation(n_topics=n, max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda.fit(tf) score = lda.score(tf) perplexity = lda.perplexity(tf) print n,score,perplexity lda_eval2.append({'topics':n,'score':score,'perplexity':perplexity}) for item in lda_eval2: print item lda_eval22 = pd.DataFrame(lda_eval2) lda_eval22 import matplotlib.pyplot as plt lda_eval22 plt.style.use('ggplot')
test_perplexities = [] # size: (max_iter / valid_iter) * (n_splits) for i in range(int(max_iter / valid_iter)): train_s = [] test_s = [] train_p = [] test_p = [] print '\ntraining ', i * valid_iter + 1, '-th iteration' for train_index, test_index in splited_index: train_data, test_data = dataset[train_index], dataset[test_index] lda_model.partial_fit(train_data) train_s.append(lda_model.score(train_data)) test_s.append(lda_model.score(test_data)) train_p.append(lda_model.perplexity(train_data)) test_p.append(lda_model.perplexity(test_data)) train_scores.append(train_s) test_scores.append(test_s) train_perplexities.append(train_p) test_perplexities.append(test_p) print "train_scores: ", train_scores[i], " test_scores: ", test_scores[i], " train_perplexities: ", train_perplexities[i], " test_perplexities: ", test_perplexities[i] dict_num_topic[str(n_component) + '_topics'] = { "max_iter": max_iter, "valid_iter": valid_iter,