def pipeline_lda_bigartm(lines, n_clusters, ngram_range, topnwords, LOGS_DATA_PATH="plsa.txt", TARGET_FOLDER="plsa"): make_file(lines, ngram_range, LOGS_DATA_PATH) bv = artm.BatchVectorizer(data_path=LOGS_DATA_PATH, data_format='vowpal_wabbit', target_folder=TARGET_FOLDER) lda = artm.LDA(num_topics=n_clusters, alpha=0.01, beta=0.001, cache_theta=True, dictionary=bv.dictionary) lda.fit_offline(batch_vectorizer=bv) top_tokens = lda.get_top_tokens(num_tokens=topnwords) topic_names = {} for i, token_list in enumerate(top_tokens): topic_names[i] = token_list return label_after_bigarm(lda), topic_names
def artm_lda(batch_vectorizer, topics, dictionary): model_lda = artm.LDA(num_topics=topics, num_processors=cpu_count(), cache_theta=True, num_document_passes=1) model_lda.initialize(dictionary=dictionary) model_lda.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=50) print "\nPerplexity for BigARTM LDA: ", model_lda.perplexity_last_value
def compute_lda(num_topics, alpha, beta, dictionary, batch_vectorizer, score_computer): lda_model = artm.LDA(num_topics=num_topics, alpha=alpha, beta=beta, cache_theta=True, num_document_passes=5, dictionary=dictionary) lda_model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=10) theta_lda = lda_model.get_theta() lda_predicts = get_df_clusters_predicted(theta_lda, url_list) score = score_computer.compute_score(lda_predicts["story_id_predicted"]) logging.info("num_topics={}, alpha={}, beta={}, " "LDA score = {}".format(num_topics, alpha, beta, score))
def experiment(filename, tau_phi, tau_theta): batch_vectorizer = artm.BatchVectorizer(data_path=filename, data_format='vowpal_wabbit', target_folder='batches') dictionary = batch_vectorizer.dictionary topic_num = 30 tokens_num = 100 print("ARTM training") topic_names = ['topic_{}'.format(i) for i in range(topic_num)] model_artm = artm.ARTM(topic_names=topic_names, dictionary=dictionary, cache_theta=True) model_plsa = artm.ARTM(topic_names=topic_names, cache_theta=True, scores=[artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary)]) model_lda = artm.LDA(num_topics=topic_num) model_artm.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary)) model_artm.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score')) model_artm.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score')) model_artm.scores.add(artm.TopTokensScore(name='top_tokens_score', num_tokens=tokens_num)) model_artm.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3)) model_artm.scores.add(artm.BackgroundTokensRatioScore(name='background_tokens_ratio_score')) model_artm.scores.add(artm.ClassPrecisionScore(name='class_precision_score')) model_artm.scores.add(artm.TopicMassPhiScore(name='topic_mass_phi_score')) model_plsa.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary)) model_plsa.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score')) model_plsa.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score')) model_plsa.scores.add(artm.TopTokensScore(name='top_tokens_score')) model_plsa.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3)) model_plsa.scores.add(artm.BackgroundTokensRatioScore(name='background_tokens_ratio_score')) model_plsa.scores.add(artm.ClassPrecisionScore(name='class_precision_score')) model_plsa.scores.add(artm.TopicMassPhiScore(name='topic_mass_phi_score')) model_artm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer')) model_artm.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer')) model_artm.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer')) model_artm.regularizers['sparse_phi_regularizer'].tau = tau_phi model_artm.regularizers['sparse_theta_regularizer'].tau = tau_theta model_artm.regularizers['decorrelator_phi_regularizer'].tau = 1e+3 model_plsa.initialize(dictionary=dictionary) model_artm.initialize(dictionary=dictionary) model_lda.initialize(dictionary=dictionary) passes = 100 model_plsa.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes) model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes) model_lda.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes) print_measures(model_plsa, model_artm, model_lda)
import tf_idf_builder start = time.time() batch_vectorizer = artm.BatchVectorizer(data_path='lemmed.txt', data_format='vowpal_wabbit', target_folder='batches') dictionary = batch_vectorizer.dictionary topic_num = 10 tokens_num = 100 print("ARTM training") topic_names = ['topic_{}'.format(i) for i in range(topic_num)] model_artm = artm.ARTM(topic_names=topic_names, dictionary=dictionary, cache_theta=True) model_plsa = artm.ARTM(topic_names=topic_names, cache_theta=True, scores=[artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary)]) model_lda = artm.LDA(num_topics=topic_num) model_artm.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary)) model_artm.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score')) model_artm.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score')) model_artm.scores.add(artm.TopTokensScore(name='top_tokens_score', num_tokens=tokens_num)) model_artm.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3)) model_artm.scores.add(artm.BackgroundTokensRatioScore(name='background_tokens_ratio_score')) model_artm.scores.add(artm.ClassPrecisionScore(name='class_precision_score')) model_artm.scores.add(artm.TopicMassPhiScore(name='topic_mass_phi_score')) model_plsa.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary)) model_plsa.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score')) model_plsa.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score')) model_plsa.scores.add(artm.TopTokensScore(name='top_tokens_score')) model_plsa.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3))
start = time.time() bv = artm.BatchVectorizer(data_path=BATCHES_DIR, data_format="batches") dictionary = artm.Dictionary() dictionary.load(DICTIONARY_FILE) cooc_dict = artm.Dictionary() cooc_dict.gather(data_path=BATCHES_DIR, cooc_file_path=COOC_FILE, vocab_file_path=VOCAB_FILE, symmetric_cooc_values=True) coherence_score = artm.TopTokensScore(name='TopTokensCoherenceScore', dictionary=cooc_dict, num_tokens=15) model_artm = artm.LDA(num_topics=N_TOPICS) model_artm._internal_model.scores.add( artm.TopTokensScore(name="top_words", num_tokens=10)) model_artm._internal_model.scores.add(coherence_score) model_artm._internal_model.scores.add( artm.PerplexityScore(name='perplexity_score', dictionary=bv.dictionary)) model_artm._internal_model.scores.add( artm.SparsityPhiScore(name='sparsity_phi_score')) model_artm._internal_model.scores.add( artm.SparsityThetaScore(name='sparsity_theta_score')) model_artm.initialize(dictionary=dictionary) print("Initializing time: {}".format(time.time() - start)) start = time.time()
def test_func(): # constants num_tokens = 15 alpha = 0.01 beta = 0.02 num_collection_passes = 15 num_document_passes = 1 num_topics = 15 vocab_size = 6906 num_docs = 3430 zero_eps = 0.001 data_path = os.environ.get('BIGARTM_UNITTEST_DATA') batches_folder = tempfile.mkdtemp() try: batch_vectorizer = artm.BatchVectorizer(data_path=data_path, data_format='bow_uci', collection_name='kos', target_folder=batches_folder) dictionary = artm.Dictionary() dictionary.gather(data_path=batch_vectorizer.data_path) model_artm = artm.ARTM(num_topics=num_topics, dictionary=dictionary, cache_theta=True, reuse_theta=True) model_artm.regularizers.add( artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=beta)) model_artm.regularizers.add( artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=alpha)) model_artm.scores.add( artm.SparsityThetaScore(name='SparsityThetaScore')) model_artm.scores.add( artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary)) model_artm.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore')) model_artm.scores.add( artm.TopTokensScore(name='TopTokensScore', num_tokens=num_tokens)) model_lda = artm.LDA(num_topics=num_topics, alpha=alpha, beta=beta, dictionary=dictionary, cache_theta=True) model_lda.initialize(dictionary=dictionary) model_artm.num_document_passes = num_document_passes model_lda.num_document_passes = num_document_passes model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=num_collection_passes) model_lda.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=num_collection_passes) for i in range(num_collection_passes): assert abs(model_artm.score_tracker['SparsityPhiScore'].value[i] - model_lda.sparsity_phi_value[i]) < zero_eps for i in range(num_collection_passes): assert abs( model_artm.score_tracker['SparsityThetaScore'].value[i] - model_lda.sparsity_theta_value[i]) < zero_eps for i in range(num_collection_passes): assert abs(model_artm.score_tracker['PerplexityScore'].value[i] - model_lda.perplexity_value[i]) < zero_eps lda_tt = model_lda.get_top_tokens(num_tokens=num_tokens) assert len(lda_tt) == num_topics for i in range(num_topics): for j in range(num_tokens): assert model_artm.score_tracker['TopTokensScore'].last_tokens[ model_artm.topic_names[i]][j] == lda_tt[i][j] lda_tt = model_lda.get_top_tokens(num_tokens=num_tokens, with_weights=True) for i in range(num_tokens): assert abs(model_artm.score_tracker['TopTokensScore'].last_weights[ model_artm.topic_names[0]][i] - lda_tt[0][i][1]) < zero_eps model_lda.fit_online(batch_vectorizer=batch_vectorizer) phi = model_lda.phi_ assert phi.shape == (vocab_size, num_topics) theta = model_lda.get_theta() assert theta.shape == (num_topics, num_docs) assert model_lda.library_version.count('.') == 2 # major.minor.patch model_lda = artm.LDA(num_topics=num_topics, alpha=alpha, beta=([0.1] * num_topics), dictionary=dictionary, cache_theta=True) assert model_lda._internal_model.regularizers.size() == num_topics + 1 finally: shutil.rmtree(batches_folder)
import artm from sklearn.feature_extraction.text import CountVectorizer from sklearn.datasets import fetch_20newsgroups from numpy import array cv = CountVectorizer(max_features=1000, stop_words='english') n_wd = array(cv.fit_transform(fetch_20newsgroups().data).todense()).T vocabulary = cv.get_feature_names() bv = artm.BatchVectorizer( data_format='bow_n_wd', n_wd=n_wd, # матрица слова х документы vocabulary=vocabulary) model = artm.LDA(num_topics=15, dictionary=bv.dictionary) model.fit_offline(bv, num_collection_passes=20) # содержимое топиков for t in model.get_top_tokens(8): print(t) # обучение простой модели LDA из UCI-формата batch_vectorizer = artm.BatchVectorizer(data_path='.', data_format='bow_uci', collection_name='kos', target_folder='kos_batches') # регуляризация каждого топика beta = [0.001] * 15 lda = artm.LDA(num_topics=15, alpha=0.01,
topics=len(set(lenta["topic"])) corp=CorpusDocuments(lenta,17) lda=LDA(corp,topics,100) lda.lda_method() plsa=PLSA(corp,topics,30) plsa.plsa_method() print("Перплексия myLDA: ",Perplexity(corp,lda.get_phi(),lda.get_theta(),topics).perplexity()) print("Перплексия myPLSA: ",Perplexity(corp,plsa.get_phi(),plsa.get_theta(),topics).perplexity()) batch_vectorizer=artm.BatchVectorizer(data_path="lenta.txt", data_format="vowpal_wabbit", target_folder="profstandards_batches", batch_size=10) topic_names=["sbj"+str(i) for i in range(topics-1)]+["bcg"] model_artm=artm.ARTM(num_topics=topics,topic_names=topic_names, num_processors=2,class_ids={"text":1} ,reuse_theta=True,cache_theta=True) np.random.seed(1) dictionary=artm.Dictionary("dictionary") dictionary.gather(batch_vectorizer.data_path) model_artm.initialize(dictionary=dictionary) model_artm.scores.add(artm.PerplexityScore("perplexity",class_ids=["text"],dictionary=dictionary)) model_artm.num_document_passes=1 model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=45) print("Перплексия artmPLSA: ",model_artm.score_tracker["perplexity"].value[-1]) model_lda=artm.LDA(num_topics=topics,num_processors=2,cache_theta=True) model_lda.initialize(dictionary=dictionary) model_lda.num_document_passes=1 model_lda.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=45) print("Перплексия artmLDA: ",model_lda.perplexity_last_value)