Пример #1
0
def pipeline_lda_bigartm(lines,
                         n_clusters,
                         ngram_range,
                         topnwords,
                         LOGS_DATA_PATH="plsa.txt",
                         TARGET_FOLDER="plsa"):

    make_file(lines, ngram_range, LOGS_DATA_PATH)

    bv = artm.BatchVectorizer(data_path=LOGS_DATA_PATH,
                              data_format='vowpal_wabbit',
                              target_folder=TARGET_FOLDER)

    lda = artm.LDA(num_topics=n_clusters,
                   alpha=0.01,
                   beta=0.001,
                   cache_theta=True,
                   dictionary=bv.dictionary)
    lda.fit_offline(batch_vectorizer=bv)

    top_tokens = lda.get_top_tokens(num_tokens=topnwords)
    topic_names = {}
    for i, token_list in enumerate(top_tokens):
        topic_names[i] = token_list

    return label_after_bigarm(lda), topic_names
Пример #2
0
def artm_lda(batch_vectorizer, topics, dictionary):
    model_lda = artm.LDA(num_topics=topics,
                         num_processors=cpu_count(),
                         cache_theta=True,
                         num_document_passes=1)
    model_lda.initialize(dictionary=dictionary)
    model_lda.fit_offline(batch_vectorizer=batch_vectorizer,
                          num_collection_passes=50)
    print "\nPerplexity for BigARTM LDA: ", model_lda.perplexity_last_value
Пример #3
0
def compute_lda(num_topics, alpha, beta, dictionary, batch_vectorizer, score_computer):
    lda_model = artm.LDA(num_topics=num_topics, alpha=alpha, beta=beta, cache_theta=True,
                         num_document_passes=5, dictionary=dictionary)
    lda_model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=10)
    theta_lda = lda_model.get_theta()
    lda_predicts = get_df_clusters_predicted(theta_lda, url_list)
    score = score_computer.compute_score(lda_predicts["story_id_predicted"])
    logging.info("num_topics={}, alpha={}, beta={}, "
                 "LDA score = {}".format(num_topics, alpha, beta, score))
Пример #4
0
def experiment(filename, tau_phi, tau_theta):
    batch_vectorizer = artm.BatchVectorizer(data_path=filename, data_format='vowpal_wabbit',
                                            target_folder='batches')

    dictionary = batch_vectorizer.dictionary

    topic_num = 30
    tokens_num = 100
    print("ARTM training")
    topic_names = ['topic_{}'.format(i) for i in range(topic_num)]
    model_artm = artm.ARTM(topic_names=topic_names, dictionary=dictionary, cache_theta=True)
    model_plsa = artm.ARTM(topic_names=topic_names, cache_theta=True,
                           scores=[artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary)])
    model_lda = artm.LDA(num_topics=topic_num)

    model_artm.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary))
    model_artm.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score'))
    model_artm.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score'))
    model_artm.scores.add(artm.TopTokensScore(name='top_tokens_score', num_tokens=tokens_num))
    model_artm.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3))
    model_artm.scores.add(artm.BackgroundTokensRatioScore(name='background_tokens_ratio_score'))
    model_artm.scores.add(artm.ClassPrecisionScore(name='class_precision_score'))
    model_artm.scores.add(artm.TopicMassPhiScore(name='topic_mass_phi_score'))

    model_plsa.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary))
    model_plsa.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score'))
    model_plsa.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score'))
    model_plsa.scores.add(artm.TopTokensScore(name='top_tokens_score'))
    model_plsa.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3))
    model_plsa.scores.add(artm.BackgroundTokensRatioScore(name='background_tokens_ratio_score'))
    model_plsa.scores.add(artm.ClassPrecisionScore(name='class_precision_score'))
    model_plsa.scores.add(artm.TopicMassPhiScore(name='topic_mass_phi_score'))

    model_artm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer'))
    model_artm.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer'))
    model_artm.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer'))

    model_artm.regularizers['sparse_phi_regularizer'].tau = tau_phi
    model_artm.regularizers['sparse_theta_regularizer'].tau = tau_theta
    model_artm.regularizers['decorrelator_phi_regularizer'].tau = 1e+3

    model_plsa.initialize(dictionary=dictionary)
    model_artm.initialize(dictionary=dictionary)
    model_lda.initialize(dictionary=dictionary)

    passes = 100
    model_plsa.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes)
    model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes)
    model_lda.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes)

    print_measures(model_plsa, model_artm, model_lda)
Пример #5
0
import tf_idf_builder

start = time.time()

batch_vectorizer = artm.BatchVectorizer(data_path='lemmed.txt', data_format='vowpal_wabbit', target_folder='batches')

dictionary = batch_vectorizer.dictionary

topic_num = 10
tokens_num = 100
print("ARTM training")
topic_names = ['topic_{}'.format(i) for i in range(topic_num)]
model_artm = artm.ARTM(topic_names=topic_names, dictionary=dictionary, cache_theta=True)
model_plsa = artm.ARTM(topic_names=topic_names, cache_theta=True,
                       scores=[artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary)])
model_lda = artm.LDA(num_topics=topic_num)

model_artm.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary))
model_artm.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score'))
model_artm.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score'))
model_artm.scores.add(artm.TopTokensScore(name='top_tokens_score', num_tokens=tokens_num))
model_artm.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3))
model_artm.scores.add(artm.BackgroundTokensRatioScore(name='background_tokens_ratio_score'))
model_artm.scores.add(artm.ClassPrecisionScore(name='class_precision_score'))
model_artm.scores.add(artm.TopicMassPhiScore(name='topic_mass_phi_score'))

model_plsa.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary))
model_plsa.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score'))
model_plsa.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score'))
model_plsa.scores.add(artm.TopTokensScore(name='top_tokens_score'))
model_plsa.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3))
Пример #6
0
start = time.time()
bv = artm.BatchVectorizer(data_path=BATCHES_DIR, data_format="batches")
dictionary = artm.Dictionary()
dictionary.load(DICTIONARY_FILE)

cooc_dict = artm.Dictionary()
cooc_dict.gather(data_path=BATCHES_DIR,
                 cooc_file_path=COOC_FILE,
                 vocab_file_path=VOCAB_FILE,
                 symmetric_cooc_values=True)

coherence_score = artm.TopTokensScore(name='TopTokensCoherenceScore',
                                      dictionary=cooc_dict,
                                      num_tokens=15)

model_artm = artm.LDA(num_topics=N_TOPICS)

model_artm._internal_model.scores.add(
    artm.TopTokensScore(name="top_words", num_tokens=10))
model_artm._internal_model.scores.add(coherence_score)
model_artm._internal_model.scores.add(
    artm.PerplexityScore(name='perplexity_score', dictionary=bv.dictionary))
model_artm._internal_model.scores.add(
    artm.SparsityPhiScore(name='sparsity_phi_score'))
model_artm._internal_model.scores.add(
    artm.SparsityThetaScore(name='sparsity_theta_score'))

model_artm.initialize(dictionary=dictionary)
print("Initializing time: {}".format(time.time() - start))

start = time.time()
Пример #7
0
def test_func():
    # constants
    num_tokens = 15
    alpha = 0.01
    beta = 0.02
    num_collection_passes = 15
    num_document_passes = 1
    num_topics = 15
    vocab_size = 6906
    num_docs = 3430
    zero_eps = 0.001

    data_path = os.environ.get('BIGARTM_UNITTEST_DATA')
    batches_folder = tempfile.mkdtemp()

    try:
        batch_vectorizer = artm.BatchVectorizer(data_path=data_path,
                                                data_format='bow_uci',
                                                collection_name='kos',
                                                target_folder=batches_folder)

        dictionary = artm.Dictionary()
        dictionary.gather(data_path=batch_vectorizer.data_path)

        model_artm = artm.ARTM(num_topics=num_topics,
                               dictionary=dictionary,
                               cache_theta=True,
                               reuse_theta=True)

        model_artm.regularizers.add(
            artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=beta))
        model_artm.regularizers.add(
            artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=alpha))

        model_artm.scores.add(
            artm.SparsityThetaScore(name='SparsityThetaScore'))
        model_artm.scores.add(
            artm.PerplexityScore(name='PerplexityScore',
                                 dictionary=dictionary))
        model_artm.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore'))
        model_artm.scores.add(
            artm.TopTokensScore(name='TopTokensScore', num_tokens=num_tokens))

        model_lda = artm.LDA(num_topics=num_topics,
                             alpha=alpha,
                             beta=beta,
                             dictionary=dictionary,
                             cache_theta=True)
        model_lda.initialize(dictionary=dictionary)

        model_artm.num_document_passes = num_document_passes
        model_lda.num_document_passes = num_document_passes

        model_artm.fit_offline(batch_vectorizer=batch_vectorizer,
                               num_collection_passes=num_collection_passes)
        model_lda.fit_offline(batch_vectorizer=batch_vectorizer,
                              num_collection_passes=num_collection_passes)

        for i in range(num_collection_passes):
            assert abs(model_artm.score_tracker['SparsityPhiScore'].value[i] -
                       model_lda.sparsity_phi_value[i]) < zero_eps

        for i in range(num_collection_passes):
            assert abs(
                model_artm.score_tracker['SparsityThetaScore'].value[i] -
                model_lda.sparsity_theta_value[i]) < zero_eps

        for i in range(num_collection_passes):
            assert abs(model_artm.score_tracker['PerplexityScore'].value[i] -
                       model_lda.perplexity_value[i]) < zero_eps

        lda_tt = model_lda.get_top_tokens(num_tokens=num_tokens)
        assert len(lda_tt) == num_topics

        for i in range(num_topics):
            for j in range(num_tokens):
                assert model_artm.score_tracker['TopTokensScore'].last_tokens[
                    model_artm.topic_names[i]][j] == lda_tt[i][j]

        lda_tt = model_lda.get_top_tokens(num_tokens=num_tokens,
                                          with_weights=True)
        for i in range(num_tokens):
            assert abs(model_artm.score_tracker['TopTokensScore'].last_weights[
                model_artm.topic_names[0]][i] - lda_tt[0][i][1]) < zero_eps

        model_lda.fit_online(batch_vectorizer=batch_vectorizer)

        phi = model_lda.phi_
        assert phi.shape == (vocab_size, num_topics)
        theta = model_lda.get_theta()
        assert theta.shape == (num_topics, num_docs)

        assert model_lda.library_version.count('.') == 2  # major.minor.patch

        model_lda = artm.LDA(num_topics=num_topics,
                             alpha=alpha,
                             beta=([0.1] * num_topics),
                             dictionary=dictionary,
                             cache_theta=True)
        assert model_lda._internal_model.regularizers.size() == num_topics + 1
    finally:
        shutil.rmtree(batches_folder)
Пример #8
0
import artm

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from numpy import array

cv = CountVectorizer(max_features=1000, stop_words='english')
n_wd = array(cv.fit_transform(fetch_20newsgroups().data).todense()).T
vocabulary = cv.get_feature_names()

bv = artm.BatchVectorizer(
    data_format='bow_n_wd',
    n_wd=n_wd,  # матрица слова х документы
    vocabulary=vocabulary)

model = artm.LDA(num_topics=15, dictionary=bv.dictionary)
model.fit_offline(bv, num_collection_passes=20)

# содержимое топиков
for t in model.get_top_tokens(8):
    print(t)

# обучение простой модели LDA из UCI-формата
batch_vectorizer = artm.BatchVectorizer(data_path='.',
                                        data_format='bow_uci',
                                        collection_name='kos',
                                        target_folder='kos_batches')
# регуляризация каждого топика
beta = [0.001] * 15
lda = artm.LDA(num_topics=15,
               alpha=0.01,
Пример #9
0
topics=len(set(lenta["topic"]))
corp=CorpusDocuments(lenta,17)
lda=LDA(corp,topics,100)
lda.lda_method()
plsa=PLSA(corp,topics,30)
plsa.plsa_method()
print("Перплексия myLDA: ",Perplexity(corp,lda.get_phi(),lda.get_theta(),topics).perplexity())
print("Перплексия myPLSA: ",Perplexity(corp,plsa.get_phi(),plsa.get_theta(),topics).perplexity())
batch_vectorizer=artm.BatchVectorizer(data_path="lenta.txt",
                                      data_format="vowpal_wabbit",
                                      target_folder="profstandards_batches",
                                     batch_size=10)
topic_names=["sbj"+str(i) for i in range(topics-1)]+["bcg"]
model_artm=artm.ARTM(num_topics=topics,topic_names=topic_names,
                     num_processors=2,class_ids={"text":1}
                     ,reuse_theta=True,cache_theta=True)
np.random.seed(1)
dictionary=artm.Dictionary("dictionary")
dictionary.gather(batch_vectorizer.data_path)
model_artm.initialize(dictionary=dictionary)
model_artm.scores.add(artm.PerplexityScore("perplexity",class_ids=["text"],dictionary=dictionary))
model_artm.num_document_passes=1
model_artm.fit_offline(batch_vectorizer=batch_vectorizer,
                       num_collection_passes=45)
print("Перплексия artmPLSA: ",model_artm.score_tracker["perplexity"].value[-1])
model_lda=artm.LDA(num_topics=topics,num_processors=2,cache_theta=True)
model_lda.initialize(dictionary=dictionary)
model_lda.num_document_passes=1
model_lda.fit_offline(batch_vectorizer=batch_vectorizer,
                       num_collection_passes=45)
print("Перплексия artmLDA: ",model_lda.perplexity_last_value)