def add_complex_scores_to_model(artm_model,
                                n_top_tokens,
                                p_mass_threshold,
                                common_topics,
                                subject_topics,
                                class_name,
                                _debug_print=False):
    if _debug_print:
        print '[{}] adding scores'.format(datetime.now())
    # subject
    artm_model.scores.add(
        artm.PerplexityScore(name='perplexity_score_subject',
                             dictionary=dictionary,
                             topic_names=subject_topics))
    artm_model.scores.add(
        artm.SparsityPhiScore(name='ss_phi_score_subject',
                              class_id=class_name,
                              topic_names=subject_topics))
    artm_model.scores.add(
        artm.SparsityThetaScore(name='ss_theta_score_subject',
                                topic_names=subject_topics))
    artm_model.scores.add(
        artm.TopicKernelScore(name='topic_kernel_score_subject',
                              class_id=class_name,
                              topic_names=subject_topics,
                              probability_mass_threshold=p_mass_threshold))
    artm_model.scores.add(
        artm.TopTokensScore(name='top_tokens_score_subject',
                            class_id=class_name,
                            topic_names=subject_topics,
                            num_tokens=n_top_tokens))

    # common
    artm_model.scores.add(
        artm.PerplexityScore(name='perplexity_score_common',
                             dictionary=dictionary,
                             topic_names=common_topics))
    artm_model.scores.add(
        artm.SparsityPhiScore(name='ss_phi_score_common',
                              class_id=class_name,
                              topic_names=common_topics))
    artm_model.scores.add(
        artm.SparsityThetaScore(name='ss_theta_score_common',
                                topic_names=common_topics))
    artm_model.scores.add(
        artm.TopicKernelScore(name='topic_kernel_score_common',
                              class_id=class_name,
                              topic_names=common_topics,
                              probability_mass_threshold=p_mass_threshold))
    artm_model.scores.add(
        artm.TopTokensScore(name='top_tokens_score_common',
                            class_id=class_name,
                            topic_names=common_topics,
                            num_tokens=n_top_tokens))
Exemplo n.º 2
0
def experiment(filename, tau_phi, tau_theta):
    batch_vectorizer = artm.BatchVectorizer(data_path=filename, data_format='vowpal_wabbit',
                                            target_folder='batches')

    dictionary = batch_vectorizer.dictionary

    topic_num = 30
    tokens_num = 100
    print("ARTM training")
    topic_names = ['topic_{}'.format(i) for i in range(topic_num)]
    model_artm = artm.ARTM(topic_names=topic_names, dictionary=dictionary, cache_theta=True)
    model_plsa = artm.ARTM(topic_names=topic_names, cache_theta=True,
                           scores=[artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary)])
    model_lda = artm.LDA(num_topics=topic_num)

    model_artm.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary))
    model_artm.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score'))
    model_artm.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score'))
    model_artm.scores.add(artm.TopTokensScore(name='top_tokens_score', num_tokens=tokens_num))
    model_artm.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3))
    model_artm.scores.add(artm.BackgroundTokensRatioScore(name='background_tokens_ratio_score'))
    model_artm.scores.add(artm.ClassPrecisionScore(name='class_precision_score'))
    model_artm.scores.add(artm.TopicMassPhiScore(name='topic_mass_phi_score'))

    model_plsa.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary))
    model_plsa.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score'))
    model_plsa.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score'))
    model_plsa.scores.add(artm.TopTokensScore(name='top_tokens_score'))
    model_plsa.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3))
    model_plsa.scores.add(artm.BackgroundTokensRatioScore(name='background_tokens_ratio_score'))
    model_plsa.scores.add(artm.ClassPrecisionScore(name='class_precision_score'))
    model_plsa.scores.add(artm.TopicMassPhiScore(name='topic_mass_phi_score'))

    model_artm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer'))
    model_artm.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer'))
    model_artm.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer'))

    model_artm.regularizers['sparse_phi_regularizer'].tau = tau_phi
    model_artm.regularizers['sparse_theta_regularizer'].tau = tau_theta
    model_artm.regularizers['decorrelator_phi_regularizer'].tau = 1e+3

    model_plsa.initialize(dictionary=dictionary)
    model_artm.initialize(dictionary=dictionary)
    model_lda.initialize(dictionary=dictionary)

    passes = 100
    model_plsa.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes)
    model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes)
    model_lda.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes)

    print_measures(model_plsa, model_artm, model_lda)
Exemplo n.º 3
0
def add_standard_scores(model,
                        dictionary,
                        main_modality="@lemmatized",
                        all_modalities=("@lemmatized", "@ngramms")):
    """
    Adds standard scores for the model.

    """
    assert main_modality in all_modalities, "main_modality must be part of all_modalities"

    model.scores.add(
        artm.scores.PerplexityScore(name='PerplexityScore@all',
                                    class_ids=all_modalities))

    model.scores.add(artm.scores.SparsityThetaScore(name='SparsityThetaScore'))

    for modality in all_modalities:
        model.scores.add(
            artm.scores.SparsityPhiScore(name=f'SparsityPhiScore{modality}',
                                         class_id=modality))
        model.scores.add(
            artm.scores.PerplexityScore(name=f'PerplexityScore{modality}',
                                        class_ids=[modality]))
        model.scores.add(
            artm.TopicKernelScore(name=f'TopicKernel{modality}',
                                  probability_mass_threshold=0.3,
                                  class_id=modality))
Exemplo n.º 4
0
def define_model(n_topics: int, dictionary: artm.Dictionary,
                 sparse_theta: float, sparse_phi: float,
                 decorrelator_phi: float) -> artm.artm_model.ARTM:
    """
    Define the ARTM model.
    :param n_topics: number of topics.
    :param dictionary: batch vectorizer dictionary.
    :param sparse_theta: sparse theta parameter.
    :param sparse_phi: sparse phi Parameter.
    :param decorrelator_phi: decorellator phi Parameter.
    :return: ARTM model.
    """
    print("Defining the model.")
    topic_names = ["topic_{}".format(i) for i in range(1, n_topics + 1)]
    model_artm = artm.ARTM(
        topic_names=topic_names,
        cache_theta=True,
        scores=[
            artm.PerplexityScore(name="PerplexityScore",
                                 dictionary=dictionary),
            artm.SparsityPhiScore(name="SparsityPhiScore"),
            artm.SparsityThetaScore(name="SparsityThetaScore"),
            artm.TopicKernelScore(name="TopicKernelScore",
                                  probability_mass_threshold=0.3),
            artm.TopTokensScore(name="TopTokensScore", num_tokens=15)
        ],
        regularizers=[
            artm.SmoothSparseThetaRegularizer(name="SparseTheta",
                                              tau=sparse_theta),
            artm.SmoothSparsePhiRegularizer(name="SparsePhi", tau=sparse_phi),
            artm.DecorrelatorPhiRegularizer(name="DecorrelatorPhi",
                                            tau=decorrelator_phi)
        ])
    return model_artm
Exemplo n.º 5
0
    def set_scores(self):

        self.model.scores.add(
            artm.PerplexityScore(name='PerplexityScore',
                                 dictionary=self.dictionary))

        self.model.scores.add(
            artm.SparsityPhiScore(name='SparsityPhiScore',
                                  class_id='@default_class',
                                  topic_names=self.specific))
        self.model.scores.add(
            artm.SparsityThetaScore(name='SparsityThetaScore',
                                    topic_names=self.specific))

        # Fraction of background words in the whole collection
        self.model.scores.add(
            artm.BackgroundTokensRatioScore(name='BackgroundTokensRatioScore',
                                            class_id='@default_class'))

        # Kernel characteristics
        self.model.scores.add(
            artm.TopicKernelScore(name='TopicKernelScore',
                                  class_id='@default_class',
                                  topic_names=self.specific,
                                  probability_mass_threshold=0.5,
                                  dictionary=self.dictionary))

        # Looking at top tokens
        self.model.scores.add(
            artm.TopTokensScore(name='TopTokensScore',
                                class_id='@default_class',
                                num_tokens=100))
Exemplo n.º 6
0
def create_and_learn_ARTM_decorPhi_modal(name="",
                                         topic_number=750,
                                         num_collection_passes=1,
                                         weigths=[1., 1., 1., 1.],
                                         decorTau=1.0):

    batch_vectorizer_train = None
    batch_vectorizer_train = artm.BatchVectorizer(data_path='./' + name,
                                                  data_format='vowpal_wabbit',
                                                  target_folder='folder' +
                                                  name)
    dictionary = artm.Dictionary()
    dictionary.gather(data_path=batch_vectorizer_train.data_path)
    topic_names = ['topic_{}'.format(i) for i in range(topic_number)]

    model = artm.ARTM(topic_names=topic_names,
                      class_ids={
                          '@text': weigths[0],
                          '@first': weigths[1],
                          '@second': weigths[2],
                          '@third': weigths[3]
                      },
                      cache_theta=True,
                      theta_columns_naming='title',
                      scores=[
                          artm.PerplexityScore(name='PerplexityScore',
                                               dictionary=dictionary)
                      ])
    model.regularizers.add(
        artm.DecorrelatorPhiRegularizer(
            name='DecorrelatorPhi_modals',
            tau=decorTau,
            class_ids=['@first', '@second', '@third']))

    model.initialize(dictionary=dictionary)

    model.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore'))
    model.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore'))
    model.scores.add(
        artm.TopicKernelScore(name='TopicKernelScore',
                              class_id='@text',
                              probability_mass_threshold=0.3))
    model.scores.add(
        artm.TopTokensScore(name='TopTokensScore',
                            num_tokens=6,
                            class_id='@text'))
    model.scores.add(
        artm.SparsityPhiScore(name='sparsity_phi_score', class_id='@third'))

    model.num_document_passes = 1

    model.fit_offline(batch_vectorizer=batch_vectorizer_train,
                      num_collection_passes=num_collection_passes)

    theta_train = model.transform(batch_vectorizer=batch_vectorizer_train)

    return model, theta_train
Exemplo n.º 7
0
    def _get_corpus_model(self,
                          corpus_vector_spaced,
                          clustering_method='artm'):
        if 'gensim' == clustering_method:
            return self._get_model_LSI(corpus_vector_spaced)
        elif 'sklearn' == clustering_method:
            return self._get_model_LDA(corpus_vector_spaced)
        elif 'artm' == clustering_method:
            batch_vectorizer = corpus_vector_spaced['batch_vectorizer']
            dictionary = corpus_vector_spaced['dictionary']

            topic_names = [
                'topic_{}'.format(i) for i in range(self.num_of_clusters)
            ]

            model_artm = artm.ARTM(
                topic_names=topic_names,
                cache_theta=True,
                scores=[
                    artm.PerplexityScore(name='PerplexityScore',
                                         dictionary=dictionary)
                ],
                regularizers=[
                    artm.SmoothSparseThetaRegularizer(name='SparseTheta',
                                                      tau=-0.15)
                ])

            model_artm.scores.add(
                artm.SparsityPhiScore(name='SparsityPhiScore'))
            model_artm.scores.add(
                artm.SparsityThetaScore(name='SparsityThetaScore'))
            model_artm.scores.add(
                artm.TopicKernelScore(name='TopicKernelScore',
                                      probability_mass_threshold=0.3))
            model_artm.scores.add(artm.TopTokensScore(name='TopTokensScore',
                                                      num_tokens=10),
                                  overwrite=True)

            model_artm.regularizers.add(
                artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=-0.1))
            model_artm.regularizers['SparseTheta'].tau = -0.2
            model_artm.regularizers.add(
                artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi',
                                                tau=1.5e+5))

            model_artm.num_document_passes = 1

            model_artm.initialize(dictionary)
            model_artm.fit_offline(batch_vectorizer=batch_vectorizer,
                                   num_collection_passes=30)

            return model_artm.transform(batch_vectorizer=batch_vectorizer).T
Exemplo n.º 8
0
def add_standard_scores(
    model: artm.ARTM,
    dictionary: artm.Dictionary = None,
    main_modality: str = "@lemmatized",
    all_modalities: List[str] = ("@lemmatized", "@ngramms")
) -> None:
    """
    Adds standard scores for the model.

    Parameters
    ----------
    model
    dictionary
        Obsolete parameter, not used
    main_modality
    all_modalities
    """
    assert main_modality in all_modalities, "main_modality must be part of all_modalities"

    if dictionary is not None:
        warnings.warn('Parameter `dictionary` is obsolete:'
                      ' it is not used in the function "add_standard_scores"!')

    model.scores.add(
        artm.scores.PerplexityScore(
            name='PerplexityScore@all',
            class_ids=all_modalities,
        ))

    model.scores.add(artm.scores.SparsityThetaScore(name='SparsityThetaScore'))

    for modality in all_modalities:
        model.scores.add(
            artm.scores.SparsityPhiScore(
                name=f'SparsityPhiScore{modality}',
                class_id=modality,
            ))
        model.scores.add(
            artm.scores.PerplexityScore(
                name=f'PerplexityScore{modality}',
                class_ids=[modality],
            ))
        model.scores.add(
            artm.TopicKernelScore(
                name=f'TopicKernel{modality}',
                probability_mass_threshold=0.3,
                class_id=modality,
            ))
Exemplo n.º 9
0
def add_scores_to_model(current_dictionary, artm_model, n_top_tokens,
                        p_mass_threshold):
    artm_model.scores.add(
        artm.PerplexityScore(name='perplexity_score',
                             use_unigram_document_model=False,
                             dictionary=current_dictionary))
    artm_model.scores.add(
        artm.SparsityPhiScore(name='sparsity_phi_score', class_id='ngramm'))
    artm_model.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score'))
    artm_model.scores.add(
        artm.TopicKernelScore(name='topic_kernel_score',
                              class_id='ngramm',
                              probability_mass_threshold=p_mass_threshold))
    artm_model.scores.add(
        artm.TopTokensScore(name='top_tokens_score',
                            class_id='ngramm',
                            num_tokens=n_top_tokens))
Exemplo n.º 10
0
def create_and_learn_PLSA(name="", topic_number=750, num_collection_passes=1):

    batch_vectorizer_train = None
    batch_vectorizer_train = artm.BatchVectorizer(data_path='./' + name,
                                                  data_format='vowpal_wabbit',
                                                  target_folder='folder' +
                                                  name)
    dictionary = artm.Dictionary()
    dictionary.gather(data_path=batch_vectorizer_train.data_path)
    topic_names = ['topic_{}'.format(i) for i in range(topic_number)]

    model_plsa = artm.ARTM(topic_names=topic_names,
                           class_ids={
                               '@text': 1.0,
                               '@first': 1.0,
                               '@second': 1.0,
                               '@third': 1.0
                           },
                           cache_theta=True,
                           theta_columns_naming='title',
                           scores=[
                               artm.PerplexityScore(name='PerplexityScore',
                                                    dictionary=dictionary)
                           ])

    model_plsa.initialize(dictionary=dictionary)

    model_plsa.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore'))
    model_plsa.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore'))
    model_plsa.scores.add(
        artm.TopicKernelScore(name='TopicKernelScore',
                              class_id='@text',
                              probability_mass_threshold=0.3))
    model_plsa.scores.add(
        artm.TopTokensScore(name='TopTokensScore',
                            num_tokens=6,
                            class_id='@text'))

    model_plsa.num_document_passes = 1

    model_plsa.fit_offline(batch_vectorizer=batch_vectorizer_train,
                           num_collection_passes=num_collection_passes)

    theta_train = model_plsa.transform(batch_vectorizer=batch_vectorizer_train)

    return model_plsa, theta_train
Exemplo n.º 11
0
def create_model_with_background(dictionary, num_tokens, num_document_passes):

    sm_phi_tau = 0.0001 * 1e-4
    sp_phi_tau = -0.0001 * 1e-4

    decor_phi_tau = 1

    specific_topics = ['topic {}'.format(i) for i in range(1, 20)]
    topic_names = specific_topics + ["background"]
    scores = [
        artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary),
        artm.TopTokensScore(
            name='TopTokensScore', num_tokens=10, class_id='plain_text'
        ),  # web version of Palmetto works only with <= 10 tokens
        artm.SparsityPhiScore(name='SparsityPhiScore'),
        artm.SparsityThetaScore(name='SparsityThetaScore'),
        artm.TopicKernelScore(name='TopicKernelScore',
                              probability_mass_threshold=0.3,
                              class_id='plain_text')
    ]

    model = artm.ARTM(topic_names=specific_topics + ["background"],
                      regularizers=[],
                      cache_theta=True,
                      scores=scores,
                      class_ids={'plain_text': 1.0})

    model.regularizers.add(
        artm.SmoothSparsePhiRegularizer(name='SparsePhi',
                                        tau=-sp_phi_tau,
                                        topic_names=specific_topics))
    model.regularizers.add(
        artm.SmoothSparsePhiRegularizer(name='SmoothPhi',
                                        tau=sm_phi_tau,
                                        topic_names=["background"]))
    # model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=decor_phi_tau))

    model.initialize(dictionary=dictionary)
    model.num_document_passes = num_document_passes

    return model
def add_scores_to_model(artm_model,
                        dictionary,
                        n_top_tokens,
                        p_mass_threshold,
                        class_name,
                        _debug_print=False):
    if _debug_print:
        print '[{}] adding scores'.format(datetime.now())
    artm_model.scores.add(
        artm.PerplexityScore(name='perplexity_score', dictionary=dictionary))
    artm_model.scores.add(
        artm.SparsityPhiScore(name='ss_phi_score', class_id=class_name))
    artm_model.scores.add(artm.SparsityThetaScore(name='ss_theta_score'))
    artm_model.scores.add(
        artm.TopicKernelScore(name='topic_kernel_score',
                              class_id=class_name,
                              probability_mass_threshold=p_mass_threshold))
    artm_model.scores.add(
        artm.TopTokensScore(name='top_tokens_score',
                            class_id=class_name,
                            num_tokens=n_top_tokens))
Exemplo n.º 13
0
def init_score_tracker(model_artm, my_dictionary, class_id='text'):
    model_artm.scores.add(artm.PerplexityScore(name='PerplexityScore',
                                               dictionary=my_dictionary),
                          overwrite=True)

    model_artm.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore',
                                                class_id=class_id),
                          overwrite=True)

    model_artm.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore'),
                          overwrite=True)

    model_artm.scores.add(artm.TopTokensScore(name="top_words",
                                              num_tokens=200,
                                              class_id=class_id),
                          overwrite=True)

    model_artm.scores.add(artm.TopicKernelScore(
        name='TopicKernelScore',
        class_id=class_id,
        probability_mass_threshold=0.6),
                          overwrite=True)
    print('Scores are set!')
Exemplo n.º 14
0
dictionary = batch_vectorizer.dictionary

topic_num = 10
tokens_num = 100
print("ARTM training")
topic_names = ['topic_{}'.format(i) for i in range(topic_num)]
model_artm = artm.ARTM(topic_names=topic_names, dictionary=dictionary, cache_theta=True)
model_plsa = artm.ARTM(topic_names=topic_names, cache_theta=True,
                       scores=[artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary)])
model_lda = artm.LDA(num_topics=topic_num)

model_artm.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary))
model_artm.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score'))
model_artm.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score'))
model_artm.scores.add(artm.TopTokensScore(name='top_tokens_score', num_tokens=tokens_num))
model_artm.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3))
model_artm.scores.add(artm.BackgroundTokensRatioScore(name='background_tokens_ratio_score'))
model_artm.scores.add(artm.ClassPrecisionScore(name='class_precision_score'))
model_artm.scores.add(artm.TopicMassPhiScore(name='topic_mass_phi_score'))

model_plsa.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary))
model_plsa.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score'))
model_plsa.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score'))
model_plsa.scores.add(artm.TopTokensScore(name='top_tokens_score'))
model_plsa.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3))
model_plsa.scores.add(artm.BackgroundTokensRatioScore(name='background_tokens_ratio_score'))
model_plsa.scores.add(artm.ClassPrecisionScore(name='class_precision_score'))
model_plsa.scores.add(artm.TopicMassPhiScore(name='topic_mass_phi_score'))

model_artm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer'))
model_artm.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer'))
Exemplo n.º 15
0
                       ],
                       cache_theta=True)

if not os.path.isfile(filename + '/dictionary.dict'):
    dictionary.gather(data_path=batch_vectorizer.data_path)
    dictionary.save(dictionary_path=filename + '/dictionary.dict')

dictionary.load(dictionary_path=(filename + '/dictionary.dict'))
dictionary.load(dictionary_path=(filename + '/dictionary.dict'))

model_artm.initialize(dictionary=dictionary)

model_artm.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore'))
model_artm.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore'))
model_artm.scores.add(
    artm.TopicKernelScore(name='TopicKernelScore',
                          probability_mass_threshold=0.3))

model_artm.regularizers.add(
    artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=-0.1))
model_artm.regularizers.add(
    artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=1.5e+5))
model_artm.regularizers.add(
    artm.TopicSelectionThetaRegularizer(name='TopicSelection', tau=0.25))

model_artm.regularizers['SparsePhi'].tau = -0.5
model_artm.regularizers['SparseTheta'].tau = -0.5
model_artm.regularizers['DecorrelatorPhi'].tau = 1e+5

model_artm.scores.add(artm.TopTokensScore(name='TopTokensScore',
                                          num_tokens=10))
def topic_model_clf(X, y, topic_num=30):
    labels_decreasing_size_order = list(y.value_counts().index)

    (X_train, X_test, y_train, y_test) = train_test_split(X,
                                                          y,
                                                          test_size=0.2,
                                                          stratify=y,
                                                          random_state=42)

    file_train = 'temp_files/X_train.txt'
    file_test = 'temp_files/X_test.txt'

    temp_df = pd.DataFrame()
    temp_df['text'] = X_train
    temp_df['class_label'] = y_train
    write_vw(temp_df, X_train.index, file_train)

    temp_df = pd.DataFrame()
    temp_df['text'] = X_test
    write_vw(temp_df, X_test.index, file_test)

    if len(glob.glob(os.path.join('batches_train*.batch'))) < 1:
        batch_vectorizer_train = artm.BatchVectorizer(
            data_path=file_train,
            data_format='vowpal_wabbit',
            target_folder='batches_train',
            gather_dictionary=True)
    else:
        batch_vectorizer_train = artm.BatchVectorizer(
            data_path='batches_train',
            data_format='batches',
            gather_dictionary=True)

    if len(glob.glob(os.path.join('batches_test' + '*.batch'))) < 1:
        batch_vectorizer_test = artm.BatchVectorizer(
            data_path=file_test,
            data_format='vowpal_wabbit',
            target_folder='batches_test',
            gather_dictionary=True)
    else:
        batch_vectorizer_test = artm.BatchVectorizer(data_path='batches_test',
                                                     data_format='batches',
                                                     gather_dictionary=True)

    model = artm.ARTM(num_topics=topic_num,
                      class_ids={
                          '@text': 5.0,
                          '@class_label': 100.0
                      },
                      cache_theta=True,
                      dictionary=batch_vectorizer_train.dictionary,
                      theta_columns_naming='title')

    scores = [
        artm.PerplexityScore(name='Perplexity',
                             dictionary=batch_vectorizer_train.dictionary,
                             class_ids=['@text']),
        artm.SparsityPhiScore(name='SparsityPhiText', class_id='@text'),
        artm.SparsityPhiScore(name='SparsityPhiClasses',
                              class_id='@class_label'),
        artm.SparsityThetaScore(name='SparsityTheta'),
        artm.TopicKernelScore(name='TopicKernelText',
                              probability_mass_threshold=0.1,
                              class_id='@text'),
        artm.TopTokensScore(name='TopTokensText',
                            class_id='@text',
                            num_tokens=20),
        artm.TopTokensScore(name='TopTokensClasses',
                            class_id='@class_label',
                            num_tokens=10)
    ]

    regularizers = [
        artm.DecorrelatorPhiRegularizer(name='DeccorText',
                                        class_ids=['@text'],
                                        tau=10000),
        artm.SmoothSparsePhiRegularizer(name='SmoothPhiText',
                                        class_ids=['@text'],
                                        tau=0),
        artm.SmoothSparsePhiRegularizer(name='SmoothPhiClasses',
                                        class_ids=['@class_label'],
                                        tau=-1),
        # artm.SmoothSparsePhiRegularizer(name='SmoothBackgroundPhi', tau=100, topic_names=['background_topic']),
        artm.SmoothSparseThetaRegularizer(name='SmoothTheta', tau=-1.5),
        # artm.SmoothSparseThetaRegularizer(name='SmoothBackgroundTheta', tau=100, topic_names=['background_topic'])
    ]

    for r in regularizers:
        model.regularizers.add(r)
    for s in scores:
        model.scores.add(s)

    for i in tqdm(range(35)):
        model.fit_offline(batch_vectorizer=batch_vectorizer_train,
                          num_collection_passes=1)

    p_cd = model.transform(batch_vectorizer=batch_vectorizer_test,
                           predict_class_id='@class_label')

    # пооптимизируем это место
    y_pred = p_cd.idxmax(axis=0).astype(int)[[str(x)
                                              for x in X_test.index]].values
    # y_pred = p_cd[[str(x) for x in X_test.index]].idxmax(axis=0).values

    # metrics_visualization(target_pred=y_pred, target_true=y_test,
    #                       top_tokens_class=model.score_tracker['TopTokensClasses'],
    #                       top_tokens_text=model.score_tracker['TopTokensText'],
    #                       score_tracker=model.score_tracker,
    #                       scores_names=['Perplexity', 'SparsityPhiClasses',
    #                                     'SparsityPhiText', 'SparsityTheta'])

    print('Accuracy_score: {}'.format(accuracy_score(y_test, y_pred)))
    plt.hist(y_pred, color='g', label='pred')
    plt.hist(y_test, color='b', alpha=0.7, label='true')
    plt.title('Topic Model')
    plt.show()
    # print(classification_report(y_test, y_pred, labels=labels_decreasing_size_order))

    create_confusion_matrix(y_test,
                            y_pred,
                            labels=labels_decreasing_size_order).savefig(
                                '../../reports/topic_model_conf_matrix.png')

    micro_roc_auc = roc_auc_score(label_binarize(y_test,
                                                 classes=list(range(0, 17))),
                                  p_cd.T,
                                  average='micro')
    macro_roc_auc = roc_auc_score(label_binarize(y_test,
                                                 classes=list(range(0, 17))),
                                  p_cd.T,
                                  average='macro')
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    micro_f1 = f1_score(y_test, y_pred, average='micro')
    log_loss_score = log_loss(y_test, p_cd.T)

    return (micro_roc_auc, macro_roc_auc, micro_f1, macro_f1, log_loss_score,
            precision_recall_fscore_support(
                y_test, y_pred, labels=labels_decreasing_size_order))
Exemplo n.º 17
0
def calc_coeffs():
    batch_vectorizer = artm.BatchVectorizer(data_path='lemmed.txt', data_format='vowpal_wabbit',
                                            target_folder='batches')

    dictionary = batch_vectorizer.dictionary

    topic_num = 10
    topic_names = ['topic_{}'.format(i) for i in range(topic_num)]
    model_artm = artm.ARTM(topic_names=topic_names, dictionary=dictionary, cache_theta=True)

    model_artm.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary))
    model_artm.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score'))
    model_artm.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score'))
    model_artm.scores.add(artm.TopTokensScore(name='top_tokens_score'))
    model_artm.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3))

    model_artm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer'))
    model_artm.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer'))
    model_artm.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer'))

    best_tau_phi = -5.0
    best_tau_theta = -5.0
    best_perplexity = 1000000

    print("Started parameters choosing")

    for i in range(-20, 20, 5):
        for j in range(-20, 20, 5):
            model_artm.regularizers['sparse_phi_regularizer'].tau = (i / 10.0)
            model_artm.regularizers['sparse_theta_regularizer'].tau = (j / 10.0)
            model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=100)
            if model_artm.score_tracker['perplexity_score'].last_value < best_perplexity:
                best_perplexity = model_artm.score_tracker['perplexity_score'].last_value
                best_tau_phi = (i / 10.0)
                best_tau_theta = (j / 10.0)
                print(best_perplexity, " ", best_tau_phi, " ", best_tau_theta)

    print("RESULT 1 ", best_perplexity, " ", best_tau_phi, " ", best_tau_theta)

    for i in range(int(10 * best_tau_phi) - 5, int(10 * best_tau_phi) + 5, 1):
        for j in range(int(10 * best_tau_theta) - 5, int(10 * best_tau_theta) + 5, 1):
            model_artm.regularizers['sparse_phi_regularizer'].tau = (i / 10.0)
            model_artm.regularizers['sparse_theta_regularizer'].tau = (j / 10.0)
            model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=100)
            if model_artm.score_tracker['perplexity_score'].last_value < best_perplexity:
                best_perplexity = model_artm.score_tracker['perplexity_score'].last_value
                best_tau_phi = (i / 10.0)
                best_tau_theta = (j / 10.0)
                print(best_perplexity, " ", best_tau_phi, " ", best_tau_theta)

    print("RESULT 2 ", best_perplexity, " ", best_tau_phi, " ", best_tau_theta)

    for i in range(int(100 * best_tau_phi) - 10, int(100 * best_tau_phi) + 10, 1):
        for j in range(int(100 * best_tau_theta) - 10, int(100 * best_tau_theta) + 10, 1):
            model_artm.regularizers['sparse_phi_regularizer'].tau = (i / 100.0)
            model_artm.regularizers['sparse_theta_regularizer'].tau = (j / 100.0)
            model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=100)
            if model_artm.score_tracker['perplexity_score'].last_value < best_perplexity:
                best_perplexity = model_artm.score_tracker['perplexity_score'].last_value
                best_tau_phi = (i / 100.0)
                best_tau_theta = (j / 100.0)
                print(best_perplexity, " ", best_tau_phi, " ", best_tau_theta)

    print("RESULT 3 ", best_perplexity, " ", best_tau_phi, " ", best_tau_theta)
    return {"tau_phi": best_tau_phi, "tau_theta": best_tau_theta}
Exemplo n.º 18
0
batch_vectorizer = artm.BatchVectorizer(data_path=path + "\\" + subd + "\\" +
                                        "batches_pos",
                                        data_format='batches')

modelPLSA = artm.ARTM(topic_names=['topic_{}'.format(i) for i in xrange(100)],
                      scores=[
                          artm.PerplexityScore(
                              name='PerplexityScore',
                              use_unigram_document_model=False,
                              dictionary=batch_vectorizer.dictionary,
                              class_ids=["text"]),
                          artm.SparsityPhiScore(name='SparsityPhiScore',
                                                class_id="text"),
                          artm.SparsityThetaScore(name='SparsityThetaScore'),
                          artm.TopicKernelScore(name='TopicKernelScore',
                                                probability_mass_threshold=0.3,
                                                class_id="text"),
                          artm.TopTokensScore(name='TopTokensScore',
                                              num_tokens=100,
                                              class_id="text")
                      ],
                      cache_theta=True)

modelPLSA.initialize(dictionary=batch_vectorizer.dictionary)

modelPLSA.num_document_passes = 5

modelPLSA.fit_offline(batch_vectorizer=batch_vectorizer,
                      num_collection_passes=30)

print "===========================PLSA PerplexityScore start===================================="
Exemplo n.º 19
0
def test_func():
    data_path = os.environ.get('BIGARTM_UNITTEST_DATA')
    batches_folder = tempfile.mkdtemp()
    dump_folder = tempfile.mkdtemp()

    try:
        batch_vectorizer = artm.BatchVectorizer(data_path=data_path,
                                                data_format='bow_uci',
                                                collection_name='kos',
                                                target_folder=batches_folder)

        model_1 = artm.ARTM(num_processors=7,
                            cache_theta=True,
                            num_document_passes=5,
                            reuse_theta=True,
                            seed=10,
                            num_topics=15,
                            class_ids={'@default_class': 1.0},
                            theta_name='THETA',
                            dictionary=batch_vectorizer.dictionary)

        model_2 = artm.ARTM(num_processors=7,
                            cache_theta=False,
                            num_document_passes=5,
                            reuse_theta=False,
                            seed=10,
                            num_topics=15,
                            class_ids={'@default_class': 1.0},
                            dictionary=batch_vectorizer.dictionary)

        for model in [model_1, model_2]:
            model.scores.add(
                artm.PerplexityScore(name='perp',
                                     dictionary=batch_vectorizer.dictionary))
            model.scores.add(artm.SparsityThetaScore(name='sp_theta', eps=0.1))
            model.scores.add(artm.TopTokensScore(name='top_tok',
                                                 num_tokens=10))
            model.scores.add(
                artm.SparsityPhiScore(name='sp_nwt',
                                      model_name=model.model_nwt))
            model.scores.add(
                artm.TopicKernelScore(name='kernel',
                                      topic_names=model.topic_names[0:5],
                                      probability_mass_threshold=0.4))

            topic_pairs = {}
            for topic_name_1 in model.topic_names:
                for topic_name_2 in model.topic_names:
                    if topic_name_1 not in topic_pairs:
                        topic_pairs[topic_name_1] = {}
                    topic_pairs[topic_name_1][
                        topic_name_2] = numpy.random.randint(0, 3)

            model.regularizers.add(
                artm.DecorrelatorPhiRegularizer(name='decor',
                                                tau=100000.0,
                                                topic_pairs=topic_pairs))
            model.regularizers.add(
                artm.SmoothSparsePhiRegularizer(
                    name='smsp_phi',
                    tau=-0.5,
                    gamma=0.3,
                    dictionary=batch_vectorizer.dictionary))
            model.regularizers.add(
                artm.SmoothSparseThetaRegularizer(name='smsp_theta',
                                                  tau=0.1,
                                                  doc_topic_coef=[2.0] *
                                                  model.num_topics))
            model.regularizers.add(
                artm.SmoothPtdwRegularizer(name='sm_ptdw', tau=0.1))

            # learn first model and dump it on disc
            model.fit_offline(batch_vectorizer, num_collection_passes=10)
            model.fit_online(batch_vectorizer, update_every=1)

            model.dump_artm_model(os.path.join(dump_folder, 'target'))

            params = {}
            with open(os.path.join(dump_folder, 'target', 'parameters.json'),
                      'r') as fin:
                params = json.load(fin)
            _assert_json_params(params)

            # create second model from the dump and check the results are equal
            model_new = artm.load_artm_model(
                os.path.join(dump_folder, 'target'))

            _assert_params_equality(model, model_new)
            _assert_scores_equality(model, model_new)
            _assert_regularizers_equality(model, model_new)
            _assert_score_values_equality(model, model_new)
            _assert_matrices_equality(model, model_new)

            # continue learning of both models
            model.fit_offline(batch_vectorizer, num_collection_passes=3)
            model.fit_online(batch_vectorizer, update_every=1)

            model_new.fit_offline(batch_vectorizer, num_collection_passes=3)
            model_new.fit_online(batch_vectorizer, update_every=1)

            # check new results are also equal
            _assert_params_equality(model, model_new)
            _assert_scores_equality(model, model_new)
            _assert_regularizers_equality(model, model_new)
            _assert_score_values_equality(model, model_new)
            _assert_matrices_equality(model, model_new)

            shutil.rmtree(os.path.join(dump_folder, 'target'))
    finally:
        shutil.rmtree(batches_folder)
        shutil.rmtree(dump_folder)
Exemplo n.º 20
0
def test_func():
    # constants
    dictionary_name = 'dictionary'
    num_tokens = 11
    probability_mass_threshold = 0.9
    sp_reg_tau = -0.1
    decor_tau = 1.5e+5
    num_collection_passes = 15
    num_document_passes = 1
    num_topics = 15
    vocab_size = 6906
    num_docs = 3430

    data_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
    batches_folder = tempfile.mkdtemp()

    sp_zero_eps = 0.001
    sparsity_phi_value = [
        0.034, 0.064, 0.093, 0.120, 0.145, 0.170, 0.194, 0.220, 0.246, 0.277,
        0.312, 0.351, 0.390, 0.428, 0.464
    ]

    sparsity_theta_value = [0.0] * num_collection_passes

    perp_zero_eps = 2.0
    perplexity_value = [
        6873, 2590, 2685, 2578, 2603, 2552, 2536, 2481, 2419, 2331, 2235, 2140,
        2065, 2009, 1964
    ]

    top_zero_eps = 0.0001
    top_tokens_num_tokens = [num_tokens * num_topics] * num_collection_passes
    top_tokens_topic_0_tokens = [
        u'party', u'state', u'campaign', u'tax', u'political', u'republican',
        u'senate', u'candidate', u'democratic', u'court', u'president'
    ]
    top_tokens_topic_0_weights = [
        0.0209, 0.0104, 0.0094, 0.0084, 0.0068, 0.0067, 0.0065, 0.0058, 0.0053,
        0.0053, 0.0051
    ]

    ker_zero_eps = 0.01
    topic_kernel_topic_0_contrast = 0.96
    topic_kernel_topic_0_purity = 0.014
    topic_kernel_topic_0_size = 18.0
    topic_kernel_average_size = [
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.13, 0.53, 1.6, 3.33, 7.13, 12.067,
        19.53, 27.8
    ]
    topic_kernel_average_contrast = [
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.12, 0.25, 0.7, 0.96, 0.96, 0.96,
        0.96, 0.97
    ]
    topic_kernel_average_purity = [
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01, 0.01, 0.015, 0.017, 0.02,
        0.03, 0.04, 0.05
    ]

    len_last_document_ids = 10

    try:
        data_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
        batch_vectorizer = None
        batch_vectorizer = artm.BatchVectorizer(data_path=data_path,
                                                data_format='bow_uci',
                                                collection_name='kos',
                                                target_folder=batches_folder)

        model = artm.ARTM(
            topic_names=['topic_{}'.format(i) for i in xrange(num_topics)],
            cache_theta=True)

        model.gather_dictionary(dictionary_name, batch_vectorizer.data_path)
        model.initialize(dictionary_name=dictionary_name)

        model.regularizers.add(
            artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=sp_reg_tau))
        model.regularizers.add(
            artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi',
                                            tau=decor_tau))

        model.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore'))
        model.scores.add(
            artm.PerplexityScore(name='PerplexityScore',
                                 use_unigram_document_model=False,
                                 dictionary_name=dictionary_name))
        model.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore'))
        model.scores.add(
            artm.TopTokensScore(name='TopTokensScore', num_tokens=num_tokens))
        model.scores.add(
            artm.TopicKernelScore(
                name='TopicKernelScore',
                probability_mass_threshold=probability_mass_threshold))
        model.scores.add(artm.ThetaSnippetScore(name='ThetaSnippetScore'))

        model.num_document_passes = num_document_passes
        model.fit_offline(batch_vectorizer=batch_vectorizer,
                          num_collection_passes=num_collection_passes)

        for i in xrange(num_collection_passes):
            assert abs(model.score_tracker['SparsityPhiScore'].value[i] -
                       sparsity_phi_value[i]) < sp_zero_eps

        for i in xrange(num_collection_passes):
            assert abs(model.score_tracker['SparsityThetaScore'].value[i] -
                       sparsity_theta_value[i]) < sp_zero_eps

        for i in xrange(num_collection_passes):
            assert abs(model.score_tracker['PerplexityScore'].value[i] -
                       perplexity_value[i]) < perp_zero_eps

        for i in xrange(num_collection_passes):
            assert model.score_tracker['TopTokensScore'].num_tokens[
                i] == top_tokens_num_tokens[i]

        for i in xrange(num_tokens):
            assert model.score_tracker['TopTokensScore'].last_tokens[
                model.topic_names[0]][i] == top_tokens_topic_0_tokens[i]
            assert abs(model.score_tracker['TopTokensScore'].last_weights[
                model.topic_names[0]][i] -
                       top_tokens_topic_0_weights[i]) < top_zero_eps

        assert len(model.score_tracker['TopicKernelScore'].last_tokens[
            model.topic_names[0]]) > 0

        assert abs(topic_kernel_topic_0_contrast -
                   model.score_tracker['TopicKernelScore'].last_contrast[
                       model.topic_names[0]]) < ker_zero_eps
        assert abs(topic_kernel_topic_0_purity -
                   model.score_tracker['TopicKernelScore'].last_purity[
                       model.topic_names[0]]) < ker_zero_eps
        assert abs(topic_kernel_topic_0_size -
                   model.score_tracker['TopicKernelScore'].last_size[
                       model.topic_names[0]]) < ker_zero_eps

        for i in xrange(num_collection_passes):
            assert abs(
                model.score_tracker['TopicKernelScore'].average_size[i] -
                topic_kernel_average_size[i]) < ker_zero_eps
            assert abs(
                model.score_tracker['TopicKernelScore'].average_contrast[i] -
                topic_kernel_average_contrast[i]) < ker_zero_eps
            assert abs(
                model.score_tracker['TopicKernelScore'].average_purity[i] -
                topic_kernel_average_purity[i]) < ker_zero_eps

        model.fit_online(batch_vectorizer=batch_vectorizer)

        info = model.info
        assert info is not None
        assert len(info.config.topic_name) == num_topics
        assert len(info.score) == len(model.score_tracker)
        assert len(info.regularizer) == len(model.regularizers.data)
        assert len(info.cache_entry) > 0

        temp = model.score_tracker['ThetaSnippetScore'].last_document_ids
        assert len_last_document_ids == len(temp)
        assert len(model.score_tracker['ThetaSnippetScore'].last_snippet[
            temp[0]]) == num_topics

        phi = model.get_phi()
        assert phi.shape == (vocab_size, num_topics)
        theta = model.get_theta()
        assert theta.shape == (num_topics, num_docs)
    finally:
        shutil.rmtree(batches_folder)
Exemplo n.º 21
0
def test_func():
    # constants
    num_tokens = 11
    probability_mass_threshold = 0.9
    sp_reg_tau = -0.1
    decor_tau = 1.5e+5
    decor_rel_tau = 0.3
    num_collection_passes = 15
    num_document_passes = 1
    num_topics = 15
    vocab_size = 6906
    num_docs = 3430

    data_path = os.environ.get('BIGARTM_UNITTEST_DATA')
    batches_folder = tempfile.mkdtemp()

    sp_zero_eps = 0.001
    sparsity_phi_value = [
        0.034, 0.064, 0.093, 0.120, 0.145, 0.170, 0.194, 0.220, 0.246, 0.277,
        0.312, 0.351, 0.390, 0.428, 0.464
    ]

    sparsity_phi_rel_value = [
        0.442, 0.444, 0.444, 0.446, 0.448, 0.449, 0.458, 0.468, 0.476, 0.488,
        0.501, 0.522, 0.574, 0.609, 0.670
    ]

    sparsity_theta_value = [0.0] * num_collection_passes

    perp_zero_eps = 2.0
    perplexity_value = [
        6873, 2590, 2685, 2578, 2603, 2552, 2536, 2481, 2419, 2331, 2235, 2140,
        2065, 2009, 1964
    ]

    perplexity_rel_value = [
        6873, 2667, 2458, 2323, 2150, 2265, 2015, 1967, 1807, 1747, 1713, 1607,
        1632, 1542, 1469
    ]

    top_zero_eps = 0.0001
    top_tokens_num_tokens = [num_tokens * num_topics] * num_collection_passes
    top_tokens_topic_0_tokens = [
        u'party', u'state', u'campaign', u'tax', u'political', u'republican',
        u'senate', u'candidate', u'democratic', u'court', u'president'
    ]
    top_tokens_topic_0_weights = [
        0.0209, 0.0104, 0.0094, 0.0084, 0.0068, 0.0067, 0.0065, 0.0058, 0.0053,
        0.0053, 0.0051
    ]

    ker_zero_eps = 0.02
    topic_kernel_topic_0_contrast = 0.96
    topic_kernel_topic_0_purity = 0.014
    topic_kernel_topic_0_size = 18.0
    topic_kernel_average_size = [
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.13, 0.6, 1.6, 3.53, 7.15, 12.6,
        20.4, 29.06
    ]
    topic_kernel_average_contrast = [
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.12, 0.31, 0.7, 0.96, 0.96, 0.96,
        0.96, 0.97
    ]
    topic_kernel_average_purity = [
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01, 0.01, 0.015, 0.017, 0.02,
        0.03, 0.04, 0.05
    ]

    len_last_document_ids = 10

    try:
        batch_vectorizer = artm.BatchVectorizer(data_path=data_path,
                                                data_format='bow_uci',
                                                collection_name='kos',
                                                target_folder=batches_folder)

        dictionary = artm.Dictionary()
        dictionary.gather(data_path=batch_vectorizer.data_path)

        model = artm.ARTM(
            topic_names=['topic_{}'.format(i) for i in range(num_topics)],
            dictionary=dictionary.name,
            cache_theta=True)

        model.regularizers.add(
            artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=sp_reg_tau))
        model.regularizers.add(
            artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi',
                                            tau=decor_tau))

        model.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore'))
        model.scores.add(
            artm.PerplexityScore(name='PerplexityScore',
                                 use_unigram_document_model=False,
                                 dictionary=dictionary))
        model.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore'))
        model.scores.add(
            artm.TopTokensScore(name='TopTokensScore', num_tokens=num_tokens))
        model.scores.add(
            artm.TopicKernelScore(
                name='TopicKernelScore',
                probability_mass_threshold=probability_mass_threshold))
        model.scores.add(artm.ThetaSnippetScore(name='ThetaSnippetScore'))

        model.num_document_passes = num_document_passes
        model.fit_offline(batch_vectorizer=batch_vectorizer,
                          num_collection_passes=num_collection_passes)

        for i in range(num_collection_passes):
            assert abs(model.score_tracker['SparsityPhiScore'].value[i] -
                       sparsity_phi_value[i]) < sp_zero_eps

        for i in range(num_collection_passes):
            assert abs(model.score_tracker['SparsityThetaScore'].value[i] -
                       sparsity_theta_value[i]) < sp_zero_eps

        for i in range(num_collection_passes):
            assert abs(model.score_tracker['PerplexityScore'].value[i] -
                       perplexity_value[i]) < perp_zero_eps

        for i in range(num_collection_passes):
            assert model.score_tracker['TopTokensScore'].num_tokens[
                i] == top_tokens_num_tokens[i]

        for i in range(num_tokens):
            assert model.score_tracker['TopTokensScore'].last_tokens[
                model.topic_names[0]][i] == top_tokens_topic_0_tokens[i]
            assert abs(model.score_tracker['TopTokensScore'].last_weights[
                model.topic_names[0]][i] -
                       top_tokens_topic_0_weights[i]) < top_zero_eps

        assert len(model.score_tracker['TopicKernelScore'].last_tokens[
            model.topic_names[0]]) > 0

        assert abs(topic_kernel_topic_0_contrast -
                   model.score_tracker['TopicKernelScore'].last_contrast[
                       model.topic_names[0]]) < ker_zero_eps
        assert abs(topic_kernel_topic_0_purity -
                   model.score_tracker['TopicKernelScore'].last_purity[
                       model.topic_names[0]]) < ker_zero_eps
        assert abs(topic_kernel_topic_0_size -
                   model.score_tracker['TopicKernelScore'].last_size[
                       model.topic_names[0]]) < ker_zero_eps

        for i in range(num_collection_passes):
            assert abs(
                model.score_tracker['TopicKernelScore'].average_size[i] -
                topic_kernel_average_size[i]) < ker_zero_eps
            assert abs(
                model.score_tracker['TopicKernelScore'].average_contrast[i] -
                topic_kernel_average_contrast[i]) < ker_zero_eps
            assert abs(
                model.score_tracker['TopicKernelScore'].average_purity[i] -
                topic_kernel_average_purity[i]) < ker_zero_eps

        model.fit_online(batch_vectorizer=batch_vectorizer)

        info = model.info
        assert info is not None
        assert len(info.config.topic_name) == num_topics
        assert len(info.score) >= len(model.score_tracker)
        assert len(info.regularizer) == len(model.regularizers.data)
        assert len(info.cache_entry) > 0

        temp = model.score_tracker['ThetaSnippetScore'].last_document_ids
        assert len_last_document_ids == len(temp)
        assert len(model.score_tracker['ThetaSnippetScore'].last_snippet[
            temp[0]]) == num_topics

        phi = model.get_phi()
        assert phi.shape == (vocab_size, num_topics)
        theta = model.get_theta()
        assert theta.shape == (num_topics, num_docs)

        assert model.library_version.count('.') == 2  # major.minor.patch

        # test relative coefficients for Phi matrix regularizers
        model = artm.ARTM(num_topics=num_topics,
                          dictionary=dictionary.name,
                          cache_theta=False)

        model.regularizers.add(
            artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi',
                                            tau=decor_rel_tau))
        model.regularizers['DecorrelatorPhi'].gamma = 0.0

        model.scores.add(
            artm.PerplexityScore(name='PerplexityScore',
                                 use_unigram_document_model=False,
                                 dictionary=dictionary))
        model.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore'))

        model.num_document_passes = num_document_passes
        model.fit_offline(batch_vectorizer=batch_vectorizer,
                          num_collection_passes=num_collection_passes)

        for i in range(num_collection_passes):
            assert abs(model.score_tracker['SparsityPhiScore'].value[i] -
                       sparsity_phi_rel_value[i]) < sp_zero_eps

        for i in range(num_collection_passes):
            assert abs(model.score_tracker['PerplexityScore'].value[i] -
                       perplexity_rel_value[i]) < perp_zero_eps
    finally:
        shutil.rmtree(batches_folder)
Exemplo n.º 22
0
def model_train(batches_folder, models_folder_name, perform_actualize,
                tm_index, regularization_params, name, name_translit,
                index_tm):
    import artm
    import os
    import datetime
    import numpy as np

    from util.constants import BASE_DAG_DIR

    from nlpmonitor.settings import ES_CLIENT

    print("Initializing vectorizer, model")
    batch_vectorizer = artm.BatchVectorizer(data_path=batches_folder,
                                            data_format='batches')
    model_folder = os.path.join(BASE_DAG_DIR, models_folder_name)
    model_artm = artm.ARTM(num_topics=tm_index.number_of_topics,
                           class_ids={"text": 1},
                           theta_columns_naming="title",
                           reuse_theta=True,
                           cache_theta=True,
                           num_processors=4)
    if not perform_actualize:
        dictionary = artm.Dictionary()
        if "scopus" in name and os.path.exists(
                os.path.join("/big_data/", "scopus250k.dict")):
            print("Loading dictionary")
            dictionary.load(os.path.join("/big_data/", "scopus250k.dict"))
        else:
            print("Gathering dictionary")
            dictionary.gather(batch_vectorizer.data_path,
                              symmetric_cooc_values=True)
            print("Filtering dictionary")
            dictionary.filter(max_dictionary_size=250_000)
            if "scopus" in name and not os.path.exists(
                    os.path.join("/big_data/", "scopus250k.dict")):
                print("Saving dictionary")
                dictionary.save(os.path.join("/big_data/", "scopus250k.dict"))

        print("Model - initial settings")
        model_artm.initialize(dictionary)
        # Add scores
        model_artm.scores.add(artm.PerplexityScore(name='PerplexityScore'))
        model_artm.scores.add(
            artm.TopicKernelScore(name='TopicKernelScore',
                                  class_id='text',
                                  probability_mass_threshold=0.3))
        # Regularize
        model_artm.regularizers.add(
            artm.SmoothSparseThetaRegularizer(
                name='SparseTheta',
                tau=regularization_params['SmoothSparseThetaRegularizer']))
        model_artm.regularizers.add(
            artm.SmoothSparsePhiRegularizer(
                name='SparsePhi',
                tau=regularization_params['SmoothSparsePhiRegularizer']))
        model_artm.regularizers.add(
            artm.DecorrelatorPhiRegularizer(
                name='DecorrelatorPhi',
                tau=regularization_params['DecorrelatorPhiRegularizer']))
        model_artm.regularizers.add(
            artm.ImproveCoherencePhiRegularizer(
                name='ImproveCoherencePhi',
                tau=regularization_params['ImproveCoherencePhiRegularizer']))

        print("!!!", "Start model train", datetime.datetime.now())
        # Fit model
        model_artm.fit_offline(batch_vectorizer=batch_vectorizer,
                               num_collection_passes=10)
        if not os.path.exists(model_folder):
            os.mkdir(model_folder)
        model_artm.save(
            os.path.join(
                model_folder,
                f"model_{name if not name_translit else name_translit}.model"))

        print("!!!", "Get topics", datetime.datetime.now())
        # Create topics in ES
        topics = []
        phi = model_artm.get_phi()
        for topic in phi:
            phi_filtered = phi[phi[topic] > 0.0001]
            topic_words = [{
                "word": ind[1],
                "weight": float(phi[topic][ind])
            } for ind in phi_filtered[topic].index]
            topic_words = sorted(topic_words,
                                 key=lambda x: x['weight'],
                                 reverse=True)[:100]
            topics.append({
                "id":
                topic,
                "topic_words":
                topic_words,
                "name":
                ", ".join([w['word'] for w in topic_words[:5]])
            })

        # Add metrics
        purity = np.mean(
            model_artm.score_tracker['TopicKernelScore'].last_average_purity)
        contrast = np.mean(
            model_artm.score_tracker['TopicKernelScore'].last_average_contrast)
        coherence = np.mean(
            model_artm.score_tracker['TopicKernelScore'].average_coherence)
        perplexity = model_artm.score_tracker['PerplexityScore'].last_value
        print("!!!", "Write topics", datetime.datetime.now())
        update_body = {
            "topics":
            topics,
            "purity":
            purity,
            "contrast":
            contrast,
            "coherence":
            coherence,
            "perplexity":
            perplexity,
            "tau_smooth_sparse_theta":
            regularization_params['SmoothSparseThetaRegularizer'],
            "tau_smooth_sparse_phi":
            regularization_params['SmoothSparsePhiRegularizer'],
            "tau_decorrelator_phi":
            regularization_params['DecorrelatorPhiRegularizer'],
            "tau_coherence_phi":
            regularization_params['ImproveCoherencePhiRegularizer'],
        }
        ES_CLIENT.update(index=index_tm,
                         id=tm_index.meta.id,
                         body={"doc": update_body})
    else:
        print("!!!", "Loading existing model")
        # Monkey patching stupid BigARTM bug
        model_artm.load = load_monkey_patch
        model_artm.load(
            model_artm,
            os.path.join(
                model_folder,
                f"model_{name if not name_translit else name_translit}.model"))
    return model_artm, batch_vectorizer