Пример #1
0
def two_experiment_enviroments(request):
    """ """
    with warnings.catch_warnings():
        warnings.filterwarnings(action="ignore", message=W_DIFF_BATCHES_1)
        dataset = Dataset('tests/test_data/test_dataset.csv')
        dictionary = dataset.get_dictionary()

    model_artm_1 = artm.ARTM(
        num_processors=1,
        num_topics=5, cache_theta=True,
        num_document_passes=1, dictionary=dictionary,
        scores=[artm.PerplexityScore(name='PerplexityScore'),
                artm.SparsityPhiScore(name='SparsityPhiScore', class_id=MAIN_MODALITY)]
    )

    model_artm_2 = artm.ARTM(
        num_processors=1,
        num_topics=5, cache_theta=True,
        num_document_passes=1, dictionary=dictionary,
        scores=[artm.PerplexityScore(name='PerplexityScore'),
                artm.SparsityPhiScore(name='SparsityPhiScore', class_id=MAIN_MODALITY)]
    )

    tm_1 = TopicModel(model_artm_1, model_id='new_id_1')
    tm_2 = TopicModel(model_artm_2, model_id='new_id_2')

    experiment_1 = Experiment(
        experiment_id="test_1", save_path="tests/experiments", topic_model=tm_1
    )
    experiment_2 = Experiment(
        experiment_id="test_2", save_path="tests/experiments", topic_model=tm_2
    )

    return tm_1, experiment_1, tm_2, experiment_2, dataset, dictionary
Пример #2
0
    def init_hierarchical_model(class_ids):
        score = [artm.PerplexityScore(name='perplexity_words', class_ids=['body']),
                 artm.PerplexityScore(name='perplexity_bigrams', class_ids=['bigrams'])]

        top_tokens = [artm.TopTokensScore(name='top_words', num_tokens=15, class_id='body'),
                      artm.TopTokensScore(name='top_bigrams', num_tokens=10, class_id='bigrams')]

        sparsity = [artm.SparsityThetaScore(name='sparsity_theta', eps=1e-6),
                    artm.SparsityPhiScore(name='sparsity_phi_words', class_id='words', eps=1e-6),
                    artm.SparsityPhiScore(name='sparsity_phi_bigrams', class_id='bigrams', eps=1e-6)]

        regularizers = [artm.DecorrelatorPhiRegularizer(tau=0, class_ids=['body'], name='decorr_words'),
                        artm.DecorrelatorPhiRegularizer(tau=0, class_ids=['bigram'], name='decorr_bigrams'),
                        artm.DecorrelatorPhiRegularizer(tau=0, class_ids=['categories'], name='decorr_categories'),
                        artm.SmoothSparseThetaRegularizer(tau=0, name='sparsity_theta'),
                        artm.SmoothSparsePhiRegularizer(tau=0, class_ids=['body'], name='sparsity_words'),
                        artm.SmoothSparsePhiRegularizer(tau=0, class_ids=['bigram'], name='sparsity_bigrams')]

        hmodel = artm.hARTM(class_ids=class_ids,
                            cache_theta=True,
                            reuse_theta=True,
                            scores=score + top_tokens + sparsity,
                            regularizers=regularizers,
                            theta_columns_naming='title')
        return hmodel
def add_complex_scores_to_model(artm_model,
                                n_top_tokens,
                                p_mass_threshold,
                                common_topics,
                                subject_topics,
                                class_name,
                                _debug_print=False):
    if _debug_print:
        print '[{}] adding scores'.format(datetime.now())
    # subject
    artm_model.scores.add(
        artm.PerplexityScore(name='perplexity_score_subject',
                             dictionary=dictionary,
                             topic_names=subject_topics))
    artm_model.scores.add(
        artm.SparsityPhiScore(name='ss_phi_score_subject',
                              class_id=class_name,
                              topic_names=subject_topics))
    artm_model.scores.add(
        artm.SparsityThetaScore(name='ss_theta_score_subject',
                                topic_names=subject_topics))
    artm_model.scores.add(
        artm.TopicKernelScore(name='topic_kernel_score_subject',
                              class_id=class_name,
                              topic_names=subject_topics,
                              probability_mass_threshold=p_mass_threshold))
    artm_model.scores.add(
        artm.TopTokensScore(name='top_tokens_score_subject',
                            class_id=class_name,
                            topic_names=subject_topics,
                            num_tokens=n_top_tokens))

    # common
    artm_model.scores.add(
        artm.PerplexityScore(name='perplexity_score_common',
                             dictionary=dictionary,
                             topic_names=common_topics))
    artm_model.scores.add(
        artm.SparsityPhiScore(name='ss_phi_score_common',
                              class_id=class_name,
                              topic_names=common_topics))
    artm_model.scores.add(
        artm.SparsityThetaScore(name='ss_theta_score_common',
                                topic_names=common_topics))
    artm_model.scores.add(
        artm.TopicKernelScore(name='topic_kernel_score_common',
                              class_id=class_name,
                              topic_names=common_topics,
                              probability_mass_threshold=p_mass_threshold))
    artm_model.scores.add(
        artm.TopTokensScore(name='top_tokens_score_common',
                            class_id=class_name,
                            topic_names=common_topics,
                            num_tokens=n_top_tokens))
Пример #4
0
def test_phi_matrix_after_lda_sampled_regularizer(experiment_enviroment):
    with warnings.catch_warnings():
        warnings.filterwarnings(action="ignore", message=W_DIFF_BATCHES_1)
        dataset = Dataset(DATA_PATH)
        dictionary = dataset.get_dictionary()
        batch_vectorizer = dataset.get_batch_vectorizer()

    topic_prior_reg = TopicPriorSampledRegularizer(
        name='topic_prior',
        tau=5,
        num_topics=5,
        beta_prior=[10, 1, 100, 2, 1000])

    model_artm_1 = artm.ARTM(
        num_processors=1,
        num_topics=5,
        cache_theta=True,
        class_ids={
            MAIN_MODALITY: 1.0,
            NGRAM_MODALITY: 1.0
        },
        num_document_passes=1,
        dictionary=dictionary,
        scores=[artm.PerplexityScore(name='PerplexityScore', )],
    )
    model_artm_2 = artm.ARTM(
        num_processors=1,
        num_topics=5,
        cache_theta=True,
        class_ids={
            MAIN_MODALITY: 1.0,
            NGRAM_MODALITY: 1.0
        },
        num_document_passes=1,
        dictionary=dictionary,
        scores=[artm.PerplexityScore(name='PerplexityScore', )],
    )

    tm_1 = TopicModel(
        model_artm_1,
        model_id='new_id_1',
        custom_regularizers={topic_prior_reg.name: topic_prior_reg})
    tm_2 = TopicModel(model_artm_2, model_id='new_id_2')

    tm_1._fit(batch_vectorizer, 10)
    tm_2._fit(batch_vectorizer, 10)

    phi_first = tm_1.get_phi()
    phi_second = tm_2.get_phi()

    assert any(phi_first != phi_second
               ), 'Phi matrices are the same after regularization.'
Пример #5
0
def experiment(filename, tau_phi, tau_theta):
    batch_vectorizer = artm.BatchVectorizer(data_path=filename, data_format='vowpal_wabbit',
                                            target_folder='batches')

    dictionary = batch_vectorizer.dictionary

    topic_num = 30
    tokens_num = 100
    print("ARTM training")
    topic_names = ['topic_{}'.format(i) for i in range(topic_num)]
    model_artm = artm.ARTM(topic_names=topic_names, dictionary=dictionary, cache_theta=True)
    model_plsa = artm.ARTM(topic_names=topic_names, cache_theta=True,
                           scores=[artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary)])
    model_lda = artm.LDA(num_topics=topic_num)

    model_artm.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary))
    model_artm.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score'))
    model_artm.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score'))
    model_artm.scores.add(artm.TopTokensScore(name='top_tokens_score', num_tokens=tokens_num))
    model_artm.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3))
    model_artm.scores.add(artm.BackgroundTokensRatioScore(name='background_tokens_ratio_score'))
    model_artm.scores.add(artm.ClassPrecisionScore(name='class_precision_score'))
    model_artm.scores.add(artm.TopicMassPhiScore(name='topic_mass_phi_score'))

    model_plsa.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary))
    model_plsa.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score'))
    model_plsa.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score'))
    model_plsa.scores.add(artm.TopTokensScore(name='top_tokens_score'))
    model_plsa.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3))
    model_plsa.scores.add(artm.BackgroundTokensRatioScore(name='background_tokens_ratio_score'))
    model_plsa.scores.add(artm.ClassPrecisionScore(name='class_precision_score'))
    model_plsa.scores.add(artm.TopicMassPhiScore(name='topic_mass_phi_score'))

    model_artm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer'))
    model_artm.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer'))
    model_artm.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer'))

    model_artm.regularizers['sparse_phi_regularizer'].tau = tau_phi
    model_artm.regularizers['sparse_theta_regularizer'].tau = tau_theta
    model_artm.regularizers['decorrelator_phi_regularizer'].tau = 1e+3

    model_plsa.initialize(dictionary=dictionary)
    model_artm.initialize(dictionary=dictionary)
    model_lda.initialize(dictionary=dictionary)

    passes = 100
    model_plsa.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes)
    model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes)
    model_lda.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes)

    print_measures(model_plsa, model_artm, model_lda)
Пример #6
0
    def set_scores(self):

        self.model.scores.add(
            artm.PerplexityScore(name='PerplexityScore',
                                 dictionary=self.dictionary))

        self.model.scores.add(
            artm.SparsityPhiScore(name='SparsityPhiScore',
                                  class_id='@default_class',
                                  topic_names=self.specific))
        self.model.scores.add(
            artm.SparsityThetaScore(name='SparsityThetaScore',
                                    topic_names=self.specific))

        # Fraction of background words in the whole collection
        self.model.scores.add(
            artm.BackgroundTokensRatioScore(name='BackgroundTokensRatioScore',
                                            class_id='@default_class'))

        # Kernel characteristics
        self.model.scores.add(
            artm.TopicKernelScore(name='TopicKernelScore',
                                  class_id='@default_class',
                                  topic_names=self.specific,
                                  probability_mass_threshold=0.5,
                                  dictionary=self.dictionary))

        # Looking at top tokens
        self.model.scores.add(
            artm.TopTokensScore(name='TopTokensScore',
                                class_id='@default_class',
                                num_tokens=100))
Пример #7
0
def test_fancy_fit_is_ok(experiment_enviroment):
    tm, dataset, experiment, dictionary = experiment_enviroment
    model_artm = artm.ARTM(
        num_topics=5,
        num_document_passes=1,
        dictionary=dictionary,
        scores=[artm.PerplexityScore(name='PerplexityScore')],
        theta_columns_naming='title',
        class_ids={
            MAIN_MODALITY: 1,
            NGRAM_MODALITY: 1,
            EXTRA_MODALITY: 1,
            '@psyduck': 42
        },
        regularizers=[
            artm.SmoothSparseThetaRegularizer(name='smooth_theta', tau=10.0),
        ])
    custom_scores = {'mean_kernel_size': ScoreExample()}

    tm = TopicModel(model_artm,
                    model_id='absolutely_new_id',
                    custom_scores=custom_scores)

    num_iterations = 10
    tm._fit(dataset.get_batch_vectorizer(), num_iterations)
    params = tm.get_jsonable_from_parameters()
    assert "smooth_theta" in params["regularizers"]
    PATH = "tests/experiments/save_standalone/"
    tm.save(PATH)
    tm2 = TopicModel.load(PATH)
    assert (tm.get_phi() == tm2.get_phi()).all().all()
Пример #8
0
def define_model(n_topics: int, dictionary: artm.Dictionary,
                 sparse_theta: float, sparse_phi: float,
                 decorrelator_phi: float) -> artm.artm_model.ARTM:
    """
    Define the ARTM model.
    :param n_topics: number of topics.
    :param dictionary: batch vectorizer dictionary.
    :param sparse_theta: sparse theta parameter.
    :param sparse_phi: sparse phi Parameter.
    :param decorrelator_phi: decorellator phi Parameter.
    :return: ARTM model.
    """
    print("Defining the model.")
    topic_names = ["topic_{}".format(i) for i in range(1, n_topics + 1)]
    model_artm = artm.ARTM(
        topic_names=topic_names,
        cache_theta=True,
        scores=[
            artm.PerplexityScore(name="PerplexityScore",
                                 dictionary=dictionary),
            artm.SparsityPhiScore(name="SparsityPhiScore"),
            artm.SparsityThetaScore(name="SparsityThetaScore"),
            artm.TopicKernelScore(name="TopicKernelScore",
                                  probability_mass_threshold=0.3),
            artm.TopTokensScore(name="TopTokensScore", num_tokens=15)
        ],
        regularizers=[
            artm.SmoothSparseThetaRegularizer(name="SparseTheta",
                                              tau=sparse_theta),
            artm.SmoothSparsePhiRegularizer(name="SparsePhi", tau=sparse_phi),
            artm.DecorrelatorPhiRegularizer(name="DecorrelatorPhi",
                                            tau=decorrelator_phi)
        ])
    return model_artm
Пример #9
0
def fit():
    batch_id = str(uuid.uuid4())
    app.logger.info("batch %s", batch_id)

    rjson = request.json
    terms = rjson['terms']
    topics_cnt = rjson['topics']

    batch = artm.messages.Batch()
    term_to_id = {}
    all_terms = []

    batch = artm.messages.Batch()
    batch.id = batch_id

    for i, doc in enumerate(terms):
        item = batch.item.add()
        item.id = i
        field = item.field.add()
        for term in doc:
            if not term in term_to_id:
                term_to_id[term] = len(all_terms)
                all_terms.append(term)
            field.token_id.append(term_to_id[term])
            field.token_count.append(1)

    for t in all_terms:
        batch.token.append(t)

    os.mkdir(batch_id)
    with open(os.path.join(batch_id, "batch.batch"), 'wb') as fout:
        fout.write(batch.SerializeToString())

    app.logger.info("batch %s is created", batch_id)

    dictionary = artm.Dictionary()
    dictionary.gather(batch_id)

    model_artm = artm.ARTM(
        topic_names=['topic_{}'.format(i) for i in xrange(topics_cnt)],
        scores=[
            artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary)
        ],
        regularizers=[
            artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=-0.15)
        ],
        show_progress_bars=False)

    batch_vectorizer = artm.BatchVectorizer(data_path=batch_id,
                                            data_format="batches")

    model_artm.initialize(dictionary=dictionary)
    app.logger.info("model is starting to fit")
    model_artm.fit_offline(batch_vectorizer=batch_vectorizer,
                           num_collection_passes=1)
    app.logger.info("mode was fitted")

    model_artm.save(os.path.join(batch_id, "model"))

    return jsonify({"id": batch_id})
Пример #10
0
def experiment_enviroment(request):
    """ """
    with warnings.catch_warnings():
        warnings.filterwarnings(action="ignore", message=W_DIFF_BATCHES_1)
        dataset = Dataset('tests/test_data/test_dataset.csv')
        dictionary = dataset.get_dictionary()

    model_artm = artm.ARTM(
        num_processors=1,
        num_topics=5,
        cache_theta=True,
        num_document_passes=1,
        dictionary=dictionary,
        scores=[artm.PerplexityScore(name='PerplexityScore', )],
    )
    model_artm.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore'))
    model_artm.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore'))
    ex_score = ScoreExample()
    tm = TopicModel(model_artm,
                    model_id='new_id',
                    custom_scores={'example_score': ex_score})
    # experiment starts without model
    experiment = Experiment(tm,
                            experiment_id="test_cube_creator",
                            save_path="tests/experiments")
    return tm, dataset, experiment, dictionary
Пример #11
0
def experiment_enviroment(request):
    """ """
    with warnings.catch_warnings():
        warnings.filterwarnings(action="ignore", message=W_DIFF_BATCHES_1)
        dataset = Dataset('tests/test_data/test_dataset.csv')
        dictionary = dataset.get_dictionary()

    model_artm = artm.ARTM(
        num_topics=5,
        class_ids={MAIN_MODALITY: 1.0, NGRAM_MODALITY: 1.0, EXTRA_MODALITY: 1.0},
        num_document_passes=1, dictionary=dictionary,
        scores=[artm.PerplexityScore(name='PerplexityScore', )],
        theta_columns_naming='title',
    )
    custom_scores = {'mean_kernel_size': ScoreExample()}

    tm = TopicModel(model_artm, model_id='new_id', custom_scores=custom_scores)
    experiment = Experiment(experiment_id="test", save_path="tests/experiments", topic_model=tm)

    def resource_teardown():
        """ """
        shutil.rmtree("tests/experiments")
        shutil.rmtree(dataset._internals_folder_path)

    request.addfinalizer(resource_teardown)

    return tm, dataset, experiment, dictionary
Пример #12
0
def experiment_enviroment(request):
    """ """
    with warnings.catch_warnings():
        warnings.filterwarnings(action="ignore", message=W_DIFF_BATCHES_1)
        dataset = Dataset(DATA_PATH)
        dictionary = dataset.get_dictionary()

    model_artm = artm.ARTM(
        num_processors=1,
        num_topics=5,
        cache_theta=True,
        class_ids={
            MAIN_MODALITY: 1.0,
            NGRAM_MODALITY: 1.0
        },
        num_document_passes=1,
        dictionary=dictionary,
        scores=[artm.PerplexityScore(name='PerplexityScore', )],
    )

    tm = TopicModel(model_artm, model_id='new_id')
    experiment = Experiment(experiment_id="test_cubes",
                            save_path="tests/experiments",
                            topic_model=tm)

    return tm, dataset, experiment, dictionary
Пример #13
0
    def setup_class(cls):
        """ """
        with warnings.catch_warnings():
            warnings.filterwarnings(action="ignore", message=W_DIFF_BATCHES_1)
            dataset = Dataset('tests/test_data/test_dataset.csv')
            raw_data = []
            with open('tests/test_data/test_vw.txt', encoding='utf-8') as file:
                for line in file:
                    raw_data += [line.split(' ')]
            dictionary = dataset.get_dictionary()
            batch_vectorizer = dataset.get_batch_vectorizer()

        model_artm = artm.ARTM(
            num_topics=NUM_TOPICS,
            class_ids=dict.fromkeys(CLASS_IDS, 1.0),
            topic_names=TOPIC_NAMES,
            cache_theta=True,
            num_document_passes=NUM_DOCUMENT_PASSES,
            dictionary=dictionary,
            scores=[artm.PerplexityScore(name='PerplexityScore')],
        )

        cls.topic_model = TopicModel(model_artm, model_id='model_id')
        cls.topic_model._fit(batch_vectorizer, num_iterations=NUM_ITERATIONS)
        cls.raw_data = raw_data
Пример #14
0
    def init_model(self, dictionary_path=None):
        """dictionary_path: optional, used with pretrained model"""
        self.dictionary = artm.Dictionary()
        if dictionary_path is None:
            self.dictionary.gather(data_path=self.batches_path)
            self.dictionary.filter(min_tf=10, max_df_rate=0.1)
            self.dictionary.save_text(
                f"{self.dir_path}/dicts/dict_{self.name_dataset}.txt")
        else:
            self.dictionary.load_text(dictionary_path)

        self.model = artm.ARTM(
            num_topics=self.n_topics,
            dictionary=self.dictionary,
            show_progress_bars=True,
        )

        # scores
        self.model.scores.add(
            artm.PerplexityScore(name="PerplexityScore",
                                 dictionary=self.dictionary))
        self.model.scores.add(
            artm.SparsityThetaScore(name="SparsityThetaScore"))
        self.model.scores.add(artm.SparsityPhiScore(name="SparsityPhiScore"))

        # regularizers
        self.model.regularizers.add(
            artm.SmoothSparsePhiRegularizer(name="SparsePhi", tau=-0.1))
        self.model.regularizers.add(
            artm.SmoothSparseThetaRegularizer(name="SparseTheta", tau=-0.5))
        self.model.regularizers.add(
            artm.DecorrelatorPhiRegularizer(name="DecorrelatorPhi", tau=1.5e5))
Пример #15
0
def test_custom_regularizer_cubed_controlled(experiment_enviroment,
                                             thread_flag, by_name):
    """ """
    _, dataset, _, dictionary = experiment_enviroment
    multiplier = 2
    initial_tau = 5

    custom_reg = TopicPriorSampledRegularizer(name='topic_prior',
                                              tau=initial_tau,
                                              num_topics=5,
                                              beta_prior=[10, 1, 100, 2, 1000])

    model_artm = artm.ARTM(
        num_processors=1,
        num_topics=5,
        cache_theta=True,
        class_ids={
            MAIN_MODALITY: 1.0,
            NGRAM_MODALITY: 1.0
        },
        num_document_passes=1,
        dictionary=dictionary,
        scores=[artm.PerplexityScore(name='PerplexityScore', )],
    )
    tm = TopicModel(
        model_artm,
        model_id='new_id_1',
        custom_regularizers={custom_reg.name: custom_reg} if by_name else {})
    experiment = Experiment(  # noqa: F841
        tm,
        experiment_id="cubed_controlled_reg",
        save_path="tests/experiments")

    parameters = {
        "score_to_track": None,
        "tau_converter": f"prev_tau * {multiplier}",
        "user_value_grid": [0.3],
        "max_iters": float("inf")
    }

    if by_name:
        parameters["reg_name"] = custom_reg.name
    else:
        parameters["regularizer"] = custom_reg

    num_iter = 10
    cube = RegularizationControllerCube(num_iter=num_iter,
                                        parameters=parameters,
                                        reg_search="grid",
                                        use_relative_coefficients=False,
                                        separate_thread=thread_flag)
    dummies = cube(tm, dataset)

    tmodels = [dummy.restore() for dummy in dummies]

    for one_model in tmodels:
        actual_tau = one_model.all_regularizers[custom_reg.name].tau

        assert actual_tau == initial_tau * (multiplier**num_iter)
Пример #16
0
def create_and_learn_ARTM_decorPhi_modal(name="",
                                         topic_number=750,
                                         num_collection_passes=1,
                                         weigths=[1., 1., 1., 1.],
                                         decorTau=1.0):

    batch_vectorizer_train = None
    batch_vectorizer_train = artm.BatchVectorizer(data_path='./' + name,
                                                  data_format='vowpal_wabbit',
                                                  target_folder='folder' +
                                                  name)
    dictionary = artm.Dictionary()
    dictionary.gather(data_path=batch_vectorizer_train.data_path)
    topic_names = ['topic_{}'.format(i) for i in range(topic_number)]

    model = artm.ARTM(topic_names=topic_names,
                      class_ids={
                          '@text': weigths[0],
                          '@first': weigths[1],
                          '@second': weigths[2],
                          '@third': weigths[3]
                      },
                      cache_theta=True,
                      theta_columns_naming='title',
                      scores=[
                          artm.PerplexityScore(name='PerplexityScore',
                                               dictionary=dictionary)
                      ])
    model.regularizers.add(
        artm.DecorrelatorPhiRegularizer(
            name='DecorrelatorPhi_modals',
            tau=decorTau,
            class_ids=['@first', '@second', '@third']))

    model.initialize(dictionary=dictionary)

    model.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore'))
    model.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore'))
    model.scores.add(
        artm.TopicKernelScore(name='TopicKernelScore',
                              class_id='@text',
                              probability_mass_threshold=0.3))
    model.scores.add(
        artm.TopTokensScore(name='TopTokensScore',
                            num_tokens=6,
                            class_id='@text'))
    model.scores.add(
        artm.SparsityPhiScore(name='sparsity_phi_score', class_id='@third'))

    model.num_document_passes = 1

    model.fit_offline(batch_vectorizer=batch_vectorizer_train,
                      num_collection_passes=num_collection_passes)

    theta_train = model.transform(batch_vectorizer=batch_vectorizer_train)

    return model, theta_train
Пример #17
0
def experiment_enviroment(request):
    """ """
    with warnings.catch_warnings():
        warnings.filterwarnings(action="ignore", message=W_DIFF_BATCHES_1)
        dataset = Dataset('tests/test_data/test_dataset.csv')
        dictionary = dataset.get_dictionary()

    model_artm = artm.ARTM(
        num_processors=3,
        num_topics=5,
        cache_theta=True,
        num_document_passes=1,
        dictionary=dictionary,
        scores=[artm.PerplexityScore(name='PerplexityScore')])
    model_artm.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore'))
    model_artm.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore'))

    ex_score = ScoreExample()
    tm = TopicModel(model_artm,
                    model_id='new_id',
                    custom_scores={'example_score': ex_score})

    experiment = Experiment(tm,
                            experiment_id="test_pipeline",
                            save_path="tests/experiments")
    cube_settings = [{
        'CubeCreator': {
            'num_iter': 10,
            'parameters': [
                {
                    'name': 'seed',
                    'values': [82019, 322],
                },
            ],
            'reg_search': 'grid',
            'separate_thread': USE_MULTIPROCESS,
        },
        'selection': [
            'model.seed = 82019 and PerplexityScore -> min COLLECT 2',
        ]
    }, {
        'RegularizersModifierCube': {
            'num_iter': 10,
            'regularizer_parameters': {
                "regularizer": artm.regularizers.SmoothSparsePhiRegularizer(),
                "tau_grid": [0.1, 0.5, 1, 5, 10]
            },
            'reg_search': 'grid',
            'use_relative_coefficients': False,
            'separate_thread': USE_MULTIPROCESS,
        },
        'selection': [
            'PerplexityScore -> max COLLECT 2',
        ]
    }]

    return tm, dataset, experiment, dictionary, cube_settings
Пример #18
0
def test_custom_regularizer_cubed(experiment_enviroment, thread_flag, by_name):
    """ """
    _, dataset, _, dictionary = experiment_enviroment
    tau_grid = [1, 0, -1]

    custom_reg = TopicPriorSampledRegularizer(name='topic_prior',
                                              tau=5,
                                              num_topics=5,
                                              beta_prior=[10, 1, 100, 2, 1000])

    model_artm = artm.ARTM(
        num_processors=1,
        num_topics=5,
        cache_theta=True,
        class_ids={
            MAIN_MODALITY: 1.0,
            NGRAM_MODALITY: 1.0
        },
        num_document_passes=1,
        dictionary=dictionary,
        scores=[artm.PerplexityScore(name='PerplexityScore', )],
    )
    tm = TopicModel(
        model_artm,
        model_id='new_id_1',
        custom_regularizers={custom_reg.name: custom_reg} if by_name else {})
    experiment = Experiment(  # noqa: F841
        tm,
        experiment_id="cubed_reg",
        save_path="tests/experiments")

    if by_name:
        regularizer_parameters = {
            "name": custom_reg.name,
            "tau_grid": tau_grid
        }
    else:
        regularizer_parameters = {
            "regularizer": custom_reg,
            "tau_grid": tau_grid
        }

    cube = RegularizersModifierCube(
        num_iter=10,
        regularizer_parameters=regularizer_parameters,
        reg_search="grid",
        use_relative_coefficients=False,
        separate_thread=thread_flag)
    dummies = cube(tm, dataset)

    tmodels = [dummy.restore() for dummy in dummies]

    assert len(tmodels) == len(tau_grid)
    for tau, one_model in zip(tau_grid, tmodels):
        assert one_model.all_regularizers[custom_reg.name].tau == tau
Пример #19
0
def compute_big_artm(num_topics, tau, dictionary, batch_vectorizer, score_computer):
    artm_model = artm.ARTM(num_topics=num_topics,
                           num_document_passes=5,
                           dictionary=dictionary,
                           scores=[artm.PerplexityScore(name='s1')],
                           regularizers=[artm.SmoothSparseThetaRegularizer(name='r1', tau=tau)], cache_theta=True)
    artm_model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=10)
    theta_bigartm = artm_model.get_theta()
    bigartm_predicts = get_df_clusters_predicted(theta_bigartm, url_list)
    score = score_computer.compute_score(bigartm_predicts["story_id_predicted"])
    logging.info("num_topics={}, tau={},"
                 "bigARTM score = {}".format(num_topics, tau, score))
 def create_topic_model(self, topic_model_name: str,
                        batch_vectorizer: artm.BatchVectorizer,
                        dictionary: artm.Dictionary) -> artm.ARTM:
     topic_model = artm.ARTM(num_topics=self.number_of_topics,
                             dictionary=dictionary,
                             cache_theta=False)
     topic_model.scores.add(
         artm.PerplexityScore(name='perplexity_score',
                              dictionary=dictionary))
     topic_model.scores.add(
         artm.SparsityPhiScore(name='sparsity_phi_score'))
     topic_model.scores.add(
         artm.SparsityThetaScore(name='sparsity_theta_score'))
     topic_model.num_document_passes = 5
     topic_model.num_processors = max(1, os.cpu_count() - 1)
     topic_model.regularizers.add(
         artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer'))
     topic_model.regularizers.add(
         artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer'))
     topic_model.regularizers.add(
         artm.DecorrelatorPhiRegularizer(
             name='decorrelator_phi_regularizer'))
     topic_model.regularizers['sparse_phi_regularizer'].tau = -1.0
     topic_model.regularizers['sparse_theta_regularizer'].tau = -0.5
     topic_model.regularizers['decorrelator_phi_regularizer'].tau = 1e+5
     best_score = None
     keyword_extraction_logger.info(
         'epoch  perplexity_score  sparsity_phi_score  sparsity_theta_score'
     )
     for restart_index in range(10):
         topic_model.fit_offline(batch_vectorizer=batch_vectorizer,
                                 num_collection_passes=3)
         if best_score is None:
             best_score = topic_model.score_tracker[
                 'perplexity_score'].last_value
         else:
             if best_score > topic_model.score_tracker[
                     'perplexity_score'].last_value:
                 best_score = topic_model.score_tracker[
                     'perplexity_score'].last_value
                 self.save_topic_model(topic_model, topic_model_name)
         keyword_extraction_logger.info(
             '{0:5}  {1:16.9}  {2:18.9}  {3:20.9}'.format(
                 (restart_index + 1) * 3,
                 topic_model.score_tracker['perplexity_score'].last_value,
                 topic_model.score_tracker['sparsity_phi_score'].last_value,
                 topic_model.score_tracker['sparsity_theta_score'].
                 last_value))
     del topic_model
     return self.load_topic_model(
         artm.ARTM(num_topics=self.number_of_topics,
                   dictionary=dictionary,
                   cache_theta=False), topic_model_name)
Пример #21
0
    def _get_corpus_model(self,
                          corpus_vector_spaced,
                          clustering_method='artm'):
        if 'gensim' == clustering_method:
            return self._get_model_LSI(corpus_vector_spaced)
        elif 'sklearn' == clustering_method:
            return self._get_model_LDA(corpus_vector_spaced)
        elif 'artm' == clustering_method:
            batch_vectorizer = corpus_vector_spaced['batch_vectorizer']
            dictionary = corpus_vector_spaced['dictionary']

            topic_names = [
                'topic_{}'.format(i) for i in range(self.num_of_clusters)
            ]

            model_artm = artm.ARTM(
                topic_names=topic_names,
                cache_theta=True,
                scores=[
                    artm.PerplexityScore(name='PerplexityScore',
                                         dictionary=dictionary)
                ],
                regularizers=[
                    artm.SmoothSparseThetaRegularizer(name='SparseTheta',
                                                      tau=-0.15)
                ])

            model_artm.scores.add(
                artm.SparsityPhiScore(name='SparsityPhiScore'))
            model_artm.scores.add(
                artm.SparsityThetaScore(name='SparsityThetaScore'))
            model_artm.scores.add(
                artm.TopicKernelScore(name='TopicKernelScore',
                                      probability_mass_threshold=0.3))
            model_artm.scores.add(artm.TopTokensScore(name='TopTokensScore',
                                                      num_tokens=10),
                                  overwrite=True)

            model_artm.regularizers.add(
                artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=-0.1))
            model_artm.regularizers['SparseTheta'].tau = -0.2
            model_artm.regularizers.add(
                artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi',
                                                tau=1.5e+5))

            model_artm.num_document_passes = 1

            model_artm.initialize(dictionary)
            model_artm.fit_offline(batch_vectorizer=batch_vectorizer,
                                   num_collection_passes=30)

            return model_artm.transform(batch_vectorizer=batch_vectorizer).T
def topic_model(class_ids, dictionary, num_of_topics, num_back, tau, tf):

    names_of_topics = [str(x) for x in range(num_of_topics)]
    dictionary.filter(min_tf=tf, class_id='subjects')
    dictionary.filter(min_tf=tf, class_id='objects')
    dictionary.filter(min_tf=tf, class_id='pairs')

    model = artm.ARTM(
        num_topics=num_of_topics,
        #reuse_theta=True,
        cache_theta=True,
        topic_names=names_of_topics,
        class_ids=class_ids,
        #regularizers=regularizers_artm,
        dictionary=dictionary)

    model.scores.add(
        artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary))

    model.scores.add(
        artm.SparsityPhiScore(name='SparcityPhiScore',
                              topic_names=model.topic_names[:-num_back]))

    model.regularizers.add(
        artm.SmoothSparsePhiRegularizer(
            name='SparsePhiRegularizer',
            class_ids=class_ids,
            topic_names=model.topic_names[:-num_back],
            tau=-tau))
    model.regularizers.add(
        artm.SmoothSparsePhiRegularizer(
            name='SmoothPhiRegularizer',
            class_ids=class_ids,
            topic_names=model.topic_names[-num_back:],
            tau=tau))

    model.regularizers.add(
        artm.DecorrelatorPhiRegularizer(
            name='DecorrelatorRegularizer',
            class_ids=class_ids,
            topic_names=model.topic_names[:-num_back],
            tau=tau))
    model.regularizers.add(
        artm.SmoothSparseThetaRegularizer(
            name='SparseThetaRegularizer',
            topic_names=model.topic_names[-num_back],
            tau=tau))
    return model
Пример #23
0
def create_thematic_model(checked_list, num_topics, num_tokens, phi_tau,
                          theta_tau, decorr_tau):
    """ Create a thematic model """
    gluing_bag_of_words(checked_list)

    batch_vectorizer = artm.BatchVectorizer(data_path=COLLECTION_PATH,
                                            data_format='vowpal_wabbit',
                                            target_folder=TARGET_FOLDER,
                                            batch_size=len(checked_list))
    dictionary = artm.Dictionary(data_path=TARGET_FOLDER)
    model = artm.ARTM(
        num_topics=num_topics,
        num_document_passes=len(checked_list),
        dictionary=dictionary,
        regularizers=[
            artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer',
                                            tau=phi_tau),
            artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer',
                                              tau=theta_tau),
            artm.DecorrelatorPhiRegularizer(
                name='decorrelator_phi_regularizer', tau=decorr_tau),
        ],
        scores=[
            artm.PerplexityScore(name='perplexity_score',
                                 dictionary=dictionary),
            artm.SparsityPhiScore(name='sparsity_phi_score'),
            artm.SparsityThetaScore(name='sparsity_theta_score'),
            artm.TopTokensScore(name='top_tokens_score', num_tokens=num_tokens)
        ])

    model.fit_offline(batch_vectorizer=batch_vectorizer,
                      num_collection_passes=len(checked_list))

    top_tokens = model.score_tracker['top_tokens_score']

    topic_dictionary = OrderedDict()

    for topic_name in model.topic_names:
        list_name = []
        for (token, weight) in zip(top_tokens.last_tokens[topic_name],
                                   top_tokens.last_weights[topic_name]):
            list_name.append(token + '-' + str(round(weight, 3)))
        topic_dictionary[str(topic_name)] = list_name

    return model.score_tracker[
        'perplexity_score'].last_value, model.score_tracker[
            'sparsity_phi_score'].last_value, model.score_tracker[
                'sparsity_theta_score'].last_value, topic_dictionary
def test_func():
    topic_selection_tau = 0.5
    num_collection_passes = 3
    num_document_passes = 10
    num_topics = 15

    data_path = os.environ.get('BIGARTM_UNITTEST_DATA')
    batches_folder = tempfile.mkdtemp()

    perplexity_eps = 0.1
    perplexity_value = [
        6676.941798754971, 2534.963709464024, 2463.1544861984794
    ]

    try:
        batch_vectorizer = artm.BatchVectorizer(data_path=data_path,
                                                data_format='bow_uci',
                                                collection_name='kos',
                                                target_folder=batches_folder)

        dictionary = artm.Dictionary(data_path=batches_folder)
        model = artm.ARTM(num_topics=num_topics,
                          dictionary=dictionary,
                          num_document_passes=num_document_passes)

        model.regularizers.add(
            artm.TopicSelectionThetaRegularizer(name='TopicSelection',
                                                tau=topic_selection_tau))
        model.scores.add(artm.PerplexityScore(name='PerplexityScore'))
        model.scores.add(
            artm.TopicMassPhiScore(name='TopicMass',
                                   model_name=model.model_nwt))
        model.fit_offline(batch_vectorizer=batch_vectorizer,
                          num_collection_passes=num_collection_passes)

        # Verify that only 8 topics are non-zero (due to TopicSelection regularizer)
        topics_left = sum(x == 0
                          for x in model.get_score('TopicMass').topic_mass)
        assert 8 == topics_left

        # the following asssertion fails on travis-ci builds, but passes locally
        for i in range(num_collection_passes):
            assert abs(model.score_tracker['PerplexityScore'].value[i] -
                       perplexity_value[i]) < perplexity_eps

        model.fit_online(batch_vectorizer=batch_vectorizer)
    finally:
        shutil.rmtree(batches_folder)
Пример #25
0
def create_and_learn_PLSA(name="", topic_number=750, num_collection_passes=1):

    batch_vectorizer_train = None
    batch_vectorizer_train = artm.BatchVectorizer(data_path='./' + name,
                                                  data_format='vowpal_wabbit',
                                                  target_folder='folder' +
                                                  name)
    dictionary = artm.Dictionary()
    dictionary.gather(data_path=batch_vectorizer_train.data_path)
    topic_names = ['topic_{}'.format(i) for i in range(topic_number)]

    model_plsa = artm.ARTM(topic_names=topic_names,
                           class_ids={
                               '@text': 1.0,
                               '@first': 1.0,
                               '@second': 1.0,
                               '@third': 1.0
                           },
                           cache_theta=True,
                           theta_columns_naming='title',
                           scores=[
                               artm.PerplexityScore(name='PerplexityScore',
                                                    dictionary=dictionary)
                           ])

    model_plsa.initialize(dictionary=dictionary)

    model_plsa.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore'))
    model_plsa.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore'))
    model_plsa.scores.add(
        artm.TopicKernelScore(name='TopicKernelScore',
                              class_id='@text',
                              probability_mass_threshold=0.3))
    model_plsa.scores.add(
        artm.TopTokensScore(name='TopTokensScore',
                            num_tokens=6,
                            class_id='@text'))

    model_plsa.num_document_passes = 1

    model_plsa.fit_offline(batch_vectorizer=batch_vectorizer_train,
                           num_collection_passes=num_collection_passes)

    theta_train = model_plsa.transform(batch_vectorizer=batch_vectorizer_train)

    return model_plsa, theta_train
Пример #26
0
def add_scores_to_model(current_dictionary, artm_model, n_top_tokens,
                        p_mass_threshold):
    artm_model.scores.add(
        artm.PerplexityScore(name='perplexity_score',
                             use_unigram_document_model=False,
                             dictionary=current_dictionary))
    artm_model.scores.add(
        artm.SparsityPhiScore(name='sparsity_phi_score', class_id='ngramm'))
    artm_model.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score'))
    artm_model.scores.add(
        artm.TopicKernelScore(name='topic_kernel_score',
                              class_id='ngramm',
                              probability_mass_threshold=p_mass_threshold))
    artm_model.scores.add(
        artm.TopTokensScore(name='top_tokens_score',
                            class_id='ngramm',
                            num_tokens=n_top_tokens))
Пример #27
0
def artm_plsa(batch_vectorizer, topics, topic_names, dictionary):
    model_artm = artm.ARTM(num_topics=topics,
                           topic_names=topic_names,
                           num_processors=cpu_count(),
                           class_ids={"text": 1},
                           reuse_theta=True,
                           cache_theta=True,
                           num_document_passes=1)
    model_artm.initialize(dictionary=dictionary)
    model_artm.scores.add(
        artm.PerplexityScore("perplexity",
                             class_ids=["text"],
                             dictionary=dictionary))
    model_artm.fit_offline(batch_vectorizer=batch_vectorizer,
                           num_collection_passes=50)
    print "\nPeprlexity for BigARTM PLSA: ", model_artm.score_tracker[
        "perplexity"].value[-1]
Пример #28
0
def pipeline_plsa_bigartm(lines,
                          TOPIC_NUMBER,
                          ngram_range,
                          topnwords,
                          LOGS_DATA_PATH="plsa.txt",
                          TARGET_FOLDER="plsa"):

    make_file(lines, ngram_range, LOGS_DATA_PATH)

    batch_vectorizer = artm.BatchVectorizer(data_path=LOGS_DATA_PATH,
                                            data_format='vowpal_wabbit',
                                            target_folder=TARGET_FOLDER)

    model_artm = artm.ARTM(num_topics=TOPIC_NUMBER, cache_theta=True)
    model_artm.initialize(dictionary=batch_vectorizer.dictionary)

    model_artm.regularizers.add(
        artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=0.05))
    model_artm.regularizers.add(
        artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=1.5e+5))
    model_artm.regularizers.add(
        artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=-0.01))

    model_artm.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore'))
    model_artm.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore'))
    model_artm.scores.add(artm.TopTokensScore(name='TopTokensScore',
                                              num_tokens=topnwords),
                          overwrite=True)
    model_artm.scores.add(
        artm.PerplexityScore(name='PerplexityScore',
                             dictionary=batch_vectorizer.dictionary))

    model_artm.num_document_passes = 2
    model_artm.fit_offline(batch_vectorizer=batch_vectorizer,
                           num_collection_passes=15)

    topic_names = {}
    for topic_name in model_artm.topic_names:
        topic_names[topic_name] = model_artm.score_tracker[
            'TopTokensScore'].last_tokens[topic_name]

    #return label_after_bigarm(model_artm),  topic_names
    return "nothing, sorry", topic_names
Пример #29
0
    def setup_class(cls):
        """ """
        with warnings.catch_warnings():
            warnings.filterwarnings(action="ignore", message=W_DIFF_BATCHES_1)
            cls.dataset = Dataset('tests/test_data/test_dataset.csv')
            dictionary = cls.dataset.get_dictionary()
            batch_vectorizer = cls.dataset.get_batch_vectorizer()

        model_artm = artm.ARTM(
            num_topics=NUM_TOPICS,
            cache_theta=True,
            num_document_passes=NUM_DOCUMENT_PASSES,
            dictionary=dictionary,
            scores=[artm.PerplexityScore(name='PerplexityScore')],)

        cls.topic_model = TopicModel(model_artm, model_id='model_id')
        cls.topic_model._fit(batch_vectorizer, num_iterations=NUM_ITERATIONS)
        cls.theta = cls.topic_model.get_theta(dataset=cls.dataset)

        cls.top_documents_viewer = top_documents_viewer.TopDocumentsViewer(model=cls.topic_model)
Пример #30
0
def create_model_with_background(dictionary, num_tokens, num_document_passes):

    sm_phi_tau = 0.0001 * 1e-4
    sp_phi_tau = -0.0001 * 1e-4

    decor_phi_tau = 1

    specific_topics = ['topic {}'.format(i) for i in range(1, 20)]
    topic_names = specific_topics + ["background"]
    scores = [
        artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary),
        artm.TopTokensScore(
            name='TopTokensScore', num_tokens=10, class_id='plain_text'
        ),  # web version of Palmetto works only with <= 10 tokens
        artm.SparsityPhiScore(name='SparsityPhiScore'),
        artm.SparsityThetaScore(name='SparsityThetaScore'),
        artm.TopicKernelScore(name='TopicKernelScore',
                              probability_mass_threshold=0.3,
                              class_id='plain_text')
    ]

    model = artm.ARTM(topic_names=specific_topics + ["background"],
                      regularizers=[],
                      cache_theta=True,
                      scores=scores,
                      class_ids={'plain_text': 1.0})

    model.regularizers.add(
        artm.SmoothSparsePhiRegularizer(name='SparsePhi',
                                        tau=-sp_phi_tau,
                                        topic_names=specific_topics))
    model.regularizers.add(
        artm.SmoothSparsePhiRegularizer(name='SmoothPhi',
                                        tau=sm_phi_tau,
                                        topic_names=["background"]))
    # model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=decor_phi_tau))

    model.initialize(dictionary=dictionary)
    model.num_document_passes = num_document_passes

    return model