def two_experiment_enviroments(request): """ """ with warnings.catch_warnings(): warnings.filterwarnings(action="ignore", message=W_DIFF_BATCHES_1) dataset = Dataset('tests/test_data/test_dataset.csv') dictionary = dataset.get_dictionary() model_artm_1 = artm.ARTM( num_processors=1, num_topics=5, cache_theta=True, num_document_passes=1, dictionary=dictionary, scores=[artm.PerplexityScore(name='PerplexityScore'), artm.SparsityPhiScore(name='SparsityPhiScore', class_id=MAIN_MODALITY)] ) model_artm_2 = artm.ARTM( num_processors=1, num_topics=5, cache_theta=True, num_document_passes=1, dictionary=dictionary, scores=[artm.PerplexityScore(name='PerplexityScore'), artm.SparsityPhiScore(name='SparsityPhiScore', class_id=MAIN_MODALITY)] ) tm_1 = TopicModel(model_artm_1, model_id='new_id_1') tm_2 = TopicModel(model_artm_2, model_id='new_id_2') experiment_1 = Experiment( experiment_id="test_1", save_path="tests/experiments", topic_model=tm_1 ) experiment_2 = Experiment( experiment_id="test_2", save_path="tests/experiments", topic_model=tm_2 ) return tm_1, experiment_1, tm_2, experiment_2, dataset, dictionary
def init_hierarchical_model(class_ids): score = [artm.PerplexityScore(name='perplexity_words', class_ids=['body']), artm.PerplexityScore(name='perplexity_bigrams', class_ids=['bigrams'])] top_tokens = [artm.TopTokensScore(name='top_words', num_tokens=15, class_id='body'), artm.TopTokensScore(name='top_bigrams', num_tokens=10, class_id='bigrams')] sparsity = [artm.SparsityThetaScore(name='sparsity_theta', eps=1e-6), artm.SparsityPhiScore(name='sparsity_phi_words', class_id='words', eps=1e-6), artm.SparsityPhiScore(name='sparsity_phi_bigrams', class_id='bigrams', eps=1e-6)] regularizers = [artm.DecorrelatorPhiRegularizer(tau=0, class_ids=['body'], name='decorr_words'), artm.DecorrelatorPhiRegularizer(tau=0, class_ids=['bigram'], name='decorr_bigrams'), artm.DecorrelatorPhiRegularizer(tau=0, class_ids=['categories'], name='decorr_categories'), artm.SmoothSparseThetaRegularizer(tau=0, name='sparsity_theta'), artm.SmoothSparsePhiRegularizer(tau=0, class_ids=['body'], name='sparsity_words'), artm.SmoothSparsePhiRegularizer(tau=0, class_ids=['bigram'], name='sparsity_bigrams')] hmodel = artm.hARTM(class_ids=class_ids, cache_theta=True, reuse_theta=True, scores=score + top_tokens + sparsity, regularizers=regularizers, theta_columns_naming='title') return hmodel
def add_complex_scores_to_model(artm_model, n_top_tokens, p_mass_threshold, common_topics, subject_topics, class_name, _debug_print=False): if _debug_print: print '[{}] adding scores'.format(datetime.now()) # subject artm_model.scores.add( artm.PerplexityScore(name='perplexity_score_subject', dictionary=dictionary, topic_names=subject_topics)) artm_model.scores.add( artm.SparsityPhiScore(name='ss_phi_score_subject', class_id=class_name, topic_names=subject_topics)) artm_model.scores.add( artm.SparsityThetaScore(name='ss_theta_score_subject', topic_names=subject_topics)) artm_model.scores.add( artm.TopicKernelScore(name='topic_kernel_score_subject', class_id=class_name, topic_names=subject_topics, probability_mass_threshold=p_mass_threshold)) artm_model.scores.add( artm.TopTokensScore(name='top_tokens_score_subject', class_id=class_name, topic_names=subject_topics, num_tokens=n_top_tokens)) # common artm_model.scores.add( artm.PerplexityScore(name='perplexity_score_common', dictionary=dictionary, topic_names=common_topics)) artm_model.scores.add( artm.SparsityPhiScore(name='ss_phi_score_common', class_id=class_name, topic_names=common_topics)) artm_model.scores.add( artm.SparsityThetaScore(name='ss_theta_score_common', topic_names=common_topics)) artm_model.scores.add( artm.TopicKernelScore(name='topic_kernel_score_common', class_id=class_name, topic_names=common_topics, probability_mass_threshold=p_mass_threshold)) artm_model.scores.add( artm.TopTokensScore(name='top_tokens_score_common', class_id=class_name, topic_names=common_topics, num_tokens=n_top_tokens))
def test_phi_matrix_after_lda_sampled_regularizer(experiment_enviroment): with warnings.catch_warnings(): warnings.filterwarnings(action="ignore", message=W_DIFF_BATCHES_1) dataset = Dataset(DATA_PATH) dictionary = dataset.get_dictionary() batch_vectorizer = dataset.get_batch_vectorizer() topic_prior_reg = TopicPriorSampledRegularizer( name='topic_prior', tau=5, num_topics=5, beta_prior=[10, 1, 100, 2, 1000]) model_artm_1 = artm.ARTM( num_processors=1, num_topics=5, cache_theta=True, class_ids={ MAIN_MODALITY: 1.0, NGRAM_MODALITY: 1.0 }, num_document_passes=1, dictionary=dictionary, scores=[artm.PerplexityScore(name='PerplexityScore', )], ) model_artm_2 = artm.ARTM( num_processors=1, num_topics=5, cache_theta=True, class_ids={ MAIN_MODALITY: 1.0, NGRAM_MODALITY: 1.0 }, num_document_passes=1, dictionary=dictionary, scores=[artm.PerplexityScore(name='PerplexityScore', )], ) tm_1 = TopicModel( model_artm_1, model_id='new_id_1', custom_regularizers={topic_prior_reg.name: topic_prior_reg}) tm_2 = TopicModel(model_artm_2, model_id='new_id_2') tm_1._fit(batch_vectorizer, 10) tm_2._fit(batch_vectorizer, 10) phi_first = tm_1.get_phi() phi_second = tm_2.get_phi() assert any(phi_first != phi_second ), 'Phi matrices are the same after regularization.'
def experiment(filename, tau_phi, tau_theta): batch_vectorizer = artm.BatchVectorizer(data_path=filename, data_format='vowpal_wabbit', target_folder='batches') dictionary = batch_vectorizer.dictionary topic_num = 30 tokens_num = 100 print("ARTM training") topic_names = ['topic_{}'.format(i) for i in range(topic_num)] model_artm = artm.ARTM(topic_names=topic_names, dictionary=dictionary, cache_theta=True) model_plsa = artm.ARTM(topic_names=topic_names, cache_theta=True, scores=[artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary)]) model_lda = artm.LDA(num_topics=topic_num) model_artm.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary)) model_artm.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score')) model_artm.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score')) model_artm.scores.add(artm.TopTokensScore(name='top_tokens_score', num_tokens=tokens_num)) model_artm.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3)) model_artm.scores.add(artm.BackgroundTokensRatioScore(name='background_tokens_ratio_score')) model_artm.scores.add(artm.ClassPrecisionScore(name='class_precision_score')) model_artm.scores.add(artm.TopicMassPhiScore(name='topic_mass_phi_score')) model_plsa.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary)) model_plsa.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score')) model_plsa.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score')) model_plsa.scores.add(artm.TopTokensScore(name='top_tokens_score')) model_plsa.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3)) model_plsa.scores.add(artm.BackgroundTokensRatioScore(name='background_tokens_ratio_score')) model_plsa.scores.add(artm.ClassPrecisionScore(name='class_precision_score')) model_plsa.scores.add(artm.TopicMassPhiScore(name='topic_mass_phi_score')) model_artm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer')) model_artm.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer')) model_artm.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer')) model_artm.regularizers['sparse_phi_regularizer'].tau = tau_phi model_artm.regularizers['sparse_theta_regularizer'].tau = tau_theta model_artm.regularizers['decorrelator_phi_regularizer'].tau = 1e+3 model_plsa.initialize(dictionary=dictionary) model_artm.initialize(dictionary=dictionary) model_lda.initialize(dictionary=dictionary) passes = 100 model_plsa.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes) model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes) model_lda.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes) print_measures(model_plsa, model_artm, model_lda)
def set_scores(self): self.model.scores.add( artm.PerplexityScore(name='PerplexityScore', dictionary=self.dictionary)) self.model.scores.add( artm.SparsityPhiScore(name='SparsityPhiScore', class_id='@default_class', topic_names=self.specific)) self.model.scores.add( artm.SparsityThetaScore(name='SparsityThetaScore', topic_names=self.specific)) # Fraction of background words in the whole collection self.model.scores.add( artm.BackgroundTokensRatioScore(name='BackgroundTokensRatioScore', class_id='@default_class')) # Kernel characteristics self.model.scores.add( artm.TopicKernelScore(name='TopicKernelScore', class_id='@default_class', topic_names=self.specific, probability_mass_threshold=0.5, dictionary=self.dictionary)) # Looking at top tokens self.model.scores.add( artm.TopTokensScore(name='TopTokensScore', class_id='@default_class', num_tokens=100))
def test_fancy_fit_is_ok(experiment_enviroment): tm, dataset, experiment, dictionary = experiment_enviroment model_artm = artm.ARTM( num_topics=5, num_document_passes=1, dictionary=dictionary, scores=[artm.PerplexityScore(name='PerplexityScore')], theta_columns_naming='title', class_ids={ MAIN_MODALITY: 1, NGRAM_MODALITY: 1, EXTRA_MODALITY: 1, '@psyduck': 42 }, regularizers=[ artm.SmoothSparseThetaRegularizer(name='smooth_theta', tau=10.0), ]) custom_scores = {'mean_kernel_size': ScoreExample()} tm = TopicModel(model_artm, model_id='absolutely_new_id', custom_scores=custom_scores) num_iterations = 10 tm._fit(dataset.get_batch_vectorizer(), num_iterations) params = tm.get_jsonable_from_parameters() assert "smooth_theta" in params["regularizers"] PATH = "tests/experiments/save_standalone/" tm.save(PATH) tm2 = TopicModel.load(PATH) assert (tm.get_phi() == tm2.get_phi()).all().all()
def define_model(n_topics: int, dictionary: artm.Dictionary, sparse_theta: float, sparse_phi: float, decorrelator_phi: float) -> artm.artm_model.ARTM: """ Define the ARTM model. :param n_topics: number of topics. :param dictionary: batch vectorizer dictionary. :param sparse_theta: sparse theta parameter. :param sparse_phi: sparse phi Parameter. :param decorrelator_phi: decorellator phi Parameter. :return: ARTM model. """ print("Defining the model.") topic_names = ["topic_{}".format(i) for i in range(1, n_topics + 1)] model_artm = artm.ARTM( topic_names=topic_names, cache_theta=True, scores=[ artm.PerplexityScore(name="PerplexityScore", dictionary=dictionary), artm.SparsityPhiScore(name="SparsityPhiScore"), artm.SparsityThetaScore(name="SparsityThetaScore"), artm.TopicKernelScore(name="TopicKernelScore", probability_mass_threshold=0.3), artm.TopTokensScore(name="TopTokensScore", num_tokens=15) ], regularizers=[ artm.SmoothSparseThetaRegularizer(name="SparseTheta", tau=sparse_theta), artm.SmoothSparsePhiRegularizer(name="SparsePhi", tau=sparse_phi), artm.DecorrelatorPhiRegularizer(name="DecorrelatorPhi", tau=decorrelator_phi) ]) return model_artm
def fit(): batch_id = str(uuid.uuid4()) app.logger.info("batch %s", batch_id) rjson = request.json terms = rjson['terms'] topics_cnt = rjson['topics'] batch = artm.messages.Batch() term_to_id = {} all_terms = [] batch = artm.messages.Batch() batch.id = batch_id for i, doc in enumerate(terms): item = batch.item.add() item.id = i field = item.field.add() for term in doc: if not term in term_to_id: term_to_id[term] = len(all_terms) all_terms.append(term) field.token_id.append(term_to_id[term]) field.token_count.append(1) for t in all_terms: batch.token.append(t) os.mkdir(batch_id) with open(os.path.join(batch_id, "batch.batch"), 'wb') as fout: fout.write(batch.SerializeToString()) app.logger.info("batch %s is created", batch_id) dictionary = artm.Dictionary() dictionary.gather(batch_id) model_artm = artm.ARTM( topic_names=['topic_{}'.format(i) for i in xrange(topics_cnt)], scores=[ artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary) ], regularizers=[ artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=-0.15) ], show_progress_bars=False) batch_vectorizer = artm.BatchVectorizer(data_path=batch_id, data_format="batches") model_artm.initialize(dictionary=dictionary) app.logger.info("model is starting to fit") model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=1) app.logger.info("mode was fitted") model_artm.save(os.path.join(batch_id, "model")) return jsonify({"id": batch_id})
def experiment_enviroment(request): """ """ with warnings.catch_warnings(): warnings.filterwarnings(action="ignore", message=W_DIFF_BATCHES_1) dataset = Dataset('tests/test_data/test_dataset.csv') dictionary = dataset.get_dictionary() model_artm = artm.ARTM( num_processors=1, num_topics=5, cache_theta=True, num_document_passes=1, dictionary=dictionary, scores=[artm.PerplexityScore(name='PerplexityScore', )], ) model_artm.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore')) model_artm.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore')) ex_score = ScoreExample() tm = TopicModel(model_artm, model_id='new_id', custom_scores={'example_score': ex_score}) # experiment starts without model experiment = Experiment(tm, experiment_id="test_cube_creator", save_path="tests/experiments") return tm, dataset, experiment, dictionary
def experiment_enviroment(request): """ """ with warnings.catch_warnings(): warnings.filterwarnings(action="ignore", message=W_DIFF_BATCHES_1) dataset = Dataset('tests/test_data/test_dataset.csv') dictionary = dataset.get_dictionary() model_artm = artm.ARTM( num_topics=5, class_ids={MAIN_MODALITY: 1.0, NGRAM_MODALITY: 1.0, EXTRA_MODALITY: 1.0}, num_document_passes=1, dictionary=dictionary, scores=[artm.PerplexityScore(name='PerplexityScore', )], theta_columns_naming='title', ) custom_scores = {'mean_kernel_size': ScoreExample()} tm = TopicModel(model_artm, model_id='new_id', custom_scores=custom_scores) experiment = Experiment(experiment_id="test", save_path="tests/experiments", topic_model=tm) def resource_teardown(): """ """ shutil.rmtree("tests/experiments") shutil.rmtree(dataset._internals_folder_path) request.addfinalizer(resource_teardown) return tm, dataset, experiment, dictionary
def experiment_enviroment(request): """ """ with warnings.catch_warnings(): warnings.filterwarnings(action="ignore", message=W_DIFF_BATCHES_1) dataset = Dataset(DATA_PATH) dictionary = dataset.get_dictionary() model_artm = artm.ARTM( num_processors=1, num_topics=5, cache_theta=True, class_ids={ MAIN_MODALITY: 1.0, NGRAM_MODALITY: 1.0 }, num_document_passes=1, dictionary=dictionary, scores=[artm.PerplexityScore(name='PerplexityScore', )], ) tm = TopicModel(model_artm, model_id='new_id') experiment = Experiment(experiment_id="test_cubes", save_path="tests/experiments", topic_model=tm) return tm, dataset, experiment, dictionary
def setup_class(cls): """ """ with warnings.catch_warnings(): warnings.filterwarnings(action="ignore", message=W_DIFF_BATCHES_1) dataset = Dataset('tests/test_data/test_dataset.csv') raw_data = [] with open('tests/test_data/test_vw.txt', encoding='utf-8') as file: for line in file: raw_data += [line.split(' ')] dictionary = dataset.get_dictionary() batch_vectorizer = dataset.get_batch_vectorizer() model_artm = artm.ARTM( num_topics=NUM_TOPICS, class_ids=dict.fromkeys(CLASS_IDS, 1.0), topic_names=TOPIC_NAMES, cache_theta=True, num_document_passes=NUM_DOCUMENT_PASSES, dictionary=dictionary, scores=[artm.PerplexityScore(name='PerplexityScore')], ) cls.topic_model = TopicModel(model_artm, model_id='model_id') cls.topic_model._fit(batch_vectorizer, num_iterations=NUM_ITERATIONS) cls.raw_data = raw_data
def init_model(self, dictionary_path=None): """dictionary_path: optional, used with pretrained model""" self.dictionary = artm.Dictionary() if dictionary_path is None: self.dictionary.gather(data_path=self.batches_path) self.dictionary.filter(min_tf=10, max_df_rate=0.1) self.dictionary.save_text( f"{self.dir_path}/dicts/dict_{self.name_dataset}.txt") else: self.dictionary.load_text(dictionary_path) self.model = artm.ARTM( num_topics=self.n_topics, dictionary=self.dictionary, show_progress_bars=True, ) # scores self.model.scores.add( artm.PerplexityScore(name="PerplexityScore", dictionary=self.dictionary)) self.model.scores.add( artm.SparsityThetaScore(name="SparsityThetaScore")) self.model.scores.add(artm.SparsityPhiScore(name="SparsityPhiScore")) # regularizers self.model.regularizers.add( artm.SmoothSparsePhiRegularizer(name="SparsePhi", tau=-0.1)) self.model.regularizers.add( artm.SmoothSparseThetaRegularizer(name="SparseTheta", tau=-0.5)) self.model.regularizers.add( artm.DecorrelatorPhiRegularizer(name="DecorrelatorPhi", tau=1.5e5))
def test_custom_regularizer_cubed_controlled(experiment_enviroment, thread_flag, by_name): """ """ _, dataset, _, dictionary = experiment_enviroment multiplier = 2 initial_tau = 5 custom_reg = TopicPriorSampledRegularizer(name='topic_prior', tau=initial_tau, num_topics=5, beta_prior=[10, 1, 100, 2, 1000]) model_artm = artm.ARTM( num_processors=1, num_topics=5, cache_theta=True, class_ids={ MAIN_MODALITY: 1.0, NGRAM_MODALITY: 1.0 }, num_document_passes=1, dictionary=dictionary, scores=[artm.PerplexityScore(name='PerplexityScore', )], ) tm = TopicModel( model_artm, model_id='new_id_1', custom_regularizers={custom_reg.name: custom_reg} if by_name else {}) experiment = Experiment( # noqa: F841 tm, experiment_id="cubed_controlled_reg", save_path="tests/experiments") parameters = { "score_to_track": None, "tau_converter": f"prev_tau * {multiplier}", "user_value_grid": [0.3], "max_iters": float("inf") } if by_name: parameters["reg_name"] = custom_reg.name else: parameters["regularizer"] = custom_reg num_iter = 10 cube = RegularizationControllerCube(num_iter=num_iter, parameters=parameters, reg_search="grid", use_relative_coefficients=False, separate_thread=thread_flag) dummies = cube(tm, dataset) tmodels = [dummy.restore() for dummy in dummies] for one_model in tmodels: actual_tau = one_model.all_regularizers[custom_reg.name].tau assert actual_tau == initial_tau * (multiplier**num_iter)
def create_and_learn_ARTM_decorPhi_modal(name="", topic_number=750, num_collection_passes=1, weigths=[1., 1., 1., 1.], decorTau=1.0): batch_vectorizer_train = None batch_vectorizer_train = artm.BatchVectorizer(data_path='./' + name, data_format='vowpal_wabbit', target_folder='folder' + name) dictionary = artm.Dictionary() dictionary.gather(data_path=batch_vectorizer_train.data_path) topic_names = ['topic_{}'.format(i) for i in range(topic_number)] model = artm.ARTM(topic_names=topic_names, class_ids={ '@text': weigths[0], '@first': weigths[1], '@second': weigths[2], '@third': weigths[3] }, cache_theta=True, theta_columns_naming='title', scores=[ artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary) ]) model.regularizers.add( artm.DecorrelatorPhiRegularizer( name='DecorrelatorPhi_modals', tau=decorTau, class_ids=['@first', '@second', '@third'])) model.initialize(dictionary=dictionary) model.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore')) model.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore')) model.scores.add( artm.TopicKernelScore(name='TopicKernelScore', class_id='@text', probability_mass_threshold=0.3)) model.scores.add( artm.TopTokensScore(name='TopTokensScore', num_tokens=6, class_id='@text')) model.scores.add( artm.SparsityPhiScore(name='sparsity_phi_score', class_id='@third')) model.num_document_passes = 1 model.fit_offline(batch_vectorizer=batch_vectorizer_train, num_collection_passes=num_collection_passes) theta_train = model.transform(batch_vectorizer=batch_vectorizer_train) return model, theta_train
def experiment_enviroment(request): """ """ with warnings.catch_warnings(): warnings.filterwarnings(action="ignore", message=W_DIFF_BATCHES_1) dataset = Dataset('tests/test_data/test_dataset.csv') dictionary = dataset.get_dictionary() model_artm = artm.ARTM( num_processors=3, num_topics=5, cache_theta=True, num_document_passes=1, dictionary=dictionary, scores=[artm.PerplexityScore(name='PerplexityScore')]) model_artm.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore')) model_artm.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore')) ex_score = ScoreExample() tm = TopicModel(model_artm, model_id='new_id', custom_scores={'example_score': ex_score}) experiment = Experiment(tm, experiment_id="test_pipeline", save_path="tests/experiments") cube_settings = [{ 'CubeCreator': { 'num_iter': 10, 'parameters': [ { 'name': 'seed', 'values': [82019, 322], }, ], 'reg_search': 'grid', 'separate_thread': USE_MULTIPROCESS, }, 'selection': [ 'model.seed = 82019 and PerplexityScore -> min COLLECT 2', ] }, { 'RegularizersModifierCube': { 'num_iter': 10, 'regularizer_parameters': { "regularizer": artm.regularizers.SmoothSparsePhiRegularizer(), "tau_grid": [0.1, 0.5, 1, 5, 10] }, 'reg_search': 'grid', 'use_relative_coefficients': False, 'separate_thread': USE_MULTIPROCESS, }, 'selection': [ 'PerplexityScore -> max COLLECT 2', ] }] return tm, dataset, experiment, dictionary, cube_settings
def test_custom_regularizer_cubed(experiment_enviroment, thread_flag, by_name): """ """ _, dataset, _, dictionary = experiment_enviroment tau_grid = [1, 0, -1] custom_reg = TopicPriorSampledRegularizer(name='topic_prior', tau=5, num_topics=5, beta_prior=[10, 1, 100, 2, 1000]) model_artm = artm.ARTM( num_processors=1, num_topics=5, cache_theta=True, class_ids={ MAIN_MODALITY: 1.0, NGRAM_MODALITY: 1.0 }, num_document_passes=1, dictionary=dictionary, scores=[artm.PerplexityScore(name='PerplexityScore', )], ) tm = TopicModel( model_artm, model_id='new_id_1', custom_regularizers={custom_reg.name: custom_reg} if by_name else {}) experiment = Experiment( # noqa: F841 tm, experiment_id="cubed_reg", save_path="tests/experiments") if by_name: regularizer_parameters = { "name": custom_reg.name, "tau_grid": tau_grid } else: regularizer_parameters = { "regularizer": custom_reg, "tau_grid": tau_grid } cube = RegularizersModifierCube( num_iter=10, regularizer_parameters=regularizer_parameters, reg_search="grid", use_relative_coefficients=False, separate_thread=thread_flag) dummies = cube(tm, dataset) tmodels = [dummy.restore() for dummy in dummies] assert len(tmodels) == len(tau_grid) for tau, one_model in zip(tau_grid, tmodels): assert one_model.all_regularizers[custom_reg.name].tau == tau
def compute_big_artm(num_topics, tau, dictionary, batch_vectorizer, score_computer): artm_model = artm.ARTM(num_topics=num_topics, num_document_passes=5, dictionary=dictionary, scores=[artm.PerplexityScore(name='s1')], regularizers=[artm.SmoothSparseThetaRegularizer(name='r1', tau=tau)], cache_theta=True) artm_model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=10) theta_bigartm = artm_model.get_theta() bigartm_predicts = get_df_clusters_predicted(theta_bigartm, url_list) score = score_computer.compute_score(bigartm_predicts["story_id_predicted"]) logging.info("num_topics={}, tau={}," "bigARTM score = {}".format(num_topics, tau, score))
def create_topic_model(self, topic_model_name: str, batch_vectorizer: artm.BatchVectorizer, dictionary: artm.Dictionary) -> artm.ARTM: topic_model = artm.ARTM(num_topics=self.number_of_topics, dictionary=dictionary, cache_theta=False) topic_model.scores.add( artm.PerplexityScore(name='perplexity_score', dictionary=dictionary)) topic_model.scores.add( artm.SparsityPhiScore(name='sparsity_phi_score')) topic_model.scores.add( artm.SparsityThetaScore(name='sparsity_theta_score')) topic_model.num_document_passes = 5 topic_model.num_processors = max(1, os.cpu_count() - 1) topic_model.regularizers.add( artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer')) topic_model.regularizers.add( artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer')) topic_model.regularizers.add( artm.DecorrelatorPhiRegularizer( name='decorrelator_phi_regularizer')) topic_model.regularizers['sparse_phi_regularizer'].tau = -1.0 topic_model.regularizers['sparse_theta_regularizer'].tau = -0.5 topic_model.regularizers['decorrelator_phi_regularizer'].tau = 1e+5 best_score = None keyword_extraction_logger.info( 'epoch perplexity_score sparsity_phi_score sparsity_theta_score' ) for restart_index in range(10): topic_model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=3) if best_score is None: best_score = topic_model.score_tracker[ 'perplexity_score'].last_value else: if best_score > topic_model.score_tracker[ 'perplexity_score'].last_value: best_score = topic_model.score_tracker[ 'perplexity_score'].last_value self.save_topic_model(topic_model, topic_model_name) keyword_extraction_logger.info( '{0:5} {1:16.9} {2:18.9} {3:20.9}'.format( (restart_index + 1) * 3, topic_model.score_tracker['perplexity_score'].last_value, topic_model.score_tracker['sparsity_phi_score'].last_value, topic_model.score_tracker['sparsity_theta_score']. last_value)) del topic_model return self.load_topic_model( artm.ARTM(num_topics=self.number_of_topics, dictionary=dictionary, cache_theta=False), topic_model_name)
def _get_corpus_model(self, corpus_vector_spaced, clustering_method='artm'): if 'gensim' == clustering_method: return self._get_model_LSI(corpus_vector_spaced) elif 'sklearn' == clustering_method: return self._get_model_LDA(corpus_vector_spaced) elif 'artm' == clustering_method: batch_vectorizer = corpus_vector_spaced['batch_vectorizer'] dictionary = corpus_vector_spaced['dictionary'] topic_names = [ 'topic_{}'.format(i) for i in range(self.num_of_clusters) ] model_artm = artm.ARTM( topic_names=topic_names, cache_theta=True, scores=[ artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary) ], regularizers=[ artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=-0.15) ]) model_artm.scores.add( artm.SparsityPhiScore(name='SparsityPhiScore')) model_artm.scores.add( artm.SparsityThetaScore(name='SparsityThetaScore')) model_artm.scores.add( artm.TopicKernelScore(name='TopicKernelScore', probability_mass_threshold=0.3)) model_artm.scores.add(artm.TopTokensScore(name='TopTokensScore', num_tokens=10), overwrite=True) model_artm.regularizers.add( artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=-0.1)) model_artm.regularizers['SparseTheta'].tau = -0.2 model_artm.regularizers.add( artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=1.5e+5)) model_artm.num_document_passes = 1 model_artm.initialize(dictionary) model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=30) return model_artm.transform(batch_vectorizer=batch_vectorizer).T
def topic_model(class_ids, dictionary, num_of_topics, num_back, tau, tf): names_of_topics = [str(x) for x in range(num_of_topics)] dictionary.filter(min_tf=tf, class_id='subjects') dictionary.filter(min_tf=tf, class_id='objects') dictionary.filter(min_tf=tf, class_id='pairs') model = artm.ARTM( num_topics=num_of_topics, #reuse_theta=True, cache_theta=True, topic_names=names_of_topics, class_ids=class_ids, #regularizers=regularizers_artm, dictionary=dictionary) model.scores.add( artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary)) model.scores.add( artm.SparsityPhiScore(name='SparcityPhiScore', topic_names=model.topic_names[:-num_back])) model.regularizers.add( artm.SmoothSparsePhiRegularizer( name='SparsePhiRegularizer', class_ids=class_ids, topic_names=model.topic_names[:-num_back], tau=-tau)) model.regularizers.add( artm.SmoothSparsePhiRegularizer( name='SmoothPhiRegularizer', class_ids=class_ids, topic_names=model.topic_names[-num_back:], tau=tau)) model.regularizers.add( artm.DecorrelatorPhiRegularizer( name='DecorrelatorRegularizer', class_ids=class_ids, topic_names=model.topic_names[:-num_back], tau=tau)) model.regularizers.add( artm.SmoothSparseThetaRegularizer( name='SparseThetaRegularizer', topic_names=model.topic_names[-num_back], tau=tau)) return model
def create_thematic_model(checked_list, num_topics, num_tokens, phi_tau, theta_tau, decorr_tau): """ Create a thematic model """ gluing_bag_of_words(checked_list) batch_vectorizer = artm.BatchVectorizer(data_path=COLLECTION_PATH, data_format='vowpal_wabbit', target_folder=TARGET_FOLDER, batch_size=len(checked_list)) dictionary = artm.Dictionary(data_path=TARGET_FOLDER) model = artm.ARTM( num_topics=num_topics, num_document_passes=len(checked_list), dictionary=dictionary, regularizers=[ artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer', tau=phi_tau), artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer', tau=theta_tau), artm.DecorrelatorPhiRegularizer( name='decorrelator_phi_regularizer', tau=decorr_tau), ], scores=[ artm.PerplexityScore(name='perplexity_score', dictionary=dictionary), artm.SparsityPhiScore(name='sparsity_phi_score'), artm.SparsityThetaScore(name='sparsity_theta_score'), artm.TopTokensScore(name='top_tokens_score', num_tokens=num_tokens) ]) model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=len(checked_list)) top_tokens = model.score_tracker['top_tokens_score'] topic_dictionary = OrderedDict() for topic_name in model.topic_names: list_name = [] for (token, weight) in zip(top_tokens.last_tokens[topic_name], top_tokens.last_weights[topic_name]): list_name.append(token + '-' + str(round(weight, 3))) topic_dictionary[str(topic_name)] = list_name return model.score_tracker[ 'perplexity_score'].last_value, model.score_tracker[ 'sparsity_phi_score'].last_value, model.score_tracker[ 'sparsity_theta_score'].last_value, topic_dictionary
def test_func(): topic_selection_tau = 0.5 num_collection_passes = 3 num_document_passes = 10 num_topics = 15 data_path = os.environ.get('BIGARTM_UNITTEST_DATA') batches_folder = tempfile.mkdtemp() perplexity_eps = 0.1 perplexity_value = [ 6676.941798754971, 2534.963709464024, 2463.1544861984794 ] try: batch_vectorizer = artm.BatchVectorizer(data_path=data_path, data_format='bow_uci', collection_name='kos', target_folder=batches_folder) dictionary = artm.Dictionary(data_path=batches_folder) model = artm.ARTM(num_topics=num_topics, dictionary=dictionary, num_document_passes=num_document_passes) model.regularizers.add( artm.TopicSelectionThetaRegularizer(name='TopicSelection', tau=topic_selection_tau)) model.scores.add(artm.PerplexityScore(name='PerplexityScore')) model.scores.add( artm.TopicMassPhiScore(name='TopicMass', model_name=model.model_nwt)) model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=num_collection_passes) # Verify that only 8 topics are non-zero (due to TopicSelection regularizer) topics_left = sum(x == 0 for x in model.get_score('TopicMass').topic_mass) assert 8 == topics_left # the following asssertion fails on travis-ci builds, but passes locally for i in range(num_collection_passes): assert abs(model.score_tracker['PerplexityScore'].value[i] - perplexity_value[i]) < perplexity_eps model.fit_online(batch_vectorizer=batch_vectorizer) finally: shutil.rmtree(batches_folder)
def create_and_learn_PLSA(name="", topic_number=750, num_collection_passes=1): batch_vectorizer_train = None batch_vectorizer_train = artm.BatchVectorizer(data_path='./' + name, data_format='vowpal_wabbit', target_folder='folder' + name) dictionary = artm.Dictionary() dictionary.gather(data_path=batch_vectorizer_train.data_path) topic_names = ['topic_{}'.format(i) for i in range(topic_number)] model_plsa = artm.ARTM(topic_names=topic_names, class_ids={ '@text': 1.0, '@first': 1.0, '@second': 1.0, '@third': 1.0 }, cache_theta=True, theta_columns_naming='title', scores=[ artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary) ]) model_plsa.initialize(dictionary=dictionary) model_plsa.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore')) model_plsa.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore')) model_plsa.scores.add( artm.TopicKernelScore(name='TopicKernelScore', class_id='@text', probability_mass_threshold=0.3)) model_plsa.scores.add( artm.TopTokensScore(name='TopTokensScore', num_tokens=6, class_id='@text')) model_plsa.num_document_passes = 1 model_plsa.fit_offline(batch_vectorizer=batch_vectorizer_train, num_collection_passes=num_collection_passes) theta_train = model_plsa.transform(batch_vectorizer=batch_vectorizer_train) return model_plsa, theta_train
def add_scores_to_model(current_dictionary, artm_model, n_top_tokens, p_mass_threshold): artm_model.scores.add( artm.PerplexityScore(name='perplexity_score', use_unigram_document_model=False, dictionary=current_dictionary)) artm_model.scores.add( artm.SparsityPhiScore(name='sparsity_phi_score', class_id='ngramm')) artm_model.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score')) artm_model.scores.add( artm.TopicKernelScore(name='topic_kernel_score', class_id='ngramm', probability_mass_threshold=p_mass_threshold)) artm_model.scores.add( artm.TopTokensScore(name='top_tokens_score', class_id='ngramm', num_tokens=n_top_tokens))
def artm_plsa(batch_vectorizer, topics, topic_names, dictionary): model_artm = artm.ARTM(num_topics=topics, topic_names=topic_names, num_processors=cpu_count(), class_ids={"text": 1}, reuse_theta=True, cache_theta=True, num_document_passes=1) model_artm.initialize(dictionary=dictionary) model_artm.scores.add( artm.PerplexityScore("perplexity", class_ids=["text"], dictionary=dictionary)) model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=50) print "\nPeprlexity for BigARTM PLSA: ", model_artm.score_tracker[ "perplexity"].value[-1]
def pipeline_plsa_bigartm(lines, TOPIC_NUMBER, ngram_range, topnwords, LOGS_DATA_PATH="plsa.txt", TARGET_FOLDER="plsa"): make_file(lines, ngram_range, LOGS_DATA_PATH) batch_vectorizer = artm.BatchVectorizer(data_path=LOGS_DATA_PATH, data_format='vowpal_wabbit', target_folder=TARGET_FOLDER) model_artm = artm.ARTM(num_topics=TOPIC_NUMBER, cache_theta=True) model_artm.initialize(dictionary=batch_vectorizer.dictionary) model_artm.regularizers.add( artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=0.05)) model_artm.regularizers.add( artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=1.5e+5)) model_artm.regularizers.add( artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=-0.01)) model_artm.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore')) model_artm.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore')) model_artm.scores.add(artm.TopTokensScore(name='TopTokensScore', num_tokens=topnwords), overwrite=True) model_artm.scores.add( artm.PerplexityScore(name='PerplexityScore', dictionary=batch_vectorizer.dictionary)) model_artm.num_document_passes = 2 model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=15) topic_names = {} for topic_name in model_artm.topic_names: topic_names[topic_name] = model_artm.score_tracker[ 'TopTokensScore'].last_tokens[topic_name] #return label_after_bigarm(model_artm), topic_names return "nothing, sorry", topic_names
def setup_class(cls): """ """ with warnings.catch_warnings(): warnings.filterwarnings(action="ignore", message=W_DIFF_BATCHES_1) cls.dataset = Dataset('tests/test_data/test_dataset.csv') dictionary = cls.dataset.get_dictionary() batch_vectorizer = cls.dataset.get_batch_vectorizer() model_artm = artm.ARTM( num_topics=NUM_TOPICS, cache_theta=True, num_document_passes=NUM_DOCUMENT_PASSES, dictionary=dictionary, scores=[artm.PerplexityScore(name='PerplexityScore')],) cls.topic_model = TopicModel(model_artm, model_id='model_id') cls.topic_model._fit(batch_vectorizer, num_iterations=NUM_ITERATIONS) cls.theta = cls.topic_model.get_theta(dataset=cls.dataset) cls.top_documents_viewer = top_documents_viewer.TopDocumentsViewer(model=cls.topic_model)
def create_model_with_background(dictionary, num_tokens, num_document_passes): sm_phi_tau = 0.0001 * 1e-4 sp_phi_tau = -0.0001 * 1e-4 decor_phi_tau = 1 specific_topics = ['topic {}'.format(i) for i in range(1, 20)] topic_names = specific_topics + ["background"] scores = [ artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary), artm.TopTokensScore( name='TopTokensScore', num_tokens=10, class_id='plain_text' ), # web version of Palmetto works only with <= 10 tokens artm.SparsityPhiScore(name='SparsityPhiScore'), artm.SparsityThetaScore(name='SparsityThetaScore'), artm.TopicKernelScore(name='TopicKernelScore', probability_mass_threshold=0.3, class_id='plain_text') ] model = artm.ARTM(topic_names=specific_topics + ["background"], regularizers=[], cache_theta=True, scores=scores, class_ids={'plain_text': 1.0}) model.regularizers.add( artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=-sp_phi_tau, topic_names=specific_topics)) model.regularizers.add( artm.SmoothSparsePhiRegularizer(name='SmoothPhi', tau=sm_phi_tau, topic_names=["background"])) # model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=decor_phi_tau)) model.initialize(dictionary=dictionary) model.num_document_passes = num_document_passes return model