def test_func(): # constants num_tokens = 15 parent_level_weight = 1 num_collection_passes = 15 num_document_passes = 10 num_topics_level0 = 15 num_topics_level1 = 50 regularizer_tau = 10 ** 5 vocab_size = 6906 num_docs = 3430 zero_eps = 0.001 data_path = os.environ.get('BIGARTM_UNITTEST_DATA') batches_folder = tempfile.mkdtemp() parent_batch_folder = tempfile.mkdtemp() try: batch_vectorizer = artm.BatchVectorizer(data_path=data_path, data_format='bow_uci', collection_name='kos', target_folder=batches_folder) dictionary = artm.Dictionary() dictionary.gather(data_path=batch_vectorizer.data_path) hier = artm.hARTM(dictionary=dictionary, cache_theta=True, num_document_passes=num_document_passes) level0 = hier.add_level(num_topics=num_topics_level0) level0.initialize(dictionary=dictionary) level0.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=num_collection_passes) hier.tmp_files_path = parent_batch_folder level1 = hier.add_level(num_topics=num_topics_level1, parent_level_weight=parent_level_weight) level1.initialize(dictionary=dictionary) level1.regularizers.add(artm.HierarchySparsingThetaRegularizer(name="HierSp", tau=regularizer_tau)) level1.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=num_collection_passes) phi = hier.get_level(1).get_phi() assert phi.shape == (vocab_size, num_topics_level1) # theta = hier.get_level(1).get_theta() # assert theta.shape == (num_topics_level1, num_docs) psi = hier.get_level(1).get_psi() support = psi.values.max(axis=1).min() # This test gives different results on python27 and python35. Authors need to investigate. on_python_27 = abs(support - 0.0978 < zero_eps) on_python_35 = abs(support - 0.1522 < zero_eps) assert(on_python_27 or on_python_35) assert(level1.clone() is not None) assert(hier.clone() is not None) finally: shutil.rmtree(batches_folder) shutil.rmtree(parent_batch_folder)
def init_hierarchical_model(class_ids): score = [artm.PerplexityScore(name='perplexity_words', class_ids=['body']), artm.PerplexityScore(name='perplexity_bigrams', class_ids=['bigrams'])] top_tokens = [artm.TopTokensScore(name='top_words', num_tokens=15, class_id='body'), artm.TopTokensScore(name='top_bigrams', num_tokens=10, class_id='bigrams')] sparsity = [artm.SparsityThetaScore(name='sparsity_theta', eps=1e-6), artm.SparsityPhiScore(name='sparsity_phi_words', class_id='words', eps=1e-6), artm.SparsityPhiScore(name='sparsity_phi_bigrams', class_id='bigrams', eps=1e-6)] regularizers = [artm.DecorrelatorPhiRegularizer(tau=0, class_ids=['body'], name='decorr_words'), artm.DecorrelatorPhiRegularizer(tau=0, class_ids=['bigram'], name='decorr_bigrams'), artm.DecorrelatorPhiRegularizer(tau=0, class_ids=['categories'], name='decorr_categories'), artm.SmoothSparseThetaRegularizer(tau=0, name='sparsity_theta'), artm.SmoothSparsePhiRegularizer(tau=0, class_ids=['body'], name='sparsity_words'), artm.SmoothSparsePhiRegularizer(tau=0, class_ids=['bigram'], name='sparsity_bigrams')] hmodel = artm.hARTM(class_ids=class_ids, cache_theta=True, reuse_theta=True, scores=score + top_tokens + sparsity, regularizers=regularizers, theta_columns_naming='title') return hmodel
def test_func(): # constants num_tokens = 15 parent_level_weight = 1 num_collection_passes = 15 num_document_passes = 10 num_topics_level0 = 15 num_topics_level1 = 50 regularizer_tau = 10 ** 5 vocab_size = 6906 num_docs = 3430 zero_eps = 0.001 data_path = os.environ.get('BIGARTM_UNITTEST_DATA') batches_folder = tempfile.mkdtemp() parent_batch_folder = tempfile.mkdtemp() try: batch_vectorizer = artm.BatchVectorizer(data_path=data_path, data_format='bow_uci', collection_name='kos', target_folder=batches_folder) dictionary = artm.Dictionary() dictionary.gather(data_path=batch_vectorizer.data_path) hier = artm.hARTM(dictionary=dictionary, cache_theta=True, num_document_passes=num_document_passes) level0 = hier.add_level(num_topics=num_topics_level0) level0.initialize(dictionary=dictionary) level0.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=num_collection_passes) hier.tmp_files_path = parent_batch_folder level1 = hier.add_level(num_topics=num_topics_level1, parent_level_weight=parent_level_weight) level1.initialize(dictionary=dictionary) level1.regularizers.add(artm.HierarchySparsingThetaRegularizer(name="HierSp", tau=regularizer_tau)) level1.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=num_collection_passes) phi = hier.get_level(1).get_phi() assert phi.shape == (vocab_size, num_topics_level1) # theta = hier.get_level(1).get_theta() # assert theta.shape == (num_topics_level1, num_docs) psi = hier.get_level(1).get_psi() support = psi.values.max(axis=1).min() # This test gives different results on python27 and python35. Authors need to investigate. on_python_27 = abs(support - 0.0978 < zero_eps) on_python_35 = abs(support - 0.1522 < zero_eps) assert(on_python_27 or on_python_35) finally: shutil.rmtree(batches_folder) shutil.rmtree(parent_batch_folder)
def create_simple(self, iter_count, regularizers={}): self.log("Creating simple model...") layers_count = self.layers_count num_topics = [int(x) for x in self.topics_count.split()] batch_vectorizer, dictionary = self.dataset.get_batches() model = artm.hARTM(num_document_passes=iter_count, theta_columns_naming="id") model.cache_theta = True layers = [0 for i in range(layers_count)] layers[0] = model.add_level( num_topics=num_topics[1], topic_names=["topic" + str(t) for t in range(num_topics[1])]) layers[0].initialize(dictionary=dictionary) self.log("Layer 0 initialized.") if (regularizers): reg_code = "" for name, params in regularizers.items(): params_init = [] for pname, value in params.items(): if len(value) > 10: raise RuntimeError( "Too long value for parameter %s.%s" % (name, pname)) params_init.append(pname + "=" + value) reg_code += "layers[0].regularizers.add(artm.%s(%s))\n" % ( name, ", ".join(params_init)) self.log("Regularizers to be applied:<br>" + reg_code.replace("\n", "<br>")) exec(reg_code) layers[0].fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=iter_count) self.log("Layer 0 fitted.") for layer_id in range(1, layers_count): layers[layer_id] = model.add_level( parent_level_weight=0.1, num_topics=num_topics[layer_id + 1], topic_names=[ "topic" + str(t) for t in range(num_topics[layer_id + 1]) ]) layers[layer_id].initialize(dictionary=dictionary) self.log("Layer " + str(layer_id) + " initialized.") layers[layer_id].fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=iter_count) self.log("Layer " + str(layer_id) + " fitted.") self.log("Model built.") return model
def __init__(self, dictionary, class_ids, tmp_files_path='', theta_columns_naming='title', cache_theta = True, num_levels=None, level_names=None, num_topics=None, topic_names=None, num_backgrounds=None, background_names=None, smooth_background_tau=None, decorrelate_phi_tau=None, parent_topics_proportion=None, spars_psi_tau=None, smooth_theta_fit=1.0, num_collection_passes=1, num_tokens=10): self.model = artm.hARTM(dictionary=dictionary, class_ids=class_ids, theta_columns_naming=theta_columns_naming, tmp_files_path=tmp_files_path, cache_theta=cache_theta) self.level_names = _generate_names(num_levels, level_names, 'level') topic_names = _generate_names_levels(len(self.level_names), num_topics, topic_names, 'topic') background_names = _generate_names_levels(len(self.level_names), num_backgrounds, background_names, 'background') for topic_names_level, background_names_level in zip(topic_names, background_names): topic_names_level = topic_names_level + background_names_level level = self.model.add_level(num_topics=len(topic_names_level), topic_names=topic_names_level) if smooth_background_tau is not None: for level, background_names_level in zip(self.model, background_names): level.regularizers.add(artm.SmoothSparsePhiRegularizer('SPhi_back', tau=smooth_background_tau, gamma=0, topic_names=background_names_level)) if decorrelate_phi_tau is not None: for level in self.model: level.regularizers.add(artm.DecorrelatorPhiRegularizer('DPhi', tau=decorrelate_phi_tau, gamma=0)) if (parent_topics_proportion is not None) and (spars_psi_tau is not None): for level, parent_topics_proportion_level in zip(self.model[1:], parent_topics_proportion): for topic_name, parent_topic_proportion in parent_topics_proportion_level.items(): level.regularizers.add(artm.HierarchySparsingThetaRegularizer(name=f'HSTheta_{topic_name}', topic_names=topic_name, tau=spars_psi_tau, parent_topic_proportion=parent_topic_proportion)) self.smooth_theta_fit = smooth_theta_fit self.num_collection_passes = num_collection_passes for level in self.model: for class_id, weight in class_ids.items(): if weight > 0: level.scores.add(artm.TopTokensScore(name=f'TT_{class_id}', class_id=class_id, num_tokens=num_tokens))
def _extract_hierarchical_relationship( self, bank_phi: pd.DataFrame, new_model_phi: pd.DataFrame, psi_threshold: float = None ) -> Tuple[List[int], Dict[int, List[int]]]: if bank_phi.shape[1] == 0: return list(range(new_model_phi.shape[1])), dict() assert bank_phi.shape[0] == new_model_phi.shape[0] # TODO: think about bank_phi.shape[1] == 1: alright to proceed? _logger.debug('Creating hARTM') hierarchy = artm.hARTM(num_processors=1) _logger.debug(f'Creating first level with {bank_phi.shape[1]} topics') level0 = hierarchy.add_level(num_topics=bank_phi.shape[1]) level0.initialize(dictionary=self._dictionary) _logger.debug(f'Copying phi for the first level.' f' Phi shape: {bank_phi.shape}.' f' First words: {bank_phi.index[:10]}') phi_ref0 = _safe_copy_phi(level0, bank_phi, self._dataset, small_num_fit_iterations=1) _logger.debug( f'Creating second level with {new_model_phi.shape[1]} topics') level1 = hierarchy.add_level(num_topics=new_model_phi.shape[1], parent_level_weight=1) level1.initialize(dictionary=self._dictionary) # Regularizer may help to refine new topics a bit # in search of parent-child relationship # However, the regularizer won't affect the topics themselves, # only the ARTM hierarchy defined here. _logger.debug( 'Adding HierarchySparsingThetaRegularizer to second level') # TODO: or smaller tau? or without regularizer at all? or change the real topics? level1.regularizers.add( artm.HierarchySparsingThetaRegularizer(name='sparse_hierarchy', tau=1.0)) _logger.debug(f'Copying phi for the second level.' f' Phi shape: {new_model_phi.shape}.' f' First words: {new_model_phi.index[:10]}') phi_ref1 = _safe_copy_phi(level1, new_model_phi, self._dataset, small_num_fit_iterations=3) psi = level1.get_psi() assert psi.shape[0] == new_model_phi.shape[1] assert psi.shape[1] == bank_phi.shape[1] if psi_threshold is None: psi_threshold = 1.0 / psi.shape[0] topics_for_append: List[int] = list() topics_for_update: Dict[int, List[int]] = defaultdict(list) _logger.debug('Analyzing Psi for parent-child relationship') for new_topic in range(level1.get_phi().shape[1]): psi_row = psi.iloc[new_topic, :] parents = np.where(psi_row > psi_threshold)[0] if len(parents) > 1: pass # linearly dependent -> skip elif len(parents) == 0: topics_for_append.append(new_topic) elif len(parents) == 1: topics_for_update[parents[0]].append(new_topic) else: assert False _logger.debug('Deleting hARTM') hierarchy.del_level(1) hierarchy.del_level(0) del phi_ref1 del phi_ref0 del hierarchy gc.collect() return topics_for_append, topics_for_update
def test_func(): # constants num_documents = 3430 vocabulary_size = 6906 num_document_passes = 10 num_collection_passes = 15 num_topics_level_0 = 15 num_topics_level_1 = 50 parent_level_weight = 1 regularizer_tau = 10**5 zero_eps = 0.001 data_path = os.environ.get('BIGARTM_UNITTEST_DATA') batches_folder = tempfile.mkdtemp() parent_batch_folder = tempfile.mkdtemp() hierarchy_model_folder = tempfile.mkdtemp() try: batch_vectorizer = artm.BatchVectorizer(data_path=data_path, data_format='bow_uci', collection_name='kos', target_folder=batches_folder) dictionary = artm.Dictionary() dictionary.gather(data_path=batch_vectorizer.data_path) hierarchy = artm.hARTM(dictionary=dictionary, cache_theta=True, num_document_passes=num_document_passes, tmp_files_path=parent_batch_folder, theta_columns_naming="title") level_0 = hierarchy.add_level(num_topics=num_topics_level_0) level_0.initialize(dictionary=dictionary) level_0.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=num_collection_passes) phi_0 = hierarchy.get_level(0).get_phi() assert phi_0.shape == (vocabulary_size, num_topics_level_0) theta_0 = hierarchy.get_level(0).get_theta() assert theta_0.shape == (num_topics_level_0, num_documents) level_1 = hierarchy.add_level(num_topics=num_topics_level_1, parent_level_weight=parent_level_weight) level_1.initialize(dictionary=dictionary) level_1.regularizers.add( artm.HierarchySparsingThetaRegularizer(name="HierSparsTheta", tau=regularizer_tau)) level_1.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=num_collection_passes) phi_1 = hierarchy.get_level(1).get_phi() assert phi_1.shape == (vocabulary_size, num_topics_level_1) theta_1 = hierarchy.get_level(1).get_theta() assert theta_1.shape == (num_topics_level_1, num_documents) psi = hierarchy.get_level(1).get_psi() assert psi.shape == (num_topics_level_1, num_topics_level_0) support = psi.values.max(axis=1).min() # This test gives different results on python27 and python35. Authors need to investigate. on_python_27 = abs(support - 0.0978 < zero_eps) on_python_35 = abs(support - 0.1522 < zero_eps) assert (on_python_27 or on_python_35) assert (level_0.clone() is not None) assert (level_1.clone() is not None) assert (hierarchy.clone() is not None) # Test save and load methods hierarchy.save(hierarchy_model_folder) hierarchy_load = artm.hARTM() hierarchy_load.load(hierarchy_model_folder) assert level_0.num_topics == hierarchy_load.get_level(0).num_topics assert (phi_0 - hierarchy_load.get_level(0).get_phi()).abs().max().max() < 1e-3 assert level_1.num_topics == hierarchy_load.get_level(1).num_topics assert (phi_1 - hierarchy_load.get_level(1).get_phi()).abs().max().max() < 1e-3 # Test add_level method when we definite topic_names but don't definite num_topics hierarchy_new = artm.hARTM(dictionary=dictionary, cache_theta=True, num_document_passes=num_document_passes, tmp_files_path=parent_batch_folder, theta_columns_naming="title") level_0_new = hierarchy_new.add_level(topic_names=level_0.topic_names) level_0_new.initialize(dictionary=dictionary) level_0_new.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=num_collection_passes) phi_0_new = hierarchy_new.get_level(0).get_phi() assert (phi_0 - phi_0_new).abs().max().max() < 1e-3 theta_0_new = hierarchy_new.get_level(0).get_theta() assert (theta_0 - theta_0_new).abs().max().max() < 1e-3 level_1_new = hierarchy_new.add_level(topic_names=level_1.topic_names) level_1_new.initialize(dictionary=dictionary) level_1_new.regularizers.add( artm.HierarchySparsingThetaRegularizer(name="HierSparsTheta", tau=regularizer_tau)) level_1_new.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=num_collection_passes) phi_1_new = hierarchy_new.get_level(1).get_phi() assert (phi_1 - phi_1_new).abs().max().max() < 1e-3 theta_1_new = hierarchy_new.get_level(1).get_theta() assert (theta_1 - theta_1_new).abs().max().max() < 1e-3 psi_new = hierarchy_new.get_level(1).get_psi() assert (psi - psi_new).abs().max().max() < 1e-3 # Test the same functionality with hARTM, and validate that resulting psi matrix is exactly the same level_0_plain = artm.ARTM(num_topics=num_topics_level_0, num_document_passes=num_document_passes, cache_theta=True, seed=level_0.seed, theta_columns_naming="title") level_0_plain.initialize(dictionary=dictionary) level_0_plain.fit_offline(num_collection_passes=num_collection_passes, batch_vectorizer=batch_vectorizer) phi_0_plain = level_0_plain.get_phi() assert (phi_0 - phi_0_plain).abs().max().max() < 1e-3 theta_0_plain = level_0_plain.get_theta() assert (theta_0 - theta_0_plain).abs().max().max() < 1e-3 level_1_plain = artm.ARTM(num_topics=num_topics_level_1, num_document_passes=num_document_passes, parent_model=level_0_plain, parent_model_weight=parent_level_weight, cache_theta=True, seed=level_1.seed, theta_columns_naming="title") level_1_plain.initialize(dictionary=dictionary) level_1_plain.regularizers.add( artm.HierarchySparsingThetaRegularizer(name="HierSparsTheta", tau=regularizer_tau)) level_1_plain.fit_offline(num_collection_passes=num_collection_passes, batch_vectorizer=batch_vectorizer) phi_1_plain = level_1_plain.get_phi() assert (phi_1 - phi_1_plain).abs().max().max() < 1e-3 theta_1_plain = level_1_plain.get_theta() assert (theta_1 - theta_1_plain).abs().max().max() < 1e-3 psi_plain = level_1_plain.get_parent_psi() assert (psi - psi_plain).abs().max().max() < 1e-3 finally: shutil.rmtree(batches_folder) shutil.rmtree(parent_batch_folder) shutil.rmtree(hierarchy_model_folder)