def _load_phi(self, num_topics: int, subsample_number: int) -> pd.DataFrame:
     topic_model = TopicModel.load(
         self._folder_path_model(
             num_topics, subsample_number=subsample_number
         )
     )
     return topic_model.get_phi()
示例#2
0
    def test_optimize_for_model(self, keep_in_memory, model_family):
        # Thetaless currently fails
        # see https://github.com/machine-intelligence-laboratory/TopicNet/issues/79

        artm_score_name = 'perplexity_score'
        artm_score = PerplexityScore(
            name=artm_score_name,
            class_ids=[self.main_modality, self.other_modality])

        custom_score_name = 'diversity_score'
        custom_score = DiversityScore(custom_score_name,
                                      class_ids=self.main_modality)

        self.text_collection._set_dataset_kwargs(keep_in_memory=keep_in_memory)

        min_num_topics = 1
        max_num_topics = 2
        num_topics_interval = 1
        num_fit_iterations = 3
        num_search_points = len(
            list(range(min_num_topics, max_num_topics + 1,
                       num_topics_interval)))
        num_restarts = 3
        experiment_name = model_family.value
        experiment_folder = self.working_folder_path

        optimizer = OptimizeScoresMethod(
            scores=[artm_score, custom_score],
            model_family=model_family,
            min_num_topics=min_num_topics,
            max_num_topics=max_num_topics,
            num_topics_interval=num_topics_interval,
            num_fit_iterations=num_fit_iterations,
            num_restarts=num_restarts,
            one_model_num_processors=1,
            separate_thread=False,
            experiment_name=experiment_name,
            experiment_directory=experiment_folder,
        )

        optimizer.search_for_optimum(text_collection=self.text_collection)
        restart_folder_names = os.listdir(experiment_folder)

        assert len(restart_folder_names) == num_restarts

        for restart_folder_name in restart_folder_names:
            assert restart_folder_name.startswith(experiment_name)

            restart_folder_path = os.path.join(experiment_folder,
                                               restart_folder_name)
            model_folder_names = os.listdir(restart_folder_path)

            assert len(model_folder_names) == num_search_points

            for model_folder_name in model_folder_names:
                topic_model = TopicModel.load(
                    os.path.join(restart_folder_path, model_folder_name))

                assert artm_score_name in topic_model.scores
                assert custom_score_name in topic_model.scores

                assert len(
                    topic_model.scores[artm_score_name]) == num_fit_iterations
                assert len(topic_model.scores[custom_score_name]) == 1

                assert all(
                    isinstance(v, Number)
                    for v in topic_model.scores[artm_score_name])
                assert all(
                    isinstance(v, Number)
                    for v in topic_model.scores[custom_score_name])