예제 #1
0
def test_infer_topics_data(method):
    pytest.importorskip("gensim")

    minimum_probability: float = 0.001
    n_tokens: int = 5

    train_corpus, inferred_model = create_model_data(method)

    inferred_topics_data: tm.InferredTopicsData = tm.predict_topics(
        topic_model=inferred_model.topic_model,
        corpus=train_corpus.corpus,
        id2token=train_corpus.id2token,
        document_index=train_corpus.document_index,
        minimum_probability=minimum_probability,
        n_tokens=n_tokens,
    )

    assert inferred_topics_data is not None
    assert isinstance(inferred_topics_data.document_index, pd.DataFrame)
    assert isinstance(inferred_topics_data.dictionary, pd.DataFrame)
    assert isinstance(inferred_topics_data.topic_token_weights, pd.DataFrame)
    assert isinstance(inferred_topics_data.topic_token_overview, pd.DataFrame)
    assert isinstance(inferred_topics_data.document_topic_weights, pd.DataFrame)
    assert inferred_topics_data.year_period == (2019, 2020)
    assert set(inferred_topics_data.topic_ids) == {0, 1, 2, 3}
    assert len(inferred_topics_data.document_index) == 5
    assert list(inferred_topics_data.topic_token_weights.topic_id.unique()) == [0, 1, 2, 3]
    assert list(inferred_topics_data.topic_token_overview.index) == [0, 1, 2, 3]
    assert set(inferred_topics_data.document_topic_weights.topic_id.unique()) == {0, 1, 2, 3}
예제 #2
0
    def predict(
        self: TopicModelMixinProtocol,
        *,
        inferred_model: tm.InferredModel,
        id2token: dict,
        corpus: corpora.Sparse2Corpus | pc.VectorizedCorpus,
        document_index: pd.DataFrame,
        target_folder: str,
        n_tokens: int,
        minimum_probability: float,
        **kwargs,
    ) -> tm.InferredTopicsData:
        """[summary]

        Args:
            inferred_model (tm.InferredModel): [description]
            id2token (dict): [description]
            corpus (Sparse2Corpus): [description]
            document_index (pd.DataFrame): [description]
            target_folder (str): [description]
            topics_data (tm.InferredTopicsData, optional): If set, pick data from thi. Defaults to None.
            n_tokens (int, optional): [description]. Defaults to 200.
            minimum_probability (float, optional): [description]. Defaults to 0.001.

        Raises:
            ValueError: [description]

        Returns:
            tm.InferredTopicsData: [description]
        """
        if not isinstance(corpus, (pc.VectorizedCorpus, corpora.Sparse2Corpus)):
            # raise ValueError(f"predict: corpus type {type(corpus)} not supported in predict (use sparse instead)")
            corpus = self.instream_to_vectorized_corpus(token2id=id2token2token2id(id2token))

        if isinstance(corpus, pc.VectorizedCorpus):
            """Make sure we use corpus' own data"""
            document_index = corpus.document_index
            id2token = corpus.id2token

        topics_data: tm.InferredTopicsData = tm.predict_topics(
            inferred_model.topic_model,
            corpus=corpus,
            id2token=id2token,
            document_index=document_index,
            n_tokens=n_tokens,
            minimum_probability=minimum_probability,
            **kwargs,
        )
        inferred_model.store_options(target_folder)
        topics_data.store(target_folder)
        return topics_data
예제 #3
0
        def compute_topic_model_handler(*_):

            self.model_widgets.output.clear_output()

            buzy(True)

            gensim_logger.setLevel(logging.INFO if self.model_widgets.
                                   show_trace.value else logging.WARNING)

            with self.model_widgets.output:

                try:

                    name: str = str(uuid.uuid1())

                    # FIXME: Move code block out of GUI (to workflows)
                    target_folder = os.path.join(self.data_folder, name)

                    vectorizer_args = dict(
                        apply_idf=self.model_widgets.apply_idf.value)

                    topic_modeller_args = dict(
                        n_topics=self.model_widgets.n_topics.value,
                        max_iter=self.model_widgets.max_iter.value,
                        learning_method='online',
                        n_jobs=1,
                    )

                    method = self.model_widgets.method.value

                    train_corpus = tm.TrainingCorpus(
                        corpus=list(self.get_corpus_terms(corpus)),
                        document_index=self.document_index,
                        vectorizer_args=vectorizer_args,
                    )

                    trained_model: tm.InferredModel = tm.train_model(
                        train_corpus=train_corpus,
                        method=method,
                        engine_args=topic_modeller_args)

                    trained_model.topic_model.save(
                        os.path.join(target_folder, 'gensim.model'))
                    trained_model.store(folder=target_folder,
                                        store_compressed=True)
                    train_corpus.store(folder=target_folder)

                    inferred_topics: tm.InferredTopicsData = tm.predict_topics(
                        topic_model=trained_model.topic_model,
                        corpus=train_corpus.corpus,
                        id2token=train_corpus.id2token,
                        document_index=train_corpus.document_index,
                        n_tokens=self.n_tokens,
                        minimum_probability=self.minimum_probability,
                    )

                    inferred_topics.store(target_folder=target_folder,
                                          pickled=False)

                    self.state.update(trained_model=trained_model,
                                      inferred_topics=inferred_topics,
                                      train_corpus=train_corpus)

                    topics: pd.DataFrame = get_topics_unstacked(
                        self.state.topic_model,
                        n_tokens=100,
                        id2term=self.inferred_topics.id2term,
                        topic_ids=self.inferred_topics.topic_ids,
                    )

                    display(topics)

                except Exception as ex:
                    logger.error(ex)
                    self.state.update(inferred_topics=None)
                    raise
                finally:
                    buzy(False)
예제 #4
0
파일: train.py 프로젝트: humlab/penelope
def compute(
    name: str = None,
    corpus_folder: str = None,
    corpus_source: str = None,
    engine: str = "gensim_lda-multicore",
    engine_args: dict = None,
    filename_field: str = None,
    minimum_probability: float = 0.001,
    n_tokens: int = 200,
    store_corpus: bool = False,
    compressed: bool = True,
):

    if engine not in SUPPORTED_ENGINES:
        raise ValueError(f"Engine {engine} not supported or deprecated")

    if corpus_source is None and corpus_folder is None:
        raise ValueError("corpus filename")

    if len(filename_field or []) == 0:
        raise ValueError("corpus filename fields")

    if corpus_folder is None:
        corpus_folder, _ = os.path.split(os.path.abspath(corpus_source))

    target_folder = os.path.join(corpus_folder, name)

    os.makedirs(target_folder, exist_ok=True)

    reader_opts = TextReaderOpts(
        filename_pattern="*.txt",
        filename_filter=None,
        filename_fields=filename_field,
    )

    transform_opts = TextTransformOpts(fix_whitespaces=False,
                                       fix_hyphenation=True)

    tokens_reader = TextTokenizer(
        source=corpus_source,
        transform_opts=transform_opts,
        reader_opts=reader_opts,
    )

    corpus: TokenizedCorpus = TokenizedCorpus(reader=tokens_reader,
                                              transform_opts=None)

    train_corpus: tm.TrainingCorpus = tm.TrainingCorpus(
        corpus=corpus,
        corpus_options=dict(
            reader_opts=reader_opts.props,
            transform_opts=transform_opts.props,
        ),
    )

    inferred_model: tm.InferredModel = tm.train_model(
        train_corpus=train_corpus,
        method=engine,
        engine_args=engine_args,
    )

    inferred_model.topic_model.save(
        os.path.join(target_folder, 'gensim.model.gz'))

    inferred_model.store(target_folder, store_compressed=compressed)

    if store_corpus:
        train_corpus.store(target_folder)

    inferred_topics: tm.InferredTopicsData = tm.predict_topics(
        inferred_model.topic_model,
        corpus=train_corpus.corpus,
        id2token=train_corpus.id2token,
        document_index=train_corpus.document_index,
        minimum_probability=minimum_probability,
        n_tokens=n_tokens,
    )

    inferred_topics.store(target_folder)
예제 #5
0
def compute(
    *,
    target_name: str = None,
    corpus_source: str = None,
    target_folder: str = None,
    reader_opts: TextReaderOpts = None,
    text_transform_opts: TextTransformOpts = None,
    transform_opts: TokensTransformOpts = None,
    engine: str = "gensim_lda-multicore",
    engine_args: dict = None,
    store_corpus: bool = False,
    store_compressed: bool = True,
    n_tokens: int = 200,
    minimum_probability: float = 0.001,
):
    """ runner """

    tokens_reader = TextTokenizer(
        source=corpus_source,
        transform_opts=text_transform_opts,
        reader_opts=reader_opts,
    )

    corpus: TokenizedCorpus = TokenizedCorpus(reader=tokens_reader,
                                              transform_opts=transform_opts)

    train_corpus: tm.TrainingCorpus = tm.TrainingCorpus(
        corpus=corpus,
        document_index=corpus.document_index,
        token2id=corpus.token2id,
        corpus_options=dict(
            reader_opts=reader_opts.props,
            transform_opts=transform_opts.props,
        ),
    )

    inferred_model: tm.InferredModel = tm.train_model(
        train_corpus=train_corpus,
        method=engine,
        engine_args=engine_args,
    )

    inferred_model.topic_model.save(jj(target_folder, 'gensim.model.gz'))

    inferred_model.store(target_folder, store_compressed=store_compressed)

    if store_corpus:
        train_corpus.store(target_folder)

    inferred_topics: tm.InferredTopicsData = tm.predict_topics(
        inferred_model.topic_model,
        corpus=train_corpus.corpus,
        id2token=train_corpus.id2token,
        document_index=train_corpus.document_index,
        n_tokens=n_tokens,
        minimum_probability=minimum_probability,
    )

    inferred_topics.store(target_folder)

    return dict(folder=target_folder, tag=target_name)