示例#1
0
文件: bogger.py 项目: humlab/penelope
def debug_main(
    config_filename: str = None,
    corpus_source: str = None,
    lemmatize: bool = True,
    pos_includes: str = '',
    pos_excludes: str = '',
    to_lower: bool = True,
    remove_stopwords: str = None,
    min_word_length: int = 2,
    max_word_length: int = None,
    keep_symbols: bool = False,
    keep_numerals: bool = False,
    only_any_alphanumeric: bool = False,
    only_alphabetic: bool = False,
    min_tf: int = None,
):
    config: pipeline.CorpusConfig = load_config(config_filename, corpus_source)

    transform_opts: pc.TokensTransformOpts = pc.TokensTransformOpts(
        to_lower=to_lower,
        to_upper=False,
        min_len=min_word_length,
        max_len=max_word_length,
        remove_accents=False,
        remove_stopwords=(remove_stopwords is not None),
        stopwords=None,
        extra_stopwords=None,
        language=remove_stopwords,
        keep_numerals=keep_numerals,
        keep_symbols=keep_symbols,
        only_alphabetic=only_alphabetic,
        only_any_alphanumeric=only_any_alphanumeric,
    )

    extract_opts = pc.ExtractTaggedTokensOpts(
        lemmatize=lemmatize,
        pos_includes=pos_includes,
        pos_excludes=pos_excludes,
    ).set_numeric_names()

    vectorize_opts: pc.VectorizeOpts = pc.VectorizeOpts(already_tokenized=True, min_tf=min_tf, max_tokens=100000)

    corpus_source: str = corpus_source or config.pipeline_payload.source

    corpus: pc.VectorizedCorpus = id_tagged_frame_to_DTM_pipeline(
        corpus_config=config,
        corpus_source=corpus_source,
        file_pattern='**/prot-*.feather',
        extract_opts=extract_opts,
        transform_opts=transform_opts,
        vectorize_opts=vectorize_opts,
    ).value()
    corpus = corpus.slice_by_tf(5)

    os.makedirs('./data/bogger', exist_ok=True)
    corpus.dump(tag='bogger', folder='./data/bogger', mode='files')

    print(f"Stored corpus of shape {corpus.data.shape}")
示例#2
0
def test_workflow_to_dtm_step_by_step(config: pipeline.CorpusConfig):

    corpus_tag: str = uuid.uuid1()
    target_folder: str = "./tests/output"
    corpus_source: str = './tests/test_data/legal_instrument_five_docs_test.zip'
    tagged_corpus_source: str = f"./tests/output/{uuid.uuid1()}_pos_csv.zip"

    args: ComputeOpts = ComputeOpts(
        corpus_tag=corpus_tag,
        corpus_source=corpus_source,
        target_folder=target_folder,
        corpus_type=pipeline.CorpusType.SpacyCSV,
        # pos_scheme: PoS_Tag_Scheme = PoS_Tag_Schemes.Universal
        transform_opts=corpora.TokensTransformOpts(language='english', remove_stopwords=True, to_lower=True),
        text_reader_opts=corpora.TextReaderOpts(filename_pattern='*.csv', filename_fields=['year:_:1']),
        extract_opts=corpora.ExtractTaggedTokensOpts(
            lemmatize=True,
            pos_includes='|NOUN|PROPN|VERB|',
            pos_excludes='|PUNCT|EOL|SPACE|',
            **config.pipeline_payload.tagged_columns_names,
            filter_opts=dict(is_alpha=False, is_punct=False, is_space=False),
        ),
        create_subfolder=False,
        persist=True,
        tf_threshold=1,
        tf_threshold_mask=False,
        vectorize_opts=corpora.VectorizeOpts(
            already_tokenized=True,
            lowercase=False,
            min_tf=1,
            max_tokens=None,
        ),
        enable_checkpoint=True,
        force_checkpoint=True,
    )
    with inline_code(spacy_pipeline.to_tagged_frame_pipeline):

        tagged_frame_filename: str = tagged_corpus_source or utility.path_add_suffix(
            config.pipeline_payload.source, '_pos_csv'
        )

        p: pipeline.CorpusPipeline = (
            pipeline.CorpusPipeline(config=config)
            .set_spacy_model(config.pipeline_payload.memory_store['spacy_model'])
            .load_text(
                reader_opts=config.text_reader_opts,
                transform_opts=None,
                source=corpus_source,
            )
            .text_to_spacy()
            .spacy_to_pos_tagged_frame()
            .checkpoint(filename=tagged_frame_filename, force_checkpoint=args.force_checkpoint)
        )

        if args.enable_checkpoint:
            p = p.checkpoint_feather(folder=config.get_feather_folder(corpus_source), force=args.force_checkpoint)

        p.exhaust()
示例#3
0
 def to_dtm(
     self: pipelines.CorpusPipeline,
     vectorize_opts: pc.VectorizeOpts = None,
     tagged_column: str = None,
 ) -> pipelines.CorpusPipeline:
     """ (filename, TEXT => DTM) """
     return self.add(
         tasks.ToDTM(vectorize_opts=vectorize_opts or pc.VectorizeOpts(),
                     tagged_column=tagged_column))
示例#4
0
def test_from_tokenized_corpus(document_index):
    source: pc.TokenizedCorpus = None
    vectorize_opts: pc.VectorizeOpts = pc.VectorizeOpts(already_tokenized=True)
    corpus: pc.VectorizedCorpus = convert.from_tokenized_corpus(
        source=source,
        document_index=document_index,
        vectorize_opts=vectorize_opts)
    assert corpus is not None
    assert corpus.shape == (5, 3)
    assert corpus.data.astype(int).todense().tolist() == EXPECTED_DENSE_VALUES
示例#5
0
文件: train.py 项目: humlab/penelope
def train(
    train_corpus: TrainingCorpus,
    method: str,
    engine_args: Dict[str, Any],
    **kwargs,
) -> InferredModel:
    """Computes a topic model using Gensim as engine.

    Parameters
    ----------
    train_corpus : TrainingCorpus
        A container for the training corpus data (terms or DTM, id2word, document_index)
    method : str
        The method to use (see `options` module for mappings)
    engine_args : Dict[str, Any]
        Generic topic modelling options that are translated to algorithm-specific options (see `options` module for translation)
    kwargs : Dict[str,Any], optional
        Additional options:
            `tfidf_weighing` if TF-IDF weighing should be applied, ony valid when terms/id2word are specified, by default False

    Returns
    -------
    InferredModel
        train_corpus        Training corpus data (updated)
        model               The textaCy topic model
        perplexity_score    Computed perplexity scores
        coherence_score     Computed coherence scores
        engine_options       Used engine options (algorithm specific)
        extra_options       Any other compute option passed as a kwarg
    """

    corpus: pc.VectorizedCorpus = convert.TranslateCorpus.translate(
        train_corpus.corpus,
        token2id=train_corpus.token2id.data,
        document_index=train_corpus.document_index,
        vectorize_opts=pc.VectorizeOpts().update(**kwargs),
    )

    model = textacy_api.TopicModel(method.split('_')[1], **engine_args)

    model.fit(corpus.data)

    train_corpus.corpus = corpus

    return InferredModel(
        topic_model=model,
        id2token=train_corpus.id2token,
        options=dict(
            method=method,
            perplexity_score=None,
            coherence_score=None,
            engine_options=engine_args,
            extra_options=kwargs,
        ),
    )
示例#6
0
def noun_dtm_pipeline(min_tf: int = 1, max_tokens: int = None) -> pp.CorpusPipeline:

    vectorize_opts: pc.VectorizeOpts = pc.VectorizeOpts(
        already_tokenized=True, lowercase=False, min_tf=min_tf, max_tokens=max_tokens
    )

    p: pp.CorpusPipeline = noun_pipeline(id_to_token=False).to_dtm(
        vectorize_opts=vectorize_opts, tagged_column='token_id'
    )

    return p
示例#7
0
def test_from_stream_of_filename_tokens(document_index, token2id):
    source: Iterable[Tuple[str, Iterable[str]]] = SIMPLE_CORPUS_ABC_5DOCS
    vectorize_opts: pc.VectorizeOpts = pc.VectorizeOpts(already_tokenized=True)
    corpus: pc.VectorizedCorpus = convert.from_stream_of_filename_tokens(
        source=source,
        token2id=token2id,
        document_index=document_index,
        vectorize_opts=vectorize_opts)
    assert corpus is not None
    assert corpus.shape == (5, 3)
    assert corpus.data.astype(int).todense().tolist() == EXPECTED_DENSE_VALUES
示例#8
0
def test_workflow_to_dtm():

    config: pipeline.CorpusConfig = pipeline.CorpusConfig.load(
        './tests/test_data/riksprot-kb-parlaclarin.yml')

    args: interface.ComputeOpts = interface.ComputeOpts(
        corpus_tag=f'{uuid.uuid1()}',
        corpus_source=
        '/data/westac/riksdagen_corpus_data/riksprot_parlaclarin_basic_protocol_stanza.csv.zip',
        corpus_type=pipeline.CorpusType.SparvCSV,
        target_folder='./data',
        transform_opts=corpora.TokensTransformOpts(to_lower=True,
                                                   only_alphabetic=True),
        # text_reader_opts=corpora.TextReaderOpts(filename_pattern='*.csv', filename_fields=['year:_:1']),
        text_reader_opts=config.text_reader_opts,
        extract_opts=corpora.ExtractTaggedTokensOpts(
            lemmatize=True,
            pos_includes='',
            pos_excludes='|MID|MAD|PAD|',
            **config.pipeline_payload.tagged_columns_names,
        ),
        vectorize_opts=corpora.VectorizeOpts(
            already_tokenized=True,
            lowercase=False,
            min_tf=1,
            max_tokens=None,
        ),
        create_subfolder=True,
        persist=True,
        enable_checkpoint=True,
        force_checkpoint=True,
        tf_threshold=5,
        tf_threshold_mask=True,
    )

    corpus = workflow.compute(args=args, corpus_config=config)

    corpus.remove(tag=args.corpus_tag, folder=args.target_folder)
    corpus.dump(tag=args.corpus_tag, folder=args.target_folder)

    assert corpora.VectorizedCorpus.dump_exists(tag=args.corpus_tag,
                                                folder=args.target_folder)

    corpus_loaded = corpora.VectorizedCorpus.load(tag=args.corpus_tag,
                                                  folder=args.target_folder)

    assert corpus_loaded is not None

    y_corpus = corpus.group_by_year()

    assert y_corpus is not None

    with contextlib.suppress(Exception):
        corpus.remove(tag=args.corpus_tag, folder=args.target_folder)
示例#9
0
def test_spaCy_co_occurrence_pipeline3(config):

    corpus_source = './tests/test_data/legal_instrument_five_docs_test.zip'
    tagged_corpus_source = f'./tests/output/{uuid.uuid1()}_pos.csv.zip'
    args: ComputeOpts = ComputeOpts(
        corpus_tag=f'{uuid.uuid1()}',
        corpus_source=corpus_source,
        target_folder=f'./tests/output/{uuid.uuid1()}',
        corpus_type=pipeline.CorpusType.SpacyCSV,
        # pos_scheme: utility.PoS_Tag_Scheme = utility.PoS_Tag_Schemes.Universal
        transform_opts=corpora.TokensTransformOpts(language='english',
                                                   remove_stopwords=True,
                                                   to_lower=True),
        text_reader_opts=corpora.TextReaderOpts(filename_pattern='*.csv',
                                                filename_fields=['year:_:1']),
        extract_opts=corpora.ExtractTaggedTokensOpts(
            lemmatize=True,
            pos_includes='|NOUN|PROPN|VERB|',
            pos_excludes='|PUNCT|EOL|SPACE|',
            **config.pipeline_payload.tagged_columns_names,
            filter_opts=dict(is_alpha=False, is_punct=False, is_space=False),
        ),
        create_subfolder=False,
        persist=True,
        vectorize_opts=corpora.VectorizeOpts(
            already_tokenized=True,
            lowercase=False,
            min_tf=1,
            max_tokens=None,
        ),
        enable_checkpoint=True,
        force_checkpoint=True,
        tf_threshold=1,
        tf_threshold_mask=False,
        context_opts=co_occurrence.ContextOpts(
            context_width=4,
            concept=set(),
            ignore_concept=False,
            partition_keys=['document_id'],
        ),
    )

    workflow.compute(
        args=args,
        corpus_config=config,
        tagged_corpus_source=tagged_corpus_source,
    )

    assert os.path.isfile(tagged_corpus_source)
    assert os.path.isdir(args.target_folder)

    shutil.rmtree(args.target_folder, ignore_errors=True)
    os.remove(tagged_corpus_source)
示例#10
0
def test_workflow_to_dtm(config: pipeline.CorpusConfig):

    args: ComputeOpts = ComputeOpts(
        corpus_tag=f'{uuid.uuid1()}',
        corpus_source='./tests/test_data/legal_instrument_five_docs_test.zip',
        corpus_type=pipeline.CorpusType.Text,
        target_folder='./tests/output/',
        transform_opts=corpora.TokensTransformOpts(language='english', remove_stopwords=True, to_lower=True),
        text_reader_opts=corpora.TextReaderOpts(filename_pattern='*.csv', filename_fields=['year:_:1']),
        extract_opts=corpora.ExtractTaggedTokensOpts(
            lemmatize=True,
            pos_includes='|NOUN|PROPN|VERB|',
            pos_excludes='|PUNCT|EOL|SPACE|',
            **config.pipeline_payload.tagged_columns_names,
            filter_opts=dict(is_alpha=False, is_punct=False, is_space=False),
        ),
        vectorize_opts=corpora.VectorizeOpts(
            already_tokenized=True,
            lowercase=False,
            min_tf=1,
            max_tokens=None,
        ),
        create_subfolder=False,
        persist=True,
        enable_checkpoint=True,
        force_checkpoint=True,
        tf_threshold=1,
        tf_threshold_mask=False,
        tagged_corpus_source='./tests/output/legal_instrument_five_docs_test_pos_csv.zip',
    )

    corpus = workflow.compute(args=args, corpus_config=config)

    corpus.remove(tag=args.corpus_tag, folder=args.target_folder)
    corpus.dump(tag=args.corpus_tag, folder=args.target_folder)

    assert corpora.VectorizedCorpus.dump_exists(tag=args.corpus_tag, folder=args.target_folder)

    corpus_loaded = corpora.VectorizedCorpus.load(tag=args.corpus_tag, folder=args.target_folder)

    assert corpus_loaded is not None

    y_corpus = corpus.group_by_year()

    assert y_corpus is not None

    with contextlib.suppress(Exception):
        corpus.remove(tag=args.corpus_tag, folder=args.target_folder)
示例#11
0
def run_workflow():
    corpus_config = pipeline.CorpusConfig.load(CONFIG_FILENAME).folders(DATA_FOLDER)
    corpus_config.pipeline_payload.files(source=CORPUS_FILENAME, document_index_source=None)
    corpus_config.checkpoint_opts.deserialize_processes = 4

    compute_opts = ComputeOpts(
        corpus_type=pipeline.CorpusType.SparvCSV,
        corpus_source=CORPUS_FILENAME,
        target_folder=jj(OUTPUT_FOLDER, 'APA'),
        corpus_tag='APA',
        transform_opts=corpora.TokensTransformOpts(
            to_lower=True,
            to_upper=False,
            min_len=1,
            max_len=None,
            remove_accents=False,
            remove_stopwords=False,
            stopwords=None,
            extra_stopwords=None,
            language='swedish',
            keep_numerals=True,
            keep_symbols=True,
            only_alphabetic=False,
            only_any_alphanumeric=False,
        ),
        text_reader_opts=corpora.TextReaderOpts(
            filename_pattern='*.csv',
            filename_filter=None,
            filename_fields=[
                'year:prot\\_(\\d{4}).*',
                'year2:prot_\\d{4}(\\d{2})__*',
                'number:prot_\\d+[afk_]{0,4}__(\\d+).*',
            ],
            index_field=None,
            as_binary=False,
            sep='\t',
            quoting=3,
        ),
        extract_opts=corpora.ExtractTaggedTokensOpts(
            pos_includes='NN|PM',
            pos_excludes='MAD|MID|PAD',
            pos_paddings='AB|DT|HA|HD|HP|HS|IE|IN|JJ|KN|PC|PL|PN|PP|PS|RG|RO|SN|UO|VB',
            lemmatize=True,
            append_pos=False,
            global_tf_threshold=1,
            global_tf_threshold_mask=False,
            **corpus_config.pipeline_payload.tagged_columns_names,
        ),
        vectorize_opts=corpora.VectorizeOpts(
            already_tokenized=True,
            lowercase=False,
            stop_words=None,
            max_df=1.0,
            min_df=1,
            min_tf=1,
        ),
        tf_threshold=1,
        tf_threshold_mask=False,
        create_subfolder=True,
        persist=True,
        context_opts=ContextOpts(
            context_width=2,
            concept=set(['kammare']),
            ignore_concept=False,
            partition_keys=['document_name'],
            processes=4,
            chunksize=10,
        ),
        enable_checkpoint=False,
        force_checkpoint=False,
    )

    _ = workflow.compute(
        args=compute_opts,
        corpus_config=corpus_config,
        tagged_corpus_source=jj(OUTPUT_FOLDER, 'test.zip'),
    )
示例#12
0
def main(
    config_filename: Optional[str] = None,
    corpus_source: Optional[str] = None,
    filename_pattern: str = None,
    train_corpus_folder: Optional[str] = None,
    trained_model_folder: Optional[str] = None,
    target_mode: Literal['train', 'predict', 'both'] = 'both',
    target_folder: Optional[str] = None,
    target_name: Optional[str] = None,
    lemmatize: bool = True,
    pos_includes: str = '',
    pos_excludes: str = '',
    to_lower: bool = True,
    max_tokens: int = None,
    tf_threshold: int = None,
    # remove_stopwords: Optional[str] = None,
    # min_word_length: int = 2,
    # max_word_length: int = None,
    # keep_symbols: bool = False,
    # keep_numerals: bool = False,
    alpha: str = 'asymmetric',
    chunk_size: int = 2000,
    engine: str = "gensim_lda-multicore",
    max_iter: int = None,
    num_top_words: int = None,
    minimum_probability: float = None,
    n_topics: int = 50,
    passes: int = None,
    per_word_topics: bool = False,
    random_seed: int = None,
    update_every: int = 1,
    workers: int = None,
    store_corpus: bool = True,
    store_compressed: bool = True,
):
    to_lower = False  # for now...

    if not config_filename or not os.path.isfile(config_filename):
        click.echo("error: config file not specified/found")
        raise sys.exit(1)

    if target_name is None:
        click.echo("error: target_name not specified")
        raise sys.exit(1)

    if target_mode == 'predict' and not InferredModel.exists(
            trained_model_folder):
        click.echo("error: trained model folder not specified")
        raise sys.exit(1)

    config: pipeline.CorpusConfig = load_config(config_filename, corpus_source)

    if corpus_source is None and config.pipeline_payload.source is None:
        click.echo("usage: corpus source must be specified")
        sys.exit(1)

    if not config.pipeline_key_exists("topic_modeling_pipeline"):
        click.echo("config error: `topic_modeling_pipeline` not specified")
        sys.exit(1)

    # transform_opts: pc.TokensTransformOpts = None

    extract_opts: pc.ExtractTaggedTokensOpts = pc.ExtractTaggedTokensOpts(
        lemmatize=lemmatize,
        pos_includes=pos_includes,
        pos_excludes=pos_excludes,
        pos_column='pos_id',
        lemma_column='lemma_id',
        text_column='token_id',
    )

    vectorize_opts: pc.VectorizeOpts = pc.VectorizeOpts(
        already_tokenized=True,
        lowercase=to_lower,
        max_tokens=max_tokens,
        min_tf=tf_threshold,
    )
    engine_args = remove_none(
        dict(
            alpha=alpha,
            chunk_size=chunk_size,
            max_iter=max_iter,
            num_top_words=num_top_words,
            minimum_probability=minimum_probability,
            n_topics=n_topics,
            passes=passes,
            per_word_topics=per_word_topics,
            random_seed=random_seed,
            update_every=update_every,
            work_folder=os.path.join(target_folder, target_name),
            workers=workers,
        ))
    # _: dict = config.get_pipeline(
    #     pipeline_key="topic_modeling_pipeline",

    value: dict = workflow.compute(
        corpus_config=config,
        corpus_source=corpus_source,
        filename_pattern=filename_pattern,
        train_corpus_folder=train_corpus_folder,
        trained_model_folder=trained_model_folder,
        target_mode=target_mode,
        target_folder=target_folder,
        target_name=target_name,
        extract_opts=extract_opts,
        vectorize_opts=vectorize_opts,
        engine=engine,
        engine_args=engine_args,
        store_corpus=store_corpus,
        store_compressed=store_compressed,
        # transform_opts=transform_opts,
    )

    logger.info(
        f"workflow completed: model {value.get('target_name')} stored in {value.get('target_folder')}"
    )
示例#13
0
def predict_topics(
    topic_model: Any,
    *,
    corpus: gensim_corpora.Sparse2Corpus | pc.VectorizedCorpus,
    id2token: Mapping[int, str] | dict | pc.Token2Id,
    document_index: pc.DocumentIndex = None,
    n_tokens: int = 200,
    minimum_probability: float = 0.001,
    **kwargs,
) -> InferredTopicsData:
    """Predict topics for `corpus`. Return InferredTopicsData.

    Args:
        topic_model (Any): [description]
        corpus (Sparse2Corpus): Corpus to be predicted.
        id2token (corpora.Dictionary): id-to-token mapping
        document_index (DocumentIndex): Document index
        n_tokens (int, optional): Number of tokens per topic to keep. Defaults to 200.
        minimum_probability (float, optional): Minimum doc-topic weights to keep. Defaults to 0.001.
    Kwargs:
        topic_token_weights (pd.DataFrame, optional): existing topic token distrubution. Defaults to None.
        topic_token_overview (pd.DataFrame, optional): existing overview. Defaults to None.
    """

    vectorized_corpus: pc.VectorizedCorpus = dtm.TranslateCorpus.translate(
        corpus,
        token2id=pc.id2token2token2id(id2token),
        document_index=document_index,
        vectorize_opts=pc.VectorizeOpts().update(**kwargs),
    )

    engine: ITopicModelEngine = get_engine_by_model_type(topic_model)

    document_topic_weights: DocumentTopicsWeightsIter = engine.predict(
        vectorized_corpus, minimum_probability=minimum_probability, **kwargs
    )

    topic_token_weights: pd.DataFrame = (
        kwargs.get('topic_token_weights')
        if kwargs.get('topic_token_weights') is not None
        else engine.get_topic_token_weights(vocabulary=vectorized_corpus.id2token, n_tokens=n_tokens)
    )

    topic_token_overview: pd.DataFrame = (
        kwargs.get('topic_token_overview')
        if kwargs.get('topic_token_overview') is not None
        else engine.get_topic_token_overview(topic_token_weights, n_tokens=n_tokens)
    )

    topic_diagnostics: pd.DataFrame = kwargs.get('topic_diagnostics', engine.topic_diagnostics)
    topic_token_diagnostics: pd.DataFrame = kwargs.get('topic_token_diagnostics', engine.topic_token_diagnostics)

    document_index: pd.DataFrame = pc.update_document_index_token_counts_by_corpus(document_index, vectorized_corpus)

    topics_data: InferredTopicsData = InferredTopicsData(
        dictionary=pc.Token2Id.id2token_to_dataframe(vectorized_corpus.id2token),
        topic_token_weights=topic_token_weights,
        topic_token_overview=topic_token_overview,
        document_index=document_index,
        document_topic_weights=to_dataframe(document_index, document_topic_weights),
        topic_diagnostics=topic_diagnostics,
        token_diagnostics=topic_token_diagnostics,
    )
    return topics_data