def debug_main( config_filename: str = None, corpus_source: str = None, lemmatize: bool = True, pos_includes: str = '', pos_excludes: str = '', to_lower: bool = True, remove_stopwords: str = None, min_word_length: int = 2, max_word_length: int = None, keep_symbols: bool = False, keep_numerals: bool = False, only_any_alphanumeric: bool = False, only_alphabetic: bool = False, min_tf: int = None, ): config: pipeline.CorpusConfig = load_config(config_filename, corpus_source) transform_opts: pc.TokensTransformOpts = pc.TokensTransformOpts( to_lower=to_lower, to_upper=False, min_len=min_word_length, max_len=max_word_length, remove_accents=False, remove_stopwords=(remove_stopwords is not None), stopwords=None, extra_stopwords=None, language=remove_stopwords, keep_numerals=keep_numerals, keep_symbols=keep_symbols, only_alphabetic=only_alphabetic, only_any_alphanumeric=only_any_alphanumeric, ) extract_opts = pc.ExtractTaggedTokensOpts( lemmatize=lemmatize, pos_includes=pos_includes, pos_excludes=pos_excludes, ).set_numeric_names() vectorize_opts: pc.VectorizeOpts = pc.VectorizeOpts(already_tokenized=True, min_tf=min_tf, max_tokens=100000) corpus_source: str = corpus_source or config.pipeline_payload.source corpus: pc.VectorizedCorpus = id_tagged_frame_to_DTM_pipeline( corpus_config=config, corpus_source=corpus_source, file_pattern='**/prot-*.feather', extract_opts=extract_opts, transform_opts=transform_opts, vectorize_opts=vectorize_opts, ).value() corpus = corpus.slice_by_tf(5) os.makedirs('./data/bogger', exist_ok=True) corpus.dump(tag='bogger', folder='./data/bogger', mode='files') print(f"Stored corpus of shape {corpus.data.shape}")
def test_workflow_to_dtm_step_by_step(config: pipeline.CorpusConfig): corpus_tag: str = uuid.uuid1() target_folder: str = "./tests/output" corpus_source: str = './tests/test_data/legal_instrument_five_docs_test.zip' tagged_corpus_source: str = f"./tests/output/{uuid.uuid1()}_pos_csv.zip" args: ComputeOpts = ComputeOpts( corpus_tag=corpus_tag, corpus_source=corpus_source, target_folder=target_folder, corpus_type=pipeline.CorpusType.SpacyCSV, # pos_scheme: PoS_Tag_Scheme = PoS_Tag_Schemes.Universal transform_opts=corpora.TokensTransformOpts(language='english', remove_stopwords=True, to_lower=True), text_reader_opts=corpora.TextReaderOpts(filename_pattern='*.csv', filename_fields=['year:_:1']), extract_opts=corpora.ExtractTaggedTokensOpts( lemmatize=True, pos_includes='|NOUN|PROPN|VERB|', pos_excludes='|PUNCT|EOL|SPACE|', **config.pipeline_payload.tagged_columns_names, filter_opts=dict(is_alpha=False, is_punct=False, is_space=False), ), create_subfolder=False, persist=True, tf_threshold=1, tf_threshold_mask=False, vectorize_opts=corpora.VectorizeOpts( already_tokenized=True, lowercase=False, min_tf=1, max_tokens=None, ), enable_checkpoint=True, force_checkpoint=True, ) with inline_code(spacy_pipeline.to_tagged_frame_pipeline): tagged_frame_filename: str = tagged_corpus_source or utility.path_add_suffix( config.pipeline_payload.source, '_pos_csv' ) p: pipeline.CorpusPipeline = ( pipeline.CorpusPipeline(config=config) .set_spacy_model(config.pipeline_payload.memory_store['spacy_model']) .load_text( reader_opts=config.text_reader_opts, transform_opts=None, source=corpus_source, ) .text_to_spacy() .spacy_to_pos_tagged_frame() .checkpoint(filename=tagged_frame_filename, force_checkpoint=args.force_checkpoint) ) if args.enable_checkpoint: p = p.checkpoint_feather(folder=config.get_feather_folder(corpus_source), force=args.force_checkpoint) p.exhaust()
def to_dtm( self: pipelines.CorpusPipeline, vectorize_opts: pc.VectorizeOpts = None, tagged_column: str = None, ) -> pipelines.CorpusPipeline: """ (filename, TEXT => DTM) """ return self.add( tasks.ToDTM(vectorize_opts=vectorize_opts or pc.VectorizeOpts(), tagged_column=tagged_column))
def test_from_tokenized_corpus(document_index): source: pc.TokenizedCorpus = None vectorize_opts: pc.VectorizeOpts = pc.VectorizeOpts(already_tokenized=True) corpus: pc.VectorizedCorpus = convert.from_tokenized_corpus( source=source, document_index=document_index, vectorize_opts=vectorize_opts) assert corpus is not None assert corpus.shape == (5, 3) assert corpus.data.astype(int).todense().tolist() == EXPECTED_DENSE_VALUES
def train( train_corpus: TrainingCorpus, method: str, engine_args: Dict[str, Any], **kwargs, ) -> InferredModel: """Computes a topic model using Gensim as engine. Parameters ---------- train_corpus : TrainingCorpus A container for the training corpus data (terms or DTM, id2word, document_index) method : str The method to use (see `options` module for mappings) engine_args : Dict[str, Any] Generic topic modelling options that are translated to algorithm-specific options (see `options` module for translation) kwargs : Dict[str,Any], optional Additional options: `tfidf_weighing` if TF-IDF weighing should be applied, ony valid when terms/id2word are specified, by default False Returns ------- InferredModel train_corpus Training corpus data (updated) model The textaCy topic model perplexity_score Computed perplexity scores coherence_score Computed coherence scores engine_options Used engine options (algorithm specific) extra_options Any other compute option passed as a kwarg """ corpus: pc.VectorizedCorpus = convert.TranslateCorpus.translate( train_corpus.corpus, token2id=train_corpus.token2id.data, document_index=train_corpus.document_index, vectorize_opts=pc.VectorizeOpts().update(**kwargs), ) model = textacy_api.TopicModel(method.split('_')[1], **engine_args) model.fit(corpus.data) train_corpus.corpus = corpus return InferredModel( topic_model=model, id2token=train_corpus.id2token, options=dict( method=method, perplexity_score=None, coherence_score=None, engine_options=engine_args, extra_options=kwargs, ), )
def noun_dtm_pipeline(min_tf: int = 1, max_tokens: int = None) -> pp.CorpusPipeline: vectorize_opts: pc.VectorizeOpts = pc.VectorizeOpts( already_tokenized=True, lowercase=False, min_tf=min_tf, max_tokens=max_tokens ) p: pp.CorpusPipeline = noun_pipeline(id_to_token=False).to_dtm( vectorize_opts=vectorize_opts, tagged_column='token_id' ) return p
def test_from_stream_of_filename_tokens(document_index, token2id): source: Iterable[Tuple[str, Iterable[str]]] = SIMPLE_CORPUS_ABC_5DOCS vectorize_opts: pc.VectorizeOpts = pc.VectorizeOpts(already_tokenized=True) corpus: pc.VectorizedCorpus = convert.from_stream_of_filename_tokens( source=source, token2id=token2id, document_index=document_index, vectorize_opts=vectorize_opts) assert corpus is not None assert corpus.shape == (5, 3) assert corpus.data.astype(int).todense().tolist() == EXPECTED_DENSE_VALUES
def test_workflow_to_dtm(): config: pipeline.CorpusConfig = pipeline.CorpusConfig.load( './tests/test_data/riksprot-kb-parlaclarin.yml') args: interface.ComputeOpts = interface.ComputeOpts( corpus_tag=f'{uuid.uuid1()}', corpus_source= '/data/westac/riksdagen_corpus_data/riksprot_parlaclarin_basic_protocol_stanza.csv.zip', corpus_type=pipeline.CorpusType.SparvCSV, target_folder='./data', transform_opts=corpora.TokensTransformOpts(to_lower=True, only_alphabetic=True), # text_reader_opts=corpora.TextReaderOpts(filename_pattern='*.csv', filename_fields=['year:_:1']), text_reader_opts=config.text_reader_opts, extract_opts=corpora.ExtractTaggedTokensOpts( lemmatize=True, pos_includes='', pos_excludes='|MID|MAD|PAD|', **config.pipeline_payload.tagged_columns_names, ), vectorize_opts=corpora.VectorizeOpts( already_tokenized=True, lowercase=False, min_tf=1, max_tokens=None, ), create_subfolder=True, persist=True, enable_checkpoint=True, force_checkpoint=True, tf_threshold=5, tf_threshold_mask=True, ) corpus = workflow.compute(args=args, corpus_config=config) corpus.remove(tag=args.corpus_tag, folder=args.target_folder) corpus.dump(tag=args.corpus_tag, folder=args.target_folder) assert corpora.VectorizedCorpus.dump_exists(tag=args.corpus_tag, folder=args.target_folder) corpus_loaded = corpora.VectorizedCorpus.load(tag=args.corpus_tag, folder=args.target_folder) assert corpus_loaded is not None y_corpus = corpus.group_by_year() assert y_corpus is not None with contextlib.suppress(Exception): corpus.remove(tag=args.corpus_tag, folder=args.target_folder)
def test_spaCy_co_occurrence_pipeline3(config): corpus_source = './tests/test_data/legal_instrument_five_docs_test.zip' tagged_corpus_source = f'./tests/output/{uuid.uuid1()}_pos.csv.zip' args: ComputeOpts = ComputeOpts( corpus_tag=f'{uuid.uuid1()}', corpus_source=corpus_source, target_folder=f'./tests/output/{uuid.uuid1()}', corpus_type=pipeline.CorpusType.SpacyCSV, # pos_scheme: utility.PoS_Tag_Scheme = utility.PoS_Tag_Schemes.Universal transform_opts=corpora.TokensTransformOpts(language='english', remove_stopwords=True, to_lower=True), text_reader_opts=corpora.TextReaderOpts(filename_pattern='*.csv', filename_fields=['year:_:1']), extract_opts=corpora.ExtractTaggedTokensOpts( lemmatize=True, pos_includes='|NOUN|PROPN|VERB|', pos_excludes='|PUNCT|EOL|SPACE|', **config.pipeline_payload.tagged_columns_names, filter_opts=dict(is_alpha=False, is_punct=False, is_space=False), ), create_subfolder=False, persist=True, vectorize_opts=corpora.VectorizeOpts( already_tokenized=True, lowercase=False, min_tf=1, max_tokens=None, ), enable_checkpoint=True, force_checkpoint=True, tf_threshold=1, tf_threshold_mask=False, context_opts=co_occurrence.ContextOpts( context_width=4, concept=set(), ignore_concept=False, partition_keys=['document_id'], ), ) workflow.compute( args=args, corpus_config=config, tagged_corpus_source=tagged_corpus_source, ) assert os.path.isfile(tagged_corpus_source) assert os.path.isdir(args.target_folder) shutil.rmtree(args.target_folder, ignore_errors=True) os.remove(tagged_corpus_source)
def test_workflow_to_dtm(config: pipeline.CorpusConfig): args: ComputeOpts = ComputeOpts( corpus_tag=f'{uuid.uuid1()}', corpus_source='./tests/test_data/legal_instrument_five_docs_test.zip', corpus_type=pipeline.CorpusType.Text, target_folder='./tests/output/', transform_opts=corpora.TokensTransformOpts(language='english', remove_stopwords=True, to_lower=True), text_reader_opts=corpora.TextReaderOpts(filename_pattern='*.csv', filename_fields=['year:_:1']), extract_opts=corpora.ExtractTaggedTokensOpts( lemmatize=True, pos_includes='|NOUN|PROPN|VERB|', pos_excludes='|PUNCT|EOL|SPACE|', **config.pipeline_payload.tagged_columns_names, filter_opts=dict(is_alpha=False, is_punct=False, is_space=False), ), vectorize_opts=corpora.VectorizeOpts( already_tokenized=True, lowercase=False, min_tf=1, max_tokens=None, ), create_subfolder=False, persist=True, enable_checkpoint=True, force_checkpoint=True, tf_threshold=1, tf_threshold_mask=False, tagged_corpus_source='./tests/output/legal_instrument_five_docs_test_pos_csv.zip', ) corpus = workflow.compute(args=args, corpus_config=config) corpus.remove(tag=args.corpus_tag, folder=args.target_folder) corpus.dump(tag=args.corpus_tag, folder=args.target_folder) assert corpora.VectorizedCorpus.dump_exists(tag=args.corpus_tag, folder=args.target_folder) corpus_loaded = corpora.VectorizedCorpus.load(tag=args.corpus_tag, folder=args.target_folder) assert corpus_loaded is not None y_corpus = corpus.group_by_year() assert y_corpus is not None with contextlib.suppress(Exception): corpus.remove(tag=args.corpus_tag, folder=args.target_folder)
def run_workflow(): corpus_config = pipeline.CorpusConfig.load(CONFIG_FILENAME).folders(DATA_FOLDER) corpus_config.pipeline_payload.files(source=CORPUS_FILENAME, document_index_source=None) corpus_config.checkpoint_opts.deserialize_processes = 4 compute_opts = ComputeOpts( corpus_type=pipeline.CorpusType.SparvCSV, corpus_source=CORPUS_FILENAME, target_folder=jj(OUTPUT_FOLDER, 'APA'), corpus_tag='APA', transform_opts=corpora.TokensTransformOpts( to_lower=True, to_upper=False, min_len=1, max_len=None, remove_accents=False, remove_stopwords=False, stopwords=None, extra_stopwords=None, language='swedish', keep_numerals=True, keep_symbols=True, only_alphabetic=False, only_any_alphanumeric=False, ), text_reader_opts=corpora.TextReaderOpts( filename_pattern='*.csv', filename_filter=None, filename_fields=[ 'year:prot\\_(\\d{4}).*', 'year2:prot_\\d{4}(\\d{2})__*', 'number:prot_\\d+[afk_]{0,4}__(\\d+).*', ], index_field=None, as_binary=False, sep='\t', quoting=3, ), extract_opts=corpora.ExtractTaggedTokensOpts( pos_includes='NN|PM', pos_excludes='MAD|MID|PAD', pos_paddings='AB|DT|HA|HD|HP|HS|IE|IN|JJ|KN|PC|PL|PN|PP|PS|RG|RO|SN|UO|VB', lemmatize=True, append_pos=False, global_tf_threshold=1, global_tf_threshold_mask=False, **corpus_config.pipeline_payload.tagged_columns_names, ), vectorize_opts=corpora.VectorizeOpts( already_tokenized=True, lowercase=False, stop_words=None, max_df=1.0, min_df=1, min_tf=1, ), tf_threshold=1, tf_threshold_mask=False, create_subfolder=True, persist=True, context_opts=ContextOpts( context_width=2, concept=set(['kammare']), ignore_concept=False, partition_keys=['document_name'], processes=4, chunksize=10, ), enable_checkpoint=False, force_checkpoint=False, ) _ = workflow.compute( args=compute_opts, corpus_config=corpus_config, tagged_corpus_source=jj(OUTPUT_FOLDER, 'test.zip'), )
def main( config_filename: Optional[str] = None, corpus_source: Optional[str] = None, filename_pattern: str = None, train_corpus_folder: Optional[str] = None, trained_model_folder: Optional[str] = None, target_mode: Literal['train', 'predict', 'both'] = 'both', target_folder: Optional[str] = None, target_name: Optional[str] = None, lemmatize: bool = True, pos_includes: str = '', pos_excludes: str = '', to_lower: bool = True, max_tokens: int = None, tf_threshold: int = None, # remove_stopwords: Optional[str] = None, # min_word_length: int = 2, # max_word_length: int = None, # keep_symbols: bool = False, # keep_numerals: bool = False, alpha: str = 'asymmetric', chunk_size: int = 2000, engine: str = "gensim_lda-multicore", max_iter: int = None, num_top_words: int = None, minimum_probability: float = None, n_topics: int = 50, passes: int = None, per_word_topics: bool = False, random_seed: int = None, update_every: int = 1, workers: int = None, store_corpus: bool = True, store_compressed: bool = True, ): to_lower = False # for now... if not config_filename or not os.path.isfile(config_filename): click.echo("error: config file not specified/found") raise sys.exit(1) if target_name is None: click.echo("error: target_name not specified") raise sys.exit(1) if target_mode == 'predict' and not InferredModel.exists( trained_model_folder): click.echo("error: trained model folder not specified") raise sys.exit(1) config: pipeline.CorpusConfig = load_config(config_filename, corpus_source) if corpus_source is None and config.pipeline_payload.source is None: click.echo("usage: corpus source must be specified") sys.exit(1) if not config.pipeline_key_exists("topic_modeling_pipeline"): click.echo("config error: `topic_modeling_pipeline` not specified") sys.exit(1) # transform_opts: pc.TokensTransformOpts = None extract_opts: pc.ExtractTaggedTokensOpts = pc.ExtractTaggedTokensOpts( lemmatize=lemmatize, pos_includes=pos_includes, pos_excludes=pos_excludes, pos_column='pos_id', lemma_column='lemma_id', text_column='token_id', ) vectorize_opts: pc.VectorizeOpts = pc.VectorizeOpts( already_tokenized=True, lowercase=to_lower, max_tokens=max_tokens, min_tf=tf_threshold, ) engine_args = remove_none( dict( alpha=alpha, chunk_size=chunk_size, max_iter=max_iter, num_top_words=num_top_words, minimum_probability=minimum_probability, n_topics=n_topics, passes=passes, per_word_topics=per_word_topics, random_seed=random_seed, update_every=update_every, work_folder=os.path.join(target_folder, target_name), workers=workers, )) # _: dict = config.get_pipeline( # pipeline_key="topic_modeling_pipeline", value: dict = workflow.compute( corpus_config=config, corpus_source=corpus_source, filename_pattern=filename_pattern, train_corpus_folder=train_corpus_folder, trained_model_folder=trained_model_folder, target_mode=target_mode, target_folder=target_folder, target_name=target_name, extract_opts=extract_opts, vectorize_opts=vectorize_opts, engine=engine, engine_args=engine_args, store_corpus=store_corpus, store_compressed=store_compressed, # transform_opts=transform_opts, ) logger.info( f"workflow completed: model {value.get('target_name')} stored in {value.get('target_folder')}" )
def predict_topics( topic_model: Any, *, corpus: gensim_corpora.Sparse2Corpus | pc.VectorizedCorpus, id2token: Mapping[int, str] | dict | pc.Token2Id, document_index: pc.DocumentIndex = None, n_tokens: int = 200, minimum_probability: float = 0.001, **kwargs, ) -> InferredTopicsData: """Predict topics for `corpus`. Return InferredTopicsData. Args: topic_model (Any): [description] corpus (Sparse2Corpus): Corpus to be predicted. id2token (corpora.Dictionary): id-to-token mapping document_index (DocumentIndex): Document index n_tokens (int, optional): Number of tokens per topic to keep. Defaults to 200. minimum_probability (float, optional): Minimum doc-topic weights to keep. Defaults to 0.001. Kwargs: topic_token_weights (pd.DataFrame, optional): existing topic token distrubution. Defaults to None. topic_token_overview (pd.DataFrame, optional): existing overview. Defaults to None. """ vectorized_corpus: pc.VectorizedCorpus = dtm.TranslateCorpus.translate( corpus, token2id=pc.id2token2token2id(id2token), document_index=document_index, vectorize_opts=pc.VectorizeOpts().update(**kwargs), ) engine: ITopicModelEngine = get_engine_by_model_type(topic_model) document_topic_weights: DocumentTopicsWeightsIter = engine.predict( vectorized_corpus, minimum_probability=minimum_probability, **kwargs ) topic_token_weights: pd.DataFrame = ( kwargs.get('topic_token_weights') if kwargs.get('topic_token_weights') is not None else engine.get_topic_token_weights(vocabulary=vectorized_corpus.id2token, n_tokens=n_tokens) ) topic_token_overview: pd.DataFrame = ( kwargs.get('topic_token_overview') if kwargs.get('topic_token_overview') is not None else engine.get_topic_token_overview(topic_token_weights, n_tokens=n_tokens) ) topic_diagnostics: pd.DataFrame = kwargs.get('topic_diagnostics', engine.topic_diagnostics) topic_token_diagnostics: pd.DataFrame = kwargs.get('topic_token_diagnostics', engine.topic_token_diagnostics) document_index: pd.DataFrame = pc.update_document_index_token_counts_by_corpus(document_index, vectorized_corpus) topics_data: InferredTopicsData = InferredTopicsData( dictionary=pc.Token2Id.id2token_to_dataframe(vectorized_corpus.id2token), topic_token_weights=topic_token_weights, topic_token_overview=topic_token_overview, document_index=document_index, document_topic_weights=to_dataframe(document_index, document_topic_weights), topic_diagnostics=topic_diagnostics, token_diagnostics=topic_token_diagnostics, ) return topics_data