def test_pipeline_text_to_dtm_succeeds(config: pipeline.CorpusConfig): target_tag: str = uuid.uuid1() tagged_corpus_source: str = os.path.join(CORPUS_FOLDER, 'checkpoint_pos_tagged_test.zip') extract_opts: corpora.ExtractTaggedTokensOpts = corpora.ExtractTaggedTokensOpts( lemmatize=True, pos_includes='|NOUN|', pos_paddings=None, **config.pipeline_payload.tagged_columns_names, filter_opts=dict(is_punct=False), ) corpus: corpora.VectorizedCorpus = ( ( pipeline.CorpusPipeline(config=config) .checkpoint(tagged_corpus_source, force_checkpoint=False) .tagged_frame_to_tokens(extract_opts=extract_opts, transform_opts=None) .tokens_transform(transform_opts=corpora.TokensTransformOpts()) .tokens_to_text() .tqdm() .to_dtm() ) .single() .content ) corpus.dump(tag=target_tag, folder=OUTPUT_FOLDER) assert isinstance(corpus, corpora.VectorizedCorpus) assert corpus.data.shape[0] == 5 assert len(corpus.token2id) == corpus.data.shape[1] corpus.remove(tag=target_tag, folder=OUTPUT_FOLDER)
def test_workflow_to_dtm_step_by_step(config: pipeline.CorpusConfig): corpus_tag: str = uuid.uuid1() target_folder: str = "./tests/output" corpus_source: str = './tests/test_data/legal_instrument_five_docs_test.zip' tagged_corpus_source: str = f"./tests/output/{uuid.uuid1()}_pos_csv.zip" args: ComputeOpts = ComputeOpts( corpus_tag=corpus_tag, corpus_source=corpus_source, target_folder=target_folder, corpus_type=pipeline.CorpusType.SpacyCSV, # pos_scheme: PoS_Tag_Scheme = PoS_Tag_Schemes.Universal transform_opts=corpora.TokensTransformOpts(language='english', remove_stopwords=True, to_lower=True), text_reader_opts=corpora.TextReaderOpts(filename_pattern='*.csv', filename_fields=['year:_:1']), extract_opts=corpora.ExtractTaggedTokensOpts( lemmatize=True, pos_includes='|NOUN|PROPN|VERB|', pos_excludes='|PUNCT|EOL|SPACE|', **config.pipeline_payload.tagged_columns_names, filter_opts=dict(is_alpha=False, is_punct=False, is_space=False), ), create_subfolder=False, persist=True, tf_threshold=1, tf_threshold_mask=False, vectorize_opts=corpora.VectorizeOpts( already_tokenized=True, lowercase=False, min_tf=1, max_tokens=None, ), enable_checkpoint=True, force_checkpoint=True, ) with inline_code(spacy_pipeline.to_tagged_frame_pipeline): tagged_frame_filename: str = tagged_corpus_source or utility.path_add_suffix( config.pipeline_payload.source, '_pos_csv' ) p: pipeline.CorpusPipeline = ( pipeline.CorpusPipeline(config=config) .set_spacy_model(config.pipeline_payload.memory_store['spacy_model']) .load_text( reader_opts=config.text_reader_opts, transform_opts=None, source=corpus_source, ) .text_to_spacy() .spacy_to_pos_tagged_frame() .checkpoint(filename=tagged_frame_filename, force_checkpoint=args.force_checkpoint) ) if args.enable_checkpoint: p = p.checkpoint_feather(folder=config.get_feather_folder(corpus_source), force=args.force_checkpoint) p.exhaust()
def debug_main( config_filename: str = None, corpus_source: str = None, lemmatize: bool = True, pos_includes: str = '', pos_excludes: str = '', to_lower: bool = True, remove_stopwords: str = None, min_word_length: int = 2, max_word_length: int = None, keep_symbols: bool = False, keep_numerals: bool = False, only_any_alphanumeric: bool = False, only_alphabetic: bool = False, min_tf: int = None, ): config: pipeline.CorpusConfig = load_config(config_filename, corpus_source) transform_opts: pc.TokensTransformOpts = pc.TokensTransformOpts( to_lower=to_lower, to_upper=False, min_len=min_word_length, max_len=max_word_length, remove_accents=False, remove_stopwords=(remove_stopwords is not None), stopwords=None, extra_stopwords=None, language=remove_stopwords, keep_numerals=keep_numerals, keep_symbols=keep_symbols, only_alphabetic=only_alphabetic, only_any_alphanumeric=only_any_alphanumeric, ) extract_opts = pc.ExtractTaggedTokensOpts( lemmatize=lemmatize, pos_includes=pos_includes, pos_excludes=pos_excludes, ).set_numeric_names() vectorize_opts: pc.VectorizeOpts = pc.VectorizeOpts(already_tokenized=True, min_tf=min_tf, max_tokens=100000) corpus_source: str = corpus_source or config.pipeline_payload.source corpus: pc.VectorizedCorpus = id_tagged_frame_to_DTM_pipeline( corpus_config=config, corpus_source=corpus_source, file_pattern='**/prot-*.feather', extract_opts=extract_opts, transform_opts=transform_opts, vectorize_opts=vectorize_opts, ).value() corpus = corpus.slice_by_tf(5) os.makedirs('./data/bogger', exist_ok=True) corpus.dump(tag='bogger', folder='./data/bogger', mode='files') print(f"Stored corpus of shape {corpus.data.shape}")
def test_workflow_to_dtm(): config: pipeline.CorpusConfig = pipeline.CorpusConfig.load( './tests/test_data/riksprot-kb-parlaclarin.yml') args: interface.ComputeOpts = interface.ComputeOpts( corpus_tag=f'{uuid.uuid1()}', corpus_source= '/data/westac/riksdagen_corpus_data/riksprot_parlaclarin_basic_protocol_stanza.csv.zip', corpus_type=pipeline.CorpusType.SparvCSV, target_folder='./data', transform_opts=corpora.TokensTransformOpts(to_lower=True, only_alphabetic=True), # text_reader_opts=corpora.TextReaderOpts(filename_pattern='*.csv', filename_fields=['year:_:1']), text_reader_opts=config.text_reader_opts, extract_opts=corpora.ExtractTaggedTokensOpts( lemmatize=True, pos_includes='', pos_excludes='|MID|MAD|PAD|', **config.pipeline_payload.tagged_columns_names, ), vectorize_opts=corpora.VectorizeOpts( already_tokenized=True, lowercase=False, min_tf=1, max_tokens=None, ), create_subfolder=True, persist=True, enable_checkpoint=True, force_checkpoint=True, tf_threshold=5, tf_threshold_mask=True, ) corpus = workflow.compute(args=args, corpus_config=config) corpus.remove(tag=args.corpus_tag, folder=args.target_folder) corpus.dump(tag=args.corpus_tag, folder=args.target_folder) assert corpora.VectorizedCorpus.dump_exists(tag=args.corpus_tag, folder=args.target_folder) corpus_loaded = corpora.VectorizedCorpus.load(tag=args.corpus_tag, folder=args.target_folder) assert corpus_loaded is not None y_corpus = corpus.group_by_year() assert y_corpus is not None with contextlib.suppress(Exception): corpus.remove(tag=args.corpus_tag, folder=args.target_folder)
def test_spaCy_co_occurrence_pipeline3(config): corpus_source = './tests/test_data/legal_instrument_five_docs_test.zip' tagged_corpus_source = f'./tests/output/{uuid.uuid1()}_pos.csv.zip' args: ComputeOpts = ComputeOpts( corpus_tag=f'{uuid.uuid1()}', corpus_source=corpus_source, target_folder=f'./tests/output/{uuid.uuid1()}', corpus_type=pipeline.CorpusType.SpacyCSV, # pos_scheme: utility.PoS_Tag_Scheme = utility.PoS_Tag_Schemes.Universal transform_opts=corpora.TokensTransformOpts(language='english', remove_stopwords=True, to_lower=True), text_reader_opts=corpora.TextReaderOpts(filename_pattern='*.csv', filename_fields=['year:_:1']), extract_opts=corpora.ExtractTaggedTokensOpts( lemmatize=True, pos_includes='|NOUN|PROPN|VERB|', pos_excludes='|PUNCT|EOL|SPACE|', **config.pipeline_payload.tagged_columns_names, filter_opts=dict(is_alpha=False, is_punct=False, is_space=False), ), create_subfolder=False, persist=True, vectorize_opts=corpora.VectorizeOpts( already_tokenized=True, lowercase=False, min_tf=1, max_tokens=None, ), enable_checkpoint=True, force_checkpoint=True, tf_threshold=1, tf_threshold_mask=False, context_opts=co_occurrence.ContextOpts( context_width=4, concept=set(), ignore_concept=False, partition_keys=['document_id'], ), ) workflow.compute( args=args, corpus_config=config, tagged_corpus_source=tagged_corpus_source, ) assert os.path.isfile(tagged_corpus_source) assert os.path.isdir(args.target_folder) shutil.rmtree(args.target_folder, ignore_errors=True) os.remove(tagged_corpus_source)
def test_workflow_to_dtm(config: pipeline.CorpusConfig): args: ComputeOpts = ComputeOpts( corpus_tag=f'{uuid.uuid1()}', corpus_source='./tests/test_data/legal_instrument_five_docs_test.zip', corpus_type=pipeline.CorpusType.Text, target_folder='./tests/output/', transform_opts=corpora.TokensTransformOpts(language='english', remove_stopwords=True, to_lower=True), text_reader_opts=corpora.TextReaderOpts(filename_pattern='*.csv', filename_fields=['year:_:1']), extract_opts=corpora.ExtractTaggedTokensOpts( lemmatize=True, pos_includes='|NOUN|PROPN|VERB|', pos_excludes='|PUNCT|EOL|SPACE|', **config.pipeline_payload.tagged_columns_names, filter_opts=dict(is_alpha=False, is_punct=False, is_space=False), ), vectorize_opts=corpora.VectorizeOpts( already_tokenized=True, lowercase=False, min_tf=1, max_tokens=None, ), create_subfolder=False, persist=True, enable_checkpoint=True, force_checkpoint=True, tf_threshold=1, tf_threshold_mask=False, tagged_corpus_source='./tests/output/legal_instrument_five_docs_test_pos_csv.zip', ) corpus = workflow.compute(args=args, corpus_config=config) corpus.remove(tag=args.corpus_tag, folder=args.target_folder) corpus.dump(tag=args.corpus_tag, folder=args.target_folder) assert corpora.VectorizedCorpus.dump_exists(tag=args.corpus_tag, folder=args.target_folder) corpus_loaded = corpora.VectorizedCorpus.load(tag=args.corpus_tag, folder=args.target_folder) assert corpus_loaded is not None y_corpus = corpus.group_by_year() assert y_corpus is not None with contextlib.suppress(Exception): corpus.remove(tag=args.corpus_tag, folder=args.target_folder)
def test_spaCy_co_occurrence_workflow(config: pipeline.CorpusConfig): """Note: Use the output from this test case to update the tests/test_data/VENUS test data VENUS-TESTDATA""" os.makedirs('./tests/output', exist_ok=True) config.pipeline_payload.source = './tests/test_data/legal_instrument_five_docs_test.zip' config.pipeline_payload.document_index_source = './tests/test_data/legal_instrument_five_docs_test.csv' config.checkpoint_opts.feather_folder = f'tests/output/{uuid.uuid1()}' corpus_tag: str = 'VENUS' target_folder: str = f'./tests/output/{uuid.uuid1()}' tagged_corpus_source: str = "./tests/output/co_occurrence_test_pos_csv.zip" bundle: co_occurrence.Bundle = spaCy_co_occurrence_pipeline( corpus_config=config, corpus_source=None, transform_opts=corpora.TokensTransformOpts(language='english', remove_stopwords=True, to_lower=True), extract_opts=corpora.ExtractTaggedTokensOpts( lemmatize=True, pos_includes='|NOUN|PROPN|VERB|', pos_excludes='|PUNCT|EOL|SPACE|', **config.pipeline_payload.tagged_columns_names, filter_opts=dict(is_alpha=False, is_punct=False, is_space=False), ), context_opts=co_occurrence.ContextOpts(context_width=4, ignore_concept=True, partition_keys=['document_id'], processes=None), global_threshold_count=1, tagged_corpus_source=tagged_corpus_source, ).value() assert bundle.corpus is not None assert bundle.token2id is not None assert bundle.document_index is not None bundle.tag = corpus_tag bundle.folder = target_folder bundle.co_occurrences = bundle.corpus.to_co_occurrences(bundle.token2id) bundle.store() shutil.rmtree(bundle.folder, ignore_errors=True) shutil.rmtree(tagged_corpus_source, ignore_errors=True) shutil.rmtree(config.checkpoint_opts.feather_folder, ignore_errors=True)
def test_spaCy_co_occurrence_pipeline(config: pipeline.CorpusConfig): os.makedirs('./tests/output', exist_ok=True) tagged_corpus_source: str = "./tests/test_data/legal_instrument_five_docs_test_pos_csv.zip" target_filename = './tests/output/SSI-co-occurrence-JJVBNN-window-9.csv' if os.path.isfile(target_filename): os.remove(target_filename) # .folder(folder='./tests/test_data') pos_scheme: utility.PoS_Tag_Scheme = utility.PoS_Tag_Schemes.Universal transform_opts: corpora.TokensTransformOpts = corpora.TokensTransformOpts() extract_opts: corpora.ExtractTaggedTokensOpts = corpora.ExtractTaggedTokensOpts( lemmatize=True, pos_includes=utility.pos_tags_to_str(pos_scheme.Adjective + pos_scheme.Verb + pos_scheme.Noun), pos_paddings=utility.pos_tags_to_str(pos_scheme.Conjunction), **config.pipeline_payload.tagged_columns_names, filter_opts=dict(is_punct=False), ) context_opts: co_occurrence.ContextOpts = co_occurrence.ContextOpts( context_width=4, partition_keys=['document_id'], ) global_threshold_count: int = 1 value: co_occurrence.Bundle = spaCy_co_occurrence_pipeline( corpus_config=config, corpus_source=config.pipeline_payload.source, transform_opts=transform_opts, context_opts=context_opts, extract_opts=extract_opts, global_threshold_count=global_threshold_count, tagged_corpus_source=tagged_corpus_source, ).value() value.co_occurrences.to_csv(target_filename, sep='\t') assert os.path.isfile(target_filename) os.remove(target_filename)
def run_workflow(): corpus_config = pipeline.CorpusConfig.load(CONFIG_FILENAME) # .folders(DATA_FOLDER) # corpus_config.pipeline_payload.files(source=CORPUS_FILENAME, document_index_source=None) # corpus_config.checkpoint_opts.deserialize_processes = 3 transform_opts: corpora.TokensTransformOpts = corpora.TokensTransformOpts( to_lower=False, to_upper=False, min_len=1, max_len=None, remove_accents=False, remove_stopwords=False, stopwords=None, extra_stopwords=None, language='swedish', keep_numerals=True, keep_symbols=True, only_alphabetic=False, only_any_alphanumeric=False, ) extract_opts: corpora.ExtractTaggedTokensOpts = corpora.ExtractTaggedTokensOpts( pos_includes=None, pos_excludes=None, pos_paddings=None, lemmatize=True, append_pos=False, global_tf_threshold=1, global_tf_threshold_mask=False, **corpus_config.pipeline_payload.tagged_columns_names, ) engine_args = { 'n_topics': 4, 'passes': 1, 'random_seed': 42, 'alpha': 'symmetric', 'workers': 1, 'max_iter': 500, 'work_folder': './tests/output/', } extract_opts = "lemma" transform_opts = None _ = ( CorpusPipeline(config=corpus_config) .load_id_tagged_frame( folder=CORPUS_FOLDER, file_pattern='**/prot-*.feather', id_to_token=True, ) .tagged_frame_to_tokens( extract_opts=extract_opts, transform_opts=transform_opts, ) .to_topic_model( target_mode="both", target_folder="./tests/output", target_name="APA", engine="gensim_lda-multicore", engine_args=engine_args, store_corpus=True, store_compressed=True, ) ).value()
def run_workflow(): corpus_config = pipeline.CorpusConfig.load(CONFIG_FILENAME).folders(DATA_FOLDER) corpus_config.pipeline_payload.files(source=CORPUS_FILENAME, document_index_source=None) corpus_config.checkpoint_opts.deserialize_processes = 4 compute_opts = ComputeOpts( corpus_type=pipeline.CorpusType.SparvCSV, corpus_source=CORPUS_FILENAME, target_folder=jj(OUTPUT_FOLDER, 'APA'), corpus_tag='APA', transform_opts=corpora.TokensTransformOpts( to_lower=True, to_upper=False, min_len=1, max_len=None, remove_accents=False, remove_stopwords=False, stopwords=None, extra_stopwords=None, language='swedish', keep_numerals=True, keep_symbols=True, only_alphabetic=False, only_any_alphanumeric=False, ), text_reader_opts=corpora.TextReaderOpts( filename_pattern='*.csv', filename_filter=None, filename_fields=[ 'year:prot\\_(\\d{4}).*', 'year2:prot_\\d{4}(\\d{2})__*', 'number:prot_\\d+[afk_]{0,4}__(\\d+).*', ], index_field=None, as_binary=False, sep='\t', quoting=3, ), extract_opts=corpora.ExtractTaggedTokensOpts( pos_includes='NN|PM', pos_excludes='MAD|MID|PAD', pos_paddings='AB|DT|HA|HD|HP|HS|IE|IN|JJ|KN|PC|PL|PN|PP|PS|RG|RO|SN|UO|VB', lemmatize=True, append_pos=False, global_tf_threshold=1, global_tf_threshold_mask=False, **corpus_config.pipeline_payload.tagged_columns_names, ), vectorize_opts=corpora.VectorizeOpts( already_tokenized=True, lowercase=False, stop_words=None, max_df=1.0, min_df=1, min_tf=1, ), tf_threshold=1, tf_threshold_mask=False, create_subfolder=True, persist=True, context_opts=ContextOpts( context_width=2, concept=set(['kammare']), ignore_concept=False, partition_keys=['document_name'], processes=4, chunksize=10, ), enable_checkpoint=False, force_checkpoint=False, ) _ = workflow.compute( args=compute_opts, corpus_config=corpus_config, tagged_corpus_source=jj(OUTPUT_FOLDER, 'test.zip'), )
def main( config_filename: Optional[str] = None, corpus_source: Optional[str] = None, train_corpus_folder: Optional[str] = None, trained_model_folder: Optional[str] = None, target_mode: Literal['train', 'predict', 'both'] = 'both', target_folder: Optional[str] = None, target_name: Optional[str] = None, lemmatize: bool = True, pos_includes: str = '', pos_excludes: str = '', to_lower: bool = True, max_tokens: int = None, tf_threshold: int = None, remove_stopwords: Optional[str] = None, min_word_length: int = 2, max_word_length: int = None, keep_symbols: bool = False, keep_numerals: bool = False, alpha: str = 'asymmetric', chunk_size: int = 2000, engine: str = "gensim_lda-multicore", max_iter: int = None, minimum_probability: float = None, n_topics: int = 50, passes: int = None, per_word_topics: bool = False, random_seed: int = None, update_every: int = 1, workers: int = None, store_corpus: bool = True, store_compressed: bool = True, only_any_alphanumeric: bool = False, only_alphabetic: bool = False, fix_hyphenation: bool = True, fix_accents: bool = True, enable_checkpoint: bool = True, force_checkpoint: bool = False, passthrough_column: Optional[str] = None, ): if not config_filename or not os.path.isfile(config_filename): click.echo("error: config file not specified/found") raise sys.exit(1) if target_name is None: click.echo("error: target_name not specified") raise sys.exit(1) if target_mode == 'predict' and not InferredModel.exists( trained_model_folder): click.echo("error: trained model folder not specified") raise sys.exit(1) config: pipeline.CorpusConfig = load_config(config_filename, corpus_source) if corpus_source is None and config.pipeline_payload.source is None: click.echo("usage: corpus source must be specified") sys.exit(1) if not config.pipeline_key_exists("topic_modeling_pipeline"): click.echo("config error: `topic_modeling_pipeline` not specified") sys.exit(1) text_transform_opts: pc.TextTransformOpts = pc.TextTransformOpts() if fix_accents: text_transform_opts.fix_accents = True if fix_hyphenation: """Replace default dehyphen function""" # fix_hyphens: Callable[[str], str] = ( # remove_hyphens_fx(config.text_reader_opts.dehyphen_expr) # if config.text_reader_opts.dehyphen_expr is not None # else remove_hyphens # ) text_transform_opts.fix_hyphenation = False text_transform_opts.extra_transforms.append(pc.remove_hyphens) transform_opts: pc.TokensTransformOpts = pc.TokensTransformOpts( to_lower=to_lower, to_upper=False, min_len=min_word_length, max_len=max_word_length, remove_accents=False, remove_stopwords=(remove_stopwords is not None), stopwords=None, extra_stopwords=None, language=remove_stopwords, keep_numerals=keep_numerals, keep_symbols=keep_symbols, only_alphabetic=only_alphabetic, only_any_alphanumeric=only_any_alphanumeric, ) extract_opts: pc.ExtractTaggedTokensOpts = pc.ExtractTaggedTokensOpts( lemmatize=lemmatize, pos_includes=pos_includes, pos_excludes=pos_excludes, **config.pipeline_payload.tagged_columns_names, ) if passthrough_column is not None: extract_opts: str = passthrough_column text_transform_opts: pc.TextTransformOpts = None transform_opts: pc.TokensTransformOpts = None engine_args: dict = remove_none( dict( alpha=alpha, chunk_size=chunk_size, max_iter=max_iter, minimum_probability=minimum_probability, n_topics=n_topics, passes=passes, per_word_topics=per_word_topics, random_seed=random_seed, update_every=update_every, work_folder=os.path.join(target_folder, target_name), workers=workers, )) _: dict = config.get_pipeline( pipeline_key="topic_modeling_pipeline", config=config, corpus_source=corpus_source, train_corpus_folder=train_corpus_folder, trained_model_folder=trained_model_folder, target_mode=target_mode, target_folder=target_folder, target_name=target_name, text_transform_opts=text_transform_opts, extract_opts=extract_opts, transform_opts=transform_opts, engine=engine, engine_args=engine_args, store_corpus=store_corpus, store_compressed=store_compressed, enable_checkpoint=enable_checkpoint, force_checkpoint=force_checkpoint, ).value()
def debug_main( config_filename: str = None, target_name: str = None, corpus_source: str = None, train_corpus_folder: str = None, target_folder: str = None, fix_hyphenation: bool = True, fix_accents: bool = True, lemmatize: bool = True, pos_includes: str = '', pos_excludes: str = '', to_lower: bool = True, remove_stopwords: str = None, min_word_length: int = 2, max_word_length: int = None, keep_symbols: bool = False, keep_numerals: bool = False, only_any_alphanumeric: bool = False, only_alphabetic: bool = False, max_tokens: int = None, alpha: str = 'asymmetric', chunk_size: int = 2000, engine: str = "gensim_lda-multicore", max_iter: int = None, minimum_probability: float = None, n_topics: int = 50, passes: int = None, per_word_topics: bool = False, random_seed: int = None, update_every: int = 1, workers: int = None, store_corpus: bool = True, store_compressed: bool = True, passthrough_column: str = None, ): config: pipeline.CorpusConfig = load_config(config_filename, corpus_source) if passthrough_column is None: text_transform_opts: pc.TextTransformOpts = pc.TextTransformOpts() if fix_accents: text_transform_opts.fix_accents = True if fix_hyphenation: """Replace default dehyphen function""" # fix_hyphens: Callable[[str], str] = ( # remove_hyphens_fx(config.text_reader_opts.dehyphen_expr) # if config.text_reader_opts.dehyphen_expr is not None # else remove_hyphens # ) text_transform_opts.fix_hyphenation = False text_transform_opts.extra_transforms.append(pc.remove_hyphens) transform_opts: pc.TokensTransformOpts = pc.TokensTransformOpts( to_lower=to_lower, to_upper=False, min_len=min_word_length, max_len=max_word_length, remove_accents=False, remove_stopwords=(remove_stopwords is not None), stopwords=None, extra_stopwords=None, language=remove_stopwords, keep_numerals=keep_numerals, keep_symbols=keep_symbols, only_alphabetic=only_alphabetic, only_any_alphanumeric=only_any_alphanumeric, ) extract_opts = pc.ExtractTaggedTokensOpts( lemmatize=lemmatize, pos_includes=pos_includes, pos_excludes=pos_excludes, ).set_numeric_names() else: # extract_opts: str = passthrough_column text_transform_opts: pc.TextTransformOpts = None # transform_opts: penelope.TokensTransformOpts = None engine_args = remove_none({ 'n_topics': n_topics, 'passes': passes, 'random_seed': random_seed, 'alpha': alpha, 'workers': workers, 'max_iter': max_iter, 'work_folder': os.path.join(target_folder, target_name), 'chunk_size': chunk_size, 'update_every': 2, }) vectorize_opts: VectorizeOpts = VectorizeOpts( already_tokenized=True, max_tokens=max_tokens, lowercase=False, ) corpus_source: str = corpus_source or config.pipeline_payload.source _: dict = from_id_tagged_frame_pipeline( corpus_config=config, corpus_source=corpus_source, file_pattern='**/*.feather', extract_opts=extract_opts, transform_opts=transform_opts, vectorize_opts=vectorize_opts, target_name=target_name, train_corpus_folder=train_corpus_folder, target_folder=target_folder, engine=engine, engine_args=engine_args, store_corpus=store_corpus, store_compressed=store_compressed, ).value()
def main( corpus_source: str = None, config_filename: str = None, model_folder: str = None, model_name: str = None, target_name: str = None, target_folder: str = None, lemmatize: bool = True, pos_includes: str = '', pos_excludes: str = '', to_lower: bool = True, remove_stopwords: str = None, min_word_length: int = 2, max_word_length: int = None, keep_symbols: bool = False, keep_numerals: bool = False, only_any_alphanumeric: bool = False, only_alphabetic: bool = False, minimum_probability: float = 0.001, n_tokens: int = 200, enable_checkpoint: bool = True, force_checkpoint: bool = False, ): config: pipeline.CorpusConfig = pipeline.CorpusConfig.load( path=config_filename) transform_opts: penelope.TokensTransformOpts = penelope.TokensTransformOpts( to_lower=to_lower, to_upper=False, min_len=min_word_length, max_len=max_word_length, remove_accents=False, remove_stopwords=(remove_stopwords is not None), stopwords=None, extra_stopwords=None, language=remove_stopwords, keep_numerals=keep_numerals, keep_symbols=keep_symbols, only_alphabetic=only_alphabetic, only_any_alphanumeric=only_any_alphanumeric, ) extract_opts = penelope.ExtractTaggedTokensOpts( lemmatize=lemmatize, pos_includes=pos_includes, pos_excludes=pos_excludes, **config.pipeline_payload.tagged_columns_names, ) tag, folder = workflow( config=config, model_name=model_name, model_folder=model_folder, target_name=target_name, target_folder=target_folder, corpus_source=corpus_source, extract_opts=extract_opts, transform_opts=transform_opts, minimum_probability=minimum_probability, n_tokens=n_tokens, enable_checkpoint=enable_checkpoint, force_checkpoint=force_checkpoint, ) logger.info(f"Done! Model {tag} stored in {folder}")