def test_workflow_to_dtm_step_by_step(config: pipeline.CorpusConfig): corpus_tag: str = uuid.uuid1() target_folder: str = "./tests/output" corpus_source: str = './tests/test_data/legal_instrument_five_docs_test.zip' tagged_corpus_source: str = f"./tests/output/{uuid.uuid1()}_pos_csv.zip" args: ComputeOpts = ComputeOpts( corpus_tag=corpus_tag, corpus_source=corpus_source, target_folder=target_folder, corpus_type=pipeline.CorpusType.SpacyCSV, # pos_scheme: PoS_Tag_Scheme = PoS_Tag_Schemes.Universal transform_opts=corpora.TokensTransformOpts(language='english', remove_stopwords=True, to_lower=True), text_reader_opts=corpora.TextReaderOpts(filename_pattern='*.csv', filename_fields=['year:_:1']), extract_opts=corpora.ExtractTaggedTokensOpts( lemmatize=True, pos_includes='|NOUN|PROPN|VERB|', pos_excludes='|PUNCT|EOL|SPACE|', **config.pipeline_payload.tagged_columns_names, filter_opts=dict(is_alpha=False, is_punct=False, is_space=False), ), create_subfolder=False, persist=True, tf_threshold=1, tf_threshold_mask=False, vectorize_opts=corpora.VectorizeOpts( already_tokenized=True, lowercase=False, min_tf=1, max_tokens=None, ), enable_checkpoint=True, force_checkpoint=True, ) with inline_code(spacy_pipeline.to_tagged_frame_pipeline): tagged_frame_filename: str = tagged_corpus_source or utility.path_add_suffix( config.pipeline_payload.source, '_pos_csv' ) p: pipeline.CorpusPipeline = ( pipeline.CorpusPipeline(config=config) .set_spacy_model(config.pipeline_payload.memory_store['spacy_model']) .load_text( reader_opts=config.text_reader_opts, transform_opts=None, source=corpus_source, ) .text_to_spacy() .spacy_to_pos_tagged_frame() .checkpoint(filename=tagged_frame_filename, force_checkpoint=args.force_checkpoint) ) if args.enable_checkpoint: p = p.checkpoint_feather(folder=config.get_feather_folder(corpus_source), force=args.force_checkpoint) p.exhaust()
def compute_callback(args: interface.ComputeOpts, corpus_config: pipeline.CorpusConfig) -> VectorizedCorpus: global LAST_ARGS, LAST_CORPUS_CONFIG LAST_ARGS = args LAST_CORPUS_CONFIG = corpus_config if args.dry_run: print(args.command_line("vectorize_corpus")) return None corpus: VectorizedCorpus = workflow.compute(args=args, corpus_config=corpus_config) return corpus
def ComputeOptsSparvCSV( *, corpus_tag: str = 'TELLUS', corpus_source: str = './tests/test_data/tranströmer_corpus_export.sparv4.csv.zip', ) -> ComputeOpts: # pylint: disable=too-many-instance-attributes) return ComputeOpts( corpus_tag=corpus_tag, corpus_source=corpus_source, target_folder="./tests/output", corpus_type=CorpusType.SparvCSV, transform_opts=TokensTransformOpts( to_lower=True, min_len=1, remove_stopwords=None, keep_symbols=True, keep_numerals=True, only_alphabetic=False, only_any_alphanumeric=False, ), text_reader_opts=TextReaderOpts( filename_pattern='*.csv', filename_fields=('year:_:1', ), index_field=None, # use filename as_binary=False, ), extract_opts=ExtractTaggedTokensOpts( pos_includes=None, pos_excludes='|MAD|MID|PAD|', pos_paddings=None, lemmatize=False, **SPARV_TAGGED_COLUMNS, filter_opts=dict( is_alpha=False, is_punct=False, is_digit=None, is_stop=None, is_space=False, ), ), create_subfolder=False, persist=True, context_opts=ContextOpts( concept=('jag', ), context_width=2, partition_keys=['document_id'], ), tf_threshold=1, tf_threshold_mask=False, vectorize_opts=VectorizeOpts(already_tokenized=True, min_tf=1, max_tokens=None), )
def test_spaCy_co_occurrence_pipeline3(config): corpus_source = './tests/test_data/legal_instrument_five_docs_test.zip' tagged_corpus_source = f'./tests/output/{uuid.uuid1()}_pos.csv.zip' args: ComputeOpts = ComputeOpts( corpus_tag=f'{uuid.uuid1()}', corpus_source=corpus_source, target_folder=f'./tests/output/{uuid.uuid1()}', corpus_type=pipeline.CorpusType.SpacyCSV, # pos_scheme: utility.PoS_Tag_Scheme = utility.PoS_Tag_Schemes.Universal transform_opts=corpora.TokensTransformOpts(language='english', remove_stopwords=True, to_lower=True), text_reader_opts=corpora.TextReaderOpts(filename_pattern='*.csv', filename_fields=['year:_:1']), extract_opts=corpora.ExtractTaggedTokensOpts( lemmatize=True, pos_includes='|NOUN|PROPN|VERB|', pos_excludes='|PUNCT|EOL|SPACE|', **config.pipeline_payload.tagged_columns_names, filter_opts=dict(is_alpha=False, is_punct=False, is_space=False), ), create_subfolder=False, persist=True, vectorize_opts=corpora.VectorizeOpts( already_tokenized=True, lowercase=False, min_tf=1, max_tokens=None, ), enable_checkpoint=True, force_checkpoint=True, tf_threshold=1, tf_threshold_mask=False, context_opts=co_occurrence.ContextOpts( context_width=4, concept=set(), ignore_concept=False, partition_keys=['document_id'], ), ) workflow.compute( args=args, corpus_config=config, tagged_corpus_source=tagged_corpus_source, ) assert os.path.isfile(tagged_corpus_source) assert os.path.isdir(args.target_folder) shutil.rmtree(args.target_folder, ignore_errors=True) os.remove(tagged_corpus_source)
def test_workflow_to_dtm(config: pipeline.CorpusConfig): args: ComputeOpts = ComputeOpts( corpus_tag=f'{uuid.uuid1()}', corpus_source='./tests/test_data/legal_instrument_five_docs_test.zip', corpus_type=pipeline.CorpusType.Text, target_folder='./tests/output/', transform_opts=corpora.TokensTransformOpts(language='english', remove_stopwords=True, to_lower=True), text_reader_opts=corpora.TextReaderOpts(filename_pattern='*.csv', filename_fields=['year:_:1']), extract_opts=corpora.ExtractTaggedTokensOpts( lemmatize=True, pos_includes='|NOUN|PROPN|VERB|', pos_excludes='|PUNCT|EOL|SPACE|', **config.pipeline_payload.tagged_columns_names, filter_opts=dict(is_alpha=False, is_punct=False, is_space=False), ), vectorize_opts=corpora.VectorizeOpts( already_tokenized=True, lowercase=False, min_tf=1, max_tokens=None, ), create_subfolder=False, persist=True, enable_checkpoint=True, force_checkpoint=True, tf_threshold=1, tf_threshold_mask=False, tagged_corpus_source='./tests/output/legal_instrument_five_docs_test_pos_csv.zip', ) corpus = workflow.compute(args=args, corpus_config=config) corpus.remove(tag=args.corpus_tag, folder=args.target_folder) corpus.dump(tag=args.corpus_tag, folder=args.target_folder) assert corpora.VectorizedCorpus.dump_exists(tag=args.corpus_tag, folder=args.target_folder) corpus_loaded = corpora.VectorizedCorpus.load(tag=args.corpus_tag, folder=args.target_folder) assert corpus_loaded is not None y_corpus = corpus.group_by_year() assert y_corpus is not None with contextlib.suppress(Exception): corpus.remove(tag=args.corpus_tag, folder=args.target_folder)
def compute( args: interface.ComputeOpts, corpus_config: CorpusConfig, tagged_frame_pipeline: pipeline.CorpusPipeline = None, ) -> VectorizedCorpus: try: assert args.is_satisfied() if tagged_frame_pipeline is None: tagged_frame_pipeline = corpus_config.get_pipeline( "tagged_frame_pipeline", corpus_source=args.corpus_source, enable_checkpoint=args.enable_checkpoint, force_checkpoint=args.force_checkpoint, tagged_corpus_source=args.tagged_corpus_source, ) corpus: VectorizedCorpus = ( tagged_frame_pipeline + wildcard_to_DTM_pipeline( transform_opts=args.transform_opts, extract_opts=args.extract_opts, vectorize_opts=args.vectorize_opts, ) ).value() if (args.tf_threshold or 1) > 1: corpus = corpus.slice_by_tf(args.tf_threshold) if args.persist: store_corpus_bundle(corpus, args) return corpus except Exception as ex: raise ex
def run_workflow(): corpus_config = pipeline.CorpusConfig.load(CONFIG_FILENAME).folders(DATA_FOLDER) corpus_config.pipeline_payload.files(source=CORPUS_FILENAME, document_index_source=None) corpus_config.checkpoint_opts.deserialize_processes = 4 compute_opts = ComputeOpts( corpus_type=pipeline.CorpusType.SparvCSV, corpus_source=CORPUS_FILENAME, target_folder=jj(OUTPUT_FOLDER, 'APA'), corpus_tag='APA', transform_opts=corpora.TokensTransformOpts( to_lower=True, to_upper=False, min_len=1, max_len=None, remove_accents=False, remove_stopwords=False, stopwords=None, extra_stopwords=None, language='swedish', keep_numerals=True, keep_symbols=True, only_alphabetic=False, only_any_alphanumeric=False, ), text_reader_opts=corpora.TextReaderOpts( filename_pattern='*.csv', filename_filter=None, filename_fields=[ 'year:prot\\_(\\d{4}).*', 'year2:prot_\\d{4}(\\d{2})__*', 'number:prot_\\d+[afk_]{0,4}__(\\d+).*', ], index_field=None, as_binary=False, sep='\t', quoting=3, ), extract_opts=corpora.ExtractTaggedTokensOpts( pos_includes='NN|PM', pos_excludes='MAD|MID|PAD', pos_paddings='AB|DT|HA|HD|HP|HS|IE|IN|JJ|KN|PC|PL|PN|PP|PS|RG|RO|SN|UO|VB', lemmatize=True, append_pos=False, global_tf_threshold=1, global_tf_threshold_mask=False, **corpus_config.pipeline_payload.tagged_columns_names, ), vectorize_opts=corpora.VectorizeOpts( already_tokenized=True, lowercase=False, stop_words=None, max_df=1.0, min_df=1, min_tf=1, ), tf_threshold=1, tf_threshold_mask=False, create_subfolder=True, persist=True, context_opts=ContextOpts( context_width=2, concept=set(['kammare']), ignore_concept=False, partition_keys=['document_name'], processes=4, chunksize=10, ), enable_checkpoint=False, force_checkpoint=False, ) _ = workflow.compute( args=compute_opts, corpus_config=corpus_config, tagged_corpus_source=jj(OUTPUT_FOLDER, 'test.zip'), )
def ComputeOptsSpacyCSV( *, corpus_tag: str = 'MARS', corpus_source: str = './tests/test_data/legal_instrument_five_docs_test.zip', ) -> ComputeOpts: # pylint: disable=too-many-instance-attributes) return ComputeOpts( corpus_tag=corpus_tag, corpus_source=corpus_source, target_folder="./tests/output", corpus_type=CorpusType.SpacyCSV, # pos_scheme: PoS_Tag_Scheme = PoS_Tag_Schemes.Universal transform_opts=TokensTransformOpts( extra_stopwords=[], keep_numerals=True, keep_symbols=True, language='english', max_len=None, min_len=1, only_alphabetic=False, only_any_alphanumeric=False, remove_accents=False, remove_stopwords=True, stopwords=None, to_lower=True, to_upper=False, ), text_reader_opts=TextReaderOpts( filename_pattern='*.csv', filename_fields=['year:_:1'], index_field=None, # use filename as_binary=False, ), extract_opts=ExtractTaggedTokensOpts( lemmatize=True, target_override=None, pos_includes='|NOUN|PROPN|VERB|', pos_paddings=None, pos_excludes='|PUNCT|EOL|SPACE|', passthrough_tokens=[], block_tokens=[], append_pos=False, global_tf_threshold=1, global_tf_threshold_mask=False, **SPACY_TAGGED_COLUMNS, filter_opts=dict( is_alpha=False, is_punct=False, is_digit=None, is_stop=None, is_space=False, ), ), create_subfolder=False, persist=True, context_opts=ContextOpts( context_width=4, concept=set(), ignore_concept=False, partition_keys=['document_id'], ), tf_threshold=1, tf_threshold_mask=False, vectorize_opts=VectorizeOpts( already_tokenized=True, lowercase=False, max_df=1.0, min_df=1, min_tf=1, max_tokens=None, ), )
def compute( args: interface.ComputeOpts, corpus_config: pipeline.CorpusConfig, tagged_corpus_source: Optional[str] = None, ) -> co_occurrence.Bundle: """Creates and stores a concept co-occurrence bundle using specified options.""" try: assert args.is_satisfied() target_filename = co_occurrence.to_filename(folder=args.target_folder, tag=args.corpus_tag) os.makedirs(args.target_folder, exist_ok=True) tagged_corpus_source: Optional[str] = tagged_corpus_source or jj( dirname(args.corpus_source), f"{args.corpus_tag}{POS_TAGGED_FRAME_FILENAME_POSTFIX}") tagged_frame_pipeline: pipeline.CorpusPipeline = corpus_config.get_pipeline( "tagged_frame_pipeline", corpus_source=args.corpus_source, tagged_corpus_source=tagged_corpus_source, enable_checkpoint=args.enable_checkpoint, force_checkpoint=args.force_checkpoint, ) args.extract_opts.passthrough_tokens = args.context_opts.concept args.extract_opts.block_tokens = [] # args.extract_opts.block_chars = '' args.extract_opts.global_tf_threshold = args.tf_threshold args.extract_opts.global_tf_threshold_mask = args.tf_threshold_mask p: pipeline.CorpusPipeline = ( tagged_frame_pipeline + pipeline.wildcard_to_partition_by_document_co_occurrence_pipeline( transform_opts=args.transform_opts, extract_opts=args.extract_opts, context_opts=args.context_opts, global_tf_threshold=args.tf_threshold, )) bundle: co_occurrence.Bundle = p.value() if bundle.corpus is None: raise co_occurrence.ZeroComputeError() bundle.tag = args.corpus_tag bundle.folder = args.target_folder try: bundle.co_occurrences = bundle.corpus.to_co_occurrences( bundle.token2id) except ValueError as ex: logger.error("fatal: to_co_occurrences failed (skipping)") logger.exception(ex) bundle.compute_options = compile_compute_options(args, target_filename) bundle.store() return bundle except ( ValueError, FileNotFoundError, PermissionError, ) as ex: logger.error(ex) raise except Exception as ex: logger.error(ex) raise