def create_test_data_bundles(): os.makedirs(OUTPUT_FOLDER, exist_ok=True) config: CorpusConfig = CorpusConfig.load('./tests/test_data/SSI.yml') config.pipeline_payload.source = './tests/test_data/legal_instrument_five_docs_test.zip' config.pipeline_payload.document_index_source = './tests/test_data/legal_instrument_five_docs_test.csv' tag: str = 'VENUS-CONCEPT' create_bundle_by_spaCy_pipeline( config=config, context_opts=ContextOpts(context_width=4, concept={"cultural"}, ignore_concept=True, partition_keys=['document_id']), tag=tag, ).store() tag: str = 'VENUS' create_bundle_by_spaCy_pipeline( config=config, context_opts=ContextOpts(context_width=4, concept={}, ignore_concept=True, partition_keys=['document_id']), tag=tag, ).store() tag: str = 'ABCDEFG_7DOCS' create_bundle_by_spaCy_pipeline( config=config, context_opts=ContextOpts(context_width=4, concept={}, ignore_concept=True, partition_keys=['document_id']), tag=tag, ).store() tag: str = 'ABCDEFG_7DOCS' create_simple_bundle_by_pipeline( data=SIMPLE_CORPUS_ABCDEFG_7DOCS, context_opts=ContextOpts(concept={}, ignore_concept=False, context_width=2), tag=tag, folder=DATA_FOLDER, ).store() tag: str = 'ABCDEFG_7DOCS_CONCEPT' create_simple_bundle_by_pipeline( data=SIMPLE_CORPUS_ABCDEFG_7DOCS, context_opts=ContextOpts(concept={'g'}, ignore_concept=False, context_width=2), tag=tag, folder=DATA_FOLDER, ).store()
def wildcard_to_partition_by_document_co_occurrence_pipeline( *, extract_opts: ExtractTaggedTokensOpts = None, transform_opts: TokensTransformOpts = None, context_opts: ContextOpts = None, global_tf_threshold: int = None, **kwargs, # pylint: disable=unused-argument ) -> CorpusPipeline: passthroughs: set = context_opts.get_concepts().union( extract_opts.get_passthrough_tokens()) pipeline: pipelines.CorpusPipeline = ( pipelines.wildcard().vocabulary( lemmatize=extract_opts.lemmatize, progress=True, tf_threshold=extract_opts.global_tf_threshold, tf_keeps=passthroughs, close=True, ).tagged_frame_to_tokens( extract_opts=extract_opts, # .clear_tf_threshold(), transform_opts=transform_opts, ) # .tokens_transform(transform_opts=transform_opts) .to_document_co_occurrence(context_opts=context_opts) # .tqdm(desc="Processing documents") .to_corpus_co_occurrence( context_opts=context_opts, global_threshold_count=global_tf_threshold, )) return pipeline
def test_tasks_pool_tokens_to_ttm(): corpus: TokenizedCorpus = very_simple_corpus(SIMPLE_CORPUS_ABCDE_3DOCS) token2id: dict = corpus.token2id context_opts: ContextOpts = ContextOpts( concept={'d'}, ignore_concept=False, ignore_padding=False, context_width=1, processes=None, ) token2id[context_opts.pad] = len(token2id) concept_ids = {token2id[x] for x in context_opts.concept} ignore_ids = set() filename, tokens = next(corpus) # doc_info = corpus.document_index[corpus.document_index.filename == filename].to_dict('records')[0] doc_info = faster_to_dict_records(corpus.document_index[corpus.document_index.filename == filename])[0] token_ids = [token2id[t] for t in tokens] pad_id = token2id[context_opts.pad] args = ( doc_info['document_id'], doc_info['document_name'], doc_info['filename'], token_ids, pad_id, context_opts, concept_ids, ignore_ids, len(token2id), ) item: dict = tokens_to_ttm(args) assert item is not None
def test_compress_corpus(): context_opts: ContextOpts = ContextOpts(concept={'d'}, ignore_concept=False, context_width=1, processes=None, ignore_padding=False) bundle: Bundle = create_simple_bundle_by_pipeline( data=SIMPLE_CORPUS_ABCDE_3DOCS, context_opts=context_opts, compress=False) concept_corpus: VectorizedCorpus = bundle.concept_corpus assert ((concept_corpus.data.todense() == np.matrix( [ [0, 0, 0, 0, 5, 0, 0, 0, 1, 1, 0, 0, 0], [0, 0, 0, 0, 3, 0, 0, 0, 1, 1, 2, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1, 0, 3, 1, 0, 0], ], dtype=np.int32, )).all().all()) _, ids_translation, keep_ids = concept_corpus.compress(tf_threshold=1, extra_keep_ids=[1], inplace=True) assert ((concept_corpus.data.todense() == np.matrix( [[0, 5, 0, 1, 1, 0], [0, 3, 0, 1, 1, 2], [0, 0, 1, 0, 3, 1]], dtype=np.int32, )).all().all()) assert keep_ids.tolist() == [1, 4, 7, 8, 9, 10] assert ids_translation == {1: 0, 4: 1, 7: 2, 8: 3, 9: 4, 10: 5}
def context_opts(self) -> ContextOpts: return ContextOpts( concept=self.concept_tokens, context_width=self._context_width.value, ignore_concept=self._ignore_concept.value, ignore_padding=self._ignore_padding.value, partition_keys=[self._partition_key.value], )
def create_simple_bundle() -> Bundle: simple_corpus = very_simple_corpus(SIMPLE_CORPUS_ABCDEFG_3DOCS) context_opts: ContextOpts = ContextOpts(concept=set(), ignore_concept=False, context_width=2) bundle: Bundle = create_simple_bundle_by_pipeline( data=simple_corpus, context_opts=context_opts, ) return bundle
def ComputeOptsSparvCSV( *, corpus_tag: str = 'TELLUS', corpus_source: str = './tests/test_data/tranströmer_corpus_export.sparv4.csv.zip', ) -> ComputeOpts: # pylint: disable=too-many-instance-attributes) return ComputeOpts( corpus_tag=corpus_tag, corpus_source=corpus_source, target_folder="./tests/output", corpus_type=CorpusType.SparvCSV, transform_opts=TokensTransformOpts( to_lower=True, min_len=1, remove_stopwords=None, keep_symbols=True, keep_numerals=True, only_alphabetic=False, only_any_alphanumeric=False, ), text_reader_opts=TextReaderOpts( filename_pattern='*.csv', filename_fields=('year:_:1', ), index_field=None, # use filename as_binary=False, ), extract_opts=ExtractTaggedTokensOpts( pos_includes=None, pos_excludes='|MAD|MID|PAD|', pos_paddings=None, lemmatize=False, **SPARV_TAGGED_COLUMNS, filter_opts=dict( is_alpha=False, is_punct=False, is_digit=None, is_stop=None, is_space=False, ), ), create_subfolder=False, persist=True, context_opts=ContextOpts( concept=('jag', ), context_width=2, partition_keys=['document_id'], ), tf_threshold=1, tf_threshold_mask=False, vectorize_opts=VectorizeOpts(already_tokenized=True, min_tf=1, max_tokens=None), )
def test_HAL_cwr_corpus_burgess_litmus(): data = [('document_01.txt', 'The Horse Raced Past The Barn Fell .'.lower().split())] context_opts: ContextOpts = ContextOpts( context_width=2, concept=set(), ignore_padding=False, ) bundle: Bundle = create_simple_bundle_by_pipeline(data, context_opts) hal_cwr_corpus: VectorizedCorpus = bundle.corpus.to_HAL_cwr_keyness() assert hal_cwr_corpus is not None
def create_keyness_test_bundle( data: Any, *, concept: str = 'd', ignore_padding=True, context_width: int = 1, processes: int = 2 ) -> Bundle: context_opts: ContextOpts = ContextOpts( concept={concept}, ignore_concept=False, ignore_padding=ignore_padding, context_width=context_width, processes=processes, ) bundle: Bundle = create_simple_bundle_by_pipeline(data=data, context_opts=context_opts) return bundle
def test_compute_hal_score_by_co_occurrence_matrix_burgess_litmus(): data = [('document_01.txt', 'The Horse Raced Past The Barn Fell .'.lower().split())] context_opts: ContextOpts = ContextOpts( context_width=2, concept=set(), ) bundle: Bundle = create_simple_bundle_by_pipeline(data, context_opts) co_occurrences = bundle.co_occurrences co_occurrences['cwr'] = compute_hal_score_by_co_occurrence_matrix( bundle.co_occurrences, bundle.corpus.window_counts.document_term_window_counts ) assert 'cwr' in co_occurrences.columns
def test_tokens_to_windows(): context_opts: ContextOpts = ContextOpts(concept=set(), context_width=1, ignore_padding=False, pad="*", min_window_size=0) tokens: Iterable[Token] = ["a", "*", "c", "a", "e", "*", "*", "h"] token2id: Token2Id = Token2Id().ingest([context_opts.pad] + tokens) expected_windows = [ ['*', 'a', '*'], ['a', '*', 'c'], ['*', 'c', 'a'], ['c', 'a', 'e'], ['a', 'e', '*'], ['e', '*', '*'], ['*', '*', 'h'], ['*', 'h', '*'], ] windows: Iterable[Iterable[str]] = generate_windows( token_ids=[token2id[t] for t in tokens], context_width=context_opts.context_width, pad_id=token2id[context_opts.pad], ignore_pads=False, ) assert list(windows) == [[token2id[t] for t in w] for w in expected_windows] expected_windows = [ ['a'], ['a', 'c'], ['c', 'a'], ['c', 'a', 'e'], ['a', 'e'], ['e'], ['h'], ['h'], ] windows: Iterable[Iterable[str]] = generate_windows( token_ids=[token2id[t] for t in tokens], context_width=context_opts.context_width, pad_id=token2id[context_opts.pad], ignore_pads=True, ) assert list(windows) == [[token2id[t] for t in w] for w in expected_windows]
def test_generate_cli_opts(): compute_opts = interface.ComputeOpts( corpus_type=interface.CorpusType.SparvCSV, corpus_source="apa.txt", target_folder='./tests/output', corpus_tag='APA', transform_opts=TokensTransformOpts( only_alphabetic=False, only_any_alphanumeric=False, to_lower=True, to_upper=False, min_len=1, max_len=None, remove_accents=False, remove_stopwords=True, stopwords=None, extra_stopwords=['örn'], language='swedish', keep_numerals=True, keep_symbols=True, ), text_reader_opts=TextReaderOpts( filename_pattern='*.csv', filename_filter=None, filename_fields=[ 'year:prot\\_(\\d{4}).*', 'year2:prot_\\d{4}(\\d{2})__*', 'number:prot_\\d+[afk_]{0,4}__(\\d+).*', ], index_field=None, as_binary=False, sep='\t', quoting=3, ), extract_opts=ExtractTaggedTokensOpts( lemmatize=True, target_override=None, pos_includes="NN", pos_excludes=None, pos_paddings="MID|MAD|PAD", passthrough_tokens=[], block_tokens=[], append_pos=False, global_tf_threshold=1, global_tf_threshold_mask=False, **SPARV_TAGGED_COLUMNS, ), vectorize_opts=VectorizeOpts( already_tokenized=True, lowercase=False, stop_words=None, max_df=1.0, min_df=1, min_tf=1, max_tokens=None, ), tf_threshold=1, tf_threshold_mask=False, create_subfolder=True, persist=True, context_opts=ContextOpts( context_width=1, concept=["apa"], ignore_concept=False, partition_keys=['document_id'], ), enable_checkpoint=True, force_checkpoint=False, ) cli_command: str = compute_opts.command_line("apa") assert cli_command is not None
def run_workflow(): corpus_config = pipeline.CorpusConfig.load(CONFIG_FILENAME).folders(DATA_FOLDER) corpus_config.pipeline_payload.files(source=CORPUS_FILENAME, document_index_source=None) corpus_config.checkpoint_opts.deserialize_processes = 4 compute_opts = ComputeOpts( corpus_type=pipeline.CorpusType.SparvCSV, corpus_source=CORPUS_FILENAME, target_folder=jj(OUTPUT_FOLDER, 'APA'), corpus_tag='APA', transform_opts=corpora.TokensTransformOpts( to_lower=True, to_upper=False, min_len=1, max_len=None, remove_accents=False, remove_stopwords=False, stopwords=None, extra_stopwords=None, language='swedish', keep_numerals=True, keep_symbols=True, only_alphabetic=False, only_any_alphanumeric=False, ), text_reader_opts=corpora.TextReaderOpts( filename_pattern='*.csv', filename_filter=None, filename_fields=[ 'year:prot\\_(\\d{4}).*', 'year2:prot_\\d{4}(\\d{2})__*', 'number:prot_\\d+[afk_]{0,4}__(\\d+).*', ], index_field=None, as_binary=False, sep='\t', quoting=3, ), extract_opts=corpora.ExtractTaggedTokensOpts( pos_includes='NN|PM', pos_excludes='MAD|MID|PAD', pos_paddings='AB|DT|HA|HD|HP|HS|IE|IN|JJ|KN|PC|PL|PN|PP|PS|RG|RO|SN|UO|VB', lemmatize=True, append_pos=False, global_tf_threshold=1, global_tf_threshold_mask=False, **corpus_config.pipeline_payload.tagged_columns_names, ), vectorize_opts=corpora.VectorizeOpts( already_tokenized=True, lowercase=False, stop_words=None, max_df=1.0, min_df=1, min_tf=1, ), tf_threshold=1, tf_threshold_mask=False, create_subfolder=True, persist=True, context_opts=ContextOpts( context_width=2, concept=set(['kammare']), ignore_concept=False, partition_keys=['document_name'], processes=4, chunksize=10, ), enable_checkpoint=False, force_checkpoint=False, ) _ = workflow.compute( args=compute_opts, corpus_config=corpus_config, tagged_corpus_source=jj(OUTPUT_FOLDER, 'test.zip'), )
def ComputeOptsSpacyCSV( *, corpus_tag: str = 'MARS', corpus_source: str = './tests/test_data/legal_instrument_five_docs_test.zip', ) -> ComputeOpts: # pylint: disable=too-many-instance-attributes) return ComputeOpts( corpus_tag=corpus_tag, corpus_source=corpus_source, target_folder="./tests/output", corpus_type=CorpusType.SpacyCSV, # pos_scheme: PoS_Tag_Scheme = PoS_Tag_Schemes.Universal transform_opts=TokensTransformOpts( extra_stopwords=[], keep_numerals=True, keep_symbols=True, language='english', max_len=None, min_len=1, only_alphabetic=False, only_any_alphanumeric=False, remove_accents=False, remove_stopwords=True, stopwords=None, to_lower=True, to_upper=False, ), text_reader_opts=TextReaderOpts( filename_pattern='*.csv', filename_fields=['year:_:1'], index_field=None, # use filename as_binary=False, ), extract_opts=ExtractTaggedTokensOpts( lemmatize=True, target_override=None, pos_includes='|NOUN|PROPN|VERB|', pos_paddings=None, pos_excludes='|PUNCT|EOL|SPACE|', passthrough_tokens=[], block_tokens=[], append_pos=False, global_tf_threshold=1, global_tf_threshold_mask=False, **SPACY_TAGGED_COLUMNS, filter_opts=dict( is_alpha=False, is_punct=False, is_digit=None, is_stop=None, is_space=False, ), ), create_subfolder=False, persist=True, context_opts=ContextOpts( context_width=4, concept=set(), ignore_concept=False, partition_keys=['document_id'], ), tf_threshold=1, tf_threshold_mask=False, vectorize_opts=VectorizeOpts( already_tokenized=True, lowercase=False, max_df=1.0, min_df=1, min_tf=1, max_tokens=None, ), )
def process_co_ocurrence( corpus_config: str = None, input_filename: str = None, output_filename: str = None, filename_pattern: str = None, concept: List[str] = None, ignore_concept: bool = False, ignore_padding: bool = False, context_width: int = None, compute_processes: int = None, compute_chunk_size: int = 10, partition_key: Sequence[str] = None, phrase: Sequence[str] = None, phrase_file: str = None, create_subfolder: bool = True, pos_includes: str = None, pos_paddings: str = None, pos_excludes: str = None, append_pos: bool = False, to_lower: bool = True, lemmatize: bool = True, remove_stopwords: str = None, min_word_length: int = 2, max_word_length: int = None, keep_symbols: bool = False, keep_numerals: bool = False, only_any_alphanumeric: bool = False, only_alphabetic: bool = False, tf_threshold: int = 1, tf_threshold_mask: bool = False, enable_checkpoint: bool = True, force_checkpoint: bool = False, deserialize_processes: int = 4, ): try: output_folder, output_tag = to_folder_and_tag(output_filename) corpus_config: CorpusConfig = CorpusConfig.load(corpus_config) phrases = parse_phrases(phrase_file, phrase) if pos_excludes is None: pos_excludes = pos_tags_to_str(corpus_config.pos_schema.Delimiter) if pos_paddings.upper() in ["FULL", "ALL", "PASSTHROUGH"]: pos_paddings = pos_tags_to_str(corpus_config.pos_schema.all_types_except(pos_includes)) logger.info(f"PoS paddings expanded to: {pos_paddings}") text_reader_opts: TextReaderOpts = corpus_config.text_reader_opts.copy() if filename_pattern is not None: text_reader_opts.filename_pattern = filename_pattern corpus_config.checkpoint_opts.deserialize_processes = max(1, deserialize_processes) tagged_columns: dict = corpus_config.pipeline_payload.tagged_columns_names args: interface.ComputeOpts = interface.ComputeOpts( corpus_type=corpus_config.corpus_type, corpus_source=input_filename, target_folder=output_folder, corpus_tag=output_tag, transform_opts=TokensTransformOpts( to_lower=to_lower, to_upper=False, min_len=min_word_length, max_len=max_word_length, remove_accents=False, remove_stopwords=(remove_stopwords is not None), stopwords=None, extra_stopwords=None, language=remove_stopwords, keep_numerals=keep_numerals, keep_symbols=keep_symbols, only_alphabetic=only_alphabetic, only_any_alphanumeric=only_any_alphanumeric, ), text_reader_opts=text_reader_opts, extract_opts=ExtractTaggedTokensOpts( pos_includes=pos_includes, pos_paddings=pos_paddings, pos_excludes=pos_excludes, lemmatize=lemmatize, phrases=phrases, append_pos=append_pos, global_tf_threshold=tf_threshold, global_tf_threshold_mask=tf_threshold_mask, **tagged_columns, ), vectorize_opts=VectorizeOpts(already_tokenized=True, max_tokens=None), tf_threshold=tf_threshold, tf_threshold_mask=tf_threshold_mask, create_subfolder=create_subfolder, persist=True, context_opts=ContextOpts( context_width=context_width, concept=set(concept or []), ignore_concept=ignore_concept, ignore_padding=ignore_padding, partition_keys=partition_key, processes=compute_processes, chunksize=compute_chunk_size, ), enable_checkpoint=enable_checkpoint, force_checkpoint=force_checkpoint, ) workflow.compute(args=args, corpus_config=corpus_config) logger.info('Done!') except Exception as ex: # pylint: disable=try-except-raise, unused-variable logger.exception(ex) click.echo(ex) # sys.exit(1) raise
def test_step_by_step_compress_with_simple_corpus(): context_opts: ContextOpts = ContextOpts(concept={'d'}, ignore_concept=False, context_width=1, ignore_padding=False) bundle: Bundle = create_simple_bundle_by_pipeline( data=SIMPLE_CORPUS_ABCDE_3DOCS, context_opts=context_opts, compress=False) token2id = dict(bundle.token2id.data) assert token2id == { '*': 0, 'd': 1, '__low-tf__': 2, 'a': 3, 'b': 4, 'c': 5, 'e': 6 } windows = [[[bundle.token2id.id2token[x] for x in window] for window in generate_windows( token_ids=[bundle.token2id[t] for t in tokens], context_width=context_opts.context_width, pad_id=bundle.token2id[context_opts.pad], ignore_pads=False, )] for _, tokens in SIMPLE_CORPUS_ABCDE_3DOCS] assert windows == [ [ ['*', 'a', 'b'], ['a', 'b', 'c'], ['b', 'c', 'c'], ['c', 'c', 'd'], ['c', 'd', 'c'], ['d', 'c', 'e'], ['c', 'e', '*'], ], [ ['*', 'a', 'a'], ['a', 'a', 'c'], ['a', 'c', 'e'], ['c', 'e', 'c'], ['e', 'c', 'd'], ['c', 'd', 'd'], ['d', 'd', '*'], ], [ ['*', 'd', 'e'], ['d', 'e', 'e'], ['e', 'e', 'b'], ['e', 'b', '*'], ], ] concept_windows = [[window for window in document if 'd' in window] for document in windows] assert concept_windows == [ [ ['c', 'c', 'd'], ['c', 'd', 'c'], ['d', 'c', 'e'], ], [ ['e', 'c', 'd'], ['c', 'd', 'd'], ['d', 'd', '*'], ], [ ['*', 'd', 'e'], ['d', 'e', 'e'], ], ] co_occurrence_dtm = bundle.concept_corpus.data.todense() assert ((co_occurrence_dtm == np.matrix( [ [0, 0, 0, 0, 5, 0, 0, 0, 1, 1, 0, 0, 0], [0, 0, 0, 0, 3, 0, 0, 0, 1, 1, 2, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1, 0, 3, 1, 0, 0], ], dtype=np.int32, )).all().all()) # id2pair = bundle.corpus.id2token # co_occurrence_dtm_frame = pd.DataFrame(co_occurrence_dtm, columns=[id2pair[i] for i in range(0, len(id2pair))]) # assert co_occurrence_dtm_frame is not None """ print(co_occurrence_dtm_frame) * * * * * 0 1 2 3 4 5 6 7 8 9 10 11 12 */a a/b */b */c d/c b/c a/c */e c/e d/e */d a/e b/e -------------------------------------------------------------- 0 0 0 0 0 5 0 0 0 1 1 0 0 0 1 0 0 0 0 3 0 0 0 1 1 2 0 0 2 0 0 0 0 0 0 0 1 0 3 1 0 0 -------------------------------------------------------------- SUM 0 0 0 0 8 0 0 1 2 5 3 0 0 """ """ Compress concept corpus (inlined code) """ concept_corpus: VectorizedCorpus = bundle.concept_corpus # _, ids_translation, kept_pair_ids = concept_corpus.compress(tf_threshold=1, inplace=True) extra_keep_ids = [] keep_ids = concept_corpus.term_frequencies_greater_than_or_equal_to_threshold( 1, keep_indices=extra_keep_ids) assert keep_ids.tolist() == [4, 7, 8, 9, 10] extra_keep_ids = [1] keep_ids = concept_corpus.term_frequencies_greater_than_or_equal_to_threshold( 1, keep_indices=extra_keep_ids) assert keep_ids.tolist() == [1, 4, 7, 8, 9, 10] ids_translation = { old_id: new_id for new_id, old_id in enumerate(keep_ids) } assert ids_translation == {1: 0, 4: 1, 7: 2, 8: 3, 9: 4, 10: 5} concept_corpus.slice_by_indices(keep_ids, inplace=True) assert concept_corpus.token2id == { 'a/b': 0, 'd/c': 1, '*/e': 2, 'c/e': 3, 'd/e': 4, '*/d': 5 } # 1 6 7 8 9 10 # a/b c/d */e d/e c/e */d assert (( concept_corpus.data.todense() == np.matrix( [ # 0 1 2 3 4 5 [0, 5, 0, 1, 1, 0], [0, 3, 0, 1, 1, 2], [0, 0, 1, 0, 3, 1], ], dtype=np.int32, )).all().all()) assert concept_corpus.term_frequency.tolist() == [0, 8, 1, 2, 5, 3] assert concept_corpus.overridden_term_frequency is None """ Slice full corpus """ corpus = bundle.corpus # pp(corpus.data.todense()) assert (( corpus.data.todense() == np.matrix( [ # 0 1 2 3 4 5 6 7 8 9 10 11 12 [1, 2, 1, 1, 5, 3, 1, 1, 2, 1, 0, 0, 0], [2, 0, 0, 0, 3, 0, 3, 0, 4, 1, 2, 1, 0], [0, 0, 1, 0, 0, 0, 0, 2, 0, 3, 1, 0, 3], ], dtype=np.int32, )).all().all()) corpus.slice_by_indices(keep_ids, inplace=True) # pp(corpus.data.todense()) assert (( corpus.data.todense() == np.matrix( [ # 1 6 7 8 9 10 # ---------------- # 0 1 2 3 4 5 # ---------------- [2, 5, 1, 2, 1, 0], [0, 3, 0, 4, 1, 2], [0, 0, 2, 0, 3, 1], # ---------------- # 2 8 3 5 6 3 ], dtype=np.int32, )).all().all()) assert corpus.token2id == { 'a/b': 0, 'd/c': 1, '*/e': 2, 'c/e': 3, 'd/e': 4, '*/d': 5 } assert corpus.term_frequency.tolist() == [2, 8, 3, 6, 5, 3] assert corpus.overridden_term_frequency is None """Update token count and token2id""" def _token_ids_to_keep(kept_pair_ids: Set[int]) -> List[int]: """Returns sorted token IDs that given co-occurrence pair IDs corresponds to """ token_ids_in_kept_pairs: Set[int] = set( flatten((k for k, pair_id in bundle.token_ids_2_pair_id.items() if pair_id in kept_pair_ids))) kept_token_ids: List[int] = sorted( list( token_ids_in_kept_pairs.union( set(bundle.token2id.magic_token_ids)))) return kept_token_ids """" Inlined calls """ token_ids_in_kept_pairs: Set[int] = set( flatten((k for k, pair_id in bundle.token_ids_2_pair_id.items() if pair_id in keep_ids))) assert token_ids_in_kept_pairs == {0, 1, 3, 4, 5, 6} # all except masked token kept_token_ids = sorted( list( token_ids_in_kept_pairs.union(set( bundle.token2id.magic_token_ids)))) assert kept_token_ids == [0, 1, 2, 3, 4, 5, 6] """" Equals function call """ assert kept_token_ids == _token_ids_to_keep(set(keep_ids)) assert (( corpus.window_counts.document_term_window_counts.todense() == np.matrix( [ # * - a b c d e [2, 3, 0, 2, 3, 6, 2], [2, 3, 0, 3, 0, 5, 3], [2, 2, 0, 0, 2, 0, 4], ], dtype=np.int32, )).all().all()) corpus.window_counts.slice(kept_token_ids, inplace=True) """Nothing is changed since all original tokens are kepts""" assert corpus.window_counts.document_term_window_counts.shape == (3, 7) """Simulate removed token `b` """ wc: TokenWindowCountMatrix = corpus.window_counts.slice( [x for x in kept_token_ids if x != 3], inplace=False) assert ((wc.document_term_window_counts.todense() == np.matrix( [[2, 3, 0, 3, 6, 2], [2, 3, 0, 0, 5, 3], [2, 2, 0, 2, 0, 4]], dtype=np.int32, )).all().all()) wc: TokenWindowCountMatrix = corpus.window_counts wc: TokenWindowCountMatrix = concept_corpus.window_counts assert ((wc.slice( kept_token_ids, inplace=False).document_term_window_counts.todense() == np.matrix( [[0, 3, 0, 0, 0, 3, 1], [1, 3, 0, 0, 0, 2, 1], [1, 2, 0, 0, 0, 0, 2]], dtype=np.int32, )).all().all()) assert ids_translation == {1: 0, 4: 1, 7: 2, 8: 3, 9: 4, 10: 5} translated_token2id = bundle.token2id.translate(ids_translation, inplace=False) assert translated_token2id is not None bundle._token_ids_2_pair_id = { pair: pair_id for pair, pair_id in bundle._token_ids_2_pair_id if pair_id in ids_translation }
def test_tasks_pool_tokens_to_ttm_step_by_step(): # Arrange context_opts: ContextOpts = ContextOpts( concept={'d'}, ignore_concept=False, ignore_padding=False, context_width=1, processes=None, ) corpus: TokenizedCorpus = very_simple_corpus(SIMPLE_CORPUS_ABCDE_3DOCS) pad_id = len(corpus.token2id) corpus.token2id[context_opts.pad] = pad_id token2id: dict = corpus.token2id id2token: dict = corpus.id2token filename, tokens = next(corpus) # doc_info = corpus.document_index[corpus.document_index.filename == filename].to_dict('records')[0] doc_info = faster_to_dict_records(corpus.document_index[corpus.document_index.filename == filename])[0] document_id = doc_info['document_id'] token_ids = [token2id[t] for t in tokens] concept_ids = {token2id[x] for x in context_opts.concept} # Act windows: Iterable[Iterable[int]] = generate_windows( token_ids=token_ids, context_width=context_opts.context_width, pad_id=pad_id, ignore_pads=context_opts.ignore_padding, ) # Assert windows = [w for w in windows] assert windows == [[5, 0, 1], [0, 1, 2], [1, 2, 2], [2, 2, 3], [2, 3, 2], [3, 2, 4], [2, 4, 5]] assert [[id2token[i] for i in w] for w in windows] == [ ['*', 'a', 'b'], ['a', 'b', 'c'], ['b', 'c', 'c'], ['c', 'c', 'd'], ['c', 'd', 'c'], ['d', 'c', 'e'], ['c', 'e', '*'], ] # ['a', 'b', 'c', 'c', 'd', 'c', 'e'] ttm_map: Mapping[VectorizeType, VectorizedTTM] = windows_to_ttm( document_id=document_id, windows=windows, concept_ids=concept_ids, ignore_ids=set(), vocab_size=len(token2id), ) expected_normal_ttm = [ # a b c d e * [0, 2, 1, 0, 0, 1], # a [0, 0, 3, 0, 0, 1], # b [0, 0, 0, 5, 2, 1], # c [0, 0, 0, 0, 1, 0], # d [0, 0, 0, 0, 0, 1], # e [0, 0, 0, 0, 0, 0], # * ] assert (ttm_map[VectorizeType.Normal].term_term_matrix.todense() == expected_normal_ttm).all() expected_concept_ttm = [ # a b c d e * [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 5, 1, 0], [0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], ] assert (ttm_map[VectorizeType.Concept].term_term_matrix.todense() == expected_concept_ttm).all()
def test_pipeline_to_co_occurrence_can_create_co_occurrence_bundle(): context_opts: ContextOpts = ContextOpts( context_width=2, concept={}, ignore_concept=False, ignore_padding=False, processes=None ) tokenized_corpus: TokenizedCorpus = very_simple_corpus(SIMPLE_CORPUS_ABCDE_5DOCS) config: CorpusConfig = CorpusConfig( corpus_name=uuid.uuid1(), corpus_type=CorpusType.Tokenized, corpus_pattern=None, checkpoint_opts=None, text_reader_opts=None, pipelines=None, pipeline_payload=PipelinePayload(), language="swedish", ) """Expected windows generated for corpus""" # print({ k: [x for x in generate_windows(tokens=tokens, context_opts=context_opts)] for k, tokens in expected_tokens.items() }) document_windows = { 'tran_2019_01_test.txt': [ ['*', '*', 'a', 'b', 'c'], ['*', 'a', 'b', 'c', 'c'], ['a', 'b', 'c', 'c', '*'], ['b', 'c', 'c', '*', '*'], ], 'tran_2019_02_test.txt': [ ['*', '*', 'a', 'a', 'b'], ['*', 'a', 'a', 'b', 'd'], ['a', 'a', 'b', 'd', '*'], ['a', 'b', 'd', '*', '*'], ], 'tran_2019_03_test.txt': [ ['*', '*', 'a', 'e', 'e'], ['*', 'a', 'e', 'e', 'b'], ['a', 'e', 'e', 'b', '*'], ['e', 'e', 'b', '*', '*'], ], 'tran_2020_01_test.txt': [ ['*', '*', 'c', 'c', 'd'], ['*', 'c', 'c', 'd', 'a'], ['c', 'c', 'd', 'a', '*'], ['c', 'd', 'a', '*', '*'], ], 'tran_2020_02_test.txt': [ ['*', '*', 'a', 'b', 'b'], ['*', 'a', 'b', 'b', 'e'], ['a', 'b', 'b', 'e', '*'], ['b', 'b', 'e', '*', '*'], ], } """Expected co-occurrences from windows above""" expected_TTMs = {filename: simple_co_occurrence(document_windows[filename]) for filename in document_windows} def verify_tokens_payload( p: CorpusPipeline, payload: DocumentPayload, *_ # pylint: disable=unused-argument ) -> bool: # expected_tokens: dict = { k: v for k, v in SIMPLE_CORPUS_ABCDE_5DOCS} expected_tokens: dict = { 'tran_2019_01_test.txt': ['a', 'b', 'c', 'c'], 'tran_2019_02_test.txt': ['a', 'a', 'b', 'd'], 'tran_2019_03_test.txt': ['a', 'e', 'e', 'b'], 'tran_2020_01_test.txt': ['c', 'c', 'd', 'a'], 'tran_2020_02_test.txt': ['a', 'b', 'b', 'e'], } return payload.content == expected_tokens.get(payload.filename) def verify_expected_vocabulary(p: CorpusPipeline, *_) -> bool: return list(p.payload.token2id.keys()) == ['*', '__low-tf__', 'a', 'b', 'c', 'd', 'e'] def verify_co_occurrence_document_TTM_payload( p: CorpusPipeline, payload: DocumentPayload, *_ ) -> bool: # pylint: disable=unused-argument fg = p.payload.token2id.id2token.get assert isinstance(payload.content, CoOccurrencePayload) TTM: sp.spmatrix = payload.content.ttm_data_map.get(VectorizeType.Normal).term_term_matrix.tocoo() document_TTM_data = {(fg(TTM.row[i]), fg(TTM.col[i])): TTM.data[i] for i in range(0, len(TTM.data))} assert expected_TTMs[payload.filename] == document_TTM_data return True bundle: Bundle = ( CorpusPipeline(config=config) .load_corpus(tokenized_corpus) .assert_on_payload(payload_test=verify_tokens_payload) .vocabulary(lemmatize=True) .assert_on_exit(exit_test=verify_expected_vocabulary) .to_document_co_occurrence(context_opts=context_opts) .assert_on_payload(payload_test=verify_co_occurrence_document_TTM_payload) .to_corpus_co_occurrence(context_opts=context_opts, global_threshold_count=1) .single() .content ) for filename in expected_TTMs: document_id = int(bundle.document_index[bundle.document_index.filename == filename].document_id) for (i, j), ij in bundle.token_ids_2_pair_id.items(): pair = (bundle.token2id.id2token[i], bundle.token2id.id2token[j]) if pair in expected_TTMs[filename]: assert bundle.corpus.data[document_id, ij] == expected_TTMs[filename][pair] else: assert bundle.corpus.data[document_id, ij] == 0
from typing import Iterable import pytest import scipy.sparse as sp from penelope.co_occurrence import Bundle, ContextOpts, VectorizeType from penelope.corpus import ClosedVocabularyError, TokenizedCorpus, VectorizedCorpus from penelope.pipeline import CorpusConfig, CorpusPipeline, DocumentPayload, PipelinePayload from penelope.pipeline.co_occurrence.tasks import CoOccurrencePayload from penelope.pipeline.config import CorpusType from ..fixtures import SIMPLE_CORPUS_ABCDE_5DOCS, very_simple_corpus jj = os.path.join CONTEXT_OPTS: ContextOpts = ContextOpts(context_width=2, concept={}, ignore_concept=False, ignore_padding=False) @pytest.mark.skip(reason="ingest is now prohibited if vocabulary is closed") def test_pipeline_to_co_occurrence_ingest_prohobited_if_vocabulary_exists(): tokenized_corpus: TokenizedCorpus = very_simple_corpus(SIMPLE_CORPUS_ABCDE_5DOCS) config: CorpusConfig = CorpusConfig.tokenized_corpus_config() with pytest.raises(ClosedVocabularyError): _: Bundle = ( CorpusPipeline(config=config) .load_corpus(tokenized_corpus) .vocabulary(lemmatize=False) .to_document_co_occurrence(context_opts=CONTEXT_OPTS) .to_corpus_co_occurrence(context_opts=CONTEXT_OPTS, global_threshold_count=1)