def test_spacy_pipeline_load_text_to_spacy_doc_resolves(en_nlp): pytest.importorskip("spacy") reader_opts = TextReaderOpts(filename_pattern="*.txt", filename_fields="year:_:1") source = dummy_source() config = Mock(spec=CorpusConfig, pipeline_payload=PipelinePayload(source=source).put2( pos_column="pos_")) pipeline = CorpusPipeline(config=config).set_spacy_model(en_nlp).load_text( reader_opts=reader_opts).text_to_spacy() payloads = [x.content for x in pipeline.resolve()] assert all(isinstance(x, spacy_api.Doc) for x in payloads)
def test_spacy_pipeline_load_text_resolves(): reader_opts = TextReaderOpts(filename_pattern="*.txt", filename_fields="year:_:1") source = dummy_source() config = Mock(spec=CorpusConfig, pipeline_payload=PipelinePayload(source=source)) pipeline = CorpusPipeline(config=config).load_text(reader_opts=reader_opts) payloads = [x.content for x in pipeline.resolve()] assert payloads == [x[1] for x in source] assert len(pipeline.payload.document_index) == len(source) assert all( pipeline.payload.document_index.filename == [x[0] for x in source])
def probe_checkpoint_document_index( pipe: pipeline.CorpusPipeline) -> pd.DataFrame: with contextlib.suppress(Exception): task: tasks.CheckpointFeather = pipe.find(tasks.CheckpointFeather) if task: return cp.feather.read_document_index(task.folder) with contextlib.suppress(Exception): task = pipe.find(tasks.LoadTaggedCSV) if task: return cp.feather.read_document_index( task.checkpoint_opts.feather_folder) return None
def create_simple_bundle_by_pipeline( data: Union[TokenizedCorpus, List[Tuple[str, List[str]]]], context_opts: ContextOpts, tag: str = "TERRA", folder: str = None, compress: bool = False, ): folder = folder or OUTPUT_FOLDER if folder.startswith('./tests') and folder != OUTPUT_FOLDER: shutil.rmtree(folder, ignore_errors=True) if not isinstance(data, TokenizedCorpus): data: TokenizedCorpus = very_simple_corpus(data) config: CorpusConfig = CorpusConfig.tokenized_corpus_config() bundle: Bundle = ( CorpusPipeline(config=config) .load_corpus(data) .vocabulary(lemmatize=False) .to_document_co_occurrence(context_opts=context_opts) .to_corpus_co_occurrence(context_opts=context_opts, global_threshold_count=1, compress=compress) .single() .content ) bundle.folder = folder bundle.tag = tag return bundle
def test_spacy_pipeline_load_checkpoint_archive( checkpoint_opts: CheckpointOpts): tagged_corpus_source = os.path.join(TEST_DATA_FOLDER, "checkpoint_mary_lamb_pos_csv.zip") pipeline_payload = PipelinePayload( source=TEST_CORPUS, document_index_source=None, pos_schema_name="Universal", memory_store={ 'spacy_model': "en_core_web_sm", 'nlp': None, 'lang': 'en,' }, ) config = MagicMock(spec=CorpusConfig, payload=pipeline_payload) pipeline = (CorpusPipeline(config=config).checkpoint( tagged_corpus_source, checkpoint_opts=checkpoint_opts, force_checkpoint=False, ).to_content()) df_docs = pipeline.resolve() assert next(df_docs) is not None
def test_to_spacy_doc_to_tagged_frame(test_payload): payload = DocumentPayload(content_type=ContentType.TEXT, filename='hello.txt', content=SAMPLE_TEXT) config: CorpusConfig = CorpusConfig.load('./tests/test_data/SSI.yml') pipeline: CorpusPipeline = CorpusPipeline(config=config, tasks=[], payload=payload).setup() prior = MagicMock(spec=ITask, outstream=lambda: [payload]) task = ToSpacyDocToTaggedFrame(pipeline=pipeline, prior=prior, attributes=POS_ATTRIBUTES) task.register_pos_counts = lambda p: p _ = patch_spacy_pipeline(test_payload).add([SetSpacyModel(name_or_nlp="en_core_web_sm"), task]).setup() payload_next = task.process_payload(payload) assert payload_next.content_type == ContentType.TAGGED_FRAME
def test_predict_topics(method: str): """Train a model that will be used in prediction""" target_folder: str = './tests/output' train_target_name: str = f'train_{str(uuid.uuid1())[:8]}' payload: DocumentPayload = tranströmer_topic_model_payload( method=method, target_folder=target_folder, target_name=train_target_name ) model_folder: str = os.path.join(payload.content.get("target_folder"), payload.content.get("target_name")) """Predict using trained model""" config: CorpusConfig = CorpusConfig.load('./tests/test_data/tranströmer.yml') corpus_source: str = './tests/test_data/tranströmer_corpus_pos_csv.zip' minimum_probability: float = 0.001 n_tokens: int = 100 predict_target_name: str = f'predict_{str(uuid.uuid1())[:8]}' transform_opts = TokensTransformOpts() extract_opts = ExtractTaggedTokensOpts( lemmatize=True, pos_includes='', pos_excludes='MAD|MID|PAD', **config.checkpoint_opts.tagged_columns, ) vectorize_opts: VectorizeOpts = VectorizeOpts(already_tokenized=True) payload: DocumentPayload = ( CorpusPipeline(config=config) .load_tagged_frame( filename=corpus_source, checkpoint_opts=config.checkpoint_opts, extra_reader_opts=config.text_reader_opts, ) .tagged_frame_to_tokens(extract_opts=extract_opts, transform_opts=transform_opts) .to_dtm(vectorize_opts=vectorize_opts) .predict_topics( model_folder=model_folder, target_folder=target_folder, target_name=predict_target_name, minimum_probability=minimum_probability, n_tokens=n_tokens, ) ).single() assert payload is not None model_infos = find_models('./tests/output') assert any(m['name'] == predict_target_name for m in model_infos) model_info = next(m for m in model_infos if m['name'] == predict_target_name) assert 'method' in model_info['options']
def test_pipeline_can_can_be_saved_in_feather(config: CorpusConfig): tagged_corpus_source: str = os.path.join( CORPUS_FOLDER, 'legal_instrument_five_docs_test_pos_csv.zip') pipeline = CorpusPipeline(config=config).checkpoint(tagged_corpus_source, force_checkpoint=False) for payload in pipeline.resolve(): tagged_frame: pd.DataFrame = payload.content filename = os.path.join( OUTPUT_FOLDER, replace_extension(payload.filename, ".feather")) tagged_frame.reset_index(drop=True).to_feather(filename, compression="lz4") assert os.path.isfile(filename) apa = pd.read_feather(filename) assert apa is not None
def test_pipeline_to_co_occurrence_ingest_prohobited_if_vocabulary_exists(): tokenized_corpus: TokenizedCorpus = very_simple_corpus(SIMPLE_CORPUS_ABCDE_5DOCS) config: CorpusConfig = CorpusConfig.tokenized_corpus_config() with pytest.raises(ClosedVocabularyError): _: Bundle = ( CorpusPipeline(config=config) .load_corpus(tokenized_corpus) .vocabulary(lemmatize=False) .to_document_co_occurrence(context_opts=CONTEXT_OPTS) .to_corpus_co_occurrence(context_opts=CONTEXT_OPTS, global_threshold_count=1) .single() .content )
def test_pipeline_to_co_occurrence_can_create_new_vocabulary(): tokenized_corpus: TokenizedCorpus = very_simple_corpus(SIMPLE_CORPUS_ABCDE_5DOCS) config: CorpusConfig = CorpusConfig.tokenized_corpus_config() bundle: Bundle = ( CorpusPipeline(config=config) .load_corpus(tokenized_corpus) .vocabulary(lemmatize=False) .to_document_co_occurrence(context_opts=CONTEXT_OPTS) .to_corpus_co_occurrence(context_opts=CONTEXT_OPTS, global_threshold_count=1) .single() .content ) assert isinstance(bundle, Bundle) corpus: VectorizedCorpus = bundle.corpus assert isinstance(corpus, VectorizedCorpus)
def test_spacy_pipeline_load_text_to_spacy_to_dataframe_resolves(en_nlp): pytest.importorskip("spacy") reader_opts = TextReaderOpts(filename_pattern="*.txt", filename_fields="year:_:1") reader = TextReader.create(MARY_TEST_CORPUS, reader_opts=reader_opts) config = Mock(spec=CorpusConfig, pipeline_payload=PipelinePayload(source=reader).put2( pos_column="pos_")) attributes = ['text', 'lemma_', 'pos_'] pipeline = (CorpusPipeline( config=config).set_spacy_model(en_nlp).load_text( reader_opts=reader_opts).text_to_spacy().spacy_to_tagged_frame( attributes=attributes)) payloads = [x.content for x in pipeline.resolve()] assert all(isinstance(x, pd.DataFrame) for x in payloads) assert all(x.columns.tolist() == attributes for x in payloads)
def tranströmer_topic_model_payload(method: str, target_folder: str, target_name: str) -> DocumentPayload: transform_opts: TokensTransformOpts = TokensTransformOpts() extract_opts: ExtractTaggedTokensOpts = ExtractTaggedTokensOpts( lemmatize=True, pos_includes='', pos_excludes='MAD|MID|PAD', text_column='token', lemma_column='baseform', pos_column='pos', ) default_engine_args: dict = { 'n_topics': 4, 'passes': 1, 'random_seed': 42, 'workers': 1, 'max_iter': 100, 'work_folder': os.path.join(target_folder, target_name), } config: CorpusConfig = CorpusConfig.load('./tests/test_data/tranströmer.yml') corpus_source: str = './tests/test_data/tranströmer_corpus_pos_csv.zip' p: CorpusPipeline = ( CorpusPipeline(config=config) .load_tagged_frame( filename=corpus_source, checkpoint_opts=config.checkpoint_opts, extra_reader_opts=config.text_reader_opts, ) .tagged_frame_to_tokens(extract_opts=extract_opts, transform_opts=transform_opts) .to_dtm(VectorizeOpts(already_tokenized=True)) .to_topic_model( target_mode='both', target_folder=target_folder, target_name=target_name, engine=method, engine_args=default_engine_args, store_corpus=True, store_compressed=True, ) ) payload: DocumentPayload = p.single() return payload
def test_spacy_pipeline_load_text_to_spacy_to_dataframe_to_tokens_to_text_to_dtm( en_nlp): pytest.importorskip("spacy") reader_opts = TextReaderOpts(filename_pattern="*.txt", filename_fields="year:_:1") text_transform_opts = TextTransformOpts() reader = TextReader.create(MARY_TEST_CORPUS, reader_opts=reader_opts, transform_opts=text_transform_opts) attributes = ['text', 'lemma_', 'pos_', 'is_punct'] extract_opts = ExtractTaggedTokensOpts( lemmatize=True, pos_includes='|VERB|NOUN|', pos_paddings=None, **SPACY_TAGGED_COLUMNS, filter_opts=dict(is_punct=False), ) transform_opts = None vectorize_opts = VectorizeOpts() config = Mock( spec=CorpusConfig, pipeline_payload=PipelinePayload(source=reader).put2( **SPACY_TAGGED_COLUMNS), ) pipeline = (CorpusPipeline(config=config).load_text( reader_opts=reader_opts, transform_opts=text_transform_opts).set_spacy_model( en_nlp).text_to_spacy().spacy_to_tagged_frame( attributes=attributes).tagged_frame_to_tokens( extract_opts=extract_opts, transform_opts=transform_opts).tokens_to_text().to_dtm( vectorize_opts)) corpus = pipeline.value() assert corpus is not None assert isinstance(corpus, VectorizedCorpus)
def test_spacy_pipeline(checkpoint_opts: CheckpointOpts): tagged_corpus_source = os.path.join(TEST_OUTPUT_FOLDER, "checkpoint_mary_lamb_pos_csv.zip") pathlib.Path(tagged_corpus_source).unlink(missing_ok=True) text_reader_opts = TextReaderOpts( filename_fields=["doc_id:_:2", "year:_:1"], index_field=None, # use filename filename_filter=None, filename_pattern="*.txt", as_binary=False, ) pipeline_payload = PipelinePayload( source=TEST_CORPUS, document_index_source=None, pos_schema_name="Universal", memory_store={ 'spacy_model': "en_core_web_sm", 'nlp': None, 'lang': 'en,', 'pos_column': 'pos_' }, ) config = Mock(spec=CorpusConfig, pipeline_payload=pipeline_payload) pipeline = (CorpusPipeline(config=config).set_spacy_model( pipeline_payload.memory_store['spacy_model']).load_text( reader_opts=text_reader_opts, transform_opts=TextTransformOpts() ).text_to_spacy().passthrough().spacy_to_pos_tagged_frame().checkpoint( tagged_corpus_source, checkpoint_opts=checkpoint_opts, force_checkpoint=True).to_content()) df_docs = pipeline.resolve() assert next(df_docs) is not None assert os.path.isfile(tagged_corpus_source)
def test_pipeline_to_co_occurrence_can_create_co_occurrence_bundle(): context_opts: ContextOpts = ContextOpts( context_width=2, concept={}, ignore_concept=False, ignore_padding=False, processes=None ) tokenized_corpus: TokenizedCorpus = very_simple_corpus(SIMPLE_CORPUS_ABCDE_5DOCS) config: CorpusConfig = CorpusConfig( corpus_name=uuid.uuid1(), corpus_type=CorpusType.Tokenized, corpus_pattern=None, checkpoint_opts=None, text_reader_opts=None, pipelines=None, pipeline_payload=PipelinePayload(), language="swedish", ) """Expected windows generated for corpus""" # print({ k: [x for x in generate_windows(tokens=tokens, context_opts=context_opts)] for k, tokens in expected_tokens.items() }) document_windows = { 'tran_2019_01_test.txt': [ ['*', '*', 'a', 'b', 'c'], ['*', 'a', 'b', 'c', 'c'], ['a', 'b', 'c', 'c', '*'], ['b', 'c', 'c', '*', '*'], ], 'tran_2019_02_test.txt': [ ['*', '*', 'a', 'a', 'b'], ['*', 'a', 'a', 'b', 'd'], ['a', 'a', 'b', 'd', '*'], ['a', 'b', 'd', '*', '*'], ], 'tran_2019_03_test.txt': [ ['*', '*', 'a', 'e', 'e'], ['*', 'a', 'e', 'e', 'b'], ['a', 'e', 'e', 'b', '*'], ['e', 'e', 'b', '*', '*'], ], 'tran_2020_01_test.txt': [ ['*', '*', 'c', 'c', 'd'], ['*', 'c', 'c', 'd', 'a'], ['c', 'c', 'd', 'a', '*'], ['c', 'd', 'a', '*', '*'], ], 'tran_2020_02_test.txt': [ ['*', '*', 'a', 'b', 'b'], ['*', 'a', 'b', 'b', 'e'], ['a', 'b', 'b', 'e', '*'], ['b', 'b', 'e', '*', '*'], ], } """Expected co-occurrences from windows above""" expected_TTMs = {filename: simple_co_occurrence(document_windows[filename]) for filename in document_windows} def verify_tokens_payload( p: CorpusPipeline, payload: DocumentPayload, *_ # pylint: disable=unused-argument ) -> bool: # expected_tokens: dict = { k: v for k, v in SIMPLE_CORPUS_ABCDE_5DOCS} expected_tokens: dict = { 'tran_2019_01_test.txt': ['a', 'b', 'c', 'c'], 'tran_2019_02_test.txt': ['a', 'a', 'b', 'd'], 'tran_2019_03_test.txt': ['a', 'e', 'e', 'b'], 'tran_2020_01_test.txt': ['c', 'c', 'd', 'a'], 'tran_2020_02_test.txt': ['a', 'b', 'b', 'e'], } return payload.content == expected_tokens.get(payload.filename) def verify_expected_vocabulary(p: CorpusPipeline, *_) -> bool: return list(p.payload.token2id.keys()) == ['*', '__low-tf__', 'a', 'b', 'c', 'd', 'e'] def verify_co_occurrence_document_TTM_payload( p: CorpusPipeline, payload: DocumentPayload, *_ ) -> bool: # pylint: disable=unused-argument fg = p.payload.token2id.id2token.get assert isinstance(payload.content, CoOccurrencePayload) TTM: sp.spmatrix = payload.content.ttm_data_map.get(VectorizeType.Normal).term_term_matrix.tocoo() document_TTM_data = {(fg(TTM.row[i]), fg(TTM.col[i])): TTM.data[i] for i in range(0, len(TTM.data))} assert expected_TTMs[payload.filename] == document_TTM_data return True bundle: Bundle = ( CorpusPipeline(config=config) .load_corpus(tokenized_corpus) .assert_on_payload(payload_test=verify_tokens_payload) .vocabulary(lemmatize=True) .assert_on_exit(exit_test=verify_expected_vocabulary) .to_document_co_occurrence(context_opts=context_opts) .assert_on_payload(payload_test=verify_co_occurrence_document_TTM_payload) .to_corpus_co_occurrence(context_opts=context_opts, global_threshold_count=1) .single() .content ) for filename in expected_TTMs: document_id = int(bundle.document_index[bundle.document_index.filename == filename].document_id) for (i, j), ij in bundle.token_ids_2_pair_id.items(): pair = (bundle.token2id.id2token[i], bundle.token2id.id2token[j]) if pair in expected_TTMs[filename]: assert bundle.corpus.data[document_id, ij] == expected_TTMs[filename][pair] else: assert bundle.corpus.data[document_id, ij] == 0
def create_very_simple_tokens_pipeline(data: List[Tuple[str, List[str]]]) -> CorpusPipeline: corpus: TokenizedCorpus = very_simple_corpus(data) p: CorpusPipeline = CorpusPipeline(config=None).load_corpus(corpus) return p
def test_spacy_pipeline_load_text_to_spacy_to_dataframe_to_tokens_resolves( en_nlp): pytest.importorskip("spacy") reader_opts = TextReaderOpts(filename_pattern="*.txt", filename_fields="year:_:1") text_transform_opts = TextTransformOpts() reader = TextReader.create(MARY_TEST_CORPUS, reader_opts=reader_opts, transform_opts=text_transform_opts) config = Mock( spec=CorpusConfig, pipeline_payload=PipelinePayload(source=reader).put2( **SPACY_TAGGED_COLUMNS), ) attributes = ['text', 'lemma_', 'pos_'] extract_opts = ExtractTaggedTokensOpts( lemmatize=True, pos_includes='|VERB|NOUN|', pos_paddings='|ADJ|', **SPACY_TAGGED_COLUMNS, filter_opts=dict(is_punct=False), ) transform_opts = None pipeline = (CorpusPipeline(config=config).load_text( reader_opts=reader_opts).set_spacy_model(en_nlp).text_to_spacy( ).spacy_to_tagged_frame(attributes=attributes).tagged_frame_to_tokens( extract_opts=extract_opts, transform_opts=transform_opts)) payloads = [x.content for x in pipeline.resolve()] assert payloads == [ ['*', 'sea', 'ocean', 'life'], ['atmosphere', 'blow'], ['*', 'activity', 'surface', 'cease'], ['*', 'planet'], ['volcano', 'erupt', 'year'], ['eruption', 'occur', 'year', 'region', 'call'], ['know', '*', 'eruption'], ['volcano', 'erupt', 'surface', '*', 'interval'], ] assert set(list(pipeline.payload.document_index.columns)) == set([ 'filename', 'year', 'document_id', 'document_name', 'Adverb', 'Conjunction', 'Delimiter', 'Noun', 'Other', 'Preposition', 'n_tokens', 'n_raw_tokens', 'Pronoun', 'Verb', 'Adjective', 'Numeral', ])
def patch_spacy_pipeline(payload: PipelinePayload): config: MagicMock = MagicMock(spec=CorpusConfig, pipeline_payload=payload) pipeline: CorpusPipeline = CorpusPipeline(config=config, tasks=[], payload=payload).setup() return pipeline
def patch_spacy_pipeline(task): pipeline = CorpusPipeline(config=fake_config(), tasks=[task]).setup() return pipeline
def test_passthrough_process_succeeds(): task = tasks.Passthrough(pipeline=CorpusPipeline(config=Mock( spec=CorpusConfig, pipeline_payload=PipelinePayload()))).setup() current_payload = DocumentPayload() next_payload = task.process(current_payload) assert current_payload == next_payload
def test_set_spacy_model_setup_succeeds(): pytest.importorskip("spacy") pipeline = CorpusPipeline(config=fake_config()) _ = spacy_tasks.SetSpacyModel(pipeline=pipeline, name_or_nlp="en_core_web_sm").setup() assert pipeline.get("spacy_nlp", None) is not None