예제 #1
0
def test_spacy_pipeline_load_text_to_spacy_doc_resolves(en_nlp):
    pytest.importorskip("spacy")
    reader_opts = TextReaderOpts(filename_pattern="*.txt",
                                 filename_fields="year:_:1")
    source = dummy_source()
    config = Mock(spec=CorpusConfig,
                  pipeline_payload=PipelinePayload(source=source).put2(
                      pos_column="pos_"))
    pipeline = CorpusPipeline(config=config).set_spacy_model(en_nlp).load_text(
        reader_opts=reader_opts).text_to_spacy()

    payloads = [x.content for x in pipeline.resolve()]

    assert all(isinstance(x, spacy_api.Doc) for x in payloads)
예제 #2
0
def test_spacy_pipeline_load_text_resolves():
    reader_opts = TextReaderOpts(filename_pattern="*.txt",
                                 filename_fields="year:_:1")
    source = dummy_source()
    config = Mock(spec=CorpusConfig,
                  pipeline_payload=PipelinePayload(source=source))
    pipeline = CorpusPipeline(config=config).load_text(reader_opts=reader_opts)

    payloads = [x.content for x in pipeline.resolve()]

    assert payloads == [x[1] for x in source]
    assert len(pipeline.payload.document_index) == len(source)
    assert all(
        pipeline.payload.document_index.filename == [x[0] for x in source])
예제 #3
0
def probe_checkpoint_document_index(
        pipe: pipeline.CorpusPipeline) -> pd.DataFrame:

    with contextlib.suppress(Exception):
        task: tasks.CheckpointFeather = pipe.find(tasks.CheckpointFeather)
        if task:
            return cp.feather.read_document_index(task.folder)

    with contextlib.suppress(Exception):
        task = pipe.find(tasks.LoadTaggedCSV)
        if task:
            return cp.feather.read_document_index(
                task.checkpoint_opts.feather_folder)

    return None
예제 #4
0
def create_simple_bundle_by_pipeline(
    data: Union[TokenizedCorpus, List[Tuple[str, List[str]]]],
    context_opts: ContextOpts,
    tag: str = "TERRA",
    folder: str = None,
    compress: bool = False,
):
    folder = folder or OUTPUT_FOLDER
    if folder.startswith('./tests') and folder != OUTPUT_FOLDER:
        shutil.rmtree(folder, ignore_errors=True)

    if not isinstance(data, TokenizedCorpus):
        data: TokenizedCorpus = very_simple_corpus(data)

    config: CorpusConfig = CorpusConfig.tokenized_corpus_config()

    bundle: Bundle = (
        CorpusPipeline(config=config)
        .load_corpus(data)
        .vocabulary(lemmatize=False)
        .to_document_co_occurrence(context_opts=context_opts)
        .to_corpus_co_occurrence(context_opts=context_opts, global_threshold_count=1, compress=compress)
        .single()
        .content
    )
    bundle.folder = folder
    bundle.tag = tag
    return bundle
예제 #5
0
def test_spacy_pipeline_load_checkpoint_archive(
        checkpoint_opts: CheckpointOpts):

    tagged_corpus_source = os.path.join(TEST_DATA_FOLDER,
                                        "checkpoint_mary_lamb_pos_csv.zip")

    pipeline_payload = PipelinePayload(
        source=TEST_CORPUS,
        document_index_source=None,
        pos_schema_name="Universal",
        memory_store={
            'spacy_model': "en_core_web_sm",
            'nlp': None,
            'lang': 'en,'
        },
    )
    config = MagicMock(spec=CorpusConfig, payload=pipeline_payload)
    pipeline = (CorpusPipeline(config=config).checkpoint(
        tagged_corpus_source,
        checkpoint_opts=checkpoint_opts,
        force_checkpoint=False,
    ).to_content())

    df_docs = pipeline.resolve()
    assert next(df_docs) is not None
예제 #6
0
def test_to_spacy_doc_to_tagged_frame(test_payload):
    payload = DocumentPayload(content_type=ContentType.TEXT, filename='hello.txt', content=SAMPLE_TEXT)
    config: CorpusConfig = CorpusConfig.load('./tests/test_data/SSI.yml')
    pipeline: CorpusPipeline = CorpusPipeline(config=config, tasks=[], payload=payload).setup()
    prior = MagicMock(spec=ITask, outstream=lambda: [payload])
    task = ToSpacyDocToTaggedFrame(pipeline=pipeline, prior=prior, attributes=POS_ATTRIBUTES)
    task.register_pos_counts = lambda p: p
    _ = patch_spacy_pipeline(test_payload).add([SetSpacyModel(name_or_nlp="en_core_web_sm"), task]).setup()
    payload_next = task.process_payload(payload)
    assert payload_next.content_type == ContentType.TAGGED_FRAME
예제 #7
0
def test_predict_topics(method: str):

    """Train a model that will be used in prediction"""

    target_folder: str = './tests/output'
    train_target_name: str = f'train_{str(uuid.uuid1())[:8]}'
    payload: DocumentPayload = tranströmer_topic_model_payload(
        method=method, target_folder=target_folder, target_name=train_target_name
    )
    model_folder: str = os.path.join(payload.content.get("target_folder"), payload.content.get("target_name"))

    """Predict using trained model"""

    config: CorpusConfig = CorpusConfig.load('./tests/test_data/tranströmer.yml')
    corpus_source: str = './tests/test_data/tranströmer_corpus_pos_csv.zip'
    minimum_probability: float = 0.001
    n_tokens: int = 100
    predict_target_name: str = f'predict_{str(uuid.uuid1())[:8]}'
    transform_opts = TokensTransformOpts()
    extract_opts = ExtractTaggedTokensOpts(
        lemmatize=True,
        pos_includes='',
        pos_excludes='MAD|MID|PAD',
        **config.checkpoint_opts.tagged_columns,
    )
    vectorize_opts: VectorizeOpts = VectorizeOpts(already_tokenized=True)
    payload: DocumentPayload = (
        CorpusPipeline(config=config)
        .load_tagged_frame(
            filename=corpus_source,
            checkpoint_opts=config.checkpoint_opts,
            extra_reader_opts=config.text_reader_opts,
        )
        .tagged_frame_to_tokens(extract_opts=extract_opts, transform_opts=transform_opts)
        .to_dtm(vectorize_opts=vectorize_opts)
        .predict_topics(
            model_folder=model_folder,
            target_folder=target_folder,
            target_name=predict_target_name,
            minimum_probability=minimum_probability,
            n_tokens=n_tokens,
        )
    ).single()

    assert payload is not None

    model_infos = find_models('./tests/output')
    assert any(m['name'] == predict_target_name for m in model_infos)
    model_info = next(m for m in model_infos if m['name'] == predict_target_name)
    assert 'method' in model_info['options']
예제 #8
0
def test_pipeline_can_can_be_saved_in_feather(config: CorpusConfig):

    tagged_corpus_source: str = os.path.join(
        CORPUS_FOLDER, 'legal_instrument_five_docs_test_pos_csv.zip')

    pipeline = CorpusPipeline(config=config).checkpoint(tagged_corpus_source,
                                                        force_checkpoint=False)

    for payload in pipeline.resolve():

        tagged_frame: pd.DataFrame = payload.content

        filename = os.path.join(
            OUTPUT_FOLDER, replace_extension(payload.filename, ".feather"))

        tagged_frame.reset_index(drop=True).to_feather(filename,
                                                       compression="lz4")

        assert os.path.isfile(filename)

        apa = pd.read_feather(filename)

        assert apa is not None
예제 #9
0
def test_pipeline_to_co_occurrence_ingest_prohobited_if_vocabulary_exists():

    tokenized_corpus: TokenizedCorpus = very_simple_corpus(SIMPLE_CORPUS_ABCDE_5DOCS)
    config: CorpusConfig = CorpusConfig.tokenized_corpus_config()

    with pytest.raises(ClosedVocabularyError):
        _: Bundle = (
            CorpusPipeline(config=config)
            .load_corpus(tokenized_corpus)
            .vocabulary(lemmatize=False)
            .to_document_co_occurrence(context_opts=CONTEXT_OPTS)
            .to_corpus_co_occurrence(context_opts=CONTEXT_OPTS, global_threshold_count=1)
            .single()
            .content
        )
예제 #10
0
def test_pipeline_to_co_occurrence_can_create_new_vocabulary():

    tokenized_corpus: TokenizedCorpus = very_simple_corpus(SIMPLE_CORPUS_ABCDE_5DOCS)
    config: CorpusConfig = CorpusConfig.tokenized_corpus_config()

    bundle: Bundle = (
        CorpusPipeline(config=config)
        .load_corpus(tokenized_corpus)
        .vocabulary(lemmatize=False)
        .to_document_co_occurrence(context_opts=CONTEXT_OPTS)
        .to_corpus_co_occurrence(context_opts=CONTEXT_OPTS, global_threshold_count=1)
        .single()
        .content
    )
    assert isinstance(bundle, Bundle)
    corpus: VectorizedCorpus = bundle.corpus
    assert isinstance(corpus, VectorizedCorpus)
예제 #11
0
def test_spacy_pipeline_load_text_to_spacy_to_dataframe_resolves(en_nlp):
    pytest.importorskip("spacy")
    reader_opts = TextReaderOpts(filename_pattern="*.txt",
                                 filename_fields="year:_:1")
    reader = TextReader.create(MARY_TEST_CORPUS, reader_opts=reader_opts)
    config = Mock(spec=CorpusConfig,
                  pipeline_payload=PipelinePayload(source=reader).put2(
                      pos_column="pos_"))
    attributes = ['text', 'lemma_', 'pos_']
    pipeline = (CorpusPipeline(
        config=config).set_spacy_model(en_nlp).load_text(
            reader_opts=reader_opts).text_to_spacy().spacy_to_tagged_frame(
                attributes=attributes))

    payloads = [x.content for x in pipeline.resolve()]

    assert all(isinstance(x, pd.DataFrame) for x in payloads)
    assert all(x.columns.tolist() == attributes for x in payloads)
예제 #12
0
def tranströmer_topic_model_payload(method: str, target_folder: str, target_name: str) -> DocumentPayload:
    transform_opts: TokensTransformOpts = TokensTransformOpts()
    extract_opts: ExtractTaggedTokensOpts = ExtractTaggedTokensOpts(
        lemmatize=True,
        pos_includes='',
        pos_excludes='MAD|MID|PAD',
        text_column='token',
        lemma_column='baseform',
        pos_column='pos',
    )
    default_engine_args: dict = {
        'n_topics': 4,
        'passes': 1,
        'random_seed': 42,
        'workers': 1,
        'max_iter': 100,
        'work_folder': os.path.join(target_folder, target_name),
    }
    config: CorpusConfig = CorpusConfig.load('./tests/test_data/tranströmer.yml')
    corpus_source: str = './tests/test_data/tranströmer_corpus_pos_csv.zip'
    p: CorpusPipeline = (
        CorpusPipeline(config=config)
        .load_tagged_frame(
            filename=corpus_source,
            checkpoint_opts=config.checkpoint_opts,
            extra_reader_opts=config.text_reader_opts,
        )
        .tagged_frame_to_tokens(extract_opts=extract_opts, transform_opts=transform_opts)
        .to_dtm(VectorizeOpts(already_tokenized=True))
        .to_topic_model(
            target_mode='both',
            target_folder=target_folder,
            target_name=target_name,
            engine=method,
            engine_args=default_engine_args,
            store_corpus=True,
            store_compressed=True,
        )
    )

    payload: DocumentPayload = p.single()

    return payload
예제 #13
0
def test_spacy_pipeline_load_text_to_spacy_to_dataframe_to_tokens_to_text_to_dtm(
        en_nlp):
    pytest.importorskip("spacy")

    reader_opts = TextReaderOpts(filename_pattern="*.txt",
                                 filename_fields="year:_:1")
    text_transform_opts = TextTransformOpts()
    reader = TextReader.create(MARY_TEST_CORPUS,
                               reader_opts=reader_opts,
                               transform_opts=text_transform_opts)

    attributes = ['text', 'lemma_', 'pos_', 'is_punct']
    extract_opts = ExtractTaggedTokensOpts(
        lemmatize=True,
        pos_includes='|VERB|NOUN|',
        pos_paddings=None,
        **SPACY_TAGGED_COLUMNS,
        filter_opts=dict(is_punct=False),
    )
    transform_opts = None

    vectorize_opts = VectorizeOpts()

    config = Mock(
        spec=CorpusConfig,
        pipeline_payload=PipelinePayload(source=reader).put2(
            **SPACY_TAGGED_COLUMNS),
    )

    pipeline = (CorpusPipeline(config=config).load_text(
        reader_opts=reader_opts,
        transform_opts=text_transform_opts).set_spacy_model(
            en_nlp).text_to_spacy().spacy_to_tagged_frame(
                attributes=attributes).tagged_frame_to_tokens(
                    extract_opts=extract_opts,
                    transform_opts=transform_opts).tokens_to_text().to_dtm(
                        vectorize_opts))

    corpus = pipeline.value()
    assert corpus is not None
    assert isinstance(corpus, VectorizedCorpus)
예제 #14
0
def test_spacy_pipeline(checkpoint_opts: CheckpointOpts):

    tagged_corpus_source = os.path.join(TEST_OUTPUT_FOLDER,
                                        "checkpoint_mary_lamb_pos_csv.zip")

    pathlib.Path(tagged_corpus_source).unlink(missing_ok=True)

    text_reader_opts = TextReaderOpts(
        filename_fields=["doc_id:_:2", "year:_:1"],
        index_field=None,  # use filename
        filename_filter=None,
        filename_pattern="*.txt",
        as_binary=False,
    )

    pipeline_payload = PipelinePayload(
        source=TEST_CORPUS,
        document_index_source=None,
        pos_schema_name="Universal",
        memory_store={
            'spacy_model': "en_core_web_sm",
            'nlp': None,
            'lang': 'en,',
            'pos_column': 'pos_'
        },
    )
    config = Mock(spec=CorpusConfig, pipeline_payload=pipeline_payload)
    pipeline = (CorpusPipeline(config=config).set_spacy_model(
        pipeline_payload.memory_store['spacy_model']).load_text(
            reader_opts=text_reader_opts, transform_opts=TextTransformOpts()
        ).text_to_spacy().passthrough().spacy_to_pos_tagged_frame().checkpoint(
            tagged_corpus_source,
            checkpoint_opts=checkpoint_opts,
            force_checkpoint=True).to_content())

    df_docs = pipeline.resolve()
    assert next(df_docs) is not None
    assert os.path.isfile(tagged_corpus_source)
예제 #15
0
def test_pipeline_to_co_occurrence_can_create_co_occurrence_bundle():
    context_opts: ContextOpts = ContextOpts(
        context_width=2, concept={}, ignore_concept=False, ignore_padding=False, processes=None
    )
    tokenized_corpus: TokenizedCorpus = very_simple_corpus(SIMPLE_CORPUS_ABCDE_5DOCS)
    config: CorpusConfig = CorpusConfig(
        corpus_name=uuid.uuid1(),
        corpus_type=CorpusType.Tokenized,
        corpus_pattern=None,
        checkpoint_opts=None,
        text_reader_opts=None,
        pipelines=None,
        pipeline_payload=PipelinePayload(),
        language="swedish",
    )

    """Expected windows generated for corpus"""
    # print({ k: [x for x in generate_windows(tokens=tokens, context_opts=context_opts)] for k, tokens in expected_tokens.items() })
    document_windows = {
        'tran_2019_01_test.txt': [
            ['*', '*', 'a', 'b', 'c'],
            ['*', 'a', 'b', 'c', 'c'],
            ['a', 'b', 'c', 'c', '*'],
            ['b', 'c', 'c', '*', '*'],
        ],
        'tran_2019_02_test.txt': [
            ['*', '*', 'a', 'a', 'b'],
            ['*', 'a', 'a', 'b', 'd'],
            ['a', 'a', 'b', 'd', '*'],
            ['a', 'b', 'd', '*', '*'],
        ],
        'tran_2019_03_test.txt': [
            ['*', '*', 'a', 'e', 'e'],
            ['*', 'a', 'e', 'e', 'b'],
            ['a', 'e', 'e', 'b', '*'],
            ['e', 'e', 'b', '*', '*'],
        ],
        'tran_2020_01_test.txt': [
            ['*', '*', 'c', 'c', 'd'],
            ['*', 'c', 'c', 'd', 'a'],
            ['c', 'c', 'd', 'a', '*'],
            ['c', 'd', 'a', '*', '*'],
        ],
        'tran_2020_02_test.txt': [
            ['*', '*', 'a', 'b', 'b'],
            ['*', 'a', 'b', 'b', 'e'],
            ['a', 'b', 'b', 'e', '*'],
            ['b', 'b', 'e', '*', '*'],
        ],
    }

    """Expected co-occurrences from windows above"""
    expected_TTMs = {filename: simple_co_occurrence(document_windows[filename]) for filename in document_windows}

    def verify_tokens_payload(
        p: CorpusPipeline, payload: DocumentPayload, *_  # pylint: disable=unused-argument
    ) -> bool:
        # expected_tokens: dict = { k: v for k, v in SIMPLE_CORPUS_ABCDE_5DOCS}

        expected_tokens: dict = {
            'tran_2019_01_test.txt': ['a', 'b', 'c', 'c'],
            'tran_2019_02_test.txt': ['a', 'a', 'b', 'd'],
            'tran_2019_03_test.txt': ['a', 'e', 'e', 'b'],
            'tran_2020_01_test.txt': ['c', 'c', 'd', 'a'],
            'tran_2020_02_test.txt': ['a', 'b', 'b', 'e'],
        }

        return payload.content == expected_tokens.get(payload.filename)

    def verify_expected_vocabulary(p: CorpusPipeline, *_) -> bool:
        return list(p.payload.token2id.keys()) == ['*', '__low-tf__', 'a', 'b', 'c', 'd', 'e']

    def verify_co_occurrence_document_TTM_payload(
        p: CorpusPipeline, payload: DocumentPayload, *_
    ) -> bool:  # pylint: disable=unused-argument

        fg = p.payload.token2id.id2token.get

        assert isinstance(payload.content, CoOccurrencePayload)

        TTM: sp.spmatrix = payload.content.ttm_data_map.get(VectorizeType.Normal).term_term_matrix.tocoo()

        document_TTM_data = {(fg(TTM.row[i]), fg(TTM.col[i])): TTM.data[i] for i in range(0, len(TTM.data))}

        assert expected_TTMs[payload.filename] == document_TTM_data

        return True

    bundle: Bundle = (
        CorpusPipeline(config=config)
        .load_corpus(tokenized_corpus)
        .assert_on_payload(payload_test=verify_tokens_payload)
        .vocabulary(lemmatize=True)
        .assert_on_exit(exit_test=verify_expected_vocabulary)
        .to_document_co_occurrence(context_opts=context_opts)
        .assert_on_payload(payload_test=verify_co_occurrence_document_TTM_payload)
        .to_corpus_co_occurrence(context_opts=context_opts, global_threshold_count=1)
        .single()
        .content
    )

    for filename in expected_TTMs:
        document_id = int(bundle.document_index[bundle.document_index.filename == filename].document_id)
        for (i, j), ij in bundle.token_ids_2_pair_id.items():
            pair = (bundle.token2id.id2token[i], bundle.token2id.id2token[j])
            if pair in expected_TTMs[filename]:
                assert bundle.corpus.data[document_id, ij] == expected_TTMs[filename][pair]
            else:
                assert bundle.corpus.data[document_id, ij] == 0
예제 #16
0
def create_very_simple_tokens_pipeline(data: List[Tuple[str, List[str]]]) -> CorpusPipeline:
    corpus: TokenizedCorpus = very_simple_corpus(data)
    p: CorpusPipeline = CorpusPipeline(config=None).load_corpus(corpus)
    return p
예제 #17
0
def test_spacy_pipeline_load_text_to_spacy_to_dataframe_to_tokens_resolves(
        en_nlp):
    pytest.importorskip("spacy")

    reader_opts = TextReaderOpts(filename_pattern="*.txt",
                                 filename_fields="year:_:1")
    text_transform_opts = TextTransformOpts()
    reader = TextReader.create(MARY_TEST_CORPUS,
                               reader_opts=reader_opts,
                               transform_opts=text_transform_opts)

    config = Mock(
        spec=CorpusConfig,
        pipeline_payload=PipelinePayload(source=reader).put2(
            **SPACY_TAGGED_COLUMNS),
    )
    attributes = ['text', 'lemma_', 'pos_']
    extract_opts = ExtractTaggedTokensOpts(
        lemmatize=True,
        pos_includes='|VERB|NOUN|',
        pos_paddings='|ADJ|',
        **SPACY_TAGGED_COLUMNS,
        filter_opts=dict(is_punct=False),
    )
    transform_opts = None

    pipeline = (CorpusPipeline(config=config).load_text(
        reader_opts=reader_opts).set_spacy_model(en_nlp).text_to_spacy(
        ).spacy_to_tagged_frame(attributes=attributes).tagged_frame_to_tokens(
            extract_opts=extract_opts, transform_opts=transform_opts))

    payloads = [x.content for x in pipeline.resolve()]

    assert payloads == [
        ['*', 'sea', 'ocean', 'life'],
        ['atmosphere', 'blow'],
        ['*', 'activity', 'surface', 'cease'],
        ['*', 'planet'],
        ['volcano', 'erupt', 'year'],
        ['eruption', 'occur', 'year', 'region', 'call'],
        ['know', '*', 'eruption'],
        ['volcano', 'erupt', 'surface', '*', 'interval'],
    ]

    assert set(list(pipeline.payload.document_index.columns)) == set([
        'filename',
        'year',
        'document_id',
        'document_name',
        'Adverb',
        'Conjunction',
        'Delimiter',
        'Noun',
        'Other',
        'Preposition',
        'n_tokens',
        'n_raw_tokens',
        'Pronoun',
        'Verb',
        'Adjective',
        'Numeral',
    ])
예제 #18
0
def patch_spacy_pipeline(payload: PipelinePayload):
    config: MagicMock = MagicMock(spec=CorpusConfig, pipeline_payload=payload)
    pipeline: CorpusPipeline = CorpusPipeline(config=config, tasks=[], payload=payload).setup()
    return pipeline
예제 #19
0
def patch_spacy_pipeline(task):
    pipeline = CorpusPipeline(config=fake_config(), tasks=[task]).setup()
    return pipeline
예제 #20
0
def test_passthrough_process_succeeds():
    task = tasks.Passthrough(pipeline=CorpusPipeline(config=Mock(
        spec=CorpusConfig, pipeline_payload=PipelinePayload()))).setup()
    current_payload = DocumentPayload()
    next_payload = task.process(current_payload)
    assert current_payload == next_payload
예제 #21
0
def test_set_spacy_model_setup_succeeds():
    pytest.importorskip("spacy")
    pipeline = CorpusPipeline(config=fake_config())
    _ = spacy_tasks.SetSpacyModel(pipeline=pipeline,
                                  name_or_nlp="en_core_web_sm").setup()
    assert pipeline.get("spacy_nlp", None) is not None