예제 #1
0
def create_test_data_bundles():
    os.makedirs(OUTPUT_FOLDER, exist_ok=True)

    config: CorpusConfig = CorpusConfig.load('./tests/test_data/SSI.yml')

    config.pipeline_payload.source = './tests/test_data/legal_instrument_five_docs_test.zip'
    config.pipeline_payload.document_index_source = './tests/test_data/legal_instrument_five_docs_test.csv'

    tag: str = 'VENUS-CONCEPT'
    create_bundle_by_spaCy_pipeline(
        config=config,
        context_opts=ContextOpts(context_width=4,
                                 concept={"cultural"},
                                 ignore_concept=True,
                                 partition_keys=['document_id']),
        tag=tag,
    ).store()

    tag: str = 'VENUS'
    create_bundle_by_spaCy_pipeline(
        config=config,
        context_opts=ContextOpts(context_width=4,
                                 concept={},
                                 ignore_concept=True,
                                 partition_keys=['document_id']),
        tag=tag,
    ).store()

    tag: str = 'ABCDEFG_7DOCS'
    create_bundle_by_spaCy_pipeline(
        config=config,
        context_opts=ContextOpts(context_width=4,
                                 concept={},
                                 ignore_concept=True,
                                 partition_keys=['document_id']),
        tag=tag,
    ).store()

    tag: str = 'ABCDEFG_7DOCS'
    create_simple_bundle_by_pipeline(
        data=SIMPLE_CORPUS_ABCDEFG_7DOCS,
        context_opts=ContextOpts(concept={},
                                 ignore_concept=False,
                                 context_width=2),
        tag=tag,
        folder=DATA_FOLDER,
    ).store()

    tag: str = 'ABCDEFG_7DOCS_CONCEPT'
    create_simple_bundle_by_pipeline(
        data=SIMPLE_CORPUS_ABCDEFG_7DOCS,
        context_opts=ContextOpts(concept={'g'},
                                 ignore_concept=False,
                                 context_width=2),
        tag=tag,
        folder=DATA_FOLDER,
    ).store()
예제 #2
0
def wildcard_to_partition_by_document_co_occurrence_pipeline(
        *,
        extract_opts: ExtractTaggedTokensOpts = None,
        transform_opts: TokensTransformOpts = None,
        context_opts: ContextOpts = None,
        global_tf_threshold: int = None,
        **kwargs,  # pylint: disable=unused-argument
) -> CorpusPipeline:

    passthroughs: set = context_opts.get_concepts().union(
        extract_opts.get_passthrough_tokens())
    pipeline: pipelines.CorpusPipeline = (
        pipelines.wildcard().vocabulary(
            lemmatize=extract_opts.lemmatize,
            progress=True,
            tf_threshold=extract_opts.global_tf_threshold,
            tf_keeps=passthroughs,
            close=True,
        ).tagged_frame_to_tokens(
            extract_opts=extract_opts,  # .clear_tf_threshold(),
            transform_opts=transform_opts,
        )
        # .tokens_transform(transform_opts=transform_opts)
        .to_document_co_occurrence(context_opts=context_opts)
        # .tqdm(desc="Processing documents")
        .to_corpus_co_occurrence(
            context_opts=context_opts,
            global_threshold_count=global_tf_threshold,
        ))

    return pipeline
예제 #3
0
def test_tasks_pool_tokens_to_ttm():
    corpus: TokenizedCorpus = very_simple_corpus(SIMPLE_CORPUS_ABCDE_3DOCS)
    token2id: dict = corpus.token2id
    context_opts: ContextOpts = ContextOpts(
        concept={'d'},
        ignore_concept=False,
        ignore_padding=False,
        context_width=1,
        processes=None,
    )
    token2id[context_opts.pad] = len(token2id)
    concept_ids = {token2id[x] for x in context_opts.concept}
    ignore_ids = set()
    filename, tokens = next(corpus)
    # doc_info = corpus.document_index[corpus.document_index.filename == filename].to_dict('records')[0]
    doc_info = faster_to_dict_records(corpus.document_index[corpus.document_index.filename == filename])[0]
    token_ids = [token2id[t] for t in tokens]
    pad_id = token2id[context_opts.pad]
    args = (
        doc_info['document_id'],
        doc_info['document_name'],
        doc_info['filename'],
        token_ids,
        pad_id,
        context_opts,
        concept_ids,
        ignore_ids,
        len(token2id),
    )

    item: dict = tokens_to_ttm(args)
    assert item is not None
예제 #4
0
def test_compress_corpus():

    context_opts: ContextOpts = ContextOpts(concept={'d'},
                                            ignore_concept=False,
                                            context_width=1,
                                            processes=None,
                                            ignore_padding=False)

    bundle: Bundle = create_simple_bundle_by_pipeline(
        data=SIMPLE_CORPUS_ABCDE_3DOCS,
        context_opts=context_opts,
        compress=False)
    concept_corpus: VectorizedCorpus = bundle.concept_corpus

    assert ((concept_corpus.data.todense() == np.matrix(
        [
            [0, 0, 0, 0, 5, 0, 0, 0, 1, 1, 0, 0, 0],
            [0, 0, 0, 0, 3, 0, 0, 0, 1, 1, 2, 0, 0],
            [0, 0, 0, 0, 0, 0, 0, 1, 0, 3, 1, 0, 0],
        ],
        dtype=np.int32,
    )).all().all())

    _, ids_translation, keep_ids = concept_corpus.compress(tf_threshold=1,
                                                           extra_keep_ids=[1],
                                                           inplace=True)

    assert ((concept_corpus.data.todense() == np.matrix(
        [[0, 5, 0, 1, 1, 0], [0, 3, 0, 1, 1, 2], [0, 0, 1, 0, 3, 1]],
        dtype=np.int32,
    )).all().all())
    assert keep_ids.tolist() == [1, 4, 7, 8, 9, 10]
    assert ids_translation == {1: 0, 4: 1, 7: 2, 8: 3, 9: 4, 10: 5}
예제 #5
0
 def context_opts(self) -> ContextOpts:
     return ContextOpts(
         concept=self.concept_tokens,
         context_width=self._context_width.value,
         ignore_concept=self._ignore_concept.value,
         ignore_padding=self._ignore_padding.value,
         partition_keys=[self._partition_key.value],
     )
예제 #6
0
def create_simple_bundle() -> Bundle:
    simple_corpus = very_simple_corpus(SIMPLE_CORPUS_ABCDEFG_3DOCS)
    context_opts: ContextOpts = ContextOpts(concept=set(), ignore_concept=False, context_width=2)
    bundle: Bundle = create_simple_bundle_by_pipeline(
        data=simple_corpus,
        context_opts=context_opts,
    )
    return bundle
예제 #7
0
def ComputeOptsSparvCSV(
    *,
    corpus_tag: str = 'TELLUS',
    corpus_source:
    str = './tests/test_data/tranströmer_corpus_export.sparv4.csv.zip',
) -> ComputeOpts:  # pylint: disable=too-many-instance-attributes)

    return ComputeOpts(
        corpus_tag=corpus_tag,
        corpus_source=corpus_source,
        target_folder="./tests/output",
        corpus_type=CorpusType.SparvCSV,
        transform_opts=TokensTransformOpts(
            to_lower=True,
            min_len=1,
            remove_stopwords=None,
            keep_symbols=True,
            keep_numerals=True,
            only_alphabetic=False,
            only_any_alphanumeric=False,
        ),
        text_reader_opts=TextReaderOpts(
            filename_pattern='*.csv',
            filename_fields=('year:_:1', ),
            index_field=None,  # use filename
            as_binary=False,
        ),
        extract_opts=ExtractTaggedTokensOpts(
            pos_includes=None,
            pos_excludes='|MAD|MID|PAD|',
            pos_paddings=None,
            lemmatize=False,
            **SPARV_TAGGED_COLUMNS,
            filter_opts=dict(
                is_alpha=False,
                is_punct=False,
                is_digit=None,
                is_stop=None,
                is_space=False,
            ),
        ),
        create_subfolder=False,
        persist=True,
        context_opts=ContextOpts(
            concept=('jag', ),
            context_width=2,
            partition_keys=['document_id'],
        ),
        tf_threshold=1,
        tf_threshold_mask=False,
        vectorize_opts=VectorizeOpts(already_tokenized=True,
                                     min_tf=1,
                                     max_tokens=None),
    )
예제 #8
0
def test_HAL_cwr_corpus_burgess_litmus():
    data = [('document_01.txt', 'The Horse Raced Past The Barn Fell .'.lower().split())]
    context_opts: ContextOpts = ContextOpts(
        context_width=2,
        concept=set(),
        ignore_padding=False,
    )
    bundle: Bundle = create_simple_bundle_by_pipeline(data, context_opts)

    hal_cwr_corpus: VectorizedCorpus = bundle.corpus.to_HAL_cwr_keyness()

    assert hal_cwr_corpus is not None
예제 #9
0
def create_keyness_test_bundle(
    data: Any, *, concept: str = 'd', ignore_padding=True, context_width: int = 1, processes: int = 2
) -> Bundle:
    context_opts: ContextOpts = ContextOpts(
        concept={concept},
        ignore_concept=False,
        ignore_padding=ignore_padding,
        context_width=context_width,
        processes=processes,
    )
    bundle: Bundle = create_simple_bundle_by_pipeline(data=data, context_opts=context_opts)
    return bundle
예제 #10
0
def test_compute_hal_score_by_co_occurrence_matrix_burgess_litmus():
    data = [('document_01.txt', 'The Horse Raced Past The Barn Fell .'.lower().split())]
    context_opts: ContextOpts = ContextOpts(
        context_width=2,
        concept=set(),
    )
    bundle: Bundle = create_simple_bundle_by_pipeline(data, context_opts)
    co_occurrences = bundle.co_occurrences
    co_occurrences['cwr'] = compute_hal_score_by_co_occurrence_matrix(
        bundle.co_occurrences, bundle.corpus.window_counts.document_term_window_counts
    )
    assert 'cwr' in co_occurrences.columns
예제 #11
0
def test_tokens_to_windows():

    context_opts: ContextOpts = ContextOpts(concept=set(),
                                            context_width=1,
                                            ignore_padding=False,
                                            pad="*",
                                            min_window_size=0)

    tokens: Iterable[Token] = ["a", "*", "c", "a", "e", "*", "*", "h"]
    token2id: Token2Id = Token2Id().ingest([context_opts.pad] + tokens)

    expected_windows = [
        ['*', 'a', '*'],
        ['a', '*', 'c'],
        ['*', 'c', 'a'],
        ['c', 'a', 'e'],
        ['a', 'e', '*'],
        ['e', '*', '*'],
        ['*', '*', 'h'],
        ['*', 'h', '*'],
    ]

    windows: Iterable[Iterable[str]] = generate_windows(
        token_ids=[token2id[t] for t in tokens],
        context_width=context_opts.context_width,
        pad_id=token2id[context_opts.pad],
        ignore_pads=False,
    )

    assert list(windows) == [[token2id[t] for t in w]
                             for w in expected_windows]

    expected_windows = [
        ['a'],
        ['a', 'c'],
        ['c', 'a'],
        ['c', 'a', 'e'],
        ['a', 'e'],
        ['e'],
        ['h'],
        ['h'],
    ]

    windows: Iterable[Iterable[str]] = generate_windows(
        token_ids=[token2id[t] for t in tokens],
        context_width=context_opts.context_width,
        pad_id=token2id[context_opts.pad],
        ignore_pads=True,
    )

    assert list(windows) == [[token2id[t] for t in w]
                             for w in expected_windows]
예제 #12
0
def test_generate_cli_opts():
    compute_opts = interface.ComputeOpts(
        corpus_type=interface.CorpusType.SparvCSV,
        corpus_source="apa.txt",
        target_folder='./tests/output',
        corpus_tag='APA',
        transform_opts=TokensTransformOpts(
            only_alphabetic=False,
            only_any_alphanumeric=False,
            to_lower=True,
            to_upper=False,
            min_len=1,
            max_len=None,
            remove_accents=False,
            remove_stopwords=True,
            stopwords=None,
            extra_stopwords=['örn'],
            language='swedish',
            keep_numerals=True,
            keep_symbols=True,
        ),
        text_reader_opts=TextReaderOpts(
            filename_pattern='*.csv',
            filename_filter=None,
            filename_fields=[
                'year:prot\\_(\\d{4}).*',
                'year2:prot_\\d{4}(\\d{2})__*',
                'number:prot_\\d+[afk_]{0,4}__(\\d+).*',
            ],
            index_field=None,
            as_binary=False,
            sep='\t',
            quoting=3,
        ),
        extract_opts=ExtractTaggedTokensOpts(
            lemmatize=True,
            target_override=None,
            pos_includes="NN",
            pos_excludes=None,
            pos_paddings="MID|MAD|PAD",
            passthrough_tokens=[],
            block_tokens=[],
            append_pos=False,
            global_tf_threshold=1,
            global_tf_threshold_mask=False,
            **SPARV_TAGGED_COLUMNS,
        ),
        vectorize_opts=VectorizeOpts(
            already_tokenized=True,
            lowercase=False,
            stop_words=None,
            max_df=1.0,
            min_df=1,
            min_tf=1,
            max_tokens=None,
        ),
        tf_threshold=1,
        tf_threshold_mask=False,
        create_subfolder=True,
        persist=True,
        context_opts=ContextOpts(
            context_width=1,
            concept=["apa"],
            ignore_concept=False,
            partition_keys=['document_id'],
        ),
        enable_checkpoint=True,
        force_checkpoint=False,
    )

    cli_command: str = compute_opts.command_line("apa")

    assert cli_command is not None
예제 #13
0
def run_workflow():
    corpus_config = pipeline.CorpusConfig.load(CONFIG_FILENAME).folders(DATA_FOLDER)
    corpus_config.pipeline_payload.files(source=CORPUS_FILENAME, document_index_source=None)
    corpus_config.checkpoint_opts.deserialize_processes = 4

    compute_opts = ComputeOpts(
        corpus_type=pipeline.CorpusType.SparvCSV,
        corpus_source=CORPUS_FILENAME,
        target_folder=jj(OUTPUT_FOLDER, 'APA'),
        corpus_tag='APA',
        transform_opts=corpora.TokensTransformOpts(
            to_lower=True,
            to_upper=False,
            min_len=1,
            max_len=None,
            remove_accents=False,
            remove_stopwords=False,
            stopwords=None,
            extra_stopwords=None,
            language='swedish',
            keep_numerals=True,
            keep_symbols=True,
            only_alphabetic=False,
            only_any_alphanumeric=False,
        ),
        text_reader_opts=corpora.TextReaderOpts(
            filename_pattern='*.csv',
            filename_filter=None,
            filename_fields=[
                'year:prot\\_(\\d{4}).*',
                'year2:prot_\\d{4}(\\d{2})__*',
                'number:prot_\\d+[afk_]{0,4}__(\\d+).*',
            ],
            index_field=None,
            as_binary=False,
            sep='\t',
            quoting=3,
        ),
        extract_opts=corpora.ExtractTaggedTokensOpts(
            pos_includes='NN|PM',
            pos_excludes='MAD|MID|PAD',
            pos_paddings='AB|DT|HA|HD|HP|HS|IE|IN|JJ|KN|PC|PL|PN|PP|PS|RG|RO|SN|UO|VB',
            lemmatize=True,
            append_pos=False,
            global_tf_threshold=1,
            global_tf_threshold_mask=False,
            **corpus_config.pipeline_payload.tagged_columns_names,
        ),
        vectorize_opts=corpora.VectorizeOpts(
            already_tokenized=True,
            lowercase=False,
            stop_words=None,
            max_df=1.0,
            min_df=1,
            min_tf=1,
        ),
        tf_threshold=1,
        tf_threshold_mask=False,
        create_subfolder=True,
        persist=True,
        context_opts=ContextOpts(
            context_width=2,
            concept=set(['kammare']),
            ignore_concept=False,
            partition_keys=['document_name'],
            processes=4,
            chunksize=10,
        ),
        enable_checkpoint=False,
        force_checkpoint=False,
    )

    _ = workflow.compute(
        args=compute_opts,
        corpus_config=corpus_config,
        tagged_corpus_source=jj(OUTPUT_FOLDER, 'test.zip'),
    )
예제 #14
0
def ComputeOptsSpacyCSV(
    *,
    corpus_tag: str = 'MARS',
    corpus_source:
    str = './tests/test_data/legal_instrument_five_docs_test.zip',
) -> ComputeOpts:  # pylint: disable=too-many-instance-attributes)

    return ComputeOpts(
        corpus_tag=corpus_tag,
        corpus_source=corpus_source,
        target_folder="./tests/output",
        corpus_type=CorpusType.SpacyCSV,
        # pos_scheme: PoS_Tag_Scheme = PoS_Tag_Schemes.Universal
        transform_opts=TokensTransformOpts(
            extra_stopwords=[],
            keep_numerals=True,
            keep_symbols=True,
            language='english',
            max_len=None,
            min_len=1,
            only_alphabetic=False,
            only_any_alphanumeric=False,
            remove_accents=False,
            remove_stopwords=True,
            stopwords=None,
            to_lower=True,
            to_upper=False,
        ),
        text_reader_opts=TextReaderOpts(
            filename_pattern='*.csv',
            filename_fields=['year:_:1'],
            index_field=None,  # use filename
            as_binary=False,
        ),
        extract_opts=ExtractTaggedTokensOpts(
            lemmatize=True,
            target_override=None,
            pos_includes='|NOUN|PROPN|VERB|',
            pos_paddings=None,
            pos_excludes='|PUNCT|EOL|SPACE|',
            passthrough_tokens=[],
            block_tokens=[],
            append_pos=False,
            global_tf_threshold=1,
            global_tf_threshold_mask=False,
            **SPACY_TAGGED_COLUMNS,
            filter_opts=dict(
                is_alpha=False,
                is_punct=False,
                is_digit=None,
                is_stop=None,
                is_space=False,
            ),
        ),
        create_subfolder=False,
        persist=True,
        context_opts=ContextOpts(
            context_width=4,
            concept=set(),
            ignore_concept=False,
            partition_keys=['document_id'],
        ),
        tf_threshold=1,
        tf_threshold_mask=False,
        vectorize_opts=VectorizeOpts(
            already_tokenized=True,
            lowercase=False,
            max_df=1.0,
            min_df=1,
            min_tf=1,
            max_tokens=None,
        ),
    )
예제 #15
0
def process_co_ocurrence(
    corpus_config: str = None,
    input_filename: str = None,
    output_filename: str = None,
    filename_pattern: str = None,
    concept: List[str] = None,
    ignore_concept: bool = False,
    ignore_padding: bool = False,
    context_width: int = None,
    compute_processes: int = None,
    compute_chunk_size: int = 10,
    partition_key: Sequence[str] = None,
    phrase: Sequence[str] = None,
    phrase_file: str = None,
    create_subfolder: bool = True,
    pos_includes: str = None,
    pos_paddings: str = None,
    pos_excludes: str = None,
    append_pos: bool = False,
    to_lower: bool = True,
    lemmatize: bool = True,
    remove_stopwords: str = None,
    min_word_length: int = 2,
    max_word_length: int = None,
    keep_symbols: bool = False,
    keep_numerals: bool = False,
    only_any_alphanumeric: bool = False,
    only_alphabetic: bool = False,
    tf_threshold: int = 1,
    tf_threshold_mask: bool = False,
    enable_checkpoint: bool = True,
    force_checkpoint: bool = False,
    deserialize_processes: int = 4,
):

    try:
        output_folder, output_tag = to_folder_and_tag(output_filename)
        corpus_config: CorpusConfig = CorpusConfig.load(corpus_config)
        phrases = parse_phrases(phrase_file, phrase)

        if pos_excludes is None:
            pos_excludes = pos_tags_to_str(corpus_config.pos_schema.Delimiter)

        if pos_paddings.upper() in ["FULL", "ALL", "PASSTHROUGH"]:
            pos_paddings = pos_tags_to_str(corpus_config.pos_schema.all_types_except(pos_includes))
            logger.info(f"PoS paddings expanded to: {pos_paddings}")

        text_reader_opts: TextReaderOpts = corpus_config.text_reader_opts.copy()

        if filename_pattern is not None:
            text_reader_opts.filename_pattern = filename_pattern

        corpus_config.checkpoint_opts.deserialize_processes = max(1, deserialize_processes)

        tagged_columns: dict = corpus_config.pipeline_payload.tagged_columns_names
        args: interface.ComputeOpts = interface.ComputeOpts(
            corpus_type=corpus_config.corpus_type,
            corpus_source=input_filename,
            target_folder=output_folder,
            corpus_tag=output_tag,
            transform_opts=TokensTransformOpts(
                to_lower=to_lower,
                to_upper=False,
                min_len=min_word_length,
                max_len=max_word_length,
                remove_accents=False,
                remove_stopwords=(remove_stopwords is not None),
                stopwords=None,
                extra_stopwords=None,
                language=remove_stopwords,
                keep_numerals=keep_numerals,
                keep_symbols=keep_symbols,
                only_alphabetic=only_alphabetic,
                only_any_alphanumeric=only_any_alphanumeric,
            ),
            text_reader_opts=text_reader_opts,
            extract_opts=ExtractTaggedTokensOpts(
                pos_includes=pos_includes,
                pos_paddings=pos_paddings,
                pos_excludes=pos_excludes,
                lemmatize=lemmatize,
                phrases=phrases,
                append_pos=append_pos,
                global_tf_threshold=tf_threshold,
                global_tf_threshold_mask=tf_threshold_mask,
                **tagged_columns,
            ),
            vectorize_opts=VectorizeOpts(already_tokenized=True, max_tokens=None),
            tf_threshold=tf_threshold,
            tf_threshold_mask=tf_threshold_mask,
            create_subfolder=create_subfolder,
            persist=True,
            context_opts=ContextOpts(
                context_width=context_width,
                concept=set(concept or []),
                ignore_concept=ignore_concept,
                ignore_padding=ignore_padding,
                partition_keys=partition_key,
                processes=compute_processes,
                chunksize=compute_chunk_size,
            ),
            enable_checkpoint=enable_checkpoint,
            force_checkpoint=force_checkpoint,
        )

        workflow.compute(args=args, corpus_config=corpus_config)

        logger.info('Done!')

    except Exception as ex:  # pylint: disable=try-except-raise, unused-variable
        logger.exception(ex)
        click.echo(ex)
        # sys.exit(1)
        raise
예제 #16
0
def test_step_by_step_compress_with_simple_corpus():

    context_opts: ContextOpts = ContextOpts(concept={'d'},
                                            ignore_concept=False,
                                            context_width=1,
                                            ignore_padding=False)

    bundle: Bundle = create_simple_bundle_by_pipeline(
        data=SIMPLE_CORPUS_ABCDE_3DOCS,
        context_opts=context_opts,
        compress=False)

    token2id = dict(bundle.token2id.data)
    assert token2id == {
        '*': 0,
        'd': 1,
        '__low-tf__': 2,
        'a': 3,
        'b': 4,
        'c': 5,
        'e': 6
    }

    windows = [[[bundle.token2id.id2token[x] for x in window]
                for window in generate_windows(
                    token_ids=[bundle.token2id[t] for t in tokens],
                    context_width=context_opts.context_width,
                    pad_id=bundle.token2id[context_opts.pad],
                    ignore_pads=False,
                )] for _, tokens in SIMPLE_CORPUS_ABCDE_3DOCS]
    assert windows == [
        [
            ['*', 'a', 'b'],
            ['a', 'b', 'c'],
            ['b', 'c', 'c'],
            ['c', 'c', 'd'],
            ['c', 'd', 'c'],
            ['d', 'c', 'e'],
            ['c', 'e', '*'],
        ],
        [
            ['*', 'a', 'a'],
            ['a', 'a', 'c'],
            ['a', 'c', 'e'],
            ['c', 'e', 'c'],
            ['e', 'c', 'd'],
            ['c', 'd', 'd'],
            ['d', 'd', '*'],
        ],
        [
            ['*', 'd', 'e'],
            ['d', 'e', 'e'],
            ['e', 'e', 'b'],
            ['e', 'b', '*'],
        ],
    ]

    concept_windows = [[window for window in document if 'd' in window]
                       for document in windows]
    assert concept_windows == [
        [
            ['c', 'c', 'd'],
            ['c', 'd', 'c'],
            ['d', 'c', 'e'],
        ],
        [
            ['e', 'c', 'd'],
            ['c', 'd', 'd'],
            ['d', 'd', '*'],
        ],
        [
            ['*', 'd', 'e'],
            ['d', 'e', 'e'],
        ],
    ]

    co_occurrence_dtm = bundle.concept_corpus.data.todense()
    assert ((co_occurrence_dtm == np.matrix(
        [
            [0, 0, 0, 0, 5, 0, 0, 0, 1, 1, 0, 0, 0],
            [0, 0, 0, 0, 3, 0, 0, 0, 1, 1, 2, 0, 0],
            [0, 0, 0, 0, 0, 0, 0, 1, 0, 3, 1, 0, 0],
        ],
        dtype=np.int32,
    )).all().all())

    # id2pair = bundle.corpus.id2token
    # co_occurrence_dtm_frame = pd.DataFrame(co_occurrence_dtm, columns=[id2pair[i] for i in range(0, len(id2pair))])
    # assert co_occurrence_dtm_frame is not None
    """
        print(co_occurrence_dtm_frame)

                             *              *    *    *    *
         0    1    2    3    4    5    6    7    8    9   10   11   12
        */a  a/b  */b  */c  d/c  b/c  a/c  */e  c/e  d/e  */d  a/e  b/e
        --------------------------------------------------------------
    0    0    0    0    0    5    0    0    0    1    1    0    0    0
    1    0    0    0    0    3    0    0    0    1    1    2    0    0
    2    0    0    0    0    0    0    0    1    0    3    1    0    0
        --------------------------------------------------------------
  SUM    0    0    0    0    8    0    0    1    2    5    3    0    0


    """
    """ Compress concept corpus (inlined code) """

    concept_corpus: VectorizedCorpus = bundle.concept_corpus

    # _, ids_translation, kept_pair_ids = concept_corpus.compress(tf_threshold=1, inplace=True)

    extra_keep_ids = []
    keep_ids = concept_corpus.term_frequencies_greater_than_or_equal_to_threshold(
        1, keep_indices=extra_keep_ids)
    assert keep_ids.tolist() == [4, 7, 8, 9, 10]

    extra_keep_ids = [1]
    keep_ids = concept_corpus.term_frequencies_greater_than_or_equal_to_threshold(
        1, keep_indices=extra_keep_ids)

    assert keep_ids.tolist() == [1, 4, 7, 8, 9, 10]

    ids_translation = {
        old_id: new_id
        for new_id, old_id in enumerate(keep_ids)
    }
    assert ids_translation == {1: 0, 4: 1, 7: 2, 8: 3, 9: 4, 10: 5}

    concept_corpus.slice_by_indices(keep_ids, inplace=True)
    assert concept_corpus.token2id == {
        'a/b': 0,
        'd/c': 1,
        '*/e': 2,
        'c/e': 3,
        'd/e': 4,
        '*/d': 5
    }

    #  1    6    7    8    9   10
    # a/b  c/d  */e  d/e  c/e  */d

    assert ((
        concept_corpus.data.todense() == np.matrix(
            [
                # 0  1  2  3  4  5
                [0, 5, 0, 1, 1, 0],
                [0, 3, 0, 1, 1, 2],
                [0, 0, 1, 0, 3, 1],
            ],
            dtype=np.int32,
        )).all().all())

    assert concept_corpus.term_frequency.tolist() == [0, 8, 1, 2, 5, 3]
    assert concept_corpus.overridden_term_frequency is None
    """ Slice full corpus """
    corpus = bundle.corpus

    # pp(corpus.data.todense())
    assert ((
        corpus.data.todense() == np.matrix(
            [
                # 0  1  2  3  4  5  6  7  8  9 10 11 12
                [1, 2, 1, 1, 5, 3, 1, 1, 2, 1, 0, 0, 0],
                [2, 0, 0, 0, 3, 0, 3, 0, 4, 1, 2, 1, 0],
                [0, 0, 1, 0, 0, 0, 0, 2, 0, 3, 1, 0, 3],
            ],
            dtype=np.int32,
        )).all().all())

    corpus.slice_by_indices(keep_ids, inplace=True)

    # pp(corpus.data.todense())
    assert ((
        corpus.data.todense() == np.matrix(
            [
                # 1  6  7  8  9 10
                # ----------------
                # 0  1  2  3  4  5
                # ----------------
                [2, 5, 1, 2, 1, 0],
                [0, 3, 0, 4, 1, 2],
                [0, 0, 2, 0, 3, 1],
                # ----------------
                # 2  8  3  5  6  3
            ],
            dtype=np.int32,
        )).all().all())

    assert corpus.token2id == {
        'a/b': 0,
        'd/c': 1,
        '*/e': 2,
        'c/e': 3,
        'd/e': 4,
        '*/d': 5
    }
    assert corpus.term_frequency.tolist() == [2, 8, 3, 6, 5, 3]
    assert corpus.overridden_term_frequency is None
    """Update token count and token2id"""
    def _token_ids_to_keep(kept_pair_ids: Set[int]) -> List[int]:
        """Returns sorted token IDs that given co-occurrence pair IDs corresponds to """
        token_ids_in_kept_pairs: Set[int] = set(
            flatten((k for k, pair_id in bundle.token_ids_2_pair_id.items()
                     if pair_id in kept_pair_ids)))
        kept_token_ids: List[int] = sorted(
            list(
                token_ids_in_kept_pairs.union(
                    set(bundle.token2id.magic_token_ids))))
        return kept_token_ids

    """" Inlined calls """
    token_ids_in_kept_pairs: Set[int] = set(
        flatten((k for k, pair_id in bundle.token_ids_2_pair_id.items()
                 if pair_id in keep_ids)))
    assert token_ids_in_kept_pairs == {0, 1, 3, 4, 5,
                                       6}  # all except masked token

    kept_token_ids = sorted(
        list(
            token_ids_in_kept_pairs.union(set(
                bundle.token2id.magic_token_ids))))
    assert kept_token_ids == [0, 1, 2, 3, 4, 5, 6]
    """" Equals function call """
    assert kept_token_ids == _token_ids_to_keep(set(keep_ids))

    assert ((
        corpus.window_counts.document_term_window_counts.todense() ==
        np.matrix(
            [
                # *  -  a  b  c  d  e
                [2, 3, 0, 2, 3, 6, 2],
                [2, 3, 0, 3, 0, 5, 3],
                [2, 2, 0, 0, 2, 0, 4],
            ],
            dtype=np.int32,
        )).all().all())

    corpus.window_counts.slice(kept_token_ids, inplace=True)
    """Nothing is changed since all original tokens are kepts"""
    assert corpus.window_counts.document_term_window_counts.shape == (3, 7)
    """Simulate removed token `b` """
    wc: TokenWindowCountMatrix = corpus.window_counts.slice(
        [x for x in kept_token_ids if x != 3], inplace=False)

    assert ((wc.document_term_window_counts.todense() == np.matrix(
        [[2, 3, 0, 3, 6, 2], [2, 3, 0, 0, 5, 3], [2, 2, 0, 2, 0, 4]],
        dtype=np.int32,
    )).all().all())

    wc: TokenWindowCountMatrix = corpus.window_counts

    wc: TokenWindowCountMatrix = concept_corpus.window_counts
    assert ((wc.slice(
        kept_token_ids,
        inplace=False).document_term_window_counts.todense() == np.matrix(
            [[0, 3, 0, 0, 0, 3, 1], [1, 3, 0, 0, 0, 2, 1],
             [1, 2, 0, 0, 0, 0, 2]],
            dtype=np.int32,
        )).all().all())
    assert ids_translation == {1: 0, 4: 1, 7: 2, 8: 3, 9: 4, 10: 5}
    translated_token2id = bundle.token2id.translate(ids_translation,
                                                    inplace=False)
    assert translated_token2id is not None

    bundle._token_ids_2_pair_id = {
        pair: pair_id
        for pair, pair_id in bundle._token_ids_2_pair_id
        if pair_id in ids_translation
    }
예제 #17
0
def test_tasks_pool_tokens_to_ttm_step_by_step():

    # Arrange
    context_opts: ContextOpts = ContextOpts(
        concept={'d'},
        ignore_concept=False,
        ignore_padding=False,
        context_width=1,
        processes=None,
    )
    corpus: TokenizedCorpus = very_simple_corpus(SIMPLE_CORPUS_ABCDE_3DOCS)

    pad_id = len(corpus.token2id)
    corpus.token2id[context_opts.pad] = pad_id

    token2id: dict = corpus.token2id
    id2token: dict = corpus.id2token

    filename, tokens = next(corpus)
    # doc_info = corpus.document_index[corpus.document_index.filename == filename].to_dict('records')[0]
    doc_info = faster_to_dict_records(corpus.document_index[corpus.document_index.filename == filename])[0]
    document_id = doc_info['document_id']
    token_ids = [token2id[t] for t in tokens]
    concept_ids = {token2id[x] for x in context_opts.concept}

    # Act

    windows: Iterable[Iterable[int]] = generate_windows(
        token_ids=token_ids,
        context_width=context_opts.context_width,
        pad_id=pad_id,
        ignore_pads=context_opts.ignore_padding,
    )

    # Assert
    windows = [w for w in windows]
    assert windows == [[5, 0, 1], [0, 1, 2], [1, 2, 2], [2, 2, 3], [2, 3, 2], [3, 2, 4], [2, 4, 5]]
    assert [[id2token[i] for i in w] for w in windows] == [
        ['*', 'a', 'b'],
        ['a', 'b', 'c'],
        ['b', 'c', 'c'],
        ['c', 'c', 'd'],
        ['c', 'd', 'c'],
        ['d', 'c', 'e'],
        ['c', 'e', '*'],
    ]
    # ['a', 'b', 'c', 'c', 'd', 'c', 'e']
    ttm_map: Mapping[VectorizeType, VectorizedTTM] = windows_to_ttm(
        document_id=document_id,
        windows=windows,
        concept_ids=concept_ids,
        ignore_ids=set(),
        vocab_size=len(token2id),
    )
    expected_normal_ttm = [
        # a  b  c  d  e  *
        [0, 2, 1, 0, 0, 1],  # a
        [0, 0, 3, 0, 0, 1],  # b
        [0, 0, 0, 5, 2, 1],  # c
        [0, 0, 0, 0, 1, 0],  # d
        [0, 0, 0, 0, 0, 1],  # e
        [0, 0, 0, 0, 0, 0],  # *
    ]
    assert (ttm_map[VectorizeType.Normal].term_term_matrix.todense() == expected_normal_ttm).all()

    expected_concept_ttm = [
        # a  b  c  d  e  *
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 5, 1, 0],
        [0, 0, 0, 0, 1, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
    ]

    assert (ttm_map[VectorizeType.Concept].term_term_matrix.todense() == expected_concept_ttm).all()
예제 #18
0
def test_pipeline_to_co_occurrence_can_create_co_occurrence_bundle():
    context_opts: ContextOpts = ContextOpts(
        context_width=2, concept={}, ignore_concept=False, ignore_padding=False, processes=None
    )
    tokenized_corpus: TokenizedCorpus = very_simple_corpus(SIMPLE_CORPUS_ABCDE_5DOCS)
    config: CorpusConfig = CorpusConfig(
        corpus_name=uuid.uuid1(),
        corpus_type=CorpusType.Tokenized,
        corpus_pattern=None,
        checkpoint_opts=None,
        text_reader_opts=None,
        pipelines=None,
        pipeline_payload=PipelinePayload(),
        language="swedish",
    )

    """Expected windows generated for corpus"""
    # print({ k: [x for x in generate_windows(tokens=tokens, context_opts=context_opts)] for k, tokens in expected_tokens.items() })
    document_windows = {
        'tran_2019_01_test.txt': [
            ['*', '*', 'a', 'b', 'c'],
            ['*', 'a', 'b', 'c', 'c'],
            ['a', 'b', 'c', 'c', '*'],
            ['b', 'c', 'c', '*', '*'],
        ],
        'tran_2019_02_test.txt': [
            ['*', '*', 'a', 'a', 'b'],
            ['*', 'a', 'a', 'b', 'd'],
            ['a', 'a', 'b', 'd', '*'],
            ['a', 'b', 'd', '*', '*'],
        ],
        'tran_2019_03_test.txt': [
            ['*', '*', 'a', 'e', 'e'],
            ['*', 'a', 'e', 'e', 'b'],
            ['a', 'e', 'e', 'b', '*'],
            ['e', 'e', 'b', '*', '*'],
        ],
        'tran_2020_01_test.txt': [
            ['*', '*', 'c', 'c', 'd'],
            ['*', 'c', 'c', 'd', 'a'],
            ['c', 'c', 'd', 'a', '*'],
            ['c', 'd', 'a', '*', '*'],
        ],
        'tran_2020_02_test.txt': [
            ['*', '*', 'a', 'b', 'b'],
            ['*', 'a', 'b', 'b', 'e'],
            ['a', 'b', 'b', 'e', '*'],
            ['b', 'b', 'e', '*', '*'],
        ],
    }

    """Expected co-occurrences from windows above"""
    expected_TTMs = {filename: simple_co_occurrence(document_windows[filename]) for filename in document_windows}

    def verify_tokens_payload(
        p: CorpusPipeline, payload: DocumentPayload, *_  # pylint: disable=unused-argument
    ) -> bool:
        # expected_tokens: dict = { k: v for k, v in SIMPLE_CORPUS_ABCDE_5DOCS}

        expected_tokens: dict = {
            'tran_2019_01_test.txt': ['a', 'b', 'c', 'c'],
            'tran_2019_02_test.txt': ['a', 'a', 'b', 'd'],
            'tran_2019_03_test.txt': ['a', 'e', 'e', 'b'],
            'tran_2020_01_test.txt': ['c', 'c', 'd', 'a'],
            'tran_2020_02_test.txt': ['a', 'b', 'b', 'e'],
        }

        return payload.content == expected_tokens.get(payload.filename)

    def verify_expected_vocabulary(p: CorpusPipeline, *_) -> bool:
        return list(p.payload.token2id.keys()) == ['*', '__low-tf__', 'a', 'b', 'c', 'd', 'e']

    def verify_co_occurrence_document_TTM_payload(
        p: CorpusPipeline, payload: DocumentPayload, *_
    ) -> bool:  # pylint: disable=unused-argument

        fg = p.payload.token2id.id2token.get

        assert isinstance(payload.content, CoOccurrencePayload)

        TTM: sp.spmatrix = payload.content.ttm_data_map.get(VectorizeType.Normal).term_term_matrix.tocoo()

        document_TTM_data = {(fg(TTM.row[i]), fg(TTM.col[i])): TTM.data[i] for i in range(0, len(TTM.data))}

        assert expected_TTMs[payload.filename] == document_TTM_data

        return True

    bundle: Bundle = (
        CorpusPipeline(config=config)
        .load_corpus(tokenized_corpus)
        .assert_on_payload(payload_test=verify_tokens_payload)
        .vocabulary(lemmatize=True)
        .assert_on_exit(exit_test=verify_expected_vocabulary)
        .to_document_co_occurrence(context_opts=context_opts)
        .assert_on_payload(payload_test=verify_co_occurrence_document_TTM_payload)
        .to_corpus_co_occurrence(context_opts=context_opts, global_threshold_count=1)
        .single()
        .content
    )

    for filename in expected_TTMs:
        document_id = int(bundle.document_index[bundle.document_index.filename == filename].document_id)
        for (i, j), ij in bundle.token_ids_2_pair_id.items():
            pair = (bundle.token2id.id2token[i], bundle.token2id.id2token[j])
            if pair in expected_TTMs[filename]:
                assert bundle.corpus.data[document_id, ij] == expected_TTMs[filename][pair]
            else:
                assert bundle.corpus.data[document_id, ij] == 0
예제 #19
0
from typing import Iterable

import pytest
import scipy.sparse as sp

from penelope.co_occurrence import Bundle, ContextOpts, VectorizeType
from penelope.corpus import ClosedVocabularyError, TokenizedCorpus, VectorizedCorpus
from penelope.pipeline import CorpusConfig, CorpusPipeline, DocumentPayload, PipelinePayload
from penelope.pipeline.co_occurrence.tasks import CoOccurrencePayload
from penelope.pipeline.config import CorpusType

from ..fixtures import SIMPLE_CORPUS_ABCDE_5DOCS, very_simple_corpus

jj = os.path.join

CONTEXT_OPTS: ContextOpts = ContextOpts(context_width=2, concept={}, ignore_concept=False, ignore_padding=False)


@pytest.mark.skip(reason="ingest is now prohibited if vocabulary is closed")
def test_pipeline_to_co_occurrence_ingest_prohobited_if_vocabulary_exists():

    tokenized_corpus: TokenizedCorpus = very_simple_corpus(SIMPLE_CORPUS_ABCDE_5DOCS)
    config: CorpusConfig = CorpusConfig.tokenized_corpus_config()

    with pytest.raises(ClosedVocabularyError):
        _: Bundle = (
            CorpusPipeline(config=config)
            .load_corpus(tokenized_corpus)
            .vocabulary(lemmatize=False)
            .to_document_co_occurrence(context_opts=CONTEXT_OPTS)
            .to_corpus_co_occurrence(context_opts=CONTEXT_OPTS, global_threshold_count=1)