Exemplo n.º 1
0
def id_tagged_frame_to_DTM_pipeline(
    corpus_config: CorpusConfig,
    corpus_source: str = None,
    id_to_token: bool = False,
    file_pattern: str = '**/prot-*.feather',
    transform_opts: TokensTransformOpts = None,
    extract_opts: ExtractTaggedTokensOpts = None,
    vectorize_opts: VectorizeOpts = None,
) -> CorpusPipeline:
    try:

        if corpus_source is None:
            corpus_source = corpus_config.pipeline_payload.source

        extract_opts.set_numeric_names()
        vectorize_opts.min_df = extract_opts.global_tf_threshold
        extract_opts.global_tf_threshold = 1
        p: CorpusPipeline = (CorpusPipeline(
            config=corpus_config).load_id_tagged_frame(
                folder=corpus_source,
                id_to_token=id_to_token,
                file_pattern=file_pattern,
            ).filter_tagged_frame(
                extract_opts=extract_opts,
                pos_schema=corpus_config.pos_schema,
                transform_opts=transform_opts,
            ).to_dtm(vectorize_opts=vectorize_opts,
                     tagged_column=extract_opts.target_column))
        return p
    except Exception as ex:
        raise ex
Exemplo n.º 2
0
 def vectorize_opts(self) -> VectorizeOpts:
     # FIXME: Add UI elements for max_tokens
     return VectorizeOpts(
         already_tokenized=True,
         lowercase=False,
         max_df=1.0,
         min_df=1,
         min_tf=self.tf_threshold,
     )
Exemplo n.º 3
0
def ComputeOptsSparvCSV(
    *,
    corpus_tag: str = 'TELLUS',
    corpus_source:
    str = './tests/test_data/tranströmer_corpus_export.sparv4.csv.zip',
) -> ComputeOpts:  # pylint: disable=too-many-instance-attributes)

    return ComputeOpts(
        corpus_tag=corpus_tag,
        corpus_source=corpus_source,
        target_folder="./tests/output",
        corpus_type=CorpusType.SparvCSV,
        transform_opts=TokensTransformOpts(
            to_lower=True,
            min_len=1,
            remove_stopwords=None,
            keep_symbols=True,
            keep_numerals=True,
            only_alphabetic=False,
            only_any_alphanumeric=False,
        ),
        text_reader_opts=TextReaderOpts(
            filename_pattern='*.csv',
            filename_fields=('year:_:1', ),
            index_field=None,  # use filename
            as_binary=False,
        ),
        extract_opts=ExtractTaggedTokensOpts(
            pos_includes=None,
            pos_excludes='|MAD|MID|PAD|',
            pos_paddings=None,
            lemmatize=False,
            **SPARV_TAGGED_COLUMNS,
            filter_opts=dict(
                is_alpha=False,
                is_punct=False,
                is_digit=None,
                is_stop=None,
                is_space=False,
            ),
        ),
        create_subfolder=False,
        persist=True,
        context_opts=ContextOpts(
            concept=('jag', ),
            context_width=2,
            partition_keys=['document_id'],
        ),
        tf_threshold=1,
        tf_threshold_mask=False,
        vectorize_opts=VectorizeOpts(already_tokenized=True,
                                     min_tf=1,
                                     max_tokens=None),
    )
Exemplo n.º 4
0
def test_spacy_pipeline_load_text_to_spacy_to_dataframe_to_tokens_to_text_to_dtm(
        en_nlp):
    pytest.importorskip("spacy")

    reader_opts = TextReaderOpts(filename_pattern="*.txt",
                                 filename_fields="year:_:1")
    text_transform_opts = TextTransformOpts()
    reader = TextReader.create(MARY_TEST_CORPUS,
                               reader_opts=reader_opts,
                               transform_opts=text_transform_opts)

    attributes = ['text', 'lemma_', 'pos_', 'is_punct']
    extract_opts = ExtractTaggedTokensOpts(
        lemmatize=True,
        pos_includes='|VERB|NOUN|',
        pos_paddings=None,
        **SPACY_TAGGED_COLUMNS,
        filter_opts=dict(is_punct=False),
    )
    transform_opts = None

    vectorize_opts = VectorizeOpts()

    config = Mock(
        spec=CorpusConfig,
        pipeline_payload=PipelinePayload(source=reader).put2(
            **SPACY_TAGGED_COLUMNS),
    )

    pipeline = (CorpusPipeline(config=config).load_text(
        reader_opts=reader_opts,
        transform_opts=text_transform_opts).set_spacy_model(
            en_nlp).text_to_spacy().spacy_to_tagged_frame(
                attributes=attributes).tagged_frame_to_tokens(
                    extract_opts=extract_opts,
                    transform_opts=transform_opts).tokens_to_text().to_dtm(
                        vectorize_opts))

    corpus = pipeline.value()
    assert corpus is not None
    assert isinstance(corpus, VectorizedCorpus)
Exemplo n.º 5
0
def process(
    corpus_config: Optional[str] = None,
    input_filename: Optional[str] = None,
    output_folder: Optional[str] = None,
    output_tag: Optional[str] = None,
    filename_pattern: Optional[str] = None,
    phrase: Sequence[str] = None,
    phrase_file: Optional[str] = None,
    create_subfolder: bool = True,
    pos_includes: Optional[str] = None,
    pos_paddings: Optional[str] = None,
    pos_excludes: Optional[str] = None,
    append_pos: bool = False,
    to_lower: bool = True,
    lemmatize: bool = True,
    remove_stopwords: Optional[str] = None,
    min_word_length: int = 2,
    max_word_length: int = None,
    keep_symbols: bool = False,
    keep_numerals: bool = False,
    only_any_alphanumeric: bool = False,
    only_alphabetic: bool = False,
    tf_threshold: int = 1,
    tf_threshold_mask: bool = False,
    max_tokens: int = None,
    enable_checkpoint: bool = True,
    force_checkpoint: bool = False,
    deserialize_processes: int = 4,
):

    try:
        corpus_config: CorpusConfig = CorpusConfig.load(corpus_config)
        phrases = parse_phrases(phrase_file, phrase)

        if pos_excludes is None:
            pos_excludes = pos_tags_to_str(corpus_config.pos_schema.Delimiter)

        if pos_paddings.upper() in ["FULL", "ALL", "PASSTHROUGH"]:
            pos_paddings = pos_tags_to_str(corpus_config.pos_schema.all_types_except(pos_includes))
            logger.info(f"PoS paddings expanded to: {pos_paddings}")

        text_reader_opts: TextReaderOpts = corpus_config.text_reader_opts.copy()

        if filename_pattern is not None:
            text_reader_opts.filename_pattern = filename_pattern

        corpus_config.checkpoint_opts.deserialize_processes = max(1, deserialize_processes)

        tagged_columns: dict = corpus_config.pipeline_payload.tagged_columns_names
        args: interface.ComputeOpts = interface.ComputeOpts(
            corpus_type=corpus_config.corpus_type,
            corpus_source=input_filename,
            target_folder=output_folder,
            corpus_tag=output_tag,
            transform_opts=TokensTransformOpts(
                to_lower=to_lower,
                to_upper=False,
                min_len=min_word_length,
                max_len=max_word_length,
                remove_accents=False,
                remove_stopwords=(remove_stopwords is not None),
                stopwords=None,
                extra_stopwords=None,
                language=remove_stopwords,
                keep_numerals=keep_numerals,
                keep_symbols=keep_symbols,
                only_alphabetic=only_alphabetic,
                only_any_alphanumeric=only_any_alphanumeric,
            ),
            text_reader_opts=text_reader_opts,
            extract_opts=ExtractTaggedTokensOpts(
                pos_includes=pos_includes,
                pos_paddings=pos_paddings,
                pos_excludes=pos_excludes,
                lemmatize=lemmatize,
                phrases=phrases,
                append_pos=append_pos,
                global_tf_threshold=tf_threshold,
                global_tf_threshold_mask=tf_threshold_mask,
                **tagged_columns,
            ),
            vectorize_opts=VectorizeOpts(
                already_tokenized=True,
                min_tf=tf_threshold,
                max_tokens=max_tokens,
            ),
            tf_threshold=tf_threshold,
            tf_threshold_mask=tf_threshold_mask,
            create_subfolder=create_subfolder,
            persist=True,
            enable_checkpoint=enable_checkpoint,
            force_checkpoint=force_checkpoint,
        )

        workflow.compute(args=args, corpus_config=corpus_config)

        logger.info('Done!')

    except Exception as ex:  # pylint: disable=try-except-raise
        logger.exception(ex)
        click.echo(ex)
        sys.exit(1)
Exemplo n.º 6
0
def test_generate_cli_opts():
    compute_opts = interface.ComputeOpts(
        corpus_type=interface.CorpusType.SparvCSV,
        corpus_source="apa.txt",
        target_folder='./tests/output',
        corpus_tag='APA',
        transform_opts=TokensTransformOpts(
            only_alphabetic=False,
            only_any_alphanumeric=False,
            to_lower=True,
            to_upper=False,
            min_len=1,
            max_len=None,
            remove_accents=False,
            remove_stopwords=True,
            stopwords=None,
            extra_stopwords=['örn'],
            language='swedish',
            keep_numerals=True,
            keep_symbols=True,
        ),
        text_reader_opts=TextReaderOpts(
            filename_pattern='*.csv',
            filename_filter=None,
            filename_fields=[
                'year:prot\\_(\\d{4}).*',
                'year2:prot_\\d{4}(\\d{2})__*',
                'number:prot_\\d+[afk_]{0,4}__(\\d+).*',
            ],
            index_field=None,
            as_binary=False,
            sep='\t',
            quoting=3,
        ),
        extract_opts=ExtractTaggedTokensOpts(
            lemmatize=True,
            target_override=None,
            pos_includes="NN",
            pos_excludes=None,
            pos_paddings="MID|MAD|PAD",
            passthrough_tokens=[],
            block_tokens=[],
            append_pos=False,
            global_tf_threshold=1,
            global_tf_threshold_mask=False,
            **SPARV_TAGGED_COLUMNS,
        ),
        vectorize_opts=VectorizeOpts(
            already_tokenized=True,
            lowercase=False,
            stop_words=None,
            max_df=1.0,
            min_df=1,
            min_tf=1,
            max_tokens=None,
        ),
        tf_threshold=1,
        tf_threshold_mask=False,
        create_subfolder=True,
        persist=True,
        context_opts=ContextOpts(
            context_width=1,
            concept=["apa"],
            ignore_concept=False,
            partition_keys=['document_id'],
        ),
        enable_checkpoint=True,
        force_checkpoint=False,
    )

    cli_command: str = compute_opts.command_line("apa")

    assert cli_command is not None
Exemplo n.º 7
0
def ComputeOptsSpacyCSV(
    *,
    corpus_tag: str = 'MARS',
    corpus_source:
    str = './tests/test_data/legal_instrument_five_docs_test.zip',
) -> ComputeOpts:  # pylint: disable=too-many-instance-attributes)

    return ComputeOpts(
        corpus_tag=corpus_tag,
        corpus_source=corpus_source,
        target_folder="./tests/output",
        corpus_type=CorpusType.SpacyCSV,
        # pos_scheme: PoS_Tag_Scheme = PoS_Tag_Schemes.Universal
        transform_opts=TokensTransformOpts(
            extra_stopwords=[],
            keep_numerals=True,
            keep_symbols=True,
            language='english',
            max_len=None,
            min_len=1,
            only_alphabetic=False,
            only_any_alphanumeric=False,
            remove_accents=False,
            remove_stopwords=True,
            stopwords=None,
            to_lower=True,
            to_upper=False,
        ),
        text_reader_opts=TextReaderOpts(
            filename_pattern='*.csv',
            filename_fields=['year:_:1'],
            index_field=None,  # use filename
            as_binary=False,
        ),
        extract_opts=ExtractTaggedTokensOpts(
            lemmatize=True,
            target_override=None,
            pos_includes='|NOUN|PROPN|VERB|',
            pos_paddings=None,
            pos_excludes='|PUNCT|EOL|SPACE|',
            passthrough_tokens=[],
            block_tokens=[],
            append_pos=False,
            global_tf_threshold=1,
            global_tf_threshold_mask=False,
            **SPACY_TAGGED_COLUMNS,
            filter_opts=dict(
                is_alpha=False,
                is_punct=False,
                is_digit=None,
                is_stop=None,
                is_space=False,
            ),
        ),
        create_subfolder=False,
        persist=True,
        context_opts=ContextOpts(
            context_width=4,
            concept=set(),
            ignore_concept=False,
            partition_keys=['document_id'],
        ),
        tf_threshold=1,
        tf_threshold_mask=False,
        vectorize_opts=VectorizeOpts(
            already_tokenized=True,
            lowercase=False,
            max_df=1.0,
            min_df=1,
            min_tf=1,
            max_tokens=None,
        ),
    )