예제 #1
0
파일: utils.py 프로젝트: humlab/penelope
def create_tokens_reader(
    source_path=TEST_CORPUS_FILENAME,
    as_binary: bool = False,
    filename_fields=None,
    index_field=None,
    filename_filter: str = None,
    filename_pattern: str = "*.txt",
    fix_hyphenation: bool = True,
    fix_whitespaces: bool = False,
    chunk_size: int = None,
    tokenize: Callable = None,
) -> TextTokenizer:
    reader_opts = TextReaderOpts(
        filename_pattern=filename_pattern,
        filename_filter=filename_filter,
        filename_fields=filename_fields,
        index_field=index_field,
        as_binary=as_binary,
    )
    transform_opts = TextTransformOpts(fix_whitespaces=fix_whitespaces,
                                       fix_hyphenation=fix_hyphenation)
    reader = TextTokenizer(
        source_path,
        reader_opts=reader_opts,
        transform_opts=transform_opts,
        tokenize=tokenize,
        chunk_size=chunk_size,
    )
    return reader
예제 #2
0
def test_load_data_frame_succeeds():
    pipeline = Mock(
        spec=CorpusPipeline,
        **{
            'payload.set_reader_index': monkey_patch,
        },
    )
    prior = MagicMock(spec=ITask, outstream=lambda: fake_data_frame_stream(1))

    task: tasks.LoadTaggedCSV = tasks.LoadTaggedCSV(
        pipeline=pipeline,
        filename="dummy.zip",
        prior=prior,
        extra_reader_opts=TextReaderOpts(),
        checkpoint_opts=CheckpointOpts(feather_folder=None),
    )

    task.register_pos_counts = lambda _: task
    fake_data: CheckpointData = patch_load_archive()
    fake_data.create_stream = lambda: fake_data_frame_stream(2)
    task.load_archive = lambda: fake_data

    task.setup()

    for payload in task.outstream():
        assert payload.content_type == ContentType.TAGGED_FRAME
예제 #3
0
def test_sparv_csv_tokenizer_interface():

    assert issubclass(readers.SparvCsvTokenizer, ICorpusReader)

    instance = readers.SparvCsvTokenizer(SPARV_CSV_EXPORT_FILENAME_SMALL,
                                         reader_opts=TextReaderOpts())
    assert isinstance(instance, ICorpusReader)
예제 #4
0
def reader_opts():
    return TextReaderOpts(
        filename_fields=["file_id:_:2", "year:_:1"],
        index_field=None,
        filename_filter=None,
        filename_pattern="*.txt",
        as_binary=False,
    )
예제 #5
0
def very_simple_corpus(data: List[Tuple[str, List[str]]]) -> TokenizedCorpus:

    reader = tng.CorpusReader(
        source=tng.InMemorySource(data),
        reader_opts=TextReaderOpts(filename_fields="year:_:1"),
        transformer=None,  # already tokenized
    )
    corpus = TokenizedCorpus(reader=reader)
    return corpus
예제 #6
0
def create_test_corpus() -> SparvTokenizedCsvCorpus:

    corpus = SparvTokenizedCsvCorpus(
        SPARV_ZIPPED_CSV_EXPORT_FILENAME,
        reader_opts=TextReaderOpts(filename_fields="year:_:1", ),
        extract_opts=ExtractTaggedTokensOpts(lemmatize=True,
                                             **SPARV_TAGGED_COLUMNS),
    )

    return corpus
예제 #7
0
def ComputeOptsSparvCSV(
    *,
    corpus_tag: str = 'TELLUS',
    corpus_source:
    str = './tests/test_data/tranströmer_corpus_export.sparv4.csv.zip',
) -> ComputeOpts:  # pylint: disable=too-many-instance-attributes)

    return ComputeOpts(
        corpus_tag=corpus_tag,
        corpus_source=corpus_source,
        target_folder="./tests/output",
        corpus_type=CorpusType.SparvCSV,
        transform_opts=TokensTransformOpts(
            to_lower=True,
            min_len=1,
            remove_stopwords=None,
            keep_symbols=True,
            keep_numerals=True,
            only_alphabetic=False,
            only_any_alphanumeric=False,
        ),
        text_reader_opts=TextReaderOpts(
            filename_pattern='*.csv',
            filename_fields=('year:_:1', ),
            index_field=None,  # use filename
            as_binary=False,
        ),
        extract_opts=ExtractTaggedTokensOpts(
            pos_includes=None,
            pos_excludes='|MAD|MID|PAD|',
            pos_paddings=None,
            lemmatize=False,
            **SPARV_TAGGED_COLUMNS,
            filter_opts=dict(
                is_alpha=False,
                is_punct=False,
                is_digit=None,
                is_stop=None,
                is_space=False,
            ),
        ),
        create_subfolder=False,
        persist=True,
        context_opts=ContextOpts(
            concept=('jag', ),
            context_width=2,
            partition_keys=['document_id'],
        ),
        tf_threshold=1,
        tf_threshold_mask=False,
        vectorize_opts=VectorizeOpts(already_tokenized=True,
                                     min_tf=1,
                                     max_tokens=None),
    )
예제 #8
0
def test_spacy_pipeline_load_text_to_spacy_doc_resolves(en_nlp):
    pytest.importorskip("spacy")
    reader_opts = TextReaderOpts(filename_pattern="*.txt",
                                 filename_fields="year:_:1")
    source = dummy_source()
    config = Mock(spec=CorpusConfig,
                  pipeline_payload=PipelinePayload(source=source).put2(
                      pos_column="pos_"))
    pipeline = CorpusPipeline(config=config).set_spacy_model(en_nlp).load_text(
        reader_opts=reader_opts).text_to_spacy()

    payloads = [x.content for x in pipeline.resolve()]

    assert all(isinstance(x, spacy_api.Doc) for x in payloads)
예제 #9
0
def test_spacy_pipeline_load_text_resolves():
    reader_opts = TextReaderOpts(filename_pattern="*.txt",
                                 filename_fields="year:_:1")
    source = dummy_source()
    config = Mock(spec=CorpusConfig,
                  pipeline_payload=PipelinePayload(source=source))
    pipeline = CorpusPipeline(config=config).load_text(reader_opts=reader_opts)

    payloads = [x.content for x in pipeline.resolve()]

    assert payloads == [x[1] for x in source]
    assert len(pipeline.payload.document_index) == len(source)
    assert all(
        pipeline.payload.document_index.filename == [x[0] for x in source])
예제 #10
0
def test_reader_when_lemmatized_nn_vb_returns_lemmatized_nn_vb():

    tokens_reader = readers.SparvCsvTokenizer(
        source=SPARV_CSV_EXPORT_FILENAME_SMALL,
        reader_opts=TextReaderOpts(),
        extract_opts=ExtractTaggedTokensOpts(pos_includes='NN|VB',
                                             pos_paddings=None,
                                             pos_excludes=None,
                                             lemmatize=True,
                                             **SPARV_TAGGED_COLUMNS),
    )

    expected = "rödräv vara hunddjur ha utbredning halvklot".split()

    filename, tokens = next(tokens_reader)

    assert filename == os.path.split(filename)[1]
    assert expected == tokens
예제 #11
0
def test_spacy_pipeline_load_text_to_spacy_to_dataframe_resolves(en_nlp):
    pytest.importorskip("spacy")
    reader_opts = TextReaderOpts(filename_pattern="*.txt",
                                 filename_fields="year:_:1")
    reader = TextReader.create(MARY_TEST_CORPUS, reader_opts=reader_opts)
    config = Mock(spec=CorpusConfig,
                  pipeline_payload=PipelinePayload(source=reader).put2(
                      pos_column="pos_"))
    attributes = ['text', 'lemma_', 'pos_']
    pipeline = (CorpusPipeline(
        config=config).set_spacy_model(en_nlp).load_text(
            reader_opts=reader_opts).text_to_spacy().spacy_to_tagged_frame(
                attributes=attributes))

    payloads = [x.content for x in pipeline.resolve()]

    assert all(isinstance(x, pd.DataFrame) for x in payloads)
    assert all(x.columns.tolist() == attributes for x in payloads)
예제 #12
0
def test_reader_when_no_transforms_returns_source_tokens():

    tokens_reader = readers.SparvCsvTokenizer(
        source=SPARV_CSV_EXPORT_FILENAME_SMALL,
        reader_opts=TextReaderOpts(),
        extract_opts=ExtractTaggedTokensOpts(pos_includes=None,
                                             pos_paddings=None,
                                             pos_excludes=None,
                                             lemmatize=False,
                                             **SPARV_TAGGED_COLUMNS),
    )

    expected = "Rödräven är ett hunddjur som har en mycket vidsträckt utbredning över norra halvklotet .".split(
    )

    filename, tokens = next(tokens_reader)

    assert filename == os.path.split(filename)[1]
    assert expected == tokens
예제 #13
0
def test_spacy_pipeline_load_text_to_spacy_to_dataframe_to_tokens_to_text_to_dtm(
        en_nlp):
    pytest.importorskip("spacy")

    reader_opts = TextReaderOpts(filename_pattern="*.txt",
                                 filename_fields="year:_:1")
    text_transform_opts = TextTransformOpts()
    reader = TextReader.create(MARY_TEST_CORPUS,
                               reader_opts=reader_opts,
                               transform_opts=text_transform_opts)

    attributes = ['text', 'lemma_', 'pos_', 'is_punct']
    extract_opts = ExtractTaggedTokensOpts(
        lemmatize=True,
        pos_includes='|VERB|NOUN|',
        pos_paddings=None,
        **SPACY_TAGGED_COLUMNS,
        filter_opts=dict(is_punct=False),
    )
    transform_opts = None

    vectorize_opts = VectorizeOpts()

    config = Mock(
        spec=CorpusConfig,
        pipeline_payload=PipelinePayload(source=reader).put2(
            **SPACY_TAGGED_COLUMNS),
    )

    pipeline = (CorpusPipeline(config=config).load_text(
        reader_opts=reader_opts,
        transform_opts=text_transform_opts).set_spacy_model(
            en_nlp).text_to_spacy().spacy_to_tagged_frame(
                attributes=attributes).tagged_frame_to_tokens(
                    extract_opts=extract_opts,
                    transform_opts=transform_opts).tokens_to_text().to_dtm(
                        vectorize_opts))

    corpus = pipeline.value()
    assert corpus is not None
    assert isinstance(corpus, VectorizedCorpus)
예제 #14
0
def test_spacy_pipeline(checkpoint_opts: CheckpointOpts):

    tagged_corpus_source = os.path.join(TEST_OUTPUT_FOLDER,
                                        "checkpoint_mary_lamb_pos_csv.zip")

    pathlib.Path(tagged_corpus_source).unlink(missing_ok=True)

    text_reader_opts = TextReaderOpts(
        filename_fields=["doc_id:_:2", "year:_:1"],
        index_field=None,  # use filename
        filename_filter=None,
        filename_pattern="*.txt",
        as_binary=False,
    )

    pipeline_payload = PipelinePayload(
        source=TEST_CORPUS,
        document_index_source=None,
        pos_schema_name="Universal",
        memory_store={
            'spacy_model': "en_core_web_sm",
            'nlp': None,
            'lang': 'en,',
            'pos_column': 'pos_'
        },
    )
    config = Mock(spec=CorpusConfig, pipeline_payload=pipeline_payload)
    pipeline = (CorpusPipeline(config=config).set_spacy_model(
        pipeline_payload.memory_store['spacy_model']).load_text(
            reader_opts=text_reader_opts, transform_opts=TextTransformOpts()
        ).text_to_spacy().passthrough().spacy_to_pos_tagged_frame().checkpoint(
            tagged_corpus_source,
            checkpoint_opts=checkpoint_opts,
            force_checkpoint=True).to_content())

    df_docs = pipeline.resolve()
    assert next(df_docs) is not None
    assert os.path.isfile(tagged_corpus_source)
예제 #15
0
def test_reader_when_source_is_zipped_archive_succeeds():

    expected_documents = [
        ['rödräv', 'hunddjur', 'utbredning', 'halvklot'],
    ]
    expected_names = ["sparv_1978_001.txt"]

    tokens_reader = readers.SparvCsvTokenizer(
        SPARV_ZIPPED_CSV_EXPORT_FILENAME,
        reader_opts=TextReaderOpts(),
        extract_opts=ExtractTaggedTokensOpts(
            pos_includes='|NN|',
            pos_paddings=None,
            lemmatize=True,
            **SPARV_TAGGED_COLUMNS,
        ),
        chunk_size=None,
    )

    for i, (filename, tokens) in enumerate(tokens_reader):

        assert expected_documents[i] == list(tokens)
        assert expected_names[i] == filename
예제 #16
0
def test_corpus_apply_when_looping_through_partition_groups_filter_outs_other_groups(
):

    expected_groups = {
        2019: ['tran_2019_01_test', 'tran_2019_02_test', 'tran_2019_03_test'],
        2020: ['tran_2020_01_test', 'tran_2020_02_test'],
    }

    expected_tokens = {
        2019: [
            [
                'KYRKA',
                'TURIST',
                'HALVMÖRKER',
                'VALV',
                'VALV',
                'ÖVERBLICK',
                'LJUSLÅGA',
                'ÄNGEL',
                'ANSIKTE',
                'KROPP',
                'MÄNNISKA',
                'VALV',
                'VALV',
                'TÅR',
                'PIAZZA',
                'MR',
                'MRS',
                'HERR',
                'SIGNORA',
                'VALV',
                'VALV',
            ],
            [
                'KÖR',
                'NATT',
                'HUS',
                'STRÅLKASTARSKEN',
                'HUS',
                'LADA',
                'FORDON',
                'NU',
                'LIV',
                'MÄNNISKA',
                'DEL',
                'ANLETSDRAG',
                'TRÄNING',
                'EVIGHET',
                'ALLT',
                'SÖMN',
                'BOM',
                'MYSTERIUM',
            ],
            [
                'SKOG',
                'GLÄNTA',
                'GLÄNTA',
                'OMSLUT',
                'SKOG',
                'SJÄLV',
                'STAM',
                'LAV',
                'SKÄGGSTUBB',
                'TRÄD',
                'TOPP',
                'KVIST',
                'LJUS',
                'SKUGGA',
                'SKUGGA',
                'KÄRR',
                'PLATS',
                'GRÄS',
                'STEN',
                'VARA',
                'GRUNDSTEN',
                'HUS',
                'HÄR',
                'UPPLYSNING',
                'NAMN',
                'ARKIV',
                'ARKIV',
                'TRADITION',
                'DÖD',
                'MINNE',
                'ZIGENARSTAMMEN',
                'MEN',
                'TORP',
                'RÖST',
                'VÄRLD',
                'CENTRUM',
                'INVÅNARE',
                'KRÖNIKA',
                'ÖDE',
                'ÅR',
                'TORP',
                'SFINX',
                'GRUNDSTEN',
                'SÄTT',
                'MÅSTE',
                'NU',
                'SNÅR',
                'SIDA',
                'STEG',
                'GÅNGSTIG',
                'KOMMUNIKATIONSNÄT',
                'KRAFTLEDNINGSSTOLPEN',
                'SKALBAGGE',
                'SOL',
                'SKÖLD',
                'FLYGVINGARNA',
                'FALLSKÄRM',
                'EXPERT',
            ],
        ],
        2020: [
            [
                'VRAK',
                'KRETSANDE',
                'PUNKT',
                'STILLHET',
                'HAV',
                'LJUS',
                'BETSEL',
                'TÅNG',
                'STRAND',
                'JORD',
                'MÖRKER',
                'FLADDERMUS',
                'VRAK',
                'STJÄRNA',
            ],
            [
                'ÅR',
                'STÖVEL',
                'SOL',
                'TRÄD',
                'VIND',
                'FRIHET',
                'BERG',
                'FOT',
                'BARRSKOGSBRÄNNINGEN',
                'MEN',
                'SOMMAR',
                'DYNING',
                'TRÄD',
                'TOPP',
                'ÖGONBLICK',
                'KUST',
            ],
        ],
    }

    corpus = SparvTokenizedCsvCorpus(
        SPARV_ZIPPED_CSV_EXPORT_FILENAME,
        reader_opts=TextReaderOpts(filename_fields="year:_:1", ),
        extract_opts=ExtractTaggedTokensOpts(lemmatize=True,
                                             pos_includes='|NN|',
                                             pos_paddings=None,
                                             **SPARV_TAGGED_COLUMNS),
        transform_opts=TokensTransformOpts(
            min_len=2,
            to_upper=True,
        ),
    )

    partitions = corpus.partition_documents('year')

    for key in partitions:

        corpus.reader.apply_filter(partitions[key])
        assert expected_groups[key] == corpus.document_names

        tokens = [x for x in corpus.terms]
        assert expected_tokens[key] == tokens
예제 #17
0
def test_create_text_tokenizer_smoke_test():
    reader = TextTokenizer(TEST_CORPUS_FILENAME, reader_opts=TextReaderOpts())
    assert reader is not None
    assert next(reader) is not None
예제 #18
0
def test_zip_text_iterator_interface():

    assert issubclass(readers.ZipTextIterator, ICorpusReader)
    instance = readers.ZipTextIterator(TEST_CORPUS_FILENAME,
                                       reader_opts=TextReaderOpts())
    assert isinstance(instance, ICorpusReader)
예제 #19
0
def ComputeOptsSpacyCSV(
    *,
    corpus_tag: str = 'MARS',
    corpus_source:
    str = './tests/test_data/legal_instrument_five_docs_test.zip',
) -> ComputeOpts:  # pylint: disable=too-many-instance-attributes)

    return ComputeOpts(
        corpus_tag=corpus_tag,
        corpus_source=corpus_source,
        target_folder="./tests/output",
        corpus_type=CorpusType.SpacyCSV,
        # pos_scheme: PoS_Tag_Scheme = PoS_Tag_Schemes.Universal
        transform_opts=TokensTransformOpts(
            extra_stopwords=[],
            keep_numerals=True,
            keep_symbols=True,
            language='english',
            max_len=None,
            min_len=1,
            only_alphabetic=False,
            only_any_alphanumeric=False,
            remove_accents=False,
            remove_stopwords=True,
            stopwords=None,
            to_lower=True,
            to_upper=False,
        ),
        text_reader_opts=TextReaderOpts(
            filename_pattern='*.csv',
            filename_fields=['year:_:1'],
            index_field=None,  # use filename
            as_binary=False,
        ),
        extract_opts=ExtractTaggedTokensOpts(
            lemmatize=True,
            target_override=None,
            pos_includes='|NOUN|PROPN|VERB|',
            pos_paddings=None,
            pos_excludes='|PUNCT|EOL|SPACE|',
            passthrough_tokens=[],
            block_tokens=[],
            append_pos=False,
            global_tf_threshold=1,
            global_tf_threshold_mask=False,
            **SPACY_TAGGED_COLUMNS,
            filter_opts=dict(
                is_alpha=False,
                is_punct=False,
                is_digit=None,
                is_stop=None,
                is_space=False,
            ),
        ),
        create_subfolder=False,
        persist=True,
        context_opts=ContextOpts(
            context_width=4,
            concept=set(),
            ignore_concept=False,
            partition_keys=['document_id'],
        ),
        tf_threshold=1,
        tf_threshold_mask=False,
        vectorize_opts=VectorizeOpts(
            already_tokenized=True,
            lowercase=False,
            max_df=1.0,
            min_df=1,
            min_tf=1,
            max_tokens=None,
        ),
    )
예제 #20
0
def test_tokenize_when_vb_lemmatized_upper_returns_correct_tokens():

    expected = [
        (
            'tran_2019_01_test.txt',
            [
                'tränga',
                'gapa',
                'fladdra',
                'omfamna',
                'viska',
                'skämmas',
                'vara',
                'öppna',
                'bli',
                'vara',
                'skola',
                'öppna',
            ],
        ),
        (
            'tran_2019_02_test.txt',
            [
                'stiga',
                'vara',
                'vilja',
                'dricka',
                'skylta',
                'vara',
                'iklä',
                'sova',
                'kunna',
                'sova',
                'ha',
                'föra',
                'våga',
                'släppa',
                'vara',
                'vila',
                'dra',
            ],
        ),
        (
            'tran_2019_03_test.txt',
            [
                'finna',
                'kunna',
                'hitta',
                'gå',
                'vara',
                'kväva',
                'sammanskruvade',
                'vara',
                'ända',
                'vidröra',
                'ruva',
                'växa',
                'öppna',
                'vara',
                'ligga',
                'måste',
                'leva',
                'kunna',
                'ge',
                'finna',
                'öppna',
                'vara',
                'hålla',
                'vara',
                'minna',
                'glömma',
                'anteckna',
                'glömma',
                'sorla',
                'vara',
                'dö',
                'flytta',
                'upphöra',
                'stå',
                'bli',
                'vara',
                'ha',
                'vara',
                'gå',
                'dyka',
                'gå',
                'tränga',
                'stiga',
                'glesna',
                'ljusna',
                'bli',
                'smyga',
                'vara',
                'sitta',
                'ligga',
                'hopvecklade',
            ],
        ),
        ('tran_2020_01_test.txt',
         ['rulla', 'tugga', 'frusta', 'hölja', 'pejla', 'stanna', 'bli']),
        (
            'tran_2020_02_test.txt',
            [
                'sparka', 'klänga', 'lövas', 'fylla', 'segla', 'stå', 'komma',
                'dra', 'vila', 'sjunka', 'stå'
            ],
        ),
    ]

    corpus = SparvTokenizedCsvCorpus(
        SPARV_ZIPPED_CSV_EXPORT_FILENAME,
        extract_opts=ExtractTaggedTokensOpts(pos_includes='|VB|',
                                             pos_paddings=None,
                                             lemmatize=True,
                                             **SPARV_TAGGED_COLUMNS),
        reader_opts=TextReaderOpts(),
        chunk_size=None,
        transform_opts=TokensTransformOpts(to_lower=True, ),
    )

    for i, (filename, tokens) in enumerate(corpus):

        assert filename == expected[i][0]
        assert tokens == expected[i][1]
예제 #21
0
 def __init__(self):
     # tran_2019_02_test.txt
     meta_fields = ["year:_:1", "year_serial_id:_:2"]
     super().__init__('./tests/test_data/tranströmer.txt',
                      TextReaderOpts(filename_fields=meta_fields))
예제 #22
0
def test_spacy_pipeline_load_text_to_spacy_to_dataframe_to_tokens_resolves(
        en_nlp):
    pytest.importorskip("spacy")

    reader_opts = TextReaderOpts(filename_pattern="*.txt",
                                 filename_fields="year:_:1")
    text_transform_opts = TextTransformOpts()
    reader = TextReader.create(MARY_TEST_CORPUS,
                               reader_opts=reader_opts,
                               transform_opts=text_transform_opts)

    config = Mock(
        spec=CorpusConfig,
        pipeline_payload=PipelinePayload(source=reader).put2(
            **SPACY_TAGGED_COLUMNS),
    )
    attributes = ['text', 'lemma_', 'pos_']
    extract_opts = ExtractTaggedTokensOpts(
        lemmatize=True,
        pos_includes='|VERB|NOUN|',
        pos_paddings='|ADJ|',
        **SPACY_TAGGED_COLUMNS,
        filter_opts=dict(is_punct=False),
    )
    transform_opts = None

    pipeline = (CorpusPipeline(config=config).load_text(
        reader_opts=reader_opts).set_spacy_model(en_nlp).text_to_spacy(
        ).spacy_to_tagged_frame(attributes=attributes).tagged_frame_to_tokens(
            extract_opts=extract_opts, transform_opts=transform_opts))

    payloads = [x.content for x in pipeline.resolve()]

    assert payloads == [
        ['*', 'sea', 'ocean', 'life'],
        ['atmosphere', 'blow'],
        ['*', 'activity', 'surface', 'cease'],
        ['*', 'planet'],
        ['volcano', 'erupt', 'year'],
        ['eruption', 'occur', 'year', 'region', 'call'],
        ['know', '*', 'eruption'],
        ['volcano', 'erupt', 'surface', '*', 'interval'],
    ]

    assert set(list(pipeline.payload.document_index.columns)) == set([
        'filename',
        'year',
        'document_id',
        'document_name',
        'Adverb',
        'Conjunction',
        'Delimiter',
        'Noun',
        'Other',
        'Preposition',
        'n_tokens',
        'n_raw_tokens',
        'Pronoun',
        'Verb',
        'Adjective',
        'Numeral',
    ])