def create_tokens_reader( source_path=TEST_CORPUS_FILENAME, as_binary: bool = False, filename_fields=None, index_field=None, filename_filter: str = None, filename_pattern: str = "*.txt", fix_hyphenation: bool = True, fix_whitespaces: bool = False, chunk_size: int = None, tokenize: Callable = None, ) -> TextTokenizer: reader_opts = TextReaderOpts( filename_pattern=filename_pattern, filename_filter=filename_filter, filename_fields=filename_fields, index_field=index_field, as_binary=as_binary, ) transform_opts = TextTransformOpts(fix_whitespaces=fix_whitespaces, fix_hyphenation=fix_hyphenation) reader = TextTokenizer( source_path, reader_opts=reader_opts, transform_opts=transform_opts, tokenize=tokenize, chunk_size=chunk_size, ) return reader
def test_load_data_frame_succeeds(): pipeline = Mock( spec=CorpusPipeline, **{ 'payload.set_reader_index': monkey_patch, }, ) prior = MagicMock(spec=ITask, outstream=lambda: fake_data_frame_stream(1)) task: tasks.LoadTaggedCSV = tasks.LoadTaggedCSV( pipeline=pipeline, filename="dummy.zip", prior=prior, extra_reader_opts=TextReaderOpts(), checkpoint_opts=CheckpointOpts(feather_folder=None), ) task.register_pos_counts = lambda _: task fake_data: CheckpointData = patch_load_archive() fake_data.create_stream = lambda: fake_data_frame_stream(2) task.load_archive = lambda: fake_data task.setup() for payload in task.outstream(): assert payload.content_type == ContentType.TAGGED_FRAME
def test_sparv_csv_tokenizer_interface(): assert issubclass(readers.SparvCsvTokenizer, ICorpusReader) instance = readers.SparvCsvTokenizer(SPARV_CSV_EXPORT_FILENAME_SMALL, reader_opts=TextReaderOpts()) assert isinstance(instance, ICorpusReader)
def reader_opts(): return TextReaderOpts( filename_fields=["file_id:_:2", "year:_:1"], index_field=None, filename_filter=None, filename_pattern="*.txt", as_binary=False, )
def very_simple_corpus(data: List[Tuple[str, List[str]]]) -> TokenizedCorpus: reader = tng.CorpusReader( source=tng.InMemorySource(data), reader_opts=TextReaderOpts(filename_fields="year:_:1"), transformer=None, # already tokenized ) corpus = TokenizedCorpus(reader=reader) return corpus
def create_test_corpus() -> SparvTokenizedCsvCorpus: corpus = SparvTokenizedCsvCorpus( SPARV_ZIPPED_CSV_EXPORT_FILENAME, reader_opts=TextReaderOpts(filename_fields="year:_:1", ), extract_opts=ExtractTaggedTokensOpts(lemmatize=True, **SPARV_TAGGED_COLUMNS), ) return corpus
def ComputeOptsSparvCSV( *, corpus_tag: str = 'TELLUS', corpus_source: str = './tests/test_data/tranströmer_corpus_export.sparv4.csv.zip', ) -> ComputeOpts: # pylint: disable=too-many-instance-attributes) return ComputeOpts( corpus_tag=corpus_tag, corpus_source=corpus_source, target_folder="./tests/output", corpus_type=CorpusType.SparvCSV, transform_opts=TokensTransformOpts( to_lower=True, min_len=1, remove_stopwords=None, keep_symbols=True, keep_numerals=True, only_alphabetic=False, only_any_alphanumeric=False, ), text_reader_opts=TextReaderOpts( filename_pattern='*.csv', filename_fields=('year:_:1', ), index_field=None, # use filename as_binary=False, ), extract_opts=ExtractTaggedTokensOpts( pos_includes=None, pos_excludes='|MAD|MID|PAD|', pos_paddings=None, lemmatize=False, **SPARV_TAGGED_COLUMNS, filter_opts=dict( is_alpha=False, is_punct=False, is_digit=None, is_stop=None, is_space=False, ), ), create_subfolder=False, persist=True, context_opts=ContextOpts( concept=('jag', ), context_width=2, partition_keys=['document_id'], ), tf_threshold=1, tf_threshold_mask=False, vectorize_opts=VectorizeOpts(already_tokenized=True, min_tf=1, max_tokens=None), )
def test_spacy_pipeline_load_text_to_spacy_doc_resolves(en_nlp): pytest.importorskip("spacy") reader_opts = TextReaderOpts(filename_pattern="*.txt", filename_fields="year:_:1") source = dummy_source() config = Mock(spec=CorpusConfig, pipeline_payload=PipelinePayload(source=source).put2( pos_column="pos_")) pipeline = CorpusPipeline(config=config).set_spacy_model(en_nlp).load_text( reader_opts=reader_opts).text_to_spacy() payloads = [x.content for x in pipeline.resolve()] assert all(isinstance(x, spacy_api.Doc) for x in payloads)
def test_spacy_pipeline_load_text_resolves(): reader_opts = TextReaderOpts(filename_pattern="*.txt", filename_fields="year:_:1") source = dummy_source() config = Mock(spec=CorpusConfig, pipeline_payload=PipelinePayload(source=source)) pipeline = CorpusPipeline(config=config).load_text(reader_opts=reader_opts) payloads = [x.content for x in pipeline.resolve()] assert payloads == [x[1] for x in source] assert len(pipeline.payload.document_index) == len(source) assert all( pipeline.payload.document_index.filename == [x[0] for x in source])
def test_reader_when_lemmatized_nn_vb_returns_lemmatized_nn_vb(): tokens_reader = readers.SparvCsvTokenizer( source=SPARV_CSV_EXPORT_FILENAME_SMALL, reader_opts=TextReaderOpts(), extract_opts=ExtractTaggedTokensOpts(pos_includes='NN|VB', pos_paddings=None, pos_excludes=None, lemmatize=True, **SPARV_TAGGED_COLUMNS), ) expected = "rödräv vara hunddjur ha utbredning halvklot".split() filename, tokens = next(tokens_reader) assert filename == os.path.split(filename)[1] assert expected == tokens
def test_spacy_pipeline_load_text_to_spacy_to_dataframe_resolves(en_nlp): pytest.importorskip("spacy") reader_opts = TextReaderOpts(filename_pattern="*.txt", filename_fields="year:_:1") reader = TextReader.create(MARY_TEST_CORPUS, reader_opts=reader_opts) config = Mock(spec=CorpusConfig, pipeline_payload=PipelinePayload(source=reader).put2( pos_column="pos_")) attributes = ['text', 'lemma_', 'pos_'] pipeline = (CorpusPipeline( config=config).set_spacy_model(en_nlp).load_text( reader_opts=reader_opts).text_to_spacy().spacy_to_tagged_frame( attributes=attributes)) payloads = [x.content for x in pipeline.resolve()] assert all(isinstance(x, pd.DataFrame) for x in payloads) assert all(x.columns.tolist() == attributes for x in payloads)
def test_reader_when_no_transforms_returns_source_tokens(): tokens_reader = readers.SparvCsvTokenizer( source=SPARV_CSV_EXPORT_FILENAME_SMALL, reader_opts=TextReaderOpts(), extract_opts=ExtractTaggedTokensOpts(pos_includes=None, pos_paddings=None, pos_excludes=None, lemmatize=False, **SPARV_TAGGED_COLUMNS), ) expected = "Rödräven är ett hunddjur som har en mycket vidsträckt utbredning över norra halvklotet .".split( ) filename, tokens = next(tokens_reader) assert filename == os.path.split(filename)[1] assert expected == tokens
def test_spacy_pipeline_load_text_to_spacy_to_dataframe_to_tokens_to_text_to_dtm( en_nlp): pytest.importorskip("spacy") reader_opts = TextReaderOpts(filename_pattern="*.txt", filename_fields="year:_:1") text_transform_opts = TextTransformOpts() reader = TextReader.create(MARY_TEST_CORPUS, reader_opts=reader_opts, transform_opts=text_transform_opts) attributes = ['text', 'lemma_', 'pos_', 'is_punct'] extract_opts = ExtractTaggedTokensOpts( lemmatize=True, pos_includes='|VERB|NOUN|', pos_paddings=None, **SPACY_TAGGED_COLUMNS, filter_opts=dict(is_punct=False), ) transform_opts = None vectorize_opts = VectorizeOpts() config = Mock( spec=CorpusConfig, pipeline_payload=PipelinePayload(source=reader).put2( **SPACY_TAGGED_COLUMNS), ) pipeline = (CorpusPipeline(config=config).load_text( reader_opts=reader_opts, transform_opts=text_transform_opts).set_spacy_model( en_nlp).text_to_spacy().spacy_to_tagged_frame( attributes=attributes).tagged_frame_to_tokens( extract_opts=extract_opts, transform_opts=transform_opts).tokens_to_text().to_dtm( vectorize_opts)) corpus = pipeline.value() assert corpus is not None assert isinstance(corpus, VectorizedCorpus)
def test_spacy_pipeline(checkpoint_opts: CheckpointOpts): tagged_corpus_source = os.path.join(TEST_OUTPUT_FOLDER, "checkpoint_mary_lamb_pos_csv.zip") pathlib.Path(tagged_corpus_source).unlink(missing_ok=True) text_reader_opts = TextReaderOpts( filename_fields=["doc_id:_:2", "year:_:1"], index_field=None, # use filename filename_filter=None, filename_pattern="*.txt", as_binary=False, ) pipeline_payload = PipelinePayload( source=TEST_CORPUS, document_index_source=None, pos_schema_name="Universal", memory_store={ 'spacy_model': "en_core_web_sm", 'nlp': None, 'lang': 'en,', 'pos_column': 'pos_' }, ) config = Mock(spec=CorpusConfig, pipeline_payload=pipeline_payload) pipeline = (CorpusPipeline(config=config).set_spacy_model( pipeline_payload.memory_store['spacy_model']).load_text( reader_opts=text_reader_opts, transform_opts=TextTransformOpts() ).text_to_spacy().passthrough().spacy_to_pos_tagged_frame().checkpoint( tagged_corpus_source, checkpoint_opts=checkpoint_opts, force_checkpoint=True).to_content()) df_docs = pipeline.resolve() assert next(df_docs) is not None assert os.path.isfile(tagged_corpus_source)
def test_reader_when_source_is_zipped_archive_succeeds(): expected_documents = [ ['rödräv', 'hunddjur', 'utbredning', 'halvklot'], ] expected_names = ["sparv_1978_001.txt"] tokens_reader = readers.SparvCsvTokenizer( SPARV_ZIPPED_CSV_EXPORT_FILENAME, reader_opts=TextReaderOpts(), extract_opts=ExtractTaggedTokensOpts( pos_includes='|NN|', pos_paddings=None, lemmatize=True, **SPARV_TAGGED_COLUMNS, ), chunk_size=None, ) for i, (filename, tokens) in enumerate(tokens_reader): assert expected_documents[i] == list(tokens) assert expected_names[i] == filename
def test_corpus_apply_when_looping_through_partition_groups_filter_outs_other_groups( ): expected_groups = { 2019: ['tran_2019_01_test', 'tran_2019_02_test', 'tran_2019_03_test'], 2020: ['tran_2020_01_test', 'tran_2020_02_test'], } expected_tokens = { 2019: [ [ 'KYRKA', 'TURIST', 'HALVMÖRKER', 'VALV', 'VALV', 'ÖVERBLICK', 'LJUSLÅGA', 'ÄNGEL', 'ANSIKTE', 'KROPP', 'MÄNNISKA', 'VALV', 'VALV', 'TÅR', 'PIAZZA', 'MR', 'MRS', 'HERR', 'SIGNORA', 'VALV', 'VALV', ], [ 'KÖR', 'NATT', 'HUS', 'STRÅLKASTARSKEN', 'HUS', 'LADA', 'FORDON', 'NU', 'LIV', 'MÄNNISKA', 'DEL', 'ANLETSDRAG', 'TRÄNING', 'EVIGHET', 'ALLT', 'SÖMN', 'BOM', 'MYSTERIUM', ], [ 'SKOG', 'GLÄNTA', 'GLÄNTA', 'OMSLUT', 'SKOG', 'SJÄLV', 'STAM', 'LAV', 'SKÄGGSTUBB', 'TRÄD', 'TOPP', 'KVIST', 'LJUS', 'SKUGGA', 'SKUGGA', 'KÄRR', 'PLATS', 'GRÄS', 'STEN', 'VARA', 'GRUNDSTEN', 'HUS', 'HÄR', 'UPPLYSNING', 'NAMN', 'ARKIV', 'ARKIV', 'TRADITION', 'DÖD', 'MINNE', 'ZIGENARSTAMMEN', 'MEN', 'TORP', 'RÖST', 'VÄRLD', 'CENTRUM', 'INVÅNARE', 'KRÖNIKA', 'ÖDE', 'ÅR', 'TORP', 'SFINX', 'GRUNDSTEN', 'SÄTT', 'MÅSTE', 'NU', 'SNÅR', 'SIDA', 'STEG', 'GÅNGSTIG', 'KOMMUNIKATIONSNÄT', 'KRAFTLEDNINGSSTOLPEN', 'SKALBAGGE', 'SOL', 'SKÖLD', 'FLYGVINGARNA', 'FALLSKÄRM', 'EXPERT', ], ], 2020: [ [ 'VRAK', 'KRETSANDE', 'PUNKT', 'STILLHET', 'HAV', 'LJUS', 'BETSEL', 'TÅNG', 'STRAND', 'JORD', 'MÖRKER', 'FLADDERMUS', 'VRAK', 'STJÄRNA', ], [ 'ÅR', 'STÖVEL', 'SOL', 'TRÄD', 'VIND', 'FRIHET', 'BERG', 'FOT', 'BARRSKOGSBRÄNNINGEN', 'MEN', 'SOMMAR', 'DYNING', 'TRÄD', 'TOPP', 'ÖGONBLICK', 'KUST', ], ], } corpus = SparvTokenizedCsvCorpus( SPARV_ZIPPED_CSV_EXPORT_FILENAME, reader_opts=TextReaderOpts(filename_fields="year:_:1", ), extract_opts=ExtractTaggedTokensOpts(lemmatize=True, pos_includes='|NN|', pos_paddings=None, **SPARV_TAGGED_COLUMNS), transform_opts=TokensTransformOpts( min_len=2, to_upper=True, ), ) partitions = corpus.partition_documents('year') for key in partitions: corpus.reader.apply_filter(partitions[key]) assert expected_groups[key] == corpus.document_names tokens = [x for x in corpus.terms] assert expected_tokens[key] == tokens
def test_create_text_tokenizer_smoke_test(): reader = TextTokenizer(TEST_CORPUS_FILENAME, reader_opts=TextReaderOpts()) assert reader is not None assert next(reader) is not None
def test_zip_text_iterator_interface(): assert issubclass(readers.ZipTextIterator, ICorpusReader) instance = readers.ZipTextIterator(TEST_CORPUS_FILENAME, reader_opts=TextReaderOpts()) assert isinstance(instance, ICorpusReader)
def ComputeOptsSpacyCSV( *, corpus_tag: str = 'MARS', corpus_source: str = './tests/test_data/legal_instrument_five_docs_test.zip', ) -> ComputeOpts: # pylint: disable=too-many-instance-attributes) return ComputeOpts( corpus_tag=corpus_tag, corpus_source=corpus_source, target_folder="./tests/output", corpus_type=CorpusType.SpacyCSV, # pos_scheme: PoS_Tag_Scheme = PoS_Tag_Schemes.Universal transform_opts=TokensTransformOpts( extra_stopwords=[], keep_numerals=True, keep_symbols=True, language='english', max_len=None, min_len=1, only_alphabetic=False, only_any_alphanumeric=False, remove_accents=False, remove_stopwords=True, stopwords=None, to_lower=True, to_upper=False, ), text_reader_opts=TextReaderOpts( filename_pattern='*.csv', filename_fields=['year:_:1'], index_field=None, # use filename as_binary=False, ), extract_opts=ExtractTaggedTokensOpts( lemmatize=True, target_override=None, pos_includes='|NOUN|PROPN|VERB|', pos_paddings=None, pos_excludes='|PUNCT|EOL|SPACE|', passthrough_tokens=[], block_tokens=[], append_pos=False, global_tf_threshold=1, global_tf_threshold_mask=False, **SPACY_TAGGED_COLUMNS, filter_opts=dict( is_alpha=False, is_punct=False, is_digit=None, is_stop=None, is_space=False, ), ), create_subfolder=False, persist=True, context_opts=ContextOpts( context_width=4, concept=set(), ignore_concept=False, partition_keys=['document_id'], ), tf_threshold=1, tf_threshold_mask=False, vectorize_opts=VectorizeOpts( already_tokenized=True, lowercase=False, max_df=1.0, min_df=1, min_tf=1, max_tokens=None, ), )
def test_tokenize_when_vb_lemmatized_upper_returns_correct_tokens(): expected = [ ( 'tran_2019_01_test.txt', [ 'tränga', 'gapa', 'fladdra', 'omfamna', 'viska', 'skämmas', 'vara', 'öppna', 'bli', 'vara', 'skola', 'öppna', ], ), ( 'tran_2019_02_test.txt', [ 'stiga', 'vara', 'vilja', 'dricka', 'skylta', 'vara', 'iklä', 'sova', 'kunna', 'sova', 'ha', 'föra', 'våga', 'släppa', 'vara', 'vila', 'dra', ], ), ( 'tran_2019_03_test.txt', [ 'finna', 'kunna', 'hitta', 'gå', 'vara', 'kväva', 'sammanskruvade', 'vara', 'ända', 'vidröra', 'ruva', 'växa', 'öppna', 'vara', 'ligga', 'måste', 'leva', 'kunna', 'ge', 'finna', 'öppna', 'vara', 'hålla', 'vara', 'minna', 'glömma', 'anteckna', 'glömma', 'sorla', 'vara', 'dö', 'flytta', 'upphöra', 'stå', 'bli', 'vara', 'ha', 'vara', 'gå', 'dyka', 'gå', 'tränga', 'stiga', 'glesna', 'ljusna', 'bli', 'smyga', 'vara', 'sitta', 'ligga', 'hopvecklade', ], ), ('tran_2020_01_test.txt', ['rulla', 'tugga', 'frusta', 'hölja', 'pejla', 'stanna', 'bli']), ( 'tran_2020_02_test.txt', [ 'sparka', 'klänga', 'lövas', 'fylla', 'segla', 'stå', 'komma', 'dra', 'vila', 'sjunka', 'stå' ], ), ] corpus = SparvTokenizedCsvCorpus( SPARV_ZIPPED_CSV_EXPORT_FILENAME, extract_opts=ExtractTaggedTokensOpts(pos_includes='|VB|', pos_paddings=None, lemmatize=True, **SPARV_TAGGED_COLUMNS), reader_opts=TextReaderOpts(), chunk_size=None, transform_opts=TokensTransformOpts(to_lower=True, ), ) for i, (filename, tokens) in enumerate(corpus): assert filename == expected[i][0] assert tokens == expected[i][1]
def __init__(self): # tran_2019_02_test.txt meta_fields = ["year:_:1", "year_serial_id:_:2"] super().__init__('./tests/test_data/tranströmer.txt', TextReaderOpts(filename_fields=meta_fields))
def test_spacy_pipeline_load_text_to_spacy_to_dataframe_to_tokens_resolves( en_nlp): pytest.importorskip("spacy") reader_opts = TextReaderOpts(filename_pattern="*.txt", filename_fields="year:_:1") text_transform_opts = TextTransformOpts() reader = TextReader.create(MARY_TEST_CORPUS, reader_opts=reader_opts, transform_opts=text_transform_opts) config = Mock( spec=CorpusConfig, pipeline_payload=PipelinePayload(source=reader).put2( **SPACY_TAGGED_COLUMNS), ) attributes = ['text', 'lemma_', 'pos_'] extract_opts = ExtractTaggedTokensOpts( lemmatize=True, pos_includes='|VERB|NOUN|', pos_paddings='|ADJ|', **SPACY_TAGGED_COLUMNS, filter_opts=dict(is_punct=False), ) transform_opts = None pipeline = (CorpusPipeline(config=config).load_text( reader_opts=reader_opts).set_spacy_model(en_nlp).text_to_spacy( ).spacy_to_tagged_frame(attributes=attributes).tagged_frame_to_tokens( extract_opts=extract_opts, transform_opts=transform_opts)) payloads = [x.content for x in pipeline.resolve()] assert payloads == [ ['*', 'sea', 'ocean', 'life'], ['atmosphere', 'blow'], ['*', 'activity', 'surface', 'cease'], ['*', 'planet'], ['volcano', 'erupt', 'year'], ['eruption', 'occur', 'year', 'region', 'call'], ['know', '*', 'eruption'], ['volcano', 'erupt', 'surface', '*', 'interval'], ] assert set(list(pipeline.payload.document_index.columns)) == set([ 'filename', 'year', 'document_id', 'document_name', 'Adverb', 'Conjunction', 'Delimiter', 'Noun', 'Other', 'Preposition', 'n_tokens', 'n_raw_tokens', 'Pronoun', 'Verb', 'Adjective', 'Numeral', ])