예제 #1
0
def test_sparv_extract_and_store_when_only_nouns_and_source_is_sparv3_succeeds(
):

    os.makedirs(OUTPUT_FOLDER, exist_ok=True)

    target_filename = os.path.join(OUTPUT_FOLDER, f'{uuid.uuid1()}.zip')

    sparv_corpus.sparv_xml_extract_and_store(
        SPARV3_ZIPPED_XML_EXPORT_FILENAME,
        target_filename,
        version=3,
        extract_opts=ExtractTaggedTokensOpts(pos_includes='|NN|',
                                             pos_paddings=None,
                                             lemmatize=False),
        transform_opts=TokensTransformOpts(to_lower=True,
                                           min_len=2,
                                           stopwords=['<text>']),
    )

    expected_document_start = "utredningar justitiedepartementet förslag utlänningslag angående om- händertagande förläggning års gere ide to lm \rstatens utredningar förteckning betänkande förslag utlänningslag lag omhändertagande utlänning anstalt förläggning tryckort tryckorten bokstäverna fetstil begynnelse- bokstäverna departement"

    test_filename = "sou_1945_1.txt"

    content = zip_utils.read_file_content(zip_or_filename=target_filename,
                                          filename=test_filename,
                                          as_binary=False)

    assert content.startswith(expected_document_start)

    os.remove(target_filename)
예제 #2
0
    def test_to_dataframe_of_term_matrix_gives_expected_result(self):

        # Arrange
        reader = PandasCorpusReader(self.create_test_dataframe())
        corpus = TokenizedCorpus(
            reader,
            # Pre-compute transform options:
            transform_opts=TokensTransformOpts(
                only_any_alphanumeric=False,
                to_lower=False,
                remove_accents=False,
                min_len=1,
                max_len=None,
                keep_numerals=False,
            ),
        )

        term_term_matrix = CorpusVectorizer().fit_transform(corpus, already_tokenized=True).co_occurrence_matrix()

        # Act
        id2w = corpus.id2token.get
        co_occurrences = term_term_matrix_to_co_occurrences(term_term_matrix, threshold_count=1, ignore_ids=set())
        co_occurrences['w1'] = co_occurrences.w1_id.apply(id2w)
        co_occurrences['w2'] = co_occurrences.w2_id.apply(id2w)

        # Assert
        assert 2 == int(co_occurrences[((co_occurrences.w1 == 'A') & (co_occurrences.w2 == 'B'))].value)
        assert 0 == len(co_occurrences[((co_occurrences.w1 == 'C') & (co_occurrences.w2 == 'F'))])
예제 #3
0
def test_reader_store_result():

    os.makedirs(OUTPUT_FOLDER, exist_ok=True)

    expected_documents = [
        ['rödräv', 'hunddjur', 'utbredning', 'halvklot'],
        [
            'fjällräv', 'fjällvärld', 'liv', 'fjällräv', 'vinter', 'men',
            'variant', 'år'
        ],
    ]
    expected_names = ["document_001.txt", "document_002.txt"]

    target_filename = os.path.join(OUTPUT_FOLDER,
                                   'test_reader_store_result.zip')

    sparv_corpus.sparv_xml_extract_and_store(
        SPARV_ZIPPED_XML_EXPORT_FILENAME,
        target_filename,
        version=4,
        extract_opts=ExtractTaggedTokensOpts(pos_includes='|NN|',
                                             pos_paddings=None,
                                             lemmatize=True),
        transform_opts=TokensTransformOpts(to_lower=True),
    )

    for i in range(0, len(expected_names)):

        content = zip_utils.read_file_content(zip_or_filename=target_filename,
                                              filename=expected_names[i],
                                              as_binary=False)

        assert ' '.join(expected_documents[i]) == content

    os.remove(target_filename)
예제 #4
0
def test_n_tokens_when_exhausted_iterater_returns_expected_count():
    reader = create_reader()
    corpus = corpora.TokenizedCorpus(
        reader,
        transform_opts=TokensTransformOpts(only_any_alphanumeric=False))
    _ = [x for x in corpus]
    n_tokens = list(corpus.document_index.n_tokens)
    expected = [22, 16, 26, 45, 21]
    assert expected == n_tokens
예제 #5
0
def test_next_document_when_only_any_alphanumeric_true_skips_deliminators_using_defaults(
):
    reader = create_tokens_reader(filename_fields=None,
                                  fix_whitespaces=True,
                                  fix_hyphenation=True)
    corpus = corpora.TokenizedCorpus(
        reader, transform_opts=TokensTransformOpts(only_any_alphanumeric=True))
    _, tokens = next(corpus)
    expected = "Tre svarta ekar ur snön Så grova men fingerfärdiga Ur deras väldiga flaskor ska grönskan skumma i vår"
    assert expected.split() == tokens
예제 #6
0
def test_n_tokens_when_exhausted_and_only_any_alphanumeric_is_true_returns_expected_count(
):
    reader = create_tokens_reader(filename_fields=None,
                                  fix_whitespaces=True,
                                  fix_hyphenation=True)
    corpus = corpora.TokenizedCorpus(
        reader, transform_opts=TokensTransformOpts(only_any_alphanumeric=True))
    _ = [x for x in corpus]
    n_tokens = list(corpus.document_index.n_tokens)
    expected = [18, 14, 24, 42, 18]
    assert expected == n_tokens
예제 #7
0
def test_n_tokens_when_exhausted_and_only_any_alphanumeric_min_len_two_returns_expected_count(
):
    reader = create_reader()
    corpus = corpora.TokenizedCorpus(
        reader,
        transform_opts=TokensTransformOpts(only_any_alphanumeric=True,
                                           min_len=2))
    n_expected = [17, 13, 21, 42, 18]
    _ = [x for x in corpus]
    n_tokens = list(corpus.document_index.n_tokens)
    assert n_expected == n_tokens
예제 #8
0
def ComputeOptsSparvCSV(
    *,
    corpus_tag: str = 'TELLUS',
    corpus_source:
    str = './tests/test_data/tranströmer_corpus_export.sparv4.csv.zip',
) -> ComputeOpts:  # pylint: disable=too-many-instance-attributes)

    return ComputeOpts(
        corpus_tag=corpus_tag,
        corpus_source=corpus_source,
        target_folder="./tests/output",
        corpus_type=CorpusType.SparvCSV,
        transform_opts=TokensTransformOpts(
            to_lower=True,
            min_len=1,
            remove_stopwords=None,
            keep_symbols=True,
            keep_numerals=True,
            only_alphabetic=False,
            only_any_alphanumeric=False,
        ),
        text_reader_opts=TextReaderOpts(
            filename_pattern='*.csv',
            filename_fields=('year:_:1', ),
            index_field=None,  # use filename
            as_binary=False,
        ),
        extract_opts=ExtractTaggedTokensOpts(
            pos_includes=None,
            pos_excludes='|MAD|MID|PAD|',
            pos_paddings=None,
            lemmatize=False,
            **SPARV_TAGGED_COLUMNS,
            filter_opts=dict(
                is_alpha=False,
                is_punct=False,
                is_digit=None,
                is_stop=None,
                is_space=False,
            ),
        ),
        create_subfolder=False,
        persist=True,
        context_opts=ContextOpts(
            concept=('jag', ),
            context_width=2,
            partition_keys=['document_id'],
        ),
        tf_threshold=1,
        tf_threshold_mask=False,
        vectorize_opts=VectorizeOpts(already_tokenized=True,
                                     min_tf=1,
                                     max_tokens=None),
    )
예제 #9
0
def test_corpus_can_be_reiterated():

    reader = create_tokens_reader(filename_fields=None,
                                  fix_whitespaces=True,
                                  fix_hyphenation=True)

    corpus = corpora.TokenizedCorpus(
        reader, transform_opts=TokensTransformOpts(only_any_alphanumeric=True))
    for _ in range(0, 4):
        n_tokens = [len(x) for x in corpus.terms]
        expected = [18, 14, 24, 42, 18]
        assert expected == n_tokens  # , f"iteration{i}"
예제 #10
0
def test_next_document_when_token_corpus_returns_tokenized_document():
    reader = create_tokens_reader(filename_fields=None,
                                  fix_whitespaces=True,
                                  fix_hyphenation=True)
    corpus = corpora.TokenizedCorpus(
        reader,
        transform_opts=TokensTransformOpts(only_any_alphanumeric=False))
    _, tokens = next(corpus)
    expected = (
        "Tre svarta ekar ur snön . Så grova , men fingerfärdiga . Ur deras väldiga flaskor ska grönskan skumma i vår ."
    )
    assert expected.split() == tokens
예제 #11
0
def test_next_document_when_max_len_is_six_returns_filter_out_longer_words():
    reader = create_reader()
    transform_opts = TokensTransformOpts(only_any_alphanumeric=True,
                                         to_lower=True,
                                         remove_accents=False,
                                         min_len=2,
                                         max_len=6,
                                         keep_numerals=True)
    corpus = corpora.TokenizedCorpus(reader, transform_opts=transform_opts)
    _, tokens = next(corpus)
    expected = "tre svarta ekar ur snön så grova men ur deras ska skumma vår"
    assert expected.split() == tokens
예제 #12
0
def create_corpus():
    reader = create_reader()
    transform_opts = TokensTransformOpts(
        only_any_alphanumeric=True,
        to_lower=True,
        remove_accents=False,
        min_len=2,
        max_len=None,
        keep_numerals=False,
    )
    corpus = TokenizedCorpus(reader, transform_opts=transform_opts)
    return corpus
예제 #13
0
def test_next_document_when_to_lower_is_true_returns_all_lowercase():
    reader = create_reader()
    transform_opts = TokensTransformOpts(only_any_alphanumeric=True,
                                         to_lower=True,
                                         remove_accents=False,
                                         min_len=1,
                                         max_len=None,
                                         keep_numerals=True)
    corpus = corpora.TokenizedCorpus(reader, transform_opts=transform_opts)
    _, tokens = next(corpus)
    expected = "tre svarta ekar ur snön så grova men fingerfärdiga ur deras väldiga flaskor ska grönskan skumma i vår"
    assert expected.split() == tokens
예제 #14
0
def test_get_index_when_extract_passed_returns_expected_count():
    reader = create_reader()
    transform_opts = TokensTransformOpts(
        only_any_alphanumeric=False,
        to_lower=False,
        remove_accents=False,
        min_len=2,
        max_len=None,
        keep_numerals=True,
    )
    corpus = corpora.TokenizedCorpus(reader, transform_opts=transform_opts)
    result = corpus.metadata
    assert 5 == len(result)
예제 #15
0
def test_next_document_when_only_any_alphanumeric_true_skips_deliminators():
    reader = create_reader()
    corpus = corpora.TokenizedCorpus(
        reader,
        transform_opts=TokensTransformOpts(only_any_alphanumeric=True,
                                           to_lower=False,
                                           remove_accents=False,
                                           min_len=1,
                                           keep_numerals=True),
    )
    _, tokens = next(corpus)
    expected = "Tre svarta ekar ur snön Så grova men fingerfärdiga Ur deras väldiga flaskor ska grönskan skumma i vår"
    assert expected.split() == tokens
예제 #16
0
 def test_processed_corpus_token_stream(self):
     df = self.create_test_dataframe()
     reader = PandasCorpusReader(df)
     corpus = TokenizedCorpus(reader, transform_opts=TokensTransformOpts())
     result = [x for x in corpus]
     expected = [
         ('document_0.txt', ['A', 'B', 'C']),
         ('document_1.txt', ['B', 'C', 'D']),
         ('document_2.txt', ['C', 'B']),
         ('document_3.txt', ['A', 'B', 'F']),
         ('document_4.txt', ['E', 'B']),
         ('document_5.txt', ['F', 'E', 'E']),
     ]
     self.assertEqual(expected, result)
예제 #17
0
    def transform_opts(self) -> TokensTransformOpts:

        opts = TokensTransformOpts(
            keep_numerals=True,
            keep_symbols=True,
            language=self._config.language,
            max_len=None,
            min_len=1,
            only_alphabetic=self._only_alphabetic.value,
            only_any_alphanumeric=self._only_any_alphanumeric.value,
            remove_accents=False,
            remove_stopwords=self._remove_stopwords.value,
            stopwords=None,
            to_lower=self._to_lowercase.value,
            to_upper=False,
        )

        if self._extra_stopwords.value.strip() != '':
            _words = [x for x in map(str.strip, self._extra_stopwords.value.strip().split()) if x != '']
            if len(_words) > 0:
                opts.extra_stopwords = _words

        return opts
예제 #18
0
def test_predict_topics(method: str):

    """Train a model that will be used in prediction"""

    target_folder: str = './tests/output'
    train_target_name: str = f'train_{str(uuid.uuid1())[:8]}'
    payload: DocumentPayload = tranströmer_topic_model_payload(
        method=method, target_folder=target_folder, target_name=train_target_name
    )
    model_folder: str = os.path.join(payload.content.get("target_folder"), payload.content.get("target_name"))

    """Predict using trained model"""

    config: CorpusConfig = CorpusConfig.load('./tests/test_data/tranströmer.yml')
    corpus_source: str = './tests/test_data/tranströmer_corpus_pos_csv.zip'
    minimum_probability: float = 0.001
    n_tokens: int = 100
    predict_target_name: str = f'predict_{str(uuid.uuid1())[:8]}'
    transform_opts = TokensTransformOpts()
    extract_opts = ExtractTaggedTokensOpts(
        lemmatize=True,
        pos_includes='',
        pos_excludes='MAD|MID|PAD',
        **config.checkpoint_opts.tagged_columns,
    )
    vectorize_opts: VectorizeOpts = VectorizeOpts(already_tokenized=True)
    payload: DocumentPayload = (
        CorpusPipeline(config=config)
        .load_tagged_frame(
            filename=corpus_source,
            checkpoint_opts=config.checkpoint_opts,
            extra_reader_opts=config.text_reader_opts,
        )
        .tagged_frame_to_tokens(extract_opts=extract_opts, transform_opts=transform_opts)
        .to_dtm(vectorize_opts=vectorize_opts)
        .predict_topics(
            model_folder=model_folder,
            target_folder=target_folder,
            target_name=predict_target_name,
            minimum_probability=minimum_probability,
            n_tokens=n_tokens,
        )
    ).single()

    assert payload is not None

    model_infos = find_models('./tests/output')
    assert any(m['name'] == predict_target_name for m in model_infos)
    model_info = next(m for m in model_infos if m['name'] == predict_target_name)
    assert 'method' in model_info['options']
예제 #19
0
 def test_tokenized_document_token_counts_is_empty_if_enumerable_not_exhausted(self):
     corpus = self.create_simple_test_corpus(
         transform_opts=TokensTransformOpts(
             keep_symbols=False,
             only_any_alphanumeric=True,
             to_lower=True,
             remove_accents=False,
             min_len=0,
             max_len=None,
             keep_numerals=True,
             stopwords=None,
         )
     )
     self.assertTrue('n_raw_tokens' not in corpus.document_index.columns)
     self.assertTrue('n_tokens' not in corpus.document_index.columns)
예제 #20
0
def text_corpus() -> TokenizedCorpus:
    filename_fields = dict(year=r".{5}(\d{4})_.*", serial_no=r".{9}_(\d+).*")
    reader = create_tokens_reader(filename_fields=filename_fields,
                                  fix_whitespaces=True,
                                  fix_hyphenation=True)
    transform_opts = TokensTransformOpts(
        only_any_alphanumeric=True,
        to_lower=True,
        remove_accents=False,
        min_len=2,
        max_len=None,
        keep_numerals=False,
    )
    corpus = TokenizedCorpus(reader, transform_opts=transform_opts)
    return corpus
예제 #21
0
 def test_tokenized_document_token_counts_is_not_empty_if_enumerable_is_exhausted(self):
     # Note: Symbols are always removed by reader - hence "keep_symbols" filter has no effect
     corpus = self.create_simple_test_corpus(
         transform_opts=TokensTransformOpts(
             keep_symbols=False,
             only_any_alphanumeric=True,
             to_lower=True,
             remove_accents=False,
             min_len=0,
             max_len=None,
             keep_numerals=True,
             stopwords=None,
         )
     )
     for _ in corpus:
         pass
     self.assertTrue('n_raw_tokens' in corpus.document_index.columns)
     self.assertTrue('n_tokens' in corpus.document_index.columns)
예제 #22
0
def test_next_document_when_only_any_alphanumeric_is_false_returns_all_tokens(
):
    reader = create_reader()
    transform_opts = TokensTransformOpts(
        only_any_alphanumeric=False,
        to_lower=False,
        remove_accents=False,
        min_len=1,
        max_len=None,
        keep_numerals=True,
        only_alphabetic=False,
    )
    corpus = corpora.TokenizedCorpus(reader, transform_opts=transform_opts)
    _, tokens = next(corpus)
    expected = (
        "Tre svarta ekar ur snön . Så grova , men fingerfärdiga . Ur deras väldiga flaskor ska grönskan skumma i vår ."
    )
    assert expected.split() == tokens
예제 #23
0
 def test_tokenized_document_where_symbols_and_numerals_are_filtered_out(self):
     corpus = self.create_simple_test_corpus(
         transform_opts=TokensTransformOpts(
             keep_symbols=False,
             only_any_alphanumeric=True,
             to_lower=False,
             remove_accents=False,
             min_len=0,
             max_len=None,
             keep_numerals=False,
             stopwords=None,
         )
     )
     result = [x for x in corpus]
     expected = [
         ('document_0.txt', ['Detta', 'är', 'en', 'mening', 'med', 'token', 'siffror', 'och', 'symboler']),
         ('document_1.txt', ['Är', 'det', 'i', 'denna', 'mening', 'en', 'mening']),
     ]
     self.assertEqual(expected, result)
예제 #24
0
def tranströmer_topic_model_payload(method: str, target_folder: str, target_name: str) -> DocumentPayload:
    transform_opts: TokensTransformOpts = TokensTransformOpts()
    extract_opts: ExtractTaggedTokensOpts = ExtractTaggedTokensOpts(
        lemmatize=True,
        pos_includes='',
        pos_excludes='MAD|MID|PAD',
        text_column='token',
        lemma_column='baseform',
        pos_column='pos',
    )
    default_engine_args: dict = {
        'n_topics': 4,
        'passes': 1,
        'random_seed': 42,
        'workers': 1,
        'max_iter': 100,
        'work_folder': os.path.join(target_folder, target_name),
    }
    config: CorpusConfig = CorpusConfig.load('./tests/test_data/tranströmer.yml')
    corpus_source: str = './tests/test_data/tranströmer_corpus_pos_csv.zip'
    p: CorpusPipeline = (
        CorpusPipeline(config=config)
        .load_tagged_frame(
            filename=corpus_source,
            checkpoint_opts=config.checkpoint_opts,
            extra_reader_opts=config.text_reader_opts,
        )
        .tagged_frame_to_tokens(extract_opts=extract_opts, transform_opts=transform_opts)
        .to_dtm(VectorizeOpts(already_tokenized=True))
        .to_topic_model(
            target_mode='both',
            target_folder=target_folder,
            target_name=target_name,
            engine=method,
            engine_args=default_engine_args,
            store_corpus=True,
            store_compressed=True,
        )
    )

    payload: DocumentPayload = p.single()

    return payload
예제 #25
0
    def test_fit_transform_gives_document_term_matrix(self):
        # Arrange
        reader = PandasCorpusReader(self.create_test_dataframe())
        corpus = TokenizedCorpus(
            reader,
            transform_opts=TokensTransformOpts(
                only_any_alphanumeric=False,
                to_lower=False,
                remove_accents=False,
                min_len=1,
                max_len=None,
                keep_numerals=False,
            ),
        )
        v_corpus = CorpusVectorizer().fit_transform(corpus)

        term_term_matrix = v_corpus.co_occurrence_matrix()
        token2id = v_corpus.token2id

        assert 2 == term_term_matrix.todense()[token2id['A'], token2id['B']]
        assert 0 == term_term_matrix.todense()[token2id['C'], token2id['F']]
예제 #26
0
def test_fit_transform_when_given_a_vocabulary_returns_same_vocabulary():

    corpus = TokenizedCorpus(
        reader=create_reader(),
        transform_opts=TokensTransformOpts(to_lower=True, min_len=10),
    )

    vocabulary = CorpusVectorizer().fit_transform(
        corpus, already_tokenized=True).token2id

    assert corpus.token2id == vocabulary

    expected_vocabulary_reversed = {
        k: abs(v - 5)
        for k, v in corpus.token2id.items()
    }

    vocabulary = (CorpusVectorizer().fit_transform(
        corpus,
        already_tokenized=True,
        vocabulary=expected_vocabulary_reversed).token2id)

    assert expected_vocabulary_reversed == vocabulary
예제 #27
0
def test_corpus_apply_when_looping_through_partition_groups_filter_outs_other_groups(
):

    expected_groups = {
        2019: ['tran_2019_01_test', 'tran_2019_02_test', 'tran_2019_03_test'],
        2020: ['tran_2020_01_test', 'tran_2020_02_test'],
    }

    expected_tokens = {
        2019: [
            [
                'KYRKA',
                'TURIST',
                'HALVMÖRKER',
                'VALV',
                'VALV',
                'ÖVERBLICK',
                'LJUSLÅGA',
                'ÄNGEL',
                'ANSIKTE',
                'KROPP',
                'MÄNNISKA',
                'VALV',
                'VALV',
                'TÅR',
                'PIAZZA',
                'MR',
                'MRS',
                'HERR',
                'SIGNORA',
                'VALV',
                'VALV',
            ],
            [
                'KÖR',
                'NATT',
                'HUS',
                'STRÅLKASTARSKEN',
                'HUS',
                'LADA',
                'FORDON',
                'NU',
                'LIV',
                'MÄNNISKA',
                'DEL',
                'ANLETSDRAG',
                'TRÄNING',
                'EVIGHET',
                'ALLT',
                'SÖMN',
                'BOM',
                'MYSTERIUM',
            ],
            [
                'SKOG',
                'GLÄNTA',
                'GLÄNTA',
                'OMSLUT',
                'SKOG',
                'SJÄLV',
                'STAM',
                'LAV',
                'SKÄGGSTUBB',
                'TRÄD',
                'TOPP',
                'KVIST',
                'LJUS',
                'SKUGGA',
                'SKUGGA',
                'KÄRR',
                'PLATS',
                'GRÄS',
                'STEN',
                'VARA',
                'GRUNDSTEN',
                'HUS',
                'HÄR',
                'UPPLYSNING',
                'NAMN',
                'ARKIV',
                'ARKIV',
                'TRADITION',
                'DÖD',
                'MINNE',
                'ZIGENARSTAMMEN',
                'MEN',
                'TORP',
                'RÖST',
                'VÄRLD',
                'CENTRUM',
                'INVÅNARE',
                'KRÖNIKA',
                'ÖDE',
                'ÅR',
                'TORP',
                'SFINX',
                'GRUNDSTEN',
                'SÄTT',
                'MÅSTE',
                'NU',
                'SNÅR',
                'SIDA',
                'STEG',
                'GÅNGSTIG',
                'KOMMUNIKATIONSNÄT',
                'KRAFTLEDNINGSSTOLPEN',
                'SKALBAGGE',
                'SOL',
                'SKÖLD',
                'FLYGVINGARNA',
                'FALLSKÄRM',
                'EXPERT',
            ],
        ],
        2020: [
            [
                'VRAK',
                'KRETSANDE',
                'PUNKT',
                'STILLHET',
                'HAV',
                'LJUS',
                'BETSEL',
                'TÅNG',
                'STRAND',
                'JORD',
                'MÖRKER',
                'FLADDERMUS',
                'VRAK',
                'STJÄRNA',
            ],
            [
                'ÅR',
                'STÖVEL',
                'SOL',
                'TRÄD',
                'VIND',
                'FRIHET',
                'BERG',
                'FOT',
                'BARRSKOGSBRÄNNINGEN',
                'MEN',
                'SOMMAR',
                'DYNING',
                'TRÄD',
                'TOPP',
                'ÖGONBLICK',
                'KUST',
            ],
        ],
    }

    corpus = SparvTokenizedCsvCorpus(
        SPARV_ZIPPED_CSV_EXPORT_FILENAME,
        reader_opts=TextReaderOpts(filename_fields="year:_:1", ),
        extract_opts=ExtractTaggedTokensOpts(lemmatize=True,
                                             pos_includes='|NN|',
                                             pos_paddings=None,
                                             **SPARV_TAGGED_COLUMNS),
        transform_opts=TokensTransformOpts(
            min_len=2,
            to_upper=True,
        ),
    )

    partitions = corpus.partition_documents('year')

    for key in partitions:

        corpus.reader.apply_filter(partitions[key])
        assert expected_groups[key] == corpus.document_names

        tokens = [x for x in corpus.terms]
        assert expected_tokens[key] == tokens
예제 #28
0
def process(
    corpus_config: Optional[str] = None,
    input_filename: Optional[str] = None,
    output_folder: Optional[str] = None,
    output_tag: Optional[str] = None,
    filename_pattern: Optional[str] = None,
    phrase: Sequence[str] = None,
    phrase_file: Optional[str] = None,
    create_subfolder: bool = True,
    pos_includes: Optional[str] = None,
    pos_paddings: Optional[str] = None,
    pos_excludes: Optional[str] = None,
    append_pos: bool = False,
    to_lower: bool = True,
    lemmatize: bool = True,
    remove_stopwords: Optional[str] = None,
    min_word_length: int = 2,
    max_word_length: int = None,
    keep_symbols: bool = False,
    keep_numerals: bool = False,
    only_any_alphanumeric: bool = False,
    only_alphabetic: bool = False,
    tf_threshold: int = 1,
    tf_threshold_mask: bool = False,
    max_tokens: int = None,
    enable_checkpoint: bool = True,
    force_checkpoint: bool = False,
    deserialize_processes: int = 4,
):

    try:
        corpus_config: CorpusConfig = CorpusConfig.load(corpus_config)
        phrases = parse_phrases(phrase_file, phrase)

        if pos_excludes is None:
            pos_excludes = pos_tags_to_str(corpus_config.pos_schema.Delimiter)

        if pos_paddings.upper() in ["FULL", "ALL", "PASSTHROUGH"]:
            pos_paddings = pos_tags_to_str(corpus_config.pos_schema.all_types_except(pos_includes))
            logger.info(f"PoS paddings expanded to: {pos_paddings}")

        text_reader_opts: TextReaderOpts = corpus_config.text_reader_opts.copy()

        if filename_pattern is not None:
            text_reader_opts.filename_pattern = filename_pattern

        corpus_config.checkpoint_opts.deserialize_processes = max(1, deserialize_processes)

        tagged_columns: dict = corpus_config.pipeline_payload.tagged_columns_names
        args: interface.ComputeOpts = interface.ComputeOpts(
            corpus_type=corpus_config.corpus_type,
            corpus_source=input_filename,
            target_folder=output_folder,
            corpus_tag=output_tag,
            transform_opts=TokensTransformOpts(
                to_lower=to_lower,
                to_upper=False,
                min_len=min_word_length,
                max_len=max_word_length,
                remove_accents=False,
                remove_stopwords=(remove_stopwords is not None),
                stopwords=None,
                extra_stopwords=None,
                language=remove_stopwords,
                keep_numerals=keep_numerals,
                keep_symbols=keep_symbols,
                only_alphabetic=only_alphabetic,
                only_any_alphanumeric=only_any_alphanumeric,
            ),
            text_reader_opts=text_reader_opts,
            extract_opts=ExtractTaggedTokensOpts(
                pos_includes=pos_includes,
                pos_paddings=pos_paddings,
                pos_excludes=pos_excludes,
                lemmatize=lemmatize,
                phrases=phrases,
                append_pos=append_pos,
                global_tf_threshold=tf_threshold,
                global_tf_threshold_mask=tf_threshold_mask,
                **tagged_columns,
            ),
            vectorize_opts=VectorizeOpts(
                already_tokenized=True,
                min_tf=tf_threshold,
                max_tokens=max_tokens,
            ),
            tf_threshold=tf_threshold,
            tf_threshold_mask=tf_threshold_mask,
            create_subfolder=create_subfolder,
            persist=True,
            enable_checkpoint=enable_checkpoint,
            force_checkpoint=force_checkpoint,
        )

        workflow.compute(args=args, corpus_config=corpus_config)

        logger.info('Done!')

    except Exception as ex:  # pylint: disable=try-except-raise
        logger.exception(ex)
        click.echo(ex)
        sys.exit(1)
예제 #29
0
def filter_tagged_frame(
    tagged_frame: pd.DataFrame,
    *,
    extract_opts: ExtractTaggedTokensOpts,
    token2id: Token2Id = None,
    pos_schema: PoS_Tag_Scheme = None,
    normalize_column_names: bool = True,
    transform_opts: TokensTransformOpts = None,
) -> pd.DataFrame:
    """Filters tagged frame (text or numeric). Returns tagged frame

    Args:
        tagged_frame ([pd.DataFrame]): Document frame to be filtered, can be text or numeric
        extract_opts (ExtractTaggedTokensOpts): PoS and lemma extract/filter opts
        token2id (Token2Id, optional): Vocabulary. Defaults to None.
        pos_schema (PoS_Tag_Scheme, optional): PoS schema. Defaults to None.
        transform_opts (TokensTransformOpts, optional): Filters and transforms. Defaults to None.
        normalize_column_names (bool, optional): If text, rename columns to `token` and `pos`. Defaults to True.

    Raises:
        Token2IdMissingError: Token2Id is mandatory if frame is numeric.
        PoSTagSchemaMissingError: PoS-schema is mandatory if frame is numeric.
        TaggedFrameColumnNameError: Missing target column (corrupt data)

    Returns:
        pd.DataFrame: Filtered and transformed document frame.
    """
    if len(tagged_frame) == 0:
        return []

    is_numeric_frame: bool = is_encoded_tagged_frame(tagged_frame)
    to_lower: bool = transform_opts and transform_opts.to_lower

    if is_numeric_frame:

        if token2id is None:
            raise Token2IdMissingError(
                "filter_tagged_frame: cannot filter tagged id frame without vocabulary"
            )

        if pos_schema is None:
            raise PoSTagSchemaMissingError(
                "filter_tagged_frame: cannot filter tagged id frame without pos_schema"
            )

        if to_lower:
            logger.warning(
                "lowercasing not implemented for numeric tagged frames")
            to_lower = False

    if not is_numeric_frame and extract_opts.lemmatize is None and extract_opts.target_override is None:
        raise ValueError("a valid target not supplied (no lemmatize or target")

    target_column: str = extract_opts.target_column
    pos_column: str = extract_opts.pos_column

    if target_column not in tagged_frame.columns:
        raise TaggedFrameColumnNameError(
            f"{target_column} is not valid target for given document (missing column)"
        )

    if pos_column not in tagged_frame.columns:
        raise ValueError(f"configuration error: {pos_column} not in document")

    passthroughs: Set[str] = extract_opts.get_passthrough_tokens()
    blocks: Set[str] = extract_opts.get_block_tokens().union('')

    if is_numeric_frame:
        passthroughs = token2id.to_id_set(passthroughs)
        blocks = token2id.to_id_set(blocks)

    if not is_numeric_frame and (extract_opts.lemmatize or to_lower):
        tagged_frame[target_column] = tagged_frame[target_column].str.lower()
        # pd.Series([x.lower() for x in tagged_frame[target_column]])
        passthroughs = {x.lower() for x in passthroughs}

    # if extract_opts.block_chars:
    #     for char in extract_opts.block_chars:
    #         doc[target] = doc[target].str.replace(char, '', regex=False)
    """ Phrase detection """
    if extract_opts.phrases:
        if is_numeric_frame:
            logger.warning(
                "phrase detection not implemented for numeric tagged frames")
            extract_opts.phrases = None
        else:
            found_phrases = detect_phrases(tagged_frame[target_column],
                                           extract_opts.phrases,
                                           ignore_case=to_lower)
            if found_phrases:
                tagged_frame = merge_phrases(tagged_frame,
                                             found_phrases,
                                             target_column=target_column,
                                             pad=PHRASE_PAD)
                passthroughs = passthroughs.union(
                    {'_'.join(x[1])
                     for x in found_phrases})

    mask = np.repeat(True, len(tagged_frame.index))
    if extract_opts.filter_opts and extract_opts.filter_opts.data:
        mask &= extract_opts.filter_opts.mask(tagged_frame)

    pos_includes: Set[str] = extract_opts.get_pos_includes()
    pos_excludes: Set[str] = extract_opts.get_pos_excludes()
    pos_paddings: Set[str] = extract_opts.get_pos_paddings()

    if is_numeric_frame:
        pg = pos_schema.pos_to_id.get
        pos_includes = {pg(x) for x in pos_includes}
        pos_excludes = {pg(x) for x in pos_excludes}
        pos_paddings = {pg(x) for x in pos_paddings}

    if pos_includes:
        """Don't filter if PoS-include is empty - and don't filter out PoS tokens that should be padded"""
        mask &= tagged_frame[pos_column].isin(pos_includes.union(pos_paddings))

    if pos_excludes:
        mask &= ~(tagged_frame[pos_column].isin(pos_excludes))

    if transform_opts and transform_opts.has_effect:
        mask &= transform_opts.mask(tagged_frame[target_column],
                                    token2id=token2id)

    if len(passthroughs) > 0:
        mask |= tagged_frame[target_column].isin(passthroughs)

    if len(blocks) > 0:
        mask &= ~tagged_frame[target_column].isin(blocks)

    filtered_data: pd.DataFrame = tagged_frame.loc[mask][[
        target_column, pos_column
    ]]

    if extract_opts.global_tf_threshold > 1:
        if token2id is None or token2id.tf is None:
            logger.error(
                "Cannot apply TF filter since token2id has no term frequencies"
            )
            extract_opts.global_tf_threshold = 1
        else:
            filtered_data = filter_tagged_frame_by_term_frequency(
                tagged_frame=filtered_data,
                target_column=target_column,
                token2id=token2id,
                extract_opts=extract_opts,
                passthroughs=passthroughs,
            )

    if not is_numeric_frame and normalize_column_names:

        filtered_data.rename(columns={
            target_column: 'token',
            pos_column: 'pos'
        },
                             inplace=True)

    return filtered_data
예제 #30
0
def test_transform_smoke_test():
    transformer = TokensTransformer(transform_opts=TokensTransformOpts())

    assert transformer is not None