Пример #1
0
def test_tagged_frame_to_tokens_with_tf_threshold_and_not_threshold_tf_mask(tagged_frame: pd.DataFrame):

    extract_opts = ExtractTaggedTokensOpts(
        lemmatize=True,
        pos_includes=None,
        pos_excludes=None,
        global_tf_threshold=2,
        global_tf_threshold_mask=False,
        **SPARV_TAGGED_COLUMNS,
    )
    """ Alternative #1: tagged_frame_to_tokens does the filtering """

    token2id: Token2Id = Token2Id().ingest(["*", GLOBAL_TF_THRESHOLD_MASK_TOKEN]).ingest(tagged_frame.baseform)
    expected_count = len(
        tagged_frame[
            tagged_frame.baseform.apply(lambda x: token2id.tf[token2id[x]] >= extract_opts.global_tf_threshold)
        ]
    )

    df: pd.DataFrame = tagged_frame.copy()
    tokens = tagged_frame_to_tokens(df, token2id=token2id, extract_opts=extract_opts)
    assert len(tokens) == expected_count
    assert set(tokens) == set(['i', 'i', '.', 'valv', 'valv', '.', '.'])

    """ Alternative #2: Use token2id to mask low TF tokens"""
    df: pd.DataFrame = tagged_frame.copy()
    token2id: Token2Id = Token2Id().ingest(["*", GLOBAL_TF_THRESHOLD_MASK_TOKEN]).ingest(df.baseform)
    """Note that translation must be used to map token-ids if used elsewhere"""
    _, translation = token2id.compress(tf_threshold=2, inplace=True)  # pylint: disable=unused-variable
    token2id.close()
    tokens = tagged_frame_to_tokens(df, token2id=token2id, extract_opts=extract_opts)
    assert len(tokens) == expected_count
    assert set(tokens) == set(['i', 'i', '.', 'valv', 'valv', '.', '.'])
Пример #2
0
def test_tagged_frame_to_tokens_with_tf_threshold_and_threshold_tf_mask(tagged_frame: pd.DataFrame):

    extract_opts = ExtractTaggedTokensOpts(lemmatize=True, pos_includes=None, pos_excludes=None, **SPARV_TAGGED_COLUMNS)

    """ Alternative #1: tagged_frame_to_tokens does the filtering """

    df: pd.DataFrame = tagged_frame.copy()
    extract_opts.global_tf_threshold = 2
    extract_opts.global_tf_threshold_mask = True
    token2id: Token2Id = Token2Id().ingest(["*", GLOBAL_TF_THRESHOLD_MASK_TOKEN]).ingest(df.baseform)

    tokens = tagged_frame_to_tokens(df, token2id=token2id, extract_opts=extract_opts)
    assert len(tokens) == len(df)
    assert set(tokens) == set([GLOBAL_TF_THRESHOLD_MASK_TOKEN, 'i', 'i', '.', 'valv', 'valv', '.', '.'])

    """ Alternative #2: Use token2id to mask low TF tokens"""
    df: pd.DataFrame = tagged_frame.copy()
    token2id: Token2Id = Token2Id().ingest(["*", GLOBAL_TF_THRESHOLD_MASK_TOKEN]).ingest(df.baseform)

    """Note that translation must be used to map token-ids if used elsewhere"""
    token2id.compress(tf_threshold=2, inplace=True)
    token2id.close()

    tokens = tagged_frame_to_tokens(df, token2id=token2id, extract_opts=extract_opts)
    assert len(tokens) == len(df)
    assert set(tokens) == set([GLOBAL_TF_THRESHOLD_MASK_TOKEN, 'i', 'i', '.', 'valv', 'valv', '.', '.'])
Пример #3
0
def test_token2id_store_and_load():

    os.makedirs('./tests/output', exist_ok=True)

    token2id: Token2Id = Token2Id().ingest(TEST_TOKENS_STREAM1)

    filename = './tests/output/test_vocabulary.zip'
    tf_filename = path_add_suffix(filename, "_tf", new_extension=".pbz2")

    token2id.store(filename=filename)

    assert os.path.isfile(filename) and os.path.isfile(tf_filename)

    token2id_loaded: Token2Id = Token2Id.load(filename=filename)

    assert token2id_loaded is not None
    assert token2id_loaded.tf is not None

    assert token2id_loaded.data == {
        'adam': 0,
        'anton': 1,
        'beatrice': 2,
        'felicia': 3,
        'niklas': 4
    }
    assert dict(token2id_loaded.tf) == {0: 3, 1: 2, 2: 1, 3: 1, 4: 1}
Пример #4
0
 def translate_id_pair_to_token(self, pair2id: Token2Id,
                                token2id: Token2Id) -> None:
     """Translates `id pairs` (w1_id, w2_id) to pair-token `w1/w2`"""
     _single_without_sep = {
         w_id: w.replace(WORD_PAIR_DELIMITER, '')
         for w_id, w in token2id.id2token.items()
     }
     sg = _single_without_sep.get
     pair2id.replace(
         data={
             sj([sg(w1_id), sg(w2_id)]): pair_id
             for (w1_id, w2_id), pair_id in pair2id.data.items()
         })
Пример #5
0
def test_interfaces_token2id_store():
    os.makedirs('./tests/output', exist_ok=True)

    filename: str = './tests/output/test_interfaces_token2id_store.zip'
    token2id = Token2Id()

    token2id.ingest(['apa', 'banan', 'soffa'])
    token2id.store(filename)

    assert pathlib.Path(filename).exists()

    token2id_loaded: Token2Id = Token2Id.load(filename)

    assert token2id.data == token2id_loaded.data
Пример #6
0
def test_term_term_matrix_to_co_occurrences_with_ignore_ids():

    text_corpus = very_simple_corpus(data=[
        ('tran_2019_01_test.txt', ['*', 'b', 'c', 'c']),
        ('tran_2019_02_test.txt', ['a', '*', '*', 'd']),
        ('tran_2019_03_test.txt', ['a', 'e', 'e', 'b']),
        ('tran_2020_01_test.txt', ['*', 'c', 'd', 'a']),
        ('tran_2020_02_test.txt', ['a', 'b', '*', '*']),
    ])
    token2id: Token2Id = Token2Id(text_corpus.token2id)

    term_term_matrix = (dtm.CorpusVectorizer().fit_transform(
        text_corpus, already_tokenized=True,
        vocabulary=text_corpus.token2id).co_occurrence_matrix())

    pad_id = token2id['*']

    co_occurrences = term_term_matrix_to_co_occurrences(
        term_term_matrix=term_term_matrix,
        threshold_count=1,
        ignore_ids=set([pad_id]),
    )

    assert not (co_occurrences.w1_id == pad_id).any()
    assert not (co_occurrences.w2_id == pad_id).any()
Пример #7
0
    def get_topic_token_weights(
            self,
            vocabulary: Any,
            n_tokens: int = 200,
            minimum_probability: float = 0.000001) -> pd.DataFrame:
        """Compile document topic weights. Return DataFrame."""
        id2token: dict = Token2Id.any_to_id2token(vocabulary)
        topic_data: list[tuple[
            int, TokenWeights]] = self.get_topic_token_weights_data(
                n_tokens=n_tokens, id2term=id2token)

        topic_token_weights: pd.DataFrame = pd.DataFrame(
            [(topic_id, token, weight) for topic_id, tokens in topic_data
             for token, weight in tokens if weight > minimum_probability],
            columns=['topic_id', 'token', 'weight'],
        )

        topic_token_weights['topic_id'] = topic_token_weights.topic_id.astype(
            np.uint16)

        fg = {v: k for k, v in id2token.items()}.get

        topic_token_weights['token_id'] = topic_token_weights.token.apply(fg)

        return topic_token_weights[['topic_id', 'token_id', 'token', 'weight']]
Пример #8
0
def test_interfaces_token2id_get():

    token2id = Token2Id()

    token_id = token2id['apa']

    assert token_id == 0
    assert 'apa' in token2id
Пример #9
0
def test_interfaces_token2id_reverse():

    token2id: Token2Id = Token2Id()

    id2token = token2id.ingest(['apa', 'banan', 'soffa']).id2token
    assert id2token[0] == 'apa'
    assert id2token[1] == 'banan'
    assert id2token[2] == 'soffa'
Пример #10
0
def test_interfaces_token2id_ingest():

    token2id = Token2Id()

    token2id.ingest(['apa', 'banan', 'soffa'])

    assert 'apa' in token2id
    assert 'banan' in token2id
    assert 'soffa' in token2id
Пример #11
0
    def _token2id(self) -> Optional[Token2Id]:
        """Returns dictionary stored in archive, or None if not found in archive"""

        if DICTIONARY_FILENAME not in self.namelist():
            return None

        return Token2Id(
            zip_utils.read_json(zip_or_filename=self,
                                filename=DICTIONARY_FILENAME))
Пример #12
0
    def process_stream(self) -> Iterable[DocumentPayload]:

        if self.document_index is None:
            raise CoOccurrenceError(
                "expected document index found no such thing")

        token2id: Token2Id = self.pipeline.payload.token2id
        pair2id: Token2Id = Token2Id()

        normal_builder: CoOccurrenceCorpusBuilder = CoOccurrenceCorpusBuilder(
            VectorizeType.Normal, self.document_index, pair2id, token2id)

        concept_builder: CoOccurrenceCorpusBuilder = (
            CoOccurrenceCorpusBuilder(VectorizeType.Concept,
                                      self.document_index, pair2id, token2id)
            if self.context_opts.concept else None)

        coo_payloads: Iterable[CoOccurrencePayload] = (
            payload.content
            for payload in self.prior.outstream(desc="Ingest",
                                                total=len(self.document_index))
            if payload.content is not None)

        for coo_payload in coo_payloads:
            normal_builder.ingest_pairs(coo_payload).add(payload=coo_payload)
            if concept_builder:
                concept_builder.add(payload=coo_payload)

        pair2id.close()
        """Translation between id-pair (single vocab IDs) and pair-pid (pair vocab IDs)"""
        token_ids_2_pair_id: Mapping[Tuple[int, int], int] = dict(pair2id.data)

        self.translate_id_pair_to_token(pair2id, token2id)

        concept_corpus: VectorizedCorpus = (concept_builder.corpus.remember(
            window_counts=self.get_window_counts(concept_builder))
                                            if concept_builder else None)

        corpus: VectorizedCorpus = normal_builder.corpus.remember(
            window_counts=self.get_window_counts(normal_builder))

        bundle: Bundle = Bundle(
            corpus=corpus,
            token2id=token2id,
            document_index=self.document_index,
            concept_corpus=concept_corpus,
            compute_options=self.pipeline.payload.stored_opts(),
            vocabs_mapping=token_ids_2_pair_id,
        )

        if self.compress:
            bundle.compress()

        payload: DocumentPayload = DocumentPayload(content=bundle)

        yield payload
Пример #13
0
    def setup(self) -> ITask:
        self.target = self.get_column_name(self.token_type)

        # if self.pipeline.get_next_to(self).in_content_type == ContentType.TAGGED_FRAME:
        #     if self.token_type is None:
        #         raise ValueError("token_type text or lemma not specfied")

        self.token2id: Token2Id = self.token2id or Token2Id()
        self.pipeline.payload.token2id = self.token2id
        return self
Пример #14
0
def test_replace():

    tokens = ['a', 'a', 'b', 'c']

    ingested: Token2Id = Token2Id().ingest(tokens)

    token2id: Token2Id = Token2Id()
    token2id.replace(data={
        'a': 0,
        'b': 1,
        'c': 2
    },
                     tf=Counter({
                         0: 2,
                         1: 1,
                         2: 1
                     }))

    assert dict(token2id.data) == dict(ingested.data)
    assert dict(token2id.tf) == dict(ingested.tf)
Пример #15
0
def test_tagged_frame_to_tokens_with_global_tf_threshold(tagged_frame: pd.DataFrame):

    tagged_frame: pd.DataFrame = tagged_frame.copy()

    expected_counts: dict = {
        '.': 3,
        'bakom': 1,
        'den': 1,
        'fladdra_omkring': 1,
        'gapa': 1,
        'halvmörker': 1,
        'i': 2,
        'ingen': 1,
        'inne': 1,
        'kyrka': 1,
        'ljuslåga': 1,
        'någon': 1,
        'och': 1,
        'romansk': 1,
        'tränga': 1,
        'turist': 1,
        'valv': 2,
        'väldig': 1,
        'överblick': 1,
    }

    extract_opts = ExtractTaggedTokensOpts(lemmatize=True, pos_includes=None, pos_excludes=None, **SPARV_TAGGED_COLUMNS)
    tokens = tagged_frame_to_tokens(tagged_frame, extract_opts=extract_opts)
    assert set(expected_counts.keys()) == set(tokens)

    """TF threshold resets to 1 if token2id not supplied (i.e. token2id.TF is needed)"""
    extract_opts.global_tf_threshold = 2
    tokens = tagged_frame_to_tokens(tagged_frame, token2id=None, extract_opts=extract_opts)
    assert extract_opts.global_tf_threshold == 1

    token2id: Token2Id = Token2Id().ingest(["*", GLOBAL_TF_THRESHOLD_MASK_TOKEN]).ingest(tagged_frame.baseform)

    extract_opts.global_tf_threshold = 2
    extract_opts.global_tf_threshold_mask = False
    tokens = tagged_frame_to_tokens(tagged_frame, token2id=token2id, extract_opts=extract_opts)
    assert tokens == ['i', 'i', '.', 'valv', 'valv', '.', '.']

    extract_opts.global_tf_threshold = 2
    extract_opts.global_tf_threshold_mask = True
    tokens = tagged_frame_to_tokens(tagged_frame, token2id=token2id, extract_opts=extract_opts)
    assert len(tokens) == len(tagged_frame)
    assert set(tokens) == set([GLOBAL_TF_THRESHOLD_MASK_TOKEN, 'i', 'i', '.', 'valv', 'valv', '.', '.'])

    extract_opts.global_tf_threshold = 2
    extract_opts.global_tf_threshold_mask = True
    extract_opts.passthrough_tokens = {'överblick'}
    tokens = tagged_frame_to_tokens(tagged_frame, token2id=token2id, extract_opts=extract_opts)
    assert len(tokens) == len(tagged_frame)
    assert set(tokens) == set([GLOBAL_TF_THRESHOLD_MASK_TOKEN, 'i', 'i', '.', 'valv', 'valv', '.', '.', 'överblick'])
Пример #16
0
def test_token2id_compress_with_ingested_mask_token_and_threshold_has_correct_magic_token_sum(
):
    token2id: Token2Id = Token2Id().ingest(TEST_TOKENS_STREAM2).ingest(
        [GLOBAL_TF_THRESHOLD_MASK_TOKEN]).close()
    _, translation = token2id.compress(tf_threshold=2, inplace=True)
    assert dict(token2id.data) == {
        'adam': 0,
        'anton': 1,
        'beata': 2,
        GLOBAL_TF_THRESHOLD_MASK_TOKEN: 3
    }
    assert dict(token2id.tf) == {0: 3, 1: 2, 2: 2, 3: 4}
    assert translation == {0: 0, 1: 1, 5: 2, 6: 3}
Пример #17
0
def test_token2id_find():

    token2id: Token2Id = Token2Id({
        'adam': 0,
        'anton': 1,
        'beatrice': 2,
        'felicia': 3,
        'niklas': 4
    })

    assert set(token2id.find(what='adam')) == set([0])
    assert set(token2id.find(what='a*')) == set([0, 1])
    assert set(token2id.find(what=['a*', 'f*'])) == set([0, 1, 3])
    assert set(token2id.find(what=['a*', 'beatrice'])) == set([0, 1, 2])
Пример #18
0
def test_interfaces_token2id_close():

    token2id = Token2Id()

    token2id.ingest(['apa', 'banan', 'soffa'])
    token2id.close()

    with pytest.raises(KeyError):
        _ = token2id['hängmatta']

    token2id.open()
    _ = token2id['hängmatta']

    assert 'hängmatta' in token2id
Пример #19
0
def test_TTM_to_co_occurrence_DTM_using_LIL_matrix():

    source_corpus = very_simple_corpus(SIMPLE_CORPUS_ABCDE_5DOCS)
    token2id = Token2Id(source_corpus.token2id)
    document_index: DocumentIndex = source_corpus.document_index

    stream: Iterable[CoOccurrencePayload] = (CoOccurrencePayload(
        document_id,
        document_name="-",
        ttm_data_map={
            VectorizeType.Normal:
            VectorizedTTM(
                vectorize_type=VectorizeType.Normal,
                term_term_matrix=CorpusVectorizer().fit_transform(
                    [doc], already_tokenized=True,
                    vocabulary=token2id.data).co_occurrence_matrix(),
                term_window_counts={},
                document_id=document_id,
            )
        },
    ) for document_id, doc in enumerate(source_corpus))

    pair2id: Token2Id = Token2Id()

    builder: CoOccurrenceCorpusBuilder = CoOccurrenceCorpusBuilder(
        vectorize_type=VectorizeType.Normal,
        document_index=document_index,
        pair2id=pair2id,
        token2id=token2id,
    )

    for payload in stream:
        builder.ingest_pairs(payload).add(payload)

    corpus: VectorizedCorpus = builder.corpus

    assert corpus is not None
Пример #20
0
def test_translation():
    data = {'*': 0, '__low-tf__': 1, 'a': 2, 'b': 3, 'c': 4, 'd': 5, 'e': 6}
    tf = {0: 1, 1: 1, 2: 5, 3: 2, 4: 5, 5: 2, 6: 4}
    token2id = Token2Id(data=data, tf=tf)
    _, translation = token2id.compress(tf_threshold=4, inplace=True)
    tf = {0: 1, 1: 1, 2: 5, 3: 2, 4: 5, 5: 2, 6: 4}

    assert dict(token2id.data) == {
        '*': 0,
        '__low-tf__': 1,
        'a': 2,
        'c': 3,
        'e': 4
    }
    assert dict(token2id.tf) == {0: 1, 1: 5, 2: 5, 3: 5, 4: 4}
    assert dict(translation) == {0: 0, 1: 1, 2: 2, 4: 3, 6: 4}
Пример #21
0
def test_translate():

    data = {'*': 0, '__low-tf__': 1, 'a': 2, 'b': 3, 'c': 4, 'd': 5, 'e': 6}
    tf = {0: 1, 1: 1, 2: 5, 3: 2, 4: 5, 5: 2, 6: 4}
    ids_translation = {0: 0, 1: 1, 2: 2, 4: 3, 6: 4}
    token2id = Token2Id(data=data, tf=tf)

    token2id.translate(ids_translation=ids_translation, inplace=True)

    assert dict(token2id.data) == {
        '*': 0,
        '__low-tf__': 1,
        'a': 2,
        'c': 3,
        'e': 4
    }
    """Note that translate doesn't add LF-counts to LF-marker"""
    assert dict(token2id.tf) == {0: 1, 1: 1, 2: 5, 3: 5, 4: 4}
Пример #22
0
def test_token2id_compress_with_threshold_and_keeps_adds_masked_magic_token_with_correct_sum(
):

    mask_token: str = GLOBAL_TF_THRESHOLD_MASK_TOKEN
    token2id: Token2Id = Token2Id().ingest(TEST_TOKENS_STREAM2).close()
    token2id_compressed, _ = token2id.compress(tf_threshold=2,
                                               keeps={4},
                                               inplace=False)

    assert mask_token not in token2id
    assert mask_token in token2id_compressed

    sum_of_masked_tokens = sum([
        v for k, v in EXPECTED_COUNTS2.items() if k not in token2id_compressed
    ])

    assert token2id_compressed.tf[
        token2id_compressed[mask_token]] == sum_of_masked_tokens
Пример #23
0
def test_token2id_inplace_compress_with_threshold_and_no_keeps():

    token2id: Token2Id = Token2Id().ingest(TEST_TOKENS_STREAM2).close()
    token2id_compressed, translation = token2id.compress(tf_threshold=2,
                                                         inplace=False)
    assert dict(token2id_compressed.data) == {
        'adam': 0,
        'anton': 1,
        'beata': 2,
        GLOBAL_TF_THRESHOLD_MASK_TOKEN: 3
    }
    assert dict(token2id_compressed.tf) == {0: 3, 1: 2, 2: 2, 3: 3}
    assert translation == {0: 0, 1: 1, 5: 2}
    assert token2id.fallback_token_id is None
    assert token2id_compressed.fallback_token_id is not None
    assert token2id_compressed["roger"] == token2id_compressed[
        GLOBAL_TF_THRESHOLD_MASK_TOKEN]
    assert "roger" not in token2id_compressed
Пример #24
0
def test_token2id_compress_with_no_threshold_and_no_keeps_returns_self():

    token2id: Token2Id = Token2Id().ingest(TEST_TOKENS_STREAM2).close()

    assert token2id.data == {
        'adam': 0,
        'anton': 1,
        'beatrice': 2,
        'felicia': 3,
        'niklas': 4,
        'beata': 5
    }
    assert dict(token2id.tf) == {0: 3, 1: 2, 2: 1, 3: 1, 4: 1, 5: 2}

    token2id_compressed, translation = token2id.compress(tf_threshold=1,
                                                         inplace=False,
                                                         keeps=None)
    assert token2id_compressed is token2id
    assert translation is None
Пример #25
0
def test_co_occurrences_to_co_occurrence_corpus():

    folder, tag = './tests/test_data/ABCDEFG_7DOCS_CONCEPT', "ABCDEFG_7DOCS_CONCEPT"

    co_occurrences: CoOccurrenceDataFrame = co_occurrence.load_co_occurrences(
        co_occurrence_filename(folder, tag))
    document_index: DocumentIndex = DocumentIndexHelper.load(
        document_index_filename(folder, tag)).document_index
    token2id: Token2Id = Token2Id.load(vocabulary_filename(folder, tag))

    corpus = LegacyCoOccurrenceMixIn.from_co_occurrences(
        co_occurrences=co_occurrences,
        document_index=document_index,
        token2id=token2id,
    )

    assert corpus.data.sum() == co_occurrences.value.sum()
    assert corpus.data.shape[0] == len(document_index)
    assert corpus.data.shape[1] == len(co_occurrences[["w1_id", "w2_id"
                                                       ]].drop_duplicates())
Пример #26
0
def test_token2id_compress_with_threshold_and_keeps_scuccee3():
    token2id: Token2Id = Token2Id().ingest(TEST_TOKENS_STREAM2).close()
    _, translation = token2id.compress(tf_threshold=2,
                                       inplace=True,
                                       keeps={token2id["felicia"]})
    assert dict(token2id.data) == {
        'adam': 0,
        'anton': 1,
        'felicia': 2,
        'beata': 3,
        GLOBAL_TF_THRESHOLD_MASK_TOKEN: 4
    }
    assert tf_to_string(token2id) == {
        'adam': 3,
        'anton': 2,
        'felicia': 1,
        'beata': 2,
        '__low-tf__': 2
    }
    assert translation == {0: 0, 1: 1, 3: 2, 5: 3}
Пример #27
0
def test_token2id_ingest():

    token2id: Token2Id = Token2Id().ingest(TEST_TOKENS_STREAM1)

    assert token2id.data == {
        'adam': 0,
        'anton': 1,
        'beatrice': 2,
        'felicia': 3,
        'niklas': 4
    }
    assert token2id.tf is not None
    assert dict(token2id.tf) == {0: 3, 1: 2, 2: 1, 3: 1, 4: 1}
    assert tf_to_string(token2id) == {
        'adam': 3,
        'anton': 2,
        'beatrice': 1,
        'felicia': 1,
        'niklas': 1
    }
Пример #28
0
def filter_tagged_frame(
    tagged_frame: pd.DataFrame,
    *,
    extract_opts: ExtractTaggedTokensOpts,
    token2id: Token2Id = None,
    pos_schema: PoS_Tag_Scheme = None,
    normalize_column_names: bool = True,
    transform_opts: TokensTransformOpts = None,
) -> pd.DataFrame:
    """Filters tagged frame (text or numeric). Returns tagged frame

    Args:
        tagged_frame ([pd.DataFrame]): Document frame to be filtered, can be text or numeric
        extract_opts (ExtractTaggedTokensOpts): PoS and lemma extract/filter opts
        token2id (Token2Id, optional): Vocabulary. Defaults to None.
        pos_schema (PoS_Tag_Scheme, optional): PoS schema. Defaults to None.
        transform_opts (TokensTransformOpts, optional): Filters and transforms. Defaults to None.
        normalize_column_names (bool, optional): If text, rename columns to `token` and `pos`. Defaults to True.

    Raises:
        Token2IdMissingError: Token2Id is mandatory if frame is numeric.
        PoSTagSchemaMissingError: PoS-schema is mandatory if frame is numeric.
        TaggedFrameColumnNameError: Missing target column (corrupt data)

    Returns:
        pd.DataFrame: Filtered and transformed document frame.
    """
    if len(tagged_frame) == 0:
        return []

    is_numeric_frame: bool = is_encoded_tagged_frame(tagged_frame)
    to_lower: bool = transform_opts and transform_opts.to_lower

    if is_numeric_frame:

        if token2id is None:
            raise Token2IdMissingError(
                "filter_tagged_frame: cannot filter tagged id frame without vocabulary"
            )

        if pos_schema is None:
            raise PoSTagSchemaMissingError(
                "filter_tagged_frame: cannot filter tagged id frame without pos_schema"
            )

        if to_lower:
            logger.warning(
                "lowercasing not implemented for numeric tagged frames")
            to_lower = False

    if not is_numeric_frame and extract_opts.lemmatize is None and extract_opts.target_override is None:
        raise ValueError("a valid target not supplied (no lemmatize or target")

    target_column: str = extract_opts.target_column
    pos_column: str = extract_opts.pos_column

    if target_column not in tagged_frame.columns:
        raise TaggedFrameColumnNameError(
            f"{target_column} is not valid target for given document (missing column)"
        )

    if pos_column not in tagged_frame.columns:
        raise ValueError(f"configuration error: {pos_column} not in document")

    passthroughs: Set[str] = extract_opts.get_passthrough_tokens()
    blocks: Set[str] = extract_opts.get_block_tokens().union('')

    if is_numeric_frame:
        passthroughs = token2id.to_id_set(passthroughs)
        blocks = token2id.to_id_set(blocks)

    if not is_numeric_frame and (extract_opts.lemmatize or to_lower):
        tagged_frame[target_column] = tagged_frame[target_column].str.lower()
        # pd.Series([x.lower() for x in tagged_frame[target_column]])
        passthroughs = {x.lower() for x in passthroughs}

    # if extract_opts.block_chars:
    #     for char in extract_opts.block_chars:
    #         doc[target] = doc[target].str.replace(char, '', regex=False)
    """ Phrase detection """
    if extract_opts.phrases:
        if is_numeric_frame:
            logger.warning(
                "phrase detection not implemented for numeric tagged frames")
            extract_opts.phrases = None
        else:
            found_phrases = detect_phrases(tagged_frame[target_column],
                                           extract_opts.phrases,
                                           ignore_case=to_lower)
            if found_phrases:
                tagged_frame = merge_phrases(tagged_frame,
                                             found_phrases,
                                             target_column=target_column,
                                             pad=PHRASE_PAD)
                passthroughs = passthroughs.union(
                    {'_'.join(x[1])
                     for x in found_phrases})

    mask = np.repeat(True, len(tagged_frame.index))
    if extract_opts.filter_opts and extract_opts.filter_opts.data:
        mask &= extract_opts.filter_opts.mask(tagged_frame)

    pos_includes: Set[str] = extract_opts.get_pos_includes()
    pos_excludes: Set[str] = extract_opts.get_pos_excludes()
    pos_paddings: Set[str] = extract_opts.get_pos_paddings()

    if is_numeric_frame:
        pg = pos_schema.pos_to_id.get
        pos_includes = {pg(x) for x in pos_includes}
        pos_excludes = {pg(x) for x in pos_excludes}
        pos_paddings = {pg(x) for x in pos_paddings}

    if pos_includes:
        """Don't filter if PoS-include is empty - and don't filter out PoS tokens that should be padded"""
        mask &= tagged_frame[pos_column].isin(pos_includes.union(pos_paddings))

    if pos_excludes:
        mask &= ~(tagged_frame[pos_column].isin(pos_excludes))

    if transform_opts and transform_opts.has_effect:
        mask &= transform_opts.mask(tagged_frame[target_column],
                                    token2id=token2id)

    if len(passthroughs) > 0:
        mask |= tagged_frame[target_column].isin(passthroughs)

    if len(blocks) > 0:
        mask &= ~tagged_frame[target_column].isin(blocks)

    filtered_data: pd.DataFrame = tagged_frame.loc[mask][[
        target_column, pos_column
    ]]

    if extract_opts.global_tf_threshold > 1:
        if token2id is None or token2id.tf is None:
            logger.error(
                "Cannot apply TF filter since token2id has no term frequencies"
            )
            extract_opts.global_tf_threshold = 1
        else:
            filtered_data = filter_tagged_frame_by_term_frequency(
                tagged_frame=filtered_data,
                target_column=target_column,
                token2id=token2id,
                extract_opts=extract_opts,
                passthroughs=passthroughs,
            )

    if not is_numeric_frame and normalize_column_names:

        filtered_data.rename(columns={
            target_column: 'token',
            pos_column: 'pos'
        },
                             inplace=True)

    return filtered_data
Пример #29
0
def tagged_frame_to_tokens(  # pylint: disable=too-many-arguments, too-many-statements
    doc: pd.DataFrame,
    extract_opts: ExtractTaggedTokensOpts,
    token2id: Token2Id = None,
    transform_opts: TokensTransformOpts = None,
    pos_schema: PoS_Tag_Scheme = None,
) -> Iterable[str | int]:
    """Extracts tokens from a tagged document represented as a Pandas data frame.

    Args:
        extract_opts (ExtractTaggedTokensOpts): Part-of-speech/lemma extract options (e.g. PoS-filter)

    Returns:
        Iterable[str]: Sequence of extracted tokens
    """

    if len(doc) == 0:
        return []

    is_numeric_frame: bool = is_encoded_tagged_frame(doc)

    if isinstance(extract_opts, str):
        passthrough_column: str = extract_opts
        """If extracts_opts is a string: return column with what name"""
        if transform_opts is not None:
            raise ValueError(
                "transform_opts must be None when passthrough is specified")
        if transform_opts is not None:
            raise ValueError(
                "transform_opts must be None when passthrough is specified")
        return doc[passthrough_column].tolist()

    # if is_numeric_frame:
    #     raise NotImplementedError("tagged_frame_to_tokens cannot handle encoded frames yet")

    pad: str = "*"
    pos_paddings: Set[str] = extract_opts.get_pos_paddings()
    phrase_pad: str = PHRASE_PAD

    filtered_data = filter_tagged_frame(
        tagged_frame=doc,
        extract_opts=extract_opts,
        token2id=token2id,
        transform_opts=transform_opts,
        pos_schema=pos_schema,
    )

    target_columns = ['token_id', 'pos_id'
                      ] if is_numeric_frame else ['token', 'pos']
    token_pos_tuples = filtered_data[target_columns].itertuples(index=False,
                                                                name=None)

    if len(pos_paddings) > 0:

        passthroughs: Set[str] = extract_opts.get_passthrough_tokens()

        if is_numeric_frame:
            pos_paddings = {pos_schema.pos_to_id.get(x) for x in pos_paddings}
            passthroughs = token2id.to_id_set(passthroughs)

        token_pos_tuples = ((pad, x[1]) if x[1] in pos_paddings
                            and x[0] not in passthroughs else x
                            for x in token_pos_tuples)

    if extract_opts.append_pos:

        if is_numeric_frame:
            raise NotImplementedError(
                "tagged_frame_to_tokens: PoS tag on encoded frame")

        return [
            pad if x[0] == pad else f"{x[0].replace(' ', '_')}@{x[1]}"
            for x in token_pos_tuples if x[0] != phrase_pad
        ]

    if not is_numeric_frame and extract_opts.phrases and len(
            extract_opts.phrases) > 0:
        return [
            x[0].replace(' ', '_') for x in token_pos_tuples
            if x[0] != phrase_pad
        ]

    return [x[0] for x in token_pos_tuples]
Пример #30
0
def filter_tagged_frame_by_term_frequency(  # pylint: disable=too-many-arguments, too-many-statements
    tagged_frame: pd.DataFrame,
    target_column: str,
    token2id: Token2Id,
    extract_opts: ExtractTaggedTokensOpts,
    passthroughs: Set[str] = None,
) -> pd.DataFrame:
    """Filter tagged frame `tagged_frame` based on `extract_opts`.
    Return tagged frame with columns `token` and `pos`.
    Columns `token` is lemmatized word or source word depending on `extract_opts.lemmatize`.

    Args:
        tagged_frame (pd.DataFrame): tagged frame to be filtered
        extract_opts (ExtractTaggedTokensOpts): Part-of-speech/lemma extract options (e.g. PoS-filter)
        token2id (Token2Id, optional): Vocabulary.

    Returns:
        pd.DataFrame: Filtered tagged frame
    """

    is_numeric_frame: bool = is_encoded_tagged_frame(tagged_frame)

    if extract_opts.global_tf_threshold <= 1:
        return tagged_frame

    if token2id is None or token2id.tf is None:
        raise ValueError("token2id or token2id.tf is not defined")

    if target_column not in tagged_frame.columns:
        raise ValueError(
            f"{target_column} is not valid target for given document (missing column)"
        )
    """
    If global_tf_threshold_mask then filter out tokens below threshold
    Otherwise replace token with `GLOBAL_TF_THRESHOLD_MASK_TOKEN`

    Alternativ implementation:
        1. Compress Token2Id (remove low frequency words)
        2. Remove or mask tokens not in compressed token2id
    """

    tg = token2id.get

    mask_token_id: int = tg(GLOBAL_TF_THRESHOLD_MASK_TOKEN)
    mask_token: str | int = mask_token_id if is_numeric_frame else GLOBAL_TF_THRESHOLD_MASK_TOKEN
    """Set low TF to 0"""
    mask_token_id_tf: int = token2id.tf.get(mask_token_id, 0)
    token2id.tf[mask_token_id] = 0

    cg = token2id.tf.get

    low_frequency_mask: pd.Series = (
        tagged_frame[target_column].apply(tg).apply(cg)
        if not is_numeric_frame else tagged_frame[target_column].apply(cg)
    ).fillna(0) < extract_opts.global_tf_threshold
    """Reset low TF count"""
    token2id.tf[mask_token_id] = mask_token_id_tf

    if passthroughs:
        low_frequency_mask &= ~tagged_frame[target_column].isin(passthroughs)

    if extract_opts.global_tf_threshold_mask:
        """Mask low frequency terms"""
        tagged_frame[target_column] = tagged_frame[target_column].where(
            ~low_frequency_mask, mask_token)
    else:
        """Filter out low frequency terms"""
        tagged_frame = tagged_frame[~low_frequency_mask]

    return tagged_frame