def test_tagged_frame_to_tokens_with_tf_threshold_and_not_threshold_tf_mask(tagged_frame: pd.DataFrame): extract_opts = ExtractTaggedTokensOpts( lemmatize=True, pos_includes=None, pos_excludes=None, global_tf_threshold=2, global_tf_threshold_mask=False, **SPARV_TAGGED_COLUMNS, ) """ Alternative #1: tagged_frame_to_tokens does the filtering """ token2id: Token2Id = Token2Id().ingest(["*", GLOBAL_TF_THRESHOLD_MASK_TOKEN]).ingest(tagged_frame.baseform) expected_count = len( tagged_frame[ tagged_frame.baseform.apply(lambda x: token2id.tf[token2id[x]] >= extract_opts.global_tf_threshold) ] ) df: pd.DataFrame = tagged_frame.copy() tokens = tagged_frame_to_tokens(df, token2id=token2id, extract_opts=extract_opts) assert len(tokens) == expected_count assert set(tokens) == set(['i', 'i', '.', 'valv', 'valv', '.', '.']) """ Alternative #2: Use token2id to mask low TF tokens""" df: pd.DataFrame = tagged_frame.copy() token2id: Token2Id = Token2Id().ingest(["*", GLOBAL_TF_THRESHOLD_MASK_TOKEN]).ingest(df.baseform) """Note that translation must be used to map token-ids if used elsewhere""" _, translation = token2id.compress(tf_threshold=2, inplace=True) # pylint: disable=unused-variable token2id.close() tokens = tagged_frame_to_tokens(df, token2id=token2id, extract_opts=extract_opts) assert len(tokens) == expected_count assert set(tokens) == set(['i', 'i', '.', 'valv', 'valv', '.', '.'])
def test_tagged_frame_to_tokens_with_tf_threshold_and_threshold_tf_mask(tagged_frame: pd.DataFrame): extract_opts = ExtractTaggedTokensOpts(lemmatize=True, pos_includes=None, pos_excludes=None, **SPARV_TAGGED_COLUMNS) """ Alternative #1: tagged_frame_to_tokens does the filtering """ df: pd.DataFrame = tagged_frame.copy() extract_opts.global_tf_threshold = 2 extract_opts.global_tf_threshold_mask = True token2id: Token2Id = Token2Id().ingest(["*", GLOBAL_TF_THRESHOLD_MASK_TOKEN]).ingest(df.baseform) tokens = tagged_frame_to_tokens(df, token2id=token2id, extract_opts=extract_opts) assert len(tokens) == len(df) assert set(tokens) == set([GLOBAL_TF_THRESHOLD_MASK_TOKEN, 'i', 'i', '.', 'valv', 'valv', '.', '.']) """ Alternative #2: Use token2id to mask low TF tokens""" df: pd.DataFrame = tagged_frame.copy() token2id: Token2Id = Token2Id().ingest(["*", GLOBAL_TF_THRESHOLD_MASK_TOKEN]).ingest(df.baseform) """Note that translation must be used to map token-ids if used elsewhere""" token2id.compress(tf_threshold=2, inplace=True) token2id.close() tokens = tagged_frame_to_tokens(df, token2id=token2id, extract_opts=extract_opts) assert len(tokens) == len(df) assert set(tokens) == set([GLOBAL_TF_THRESHOLD_MASK_TOKEN, 'i', 'i', '.', 'valv', 'valv', '.', '.'])
def test_token2id_store_and_load(): os.makedirs('./tests/output', exist_ok=True) token2id: Token2Id = Token2Id().ingest(TEST_TOKENS_STREAM1) filename = './tests/output/test_vocabulary.zip' tf_filename = path_add_suffix(filename, "_tf", new_extension=".pbz2") token2id.store(filename=filename) assert os.path.isfile(filename) and os.path.isfile(tf_filename) token2id_loaded: Token2Id = Token2Id.load(filename=filename) assert token2id_loaded is not None assert token2id_loaded.tf is not None assert token2id_loaded.data == { 'adam': 0, 'anton': 1, 'beatrice': 2, 'felicia': 3, 'niklas': 4 } assert dict(token2id_loaded.tf) == {0: 3, 1: 2, 2: 1, 3: 1, 4: 1}
def translate_id_pair_to_token(self, pair2id: Token2Id, token2id: Token2Id) -> None: """Translates `id pairs` (w1_id, w2_id) to pair-token `w1/w2`""" _single_without_sep = { w_id: w.replace(WORD_PAIR_DELIMITER, '') for w_id, w in token2id.id2token.items() } sg = _single_without_sep.get pair2id.replace( data={ sj([sg(w1_id), sg(w2_id)]): pair_id for (w1_id, w2_id), pair_id in pair2id.data.items() })
def test_interfaces_token2id_store(): os.makedirs('./tests/output', exist_ok=True) filename: str = './tests/output/test_interfaces_token2id_store.zip' token2id = Token2Id() token2id.ingest(['apa', 'banan', 'soffa']) token2id.store(filename) assert pathlib.Path(filename).exists() token2id_loaded: Token2Id = Token2Id.load(filename) assert token2id.data == token2id_loaded.data
def test_term_term_matrix_to_co_occurrences_with_ignore_ids(): text_corpus = very_simple_corpus(data=[ ('tran_2019_01_test.txt', ['*', 'b', 'c', 'c']), ('tran_2019_02_test.txt', ['a', '*', '*', 'd']), ('tran_2019_03_test.txt', ['a', 'e', 'e', 'b']), ('tran_2020_01_test.txt', ['*', 'c', 'd', 'a']), ('tran_2020_02_test.txt', ['a', 'b', '*', '*']), ]) token2id: Token2Id = Token2Id(text_corpus.token2id) term_term_matrix = (dtm.CorpusVectorizer().fit_transform( text_corpus, already_tokenized=True, vocabulary=text_corpus.token2id).co_occurrence_matrix()) pad_id = token2id['*'] co_occurrences = term_term_matrix_to_co_occurrences( term_term_matrix=term_term_matrix, threshold_count=1, ignore_ids=set([pad_id]), ) assert not (co_occurrences.w1_id == pad_id).any() assert not (co_occurrences.w2_id == pad_id).any()
def get_topic_token_weights( self, vocabulary: Any, n_tokens: int = 200, minimum_probability: float = 0.000001) -> pd.DataFrame: """Compile document topic weights. Return DataFrame.""" id2token: dict = Token2Id.any_to_id2token(vocabulary) topic_data: list[tuple[ int, TokenWeights]] = self.get_topic_token_weights_data( n_tokens=n_tokens, id2term=id2token) topic_token_weights: pd.DataFrame = pd.DataFrame( [(topic_id, token, weight) for topic_id, tokens in topic_data for token, weight in tokens if weight > minimum_probability], columns=['topic_id', 'token', 'weight'], ) topic_token_weights['topic_id'] = topic_token_weights.topic_id.astype( np.uint16) fg = {v: k for k, v in id2token.items()}.get topic_token_weights['token_id'] = topic_token_weights.token.apply(fg) return topic_token_weights[['topic_id', 'token_id', 'token', 'weight']]
def test_interfaces_token2id_get(): token2id = Token2Id() token_id = token2id['apa'] assert token_id == 0 assert 'apa' in token2id
def test_interfaces_token2id_reverse(): token2id: Token2Id = Token2Id() id2token = token2id.ingest(['apa', 'banan', 'soffa']).id2token assert id2token[0] == 'apa' assert id2token[1] == 'banan' assert id2token[2] == 'soffa'
def test_interfaces_token2id_ingest(): token2id = Token2Id() token2id.ingest(['apa', 'banan', 'soffa']) assert 'apa' in token2id assert 'banan' in token2id assert 'soffa' in token2id
def _token2id(self) -> Optional[Token2Id]: """Returns dictionary stored in archive, or None if not found in archive""" if DICTIONARY_FILENAME not in self.namelist(): return None return Token2Id( zip_utils.read_json(zip_or_filename=self, filename=DICTIONARY_FILENAME))
def process_stream(self) -> Iterable[DocumentPayload]: if self.document_index is None: raise CoOccurrenceError( "expected document index found no such thing") token2id: Token2Id = self.pipeline.payload.token2id pair2id: Token2Id = Token2Id() normal_builder: CoOccurrenceCorpusBuilder = CoOccurrenceCorpusBuilder( VectorizeType.Normal, self.document_index, pair2id, token2id) concept_builder: CoOccurrenceCorpusBuilder = ( CoOccurrenceCorpusBuilder(VectorizeType.Concept, self.document_index, pair2id, token2id) if self.context_opts.concept else None) coo_payloads: Iterable[CoOccurrencePayload] = ( payload.content for payload in self.prior.outstream(desc="Ingest", total=len(self.document_index)) if payload.content is not None) for coo_payload in coo_payloads: normal_builder.ingest_pairs(coo_payload).add(payload=coo_payload) if concept_builder: concept_builder.add(payload=coo_payload) pair2id.close() """Translation between id-pair (single vocab IDs) and pair-pid (pair vocab IDs)""" token_ids_2_pair_id: Mapping[Tuple[int, int], int] = dict(pair2id.data) self.translate_id_pair_to_token(pair2id, token2id) concept_corpus: VectorizedCorpus = (concept_builder.corpus.remember( window_counts=self.get_window_counts(concept_builder)) if concept_builder else None) corpus: VectorizedCorpus = normal_builder.corpus.remember( window_counts=self.get_window_counts(normal_builder)) bundle: Bundle = Bundle( corpus=corpus, token2id=token2id, document_index=self.document_index, concept_corpus=concept_corpus, compute_options=self.pipeline.payload.stored_opts(), vocabs_mapping=token_ids_2_pair_id, ) if self.compress: bundle.compress() payload: DocumentPayload = DocumentPayload(content=bundle) yield payload
def setup(self) -> ITask: self.target = self.get_column_name(self.token_type) # if self.pipeline.get_next_to(self).in_content_type == ContentType.TAGGED_FRAME: # if self.token_type is None: # raise ValueError("token_type text or lemma not specfied") self.token2id: Token2Id = self.token2id or Token2Id() self.pipeline.payload.token2id = self.token2id return self
def test_replace(): tokens = ['a', 'a', 'b', 'c'] ingested: Token2Id = Token2Id().ingest(tokens) token2id: Token2Id = Token2Id() token2id.replace(data={ 'a': 0, 'b': 1, 'c': 2 }, tf=Counter({ 0: 2, 1: 1, 2: 1 })) assert dict(token2id.data) == dict(ingested.data) assert dict(token2id.tf) == dict(ingested.tf)
def test_tagged_frame_to_tokens_with_global_tf_threshold(tagged_frame: pd.DataFrame): tagged_frame: pd.DataFrame = tagged_frame.copy() expected_counts: dict = { '.': 3, 'bakom': 1, 'den': 1, 'fladdra_omkring': 1, 'gapa': 1, 'halvmörker': 1, 'i': 2, 'ingen': 1, 'inne': 1, 'kyrka': 1, 'ljuslåga': 1, 'någon': 1, 'och': 1, 'romansk': 1, 'tränga': 1, 'turist': 1, 'valv': 2, 'väldig': 1, 'överblick': 1, } extract_opts = ExtractTaggedTokensOpts(lemmatize=True, pos_includes=None, pos_excludes=None, **SPARV_TAGGED_COLUMNS) tokens = tagged_frame_to_tokens(tagged_frame, extract_opts=extract_opts) assert set(expected_counts.keys()) == set(tokens) """TF threshold resets to 1 if token2id not supplied (i.e. token2id.TF is needed)""" extract_opts.global_tf_threshold = 2 tokens = tagged_frame_to_tokens(tagged_frame, token2id=None, extract_opts=extract_opts) assert extract_opts.global_tf_threshold == 1 token2id: Token2Id = Token2Id().ingest(["*", GLOBAL_TF_THRESHOLD_MASK_TOKEN]).ingest(tagged_frame.baseform) extract_opts.global_tf_threshold = 2 extract_opts.global_tf_threshold_mask = False tokens = tagged_frame_to_tokens(tagged_frame, token2id=token2id, extract_opts=extract_opts) assert tokens == ['i', 'i', '.', 'valv', 'valv', '.', '.'] extract_opts.global_tf_threshold = 2 extract_opts.global_tf_threshold_mask = True tokens = tagged_frame_to_tokens(tagged_frame, token2id=token2id, extract_opts=extract_opts) assert len(tokens) == len(tagged_frame) assert set(tokens) == set([GLOBAL_TF_THRESHOLD_MASK_TOKEN, 'i', 'i', '.', 'valv', 'valv', '.', '.']) extract_opts.global_tf_threshold = 2 extract_opts.global_tf_threshold_mask = True extract_opts.passthrough_tokens = {'överblick'} tokens = tagged_frame_to_tokens(tagged_frame, token2id=token2id, extract_opts=extract_opts) assert len(tokens) == len(tagged_frame) assert set(tokens) == set([GLOBAL_TF_THRESHOLD_MASK_TOKEN, 'i', 'i', '.', 'valv', 'valv', '.', '.', 'överblick'])
def test_token2id_compress_with_ingested_mask_token_and_threshold_has_correct_magic_token_sum( ): token2id: Token2Id = Token2Id().ingest(TEST_TOKENS_STREAM2).ingest( [GLOBAL_TF_THRESHOLD_MASK_TOKEN]).close() _, translation = token2id.compress(tf_threshold=2, inplace=True) assert dict(token2id.data) == { 'adam': 0, 'anton': 1, 'beata': 2, GLOBAL_TF_THRESHOLD_MASK_TOKEN: 3 } assert dict(token2id.tf) == {0: 3, 1: 2, 2: 2, 3: 4} assert translation == {0: 0, 1: 1, 5: 2, 6: 3}
def test_token2id_find(): token2id: Token2Id = Token2Id({ 'adam': 0, 'anton': 1, 'beatrice': 2, 'felicia': 3, 'niklas': 4 }) assert set(token2id.find(what='adam')) == set([0]) assert set(token2id.find(what='a*')) == set([0, 1]) assert set(token2id.find(what=['a*', 'f*'])) == set([0, 1, 3]) assert set(token2id.find(what=['a*', 'beatrice'])) == set([0, 1, 2])
def test_interfaces_token2id_close(): token2id = Token2Id() token2id.ingest(['apa', 'banan', 'soffa']) token2id.close() with pytest.raises(KeyError): _ = token2id['hängmatta'] token2id.open() _ = token2id['hängmatta'] assert 'hängmatta' in token2id
def test_TTM_to_co_occurrence_DTM_using_LIL_matrix(): source_corpus = very_simple_corpus(SIMPLE_CORPUS_ABCDE_5DOCS) token2id = Token2Id(source_corpus.token2id) document_index: DocumentIndex = source_corpus.document_index stream: Iterable[CoOccurrencePayload] = (CoOccurrencePayload( document_id, document_name="-", ttm_data_map={ VectorizeType.Normal: VectorizedTTM( vectorize_type=VectorizeType.Normal, term_term_matrix=CorpusVectorizer().fit_transform( [doc], already_tokenized=True, vocabulary=token2id.data).co_occurrence_matrix(), term_window_counts={}, document_id=document_id, ) }, ) for document_id, doc in enumerate(source_corpus)) pair2id: Token2Id = Token2Id() builder: CoOccurrenceCorpusBuilder = CoOccurrenceCorpusBuilder( vectorize_type=VectorizeType.Normal, document_index=document_index, pair2id=pair2id, token2id=token2id, ) for payload in stream: builder.ingest_pairs(payload).add(payload) corpus: VectorizedCorpus = builder.corpus assert corpus is not None
def test_translation(): data = {'*': 0, '__low-tf__': 1, 'a': 2, 'b': 3, 'c': 4, 'd': 5, 'e': 6} tf = {0: 1, 1: 1, 2: 5, 3: 2, 4: 5, 5: 2, 6: 4} token2id = Token2Id(data=data, tf=tf) _, translation = token2id.compress(tf_threshold=4, inplace=True) tf = {0: 1, 1: 1, 2: 5, 3: 2, 4: 5, 5: 2, 6: 4} assert dict(token2id.data) == { '*': 0, '__low-tf__': 1, 'a': 2, 'c': 3, 'e': 4 } assert dict(token2id.tf) == {0: 1, 1: 5, 2: 5, 3: 5, 4: 4} assert dict(translation) == {0: 0, 1: 1, 2: 2, 4: 3, 6: 4}
def test_translate(): data = {'*': 0, '__low-tf__': 1, 'a': 2, 'b': 3, 'c': 4, 'd': 5, 'e': 6} tf = {0: 1, 1: 1, 2: 5, 3: 2, 4: 5, 5: 2, 6: 4} ids_translation = {0: 0, 1: 1, 2: 2, 4: 3, 6: 4} token2id = Token2Id(data=data, tf=tf) token2id.translate(ids_translation=ids_translation, inplace=True) assert dict(token2id.data) == { '*': 0, '__low-tf__': 1, 'a': 2, 'c': 3, 'e': 4 } """Note that translate doesn't add LF-counts to LF-marker""" assert dict(token2id.tf) == {0: 1, 1: 1, 2: 5, 3: 5, 4: 4}
def test_token2id_compress_with_threshold_and_keeps_adds_masked_magic_token_with_correct_sum( ): mask_token: str = GLOBAL_TF_THRESHOLD_MASK_TOKEN token2id: Token2Id = Token2Id().ingest(TEST_TOKENS_STREAM2).close() token2id_compressed, _ = token2id.compress(tf_threshold=2, keeps={4}, inplace=False) assert mask_token not in token2id assert mask_token in token2id_compressed sum_of_masked_tokens = sum([ v for k, v in EXPECTED_COUNTS2.items() if k not in token2id_compressed ]) assert token2id_compressed.tf[ token2id_compressed[mask_token]] == sum_of_masked_tokens
def test_token2id_inplace_compress_with_threshold_and_no_keeps(): token2id: Token2Id = Token2Id().ingest(TEST_TOKENS_STREAM2).close() token2id_compressed, translation = token2id.compress(tf_threshold=2, inplace=False) assert dict(token2id_compressed.data) == { 'adam': 0, 'anton': 1, 'beata': 2, GLOBAL_TF_THRESHOLD_MASK_TOKEN: 3 } assert dict(token2id_compressed.tf) == {0: 3, 1: 2, 2: 2, 3: 3} assert translation == {0: 0, 1: 1, 5: 2} assert token2id.fallback_token_id is None assert token2id_compressed.fallback_token_id is not None assert token2id_compressed["roger"] == token2id_compressed[ GLOBAL_TF_THRESHOLD_MASK_TOKEN] assert "roger" not in token2id_compressed
def test_token2id_compress_with_no_threshold_and_no_keeps_returns_self(): token2id: Token2Id = Token2Id().ingest(TEST_TOKENS_STREAM2).close() assert token2id.data == { 'adam': 0, 'anton': 1, 'beatrice': 2, 'felicia': 3, 'niklas': 4, 'beata': 5 } assert dict(token2id.tf) == {0: 3, 1: 2, 2: 1, 3: 1, 4: 1, 5: 2} token2id_compressed, translation = token2id.compress(tf_threshold=1, inplace=False, keeps=None) assert token2id_compressed is token2id assert translation is None
def test_co_occurrences_to_co_occurrence_corpus(): folder, tag = './tests/test_data/ABCDEFG_7DOCS_CONCEPT', "ABCDEFG_7DOCS_CONCEPT" co_occurrences: CoOccurrenceDataFrame = co_occurrence.load_co_occurrences( co_occurrence_filename(folder, tag)) document_index: DocumentIndex = DocumentIndexHelper.load( document_index_filename(folder, tag)).document_index token2id: Token2Id = Token2Id.load(vocabulary_filename(folder, tag)) corpus = LegacyCoOccurrenceMixIn.from_co_occurrences( co_occurrences=co_occurrences, document_index=document_index, token2id=token2id, ) assert corpus.data.sum() == co_occurrences.value.sum() assert corpus.data.shape[0] == len(document_index) assert corpus.data.shape[1] == len(co_occurrences[["w1_id", "w2_id" ]].drop_duplicates())
def test_token2id_compress_with_threshold_and_keeps_scuccee3(): token2id: Token2Id = Token2Id().ingest(TEST_TOKENS_STREAM2).close() _, translation = token2id.compress(tf_threshold=2, inplace=True, keeps={token2id["felicia"]}) assert dict(token2id.data) == { 'adam': 0, 'anton': 1, 'felicia': 2, 'beata': 3, GLOBAL_TF_THRESHOLD_MASK_TOKEN: 4 } assert tf_to_string(token2id) == { 'adam': 3, 'anton': 2, 'felicia': 1, 'beata': 2, '__low-tf__': 2 } assert translation == {0: 0, 1: 1, 3: 2, 5: 3}
def test_token2id_ingest(): token2id: Token2Id = Token2Id().ingest(TEST_TOKENS_STREAM1) assert token2id.data == { 'adam': 0, 'anton': 1, 'beatrice': 2, 'felicia': 3, 'niklas': 4 } assert token2id.tf is not None assert dict(token2id.tf) == {0: 3, 1: 2, 2: 1, 3: 1, 4: 1} assert tf_to_string(token2id) == { 'adam': 3, 'anton': 2, 'beatrice': 1, 'felicia': 1, 'niklas': 1 }
def filter_tagged_frame( tagged_frame: pd.DataFrame, *, extract_opts: ExtractTaggedTokensOpts, token2id: Token2Id = None, pos_schema: PoS_Tag_Scheme = None, normalize_column_names: bool = True, transform_opts: TokensTransformOpts = None, ) -> pd.DataFrame: """Filters tagged frame (text or numeric). Returns tagged frame Args: tagged_frame ([pd.DataFrame]): Document frame to be filtered, can be text or numeric extract_opts (ExtractTaggedTokensOpts): PoS and lemma extract/filter opts token2id (Token2Id, optional): Vocabulary. Defaults to None. pos_schema (PoS_Tag_Scheme, optional): PoS schema. Defaults to None. transform_opts (TokensTransformOpts, optional): Filters and transforms. Defaults to None. normalize_column_names (bool, optional): If text, rename columns to `token` and `pos`. Defaults to True. Raises: Token2IdMissingError: Token2Id is mandatory if frame is numeric. PoSTagSchemaMissingError: PoS-schema is mandatory if frame is numeric. TaggedFrameColumnNameError: Missing target column (corrupt data) Returns: pd.DataFrame: Filtered and transformed document frame. """ if len(tagged_frame) == 0: return [] is_numeric_frame: bool = is_encoded_tagged_frame(tagged_frame) to_lower: bool = transform_opts and transform_opts.to_lower if is_numeric_frame: if token2id is None: raise Token2IdMissingError( "filter_tagged_frame: cannot filter tagged id frame without vocabulary" ) if pos_schema is None: raise PoSTagSchemaMissingError( "filter_tagged_frame: cannot filter tagged id frame without pos_schema" ) if to_lower: logger.warning( "lowercasing not implemented for numeric tagged frames") to_lower = False if not is_numeric_frame and extract_opts.lemmatize is None and extract_opts.target_override is None: raise ValueError("a valid target not supplied (no lemmatize or target") target_column: str = extract_opts.target_column pos_column: str = extract_opts.pos_column if target_column not in tagged_frame.columns: raise TaggedFrameColumnNameError( f"{target_column} is not valid target for given document (missing column)" ) if pos_column not in tagged_frame.columns: raise ValueError(f"configuration error: {pos_column} not in document") passthroughs: Set[str] = extract_opts.get_passthrough_tokens() blocks: Set[str] = extract_opts.get_block_tokens().union('') if is_numeric_frame: passthroughs = token2id.to_id_set(passthroughs) blocks = token2id.to_id_set(blocks) if not is_numeric_frame and (extract_opts.lemmatize or to_lower): tagged_frame[target_column] = tagged_frame[target_column].str.lower() # pd.Series([x.lower() for x in tagged_frame[target_column]]) passthroughs = {x.lower() for x in passthroughs} # if extract_opts.block_chars: # for char in extract_opts.block_chars: # doc[target] = doc[target].str.replace(char, '', regex=False) """ Phrase detection """ if extract_opts.phrases: if is_numeric_frame: logger.warning( "phrase detection not implemented for numeric tagged frames") extract_opts.phrases = None else: found_phrases = detect_phrases(tagged_frame[target_column], extract_opts.phrases, ignore_case=to_lower) if found_phrases: tagged_frame = merge_phrases(tagged_frame, found_phrases, target_column=target_column, pad=PHRASE_PAD) passthroughs = passthroughs.union( {'_'.join(x[1]) for x in found_phrases}) mask = np.repeat(True, len(tagged_frame.index)) if extract_opts.filter_opts and extract_opts.filter_opts.data: mask &= extract_opts.filter_opts.mask(tagged_frame) pos_includes: Set[str] = extract_opts.get_pos_includes() pos_excludes: Set[str] = extract_opts.get_pos_excludes() pos_paddings: Set[str] = extract_opts.get_pos_paddings() if is_numeric_frame: pg = pos_schema.pos_to_id.get pos_includes = {pg(x) for x in pos_includes} pos_excludes = {pg(x) for x in pos_excludes} pos_paddings = {pg(x) for x in pos_paddings} if pos_includes: """Don't filter if PoS-include is empty - and don't filter out PoS tokens that should be padded""" mask &= tagged_frame[pos_column].isin(pos_includes.union(pos_paddings)) if pos_excludes: mask &= ~(tagged_frame[pos_column].isin(pos_excludes)) if transform_opts and transform_opts.has_effect: mask &= transform_opts.mask(tagged_frame[target_column], token2id=token2id) if len(passthroughs) > 0: mask |= tagged_frame[target_column].isin(passthroughs) if len(blocks) > 0: mask &= ~tagged_frame[target_column].isin(blocks) filtered_data: pd.DataFrame = tagged_frame.loc[mask][[ target_column, pos_column ]] if extract_opts.global_tf_threshold > 1: if token2id is None or token2id.tf is None: logger.error( "Cannot apply TF filter since token2id has no term frequencies" ) extract_opts.global_tf_threshold = 1 else: filtered_data = filter_tagged_frame_by_term_frequency( tagged_frame=filtered_data, target_column=target_column, token2id=token2id, extract_opts=extract_opts, passthroughs=passthroughs, ) if not is_numeric_frame and normalize_column_names: filtered_data.rename(columns={ target_column: 'token', pos_column: 'pos' }, inplace=True) return filtered_data
def tagged_frame_to_tokens( # pylint: disable=too-many-arguments, too-many-statements doc: pd.DataFrame, extract_opts: ExtractTaggedTokensOpts, token2id: Token2Id = None, transform_opts: TokensTransformOpts = None, pos_schema: PoS_Tag_Scheme = None, ) -> Iterable[str | int]: """Extracts tokens from a tagged document represented as a Pandas data frame. Args: extract_opts (ExtractTaggedTokensOpts): Part-of-speech/lemma extract options (e.g. PoS-filter) Returns: Iterable[str]: Sequence of extracted tokens """ if len(doc) == 0: return [] is_numeric_frame: bool = is_encoded_tagged_frame(doc) if isinstance(extract_opts, str): passthrough_column: str = extract_opts """If extracts_opts is a string: return column with what name""" if transform_opts is not None: raise ValueError( "transform_opts must be None when passthrough is specified") if transform_opts is not None: raise ValueError( "transform_opts must be None when passthrough is specified") return doc[passthrough_column].tolist() # if is_numeric_frame: # raise NotImplementedError("tagged_frame_to_tokens cannot handle encoded frames yet") pad: str = "*" pos_paddings: Set[str] = extract_opts.get_pos_paddings() phrase_pad: str = PHRASE_PAD filtered_data = filter_tagged_frame( tagged_frame=doc, extract_opts=extract_opts, token2id=token2id, transform_opts=transform_opts, pos_schema=pos_schema, ) target_columns = ['token_id', 'pos_id' ] if is_numeric_frame else ['token', 'pos'] token_pos_tuples = filtered_data[target_columns].itertuples(index=False, name=None) if len(pos_paddings) > 0: passthroughs: Set[str] = extract_opts.get_passthrough_tokens() if is_numeric_frame: pos_paddings = {pos_schema.pos_to_id.get(x) for x in pos_paddings} passthroughs = token2id.to_id_set(passthroughs) token_pos_tuples = ((pad, x[1]) if x[1] in pos_paddings and x[0] not in passthroughs else x for x in token_pos_tuples) if extract_opts.append_pos: if is_numeric_frame: raise NotImplementedError( "tagged_frame_to_tokens: PoS tag on encoded frame") return [ pad if x[0] == pad else f"{x[0].replace(' ', '_')}@{x[1]}" for x in token_pos_tuples if x[0] != phrase_pad ] if not is_numeric_frame and extract_opts.phrases and len( extract_opts.phrases) > 0: return [ x[0].replace(' ', '_') for x in token_pos_tuples if x[0] != phrase_pad ] return [x[0] for x in token_pos_tuples]
def filter_tagged_frame_by_term_frequency( # pylint: disable=too-many-arguments, too-many-statements tagged_frame: pd.DataFrame, target_column: str, token2id: Token2Id, extract_opts: ExtractTaggedTokensOpts, passthroughs: Set[str] = None, ) -> pd.DataFrame: """Filter tagged frame `tagged_frame` based on `extract_opts`. Return tagged frame with columns `token` and `pos`. Columns `token` is lemmatized word or source word depending on `extract_opts.lemmatize`. Args: tagged_frame (pd.DataFrame): tagged frame to be filtered extract_opts (ExtractTaggedTokensOpts): Part-of-speech/lemma extract options (e.g. PoS-filter) token2id (Token2Id, optional): Vocabulary. Returns: pd.DataFrame: Filtered tagged frame """ is_numeric_frame: bool = is_encoded_tagged_frame(tagged_frame) if extract_opts.global_tf_threshold <= 1: return tagged_frame if token2id is None or token2id.tf is None: raise ValueError("token2id or token2id.tf is not defined") if target_column not in tagged_frame.columns: raise ValueError( f"{target_column} is not valid target for given document (missing column)" ) """ If global_tf_threshold_mask then filter out tokens below threshold Otherwise replace token with `GLOBAL_TF_THRESHOLD_MASK_TOKEN` Alternativ implementation: 1. Compress Token2Id (remove low frequency words) 2. Remove or mask tokens not in compressed token2id """ tg = token2id.get mask_token_id: int = tg(GLOBAL_TF_THRESHOLD_MASK_TOKEN) mask_token: str | int = mask_token_id if is_numeric_frame else GLOBAL_TF_THRESHOLD_MASK_TOKEN """Set low TF to 0""" mask_token_id_tf: int = token2id.tf.get(mask_token_id, 0) token2id.tf[mask_token_id] = 0 cg = token2id.tf.get low_frequency_mask: pd.Series = ( tagged_frame[target_column].apply(tg).apply(cg) if not is_numeric_frame else tagged_frame[target_column].apply(cg) ).fillna(0) < extract_opts.global_tf_threshold """Reset low TF count""" token2id.tf[mask_token_id] = mask_token_id_tf if passthroughs: low_frequency_mask &= ~tagged_frame[target_column].isin(passthroughs) if extract_opts.global_tf_threshold_mask: """Mask low frequency terms""" tagged_frame[target_column] = tagged_frame[target_column].where( ~low_frequency_mask, mask_token) else: """Filter out low frequency terms""" tagged_frame = tagged_frame[~low_frequency_mask] return tagged_frame