def test_tagged_frame_to_tokens_with_tf_threshold_and_not_threshold_tf_mask(tagged_frame: pd.DataFrame): extract_opts = ExtractTaggedTokensOpts( lemmatize=True, pos_includes=None, pos_excludes=None, global_tf_threshold=2, global_tf_threshold_mask=False, **SPARV_TAGGED_COLUMNS, ) """ Alternative #1: tagged_frame_to_tokens does the filtering """ token2id: Token2Id = Token2Id().ingest(["*", GLOBAL_TF_THRESHOLD_MASK_TOKEN]).ingest(tagged_frame.baseform) expected_count = len( tagged_frame[ tagged_frame.baseform.apply(lambda x: token2id.tf[token2id[x]] >= extract_opts.global_tf_threshold) ] ) df: pd.DataFrame = tagged_frame.copy() tokens = tagged_frame_to_tokens(df, token2id=token2id, extract_opts=extract_opts) assert len(tokens) == expected_count assert set(tokens) == set(['i', 'i', '.', 'valv', 'valv', '.', '.']) """ Alternative #2: Use token2id to mask low TF tokens""" df: pd.DataFrame = tagged_frame.copy() token2id: Token2Id = Token2Id().ingest(["*", GLOBAL_TF_THRESHOLD_MASK_TOKEN]).ingest(df.baseform) """Note that translation must be used to map token-ids if used elsewhere""" _, translation = token2id.compress(tf_threshold=2, inplace=True) # pylint: disable=unused-variable token2id.close() tokens = tagged_frame_to_tokens(df, token2id=token2id, extract_opts=extract_opts) assert len(tokens) == expected_count assert set(tokens) == set(['i', 'i', '.', 'valv', 'valv', '.', '.'])
def test_phrased_tagged_frame(): os.makedirs('./tests/output', exist_ok=True) tagged_corpus_source: str = "./tests/test_data/tranströmer_corpus_export.sparv4.csv.zip" checkpoint_opts: checkpoint.CheckpointOpts = None data = checkpoint.load_archive(source_name=tagged_corpus_source, checkpoint_opts=checkpoint_opts, reader_opts=None) payload = next(data.create_stream()) tokens = tagged_frame_to_tokens( payload.content, ExtractTaggedTokensOpts(lemmatize=False, **SPARV_TAGGED_COLUMNS), ) assert tokens is not None phrases = {'United Nations': 'United_Nations', 'United': 'United'} phrased_tokens = tagged_frame_to_tokens( payload.content, ExtractTaggedTokensOpts(lemmatize=False, phrases=phrases, **SPARV_TAGGED_COLUMNS), ) assert phrased_tokens[:9] == [ 'Constitution', 'of', 'the', 'United_Nations', 'Educational', ',', 'Scientific', 'and', 'Cultural', ]
def test_tagged_frame_to_tokens_with_tf_threshold_and_threshold_tf_mask(tagged_frame: pd.DataFrame): extract_opts = ExtractTaggedTokensOpts(lemmatize=True, pos_includes=None, pos_excludes=None, **SPARV_TAGGED_COLUMNS) """ Alternative #1: tagged_frame_to_tokens does the filtering """ df: pd.DataFrame = tagged_frame.copy() extract_opts.global_tf_threshold = 2 extract_opts.global_tf_threshold_mask = True token2id: Token2Id = Token2Id().ingest(["*", GLOBAL_TF_THRESHOLD_MASK_TOKEN]).ingest(df.baseform) tokens = tagged_frame_to_tokens(df, token2id=token2id, extract_opts=extract_opts) assert len(tokens) == len(df) assert set(tokens) == set([GLOBAL_TF_THRESHOLD_MASK_TOKEN, 'i', 'i', '.', 'valv', 'valv', '.', '.']) """ Alternative #2: Use token2id to mask low TF tokens""" df: pd.DataFrame = tagged_frame.copy() token2id: Token2Id = Token2Id().ingest(["*", GLOBAL_TF_THRESHOLD_MASK_TOKEN]).ingest(df.baseform) """Note that translation must be used to map token-ids if used elsewhere""" token2id.compress(tf_threshold=2, inplace=True) token2id.close() tokens = tagged_frame_to_tokens(df, token2id=token2id, extract_opts=extract_opts) assert len(tokens) == len(df) assert set(tokens) == set([GLOBAL_TF_THRESHOLD_MASK_TOKEN, 'i', 'i', '.', 'valv', 'valv', '.', '.'])
def test_tagged_frame_to_tokens_with_global_tf_threshold(tagged_frame: pd.DataFrame): tagged_frame: pd.DataFrame = tagged_frame.copy() expected_counts: dict = { '.': 3, 'bakom': 1, 'den': 1, 'fladdra_omkring': 1, 'gapa': 1, 'halvmörker': 1, 'i': 2, 'ingen': 1, 'inne': 1, 'kyrka': 1, 'ljuslåga': 1, 'någon': 1, 'och': 1, 'romansk': 1, 'tränga': 1, 'turist': 1, 'valv': 2, 'väldig': 1, 'överblick': 1, } extract_opts = ExtractTaggedTokensOpts(lemmatize=True, pos_includes=None, pos_excludes=None, **SPARV_TAGGED_COLUMNS) tokens = tagged_frame_to_tokens(tagged_frame, extract_opts=extract_opts) assert set(expected_counts.keys()) == set(tokens) """TF threshold resets to 1 if token2id not supplied (i.e. token2id.TF is needed)""" extract_opts.global_tf_threshold = 2 tokens = tagged_frame_to_tokens(tagged_frame, token2id=None, extract_opts=extract_opts) assert extract_opts.global_tf_threshold == 1 token2id: Token2Id = Token2Id().ingest(["*", GLOBAL_TF_THRESHOLD_MASK_TOKEN]).ingest(tagged_frame.baseform) extract_opts.global_tf_threshold = 2 extract_opts.global_tf_threshold_mask = False tokens = tagged_frame_to_tokens(tagged_frame, token2id=token2id, extract_opts=extract_opts) assert tokens == ['i', 'i', '.', 'valv', 'valv', '.', '.'] extract_opts.global_tf_threshold = 2 extract_opts.global_tf_threshold_mask = True tokens = tagged_frame_to_tokens(tagged_frame, token2id=token2id, extract_opts=extract_opts) assert len(tokens) == len(tagged_frame) assert set(tokens) == set([GLOBAL_TF_THRESHOLD_MASK_TOKEN, 'i', 'i', '.', 'valv', 'valv', '.', '.']) extract_opts.global_tf_threshold = 2 extract_opts.global_tf_threshold_mask = True extract_opts.passthrough_tokens = {'överblick'} tokens = tagged_frame_to_tokens(tagged_frame, token2id=token2id, extract_opts=extract_opts) assert len(tokens) == len(tagged_frame) assert set(tokens) == set([GLOBAL_TF_THRESHOLD_MASK_TOKEN, 'i', 'i', '.', 'valv', 'valv', '.', '.', 'överblick'])
def test_to_tagged_frame_SUC_pos_with_phrase_detection(): os.makedirs('./tests/output', exist_ok=True) data_str: str = """token pos baseform Herr NN |herr| talman NN |talman| ! MAD | Jag PN |jag| ber VB |be| få VB |få| hemställa VB |hemställa| , MID | att IE |att| kammaren NN |kammare| måtte VB |må| besluta VB |besluta| att IE |att| välja VB |välja| suppleanter NN |suppleant| i PL | de PN |de| ständiga JJ |ständig| utskotten NN |utskott| . MAD | """ tagged_frame: pd.DataFrame = pd.read_csv(StringIO(data_str), sep='\t', index_col=None) phrases = {'herr_talman': 'Herr talman'.split()} phrased_tokens = tagged_frame_to_tokens( tagged_frame, extract_opts=ExtractTaggedTokensOpts(lemmatize=False, phrases=phrases, **SPARV_TAGGED_COLUMNS), ) assert phrased_tokens[:9] == ['herr_talman', '!', 'Jag', 'ber', 'få', 'hemställa', ',', 'att', 'kammaren']
def transform_frame(tagged_frame: str, transform_opts: TokensTransformOpts) -> List[str]: tokens = tagged_frame_to_tokens( tagged_frame, extract_opts=ExtractTaggedTokensOpts(lemmatize=False, **SPARV_TAGGED_COLUMNS), transform_opts=transform_opts, ) return tokens
def test_tagged_frame_to_tokens_detect_phrases(tagged_frame: pd.DataFrame): expected_tokens = tagged_frame.baseform[:4].tolist() + ['romansk_kyrka'] + tagged_frame.baseform[6:].tolist() extract_opts = ExtractTaggedTokensOpts( lemmatize=True, pos_includes=None, phrases=[["romansk", "kyrka"]], **SPARV_TAGGED_COLUMNS ) tokens = tagged_frame_to_tokens(tagged_frame, extract_opts=extract_opts) assert tokens == expected_tokens
def test_tagged_frame_to_tokens_with_append_pos_true(tagged_frame: pd.DataFrame): extract_opts = ExtractTaggedTokensOpts( lemmatize=False, pos_includes='VB', pos_excludes=None, append_pos=True, **SPARV_TAGGED_COLUMNS ) tokens = tagged_frame_to_tokens(tagged_frame, extract_opts=extract_opts) assert tokens == ['trängdes@VB', 'gapade@VB', 'fladdrade@VB'] extract_opts = ExtractTaggedTokensOpts( lemmatize=True, pos_includes="JJ", pos_excludes='MID|MAD|PAD', pos_paddings="VB|NN", append_pos=True, **SPARV_TAGGED_COLUMNS, ) tokens = tagged_frame_to_tokens(tagged_frame, extract_opts=extract_opts) assert tokens == ['väldig@JJ', 'romansk@JJ', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*']
def test_tagged_frame_to_tokens_with_passthrough(tagged_frame: pd.DataFrame): extract_opts = ExtractTaggedTokensOpts( lemmatize=False, pos_includes='VB', pos_excludes=None, passthrough_tokens=['kyrkan', 'ljuslågor'], **SPARV_TAGGED_COLUMNS, ) tokens = tagged_frame_to_tokens(tagged_frame, extract_opts=extract_opts) assert tokens == ['kyrkan', 'trängdes', 'gapade', 'ljuslågor', 'fladdrade']
def test_tagged_frame_to_tokens_replace_pos(tagged_frame: pd.DataFrame): extract_opts = ExtractTaggedTokensOpts( lemmatize=True, pos_includes="NN", pos_excludes='MID|MAD|PAD', pos_paddings="VB", **SPARV_TAGGED_COLUMNS ) tokens = tagged_frame_to_tokens(tagged_frame, extract_opts=extract_opts) assert tokens == ["kyrka", "*", "turist", "halvmörker", "valv", "*", "valv", "överblick", "ljuslåga", "*"] extract_opts = ExtractTaggedTokensOpts( lemmatize=True, pos_includes=None, pos_excludes='MID|MAD|PAD', pos_paddings="VB|NN", **SPARV_TAGGED_COLUMNS ) tokens = tagged_frame_to_tokens(tagged_frame, extract_opts=extract_opts) assert tokens == ( ['inne', 'i', 'den', 'väldig', 'romansk', '*', '*', '*', 'i', '*', '*', '*'] + ['bakom', '*', 'och', 'ingen', '*', 'någon', '*', '*'] ) extract_opts = ExtractTaggedTokensOpts( lemmatize=True, pos_includes="JJ", pos_excludes='MID|MAD|PAD', pos_paddings="VB|NN", **SPARV_TAGGED_COLUMNS ) tokens = tagged_frame_to_tokens(tagged_frame, extract_opts=extract_opts) assert tokens == ['väldig', 'romansk', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*']
def test_tagged_frame_to_tokens_pos_and_lemma(tagged_frame: pd.DataFrame): extract_opts = ExtractTaggedTokensOpts( lemmatize=False, pos_includes=None, pos_excludes=None, **SPARV_TAGGED_COLUMNS ) tokens = tagged_frame_to_tokens(tagged_frame, extract_opts=extract_opts) assert tokens == tagged_frame.token.tolist() extract_opts = ExtractTaggedTokensOpts(lemmatize=True, pos_includes=None, pos_excludes=None, **SPARV_TAGGED_COLUMNS) tokens = tagged_frame_to_tokens(tagged_frame, extract_opts=extract_opts) assert tokens == tagged_frame.apply(lambda x: to_lemma_form(x['token'], x['baseform']), axis=1).tolist() extract_opts = ExtractTaggedTokensOpts( lemmatize=False, pos_includes='VB', pos_excludes=None, **SPARV_TAGGED_COLUMNS ) tokens = tagged_frame_to_tokens(tagged_frame, extract_opts=extract_opts) assert tokens == ['trängdes', 'gapade', 'fladdrade'] extract_opts = ExtractTaggedTokensOpts(lemmatize=True, pos_includes='VB', pos_excludes=None, **SPARV_TAGGED_COLUMNS) tokens = tagged_frame_to_tokens(tagged_frame, extract_opts=extract_opts) assert tokens == ['tränga', 'gapa', 'fladdra_omkring'] extract_opts = ExtractTaggedTokensOpts( lemmatize=True, pos_includes='|VB|', pos_excludes=None, **SPARV_TAGGED_COLUMNS ) tokens = tagged_frame_to_tokens(tagged_frame, extract_opts=extract_opts) assert tokens == tagged_frame[tagged_frame.pos.isin(['VB'])].baseform.tolist() extract_opts = ExtractTaggedTokensOpts( lemmatize=True, pos_includes='|VB|NN|', pos_excludes=None, **SPARV_TAGGED_COLUMNS ) tokens = tagged_frame_to_tokens(tagged_frame, extract_opts=extract_opts) assert tokens == tagged_frame[tagged_frame.pos.isin(['VB', 'NN'])].baseform.tolist() extract_opts = ExtractTaggedTokensOpts( lemmatize=True, pos_includes=None, pos_excludes='MID|MAD|PAD', **SPARV_TAGGED_COLUMNS ) tokens = tagged_frame_to_tokens(tagged_frame, extract_opts=extract_opts) assert tokens == tagged_frame[~tagged_frame.pos.isin(['MID', 'MAD'])].baseform.tolist() extract_opts = ExtractTaggedTokensOpts( lemmatize=True, pos_includes='|VB|', pos_excludes=None, **SPARV_TAGGED_COLUMNS ) tokens = tagged_frame_to_tokens(tagged_frame, extract_opts=extract_opts) assert tokens == tagged_frame[tagged_frame.pos.isin(['VB'])].baseform.tolist()