Пример #1
0
def test_tagged_frame_to_tokens_with_tf_threshold_and_not_threshold_tf_mask(tagged_frame: pd.DataFrame):

    extract_opts = ExtractTaggedTokensOpts(
        lemmatize=True,
        pos_includes=None,
        pos_excludes=None,
        global_tf_threshold=2,
        global_tf_threshold_mask=False,
        **SPARV_TAGGED_COLUMNS,
    )
    """ Alternative #1: tagged_frame_to_tokens does the filtering """

    token2id: Token2Id = Token2Id().ingest(["*", GLOBAL_TF_THRESHOLD_MASK_TOKEN]).ingest(tagged_frame.baseform)
    expected_count = len(
        tagged_frame[
            tagged_frame.baseform.apply(lambda x: token2id.tf[token2id[x]] >= extract_opts.global_tf_threshold)
        ]
    )

    df: pd.DataFrame = tagged_frame.copy()
    tokens = tagged_frame_to_tokens(df, token2id=token2id, extract_opts=extract_opts)
    assert len(tokens) == expected_count
    assert set(tokens) == set(['i', 'i', '.', 'valv', 'valv', '.', '.'])

    """ Alternative #2: Use token2id to mask low TF tokens"""
    df: pd.DataFrame = tagged_frame.copy()
    token2id: Token2Id = Token2Id().ingest(["*", GLOBAL_TF_THRESHOLD_MASK_TOKEN]).ingest(df.baseform)
    """Note that translation must be used to map token-ids if used elsewhere"""
    _, translation = token2id.compress(tf_threshold=2, inplace=True)  # pylint: disable=unused-variable
    token2id.close()
    tokens = tagged_frame_to_tokens(df, token2id=token2id, extract_opts=extract_opts)
    assert len(tokens) == expected_count
    assert set(tokens) == set(['i', 'i', '.', 'valv', 'valv', '.', '.'])
Пример #2
0
def test_phrased_tagged_frame():

    os.makedirs('./tests/output', exist_ok=True)

    tagged_corpus_source: str = "./tests/test_data/tranströmer_corpus_export.sparv4.csv.zip"
    checkpoint_opts: checkpoint.CheckpointOpts = None
    data = checkpoint.load_archive(source_name=tagged_corpus_source,
                                   checkpoint_opts=checkpoint_opts,
                                   reader_opts=None)
    payload = next(data.create_stream())

    tokens = tagged_frame_to_tokens(
        payload.content,
        ExtractTaggedTokensOpts(lemmatize=False, **SPARV_TAGGED_COLUMNS),
    )
    assert tokens is not None
    phrases = {'United Nations': 'United_Nations', 'United': 'United'}
    phrased_tokens = tagged_frame_to_tokens(
        payload.content,
        ExtractTaggedTokensOpts(lemmatize=False,
                                phrases=phrases,
                                **SPARV_TAGGED_COLUMNS),
    )
    assert phrased_tokens[:9] == [
        'Constitution',
        'of',
        'the',
        'United_Nations',
        'Educational',
        ',',
        'Scientific',
        'and',
        'Cultural',
    ]
Пример #3
0
def test_tagged_frame_to_tokens_with_tf_threshold_and_threshold_tf_mask(tagged_frame: pd.DataFrame):

    extract_opts = ExtractTaggedTokensOpts(lemmatize=True, pos_includes=None, pos_excludes=None, **SPARV_TAGGED_COLUMNS)

    """ Alternative #1: tagged_frame_to_tokens does the filtering """

    df: pd.DataFrame = tagged_frame.copy()
    extract_opts.global_tf_threshold = 2
    extract_opts.global_tf_threshold_mask = True
    token2id: Token2Id = Token2Id().ingest(["*", GLOBAL_TF_THRESHOLD_MASK_TOKEN]).ingest(df.baseform)

    tokens = tagged_frame_to_tokens(df, token2id=token2id, extract_opts=extract_opts)
    assert len(tokens) == len(df)
    assert set(tokens) == set([GLOBAL_TF_THRESHOLD_MASK_TOKEN, 'i', 'i', '.', 'valv', 'valv', '.', '.'])

    """ Alternative #2: Use token2id to mask low TF tokens"""
    df: pd.DataFrame = tagged_frame.copy()
    token2id: Token2Id = Token2Id().ingest(["*", GLOBAL_TF_THRESHOLD_MASK_TOKEN]).ingest(df.baseform)

    """Note that translation must be used to map token-ids if used elsewhere"""
    token2id.compress(tf_threshold=2, inplace=True)
    token2id.close()

    tokens = tagged_frame_to_tokens(df, token2id=token2id, extract_opts=extract_opts)
    assert len(tokens) == len(df)
    assert set(tokens) == set([GLOBAL_TF_THRESHOLD_MASK_TOKEN, 'i', 'i', '.', 'valv', 'valv', '.', '.'])
Пример #4
0
def test_tagged_frame_to_tokens_with_global_tf_threshold(tagged_frame: pd.DataFrame):

    tagged_frame: pd.DataFrame = tagged_frame.copy()

    expected_counts: dict = {
        '.': 3,
        'bakom': 1,
        'den': 1,
        'fladdra_omkring': 1,
        'gapa': 1,
        'halvmörker': 1,
        'i': 2,
        'ingen': 1,
        'inne': 1,
        'kyrka': 1,
        'ljuslåga': 1,
        'någon': 1,
        'och': 1,
        'romansk': 1,
        'tränga': 1,
        'turist': 1,
        'valv': 2,
        'väldig': 1,
        'överblick': 1,
    }

    extract_opts = ExtractTaggedTokensOpts(lemmatize=True, pos_includes=None, pos_excludes=None, **SPARV_TAGGED_COLUMNS)
    tokens = tagged_frame_to_tokens(tagged_frame, extract_opts=extract_opts)
    assert set(expected_counts.keys()) == set(tokens)

    """TF threshold resets to 1 if token2id not supplied (i.e. token2id.TF is needed)"""
    extract_opts.global_tf_threshold = 2
    tokens = tagged_frame_to_tokens(tagged_frame, token2id=None, extract_opts=extract_opts)
    assert extract_opts.global_tf_threshold == 1

    token2id: Token2Id = Token2Id().ingest(["*", GLOBAL_TF_THRESHOLD_MASK_TOKEN]).ingest(tagged_frame.baseform)

    extract_opts.global_tf_threshold = 2
    extract_opts.global_tf_threshold_mask = False
    tokens = tagged_frame_to_tokens(tagged_frame, token2id=token2id, extract_opts=extract_opts)
    assert tokens == ['i', 'i', '.', 'valv', 'valv', '.', '.']

    extract_opts.global_tf_threshold = 2
    extract_opts.global_tf_threshold_mask = True
    tokens = tagged_frame_to_tokens(tagged_frame, token2id=token2id, extract_opts=extract_opts)
    assert len(tokens) == len(tagged_frame)
    assert set(tokens) == set([GLOBAL_TF_THRESHOLD_MASK_TOKEN, 'i', 'i', '.', 'valv', 'valv', '.', '.'])

    extract_opts.global_tf_threshold = 2
    extract_opts.global_tf_threshold_mask = True
    extract_opts.passthrough_tokens = {'överblick'}
    tokens = tagged_frame_to_tokens(tagged_frame, token2id=token2id, extract_opts=extract_opts)
    assert len(tokens) == len(tagged_frame)
    assert set(tokens) == set([GLOBAL_TF_THRESHOLD_MASK_TOKEN, 'i', 'i', '.', 'valv', 'valv', '.', '.', 'överblick'])
Пример #5
0
def test_to_tagged_frame_SUC_pos_with_phrase_detection():

    os.makedirs('./tests/output', exist_ok=True)
    data_str: str = """token	pos	baseform
Herr	NN	|herr|
talman	NN	|talman|
!	MAD	|
Jag	PN	|jag|
ber	VB	|be|
få	VB	|få|
hemställa	VB	|hemställa|
,	MID	|
att	IE	|att|
kammaren	NN	|kammare|
måtte	VB	|må|
besluta	VB	|besluta|
att	IE	|att|
välja	VB	|välja|
suppleanter	NN	|suppleant|
i	PL	|
de	PN	|de|
ständiga	JJ	|ständig|
utskotten	NN	|utskott|
.	MAD	|
"""

    tagged_frame: pd.DataFrame = pd.read_csv(StringIO(data_str), sep='\t', index_col=None)

    phrases = {'herr_talman': 'Herr talman'.split()}
    phrased_tokens = tagged_frame_to_tokens(
        tagged_frame,
        extract_opts=ExtractTaggedTokensOpts(lemmatize=False, phrases=phrases, **SPARV_TAGGED_COLUMNS),
    )
    assert phrased_tokens[:9] == ['herr_talman', '!', 'Jag', 'ber', 'få', 'hemställa', ',', 'att', 'kammaren']
Пример #6
0
def transform_frame(tagged_frame: str, transform_opts: TokensTransformOpts) -> List[str]:

    tokens = tagged_frame_to_tokens(
        tagged_frame,
        extract_opts=ExtractTaggedTokensOpts(lemmatize=False, **SPARV_TAGGED_COLUMNS),
        transform_opts=transform_opts,
    )
    return tokens
Пример #7
0
def test_tagged_frame_to_tokens_detect_phrases(tagged_frame: pd.DataFrame):

    expected_tokens = tagged_frame.baseform[:4].tolist() + ['romansk_kyrka'] + tagged_frame.baseform[6:].tolist()
    extract_opts = ExtractTaggedTokensOpts(
        lemmatize=True, pos_includes=None, phrases=[["romansk", "kyrka"]], **SPARV_TAGGED_COLUMNS
    )
    tokens = tagged_frame_to_tokens(tagged_frame, extract_opts=extract_opts)
    assert tokens == expected_tokens
Пример #8
0
def test_tagged_frame_to_tokens_with_append_pos_true(tagged_frame: pd.DataFrame):

    extract_opts = ExtractTaggedTokensOpts(
        lemmatize=False, pos_includes='VB', pos_excludes=None, append_pos=True, **SPARV_TAGGED_COLUMNS
    )
    tokens = tagged_frame_to_tokens(tagged_frame, extract_opts=extract_opts)
    assert tokens == ['trängdes@VB', 'gapade@VB', 'fladdrade@VB']

    extract_opts = ExtractTaggedTokensOpts(
        lemmatize=True,
        pos_includes="JJ",
        pos_excludes='MID|MAD|PAD',
        pos_paddings="VB|NN",
        append_pos=True,
        **SPARV_TAGGED_COLUMNS,
    )
    tokens = tagged_frame_to_tokens(tagged_frame, extract_opts=extract_opts)
    assert tokens == ['väldig@JJ', 'romansk@JJ', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*']
Пример #9
0
def test_tagged_frame_to_tokens_with_passthrough(tagged_frame: pd.DataFrame):

    extract_opts = ExtractTaggedTokensOpts(
        lemmatize=False,
        pos_includes='VB',
        pos_excludes=None,
        passthrough_tokens=['kyrkan', 'ljuslågor'],
        **SPARV_TAGGED_COLUMNS,
    )
    tokens = tagged_frame_to_tokens(tagged_frame, extract_opts=extract_opts)
    assert tokens == ['kyrkan', 'trängdes', 'gapade', 'ljuslågor', 'fladdrade']
Пример #10
0
def test_tagged_frame_to_tokens_replace_pos(tagged_frame: pd.DataFrame):

    extract_opts = ExtractTaggedTokensOpts(
        lemmatize=True, pos_includes="NN", pos_excludes='MID|MAD|PAD', pos_paddings="VB", **SPARV_TAGGED_COLUMNS
    )
    tokens = tagged_frame_to_tokens(tagged_frame, extract_opts=extract_opts)
    assert tokens == ["kyrka", "*", "turist", "halvmörker", "valv", "*", "valv", "överblick", "ljuslåga", "*"]

    extract_opts = ExtractTaggedTokensOpts(
        lemmatize=True, pos_includes=None, pos_excludes='MID|MAD|PAD', pos_paddings="VB|NN", **SPARV_TAGGED_COLUMNS
    )
    tokens = tagged_frame_to_tokens(tagged_frame, extract_opts=extract_opts)
    assert tokens == (
        ['inne', 'i', 'den', 'väldig', 'romansk', '*', '*', '*', 'i', '*', '*', '*']
        + ['bakom', '*', 'och', 'ingen', '*', 'någon', '*', '*']
    )

    extract_opts = ExtractTaggedTokensOpts(
        lemmatize=True, pos_includes="JJ", pos_excludes='MID|MAD|PAD', pos_paddings="VB|NN", **SPARV_TAGGED_COLUMNS
    )
    tokens = tagged_frame_to_tokens(tagged_frame, extract_opts=extract_opts)
    assert tokens == ['väldig', 'romansk', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*']
Пример #11
0
def test_tagged_frame_to_tokens_pos_and_lemma(tagged_frame: pd.DataFrame):

    extract_opts = ExtractTaggedTokensOpts(
        lemmatize=False, pos_includes=None, pos_excludes=None, **SPARV_TAGGED_COLUMNS
    )
    tokens = tagged_frame_to_tokens(tagged_frame, extract_opts=extract_opts)
    assert tokens == tagged_frame.token.tolist()

    extract_opts = ExtractTaggedTokensOpts(lemmatize=True, pos_includes=None, pos_excludes=None, **SPARV_TAGGED_COLUMNS)
    tokens = tagged_frame_to_tokens(tagged_frame, extract_opts=extract_opts)
    assert tokens == tagged_frame.apply(lambda x: to_lemma_form(x['token'], x['baseform']), axis=1).tolist()

    extract_opts = ExtractTaggedTokensOpts(
        lemmatize=False, pos_includes='VB', pos_excludes=None, **SPARV_TAGGED_COLUMNS
    )
    tokens = tagged_frame_to_tokens(tagged_frame, extract_opts=extract_opts)
    assert tokens == ['trängdes', 'gapade', 'fladdrade']

    extract_opts = ExtractTaggedTokensOpts(lemmatize=True, pos_includes='VB', pos_excludes=None, **SPARV_TAGGED_COLUMNS)
    tokens = tagged_frame_to_tokens(tagged_frame, extract_opts=extract_opts)
    assert tokens == ['tränga', 'gapa', 'fladdra_omkring']

    extract_opts = ExtractTaggedTokensOpts(
        lemmatize=True, pos_includes='|VB|', pos_excludes=None, **SPARV_TAGGED_COLUMNS
    )
    tokens = tagged_frame_to_tokens(tagged_frame, extract_opts=extract_opts)
    assert tokens == tagged_frame[tagged_frame.pos.isin(['VB'])].baseform.tolist()

    extract_opts = ExtractTaggedTokensOpts(
        lemmatize=True, pos_includes='|VB|NN|', pos_excludes=None, **SPARV_TAGGED_COLUMNS
    )
    tokens = tagged_frame_to_tokens(tagged_frame, extract_opts=extract_opts)
    assert tokens == tagged_frame[tagged_frame.pos.isin(['VB', 'NN'])].baseform.tolist()

    extract_opts = ExtractTaggedTokensOpts(
        lemmatize=True, pos_includes=None, pos_excludes='MID|MAD|PAD', **SPARV_TAGGED_COLUMNS
    )
    tokens = tagged_frame_to_tokens(tagged_frame, extract_opts=extract_opts)
    assert tokens == tagged_frame[~tagged_frame.pos.isin(['MID', 'MAD'])].baseform.tolist()

    extract_opts = ExtractTaggedTokensOpts(
        lemmatize=True, pos_includes='|VB|', pos_excludes=None, **SPARV_TAGGED_COLUMNS
    )
    tokens = tagged_frame_to_tokens(tagged_frame, extract_opts=extract_opts)
    assert tokens == tagged_frame[tagged_frame.pos.isin(['VB'])].baseform.tolist()