示例#1
0
def test_tokenizer_no_key():
    sess = utils.make_session("test.tokenizer.maxTokenLen")
    test_df = pd.DataFrame({"name": ["Alice", "Bob"]})
    _apply_tokenizer(
        sess,
        test_df,
        tkn.Tokenizer(max_token_len=None, key=None),
        col_to_rename="to_token(name)",
    )
示例#2
0
def test_tokenizer_is_linkable():
    sess = utils.make_session("test.tokenizer.isLinkable")
    test_df = pd.DataFrame({"name": ["Alice", "Bob"]})
    key1 = "secret_key"
    key2 = "secret_key"
    df1 = _apply_tokenizer(
        sess,
        test_df,
        tkn.Tokenizer(max_token_len=None, key=key1),
        col_to_rename="to_token(name)",
    )
    df2 = _apply_tokenizer(
        sess,
        test_df,
        tkn.Tokenizer(max_token_len=None, key=key2),
        col_to_rename="to_token(name)",
    )
    pdt.assert_frame_equal(df1, df2)
示例#3
0
def test_tokenizer_with_max_token_len():
    sess = utils.make_session("test.tokenizer.maxTokenLen")
    test_df = pd.DataFrame({"name": ["Alice", "Bob"]})
    expected = pd.DataFrame({"name": ["70a4b1a987", "dd4532a296"]})
    max_token_len = 10
    key = "secret_key"
    df = _apply_tokenizer(
        sess,
        test_df,
        tkn.Tokenizer(max_token_len=max_token_len, key=key),
        col_to_rename="to_token(name)",
    )
    pdt.assert_frame_equal(df, expected)
示例#4
0
def test_tokenizer_is_not_linkable():
    sess = utils.make_session("test.tokenizer.isNotLinkable")
    test_df = pd.DataFrame({"name": ["Alice", "Bob"]})
    key1 = "secret_key"
    key2 = "not_your_secret_key"
    df1 = _apply_tokenizer(
        sess,
        test_df,
        tkn.Tokenizer(max_token_len=None, key=key1),
        col_to_rename="to_token(name)",
    )
    df2 = _apply_tokenizer(
        sess,
        test_df,
        tkn.Tokenizer(max_token_len=None, key=key2),
        col_to_rename="to_token(name)",
    )
    try:
        pdt.assert_frame_equal(df1, df2)
        raise NotImplemented  # noqa: F901
    except AssertionError:
        pass
    except NotImplemented:
        raise AssertionError
示例#5
0
def test_tokenizer_simple():
    sess = utils.make_session("test.tokenizer.simple")
    test_df = pd.DataFrame({"name": ["Alice", "Bob"]})
    expected = pd.DataFrame({
        "name": [
            "70a4b1a987767abf36463cd3e3f2b37144132e572fbb9b39f28bcaafe10d9b24",
            "dd4532a296deb4f114b1e7e88faefe4fb2b32c559ac15a8c6bcbdbcbc2aa4d4b",
        ]
    })
    key = "secret_key"
    df = _apply_tokenizer(
        sess,
        test_df,
        tkn.Tokenizer(max_token_len=None, key=key),
        col_to_rename="to_token(name)",
    )
    pdt.assert_frame_equal(df, expected)
def _make_and_apply_tokenizer(sess, df, max_token_len, key):
    df = sess.createDataFrame(df, schema=["name"])
    tokenize = tkn.Tokenizer(max_token_len, key)
    result_df = df.select(tokenize(functions.col("name")))
    return result_df.withColumnRenamed("to_token(name)", "name").toPandas()