def test_reversible_tokenizer(): sess = utils.make_session("test.tokenizer.reversibleTokenizer") key = b"5" * 32 plaintext = pd.DataFrame({"name": ["Alice", "Bob"]}) tokenized = _apply_tokenizer( sess, plaintext, tkn.ReversibleTokenizer(key=key), col_to_rename="to_token(name)", ) tokenized_expected = pd.DataFrame({ "name": [ "c8c7e80144304276183e5bcd589db782bc5ff95309", "e0f40aea0d5c21b35967c4231b98b5b3e5338e", ] }) pdt.assert_frame_equal(tokenized, tokenized_expected) recovered = _apply_tokenizer( sess, tokenized, tkn.TokenReverser(key=key), col_to_rename="from_token(name)", ) pdt.assert_frame_equal(recovered, plaintext)
def test_tokenizer_with_max_token_len(): sess = utils.make_session("test.tokenizer.maxTokenLen") test_df = pd.DataFrame({"name": ["Alice", "Bob"]}) expected = pd.DataFrame({"name": ["70a4b1a987", "dd4532a296"]}) max_token_len = 10 key = "secret_key" df = _make_and_apply_tokenizer(sess, test_df, max_token_len=max_token_len, key=key) pdt.assert_frame_equal(df, expected)
def test_tokenizer_is_linkable(): sess = utils.make_session("test.tokenizer.isLinkable") test_df = pd.DataFrame({"name": ["Alice", "Bob"]}) key1 = "secret_key" key2 = "secret_key" df1 = _make_and_apply_tokenizer(sess, test_df, max_token_len=None, key=key1) df2 = _make_and_apply_tokenizer(sess, test_df, max_token_len=None, key=key2) pdt.assert_frame_equal(df1, df2)
def test_truncate_date(): sess = utils.make_session("test.truncation.date") test_df, expected = _make_date_data(sess) truncate = rnd.DateTruncation("month") result_df = test_df.select(truncate(test_df.data)).toPandas() result = result_df.values assert result.dtype == expected.dtype np.testing.assert_equal(result, expected)
def test_rounding_long(): precision = -2 sess = utils.make_session("test.rounding.integer") test_df, expected = _make_integer_data(np.int64, precision) result_df = _make_and_apply_rounder(sess, test_df, dtypes.Long, precision) result = result_df.values assert result.dtype == expected.dtype np.testing.assert_almost_equal(result, expected)
def test_rounding_float(): precision = 0 sess = utils.make_session("test.rounding.float") test_df, expected = _make_float_data(np.float32, precision) result_df = _make_and_apply_rounder(sess, test_df, dtypes.Float, precision) result = result_df.values assert result.dtype == expected.dtype np.testing.assert_almost_equal(result, expected)
def test_column_redact(): sess = utils.make_session("test.redaction.column") df = pd.DataFrame(np.ones((5, 3)), columns=["a", "b", "c"]) expected = pd.DataFrame(np.ones((5, )), columns=["a"]) test_df = sess.createDataFrame(df, schema=["a", "b", "c"]) redact = rdc.ColumnRedact(["b", "c"]) result = redact(test_df).toPandas() pdt.assert_frame_equal(result, expected)
def test_tokenizer_no_key(): sess = utils.make_session("test.tokenizer.maxTokenLen") test_df = pd.DataFrame({"name": ["Alice", "Bob"]}) _apply_tokenizer( sess, test_df, tkn.Tokenizer(max_token_len=None, key=None), col_to_rename="to_token(name)", )
def test_row_redact(): sess = utils.make_session("test.redaction.row") df = pd.DataFrame(np.ones((5, 2)), columns=["a", "b"]) df["a"].iloc[0] = 6 df["a"].iloc[3] = 6 expected = pd.DataFrame(np.ones((3, 2)), columns=["a", "b"]) test_df = sess.createDataFrame(df, schema=["a", "b"]) redact = rdc.RowRedact("a > 5") result = redact(test_df).toPandas() pdt.assert_frame_equal(result, expected)
def test_tokenizer_is_not_linkable(): sess = utils.make_session("test.tokenizer.isNotLinkable") test_df = pd.DataFrame({"name": ["Alice", "Bob"]}) key1 = "secret_key" key2 = "not_your_secret_key" df1 = _make_and_apply_tokenizer(sess, test_df, max_token_len=None, key=key1) df2 = _make_and_apply_tokenizer(sess, test_df, max_token_len=None, key=key2) try: pdt.assert_frame_equal(df1, df2) raise NotImplemented # noqa: F901 except AssertionError: pass except NotImplemented: raise AssertionError
def test_tokenizer_simple(): sess = utils.make_session("test.tokenizer.simple") test_df = pd.DataFrame({"name": ["Alice", "Bob"]}) expected = pd.DataFrame( { "name": [ "70a4b1a987767abf36463cd3e3f2b37144132e572fbb9b39f28bcaafe10d9b24", "dd4532a296deb4f114b1e7e88faefe4fb2b32c559ac15a8c6bcbdbcbc2aa4d4b", ] } ) key = "secret_key" df = _make_and_apply_tokenizer(sess, test_df, max_token_len=None, key=key) pdt.assert_frame_equal(df, expected)
def test_tokenizer_no_key(): sess = utils.make_session("test.tokenizer.maxTokenLen") test_df = pd.DataFrame({"name": ["Alice", "Bob"]}) _make_and_apply_tokenizer(sess, test_df, max_token_len=None, key=None)