def test_apply_complex_policies_spark(): sess = spark_lib.utils.make_session("test.policy.applyComplexPolicies") pd_df = pd.DataFrame({ "name": ["bob", "alice"], "val-int": [30, 50], "val-float": [32.43424, 56.64543], "date": [pd.Timestamp("2018-10-15"), pd.Timestamp("2016-09-10")], }) expected_df = pd.DataFrame({ "name": [ "db6063546d5d6c1fd3826bc0a1d8188fa0dae1a174823eac1e8e063a073bf149", "4ae0639267ad49c658e8d266aa1caa51c876ed1d7ca788a0749d5189248295eb", ], "val-int": [25, 56], "val-float": [32.4, 56.6], # TODO: when these are pd.Timestamp, Spark's date_trunc is causing # dtype erasure. We should figure out why that's happening "date": [datetime.date(2018, 1, 1), datetime.date(2016, 1, 1)], }) df = sess.createDataFrame(pd_df) d = yaml.load(fixtures.complex_y, Loader=yaml.FullLoader) p = data.Policy(**d) new_df = policy_lib.apply_policy(p, df).toPandas() pdt.assert_frame_equal(new_df, expected_df, check_dtype=True)
def test_apply_complex_policies_pandas(): d = yaml.load(fixtures.complex_y, Loader=yaml.FullLoader) df = pd.DataFrame({ "name": ["bob", "alice"], "val-int": [30, 50], "val-float": [32.43424, 56.64543], "date": [pd.Timestamp("2018-10-15"), pd.Timestamp("2016-09-10")], }) expected_df = pd.DataFrame({ "name": [ "db6063546d5d6c1fd3826bc0a1d8188fa0dae1a174823eac1e8e063a073bf149", "4ae0639267ad49c658e8d266aa1caa51c876ed1d7ca788a0749d5189248295eb", ], "val-int": [23, 58], "val-float": [32.4, 56.6], "date": [pd.Timestamp("2018-01-01"), pd.Timestamp("2016-01-01")], }) p = data.Policy(**d) new_df = policy_lib.apply_policy(p, df) pdt.assert_frame_equal(new_df, expected_df)
def test_reverse_helper(): p = yaml.load(fixtures.reversible_yaml, Loader=yaml.FullLoader) policy = policy_lib.parse_policy(p) df = pd.DataFrame({"name": ["bob", "alice"]}) new_df = policy_lib.apply_policy(policy, df) new_policy = policy_lib.reverse(policy) another_df = policy_lib.apply_policy(new_policy, new_df) for transform in new_policy.transformations: assert transform.type == pandas_lib.transformations.TokenReverser.identifier pdt.assert_frame_equal(df, another_df)
def test_secret_in_named_transform(): d = yaml.load(fixtures.secret_yaml, Loader=yaml.FullLoader) df = pd.DataFrame({"name": ["bob", "alice"]}) p = data.Policy(**d) new_df = policy_lib.apply_policy(p, df) pdt.assert_frame_equal(new_df, df)
def test_apply_policy_pandas(): pandas_lib.registry.register("plusN", test_utils.PlusN) d = yaml.load(fixtures.y, Loader=yaml.FullLoader) df = pd.DataFrame(np.ones(5, ), columns=["test"]) expected_df = df + 3 p = data.Policy(**d) new_df = policy_lib.apply_policy(p, df) pdt.assert_frame_equal(new_df, expected_df)
def test_named_transformation_spark(): sess = spark_lib.utils.make_session("test.policy.namedTransformations") pd_df = pd.DataFrame(np.ones(5, ), columns=["test"]) expected_df = pd_df + 3 df = sess.createDataFrame(pd_df) spark_lib.registry.register(test_utils.PlusN.identifier, test_utils.PlusN) d = yaml.load(fixtures.named_y, Loader=yaml.FullLoader) p = data.Policy(**d) new_df = policy_lib.apply_policy(p, df).toPandas() pdt.assert_frame_equal(new_df, expected_df) del spark_lib.registry._registry[test_utils.PlusN.identifier]