def test_apply_complex_policies_spark(): sess = spark_lib.utils.make_session("test.policy.applyComplexPolicies") pd_df = pd.DataFrame({ "name": ["bob", "alice"], "val-int": [30, 50], "val-float": [32.43424, 56.64543], "date": [pd.Timestamp("2018-10-15"), pd.Timestamp("2016-09-10")], }) expected_df = pd.DataFrame({ "name": [ "db6063546d5d6c1fd3826bc0a1d8188fa0dae1a174823eac1e8e063a073bf149", "4ae0639267ad49c658e8d266aa1caa51c876ed1d7ca788a0749d5189248295eb", ], "val-int": [25, 56], "val-float": [32.4, 56.6], # TODO: when these are pd.Timestamp, Spark's date_trunc is causing # dtype erasure. We should figure out why that's happening "date": [datetime.date(2018, 1, 1), datetime.date(2016, 1, 1)], }) df = sess.createDataFrame(pd_df) d = yaml.load(fixtures.complex_y, Loader=yaml.FullLoader) p = data.Policy(**d) new_df = policy_lib.apply_policy(p, df).toPandas() pdt.assert_frame_equal(new_df, expected_df, check_dtype=True)
def parse_policy( p: Union[str, Dict[Any, Any]], logger: AuditLogger = AuditLogger() ) -> data.Policy: """Parses a policy YAML file. The passed in string can either be a path to a local file, a URL pointing to a file or a dictionary representing the policy. If it is a URL then requests attempts to download it. Args: p: a path string, a URL string or a dictionary representing the policy. Returns: The Policy object initialized by the YAML. """ if type(p) == str: if validators.url(p): yaml_data = requests.get(p).text else: with open(p) as f: yaml_data = f.read() policy = yaml.load(yaml_data, Loader=yaml.FullLoader) else: policy = p return data.Policy(logger=logger, **policy)
def test_apply_complex_policies_pandas(): d = yaml.load(fixtures.complex_y, Loader=yaml.FullLoader) df = pd.DataFrame({ "name": ["bob", "alice"], "val-int": [30, 50], "val-float": [32.43424, 56.64543], "date": [pd.Timestamp("2018-10-15"), pd.Timestamp("2016-09-10")], }) expected_df = pd.DataFrame({ "name": [ "db6063546d5d6c1fd3826bc0a1d8188fa0dae1a174823eac1e8e063a073bf149", "4ae0639267ad49c658e8d266aa1caa51c876ed1d7ca788a0749d5189248295eb", ], "val-int": [23, 58], "val-float": [32.4, 56.6], "date": [pd.Timestamp("2018-01-01"), pd.Timestamp("2016-01-01")], }) p = data.Policy(**d) new_df = policy_lib.apply_policy(p, df) pdt.assert_frame_equal(new_df, expected_df)
def test_secret_in_named_transform(): d = yaml.load(fixtures.secret_yaml, Loader=yaml.FullLoader) df = pd.DataFrame({"name": ["bob", "alice"]}) p = data.Policy(**d) new_df = policy_lib.apply_policy(p, df) pdt.assert_frame_equal(new_df, df)
def test_named_transform_type_not_found(): d = yaml.load( fixtures.named_not_found_y("plusOne", "plusOne", "plusM"), Loader=yaml.FullLoader, ) p = data.Policy(**d) tfm = p.rules[0].transformations[0] with pytest.raises(exceptions.NamedTransformNotFound) as e: policy_lib._get_transformation(p, tfm, pandas_lib.registry, pandas_lib.dtypes) assert str(e.value) == "Could not find transform of type plusM in registry"
def test_apply_policy_pandas(): pandas_lib.registry.register("plusN", test_utils.PlusN) d = yaml.load(fixtures.y, Loader=yaml.FullLoader) df = pd.DataFrame(np.ones(5, ), columns=["test"]) expected_df = df + 3 p = data.Policy(**d) new_df = policy_lib.apply_policy(p, df) pdt.assert_frame_equal(new_df, expected_df)
def test_named_transformation_spark(): sess = spark_lib.utils.make_session("test.policy.namedTransformations") pd_df = pd.DataFrame(np.ones(5, ), columns=["test"]) expected_df = pd_df + 3 df = sess.createDataFrame(pd_df) spark_lib.registry.register(test_utils.PlusN.identifier, test_utils.PlusN) d = yaml.load(fixtures.named_y, Loader=yaml.FullLoader) p = data.Policy(**d) new_df = policy_lib.apply_policy(p, df).toPandas() pdt.assert_frame_equal(new_df, expected_df) del spark_lib.registry._registry[test_utils.PlusN.identifier]
def test_named_transform_not_found(): pandas_lib.registry.register("plusN", test_utils.PlusN) d = yaml.load( fixtures.named_not_found_y("plusOne", "plusOneThousand", "plusN"), Loader=yaml.FullLoader, ) df = pd.DataFrame(np.ones(5, ), columns=["test"]) p = data.Policy(**d) tfm = p.rules[0].transformations[0] with pytest.raises(exceptions.NamedTransformNotFound) as e: policy_lib._get_transformation(p, tfm, df, pandas_lib.dtypes) assert str(e.value) == ( "Could not find transform plusOneThousand in transformations block")
def parse_policy(p: str): """Parses a policy yaml file. The passed in string can either be a path to a local file or a URL pointing to a file. If it is a URL then requests attempts to download it. Args: p: a path string or a URL string Returns: The Policy object initialized by the yaml. """ yaml_data: str if validators.url(p): yaml_data = requests.get(p).text else: with open(p) as f: yaml_data = f.read() policy = yaml.load(yaml_data, Loader=yaml.FullLoader) return data.Policy(**policy)