示例#1
0
def test_apply_complex_policies_spark():
    sess = spark_lib.utils.make_session("test.policy.applyComplexPolicies")
    pd_df = pd.DataFrame({
        "name": ["bob", "alice"],
        "val-int": [30, 50],
        "val-float": [32.43424, 56.64543],
        "date": [pd.Timestamp("2018-10-15"),
                 pd.Timestamp("2016-09-10")],
    })
    expected_df = pd.DataFrame({
        "name": [
            "db6063546d5d6c1fd3826bc0a1d8188fa0dae1a174823eac1e8e063a073bf149",
            "4ae0639267ad49c658e8d266aa1caa51c876ed1d7ca788a0749d5189248295eb",
        ],
        "val-int": [25, 56],
        "val-float": [32.4, 56.6],
        # TODO: when these are pd.Timestamp, Spark's date_trunc is causing
        # dtype erasure. We should figure out why that's happening
        "date": [datetime.date(2018, 1, 1),
                 datetime.date(2016, 1, 1)],
    })
    df = sess.createDataFrame(pd_df)

    d = yaml.load(fixtures.complex_y, Loader=yaml.FullLoader)
    p = data.Policy(**d)
    new_df = policy_lib.apply_policy(p, df).toPandas()
    pdt.assert_frame_equal(new_df, expected_df, check_dtype=True)
示例#2
0
def parse_policy(
    p: Union[str, Dict[Any, Any]], logger: AuditLogger = AuditLogger()
) -> data.Policy:
    """Parses a policy YAML file.

    The passed in string can either be a path to a local file,
    a URL pointing to a file or a dictionary representing the policy.
    If it is a URL then requests attempts to download it.

    Args:
        p: a path string, a URL string or a dictionary representing the
           policy.

    Returns:
        The Policy object initialized by the YAML.
    """
    if type(p) == str:
        if validators.url(p):
            yaml_data = requests.get(p).text
        else:
            with open(p) as f:
                yaml_data = f.read()

        policy = yaml.load(yaml_data, Loader=yaml.FullLoader)
    else:
        policy = p

    return data.Policy(logger=logger, **policy)
示例#3
0
def test_apply_complex_policies_pandas():
    d = yaml.load(fixtures.complex_y, Loader=yaml.FullLoader)

    df = pd.DataFrame({
        "name": ["bob", "alice"],
        "val-int": [30, 50],
        "val-float": [32.43424, 56.64543],
        "date": [pd.Timestamp("2018-10-15"),
                 pd.Timestamp("2016-09-10")],
    })
    expected_df = pd.DataFrame({
        "name": [
            "db6063546d5d6c1fd3826bc0a1d8188fa0dae1a174823eac1e8e063a073bf149",
            "4ae0639267ad49c658e8d266aa1caa51c876ed1d7ca788a0749d5189248295eb",
        ],
        "val-int": [23, 58],
        "val-float": [32.4, 56.6],
        "date": [pd.Timestamp("2018-01-01"),
                 pd.Timestamp("2016-01-01")],
    })

    p = data.Policy(**d)

    new_df = policy_lib.apply_policy(p, df)

    pdt.assert_frame_equal(new_df, expected_df)
示例#4
0
def test_secret_in_named_transform():
    d = yaml.load(fixtures.secret_yaml, Loader=yaml.FullLoader)

    df = pd.DataFrame({"name": ["bob", "alice"]})

    p = data.Policy(**d)

    new_df = policy_lib.apply_policy(p, df)

    pdt.assert_frame_equal(new_df, df)
示例#5
0
def test_named_transform_type_not_found():
    d = yaml.load(
        fixtures.named_not_found_y("plusOne", "plusOne", "plusM"),
        Loader=yaml.FullLoader,
    )
    p = data.Policy(**d)
    tfm = p.rules[0].transformations[0]

    with pytest.raises(exceptions.NamedTransformNotFound) as e:
        policy_lib._get_transformation(p, tfm, pandas_lib.registry,
                                       pandas_lib.dtypes)
    assert str(e.value) == "Could not find transform of type plusM in registry"
示例#6
0
def test_apply_policy_pandas():
    pandas_lib.registry.register("plusN", test_utils.PlusN)
    d = yaml.load(fixtures.y, Loader=yaml.FullLoader)

    df = pd.DataFrame(np.ones(5, ), columns=["test"])

    expected_df = df + 3

    p = data.Policy(**d)

    new_df = policy_lib.apply_policy(p, df)

    pdt.assert_frame_equal(new_df, expected_df)
示例#7
0
def test_named_transformation_spark():
    sess = spark_lib.utils.make_session("test.policy.namedTransformations")
    pd_df = pd.DataFrame(np.ones(5, ), columns=["test"])
    expected_df = pd_df + 3
    df = sess.createDataFrame(pd_df)

    spark_lib.registry.register(test_utils.PlusN.identifier, test_utils.PlusN)
    d = yaml.load(fixtures.named_y, Loader=yaml.FullLoader)
    p = data.Policy(**d)
    new_df = policy_lib.apply_policy(p, df).toPandas()

    pdt.assert_frame_equal(new_df, expected_df)
    del spark_lib.registry._registry[test_utils.PlusN.identifier]
示例#8
0
def test_named_transform_not_found():
    pandas_lib.registry.register("plusN", test_utils.PlusN)
    d = yaml.load(
        fixtures.named_not_found_y("plusOne", "plusOneThousand", "plusN"),
        Loader=yaml.FullLoader,
    )

    df = pd.DataFrame(np.ones(5, ), columns=["test"])

    p = data.Policy(**d)
    tfm = p.rules[0].transformations[0]

    with pytest.raises(exceptions.NamedTransformNotFound) as e:
        policy_lib._get_transformation(p, tfm, df, pandas_lib.dtypes)

    assert str(e.value) == (
        "Could not find transform plusOneThousand in transformations block")
示例#9
0
def parse_policy(p: str):
    """Parses a policy yaml file.

    The passed in string can either be a path to a local file or
    a URL pointing to a file. If it is a URL then requests attempts to download it.

    Args:
        p: a path string or a URL string

    Returns:
        The Policy object initialized by the yaml.
    """
    yaml_data: str

    if validators.url(p):
        yaml_data = requests.get(p).text
    else:
        with open(p) as f:
            yaml_data = f.read()

    policy = yaml.load(yaml_data, Loader=yaml.FullLoader)
    return data.Policy(**policy)