def test_should_pass_empty_df_if_there_are_no_rules(spark_session): df = empty_string_df(spark_session) result = ValidateSparkDataFrame(spark_session, df).execute() AssertValidationResult(column_name="col1", constraint_name="") \ .check( actual=result, expected_correct=df, expected_erroneous=df )
def test_should_pass_df_if_there_are_no_rules(spark_session): df = spark_session.createDataFrame([["abc"], ["def"]], schema=single_string_column_schema) result = ValidateSparkDataFrame(spark_session, df).execute() AssertValidationResult(column_name="col1", constraint_name="") \ .check( actual=result, expected_correct=df, expected_erroneous=empty_string_df(spark_session) )
def test_should_return_df_without_changes_if_all_rows_are_unique(spark_session): df = spark_session.createDataFrame([["abc"], ["def"], ["ghi"]], schema=single_string_column_schema) result = ValidateSparkDataFrame(spark_session, df) \ .is_unique("col1") \ .execute() AssertValidationResult(column_name="col1", constraint_name="unique") \ .check( actual=result, expected_correct=df, expected_erroneous=empty_string_df(spark_session) )
def test_should_return_df_without_changes_if_empty_df_with_is_unique_constraint(spark_session): df = empty_string_df(spark_session) result = ValidateSparkDataFrame(spark_session, df) \ .is_unique("col1") \ .execute() AssertValidationResult(column_name="col1", constraint_name="unique") \ .check( actual=result, expected_correct=df, expected_erroneous=df )
def test_should_pass_empty_df_with_not_null_constraint(spark_session): df = empty_string_df(spark_session) result = ValidateSparkDataFrame(spark_session, df) \ .is_not_null("col1") \ .execute() AssertValidationResult(column_name="col1", constraint_name="not_null") \ .check( actual=result, expected_correct=df, expected_erroneous=df )
def test_should_return_df_without_changes_if_regex_matches_the_text(spark_session): df = spark_session.createDataFrame([["abc"], ["def"], ["ghi"]], schema=single_string_column_schema) result = ValidateSparkDataFrame(spark_session, df) \ .text_matches_regex("col1", ".*") \ .execute() AssertValidationResult(column_name="col1", constraint_name="regex_match") \ .check( actual=result, expected_correct=df, expected_erroneous=empty_string_df(spark_session) )
def test_should_reject_all_rows_if_none_of_them_is_in_the_list(spark_session): df = spark_session.createDataFrame([["abc"], ["a"], ["abcdefghi"]], schema=single_string_column_schema) expected_errors = spark_session.createDataFrame([["abc"], ["a"], ["abcdefghi"]], schema=single_string_column_schema) result = ValidateSparkDataFrame(spark_session, df) \ .one_of("col1", ["ab", "b"]) \ .execute() AssertValidationResult(column_name="col1", constraint_name="one_of") \ .check( actual=result, expected_correct=empty_string_df(spark_session), expected_erroneous=expected_errors )
def test_should_reject_all_rows_if_regex_match_fails(spark_session): df = spark_session.createDataFrame([["abc"], ["a"], ["abcdefghi"]], schema=single_string_column_schema) expected_errors = spark_session.createDataFrame([["abc"], ["a"], ["abcdefghi"]], schema=single_string_column_schema) result = ValidateSparkDataFrame(spark_session, df) \ .text_matches_regex("col1", "[0-9]+") \ .execute() AssertValidationResult(column_name="col1", constraint_name="regex_match") \ .check( actual=result, expected_correct=empty_string_df(spark_session), expected_erroneous=expected_errors )
def test_should_return_df_without_changes_if_all_are_longer_than_lower_bound( spark_session): df = spark_session.createDataFrame([["abcdef"], ["ghijkl"]], schema=single_string_column_schema) result = ValidateSparkDataFrame(spark_session, df) \ .has_length_between("col1", 5, 20) \ .execute() AssertValidationResult(column_name="col1", constraint_name="text_length") \ .check( actual=result, expected_correct=df, expected_erroneous=empty_string_df(spark_session) )
def test_should_reject_all_rows_if_all_are_too_short_or_too_long( spark_session): df = spark_session.createDataFrame([["abc"], ["a"], ["abcdefghi"]], schema=single_string_column_schema) expected_errors = spark_session.createDataFrame( [["abc"], ["a"], ["abcdefghi"]], schema=single_string_column_schema) result = ValidateSparkDataFrame(spark_session, df) \ .has_length_between("col1", 5, 8) \ .execute() AssertValidationResult(column_name="col1", constraint_name="text_length") \ .check( actual=result, expected_correct=empty_string_df(spark_session), expected_erroneous=expected_errors )
def test_should_throw_error_if_constraint_is_not_a_numeric_column( spark_session): with pytest.raises(ValueError): ValidateSparkDataFrame(spark_session, empty_string_df(spark_session)) \ .mean_column_value("col1", 10, 10) \ .execute()
def test_should_throw_error_if_lower_bound_is_greater_than_upper_bound( spark_session): with pytest.raises(ValueError): ValidateSparkDataFrame(spark_session, empty_string_df(spark_session)) \ .has_length_between("col1", 10, 5) \ .execute()
def test_should_throw_error_if_there_are_duplicate_constraints(spark_session): with pytest.raises(ValueError): ValidateSparkDataFrame(spark_session, empty_string_df(spark_session)) \ .has_length_between("col1", 0, 10) \ .has_length_between("col1", 0, 5) \ .execute()
def test_should_throw_error_if_constraint_uses_non_existing_column( spark_session): with pytest.raises(ValueError): ValidateSparkDataFrame(spark_session, empty_string_df(spark_session)) \ .has_length_between("column_that_does_not_exist", 0, 1) \ .execute()
def test_should_throw_error_if_there_are_duplicate_constraints(spark_session): with pytest.raises(ValueError): ValidateSparkDataFrame(spark_session, empty_string_df(spark_session)) \ .is_unique("col1") \ .is_unique("col1") \ .execute()
def test_should_throw_error_if_there_are_duplicate_constraints(spark_session): with pytest.raises(ValueError): ValidateSparkDataFrame(spark_session, empty_string_df(spark_session)) \ .text_matches_regex("column_that_does_not_exist", '.*') \ .text_matches_regex("column_that_does_not_exist", '[a-z]*') \ .execute()