def test_should_return_df_without_changes_if_all_are_between(spark_session): df = spark_session.createDataFrame([[5], [10], [15]], schema=single_integer_column_schema) result = ValidateSparkDataFrame(spark_session, df) \ .is_between("col1", 5, 15) \ .execute() AssertValidationResult(column_name="col1", constraint_name="between") \ .check( actual=result, expected_correct=df, expected_erroneous=empty_integer_df(spark_session) )
def test_should_return_df_without_changes_if_empty_df_with_is_between_constraint( spark_session): df = empty_integer_df(spark_session) result = ValidateSparkDataFrame(spark_session, df) \ .is_between("col1", 5, 10) \ .execute() AssertValidationResult(column_name="col1", constraint_name="between") \ .check( actual=result, expected_correct=df, expected_erroneous=df )
def test_should_reject_all_rows_if_mean_is_larger_than_given_values( spark_session): df = spark_session.createDataFrame([[5], [10], [15]], schema=single_integer_column_schema) result = ValidateSparkDataFrame(spark_session, df) \ .mean_column_value("col1", 5, 8) \ .execute() AssertValidationResult(column_name="col1", constraint_name="mean_between") \ .check( actual=result, expected_correct=empty_integer_df(spark_session), expected_erroneous=df )
def test_should_return_df_without_changes_if_all_are_longer_than_lower_bound( spark_session): df = spark_session.createDataFrame([["abcdef"], ["ghijkl"]], schema=single_string_column_schema) result = ValidateSparkDataFrame(spark_session, df) \ .has_length_between("col1", 5, 20) \ .execute() AssertValidationResult(column_name="col1", constraint_name="text_length") \ .check( actual=result, expected_correct=df, expected_erroneous=empty_string_df(spark_session) )
def test_should_return_both_correct_and_incorrect_rows(spark_session): df = spark_session.createDataFrame([["a"], ["abc"], ["defg"], ["hijkl"]], schema=single_string_column_schema) expected_correct = spark_session.createDataFrame([["abc"], ["defg"]], schema=single_string_column_schema) expected_errors = spark_session.createDataFrame([["a"], ["hijkl"]], schema=single_string_column_schema) result = ValidateSparkDataFrame(spark_session, df) \ .text_matches_regex("col1", "^[a-z]{3,4}$") \ .execute() AssertValidationResult(column_name="col1", constraint_name="regex_match") \ .check( actual=result, expected_correct=expected_correct, expected_erroneous=expected_errors )
def test_mean_value_of_other_columns_is_ignored(spark_session): df = spark_session.createDataFrame([[5, 1], [10, 2], [15, 3]], schema=two_integer_columns_schema) expected_errors = spark_session.createDataFrame( [], schema=two_integer_columns_schema) result = ValidateSparkDataFrame(spark_session, df) \ .mean_column_value("col1", 10, 10) \ .execute() AssertValidationResult(column_name="col1", constraint_name="mean_between") \ .check( actual=result, expected_correct=df, expected_erroneous=expected_errors )
def test_nulls_in_other_columns_are_ignored(spark_session): df = spark_session.createDataFrame([["abc", "123"], [None, "456"], ["def", None]], schema=two_string_columns_schema) expected_correct = spark_session.createDataFrame([["abc", "123"], ["def", None]], schema=two_string_columns_schema) expected_errors = spark_session.createDataFrame([[None, "456"]], schema=two_string_columns_schema) result = ValidateSparkDataFrame(spark_session, df) \ .is_not_null("col1") \ .execute() AssertValidationResult(column_name="col1", constraint_name="not_null") \ .check( actual=result, expected_correct=expected_correct, expected_erroneous=expected_errors )
def test_should_return_both_correct_and_incorrect_rows(spark_session): df = spark_session.createDataFrame([["abc"], [None]], schema=single_string_column_schema) expected_correct = spark_session.createDataFrame([["abc"]], schema=single_string_column_schema) expected_errors = spark_session.createDataFrame([[None]], schema=single_string_column_schema) result = ValidateSparkDataFrame(spark_session, df) \ .is_not_null("col1") \ .execute() AssertValidationResult(column_name="col1", constraint_name="not_null") \ .check( actual=result, expected_correct=expected_correct, expected_erroneous=expected_errors )
def test_one_of_of_other_columns_is_ignored(spark_session): df = spark_session.createDataFrame([["a", "123"], ["bcd", "45"], ["cd", "12345"]], schema=two_string_columns_schema) expected_correct = spark_session.createDataFrame([["cd", "12345"]], schema=two_string_columns_schema) expected_errors = spark_session.createDataFrame([["a", "123"], ["bcd", "45"]], schema=two_string_columns_schema) result = ValidateSparkDataFrame(spark_session, df) \ .one_of("col1", ["cd", "123", "45"]) \ .execute() AssertValidationResult(column_name="col1", constraint_name="one_of") \ .check( actual=result, expected_correct=expected_correct, expected_erroneous=expected_errors )
def test_should_return_both_correct_and_incorrect_rows_numeric_values(spark_session): df = spark_session.createDataFrame([[1], [2], [3], [4]], schema=single_string_column_schema) expected_correct = spark_session.createDataFrame([[1], [3]], schema=single_string_column_schema) expected_errors = spark_session.createDataFrame([[2], [4]], schema=single_string_column_schema) result = ValidateSparkDataFrame(spark_session, df) \ .one_of("col1", [1, 3, 5]) \ .execute() AssertValidationResult(column_name="col1", constraint_name="one_of") \ .check( actual=result, expected_correct=expected_correct, expected_erroneous=expected_errors )
def test_should_reject_all_rows_if_all_are_too_short_or_too_long( spark_session): df = spark_session.createDataFrame([["abc"], ["a"], ["abcdefghi"]], schema=single_string_column_schema) expected_errors = spark_session.createDataFrame( [["abc"], ["a"], ["abcdefghi"]], schema=single_string_column_schema) result = ValidateSparkDataFrame(spark_session, df) \ .has_length_between("col1", 5, 8) \ .execute() AssertValidationResult(column_name="col1", constraint_name="text_length") \ .check( actual=result, expected_correct=empty_string_df(spark_session), expected_erroneous=expected_errors )
def test_should_return_both_correct_and_incorrect_rows(spark_session): df = spark_session.createDataFrame([[5], [10], [15]], schema=single_integer_column_schema) expected_correct = spark_session.createDataFrame( [[10], [15]], schema=single_integer_column_schema) expected_errors = spark_session.createDataFrame( [[5]], schema=single_integer_column_schema) result = ValidateSparkDataFrame(spark_session, df) \ .is_min("col1", 10) \ .execute() AssertValidationResult(column_name="col1", constraint_name="min") \ .check( actual=result, expected_correct=expected_correct, expected_erroneous=expected_errors )