def test_should_return_df_without_changes_if_empty_df_with_mean_constraint( spark_session): df = empty_integer_df(spark_session) result = ValidateSparkDataFrame(spark_session, df) \ .mean_column_value("col1", 0, 1) \ .execute() AssertValidationResult(column_name="col1", constraint_name="mean_between") \ .check( actual=result, expected_correct=df, expected_erroneous=df )
def test_should_reject_all_rows_if_smaller_than_min(spark_session): df = spark_session.createDataFrame([[5], [10], [15]], schema=single_integer_column_schema) result = ValidateSparkDataFrame(spark_session, df) \ .is_min("col1", 20) \ .execute() AssertValidationResult(column_name="col1", constraint_name="min") \ .check( actual=result, expected_correct=empty_integer_df(spark_session), expected_erroneous=df )
def test_should_return_df_without_changes_if_all_are_between(spark_session): df = spark_session.createDataFrame([[5], [10], [15]], schema=single_integer_column_schema) result = ValidateSparkDataFrame(spark_session, df) \ .is_between("col1", 5, 15) \ .execute() AssertValidationResult(column_name="col1", constraint_name="between") \ .check( actual=result, expected_correct=df, expected_erroneous=empty_integer_df(spark_session) )
def test_should_reject_all_rows_if_mean_is_larger_than_given_values( spark_session): df = spark_session.createDataFrame([[5], [10], [15]], schema=single_integer_column_schema) result = ValidateSparkDataFrame(spark_session, df) \ .mean_column_value("col1", 5, 8) \ .execute() AssertValidationResult(column_name="col1", constraint_name="mean_between") \ .check( actual=result, expected_correct=empty_integer_df(spark_session), expected_erroneous=df )
def test_should_throw_error_if_there_are_duplicate_constraints(spark_session): with pytest.raises(ValueError): ValidateSparkDataFrame(spark_session, empty_integer_df(spark_session)) \ .mean_column_value("col1", 10, 10) \ .mean_column_value("col1", 5, 5) \ .execute()
def test_should_throw_error_if_constraint_uses_non_existing_column( spark_session): with pytest.raises(ValueError): ValidateSparkDataFrame(spark_session, empty_integer_df(spark_session)) \ .mean_column_value("column_that_does_not_exist", 5, 5) \ .execute()
def test_should_throw_error_if_constraint_is_not_a_text_column(spark_session): with pytest.raises(ValueError): ValidateSparkDataFrame(spark_session, empty_integer_df(spark_session)) \ .has_length_between("col1", 5, 10) \ .execute()
def test_should_throw_error_if_constraint_is_not_a_text_column(spark_session): with pytest.raises(ValueError): ValidateSparkDataFrame(spark_session, empty_integer_df(spark_session)) \ .text_matches_regex("col1", '[a-z]*') \ .execute()
def test_should_throw_error_if_lower_bound_is_greater_than_upper_bound( spark_session): with pytest.raises(ValueError): ValidateSparkDataFrame(spark_session, empty_integer_df(spark_session)) \ .is_between("col1", 10, 5) \ .execute()