def test_should_check_all_given_columns_separately(spark_session):
    df = spark_session.createDataFrame(
        [["a", "12"], ["abcde", "56"], ["def", "123"]],
        schema=two_string_columns_schema)

    expected_correct = spark_session.createDataFrame(
        [], schema=two_string_columns_schema)
    expected_errors = spark_session.createDataFrame(
        [["a", "12"], ["abcde", "56"], ["def", "123"]],
        schema=two_string_columns_schema)

    result = ValidateSparkDataFrame(spark_session, df) \
        .has_length_between("col1", 2, 4) \
        .has_length_between("col2", 1, 2) \
        .execute()

    AssertDf(result.correct_data, order_by_column="col1") \
        .contains_exactly(expected_correct.toPandas()) \
        .has_columns(["col1", "col2"])

    AssertDf(result.erroneous_data, order_by_column="col2") \
        .contains_exactly(expected_errors.toPandas()) \
        .has_columns(["col1", "col2"])

    assert result.errors == [
        ValidationError("col1", "text_length", 2),
        ValidationError("col2", "text_length", 1)
    ]
def test_min_should_check_all_given_columns_separately(spark_session):
    df = spark_session.createDataFrame([[5, 1], [10, 2], [15, 3]],
                                       schema=two_integer_columns_schema)
    expected_correct = spark_session.createDataFrame(
        [], schema=two_integer_columns_schema)
    expected_errors = spark_session.createDataFrame(
        [[5, 1], [10, 2], [15, 3]], schema=two_integer_columns_schema)

    result = ValidateSparkDataFrame(spark_session, df) \
        .is_min("col1", 20) \
        .is_min("col2", 5) \
        .execute()

    AssertDf(result.correct_data, order_by_column="col1") \
        .contains_exactly(expected_correct.toPandas()) \
        .has_columns(["col1", "col2"])

    AssertDf(result.erroneous_data, order_by_column="col2") \
        .contains_exactly(expected_errors.toPandas()) \
        .has_columns(["col1", "col2"])

    assert result.errors == [
        ValidationError("col1", "min", 3),
        ValidationError("col2", "min", 3)
    ]
def test_should_return_rows_that_pass_all_checks_and_reject_rows_that_violate_any_test(spark_session):
    not_between = [25, 1]
    max_exceeded = [3, 30]
    correct = [3, 15]
    less_than_min = [1, 15]
    both_wrong = [7, 30]

    df = spark_session.createDataFrame([not_between, max_exceeded, correct, less_than_min, both_wrong], schema=two_integer_columns_schema)
    expected_correct = spark_session.createDataFrame([correct], schema=two_integer_columns_schema)
    expected_errors = spark_session.createDataFrame([not_between, max_exceeded, less_than_min, both_wrong], schema=two_integer_columns_schema)

    result = ValidateSparkDataFrame(spark_session, df) \
        .is_between("col1", 0, 5) \
        .is_min("col1", 3) \
        .is_max("col2", 20) \
        .execute()

    AssertDf(result.correct_data, order_by_column="col1") \
        .contains_exactly(expected_correct.toPandas()) \
        .has_columns(["col1", "col2"])

    AssertDf(result.erroneous_data, order_by_column="col2") \
        .contains_exactly(expected_errors.toPandas()) \
        .has_columns(["col1", "col2"])

    assert result.errors == [ValidationError("col1", "between", 2), ValidationError("col1", "min", 1), ValidationError("col2", "max", 2)]
Пример #4
0
    def check(self, *, actual: ValidationResult, expected_correct: DataFrame,
              expected_erroneous: DataFrame):
        if expected_correct.count() == 0:
            AssertDf(actual.correct_data) \
                .is_empty() \
                .has_columns(expected_correct.columns)
        else:
            AssertDf(actual.correct_data, order_by_column=self.column_name) \
                .contains_exactly(expected_correct.toPandas()) \
                .has_columns(expected_correct.columns)

        if expected_erroneous.count() == 0:
            AssertDf(actual.erroneous_data) \
                .is_empty() \
                .has_columns(expected_erroneous.columns)
        else:
            AssertDf(actual.erroneous_data, order_by_column=self.column_name) \
                .contains_exactly(expected_erroneous.toPandas()) \
                .has_columns(expected_erroneous.columns)

        if expected_erroneous.count() == 0:
            assert actual.errors == []
        else:
            assert actual.errors == [
                ValidationError(self.column_name, self.constraint_name,
                                expected_erroneous.count())
            ]
Пример #5
0
def test_should_reject_all_rows_if_all_are_the_same(spark_session):
    df = spark_session.createDataFrame([["abc"], ["abc"], ["abc"]], schema=single_string_column_schema)
    expected_errors = spark_session.createDataFrame([["abc"]], schema=single_string_column_schema)

    result = ValidateSparkDataFrame(spark_session, df) \
        .is_unique("col1") \
        .execute()

    AssertDf(result.correct_data) \
        .is_empty() \
        .has_columns(["col1"])

    AssertDf(result.erroneous_data, order_by_column="col1") \
        .contains_exactly(expected_errors.toPandas()) \
        .has_columns(["col1"])

    assert result.errors == [ValidationError("col1", "unique", 3)]
def test_not_null_should_check_all_given_columns_separately_even_if_all_of_them_are_defined_at_once(spark_session):
    df = spark_session.createDataFrame([["abc", None], [None, "456"], [None, None]], schema=two_string_columns_schema)
    expected_errors = spark_session.createDataFrame([["abc", None], [None, "456"], [None, None]], schema=two_string_columns_schema)

    result = ValidateSparkDataFrame(spark_session, df) \
        .are_not_null(["col1", "col2"]) \
        .execute()

    AssertDf(result.correct_data) \
        .is_empty() \
        .has_columns(["col1", "col2"])

    AssertDf(result.erroneous_data, order_by_column=["col1", "col2"]) \
        .contains_exactly(expected_errors.toPandas()) \
        .has_columns(["col1", "col2"])

    assert result.errors == [ValidationError("col1", "not_null", 2), ValidationError("col2", "not_null", 2)]
Пример #7
0
def test_uniqueness_of_other_columns_is_ignored(spark_session):
    df = spark_session.createDataFrame([["abc", "123"], ["abc", "456"], ["def", "123"]], schema=two_string_columns_schema)
    expected_correct = spark_session.createDataFrame([["def", "123"]], schema=two_string_columns_schema)
    expected_errors = spark_session.createDataFrame([["abc", "123"], ["abc", "456"]], schema=two_string_columns_schema)

    result = ValidateSparkDataFrame(spark_session, df) \
        .is_unique("col1") \
        .execute()

    AssertDf(result.correct_data, order_by_column="col1") \
        .contains_exactly(expected_correct.toPandas()) \
        .has_columns(["col1", "col2"])

    AssertDf(result.erroneous_data, order_by_column="col2") \
        .contains_exactly(expected_errors.toPandas()) \
        .has_columns(["col1", "col2"])

    assert result.errors == [ValidationError("col1", "unique", 2)]
Пример #8
0
def test_uniqueness_should_check_all_given_columns_separately_when_defining_all_columns_at_once(spark_session):
    df = spark_session.createDataFrame([["abc", "123"], ["abc", "456"], ["def", "123"]], schema=two_string_columns_schema)
    expected_correct = spark_session.createDataFrame([], schema=two_string_columns_schema)
    expected_errors = spark_session.createDataFrame([["abc", "123"], ["abc", "456"], ["def", "123"]], schema=two_string_columns_schema)

    result = ValidateSparkDataFrame(spark_session, df) \
        .are_unique(["col1", "col2"]) \
        .execute()

    AssertDf(result.correct_data, order_by_column="col1") \
        .contains_exactly(expected_correct.toPandas()) \
        .has_columns(["col1", "col2"])

    AssertDf(result.erroneous_data, order_by_column="col2") \
        .contains_exactly(expected_errors.toPandas()) \
        .has_columns(["col1", "col2"])

    assert result.errors == [ValidationError("col1", "unique", 2), ValidationError("col2", "unique", 2)]
def test_between_ignores_the_other_column(spark_session):
    df = spark_session.createDataFrame([[5, 8], [10, 20], [15, 8]],
                                       schema=two_integer_columns_schema)
    expected_correct = spark_session.createDataFrame(
        [[5, 8], [10, 20]], schema=two_integer_columns_schema)
    expected_errors = spark_session.createDataFrame(
        [[15, 8]], schema=two_integer_columns_schema)

    result = ValidateSparkDataFrame(spark_session, df) \
        .is_between("col1", 5, 10) \
        .execute()

    AssertDf(result.correct_data, order_by_column="col1") \
        .contains_exactly(expected_correct.toPandas()) \
        .has_columns(["col1", "col2"])

    AssertDf(result.erroneous_data, order_by_column="col2") \
        .contains_exactly(expected_errors.toPandas()) \
        .has_columns(["col1", "col2"])

    assert result.errors == [ValidationError("col1", "between", 1)]
def test_spark_sql_operation(spark_session):
    df_schema = StructType([
        StructField("col1", StringType()),
        StructField("col2", IntegerType())
    ])

    test_list = [["v1", 1], ["v1", 2], ["v2", 3]]

    df: DataFrame = spark_session.createDataFrame(test_list, schema=df_schema)
    aggregated = df.groupby("col1").sum("col2").orderBy('col1')

    AssertDf(aggregated) \
        .contains_exactly(pd.DataFrame([['v1', 3], ['v2', 3]], columns=['col1', 'sum(col2)']).sort_values('col1')) \
        .has_columns(["col1", "sum(col2)"]) \
        .has_n_rows(2)
def test_empty_dataframe(spark_session):
    df_schema = StructType([StructField("col1", StringType())])

    df = spark_session.createDataFrame([], schema=df_schema)
    AssertDf(df).is_empty()