def assert_schema( spark_session, dataset_name, expected, # noqa F401 dataframe_source, schema): result = dataframe.get(spark_session, dataset_name, schema=schema, dataframe_source=dataframe_source) assert not dfdiff.diff(result, expected)
def datasets(spark_session, expected_ds, result_ds, project_key=None): """Validate that two data sets are the same. Args: spark_session (SparkSession): spark session used to load data frames. expected_ds (str): The expected data set to load. result_ds (str): The result data set to load. project_key (str): Used if data set in a separate dataiku project. """ expected_df = dataframe.get(spark_session, expected_ds, prefix=project_key) result_df = dataframe.get(spark_session, result_ds, prefix=project_key) diff_ret = dfdiff.diff(expected_df, result_df) assert not diff_ret, "Dataframe diff: " + diff_ret print("Test successful")
def test_equal(spark_session, fixtures, expected): s3_source = S3Source(format='parquet') dataset_name = "fixtures" s3_dir = "s3://birgittatestbucket/sourcetests" fixtures_mock = MagicMock() fixtures_mock.write.format().mode().save.return_value = None dataframe.write(fixtures_mock, dataset_name, prefix=s3_dir, dataframe_source=s3_source) spark_session_mock = MagicMock() spark_session_mock.read.format().load.return_value = fixtures out_df = dataframe.get(spark_session_mock, dataset_name, prefix=s3_dir, dataframe_source=s3_source) assert not dfdiff.diff(out_df, expected)
def test_error_on_short(fixtures, expected): # noqa F811 expected_short = expected.limit(5) expected_output = """Error: Row count diff Expected: 10 Actual: 5 Rows are different (max 20 rows shown) Only in expected: letter number 0 f 6 1 g 7 2 h 8 3 i 9 4 j 10 Only in actual result: Empty DataFrame Columns: [letter, number] Index: [] Expected: letter number 0 a 1 1 b 2 2 c 3 3 d 4 4 e 5 5 f 6 6 g 7 7 h 8 8 i 9 9 j 10 Actual: letter number 0 a 1 1 b 2 2 c 3 3 d 4 4 e 5""" assert dfdiff.diff(fixtures, expected_short) == expected_output
def test_equal(fixtures, expected): # noqa F811 assert not dfdiff.diff(fixtures, expected)
def test_error_on_val_diff(fixtures, expected): # noqa F811 expected_val_diff = expected.withColumn( 'number', F.lit(3).cast(LongType())) assert dfdiff.diff(fixtures, expected_val_diff) == """Error: Rows are different (max 20 rows shown)
def test_error_on_col_name_diff(fixtures, expected): # noqa F811 expected_col_name_diff = expected.withColumn( 'numfoo', F.col('number') ).drop('number') assert dfdiff.diff(fixtures, expected_col_name_diff) == """Error: Cols diff
def test_error_on_extra_col(fixtures, expected): # noqa F811 expected_extra_col = expected.withColumn("foo", F.lit("bar")) assert dfdiff.diff(fixtures, expected_extra_col) == """Error: Cols diff
def test_error_on_single_val_diff(fixtures, expected): # noqa F811 num_when = F.when(F.col('number') == 3, 3333).otherwise(F.col('number')) expected_single_val_diff = expected.withColumn('number', num_when) assert dfdiff.diff(fixtures, expected_single_val_diff) == """Error: Rows are different (max 20 rows shown)