示例#1
0
def assert_schema(
        spark_session,
        dataset_name,
        expected,  # noqa F401
        dataframe_source,
        schema):
    result = dataframe.get(spark_session,
                           dataset_name,
                           schema=schema,
                           dataframe_source=dataframe_source)
    assert not dfdiff.diff(result, expected)
示例#2
0
def datasets(spark_session, expected_ds, result_ds, project_key=None):
    """Validate that two data sets are the same.

    Args:
        spark_session (SparkSession): spark session used to load data frames.
        expected_ds (str): The expected data set to load.
        result_ds (str): The result data set to load.
        project_key (str): Used if data set in a separate dataiku project.
    """
    expected_df = dataframe.get(spark_session, expected_ds, prefix=project_key)
    result_df = dataframe.get(spark_session, result_ds, prefix=project_key)
    diff_ret = dfdiff.diff(expected_df, result_df)
    assert not diff_ret, "Dataframe diff: " + diff_ret
    print("Test successful")
示例#3
0
def test_equal(spark_session, fixtures, expected):
    s3_source = S3Source(format='parquet')
    dataset_name = "fixtures"
    s3_dir = "s3://birgittatestbucket/sourcetests"
    fixtures_mock = MagicMock()
    fixtures_mock.write.format().mode().save.return_value = None
    dataframe.write(fixtures_mock,
                    dataset_name,
                    prefix=s3_dir,
                    dataframe_source=s3_source)
    spark_session_mock = MagicMock()
    spark_session_mock.read.format().load.return_value = fixtures
    out_df = dataframe.get(spark_session_mock,
                           dataset_name,
                           prefix=s3_dir,
                           dataframe_source=s3_source)
    assert not dfdiff.diff(out_df, expected)
示例#4
0
def test_error_on_short(fixtures, expected):  # noqa F811
    expected_short = expected.limit(5)
    expected_output = """Error: Row count diff
Expected: 10
Actual:   5

Rows are different (max 20 rows shown)
Only in expected:
  letter  number
0      f       6
1      g       7
2      h       8
3      i       9
4      j      10
Only in actual result:
Empty DataFrame
Columns: [letter, number]
Index: []
Expected:
  letter  number
0      a       1
1      b       2
2      c       3
3      d       4
4      e       5
5      f       6
6      g       7
7      h       8
8      i       9
9      j      10
Actual:
  letter  number
0      a       1
1      b       2
2      c       3
3      d       4
4      e       5"""
    assert dfdiff.diff(fixtures, expected_short) == expected_output
示例#5
0
def test_equal(fixtures, expected):  # noqa F811
    assert not dfdiff.diff(fixtures, expected)
示例#6
0
def test_error_on_val_diff(fixtures, expected):  # noqa F811
    expected_val_diff = expected.withColumn(
        'number', F.lit(3).cast(LongType()))
    assert dfdiff.diff(fixtures, expected_val_diff) == """Error: Rows are different (max 20 rows shown)
示例#7
0
def test_error_on_col_name_diff(fixtures, expected):  # noqa F811
    expected_col_name_diff = expected.withColumn(
        'numfoo', F.col('number')
    ).drop('number')
    assert dfdiff.diff(fixtures, expected_col_name_diff) == """Error: Cols diff
示例#8
0
def test_error_on_extra_col(fixtures, expected):  # noqa F811
    expected_extra_col = expected.withColumn("foo", F.lit("bar"))
    assert dfdiff.diff(fixtures, expected_extra_col) == """Error: Cols diff
示例#9
0
def test_error_on_single_val_diff(fixtures, expected):  # noqa F811
    num_when = F.when(F.col('number') == 3, 3333).otherwise(F.col('number'))
    expected_single_val_diff = expected.withColumn('number', num_when)
    assert dfdiff.diff(fixtures, expected_single_val_diff) == """Error: Rows are different (max 20 rows shown)