예제 #1
0
def test_assert_equal_when_data_types_do_not_match(
        spark_session: SparkSession):
    """
    Test the fairly subtle case where one DF contains an INT and the other
    contains a BIGINT, which can be an issue if we try to write a DF containing
    a BIGINT into a previously existing Hive table defined to contain an INT.
    """
    actual_df = show_output_to_df(
        """
    +------+
    |col_a |
    [bigint]
    +------+
    |1     |
    +------+
    """, spark_session)

    expected_df = show_output_to_df(
        """
    +------+
    |col_a |
    [int   ]
    +------+
    |1     |
    +------+
    """, spark_session)

    with raises(AssertionError) as exception_info:
        assert_equal(expected_df, actual_df)
    assert 'The DataFrame schemas differ.' == str(exception_info.value)
 def test_show_output_to_df_when_data_does_not_match_default_data_type(
         self, spark_session: SparkSession):
     with raises(ValueError) as exception_info:
         show_output_to_df("""
         +----------+
         |int_column|
         +----------+
         |          | # Empty string is not a valid INT
         +----------+
         """,
                           spark_session,
                           default_data_type='int').collect()
     assert ("invalid literal for int() with base 10: ''" == str(
         exception_info.value))
 def test_show_output_to_df_with_multiple_data_type_declarations(
         self, spark_session: SparkSession):
     with raises(ValueError) as exception_info:
         show_output_to_df(
             """
         +-------------+
         |string_column|
         [string       ]
         [string       ]
         +-------------+
         |one          |
         +-------------+
         """, spark_session)
     assert ('Cannot have more than one data type declaration line.' == str(
         exception_info.value))
    def test_show_output_to_df_with_1d_arrays(self,
                                              spark_session: SparkSession):
        df = show_output_to_df(
            """
        +----+-------------+-------------+---------------+----------------+
        |id  |array_str_col|array_int_col|array_float_col|array_double_col|
        [int |array<string>|array<int>   |array<float>   |array<double>   |
        +----+-------------+-------------+---------------+----------------+
        |1   |[ a ,b, c ]  |[0,  1 ,2 ]  |[0, 1, 2.0]    |[0, 1, 2.0]     |
        |2   |[  ]         |[     ]      |[]             |[]              |
        +----+-------------+-------------+---------------+----------------+
        """, spark_session)
        rows = df.collect()
        assert 2 == len(rows)
        assert 5 == len(rows[0])  # Number of columns

        row = rows[0]
        assert 1 == row['id']
        assert ['a', 'b', 'c'] == row['array_str_col']
        assert [0, 1, 2] == row['array_int_col']
        assert [0.0, 1.0, 2.0] == row['array_float_col']
        assert [0.0, 1.0, 2.0] == row['array_double_col']

        row = rows[1]
        assert 2 == row['id']
        assert [] == row['array_str_col']
        assert [] == row['array_int_col']
        assert [] == row['array_float_col']
        assert [] == row['array_double_col']
    def test_show_output_to_df_when_data_types_specified(
            self, spark_session: SparkSession):
        rows = show_output_to_df(
            """
        +-------------+----------+-------------+------------+-------------------+-------------------+-----------+
        |string_column|int_column|bigint_column|float_column|timestamp_column   |default_type_column|bool_column|
        [string       |int       |bigint       |float       |timestamp          |                   |boolean    ]
        +-------------+----------+-------------+------------+-------------------+-------------------+-----------+
        |one          |1         |1            |1.1         |2018-01-01 00:00:00|11                 |true       |
        |two          |2         |2            |2.2         |2018-01-02 12:34:56|22                 |false      |
        +-------------+----------+-------------+------------+-------------------+-------------------+-----------+
        """, spark_session).collect()
        assert 2 == len(rows)
        assert 7 == len(rows[0])  # Number of columns

        row = rows[0]
        assert 'one' == row['string_column']
        assert 1 == row['int_column']
        assert 1 == row['bigint_column']
        assert 1.1 == row['float_column']
        assert datetime(2018, 1, 1) == row['timestamp_column']
        assert '11' == row['default_type_column']
        assert True is row['bool_column']

        row = rows[1]
        assert 'two' == row['string_column']
        assert 2 == row['int_column']
        assert 2 == row['bigint_column']
        assert 2.2 == row['float_column']
        assert datetime(2018, 1, 2, 12, 34, 56) == row['timestamp_column']
        assert '22' == row['default_type_column']
        assert False is row['bool_column']
    def test_show_output_to_df_when_data_types_not_specified(
            self, spark_session: SparkSession):
        rows = show_output_to_df(
            """
        +-------------+----------+------------+-------------------+
        |string_column|int_column|float_column|timestamp_column   |
        +-------------+----------+------------+-------------------+
        # This is a comment that gets ignored.
        |one          |1         |1.1         |2018-01-01 00:00:00| # Another comment to ignore
        |two          |2         |2.2         |2018-01-02 12:34:56|
        +-------------+----------+------------+-------------------+
        """, spark_session).collect()
        assert 2 == len(rows)
        assert 4 == len(rows[0])  # Number of columns

        row = rows[0]
        assert 'one' == row['string_column']
        assert '1' == row['int_column']
        assert '1.1' == row['float_column']
        assert '2018-01-01 00:00:00' == row['timestamp_column']

        row = rows[1]
        assert 'two' == row['string_column']
        assert '2' == row['int_column']
        assert '2.2' == row['float_column']
        assert '2018-01-02 12:34:56' == row['timestamp_column']
예제 #7
0
def expected_df(spark_session: SparkSession) -> DataFrame:
    return show_output_to_df(
        """
    +-----+-----+
    |col_a|col_b|
    +-----+-----+
    |1a   |1b   |
    |2a   |2b   |
    +-----+-----+
    """, spark_session)
 def test_show_output_to_df_with_no_data_rows_and_data_types_not_specified(
         self, spark_session: SparkSession):
     df = show_output_to_df(
         """
     +-------------+
     |string_column|
     +-------------+
     +-------------+
     """, spark_session)
     assert DataFrame == type(df)
     assert 1 == len(df.columns)
     assert 0 == df.count()
     assert isinstance(df.schema.fields[0].dataType, StringType)
예제 #9
0
def test_assert_equal_when_actual_df_has_too_few_rows(
        expected_df, spark_session: SparkSession):
    actual_df = show_output_to_df(
        """
    +-----+-----+
    |col_a|col_b|
    +-----+-----+
    |1a   |1b   |
    +-----+-----+
    """, spark_session)

    with raises(AssertionError) as exception_info:
        assert_equal(expected_df, actual_df, verbose=False)
    assert 'The DataFrames differ.' == str(exception_info.value)
예제 #10
0
def test_assert_equal_when_dfs_are_equal(expected_df,
                                         spark_session: SparkSession):
    actual_df = show_output_to_df(
        """
    +-----+-----+
    |col_a|col_b|
    +-----+-----+
    |1a   |1b   |
    |2a   |2b   |
    +-----+-----+
    """, spark_session)

    # No error or assertion failure should be thrown:
    assert_equal(expected_df, actual_df)
 def test_show_output_to_df_when_default_data_type_specified(
         self, spark_session: SparkSession):
     rows = show_output_to_df("""
     +----------+
     |int_column|
     +----------+
     |1         |
     +----------+
     """,
                              spark_session,
                              default_data_type='int').collect()
     assert 1 == len(rows)
     assert 1 == len(rows[0])  # Number of columns
     assert 1 == rows[0]['int_column']
예제 #12
0
def test_assert_equal_when_dfs_are_equal_and_column_is_null(
        spark_session: SparkSession):
    actual_df = show_output_to_df(
        """
    +------+
    |col_a |
    [string]
    +------+
    |null  |
    +------+
    """, spark_session)

    expected_df = show_output_to_df(
        """
    +------+
    |col_a |
    [string]
    +------+
    |null  |
    +------+
    """, spark_session)

    # No error or assertion failure should be thrown:
    assert_equal(expected_df, actual_df)
예제 #13
0
def test_assert_equal_when_column_order_is_different(
        expected_df, spark_session: SparkSession):
    actual_df = show_output_to_df(
        """
    +-----+-----+
    |col_b|col_a|
    +-----+-----+
    |1b   |1a   |
    |2b   |2a   |
    +-----+-----+
    """, spark_session)

    with raises(AssertionError) as exception_info:
        assert_equal(expected_df, actual_df)
    assert 'The DataFrame schemas differ.' == str(exception_info.value)
예제 #14
0
def test_assert_equal_when_expected_df_is_none(expected_df,
                                               spark_session: SparkSession):
    actual_df = show_output_to_df(
        """
    +-----+
    |col_a|
    +-----+
    |1a   |
    +-----+
    """, spark_session)

    with raises(AssertionError) as exception_info:
        assert_equal(None, actual_df)
    assert 'The expected DataFrame is None, but the actual DataFrame is not.' \
           == str(exception_info.value)
    def test_show_output_to_df_when_values_are_null_and_data_types_not_specified(
            self, spark_session: SparkSession):
        rows = show_output_to_df(
            """
        +-------------+
        |string_column|
        +-------------+
        |null         |
        +-------------+
        """, spark_session).collect()
        assert 1 == len(rows)
        assert 1 == len(rows[0])  # Number of columns

        row = rows[0]
        assert None is row['string_column']
 def test_show_output_to_df_when_default_data_type_overridden(
         self, spark_session: SparkSession):
     rows = show_output_to_df("""
     +-------------+----------+
     |string_column|int_column|
     [string       |          ]
     +-------------+----------+
     |1            |1         |
     +-------------+----------+
     """,
                              spark_session,
                              default_data_type='int').collect()
     assert 1 == len(rows)
     assert 2 == len(rows[0])  # Number of columns
     assert '1' == rows[0]['string_column']
     assert 1 == rows[0]['int_column']
예제 #17
0
def test_assert_equal_when_actual_df_has_duplicate_last_row(
        expected_df, spark_session: SparkSession):
    actual_df = show_output_to_df(
        """
    +-----+-----+
    |col_a|col_b|
    +-----+-----+
    |1a   |1b   |
    |2a   |2b   |
    |2a   |2b   |
    +-----+-----+
    """, spark_session)

    with raises(AssertionError) as exception_info:
        assert_equal(expected_df, actual_df)
    assert 'The DataFrames differ.' == str(exception_info.value)
    def test_show_output_to_df_when_values_are_empty_and_data_types_are_specified(
            self, spark_session: SparkSession):
        rows = show_output_to_df(
            """
        +-------------+-------------------+
        |string_column|default_type_column|
        [string       |                   |
        +-------------+-------------------+
        |             |                   |
        +-------------+-------------------+
        """, spark_session).collect()
        assert 1 == len(rows)
        assert 2 == len(rows[0])  # Number of columns

        row = rows[0]
        assert '' == row['string_column']
        assert '' == row['default_type_column']
    def test_show_output_to_df_when_values_are_null_and_data_types_are_specified(
            self, spark_session: SparkSession):
        rows = show_output_to_df(
            """
        +-------------+----------+-------------+------------+----------------+-------------------+-----------+
        |string_column|int_column|bigint_column|float_column|timestamp_column|default_type_column|bool_column|
        [string       |int       |bigint       |float       |timestamp       |                   |boolean    ]
        +-------------+----------+-------------+------------+----------------+-------------------+-----------+
        |null         |null      |null         |null        |null            |null               |null       |
        +-------------+----------+-------------+------------+----------------+-------------------+-----------+
        """, spark_session).collect()
        assert 1 == len(rows)
        assert 7 == len(rows[0])  # Number of columns

        row = rows[0]
        assert None is row['string_column']
        assert None is row['int_column']
        assert None is row['bigint_column']
        assert None is row['float_column']
        assert None is row['timestamp_column']
        assert None is row['default_type_column']
        assert None is row['bool_column']
    def test_show_output_to_df_when_row_delimiters_not_present(
            self, spark_session: SparkSession):
        rows = show_output_to_df(
            """
        |string_column|int_column|float_column|timestamp_column   |
        |one          |1         |1.1         |2018-01-01 00:00:00|
        |two          |2         |2.2         |2018-01-02 12:34:56|
        """, spark_session).collect()
        assert 2 == len(rows)
        assert 4 == len(rows[0])  # Number of columns

        row = rows[0]
        assert 'one' == row['string_column']
        assert '1' == row['int_column']
        assert '1.1' == row['float_column']
        assert '2018-01-01 00:00:00' == row['timestamp_column']

        row = rows[1]
        assert 'two' == row['string_column']
        assert '2' == row['int_column']
        assert '2.2' == row['float_column']
        assert '2018-01-02 12:34:56' == row['timestamp_column']
    def test_show_output_to_df_with_wrapped_arrays(
            self, spark_session: SparkSession):
        df = show_output_to_df(
            """
        +---+--------------------------------------------------------+-----------------------------------------------------+-----------------------------------------------------------+-----------------------------------------------------------+
        |id |array_array_str_col                                     |array_array_int_col                                  |array_array_float_col                                      |array_array_double_col                                     |
        [int|array<array<string>>                                    |array<array<int>>                                    |array<array<float>>                                        |array<array<double>>                                       |
        +---+--------------------------------------------------------+-----------------------------------------------------+-----------------------------------------------------------+-----------------------------------------------------------+
        |1  |[WrappedArray(a, b, c), WrappedArray(d), WrappedArray()]|[WrappedArray(1, 2), WrappedArray(1), WrappedArray()]|[WrappedArray(1.0, 2.0), WrappedArray(1.0), WrappedArray()]|[WrappedArray(1.0, 2.0), WrappedArray(1.0), WrappedArray()]|
        |2  |[WrappedArray()]                                        |[WrappedArray()]                                     |[WrappedArray()]                                           |[WrappedArray()]                                           |
        |3  |[WrappedArray()]                                        |[WrappedArray(), WrappedArray()]                     |[WrappedArray(), WrappedArray(), WrappedArray()]           |[WrappedArray()]                                           |
        +---+--------------------------------------------------------+-----------------------------------------------------+-----------------------------------------------------------+-----------------------------------------------------------+
        """, spark_session)
        rows = df.collect()
        assert 3 == len(rows)
        assert 5 == len(rows[0])  # Number of columns

        row = rows[0]
        assert 1 == row['id']
        assert [['a', 'b', 'c'], ['d'], []] == row['array_array_str_col']
        assert [[1, 2], [1], []] == row['array_array_int_col']
        assert [[1.0, 2.0], [1.0], []] == row['array_array_float_col']
        assert [[1.0, 2.0], [1.0], []] == row['array_array_double_col']

        row = rows[1]
        assert 2 == row['id']
        assert [[]] == row['array_array_str_col']
        assert [[]] == row['array_array_int_col']
        assert [[]] == row['array_array_float_col']
        assert [[]] == row['array_array_double_col']

        row = rows[2]
        assert 3 == row['id']
        assert [[]] == row['array_array_str_col']
        assert [[], []] == row['array_array_int_col']
        assert [[], [], []] == row['array_array_float_col']
        assert [[]] == row['array_array_double_col']