def test_overwrite_static_partition(self, input_df, default_params, partition, full_table_name, spark_session): ( default_params["partition_definitions"][0]["default_value"], default_params["partition_definitions"][1]["default_value"], ) = partition loader = HiveLoader(**default_params) where_clause = construct_partition_query( loader.partition_definitions).replace(", ", " and ") df_to_load = input_df.where(where_clause) count_pre_total = spark_session.table(full_table_name).count() count_to_load = df_to_load.count() count_post_total = input_df.count() assert ( count_post_total == count_pre_total ), "Something went wrong in the test setup of the input DataFrame (input_df)" assert df_to_load.count() > 0, "DataFrame to load is empty!" loader.load(df_to_load) spark_session.catalog.refreshTable(full_table_name) assert ( spark_session.table( full_table_name).count() == count_post_total ), "test partition was not successfully loaded to output hive table"
def test_add_new_static_partition_with_overwritten_partition_value( self, input_df, default_params, partition, full_table_name, spark_session): ( default_params["partition_definitions"][0]["default_value"], default_params["partition_definitions"][1]["default_value"], ) = partition default_params["clear_partition"] = False loader = HiveLoader(**default_params) where_clause = construct_partition_query( loader.partition_definitions).replace(", ", " and ") output_table = spark_session.table(full_table_name) count_pre_partition = output_table.where(where_clause).count() count_post_partition = input_df.count() count_post_total = input_df.count() * 2 assert input_df.count() > 0, "Dataframe to load is empty!" loader.load(input_df) assert ( output_table.count() == count_post_total ), "test partition was not successfully loaded to output hive table" assert ( output_table.where(where_clause).count() == input_df.count() + count_pre_partition ), "test partition was not successfully loaded to output hive table"
def test_append_to_static_partition(self, input_df, default_params, partition, full_table_name, spark_session): ( default_params["partition_definitions"][0]["default_value"], default_params["partition_definitions"][1]["default_value"], ) = partition default_params["clear_partition"] = False loader = HiveLoader(**default_params) where_clause = construct_partition_query( loader.partition_definitions).replace(", ", " and ") # df_to_load = input_df.where(where_clause) count_pre_total = spark_session.table(full_table_name).count() count_to_load = df_to_load.count() count_post_total = count_pre_total + count_to_load assert df_to_load.count() > 0, "DataFrame to load is empty!" loader.load(df_to_load) spark_session.catalog.refreshTable(full_table_name) assert ( spark_session.table( full_table_name).count() == count_post_total ), "test partition was not successfully loaded to output hive table"
def test_add_new_static_partition(self, input_df, default_params, partition, full_table_name, spark_session): default_params["partition_definitions"][0][ "default_value"] = partition loader = HiveLoader(**default_params) partition_query = construct_partition_query( default_params["partition_definitions"]) inverted_partition_query = partition_query.replace( "=", "!=").replace(", ", " and ") df_to_load = input_df.where(partition_query) count_pre_total = input_df.where(inverted_partition_query).count() count_to_load = df_to_load.count() count_post_total = input_df.count() assert ( count_post_total == count_pre_total + count_to_load ), "Something went wrong in the test setup of the input dataframe (input_df)" spark_session.sql( "alter table {tbl} drop partition ({part_def})".format( tbl=full_table_name, part_def=partition_query)) assert ( spark_session.table(full_table_name).count() == count_pre_total ), "test partition was not successfully dropped from output hive table" assert df_to_load.count() > 0, "Dataframe to load is empty!" loader.load(df_to_load) spark_session.catalog.refreshTable(full_table_name) assert ( spark_session.table( full_table_name).count() == count_post_total ), "test partition was not successfully loaded to output hive table"
def test_default_value_is_missing(self, default_params, input_df): default_params["partition_definitions"][1].pop("default_value") with pytest.raises(AssertionError) as excinfo: loader = HiveLoader(**default_params) loader.load(input_df) assert "No default partition value set for partition column" in str( excinfo.value)
def test_default_value_is_empty(self, default_value, default_params, input_df): default_params["partition_definitions"][0][ "default_value"], default_params["partition_definitions"][0][ "default_value"] = 3, default_value with pytest.raises(AssertionError) as excinfo: loader = HiveLoader(**default_params) loader.load(input_df) assert "No default partition value set for partition column" in str( excinfo.value)
def test_column_name_is_missing(self, default_params): default_params["partition_definitions"][0][ "column_name"], default_params["partition_definitions"][1][ "column_name"] = None, "f" with pytest.raises(AssertionError) as excinfo: HiveLoader(**default_params) assert "No column name set!" in str(excinfo.value)
def test_input_is_not_a_list(self, partition_definitions, default_params): default_params["partition_definitions"] = partition_definitions with pytest.raises(AssertionError) as excinfo: HiveLoader(**default_params) assert "partition_definitions has to be a list containing dicts" in str( excinfo.value)
def test_clear_partition(self, spark_session, input_df, partition, default_params, full_table_name): """Partition is dropped""" ( default_params["partition_definitions"][0]["default_value"], default_params["partition_definitions"][1]["default_value"], ) = partition loader = HiveLoader(**default_params) partition_query = construct_partition_query( loader.partition_definitions).replace(", ", " and ") inverted_partition_query = partition_query.replace("=", "!=") expected_count = input_df.where(inverted_partition_query).count() loader._clear_hive_partition() actual_count = spark_session.table(full_table_name).count() assert actual_count == expected_count
def test_list_input_contains_non_dict_items(self, partition_definitions, default_params): default_params["partition_definitions"] = [partition_definitions] with pytest.raises(AssertionError) as excinfo: HiveLoader(**default_params) assert "Items of partition_definitions have to be dictionaries" in str( excinfo.value)
def test_column_type_not_a_valid_spark_sql_type( self, data_type, default_params): default_params["partition_definitions"][0][ "column_type"] = data_type with pytest.raises(AssertionError) as excinfo: HiveLoader(**default_params) assert "Not a valid (PySpark) datatype for the partition column" in str( excinfo.value)
def test_create_partitioned_table(self, input_df, default_params, partition, full_table_name, spark_session): ( default_params["partition_definitions"][0]["default_value"], default_params["partition_definitions"][1]["default_value"], ) = partition default_params["auto_create_table"] = True loader = HiveLoader(**default_params) spark_session.sql("drop table if exists " + full_table_name) spark_session.catalog.setCurrentDatabase(default_params["db_name"]) assert default_params["table_name"] not in [ tbl.name for tbl in spark_session.catalog.listTables() ], "Test setup of database is not clean. Table already exists!" where_clause = construct_partition_query( loader.partition_definitions).replace(", ", " and ") df_to_load = input_df.where(where_clause) count_to_load = df_to_load.count() assert df_to_load.count() > 0, "DataFrame to load is empty!" loader.load(df_to_load) spark_session.catalog.refreshTable(full_table_name) assert default_params["table_name"] in [ tbl.name for tbl in spark_session.catalog.listTables() ], "Table was not created!" assert ( spark_session.table(full_table_name).count() == count_to_load ), "test partition was not successfully loaded to automatically created output hive table" try: assert spark_session.sql("show partitions " + full_table_name).count() > 0 except Py4JJavaError as e: raise AssertionError("Created table is not partitioned. " + str(e))
def default_loader(default_params): return HiveLoader(**default_params)