def validate_dataset(ctx, raw_df, cols_to_validate): total_row_count = ctx.get_metadata(ctx.raw_dataset["key"])["dataset_size"] conditions_dict = spark_util.value_check_data(ctx, raw_df, cols_to_validate) if len(conditions_dict) > 0: for column, cond_count_list in conditions_dict.items(): for condition, fail_count in cond_count_list: logger.error( "Data validation {} has been violated in {}/{} samples". format(condition, fail_count, total_row_count)) raise UserException("raw column validations failed")
def test_value_check_data_invalid_out_of_range(spark, ctx_obj, get_context): data = [("a", 2.3, None), ("b", 1.0, None), ("c", 1.1, 4)] schema = StructType([ StructField("a_str", StringType()), StructField("b_float", FloatType()), StructField("c_long", LongType()), ]) df = spark.createDataFrame(data, schema) ctx_obj["raw_features"] = { "a_str": { "name": "a_str", "type": "STRING_FEATURE", "required": True, "id": 1 }, "b_float": { "name": "b_float", "type": "FLOAT_FEATURE", "required": True, "id": 2 }, "c_long": { "name": "c_long", "type": "INT_FEATURE", "required": False, "max": 1, "min": 0, "id": 3, }, } ctx = get_context(ctx_obj) validations = spark_util.value_check_data(ctx, df) assert validations == {"c_long": [("(c_long <= 1)", 1)]}
def test_value_check_data_valid(spark, ctx_obj, get_context): data = [("a", 0.1, None), ("b", 1.0, None), (None, 1.1, 0)] schema = StructType([ StructField("a_str", StringType()), StructField("b_float", FloatType()), StructField("c_long", LongType()), ]) df = spark.createDataFrame(data, schema) ctx_obj["raw_features"] = { "a_str": { "name": "a_str", "type": "STRING_FEATURE", "required": False, "values": ["a", "b"], "id": 1, }, "b_float": { "name": "b_float", "type": "FLOAT_FEATURE", "required": True, "id": 2 }, "c_long": { "name": "c_long", "type": "INT_FEATURE", "required": False, "max": 1, "min": 0, "id": 3, }, } ctx = get_context(ctx_obj) assert len(spark_util.value_check_data(ctx, df)) == 0
def test_value_check_data_invalid_null_value(spark, ctx_obj, get_context): data = [("a", None, None), ("b", 1.0, None), ("c", 1.1, 1)] schema = StructType([ StructField("a_str", StringType()), StructField("b_float", FloatType()), StructField("c_long", LongType()), ]) df = spark.createDataFrame(data, schema) ctx_obj["raw_columns"] = { "a_str": { "name": "a_str", "type": "STRING_COLUMN", "required": True, "id": 1 }, "b_float": { "name": "b_float", "type": "FLOAT_COLUMN", "required": True, "id": 2 }, "c_long": { "name": "c_long", "type": "INT_COLUMN", "max": 1, "min": 0, "id": 3 }, } ctx = get_context(ctx_obj) validations = spark_util.value_check_data(ctx, df) assert validations == {"b_float": [("(b_float IS NOT NULL)", 1)]}
def test_ingest_parquet_failed_requirements(capsys, spark, write_parquet_file, ctx_obj, get_context): data = [(None, 0.1, None), ("b", 1.0, None), ("c", 1.1, 4)] schema = StructType([ StructField("a_str", StringType()), StructField("b_float", FloatType()), StructField("c_long", LongType()), ]) path_to_file = write_parquet_file(spark, data, schema) ctx_obj["environment"] = { "data": { "type": "parquet", "path": path_to_file, "schema": [ { "column_name": "a_str", "feature_name": "a_str" }, { "column_name": "b_float", "feature_name": "b_float" }, { "column_name": "c_long", "feature_name": "c_long" }, ], } } ctx_obj["raw_features"] = { "a_str": { "name": "a_str", "type": "STRING_FEATURE", "values": ["a", "b"], "id": "1" }, "b_float": { "name": "b_float", "type": "FLOAT_FEATURE", "required": True, "id": "2" }, "c_long": { "name": "c_long", "type": "INT_FEATURE", "required": False, "id": "3" }, } ctx = get_context(ctx_obj) df = spark_util.ingest(ctx, spark) validations = spark_util.value_check_data(ctx, df) assert validations == {"a_str": [("(a_str IN (a, b))", 1)]}
def ingest_raw_dataset(spark, ctx, features_to_validate, should_ingest): if should_ingest: features_to_validate = list(ctx.rf_id_map.keys()) if len(features_to_validate) > 0: feature_resources_to_validate = [ ctx.rf_id_map[f] for f in features_to_validate ] ctx.upload_resource_status_start(*feature_resources_to_validate) try: if should_ingest: logger.info("Ingesting") logger.info("Ingesting {} data from {}".format( ctx.app["name"], ctx.environment["data"]["path"])) ingest_df = spark_util.ingest(ctx, spark) full_dataset_counter = ingest_df.count() if ctx.environment["data"].get("drop_null"): ingest_df = ingest_df.dropna() logger.info("Dropping any rows that contain null values") write_dataset_counter = ingest_df.count() logger.info("Caching {} data (version: {})".format( ctx.app["name"], ctx.dataset_version)) spark_util.write_raw_dataset(ingest_df, ctx) if ctx.environment["data"].get("drop_null"): logger.info( "{} rows read, {} rows dropped, {} rows ingested". format( full_dataset_counter, full_dataset_counter - write_dataset_counter, write_dataset_counter, )) else: logger.info( "{} rows ingested".format(full_dataset_counter)) logger.info("Reading {} data (version: {})".format( ctx.app["name"], ctx.dataset_version)) raw_df = spark_util.read_raw_dataset(ctx, spark) total_row_count = raw_df.count() conditions_dict = spark_util.value_check_data( ctx, raw_df, features_to_validate) if len(conditions_dict) > 0: for column, cond_count_list in conditions_dict.items(): for condition, fail_count in cond_count_list: logger.error( "Data validation {} has been violated in {}/{} samples" .format(condition, fail_count, total_row_count)) raise UserException("raw feature validations failed") except: ctx.upload_resource_status_failed(*feature_resources_to_validate) raise ctx.upload_resource_status_success(*feature_resources_to_validate) logger.info("First {} samples:".format(3)) show_df(raw_df, ctx, 3) else: logger.info("Reading {} data (version: {})".format( ctx.app["name"], ctx.dataset_version)) raw_df = spark_util.read_raw_dataset(ctx, spark) spark_util.value_check_data(ctx, raw_df, features_to_validate) return raw_df