Exemplo n.º 1
0
def validate_dataset(ctx, raw_df, cols_to_validate):
    total_row_count = ctx.get_metadata(ctx.raw_dataset["key"])["dataset_size"]
    conditions_dict = spark_util.value_check_data(ctx, raw_df,
                                                  cols_to_validate)

    if len(conditions_dict) > 0:
        for column, cond_count_list in conditions_dict.items():
            for condition, fail_count in cond_count_list:
                logger.error(
                    "Data validation {} has been violated in {}/{} samples".
                    format(condition, fail_count, total_row_count))
        raise UserException("raw column validations failed")
Exemplo n.º 2
0
def test_value_check_data_invalid_out_of_range(spark, ctx_obj, get_context):
    data = [("a", 2.3, None), ("b", 1.0, None), ("c", 1.1, 4)]

    schema = StructType([
        StructField("a_str", StringType()),
        StructField("b_float", FloatType()),
        StructField("c_long", LongType()),
    ])

    df = spark.createDataFrame(data, schema)

    ctx_obj["raw_features"] = {
        "a_str": {
            "name": "a_str",
            "type": "STRING_FEATURE",
            "required": True,
            "id": 1
        },
        "b_float": {
            "name": "b_float",
            "type": "FLOAT_FEATURE",
            "required": True,
            "id": 2
        },
        "c_long": {
            "name": "c_long",
            "type": "INT_FEATURE",
            "required": False,
            "max": 1,
            "min": 0,
            "id": 3,
        },
    }

    ctx = get_context(ctx_obj)

    validations = spark_util.value_check_data(ctx, df)
    assert validations == {"c_long": [("(c_long <= 1)", 1)]}
Exemplo n.º 3
0
def test_value_check_data_valid(spark, ctx_obj, get_context):
    data = [("a", 0.1, None), ("b", 1.0, None), (None, 1.1, 0)]

    schema = StructType([
        StructField("a_str", StringType()),
        StructField("b_float", FloatType()),
        StructField("c_long", LongType()),
    ])

    df = spark.createDataFrame(data, schema)

    ctx_obj["raw_features"] = {
        "a_str": {
            "name": "a_str",
            "type": "STRING_FEATURE",
            "required": False,
            "values": ["a", "b"],
            "id": 1,
        },
        "b_float": {
            "name": "b_float",
            "type": "FLOAT_FEATURE",
            "required": True,
            "id": 2
        },
        "c_long": {
            "name": "c_long",
            "type": "INT_FEATURE",
            "required": False,
            "max": 1,
            "min": 0,
            "id": 3,
        },
    }

    ctx = get_context(ctx_obj)

    assert len(spark_util.value_check_data(ctx, df)) == 0
Exemplo n.º 4
0
def test_value_check_data_invalid_null_value(spark, ctx_obj, get_context):
    data = [("a", None, None), ("b", 1.0, None), ("c", 1.1, 1)]

    schema = StructType([
        StructField("a_str", StringType()),
        StructField("b_float", FloatType()),
        StructField("c_long", LongType()),
    ])

    df = spark.createDataFrame(data, schema)

    ctx_obj["raw_columns"] = {
        "a_str": {
            "name": "a_str",
            "type": "STRING_COLUMN",
            "required": True,
            "id": 1
        },
        "b_float": {
            "name": "b_float",
            "type": "FLOAT_COLUMN",
            "required": True,
            "id": 2
        },
        "c_long": {
            "name": "c_long",
            "type": "INT_COLUMN",
            "max": 1,
            "min": 0,
            "id": 3
        },
    }

    ctx = get_context(ctx_obj)
    validations = spark_util.value_check_data(ctx, df)
    assert validations == {"b_float": [("(b_float IS NOT NULL)", 1)]}
Exemplo n.º 5
0
def test_ingest_parquet_failed_requirements(capsys, spark, write_parquet_file,
                                            ctx_obj, get_context):
    data = [(None, 0.1, None), ("b", 1.0, None), ("c", 1.1, 4)]

    schema = StructType([
        StructField("a_str", StringType()),
        StructField("b_float", FloatType()),
        StructField("c_long", LongType()),
    ])

    path_to_file = write_parquet_file(spark, data, schema)

    ctx_obj["environment"] = {
        "data": {
            "type":
            "parquet",
            "path":
            path_to_file,
            "schema": [
                {
                    "column_name": "a_str",
                    "feature_name": "a_str"
                },
                {
                    "column_name": "b_float",
                    "feature_name": "b_float"
                },
                {
                    "column_name": "c_long",
                    "feature_name": "c_long"
                },
            ],
        }
    }

    ctx_obj["raw_features"] = {
        "a_str": {
            "name": "a_str",
            "type": "STRING_FEATURE",
            "values": ["a", "b"],
            "id": "1"
        },
        "b_float": {
            "name": "b_float",
            "type": "FLOAT_FEATURE",
            "required": True,
            "id": "2"
        },
        "c_long": {
            "name": "c_long",
            "type": "INT_FEATURE",
            "required": False,
            "id": "3"
        },
    }

    ctx = get_context(ctx_obj)
    df = spark_util.ingest(ctx, spark)

    validations = spark_util.value_check_data(ctx, df)
    assert validations == {"a_str": [("(a_str IN (a, b))", 1)]}
Exemplo n.º 6
0
def ingest_raw_dataset(spark, ctx, features_to_validate, should_ingest):
    if should_ingest:
        features_to_validate = list(ctx.rf_id_map.keys())

    if len(features_to_validate) > 0:
        feature_resources_to_validate = [
            ctx.rf_id_map[f] for f in features_to_validate
        ]
        ctx.upload_resource_status_start(*feature_resources_to_validate)
        try:
            if should_ingest:
                logger.info("Ingesting")
                logger.info("Ingesting {} data from {}".format(
                    ctx.app["name"], ctx.environment["data"]["path"]))
                ingest_df = spark_util.ingest(ctx, spark)
                full_dataset_counter = ingest_df.count()
                if ctx.environment["data"].get("drop_null"):
                    ingest_df = ingest_df.dropna()
                    logger.info("Dropping any rows that contain null values")
                    write_dataset_counter = ingest_df.count()

                logger.info("Caching {} data (version: {})".format(
                    ctx.app["name"], ctx.dataset_version))
                spark_util.write_raw_dataset(ingest_df, ctx)

                if ctx.environment["data"].get("drop_null"):
                    logger.info(
                        "{} rows read, {} rows dropped, {} rows ingested".
                        format(
                            full_dataset_counter,
                            full_dataset_counter - write_dataset_counter,
                            write_dataset_counter,
                        ))
                else:
                    logger.info(
                        "{} rows ingested".format(full_dataset_counter))
            logger.info("Reading {} data (version: {})".format(
                ctx.app["name"], ctx.dataset_version))
            raw_df = spark_util.read_raw_dataset(ctx, spark)
            total_row_count = raw_df.count()
            conditions_dict = spark_util.value_check_data(
                ctx, raw_df, features_to_validate)

            if len(conditions_dict) > 0:
                for column, cond_count_list in conditions_dict.items():
                    for condition, fail_count in cond_count_list:
                        logger.error(
                            "Data validation {} has been violated in {}/{} samples"
                            .format(condition, fail_count, total_row_count))
                raise UserException("raw feature validations failed")
        except:
            ctx.upload_resource_status_failed(*feature_resources_to_validate)
            raise
        ctx.upload_resource_status_success(*feature_resources_to_validate)
        logger.info("First {} samples:".format(3))
        show_df(raw_df, ctx, 3)
    else:
        logger.info("Reading {} data (version: {})".format(
            ctx.app["name"], ctx.dataset_version))
        raw_df = spark_util.read_raw_dataset(ctx, spark)
        spark_util.value_check_data(ctx, raw_df, features_to_validate)

    return raw_df