예제 #1
0
def test_ingest_parquet_type_mismatch(spark, write_parquet_file, ctx_obj,
                                      get_context):
    data = [("a", 0.1, None), ("b", 1.0, None), ("c", 1.1, 4.0)]

    schema = StructType([
        StructField("a_str", StringType()),
        StructField("b_float", FloatType()),
        StructField("c_long", FloatType()),
    ])

    path_to_file = write_parquet_file(spark, data, schema)

    ctx_obj["environment"] = {
        "data": {
            "type":
            "parquet",
            "path":
            path_to_file,
            "schema": [
                {
                    "parquet_column_name": "a_str",
                    "raw_column": add_res_ref("a_str")
                },
                {
                    "parquet_column_name": "b_float",
                    "raw_column": add_res_ref("b_float")
                },
                {
                    "parquet_column_name": "c_long",
                    "raw_column": add_res_ref("c_long")
                },
            ],
        }
    }

    ctx_obj["raw_columns"] = {
        "a_str": {
            "name": "a_str",
            "type": "STRING_COLUMN",
            "required": True,
            "id": "1"
        },
        "b_float": {
            "name": "b_float",
            "type": "FLOAT_COLUMN",
            "required": True,
            "id": "2"
        },
        "c_long": {
            "name": "c_long",
            "type": "INT_COLUMN",
            "required": False,
            "id": "3"
        },
    }

    with pytest.raises(UserException) as exec_info:
        spark_util.ingest(get_context(ctx_obj), spark).collect()
    assert "c_long" in str(exec_info.value) and "type mismatch" in str(
        exec_info.value)
예제 #2
0
def test_ingest_parquet_type_mismatch(spark, write_parquet_file, ctx_obj,
                                      get_context):
    data = [("a", 0.1, None), ("b", 1.0, None), ("c", 1.1, 4.0)]

    schema = StructType([
        StructField("a_str", StringType()),
        StructField("b_float", FloatType()),
        StructField("c_long", FloatType()),
    ])

    path_to_file = write_parquet_file(spark, data, schema)

    ctx_obj["environment"] = {
        "data": {
            "type":
            "parquet",
            "path":
            path_to_file,
            "schema": [
                {
                    "column_name": "a_str",
                    "feature_name": "a_str"
                },
                {
                    "column_name": "b_float",
                    "feature_name": "b_float"
                },
                {
                    "column_name": "c_long",
                    "feature_name": "c_long"
                },
            ],
        }
    }

    ctx_obj["raw_features"] = {
        "a_str": {
            "name": "a_str",
            "type": "STRING_FEATURE",
            "required": True,
            "id": "1"
        },
        "b_float": {
            "name": "b_float",
            "type": "FLOAT_FEATURE",
            "required": True,
            "id": "2"
        },
        "c_long": {
            "name": "c_long",
            "type": "INT_FEATURE",
            "required": False,
            "id": "3"
        },
    }

    with pytest.raises(UserException):
        spark_util.ingest(get_context(ctx_obj), spark).collect()
예제 #3
0
def test_ingest_parquet_valid(spark, write_parquet_file, ctx_obj, get_context):
    data = [("a", 0.1, None), ("b", 1.0, None), ("c", 1.1, 4)]

    schema = StructType([
        StructField("a_str", StringType()),
        StructField("b_float", FloatType()),
        StructField("c_long", LongType()),
    ])

    path_to_file = write_parquet_file(spark, data, schema)

    ctx_obj["environment"] = {
        "data": {
            "type":
            "parquet",
            "path":
            path_to_file,
            "schema": [
                {
                    "parquet_column_name": "a_str",
                    "raw_column_name": "a_str"
                },
                {
                    "parquet_column_name": "b_float",
                    "raw_column_name": "b_float"
                },
                {
                    "parquet_column_name": "c_long",
                    "raw_column_name": "c_long"
                },
            ],
        }
    }

    ctx_obj["raw_columns"] = {
        "a_str": {
            "name": "a_str",
            "type": "STRING_COLUMN",
            "required": True,
            "id": "1"
        },
        "b_float": {
            "name": "b_float",
            "type": "FLOAT_COLUMN",
            "required": True,
            "id": "2"
        },
        "c_long": {
            "name": "c_long",
            "type": "INT_COLUMN",
            "required": False,
            "id": "3"
        },
    }

    assert spark_util.ingest(get_context(ctx_obj), spark).count() == 3
예제 #4
0
def test_read_csv_invalid_type(spark, write_csv_file, ctx_obj, get_context):
    csv_str = "\n".join(["a,0.1,", "b,b,1", "c,1.1,4"])

    path_to_file = write_csv_file(csv_str)

    ctx_obj["environment"] = {
        "data": {
            "type":
            "csv",
            "path":
            path_to_file,
            "schema": [
                add_res_ref("a_str"),
                add_res_ref("b_long"),
                add_res_ref("c_long")
            ],
        }
    }

    ctx_obj["raw_columns"] = {
        "a_str": {
            "name": "a_str",
            "type": "STRING_COLUMN",
            "required": True,
            "id": "-"
        },
        "b_long": {
            "name": "b_long",
            "type": "INT_COLUMN",
            "required": True,
            "id": "-"
        },
        "c_long": {
            "name": "c_long",
            "type": "INT_COLUMN",
            "required": False,
            "id": "-"
        },
    }

    with pytest.raises(UserException):
        spark_util.ingest(get_context(ctx_obj), spark).collect()
예제 #5
0
def ingest_raw_dataset(spark, ctx, cols_to_validate, should_ingest):
    if should_ingest:
        cols_to_validate = list(ctx.rf_id_map.keys())

    if len(cols_to_validate) == 0:
        logger.info("Reading {} data (version: {})".format(
            ctx.app["name"], ctx.dataset_version))
        return spark_util.read_raw_dataset(ctx, spark)

    col_resources_to_validate = [ctx.rf_id_map[f] for f in cols_to_validate]
    ctx.upload_resource_status_start(*col_resources_to_validate)

    try:
        if should_ingest:
            data_config = ctx.environment["data"]

            logger.info("Ingesting")
            logger.info("Ingesting {} data from {}".format(
                ctx.app["name"], data_config["path"]))
            ingest_df = spark_util.ingest(ctx, spark)

            full_dataset_size = ingest_df.count()

            if data_config.get("drop_null"):
                logger.info("Dropping any rows that contain null values")
                ingest_df = ingest_df.dropna()

            if ctx.environment.get("limit"):
                ingest_df = limit_dataset(full_dataset_size, ingest_df,
                                          ctx.environment["limit"])

            written_count = write_raw_dataset(ingest_df, ctx, spark)
            metadata = {"dataset_size": written_count}
            ctx.storage.put_json(metadata, ctx.raw_dataset["metadata_key"])
            if written_count != full_dataset_size:
                logger.info(
                    "{} rows read, {} rows dropped, {} rows ingested".format(
                        full_dataset_size, full_dataset_size - written_count,
                        written_count))
            else:
                logger.info("{} rows ingested".format(written_count))

        logger.info("Reading {} data (version: {})".format(
            ctx.app["name"], ctx.dataset_version))
        raw_df = spark_util.read_raw_dataset(ctx, spark)
        validate_dataset(ctx, raw_df, cols_to_validate)
    except:
        ctx.upload_resource_status_failed(*col_resources_to_validate)
        raise
    ctx.upload_resource_status_success(*col_resources_to_validate)
    logger.info("First {} samples:".format(3))
    show_df(raw_df, ctx, 3)

    return raw_df
예제 #6
0
def test_read_csv_invalid_type(spark, write_csv_file, ctx_obj, get_context):
    csv_str = "\n".join(["a,0.1,", "b,1,1", "c,1.1,4"])

    path_to_file = write_csv_file(csv_str)

    ctx_obj["environment"] = {
        "data": {
            "type": "csv",
            "path": path_to_file,
            "schema": ["a_str", "b_long", "c_long"]
        }
    }

    ctx_obj["raw_features"] = {
        "a_str": {
            "name": "a_str",
            "type": "STRING_FEATURE",
            "required": True,
            "id": "-"
        },
        "b_long": {
            "name": "b_long",
            "type": "INT_FEATURE",
            "required": True,
            "id": "-"
        },
        "c_long": {
            "name": "c_long",
            "type": "INT_FEATURE",
            "required": False,
            "id": "-"
        },
    }

    with pytest.raises(Py4JJavaError):
        spark_util.ingest(get_context(ctx_obj), spark).collect()
예제 #7
0
def test_read_csv_missing_column(spark, write_csv_file, ctx_obj, get_context):
    csv_str = "\n".join(["a,0.1,", "b,1,1"])

    path_to_file = write_csv_file(csv_str)

    ctx_obj["environment"] = {
        "data": {
            "type": "csv",
            "path": path_to_file,
            "schema": ["a_str", "b_long", "c_long"]
        }
    }

    ctx_obj["raw_columns"] = {
        "a_str": {
            "name": "a_str",
            "type": "STRING_COLUMN",
            "required": True,
            "id": "-"
        },
        "b_long": {
            "name": "b_long",
            "type": "INT_COLUMN",
            "required": True,
            "id": "-"
        },
        "c_long": {
            "name": "c_long",
            "type": "INT_COLUMN",
            "required": False,
            "id": "-"
        },
    }

    with pytest.raises(Py4JJavaError) as exec_info:
        spark_util.ingest(get_context(ctx_obj), spark).collect()
예제 #8
0
def ingest_raw_dataset(spark, ctx, cols_to_validate, should_ingest):
    if should_ingest:
        cols_to_validate = list(ctx.rf_id_map.keys())

    if len(cols_to_validate) == 0:
        logger.info("Reading {} data (version: {})".format(
            ctx.app["name"], ctx.dataset_version))
        return spark_util.read_raw_dataset(ctx, spark)

    col_resources_to_validate = [ctx.rf_id_map[f] for f in cols_to_validate]
    ctx.upload_resource_status_start(*col_resources_to_validate)
    try:
        if should_ingest:
            logger.info("Ingesting")
            logger.info("Ingesting {} data from {}".format(
                ctx.app["name"], ctx.environment["data"]["path"]))
            ingest_df = spark_util.ingest(ctx, spark)

            if ctx.environment["data"].get("drop_null"):
                drop_null_and_write(ingest_df, ctx, spark)
            else:
                written_count = write_raw_dataset(ingest_df, ctx, spark)
                metadata = {"dataset_size": written_count}
                aws.upload_json_to_s3(metadata,
                                      ctx.raw_dataset["metadata_key"],
                                      ctx.bucket)
                logger.info("{} rows ingested".format(written_count))

        logger.info("Reading {} data (version: {})".format(
            ctx.app["name"], ctx.dataset_version))
        raw_df = spark_util.read_raw_dataset(ctx, spark)
        validate_dataset(ctx, raw_df, cols_to_validate)
    except:
        ctx.upload_resource_status_failed(*col_resources_to_validate)
        raise
    ctx.upload_resource_status_success(*col_resources_to_validate)
    logger.info("First {} samples:".format(3))
    show_df(raw_df, ctx, 3)

    return raw_df
예제 #9
0
def ingest_raw_dataset(spark, ctx, cols_to_validate, should_ingest):
    if should_ingest:
        cols_to_validate = list(ctx.rf_id_map.keys())

    if len(cols_to_validate) == 0:
        logger.info("Reading {} data (version: {})".format(
            ctx.app["name"], ctx.dataset_version))
        return spark_util.read_raw_dataset(ctx, spark)

    col_resources_to_validate = [ctx.rf_id_map[f] for f in cols_to_validate]
    ctx.upload_resource_status_start(*col_resources_to_validate)

    try:
        if should_ingest:
            data_config = ctx.environment["data"]

            logger.info("Ingesting")
            logger.info("Ingesting {} data from {}".format(
                ctx.app["name"], data_config["path"]))
            ingest_df = spark_util.ingest(ctx, spark)

            input_type_map = {f.name: f.dataType for f in ingest_df.schema}
            for raw_column_name in ctx.raw_columns:
                if ctx.raw_columns[raw_column_name][
                        "type"] == consts.COLUMN_TYPE_INFERRED:
                    column_type = spark_util.SPARK_TYPE_TO_CORTEX_TYPE[
                        input_type_map[raw_column_name]]
                    ctx.write_metadata(ctx.raw_columns[raw_column_name]["id"],
                                       {"type": column_type})

            full_dataset_size = ingest_df.count()

            if data_config.get("drop_null"):
                logger.info("Dropping any rows that contain null values")
                ingest_df = ingest_df.dropna()

            if ctx.environment.get("limit"):
                ingest_df = limit_dataset(full_dataset_size, ingest_df,
                                          ctx.environment["limit"])

            written_count = write_raw_dataset(ingest_df, ctx, spark)
            ctx.write_metadata(ctx.raw_dataset["key"],
                               {"dataset_size": written_count})
            if written_count != full_dataset_size:
                logger.info(
                    "{} rows read, {} rows dropped, {} rows ingested".format(
                        full_dataset_size, full_dataset_size - written_count,
                        written_count))
            else:
                logger.info("{} rows ingested".format(written_count))

        logger.info("Reading {} data (version: {})".format(
            ctx.app["name"], ctx.dataset_version))
        raw_df = spark_util.read_raw_dataset(ctx, spark)
        validate_dataset(ctx, raw_df, cols_to_validate)
    except:
        ctx.upload_resource_status_failed(*col_resources_to_validate)
        raise
    ctx.upload_resource_status_success(*col_resources_to_validate)
    logger.info("First {} samples:".format(3))
    show_df(raw_df, ctx, 3)

    return raw_df
예제 #10
0
def test_ingest_parquet_failed_requirements(capsys, spark, write_parquet_file,
                                            ctx_obj, get_context):
    data = [(None, 0.1, None), ("b", 1.0, None), ("c", 1.1, 4)]

    schema = StructType([
        StructField("a_str", StringType()),
        StructField("b_float", FloatType()),
        StructField("c_long", LongType()),
    ])

    path_to_file = write_parquet_file(spark, data, schema)

    ctx_obj["environment"] = {
        "data": {
            "type":
            "parquet",
            "path":
            path_to_file,
            "schema": [
                {
                    "column_name": "a_str",
                    "feature_name": "a_str"
                },
                {
                    "column_name": "b_float",
                    "feature_name": "b_float"
                },
                {
                    "column_name": "c_long",
                    "feature_name": "c_long"
                },
            ],
        }
    }

    ctx_obj["raw_features"] = {
        "a_str": {
            "name": "a_str",
            "type": "STRING_FEATURE",
            "values": ["a", "b"],
            "id": "1"
        },
        "b_float": {
            "name": "b_float",
            "type": "FLOAT_FEATURE",
            "required": True,
            "id": "2"
        },
        "c_long": {
            "name": "c_long",
            "type": "INT_FEATURE",
            "required": False,
            "id": "3"
        },
    }

    ctx = get_context(ctx_obj)
    df = spark_util.ingest(ctx, spark)

    validations = spark_util.value_check_data(ctx, df)
    assert validations == {"a_str": [("(a_str IN (a, b))", 1)]}
예제 #11
0
def ingest_raw_dataset(spark, ctx, features_to_validate, should_ingest):
    if should_ingest:
        features_to_validate = list(ctx.rf_id_map.keys())

    if len(features_to_validate) > 0:
        feature_resources_to_validate = [
            ctx.rf_id_map[f] for f in features_to_validate
        ]
        ctx.upload_resource_status_start(*feature_resources_to_validate)
        try:
            if should_ingest:
                logger.info("Ingesting")
                logger.info("Ingesting {} data from {}".format(
                    ctx.app["name"], ctx.environment["data"]["path"]))
                ingest_df = spark_util.ingest(ctx, spark)
                full_dataset_counter = ingest_df.count()
                if ctx.environment["data"].get("drop_null"):
                    ingest_df = ingest_df.dropna()
                    logger.info("Dropping any rows that contain null values")
                    write_dataset_counter = ingest_df.count()

                logger.info("Caching {} data (version: {})".format(
                    ctx.app["name"], ctx.dataset_version))
                spark_util.write_raw_dataset(ingest_df, ctx)

                if ctx.environment["data"].get("drop_null"):
                    logger.info(
                        "{} rows read, {} rows dropped, {} rows ingested".
                        format(
                            full_dataset_counter,
                            full_dataset_counter - write_dataset_counter,
                            write_dataset_counter,
                        ))
                else:
                    logger.info(
                        "{} rows ingested".format(full_dataset_counter))
            logger.info("Reading {} data (version: {})".format(
                ctx.app["name"], ctx.dataset_version))
            raw_df = spark_util.read_raw_dataset(ctx, spark)
            total_row_count = raw_df.count()
            conditions_dict = spark_util.value_check_data(
                ctx, raw_df, features_to_validate)

            if len(conditions_dict) > 0:
                for column, cond_count_list in conditions_dict.items():
                    for condition, fail_count in cond_count_list:
                        logger.error(
                            "Data validation {} has been violated in {}/{} samples"
                            .format(condition, fail_count, total_row_count))
                raise UserException("raw feature validations failed")
        except:
            ctx.upload_resource_status_failed(*feature_resources_to_validate)
            raise
        ctx.upload_resource_status_success(*feature_resources_to_validate)
        logger.info("First {} samples:".format(3))
        show_df(raw_df, ctx, 3)
    else:
        logger.info("Reading {} data (version: {})".format(
            ctx.app["name"], ctx.dataset_version))
        raw_df = spark_util.read_raw_dataset(ctx, spark)
        spark_util.value_check_data(ctx, raw_df, features_to_validate)

    return raw_df
예제 #12
0
def test_read_parquet_infer_invalid(spark, write_parquet_file, ctx_obj,
                                    get_context):
    tests = [
        {
            "data": [("a", 0.1, None), ("b", 1.0, None), ("c", 1.1, 4)],
            "schema":
            StructType([
                StructField("a_str", StringType()),
                StructField("b_float", DoubleType()),
                StructField("c_long", IntegerType()),
            ]),
            "env": [
                {
                    "parquet_column_name": "a_str",
                    "raw_column": add_res_ref("a_str")
                },
                {
                    "parquet_column_name": "b_float",
                    "raw_column": add_res_ref("b_float")
                },
                {
                    "parquet_column_name": "c_long",
                    "raw_column": add_res_ref("c_long")
                },
            ],
            "raw_columns": {
                "a_str": {
                    "name": "a_str",
                    "type": "INFERRED_COLUMN",
                    "required": True,
                    "id": "1"
                },
                "b_float": {
                    "name": "b_float",
                    "type": "INT_COLUMN",
                    "required": True,
                    "id": "2"
                },
                "c_long": {
                    "name": "c_long",
                    "type": "INFERRED_COLUMN",
                    "required": False,
                    "id": "3",
                },
            },
        },
        {
            "data": [("1", 0.1, "a"), ("1", 1.0, "a"), ("1", 1.1, "a")],
            "schema":
            StructType([
                StructField("a_str", StringType()),
                StructField("b_float", DoubleType()),
                StructField("c_str", StringType()),
            ]),
            "env": [
                {
                    "parquet_column_name": "a_str",
                    "raw_column": add_res_ref("a_str")
                },
                {
                    "parquet_column_name": "b_float",
                    "raw_column": add_res_ref("b_float")
                },
                {
                    "parquet_column_name": "c_str",
                    "raw_column": add_res_ref("c_str")
                },
            ],
            "raw_columns": {
                "a_str": {
                    "name": "a_str",
                    "type": "INFERRED_COLUMN",
                    "required": True,
                    "id": "1"
                },
                "b_float": {
                    "name": "b_float",
                    "type": "INFERRED_COLUMN",
                    "required": True,
                    "id": "2",
                },
                "c_str": {
                    "name": "c_str",
                    "type": "INT_COLUMN",
                    "required": False,
                    "id": "3"
                },
            },
        },
        {
            "data": [("a", 1, None), ("b", 1, None), ("c", 1, 4)],
            "schema":
            StructType([
                StructField("a_str", StringType()),
                StructField("b_float", IntegerType()),
                StructField("c_long", IntegerType()),
            ]),
            "env": [
                {
                    "parquet_column_name": "a_str",
                    "raw_column": add_res_ref("a_str")
                },
                {
                    "parquet_column_name": "b_float",
                    "raw_column": add_res_ref("b_float")
                },
                {
                    "parquet_column_name": "c_long",
                    "raw_column": add_res_ref("c_long")
                },
            ],
            "raw_columns": {
                "a_str": {
                    "name": "a_str",
                    "type": "INFERRED_COLUMN",
                    "required": True,
                    "id": "1"
                },
                "b_float": {
                    "name": "b_float",
                    "type": "FLOAT_COLUMN",
                    "required": True,
                    "id": "2"
                },
                "c_long": {
                    "name": "c_long",
                    "type": "INFERRED_COLUMN",
                    "required": False,
                    "id": "3",
                },
            },
        },
        {
            "data": [("a", 1, None), ("b", 1, None), ("c", 1, 4)],
            "schema":
            StructType([
                StructField("a_str", StringType()),
                StructField("b_float", IntegerType()),
                StructField("c_long", IntegerType()),
            ]),
            "env": [
                {
                    "parquet_column_name": "a_str",
                    "raw_column": add_res_ref("a_str")
                },
                {
                    "parquet_column_name": "b_float",
                    "raw_column": add_res_ref("b_float")
                },
                {
                    "parquet_column_name": "c_long",
                    "raw_column": add_res_ref("c_long")
                },
            ],
            "raw_columns": {
                "a_str": {
                    "name": "a_str",
                    "type": "INT_COLUMN",
                    "required": True,
                    "id": "1"
                },
                "b_float": {
                    "name": "b_float",
                    "type": "STRING_COLUMN",
                    "required": True,
                    "id": "2",
                },
                "c_long": {
                    "name": "c_long",
                    "type": "INFERRED_COLUMN",
                    "required": False,
                    "id": "3",
                },
            },
        },
        {
            "data": [("a", [1], None), ("b", [1], None), ("c", [1], 4)],
            "schema":
            StructType([
                StructField("a_str", StringType()),
                StructField("b_float", ArrayType(IntegerType()), True),
                StructField("c_long", IntegerType()),
            ]),
            "env": [
                {
                    "parquet_column_name": "a_str",
                    "raw_column": add_res_ref("a_str")
                },
                {
                    "parquet_column_name": "b_float",
                    "raw_column": add_res_ref("b_float")
                },
                {
                    "parquet_column_name": "c_long",
                    "raw_column": add_res_ref("c_long")
                },
            ],
            "raw_columns": {
                "a_str": {
                    "name": "a_str",
                    "type": "INT_COLUMN",
                    "required": True,
                    "id": "1"
                },
                "b_float": {
                    "name": "b_float",
                    "type": "FLOAT_LIST_COLUMN",
                    "required": True,
                    "id": "2",
                },
                "c_long": {
                    "name": "c_long",
                    "type": "INFERRED_COLUMN",
                    "required": False,
                    "id": "3",
                },
            },
        },
    ]

    for test in tests:
        data = test["data"]

        schema = test["schema"]

        path_to_file = write_parquet_file(spark, data, schema)

        ctx_obj["environment"] = {
            "data": {
                "type": "parquet",
                "path": path_to_file,
                "schema": test["env"]
            }
        }

        ctx_obj["raw_columns"] = test["raw_columns"]

        with pytest.raises(UserException) as exec_info:
            spark_util.ingest(get_context(ctx_obj), spark).collect()
예제 #13
0
def test_ingest_parquet_infer_valid(spark, write_parquet_file, ctx_obj,
                                    get_context):
    tests = [
        {
            "data": [("a", 0.1, None), ("b", 1.0, None), ("c", 1.1, 4)],
            "schema":
            StructType([
                StructField("a_str", StringType()),
                StructField("b_float", DoubleType()),
                StructField("c_long", IntegerType()),
            ]),
            "env": [
                {
                    "parquet_column_name": "a_str",
                    "raw_column": add_res_ref("a_str")
                },
                {
                    "parquet_column_name": "b_float",
                    "raw_column": add_res_ref("b_float")
                },
                {
                    "parquet_column_name": "c_long",
                    "raw_column": add_res_ref("c_long")
                },
            ],
            "raw_columns": {
                "a_str": {
                    "name": "a_str",
                    "type": "INFERRED_COLUMN",
                    "required": True,
                    "id": "1"
                },
                "b_float": {
                    "name": "b_float",
                    "type": "INFERRED_COLUMN",
                    "required": True,
                    "id": "2",
                },
                "c_long": {
                    "name": "c_long",
                    "type": "INFERRED_COLUMN",
                    "required": False,
                    "id": "3",
                },
            },
            "expected_types": [
                ("a_str", StringType()),
                ("b_float", FloatType()),
                ("c_long", LongType()),
            ],
        },
        {
            "data": [("1", 0.1, None), ("1", 1.0, None), ("1", 1.1, 4)],
            "schema":
            StructType([
                StructField("a_str", StringType()),
                StructField("b_float", DoubleType()),
                StructField("c_long", IntegerType()),
            ]),
            "env": [
                {
                    "parquet_column_name": "a_str",
                    "raw_column": add_res_ref("a_str")
                },
                {
                    "parquet_column_name": "b_float",
                    "raw_column": add_res_ref("b_float")
                },
                {
                    "parquet_column_name": "c_long",
                    "raw_column": add_res_ref("c_long")
                },
            ],
            "raw_columns": {
                "a_str": {
                    "name": "a_str",
                    "type": "INFERRED_COLUMN",
                    "required": True,
                    "id": "1"
                },
                "b_float": {
                    "name": "b_float",
                    "type": "INFERRED_COLUMN",
                    "required": True,
                    "id": "2",
                },
                "c_long": {
                    "name": "c_long",
                    "type": "INFERRED_COLUMN",
                    "required": False,
                    "id": "3",
                },
            },
            "expected_types": [
                ("a_str", StringType()),
                ("b_float", FloatType()),
                ("c_long", LongType()),
            ],
        },
        {
            "data": [
                ("1", 0.1, datetime.now()),
                ("1", 1.0, datetime.now()),
                ("1", 1.1, datetime.now()),
            ],
            "schema":
            StructType([
                StructField("a_str", StringType()),
                StructField("b_float", DoubleType()),
                StructField("c_str", TimestampType()),
            ]),
            "env": [
                {
                    "parquet_column_name": "a_str",
                    "raw_column": add_res_ref("a_str")
                },
                {
                    "parquet_column_name": "b_float",
                    "raw_column": add_res_ref("b_float")
                },
                {
                    "parquet_column_name": "c_str",
                    "raw_column": add_res_ref("c_str")
                },
            ],
            "raw_columns": {
                "a_str": {
                    "name": "a_str",
                    "type": "INFERRED_COLUMN",
                    "required": True,
                    "id": "1"
                },
                "b_float": {
                    "name": "b_float",
                    "type": "INFERRED_COLUMN",
                    "required": True,
                    "id": "2",
                },
                "c_str": {
                    "name": "c_str",
                    "type": "INFERRED_COLUMN",
                    "required": False,
                    "id": "3"
                },
            },
            "expected_types": [
                ("a_str", StringType()),
                ("b_float", FloatType()),
                ("c_str", StringType()),
            ],
        },
        {
            "data": [
                ("1", [0.1, 12.0], datetime.now()),
                ("1", [1.23, 1.0], datetime.now()),
                ("1", [12.3, 1.1], datetime.now()),
            ],
            "schema":
            StructType([
                StructField("a_str", StringType()),
                StructField("b_float", ArrayType(DoubleType()), True),
                StructField("c_str", TimestampType()),
            ]),
            "env": [
                {
                    "parquet_column_name": "a_str",
                    "raw_column": add_res_ref("a_str")
                },
                {
                    "parquet_column_name": "b_float",
                    "raw_column": add_res_ref("b_float")
                },
                {
                    "parquet_column_name": "c_str",
                    "raw_column": add_res_ref("c_str")
                },
            ],
            "raw_columns": {
                "a_str": {
                    "name": "a_str",
                    "type": "INFERRED_COLUMN",
                    "required": True,
                    "id": "1"
                },
                "b_float": {
                    "name": "b_float",
                    "type": "FLOAT_LIST_COLUMN",
                    "required": True,
                    "id": "2",
                },
                "c_str": {
                    "name": "c_str",
                    "type": "INFERRED_COLUMN",
                    "required": False,
                    "id": "3"
                },
            },
            "expected_types": [
                ("a_str", StringType()),
                ("b_float", ArrayType(FloatType(), True)),
                ("c_str", StringType()),
            ],
        },
    ]

    for test in tests:
        data = test["data"]

        schema = test["schema"]

        path_to_file = write_parquet_file(spark, data, schema)

        ctx_obj["environment"] = {
            "data": {
                "type": "parquet",
                "path": path_to_file,
                "schema": test["env"]
            }
        }

        ctx_obj["raw_columns"] = test["raw_columns"]

        df = spark_util.ingest(get_context(ctx_obj), spark)

        assert df.count() == 3
        assert (sorted([(s.name, s.dataType) for s in df.schema],
                       key=lambda x: x[0]) == test["expected_types"])
예제 #14
0
def test_ingest_parquet_valid(spark, write_parquet_file, ctx_obj, get_context):
    data = [("a", 0.1, None), ("b", 1.0, None), ("c", 1.1, 4)]

    schema = StructType([
        StructField("a_str", StringType()),
        StructField("b_float", DoubleType()),
        StructField("c_long", IntegerType()),
    ])

    path_to_file = write_parquet_file(spark, data, schema)

    ctx_obj["environment"] = {
        "data": {
            "type":
            "parquet",
            "path":
            path_to_file,
            "schema": [
                {
                    "parquet_column_name": "a_str",
                    "raw_column": add_res_ref("a_str")
                },
                {
                    "parquet_column_name": "b_float",
                    "raw_column": add_res_ref("b_float")
                },
                {
                    "parquet_column_name": "c_long",
                    "raw_column": add_res_ref("c_long")
                },
            ],
        }
    }

    ctx_obj["raw_columns"] = {
        "a_str": {
            "name": "a_str",
            "type": "STRING_COLUMN",
            "required": True,
            "id": "1"
        },
        "b_float": {
            "name": "b_float",
            "type": "FLOAT_COLUMN",
            "required": True,
            "id": "2"
        },
        "c_long": {
            "name": "c_long",
            "type": "INT_COLUMN",
            "required": False,
            "id": "3"
        },
    }

    df = spark_util.ingest(get_context(ctx_obj), spark)

    assert df.count() == 3

    assert sorted([(s.name, s.dataType) for s in df.schema],
                  key=lambda x: x[0]) == [
                      ("a_str", StringType()),
                      ("b_float", FloatType()),
                      ("c_long", LongType()),
                  ]
예제 #15
0
def test_read_csv_infer_invalid(spark, write_csv_file, ctx_obj, get_context):
    test_cases = [
        {
            "csv": ["a,0.1,", "a,0.1,1", "a,1.1,4"],
            "schema": [
                add_res_ref("a_int"),
                add_res_ref("b_float"),
                add_res_ref("c_long")
            ],
            "raw_columns": {
                "a_int": {
                    "name": "a_int",
                    "type": "INT_COLUMN",
                    "required": True,
                    "id": "-"
                },
                "b_float": {
                    "name": "b_float",
                    "type": "INFERRED_COLUMN",
                    "required": True,
                    "id": "-",
                },
                "c_long": {
                    "name": "c_long",
                    "type": "INFERRED_COLUMN",
                    "required": True,
                    "id": "-",
                },
            },
        },
        {
            "csv": ["a,1.1,", "a,1.1,1", "a,1.1,4"],
            "schema":
            [add_res_ref("a_str"),
             add_res_ref("b_int"),
             add_res_ref("c_int")],
            "raw_columns": {
                "a_str": {
                    "name": "a_str",
                    "type": "INFERRED_COLUMN",
                    "required": True,
                    "id": "-"
                },
                "b_int": {
                    "name": "b_int",
                    "type": "INT_COLUMN",
                    "required": True,
                    "id": "-"
                },
                "c_int": {
                    "name": "c_int",
                    "type": "INFERRED_COLUMN",
                    "required": True,
                    "id": "-"
                },
            },
        },
    ]

    for test in test_cases:
        csv_str = "\n".join(test["csv"])
        path_to_file = write_csv_file(csv_str)

        ctx_obj["environment"] = {
            "data": {
                "type": "csv",
                "path": path_to_file,
                "schema": test["schema"]
            }
        }

        ctx_obj["raw_columns"] = test["raw_columns"]

        with pytest.raises(UserException):
            spark_util.ingest(get_context(ctx_obj), spark).collect()
예제 #16
0
def test_read_csv_infer_type(spark, write_csv_file, ctx_obj, get_context):
    test_cases = [
        {
            "csv": ["a,0.1,", "b,0.1,1", "c,1.1,4"],
            "schema": [
                add_res_ref("a_str"),
                add_res_ref("b_float"),
                add_res_ref("c_long")
            ],
            "raw_columns": {
                "a_str": {
                    "name": "a_str",
                    "type": "INFERRED_COLUMN",
                    "required": True,
                    "id": "-"
                },
                "b_float": {
                    "name": "b_float",
                    "type": "INFERRED_COLUMN",
                    "required": True,
                    "id": "-",
                },
                "c_long": {
                    "name": "c_float",
                    "type": "INFERRED_COLUMN",
                    "required": True,
                    "id": "-",
                },
            },
            "expected_types": {
                "a_str": StringType(),
                "b_float": FloatType(),
                "c_long": LongType()
            },
        },
        {
            "csv": ["1,4,4.5", "1,3,1.2", "1,5,4.7"],
            "schema": [
                add_res_ref("a_str"),
                add_res_ref("b_int"),
                add_res_ref("c_float")
            ],
            "raw_columns": {
                "a_str": {
                    "name": "a_str",
                    "type": "STRING_COLUMN",
                    "required": True,
                    "id": "-"
                },
                "b_int": {
                    "name": "b_int",
                    "type": "INFERRED_COLUMN",
                    "required": True,
                    "id": "-"
                },
                "c_float": {
                    "name": "c_float",
                    "type": "INFERRED_COLUMN",
                    "required": True,
                    "id": "-",
                },
            },
            "expected_types": {
                "a_str": StringType(),
                "b_int": LongType(),
                "c_float": FloatType()
            },
        },
        {
            "csv": ["1,4,2017-09-16", "1,3,2017-09-16", "1,5,2017-09-16"],
            "schema":
            [add_res_ref("a_str"),
             add_res_ref("b_int"),
             add_res_ref("c_str")],
            "raw_columns": {
                "a_str": {
                    "name": "a_str",
                    "type": "STRING_COLUMN",
                    "required": True,
                    "id": "-"
                },
                "b_int": {
                    "name": "b_int",
                    "type": "INFERRED_COLUMN",
                    "required": True,
                    "id": "-"
                },
                "c_str": {
                    "name": "c_str",
                    "type": "INFERRED_COLUMN",
                    "required": True,
                    "id": "-"
                },
            },
            "expected_types": {
                "a_str": StringType(),
                "b_int": LongType(),
                "c_str": StringType()
            },
        },
        {
            "csv": ["1,4,2017-09-16", "1,3,2017-09-16", "1,5,2017-09-16"],
            "schema": [
                add_res_ref("a_float"),
                add_res_ref("b_int"),
                add_res_ref("c_str")
            ],
            "raw_columns": {
                "a_float": {
                    "name": "a_float",
                    "type": "FLOAT_COLUMN",
                    "required": True,
                    "id": "-"
                },
                "b_int": {
                    "name": "b_int",
                    "type": "INFERRED_COLUMN",
                    "required": True,
                    "id": "-"
                },
                "c_str": {
                    "name": "c_str",
                    "type": "INFERRED_COLUMN",
                    "required": True,
                    "id": "-"
                },
            },
            "expected_types": {
                "a_float": FloatType(),
                "b_int": LongType(),
                "c_str": StringType()
            },
        },
    ]

    for test in test_cases:
        csv_str = "\n".join(test["csv"])
        path_to_file = write_csv_file(csv_str)

        ctx_obj["environment"] = {
            "data": {
                "type": "csv",
                "path": path_to_file,
                "schema": test["schema"]
            }
        }

        ctx_obj["raw_columns"] = test["raw_columns"]

        df = spark_util.ingest(get_context(ctx_obj), spark)
        assert df.count() == len(test["expected_types"])
        inferred_col_type_map = {c.name: c.dataType for c in df.schema}
        for column_name in test["expected_types"]:
            assert inferred_col_type_map[column_name] == test[
                "expected_types"][column_name]