def test_ingest_parquet_type_mismatch(spark, write_parquet_file, ctx_obj, get_context): data = [("a", 0.1, None), ("b", 1.0, None), ("c", 1.1, 4.0)] schema = StructType([ StructField("a_str", StringType()), StructField("b_float", FloatType()), StructField("c_long", FloatType()), ]) path_to_file = write_parquet_file(spark, data, schema) ctx_obj["environment"] = { "data": { "type": "parquet", "path": path_to_file, "schema": [ { "parquet_column_name": "a_str", "raw_column": add_res_ref("a_str") }, { "parquet_column_name": "b_float", "raw_column": add_res_ref("b_float") }, { "parquet_column_name": "c_long", "raw_column": add_res_ref("c_long") }, ], } } ctx_obj["raw_columns"] = { "a_str": { "name": "a_str", "type": "STRING_COLUMN", "required": True, "id": "1" }, "b_float": { "name": "b_float", "type": "FLOAT_COLUMN", "required": True, "id": "2" }, "c_long": { "name": "c_long", "type": "INT_COLUMN", "required": False, "id": "3" }, } with pytest.raises(UserException) as exec_info: spark_util.ingest(get_context(ctx_obj), spark).collect() assert "c_long" in str(exec_info.value) and "type mismatch" in str( exec_info.value)
def test_read_csv_invalid_type(spark, write_csv_file, ctx_obj, get_context): csv_str = "\n".join(["a,0.1,", "b,b,1", "c,1.1,4"]) path_to_file = write_csv_file(csv_str) ctx_obj["environment"] = { "data": { "type": "csv", "path": path_to_file, "schema": [ add_res_ref("a_str"), add_res_ref("b_long"), add_res_ref("c_long") ], } } ctx_obj["raw_columns"] = { "a_str": { "name": "a_str", "type": "STRING_COLUMN", "required": True, "id": "-" }, "b_long": { "name": "b_long", "type": "INT_COLUMN", "required": True, "id": "-" }, "c_long": { "name": "c_long", "type": "INT_COLUMN", "required": False, "id": "-" }, } with pytest.raises(UserException): spark_util.ingest(get_context(ctx_obj), spark).collect()
def ingest_raw_dataset(spark, ctx, cols_to_validate, should_ingest): if should_ingest: cols_to_validate = list(ctx.rf_id_map.keys()) if len(cols_to_validate) == 0: logger.info("Reading {} data (version: {})".format( ctx.app["name"], ctx.dataset_version)) return spark_util.read_raw_dataset(ctx, spark) col_resources_to_validate = [ctx.rf_id_map[f] for f in cols_to_validate] ctx.upload_resource_status_start(*col_resources_to_validate) try: if should_ingest: data_config = ctx.environment["data"] logger.info("Ingesting") logger.info("Ingesting {} data from {}".format( ctx.app["name"], data_config["path"])) ingest_df = spark_util.ingest(ctx, spark) input_type_map = {f.name: f.dataType for f in ingest_df.schema} for raw_column_name in ctx.raw_columns: if ctx.raw_columns[raw_column_name][ "type"] == consts.COLUMN_TYPE_INFERRED: column_type = spark_util.SPARK_TYPE_TO_CORTEX_TYPE[ input_type_map[raw_column_name]] ctx.write_metadata(ctx.raw_columns[raw_column_name]["id"], {"type": column_type}) full_dataset_size = ingest_df.count() if data_config.get("drop_null"): logger.info("Dropping any rows that contain null values") ingest_df = ingest_df.dropna() if ctx.environment.get("limit"): ingest_df = limit_dataset(full_dataset_size, ingest_df, ctx.environment["limit"]) written_count = write_raw_dataset(ingest_df, ctx, spark) ctx.write_metadata(ctx.raw_dataset["key"], {"dataset_size": written_count}) if written_count != full_dataset_size: logger.info( "{} rows read, {} rows dropped, {} rows ingested".format( full_dataset_size, full_dataset_size - written_count, written_count)) else: logger.info("{} rows ingested".format(written_count)) logger.info("Reading {} data (version: {})".format( ctx.app["name"], ctx.dataset_version)) raw_df = spark_util.read_raw_dataset(ctx, spark) validate_dataset(ctx, raw_df, cols_to_validate) except: ctx.upload_resource_status_failed(*col_resources_to_validate) raise ctx.upload_resource_status_success(*col_resources_to_validate) logger.info("First {} samples:".format(3)) show_df(raw_df, ctx, 3) return raw_df
def test_read_parquet_infer_invalid(spark, write_parquet_file, ctx_obj, get_context): tests = [ { "data": [("a", 0.1, None), ("b", 1.0, None), ("c", 1.1, 4)], "schema": StructType([ StructField("a_str", StringType()), StructField("b_float", DoubleType()), StructField("c_long", IntegerType()), ]), "env": [ { "parquet_column_name": "a_str", "raw_column": add_res_ref("a_str") }, { "parquet_column_name": "b_float", "raw_column": add_res_ref("b_float") }, { "parquet_column_name": "c_long", "raw_column": add_res_ref("c_long") }, ], "raw_columns": { "a_str": { "name": "a_str", "type": "INFERRED_COLUMN", "required": True, "id": "1" }, "b_float": { "name": "b_float", "type": "INT_COLUMN", "required": True, "id": "2" }, "c_long": { "name": "c_long", "type": "INFERRED_COLUMN", "required": False, "id": "3", }, }, }, { "data": [("1", 0.1, "a"), ("1", 1.0, "a"), ("1", 1.1, "a")], "schema": StructType([ StructField("a_str", StringType()), StructField("b_float", DoubleType()), StructField("c_str", StringType()), ]), "env": [ { "parquet_column_name": "a_str", "raw_column": add_res_ref("a_str") }, { "parquet_column_name": "b_float", "raw_column": add_res_ref("b_float") }, { "parquet_column_name": "c_str", "raw_column": add_res_ref("c_str") }, ], "raw_columns": { "a_str": { "name": "a_str", "type": "INFERRED_COLUMN", "required": True, "id": "1" }, "b_float": { "name": "b_float", "type": "INFERRED_COLUMN", "required": True, "id": "2", }, "c_str": { "name": "c_str", "type": "INT_COLUMN", "required": False, "id": "3" }, }, }, { "data": [("a", 1, None), ("b", 1, None), ("c", 1, 4)], "schema": StructType([ StructField("a_str", StringType()), StructField("b_float", IntegerType()), StructField("c_long", IntegerType()), ]), "env": [ { "parquet_column_name": "a_str", "raw_column": add_res_ref("a_str") }, { "parquet_column_name": "b_float", "raw_column": add_res_ref("b_float") }, { "parquet_column_name": "c_long", "raw_column": add_res_ref("c_long") }, ], "raw_columns": { "a_str": { "name": "a_str", "type": "INFERRED_COLUMN", "required": True, "id": "1" }, "b_float": { "name": "b_float", "type": "FLOAT_COLUMN", "required": True, "id": "2" }, "c_long": { "name": "c_long", "type": "INFERRED_COLUMN", "required": False, "id": "3", }, }, }, { "data": [("a", 1, None), ("b", 1, None), ("c", 1, 4)], "schema": StructType([ StructField("a_str", StringType()), StructField("b_float", IntegerType()), StructField("c_long", IntegerType()), ]), "env": [ { "parquet_column_name": "a_str", "raw_column": add_res_ref("a_str") }, { "parquet_column_name": "b_float", "raw_column": add_res_ref("b_float") }, { "parquet_column_name": "c_long", "raw_column": add_res_ref("c_long") }, ], "raw_columns": { "a_str": { "name": "a_str", "type": "INT_COLUMN", "required": True, "id": "1" }, "b_float": { "name": "b_float", "type": "STRING_COLUMN", "required": True, "id": "2", }, "c_long": { "name": "c_long", "type": "INFERRED_COLUMN", "required": False, "id": "3", }, }, }, { "data": [("a", [1], None), ("b", [1], None), ("c", [1], 4)], "schema": StructType([ StructField("a_str", StringType()), StructField("b_float", ArrayType(IntegerType()), True), StructField("c_long", IntegerType()), ]), "env": [ { "parquet_column_name": "a_str", "raw_column": add_res_ref("a_str") }, { "parquet_column_name": "b_float", "raw_column": add_res_ref("b_float") }, { "parquet_column_name": "c_long", "raw_column": add_res_ref("c_long") }, ], "raw_columns": { "a_str": { "name": "a_str", "type": "INT_COLUMN", "required": True, "id": "1" }, "b_float": { "name": "b_float", "type": "FLOAT_LIST_COLUMN", "required": True, "id": "2", }, "c_long": { "name": "c_long", "type": "INFERRED_COLUMN", "required": False, "id": "3", }, }, }, ] for test in tests: data = test["data"] schema = test["schema"] path_to_file = write_parquet_file(spark, data, schema) ctx_obj["environment"] = { "data": { "type": "parquet", "path": path_to_file, "schema": test["env"] } } ctx_obj["raw_columns"] = test["raw_columns"] with pytest.raises(UserException) as exec_info: spark_util.ingest(get_context(ctx_obj), spark).collect()
def test_ingest_parquet_infer_valid(spark, write_parquet_file, ctx_obj, get_context): tests = [ { "data": [("a", 0.1, None), ("b", 1.0, None), ("c", 1.1, 4)], "schema": StructType([ StructField("a_str", StringType()), StructField("b_float", DoubleType()), StructField("c_long", IntegerType()), ]), "env": [ { "parquet_column_name": "a_str", "raw_column": add_res_ref("a_str") }, { "parquet_column_name": "b_float", "raw_column": add_res_ref("b_float") }, { "parquet_column_name": "c_long", "raw_column": add_res_ref("c_long") }, ], "raw_columns": { "a_str": { "name": "a_str", "type": "INFERRED_COLUMN", "required": True, "id": "1" }, "b_float": { "name": "b_float", "type": "INFERRED_COLUMN", "required": True, "id": "2", }, "c_long": { "name": "c_long", "type": "INFERRED_COLUMN", "required": False, "id": "3", }, }, "expected_types": [ ("a_str", StringType()), ("b_float", FloatType()), ("c_long", LongType()), ], }, { "data": [("1", 0.1, None), ("1", 1.0, None), ("1", 1.1, 4)], "schema": StructType([ StructField("a_str", StringType()), StructField("b_float", DoubleType()), StructField("c_long", IntegerType()), ]), "env": [ { "parquet_column_name": "a_str", "raw_column": add_res_ref("a_str") }, { "parquet_column_name": "b_float", "raw_column": add_res_ref("b_float") }, { "parquet_column_name": "c_long", "raw_column": add_res_ref("c_long") }, ], "raw_columns": { "a_str": { "name": "a_str", "type": "INFERRED_COLUMN", "required": True, "id": "1" }, "b_float": { "name": "b_float", "type": "INFERRED_COLUMN", "required": True, "id": "2", }, "c_long": { "name": "c_long", "type": "INFERRED_COLUMN", "required": False, "id": "3", }, }, "expected_types": [ ("a_str", StringType()), ("b_float", FloatType()), ("c_long", LongType()), ], }, { "data": [ ("1", 0.1, datetime.now()), ("1", 1.0, datetime.now()), ("1", 1.1, datetime.now()), ], "schema": StructType([ StructField("a_str", StringType()), StructField("b_float", DoubleType()), StructField("c_str", TimestampType()), ]), "env": [ { "parquet_column_name": "a_str", "raw_column": add_res_ref("a_str") }, { "parquet_column_name": "b_float", "raw_column": add_res_ref("b_float") }, { "parquet_column_name": "c_str", "raw_column": add_res_ref("c_str") }, ], "raw_columns": { "a_str": { "name": "a_str", "type": "INFERRED_COLUMN", "required": True, "id": "1" }, "b_float": { "name": "b_float", "type": "INFERRED_COLUMN", "required": True, "id": "2", }, "c_str": { "name": "c_str", "type": "INFERRED_COLUMN", "required": False, "id": "3" }, }, "expected_types": [ ("a_str", StringType()), ("b_float", FloatType()), ("c_str", StringType()), ], }, { "data": [ ("1", [0.1, 12.0], datetime.now()), ("1", [1.23, 1.0], datetime.now()), ("1", [12.3, 1.1], datetime.now()), ], "schema": StructType([ StructField("a_str", StringType()), StructField("b_float", ArrayType(DoubleType()), True), StructField("c_str", TimestampType()), ]), "env": [ { "parquet_column_name": "a_str", "raw_column": add_res_ref("a_str") }, { "parquet_column_name": "b_float", "raw_column": add_res_ref("b_float") }, { "parquet_column_name": "c_str", "raw_column": add_res_ref("c_str") }, ], "raw_columns": { "a_str": { "name": "a_str", "type": "INFERRED_COLUMN", "required": True, "id": "1" }, "b_float": { "name": "b_float", "type": "FLOAT_LIST_COLUMN", "required": True, "id": "2", }, "c_str": { "name": "c_str", "type": "INFERRED_COLUMN", "required": False, "id": "3" }, }, "expected_types": [ ("a_str", StringType()), ("b_float", ArrayType(FloatType(), True)), ("c_str", StringType()), ], }, ] for test in tests: data = test["data"] schema = test["schema"] path_to_file = write_parquet_file(spark, data, schema) ctx_obj["environment"] = { "data": { "type": "parquet", "path": path_to_file, "schema": test["env"] } } ctx_obj["raw_columns"] = test["raw_columns"] df = spark_util.ingest(get_context(ctx_obj), spark) assert df.count() == 3 assert (sorted([(s.name, s.dataType) for s in df.schema], key=lambda x: x[0]) == test["expected_types"])
def test_ingest_parquet_valid(spark, write_parquet_file, ctx_obj, get_context): data = [("a", 0.1, None), ("b", 1.0, None), ("c", 1.1, 4)] schema = StructType([ StructField("a_str", StringType()), StructField("b_float", DoubleType()), StructField("c_long", IntegerType()), ]) path_to_file = write_parquet_file(spark, data, schema) ctx_obj["environment"] = { "data": { "type": "parquet", "path": path_to_file, "schema": [ { "parquet_column_name": "a_str", "raw_column": add_res_ref("a_str") }, { "parquet_column_name": "b_float", "raw_column": add_res_ref("b_float") }, { "parquet_column_name": "c_long", "raw_column": add_res_ref("c_long") }, ], } } ctx_obj["raw_columns"] = { "a_str": { "name": "a_str", "type": "STRING_COLUMN", "required": True, "id": "1" }, "b_float": { "name": "b_float", "type": "FLOAT_COLUMN", "required": True, "id": "2" }, "c_long": { "name": "c_long", "type": "INT_COLUMN", "required": False, "id": "3" }, } df = spark_util.ingest(get_context(ctx_obj), spark) assert df.count() == 3 assert sorted([(s.name, s.dataType) for s in df.schema], key=lambda x: x[0]) == [ ("a_str", StringType()), ("b_float", FloatType()), ("c_long", LongType()), ]
def test_read_csv_infer_invalid(spark, write_csv_file, ctx_obj, get_context): test_cases = [ { "csv": ["a,0.1,", "a,0.1,1", "a,1.1,4"], "schema": [ add_res_ref("a_int"), add_res_ref("b_float"), add_res_ref("c_long") ], "raw_columns": { "a_int": { "name": "a_int", "type": "INT_COLUMN", "required": True, "id": "-" }, "b_float": { "name": "b_float", "type": "INFERRED_COLUMN", "required": True, "id": "-", }, "c_long": { "name": "c_long", "type": "INFERRED_COLUMN", "required": True, "id": "-", }, }, }, { "csv": ["a,1.1,", "a,1.1,1", "a,1.1,4"], "schema": [add_res_ref("a_str"), add_res_ref("b_int"), add_res_ref("c_int")], "raw_columns": { "a_str": { "name": "a_str", "type": "INFERRED_COLUMN", "required": True, "id": "-" }, "b_int": { "name": "b_int", "type": "INT_COLUMN", "required": True, "id": "-" }, "c_int": { "name": "c_int", "type": "INFERRED_COLUMN", "required": True, "id": "-" }, }, }, ] for test in test_cases: csv_str = "\n".join(test["csv"]) path_to_file = write_csv_file(csv_str) ctx_obj["environment"] = { "data": { "type": "csv", "path": path_to_file, "schema": test["schema"] } } ctx_obj["raw_columns"] = test["raw_columns"] with pytest.raises(UserException): spark_util.ingest(get_context(ctx_obj), spark).collect()
def test_ingest_parquet_failed_requirements(capsys, spark, write_parquet_file, ctx_obj, get_context): data = [(None, 0.1, None), ("b", 1.0, None), ("c", 1.1, 4)] schema = StructType([ StructField("a_str", StringType()), StructField("b_float", FloatType()), StructField("c_long", LongType()), ]) path_to_file = write_parquet_file(spark, data, schema) ctx_obj["environment"] = { "data": { "type": "parquet", "path": path_to_file, "schema": [ { "parquet_column_name": "a_str", "raw_column": add_res_ref("a_str") }, { "parquet_column_name": "b_float", "raw_column": add_res_ref("b_float") }, { "parquet_column_name": "c_long", "raw_column": add_res_ref("c_long") }, ], } } ctx_obj["raw_columns"] = { "a_str": { "name": "a_str", "type": "STRING_COLUMN", "values": ["a", "b"], "id": "1" }, "b_float": { "name": "b_float", "type": "FLOAT_COLUMN", "required": True, "id": "2" }, "c_long": { "name": "c_long", "type": "INT_COLUMN", "required": False, "id": "3" }, } ctx = get_context(ctx_obj) df = spark_util.ingest(ctx, spark) validations = spark_util.value_check_data(ctx, df) assert validations == {"a_str": [("(a_str IN (a, b))", 1)]}
def test_read_csv_infer_type(spark, write_csv_file, ctx_obj, get_context): test_cases = [ { "csv": ["a,0.1,", "b,0.1,1", "c,1.1,4"], "schema": [ add_res_ref("a_str"), add_res_ref("b_float"), add_res_ref("c_long") ], "raw_columns": { "a_str": { "name": "a_str", "type": "INFERRED_COLUMN", "required": True, "id": "-" }, "b_float": { "name": "b_float", "type": "INFERRED_COLUMN", "required": True, "id": "-", }, "c_long": { "name": "c_float", "type": "INFERRED_COLUMN", "required": True, "id": "-", }, }, "expected_types": { "a_str": StringType(), "b_float": FloatType(), "c_long": LongType() }, }, { "csv": ["1,4,4.5", "1,3,1.2", "1,5,4.7"], "schema": [ add_res_ref("a_str"), add_res_ref("b_int"), add_res_ref("c_float") ], "raw_columns": { "a_str": { "name": "a_str", "type": "STRING_COLUMN", "required": True, "id": "-" }, "b_int": { "name": "b_int", "type": "INFERRED_COLUMN", "required": True, "id": "-" }, "c_float": { "name": "c_float", "type": "INFERRED_COLUMN", "required": True, "id": "-", }, }, "expected_types": { "a_str": StringType(), "b_int": LongType(), "c_float": FloatType() }, }, { "csv": ["1,4,2017-09-16", "1,3,2017-09-16", "1,5,2017-09-16"], "schema": [add_res_ref("a_str"), add_res_ref("b_int"), add_res_ref("c_str")], "raw_columns": { "a_str": { "name": "a_str", "type": "STRING_COLUMN", "required": True, "id": "-" }, "b_int": { "name": "b_int", "type": "INFERRED_COLUMN", "required": True, "id": "-" }, "c_str": { "name": "c_str", "type": "INFERRED_COLUMN", "required": True, "id": "-" }, }, "expected_types": { "a_str": StringType(), "b_int": LongType(), "c_str": StringType() }, }, { "csv": ["1,4,2017-09-16", "1,3,2017-09-16", "1,5,2017-09-16"], "schema": [ add_res_ref("a_float"), add_res_ref("b_int"), add_res_ref("c_str") ], "raw_columns": { "a_float": { "name": "a_float", "type": "FLOAT_COLUMN", "required": True, "id": "-" }, "b_int": { "name": "b_int", "type": "INFERRED_COLUMN", "required": True, "id": "-" }, "c_str": { "name": "c_str", "type": "INFERRED_COLUMN", "required": True, "id": "-" }, }, "expected_types": { "a_float": FloatType(), "b_int": LongType(), "c_str": StringType() }, }, ] for test in test_cases: csv_str = "\n".join(test["csv"]) path_to_file = write_csv_file(csv_str) ctx_obj["environment"] = { "data": { "type": "csv", "path": path_to_file, "schema": test["schema"] } } ctx_obj["raw_columns"] = test["raw_columns"] df = spark_util.ingest(get_context(ctx_obj), spark) assert df.count() == len(test["expected_types"]) inferred_col_type_map = {c.name: c.dataType for c in df.schema} for column_name in test["expected_types"]: assert inferred_col_type_map[column_name] == test[ "expected_types"][column_name]
def test_ingest_parquet_extra_cols(spark, write_parquet_file, ctx_obj, get_context): data = [("a", 0.1, None), ("b", 1.0, None), ("c", 1.1, 4)] schema = StructType([ StructField("a_str", StringType()), StructField("b_float", FloatType()), StructField("c_long", LongType()), ]) path_to_file = write_parquet_file(spark, data, schema) ctx_obj["environment"] = { "data": { "type": "parquet", "path": path_to_file, "schema": [ { "parquet_column_name": "a_str", "raw_column": add_res_ref("a_str") }, { "parquet_column_name": "b_float", "raw_column": add_res_ref("b_float") }, { "parquet_column_name": "c_long", "raw_column": add_res_ref("c_long") }, { "parquet_column_name": "d_long", "raw_column": add_res_ref("d_long") }, ], } } ctx_obj["raw_columns"] = { "a_str": { "name": "a_str", "type": "STRING_COLUMN", "required": True, "id": "1" }, "b_float": { "name": "b_float", "type": "FLOAT_COLUMN", "required": True, "id": "2" }, "c_long": { "name": "c_long", "type": "INT_COLUMN", "required": False, "id": "3" }, } assert spark_util.ingest(get_context(ctx_obj), spark).count() == 3