def titanic_dataset(): df = ge.read_csv(file_relative_path(__file__, "../test_sets/Titanic.csv")) batch_df = PandasDataset(df) return batch_df
def get_dataset(dataset_type, data, schemas=None, profiler=ColumnsExistProfiler, caching=True): """Utility to create datasets for json-formatted tests. """ df = pd.DataFrame(data) if dataset_type == "PandasDataset": if schemas and "pandas" in schemas: schema = schemas["pandas"] pandas_schema = {} for (key, value) in schema.items(): # Note, these are just names used in our internal schemas to build datasets *for internal tests* # Further, some changes in pandas internal about how datetimes are created means to support pandas # pre- 0.25, we need to explicitly specify when we want timezone. # We will use timestamp for timezone-aware (UTC only) dates in our tests if value.lower() in ["timestamp", "datetime64[ns, tz]"]: df[key] = pd.to_datetime(df[key], utc=True) continue elif value.lower() in [ "datetime", "datetime64", "datetime64[ns]" ]: df[key] = pd.to_datetime(df[key]) continue try: type_ = np.dtype(value) except TypeError: type_ = getattr(pd.core.dtypes.dtypes, value) # If this raises AttributeError it's okay: it means someone built a bad test pandas_schema[key] = type_ # pandas_schema = {key: np.dtype(value) for (key, value) in schemas["pandas"].items()} df = df.astype(pandas_schema) return PandasDataset(df, profiler=profiler, caching=caching) elif dataset_type == "sqlite": from sqlalchemy import create_engine engine = create_engine("sqlite://") conn = engine.connect() # Add the data to the database as a new table sql_dtypes = {} if (schemas and "sqlite" in schemas and isinstance(engine.dialect, sqlitetypes.dialect)): schema = schemas["sqlite"] sql_dtypes = { col: SQLITE_TYPES[dtype] for (col, dtype) in schema.items() } for col in schema: type_ = schema[col] if type_ in ["INTEGER", "SMALLINT", "BIGINT"]: df[col] = pd.to_numeric(df[col], downcast="signed") elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]: df[col] = pd.to_numeric(df[col]) elif type_ in ["DATETIME", "TIMESTAMP"]: df[col] = pd.to_datetime(df[col]) tablename = "test_data_" + "".join([ random.choice(string.ascii_letters + string.digits) for n in range(8) ]) df.to_sql(name=tablename, con=conn, index=False, dtype=sql_dtypes) # Build a SqlAlchemyDataset using that database return SqlAlchemyDataset(tablename, engine=conn, profiler=profiler, caching=caching) elif dataset_type == "postgresql": from sqlalchemy import create_engine # Create a new database engine = create_engine("postgresql://postgres@localhost/test_ci") conn = engine.connect() sql_dtypes = {} if (schemas and "postgresql" in schemas and isinstance(engine.dialect, postgresqltypes.dialect)): schema = schemas["postgresql"] sql_dtypes = { col: POSTGRESQL_TYPES[dtype] for (col, dtype) in schema.items() } for col in schema: type_ = schema[col] if type_ in ["INTEGER", "SMALLINT", "BIGINT"]: df[col] = pd.to_numeric(df[col], downcast="signed") elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]: df[col] = pd.to_numeric(df[col]) elif type_ in ["DATETIME", "TIMESTAMP"]: df[col] = pd.to_datetime(df[col]) tablename = "test_data_" + "".join([ random.choice(string.ascii_letters + string.digits) for n in range(8) ]) df.to_sql(name=tablename, con=conn, index=False, dtype=sql_dtypes) # Build a SqlAlchemyDataset using that database return SqlAlchemyDataset(tablename, engine=conn, profiler=profiler, caching=caching) elif dataset_type == "mysql": from sqlalchemy import create_engine engine = create_engine("mysql://root@localhost/test_ci") conn = engine.connect() sql_dtypes = {} if (schemas and "mysql" in schemas and isinstance(engine.dialect, mysqltypes.dialect)): schema = schemas["mysql"] sql_dtypes = { col: MYSQL_TYPES[dtype] for (col, dtype) in schema.items() } for col in schema: type_ = schema[col] if type_ in ["INTEGER", "SMALLINT", "BIGINT"]: df[col] = pd.to_numeric(df[col], downcast="signed") elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]: df[col] = pd.to_numeric(df[col]) elif type_ in ["DATETIME", "TIMESTAMP"]: df[col] = pd.to_datetime(df[col]) tablename = "test_data_" + "".join([ random.choice(string.ascii_letters + string.digits) for n in range(8) ]) df.to_sql(name=tablename, con=conn, index=False, dtype=sql_dtypes) # Build a SqlAlchemyDataset using that database return SqlAlchemyDataset(tablename, engine=conn, profiler=profiler, caching=caching) elif dataset_type == "SparkDFDataset": from pyspark.sql import SparkSession import pyspark.sql.types as sparktypes SPARK_TYPES = { "StringType": sparktypes.StringType, "IntegerType": sparktypes.IntegerType, "LongType": sparktypes.LongType, "DateType": sparktypes.DateType, "TimestampType": sparktypes.TimestampType, "FloatType": sparktypes.FloatType, "DoubleType": sparktypes.DoubleType, "BooleanType": sparktypes.BooleanType, "DataType": sparktypes.DataType, "NullType": sparktypes.NullType, } spark = SparkSession.builder.getOrCreate() # We need to allow null values in some column types that do not support them natively, so we skip # use of df in this case. data_reshaped = list( zip(*[v for _, v in data.items()])) # create a list of rows if schemas and "spark" in schemas: schema = schemas["spark"] # sometimes first method causes Spark to throw a TypeError try: spark_schema = sparktypes.StructType([ sparktypes.StructField(column, SPARK_TYPES[schema[column]](), True) for column in schema ]) # We create these every time, which is painful for testing # However nuance around null treatment as well as the desire # for real datetime support in tests makes this necessary data = copy.deepcopy(data) if "ts" in data: print(data) print(schema) for col in schema: type_ = schema[col] if type_ in ["IntegerType", "LongType"]: # Ints cannot be None...but None can be valid in Spark (as Null) vals = [] for val in data[col]: if val is None: vals.append(val) else: vals.append(int(val)) data[col] = vals elif type_ in ["FloatType", "DoubleType"]: vals = [] for val in data[col]: if val is None: vals.append(val) else: vals.append(float(val)) data[col] = vals elif type_ in ["DateType", "TimestampType"]: vals = [] for val in data[col]: if val is None: vals.append(val) else: vals.append(parse(val)) data[col] = vals # Do this again, now that we have done type conversion using the provided schema data_reshaped = list( zip(*[v for _, v in data.items()])) # create a list of rows spark_df = spark.createDataFrame(data_reshaped, schema=spark_schema) except TypeError: string_schema = sparktypes.StructType([ sparktypes.StructField(column, sparktypes.StringType()) for column in schema ]) spark_df = spark.createDataFrame(data_reshaped, string_schema) for c in spark_df.columns: spark_df = spark_df.withColumn( c, spark_df[c].cast(SPARK_TYPES[schema[c]]())) elif len(data_reshaped) == 0: # if we have an empty dataset and no schema, need to assign an arbitrary type columns = list(data.keys()) spark_schema = sparktypes.StructType([ sparktypes.StructField(column, sparktypes.StringType()) for column in columns ]) spark_df = spark.createDataFrame(data_reshaped, spark_schema) else: # if no schema provided, uses Spark's schema inference columns = list(data.keys()) spark_df = spark.createDataFrame(data_reshaped, columns) return SparkDFDataset(spark_df, profiler=profiler, caching=caching) else: raise ValueError("Unknown dataset_type " + str(dataset_type))
def test_that_ge_pandas_datasets_are_memory_efficient(csvpath: Path): df = pd.read_csv(str(csvpath)) df_ge = PandasDataset(df) bequals = df.values.base == df_ge.values.base assert bool(bequals.all())
def get_dataset(dataset_type, data, schemas=None, profiler=ColumnsExistProfiler, caching=True): """Utility to create datasets for json-formatted tests. """ df = pd.DataFrame(data) if dataset_type == 'PandasDataset': if schemas and "pandas" in schemas: pandas_schema = { key: np.dtype(value) for (key, value) in schemas["pandas"].items() } df = df.astype(pandas_schema) return PandasDataset(df, profiler=profiler, caching=caching) elif dataset_type == "sqlite": engine = create_engine('sqlite://') conn = engine.connect() # Add the data to the database as a new table sql_dtypes = {} if schemas and "sqlite" in schemas and isinstance( engine.dialect, sqlitetypes.dialect): schema = schemas["sqlite"] sql_dtypes = { col: SQLITE_TYPES[dtype] for (col, dtype) in schema.items() } for col in schema: type_ = schema[col] if type_ in ["INTEGER", "SMALLINT", "BIGINT"]: df[col] = pd.to_numeric(df[col], downcast='signed') elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]: df[col] = pd.to_numeric(df[col]) elif type_ in ["DATETIME", "TIMESTAMP"]: df[col] = pd.to_datetime(df[col]) tablename = "test_data_" + ''.join([ random.choice(string.ascii_letters + string.digits) for n in range(8) ]) df.to_sql(name=tablename, con=conn, index=False, dtype=sql_dtypes) # Build a SqlAlchemyDataset using that database return SqlAlchemyDataset(tablename, engine=conn, profiler=profiler, caching=caching) elif dataset_type == 'postgresql': # Create a new database engine = create_engine('postgresql://postgres@localhost/test_ci') conn = engine.connect() sql_dtypes = {} if schemas and "postgresql" in schemas and isinstance( engine.dialect, postgresqltypes.dialect): schema = schemas["postgresql"] sql_dtypes = { col: POSTGRESQL_TYPES[dtype] for (col, dtype) in schema.items() } for col in schema: type_ = schema[col] if type_ in ["INTEGER", "SMALLINT", "BIGINT"]: df[col] = pd.to_numeric(df[col], downcast='signed') elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]: df[col] = pd.to_numeric(df[col]) elif type_ in ["DATETIME", "TIMESTAMP"]: df[col] = pd.to_datetime(df[col]) tablename = "test_data_" + ''.join([ random.choice(string.ascii_letters + string.digits) for n in range(8) ]) df.to_sql(name=tablename, con=conn, index=False, dtype=sql_dtypes) # Build a SqlAlchemyDataset using that database return SqlAlchemyDataset(tablename, engine=conn, profiler=profiler, caching=caching) elif dataset_type == 'mysql': engine = create_engine('mysql://root@localhost/test_ci') conn = engine.connect() sql_dtypes = {} if schemas and "mysql" in schemas and isinstance( engine.dialect, mysqltypes.dialect): schema = schemas["mysql"] sql_dtypes = { col: MYSQL_TYPES[dtype] for (col, dtype) in schema.items() } for col in schema: type_ = schema[col] if type_ in ["INTEGER", "SMALLINT", "BIGINT"]: df[col] = pd.to_numeric(df[col], downcast='signed') elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]: df[col] = pd.to_numeric(df[col]) elif type_ in ["DATETIME", "TIMESTAMP"]: df[col] = pd.to_datetime(df[col]) tablename = "test_data_" + ''.join([ random.choice(string.ascii_letters + string.digits) for n in range(8) ]) df.to_sql(name=tablename, con=conn, index=False, dtype=sql_dtypes) # Build a SqlAlchemyDataset using that database return SqlAlchemyDataset(tablename, engine=conn, profiler=profiler, caching=caching) elif dataset_type == 'SparkDFDataset': spark = SparkSession.builder.getOrCreate() # We need to allow null values in some column types that do not support them natively, so we skip # use of df in this case. data_reshaped = list( zip(*[v for _, v in data.items()])) # create a list of rows if schemas and 'spark' in schemas: schema = schemas['spark'] # sometimes first method causes Spark to throw a TypeError try: spark_schema = sparktypes.StructType([ sparktypes.StructField(column, SPARK_TYPES[schema[column]](), True) for column in schema ]) # We create these every time, which is painful for testing # However nuance around null treatment as well as the desire # for real datetime support in tests makes this necessary data = copy.deepcopy(data) if "ts" in data: print(data) print(schema) for col in schema: type_ = schema[col] if type_ in ["IntegerType", "LongType"]: # Ints cannot be None...but None can be valid in Spark (as Null) vals = [] for val in data[col]: if val is None: vals.append(val) else: vals.append(int(val)) data[col] = vals elif type_ in ["FloatType", "DoubleType"]: vals = [] for val in data[col]: if val is None: vals.append(val) else: vals.append(float(val)) data[col] = vals elif type_ in ["DateType", "TimestampType"]: vals = [] for val in data[col]: if val is None: vals.append(val) else: vals.append(parse(val)) data[col] = vals # Do this again, now that we have done type conversion using the provided schema data_reshaped = list( zip(*[v for _, v in data.items()])) # create a list of rows spark_df = spark.createDataFrame(data_reshaped, schema=spark_schema) except TypeError: string_schema = sparktypes.StructType([ sparktypes.StructField(column, sparktypes.StringType()) for column in schema ]) spark_df = spark.createDataFrame(data_reshaped, string_schema) for c in spark_df.columns: spark_df = spark_df.withColumn( c, spark_df[c].cast(SPARK_TYPES[schema[c]]())) elif len(data_reshaped) == 0: # if we have an empty dataset and no schema, need to assign an arbitrary type columns = list(data.keys()) spark_schema = sparktypes.StructType([ sparktypes.StructField(column, sparktypes.StringType()) for column in columns ]) spark_df = spark.createDataFrame(data_reshaped, spark_schema) else: # if no schema provided, uses Spark's schema inference columns = list(data.keys()) spark_df = spark.createDataFrame(data_reshaped, columns) return SparkDFDataset(spark_df, profiler=profiler, caching=caching) else: raise ValueError("Unknown dataset_type " + str(dataset_type))
def test_validation_with_ge(feast_client: Client, kafka_server, pytestconfig): kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}" topic_name = f"avro-{uuid.uuid4()}" entity, feature_table = create_schema(kafka_broker, topic_name, "validation_ge") feast_client.apply_entity(entity) feast_client.apply_feature_table(feature_table) train_data = generate_train_data() ge_ds = PandasDataset(train_data) ge_ds.expect_column_values_to_be_between("num", 0, 100) ge_ds.expect_column_values_to_be_in_set("set", ["a", "b", "c"]) expectations = ge_ds.get_expectation_suite() udf = create_validation_udf("testUDF", expectations, feature_table) apply_validation(feast_client, feature_table, udf, validation_window_secs=1) job = start_job(feast_client, feature_table, pytestconfig) wait_retry_backoff( lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 300) test_data = generate_test_data() ge_ds = PandasDataset(test_data) validation_result = ge_ds.validate(expectations, result_format="COMPLETE") invalid_idx = list({ idx for check in validation_result.results for idx in check.result["unexpected_index_list"] }) entity_rows = [{"key": key} for key in test_data["key"].tolist()] try: ingested = ingest_and_retrieve( feast_client, test_data, avro_schema_json=avro_schema(), topic_name=topic_name, kafka_broker=kafka_broker, entity_rows=entity_rows, feature_names=["validation_ge:num", "validation_ge:set"], expected_ingested_count=test_data.shape[0] - len(invalid_idx), ) finally: stop_job(job, feast_client, feature_table) test_data["num"] = test_data["num"].astype(np.float64) test_data["num"].iloc[invalid_idx] = np.nan test_data["set"].iloc[invalid_idx] = None pd.testing.assert_frame_equal( ingested[["key", "validation_ge:num", "validation_ge:set"]], test_data[["key", "num", "set"]].rename(columns={ "num": "validation_ge:num", "set": "validation_ge:set" }), )
def test_validation_reports_metrics(feast_client: Client, kafka_server, statsd_server: StatsDServer, pytestconfig): kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}" topic_name = f"avro-{uuid.uuid4()}" entity, feature_table = create_schema(kafka_broker, topic_name, "validation_ge_metrics") feast_client.apply_entity(entity) feast_client.apply_feature_table(feature_table) train_data = generate_train_data() ge_ds = PandasDataset(train_data) ge_ds.expect_column_values_to_be_between("num", 0, 100) ge_ds.expect_column_values_to_be_in_set("set", ["a", "b", "c"]) expectations = ge_ds.get_expectation_suite() udf = create_validation_udf("testUDF", expectations, feature_table) apply_validation(feast_client, feature_table, udf, validation_window_secs=10) job = start_job(feast_client, feature_table, pytestconfig) wait_retry_backoff( lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 300) test_data = generate_test_data() ge_ds = PandasDataset(test_data) validation_result = ge_ds.validate(expectations, result_format="COMPLETE") unexpected_counts = { "expect_column_values_to_be_between_num_0_100": validation_result.results[0].result["unexpected_count"], "expect_column_values_to_be_in_set_set": validation_result.results[1].result["unexpected_count"], } invalid_idx = list({ idx for check in validation_result.results for idx in check.result["unexpected_index_list"] }) entity_rows = [{"key": key} for key in test_data["key"].tolist()] try: ingest_and_retrieve( feast_client, test_data, avro_schema_json=avro_schema(), topic_name=topic_name, kafka_broker=kafka_broker, entity_rows=entity_rows, feature_names=[ "validation_ge_metrics:num", "validation_ge_metrics:set" ], expected_ingested_count=test_data.shape[0] - len(invalid_idx), ) finally: stop_job(job, feast_client, feature_table) expected_metrics = [( f"feast_feature_validation_check_failed#check:{check_name}," f"feature_table:{feature_table.name},project:{feast_client.project}", value, ) for check_name, value in unexpected_counts.items()] wait_retry_backoff( lambda: ( None, all( statsd_server.metrics.get(m) == v for m, v in expected_metrics), ), timeout_secs=30, timeout_msg="Expected metrics were not received: " + str(expected_metrics) + "\n" "Actual received metrics" + str(statsd_server.metrics), )
def test_validation_with_ge(feast_client: Client, kafka_server): entity = Entity(name="key", description="Key", value_type=ValueType.INT64) kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}" topic_name = f"avro-{uuid.uuid4()}" feature_table = FeatureTable( name="validation_test", entities=["key"], features=[ Feature("num", ValueType.INT64), Feature("set", ValueType.STRING) ], batch_source=FileSource( event_timestamp_column="event_timestamp", file_format=ParquetFormat(), file_url="/dev/null", ), stream_source=KafkaSource( event_timestamp_column="event_timestamp", bootstrap_servers=kafka_broker, message_format=AvroFormat(avro_schema()), topic=topic_name, ), ) feast_client.apply_entity(entity) feast_client.apply_feature_table(feature_table) train_data = generate_train_data() ge_ds = PandasDataset(train_data) ge_ds.expect_column_values_to_be_between("num", 0, 100) ge_ds.expect_column_values_to_be_in_set("set", ["a", "b", "c"]) expectations = ge_ds.get_expectation_suite() udf = create_validation_udf("testUDF", expectations) apply_validation(feast_client, feature_table, udf, validation_window_secs=1) job = feast_client.start_stream_to_online_ingestion(feature_table) wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 120) wait_retry_backoff( lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 120) test_data = generate_test_data() ge_ds = PandasDataset(test_data) validation_result = ge_ds.validate(expectations, result_format="COMPLETE") invalid_idx = list({ idx for check in validation_result.results for idx in check.result["unexpected_index_list"] }) entity_rows = [{"key": key} for key in test_data["key"].tolist()] try: ingested = ingest_and_retrieve( feast_client, test_data, avro_schema_json=avro_schema(), topic_name=topic_name, kafka_broker=kafka_broker, entity_rows=entity_rows, feature_names=["validation_test:num", "validation_test:set"], expected_ingested_count=test_data.shape[0] - len(invalid_idx), ) finally: job.cancel() test_data["num"] = test_data["num"].astype(np.float64) test_data["num"].iloc[invalid_idx] = np.nan test_data["set"].iloc[invalid_idx] = None pd.testing.assert_frame_equal( ingested[["key", "validation_test:num", "validation_test:set"]], test_data[["key", "num", "set"]].rename(columns={ "num": "validation_test:num", "set": "validation_test:set" }), )