def test_streaming_ingestion(feast_client: Client, local_staging_path: str, kafka_server, pytestconfig): entity = Entity( name="s2id", description="S2id", value_type=ValueType.INT64, ) kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}" topic_name = f"avro-{uuid.uuid4()}" feature_table = FeatureTable( name="drivers_stream", entities=["s2id"], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=FileSource( event_timestamp_column="event_timestamp", created_timestamp_column="event_timestamp", file_format=ParquetFormat(), file_url=os.path.join(local_staging_path, "batch-storage"), ), stream_source=KafkaSource( event_timestamp_column="event_timestamp", bootstrap_servers=kafka_broker, message_format=AvroFormat(avro_schema()), topic=topic_name, ), ) feast_client.apply(entity) feast_client.apply(feature_table) if not pytestconfig.getoption("scheduled_streaming_job"): job = feast_client.start_stream_to_online_ingestion(feature_table) assert job.get_feature_table() == feature_table.name wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 120) else: job = None wait_retry_backoff( lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 300) test_data = generate_data()[["s2id", "unique_drivers", "event_timestamp"]] try: ingested = ingest_and_retrieve( feast_client, test_data, avro_schema_json=avro_schema(), topic_name=topic_name, kafka_broker=kafka_broker, entity_rows=[{ "s2id": s2_id } for s2_id in test_data["s2id"].tolist()], feature_names=["drivers_stream:unique_drivers"], ) finally: if job: job.cancel() else: feast_client.delete_feature_table(feature_table.name) pd.testing.assert_frame_equal( ingested[["s2id", "drivers_stream:unique_drivers"]], test_data[[ "s2id", "unique_drivers" ]].rename(columns={"unique_drivers": "drivers_stream:unique_drivers"}), )
def test_validation_reports_metrics(feast_client: Client, kafka_server, statsd_server: StatsDServer): kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}" topic_name = f"avro-{uuid.uuid4()}" entity, feature_table = create_schema(kafka_broker, topic_name, "validation_ge_metrics") feast_client.apply_entity(entity) feast_client.apply_feature_table(feature_table) train_data = generate_train_data() ge_ds = PandasDataset(train_data) ge_ds.expect_column_values_to_be_between("num", 0, 100) ge_ds.expect_column_values_to_be_in_set("set", ["a", "b", "c"]) expectations = ge_ds.get_expectation_suite() udf = create_validation_udf("testUDF", expectations, feature_table) apply_validation(feast_client, feature_table, udf, validation_window_secs=10) job = feast_client.start_stream_to_online_ingestion(feature_table) wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 120) wait_retry_backoff( lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 120) test_data = generate_test_data() ge_ds = PandasDataset(test_data) validation_result = ge_ds.validate(expectations, result_format="COMPLETE") unexpected_counts = { "expect_column_values_to_be_between_num_0_100": validation_result.results[0].result["unexpected_count"], "expect_column_values_to_be_in_set_set": validation_result.results[1].result["unexpected_count"], } invalid_idx = list({ idx for check in validation_result.results for idx in check.result["unexpected_index_list"] }) entity_rows = [{"key": key} for key in test_data["key"].tolist()] try: ingest_and_retrieve( feast_client, test_data, avro_schema_json=avro_schema(), topic_name=topic_name, kafka_broker=kafka_broker, entity_rows=entity_rows, feature_names=[ "validation_ge_metrics:num", "validation_ge_metrics:set" ], expected_ingested_count=test_data.shape[0] - len(invalid_idx), ) finally: job.cancel() expected_metrics = [( f"feast_feature_validation_check_failed#check:{check_name}," f"feature_table:{feature_table.name},project:{feast_client.project}", value, ) for check_name, value in unexpected_counts.items()] wait_retry_backoff( lambda: ( None, all( statsd_server.metrics.get(m) == v for m, v in expected_metrics), ), timeout_secs=30, timeout_msg="Expected metrics were not received: " + str(expected_metrics) + "\n" "Actual received metrics" + str(statsd_server.metrics), )
def test_validation_with_ge(feast_client: Client, kafka_server): kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}" topic_name = f"avro-{uuid.uuid4()}" entity, feature_table = create_schema(kafka_broker, topic_name, "validation_ge") feast_client.apply_entity(entity) feast_client.apply_feature_table(feature_table) train_data = generate_train_data() ge_ds = PandasDataset(train_data) ge_ds.expect_column_values_to_be_between("num", 0, 100) ge_ds.expect_column_values_to_be_in_set("set", ["a", "b", "c"]) expectations = ge_ds.get_expectation_suite() udf = create_validation_udf("testUDF", expectations, feature_table) apply_validation(feast_client, feature_table, udf, validation_window_secs=1) job = feast_client.start_stream_to_online_ingestion(feature_table) wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 120) wait_retry_backoff( lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 120) test_data = generate_test_data() ge_ds = PandasDataset(test_data) validation_result = ge_ds.validate(expectations, result_format="COMPLETE") invalid_idx = list({ idx for check in validation_result.results for idx in check.result["unexpected_index_list"] }) entity_rows = [{"key": key} for key in test_data["key"].tolist()] try: ingested = ingest_and_retrieve( feast_client, test_data, avro_schema_json=avro_schema(), topic_name=topic_name, kafka_broker=kafka_broker, entity_rows=entity_rows, feature_names=["validation_ge:num", "validation_ge:set"], expected_ingested_count=test_data.shape[0] - len(invalid_idx), ) finally: job.cancel() test_data["num"] = test_data["num"].astype(np.float64) test_data["num"].iloc[invalid_idx] = np.nan test_data["set"].iloc[invalid_idx] = None pd.testing.assert_frame_equal( ingested[["key", "validation_ge:num", "validation_ge:set"]], test_data[["key", "num", "set"]].rename(columns={ "num": "validation_ge:num", "set": "validation_ge:set" }), )
def test_validation_with_ge(feast_client: Client, kafka_server): entity = Entity(name="key", description="Key", value_type=ValueType.INT64) kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}" topic_name = f"avro-{uuid.uuid4()}" feature_table = FeatureTable( name="validation_test", entities=["key"], features=[ Feature("num", ValueType.INT64), Feature("set", ValueType.STRING) ], batch_source=FileSource( event_timestamp_column="event_timestamp", file_format=ParquetFormat(), file_url="/dev/null", ), stream_source=KafkaSource( event_timestamp_column="event_timestamp", bootstrap_servers=kafka_broker, message_format=AvroFormat(avro_schema()), topic=topic_name, ), ) feast_client.apply_entity(entity) feast_client.apply_feature_table(feature_table) train_data = generate_train_data() ge_ds = PandasDataset(train_data) ge_ds.expect_column_values_to_be_between("num", 0, 100) ge_ds.expect_column_values_to_be_in_set("set", ["a", "b", "c"]) expectations = ge_ds.get_expectation_suite() udf = create_validation_udf("testUDF", expectations) apply_validation(feast_client, feature_table, udf, validation_window_secs=1) job = feast_client.start_stream_to_online_ingestion(feature_table) wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 120) wait_retry_backoff( lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 120) test_data = generate_test_data() ge_ds = PandasDataset(test_data) validation_result = ge_ds.validate(expectations, result_format="COMPLETE") invalid_idx = list({ idx for check in validation_result.results for idx in check.result["unexpected_index_list"] }) entity_rows = [{"key": key} for key in test_data["key"].tolist()] try: ingested = ingest_and_retrieve( feast_client, test_data, avro_schema_json=avro_schema(), topic_name=topic_name, kafka_broker=kafka_broker, entity_rows=entity_rows, feature_names=["validation_test:num", "validation_test:set"], expected_ingested_count=test_data.shape[0] - len(invalid_idx), ) finally: job.cancel() test_data["num"] = test_data["num"].astype(np.float64) test_data["num"].iloc[invalid_idx] = np.nan test_data["set"].iloc[invalid_idx] = None pd.testing.assert_frame_equal( ingested[["key", "validation_test:num", "validation_test:set"]], test_data[["key", "num", "set"]].rename(columns={ "num": "validation_test:num", "set": "validation_test:set" }), )