def test_offline_ingestion(feast_client: Client, staging_path: str): entity = Entity( name="s2id", description="S2id", value_type=ValueType.INT64, ) feature_table = FeatureTable( name="drivers", entities=["s2id"], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=FileSource( "event_timestamp", "event_timestamp", ParquetFormat(), os.path.join(staging_path, "batch-storage"), ), ) feast_client.apply_entity(entity) feast_client.apply_feature_table(feature_table) original = generate_data() feast_client.ingest(feature_table, original) # write to batch (offline) storage job = feast_client.start_offline_to_online_ingestion( feature_table, datetime.today(), datetime.today() + timedelta(days=1)) status = wait_retry_backoff( lambda: (job.get_status(), job.get_status() != SparkJobStatus.IN_PROGRESS), 300) assert status == SparkJobStatus.COMPLETED features = feast_client.get_online_features( ["drivers:unique_drivers"], entity_rows=[{ "s2id": s2_id } for s2_id in original["s2id"].tolist()], ).to_dict() ingested = pd.DataFrame.from_dict(features) pd.testing.assert_frame_equal( ingested[["s2id", "drivers:unique_drivers"]], original[[ "s2id", "unique_drivers" ]].rename(columns={"unique_drivers": "drivers:unique_drivers"}), )
def test_historical_features( feast_client: Client, batch_source: Union[BigQuerySource, FileSource] ): customer_entity = Entity( name="user_id", description="Customer", value_type=ValueType.INT64 ) feast_client.apply_entity(customer_entity) max_age = Duration() max_age.FromSeconds(2 * 86400) transactions_feature_table = FeatureTable( name="transactions", entities=["user_id"], features=[ Feature("daily_transactions", ValueType.DOUBLE), Feature("total_transactions", ValueType.DOUBLE), ], batch_source=batch_source, max_age=max_age, ) feast_client.apply_feature_table(transactions_feature_table) transactions_df, customers_df = generate_data() feast_client.ingest(transactions_feature_table, transactions_df) feature_refs = ["transactions:daily_transactions"] job = feast_client.get_historical_features(feature_refs, customers_df) output_dir = job.get_output_file_uri() joined_df = read_parquet(output_dir) expected_joined_df = pd.DataFrame( { "event_timestamp": customers_df.event_timestamp.tolist(), "user_id": customers_df.user_id.tolist(), "transactions__daily_transactions": transactions_df.daily_transactions.tolist() + [None] * transactions_df.shape[0], } ) assert_frame_equal( joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index(drop=True), expected_joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index( drop=True ), )
def test_offline_ingestion_from_bq_view(pytestconfig, bq_dataset, feast_client: Client): original = generate_data() bq_project = pytestconfig.getoption("bq_project") bq_client = bigquery.Client(project=bq_project) source_ref = bigquery.TableReference( bigquery.DatasetReference(bq_project, bq_dataset), f"ingestion_source_{datetime.now():%Y%m%d%H%M%s}", ) bq_client.load_table_from_dataframe(original, source_ref).result() view_ref = bigquery.TableReference( bigquery.DatasetReference(bq_project, bq_dataset), f"ingestion_view_{datetime.now():%Y%m%d%H%M%s}", ) view = bigquery.Table(view_ref) view.view_query = f"select * from `{source_ref.project}.{source_ref.dataset_id}.{source_ref.table_id}`" bq_client.create_table(view) entity = Entity(name="s2id", description="S2id", value_type=ValueType.INT64) feature_table = FeatureTable( name="bq_ingestion", entities=["s2id"], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=BigQuerySource( event_timestamp_column="event_timestamp", table_ref= f"{view_ref.project}:{view_ref.dataset_id}.{view_ref.table_id}", ), ) feast_client.apply_entity(entity) feast_client.apply_feature_table(feature_table) ingest_and_verify(feast_client, feature_table, original)
def test_offline_ingestion(feast_client: Client, batch_source: Union[BigQuerySource, FileSource]): entity = Entity( name="s2id", description="S2id", value_type=ValueType.INT64, ) feature_table = FeatureTable( name="drivers", entities=["s2id"], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=batch_source, ) feast_client.apply_entity(entity) feast_client.apply_feature_table(feature_table) original = generate_data() feast_client.ingest(feature_table, original) # write to batch (offline) storage ingest_and_verify(feast_client, feature_table, original)
def test_streaming_ingestion(feast_client: Client, local_staging_path: str, kafka_server): entity = Entity( name="s2id", description="S2id", value_type=ValueType.INT64, ) kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}" topic_name = f"avro-{uuid.uuid4()}" feature_table = FeatureTable( name="drivers_stream", entities=["s2id"], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=FileSource( event_timestamp_column="event_timestamp", created_timestamp_column="event_timestamp", file_format=ParquetFormat(), file_url=os.path.join(local_staging_path, "batch-storage"), ), stream_source=KafkaSource( "event_timestamp", "event_timestamp", kafka_broker, AvroFormat(avro_schema()), topic=topic_name, ), ) feast_client.apply_entity(entity) feast_client.apply_feature_table(feature_table) job = feast_client.start_stream_to_online_ingestion(feature_table) wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 60) wait_retry_backoff( lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 60) try: original = generate_data()[[ "s2id", "unique_drivers", "event_timestamp" ]] for record in original.to_dict("records"): record["event_timestamp"] = ( record["event_timestamp"].to_pydatetime().replace( tzinfo=pytz.utc)) send_avro_record_to_kafka( topic_name, record, bootstrap_servers=kafka_broker, avro_schema_json=avro_schema(), ) def get_online_features(): features = feast_client.get_online_features( ["drivers_stream:unique_drivers"], entity_rows=[{ "s2id": s2_id } for s2_id in original["s2id"].tolist()], ).to_dict() df = pd.DataFrame.from_dict(features) return df, not df["drivers_stream:unique_drivers"].isna().any() ingested = wait_retry_backoff(get_online_features, 60) finally: job.cancel() pd.testing.assert_frame_equal( ingested[["s2id", "drivers_stream:unique_drivers"]], original[[ "s2id", "unique_drivers" ]].rename(columns={"unique_drivers": "drivers_stream:unique_drivers"}), )
def test_historical_features(feast_client: Client, local_staging_path: str): customer_entity = Entity(name="user_id", description="Customer", value_type=ValueType.INT64) feast_client.apply_entity(customer_entity) max_age = Duration() max_age.FromSeconds(2 * 86400) transactions_feature_table = FeatureTable( name="transactions", entities=["user_id"], features=[ Feature("daily_transactions", ValueType.DOUBLE), Feature("total_transactions", ValueType.DOUBLE), ], batch_source=FileSource( "event_timestamp", "created_timestamp", ParquetFormat(), os.path.join(local_staging_path, "transactions"), ), max_age=max_age, ) feast_client.apply_feature_table(transactions_feature_table) retrieval_date = (datetime.utcnow().replace( hour=0, minute=0, second=0, microsecond=0).replace(tzinfo=None)) retrieval_outside_max_age_date = retrieval_date + timedelta(1) event_date = retrieval_date - timedelta(2) creation_date = retrieval_date - timedelta(1) customers = [1001, 1002, 1003, 1004, 1005] daily_transactions = [np.random.rand() * 10 for _ in customers] total_transactions = [np.random.rand() * 100 for _ in customers] transactions_df = pd.DataFrame({ "event_timestamp": [event_date for _ in customers], "created_timestamp": [creation_date for _ in customers], "user_id": customers, "daily_transactions": daily_transactions, "total_transactions": total_transactions, }) feast_client.ingest(transactions_feature_table, transactions_df) feature_refs = ["transactions:daily_transactions"] customer_df = pd.DataFrame({ "event_timestamp": [retrieval_date for _ in customers] + [retrieval_outside_max_age_date for _ in customers], "user_id": customers + customers, }) job = feast_client.get_historical_features(feature_refs, customer_df) output_dir = job.get_output_file_uri() joined_df = read_parquet(output_dir) expected_joined_df = pd.DataFrame({ "event_timestamp": [retrieval_date for _ in customers] + [retrieval_outside_max_age_date for _ in customers], "user_id": customers + customers, "transactions__daily_transactions": daily_transactions + [None] * len(customers), }) assert_frame_equal( joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index( drop=True), expected_joined_df.sort_values( by=["user_id", "event_timestamp"]).reset_index(drop=True), )
def test_validation_with_ge(feast_client: Client, kafka_server): kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}" topic_name = f"avro-{uuid.uuid4()}" entity, feature_table = create_schema(kafka_broker, topic_name, "validation_ge") feast_client.apply_entity(entity) feast_client.apply_feature_table(feature_table) train_data = generate_train_data() ge_ds = PandasDataset(train_data) ge_ds.expect_column_values_to_be_between("num", 0, 100) ge_ds.expect_column_values_to_be_in_set("set", ["a", "b", "c"]) expectations = ge_ds.get_expectation_suite() udf = create_validation_udf("testUDF", expectations, feature_table) apply_validation(feast_client, feature_table, udf, validation_window_secs=1) job = feast_client.start_stream_to_online_ingestion(feature_table) wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 120) wait_retry_backoff( lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 120) test_data = generate_test_data() ge_ds = PandasDataset(test_data) validation_result = ge_ds.validate(expectations, result_format="COMPLETE") invalid_idx = list({ idx for check in validation_result.results for idx in check.result["unexpected_index_list"] }) entity_rows = [{"key": key} for key in test_data["key"].tolist()] try: ingested = ingest_and_retrieve( feast_client, test_data, avro_schema_json=avro_schema(), topic_name=topic_name, kafka_broker=kafka_broker, entity_rows=entity_rows, feature_names=["validation_ge:num", "validation_ge:set"], expected_ingested_count=test_data.shape[0] - len(invalid_idx), ) finally: job.cancel() test_data["num"] = test_data["num"].astype(np.float64) test_data["num"].iloc[invalid_idx] = np.nan test_data["set"].iloc[invalid_idx] = None pd.testing.assert_frame_equal( ingested[["key", "validation_ge:num", "validation_ge:set"]], test_data[["key", "num", "set"]].rename(columns={ "num": "validation_ge:num", "set": "validation_ge:set" }), )
def test_validation_reports_metrics(feast_client: Client, kafka_server, statsd_server: StatsDServer): kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}" topic_name = f"avro-{uuid.uuid4()}" entity, feature_table = create_schema(kafka_broker, topic_name, "validation_ge_metrics") feast_client.apply_entity(entity) feast_client.apply_feature_table(feature_table) train_data = generate_train_data() ge_ds = PandasDataset(train_data) ge_ds.expect_column_values_to_be_between("num", 0, 100) ge_ds.expect_column_values_to_be_in_set("set", ["a", "b", "c"]) expectations = ge_ds.get_expectation_suite() udf = create_validation_udf("testUDF", expectations, feature_table) apply_validation(feast_client, feature_table, udf, validation_window_secs=10) job = feast_client.start_stream_to_online_ingestion(feature_table) wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 120) wait_retry_backoff( lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 120) test_data = generate_test_data() ge_ds = PandasDataset(test_data) validation_result = ge_ds.validate(expectations, result_format="COMPLETE") unexpected_counts = { "expect_column_values_to_be_between_num_0_100": validation_result.results[0].result["unexpected_count"], "expect_column_values_to_be_in_set_set": validation_result.results[1].result["unexpected_count"], } invalid_idx = list({ idx for check in validation_result.results for idx in check.result["unexpected_index_list"] }) entity_rows = [{"key": key} for key in test_data["key"].tolist()] try: ingest_and_retrieve( feast_client, test_data, avro_schema_json=avro_schema(), topic_name=topic_name, kafka_broker=kafka_broker, entity_rows=entity_rows, feature_names=[ "validation_ge_metrics:num", "validation_ge_metrics:set" ], expected_ingested_count=test_data.shape[0] - len(invalid_idx), ) finally: job.cancel() expected_metrics = [( f"feast_feature_validation_check_failed#check:{check_name}," f"feature_table:{feature_table.name},project:{feast_client.project}", value, ) for check_name, value in unexpected_counts.items()] wait_retry_backoff( lambda: ( None, all( statsd_server.metrics.get(m) == v for m, v in expected_metrics), ), timeout_secs=30, timeout_msg="Expected metrics were not received: " + str(expected_metrics) + "\n" "Actual received metrics" + str(statsd_server.metrics), )
def test_validation_with_ge(feast_client: Client, kafka_server): entity = Entity(name="key", description="Key", value_type=ValueType.INT64) kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}" topic_name = f"avro-{uuid.uuid4()}" feature_table = FeatureTable( name="validation_test", entities=["key"], features=[ Feature("num", ValueType.INT64), Feature("set", ValueType.STRING) ], batch_source=FileSource( event_timestamp_column="event_timestamp", file_format=ParquetFormat(), file_url="/dev/null", ), stream_source=KafkaSource( event_timestamp_column="event_timestamp", bootstrap_servers=kafka_broker, message_format=AvroFormat(avro_schema()), topic=topic_name, ), ) feast_client.apply_entity(entity) feast_client.apply_feature_table(feature_table) train_data = generate_train_data() ge_ds = PandasDataset(train_data) ge_ds.expect_column_values_to_be_between("num", 0, 100) ge_ds.expect_column_values_to_be_in_set("set", ["a", "b", "c"]) expectations = ge_ds.get_expectation_suite() udf = create_validation_udf("testUDF", expectations) apply_validation(feast_client, feature_table, udf, validation_window_secs=1) job = feast_client.start_stream_to_online_ingestion(feature_table) wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 120) wait_retry_backoff( lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 120) test_data = generate_test_data() ge_ds = PandasDataset(test_data) validation_result = ge_ds.validate(expectations, result_format="COMPLETE") invalid_idx = list({ idx for check in validation_result.results for idx in check.result["unexpected_index_list"] }) entity_rows = [{"key": key} for key in test_data["key"].tolist()] try: ingested = ingest_and_retrieve( feast_client, test_data, avro_schema_json=avro_schema(), topic_name=topic_name, kafka_broker=kafka_broker, entity_rows=entity_rows, feature_names=["validation_test:num", "validation_test:set"], expected_ingested_count=test_data.shape[0] - len(invalid_idx), ) finally: job.cancel() test_data["num"] = test_data["num"].astype(np.float64) test_data["num"].iloc[invalid_idx] = np.nan test_data["set"].iloc[invalid_idx] = None pd.testing.assert_frame_equal( ingested[["key", "validation_test:num", "validation_test:set"]], test_data[["key", "num", "set"]].rename(columns={ "num": "validation_test:num", "set": "validation_test:set" }), )