def stop_job(job, feast_client: Client, feature_table: FeatureTable): if job: job.cancel() else: feast_client.delete_feature_table(feature_table.name)
def test_streaming_ingestion(feast_client: Client, local_staging_path: str, kafka_server, pytestconfig): entity = Entity( name="s2id", description="S2id", value_type=ValueType.INT64, ) kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}" topic_name = f"avro-{uuid.uuid4()}" feature_table = FeatureTable( name="drivers_stream", entities=["s2id"], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=FileSource( event_timestamp_column="event_timestamp", created_timestamp_column="event_timestamp", file_format=ParquetFormat(), file_url=os.path.join(local_staging_path, "batch-storage"), ), stream_source=KafkaSource( event_timestamp_column="event_timestamp", bootstrap_servers=kafka_broker, message_format=AvroFormat(avro_schema()), topic=topic_name, ), ) feast_client.apply(entity) feast_client.apply(feature_table) if not pytestconfig.getoption("scheduled_streaming_job"): job = feast_client.start_stream_to_online_ingestion(feature_table) assert job.get_feature_table() == feature_table.name wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 120) else: job = None wait_retry_backoff( lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 300) test_data = generate_data()[["s2id", "unique_drivers", "event_timestamp"]] try: ingested = ingest_and_retrieve( feast_client, test_data, avro_schema_json=avro_schema(), topic_name=topic_name, kafka_broker=kafka_broker, entity_rows=[{ "s2id": s2_id } for s2_id in test_data["s2id"].tolist()], feature_names=["drivers_stream:unique_drivers"], ) finally: if job: job.cancel() else: feast_client.delete_feature_table(feature_table.name) pd.testing.assert_frame_equal( ingested[["s2id", "drivers_stream:unique_drivers"]], test_data[[ "s2id", "unique_drivers" ]].rename(columns={"unique_drivers": "drivers_stream:unique_drivers"}), )