def get_avro_files(self, timeout_sec: int = int(defaults[CONFIG_TIMEOUT_KEY])): """ Wait until job is done to get the file uri to Avro result files on Google Cloud Storage. Args: timeout_sec (int): Max no of seconds to wait until job is done. If "timeout_sec" is exceeded, an exception will be raised. Returns: str: Google Cloud Storage file uris of the returned Avro files. """ def try_retrieve(): self.reload() return None, self.status == JOB_STATUS_DONE wait_retry_backoff( retry_fn=try_retrieve, timeout_secs=timeout_sec, timeout_msg="Timeout exceeded while waiting for result. Please retry " "this method or use a longer timeout value.", ) if self.job_proto.error: raise Exception(self.job_proto.error) if self.job_proto.data_format != DATA_FORMAT_AVRO: raise Exception( "Feast only supports Avro data format for now. Please check " "your Feast Serving deployment." ) return [urlparse(uri) for uri in self.job_proto.file_uris]
def test_ingest_into_bq( feast_client: Client, customer_entity: Entity, driver_entity: Entity, bq_dataframe: pd.DataFrame, bq_dataset: str, pytestconfig, ): bq_project = pytestconfig.getoption("bq_project") bq_table_id = f"bq_staging_{datetime.now():%Y%m%d%H%M%s}" ft = FeatureTable( name="basic_featuretable", entities=["driver_id", "customer_id"], features=[ Feature(name="dev_feature_float", dtype=ValueType.FLOAT), Feature(name="dev_feature_string", dtype=ValueType.STRING), ], max_age=Duration(seconds=3600), batch_source=BigQuerySource( table_ref=f"{bq_project}:{bq_dataset}.{bq_table_id}", event_timestamp_column="datetime", created_timestamp_column="timestamp", ), ) # ApplyEntity feast_client.apply(customer_entity) feast_client.apply(driver_entity) # ApplyFeatureTable feast_client.apply(ft) feast_client.ingest(ft, bq_dataframe, timeout=120) bq_client = bigquery.Client(project=bq_project) # Poll BQ for table until the table has been created def try_get_table(): try: table = bq_client.get_table( bigquery.TableReference( bigquery.DatasetReference(bq_project, bq_dataset), bq_table_id ) ) except NotFound: return None, False else: return table, True wait_retry_backoff( retry_fn=try_get_table, timeout_secs=30, timeout_msg="Timed out trying to get bigquery table", ) query_string = f"SELECT * FROM `{bq_project}.{bq_dataset}.{bq_table_id}`" job = bq_client.query(query_string) query_df = job.to_dataframe() assert_frame_equal(query_df, bq_dataframe)
def test_basic_retrieve_online_multiple_featureset(client, cust_trans_df, driver_df): # Test retrieve with different variations of the string feature refs # ie feature set inference for feature refs without specified feature set feature_ref_df_mapping = [ ("customer_transactions:daily_transactions", cust_trans_df), ("driver:rating", driver_df), ("total_transactions", cust_trans_df), ] # Poll serving for feature values until the correct values are returned def try_get_features(): feature_refs = [mapping[0] for mapping in feature_ref_df_mapping] response = client.get_online_features( entity_rows=[ GetOnlineFeaturesRequest.EntityRow( fields={ "customer_id": Value(int64_val=cust_trans_df.iloc[0]["customer_id"]), "driver_id": Value(int64_val=driver_df.iloc[0]["driver_id"]) }) ], feature_refs=feature_refs, ) # type: GetOnlineFeaturesResponse is_ok = all([ check_online_response(ref, df, response) for ref, df in feature_ref_df_mapping ]) return response, is_ok wait_retry_backoff( retry_fn=try_get_features, timeout_secs=90, timeout_msg="Timed out trying to get online feature values")
def test_large_volume_retrieve_online_success(client, large_volume_dataframe): # Poll serving for feature values until the correct values are returned feature_refs = [ "daily_transactions_large", "total_transactions_large", ] while True: response = client.get_online_features( entity_rows=[ GetOnlineFeaturesRequest.EntityRow( fields={ "customer_id": Value(int64_val=large_volume_dataframe.iloc[0] ["customer_id"]) }) ], feature_refs=feature_refs, ) # type: GetOnlineFeaturesResponse is_ok = all([ check_online_response(ref, large_volume_dataframe, response) for ref in feature_refs ]) return None, is_ok wait_retry_backoff( retry_fn=try_get_features, timeout_secs=90, timeout_msg="Timed out trying to get online feature values")
def test_basic_retrieve_online_success(client, cust_trans_df): feature_refs = ["daily_transactions", "total_transactions", "null_values"] # Poll serving for feature values until the correct values are returned def try_get_features(): response = client.get_online_features( entity_rows=[ GetOnlineFeaturesRequest.EntityRow( fields={ "customer_id": Value(int64_val=cust_trans_df.iloc[0]["customer_id"]) }) ], feature_refs=feature_refs, ) # type: GetOnlineFeaturesResponse is_ok = all([ check_online_response(ref, cust_trans_df, response) for ref in feature_refs ]) return response, is_ok wait_retry_backoff( retry_fn=try_get_features, timeout_secs=90, timeout_msg="Timed out trying to get online feature values", )
def ingest_and_verify(feast_client: Client, feature_table: FeatureTable, original: pd.DataFrame): job = feast_client.start_offline_to_online_ingestion( feature_table, original.event_timestamp.min().to_pydatetime(), original.event_timestamp.max().to_pydatetime() + timedelta(seconds=1), ) assert job.get_feature_table() == feature_table.name wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.COMPLETED), 180) features = feast_client.get_online_features( [f"{feature_table.name}:unique_drivers"], entity_rows=[{ "s2id": s2_id } for s2_id in original["s2id"].tolist()], ).to_dict() ingested = pd.DataFrame.from_dict(features) pd.testing.assert_frame_equal( ingested[["s2id", f"{feature_table.name}:unique_drivers"]], original[["s2id", "unique_drivers"]].rename( columns={"unique_drivers": f"{feature_table.name}:unique_drivers" }), )
def start_job(feast_client: Client, feature_table: FeatureTable, pytestconfig): if pytestconfig.getoption("scheduled_streaming_job"): return job = feast_client.start_stream_to_online_ingestion(feature_table) wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 120) return job
def test_batch_get_historical_features_with_file(client): file_fs1 = client.get_feature_set(name="file_feature_set") N_ROWS = 10 time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) features_1_df = pd.DataFrame( { "datetime": [time_offset] * N_ROWS, "entity_id": [i for i in range(N_ROWS)], "feature_value1": [f"{i}" for i in range(N_ROWS)], } ) # feature set may be ready (direct runner set ready right after job submitted), # but kafka consumer is not configured # give some time to warm up ingestion job wait_retry_backoff( retry_fn=( lambda: ( None, client.get_feature_set(name="file_feature_set").status == FeatureSetStatus.STATUS_READY, ) ), timeout_secs=480, timeout_msg="Wait for FeatureSet to be READY", ) time.sleep(20) client.ingest(file_fs1, features_1_df, timeout=480) # Rename column (datetime -> event_timestamp) features_1_df = features_1_df.rename(columns={"datetime": "event_timestamp"}) to_avro( df=features_1_df[["event_timestamp", "entity_id"]], file_path_or_buffer="file_feature_set.avro", ) time.sleep(10) def check(): feature_retrieval_job = client.get_historical_features( entity_rows="file://file_feature_set.avro", feature_refs=["feature_value1"], project=PROJECT_NAME, ) output = feature_retrieval_job.to_dataframe(timeout_sec=180) print(output.head()) assert output["entity_id"].to_list() == [ int(i) for i in output["feature_value1"].to_list() ] clean_up_remote_files(feature_retrieval_job.get_avro_files()) wait_for(check, timedelta(minutes=10))
def test_list_jobs_long_table_name( feast_client: Client, feast_spark_client: SparkClient, batch_source: Union[BigQuerySource, FileSource], ): entity = Entity(name="long_entity_name" * 10, description="S2id", value_type=ValueType.INT64) feature_table = FeatureTable( name="just1a2featuretable3with4a5really6really7really8really9really10", entities=[entity.name], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=batch_source, ) feast_client.apply(entity) feast_client.apply(feature_table) data_sample = generate_data().rename(columns={"s2id": entity.name}) feast_client.ingest(feature_table, data_sample) job = feast_spark_client.start_offline_to_online_ingestion( feature_table, data_sample.event_timestamp.min().to_pydatetime(), data_sample.event_timestamp.max().to_pydatetime() + timedelta(seconds=1), ) wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.COMPLETED), 180) all_job_ids = [ job.get_id() for job in feast_spark_client.list_jobs( include_terminated=True, project=feast_client.project, table_name=feature_table.name, ) ] assert job.get_id() in all_job_ids features = feast_client.get_online_features( [f"{feature_table.name}:unique_drivers"], entity_rows=[{ entity.name: key } for key in data_sample[entity.name].tolist()], ).to_dict() ingested = pd.DataFrame.from_dict(features) pd.testing.assert_frame_equal( ingested[[entity.name, f"{feature_table.name}:unique_drivers"]], data_sample[[entity.name, "unique_drivers"]].rename( columns={"unique_drivers": f"{feature_table.name}:unique_drivers" }), )
def test_ingest( client: Client, customer_entity: Entity, driver_entity: Entity, bq_featuretable: FeatureTable, bq_dataset: pd.DataFrame, bq_table_id: str, ): gcp_project, _ = bq_table_id.split(":") bq_table_id = bq_table_id.replace(":", ".") # ApplyEntity client.apply_entity(customer_entity) client.apply_entity(driver_entity) # ApplyFeatureTable client.apply_feature_table(bq_featuretable) client.ingest(bq_featuretable, bq_dataset, timeout=120) from google.api_core.exceptions import NotFound from google.cloud import bigquery bq_client = bigquery.Client(project=gcp_project) # Poll BQ for table until the table has been created def try_get_table(): table_exist = False table_resp = None try: table_resp = bq_client.get_table(bq_table_id) if table_resp and table_resp.table_id == bq_table_id.split( ".")[-1]: table_exist = True except NotFound: pass return table_resp, table_exist wait_retry_backoff( retry_fn=try_get_table, timeout_secs=30, timeout_msg="Timed out trying to get bigquery table", ) query_string = f"SELECT * FROM `{bq_table_id}`" job = bq_client.query(query_string) query_df = job.to_dataframe() assert_frame_equal(query_df, bq_dataset) bq_client.delete_table(bq_table_id, not_found_ok=True)
def wait(self, status: IngestionJobStatus, timeout_secs: int = 300): # type: ignore """ Wait for this IngestJob to transtion to the given status. Raises TimeoutError if the wait operation times out. Args: status: The IngestionJobStatus to wait for. timeout_secs: Maximum seconds to wait before timing out. """ # poll & wait for job status to transition wait_retry_backoff( retry_fn=(lambda: (None, self.status == status)), # type: ignore timeout_secs=timeout_secs, timeout_msg="Wait for IngestJob's status to transition timed out", )
def ingest_and_retrieve( feast_client: Client, df: pd.DataFrame, topic_name: str, kafka_broker: str, avro_schema_json: str, entity_rows: List[Dict[str, Any]], feature_names: List[Any], expected_ingested_count: Optional[int] = None, ): expected_ingested_count = expected_ingested_count or df.shape[0] for record in df.to_dict("records"): record["event_timestamp"] = ( record["event_timestamp"].to_pydatetime().replace(tzinfo=pytz.utc)) send_avro_record_to_kafka( topic_name, record, bootstrap_servers=kafka_broker, avro_schema_json=avro_schema_json, ) def get_online_features(): features = feast_client.get_online_features( feature_names, entity_rows=entity_rows, ).to_dict() out_df = pd.DataFrame.from_dict(features) return out_df, out_df[feature_names].count().min( ) >= expected_ingested_count ingested = wait_retry_backoff(get_online_features, 120) return ingested
def test_offline_ingestion(feast_client: Client, local_staging_path: str): entity = Entity( name="s2id", description="S2id", value_type=ValueType.INT64, ) feature_table = FeatureTable( name="drivers", entities=["s2id"], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=FileSource( event_timestamp_column="event_timestamp", created_timestamp_column="event_timestamp", file_format=ParquetFormat(), file_url=os.path.join(local_staging_path, "batch-storage"), ), ) feast_client.apply_entity(entity) feast_client.apply_feature_table(feature_table) original = generate_data() feast_client.ingest(feature_table, original) # write to batch (offline) storage job = feast_client.start_offline_to_online_ingestion( feature_table, datetime.today(), datetime.today() + timedelta(days=1)) wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.COMPLETED), 60) features = feast_client.get_online_features( ["drivers:unique_drivers"], entity_rows=[{ "s2id": s2_id } for s2_id in original["s2id"].tolist()], ).to_dict() ingested = pd.DataFrame.from_dict(features) pd.testing.assert_frame_equal( ingested[["s2id", "drivers:unique_drivers"]], original[[ "s2id", "unique_drivers" ]].rename(columns={"unique_drivers": "drivers:unique_drivers"}), )
def test_list_jobs_long_table_name( feast_client: Client, feast_spark_client: SparkClient, batch_source: Union[BigQuerySource, FileSource], ): entity = Entity( name="s2id", description="S2id", value_type=ValueType.INT64, ) feature_table = FeatureTable( name= "just1a2featuretable3with4a5really6really7really8really9really10really11really12long13name", entities=["s2id"], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=batch_source, ) feast_client.apply(entity) feast_client.apply(feature_table) data_sample = generate_data() feast_client.ingest(feature_table, data_sample) job = feast_spark_client.start_offline_to_online_ingestion( feature_table, data_sample.event_timestamp.min().to_pydatetime(), data_sample.event_timestamp.max().to_pydatetime() + timedelta(seconds=1), ) wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.COMPLETED), 180) all_job_ids = [ job.get_id() for job in feast_spark_client.list_jobs( include_terminated=True, project=feast_client.project, table_name=feature_table.name, ) ] assert job.get_id() in all_job_ids
def test_all_types_retrieve_online_success(client, all_types_dataframe): # Poll serving for feature values until the correct values are returned_float_list feature_refs = [ "float_feature", "int64_feature", "int32_feature", "double_feature", "string_feature", "bool_feature", "bytes_feature", "float_list_feature", "int64_list_feature", "int32_list_feature", "string_list_feature", "bytes_list_feature", "double_list_feature", ] def try_get_features(): response = client.get_online_features( entity_rows=[ GetOnlineFeaturesRequest.EntityRow( fields={ "user_id": Value(int64_val=all_types_dataframe.iloc[0]["user_id"]) }) ], feature_refs=feature_refs, ) # type: GetOnlineFeaturesResponse is_ok = check_online_response("float_feature", all_types_dataframe, response) return response, is_ok response = wait_retry_backoff( retry_fn=try_get_features, timeout_secs=90, timeout_msg="Timed out trying to get online feature values", ) # check returned values returned_float_list = (response.field_values[0]. fields["float_list_feature"].float_list_val.val) sent_float_list = all_types_dataframe.iloc[0]["float_list_feature"] assert math.isclose(returned_float_list[0], sent_float_list[0], abs_tol=FLOAT_TOLERANCE) # check returned metadata assert (response.field_values[0].statuses["float_list_feature"] == GetOnlineFeaturesResponse.FieldStatus.PRESENT)
def test_basic_ingest_retrieval_str(client): # Set to another project to test ingestion based on current project context client.set_project(PROJECT_NAME + "_NS1") customer_fs = FeatureSet( name="cust_fs", features=[ Feature(name="cust_rating", dtype=ValueType.INT64), Feature(name="cust_cost", dtype=ValueType.FLOAT), ], entities=[Entity("cust_id", ValueType.INT64)], max_age=Duration(seconds=3600), ) client.apply(customer_fs) N_ROWS = 2 time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) cust_df = pd.DataFrame({ "datetime": [time_offset] * N_ROWS, "cust_id": [i for i in range(N_ROWS)], "cust_rating": [i for i in range(N_ROWS)], "cust_cost": [float(i) + 0.5 for i in range(N_ROWS)], }) client.ingest("cust_fs", cust_df, timeout=600) time.sleep(15) online_request_entity = [{"cust_id": 0}, {"cust_id": 1}] online_request_features = ["cust_rating", "cust_cost"] def try_get_features(): response = client.get_online_features( entity_rows=online_request_entity, feature_refs=online_request_features) return response, True online_features_actual = wait_retry_backoff( retry_fn=try_get_features, timeout_secs=90, timeout_msg="Timed out trying to get online feature values", ) online_features_expected = { "cust_id": [0, 1], "cust_rating": [0, 1], "cust_cost": [0.5, 1.5], } assert online_features_actual.to_dict() == online_features_expected
def test_validation_reports_metrics(feast_client: Client, kafka_server, statsd_server: StatsDServer): kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}" topic_name = f"avro-{uuid.uuid4()}" entity, feature_table = create_schema(kafka_broker, topic_name, "validation_ge_metrics") feast_client.apply_entity(entity) feast_client.apply_feature_table(feature_table) train_data = generate_train_data() ge_ds = PandasDataset(train_data) ge_ds.expect_column_values_to_be_between("num", 0, 100) ge_ds.expect_column_values_to_be_in_set("set", ["a", "b", "c"]) expectations = ge_ds.get_expectation_suite() udf = create_validation_udf("testUDF", expectations, feature_table) apply_validation(feast_client, feature_table, udf, validation_window_secs=10) job = feast_client.start_stream_to_online_ingestion(feature_table) wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 120) wait_retry_backoff( lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 120) test_data = generate_test_data() ge_ds = PandasDataset(test_data) validation_result = ge_ds.validate(expectations, result_format="COMPLETE") unexpected_counts = { "expect_column_values_to_be_between_num_0_100": validation_result.results[0].result["unexpected_count"], "expect_column_values_to_be_in_set_set": validation_result.results[1].result["unexpected_count"], } invalid_idx = list({ idx for check in validation_result.results for idx in check.result["unexpected_index_list"] }) entity_rows = [{"key": key} for key in test_data["key"].tolist()] try: ingest_and_retrieve( feast_client, test_data, avro_schema_json=avro_schema(), topic_name=topic_name, kafka_broker=kafka_broker, entity_rows=entity_rows, feature_names=[ "validation_ge_metrics:num", "validation_ge_metrics:set" ], expected_ingested_count=test_data.shape[0] - len(invalid_idx), ) finally: job.cancel() expected_metrics = [( f"feast_feature_validation_check_failed#check:{check_name}," f"feature_table:{feature_table.name},project:{feast_client.project}", value, ) for check_name, value in unexpected_counts.items()] wait_retry_backoff( lambda: ( None, all( statsd_server.metrics.get(m) == v for m, v in expected_metrics), ), timeout_secs=30, timeout_msg="Expected metrics were not received: " + str(expected_metrics) + "\n" "Actual received metrics" + str(statsd_server.metrics), )
def test_basic_retrieve_online_entity_listform(client, list_entity_dataframe): # Case 1: Features retrieval with entity in list format check district_fs = FeatureSet( name="district", features=[ Feature(name="district_rating", dtype=ValueType.INT64), Feature(name="district_cost", dtype=ValueType.FLOAT), Feature(name="district_past_transactions_int", dtype=ValueType.INT64_LIST), Feature(name="district_past_transactions_double", dtype=ValueType.DOUBLE_LIST), Feature(name="district_past_transactions_float", dtype=ValueType.FLOAT_LIST), Feature(name="district_past_transactions_string", dtype=ValueType.STRING_LIST), Feature(name="district_past_transactions_bool", dtype=ValueType.BOOL_LIST), ], entities=[Entity("district_ids", dtype=ValueType.INT64_LIST)], max_age=Duration(seconds=3600), ) client.set_project(PROJECT_NAME) client.apply(district_fs) district_fs = client.get_feature_set(name="district") client.ingest(district_fs, list_entity_dataframe, timeout=600) time.sleep(15) online_request_entity = [{ "district_ids": [np.int64(1), np.int64(2), np.int64(3)] }] online_request_features = [ "district_rating", "district_cost", "district_past_transactions_int", "district_past_transactions_double", "district_past_transactions_float", "district_past_transactions_string", "district_past_transactions_bool", ] online_request_entity2 = [{ "district_ids": Value(int64_list_val=Int64List(val=[1, 2, 3])) }] def try_get_features1(): response = client.get_online_features( entity_rows=online_request_entity, feature_refs=online_request_features) return response, True def try_get_features2(): response = client.get_online_features( entity_rows=online_request_entity2, feature_refs=online_request_features) return response, True online_features_actual = wait_retry_backoff( retry_fn=try_get_features1, timeout_secs=90, timeout_msg="Timed out trying to get online feature values", ) online_features_actual2 = wait_retry_backoff( retry_fn=try_get_features2, timeout_secs=90, timeout_msg="Timed out trying to get online feature values", ) online_features_expected = { "district_ids": [[np.int64(1), np.int64(2), np.int64(3)]], "district_rating": [1], "district_cost": [1.5], "district_past_transactions_int": [[1, 3]], "district_past_transactions_double": [[1.5, 3.0]], "district_past_transactions_float": [[1.5, 3.0]], "district_past_transactions_string": [["first_1", "second_1"]], "district_past_transactions_bool": [[True, False]], } assert online_features_actual.to_dict() == online_features_expected assert online_features_actual2.to_dict() == online_features_expected # Case 2: Features retrieval with entity in list format check with mixed types with pytest.raises(ValueError) as excinfo: online_request_entity2 = [{ "district_ids": [np.int64(1), np.int64(2), True] }] online_features_actual2 = client.get_online_features( entity_rows=online_request_entity2, feature_refs=online_request_features) assert ( "List value type for field district_ids is inconsistent. ValueType.INT64 different from ValueType.BOOL." in str(excinfo.value))
def test_basic_retrieve_online_entity_nonlistform(client, nonlist_entity_dataframe, list_entity_dataframe): # Case 1: Feature retrieval with multiple entities retrieval check customer_fs = FeatureSet( name="customer2", features=[ Feature(name="customer2_rating", dtype=ValueType.INT64), Feature(name="customer2_cost", dtype=ValueType.FLOAT), Feature(name="customer2_past_transactions_int", dtype=ValueType.INT64_LIST), Feature(name="customer2_past_transactions_double", dtype=ValueType.DOUBLE_LIST), Feature(name="customer2_past_transactions_float", dtype=ValueType.FLOAT_LIST), Feature(name="customer2_past_transactions_string", dtype=ValueType.STRING_LIST), Feature(name="customer2_past_transactions_bool", dtype=ValueType.BOOL_LIST), ], entities=[Entity("customer_id2", ValueType.INT64)], max_age=Duration(seconds=3600), ) client.set_project(PROJECT_NAME) client.apply(customer_fs) customer_fs = client.get_feature_set(name="customer2") client.ingest(customer_fs, nonlist_entity_dataframe, timeout=600) time.sleep(15) online_request_entity = [{"customer_id2": 0}, {"customer_id2": 1}] online_request_features = [ "customer2_rating", "customer2_cost", "customer2_past_transactions_int", "customer2_past_transactions_double", "customer2_past_transactions_float", "customer2_past_transactions_string", "customer2_past_transactions_bool", ] online_request_entity2 = [ { "customer_id2": Value(int64_val=0) }, { "customer_id2": Value(int64_val=1) }, ] def try_get_features1(): response = client.get_online_features( entity_rows=online_request_entity, feature_refs=online_request_features) return response, True def try_get_features2(): response = client.get_online_features( entity_rows=online_request_entity2, feature_refs=online_request_features) return response, True online_features_actual1 = wait_retry_backoff( retry_fn=try_get_features1, timeout_secs=90, timeout_msg="Timed out trying to get online feature values", ) online_features_actual2 = wait_retry_backoff( retry_fn=try_get_features2, timeout_secs=90, timeout_msg="Timed out trying to get online feature values", ) online_features_expected = { "customer_id2": [0, 1], "customer2_rating": [0, 1], "customer2_cost": [0.5, 1.5], "customer2_past_transactions_int": [[0, 2], [1, 3]], "customer2_past_transactions_double": [[0.5, 2.0], [1.5, 3.0]], "customer2_past_transactions_float": [[0.5, 2.0], [1.5, 3.0]], "customer2_past_transactions_string": [ ["first_0", "second_0"], ["first_1", "second_1"], ], "customer2_past_transactions_bool": [[True, False], [True, False]], } assert online_features_actual1.to_dict() == online_features_expected assert online_features_actual2.to_dict() == online_features_expected # Case 2: Feature retrieval with multiple entities retrieval check with mixed types with pytest.raises(TypeError) as excinfo: online_request_entity2 = [{ "customer_id": 0 }, { "customer_id": "error_pls" }] online_features_actual2 = client.get_online_features( entity_rows=online_request_entity2, feature_refs=online_request_features) assert ( "Input entity customer_id has mixed types, ValueType.STRING and ValueType.INT64. That is not allowed." in str(excinfo.value))
def test_validation_with_ge(feast_client: Client, kafka_server): entity = Entity(name="key", description="Key", value_type=ValueType.INT64) kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}" topic_name = f"avro-{uuid.uuid4()}" feature_table = FeatureTable( name="validation_test", entities=["key"], features=[ Feature("num", ValueType.INT64), Feature("set", ValueType.STRING) ], batch_source=FileSource( event_timestamp_column="event_timestamp", file_format=ParquetFormat(), file_url="/dev/null", ), stream_source=KafkaSource( event_timestamp_column="event_timestamp", bootstrap_servers=kafka_broker, message_format=AvroFormat(avro_schema()), topic=topic_name, ), ) feast_client.apply_entity(entity) feast_client.apply_feature_table(feature_table) train_data = generate_train_data() ge_ds = PandasDataset(train_data) ge_ds.expect_column_values_to_be_between("num", 0, 100) ge_ds.expect_column_values_to_be_in_set("set", ["a", "b", "c"]) expectations = ge_ds.get_expectation_suite() udf = create_validation_udf("testUDF", expectations) apply_validation(feast_client, feature_table, udf, validation_window_secs=1) job = feast_client.start_stream_to_online_ingestion(feature_table) wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 120) wait_retry_backoff( lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 120) test_data = generate_test_data() ge_ds = PandasDataset(test_data) validation_result = ge_ds.validate(expectations, result_format="COMPLETE") invalid_idx = list({ idx for check in validation_result.results for idx in check.result["unexpected_index_list"] }) entity_rows = [{"key": key} for key in test_data["key"].tolist()] try: ingested = ingest_and_retrieve( feast_client, test_data, avro_schema_json=avro_schema(), topic_name=topic_name, kafka_broker=kafka_broker, entity_rows=entity_rows, feature_names=["validation_test:num", "validation_test:set"], expected_ingested_count=test_data.shape[0] - len(invalid_idx), ) finally: job.cancel() test_data["num"] = test_data["num"].astype(np.float64) test_data["num"].iloc[invalid_idx] = np.nan test_data["set"].iloc[invalid_idx] = None pd.testing.assert_frame_equal( ingested[["key", "validation_test:num", "validation_test:set"]], test_data[["key", "num", "set"]].rename(columns={ "num": "validation_test:num", "set": "validation_test:set" }), )
def test_streaming_ingestion(feast_client: Client, local_staging_path: str, kafka_server): entity = Entity( name="s2id", description="S2id", value_type=ValueType.INT64, ) kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}" topic_name = f"avro-{uuid.uuid4()}" feature_table = FeatureTable( name="drivers_stream", entities=["s2id"], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=FileSource( event_timestamp_column="event_timestamp", created_timestamp_column="event_timestamp", file_format=ParquetFormat(), file_url=os.path.join(local_staging_path, "batch-storage"), ), stream_source=KafkaSource( "event_timestamp", "event_timestamp", kafka_broker, AvroFormat(avro_schema()), topic=topic_name, ), ) feast_client.apply_entity(entity) feast_client.apply_feature_table(feature_table) job = feast_client.start_stream_to_online_ingestion(feature_table) wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 60) wait_retry_backoff( lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 60) try: original = generate_data()[[ "s2id", "unique_drivers", "event_timestamp" ]] for record in original.to_dict("records"): record["event_timestamp"] = ( record["event_timestamp"].to_pydatetime().replace( tzinfo=pytz.utc)) send_avro_record_to_kafka( topic_name, record, bootstrap_servers=kafka_broker, avro_schema_json=avro_schema(), ) def get_online_features(): features = feast_client.get_online_features( ["drivers_stream:unique_drivers"], entity_rows=[{ "s2id": s2_id } for s2_id in original["s2id"].tolist()], ).to_dict() df = pd.DataFrame.from_dict(features) return df, not df["drivers_stream:unique_drivers"].isna().any() ingested = wait_retry_backoff(get_online_features, 60) finally: job.cancel() pd.testing.assert_frame_equal( ingested[["s2id", "drivers_stream:unique_drivers"]], original[[ "s2id", "unique_drivers" ]].rename(columns={"unique_drivers": "drivers_stream:unique_drivers"}), )
def test_validation_with_ge(feast_client: Client, kafka_server): kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}" topic_name = f"avro-{uuid.uuid4()}" entity, feature_table = create_schema(kafka_broker, topic_name, "validation_ge") feast_client.apply_entity(entity) feast_client.apply_feature_table(feature_table) train_data = generate_train_data() ge_ds = PandasDataset(train_data) ge_ds.expect_column_values_to_be_between("num", 0, 100) ge_ds.expect_column_values_to_be_in_set("set", ["a", "b", "c"]) expectations = ge_ds.get_expectation_suite() udf = create_validation_udf("testUDF", expectations, feature_table) apply_validation(feast_client, feature_table, udf, validation_window_secs=1) job = feast_client.start_stream_to_online_ingestion(feature_table) wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 120) wait_retry_backoff( lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 120) test_data = generate_test_data() ge_ds = PandasDataset(test_data) validation_result = ge_ds.validate(expectations, result_format="COMPLETE") invalid_idx = list({ idx for check in validation_result.results for idx in check.result["unexpected_index_list"] }) entity_rows = [{"key": key} for key in test_data["key"].tolist()] try: ingested = ingest_and_retrieve( feast_client, test_data, avro_schema_json=avro_schema(), topic_name=topic_name, kafka_broker=kafka_broker, entity_rows=entity_rows, feature_names=["validation_ge:num", "validation_ge:set"], expected_ingested_count=test_data.shape[0] - len(invalid_idx), ) finally: job.cancel() test_data["num"] = test_data["num"].astype(np.float64) test_data["num"].iloc[invalid_idx] = np.nan test_data["set"].iloc[invalid_idx] = None pd.testing.assert_frame_equal( ingested[["key", "validation_ge:num", "validation_ge:set"]], test_data[["key", "num", "set"]].rename(columns={ "num": "validation_ge:num", "set": "validation_ge:set" }), )
# Register feature set client.apply(all_types_fs_expected) df.info() df.describe() df.head() # Ingest tdata client.ingest(all_types_fs_expected, df) # Wait for data to be available def try_get_features(): online_request_entity = [{"user_id": 1001}] online_request_features = ["float_feature"] response = client.get_online_features(entity_rows=online_request_entity, feature_refs=online_request_features) response_dict = response.to_dict() if response_dict['float_feature'] == df.iloc[0]['float_feature']: return response_dict, True return response_dict, False online_features_actual = wait_retry_backoff( retry_fn=try_get_features, timeout_secs=90, timeout_msg="Timed out trying to get online feature values", )
def test_online_store_cleanup(environment, universal_data_sources): """ Some online store implementations (like Redis) keep features from different features views but with common entities together. This might end up with deletion of all features attached to the entity, when only one feature view was deletion target (see https://github.com/feast-dev/feast/issues/2150). Plan: 1. Register two feature views with common entity "driver" 2. Materialize data 3. Check if features are available (via online retrieval) 4. Delete one feature view 5. Check that features for other are still available 6. Delete another feature view (and create again) 7. Verify that features for both feature view were deleted """ fs = environment.feature_store entities, datasets, data_sources = universal_data_sources driver_stats_fv = construct_universal_feature_views(data_sources).driver driver_entities = entities.driver_vals df = pd.DataFrame({ "ts_1": [environment.end_date] * len(driver_entities), "created_ts": [environment.end_date] * len(driver_entities), "driver_id": driver_entities, "value": np.random.random(size=len(driver_entities)), }) ds = environment.data_source_creator.create_data_source( df, destination_name="simple_driver_dataset") simple_driver_fv = driver_feature_view( data_source=ds, name="test_universal_online_simple_driver") fs.apply([driver(), simple_driver_fv, driver_stats_fv]) fs.materialize( environment.start_date - timedelta(days=1), environment.end_date + timedelta(days=1), ) expected_values = df.sort_values(by="driver_id") features = [f"{simple_driver_fv.name}:value"] entity_rows = [{ "driver_id": driver_id } for driver_id in sorted(driver_entities)] online_features = fs.get_online_features( features=features, entity_rows=entity_rows).to_dict() assert np.allclose(expected_values["value"], online_features["value"]) fs.apply(objects=[simple_driver_fv], objects_to_delete=[driver_stats_fv], partial=False) online_features = fs.get_online_features( features=features, entity_rows=entity_rows).to_dict() assert np.allclose(expected_values["value"], online_features["value"]) fs.apply(objects=[], objects_to_delete=[simple_driver_fv], partial=False) def eventually_apply() -> Tuple[None, bool]: try: fs.apply([simple_driver_fv]) except BotoCoreError: return None, False return None, True # Online store backend might have eventual consistency in schema update # So recreating table that was just deleted might need some retries wait_retry_backoff(eventually_apply, timeout_secs=60) online_features = fs.get_online_features( features=features, entity_rows=entity_rows).to_dict() assert all(v is None for v in online_features["value"])
def test_streaming_ingestion(feast_client: Client, local_staging_path: str, kafka_server, pytestconfig): entity = Entity( name="s2id", description="S2id", value_type=ValueType.INT64, ) kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}" topic_name = f"avro-{uuid.uuid4()}" feature_table = FeatureTable( name="drivers_stream", entities=["s2id"], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=FileSource( event_timestamp_column="event_timestamp", created_timestamp_column="event_timestamp", file_format=ParquetFormat(), file_url=os.path.join(local_staging_path, "batch-storage"), ), stream_source=KafkaSource( event_timestamp_column="event_timestamp", bootstrap_servers=kafka_broker, message_format=AvroFormat(avro_schema()), topic=topic_name, ), ) feast_client.apply(entity) feast_client.apply(feature_table) if not pytestconfig.getoption("scheduled_streaming_job"): job = feast_client.start_stream_to_online_ingestion(feature_table) assert job.get_feature_table() == feature_table.name wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 120) else: job = None wait_retry_backoff( lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 300) test_data = generate_data()[["s2id", "unique_drivers", "event_timestamp"]] try: ingested = ingest_and_retrieve( feast_client, test_data, avro_schema_json=avro_schema(), topic_name=topic_name, kafka_broker=kafka_broker, entity_rows=[{ "s2id": s2_id } for s2_id in test_data["s2id"].tolist()], feature_names=["drivers_stream:unique_drivers"], ) finally: if job: job.cancel() else: feast_client.delete_feature_table(feature_table.name) pd.testing.assert_frame_equal( ingested[["s2id", "drivers_stream:unique_drivers"]], test_data[[ "s2id", "unique_drivers" ]].rename(columns={"unique_drivers": "drivers_stream:unique_drivers"}), )