def test_feature_set_ingest_throws_exception_if_kafka_down( self, dataframe, test_client, exception, mocker): test_client.set_project("project1") driver_fs = FeatureSet( "driver-feature-set", source=KafkaSource(brokers="localhost:4412", topic="test"), ) driver_fs.add(Feature(name="feature_1", dtype=ValueType.FLOAT)) driver_fs.add(Feature(name="feature_2", dtype=ValueType.STRING)) driver_fs.add(Feature(name="feature_3", dtype=ValueType.INT64)) driver_fs.add(Entity(name="entity_id", dtype=ValueType.INT64)) # Register with Feast core test_client.apply(driver_fs) driver_fs = driver_fs.to_proto() driver_fs.meta.status = FeatureSetStatusProto.STATUS_READY mocker.patch.object( test_client._core_service_stub, "GetFeatureSet", return_value=GetFeatureSetResponse(feature_set=driver_fs), ) with pytest.raises(exception): test_client.ingest("driver-feature-set", dataframe, timeout=1)
def test_register_feature_set(self, sqlite_store): fs = FeatureSet("my-feature-set") fs.add(Feature(name="my-feature-1", dtype=ValueType.INT64)) fs.add(Feature(name="my-feature-2", dtype=ValueType.INT64)) fs.add(Entity(name="my-entity-1", dtype=ValueType.INT64)) fs._version = 1 feature_set_spec_proto = fs.to_proto().spec sqlite_store.register_feature_set(feature_set_spec_proto) feature_row = FeatureRowProto.FeatureRow( feature_set="feature_set_1", event_timestamp=Timestamp(), fields=[ FieldProto.Field( name="feature_1", value=ValueProto.Value(float_val=1.2) ), FieldProto.Field( name="feature_2", value=ValueProto.Value(float_val=1.2) ), FieldProto.Field( name="feature_3", value=ValueProto.Value(float_val=1.2) ), ], ) # sqlite_store.upsert_feature_row(feature_set_proto, feature_row) assert True
def test_feature_set_ingest_fail_if_pending(self, dataframe, exception, test_client, mocker): with pytest.raises(exception): test_client.set_project("project1") driver_fs = FeatureSet( "driver-feature-set", source=KafkaSource(brokers="kafka:9092", topic="test"), ) driver_fs.add(Feature(name="feature_1", dtype=ValueType.FLOAT)) driver_fs.add(Feature(name="feature_2", dtype=ValueType.STRING)) driver_fs.add(Feature(name="feature_3", dtype=ValueType.INT64)) driver_fs.add(Entity(name="entity_id", dtype=ValueType.INT64)) # Register with Feast core test_client.apply(driver_fs) driver_fs = driver_fs.to_proto() driver_fs.meta.status = FeatureSetStatusProto.STATUS_PENDING mocker.patch.object( test_client._core_service_stub, "GetFeatureSet", return_value=GetFeatureSetResponse(feature_set=driver_fs), ) # Need to create a mock producer with patch("feast.client.get_producer"): # Ingest data into Feast test_client.ingest("driver-feature-set", dataframe, timeout=1)
def test_feature_table_import_export_yaml(self, batch_source): stream_source = KafkaSource( field_mapping={ "ride_distance": "ride_distance", "ride_duration": "ride_duration", }, bootstrap_servers="localhost:9094", message_format=ProtoFormat(class_path="class.path"), topic="test_topic", event_timestamp_column="ts_col", ) test_feature_table = FeatureTable( name="car_driver", features=[ Feature(name="ride_distance", dtype=ValueType.FLOAT), Feature(name="ride_duration", dtype=ValueType.STRING), ], entities=["car_driver_entity"], labels={"team": "matchmaking"}, batch_source=batch_source, stream_source=stream_source, ) # Create a string YAML representation of the feature table string_yaml = test_feature_table.to_yaml() # Create a new feature table object from the YAML string actual_feature_table_from_string = FeatureTable.from_yaml(string_yaml) # Ensure equality is upheld to original feature table assert test_feature_table == actual_feature_table_from_string
def test_ingest_into_bq( feast_client: Client, customer_entity: Entity, driver_entity: Entity, bq_dataframe: pd.DataFrame, bq_dataset: str, pytestconfig, ): bq_project = pytestconfig.getoption("bq_project") bq_table_id = f"bq_staging_{datetime.now():%Y%m%d%H%M%s}" ft = FeatureTable( name="basic_featuretable", entities=["driver_id", "customer_id"], features=[ Feature(name="dev_feature_float", dtype=ValueType.FLOAT), Feature(name="dev_feature_string", dtype=ValueType.STRING), ], max_age=Duration(seconds=3600), batch_source=BigQuerySource( table_ref=f"{bq_project}:{bq_dataset}.{bq_table_id}", event_timestamp_column="datetime", created_timestamp_column="timestamp", ), ) # ApplyEntity feast_client.apply(customer_entity) feast_client.apply(driver_entity) # ApplyFeatureTable feast_client.apply(ft) feast_client.ingest(ft, bq_dataframe, timeout=120) bq_client = bigquery.Client(project=bq_project) # Poll BQ for table until the table has been created def try_get_table(): try: table = bq_client.get_table( bigquery.TableReference( bigquery.DatasetReference(bq_project, bq_dataset), bq_table_id ) ) except NotFound: return None, False else: return table, True wait_retry_backoff( retry_fn=try_get_table, timeout_secs=30, timeout_msg="Timed out trying to get bigquery table", ) query_string = f"SELECT * FROM `{bq_project}.{bq_dataset}.{bq_table_id}`" job = bq_client.query(query_string) query_df = job.to_dataframe() assert_frame_equal(query_df, bq_dataframe)
def test_list_features(self, test_client, mocker): mocker.patch.object( test_client, "_core_service_stub", return_value=Core.CoreServiceStub(grpc.insecure_channel("")), ) feature1_proto = FeatureSpecProto( name="feature_1", value_type=ValueProto.ValueType.FLOAT) feature2_proto = FeatureSpecProto( name="feature_2", value_type=ValueProto.ValueType.STRING) mocker.patch.object( test_client._core_service_stub, "ListFeatures", return_value=ListFeaturesResponse( features={ "driver_car:feature_1": feature1_proto, "driver_car:feature_2": feature2_proto, }), ) features = test_client.list_features_by_ref(project="test") assert len(features) == 2 native_feature_list = [] for _, feature_proto in features.items(): native_feature_list.append(feature_proto) assert sorted(native_feature_list) == sorted([ Feature.from_proto(feature1_proto), Feature.from_proto(feature2_proto) ])
def test_list_entities_and_features(client): customer_entity = Entity("customer_id", ValueType.INT64) driver_entity = Entity("driver_id", ValueType.INT64) customer_feature_rating = Feature(name="rating", dtype=ValueType.FLOAT, labels={"key1":"val1"}) customer_feature_cost = Feature(name="cost", dtype=ValueType.FLOAT) driver_feature_rating = Feature(name="rating", dtype=ValueType.FLOAT) driver_feature_cost = Feature(name="cost", dtype=ValueType.FLOAT, labels={"key1":"val1"}) filter_by_project_entity_labels_expected = dict([ ("customer:rating", customer_feature_rating) ]) filter_by_project_entity_expected = dict([ ("driver:cost", driver_feature_cost), ("driver:rating", driver_feature_rating) ]) filter_by_project_labels_expected = dict([ ("customer:rating", customer_feature_rating), ("driver:cost", driver_feature_cost) ]) customer_fs = FeatureSet( "customer", features=[ customer_feature_rating, customer_feature_cost ], entities=[customer_entity], max_age=Duration(seconds=100) ) driver_fs = FeatureSet( "driver", features=[ driver_feature_rating, driver_feature_cost ], entities=[driver_entity], max_age=Duration(seconds=100) ) client.set_project(PROJECT_NAME) client.apply(customer_fs) client.apply(driver_fs) # Test for listing of features # Case 1: Filter by: project, entities and labels filter_by_project_entity_labels_actual = client.list_features_by_ref(project=PROJECT_NAME, entities=["customer_id"], labels={"key1":"val1"}) # Case 2: Filter by: project, entities filter_by_project_entity_actual = client.list_features_by_ref(project=PROJECT_NAME, entities=["driver_id"]) # Case 3: Filter by: project, labels filter_by_project_labels_actual = client.list_features_by_ref(project=PROJECT_NAME, labels={"key1":"val1"}) assert set(filter_by_project_entity_labels_expected) == set(filter_by_project_entity_labels_actual) assert set(filter_by_project_entity_expected) == set(filter_by_project_entity_actual) assert set(filter_by_project_labels_expected) == set(filter_by_project_labels_actual)
def test_apply_feature_view_integration(test_feature_store): # Create Feature Views batch_source = FileSource( file_format=ParquetFormat(), path="file://feast/*", event_timestamp_column="ts_col", created_timestamp_column="timestamp", date_partition_column="date_partition_col", ) fv1 = FeatureView( name="my_feature_view_1", features=[ Feature(name="fs1_my_feature_1", dtype=ValueType.INT64), Feature(name="fs1_my_feature_2", dtype=ValueType.STRING), Feature(name="fs1_my_feature_3", dtype=ValueType.STRING_LIST), Feature(name="fs1_my_feature_4", dtype=ValueType.BYTES_LIST), ], entities=["fs1_my_entity_1"], tags={"team": "matchmaking"}, batch_source=batch_source, ttl=timedelta(minutes=5), ) # Register Feature View test_feature_store.apply([fv1]) feature_views = test_feature_store.list_feature_views() # List Feature Views assert (len(feature_views) == 1 and feature_views[0].name == "my_feature_view_1" and feature_views[0].features[0].name == "fs1_my_feature_1" and feature_views[0].features[0].dtype == ValueType.INT64 and feature_views[0].features[1].name == "fs1_my_feature_2" and feature_views[0].features[1].dtype == ValueType.STRING and feature_views[0].features[2].name == "fs1_my_feature_3" and feature_views[0].features[2].dtype == ValueType.STRING_LIST and feature_views[0].features[3].name == "fs1_my_feature_4" and feature_views[0].features[3].dtype == ValueType.BYTES_LIST and feature_views[0].entities[0] == "fs1_my_entity_1") feature_view = test_feature_store.get_feature_view("my_feature_view_1") assert (feature_view.name == "my_feature_view_1" and feature_view.features[0].name == "fs1_my_feature_1" and feature_view.features[0].dtype == ValueType.INT64 and feature_view.features[1].name == "fs1_my_feature_2" and feature_view.features[1].dtype == ValueType.STRING and feature_view.features[2].name == "fs1_my_feature_3" and feature_view.features[2].dtype == ValueType.STRING_LIST and feature_view.features[3].name == "fs1_my_feature_4" and feature_view.features[3].dtype == ValueType.BYTES_LIST and feature_view.entities[0] == "fs1_my_entity_1") test_feature_store.delete_feature_view("my_feature_view_1") feature_views = test_feature_store.list_feature_views() assert len(feature_views) == 0 test_feature_store.teardown()
def test_multiple_featureset_joins(client): fs1 = FeatureSet( "feature_set_1", features=[Feature("feature_value", ValueType.STRING)], entities=[Entity("entity_id", ValueType.INT64)], max_age=Duration(seconds=100), ) fs2 = FeatureSet( "feature_set_2", features=[Feature("other_feature_value", ValueType.INT64)], entities=[Entity("other_entity_id", ValueType.INT64)], max_age=Duration(seconds=100), ) client.apply(fs1) time.sleep(10) fs1 = client.get_feature_set(name="feature_set_1", version=1) client.apply(fs2) time.sleep(10) fs2 = client.get_feature_set(name="feature_set_2", version=1) N_ROWS = 10 time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) features_1_df = pd.DataFrame({ "datetime": [time_offset] * N_ROWS, "entity_id": [i for i in range(N_ROWS)], "feature_value": [f"{i}" for i in range(N_ROWS)], }) client.ingest(fs1, features_1_df) features_2_df = pd.DataFrame({ "datetime": [time_offset] * N_ROWS, "other_entity_id": [i for i in range(N_ROWS)], "other_feature_value": [i for i in range(N_ROWS)], }) client.ingest(fs2, features_2_df) entity_df = pd.DataFrame({ "datetime": [time_offset] * N_ROWS, "entity_id": [i for i in range(N_ROWS)], "other_entity_id": [N_ROWS - 1 - i for i in range(N_ROWS)], }) feature_retrieval_job = client.get_batch_features( entity_rows=entity_df, feature_ids=[ "feature_set_1:1:feature_value", "feature_set_2:1:other_feature_value" ]) output = feature_retrieval_job.to_dataframe() print(output.head()) assert output["entity_id"].to_list() == [ int(i) for i in output["feature_set_1_v1_feature_value"].to_list() ] assert output["other_entity_id"].to_list( ) == output["feature_set_2_v1_other_feature_value"].to_list()
def test_apply_feature_table_success(self, test_client): test_client.set_project("project1") # Create Feature Tables batch_source = FileSource( file_format="parquet", file_url="file://feast/*", event_timestamp_column="ts_col", created_timestamp_column="timestamp", date_partition_column="date_partition_col", ) stream_source = KafkaSource( bootstrap_servers="localhost:9094", class_path="random/path/to/class", topic="test_topic", event_timestamp_column="ts_col", created_timestamp_column="timestamp", ) ft1 = FeatureTable( name="my-feature-table-1", features=[ Feature(name="fs1-my-feature-1", dtype=ValueType.INT64), Feature(name="fs1-my-feature-2", dtype=ValueType.STRING), Feature(name="fs1-my-feature-3", dtype=ValueType.STRING_LIST), Feature(name="fs1-my-feature-4", dtype=ValueType.BYTES_LIST), ], entities=["fs1-my-entity-1"], labels={"team": "matchmaking"}, batch_source=batch_source, stream_source=stream_source, ) # Register Feature Table with Core test_client.apply_feature_table(ft1) feature_tables = test_client.list_feature_tables() # List Feature Tables assert ( len(feature_tables) == 1 and feature_tables[0].name == "my-feature-table-1" and feature_tables[0].features[0].name == "fs1-my-feature-1" and feature_tables[0].features[0].dtype == ValueType.INT64 and feature_tables[0].features[1].name == "fs1-my-feature-2" and feature_tables[0].features[1].dtype == ValueType.STRING and feature_tables[0].features[2].name == "fs1-my-feature-3" and feature_tables[0].features[2].dtype == ValueType.STRING_LIST and feature_tables[0].features[3].name == "fs1-my-feature-4" and feature_tables[0].features[3].dtype == ValueType.BYTES_LIST and feature_tables[0].entities[0] == "fs1-my-entity-1" )
def test_reapply_feature_view_success(test_feature_store, dataframe_source): with prep_file_source(df=dataframe_source, event_timestamp_column="ts_1") as file_source: e = Entity(name="id", value_type=ValueType.STRING) # Create Feature View fv1 = FeatureView( name="my_feature_view_1", features=[Feature(name="string_col", dtype=ValueType.STRING)], entities=["id"], batch_source=file_source, ttl=timedelta(minutes=5), ) # Register Feature View test_feature_store.apply([fv1, e]) # Check Feature View fv_stored = test_feature_store.get_feature_view(fv1.name) assert len(fv_stored.materialization_intervals) == 0 # Run materialization test_feature_store.materialize(datetime(2020, 1, 1), datetime(2021, 1, 1)) # Check Feature View fv_stored = test_feature_store.get_feature_view(fv1.name) assert len(fv_stored.materialization_intervals) == 1 # Apply again test_feature_store.apply([fv1]) # Check Feature View fv_stored = test_feature_store.get_feature_view(fv1.name) assert len(fv_stored.materialization_intervals) == 1 # Change and apply Feature View fv1 = FeatureView( name="my_feature_view_1", features=[Feature(name="int64_col", dtype=ValueType.INT64)], entities=["id"], batch_source=file_source, ttl=timedelta(minutes=5), ) test_feature_store.apply([fv1]) # Check Feature View fv_stored = test_feature_store.get_feature_view(fv1.name) assert len(fv_stored.materialization_intervals) == 0 test_feature_store.teardown()
def feature_stats_feature_set(client): fv_fs = FeatureSet( "feature_stats", features=[ Feature("strings", ValueType.STRING), Feature("ints", ValueType.INT64), Feature("floats", ValueType.FLOAT), ], entities=[Entity("entity_id", ValueType.INT64)], max_age=Duration(seconds=100), ) client.apply(fv_fs) return fv_fs
def create_customer_daily_profile_feature_view(source): customer_profile_feature_view = FeatureView( name="customer_profile", entities=["customer_id"], features=[ Feature(name="current_balance", dtype=ValueType.FLOAT), Feature(name="avg_passenger_count", dtype=ValueType.FLOAT), Feature(name="lifetime_trip_count", dtype=ValueType.INT32), ], input=source, ttl=timedelta(days=2), ) return customer_profile_feature_view
def create_driver_hourly_stats_feature_view(source): driver_stats_feature_view = FeatureView( name="driver_stats", entities=["driver_id"], features=[ Feature(name="conv_rate", dtype=ValueType.FLOAT), Feature(name="acc_rate", dtype=ValueType.FLOAT), Feature(name="avg_daily_trips", dtype=ValueType.INT32), ], input=source, ttl=timedelta(hours=2), ) return driver_stats_feature_view
def test_basic_register_feature_set_success(client): # Register feature set without project cust_trans_fs_expected = FeatureSet.from_yaml( f"{DIR_PATH}/basic/cust_trans_fs.yaml") driver_fs_expected = FeatureSet.from_yaml( f"{DIR_PATH}/basic/driver_fs.yaml") client.apply(cust_trans_fs_expected) client.apply(driver_fs_expected) cust_trans_fs_actual = client.get_feature_set("customer_transactions") assert cust_trans_fs_actual == cust_trans_fs_expected driver_fs_actual = client.get_feature_set("driver") assert driver_fs_actual == driver_fs_expected # Register feature set with project cust_trans_fs_expected = FeatureSet.from_yaml( f"{DIR_PATH}/basic/cust_trans_fs.yaml") client.set_project(PROJECT_NAME) client.apply(cust_trans_fs_expected) cust_trans_fs_actual = client.get_feature_set("customer_transactions", project=PROJECT_NAME) assert cust_trans_fs_actual == cust_trans_fs_expected # Register feature set with labels driver_unlabelled_fs = FeatureSet( "driver_unlabelled", features=[ Feature("rating", ValueType.FLOAT), Feature("cost", ValueType.FLOAT) ], entities=[Entity("entity_id", ValueType.INT64)], max_age=Duration(seconds=100), ) driver_labeled_fs_expected = FeatureSet( "driver_labeled", features=[ Feature("rating", ValueType.FLOAT), Feature("cost", ValueType.FLOAT) ], entities=[Entity("entity_id", ValueType.INT64)], max_age=Duration(seconds=100), labels={"key1": "val1"}, ) client.set_project(PROJECT_NAME) client.apply(driver_unlabelled_fs) client.apply(driver_labeled_fs_expected) driver_fs_actual = client.list_feature_sets(project=PROJECT_NAME, labels={"key1": "val1"})[0] assert driver_fs_actual == driver_labeled_fs_expected # reset client's project for other tests client.set_project()
def bq_featuretable(bq_table_id): batch_source = BigQuerySource( table_ref=bq_table_id, timestamp_column="datetime", ) return FeatureTable( name="basic_featuretable", entities=["driver_id", "customer_id"], features=[ Feature(name="dev_feature_float", dtype=ValueType.FLOAT), Feature(name="dev_feature_string", dtype=ValueType.STRING), ], max_age=Duration(seconds=3600), batch_source=batch_source, )
def test_apply_object_and_read(test_feature_store): assert isinstance(test_feature_store, FeatureStore) # Create Feature Views batch_source = FileSource( file_format=ParquetFormat(), path="file://feast/*", event_timestamp_column="ts_col", created_timestamp_column="timestamp", ) e1 = Entity(name="fs1_my_entity_1", value_type=ValueType.STRING, description="something") e2 = Entity(name="fs1_my_entity_2", value_type=ValueType.STRING, description="something") fv1 = FeatureView( name="my_feature_view_1", features=[ Feature(name="fs1_my_feature_1", dtype=ValueType.INT64), Feature(name="fs1_my_feature_2", dtype=ValueType.STRING), Feature(name="fs1_my_feature_3", dtype=ValueType.STRING_LIST), Feature(name="fs1_my_feature_4", dtype=ValueType.BYTES_LIST), ], entities=["fs1_my_entity_1"], tags={"team": "matchmaking"}, batch_source=batch_source, ttl=timedelta(minutes=5), ) fv2 = FeatureView( name="my_feature_view_2", features=[ Feature(name="fs1_my_feature_1", dtype=ValueType.INT64), Feature(name="fs1_my_feature_2", dtype=ValueType.STRING), Feature(name="fs1_my_feature_3", dtype=ValueType.STRING_LIST), Feature(name="fs1_my_feature_4", dtype=ValueType.BYTES_LIST), ], entities=["fs1_my_entity_1"], tags={"team": "matchmaking"}, batch_source=batch_source, ttl=timedelta(minutes=5), ) # Register Feature View test_feature_store.apply([fv1, e1, fv2, e2]) fv1_actual = test_feature_store.get_feature_view("my_feature_view_1") e1_actual = test_feature_store.get_entity("fs1_my_entity_1") assert fv1 == fv1_actual assert e1 == e1_actual assert fv2 != fv1_actual assert e2 != e1_actual test_feature_store.teardown()
def from_proto(cls, feature_set_proto: FeatureSetProto): """ Creates a feature set from a protobuf representation of a feature set Args: feature_set_proto: A protobuf representation of a feature set Returns: Returns a FeatureSet object based on the feature set protobuf """ feature_set = cls( name=feature_set_proto.spec.name, features=[ Feature.from_proto(feature) for feature in feature_set_proto.spec.features ], entities=[ Entity.from_proto(entity) for entity in feature_set_proto.spec.entities ], max_age=feature_set_proto.spec.max_age, source=(None if feature_set_proto.spec.source.type == 0 else Source.from_proto(feature_set_proto.spec.source)), project=feature_set_proto.spec.project if len(feature_set_proto.spec.project) == 0 else feature_set_proto.spec.project, ) feature_set._version = feature_set_proto.spec.version feature_set._status = feature_set_proto.meta.status feature_set._created_timestamp = feature_set_proto.meta.created_timestamp return feature_set
def from_proto(cls, feature_table_proto: FeatureTableProto): """ Creates a feature table from a protobuf representation of a feature table Args: feature_table_proto: A protobuf representation of a feature table Returns: Returns a FeatureTableProto object based on the feature table protobuf """ feature_table = cls( name=feature_table_proto.spec.name, entities=[entity for entity in feature_table_proto.spec.entities], features=[ Feature.from_proto(feature) for feature in feature_table_proto.spec.features ], labels=feature_table_proto.spec.labels, max_age=(None if feature_table_proto.spec.max_age.seconds == 0 and feature_table_proto.spec.max_age.nanos == 0 else feature_table_proto.spec.max_age), batch_source=DataSource.from_proto( feature_table_proto.spec.batch_source), stream_source=( None if not feature_table_proto.spec.stream_source.ByteSize() else DataSource.from_proto(feature_table_proto.spec.stream_source)), ) feature_table._created_timestamp = feature_table_proto.meta.created_timestamp return feature_table
def test_order_by_creation_time(client): proc_time_fs = FeatureSet( "processing_time", features=[Feature("feature_value", ValueType.STRING)], entities=[Entity("entity_id", ValueType.INT64)], max_age=Duration(seconds=100), ) client.apply(proc_time_fs) time.sleep(10) proc_time_fs = client.get_feature_set(name="processing_time", version=1) time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) N_ROWS = 10 incorrect_df = pd.DataFrame({ "datetime": [time_offset] * N_ROWS, "entity_id": [i for i in range(N_ROWS)], "feature_value": ["WRONG"] * N_ROWS, }) correct_df = pd.DataFrame({ "datetime": [time_offset] * N_ROWS, "entity_id": [i for i in range(N_ROWS)], "feature_value": ["CORRECT"] * N_ROWS, }) client.ingest(proc_time_fs, incorrect_df) time.sleep(10) client.ingest(proc_time_fs, correct_df) feature_retrieval_job = client.get_batch_features( entity_rows=incorrect_df[["datetime", "entity_id"]], feature_ids=["processing_time:1:feature_value"]) output = feature_retrieval_job.to_dataframe() print(output.head()) assert output["processing_time_v1_feature_value"].to_list() == ["CORRECT" ] * N_ROWS
def infer_features_from_input_source(self, config: RepoConfig): if not self.features: columns_to_exclude = { self.input.event_timestamp_column, self.input.created_timestamp_column, } | set(self.entities) for col_name, col_datatype in self.input.get_table_column_names_and_types( config): if col_name not in columns_to_exclude and not re.match( "^__|__$", col_name, # double underscores often signal an internal-use column ): feature_name = ( self.input.field_mapping[col_name] if col_name in self.input.field_mapping.keys() else col_name) self.features.append( Feature( feature_name, self.input.source_datatype_to_feast_value_type()( col_datatype), )) if not self.features: raise RegistryInferenceFailure( "FeatureView", f"Could not infer Features for the FeatureView named {self.name}.", )
def from_proto(cls, feature_view_proto: FeatureViewProto): """ Creates a feature view from a protobuf representation of a feature view Args: feature_view_proto: A protobuf representation of a feature view Returns: Returns a FeatureViewProto object based on the feature view protobuf """ feature_view = cls( name=feature_view_proto.spec.name, entities=[entity for entity in feature_view_proto.spec.entities], features=[ Feature( name=feature.name, dtype=ValueType(feature.value_type), labels=feature.labels, ) for feature in feature_view_proto.spec.features ], tags=dict(feature_view_proto.spec.tags), online=feature_view_proto.spec.online, ttl=(None if feature_view_proto.spec.ttl.seconds == 0 and feature_view_proto.spec.ttl.nanos == 0 else feature_view_proto.spec.ttl), input=DataSource.from_proto(feature_view_proto.spec.input), ) feature_view.created_timestamp = feature_view_proto.meta.created_timestamp return feature_view
def from_proto(cls, feature_set_proto: FeatureSetSpecProto): """ Creates a feature set from a protobuf representation of a feature set Args: from_proto: A protobuf representation of a feature set Returns: Returns a FeatureSet object based on the feature set protobuf """ feature_set = cls( name=feature_set_proto.name, features=[ Feature.from_proto(feature) for feature in feature_set_proto.features ], entities=[ Entity.from_proto(entity) for entity in feature_set_proto.entities ], max_age=feature_set_proto.max_age, source=( None if feature_set_proto.source.type == 0 else Source.from_proto(feature_set_proto.source) ), ) feature_set._version = feature_set_proto.version feature_set._is_dirty = False return feature_set
def test_basic_ingest_retrieval_str(client): # Set to another project to test ingestion based on current project context client.set_project(PROJECT_NAME + "_NS1") customer_fs = FeatureSet( name="cust_fs", features=[ Feature(name="cust_rating", dtype=ValueType.INT64), Feature(name="cust_cost", dtype=ValueType.FLOAT), ], entities=[Entity("cust_id", ValueType.INT64)], max_age=Duration(seconds=3600), ) client.apply(customer_fs) N_ROWS = 2 time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) cust_df = pd.DataFrame({ "datetime": [time_offset] * N_ROWS, "cust_id": [i for i in range(N_ROWS)], "cust_rating": [i for i in range(N_ROWS)], "cust_cost": [float(i) + 0.5 for i in range(N_ROWS)], }) client.ingest("cust_fs", cust_df, timeout=600) time.sleep(15) online_request_entity = [{"cust_id": 0}, {"cust_id": 1}] online_request_features = ["cust_rating", "cust_cost"] def try_get_features(): response = client.get_online_features( entity_rows=online_request_entity, feature_refs=online_request_features) return response, True online_features_actual = wait_retry_backoff( retry_fn=try_get_features, timeout_secs=90, timeout_msg="Timed out trying to get online feature values", ) online_features_expected = { "cust_id": [0, 1], "cust_rating": [0, 1], "cust_cost": [0.5, 1.5], } assert online_features_actual.to_dict() == online_features_expected
def get_feature_view(data_source: DataSource) -> FeatureView: return FeatureView( name="test_bq_correctness", entities=["driver"], features=[Feature("value", ValueType.FLOAT)], ttl=timedelta(days=5), batch_source=data_source, )
def get_feature_view(data_source: Union[FileSource, BigQuerySource]) -> FeatureView: return FeatureView( name="test_bq_correctness", entities=["driver"], features=[Feature("value", ValueType.FLOAT)], ttl=timedelta(days=5), input=data_source, )
def test_bigquery_query_to_datastore_correctness(self): # create dataset ts = pd.Timestamp.now(tz="UTC").round("ms") data = { "id": [1, 2, 1], "value": [0.1, 0.2, 0.3], "ts_1": [ts - timedelta(minutes=2), ts, ts], "created_ts": [ts, ts, ts], } df = pd.DataFrame.from_dict(data) # load dataset into BigQuery job_config = bigquery.LoadJobConfig() table_id = f"{self.gcp_project}.{self.bigquery_dataset}.query_correctness_{int(time.time())}" query = f"SELECT * FROM `{table_id}`" job = self.client.load_table_from_dataframe(df, table_id, job_config=job_config) job.result() # create FeatureView fv = FeatureView( name="test_bq_query_correctness", entities=["driver_id"], features=[Feature("value", ValueType.FLOAT)], ttl=timedelta(minutes=5), input=BigQuerySource( event_timestamp_column="ts", created_timestamp_column="created_ts", field_mapping={ "ts_1": "ts", "id": "driver_id" }, date_partition_column="", query=query, ), ) config = RepoConfig( metadata_store="./metadata.db", project=f"test_bq_query_correctness_{int(time.time())}", provider="gcp", ) fs = FeatureStore(config=config) fs.apply([fv]) # run materialize() fs.materialize( [fv.name], datetime.utcnow() - timedelta(minutes=5), datetime.utcnow() - timedelta(minutes=0), ) # check result of materialize() response_dict = fs.get_online_features([f"{fv.name}:value"], [{ "driver_id": 1 }]).to_dict() assert abs(response_dict[f"{fv.name}:value"][0] - 0.3) < 1e-6
def from_proto(cls, feature_view_proto: FeatureViewProto): """ Creates a feature view from a protobuf representation of a feature view. Args: feature_view_proto: A protobuf representation of a feature view. Returns: A FeatureViewProto object based on the feature view protobuf. """ batch_source = DataSource.from_proto(feature_view_proto.spec.batch_source) stream_source = ( DataSource.from_proto(feature_view_proto.spec.stream_source) if feature_view_proto.spec.HasField("stream_source") else None ) feature_view = cls( name=feature_view_proto.spec.name, entities=[entity for entity in feature_view_proto.spec.entities], features=[ Feature( name=feature.name, dtype=ValueType(feature.value_type), labels=dict(feature.labels), ) for feature in feature_view_proto.spec.features ], tags=dict(feature_view_proto.spec.tags), online=feature_view_proto.spec.online, ttl=( None if feature_view_proto.spec.ttl.seconds == 0 and feature_view_proto.spec.ttl.nanos == 0 else feature_view_proto.spec.ttl ), batch_source=batch_source, stream_source=stream_source, ) if feature_view_proto.meta.HasField("created_timestamp"): feature_view.created_timestamp = ( feature_view_proto.meta.created_timestamp.ToDatetime() ) if feature_view_proto.meta.HasField("last_updated_timestamp"): feature_view.last_updated_timestamp = ( feature_view_proto.meta.last_updated_timestamp.ToDatetime() ) for interval in feature_view_proto.meta.materialization_intervals: feature_view.materialization_intervals.append( ( utils.make_tzaware(interval.start_time.ToDatetime()), utils.make_tzaware(interval.end_time.ToDatetime()), ) ) return feature_view
def __init__( self, name: str, entities: List[str], ttl: Optional[Union[Duration, timedelta]], input: Union[BigQuerySource, FileSource], features: List[Feature] = [], tags: Optional[Dict[str, str]] = None, online: bool = True, ): if not features: features = [] # to handle python's mutable default arguments columns_to_exclude = { input.event_timestamp_column, input.created_timestamp_column, } | set(entities) for col_name, col_datatype in input.get_table_column_names_and_types( ): if col_name not in columns_to_exclude and not re.match( "^__|__$", col_name): features.append( Feature( col_name, input.source_datatype_to_feast_value_type()( col_datatype), )) if not features: raise ValueError( f"Could not infer Features for the FeatureView named {name}. Please specify Features explicitly for this FeatureView." ) cols = [entity for entity in entities] + [feat.name for feat in features] for col in cols: if input.field_mapping is not None and col in input.field_mapping.keys( ): raise ValueError( f"The field {col} is mapped to {input.field_mapping[col]} for this data source. Please either remove this field mapping or use {input.field_mapping[col]} as the Entity or Feature name." ) self.name = name self.entities = entities self.features = features self.tags = tags if tags is not None else {} if isinstance(ttl, Duration): self.ttl = timedelta(seconds=int(ttl.seconds)) else: self.ttl = ttl self.online = online self.input = input self.materialization_intervals = []
def test_apply_all_featuresets(client): client.set_project(PROJECT_NAME) file_fs1 = FeatureSet( "file_feature_set", features=[Feature("feature_value1", ValueType.STRING)], entities=[Entity("entity_id", ValueType.INT64)], max_age=Duration(seconds=100), ) client.apply(file_fs1) gcs_fs1 = FeatureSet( "gcs_feature_set", features=[Feature("feature_value2", ValueType.STRING)], entities=[Entity("entity_id", ValueType.INT64)], max_age=Duration(seconds=100), ) client.apply(gcs_fs1) proc_time_fs = FeatureSet( "processing_time", features=[Feature("feature_value3", ValueType.STRING)], entities=[Entity("entity_id", ValueType.INT64)], max_age=Duration(seconds=100), ) client.apply(proc_time_fs) add_cols_fs = FeatureSet( "additional_columns", features=[Feature("feature_value4", ValueType.STRING)], entities=[Entity("entity_id", ValueType.INT64)], max_age=Duration(seconds=100), ) client.apply(add_cols_fs) historical_fs = FeatureSet( "historical", features=[Feature("feature_value5", ValueType.STRING)], entities=[Entity("entity_id", ValueType.INT64)], max_age=Duration(seconds=100), ) client.apply(historical_fs) fs1 = FeatureSet( "feature_set_1", features=[Feature("feature_value6", ValueType.STRING)], entities=[Entity("entity_id", ValueType.INT64)], max_age=Duration(seconds=100), ) fs2 = FeatureSet( "feature_set_2", features=[Feature("other_feature_value7", ValueType.INT64)], entities=[Entity("other_entity_id", ValueType.INT64)], max_age=Duration(seconds=100), ) client.apply(fs1) client.apply(fs2)