def get_feature_service(self, name: str, project: str, allow_cache: bool = False) -> FeatureService: """ Retrieves a feature service. Args: name: Name of feature service project: Feast project that this feature service belongs to Returns: Returns either the specified feature service, or raises an exception if none is found """ registry = self._get_registry_proto(allow_cache=allow_cache) for feature_service_proto in registry.feature_services: if (feature_service_proto.spec.project == project and feature_service_proto.spec.name == name): return FeatureService.from_proto(feature_service_proto) raise FeatureServiceNotFoundException(name, project=project)
def list_feature_services( self, project: str, allow_cache: bool = False) -> List[FeatureService]: """ Retrieve a list of feature services from the registry Args: allow_cache: Whether to allow returning entities from a cached registry project: Filter entities based on project name Returns: List of feature services """ registry = self._get_registry_proto(allow_cache=allow_cache) feature_services = [] for feature_service_proto in registry.feature_services: if feature_service_proto.spec.project == project: feature_services.append( FeatureService.from_proto(feature_service_proto)) return feature_services
def test_historical_features(environment, universal_data_sources, full_feature_names): store = environment.feature_store (entities, datasets, data_sources) = universal_data_sources feature_views = construct_universal_feature_views(data_sources) entity_df_with_request_data = datasets.entity_df.copy(deep=True) entity_df_with_request_data["val_to_add"] = [ i for i in range(len(entity_df_with_request_data)) ] entity_df_with_request_data["driver_age"] = [ i + 100 for i in range(len(entity_df_with_request_data)) ] feature_service = FeatureService( name="convrate_plus100", features=[ feature_views.driver[["conv_rate"]], feature_views.driver_odfv ], ) feature_service_entity_mapping = FeatureService( name="entity_mapping", features=[ feature_views.location.with_name("origin").with_join_key_map( {"location_id": "origin_id"}), feature_views.location.with_name("destination").with_join_key_map( {"location_id": "destination_id"}), ], ) store.apply([ driver(), customer(), location(), feature_service, feature_service_entity_mapping, *feature_views.values(), ]) event_timestamp = (DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL if DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL in datasets.orders_df.columns else "e_ts") full_expected_df = get_expected_training_df( datasets.customer_df, feature_views.customer, datasets.driver_df, feature_views.driver, datasets.orders_df, feature_views.order, datasets.location_df, feature_views.location, datasets.global_df, feature_views.global_fv, datasets.field_mapping_df, feature_views.field_mapping, entity_df_with_request_data, event_timestamp, full_feature_names, ) # Only need the shadow entities features in the FeatureService test expected_df = full_expected_df.drop( columns=["origin__temperature", "destination__temperature"], ) job_from_df = store.get_historical_features( entity_df=entity_df_with_request_data, features=[ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "conv_rate_plus_100:conv_rate_plus_100", "conv_rate_plus_100:conv_rate_plus_100_rounded", "conv_rate_plus_100:conv_rate_plus_val_to_add", "order:order_is_success", "global_stats:num_rides", "global_stats:avg_ride_length", "field_mapping:feature_name", ], full_feature_names=full_feature_names, ) start_time = datetime.utcnow() actual_df_from_df_entities = job_from_df.to_df() print( f"actual_df_from_df_entities shape: {actual_df_from_df_entities.shape}" ) end_time = datetime.utcnow() print( str(f"Time to execute job_from_df.to_df() = '{(end_time - start_time)}'\n" )) assert sorted(expected_df.columns) == sorted( actual_df_from_df_entities.columns) assert_frame_equal( expected_df, actual_df_from_df_entities, keys=[event_timestamp, "order_id", "driver_id", "customer_id"], ) assert_feature_service_correctness( store, feature_service, full_feature_names, entity_df_with_request_data, expected_df, event_timestamp, ) assert_feature_service_entity_mapping_correctness( store, feature_service_entity_mapping, full_feature_names, entity_df_with_request_data, full_expected_df, event_timestamp, ) table_from_df_entities: pd.DataFrame = job_from_df.to_arrow().to_pandas() assert_frame_equal( expected_df, table_from_df_entities, keys=[event_timestamp, "order_id", "driver_id", "customer_id"], )
def test_historical_features(environment, universal_data_sources, full_feature_names): store = environment.feature_store (entities, datasets, data_sources) = universal_data_sources feature_views = construct_universal_feature_views(data_sources) customer_df, driver_df, location_df, orders_df, global_df, entity_df = ( datasets["customer"], datasets["driver"], datasets["location"], datasets["orders"], datasets["global"], datasets["entity"], ) entity_df_with_request_data = entity_df.copy(deep=True) entity_df_with_request_data["val_to_add"] = [ i for i in range(len(entity_df_with_request_data)) ] entity_df_with_request_data["driver_age"] = [ i + 100 for i in range(len(entity_df_with_request_data)) ] ( customer_fv, driver_fv, driver_odfv, location_fv, order_fv, global_fv, driver_age_request_fv, ) = ( feature_views["customer"], feature_views["driver"], feature_views["driver_odfv"], feature_views["location"], feature_views["order"], feature_views["global"], feature_views["driver_age_request_fv"], ) feature_service = FeatureService( name="convrate_plus100", features=[ feature_views["driver"][["conv_rate"]], driver_odfv, driver_age_request_fv, ], ) feature_service_entity_mapping = FeatureService( name="entity_mapping", features=[ location_fv.with_name("origin").with_join_key_map( {"location_id": "origin_id"} ), location_fv.with_name("destination").with_join_key_map( {"location_id": "destination_id"} ), ], ) feast_objects = [] feast_objects.extend( [ customer_fv, driver_fv, driver_odfv, location_fv, order_fv, global_fv, driver_age_request_fv, driver(), customer(), location(), feature_service, feature_service_entity_mapping, ] ) store.apply(feast_objects) entity_df_query = None orders_table = table_name_from_data_source(data_sources["orders"]) if orders_table: entity_df_query = f"SELECT customer_id, driver_id, order_id, origin_id, destination_id, event_timestamp FROM {orders_table}" event_timestamp = ( DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL if DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL in orders_df.columns else "e_ts" ) full_expected_df = get_expected_training_df( customer_df, customer_fv, driver_df, driver_fv, orders_df, order_fv, location_df, location_fv, global_df, global_fv, entity_df_with_request_data, event_timestamp, full_feature_names, ) # Only need the shadow entities features in the FeatureService test expected_df = full_expected_df.drop( columns=["origin__temperature", "destination__temperature"], ) if entity_df_query: job_from_sql = store.get_historical_features( entity_df=entity_df_query, features=[ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "order:order_is_success", "global_stats:num_rides", "global_stats:avg_ride_length", ], full_feature_names=full_feature_names, ) start_time = datetime.utcnow() actual_df_from_sql_entities = job_from_sql.to_df() end_time = datetime.utcnow() print( str(f"\nTime to execute job_from_sql.to_df() = '{(end_time - start_time)}'") ) # Not requesting the on demand transform with an entity_df query (can't add request data in them) expected_df_query = expected_df.drop( columns=[ "conv_rate_plus_100", "conv_rate_plus_100_rounded", "val_to_add", "conv_rate_plus_val_to_add", "driver_age", ] ) assert sorted(expected_df_query.columns) == sorted( actual_df_from_sql_entities.columns ) actual_df_from_sql_entities = ( actual_df_from_sql_entities[expected_df_query.columns] .sort_values(by=[event_timestamp, "order_id", "driver_id", "customer_id"]) .drop_duplicates() .reset_index(drop=True) ) expected_df_query = ( expected_df_query.sort_values( by=[event_timestamp, "order_id", "driver_id", "customer_id"] ) .drop_duplicates() .reset_index(drop=True) ) assert_frame_equal( actual_df_from_sql_entities, expected_df_query, check_dtype=False, ) table_from_sql_entities = job_from_sql.to_arrow() df_from_sql_entities = ( table_from_sql_entities.to_pandas()[expected_df_query.columns] .sort_values(by=[event_timestamp, "order_id", "driver_id", "customer_id"]) .drop_duplicates() .reset_index(drop=True) ) for col in df_from_sql_entities.columns: expected_df_query[col] = expected_df_query[col].astype( df_from_sql_entities[col].dtype ) assert_frame_equal(expected_df_query, df_from_sql_entities) job_from_df = store.get_historical_features( entity_df=entity_df_with_request_data, features=[ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "conv_rate_plus_100:conv_rate_plus_100", "conv_rate_plus_100:conv_rate_plus_100_rounded", "conv_rate_plus_100:conv_rate_plus_val_to_add", "order:order_is_success", "global_stats:num_rides", "global_stats:avg_ride_length", "driver_age:driver_age", ], full_feature_names=full_feature_names, ) start_time = datetime.utcnow() actual_df_from_df_entities = job_from_df.to_df() print(f"actual_df_from_df_entities shape: {actual_df_from_df_entities.shape}") end_time = datetime.utcnow() print(str(f"Time to execute job_from_df.to_df() = '{(end_time - start_time)}'\n")) assert sorted(expected_df.columns) == sorted(actual_df_from_df_entities.columns) expected_df: pd.DataFrame = ( expected_df.sort_values( by=[event_timestamp, "order_id", "driver_id", "customer_id"] ) .drop_duplicates() .reset_index(drop=True) ) actual_df_from_df_entities = ( actual_df_from_df_entities[expected_df.columns] .sort_values(by=[event_timestamp, "order_id", "driver_id", "customer_id"]) .drop_duplicates() .reset_index(drop=True) ) assert_frame_equal( expected_df, actual_df_from_df_entities, check_dtype=False, ) assert_feature_service_correctness( store, feature_service, full_feature_names, entity_df_with_request_data, expected_df, event_timestamp, ) assert_feature_service_entity_mapping_correctness( store, feature_service_entity_mapping, full_feature_names, entity_df_with_request_data, full_expected_df, event_timestamp, ) table_from_df_entities: pd.DataFrame = job_from_df.to_arrow().to_pandas() columns_expected_in_table = expected_df.columns.tolist() table_from_df_entities = ( table_from_df_entities[columns_expected_in_table] .sort_values(by=[event_timestamp, "order_id", "driver_id", "customer_id"]) .drop_duplicates() .reset_index(drop=True) ) assert_frame_equal(actual_df_from_df_entities, table_from_df_entities) # If request data is missing that's needed for on demand transform, throw an error with pytest.raises(RequestDataNotFoundInEntityDfException): store.get_historical_features( entity_df=entity_df, features=[ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "conv_rate_plus_100:conv_rate_plus_100", "conv_rate_plus_100:conv_rate_plus_val_to_add", "global_stats:num_rides", "global_stats:avg_ride_length", ], full_feature_names=full_feature_names, ) # If request data is missing that's needed for a request feature view, throw an error with pytest.raises(RequestDataNotFoundInEntityDfException): store.get_historical_features( entity_df=entity_df, features=[ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "driver_age:driver_age", "global_stats:num_rides", "global_stats:avg_ride_length", ], full_feature_names=full_feature_names, )
def test_feature_service_with_description(): feature_service = FeatureService( name="my-feature-service", features=[], description="a clear description" ) assert feature_service.to_proto().spec.description == "a clear description"
def test_feature_service_without_description(): feature_service = FeatureService(name="my-feature-service", features=[]) # assert feature_service.to_proto().spec.description == ""
return df generated_data_source = FileSource( path="benchmark_data.parquet", timestamp_field="event_timestamp", ) entity = Entity( name="entity", value_type=ValueType.STRING, ) benchmark_feature_views = [ FeatureView( name=f"feature_view_{i}", entities=["entity"], ttl=Duration(seconds=86400), schema=[ Field(name=f"feature_{10 * i + j}", dtype=Int64) for j in range(10) ], online=True, batch_source=generated_data_source, ) for i in range(25) ] benchmark_feature_service = FeatureService( name=f"benchmark_feature_service", features=benchmark_feature_views, )