def test_online_retrieval(environment, universal_data_sources, benchmark): fs = environment.feature_store entities, datasets, data_sources = universal_data_sources feature_views = construct_universal_feature_views(data_sources) feature_service = FeatureService( "convrate_plus100", features=[ feature_views["driver"][["conv_rate"]], feature_views["driver_odfv"] ], ) feast_objects = [] feast_objects.extend(feature_views.values()) feast_objects.extend([driver(), customer(), location(), feature_service]) fs.apply(feast_objects) fs.materialize(environment.start_date, environment.end_date) sample_drivers = random.sample(entities["driver"], 10) sample_customers = random.sample(entities["customer"], 10) entity_rows = [{ "driver": d, "customer_id": c, "val_to_add": 50 } for (d, c) in zip(sample_drivers, sample_customers)] feature_refs = [ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "conv_rate_plus_100:conv_rate_plus_100", "conv_rate_plus_100:conv_rate_plus_val_to_add", "global_stats:num_rides", "global_stats:avg_ride_length", ] unprefixed_feature_refs = [ f.rsplit(":", 1)[-1] for f in feature_refs if ":" in f ] # Remove the on demand feature view output features, since they're not present in the source dataframe unprefixed_feature_refs.remove("conv_rate_plus_100") unprefixed_feature_refs.remove("conv_rate_plus_val_to_add") benchmark( fs.get_online_features, features=feature_refs, entity_rows=entity_rows, )
def test_read_pre_applied() -> None: """ Read feature values from the FeatureStore using a FeatureService. """ runner = CliRunner() with runner.local_repo(get_example_repo("example_feature_repo_1.py"), "bigquery") as store: assert len(store.list_feature_services()) == 1 fs = store.get_feature_service("driver_locations_service") assert len(fs.tags) == 1 assert fs.tags["release"] == "production" fv = store.get_feature_view("driver_locations") fs = FeatureService(name="new_feature_service", features=[fv[["lon"]]]) store.apply([fs]) assert len(store.list_feature_services()) == 2 store.get_feature_service("new_feature_service")
def test_online_retrieval(environment, universal_data_sources, full_feature_names): fs = environment.feature_store entities, datasets, data_sources = universal_data_sources feature_views = construct_universal_feature_views(data_sources) feature_service = FeatureService( "convrate_plus100", features=[ feature_views.driver[["conv_rate"]], feature_views.driver_odfv, feature_views.customer[["current_balance"]], ], ) feature_service_entity_mapping = FeatureService( name="entity_mapping", features=[ feature_views.location.with_name("origin").with_join_key_map( {"location_id": "origin_id"}), feature_views.location.with_name("destination").with_join_key_map( {"location_id": "destination_id"}), ], ) feast_objects = [] feast_objects.extend(feature_views.values()) feast_objects.extend([ driver(), customer(), location(), feature_service, feature_service_entity_mapping, ]) fs.apply(feast_objects) fs.materialize( environment.start_date - timedelta(days=1), environment.end_date + timedelta(days=1), ) entity_sample = datasets.orders_df.sample(10)[[ "customer_id", "driver_id", "order_id", "event_timestamp" ]] orders_df = datasets.orders_df[( datasets.orders_df["customer_id"].isin(entity_sample["customer_id"]) & datasets.orders_df["driver_id"].isin(entity_sample["driver_id"]))] sample_drivers = entity_sample["driver_id"] drivers_df = datasets.driver_df[datasets.driver_df["driver_id"].isin( sample_drivers)] sample_customers = entity_sample["customer_id"] customers_df = datasets.customer_df[ datasets.customer_df["customer_id"].isin(sample_customers)] location_pairs = np.array( list(itertools.permutations(entities.location_vals, 2))) sample_location_pairs = location_pairs[np.random.choice( len(location_pairs), 10)].T.tolist() origins_df = datasets.location_df[datasets.location_df["location_id"].isin( sample_location_pairs[0])] destinations_df = datasets.location_df[ datasets.location_df["location_id"].isin(sample_location_pairs[1])] global_df = datasets.global_df entity_rows = [{ "driver_id": d, "customer_id": c, "val_to_add": 50 } for (d, c) in zip(sample_drivers, sample_customers)] feature_refs = [ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "conv_rate_plus_100:conv_rate_plus_100", "conv_rate_plus_100:conv_rate_plus_val_to_add", "order:order_is_success", "global_stats:num_rides", "global_stats:avg_ride_length", ] unprefixed_feature_refs = [ f.rsplit(":", 1)[-1] for f in feature_refs if ":" in f ] # Remove the on demand feature view output features, since they're not present in the source dataframe unprefixed_feature_refs.remove("conv_rate_plus_100") unprefixed_feature_refs.remove("conv_rate_plus_val_to_add") online_features_dict = get_online_features_dict( environment=environment, features=feature_refs, entity_rows=entity_rows, full_feature_names=full_feature_names, ) # Test that the on demand feature views compute properly even if the dependent conv_rate # feature isn't requested. online_features_no_conv_rate = get_online_features_dict( environment=environment, features=[ ref for ref in feature_refs if ref != "driver_stats:conv_rate" ], entity_rows=entity_rows, full_feature_names=full_feature_names, ) assert online_features_no_conv_rate is not None keys = set(online_features_dict.keys()) expected_keys = set( f.replace(":", "__") if full_feature_names else f.split(":")[-1] for f in feature_refs) | {"customer_id", "driver_id"} assert ( keys == expected_keys ), f"Response keys are different from expected: {keys - expected_keys} (extra) and {expected_keys - keys} (missing)" tc = unittest.TestCase() for i, entity_row in enumerate(entity_rows): df_features = get_latest_feature_values_from_dataframes( driver_df=drivers_df, customer_df=customers_df, orders_df=orders_df, global_df=global_df, entity_row=entity_row, ) assert df_features["customer_id"] == online_features_dict[ "customer_id"][i] assert df_features["driver_id"] == online_features_dict["driver_id"][i] tc.assertAlmostEqual( online_features_dict[response_feature_name("conv_rate_plus_100", feature_refs, full_feature_names)][i], df_features["conv_rate"] + 100, delta=0.0001, ) tc.assertAlmostEqual( online_features_dict[response_feature_name( "conv_rate_plus_val_to_add", feature_refs, full_feature_names)][i], df_features["conv_rate"] + df_features["val_to_add"], delta=0.0001, ) for unprefixed_feature_ref in unprefixed_feature_refs: tc.assertAlmostEqual( df_features[unprefixed_feature_ref], online_features_dict[response_feature_name( unprefixed_feature_ref, feature_refs, full_feature_names)][i], delta=0.0001, ) # Check what happens for missing values missing_responses_dict = get_online_features_dict( environment=environment, features=feature_refs, entity_rows=[{ "driver_id": 0, "customer_id": 0, "val_to_add": 100 }], full_feature_names=full_feature_names, ) assert missing_responses_dict is not None for unprefixed_feature_ref in unprefixed_feature_refs: if unprefixed_feature_ref not in {"num_rides", "avg_ride_length"}: tc.assertIsNone(missing_responses_dict[response_feature_name( unprefixed_feature_ref, feature_refs, full_feature_names)][0]) # Check what happens for missing request data with pytest.raises(RequestDataNotFoundInEntityRowsException): get_online_features_dict( environment=environment, features=feature_refs, entity_rows=[{ "driver_id": 0, "customer_id": 0 }], full_feature_names=full_feature_names, ) assert_feature_service_correctness( environment, feature_service, entity_rows, full_feature_names, drivers_df, customers_df, orders_df, global_df, ) entity_rows = [{ "origin_id": origin, "destination_id": destination } for (_driver, _customer, origin, destination ) in zip(sample_drivers, sample_customers, *sample_location_pairs)] assert_feature_service_entity_mapping_correctness( environment, feature_service_entity_mapping, entity_rows, full_feature_names, origins_df, destinations_df, )
) benchmark_feature_views = [ FeatureView( name=f"feature_view_{i}", entities=["entity"], ttl=Duration(seconds=86400), features=[ Feature(name=f"feature_{10 * i + j}", dtype=ValueType.INT64) for j in range(10) ], online=True, batch_source=generated_data_source, ) for i in range(25) ] benchmark_feature_service = FeatureService( name=f"benchmark_feature_service", features=benchmark_feature_views, ) fs = FeatureStore(".") fs.apply([ driver_hourly_stats_view, driver, entity, benchmark_feature_service, *benchmark_feature_views ]) now = datetime.now() fs.materialize(start, now) print("Materialization finished")
customer_profile = FeatureView( name="customer_profile", entities=["customer"], ttl=timedelta(days=1), features=[ Feature(name="avg_orders_day", dtype=ValueType.FLOAT), Feature(name="name", dtype=ValueType.STRING), Feature(name="age", dtype=ValueType.INT64), ], online=True, batch_source=customer_profile_source, tags={}, ) customer_driver_combined = FeatureView( name="customer_driver_combined", entities=["customer", "driver"], ttl=timedelta(days=1), features=[Feature(name="trips", dtype=ValueType.INT64)], online=True, batch_source=customer_driver_combined_source, tags={}, ) all_drivers_feature_service = FeatureService( name="driver_locations_service", features=[driver_locations], tags={"release": "production"}, )
def test_online_retrieval(environment, universal_data_sources, full_feature_names): fs = environment.feature_store entities, datasets, data_sources = universal_data_sources feature_views = construct_universal_feature_views(data_sources) feature_service = FeatureService( "convrate_plus100", features=[ feature_views["driver"][["conv_rate"]], feature_views["driver_odfv"], feature_views["driver_age_request_fv"], ], ) feature_service_entity_mapping = FeatureService( name="entity_mapping", features=[ feature_views["location"].with_name("origin").with_join_key_map( {"location_id": "origin_id"}), feature_views["location"].with_name( "destination").with_join_key_map( {"location_id": "destination_id"}), ], ) feast_objects = [] feast_objects.extend(feature_views.values()) feast_objects.extend([ driver(), customer(), location(), feature_service, feature_service_entity_mapping, ]) fs.apply(feast_objects) fs.materialize( environment.start_date - timedelta(days=1), environment.end_date + timedelta(days=1), ) entity_sample = datasets["orders"].sample(10)[[ "customer_id", "driver_id", "order_id", "event_timestamp" ]] orders_df = datasets["orders"][( datasets["orders"]["customer_id"].isin(entity_sample["customer_id"]) & datasets["orders"]["driver_id"].isin(entity_sample["driver_id"]))] sample_drivers = entity_sample["driver_id"] drivers_df = datasets["driver"][datasets["driver"]["driver_id"].isin( sample_drivers)] sample_customers = entity_sample["customer_id"] customers_df = datasets["customer"][datasets["customer"] ["customer_id"].isin(sample_customers)] location_pairs = np.array( list(itertools.permutations(entities["location"], 2))) sample_location_pairs = location_pairs[np.random.choice( len(location_pairs), 10)].T origins_df = datasets["location"][datasets["location"]["location_id"].isin( sample_location_pairs[0])] destinations_df = datasets["location"][ datasets["location"]["location_id"].isin(sample_location_pairs[1])] global_df = datasets["global"] entity_rows = [{ "driver": d, "customer_id": c, "val_to_add": 50, "driver_age": 25 } for (d, c) in zip(sample_drivers, sample_customers)] feature_refs = [ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "conv_rate_plus_100:conv_rate_plus_100", "conv_rate_plus_100:conv_rate_plus_val_to_add", "order:order_is_success", "global_stats:num_rides", "global_stats:avg_ride_length", "driver_age:driver_age", ] unprefixed_feature_refs = [ f.rsplit(":", 1)[-1] for f in feature_refs if ":" in f ] # Remove the on demand feature view output features, since they're not present in the source dataframe unprefixed_feature_refs.remove("conv_rate_plus_100") unprefixed_feature_refs.remove("conv_rate_plus_val_to_add") online_features = fs.get_online_features( features=feature_refs, entity_rows=entity_rows, full_feature_names=full_feature_names, ) assert online_features is not None online_features_dict = online_features.to_dict() keys = online_features_dict.keys() assert ( len(keys) == len(feature_refs) + 3 ) # Add three for the driver id and the customer id entity keys + val_to_add request data. for feature in feature_refs: # full_feature_names does not apply to request feature views if full_feature_names and feature != "driver_age:driver_age": assert feature.replace(":", "__") in keys else: assert feature.rsplit(":", 1)[-1] in keys assert ("driver_stats" not in keys and "customer_profile" not in keys and "order" not in keys and "global_stats" not in keys) tc = unittest.TestCase() for i, entity_row in enumerate(entity_rows): df_features = get_latest_feature_values_from_dataframes( driver_df=drivers_df, customer_df=customers_df, orders_df=orders_df, global_df=global_df, entity_row=entity_row, ) assert df_features["customer_id"] == online_features_dict[ "customer_id"][i] assert df_features["driver_id"] == online_features_dict["driver_id"][i] tc.assertAlmostEqual( online_features_dict[response_feature_name("conv_rate_plus_100", full_feature_names)][i], df_features["conv_rate"] + 100, delta=0.0001, ) tc.assertAlmostEqual( online_features_dict[response_feature_name( "conv_rate_plus_val_to_add", full_feature_names)][i], df_features["conv_rate"] + df_features["val_to_add"], delta=0.0001, ) for unprefixed_feature_ref in unprefixed_feature_refs: tc.assertAlmostEqual( df_features[unprefixed_feature_ref], online_features_dict[response_feature_name( unprefixed_feature_ref, full_feature_names)][i], delta=0.0001, ) # Check what happens for missing values missing_responses_dict = fs.get_online_features( features=feature_refs, entity_rows=[{ "driver": 0, "customer_id": 0, "val_to_add": 100, "driver_age": 125 }], full_feature_names=full_feature_names, ).to_dict() assert missing_responses_dict is not None for unprefixed_feature_ref in unprefixed_feature_refs: if unprefixed_feature_ref not in { "num_rides", "avg_ride_length", "driver_age" }: tc.assertIsNone(missing_responses_dict[response_feature_name( unprefixed_feature_ref, full_feature_names)][0]) # Check what happens for missing request data with pytest.raises(RequestDataNotFoundInEntityRowsException): fs.get_online_features( features=feature_refs, entity_rows=[{ "driver": 0, "customer_id": 0 }], full_feature_names=full_feature_names, ).to_dict() # Also with request data with pytest.raises(RequestDataNotFoundInEntityRowsException): fs.get_online_features( features=feature_refs, entity_rows=[{ "driver": 0, "customer_id": 0, "val_to_add": 20 }], full_feature_names=full_feature_names, ).to_dict() assert_feature_service_correctness( fs, feature_service, entity_rows, full_feature_names, drivers_df, customers_df, orders_df, global_df, ) entity_rows = [{ "driver": driver, "customer_id": customer, "origin_id": origin, "destination_id": destination, } for (driver, customer, origin, destination ) in zip(sample_drivers, sample_customers, *sample_location_pairs)] assert_feature_service_entity_mapping_correctness( fs, feature_service_entity_mapping, entity_rows, full_feature_names, drivers_df, customers_df, orders_df, origins_df, destinations_df, )
def test_feature_service_with_description(): feature_service = FeatureService(name="my-feature-service", features=[], description="a clear description") assert feature_service.to_proto().spec.description == "a clear description"
def test_feature_service_without_description(): feature_service = FeatureService(name="my-feature-service", features=[]) # assert feature_service.to_proto().spec.description == ""
def test_online_retrieval(environment, universal_data_sources, full_feature_names): fs = environment.feature_store entities, datasets, data_sources = universal_data_sources feature_views = construct_universal_feature_views(data_sources) feature_service = FeatureService( "convrate_plus100", features=[feature_views["driver"][["conv_rate"]], feature_views["driver_odfv"]], ) feast_objects = [] feast_objects.extend(feature_views.values()) feast_objects.extend([driver(), customer(), feature_service]) fs.apply(feast_objects) fs.materialize( environment.start_date - timedelta(days=1), environment.end_date + timedelta(days=1), ) entity_sample = datasets["orders"].sample(10)[ ["customer_id", "driver_id", "order_id", "event_timestamp"] ] orders_df = datasets["orders"][ ( datasets["orders"]["customer_id"].isin(entity_sample["customer_id"]) & datasets["orders"]["driver_id"].isin(entity_sample["driver_id"]) ) ] sample_drivers = entity_sample["driver_id"] drivers_df = datasets["driver"][ datasets["driver"]["driver_id"].isin(sample_drivers) ] sample_customers = entity_sample["customer_id"] customers_df = datasets["customer"][ datasets["customer"]["customer_id"].isin(sample_customers) ] global_df = datasets["global"] entity_rows = [ {"driver": d, "customer_id": c, "val_to_add": 50} for (d, c) in zip(sample_drivers, sample_customers) ] feature_refs = [ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "conv_rate_plus_100:conv_rate_plus_100", "conv_rate_plus_100:conv_rate_plus_val_to_add", "order:order_is_success", "global_stats:num_rides", "global_stats:avg_ride_length", ] unprefixed_feature_refs = [f.rsplit(":", 1)[-1] for f in feature_refs if ":" in f] # Remove the on demand feature view output features, since they're not present in the source dataframe unprefixed_feature_refs.remove("conv_rate_plus_100") unprefixed_feature_refs.remove("conv_rate_plus_val_to_add") online_features = fs.get_online_features( features=feature_refs, entity_rows=entity_rows, full_feature_names=full_feature_names, ) assert online_features is not None online_features_dict = online_features.to_dict() keys = online_features_dict.keys() assert ( len(keys) == len(feature_refs) + 3 ) # Add three for the driver id and the customer id entity keys + val_to_add request data. for feature in feature_refs: if full_feature_names: assert feature.replace(":", "__") in keys else: assert feature.rsplit(":", 1)[-1] in keys assert ( "driver_stats" not in keys and "customer_profile" not in keys and "order" not in keys and "global_stats" not in keys ) tc = unittest.TestCase() for i, entity_row in enumerate(entity_rows): df_features = get_latest_feature_values_from_dataframes( drivers_df, customers_df, orders_df, global_df, entity_row ) assert df_features["customer_id"] == online_features_dict["customer_id"][i] assert df_features["driver_id"] == online_features_dict["driver_id"][i] assert ( online_features_dict[ response_feature_name("conv_rate_plus_100", full_feature_names) ][i] == df_features["conv_rate"] + 100 ) assert ( online_features_dict[ response_feature_name("conv_rate_plus_val_to_add", full_feature_names) ][i] == df_features["conv_rate"] + df_features["val_to_add"] ) for unprefixed_feature_ref in unprefixed_feature_refs: tc.assertEqual( df_features[unprefixed_feature_ref], online_features_dict[ response_feature_name(unprefixed_feature_ref, full_feature_names) ][i], ) # Check what happens for missing values missing_responses_dict = fs.get_online_features( features=feature_refs, entity_rows=[{"driver": 0, "customer_id": 0, "val_to_add": 100}], full_feature_names=full_feature_names, ).to_dict() assert missing_responses_dict is not None for unprefixed_feature_ref in unprefixed_feature_refs: if unprefixed_feature_ref not in {"num_rides", "avg_ride_length"}: tc.assertIsNone( missing_responses_dict[ response_feature_name(unprefixed_feature_ref, full_feature_names) ][0] ) # Check what happens for missing request data with pytest.raises(RequestDataNotFoundInEntityRowsException): fs.get_online_features( features=feature_refs, entity_rows=[{"driver": 0, "customer_id": 0}], full_feature_names=full_feature_names, ).to_dict() assert_feature_service_correctness( fs, feature_service, entity_rows, full_feature_names, drivers_df, customers_df, orders_df, global_df, )
def feature_service(name: str, views) -> FeatureService: return FeatureService(name, views)
df["transaction_gt_last_credit_card_due"] = ( inputs["transaction_amt"] > inputs["credit_card_due"] ) return df # Define request feature view transaction_request_fv = RequestFeatureView( name="transaction_request_fv", request_data_source=input_request, ) model_v1 = FeatureService( name="credit_score_v1", features=[ credit_history[["mortgage_due", "credit_card_due", "missed_payments_1y"]], zipcode_features, ], tags={"owner": "*****@*****.**", "stage": "staging"}, description="Credit scoring model", ) model_v2 = FeatureService( name="credit_score_v2", features=[ credit_history[["mortgage_due", "credit_card_due", "missed_payments_1y"]], zipcode_features, transaction_request_fv, ], tags={"owner": "*****@*****.**", "stage": "prod"}, description="Credit scoring model", )
event_timestamp_column="event_timestamp", created_timestamp_column="created", ) # Define an entity for the driver. You can think of entity as a primary key used to # fetch features. driver = Entity( name="driver_id", value_type=ValueType.INT64, description="driver id", ) # Our parquet files contain sample data that includes a driver_id column, timestamps and # three feature column. Here we define a Feature View that will allow us to serve this # data to our model online. driver_hourly_stats_view = FeatureView( name="driver_hourly_stats", entities=["driver_id"], ttl=Duration(seconds=86400 * 365 * 10), features=[ Feature(name="conv_rate", dtype=ValueType.FLOAT), Feature(name="acc_rate", dtype=ValueType.FLOAT), Feature(name="avg_daily_trips", dtype=ValueType.INT64), ], online=True, batch_source=driver_hourly_stats, tags={}, ) driver_stats_fs = FeatureService(name="test_service", features=[driver_hourly_stats_view])