def test_historical_features_with_missing_request_data(environment, universal_data_sources, full_feature_names): store = environment.feature_store (_, datasets, data_sources) = universal_data_sources feature_views = construct_universal_feature_views(data_sources) store.apply([driver(), customer(), location(), *feature_views.values()]) # If request data is missing that's needed for on demand transform, throw an error with pytest.raises(RequestDataNotFoundInEntityDfException): store.get_historical_features( entity_df=datasets.entity_df, features=[ "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "conv_rate_plus_100:conv_rate_plus_100", "conv_rate_plus_100:conv_rate_plus_val_to_add", "global_stats:num_rides", "global_stats:avg_ride_length", "field_mapping:feature_name", ], full_feature_names=full_feature_names, )
def test_push_features_and_read(environment, universal_data_sources): store = environment.feature_store (_, datasets, data_sources) = universal_data_sources feature_views = construct_universal_feature_views(data_sources) store.apply([driver(), customer(), location(), *feature_views.values()]) data = { "location_id": [1], "temperature": [4], "event_timestamp": [pd.Timestamp(datetime.datetime.utcnow()).round("ms")], "created": [pd.Timestamp(datetime.datetime.utcnow()).round("ms")], } df_ingest = pd.DataFrame(data) store.push("location_stats_push_source", df_ingest) online_resp = store.get_online_features( features=["pushable_location_stats:temperature"], entity_rows=[{ "location_id": 1 }], ) online_resp_dict = online_resp.to_dict() assert online_resp_dict["location_id"] == [1] assert online_resp_dict["temperature"] == [4]
def test_feature_get_online_features_types_match(online_types_test_fixtures): environment, config, data_source, fv = online_types_test_fixtures fv = create_feature_view(config.feature_dtype, config.feature_is_list, data_source) fs = environment.feature_store features = [fv.name + ":value"] entity = driver(value_type=ValueType.UNKNOWN) fs.apply([fv, entity]) fs.materialize(environment.start_date, environment.end_date) driver_id_value = "1" if config.entity_type == ValueType.STRING else 1 online_features = fs.get_online_features( features=features, entity_rows=[{ "driver": driver_id_value }], ).to_dict() feature_list_dtype_to_expected_online_response_value_type = { "int32": "int", "int64": "int", "float": "float", "string": "str", "bool": "bool", } if config.feature_is_list: assert type(online_features["value"][0]).__name__ == "list" assert (type(online_features["value"][0][0]).__name__ == feature_list_dtype_to_expected_online_response_value_type[ config.feature_dtype]) else: assert (type(online_features["value"][0]).__name__ == feature_list_dtype_to_expected_online_response_value_type[ config.feature_dtype])
def test_feature_get_online_features_types_match(online_types_test_fixtures): environment, config, data_source, fv = online_types_test_fixtures fv = create_feature_view(config.feature_dtype, config.feature_is_list, config.has_empty_list, data_source) fs = environment.feature_store features = [fv.name + ":value"] entity = driver(value_type=ValueType.UNKNOWN) fs.apply([fv, entity]) fs.materialize(environment.start_date, environment.end_date) driver_id_value = "1" if config.entity_type == ValueType.STRING else 1 online_features = fs.get_online_features( features=features, entity_rows=[{ "driver": driver_id_value }], ).to_dict() feature_list_dtype_to_expected_online_response_value_type = { "int32": int, "int64": int, "float": float, "string": str, "bool": bool, } expected_dtype = feature_list_dtype_to_expected_online_response_value_type[ config.feature_dtype] if config.feature_is_list: for feature in online_features["value"]: assert isinstance(feature, list) for element in feature: assert isinstance(element, expected_dtype) else: for feature in online_features["value"]: assert isinstance(feature, expected_dtype)
def test_registration_and_retrieval_from_custom_s3_endpoint(universal_data_sources): config = IntegrationTestRepoConfig( offline_store_creator="tests.integration.feature_repos.universal.data_sources.file.S3FileDataSourceCreator" ) import os if "AWS_ACCESS_KEY_ID" in os.environ: raise Exception( "AWS_ACCESS_KEY_ID has already been set in the environment. Setting it again may cause a conflict. " "It may be better to deduplicate AWS configuration or use sub-processes for isolation" ) os.environ["AWS_ACCESS_KEY_ID"] = "AKIAIOSFODNN7EXAMPLE" os.environ["AWS_SECRET_ACCESS_KEY"] = "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" with construct_test_environment(config) as environment: fs = environment.feature_store entities, datasets, data_sources = universal_data_sources feature_views = construct_universal_feature_views(data_sources) feast_objects = [] feast_objects.extend(feature_views.values()) feast_objects.extend([driver(), customer()]) fs.apply(feast_objects) fs.materialize(environment.start_date, environment.end_date) out = fs.get_online_features( features=["driver_stats:conv_rate"], entity_rows=[{"driver": 5001}] ).to_dict() assert out["conv_rate"][0] is not None del os.environ["AWS_ACCESS_KEY_ID"] del os.environ["AWS_SECRET_ACCESS_KEY"]
def test_historical_retrieval_with_validation(environment, universal_data_sources): store = environment.feature_store (entities, datasets, data_sources) = universal_data_sources feature_views = construct_universal_feature_views(data_sources) store.apply([driver(), customer(), location(), *feature_views.values()]) # Create two identical retrieval jobs entity_df = datasets.entity_df.drop( columns=["order_id", "origin_id", "destination_id"]) reference_job = store.get_historical_features( entity_df=entity_df, features=_features, ) job = store.get_historical_features( entity_df=entity_df, features=_features, ) # Save dataset using reference job and retrieve it store.create_saved_dataset( from_=reference_job, name="my_training_dataset", storage=environment.data_source_creator. create_saved_dataset_destination(), ) saved_dataset = store.get_saved_dataset("my_training_dataset") # If validation pass there will be no exceptions on this point reference = saved_dataset.as_reference(profiler=configurable_profiler) job.to_df(validation_reference=reference)
def test_online_retrieval_with_event_timestamps(environment, universal_data_sources, full_feature_names): fs = environment.feature_store entities, datasets, data_sources = universal_data_sources feature_views = construct_universal_feature_views(data_sources) fs.apply([driver(), feature_views.driver, feature_views.global_fv]) # fake data to ingest into Online Store data = { "driver_id": [1, 2], "conv_rate": [0.5, 0.3], "acc_rate": [0.6, 0.4], "avg_daily_trips": [4, 5], "event_timestamp": [ pd.to_datetime(1646263500, utc=True, unit="s"), pd.to_datetime(1646263600, utc=True, unit="s"), ], "created": [ pd.to_datetime(1646263500, unit="s"), pd.to_datetime(1646263600, unit="s"), ], } df_ingest = pd.DataFrame(data) # directly ingest data into the Online Store fs.write_to_online_store("driver_stats", df_ingest) response = fs.get_online_features( features=[ "driver_stats:avg_daily_trips", "driver_stats:acc_rate", "driver_stats:conv_rate", ], entity_rows=[{ "driver_id": 1 }, { "driver_id": 2 }], ) df = response.to_df(True) assertpy.assert_that(len(df)).is_equal_to(2) assertpy.assert_that(df["driver_id"].iloc[0]).is_equal_to(1) assertpy.assert_that(df["driver_id"].iloc[1]).is_equal_to(2) assertpy.assert_that(df["avg_daily_trips" + TIMESTAMP_POSTFIX].iloc[0]).is_equal_to(1646263500) assertpy.assert_that(df["avg_daily_trips" + TIMESTAMP_POSTFIX].iloc[1]).is_equal_to(1646263600) assertpy.assert_that(df["acc_rate" + TIMESTAMP_POSTFIX].iloc[0]).is_equal_to(1646263500) assertpy.assert_that(df["acc_rate" + TIMESTAMP_POSTFIX].iloc[1]).is_equal_to(1646263600) assertpy.assert_that(df["conv_rate" + TIMESTAMP_POSTFIX].iloc[0]).is_equal_to(1646263500) assertpy.assert_that(df["conv_rate" + TIMESTAMP_POSTFIX].iloc[1]).is_equal_to(1646263600)
def test_e2e_consistency(environment, e2e_data_sources, infer_features): fs = environment.feature_store df, data_source = e2e_data_sources fv = driver_feature_view(data_source=data_source, infer_features=infer_features) entity = driver() fs.apply([fv, entity]) run_offline_online_store_consistency_test(fs, fv)
def test_online_retrieval(environment, universal_data_sources, benchmark): fs = environment.feature_store entities, datasets, data_sources = universal_data_sources feature_views = construct_universal_feature_views(data_sources) feature_service = FeatureService( "convrate_plus100", features=[ feature_views["driver"][["conv_rate"]], feature_views["driver_odfv"] ], ) feast_objects = [] feast_objects.extend(feature_views.values()) feast_objects.extend([driver(), customer(), location(), feature_service]) fs.apply(feast_objects) fs.materialize(environment.start_date, environment.end_date) sample_drivers = random.sample(entities["driver"], 10) sample_customers = random.sample(entities["customer"], 10) entity_rows = [{ "driver": d, "customer_id": c, "val_to_add": 50 } for (d, c) in zip(sample_drivers, sample_customers)] feature_refs = [ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "conv_rate_plus_100:conv_rate_plus_100", "conv_rate_plus_100:conv_rate_plus_val_to_add", "global_stats:num_rides", "global_stats:avg_ride_length", ] unprefixed_feature_refs = [ f.rsplit(":", 1)[-1] for f in feature_refs if ":" in f ] # Remove the on demand feature view output features, since they're not present in the source dataframe unprefixed_feature_refs.remove("conv_rate_plus_100") unprefixed_feature_refs.remove("conv_rate_plus_val_to_add") benchmark( fs.get_online_features, features=feature_refs, entity_rows=entity_rows, )
def test_feature_get_historical_features_types_match( offline_types_test_fixtures): """ Note: to make sure this test works, we need to ensure that get_historical_features returns at least one non-null row to make sure type inferral works. This can only be achieved by carefully matching entity_df to the data fixtures. """ environment, config, data_source, fv = offline_types_test_fixtures fs = environment.feature_store entity = driver() fv = create_feature_view( "get_historical_features_types_match", config.feature_dtype, config.feature_is_list, config.has_empty_list, data_source, ) fs.apply([fv, entity]) entity_df = pd.DataFrame() entity_df["driver_id"] = (["1", "3"] if config.entity_type == ValueType.STRING else [1, 3]) ts = pd.Timestamp(datetime.utcnow()).round("ms") entity_df["ts"] = [ ts - timedelta(hours=4), ts - timedelta(hours=2), ] features = [f"{fv.name}:value"] historical_features = fs.get_historical_features( entity_df=entity_df, features=features, ) # Note: Pandas doesn't play well with nan values in ints. BQ will also coerce to floats if there are NaNs historical_features_df = historical_features.to_df() print(historical_features_df) if config.feature_is_list: assert_feature_list_types( environment.test_repo_config.provider, config.feature_dtype, historical_features_df, ) else: assert_expected_historical_feature_types(config.feature_dtype, historical_features_df) assert_expected_arrow_types( environment.test_repo_config.provider, config.feature_dtype, config.feature_is_list, historical_features, )
def test_feature_get_online_features_types_match(online_types_test_fixtures): environment, config, data_source, fv = online_types_test_fixtures fv = create_feature_view( "get_online_features_types_match", config.feature_dtype, config.feature_is_list, config.has_empty_list, data_source, ) fs = environment.feature_store features = [fv.name + ":value"] entity = driver(value_type=config.entity_type) fs.apply([fv, entity]) fs.materialize( environment.start_date, environment.end_date - timedelta(hours=1) # throwing out last record to make sure # we can successfully infer type even from all empty values ) driver_id_value = "1" if config.entity_type == ValueType.STRING else 1 online_features = fs.get_online_features( features=features, entity_rows=[{ "driver_id": driver_id_value }], ).to_dict() feature_list_dtype_to_expected_online_response_value_type = { "int32": int, "int64": int, "float": float, "string": str, "bool": bool, "datetime": datetime, } expected_dtype = feature_list_dtype_to_expected_online_response_value_type[ config.feature_dtype] assert len(online_features["value"]) == 1 if config.feature_is_list: for feature in online_features["value"]: assert isinstance(feature, list), "Feature value should be a list" assert (config.has_empty_list or len(feature) > 0), "List of values should not be empty" for element in feature: assert isinstance(element, expected_dtype) else: for feature in online_features["value"]: assert isinstance(feature, expected_dtype)
def test_feature_get_historical_features_types_match( offline_types_test_fixtures): environment, config, data_source, fv = offline_types_test_fixtures fs = environment.feature_store fv = create_feature_view( "get_historical_features_types_match", config.feature_dtype, config.feature_is_list, config.has_empty_list, data_source, ) entity = driver() fs.apply([fv, entity]) features = [f"{fv.name}:value"] entity_df = pd.DataFrame() entity_df["driver_id"] = (["1", "3"] if config.entity_type == ValueType.STRING else [1, 3]) now = datetime.utcnow() ts = pd.Timestamp(now).round("ms") entity_df["ts"] = [ ts - timedelta(hours=4), ts - timedelta(hours=2), ] historical_features = fs.get_historical_features( entity_df=entity_df, features=features, ) # Note: Pandas doesn't play well with nan values in ints. BQ will also coerce to floats if there are NaNs historical_features_df = historical_features.to_df() print(historical_features_df) if config.feature_is_list: assert_feature_list_types( environment.test_repo_config.provider, config.feature_dtype, historical_features_df, ) else: assert_expected_historical_feature_types(config.feature_dtype, historical_features_df) assert_expected_arrow_types( environment.test_repo_config.provider, config.feature_dtype, config.feature_is_list, historical_features, )
def test_e2e_consistency(environment, e2e_data_sources, infer_features): fs = environment.feature_store df, data_source = e2e_data_sources fv = driver_feature_view( name=f"test_consistency_{'with_inference' if infer_features else ''}", data_source=data_source, infer_features=infer_features, ) entity = driver() fs.apply([fv, entity]) # materialization is run in two steps and # we use timestamp from generated dataframe as a split point split_dt = df["ts_1"][4].to_pydatetime() - timedelta(seconds=1) run_offline_online_store_consistency_test(fs, fv, split_dt)
def test_infer_odfv_features_with_error(environment, universal_data_sources): store = environment.feature_store (entities, datasets, data_sources) = universal_data_sources features = [Field(name="conv_rate_plus_200", dtype=Float64)] driver_hourly_stats = create_driver_hourly_stats_batch_feature_view( data_sources.driver) request_source = create_conv_rate_request_source() driver_odfv = conv_rate_plus_100_feature_view( [driver_hourly_stats, request_source], features=features, ) feast_objects = [driver_hourly_stats, driver_odfv, driver(), customer()] with pytest.raises(SpecifiedFeaturesNotPresentError): store.apply(feast_objects)
def test_infer_odfv_features_with_error(environment, universal_data_sources): store = environment.feature_store (entities, datasets, data_sources) = universal_data_sources features = [Feature("conv_rate_plus_200", ValueType.DOUBLE)] driver_hourly_stats = create_driver_hourly_stats_feature_view( data_sources["driver"] ) request_data_source = create_conv_rate_request_data_source() driver_odfv = conv_rate_plus_100_feature_view( {"driver": driver_hourly_stats, "input_request": request_data_source}, features=features, ) feast_objects = [driver_hourly_stats, driver_odfv, driver(), customer()] with pytest.raises(SpecifiedFeaturesNotPresentError): store.apply(feast_objects)
def test_infer_odfv_features(environment, universal_data_sources, infer_features): store = environment.feature_store (entities, datasets, data_sources) = universal_data_sources driver_hourly_stats = create_driver_hourly_stats_feature_view( data_sources["driver"] ) request_data_source = create_conv_rate_request_data_source() driver_odfv = conv_rate_plus_100_feature_view( {"driver": driver_hourly_stats, "input_request": request_data_source}, infer_features=infer_features, ) feast_objects = [driver_hourly_stats, driver_odfv, driver(), customer()] store.apply(feast_objects) odfv = store.get_on_demand_feature_view("conv_rate_plus_100") assert len(odfv.features) == 2
def setup_python_fs_client(): config = IntegrationTestRepoConfig() environment = construct_test_environment(config) fs = environment.feature_store try: entities, datasets, data_sources = construct_universal_test_data( environment) feature_views = construct_universal_feature_views(data_sources) feast_objects: List[FeastObject] = [] feast_objects.extend(feature_views.values()) feast_objects.extend([driver(), customer(), location()]) fs.apply(feast_objects) fs.materialize(environment.start_date, environment.end_date) client = TestClient(get_app(fs)) yield client finally: fs.teardown() environment.data_source_creator.teardown()
def test_historical_retrieval_fails_on_validation(environment, universal_data_sources): store = environment.feature_store (entities, datasets, data_sources) = universal_data_sources feature_views = construct_universal_feature_views(data_sources) store.apply([driver(), customer(), location(), *feature_views.values()]) entity_df = datasets.entity_df.drop( columns=["order_id", "origin_id", "destination_id"]) reference_job = store.get_historical_features( entity_df=entity_df, features=_features, ) store.create_saved_dataset( from_=reference_job, name="my_other_dataset", storage=environment.data_source_creator. create_saved_dataset_destination(), ) job = store.get_historical_features( entity_df=entity_df, features=_features, ) with pytest.raises(ValidationFailed) as exc_info: job.to_df(validation_reference=store.get_saved_dataset( "my_other_dataset").as_reference( profiler=profiler_with_unrealistic_expectations)) failed_expectations = exc_info.value.report.errors assert len(failed_expectations) == 2 assert failed_expectations[ 0].check_name == "expect_column_max_to_be_between" assert failed_expectations[0].column_name == "current_balance" assert failed_expectations[ 1].check_name == "expect_column_values_to_be_in_set" assert failed_expectations[1].column_name == "avg_passenger_count"
def test_entity_inference_types_match(offline_types_test_fixtures): environment, config, data_source, fv = offline_types_test_fixtures fs = environment.feature_store # Don't specify value type in entity to force inference entity = driver(value_type=ValueType.UNKNOWN) fs.apply([fv, entity]) entities = fs.list_entities() entity_type_to_expected_inferred_entity_type = { ValueType.INT32: ValueType.INT64, ValueType.INT64: ValueType.INT64, ValueType.FLOAT: ValueType.DOUBLE, ValueType.STRING: ValueType.STRING, } for entity in entities: assert ( entity.value_type == entity_type_to_expected_inferred_entity_type[ config.entity_type])
def test_write_to_online_store(environment, universal_data_sources): fs = environment.feature_store entities, datasets, data_sources = universal_data_sources driver_hourly_stats = create_driver_hourly_stats_feature_view( data_sources.driver) driver_entity = driver() # Register Feature View and Entity fs.apply([driver_hourly_stats, driver_entity]) # fake data to ingest into Online Store data = { "driver_id": [123], "conv_rate": [0.85], "acc_rate": [0.91], "avg_daily_trips": [14], "event_timestamp": [pd.Timestamp(datetime.datetime.utcnow()).round("ms")], "created": [pd.Timestamp(datetime.datetime.utcnow()).round("ms")], } df_data = pd.DataFrame(data) # directly ingest data into the Online Store fs.write_to_online_store("driver_stats", df_data) # assert the right data is in the Online Store df = fs.get_online_features( features=[ "driver_stats:avg_daily_trips", "driver_stats:acc_rate", "driver_stats:conv_rate", ], entity_rows=[{ "driver_id": 123 }], ).to_df() assertpy.assert_that(df["avg_daily_trips"].iloc[0]).is_equal_to(14) assertpy.assert_that(df["acc_rate"].iloc[0]).is_close_to(0.91, 1e-6) assertpy.assert_that(df["conv_rate"].iloc[0]).is_close_to(0.85, 1e-6)
def construct_test_environment( test_repo_config: TestRepoConfig, create_and_apply: bool = False, materialize: bool = False, ) -> Environment: """ This method should take in the parameters from the test repo config and created a feature repo, apply it, and return the constructed feature store object to callers. This feature store object can be interacted for the purposes of tests. The user is *not* expected to perform any clean up actions. :param test_repo_config: configuration :return: A feature store built using the supplied configuration. """ df = create_dataset() project = f"test_correctness_{str(uuid.uuid4()).replace('-', '')[:8]}" module_name, config_class_name = test_repo_config.offline_store_creator.rsplit( ".", 1) offline_creator: DataSourceCreator = importer.get_class_from_type( module_name, config_class_name, "DataSourceCreator")(project) ds = offline_creator.create_data_source(project, df, field_mapping={ "ts_1": "ts", "id": "driver_id" }) offline_store = offline_creator.create_offline_store_config() online_store = test_repo_config.online_store with tempfile.TemporaryDirectory() as repo_dir_name: config = RepoConfig( registry=str(Path(repo_dir_name) / "registry.db"), project=project, provider=test_repo_config.provider, offline_store=offline_store, online_store=online_store, repo_path=repo_dir_name, ) fs = FeatureStore(config=config) environment = Environment( name=project, test_repo_config=test_repo_config, feature_store=fs, data_source=ds, data_source_creator=offline_creator, ) fvs = [] entities = [] try: if create_and_apply: entities.extend([driver(), customer()]) fvs.extend([ environment.driver_stats_feature_view(), environment.customer_feature_view(), ]) fs.apply(fvs + entities) if materialize: fs.materialize(environment.start_date, environment.end_date) yield environment finally: offline_creator.teardown() fs.teardown()
def test_online_store_cleanup(environment, universal_data_sources): """ Some online store implementations (like Redis) keep features from different features views but with common entities together. This might end up with deletion of all features attached to the entity, when only one feature view was deletion target (see https://github.com/feast-dev/feast/issues/2150). Plan: 1. Register two feature views with common entity "driver" 2. Materialize data 3. Check if features are available (via online retrieval) 4. Delete one feature view 5. Check that features for other are still available 6. Delete another feature view (and create again) 7. Verify that features for both feature view were deleted """ fs = environment.feature_store entities, datasets, data_sources = universal_data_sources driver_stats_fv = construct_universal_feature_views(data_sources).driver driver_entities = entities.driver_vals df = pd.DataFrame({ "ts_1": [environment.end_date] * len(driver_entities), "created_ts": [environment.end_date] * len(driver_entities), "driver_id": driver_entities, "value": np.random.random(size=len(driver_entities)), }) ds = environment.data_source_creator.create_data_source( df, destination_name="simple_driver_dataset") simple_driver_fv = driver_feature_view( data_source=ds, name="test_universal_online_simple_driver") fs.apply([driver(), simple_driver_fv, driver_stats_fv]) fs.materialize( environment.start_date - timedelta(days=1), environment.end_date + timedelta(days=1), ) expected_values = df.sort_values(by="driver_id") features = [f"{simple_driver_fv.name}:value"] entity_rows = [{ "driver_id": driver_id } for driver_id in sorted(driver_entities)] online_features = fs.get_online_features( features=features, entity_rows=entity_rows).to_dict() assert np.allclose(expected_values["value"], online_features["value"]) fs.apply(objects=[simple_driver_fv], objects_to_delete=[driver_stats_fv], partial=False) online_features = fs.get_online_features( features=features, entity_rows=entity_rows).to_dict() assert np.allclose(expected_values["value"], online_features["value"]) fs.apply(objects=[], objects_to_delete=[simple_driver_fv], partial=False) def eventually_apply() -> Tuple[None, bool]: try: fs.apply([simple_driver_fv]) except BotoCoreError: return None, False return None, True # Online store backend might have eventual consistency in schema update # So recreating table that was just deleted might need some retries wait_retry_backoff(eventually_apply, timeout_secs=60) online_features = fs.get_online_features( features=features, entity_rows=entity_rows).to_dict() assert all(v is None for v in online_features["value"])
def test_online_retrieval(environment, universal_data_sources, full_feature_names): fs = environment.feature_store entities, datasets, data_sources = universal_data_sources feature_views = construct_universal_feature_views(data_sources) feature_service = FeatureService( "convrate_plus100", features=[ feature_views.driver[["conv_rate"]], feature_views.driver_odfv, feature_views.customer[["current_balance"]], ], ) feature_service_entity_mapping = FeatureService( name="entity_mapping", features=[ feature_views.location.with_name("origin").with_join_key_map( {"location_id": "origin_id"}), feature_views.location.with_name("destination").with_join_key_map( {"location_id": "destination_id"}), ], ) feast_objects = [] feast_objects.extend(feature_views.values()) feast_objects.extend([ driver(), customer(), location(), feature_service, feature_service_entity_mapping, ]) fs.apply(feast_objects) fs.materialize( environment.start_date - timedelta(days=1), environment.end_date + timedelta(days=1), ) entity_sample = datasets.orders_df.sample(10)[[ "customer_id", "driver_id", "order_id", "event_timestamp" ]] orders_df = datasets.orders_df[( datasets.orders_df["customer_id"].isin(entity_sample["customer_id"]) & datasets.orders_df["driver_id"].isin(entity_sample["driver_id"]))] sample_drivers = entity_sample["driver_id"] drivers_df = datasets.driver_df[datasets.driver_df["driver_id"].isin( sample_drivers)] sample_customers = entity_sample["customer_id"] customers_df = datasets.customer_df[ datasets.customer_df["customer_id"].isin(sample_customers)] location_pairs = np.array( list(itertools.permutations(entities.location_vals, 2))) sample_location_pairs = location_pairs[np.random.choice( len(location_pairs), 10)].T.tolist() origins_df = datasets.location_df[datasets.location_df["location_id"].isin( sample_location_pairs[0])] destinations_df = datasets.location_df[ datasets.location_df["location_id"].isin(sample_location_pairs[1])] global_df = datasets.global_df entity_rows = [{ "driver_id": d, "customer_id": c, "val_to_add": 50 } for (d, c) in zip(sample_drivers, sample_customers)] feature_refs = [ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "conv_rate_plus_100:conv_rate_plus_100", "conv_rate_plus_100:conv_rate_plus_val_to_add", "order:order_is_success", "global_stats:num_rides", "global_stats:avg_ride_length", ] unprefixed_feature_refs = [ f.rsplit(":", 1)[-1] for f in feature_refs if ":" in f ] # Remove the on demand feature view output features, since they're not present in the source dataframe unprefixed_feature_refs.remove("conv_rate_plus_100") unprefixed_feature_refs.remove("conv_rate_plus_val_to_add") online_features_dict = get_online_features_dict( environment=environment, features=feature_refs, entity_rows=entity_rows, full_feature_names=full_feature_names, ) # Test that the on demand feature views compute properly even if the dependent conv_rate # feature isn't requested. online_features_no_conv_rate = get_online_features_dict( environment=environment, features=[ ref for ref in feature_refs if ref != "driver_stats:conv_rate" ], entity_rows=entity_rows, full_feature_names=full_feature_names, ) assert online_features_no_conv_rate is not None keys = set(online_features_dict.keys()) expected_keys = set( f.replace(":", "__") if full_feature_names else f.split(":")[-1] for f in feature_refs) | {"customer_id", "driver_id"} assert ( keys == expected_keys ), f"Response keys are different from expected: {keys - expected_keys} (extra) and {expected_keys - keys} (missing)" tc = unittest.TestCase() for i, entity_row in enumerate(entity_rows): df_features = get_latest_feature_values_from_dataframes( driver_df=drivers_df, customer_df=customers_df, orders_df=orders_df, global_df=global_df, entity_row=entity_row, ) assert df_features["customer_id"] == online_features_dict[ "customer_id"][i] assert df_features["driver_id"] == online_features_dict["driver_id"][i] tc.assertAlmostEqual( online_features_dict[response_feature_name("conv_rate_plus_100", feature_refs, full_feature_names)][i], df_features["conv_rate"] + 100, delta=0.0001, ) tc.assertAlmostEqual( online_features_dict[response_feature_name( "conv_rate_plus_val_to_add", feature_refs, full_feature_names)][i], df_features["conv_rate"] + df_features["val_to_add"], delta=0.0001, ) for unprefixed_feature_ref in unprefixed_feature_refs: tc.assertAlmostEqual( df_features[unprefixed_feature_ref], online_features_dict[response_feature_name( unprefixed_feature_ref, feature_refs, full_feature_names)][i], delta=0.0001, ) # Check what happens for missing values missing_responses_dict = get_online_features_dict( environment=environment, features=feature_refs, entity_rows=[{ "driver_id": 0, "customer_id": 0, "val_to_add": 100 }], full_feature_names=full_feature_names, ) assert missing_responses_dict is not None for unprefixed_feature_ref in unprefixed_feature_refs: if unprefixed_feature_ref not in {"num_rides", "avg_ride_length"}: tc.assertIsNone(missing_responses_dict[response_feature_name( unprefixed_feature_ref, feature_refs, full_feature_names)][0]) # Check what happens for missing request data with pytest.raises(RequestDataNotFoundInEntityRowsException): get_online_features_dict( environment=environment, features=feature_refs, entity_rows=[{ "driver_id": 0, "customer_id": 0 }], full_feature_names=full_feature_names, ) assert_feature_service_correctness( environment, feature_service, entity_rows, full_feature_names, drivers_df, customers_df, orders_df, global_df, ) entity_rows = [{ "origin_id": origin, "destination_id": destination } for (_driver, _customer, origin, destination ) in zip(sample_drivers, sample_customers, *sample_location_pairs)] assert_feature_service_entity_mapping_correctness( environment, feature_service_entity_mapping, entity_rows, full_feature_names, origins_df, destinations_df, )
def test_entity_ttl_online_store(local_redis_environment, redis_universal_data_sources): if os.getenv("FEAST_IS_LOCAL_TEST", "False") == "True": return fs = local_redis_environment.feature_store # setting ttl setting in online store to 1 second fs.config.online_store.key_ttl_seconds = 1 entities, datasets, data_sources = redis_universal_data_sources driver_hourly_stats = create_driver_hourly_stats_feature_view( data_sources.driver) driver_entity = driver() # Register Feature View and Entity fs.apply([driver_hourly_stats, driver_entity]) # fake data to ingest into Online Store data = { "driver_id": [1], "conv_rate": [0.5], "acc_rate": [0.6], "avg_daily_trips": [4], "event_timestamp": [pd.Timestamp(datetime.datetime.utcnow()).round("ms")], "created": [pd.Timestamp(datetime.datetime.utcnow()).round("ms")], } df_ingest = pd.DataFrame(data) # directly ingest data into the Online Store fs.write_to_online_store("driver_stats", df_ingest) # assert the right data is in the Online Store df = fs.get_online_features( features=[ "driver_stats:avg_daily_trips", "driver_stats:acc_rate", "driver_stats:conv_rate", ], entity_rows=[{ "driver_id": 1 }], ).to_df() assertpy.assert_that(df["avg_daily_trips"].iloc[0]).is_equal_to(4) assertpy.assert_that(df["acc_rate"].iloc[0]).is_close_to(0.6, 1e-6) assertpy.assert_that(df["conv_rate"].iloc[0]).is_close_to(0.5, 1e-6) # simulate time passing for testing ttl time.sleep(1) # retrieve the same entity again df = fs.get_online_features( features=[ "driver_stats:avg_daily_trips", "driver_stats:acc_rate", "driver_stats:conv_rate", ], entity_rows=[{ "driver_id": 1 }], ).to_df() # assert that the entity features expired in the online store assertpy.assert_that(df["avg_daily_trips"].iloc[0]).is_none() assertpy.assert_that(df["acc_rate"].iloc[0]).is_none() assertpy.assert_that(df["conv_rate"].iloc[0]).is_none()
def test_online_retrieval(environment, universal_data_sources, full_feature_names): fs = environment.feature_store entities, datasets, data_sources = universal_data_sources feature_views = construct_universal_feature_views(data_sources) feature_service = FeatureService( "convrate_plus100", features=[feature_views["driver"][["conv_rate"]], feature_views["driver_odfv"]], ) feast_objects = [] feast_objects.extend(feature_views.values()) feast_objects.extend([driver(), customer(), feature_service]) fs.apply(feast_objects) fs.materialize( environment.start_date - timedelta(days=1), environment.end_date + timedelta(days=1), ) entity_sample = datasets["orders"].sample(10)[ ["customer_id", "driver_id", "order_id", "event_timestamp"] ] orders_df = datasets["orders"][ ( datasets["orders"]["customer_id"].isin(entity_sample["customer_id"]) & datasets["orders"]["driver_id"].isin(entity_sample["driver_id"]) ) ] sample_drivers = entity_sample["driver_id"] drivers_df = datasets["driver"][ datasets["driver"]["driver_id"].isin(sample_drivers) ] sample_customers = entity_sample["customer_id"] customers_df = datasets["customer"][ datasets["customer"]["customer_id"].isin(sample_customers) ] global_df = datasets["global"] entity_rows = [ {"driver": d, "customer_id": c, "val_to_add": 50} for (d, c) in zip(sample_drivers, sample_customers) ] feature_refs = [ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "conv_rate_plus_100:conv_rate_plus_100", "conv_rate_plus_100:conv_rate_plus_val_to_add", "order:order_is_success", "global_stats:num_rides", "global_stats:avg_ride_length", ] unprefixed_feature_refs = [f.rsplit(":", 1)[-1] for f in feature_refs if ":" in f] # Remove the on demand feature view output features, since they're not present in the source dataframe unprefixed_feature_refs.remove("conv_rate_plus_100") unprefixed_feature_refs.remove("conv_rate_plus_val_to_add") online_features = fs.get_online_features( features=feature_refs, entity_rows=entity_rows, full_feature_names=full_feature_names, ) assert online_features is not None online_features_dict = online_features.to_dict() keys = online_features_dict.keys() assert ( len(keys) == len(feature_refs) + 3 ) # Add three for the driver id and the customer id entity keys + val_to_add request data. for feature in feature_refs: if full_feature_names: assert feature.replace(":", "__") in keys else: assert feature.rsplit(":", 1)[-1] in keys assert ( "driver_stats" not in keys and "customer_profile" not in keys and "order" not in keys and "global_stats" not in keys ) tc = unittest.TestCase() for i, entity_row in enumerate(entity_rows): df_features = get_latest_feature_values_from_dataframes( drivers_df, customers_df, orders_df, global_df, entity_row ) assert df_features["customer_id"] == online_features_dict["customer_id"][i] assert df_features["driver_id"] == online_features_dict["driver_id"][i] assert ( online_features_dict[ response_feature_name("conv_rate_plus_100", full_feature_names) ][i] == df_features["conv_rate"] + 100 ) assert ( online_features_dict[ response_feature_name("conv_rate_plus_val_to_add", full_feature_names) ][i] == df_features["conv_rate"] + df_features["val_to_add"] ) for unprefixed_feature_ref in unprefixed_feature_refs: tc.assertEqual( df_features[unprefixed_feature_ref], online_features_dict[ response_feature_name(unprefixed_feature_ref, full_feature_names) ][i], ) # Check what happens for missing values missing_responses_dict = fs.get_online_features( features=feature_refs, entity_rows=[{"driver": 0, "customer_id": 0, "val_to_add": 100}], full_feature_names=full_feature_names, ).to_dict() assert missing_responses_dict is not None for unprefixed_feature_ref in unprefixed_feature_refs: if unprefixed_feature_ref not in {"num_rides", "avg_ride_length"}: tc.assertIsNone( missing_responses_dict[ response_feature_name(unprefixed_feature_ref, full_feature_names) ][0] ) # Check what happens for missing request data with pytest.raises(RequestDataNotFoundInEntityRowsException): fs.get_online_features( features=feature_refs, entity_rows=[{"driver": 0, "customer_id": 0}], full_feature_names=full_feature_names, ).to_dict() assert_feature_service_correctness( fs, feature_service, entity_rows, full_feature_names, drivers_df, customers_df, orders_df, global_df, )
def test_historical_features_persisting(environment, universal_data_sources, full_feature_names): store = environment.feature_store (entities, datasets, data_sources) = universal_data_sources feature_views = construct_universal_feature_views(data_sources) store.apply([driver(), customer(), location(), *feature_views.values()]) entity_df = datasets.entity_df.drop( columns=["order_id", "origin_id", "destination_id"]) job = store.get_historical_features( entity_df=entity_df, features=[ "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "order:order_is_success", "global_stats:num_rides", "global_stats:avg_ride_length", "field_mapping:feature_name", ], full_feature_names=full_feature_names, ) saved_dataset = store.create_saved_dataset( from_=job, name="saved_dataset", storage=environment.data_source_creator. create_saved_dataset_destination(), tags={"env": "test"}, ) event_timestamp = DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL expected_df = get_expected_training_df( datasets.customer_df, feature_views.customer, datasets.driver_df, feature_views.driver, datasets.orders_df, feature_views.order, datasets.location_df, feature_views.location, datasets.global_df, feature_views.global_fv, datasets.field_mapping_df, feature_views.field_mapping, entity_df, event_timestamp, full_feature_names, ).drop(columns=[ response_feature_name("conv_rate_plus_100", full_feature_names), response_feature_name("conv_rate_plus_100_rounded", full_feature_names), response_feature_name("avg_daily_trips", full_feature_names), response_feature_name("conv_rate", full_feature_names), "origin__temperature", "destination__temperature", ]) assert_frame_equal( expected_df, saved_dataset.to_df(), keys=[event_timestamp, "driver_id", "customer_id"], ) assert_frame_equal( job.to_df(), saved_dataset.to_df(), keys=[event_timestamp, "driver_id", "customer_id"], )
def test_historical_features_with_entities_from_query(environment, universal_data_sources, full_feature_names): store = environment.feature_store (entities, datasets, data_sources) = universal_data_sources feature_views = construct_universal_feature_views(data_sources) orders_table = table_name_from_data_source(data_sources.orders) if not orders_table: raise pytest.skip("Offline source is not sql-based") data_source_creator = environment.test_repo_config.offline_store_creator if data_source_creator.__name__ == SnowflakeDataSourceCreator.__name__: entity_df_query = f""" SELECT "customer_id", "driver_id", "order_id", "origin_id", "destination_id", "event_timestamp" FROM "{orders_table}" """ else: entity_df_query = f""" SELECT customer_id, driver_id, order_id, origin_id, destination_id, event_timestamp FROM {orders_table} """ store.apply([driver(), customer(), location(), *feature_views.values()]) job_from_sql = store.get_historical_features( entity_df=entity_df_query, features=[ "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "order:order_is_success", "global_stats:num_rides", "global_stats:avg_ride_length", "field_mapping:feature_name", ], full_feature_names=full_feature_names, ) start_time = datetime.utcnow() actual_df_from_sql_entities = job_from_sql.to_df() end_time = datetime.utcnow() print( str(f"\nTime to execute job_from_sql.to_df() = '{(end_time - start_time)}'" )) event_timestamp = (DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL if DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL in datasets.orders_df.columns else "e_ts") full_expected_df = get_expected_training_df( datasets.customer_df, feature_views.customer, datasets.driver_df, feature_views.driver, datasets.orders_df, feature_views.order, datasets.location_df, feature_views.location, datasets.global_df, feature_views.global_fv, datasets.field_mapping_df, feature_views.field_mapping, datasets.entity_df, event_timestamp, full_feature_names, ) # Not requesting the on demand transform with an entity_df query (can't add request data in them) expected_df_query = full_expected_df.drop(columns=[ response_feature_name("conv_rate_plus_100", full_feature_names), response_feature_name("conv_rate_plus_100_rounded", full_feature_names), response_feature_name("avg_daily_trips", full_feature_names), response_feature_name("conv_rate", full_feature_names), "origin__temperature", "destination__temperature", ]) assert_frame_equal( expected_df_query, actual_df_from_sql_entities, keys=[event_timestamp, "order_id", "driver_id", "customer_id"], ) table_from_sql_entities = job_from_sql.to_arrow().to_pandas() for col in table_from_sql_entities.columns: expected_df_query[col] = expected_df_query[col].astype( table_from_sql_entities[col].dtype) assert_frame_equal( expected_df_query, table_from_sql_entities, keys=[event_timestamp, "order_id", "driver_id", "customer_id"], )
def test_historical_features(environment, universal_data_sources, full_feature_names): store = environment.feature_store (entities, datasets, data_sources) = universal_data_sources feature_views = construct_universal_feature_views(data_sources) customer_df, driver_df, location_df, orders_df, global_df, entity_df = ( datasets["customer"], datasets["driver"], datasets["location"], datasets["orders"], datasets["global"], datasets["entity"], ) entity_df_with_request_data = entity_df.copy(deep=True) entity_df_with_request_data["val_to_add"] = [ i for i in range(len(entity_df_with_request_data)) ] entity_df_with_request_data["driver_age"] = [ i + 100 for i in range(len(entity_df_with_request_data)) ] ( customer_fv, driver_fv, driver_odfv, location_fv, order_fv, global_fv, driver_age_request_fv, ) = ( feature_views["customer"], feature_views["driver"], feature_views["driver_odfv"], feature_views["location"], feature_views["order"], feature_views["global"], feature_views["driver_age_request_fv"], ) feature_service = FeatureService( name="convrate_plus100", features=[ feature_views["driver"][["conv_rate"]], driver_odfv, driver_age_request_fv, ], ) feature_service_entity_mapping = FeatureService( name="entity_mapping", features=[ location_fv.with_name("origin").with_join_key_map( {"location_id": "origin_id"} ), location_fv.with_name("destination").with_join_key_map( {"location_id": "destination_id"} ), ], ) feast_objects = [] feast_objects.extend( [ customer_fv, driver_fv, driver_odfv, location_fv, order_fv, global_fv, driver_age_request_fv, driver(), customer(), location(), feature_service, feature_service_entity_mapping, ] ) store.apply(feast_objects) entity_df_query = None orders_table = table_name_from_data_source(data_sources["orders"]) if orders_table: entity_df_query = f"SELECT customer_id, driver_id, order_id, origin_id, destination_id, event_timestamp FROM {orders_table}" event_timestamp = ( DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL if DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL in orders_df.columns else "e_ts" ) full_expected_df = get_expected_training_df( customer_df, customer_fv, driver_df, driver_fv, orders_df, order_fv, location_df, location_fv, global_df, global_fv, entity_df_with_request_data, event_timestamp, full_feature_names, ) # Only need the shadow entities features in the FeatureService test expected_df = full_expected_df.drop( columns=["origin__temperature", "destination__temperature"], ) if entity_df_query: job_from_sql = store.get_historical_features( entity_df=entity_df_query, features=[ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "order:order_is_success", "global_stats:num_rides", "global_stats:avg_ride_length", ], full_feature_names=full_feature_names, ) start_time = datetime.utcnow() actual_df_from_sql_entities = job_from_sql.to_df() end_time = datetime.utcnow() print( str(f"\nTime to execute job_from_sql.to_df() = '{(end_time - start_time)}'") ) # Not requesting the on demand transform with an entity_df query (can't add request data in them) expected_df_query = expected_df.drop( columns=[ "conv_rate_plus_100", "conv_rate_plus_100_rounded", "val_to_add", "conv_rate_plus_val_to_add", "driver_age", ] ) assert sorted(expected_df_query.columns) == sorted( actual_df_from_sql_entities.columns ) actual_df_from_sql_entities = ( actual_df_from_sql_entities[expected_df_query.columns] .sort_values(by=[event_timestamp, "order_id", "driver_id", "customer_id"]) .drop_duplicates() .reset_index(drop=True) ) expected_df_query = ( expected_df_query.sort_values( by=[event_timestamp, "order_id", "driver_id", "customer_id"] ) .drop_duplicates() .reset_index(drop=True) ) assert_frame_equal( actual_df_from_sql_entities, expected_df_query, check_dtype=False, ) table_from_sql_entities = job_from_sql.to_arrow() df_from_sql_entities = ( table_from_sql_entities.to_pandas()[expected_df_query.columns] .sort_values(by=[event_timestamp, "order_id", "driver_id", "customer_id"]) .drop_duplicates() .reset_index(drop=True) ) for col in df_from_sql_entities.columns: expected_df_query[col] = expected_df_query[col].astype( df_from_sql_entities[col].dtype ) assert_frame_equal(expected_df_query, df_from_sql_entities) job_from_df = store.get_historical_features( entity_df=entity_df_with_request_data, features=[ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "conv_rate_plus_100:conv_rate_plus_100", "conv_rate_plus_100:conv_rate_plus_100_rounded", "conv_rate_plus_100:conv_rate_plus_val_to_add", "order:order_is_success", "global_stats:num_rides", "global_stats:avg_ride_length", "driver_age:driver_age", ], full_feature_names=full_feature_names, ) start_time = datetime.utcnow() actual_df_from_df_entities = job_from_df.to_df() print(f"actual_df_from_df_entities shape: {actual_df_from_df_entities.shape}") end_time = datetime.utcnow() print(str(f"Time to execute job_from_df.to_df() = '{(end_time - start_time)}'\n")) assert sorted(expected_df.columns) == sorted(actual_df_from_df_entities.columns) expected_df: pd.DataFrame = ( expected_df.sort_values( by=[event_timestamp, "order_id", "driver_id", "customer_id"] ) .drop_duplicates() .reset_index(drop=True) ) actual_df_from_df_entities = ( actual_df_from_df_entities[expected_df.columns] .sort_values(by=[event_timestamp, "order_id", "driver_id", "customer_id"]) .drop_duplicates() .reset_index(drop=True) ) assert_frame_equal( expected_df, actual_df_from_df_entities, check_dtype=False, ) assert_feature_service_correctness( store, feature_service, full_feature_names, entity_df_with_request_data, expected_df, event_timestamp, ) assert_feature_service_entity_mapping_correctness( store, feature_service_entity_mapping, full_feature_names, entity_df_with_request_data, full_expected_df, event_timestamp, ) table_from_df_entities: pd.DataFrame = job_from_df.to_arrow().to_pandas() columns_expected_in_table = expected_df.columns.tolist() table_from_df_entities = ( table_from_df_entities[columns_expected_in_table] .sort_values(by=[event_timestamp, "order_id", "driver_id", "customer_id"]) .drop_duplicates() .reset_index(drop=True) ) assert_frame_equal(actual_df_from_df_entities, table_from_df_entities) # If request data is missing that's needed for on demand transform, throw an error with pytest.raises(RequestDataNotFoundInEntityDfException): store.get_historical_features( entity_df=entity_df, features=[ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "conv_rate_plus_100:conv_rate_plus_100", "conv_rate_plus_100:conv_rate_plus_val_to_add", "global_stats:num_rides", "global_stats:avg_ride_length", ], full_feature_names=full_feature_names, ) # If request data is missing that's needed for a request feature view, throw an error with pytest.raises(RequestDataNotFoundInEntityDfException): store.get_historical_features( entity_df=entity_df, features=[ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "driver_age:driver_age", "global_stats:num_rides", "global_stats:avg_ride_length", ], full_feature_names=full_feature_names, )
def test_historical_features(environment, universal_data_sources, full_feature_names): store = environment.feature_store (entities, datasets, data_sources) = universal_data_sources feature_views = construct_universal_feature_views(data_sources) entity_df_with_request_data = datasets.entity_df.copy(deep=True) entity_df_with_request_data["val_to_add"] = [ i for i in range(len(entity_df_with_request_data)) ] entity_df_with_request_data["driver_age"] = [ i + 100 for i in range(len(entity_df_with_request_data)) ] feature_service = FeatureService( name="convrate_plus100", features=[ feature_views.driver[["conv_rate"]], feature_views.driver_odfv ], ) feature_service_entity_mapping = FeatureService( name="entity_mapping", features=[ feature_views.location.with_name("origin").with_join_key_map( {"location_id": "origin_id"}), feature_views.location.with_name("destination").with_join_key_map( {"location_id": "destination_id"}), ], ) store.apply([ driver(), customer(), location(), feature_service, feature_service_entity_mapping, *feature_views.values(), ]) event_timestamp = (DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL if DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL in datasets.orders_df.columns else "e_ts") full_expected_df = get_expected_training_df( datasets.customer_df, feature_views.customer, datasets.driver_df, feature_views.driver, datasets.orders_df, feature_views.order, datasets.location_df, feature_views.location, datasets.global_df, feature_views.global_fv, datasets.field_mapping_df, feature_views.field_mapping, entity_df_with_request_data, event_timestamp, full_feature_names, ) # Only need the shadow entities features in the FeatureService test expected_df = full_expected_df.drop( columns=["origin__temperature", "destination__temperature"], ) job_from_df = store.get_historical_features( entity_df=entity_df_with_request_data, features=[ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "conv_rate_plus_100:conv_rate_plus_100", "conv_rate_plus_100:conv_rate_plus_100_rounded", "conv_rate_plus_100:conv_rate_plus_val_to_add", "order:order_is_success", "global_stats:num_rides", "global_stats:avg_ride_length", "field_mapping:feature_name", ], full_feature_names=full_feature_names, ) start_time = datetime.utcnow() actual_df_from_df_entities = job_from_df.to_df() print( f"actual_df_from_df_entities shape: {actual_df_from_df_entities.shape}" ) end_time = datetime.utcnow() print( str(f"Time to execute job_from_df.to_df() = '{(end_time - start_time)}'\n" )) assert sorted(expected_df.columns) == sorted( actual_df_from_df_entities.columns) assert_frame_equal( expected_df, actual_df_from_df_entities, keys=[event_timestamp, "order_id", "driver_id", "customer_id"], ) assert_feature_service_correctness( store, feature_service, full_feature_names, entity_df_with_request_data, expected_df, event_timestamp, ) assert_feature_service_entity_mapping_correctness( store, feature_service_entity_mapping, full_feature_names, entity_df_with_request_data, full_expected_df, event_timestamp, ) table_from_df_entities: pd.DataFrame = job_from_df.to_arrow().to_pandas() assert_frame_equal( expected_df, table_from_df_entities, keys=[event_timestamp, "order_id", "driver_id", "customer_id"], )
def test_online_retrieval(environment, universal_data_sources, full_feature_names): fs = environment.feature_store entities, datasets, data_sources = universal_data_sources feature_views = construct_universal_feature_views(data_sources) feature_service = FeatureService( "convrate_plus100", features=[ feature_views["driver"][["conv_rate"]], feature_views["driver_odfv"], feature_views["driver_age_request_fv"], ], ) feature_service_entity_mapping = FeatureService( name="entity_mapping", features=[ feature_views["location"].with_name("origin").with_join_key_map( {"location_id": "origin_id"}), feature_views["location"].with_name( "destination").with_join_key_map( {"location_id": "destination_id"}), ], ) feast_objects = [] feast_objects.extend(feature_views.values()) feast_objects.extend([ driver(), customer(), location(), feature_service, feature_service_entity_mapping, ]) fs.apply(feast_objects) fs.materialize( environment.start_date - timedelta(days=1), environment.end_date + timedelta(days=1), ) entity_sample = datasets["orders"].sample(10)[[ "customer_id", "driver_id", "order_id", "event_timestamp" ]] orders_df = datasets["orders"][( datasets["orders"]["customer_id"].isin(entity_sample["customer_id"]) & datasets["orders"]["driver_id"].isin(entity_sample["driver_id"]))] sample_drivers = entity_sample["driver_id"] drivers_df = datasets["driver"][datasets["driver"]["driver_id"].isin( sample_drivers)] sample_customers = entity_sample["customer_id"] customers_df = datasets["customer"][datasets["customer"] ["customer_id"].isin(sample_customers)] location_pairs = np.array( list(itertools.permutations(entities["location"], 2))) sample_location_pairs = location_pairs[np.random.choice( len(location_pairs), 10)].T origins_df = datasets["location"][datasets["location"]["location_id"].isin( sample_location_pairs[0])] destinations_df = datasets["location"][ datasets["location"]["location_id"].isin(sample_location_pairs[1])] global_df = datasets["global"] entity_rows = [{ "driver": d, "customer_id": c, "val_to_add": 50, "driver_age": 25 } for (d, c) in zip(sample_drivers, sample_customers)] feature_refs = [ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "conv_rate_plus_100:conv_rate_plus_100", "conv_rate_plus_100:conv_rate_plus_val_to_add", "order:order_is_success", "global_stats:num_rides", "global_stats:avg_ride_length", "driver_age:driver_age", ] unprefixed_feature_refs = [ f.rsplit(":", 1)[-1] for f in feature_refs if ":" in f ] # Remove the on demand feature view output features, since they're not present in the source dataframe unprefixed_feature_refs.remove("conv_rate_plus_100") unprefixed_feature_refs.remove("conv_rate_plus_val_to_add") online_features = fs.get_online_features( features=feature_refs, entity_rows=entity_rows, full_feature_names=full_feature_names, ) assert online_features is not None online_features_dict = online_features.to_dict() keys = online_features_dict.keys() assert ( len(keys) == len(feature_refs) + 3 ) # Add three for the driver id and the customer id entity keys + val_to_add request data. for feature in feature_refs: # full_feature_names does not apply to request feature views if full_feature_names and feature != "driver_age:driver_age": assert feature.replace(":", "__") in keys else: assert feature.rsplit(":", 1)[-1] in keys assert ("driver_stats" not in keys and "customer_profile" not in keys and "order" not in keys and "global_stats" not in keys) tc = unittest.TestCase() for i, entity_row in enumerate(entity_rows): df_features = get_latest_feature_values_from_dataframes( driver_df=drivers_df, customer_df=customers_df, orders_df=orders_df, global_df=global_df, entity_row=entity_row, ) assert df_features["customer_id"] == online_features_dict[ "customer_id"][i] assert df_features["driver_id"] == online_features_dict["driver_id"][i] tc.assertAlmostEqual( online_features_dict[response_feature_name("conv_rate_plus_100", full_feature_names)][i], df_features["conv_rate"] + 100, delta=0.0001, ) tc.assertAlmostEqual( online_features_dict[response_feature_name( "conv_rate_plus_val_to_add", full_feature_names)][i], df_features["conv_rate"] + df_features["val_to_add"], delta=0.0001, ) for unprefixed_feature_ref in unprefixed_feature_refs: tc.assertAlmostEqual( df_features[unprefixed_feature_ref], online_features_dict[response_feature_name( unprefixed_feature_ref, full_feature_names)][i], delta=0.0001, ) # Check what happens for missing values missing_responses_dict = fs.get_online_features( features=feature_refs, entity_rows=[{ "driver": 0, "customer_id": 0, "val_to_add": 100, "driver_age": 125 }], full_feature_names=full_feature_names, ).to_dict() assert missing_responses_dict is not None for unprefixed_feature_ref in unprefixed_feature_refs: if unprefixed_feature_ref not in { "num_rides", "avg_ride_length", "driver_age" }: tc.assertIsNone(missing_responses_dict[response_feature_name( unprefixed_feature_ref, full_feature_names)][0]) # Check what happens for missing request data with pytest.raises(RequestDataNotFoundInEntityRowsException): fs.get_online_features( features=feature_refs, entity_rows=[{ "driver": 0, "customer_id": 0 }], full_feature_names=full_feature_names, ).to_dict() # Also with request data with pytest.raises(RequestDataNotFoundInEntityRowsException): fs.get_online_features( features=feature_refs, entity_rows=[{ "driver": 0, "customer_id": 0, "val_to_add": 20 }], full_feature_names=full_feature_names, ).to_dict() assert_feature_service_correctness( fs, feature_service, entity_rows, full_feature_names, drivers_df, customers_df, orders_df, global_df, ) entity_rows = [{ "driver": driver, "customer_id": customer, "origin_id": origin, "destination_id": destination, } for (driver, customer, origin, destination ) in zip(sample_drivers, sample_customers, *sample_location_pairs)] assert_feature_service_entity_mapping_correctness( fs, feature_service_entity_mapping, entity_rows, full_feature_names, drivers_df, customers_df, orders_df, origins_df, destinations_df, )