def test_historical_features_with_missing_request_data(environment, universal_data_sources, full_feature_names): store = environment.feature_store (_, datasets, data_sources) = universal_data_sources feature_views = construct_universal_feature_views(data_sources) store.apply([driver(), customer(), location(), *feature_views.values()]) # If request data is missing that's needed for on demand transform, throw an error with pytest.raises(RequestDataNotFoundInEntityDfException): store.get_historical_features( entity_df=datasets.entity_df, features=[ "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "conv_rate_plus_100:conv_rate_plus_100", "conv_rate_plus_100:conv_rate_plus_val_to_add", "global_stats:num_rides", "global_stats:avg_ride_length", "field_mapping:feature_name", ], full_feature_names=full_feature_names, )
def test_registration_and_retrieval_from_custom_s3_endpoint(universal_data_sources): config = IntegrationTestRepoConfig( offline_store_creator="tests.integration.feature_repos.universal.data_sources.file.S3FileDataSourceCreator" ) import os if "AWS_ACCESS_KEY_ID" in os.environ: raise Exception( "AWS_ACCESS_KEY_ID has already been set in the environment. Setting it again may cause a conflict. " "It may be better to deduplicate AWS configuration or use sub-processes for isolation" ) os.environ["AWS_ACCESS_KEY_ID"] = "AKIAIOSFODNN7EXAMPLE" os.environ["AWS_SECRET_ACCESS_KEY"] = "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" with construct_test_environment(config) as environment: fs = environment.feature_store entities, datasets, data_sources = universal_data_sources feature_views = construct_universal_feature_views(data_sources) feast_objects = [] feast_objects.extend(feature_views.values()) feast_objects.extend([driver(), customer()]) fs.apply(feast_objects) fs.materialize(environment.start_date, environment.end_date) out = fs.get_online_features( features=["driver_stats:conv_rate"], entity_rows=[{"driver": 5001}] ).to_dict() assert out["conv_rate"][0] is not None del os.environ["AWS_ACCESS_KEY_ID"] del os.environ["AWS_SECRET_ACCESS_KEY"]
def test_push_features_and_read(environment, universal_data_sources): store = environment.feature_store (_, datasets, data_sources) = universal_data_sources feature_views = construct_universal_feature_views(data_sources) store.apply([driver(), customer(), location(), *feature_views.values()]) data = { "location_id": [1], "temperature": [4], "event_timestamp": [pd.Timestamp(datetime.datetime.utcnow()).round("ms")], "created": [pd.Timestamp(datetime.datetime.utcnow()).round("ms")], } df_ingest = pd.DataFrame(data) store.push("location_stats_push_source", df_ingest) online_resp = store.get_online_features( features=["pushable_location_stats:temperature"], entity_rows=[{ "location_id": 1 }], ) online_resp_dict = online_resp.to_dict() assert online_resp_dict["location_id"] == [1] assert online_resp_dict["temperature"] == [4]
def test_historical_retrieval_with_validation(environment, universal_data_sources): store = environment.feature_store (entities, datasets, data_sources) = universal_data_sources feature_views = construct_universal_feature_views(data_sources) store.apply([driver(), customer(), location(), *feature_views.values()]) # Create two identical retrieval jobs entity_df = datasets.entity_df.drop( columns=["order_id", "origin_id", "destination_id"]) reference_job = store.get_historical_features( entity_df=entity_df, features=_features, ) job = store.get_historical_features( entity_df=entity_df, features=_features, ) # Save dataset using reference job and retrieve it store.create_saved_dataset( from_=reference_job, name="my_training_dataset", storage=environment.data_source_creator. create_saved_dataset_destination(), ) saved_dataset = store.get_saved_dataset("my_training_dataset") # If validation pass there will be no exceptions on this point reference = saved_dataset.as_reference(profiler=configurable_profiler) job.to_df(validation_reference=reference)
def test_online_retrieval(environment, universal_data_sources, benchmark): fs = environment.feature_store entities, datasets, data_sources = universal_data_sources feature_views = construct_universal_feature_views(data_sources) feature_service = FeatureService( "convrate_plus100", features=[ feature_views["driver"][["conv_rate"]], feature_views["driver_odfv"] ], ) feast_objects = [] feast_objects.extend(feature_views.values()) feast_objects.extend([driver(), customer(), location(), feature_service]) fs.apply(feast_objects) fs.materialize(environment.start_date, environment.end_date) sample_drivers = random.sample(entities["driver"], 10) sample_customers = random.sample(entities["customer"], 10) entity_rows = [{ "driver": d, "customer_id": c, "val_to_add": 50 } for (d, c) in zip(sample_drivers, sample_customers)] feature_refs = [ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "conv_rate_plus_100:conv_rate_plus_100", "conv_rate_plus_100:conv_rate_plus_val_to_add", "global_stats:num_rides", "global_stats:avg_ride_length", ] unprefixed_feature_refs = [ f.rsplit(":", 1)[-1] for f in feature_refs if ":" in f ] # Remove the on demand feature view output features, since they're not present in the source dataframe unprefixed_feature_refs.remove("conv_rate_plus_100") unprefixed_feature_refs.remove("conv_rate_plus_val_to_add") benchmark( fs.get_online_features, features=feature_refs, entity_rows=entity_rows, )
def test_infer_odfv_features_with_error(environment, universal_data_sources): store = environment.feature_store (entities, datasets, data_sources) = universal_data_sources features = [Field(name="conv_rate_plus_200", dtype=Float64)] driver_hourly_stats = create_driver_hourly_stats_batch_feature_view( data_sources.driver) request_source = create_conv_rate_request_source() driver_odfv = conv_rate_plus_100_feature_view( [driver_hourly_stats, request_source], features=features, ) feast_objects = [driver_hourly_stats, driver_odfv, driver(), customer()] with pytest.raises(SpecifiedFeaturesNotPresentError): store.apply(feast_objects)
def test_infer_odfv_features_with_error(environment, universal_data_sources): store = environment.feature_store (entities, datasets, data_sources) = universal_data_sources features = [Feature("conv_rate_plus_200", ValueType.DOUBLE)] driver_hourly_stats = create_driver_hourly_stats_feature_view( data_sources["driver"] ) request_data_source = create_conv_rate_request_data_source() driver_odfv = conv_rate_plus_100_feature_view( {"driver": driver_hourly_stats, "input_request": request_data_source}, features=features, ) feast_objects = [driver_hourly_stats, driver_odfv, driver(), customer()] with pytest.raises(SpecifiedFeaturesNotPresentError): store.apply(feast_objects)
def test_infer_odfv_features(environment, universal_data_sources, infer_features): store = environment.feature_store (entities, datasets, data_sources) = universal_data_sources driver_hourly_stats = create_driver_hourly_stats_feature_view( data_sources["driver"] ) request_data_source = create_conv_rate_request_data_source() driver_odfv = conv_rate_plus_100_feature_view( {"driver": driver_hourly_stats, "input_request": request_data_source}, infer_features=infer_features, ) feast_objects = [driver_hourly_stats, driver_odfv, driver(), customer()] store.apply(feast_objects) odfv = store.get_on_demand_feature_view("conv_rate_plus_100") assert len(odfv.features) == 2
def setup_python_fs_client(): config = IntegrationTestRepoConfig() environment = construct_test_environment(config) fs = environment.feature_store try: entities, datasets, data_sources = construct_universal_test_data( environment) feature_views = construct_universal_feature_views(data_sources) feast_objects: List[FeastObject] = [] feast_objects.extend(feature_views.values()) feast_objects.extend([driver(), customer(), location()]) fs.apply(feast_objects) fs.materialize(environment.start_date, environment.end_date) client = TestClient(get_app(fs)) yield client finally: fs.teardown() environment.data_source_creator.teardown()
def test_historical_retrieval_fails_on_validation(environment, universal_data_sources): store = environment.feature_store (entities, datasets, data_sources) = universal_data_sources feature_views = construct_universal_feature_views(data_sources) store.apply([driver(), customer(), location(), *feature_views.values()]) entity_df = datasets.entity_df.drop( columns=["order_id", "origin_id", "destination_id"]) reference_job = store.get_historical_features( entity_df=entity_df, features=_features, ) store.create_saved_dataset( from_=reference_job, name="my_other_dataset", storage=environment.data_source_creator. create_saved_dataset_destination(), ) job = store.get_historical_features( entity_df=entity_df, features=_features, ) with pytest.raises(ValidationFailed) as exc_info: job.to_df(validation_reference=store.get_saved_dataset( "my_other_dataset").as_reference( profiler=profiler_with_unrealistic_expectations)) failed_expectations = exc_info.value.report.errors assert len(failed_expectations) == 2 assert failed_expectations[ 0].check_name == "expect_column_max_to_be_between" assert failed_expectations[0].column_name == "current_balance" assert failed_expectations[ 1].check_name == "expect_column_values_to_be_in_set" assert failed_expectations[1].column_name == "avg_passenger_count"
def construct_test_environment( test_repo_config: TestRepoConfig, create_and_apply: bool = False, materialize: bool = False, ) -> Environment: """ This method should take in the parameters from the test repo config and created a feature repo, apply it, and return the constructed feature store object to callers. This feature store object can be interacted for the purposes of tests. The user is *not* expected to perform any clean up actions. :param test_repo_config: configuration :return: A feature store built using the supplied configuration. """ df = create_dataset() project = f"test_correctness_{str(uuid.uuid4()).replace('-', '')[:8]}" module_name, config_class_name = test_repo_config.offline_store_creator.rsplit( ".", 1) offline_creator: DataSourceCreator = importer.get_class_from_type( module_name, config_class_name, "DataSourceCreator")(project) ds = offline_creator.create_data_source(project, df, field_mapping={ "ts_1": "ts", "id": "driver_id" }) offline_store = offline_creator.create_offline_store_config() online_store = test_repo_config.online_store with tempfile.TemporaryDirectory() as repo_dir_name: config = RepoConfig( registry=str(Path(repo_dir_name) / "registry.db"), project=project, provider=test_repo_config.provider, offline_store=offline_store, online_store=online_store, repo_path=repo_dir_name, ) fs = FeatureStore(config=config) environment = Environment( name=project, test_repo_config=test_repo_config, feature_store=fs, data_source=ds, data_source_creator=offline_creator, ) fvs = [] entities = [] try: if create_and_apply: entities.extend([driver(), customer()]) fvs.extend([ environment.driver_stats_feature_view(), environment.customer_feature_view(), ]) fs.apply(fvs + entities) if materialize: fs.materialize(environment.start_date, environment.end_date) yield environment finally: offline_creator.teardown() fs.teardown()
def test_online_retrieval(environment, universal_data_sources, full_feature_names): fs = environment.feature_store entities, datasets, data_sources = universal_data_sources feature_views = construct_universal_feature_views(data_sources) feature_service = FeatureService( "convrate_plus100", features=[ feature_views.driver[["conv_rate"]], feature_views.driver_odfv, feature_views.customer[["current_balance"]], ], ) feature_service_entity_mapping = FeatureService( name="entity_mapping", features=[ feature_views.location.with_name("origin").with_join_key_map( {"location_id": "origin_id"}), feature_views.location.with_name("destination").with_join_key_map( {"location_id": "destination_id"}), ], ) feast_objects = [] feast_objects.extend(feature_views.values()) feast_objects.extend([ driver(), customer(), location(), feature_service, feature_service_entity_mapping, ]) fs.apply(feast_objects) fs.materialize( environment.start_date - timedelta(days=1), environment.end_date + timedelta(days=1), ) entity_sample = datasets.orders_df.sample(10)[[ "customer_id", "driver_id", "order_id", "event_timestamp" ]] orders_df = datasets.orders_df[( datasets.orders_df["customer_id"].isin(entity_sample["customer_id"]) & datasets.orders_df["driver_id"].isin(entity_sample["driver_id"]))] sample_drivers = entity_sample["driver_id"] drivers_df = datasets.driver_df[datasets.driver_df["driver_id"].isin( sample_drivers)] sample_customers = entity_sample["customer_id"] customers_df = datasets.customer_df[ datasets.customer_df["customer_id"].isin(sample_customers)] location_pairs = np.array( list(itertools.permutations(entities.location_vals, 2))) sample_location_pairs = location_pairs[np.random.choice( len(location_pairs), 10)].T.tolist() origins_df = datasets.location_df[datasets.location_df["location_id"].isin( sample_location_pairs[0])] destinations_df = datasets.location_df[ datasets.location_df["location_id"].isin(sample_location_pairs[1])] global_df = datasets.global_df entity_rows = [{ "driver_id": d, "customer_id": c, "val_to_add": 50 } for (d, c) in zip(sample_drivers, sample_customers)] feature_refs = [ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "conv_rate_plus_100:conv_rate_plus_100", "conv_rate_plus_100:conv_rate_plus_val_to_add", "order:order_is_success", "global_stats:num_rides", "global_stats:avg_ride_length", ] unprefixed_feature_refs = [ f.rsplit(":", 1)[-1] for f in feature_refs if ":" in f ] # Remove the on demand feature view output features, since they're not present in the source dataframe unprefixed_feature_refs.remove("conv_rate_plus_100") unprefixed_feature_refs.remove("conv_rate_plus_val_to_add") online_features_dict = get_online_features_dict( environment=environment, features=feature_refs, entity_rows=entity_rows, full_feature_names=full_feature_names, ) # Test that the on demand feature views compute properly even if the dependent conv_rate # feature isn't requested. online_features_no_conv_rate = get_online_features_dict( environment=environment, features=[ ref for ref in feature_refs if ref != "driver_stats:conv_rate" ], entity_rows=entity_rows, full_feature_names=full_feature_names, ) assert online_features_no_conv_rate is not None keys = set(online_features_dict.keys()) expected_keys = set( f.replace(":", "__") if full_feature_names else f.split(":")[-1] for f in feature_refs) | {"customer_id", "driver_id"} assert ( keys == expected_keys ), f"Response keys are different from expected: {keys - expected_keys} (extra) and {expected_keys - keys} (missing)" tc = unittest.TestCase() for i, entity_row in enumerate(entity_rows): df_features = get_latest_feature_values_from_dataframes( driver_df=drivers_df, customer_df=customers_df, orders_df=orders_df, global_df=global_df, entity_row=entity_row, ) assert df_features["customer_id"] == online_features_dict[ "customer_id"][i] assert df_features["driver_id"] == online_features_dict["driver_id"][i] tc.assertAlmostEqual( online_features_dict[response_feature_name("conv_rate_plus_100", feature_refs, full_feature_names)][i], df_features["conv_rate"] + 100, delta=0.0001, ) tc.assertAlmostEqual( online_features_dict[response_feature_name( "conv_rate_plus_val_to_add", feature_refs, full_feature_names)][i], df_features["conv_rate"] + df_features["val_to_add"], delta=0.0001, ) for unprefixed_feature_ref in unprefixed_feature_refs: tc.assertAlmostEqual( df_features[unprefixed_feature_ref], online_features_dict[response_feature_name( unprefixed_feature_ref, feature_refs, full_feature_names)][i], delta=0.0001, ) # Check what happens for missing values missing_responses_dict = get_online_features_dict( environment=environment, features=feature_refs, entity_rows=[{ "driver_id": 0, "customer_id": 0, "val_to_add": 100 }], full_feature_names=full_feature_names, ) assert missing_responses_dict is not None for unprefixed_feature_ref in unprefixed_feature_refs: if unprefixed_feature_ref not in {"num_rides", "avg_ride_length"}: tc.assertIsNone(missing_responses_dict[response_feature_name( unprefixed_feature_ref, feature_refs, full_feature_names)][0]) # Check what happens for missing request data with pytest.raises(RequestDataNotFoundInEntityRowsException): get_online_features_dict( environment=environment, features=feature_refs, entity_rows=[{ "driver_id": 0, "customer_id": 0 }], full_feature_names=full_feature_names, ) assert_feature_service_correctness( environment, feature_service, entity_rows, full_feature_names, drivers_df, customers_df, orders_df, global_df, ) entity_rows = [{ "origin_id": origin, "destination_id": destination } for (_driver, _customer, origin, destination ) in zip(sample_drivers, sample_customers, *sample_location_pairs)] assert_feature_service_entity_mapping_correctness( environment, feature_service_entity_mapping, entity_rows, full_feature_names, origins_df, destinations_df, )
def test_historical_features_persisting(environment, universal_data_sources, full_feature_names): store = environment.feature_store (entities, datasets, data_sources) = universal_data_sources feature_views = construct_universal_feature_views(data_sources) store.apply([driver(), customer(), location(), *feature_views.values()]) entity_df = datasets.entity_df.drop( columns=["order_id", "origin_id", "destination_id"]) job = store.get_historical_features( entity_df=entity_df, features=[ "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "order:order_is_success", "global_stats:num_rides", "global_stats:avg_ride_length", "field_mapping:feature_name", ], full_feature_names=full_feature_names, ) saved_dataset = store.create_saved_dataset( from_=job, name="saved_dataset", storage=environment.data_source_creator. create_saved_dataset_destination(), tags={"env": "test"}, ) event_timestamp = DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL expected_df = get_expected_training_df( datasets.customer_df, feature_views.customer, datasets.driver_df, feature_views.driver, datasets.orders_df, feature_views.order, datasets.location_df, feature_views.location, datasets.global_df, feature_views.global_fv, datasets.field_mapping_df, feature_views.field_mapping, entity_df, event_timestamp, full_feature_names, ).drop(columns=[ response_feature_name("conv_rate_plus_100", full_feature_names), response_feature_name("conv_rate_plus_100_rounded", full_feature_names), response_feature_name("avg_daily_trips", full_feature_names), response_feature_name("conv_rate", full_feature_names), "origin__temperature", "destination__temperature", ]) assert_frame_equal( expected_df, saved_dataset.to_df(), keys=[event_timestamp, "driver_id", "customer_id"], ) assert_frame_equal( job.to_df(), saved_dataset.to_df(), keys=[event_timestamp, "driver_id", "customer_id"], )
def test_historical_features_with_entities_from_query(environment, universal_data_sources, full_feature_names): store = environment.feature_store (entities, datasets, data_sources) = universal_data_sources feature_views = construct_universal_feature_views(data_sources) orders_table = table_name_from_data_source(data_sources.orders) if not orders_table: raise pytest.skip("Offline source is not sql-based") data_source_creator = environment.test_repo_config.offline_store_creator if data_source_creator.__name__ == SnowflakeDataSourceCreator.__name__: entity_df_query = f""" SELECT "customer_id", "driver_id", "order_id", "origin_id", "destination_id", "event_timestamp" FROM "{orders_table}" """ else: entity_df_query = f""" SELECT customer_id, driver_id, order_id, origin_id, destination_id, event_timestamp FROM {orders_table} """ store.apply([driver(), customer(), location(), *feature_views.values()]) job_from_sql = store.get_historical_features( entity_df=entity_df_query, features=[ "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "order:order_is_success", "global_stats:num_rides", "global_stats:avg_ride_length", "field_mapping:feature_name", ], full_feature_names=full_feature_names, ) start_time = datetime.utcnow() actual_df_from_sql_entities = job_from_sql.to_df() end_time = datetime.utcnow() print( str(f"\nTime to execute job_from_sql.to_df() = '{(end_time - start_time)}'" )) event_timestamp = (DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL if DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL in datasets.orders_df.columns else "e_ts") full_expected_df = get_expected_training_df( datasets.customer_df, feature_views.customer, datasets.driver_df, feature_views.driver, datasets.orders_df, feature_views.order, datasets.location_df, feature_views.location, datasets.global_df, feature_views.global_fv, datasets.field_mapping_df, feature_views.field_mapping, datasets.entity_df, event_timestamp, full_feature_names, ) # Not requesting the on demand transform with an entity_df query (can't add request data in them) expected_df_query = full_expected_df.drop(columns=[ response_feature_name("conv_rate_plus_100", full_feature_names), response_feature_name("conv_rate_plus_100_rounded", full_feature_names), response_feature_name("avg_daily_trips", full_feature_names), response_feature_name("conv_rate", full_feature_names), "origin__temperature", "destination__temperature", ]) assert_frame_equal( expected_df_query, actual_df_from_sql_entities, keys=[event_timestamp, "order_id", "driver_id", "customer_id"], ) table_from_sql_entities = job_from_sql.to_arrow().to_pandas() for col in table_from_sql_entities.columns: expected_df_query[col] = expected_df_query[col].astype( table_from_sql_entities[col].dtype) assert_frame_equal( expected_df_query, table_from_sql_entities, keys=[event_timestamp, "order_id", "driver_id", "customer_id"], )
def test_online_retrieval(environment, universal_data_sources, full_feature_names): fs = environment.feature_store entities, datasets, data_sources = universal_data_sources feature_views = construct_universal_feature_views(data_sources) feature_service = FeatureService( "convrate_plus100", features=[feature_views["driver"][["conv_rate"]], feature_views["driver_odfv"]], ) feast_objects = [] feast_objects.extend(feature_views.values()) feast_objects.extend([driver(), customer(), feature_service]) fs.apply(feast_objects) fs.materialize( environment.start_date - timedelta(days=1), environment.end_date + timedelta(days=1), ) entity_sample = datasets["orders"].sample(10)[ ["customer_id", "driver_id", "order_id", "event_timestamp"] ] orders_df = datasets["orders"][ ( datasets["orders"]["customer_id"].isin(entity_sample["customer_id"]) & datasets["orders"]["driver_id"].isin(entity_sample["driver_id"]) ) ] sample_drivers = entity_sample["driver_id"] drivers_df = datasets["driver"][ datasets["driver"]["driver_id"].isin(sample_drivers) ] sample_customers = entity_sample["customer_id"] customers_df = datasets["customer"][ datasets["customer"]["customer_id"].isin(sample_customers) ] global_df = datasets["global"] entity_rows = [ {"driver": d, "customer_id": c, "val_to_add": 50} for (d, c) in zip(sample_drivers, sample_customers) ] feature_refs = [ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "conv_rate_plus_100:conv_rate_plus_100", "conv_rate_plus_100:conv_rate_plus_val_to_add", "order:order_is_success", "global_stats:num_rides", "global_stats:avg_ride_length", ] unprefixed_feature_refs = [f.rsplit(":", 1)[-1] for f in feature_refs if ":" in f] # Remove the on demand feature view output features, since they're not present in the source dataframe unprefixed_feature_refs.remove("conv_rate_plus_100") unprefixed_feature_refs.remove("conv_rate_plus_val_to_add") online_features = fs.get_online_features( features=feature_refs, entity_rows=entity_rows, full_feature_names=full_feature_names, ) assert online_features is not None online_features_dict = online_features.to_dict() keys = online_features_dict.keys() assert ( len(keys) == len(feature_refs) + 3 ) # Add three for the driver id and the customer id entity keys + val_to_add request data. for feature in feature_refs: if full_feature_names: assert feature.replace(":", "__") in keys else: assert feature.rsplit(":", 1)[-1] in keys assert ( "driver_stats" not in keys and "customer_profile" not in keys and "order" not in keys and "global_stats" not in keys ) tc = unittest.TestCase() for i, entity_row in enumerate(entity_rows): df_features = get_latest_feature_values_from_dataframes( drivers_df, customers_df, orders_df, global_df, entity_row ) assert df_features["customer_id"] == online_features_dict["customer_id"][i] assert df_features["driver_id"] == online_features_dict["driver_id"][i] assert ( online_features_dict[ response_feature_name("conv_rate_plus_100", full_feature_names) ][i] == df_features["conv_rate"] + 100 ) assert ( online_features_dict[ response_feature_name("conv_rate_plus_val_to_add", full_feature_names) ][i] == df_features["conv_rate"] + df_features["val_to_add"] ) for unprefixed_feature_ref in unprefixed_feature_refs: tc.assertEqual( df_features[unprefixed_feature_ref], online_features_dict[ response_feature_name(unprefixed_feature_ref, full_feature_names) ][i], ) # Check what happens for missing values missing_responses_dict = fs.get_online_features( features=feature_refs, entity_rows=[{"driver": 0, "customer_id": 0, "val_to_add": 100}], full_feature_names=full_feature_names, ).to_dict() assert missing_responses_dict is not None for unprefixed_feature_ref in unprefixed_feature_refs: if unprefixed_feature_ref not in {"num_rides", "avg_ride_length"}: tc.assertIsNone( missing_responses_dict[ response_feature_name(unprefixed_feature_ref, full_feature_names) ][0] ) # Check what happens for missing request data with pytest.raises(RequestDataNotFoundInEntityRowsException): fs.get_online_features( features=feature_refs, entity_rows=[{"driver": 0, "customer_id": 0}], full_feature_names=full_feature_names, ).to_dict() assert_feature_service_correctness( fs, feature_service, entity_rows, full_feature_names, drivers_df, customers_df, orders_df, global_df, )
def test_historical_features(environment, universal_data_sources, full_feature_names): store = environment.feature_store (entities, datasets, data_sources) = universal_data_sources feature_views = construct_universal_feature_views(data_sources) entity_df_with_request_data = datasets.entity_df.copy(deep=True) entity_df_with_request_data["val_to_add"] = [ i for i in range(len(entity_df_with_request_data)) ] entity_df_with_request_data["driver_age"] = [ i + 100 for i in range(len(entity_df_with_request_data)) ] feature_service = FeatureService( name="convrate_plus100", features=[ feature_views.driver[["conv_rate"]], feature_views.driver_odfv ], ) feature_service_entity_mapping = FeatureService( name="entity_mapping", features=[ feature_views.location.with_name("origin").with_join_key_map( {"location_id": "origin_id"}), feature_views.location.with_name("destination").with_join_key_map( {"location_id": "destination_id"}), ], ) store.apply([ driver(), customer(), location(), feature_service, feature_service_entity_mapping, *feature_views.values(), ]) event_timestamp = (DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL if DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL in datasets.orders_df.columns else "e_ts") full_expected_df = get_expected_training_df( datasets.customer_df, feature_views.customer, datasets.driver_df, feature_views.driver, datasets.orders_df, feature_views.order, datasets.location_df, feature_views.location, datasets.global_df, feature_views.global_fv, datasets.field_mapping_df, feature_views.field_mapping, entity_df_with_request_data, event_timestamp, full_feature_names, ) # Only need the shadow entities features in the FeatureService test expected_df = full_expected_df.drop( columns=["origin__temperature", "destination__temperature"], ) job_from_df = store.get_historical_features( entity_df=entity_df_with_request_data, features=[ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "conv_rate_plus_100:conv_rate_plus_100", "conv_rate_plus_100:conv_rate_plus_100_rounded", "conv_rate_plus_100:conv_rate_plus_val_to_add", "order:order_is_success", "global_stats:num_rides", "global_stats:avg_ride_length", "field_mapping:feature_name", ], full_feature_names=full_feature_names, ) start_time = datetime.utcnow() actual_df_from_df_entities = job_from_df.to_df() print( f"actual_df_from_df_entities shape: {actual_df_from_df_entities.shape}" ) end_time = datetime.utcnow() print( str(f"Time to execute job_from_df.to_df() = '{(end_time - start_time)}'\n" )) assert sorted(expected_df.columns) == sorted( actual_df_from_df_entities.columns) assert_frame_equal( expected_df, actual_df_from_df_entities, keys=[event_timestamp, "order_id", "driver_id", "customer_id"], ) assert_feature_service_correctness( store, feature_service, full_feature_names, entity_df_with_request_data, expected_df, event_timestamp, ) assert_feature_service_entity_mapping_correctness( store, feature_service_entity_mapping, full_feature_names, entity_df_with_request_data, full_expected_df, event_timestamp, ) table_from_df_entities: pd.DataFrame = job_from_df.to_arrow().to_pandas() assert_frame_equal( expected_df, table_from_df_entities, keys=[event_timestamp, "order_id", "driver_id", "customer_id"], )
def test_online_retrieval(environment, universal_data_sources, full_feature_names): fs = environment.feature_store entities, datasets, data_sources = universal_data_sources feature_views = construct_universal_feature_views(data_sources) feature_service = FeatureService( "convrate_plus100", features=[ feature_views["driver"][["conv_rate"]], feature_views["driver_odfv"], feature_views["driver_age_request_fv"], ], ) feature_service_entity_mapping = FeatureService( name="entity_mapping", features=[ feature_views["location"].with_name("origin").with_join_key_map( {"location_id": "origin_id"}), feature_views["location"].with_name( "destination").with_join_key_map( {"location_id": "destination_id"}), ], ) feast_objects = [] feast_objects.extend(feature_views.values()) feast_objects.extend([ driver(), customer(), location(), feature_service, feature_service_entity_mapping, ]) fs.apply(feast_objects) fs.materialize( environment.start_date - timedelta(days=1), environment.end_date + timedelta(days=1), ) entity_sample = datasets["orders"].sample(10)[[ "customer_id", "driver_id", "order_id", "event_timestamp" ]] orders_df = datasets["orders"][( datasets["orders"]["customer_id"].isin(entity_sample["customer_id"]) & datasets["orders"]["driver_id"].isin(entity_sample["driver_id"]))] sample_drivers = entity_sample["driver_id"] drivers_df = datasets["driver"][datasets["driver"]["driver_id"].isin( sample_drivers)] sample_customers = entity_sample["customer_id"] customers_df = datasets["customer"][datasets["customer"] ["customer_id"].isin(sample_customers)] location_pairs = np.array( list(itertools.permutations(entities["location"], 2))) sample_location_pairs = location_pairs[np.random.choice( len(location_pairs), 10)].T origins_df = datasets["location"][datasets["location"]["location_id"].isin( sample_location_pairs[0])] destinations_df = datasets["location"][ datasets["location"]["location_id"].isin(sample_location_pairs[1])] global_df = datasets["global"] entity_rows = [{ "driver": d, "customer_id": c, "val_to_add": 50, "driver_age": 25 } for (d, c) in zip(sample_drivers, sample_customers)] feature_refs = [ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "conv_rate_plus_100:conv_rate_plus_100", "conv_rate_plus_100:conv_rate_plus_val_to_add", "order:order_is_success", "global_stats:num_rides", "global_stats:avg_ride_length", "driver_age:driver_age", ] unprefixed_feature_refs = [ f.rsplit(":", 1)[-1] for f in feature_refs if ":" in f ] # Remove the on demand feature view output features, since they're not present in the source dataframe unprefixed_feature_refs.remove("conv_rate_plus_100") unprefixed_feature_refs.remove("conv_rate_plus_val_to_add") online_features = fs.get_online_features( features=feature_refs, entity_rows=entity_rows, full_feature_names=full_feature_names, ) assert online_features is not None online_features_dict = online_features.to_dict() keys = online_features_dict.keys() assert ( len(keys) == len(feature_refs) + 3 ) # Add three for the driver id and the customer id entity keys + val_to_add request data. for feature in feature_refs: # full_feature_names does not apply to request feature views if full_feature_names and feature != "driver_age:driver_age": assert feature.replace(":", "__") in keys else: assert feature.rsplit(":", 1)[-1] in keys assert ("driver_stats" not in keys and "customer_profile" not in keys and "order" not in keys and "global_stats" not in keys) tc = unittest.TestCase() for i, entity_row in enumerate(entity_rows): df_features = get_latest_feature_values_from_dataframes( driver_df=drivers_df, customer_df=customers_df, orders_df=orders_df, global_df=global_df, entity_row=entity_row, ) assert df_features["customer_id"] == online_features_dict[ "customer_id"][i] assert df_features["driver_id"] == online_features_dict["driver_id"][i] tc.assertAlmostEqual( online_features_dict[response_feature_name("conv_rate_plus_100", full_feature_names)][i], df_features["conv_rate"] + 100, delta=0.0001, ) tc.assertAlmostEqual( online_features_dict[response_feature_name( "conv_rate_plus_val_to_add", full_feature_names)][i], df_features["conv_rate"] + df_features["val_to_add"], delta=0.0001, ) for unprefixed_feature_ref in unprefixed_feature_refs: tc.assertAlmostEqual( df_features[unprefixed_feature_ref], online_features_dict[response_feature_name( unprefixed_feature_ref, full_feature_names)][i], delta=0.0001, ) # Check what happens for missing values missing_responses_dict = fs.get_online_features( features=feature_refs, entity_rows=[{ "driver": 0, "customer_id": 0, "val_to_add": 100, "driver_age": 125 }], full_feature_names=full_feature_names, ).to_dict() assert missing_responses_dict is not None for unprefixed_feature_ref in unprefixed_feature_refs: if unprefixed_feature_ref not in { "num_rides", "avg_ride_length", "driver_age" }: tc.assertIsNone(missing_responses_dict[response_feature_name( unprefixed_feature_ref, full_feature_names)][0]) # Check what happens for missing request data with pytest.raises(RequestDataNotFoundInEntityRowsException): fs.get_online_features( features=feature_refs, entity_rows=[{ "driver": 0, "customer_id": 0 }], full_feature_names=full_feature_names, ).to_dict() # Also with request data with pytest.raises(RequestDataNotFoundInEntityRowsException): fs.get_online_features( features=feature_refs, entity_rows=[{ "driver": 0, "customer_id": 0, "val_to_add": 20 }], full_feature_names=full_feature_names, ).to_dict() assert_feature_service_correctness( fs, feature_service, entity_rows, full_feature_names, drivers_df, customers_df, orders_df, global_df, ) entity_rows = [{ "driver": driver, "customer_id": customer, "origin_id": origin, "destination_id": destination, } for (driver, customer, origin, destination ) in zip(sample_drivers, sample_customers, *sample_location_pairs)] assert_feature_service_entity_mapping_correctness( fs, feature_service_entity_mapping, entity_rows, full_feature_names, drivers_df, customers_df, orders_df, origins_df, destinations_df, )
def test_historical_features(environment, universal_data_sources, full_feature_names): store = environment.feature_store (entities, datasets, data_sources) = universal_data_sources feature_views = construct_universal_feature_views(data_sources) customer_df, driver_df, location_df, orders_df, global_df, entity_df = ( datasets["customer"], datasets["driver"], datasets["location"], datasets["orders"], datasets["global"], datasets["entity"], ) entity_df_with_request_data = entity_df.copy(deep=True) entity_df_with_request_data["val_to_add"] = [ i for i in range(len(entity_df_with_request_data)) ] entity_df_with_request_data["driver_age"] = [ i + 100 for i in range(len(entity_df_with_request_data)) ] ( customer_fv, driver_fv, driver_odfv, location_fv, order_fv, global_fv, driver_age_request_fv, ) = ( feature_views["customer"], feature_views["driver"], feature_views["driver_odfv"], feature_views["location"], feature_views["order"], feature_views["global"], feature_views["driver_age_request_fv"], ) feature_service = FeatureService( name="convrate_plus100", features=[ feature_views["driver"][["conv_rate"]], driver_odfv, driver_age_request_fv, ], ) feature_service_entity_mapping = FeatureService( name="entity_mapping", features=[ location_fv.with_name("origin").with_join_key_map( {"location_id": "origin_id"} ), location_fv.with_name("destination").with_join_key_map( {"location_id": "destination_id"} ), ], ) feast_objects = [] feast_objects.extend( [ customer_fv, driver_fv, driver_odfv, location_fv, order_fv, global_fv, driver_age_request_fv, driver(), customer(), location(), feature_service, feature_service_entity_mapping, ] ) store.apply(feast_objects) entity_df_query = None orders_table = table_name_from_data_source(data_sources["orders"]) if orders_table: entity_df_query = f"SELECT customer_id, driver_id, order_id, origin_id, destination_id, event_timestamp FROM {orders_table}" event_timestamp = ( DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL if DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL in orders_df.columns else "e_ts" ) full_expected_df = get_expected_training_df( customer_df, customer_fv, driver_df, driver_fv, orders_df, order_fv, location_df, location_fv, global_df, global_fv, entity_df_with_request_data, event_timestamp, full_feature_names, ) # Only need the shadow entities features in the FeatureService test expected_df = full_expected_df.drop( columns=["origin__temperature", "destination__temperature"], ) if entity_df_query: job_from_sql = store.get_historical_features( entity_df=entity_df_query, features=[ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "order:order_is_success", "global_stats:num_rides", "global_stats:avg_ride_length", ], full_feature_names=full_feature_names, ) start_time = datetime.utcnow() actual_df_from_sql_entities = job_from_sql.to_df() end_time = datetime.utcnow() print( str(f"\nTime to execute job_from_sql.to_df() = '{(end_time - start_time)}'") ) # Not requesting the on demand transform with an entity_df query (can't add request data in them) expected_df_query = expected_df.drop( columns=[ "conv_rate_plus_100", "conv_rate_plus_100_rounded", "val_to_add", "conv_rate_plus_val_to_add", "driver_age", ] ) assert sorted(expected_df_query.columns) == sorted( actual_df_from_sql_entities.columns ) actual_df_from_sql_entities = ( actual_df_from_sql_entities[expected_df_query.columns] .sort_values(by=[event_timestamp, "order_id", "driver_id", "customer_id"]) .drop_duplicates() .reset_index(drop=True) ) expected_df_query = ( expected_df_query.sort_values( by=[event_timestamp, "order_id", "driver_id", "customer_id"] ) .drop_duplicates() .reset_index(drop=True) ) assert_frame_equal( actual_df_from_sql_entities, expected_df_query, check_dtype=False, ) table_from_sql_entities = job_from_sql.to_arrow() df_from_sql_entities = ( table_from_sql_entities.to_pandas()[expected_df_query.columns] .sort_values(by=[event_timestamp, "order_id", "driver_id", "customer_id"]) .drop_duplicates() .reset_index(drop=True) ) for col in df_from_sql_entities.columns: expected_df_query[col] = expected_df_query[col].astype( df_from_sql_entities[col].dtype ) assert_frame_equal(expected_df_query, df_from_sql_entities) job_from_df = store.get_historical_features( entity_df=entity_df_with_request_data, features=[ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "conv_rate_plus_100:conv_rate_plus_100", "conv_rate_plus_100:conv_rate_plus_100_rounded", "conv_rate_plus_100:conv_rate_plus_val_to_add", "order:order_is_success", "global_stats:num_rides", "global_stats:avg_ride_length", "driver_age:driver_age", ], full_feature_names=full_feature_names, ) start_time = datetime.utcnow() actual_df_from_df_entities = job_from_df.to_df() print(f"actual_df_from_df_entities shape: {actual_df_from_df_entities.shape}") end_time = datetime.utcnow() print(str(f"Time to execute job_from_df.to_df() = '{(end_time - start_time)}'\n")) assert sorted(expected_df.columns) == sorted(actual_df_from_df_entities.columns) expected_df: pd.DataFrame = ( expected_df.sort_values( by=[event_timestamp, "order_id", "driver_id", "customer_id"] ) .drop_duplicates() .reset_index(drop=True) ) actual_df_from_df_entities = ( actual_df_from_df_entities[expected_df.columns] .sort_values(by=[event_timestamp, "order_id", "driver_id", "customer_id"]) .drop_duplicates() .reset_index(drop=True) ) assert_frame_equal( expected_df, actual_df_from_df_entities, check_dtype=False, ) assert_feature_service_correctness( store, feature_service, full_feature_names, entity_df_with_request_data, expected_df, event_timestamp, ) assert_feature_service_entity_mapping_correctness( store, feature_service_entity_mapping, full_feature_names, entity_df_with_request_data, full_expected_df, event_timestamp, ) table_from_df_entities: pd.DataFrame = job_from_df.to_arrow().to_pandas() columns_expected_in_table = expected_df.columns.tolist() table_from_df_entities = ( table_from_df_entities[columns_expected_in_table] .sort_values(by=[event_timestamp, "order_id", "driver_id", "customer_id"]) .drop_duplicates() .reset_index(drop=True) ) assert_frame_equal(actual_df_from_df_entities, table_from_df_entities) # If request data is missing that's needed for on demand transform, throw an error with pytest.raises(RequestDataNotFoundInEntityDfException): store.get_historical_features( entity_df=entity_df, features=[ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "conv_rate_plus_100:conv_rate_plus_100", "conv_rate_plus_100:conv_rate_plus_val_to_add", "global_stats:num_rides", "global_stats:avg_ride_length", ], full_feature_names=full_feature_names, ) # If request data is missing that's needed for a request feature view, throw an error with pytest.raises(RequestDataNotFoundInEntityDfException): store.get_historical_features( entity_df=entity_df, features=[ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "driver_age:driver_age", "global_stats:num_rides", "global_stats:avg_ride_length", ], full_feature_names=full_feature_names, )