def check_offline_and_online_features( fs: FeatureStore, fv: FeatureView, driver_id: int, event_timestamp: datetime, expected_value: Optional[float], full_feature_names: bool, check_offline_store: bool = True, ) -> None: # Check online store response_dict = fs.get_online_features( [f"{fv.name}:value"], [{ "driver_id": driver_id }], full_feature_names=full_feature_names, ).to_dict() if full_feature_names: if expected_value: assert ( abs(response_dict[f"{fv.name}__value"][0] - expected_value) < 1e-6), f"Response: {response_dict}, Expected: {expected_value}" else: assert response_dict[f"{fv.name}__value"][0] is None else: if expected_value: assert (abs(response_dict["value"][0] - expected_value) < 1e-6 ), f"Response: {response_dict}, Expected: {expected_value}" else: assert response_dict["value"][0] is None # Check offline store if check_offline_store: df = fs.get_historical_features( entity_df=pd.DataFrame.from_dict({ "driver_id": [driver_id], "event_timestamp": [event_timestamp] }), features=[f"{fv.name}:value"], full_feature_names=full_feature_names, ).to_df() if full_feature_names: if expected_value: assert (abs( df.to_dict(orient="list")[f"{fv.name}__value"][0] - expected_value) < 1e-6) else: assert not df.to_dict( orient="list")[f"{fv.name}__value"] or math.isnan( df.to_dict(orient="list")[f"{fv.name}__value"][0]) else: if expected_value: assert (abs( df.to_dict(orient="list")["value"][0] - expected_value) < 1e-6) else: assert not df.to_dict(orient="list")["value"] or math.isnan( df.to_dict(orient="list")["value"][0])
def load_historical_features(feature_store: FeatureStore) -> FlyteSchema: entity_df = pd.DataFrame.from_dict({ "Hospital Number": [ "530101", "5290409", "5291329", "530051", "529518", "530101", "529340", "5290409", "530034", ], "event_timestamp": [ datetime(2021, 6, 25, 16, 36, 27), datetime(2021, 6, 25, 16, 36, 27), datetime(2021, 6, 25, 16, 36, 27), datetime(2021, 6, 25, 16, 36, 27), datetime(2021, 6, 25, 16, 36, 27), datetime(2021, 7, 5, 11, 36, 1), datetime(2021, 6, 25, 16, 36, 27), datetime(2021, 7, 5, 11, 50, 40), datetime(2021, 6, 25, 16, 36, 27), ], }) return feature_store.get_historical_features( entity_df=entity_df, features=FEAST_FEATURES) # noqa
def main(): pd.set_option("display.max_columns", None) pd.set_option("display.width", 1000) # Load the feature store from the current path fs = FeatureStore(repo_path=".") # Deploy the feature store to Snowflake print("Deploying feature store to Snowflake...") fs.apply([driver, driver_stats_fv]) # Select features features = [ "driver_hourly_stats:conv_rate", "driver_hourly_stats:acc_rate" ] # Create an entity dataframe. This is the dataframe that will be enriched with historical features entity_df = pd.DataFrame({ "event_timestamp": [ pd.Timestamp(dt, unit="ms", tz="UTC").round("ms") for dt in pd.date_range( start=datetime.now() - timedelta(days=3), end=datetime.now(), periods=3, ) ], "driver_id": [1001, 1002, 1003], }) print("Retrieving training data...") # Retrieve historical features by joining the entity dataframe to the Snowflake table source training_df = fs.get_historical_features(features=features, entity_df=entity_df).to_df() print() print(training_df) print() print("Loading features into the online store...") fs.materialize_incremental(end_date=datetime.now()) print() print("Retrieving online features...") # Retrieve features from the online store online_features = fs.get_online_features( features=features, entity_rows=[{ "driver_id": 1001 }, { "driver_id": 1002 }], ).to_dict() print() print(pd.DataFrame.from_dict(online_features))
def generate_saved_dataset(): store = FeatureStore(repo_path=".") entity_df = pd.read_parquet(path="data/loan_table.parquet") fs = store.get_feature_service("credit_score_v1") job = store.get_historical_features(entity_df=entity_df, features=fs,) store.create_saved_dataset( from_=job, name="my_training_ds", storage=SavedDatasetFileStorage(path="my_training_ds.parquet"), feature_service=fs, profiler=credit_profiler, )
def get_historical_features(): """Retrieve historical features for training.""" # Entities to pull data for (should dynamically read this from somewhere) project_ids = [1, 2, 3] now = datetime.now() timestamps = [datetime(now.year, now.month, now.day)] * len(project_ids) entity_df = pd.DataFrame.from_dict({"id": project_ids, "event_timestamp": timestamps}) # Get historical features store = FeatureStore(repo_path=Path(config.BASE_DIR, "features")) training_df = store.get_historical_features( entity_df=entity_df, feature_refs=["project_details:text", "project_details:tags"], ).to_df() # Store in location for training task to pick up print(training_df.head())
from sklearn.linear_model import LinearRegression import helpers # Load driver order data orders = pd.read_csv("driver_orders.csv", sep="\t") orders["event_timestamp"] = pd.to_datetime(orders["event_timestamp"]) # Set up feature store fs = FeatureStore(repo_path="driver_ranking/") # Retrieve training data from BigQuery training_df = fs.get_historical_features( entity_df=orders, feature_refs=[ "driver_hourly_stats:conv_rate", "driver_hourly_stats:acc_rate", "driver_hourly_stats:avg_daily_trips", ], ).to_df() # # Print output # print(training_df) # Train model target = "trip_completed" reg = LinearRegression() train_X = training_df[training_df.columns.drop(target).drop("event_timestamp")] train_Y = training_df.loc[:, target] reg.fit(train_X, train_Y) dump(reg, "driver_model.bin")
"user_id": [1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008], "event_timestamp": [ datetime(2021, 4, 21, 17, 58, 9), datetime(2021, 4, 21, 17, 58, 9), datetime(2021, 4, 21, 17, 58, 9), datetime(2021, 4, 21, 17, 58, 9), datetime(2021, 4, 21, 17, 58, 9), datetime(2021, 4, 21, 17, 58, 9), datetime(2021, 4, 21, 17, 58, 9), datetime(2021, 4, 21, 17, 58, 9), ] }) training_df = store.get_historical_features( entity_df=entity_df, feature_refs=[ 'driver_hourly_stats:daily_transactions', 'driver_hourly_stats:total_transactions', ], ).to_df() print(training_df) executionTime = (time.time() - startTime) print('Execution time in seconds: ' + str(executionTime)) startTime = time.time() entity_df = pd.DataFrame.from_dict({ "user_id": [1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008], "event_timestamp": [ datetime(2021, 4, 24, 17, 58, 9), datetime(2021, 4, 24, 17, 58, 9), datetime(2021, 4, 24, 17, 58, 9), datetime(2021, 4, 24, 17, 58, 9),
entity_df = pd.DataFrame.from_dict({ "driver_id": [1001, 1002, 1003, 1004], "event_timestamp": [ datetime(2021, 4, 12, 10, 59, 42), datetime(2021, 4, 12, 8, 12, 10), datetime(2021, 4, 12, 16, 40, 26), datetime(2021, 4, 12, 15, 1 , 12) ] }) store = FeatureStore(repo_path="feast_repo") training_df = store.get_historical_features( entity_df=entity_df, feature_refs = [ 'driver_hourly_stats:conv_rate', 'driver_hourly_stats:acc_rate', 'driver_hourly_stats:avg_daily_trips' ], ).to_df() print(training_df.head()) # another feature store store = FeatureStore(repo_path="feature_transaction") customer_df = pd.read_parquet("customers.parquet") training_df = store.get_historical_features( entity_df=customer_df, feature_refs = [ 'customer_transactions:total_transactions',