def test_usage_off(): old_environ = dict(os.environ) test_usage_id = str(uuid.uuid4()) os.environ["FEAST_IS_USAGE_TEST"] = "True" os.environ["FEAST_USAGE"] = "False" os.environ["FEAST_FORCE_USAGE_UUID"] = test_usage_id with tempfile.TemporaryDirectory() as temp_dir: test_feature_store = FeatureStore( config=RepoConfig( registry=os.path.join(temp_dir, "registry.db"), project="fake_project", provider="local", online_store=SqliteOnlineStoreConfig( path=os.path.join(temp_dir, "online.db") ), ) ) entity = Entity( name="driver_car_id", description="Car driver id", value_type=ValueType.STRING, labels={"team": "matchmaking"}, ) test_feature_store.apply([entity]) os.environ.clear() os.environ.update(old_environ) sleep(30) rows = read_bigquery_usage_id(test_usage_id) assert rows.total_rows == 0
def main(): pd.set_option("display.max_columns", None) pd.set_option("display.width", 1000) # Load the feature store from the current path fs = FeatureStore(repo_path=".") # Deploy the feature store to AWS print("Deploying feature store to AWS...") fs.apply([driver, driver_hourly_stats_view]) # Select features feature_refs = ["driver_hourly_stats:conv_rate", "driver_hourly_stats:acc_rate"] print("Loading features into the online store...") fs.materialize_incremental(end_date=datetime.now()) print("Retrieving online features...") # Retrieve features from the online store (DynamoDB) online_features = fs.get_online_features( feature_refs=feature_refs, entity_rows=[{"driver_id": 1001}, {"driver_id": 1002}], ).to_dict() print(pd.DataFrame.from_dict(online_features))
def store_offline(feature_store: FeatureStore, dataframe: FlyteSchema) -> FeatureStore: horse_colic_entity = Entity(name="Hospital Number", value_type=ValueType.STRING) horse_colic_feature_view = FeatureView( name="horse_colic_stats", entities=["Hospital Number"], features=[ Feature(name="rectal temperature", dtype=ValueType.FLOAT), Feature(name="total protein", dtype=ValueType.FLOAT), Feature(name="peripheral pulse", dtype=ValueType.FLOAT), Feature(name="surgical lesion", dtype=ValueType.STRING), Feature(name="abdominal distension", dtype=ValueType.FLOAT), Feature(name="nasogastric tube", dtype=ValueType.STRING), Feature(name="outcome", dtype=ValueType.STRING), Feature(name="packed cell volume", dtype=ValueType.FLOAT), Feature(name="nasogastric reflux PH", dtype=ValueType.FLOAT), ], batch_source=FileSource( path=str(dataframe.remote_path), event_timestamp_column="timestamp", ), ttl=timedelta(days=1), ) # Ingest the data into feast feature_store.apply([horse_colic_entity, horse_colic_feature_view]) return feature_store
def test_telemetry_on(): old_environ = dict(os.environ) test_telemetry_id = str(uuid.uuid4()) os.environ["FEAST_FORCE_TELEMETRY_UUID"] = test_telemetry_id os.environ["FEAST_IS_TELEMETRY_TEST"] = "True" os.environ["FEAST_TELEMETRY"] = "True" with tempfile.TemporaryDirectory() as temp_dir: test_feature_store = FeatureStore( config=RepoConfig( registry=os.path.join(temp_dir, "registry.db"), project="fake_project", provider="local", online_store=SqliteOnlineStoreConfig( path=os.path.join(temp_dir, "online.db") ), ) ) entity = Entity( name="driver_car_id", description="Car driver id", value_type=ValueType.STRING, labels={"team": "matchmaking"}, ) test_feature_store.apply([entity]) os.environ.clear() os.environ.update(old_environ) ensure_bigquery_telemetry_id_with_retry(test_telemetry_id)
def setup_feature_store(): """Prepares the local environment for a FeatureStore docstring test.""" from datetime import datetime, timedelta from feast import Entity, Feature, FeatureStore, FeatureView, FileSource, ValueType from feast.repo_operations import init_repo init_repo("feature_repo", "local") fs = FeatureStore(repo_path="feature_repo") driver = Entity( name="driver_id", value_type=ValueType.INT64, description="driver id", ) driver_hourly_stats = FileSource( path="feature_repo/data/driver_stats.parquet", event_timestamp_column="event_timestamp", created_timestamp_column="created", ) driver_hourly_stats_view = FeatureView( name="driver_hourly_stats", entities=["driver_id"], ttl=timedelta(seconds=86400 * 1), features=[ Feature(name="conv_rate", dtype=ValueType.FLOAT), Feature(name="acc_rate", dtype=ValueType.FLOAT), Feature(name="avg_daily_trips", dtype=ValueType.INT64), ], batch_source=driver_hourly_stats, ) fs.apply([driver_hourly_stats_view, driver]) fs.materialize( start_date=datetime.utcnow() - timedelta(hours=3), end_date=datetime.utcnow() - timedelta(minutes=10), )
def main(): pd.set_option("display.max_columns", None) pd.set_option("display.width", 1000) # Load the feature store from the current path fs = FeatureStore(repo_path=".") # Deploy the feature store to Snowflake print("Deploying feature store to Snowflake...") fs.apply([driver, driver_stats_fv]) # Select features features = [ "driver_hourly_stats:conv_rate", "driver_hourly_stats:acc_rate" ] # Create an entity dataframe. This is the dataframe that will be enriched with historical features entity_df = pd.DataFrame({ "event_timestamp": [ pd.Timestamp(dt, unit="ms", tz="UTC").round("ms") for dt in pd.date_range( start=datetime.now() - timedelta(days=3), end=datetime.now(), periods=3, ) ], "driver_id": [1001, 1002, 1003], }) print("Retrieving training data...") # Retrieve historical features by joining the entity dataframe to the Snowflake table source training_df = fs.get_historical_features(features=features, entity_df=entity_df).to_df() print() print(training_df) print() print("Loading features into the online store...") fs.materialize_incremental(end_date=datetime.now()) print() print("Retrieving online features...") # Retrieve features from the online store online_features = fs.get_online_features( features=features, entity_rows=[{ "driver_id": 1001 }, { "driver_id": 1002 }], ).to_dict() print() print(pd.DataFrame.from_dict(online_features))
def test_telemetry_on(): old_environ = dict(os.environ) test_telemetry_id = str(uuid.uuid4()) os.environ["FEAST_FORCE_TELEMETRY_UUID"] = test_telemetry_id os.environ["FEAST_IS_TELEMETRY_TEST"] = "True" os.environ["FEAST_TELEMETRY"] = "True" test_feature_store = FeatureStore() entity = Entity( name="driver_car_id", description="Car driver id", value_type=ValueType.STRING, labels={"team": "matchmaking"}, ) test_feature_store.apply([entity]) os.environ.clear() os.environ.update(old_environ) ensure_bigquery_telemetry_id_with_retry(test_telemetry_id)
def test_telemetry_off(): old_environ = dict(os.environ) test_telemetry_id = str(uuid.uuid4()) os.environ["FEAST_IS_TELEMETRY_TEST"] = "True" os.environ["FEAST_TELEMETRY"] = "False" os.environ["FEAST_FORCE_TELEMETRY_UUID"] = test_telemetry_id test_feature_store = FeatureStore() entity = Entity( name="driver_car_id", description="Car driver id", value_type=ValueType.STRING, labels={"team": "matchmaking"}, ) test_feature_store.apply([entity]) os.environ.clear() os.environ.update(old_environ) sleep(30) rows = read_bigquery_telemetry_id(test_telemetry_id) assert rows.total_rows == 0
# Define an entity for the driver. You can think of entity as a primary key used to # fetch features. driver = Entity( name="driver_id", value_type=ValueType.INT64, description="driver id", ) # Our parquet files contain sample data that includes a driver_id column, timestamps and # three feature column. Here we define a Feature View that will allow us to serve this # data to our model online. driver_hourly_stats_view = FeatureView( name="driver_hourly_stats", entities=["driver_id"], ttl=Duration(seconds=86400 * 365), features=[ Feature(name="conv_rate", dtype=ValueType.DOUBLE), Feature(name="acc_rate", dtype=ValueType.FLOAT), Feature(name="avg_daily_trips", dtype=ValueType.INT64), ], online=True, batch_source=driver_hourly_stats, tags={}, ) fs = FeatureStore("") fs.apply([driver_hourly_stats_view, driver]) now = datetime.now() fs.materialize_incremental(now)
) benchmark_feature_views = [ FeatureView( name=f"feature_view_{i}", entities=["entity"], ttl=Duration(seconds=86400), features=[ Feature(name=f"feature_{10 * i + j}", dtype=ValueType.INT64) for j in range(10) ], online=True, batch_source=generated_data_source, ) for i in range(25) ] benchmark_feature_service = FeatureService( name=f"benchmark_feature_service", features=benchmark_feature_views, ) fs = FeatureStore(".") fs.apply([ driver_hourly_stats_view, driver, entity, benchmark_feature_service, *benchmark_feature_views ]) now = datetime.now() fs.materialize(start, now) print("Materialization finished")
def construct_test_environment( test_repo_config: TestRepoConfig, create_and_apply: bool = False, materialize: bool = False, ) -> Environment: """ This method should take in the parameters from the test repo config and created a feature repo, apply it, and return the constructed feature store object to callers. This feature store object can be interacted for the purposes of tests. The user is *not* expected to perform any clean up actions. :param test_repo_config: configuration :return: A feature store built using the supplied configuration. """ df = create_dataset() project = f"test_correctness_{str(uuid.uuid4()).replace('-', '')[:8]}" module_name, config_class_name = test_repo_config.offline_store_creator.rsplit( ".", 1) offline_creator: DataSourceCreator = importer.get_class_from_type( module_name, config_class_name, "DataSourceCreator")(project) ds = offline_creator.create_data_source(project, df, field_mapping={ "ts_1": "ts", "id": "driver_id" }) offline_store = offline_creator.create_offline_store_config() online_store = test_repo_config.online_store with tempfile.TemporaryDirectory() as repo_dir_name: config = RepoConfig( registry=str(Path(repo_dir_name) / "registry.db"), project=project, provider=test_repo_config.provider, offline_store=offline_store, online_store=online_store, repo_path=repo_dir_name, ) fs = FeatureStore(config=config) environment = Environment( name=project, test_repo_config=test_repo_config, feature_store=fs, data_source=ds, data_source_creator=offline_creator, ) fvs = [] entities = [] try: if create_and_apply: entities.extend([driver(), customer()]) fvs.extend([ environment.driver_stats_feature_view(), environment.customer_feature_view(), ]) fs.apply(fvs + entities) if materialize: fs.materialize(environment.start_date, environment.end_date) yield environment finally: offline_creator.teardown() fs.teardown()
def generate_data(num_rows: int, num_features: int, destination: str) -> pd.DataFrame: features = [f"feature_{i}" for i in range(num_features)] columns = ["entity", "event_timestamp"] + features df = pd.DataFrame(0, index=np.arange(num_rows), columns=columns) df["event_timestamp"] = datetime.utcnow() for column in features: df[column] = np.random.randint(1, num_rows, num_rows) df["entity"] = "key-" + pd.Series(np.arange(1, num_rows + 1)).astype( pd.StringDtype()) df.to_parquet(destination) generate_data(10**3, 250, "benchmark_data.parquet") fs = FeatureStore(".") fs.apply([ driver_hourly_stats_view, transformed_conv_rate, driver, entity, benchmark_feature_service, *benchmark_feature_views, ]) now = datetime.now() fs.materialize(start, now) print("Materialization finished")