def test_telemetry_on(): old_environ = dict(os.environ) test_telemetry_id = str(uuid.uuid4()) os.environ["FEAST_FORCE_TELEMETRY_UUID"] = test_telemetry_id os.environ["FEAST_IS_TELEMETRY_TEST"] = "True" os.environ["FEAST_TELEMETRY"] = "True" with tempfile.TemporaryDirectory() as temp_dir: test_feature_store = FeatureStore( config=RepoConfig( registry=os.path.join(temp_dir, "registry.db"), project="fake_project", provider="local", online_store=SqliteOnlineStoreConfig( path=os.path.join(temp_dir, "online.db") ), ) ) entity = Entity( name="driver_car_id", description="Car driver id", value_type=ValueType.STRING, labels={"team": "matchmaking"}, ) test_feature_store.apply([entity]) os.environ.clear() os.environ.update(old_environ) ensure_bigquery_telemetry_id_with_retry(test_telemetry_id)
def check_offline_and_online_features( fs: FeatureStore, fv: FeatureView, driver_id: int, event_timestamp: datetime, expected_value: Optional[float], full_feature_names: bool, check_offline_store: bool = True, ) -> None: # Check online store response_dict = fs.get_online_features( [f"{fv.name}:value"], [{ "driver_id": driver_id }], full_feature_names=full_feature_names, ).to_dict() if full_feature_names: if expected_value: assert ( abs(response_dict[f"{fv.name}__value"][0] - expected_value) < 1e-6), f"Response: {response_dict}, Expected: {expected_value}" else: assert response_dict[f"{fv.name}__value"][0] is None else: if expected_value: assert (abs(response_dict["value"][0] - expected_value) < 1e-6 ), f"Response: {response_dict}, Expected: {expected_value}" else: assert response_dict["value"][0] is None # Check offline store if check_offline_store: df = fs.get_historical_features( entity_df=pd.DataFrame.from_dict({ "driver_id": [driver_id], "event_timestamp": [event_timestamp] }), features=[f"{fv.name}:value"], full_feature_names=full_feature_names, ).to_df() if full_feature_names: if expected_value: assert (abs( df.to_dict(orient="list")[f"{fv.name}__value"][0] - expected_value) < 1e-6) else: assert not df.to_dict( orient="list")[f"{fv.name}__value"] or math.isnan( df.to_dict(orient="list")[f"{fv.name}__value"][0]) else: if expected_value: assert (abs( df.to_dict(orient="list")["value"][0] - expected_value) < 1e-6) else: assert not df.to_dict(orient="list")["value"] or math.isnan( df.to_dict(orient="list")["value"][0])
def get_latest_timestamps(): store = FeatureStore(repo_path=".") feature_views = store.list_feature_views() for fv in feature_views: print( f"Data source latest event for {fv.name} is {fv.batch_source._meta.latest_event_timestamp}" )
def construct_test_environment( test_repo_config: IntegrationTestRepoConfig, test_suite_name: str = "integration_test", ) -> Environment: project = f"{test_suite_name}_{str(uuid.uuid4()).replace('-', '')[:8]}" offline_creator: DataSourceCreator = test_repo_config.offline_store_creator(project) offline_store_config = offline_creator.create_offline_store_config() online_store = test_repo_config.online_store with tempfile.TemporaryDirectory() as repo_dir_name: config = RepoConfig( registry=str(Path(repo_dir_name) / "registry.db"), project=project, provider=test_repo_config.provider, offline_store=offline_store_config, online_store=online_store, repo_path=repo_dir_name, ) fs = FeatureStore(config=config) # We need to initialize the registry, because if nothing is applied in the test before tearing down # the feature store, that will cause the teardown method to blow up. fs.registry._initialize_registry() environment = Environment( name=project, test_repo_config=test_repo_config, feature_store=fs, data_source_creator=offline_creator, ) try: yield environment finally: fs.teardown()
def store_offline(feature_store: FeatureStore, dataframe: FlyteSchema) -> FeatureStore: horse_colic_entity = Entity(name="Hospital Number", value_type=ValueType.STRING) horse_colic_feature_view = FeatureView( name="horse_colic_stats", entities=["Hospital Number"], features=[ Feature(name="rectal temperature", dtype=ValueType.FLOAT), Feature(name="total protein", dtype=ValueType.FLOAT), Feature(name="peripheral pulse", dtype=ValueType.FLOAT), Feature(name="surgical lesion", dtype=ValueType.STRING), Feature(name="abdominal distension", dtype=ValueType.FLOAT), Feature(name="nasogastric tube", dtype=ValueType.STRING), Feature(name="outcome", dtype=ValueType.STRING), Feature(name="packed cell volume", dtype=ValueType.FLOAT), Feature(name="nasogastric reflux PH", dtype=ValueType.FLOAT), ], batch_source=FileSource( path=str(dataframe.remote_path), event_timestamp_column="timestamp", ), ttl=timedelta(days=1), ) # Ingest the data into feast feature_store.apply([horse_colic_entity, horse_colic_feature_view]) return feature_store
def test_usage_off(): old_environ = dict(os.environ) test_usage_id = str(uuid.uuid4()) os.environ["FEAST_IS_USAGE_TEST"] = "True" os.environ["FEAST_USAGE"] = "False" os.environ["FEAST_FORCE_USAGE_UUID"] = test_usage_id with tempfile.TemporaryDirectory() as temp_dir: test_feature_store = FeatureStore( config=RepoConfig( registry=os.path.join(temp_dir, "registry.db"), project="fake_project", provider="local", online_store=SqliteOnlineStoreConfig( path=os.path.join(temp_dir, "online.db") ), ) ) entity = Entity( name="driver_car_id", description="Car driver id", value_type=ValueType.STRING, labels={"team": "matchmaking"}, ) test_feature_store.apply([entity]) os.environ.clear() os.environ.update(old_environ) sleep(30) rows = read_bigquery_usage_id(test_usage_id) assert rows.total_rows == 0
def run_offline_online_store_consistency_test(fs: FeatureStore, fv: FeatureView) -> None: now = datetime.now() full_feature_names = True check_offline_store: bool = True # Run materialize() # use both tz-naive & tz-aware timestamps to test that they're both correctly handled start_date = (now - timedelta(hours=5)).replace(tzinfo=utc) end_date = now - timedelta(hours=2) fs.materialize(feature_views=[fv.name], start_date=start_date, end_date=end_date) # check result of materialize() check_offline_and_online_features( fs=fs, fv=fv, driver_id=1, event_timestamp=end_date, expected_value=0.3, full_feature_names=full_feature_names, check_offline_store=check_offline_store, ) check_offline_and_online_features( fs=fs, fv=fv, driver_id=2, event_timestamp=end_date, expected_value=None, full_feature_names=full_feature_names, check_offline_store=check_offline_store, ) # check prior value for materialize_incremental() check_offline_and_online_features( fs=fs, fv=fv, driver_id=3, event_timestamp=end_date, expected_value=4, full_feature_names=full_feature_names, check_offline_store=check_offline_store, ) # run materialize_incremental() fs.materialize_incremental(feature_views=[fv.name], end_date=now) # check result of materialize_incremental() check_offline_and_online_features( fs=fs, fv=fv, driver_id=3, event_timestamp=now, expected_value=5, full_feature_names=full_feature_names, check_offline_store=check_offline_store, )
def init(self, conf: ConfigTree) -> None: conf = conf.with_fallback(FeastExtractor.DEFAULT_CONFIG) self._feast_repository_path = conf.get_string( FeastExtractor.FEAST_REPOSITORY_PATH) self._describe_feature_views = conf.get_bool( FeastExtractor.DESCRIBE_FEATURE_VIEWS) self._feast = FeatureStore(repo_path=self._feast_repository_path) self._extract_iter: Union[None, Iterator] = None
def _assert_online_features(store: FeatureStore, driver_df: pd.DataFrame, max_date: datetime): """Assert that features in online store are up to date with `max_date` date.""" # Read features back result = store.get_online_features( features=[ "driver_hourly_stats:conv_rate", "driver_hourly_stats:avg_daily_trips", "global_daily_stats:num_rides", "global_daily_stats:avg_ride_length", ], entity_rows=[{ "driver_id": 1001 }], full_feature_names=True, ).to_dict() assert len(result) == 5 assert "driver_hourly_stats__avg_daily_trips" in result assert "driver_hourly_stats__conv_rate" in result assert (abs(result["driver_hourly_stats__conv_rate"][0] - _get_last_feature_row(driver_df, 1001, max_date)["conv_rate"]) < 0.01) assert "global_daily_stats__num_rides" in result assert "global_daily_stats__avg_ride_length" in result
class DriverRankingModel: def __init__(self): # Load model self.model = load("driver_model.bin") # Set up feature store self.fs = FeatureStore(repo_path="driver_ranking/") def predict(self, driver_ids): # Read features from Feast driver_features = self.fs.get_online_features( entity_rows=[{ "driver_id": driver_id } for driver_id in driver_ids], feature_refs=[ "driver_hourly_stats:conv_rate", "driver_hourly_stats:acc_rate", "driver_hourly_stats:avg_daily_trips", ], ) features_df = pd.DataFrame.from_dict(driver_features.to_dict()) # Make prediction features_df["prediction"] = self.model.predict(features_df) # Choose best driver best_driver_id = features_df["driver_id"].iloc[ features_df["prediction"].argmax()] # return best driver return best_driver_id
def test__get_unique_entities(): entity_values = { "entity_1": [Value(int64_val=1), Value(int64_val=2), Value(int64_val=1)], "entity_2": [ Value(string_val="1"), Value(string_val="2"), Value(string_val="1"), ], "entity_3": [Value(int64_val=8), Value(int64_val=9), Value(int64_val=10)], } entity_name_to_join_key_map = {"entity_1": "entity_1", "entity_2": "entity_2"} fv = MockFeatureView( name="fv_1", entities=["entity_1", "entity_2"], projection=MockFeatureViewProjection(join_key_map={}), ) unique_entities, indexes = FeatureStore._get_unique_entities( FeatureStore, table=fv, join_key_values=entity_values, entity_name_to_join_key_map=entity_name_to_join_key_map, ) assert unique_entities == ( {"entity_1": Value(int64_val=1), "entity_2": Value(string_val="1")}, {"entity_1": Value(int64_val=2), "entity_2": Value(string_val="2")}, ) assert indexes == ([0, 2], [1])
def _assert_online_features(store: FeatureStore, driver_df: pd.DataFrame, max_date: datetime): """Assert that features in online store are up to date with `max_date` date.""" # Read features back response = store.get_online_features( features=[ "driver_hourly_stats:conv_rate", "driver_hourly_stats:avg_daily_trips", "global_daily_stats:num_rides", "global_daily_stats:avg_ride_length", ], entity_rows=[{ "driver_id": 1001 }], full_feature_names=True, ) # Float features should still be floats from the online store... assert (response.proto.results[list( response.proto.metadata.feature_names.val).index( "driver_hourly_stats__conv_rate")].values[0].float_val > 0) result = response.to_dict() assert len(result) == 5 assert "driver_hourly_stats__avg_daily_trips" in result assert "driver_hourly_stats__conv_rate" in result assert (abs(result["driver_hourly_stats__conv_rate"][0] - _get_last_feature_row(driver_df, 1001, max_date)["conv_rate"]) < 0.01) assert "global_daily_stats__num_rides" in result assert "global_daily_stats__avg_ride_length" in result
def run_demo(): store = FeatureStore(repo_path=".") print("--- Historical features (from saved dataset) ---") ds = store.get_saved_dataset("my_training_ds") print(ds.to_df()) print("\n--- Online features ---") features = store.get_online_features( features=store.get_feature_service("credit_score_v3"), entity_rows=[ {"zipcode": 30721, "dob_ssn": "19530219_5179", "transaction_amt": 1023} ], ).to_dict() for key, value in sorted(features.items()): print(key, " : ", value)
def retrieve_online(feature_store: FeatureStore, dataset: pd.DataFrame) -> dict: inference_data = random.choice(dataset["Hospital Number"]) logger.info(f"Hospital Number chosen for inference is: {inference_data}") entity_rows = [{"Hospital Number": inference_data}] return feature_store.get_online_features(FEAST_FEATURES, entity_rows)
def load_historical_features(feature_store: FeatureStore) -> FlyteSchema: entity_df = pd.DataFrame.from_dict({ "Hospital Number": [ "530101", "5290409", "5291329", "530051", "529518", "530101", "529340", "5290409", "530034", ], "event_timestamp": [ datetime(2021, 6, 25, 16, 36, 27), datetime(2021, 6, 25, 16, 36, 27), datetime(2021, 6, 25, 16, 36, 27), datetime(2021, 6, 25, 16, 36, 27), datetime(2021, 6, 25, 16, 36, 27), datetime(2021, 7, 5, 11, 36, 1), datetime(2021, 6, 25, 16, 36, 27), datetime(2021, 7, 5, 11, 50, 40), datetime(2021, 6, 25, 16, 36, 27), ], }) return feature_store.get_historical_features( entity_df=entity_df, features=FEAST_FEATURES) # noqa
def main(): pd.set_option("display.max_columns", None) pd.set_option("display.width", 1000) # Load the feature store from the current path fs = FeatureStore(repo_path=".") # Deploy the feature store to Snowflake print("Deploying feature store to Snowflake...") fs.apply([driver, driver_stats_fv]) # Select features features = [ "driver_hourly_stats:conv_rate", "driver_hourly_stats:acc_rate" ] # Create an entity dataframe. This is the dataframe that will be enriched with historical features entity_df = pd.DataFrame({ "event_timestamp": [ pd.Timestamp(dt, unit="ms", tz="UTC").round("ms") for dt in pd.date_range( start=datetime.now() - timedelta(days=3), end=datetime.now(), periods=3, ) ], "driver_id": [1001, 1002, 1003], }) print("Retrieving training data...") # Retrieve historical features by joining the entity dataframe to the Snowflake table source training_df = fs.get_historical_features(features=features, entity_df=entity_df).to_df() print() print(training_df) print() print("Loading features into the online store...") fs.materialize_incremental(end_date=datetime.now()) print() print("Retrieving online features...") # Retrieve features from the online store online_features = fs.get_online_features( features=features, entity_rows=[{ "driver_id": 1001 }, { "driver_id": 1002 }], ).to_dict() print() print(pd.DataFrame.from_dict(online_features))
def get_historical_features(): """Retrieve historical features for training.""" # Entities to pull data for (should dynamically read this from somewhere) project_ids = [1, 2, 3] now = datetime.now() timestamps = [datetime(now.year, now.month, now.day)] * len(project_ids) entity_df = pd.DataFrame.from_dict({"id": project_ids, "event_timestamp": timestamps}) # Get historical features store = FeatureStore(repo_path=Path(config.BASE_DIR, "features")) training_df = store.get_historical_features( entity_df=entity_df, feature_refs=["project_details:text", "project_details:tags"], ).to_df() # Store in location for training task to pick up print(training_df.head())
def main(): pd.set_option("display.max_columns", None) pd.set_option("display.width", 1000) # Load the feature store from the current path fs = FeatureStore(repo_path=".") # Deploy the feature store to AWS print("Deploying feature store to AWS...") fs.apply([driver, driver_hourly_stats_view]) # Select features feature_refs = ["driver_hourly_stats:conv_rate", "driver_hourly_stats:acc_rate"] print("Loading features into the online store...") fs.materialize_incremental(end_date=datetime.now()) print("Retrieving online features...") # Retrieve features from the online store (DynamoDB) online_features = fs.get_online_features( feature_refs=feature_refs, entity_rows=[{"driver_id": 1001}, {"driver_id": 1002}], ).to_dict() print(pd.DataFrame.from_dict(online_features))
def build_feature_store(s3_bucket: str, registry_path: str, online_store_path: str) -> FeatureStore: feature_store_config = FeatureStoreConfig( project="horsecolic", s3_bucket=s3_bucket, registry_path=registry_path, online_store_path=online_store_path, ) return FeatureStore(config=feature_store_config)
def test_telemetry_on(): old_environ = dict(os.environ) test_telemetry_id = str(uuid.uuid4()) os.environ["FEAST_FORCE_TELEMETRY_UUID"] = test_telemetry_id os.environ["FEAST_IS_TELEMETRY_TEST"] = "True" os.environ["FEAST_TELEMETRY"] = "True" test_feature_store = FeatureStore() entity = Entity( name="driver_car_id", description="Car driver id", value_type=ValueType.STRING, labels={"team": "matchmaking"}, ) test_feature_store.apply([entity]) os.environ.clear() os.environ.update(old_environ) ensure_bigquery_telemetry_id_with_retry(test_telemetry_id)
def test_telemetry_off(): old_environ = dict(os.environ) test_telemetry_id = str(uuid.uuid4()) os.environ["FEAST_IS_TELEMETRY_TEST"] = "True" os.environ["FEAST_TELEMETRY"] = "False" os.environ["FEAST_FORCE_TELEMETRY_UUID"] = test_telemetry_id test_feature_store = FeatureStore() entity = Entity( name="driver_car_id", description="Car driver id", value_type=ValueType.STRING, labels={"team": "matchmaking"}, ) test_feature_store.apply([entity]) os.environ.clear() os.environ.update(old_environ) sleep(30) rows = read_bigquery_telemetry_id(test_telemetry_id) assert rows.total_rows == 0
def setup_feature_store(): """Prepares the local environment for a FeatureStore docstring test.""" from datetime import datetime, timedelta from feast import Entity, Feature, FeatureStore, FeatureView, FileSource, ValueType from feast.repo_operations import init_repo init_repo("feature_repo", "local") fs = FeatureStore(repo_path="feature_repo") driver = Entity( name="driver_id", value_type=ValueType.INT64, description="driver id", ) driver_hourly_stats = FileSource( path="feature_repo/data/driver_stats.parquet", event_timestamp_column="event_timestamp", created_timestamp_column="created", ) driver_hourly_stats_view = FeatureView( name="driver_hourly_stats", entities=["driver_id"], ttl=timedelta(seconds=86400 * 1), features=[ Feature(name="conv_rate", dtype=ValueType.FLOAT), Feature(name="acc_rate", dtype=ValueType.FLOAT), Feature(name="avg_daily_trips", dtype=ValueType.INT64), ], batch_source=driver_hourly_stats, ) fs.apply([driver_hourly_stats_view, driver]) fs.materialize( start_date=datetime.utcnow() - timedelta(hours=3), end_date=datetime.utcnow() - timedelta(minutes=10), )
def test_exception_usage_on(): old_environ = dict(os.environ) test_usage_id = str(uuid.uuid4()) os.environ["FEAST_FORCE_USAGE_UUID"] = test_usage_id os.environ["FEAST_IS_USAGE_TEST"] = "True" os.environ["FEAST_USAGE"] = "True" try: test_feature_store = FeatureStore("/tmp/non_existent_directory") except: pass os.environ.clear() os.environ.update(old_environ) ensure_bigquery_usage_id_with_retry(test_usage_id)
def test_exception_usage_off(): old_environ = dict(os.environ) test_usage_id = str(uuid.uuid4()) os.environ["FEAST_IS_USAGE_TEST"] = "True" os.environ["FEAST_USAGE"] = "False" os.environ["FEAST_FORCE_USAGE_UUID"] = test_usage_id try: test_feature_store = FeatureStore("/tmp/non_existent_directory") except: pass os.environ.clear() os.environ.update(old_environ) sleep(30) rows = read_bigquery_usage_id(test_usage_id) assert rows.total_rows == 0
def generate_saved_dataset(): store = FeatureStore(repo_path=".") entity_df = pd.read_parquet(path="data/loan_table.parquet") fs = store.get_feature_service("credit_score_v1") job = store.get_historical_features(entity_df=entity_df, features=fs,) store.create_saved_dataset( from_=job, name="my_training_ds", storage=SavedDatasetFileStorage(path="my_training_ds.parquet"), feature_service=fs, profiler=credit_profiler, )
def _assert_online_features(store: FeatureStore, driver_df: pd.DataFrame, max_date: datetime): """Assert that features in online store are up to date with `max_date` date.""" # Read features back result = store.get_online_features( feature_refs=[ "driver_hourly_stats:conv_rate", "driver_hourly_stats:avg_daily_trips", ], entity_rows=[{ "driver_id": 1001 }], ) assert "driver_hourly_stats__avg_daily_trips" in result.to_dict() assert "driver_hourly_stats__conv_rate" in result.to_dict() assert (abs(result.to_dict()["driver_hourly_stats__conv_rate"][0] - _get_last_feature_row(driver_df, 1001, max_date)["conv_rate"]) < 0.01)
def store_online(feature_store: FeatureStore) -> FeatureStore: feature_store.materialize( start_date=datetime.utcnow() - timedelta(days=250), end_date=datetime.utcnow() - timedelta(minutes=10), ) return feature_store
class FeastRepositorySource(Source): """ This plugin extracts: - Entities as [`MLPrimaryKey`](https://datahubproject.io/docs/graphql/objects#mlprimarykey) - Features as [`MLFeature`](https://datahubproject.io/docs/graphql/objects#mlfeature) - Feature views and on-demand feature views as [`MLFeatureTable`](https://datahubproject.io/docs/graphql/objects#mlfeaturetable) - Batch and stream source details as [`Dataset`](https://datahubproject.io/docs/graphql/objects#dataset) - Column types associated with each entity and feature """ source_config: FeastRepositorySourceConfig report: SourceReport feature_store: FeatureStore def __init__(self, config: FeastRepositorySourceConfig, ctx: PipelineContext): super().__init__(ctx) self.source_config = config self.report = SourceReport() self.feature_store = FeatureStore(self.source_config.path) def _get_field_type(self, field_type: ValueType, parent_name: str) -> str: """ Maps types encountered in Feast to corresponding schema types. """ ml_feature_data_type = _field_type_mapping.get(field_type) if ml_feature_data_type is None: self.report.report_warning( parent_name, f"unable to map type {field_type} to metadata schema" ) ml_feature_data_type = MLFeatureDataType.UNKNOWN return ml_feature_data_type def _get_data_source_details(self, source: DataSource) -> Tuple[str, str]: """ Get Feast batch/stream source platform and name. """ platform = "unknown" name = "unknown" if isinstance(source, FileSource): platform = "file" name = source.path.replace("://", ".").replace("/", ".") if isinstance(source, BigQuerySource): platform = "bigquery" name = source.table if isinstance(source, KafkaSource): platform = "kafka" name = source.kafka_options.topic if isinstance(source, KinesisSource): platform = "kinesis" name = ( f"{source.kinesis_options.region}:{source.kinesis_options.stream_name}" ) if isinstance(source, RequestDataSource): platform = "request" name = source.name return platform, name def _get_data_sources(self, feature_view: FeatureView) -> List[str]: """ Get data source URN list. """ sources = [] if feature_view.batch_source is not None: batch_source_platform, batch_source_name = self._get_data_source_details( feature_view.batch_source ) sources.append( builder.make_dataset_urn( batch_source_platform, batch_source_name, self.source_config.environment, ) ) if feature_view.stream_source is not None: stream_source_platform, stream_source_name = self._get_data_source_details( feature_view.stream_source ) sources.append( builder.make_dataset_urn( stream_source_platform, stream_source_name, self.source_config.environment, ) ) return sources def _get_entity_workunit( self, feature_view: FeatureView, entity: Entity ) -> MetadataWorkUnit: """ Generate an MLPrimaryKey work unit for a Feast entity. """ feature_view_name = f"{self.feature_store.project}.{feature_view.name}" entity_snapshot = MLPrimaryKeySnapshot( urn=builder.make_ml_primary_key_urn(feature_view_name, entity.name), aspects=[StatusClass(removed=False)], ) entity_snapshot.aspects.append( MLPrimaryKeyPropertiesClass( description=entity.description, dataType=self._get_field_type(entity.value_type, entity.name), sources=self._get_data_sources(feature_view), ) ) mce = MetadataChangeEvent(proposedSnapshot=entity_snapshot) return MetadataWorkUnit(id=entity.name, mce=mce) def _get_feature_workunit( self, feature_view: Union[FeatureView, OnDemandFeatureView], feature: Feature, ) -> MetadataWorkUnit: """ Generate an MLFeature work unit for a Feast feature. """ feature_view_name = f"{self.feature_store.project}.{feature_view.name}" feature_snapshot = MLFeatureSnapshot( urn=builder.make_ml_feature_urn(feature_view_name, feature.name), aspects=[StatusClass(removed=False)], ) feature_sources = [] if isinstance(feature_view, FeatureView): feature_sources = self._get_data_sources(feature_view) elif isinstance(feature_view, OnDemandFeatureView): if feature_view.input_request_data_sources is not None: for request_source in feature_view.input_request_data_sources.values(): source_platform, source_name = self._get_data_source_details( request_source ) feature_sources.append( builder.make_dataset_urn( source_platform, source_name, self.source_config.environment, ) ) if feature_view.input_feature_view_projections is not None: for ( feature_view_projection ) in feature_view.input_feature_view_projections.values(): feature_view_source = self.feature_store.get_feature_view( feature_view_projection.name ) feature_sources.extend(self._get_data_sources(feature_view_source)) feature_snapshot.aspects.append( MLFeaturePropertiesClass( description=feature.labels.get("description"), dataType=self._get_field_type(feature.dtype, feature.name), sources=feature_sources, ) ) mce = MetadataChangeEvent(proposedSnapshot=feature_snapshot) return MetadataWorkUnit(id=feature.name, mce=mce) def _get_feature_view_workunit(self, feature_view: FeatureView) -> MetadataWorkUnit: """ Generate an MLFeatureTable work unit for a Feast feature view. """ feature_view_name = f"{self.feature_store.project}.{feature_view.name}" feature_view_snapshot = MLFeatureTableSnapshot( urn=builder.make_ml_feature_table_urn("feast", feature_view_name), aspects=[ BrowsePathsClass( paths=[f"/feast/{self.feature_store.project}/{feature_view_name}"] ), StatusClass(removed=False), ], ) feature_view_snapshot.aspects.append( MLFeatureTablePropertiesClass( mlFeatures=[ builder.make_ml_feature_urn( feature_view_name, feature.name, ) for feature in feature_view.features ], mlPrimaryKeys=[ builder.make_ml_primary_key_urn(feature_view_name, entity_name) for entity_name in feature_view.entities ], ) ) mce = MetadataChangeEvent(proposedSnapshot=feature_view_snapshot) return MetadataWorkUnit(id=feature_view_name, mce=mce) def _get_on_demand_feature_view_workunit( self, on_demand_feature_view: OnDemandFeatureView ) -> MetadataWorkUnit: """ Generate an MLFeatureTable work unit for a Feast on-demand feature view. """ on_demand_feature_view_name = ( f"{self.feature_store.project}.{on_demand_feature_view.name}" ) on_demand_feature_view_snapshot = MLFeatureTableSnapshot( urn=builder.make_ml_feature_table_urn("feast", on_demand_feature_view_name), aspects=[ BrowsePathsClass( paths=[ f"/feast/{self.feature_store.project}/{on_demand_feature_view_name}" ] ), StatusClass(removed=False), ], ) on_demand_feature_view_snapshot.aspects.append( MLFeatureTablePropertiesClass( mlFeatures=[ builder.make_ml_feature_urn( on_demand_feature_view_name, feature.name, ) for feature in on_demand_feature_view.features ], mlPrimaryKeys=[], ) ) mce = MetadataChangeEvent(proposedSnapshot=on_demand_feature_view_snapshot) return MetadataWorkUnit(id=on_demand_feature_view_name, mce=mce) @classmethod def create(cls, config_dict, ctx): config = FeastRepositorySourceConfig.parse_obj(config_dict) return cls(config, ctx) def get_workunits(self) -> Iterable[MetadataWorkUnit]: for feature_view in self.feature_store.list_feature_views(): for entity_name in feature_view.entities: entity = self.feature_store.get_entity(entity_name) work_unit = self._get_entity_workunit(feature_view, entity) self.report.report_workunit(work_unit) yield work_unit for feature in feature_view.features: work_unit = self._get_feature_workunit(feature_view, feature) self.report.report_workunit(work_unit) yield work_unit work_unit = self._get_feature_view_workunit(feature_view) self.report.report_workunit(work_unit) yield work_unit for on_demand_feature_view in self.feature_store.list_on_demand_feature_views(): for feature in on_demand_feature_view.features: work_unit = self._get_feature_workunit(on_demand_feature_view, feature) self.report.report_workunit(work_unit) yield work_unit work_unit = self._get_on_demand_feature_view_workunit( on_demand_feature_view ) self.report.report_workunit(work_unit) yield work_unit def get_report(self) -> SourceReport: return self.report def close(self) -> None: return
def __init__(self, config: FeastRepositorySourceConfig, ctx: PipelineContext): super().__init__(ctx) self.source_config = config self.report = SourceReport() self.feature_store = FeatureStore(self.source_config.path)
# Define an entity for the driver. You can think of entity as a primary key used to # fetch features. driver = Entity( name="driver_id", value_type=ValueType.INT64, description="driver id", ) # Our parquet files contain sample data that includes a driver_id column, timestamps and # three feature column. Here we define a Feature View that will allow us to serve this # data to our model online. driver_hourly_stats_view = FeatureView( name="driver_hourly_stats", entities=["driver_id"], ttl=Duration(seconds=86400 * 365), features=[ Feature(name="conv_rate", dtype=ValueType.DOUBLE), Feature(name="acc_rate", dtype=ValueType.FLOAT), Feature(name="avg_daily_trips", dtype=ValueType.INT64), ], online=True, batch_source=driver_hourly_stats, tags={}, ) fs = FeatureStore("") fs.apply([driver_hourly_stats_view, driver]) now = datetime.now() fs.materialize_incremental(now)