def get_feature_view_query_context( feature_refs: List[str], feature_views: List[FeatureView], registry: Registry, project: str, ) -> List[FeatureViewQueryContext]: """Build a query context containing all information required to template a BigQuery and Redshift point-in-time SQL query""" ( feature_views_to_feature_map, on_demand_feature_views_to_features, ) = _get_requested_feature_views_to_features_dict( feature_refs, feature_views, registry.list_on_demand_feature_views(project) ) query_context = [] for feature_view, features in feature_views_to_feature_map.items(): join_keys = [] entity_selections = [] reverse_field_mapping = { v: k for k, v in feature_view.input.field_mapping.items() } for entity_name in feature_view.entities: entity = registry.get_entity(entity_name, project) join_key = feature_view.projection.join_key_map.get( entity.join_key, entity.join_key ) join_keys.append(join_key) entity_selections.append(f"{entity.join_key} AS {join_key}") if isinstance(feature_view.ttl, timedelta): ttl_seconds = int(feature_view.ttl.total_seconds()) else: ttl_seconds = 0 event_timestamp_column = feature_view.input.event_timestamp_column created_timestamp_column = feature_view.input.created_timestamp_column context = FeatureViewQueryContext( name=feature_view.projection.name_to_use(), ttl=ttl_seconds, entities=join_keys, features=features, event_timestamp_column=reverse_field_mapping.get( event_timestamp_column, event_timestamp_column ), created_timestamp_column=reverse_field_mapping.get( created_timestamp_column, created_timestamp_column ), # TODO: Make created column optional and not hardcoded table_subquery=feature_view.input.get_table_query_string(), entity_selections=entity_selections, ) query_context.append(context) return query_context
def _tag_registry_on_demand_feature_views_for_keep_delete( project: str, registry: Registry, repo: ParsedRepo ) -> Tuple[List[OnDemandFeatureView], List[OnDemandFeatureView]]: odfvs_to_keep: List[OnDemandFeatureView] = repo.on_demand_feature_views odfvs_to_delete: List[OnDemandFeatureView] = [] repo_on_demand_feature_view_names = set( t.name for t in repo.on_demand_feature_views ) for registry_odfv in registry.list_on_demand_feature_views(project=project): if registry_odfv.name not in repo_on_demand_feature_view_names: odfvs_to_delete.append(registry_odfv) return odfvs_to_keep, odfvs_to_delete
def get_historical_features( config: RepoConfig, feature_views: List[FeatureView], feature_refs: List[str], entity_df: Union[pd.DataFrame, str], registry: Registry, project: str, full_feature_names: bool = False, ) -> RetrievalJob: if not isinstance(entity_df, pd.DataFrame): raise ValueError( f"Please provide an entity_df of type {type(pd.DataFrame)} instead of type {type(entity_df)}" ) entity_df_event_timestamp_col = DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL # local modifiable copy of global variable if entity_df_event_timestamp_col not in entity_df.columns: datetime_columns = entity_df.select_dtypes( include=["datetime", "datetimetz"]).columns if len(datetime_columns) == 1: print( f"Using {datetime_columns[0]} as the event timestamp. To specify a column explicitly, please name it {DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL}." ) entity_df_event_timestamp_col = datetime_columns[0] else: raise ValueError( f"Please provide an entity_df with a column named {DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL} representing the time of events." ) ( feature_views_to_features, on_demand_feature_views_to_features, ) = _get_requested_feature_views_to_features_dict( feature_refs, feature_views, registry.list_on_demand_feature_views(config.project), ) # Create lazy function that is only called from the RetrievalJob object def evaluate_historical_retrieval(): # Make sure all event timestamp fields are tz-aware. We default tz-naive fields to UTC entity_df[entity_df_event_timestamp_col] = entity_df[ entity_df_event_timestamp_col].apply( lambda x: x if x.tzinfo is not None else x.replace(tzinfo=pytz.utc)) # Create a copy of entity_df to prevent modifying the original entity_df_with_features = entity_df.copy() # Convert event timestamp column to datetime and normalize time zone to UTC # This is necessary to avoid issues with pd.merge_asof entity_df_with_features[ entity_df_event_timestamp_col] = pd.to_datetime( entity_df_with_features[entity_df_event_timestamp_col], utc=True) # Sort event timestamp values entity_df_with_features = entity_df_with_features.sort_values( entity_df_event_timestamp_col) # Load feature view data from sources and join them incrementally for feature_view, features in feature_views_to_features.items(): event_timestamp_column = ( feature_view.batch_source.event_timestamp_column) created_timestamp_column = ( feature_view.batch_source.created_timestamp_column) # Read offline parquet data in pyarrow format. filesystem, path = FileSource.create_filesystem_and_path( feature_view.batch_source.path, feature_view.batch_source.file_options. s3_endpoint_override, ) table = pyarrow.parquet.read_table(path, filesystem=filesystem) # Rename columns by the field mapping dictionary if it exists if feature_view.batch_source.field_mapping is not None: table = _run_field_mapping( table, feature_view.batch_source.field_mapping) # Rename entity columns by the join_key_map dictionary if it exists if feature_view.projection.join_key_map: table = _run_field_mapping( table, feature_view.projection.join_key_map) # Convert pyarrow table to pandas dataframe. Note, if the underlying data has missing values, # pandas will convert those values to np.nan if the dtypes are numerical (floats, ints, etc.) or boolean # If the dtype is 'object', then missing values are inferred as python `None`s. # More details at: # https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html#values-considered-missing df_to_join = table.to_pandas() # Make sure all timestamp fields are tz-aware. We default tz-naive fields to UTC df_to_join[event_timestamp_column] = df_to_join[ event_timestamp_column].apply(lambda x: x if x.tzinfo is not None else x.replace(tzinfo=pytz.utc)) if created_timestamp_column: df_to_join[created_timestamp_column] = df_to_join[ created_timestamp_column].apply( lambda x: x if x.tzinfo is not None else x.replace( tzinfo=pytz.utc)) # Sort dataframe by the event timestamp column df_to_join = df_to_join.sort_values(event_timestamp_column) # Build a list of all the features we should select from this source feature_names = [] for feature in features: # Modify the separator for feature refs in column names to double underscore. We are using # double underscore as separator for consistency with other databases like BigQuery, # where there are very few characters available for use as separators if full_feature_names: formatted_feature_name = ( f"{feature_view.projection.name_to_use()}__{feature}" ) else: formatted_feature_name = feature # Add the feature name to the list of columns feature_names.append(formatted_feature_name) # Ensure that the source dataframe feature column includes the feature view name as a prefix df_to_join.rename( columns={feature: formatted_feature_name}, inplace=True, ) # Build a list of entity columns to join on (from the right table) join_keys = [] for entity_name in feature_view.entities: entity = registry.get_entity(entity_name, project) join_key = feature_view.projection.join_key_map.get( entity.join_key, entity.join_key) join_keys.append(join_key) right_entity_columns = join_keys right_entity_key_columns = [event_timestamp_column ] + right_entity_columns # Remove all duplicate entity keys (using created timestamp) right_entity_key_sort_columns = right_entity_key_columns if created_timestamp_column: # If created_timestamp is available, use it to dedupe deterministically right_entity_key_sort_columns = right_entity_key_sort_columns + [ created_timestamp_column ] df_to_join.sort_values(by=right_entity_key_sort_columns, inplace=True) df_to_join.drop_duplicates( right_entity_key_sort_columns, keep="last", ignore_index=True, inplace=True, ) # Select only the columns we need to join from the feature dataframe df_to_join = df_to_join[right_entity_key_columns + feature_names] # Do point in-time-join between entity_df and feature dataframe entity_df_with_features = pd.merge_asof( entity_df_with_features, df_to_join, left_on=entity_df_event_timestamp_col, right_on=event_timestamp_column, by=right_entity_columns or None, tolerance=feature_view.ttl, ) # Remove right (feature table/view) event_timestamp column. if event_timestamp_column != entity_df_event_timestamp_col: entity_df_with_features.drop( columns=[event_timestamp_column], inplace=True) # Ensure that we delete dataframes to free up memory del df_to_join # Move "event_timestamp" column to front current_cols = entity_df_with_features.columns.tolist() current_cols.remove(entity_df_event_timestamp_col) entity_df_with_features = entity_df_with_features[ [entity_df_event_timestamp_col] + current_cols] return entity_df_with_features job = FileRetrievalJob( evaluation_function=evaluate_historical_retrieval, full_feature_names=full_feature_names, on_demand_feature_views=OnDemandFeatureView.get_requested_odfvs( feature_refs, project, registry), ) return job
class FeatureStore: """ A FeatureStore object is used to define, create, and retrieve features. Args: repo_path (optional): Path to a `feature_store.yaml` used to configure the feature store. config (optional): Configuration object used to configure the feature store. """ config: RepoConfig repo_path: Path _registry: Registry @log_exceptions def __init__( self, repo_path: Optional[str] = None, config: Optional[RepoConfig] = None, ): """ Creates a FeatureStore object. Raises: ValueError: If both or neither of repo_path and config are specified. """ if repo_path is not None and config is not None: raise ValueError("You cannot specify both repo_path and config.") if config is not None: self.repo_path = Path(os.getcwd()) self.config = config elif repo_path is not None: self.repo_path = Path(repo_path) self.config = load_repo_config(Path(repo_path)) else: raise ValueError("Please specify one of repo_path or config.") registry_config = self.config.get_registry_config() self._registry = Registry(registry_config, repo_path=self.repo_path) @log_exceptions def version(self) -> str: """Returns the version of the current Feast SDK/CLI.""" return get_version() @property def registry(self) -> Registry: """Gets the registry of this feature store.""" return self._registry @property def project(self) -> str: """Gets the project of this feature store.""" return self.config.project def _get_provider(self) -> Provider: # TODO: Bake self.repo_path into self.config so that we dont only have one interface to paths return get_provider(self.config, self.repo_path) @log_exceptions_and_usage def refresh_registry(self): """Fetches and caches a copy of the feature registry in memory. Explicitly calling this method allows for direct control of the state of the registry cache. Every time this method is called the complete registry state will be retrieved from the remote registry store backend (e.g., GCS, S3), and the cache timer will be reset. If refresh_registry() is run before get_online_features() is called, then get_online_feature() will use the cached registry instead of retrieving (and caching) the registry itself. Additionally, the TTL for the registry cache can be set to infinity (by setting it to 0), which means that refresh_registry() will become the only way to update the cached registry. If the TTL is set to a value greater than 0, then once the cache becomes stale (more time than the TTL has passed), a new cache will be downloaded synchronously, which may increase latencies if the triggering method is get_online_features() """ registry_config = self.config.get_registry_config() self._registry = Registry(registry_config, repo_path=self.repo_path) self._registry.refresh() @log_exceptions_and_usage def list_entities(self, allow_cache: bool = False) -> List[Entity]: """ Retrieves the list of entities from the registry. Args: allow_cache: Whether to allow returning entities from a cached registry. Returns: A list of entities. """ return self._list_entities(allow_cache) def _list_entities(self, allow_cache: bool = False, hide_dummy_entity: bool = True) -> List[Entity]: all_entities = self._registry.list_entities(self.project, allow_cache=allow_cache) return [ entity for entity in all_entities if entity.name != DUMMY_ENTITY_NAME or not hide_dummy_entity ] @log_exceptions_and_usage def list_feature_services(self) -> List[FeatureService]: """ Retrieves the list of feature services from the registry. Returns: A list of feature services. """ return self._registry.list_feature_services(self.project) @log_exceptions_and_usage def list_feature_views(self, allow_cache: bool = False) -> List[FeatureView]: """ Retrieves the list of feature views from the registry. Args: allow_cache: Whether to allow returning entities from a cached registry. Returns: A list of feature views. """ return self._list_feature_views(allow_cache) def _list_feature_views( self, allow_cache: bool = False, hide_dummy_entity: bool = True) -> List[FeatureView]: feature_views = [] for fv in self._registry.list_feature_views(self.project, allow_cache=allow_cache): if hide_dummy_entity and fv.entities[0] == DUMMY_ENTITY_NAME: fv.entities = [] feature_views.append(fv) return feature_views @log_exceptions_and_usage def list_on_demand_feature_views(self) -> List[OnDemandFeatureView]: """ Retrieves the list of on demand feature views from the registry. Returns: A list of on demand feature views. """ return self._registry.list_on_demand_feature_views(self.project) @log_exceptions_and_usage def get_entity(self, name: str) -> Entity: """ Retrieves an entity. Args: name: Name of entity. Returns: The specified entity. Raises: EntityNotFoundException: The entity could not be found. """ return self._registry.get_entity(name, self.project) @log_exceptions_and_usage def get_feature_service(self, name: str) -> FeatureService: """ Retrieves a feature service. Args: name: Name of feature service. Returns: The specified feature service. Raises: FeatureServiceNotFoundException: The feature service could not be found. """ return self._registry.get_feature_service(name, self.project) @log_exceptions_and_usage def get_feature_view(self, name: str) -> FeatureView: """ Retrieves a feature view. Args: name: Name of feature view. Returns: The specified feature view. Raises: FeatureViewNotFoundException: The feature view could not be found. """ return self._get_feature_view(name) def _get_feature_view(self, name: str, hide_dummy_entity: bool = True) -> FeatureView: feature_view = self._registry.get_feature_view(name, self.project) if hide_dummy_entity and feature_view.entities[0] == DUMMY_ENTITY_NAME: feature_view.entities = [] return feature_view @log_exceptions_and_usage def get_on_demand_feature_view(self, name: str) -> OnDemandFeatureView: """ Retrieves a feature view. Args: name: Name of feature view. Returns: The specified feature view. Raises: FeatureViewNotFoundException: The feature view could not be found. """ return self._registry.get_on_demand_feature_view(name, self.project) @log_exceptions_and_usage def delete_feature_view(self, name: str): """ Deletes a feature view. Args: name: Name of feature view. Raises: FeatureViewNotFoundException: The feature view could not be found. """ return self._registry.delete_feature_view(name, self.project) @log_exceptions_and_usage def delete_feature_service(self, name: str): """ Deletes a feature service. Args: name: Name of feature service. Raises: FeatureServiceNotFoundException: The feature view could not be found. """ return self._registry.delete_feature_service(name, self.project) def _get_features( self, features: Optional[Union[List[str], FeatureService]], feature_refs: Optional[List[str]], ) -> List[str]: _features = features or feature_refs if not _features: raise ValueError("No features specified for retrieval") _feature_refs: List[str] if isinstance(_features, FeatureService): # Get the latest value of the feature service, in case the object passed in has been updated underneath us. _feature_refs = _get_feature_refs_from_feature_services( self.get_feature_service(_features.name)) else: _feature_refs = _features return _feature_refs @log_exceptions_and_usage def apply( self, objects: Union[Entity, FeatureView, OnDemandFeatureView, FeatureService, List[Union[FeatureView, OnDemandFeatureView, Entity, FeatureService]], ], commit: bool = True, ): """Register objects to metadata store and update related infrastructure. The apply method registers one or more definitions (e.g., Entity, FeatureView) and registers or updates these objects in the Feast registry. Once the registry has been updated, the apply method will update related infrastructure (e.g., create tables in an online store) in order to reflect these new definitions. All operations are idempotent, meaning they can safely be rerun. Args: objects: A single object, or a list of objects that should be registered with the Feature Store. commit: whether to commit changes to the registry Raises: ValueError: The 'objects' parameter could not be parsed properly. Examples: Register an Entity and a FeatureView. >>> from feast import FeatureStore, Entity, FeatureView, Feature, ValueType, FileSource, RepoConfig >>> from datetime import timedelta >>> fs = FeatureStore(repo_path="feature_repo") >>> driver = Entity(name="driver_id", value_type=ValueType.INT64, description="driver id") >>> driver_hourly_stats = FileSource( ... path="feature_repo/data/driver_stats.parquet", ... event_timestamp_column="event_timestamp", ... created_timestamp_column="created", ... ) >>> driver_hourly_stats_view = FeatureView( ... name="driver_hourly_stats", ... entities=["driver_id"], ... ttl=timedelta(seconds=86400 * 1), ... batch_source=driver_hourly_stats, ... ) >>> fs.apply([driver_hourly_stats_view, driver]) # register entity and feature view """ # TODO: Add locking if not isinstance(objects, Iterable): objects = [objects] assert isinstance(objects, list) views_to_update = [ob for ob in objects if isinstance(ob, FeatureView)] odfvs_to_update = [ ob for ob in objects if isinstance(ob, OnDemandFeatureView) ] if (not flags_helper.enable_on_demand_feature_views(self.config) and len(odfvs_to_update) > 0): raise ExperimentalFeatureNotEnabled( flags.FLAG_ON_DEMAND_TRANSFORM_NAME) if len(odfvs_to_update) > 0: log_event(UsageEvent.APPLY_WITH_ODFV) _validate_feature_views(views_to_update) entities_to_update = [ob for ob in objects if isinstance(ob, Entity)] services_to_update = [ ob for ob in objects if isinstance(ob, FeatureService) ] # Make inferences update_entities_with_inferred_types_from_feature_views( entities_to_update, views_to_update, self.config) update_data_sources_with_inferred_event_timestamp_col( [view.batch_source for view in views_to_update], self.config) for view in views_to_update: view.infer_features_from_batch_source(self.config) for odfv in odfvs_to_update: odfv.infer_features() if len(views_to_update) + len(entities_to_update) + len( services_to_update) + len(odfvs_to_update) != len(objects): raise ValueError( "Unknown object type provided as part of apply() call") # DUMMY_ENTITY is a placeholder entity used in entityless FeatureViews DUMMY_ENTITY = Entity( name=DUMMY_ENTITY_NAME, join_key=DUMMY_ENTITY_ID, value_type=ValueType.INT32, ) entities_to_update.append(DUMMY_ENTITY) for view in views_to_update: self._registry.apply_feature_view(view, project=self.project, commit=False) for odfv in odfvs_to_update: self._registry.apply_on_demand_feature_view(odfv, project=self.project, commit=False) for ent in entities_to_update: self._registry.apply_entity(ent, project=self.project, commit=False) for feature_service in services_to_update: self._registry.apply_feature_service(feature_service, project=self.project) self._get_provider().update_infra( project=self.project, tables_to_delete=[], tables_to_keep=views_to_update, entities_to_delete=[], entities_to_keep=entities_to_update, partial=True, ) if commit: self._registry.commit() @log_exceptions_and_usage def teardown(self): """Tears down all local and cloud resources for the feature store.""" tables: List[Union[FeatureView, FeatureTable]] = [] feature_views = self.list_feature_views() feature_tables = self._registry.list_feature_tables(self.project) tables.extend(feature_views) tables.extend(feature_tables) entities = self.list_entities() self._get_provider().teardown_infra(self.project, tables, entities) self._registry.teardown() @log_exceptions_and_usage def get_historical_features( self, entity_df: Union[pd.DataFrame, str], features: Optional[Union[List[str], FeatureService]] = None, feature_refs: Optional[List[str]] = None, full_feature_names: bool = False, ) -> RetrievalJob: """Enrich an entity dataframe with historical feature values for either training or batch scoring. This method joins historical feature data from one or more feature views to an entity dataframe by using a time travel join. Each feature view is joined to the entity dataframe using all entities configured for the respective feature view. All configured entities must be available in the entity dataframe. Therefore, the entity dataframe must contain all entities found in all feature views, but the individual feature views can have different entities. Time travel is based on the configured TTL for each feature view. A shorter TTL will limit the amount of scanning that will be done in order to find feature data for a specific entity key. Setting a short TTL may result in null values being returned. Args: entity_df (Union[pd.DataFrame, str]): An entity dataframe is a collection of rows containing all entity columns (e.g., customer_id, driver_id) on which features need to be joined, as well as a event_timestamp column used to ensure point-in-time correctness. Either a Pandas DataFrame can be provided or a string SQL query. The query must be of a format supported by the configured offline store (e.g., BigQuery) features: A list of features, that should be retrieved from the offline store. Either a list of string feature references can be provided or a FeatureService object. Feature references are of the format "feature_view:feature", e.g., "customer_fv:daily_transactions". full_feature_names: A boolean that provides the option to add the feature view prefixes to the feature names, changing them from the format "feature" to "feature_view__feature" (e.g., "daily_transactions" changes to "customer_fv__daily_transactions"). By default, this value is set to False. Returns: RetrievalJob which can be used to materialize the results. Raises: ValueError: Both or neither of features and feature_refs are specified. Examples: Retrieve historical features from a local offline store. >>> from feast import FeatureStore, RepoConfig >>> import pandas as pd >>> fs = FeatureStore(repo_path="feature_repo") >>> entity_df = pd.DataFrame.from_dict( ... { ... "driver_id": [1001, 1002], ... "event_timestamp": [ ... datetime(2021, 4, 12, 10, 59, 42), ... datetime(2021, 4, 12, 8, 12, 10), ... ], ... } ... ) >>> retrieval_job = fs.get_historical_features( ... entity_df=entity_df, ... features=[ ... "driver_hourly_stats:conv_rate", ... "driver_hourly_stats:acc_rate", ... "driver_hourly_stats:avg_daily_trips", ... ], ... ) >>> feature_data = retrieval_job.to_df() """ if (features is not None and feature_refs is not None) or (features is None and feature_refs is None): raise ValueError( "You must specify exactly one of features and feature_refs.") if feature_refs: warnings.warn( ("The argument 'feature_refs' is being deprecated. Please use 'features' " "instead. Feast 0.13 and onwards will not support the argument 'feature_refs'." ), DeprecationWarning, ) _feature_refs = self._get_features(features, feature_refs) all_feature_views = self.list_feature_views() all_on_demand_feature_views = self._registry.list_on_demand_feature_views( project=self.project) # TODO(achal): _group_feature_refs returns the on demand feature views, but it's no passed into the provider. # This is a weird interface quirk - we should revisit the `get_historical_features` to # pass in the on demand feature views as well. fvs, odfvs = _group_feature_refs(_feature_refs, all_feature_views, all_on_demand_feature_views) feature_views = list(view for view, _ in fvs) on_demand_feature_views = list(view for view, _ in odfvs) if len(on_demand_feature_views) > 0: log_event(UsageEvent.GET_HISTORICAL_FEATURES_WITH_ODFV) # Check that the right request data is present in the entity_df if type(entity_df) == pd.DataFrame: entity_pd_df = cast(pd.DataFrame, entity_df) for odfv in on_demand_feature_views: odfv_inputs = odfv.inputs.values() for odfv_input in odfv_inputs: if type(odfv_input) == RequestDataSource: request_data_source = cast(RequestDataSource, odfv_input) for feature_name in request_data_source.schema.keys(): if feature_name not in entity_pd_df.columns: raise RequestDataNotFoundInEntityDfException( feature_name=feature_name, feature_view_name=odfv.name, ) _validate_feature_refs(_feature_refs, full_feature_names) provider = self._get_provider() job = provider.get_historical_features( self.config, feature_views, _feature_refs, entity_df, self._registry, self.project, full_feature_names, ) return job @log_exceptions_and_usage def materialize_incremental( self, end_date: datetime, feature_views: Optional[List[str]] = None, ) -> None: """ Materialize incremental new data from the offline store into the online store. This method loads incremental new feature data up to the specified end time from either the specified feature views, or all feature views if none are specified, into the online store where it is available for online serving. The start time of the interval materialized is either the most recent end time of a prior materialization or (now - ttl) if no such prior materialization exists. Args: end_date (datetime): End date for time range of data to materialize into the online store feature_views (List[str]): Optional list of feature view names. If selected, will only run materialization for the specified feature views. Raises: Exception: A feature view being materialized does not have a TTL set. Examples: Materialize all features into the online store up to 5 minutes ago. >>> from feast import FeatureStore, RepoConfig >>> from datetime import datetime, timedelta >>> fs = FeatureStore(repo_path="feature_repo") >>> fs.materialize_incremental(end_date=datetime.utcnow() - timedelta(minutes=5)) Materializing... <BLANKLINE> ... """ feature_views_to_materialize = [] if feature_views is None: feature_views_to_materialize = self._list_feature_views( hide_dummy_entity=False) else: for name in feature_views: feature_view = self._get_feature_view(name, hide_dummy_entity=False) feature_views_to_materialize.append(feature_view) _print_materialization_log( None, end_date, len(feature_views_to_materialize), self.config.online_store.type, ) # TODO paging large loads for feature_view in feature_views_to_materialize: start_date = feature_view.most_recent_end_time if start_date is None: if feature_view.ttl is None: raise Exception( f"No start time found for feature view {feature_view.name}. materialize_incremental() requires" f" either a ttl to be set or for materialize() to have been run at least once." ) start_date = datetime.utcnow() - feature_view.ttl provider = self._get_provider() print( f"{Style.BRIGHT + Fore.GREEN}{feature_view.name}{Style.RESET_ALL}" f" from {Style.BRIGHT + Fore.GREEN}{start_date.replace(microsecond=0).astimezone()}{Style.RESET_ALL}" f" to {Style.BRIGHT + Fore.GREEN}{end_date.replace(microsecond=0).astimezone()}{Style.RESET_ALL}:" ) def tqdm_builder(length): return tqdm(total=length, ncols=100) start_date = utils.make_tzaware(start_date) end_date = utils.make_tzaware(end_date) provider.materialize_single_feature_view( config=self.config, feature_view=feature_view, start_date=start_date, end_date=end_date, registry=self._registry, project=self.project, tqdm_builder=tqdm_builder, ) self._registry.apply_materialization(feature_view, self.project, start_date, end_date) @log_exceptions_and_usage def materialize( self, start_date: datetime, end_date: datetime, feature_views: Optional[List[str]] = None, ) -> None: """ Materialize data from the offline store into the online store. This method loads feature data in the specified interval from either the specified feature views, or all feature views if none are specified, into the online store where it is available for online serving. Args: start_date (datetime): Start date for time range of data to materialize into the online store end_date (datetime): End date for time range of data to materialize into the online store feature_views (List[str]): Optional list of feature view names. If selected, will only run materialization for the specified feature views. Examples: Materialize all features into the online store over the interval from 3 hours ago to 10 minutes ago. >>> from feast import FeatureStore, RepoConfig >>> from datetime import datetime, timedelta >>> fs = FeatureStore(repo_path="feature_repo") >>> fs.materialize( ... start_date=datetime.utcnow() - timedelta(hours=3), end_date=datetime.utcnow() - timedelta(minutes=10) ... ) Materializing... <BLANKLINE> ... """ if utils.make_tzaware(start_date) > utils.make_tzaware(end_date): raise ValueError( f"The given start_date {start_date} is greater than the given end_date {end_date}." ) feature_views_to_materialize = [] if feature_views is None: feature_views_to_materialize = self._list_feature_views( hide_dummy_entity=False) else: for name in feature_views: feature_view = self._get_feature_view(name, hide_dummy_entity=False) feature_views_to_materialize.append(feature_view) _print_materialization_log( start_date, end_date, len(feature_views_to_materialize), self.config.online_store.type, ) # TODO paging large loads for feature_view in feature_views_to_materialize: provider = self._get_provider() print( f"{Style.BRIGHT + Fore.GREEN}{feature_view.name}{Style.RESET_ALL}:" ) def tqdm_builder(length): return tqdm(total=length, ncols=100) start_date = utils.make_tzaware(start_date) end_date = utils.make_tzaware(end_date) provider.materialize_single_feature_view( config=self.config, feature_view=feature_view, start_date=start_date, end_date=end_date, registry=self._registry, project=self.project, tqdm_builder=tqdm_builder, ) self._registry.apply_materialization(feature_view, self.project, start_date, end_date) @log_exceptions_and_usage def get_online_features( self, features: Union[List[str], FeatureService], entity_rows: List[Dict[str, Any]], feature_refs: Optional[List[str]] = None, full_feature_names: bool = False, ) -> OnlineResponse: """ Retrieves the latest online feature data. Note: This method will download the full feature registry the first time it is run. If you are using a remote registry like GCS or S3 then that may take a few seconds. The registry remains cached up to a TTL duration (which can be set to infinity). If the cached registry is stale (more time than the TTL has passed), then a new registry will be downloaded synchronously by this method. This download may introduce latency to online feature retrieval. In order to avoid synchronous downloads, please call refresh_registry() prior to the TTL being reached. Remember it is possible to set the cache TTL to infinity (cache forever). Args: features: List of feature references that will be returned for each entity. Each feature reference should have the following format: "feature_table:feature" where "feature_table" & "feature" refer to the feature and feature table names respectively. Only the feature name is required. entity_rows: A list of dictionaries where each key-value is an entity-name, entity-value pair. Returns: OnlineResponse containing the feature data in records. Raises: Exception: No entity with the specified name exists. Examples: Materialize all features into the online store over the interval from 3 hours ago to 10 minutes ago, and then retrieve these online features. >>> from feast import FeatureStore, RepoConfig >>> fs = FeatureStore(repo_path="feature_repo") >>> online_response = fs.get_online_features( ... features=[ ... "driver_hourly_stats:conv_rate", ... "driver_hourly_stats:acc_rate", ... "driver_hourly_stats:avg_daily_trips", ... ], ... entity_rows=[{"driver_id": 1001}, {"driver_id": 1002}, {"driver_id": 1003}, {"driver_id": 1004}], ... ) >>> online_response_dict = online_response.to_dict() """ _feature_refs = self._get_features(features, feature_refs) all_feature_views = self._list_feature_views(allow_cache=True, hide_dummy_entity=False) all_on_demand_feature_views = self._registry.list_on_demand_feature_views( project=self.project, allow_cache=True) _validate_feature_refs(_feature_refs, full_feature_names) grouped_refs, grouped_odfv_refs = _group_feature_refs( _feature_refs, all_feature_views, all_on_demand_feature_views) if len(grouped_odfv_refs) > 0: log_event(UsageEvent.GET_ONLINE_FEATURES_WITH_ODFV) feature_views = list(view for view, _ in grouped_refs) entityless_case = DUMMY_ENTITY_NAME in [ entity_name for feature_view in feature_views for entity_name in feature_view.entities ] provider = self._get_provider() entities = self._list_entities(allow_cache=True, hide_dummy_entity=False) entity_name_to_join_key_map = {} for entity in entities: entity_name_to_join_key_map[entity.name] = entity.join_key needed_request_data_features = self._get_needed_request_data_features( grouped_odfv_refs) join_key_rows = [] request_data_features: Dict[str, List[Any]] = {} # Entity rows may be either entities or request data. for row in entity_rows: join_key_row = {} for entity_name, entity_value in row.items(): # Found request data if entity_name in needed_request_data_features: if entity_name not in request_data_features: request_data_features[entity_name] = [] request_data_features[entity_name].append(entity_value) continue try: join_key = entity_name_to_join_key_map[entity_name] except KeyError: raise EntityNotFoundException(entity_name, self.project) join_key_row[join_key] = entity_value if entityless_case: join_key_row[DUMMY_ENTITY_ID] = DUMMY_ENTITY_VAL if len(join_key_row) > 0: # May be empty if this entity row was request data join_key_rows.append(join_key_row) if len(needed_request_data_features) != len( request_data_features.keys()): raise RequestDataNotFoundInEntityRowsException( feature_names=needed_request_data_features) entity_row_proto_list = _infer_online_entity_rows(join_key_rows) union_of_entity_keys: List[EntityKeyProto] = [] result_rows: List[GetOnlineFeaturesResponse.FieldValues] = [] for entity_row_proto in entity_row_proto_list: # Create a list of entity keys to filter down for each feature view at lookup time. union_of_entity_keys.append(_entity_row_to_key(entity_row_proto)) # Also create entity values to append to the result result_rows.append(_entity_row_to_field_values(entity_row_proto)) # Add more feature values to the existing result rows for the request data features for feature_name, feature_values in request_data_features.items(): for row_idx, feature_value in enumerate(feature_values): result_row = result_rows[row_idx] result_row.fields[feature_name].CopyFrom( python_value_to_proto_value(feature_value)) result_row.statuses[ feature_name] = GetOnlineFeaturesResponse.FieldStatus.PRESENT for table, requested_features in grouped_refs: self._populate_result_rows_from_feature_view( entity_name_to_join_key_map, full_feature_names, provider, requested_features, result_rows, table, union_of_entity_keys, ) initial_response = OnlineResponse( GetOnlineFeaturesResponse(field_values=result_rows)) return self._augment_response_with_on_demand_transforms( _feature_refs, full_feature_names, initial_response, result_rows) def _populate_result_rows_from_feature_view( self, entity_name_to_join_key_map: Dict[str, str], full_feature_names: bool, provider: Provider, requested_features: List[str], result_rows: List[GetOnlineFeaturesResponse.FieldValues], table: FeatureView, union_of_entity_keys: List[EntityKeyProto], ): entity_keys = _get_table_entity_keys(table, union_of_entity_keys, entity_name_to_join_key_map) read_rows = provider.online_read( config=self.config, table=table, entity_keys=entity_keys, requested_features=requested_features, ) # Each row is a set of features for a given entity key for row_idx, read_row in enumerate(read_rows): row_ts, feature_data = read_row result_row = result_rows[row_idx] if feature_data is None: for feature_name in requested_features: feature_ref = (f"{table.name}__{feature_name}" if full_feature_names else feature_name) result_row.statuses[ feature_ref] = GetOnlineFeaturesResponse.FieldStatus.NOT_FOUND else: for feature_name in feature_data: feature_ref = (f"{table.name}__{feature_name}" if full_feature_names else feature_name) if feature_name in requested_features: result_row.fields[feature_ref].CopyFrom( feature_data[feature_name]) result_row.statuses[ feature_ref] = GetOnlineFeaturesResponse.FieldStatus.PRESENT def _get_needed_request_data_features(self, grouped_odfv_refs) -> Set[str]: needed_request_data_features = set() for odfv_to_feature_names in grouped_odfv_refs: odfv, requested_feature_names = odfv_to_feature_names odfv_inputs = odfv.inputs.values() for odfv_input in odfv_inputs: if type(odfv_input) == RequestDataSource: request_data_source = cast(RequestDataSource, odfv_input) for feature_name in request_data_source.schema.keys(): needed_request_data_features.add(feature_name) return needed_request_data_features def _augment_response_with_on_demand_transforms( self, feature_refs: List[str], full_feature_names: bool, initial_response: OnlineResponse, result_rows: List[GetOnlineFeaturesResponse.FieldValues], ) -> OnlineResponse: all_on_demand_feature_views = { view.name: view for view in self._registry.list_on_demand_feature_views( project=self.project, allow_cache=True) } all_odfv_feature_names = all_on_demand_feature_views.keys() if len(all_on_demand_feature_views) == 0: return initial_response initial_response_df = initial_response.to_df() odfv_feature_refs = defaultdict(list) for feature_ref in feature_refs: view_name, feature_name = feature_ref.split(":") if view_name in all_odfv_feature_names: odfv_feature_refs[view_name].append(feature_name) # Apply on demand transformations for odfv_name, _feature_refs in odfv_feature_refs.items(): odfv = all_on_demand_feature_views[odfv_name] transformed_features_df = odfv.get_transformed_features_df( full_feature_names, initial_response_df) for row_idx in range(len(result_rows)): result_row = result_rows[row_idx] selected_subset = [ f for f in transformed_features_df.columns if f in _feature_refs ] for transformed_feature in selected_subset: transformed_feature_name = ( f"{odfv.name}__{transformed_feature}" if full_feature_names else transformed_feature) proto_value = python_value_to_proto_value( transformed_features_df[transformed_feature]. values[row_idx]) result_row.fields[transformed_feature_name].CopyFrom( proto_value) result_row.statuses[ transformed_feature_name] = GetOnlineFeaturesResponse.FieldStatus.PRESENT return OnlineResponse( GetOnlineFeaturesResponse(field_values=result_rows)) @log_exceptions_and_usage def serve(self, port: int) -> None: """Start the feature consumption server locally on a given port.""" if not flags_helper.enable_python_feature_server(self.config): raise ExperimentalFeatureNotEnabled( flags.FLAG_PYTHON_FEATURE_SERVER_NAME) feature_server.start_server(self, port)
def get_feature_view_query_context( feature_refs: List[str], feature_views: List[FeatureView], registry: Registry, project: str, entity_df_timestamp_range: Tuple[datetime, datetime], ) -> List[FeatureViewQueryContext]: """ Build a query context containing all information required to template a BigQuery and Redshift point-in-time SQL query """ ( feature_views_to_feature_map, on_demand_feature_views_to_features, ) = _get_requested_feature_views_to_features_dict( feature_refs, feature_views, registry.list_on_demand_feature_views(project)) query_context = [] for feature_view, features in feature_views_to_feature_map.items(): join_keys, entity_selections = [], [] for entity_name in feature_view.entities: entity = registry.get_entity(entity_name, project) join_key = feature_view.projection.join_key_map.get( entity.join_key, entity.join_key) join_keys.append(join_key) entity_selections.append(f"{entity.join_key} AS {join_key}") if isinstance(feature_view.ttl, timedelta): ttl_seconds = int(feature_view.ttl.total_seconds()) else: ttl_seconds = 0 reverse_field_mapping = { v: k for k, v in feature_view.batch_source.field_mapping.items() } features = [ reverse_field_mapping.get(feature, feature) for feature in features ] timestamp_field = reverse_field_mapping.get( feature_view.batch_source.timestamp_field, feature_view.batch_source.timestamp_field, ) created_timestamp_column = reverse_field_mapping.get( feature_view.batch_source.created_timestamp_column, feature_view.batch_source.created_timestamp_column, ) max_event_timestamp = to_naive_utc( entity_df_timestamp_range[1]).isoformat() min_event_timestamp = None if feature_view.ttl: min_event_timestamp = to_naive_utc(entity_df_timestamp_range[0] - feature_view.ttl).isoformat() context = FeatureViewQueryContext( name=feature_view.projection.name_to_use(), ttl=ttl_seconds, entities=join_keys, features=features, field_mapping=feature_view.batch_source.field_mapping, event_timestamp_column=timestamp_field, created_timestamp_column=created_timestamp_column, # TODO: Make created column optional and not hardcoded table_subquery=feature_view.batch_source.get_table_query_string(), entity_selections=entity_selections, min_event_timestamp=min_event_timestamp, max_event_timestamp=max_event_timestamp, ) query_context.append(context) return query_context
def get_historical_features( config: RepoConfig, feature_views: List[FeatureView], feature_refs: List[str], entity_df: Union[pd.DataFrame, str], registry: Registry, project: str, full_feature_names: bool = False, ) -> RetrievalJob: if not isinstance(entity_df, pd.DataFrame) and not isinstance( entity_df, dd.DataFrame): raise ValueError( f"Please provide an entity_df of type {type(pd.DataFrame)} instead of type {type(entity_df)}" ) entity_df_event_timestamp_col = DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL # local modifiable copy of global variable if entity_df_event_timestamp_col not in entity_df.columns: datetime_columns = entity_df.select_dtypes( include=["datetime", "datetimetz"]).columns if len(datetime_columns) == 1: print( f"Using {datetime_columns[0]} as the event timestamp. To specify a column explicitly, please name it {DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL}." ) entity_df_event_timestamp_col = datetime_columns[0] else: raise ValueError( f"Please provide an entity_df with a column named {DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL} representing the time of events." ) ( feature_views_to_features, on_demand_feature_views_to_features, ) = _get_requested_feature_views_to_features_dict( feature_refs, feature_views, registry.list_on_demand_feature_views(config.project), ) entity_df_event_timestamp_range = _get_entity_df_event_timestamp_range( entity_df, entity_df_event_timestamp_col) # Create lazy function that is only called from the RetrievalJob object def evaluate_historical_retrieval(): # Create a copy of entity_df to prevent modifying the original entity_df_with_features = entity_df.copy() entity_df_event_timestamp_col_type = entity_df_with_features.dtypes[ entity_df_event_timestamp_col] if (not hasattr(entity_df_event_timestamp_col_type, "tz") or entity_df_event_timestamp_col_type.tz != pytz.UTC): # Make sure all event timestamp fields are tz-aware. We default tz-naive fields to UTC entity_df_with_features[ entity_df_event_timestamp_col] = entity_df_with_features[ entity_df_event_timestamp_col].apply( lambda x: x if x.tzinfo is not None else x.replace( tzinfo=pytz.utc)) # Convert event timestamp column to datetime and normalize time zone to UTC # This is necessary to avoid issues with pd.merge_asof if isinstance(entity_df_with_features, dd.DataFrame): entity_df_with_features[ entity_df_event_timestamp_col] = dd.to_datetime( entity_df_with_features[ entity_df_event_timestamp_col], utc=True) else: entity_df_with_features[ entity_df_event_timestamp_col] = pd.to_datetime( entity_df_with_features[ entity_df_event_timestamp_col], utc=True) # Sort event timestamp values entity_df_with_features = entity_df_with_features.sort_values( entity_df_event_timestamp_col) join_keys = [] all_join_keys = [] # Load feature view data from sources and join them incrementally for feature_view, features in feature_views_to_features.items(): event_timestamp_column = feature_view.batch_source.timestamp_field created_timestamp_column = ( feature_view.batch_source.created_timestamp_column) # Build a list of entity columns to join on (from the right table) join_keys = [] for entity_name in feature_view.entities: entity = registry.get_entity(entity_name, project) join_key = feature_view.projection.join_key_map.get( entity.join_key, entity.join_key) join_keys.append(join_key) right_entity_key_columns = [ event_timestamp_column, created_timestamp_column, ] + join_keys right_entity_key_columns = [ c for c in right_entity_key_columns if c ] all_join_keys = list(set(all_join_keys + join_keys)) df_to_join = _read_datasource(feature_view.batch_source) df_to_join, event_timestamp_column = _field_mapping( df_to_join, feature_view, features, right_entity_key_columns, entity_df_event_timestamp_col, event_timestamp_column, full_feature_names, ) df_to_join = _merge(entity_df_with_features, df_to_join, join_keys) df_to_join = _normalize_timestamp(df_to_join, event_timestamp_column, created_timestamp_column) df_to_join = _filter_ttl( df_to_join, feature_view, entity_df_event_timestamp_col, event_timestamp_column, ) df_to_join = _drop_duplicates( df_to_join, all_join_keys, event_timestamp_column, created_timestamp_column, entity_df_event_timestamp_col, ) entity_df_with_features = _drop_columns( df_to_join, event_timestamp_column, created_timestamp_column) # Ensure that we delete dataframes to free up memory del df_to_join return entity_df_with_features.persist() job = FileRetrievalJob( evaluation_function=evaluate_historical_retrieval, full_feature_names=full_feature_names, on_demand_feature_views=OnDemandFeatureView.get_requested_odfvs( feature_refs, project, registry), metadata=RetrievalMetadata( features=feature_refs, keys=list( set(entity_df.columns) - {entity_df_event_timestamp_col}), min_event_timestamp=entity_df_event_timestamp_range[0], max_event_timestamp=entity_df_event_timestamp_range[1], ), ) return job