def _connect_core(self, skip_if_connected: bool = True): """ Connect to Core API Args: skip_if_connected: Do not attempt to connect if already connected """ if skip_if_connected and self._core_service_stub: return if not self.core_url: raise ValueError("Please set Feast Core URL.") if self.__core_channel is None: if self.core_secure or self.core_url.endswith(":443"): self.__core_channel = grpc.secure_channel( self.core_url, grpc.ssl_channel_credentials()) else: self.__core_channel = grpc.insecure_channel(self.core_url) try: grpc.channel_ready_future( self.__core_channel).result(timeout=self._config.getint( CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY)) except grpc.FutureTimeoutError: raise ConnectionError( f"Connection timed out while attempting to connect to Feast " f"Core gRPC server {self.core_url} ") else: self._core_service_stub = CoreServiceStub(self.__core_channel)
def _connect_core(self, skip_if_connected: bool = True): """ Connect to Core API Args: skip_if_connected: Do not attempt to connect if already connected """ if skip_if_connected and self._core_service_stub: return if not self.core_url: raise ValueError("Please set Feast Core URL.") if self.__core_channel is None: self.__core_channel = grpc.insecure_channel(self.core_url) try: grpc.channel_ready_future(self.__core_channel).result( timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT) except grpc.FutureTimeoutError: print( f"Connection timed out while attempting to connect to Feast Core gRPC server {self.core_url}" ) sys.exit(1) else: self._core_service_stub = CoreServiceStub(self.__core_channel)
def _connect_core(self): """Connect to core api""" if self.__core_channel is None: self.__core_channel = grpc.insecure_channel(self.core_url) self._core_service_stub = CoreServiceStub(self.__core_channel) self._job_service_stub = JobServiceStub(self.__core_channel) self._dataset_service_stub = DatasetServiceStub( self.__core_channel)
def _core_service(self): """ Creates or returns the gRPC Feast Core Service Stub Returns: CoreServiceStub """ if not self._core_service_stub: channel = create_grpc_channel( url=self._config.get(CONFIG_CORE_URL_KEY), enable_ssl=self._config.getboolean(CONFIG_CORE_ENABLE_SSL_KEY), enable_auth=self._config.getboolean(CONFIG_ENABLE_AUTH_KEY), ssl_server_cert_path=self._config.get(CONFIG_CORE_SERVER_SSL_CERT_KEY), auth_metadata_plugin=self._auth_metadata, timeout=self._config.getint(CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY), ) self._core_service_stub = CoreServiceStub(channel) return self._core_service_stub
def __connect_core(self, core_url: str): if not core_url: raise ValueError("Please set Feast Core URL.") if self.__core_channel is None: self.__core_channel = grpc.insecure_channel(core_url) try: grpc.channel_ready_future(self.__core_channel).result(timeout=5) except grpc.FutureTimeoutError: raise ConnectionError( "connection timed out while attempting to connect to Feast Core gRPC server " + core_url) else: self._core_service_stub = CoreServiceStub(self.__core_channel)
def core_service_stub(self, core_url): if core_url.endswith(":443"): core_channel = grpc.secure_channel(core_url, grpc.ssl_channel_credentials()) else: core_channel = grpc.insecure_channel(core_url) try: grpc.channel_ready_future(core_channel).result( timeout=self.GRPC_CONNECTION_TIMEOUT) except grpc.FutureTimeoutError: raise ConnectionError( f"Connection timed out while attempting to connect to Feast " f"Core gRPC server {core_url} ") core_service_stub = CoreServiceStub(core_channel) return core_service_stub
class Client: """ Feast Client: Used for creating, managing, and retrieving features. """ def __init__(self, options: Optional[Dict[str, str]] = None, **kwargs): """ The Feast Client should be initialized with at least one service url Please see constants.py for configuration options. Commonly used options or arguments include: core_url: Feast Core URL. Used to manage features serving_url: Feast Serving URL. Used to retrieve features project: Sets the active project. This field is optional. core_secure: Use client-side SSL/TLS for Core gRPC API serving_secure: Use client-side SSL/TLS for Serving gRPC API enable_auth: Enable authentication and authorization auth_provider: Authentication provider – "google" or "oauth" if auth_provider is "oauth", the following fields are mandatory – oauth_grant_type, oauth_client_id, oauth_client_secret, oauth_audience, oauth_token_request_url Args: options: Configuration options to initialize client with **kwargs: Additional keyword arguments that will be used as configuration options along with "options" """ if options is None: options = dict() self._config = Config(options={**options, **kwargs}) self._core_service_stub: Optional[CoreServiceStub] = None self._serving_service_stub: Optional[ServingServiceStub] = None self._auth_metadata: Optional[grpc.AuthMetadataPlugin] = None # Configure Auth Metadata Plugin if auth is enabled if self._config.getboolean(CONFIG_ENABLE_AUTH_KEY): self._auth_metadata = feast_auth.get_auth_metadata_plugin(self._config) @property def _core_service(self): """ Creates or returns the gRPC Feast Core Service Stub Returns: CoreServiceStub """ if not self._core_service_stub: channel = create_grpc_channel( url=self._config.get(CONFIG_CORE_URL_KEY), enable_ssl=self._config.getboolean(CONFIG_CORE_ENABLE_SSL_KEY), enable_auth=self._config.getboolean(CONFIG_ENABLE_AUTH_KEY), ssl_server_cert_path=self._config.get(CONFIG_CORE_SERVER_SSL_CERT_KEY), auth_metadata_plugin=self._auth_metadata, timeout=self._config.getint(CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY), ) self._core_service_stub = CoreServiceStub(channel) return self._core_service_stub @property def _serving_service(self): """ Creates or returns the gRPC Feast Serving Service Stub Returns: ServingServiceStub """ if not self._serving_service_stub: channel = create_grpc_channel( url=self._config.get(CONFIG_SERVING_URL_KEY), enable_ssl=self._config.getboolean(CONFIG_SERVING_ENABLE_SSL_KEY), enable_auth=self._config.getboolean(CONFIG_ENABLE_AUTH_KEY), ssl_server_cert_path=self._config.get( CONFIG_SERVING_SERVER_SSL_CERT_KEY ), auth_metadata_plugin=self._auth_metadata, timeout=self._config.getint(CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY), ) self._serving_service_stub = ServingServiceStub(channel) return self._serving_service_stub @property def core_url(self) -> str: """ Retrieve Feast Core URL Returns: Feast Core URL string """ return self._config.get(CONFIG_CORE_URL_KEY) @core_url.setter def core_url(self, value: str): """ Set the Feast Core URL Args: value: Feast Core URL """ self._config.set(CONFIG_CORE_URL_KEY, value) @property def serving_url(self) -> str: """ Retrieve Serving Core URL Returns: Feast Serving URL string """ return self._config.get(CONFIG_SERVING_URL_KEY) @serving_url.setter def serving_url(self, value: str): """ Set the Feast Serving URL Args: value: Feast Serving URL """ self._config.set(CONFIG_SERVING_URL_KEY, value) @property def core_secure(self) -> bool: """ Retrieve Feast Core client-side SSL/TLS setting Returns: Whether client-side SSL/TLS is enabled """ return self._config.getboolean(CONFIG_CORE_ENABLE_SSL_KEY) @core_secure.setter def core_secure(self, value: bool): """ Set the Feast Core client-side SSL/TLS setting Args: value: True to enable client-side SSL/TLS """ self._config.set(CONFIG_CORE_ENABLE_SSL_KEY, value) @property def serving_secure(self) -> bool: """ Retrieve Feast Serving client-side SSL/TLS setting Returns: Whether client-side SSL/TLS is enabled """ return self._config.getboolean(CONFIG_SERVING_ENABLE_SSL_KEY) @serving_secure.setter def serving_secure(self, value: bool): """ Set the Feast Serving client-side SSL/TLS setting Args: value: True to enable client-side SSL/TLS """ self._config.set(CONFIG_SERVING_ENABLE_SSL_KEY, value) def version(self): """ Returns version information from Feast Core and Feast Serving """ import pkg_resources result = { "sdk": {"version": pkg_resources.get_distribution("feast").version}, "serving": "not configured", "core": "not configured", } if self.serving_url: serving_version = self._serving_service.GetFeastServingInfo( GetFeastServingInfoRequest(), timeout=self._config.getint(CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY), metadata=self._get_grpc_metadata(), ).version result["serving"] = {"url": self.serving_url, "version": serving_version} if self.core_url: core_version = self._core_service.GetFeastCoreVersion( GetFeastCoreVersionRequest(), timeout=self._config.getint(CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY), metadata=self._get_grpc_metadata(), ).version result["core"] = {"url": self.core_url, "version": core_version} return result @property def project(self) -> Union[str, None]: """ Retrieve currently active project Returns: Project name """ if not self._config.get(CONFIG_PROJECT_KEY): raise ValueError("No project has been configured.") return self._config.get(CONFIG_PROJECT_KEY) def set_project(self, project: Optional[str] = None): """ Set currently active Feast project Args: project: Project to set as active. If unset, will reset to the default project. """ if project is None: project = FEAST_DEFAULT_OPTIONS[CONFIG_PROJECT_KEY] self._config.set(CONFIG_PROJECT_KEY, project) def list_projects(self) -> List[str]: """ List all active Feast projects Returns: List of project names """ response = self._core_service.ListProjects( ListProjectsRequest(), timeout=self._config.getint(CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY), metadata=self._get_grpc_metadata(), ) # type: ListProjectsResponse return list(response.projects) def create_project(self, project: str): """ Creates a Feast project Args: project: Name of project """ self._core_service.CreateProject( CreateProjectRequest(name=project), timeout=self._config.getint(CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY), metadata=self._get_grpc_metadata(), ) # type: CreateProjectResponse def archive_project(self, project): """ Archives a project. Project will still continue to function for ingestion and retrieval, but will be in a read-only state. It will also not be visible from the Core API for management purposes. Args: project: Name of project to archive """ try: self._core_service_stub.ArchiveProject( ArchiveProjectRequest(name=project), timeout=self._config.getint(CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY), metadata=self._get_grpc_metadata(), ) # type: ArchiveProjectResponse except grpc.RpcError as e: raise grpc.RpcError(e.details()) # revert to the default project if self._project == project: self._project = FEAST_DEFAULT_OPTIONS[CONFIG_PROJECT_KEY] def apply_entity(self, entities: Union[List[Entity], Entity], project: str = None): """ Idempotently registers entities with Feast Core. Either a single entity or a list can be provided. Args: entities: List of entities that will be registered Examples: >>> from feast import Client >>> from feast.entity import Entity >>> from feast.value_type import ValueType >>> >>> feast_client = Client(core_url="localhost:6565") >>> entity = Entity( >>> name="driver_entity", >>> description="Driver entity for car rides", >>> value_type=ValueType.STRING, >>> labels={ >>> "key": "val" >>> } >>> ) >>> feast_client.apply_entity(entity) """ if project is None: project = self.project if not isinstance(entities, list): entities = [entities] for entity in entities: if isinstance(entity, Entity): self._apply_entity(project, entity) # type: ignore continue raise ValueError(f"Could not determine entity type to apply {entity}") def _apply_entity(self, project: str, entity: Entity): """ Registers a single entity with Feast Args: entity: Entity that will be registered """ entity.is_valid() entity_proto = entity.to_spec_proto() # Convert the entity to a request and send to Feast Core try: apply_entity_response = self._core_service.ApplyEntity( ApplyEntityRequest(project=project, spec=entity_proto), # type: ignore timeout=self._config.getint(CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY), metadata=self._get_grpc_metadata(), ) # type: ApplyEntityResponse except grpc.RpcError as e: raise grpc.RpcError(e.details()) # Extract the returned entity applied_entity = Entity.from_proto(apply_entity_response.entity) # Deep copy from the returned entity to the local entity entity._update_from_entity(applied_entity) def list_entities( self, project: str = None, labels: Dict[str, str] = dict() ) -> List[Entity]: """ Retrieve a list of entities from Feast Core Args: project: Filter entities based on project name labels: User-defined labels that these entities are associated with Returns: List of entities """ if project is None: project = self.project filter = ListEntitiesRequest.Filter(project=project, labels=labels) # Get latest entities from Feast Core entity_protos = self._core_service.ListEntities( ListEntitiesRequest(filter=filter), metadata=self._get_grpc_metadata(), ) # type: ListEntitiesResponse # Extract entities and return entities = [] for entity_proto in entity_protos.entities: entity = Entity.from_proto(entity_proto) entity._client = self entities.append(entity) return entities def get_entity(self, name: str, project: str = None) -> Entity: """ Retrieves an entity. Args: project: Feast project that this entity belongs to name: Name of entity Returns: Returns either the specified entity, or raises an exception if none is found """ if project is None: project = self.project try: get_entity_response = self._core_service.GetEntity( GetEntityRequest(project=project, name=name.strip()), metadata=self._get_grpc_metadata(), ) # type: GetEntityResponse except grpc.RpcError as e: raise grpc.RpcError(e.details()) entity = Entity.from_proto(get_entity_response.entity) return entity def apply_feature_table( self, feature_tables: Union[List[FeatureTable], FeatureTable], project: str = None, ): """ Idempotently registers feature tables with Feast Core. Either a single feature table or a list can be provided. Args: feature_tables: List of feature tables that will be registered """ if project is None: project = self.project if not isinstance(feature_tables, list): feature_tables = [feature_tables] for feature_table in feature_tables: if isinstance(feature_table, FeatureTable): self._apply_feature_table(project, feature_table) # type: ignore continue raise ValueError( f"Could not determine feature table type to apply {feature_table}" ) def _apply_feature_table(self, project: str, feature_table: FeatureTable): """ Registers a single feature table with Feast Args: feature_table: Feature table that will be registered """ feature_table.is_valid() feature_table_proto = feature_table.to_spec_proto() # Convert the feature table to a request and send to Feast Core try: apply_feature_table_response = self._core_service.ApplyFeatureTable( ApplyFeatureTableRequest(project=project, table_spec=feature_table_proto), # type: ignore timeout=self._config.getint(CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY), metadata=self._get_grpc_metadata(), ) # type: ApplyFeatureTableResponse except grpc.RpcError as e: raise grpc.RpcError(e.details()) # Extract the returned feature table applied_feature_table = FeatureTable.from_proto( apply_feature_table_response.table ) # Deep copy from the returned feature table to the local entity feature_table._update_from_feature_table(applied_feature_table) def list_feature_tables( self, project: str = None, labels: Dict[str, str] = dict() ) -> List[FeatureTable]: """ Retrieve a list of feature tables from Feast Core Args: project: Filter feature tables based on project name Returns: List of feature tables """ if project is None: project = self.project filter = ListFeatureTablesRequest.Filter(project=project, labels=labels) # Get latest feature tables from Feast Core feature_table_protos = self._core_service.ListFeatureTables( ListFeatureTablesRequest(filter=filter), metadata=self._get_grpc_metadata(), ) # type: ListFeatureTablesResponse # Extract feature tables and return feature_tables = [] for feature_table_proto in feature_table_protos.tables: feature_table = FeatureTable.from_proto(feature_table_proto) feature_table._client = self feature_tables.append(feature_table) return feature_tables def get_feature_table(self, name: str, project: str = None) -> FeatureTable: """ Retrieves a feature table. Args: project: Feast project that this feature table belongs to name: Name of feature table Returns: Returns either the specified feature table, or raises an exception if none is found """ if project is None: project = self.project try: get_feature_table_response = self._core_service.GetFeatureTable( GetFeatureTableRequest(project=project, name=name.strip()), metadata=self._get_grpc_metadata(), ) # type: GetFeatureTableResponse except grpc.RpcError as e: raise grpc.RpcError(e.details()) return FeatureTable.from_proto(get_feature_table_response.table) def ingest( self, feature_table: Union[str, FeatureTable], source: Union[pd.DataFrame, str], project: str = None, chunk_size: int = 10000, max_workers: int = max(CPU_COUNT - 1, 1), timeout: int = BATCH_INGESTION_PRODUCTION_TIMEOUT, ) -> None: """ Batch load feature data into a FeatureTable. Args: feature_table (typing.Union[str, feast.feature_table.FeatureTable]): FeatureTable object or the string name of the feature table source (typing.Union[pd.DataFrame, str]): Either a file path or Pandas Dataframe to ingest into Feast Files that are currently supported: * parquet * csv * json project: Feast project to locate FeatureTable chunk_size (int): Amount of rows to load and ingest at a time. max_workers (int): Number of worker processes to use to encode values. timeout (int): Timeout in seconds to wait for completion. Examples: >>> from feast import Client >>> >>> client = Client(core_url="localhost:6565") >>> ft_df = pd.DataFrame( >>> { >>> "datetime": [pd.datetime.now()], >>> "driver": [1001], >>> "rating": [4.3], >>> } >>> ) >>> client.set_project("project1") >>> >>> driver_ft = client.get_feature_table("driver") >>> client.ingest(driver_ft, ft_df) """ if project is None: project = self.project if isinstance(feature_table, FeatureTable): name = feature_table.name fetched_feature_table: Optional[FeatureTable] = self.get_feature_table( name, project ) if fetched_feature_table is not None: feature_table = fetched_feature_table else: raise Exception(f"FeatureTable, {name} cannot be found.") # Check 1) Only parquet file format for FeatureTable batch source is supported if ( feature_table.batch_source and issubclass(type(feature_table.batch_source), FileSource) and "".join( feature_table.batch_source.file_options.file_format.split() ).lower() != "parquet" ): raise Exception( f"No suitable batch source found for FeatureTable, {name}." f"Only BATCH_FILE source with parquet format is supported for batch ingestion." ) pyarrow_table, column_names = _read_table_from_source(source) # Check 2) Check if FeatureTable batch source field mappings can be found in provided source table _check_field_mappings( column_names, name, feature_table.batch_source.timestamp_column, feature_table.batch_source.field_mapping, ) dir_path = None with_partitions = False if ( issubclass(type(feature_table.batch_source), FileSource) and feature_table.batch_source.date_partition_column ): with_partitions = True dest_path = _write_partitioned_table_from_source( column_names, pyarrow_table, feature_table.batch_source.date_partition_column, feature_table.batch_source.timestamp_column, ) else: dir_path, dest_path = _write_non_partitioned_table_from_source( column_names, pyarrow_table, chunk_size, max_workers, ) try: if issubclass(type(feature_table.batch_source), FileSource): file_url = feature_table.batch_source.file_options.file_url[:-1] _upload_to_file_source(file_url, with_partitions, dest_path) if issubclass(type(feature_table.batch_source), BigQuerySource): bq_table_ref = feature_table.batch_source.bigquery_options.table_ref feature_table_timestamp_column = ( feature_table.batch_source.timestamp_column ) _upload_to_bq_source( bq_table_ref, feature_table_timestamp_column, dest_path ) finally: # Remove parquet file(s) that were created earlier print("Removing temporary file(s)...") if dir_path: shutil.rmtree(dir_path) print("Data has been successfully ingested into FeatureTable batch source.") def _get_grpc_metadata(self): """ Returns a metadata tuple to attach to gRPC requests. This is primarily used when authentication is enabled but SSL/TLS is disabled. Returns: Tuple of metadata to attach to each gRPC call """ if self._config.getboolean(CONFIG_ENABLE_AUTH_KEY) and self._auth_metadata: return self._auth_metadata.get_signed_meta() return ()
class Client: """ Feast Client: Used for creating, managing, and retrieving features. """ def __init__(self, options: Optional[Dict[str, str]] = None, **kwargs): """ The Feast Client should be initialized with at least one service url Args: core_url: Feast Core URL. Used to manage features serving_url: Feast Serving URL. Used to retrieve features project: Sets the active project. This field is optional. core_secure: Use client-side SSL/TLS for Core gRPC API serving_secure: Use client-side SSL/TLS for Serving gRPC API options: Configuration options to initialize client with **kwargs: Additional keyword arguments that will be used as configuration options along with "options" """ if options is None: options = dict() self._config = Config(options={**options, **kwargs}) self.__core_channel: grpc.Channel = None self.__serving_channel: grpc.Channel = None self._core_service_stub: CoreServiceStub = None self._serving_service_stub: ServingServiceStub = None @property def core_url(self) -> str: """ Retrieve Feast Core URL Returns: Feast Core URL string """ return self._config.get(CONFIG_CORE_URL_KEY) @core_url.setter def core_url(self, value: str): """ Set the Feast Core URL Args: value: Feast Core URL """ self._config.set(CONFIG_CORE_URL_KEY, value) @property def serving_url(self) -> str: """ Retrieve Serving Core URL Returns: Feast Serving URL string """ return self._config.get(CONFIG_SERVING_URL_KEY) @serving_url.setter def serving_url(self, value: str): """ Set the Feast Serving URL Args: value: Feast Serving URL """ self._config.set(CONFIG_SERVING_URL_KEY, value) @property def core_secure(self) -> bool: """ Retrieve Feast Core client-side SSL/TLS setting Returns: Whether client-side SSL/TLS is enabled """ return self._config.getboolean(CONFIG_CORE_SECURE_KEY) @core_secure.setter def core_secure(self, value: bool): """ Set the Feast Core client-side SSL/TLS setting Args: value: True to enable client-side SSL/TLS """ self._config.set(CONFIG_CORE_SECURE_KEY, value) @property def serving_secure(self) -> bool: """ Retrieve Feast Serving client-side SSL/TLS setting Returns: Whether client-side SSL/TLS is enabled """ return self._config.getboolean(CONFIG_SERVING_SECURE_KEY) @serving_secure.setter def serving_secure(self, value: bool): """ Set the Feast Serving client-side SSL/TLS setting Args: value: True to enable client-side SSL/TLS """ self._config.set(CONFIG_SERVING_SECURE_KEY, value) def version(self): """ Returns version information from Feast Core and Feast Serving """ result = {} if self.serving_url: self._connect_serving() serving_version = self._serving_service_stub.GetFeastServingInfo( GetFeastServingInfoRequest(), timeout=self._config.getint( CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY), ).version result["serving"] = { "url": self.serving_url, "version": serving_version } if self.core_url: self._connect_core() core_version = self._core_service_stub.GetFeastCoreVersion( GetFeastCoreVersionRequest(), timeout=self._config.getint( CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY), ).version result["core"] = {"url": self.core_url, "version": core_version} return result def _connect_core(self, skip_if_connected: bool = True): """ Connect to Core API Args: skip_if_connected: Do not attempt to connect if already connected """ if skip_if_connected and self._core_service_stub: return if not self.core_url: raise ValueError("Please set Feast Core URL.") if self.__core_channel is None: if self.core_secure or self.core_url.endswith(":443"): self.__core_channel = grpc.secure_channel( self.core_url, grpc.ssl_channel_credentials()) else: self.__core_channel = grpc.insecure_channel(self.core_url) try: grpc.channel_ready_future( self.__core_channel).result(timeout=self._config.getint( CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY)) except grpc.FutureTimeoutError: raise ConnectionError( f"Connection timed out while attempting to connect to Feast " f"Core gRPC server {self.core_url} ") else: self._core_service_stub = CoreServiceStub(self.__core_channel) def _connect_serving(self, skip_if_connected=True): """ Connect to Serving API Args: skip_if_connected: Do not attempt to connect if already connected """ if skip_if_connected and self._serving_service_stub: return if not self.serving_url: raise ValueError("Please set Feast Serving URL.") if self.__serving_channel is None: if self.serving_secure or self.serving_url.endswith(":443"): self.__serving_channel = grpc.secure_channel( self.serving_url, grpc.ssl_channel_credentials()) else: self.__serving_channel = grpc.insecure_channel( self.serving_url) try: grpc.channel_ready_future( self.__serving_channel).result(timeout=self._config.getint( CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY)) except grpc.FutureTimeoutError: raise ConnectionError( f"Connection timed out while attempting to connect to Feast " f"Serving gRPC server {self.serving_url} ") else: self._serving_service_stub = ServingServiceStub( self.__serving_channel) @property def project(self) -> Union[str, None]: """ Retrieve currently active project Returns: Project name """ return self._config.get(CONFIG_PROJECT_KEY) def set_project(self, project: str): """ Set currently active Feast project Args: project: Project to set as active """ self._config.set(CONFIG_PROJECT_KEY, project) def list_projects(self) -> List[str]: """ List all active Feast projects Returns: List of project names """ self._connect_core() response = self._core_service_stub.ListProjects( ListProjectsRequest(), timeout=self._config.getint( CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY), ) # type: ListProjectsResponse return list(response.projects) def create_project(self, project: str): """ Creates a Feast project Args: project: Name of project """ self._connect_core() self._core_service_stub.CreateProject( CreateProjectRequest(name=project), timeout=self._config.getint( CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY), ) # type: CreateProjectResponse def archive_project(self, project): """ Archives a project. Project will still continue to function for ingestion and retrieval, but will be in a read-only state. It will also not be visible from the Core API for management purposes. Args: project: Name of project to archive """ self._connect_core() self._core_service_stub.ArchiveProject( ArchiveProjectRequest(name=project), timeout=self._config.getint( CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY), ) # type: ArchiveProjectResponse if self._project == project: self._project = "" def apply(self, feature_sets: Union[List[FeatureSet], FeatureSet]): """ Idempotently registers feature set(s) with Feast Core. Either a single feature set or a list can be provided. Args: feature_sets: List of feature sets that will be registered """ if not isinstance(feature_sets, list): feature_sets = [feature_sets] for feature_set in feature_sets: if isinstance(feature_set, FeatureSet): self._apply_feature_set(feature_set) continue raise ValueError( f"Could not determine feature set type to apply {feature_set}") def _apply_feature_set(self, feature_set: FeatureSet): """ Registers a single feature set with Feast Args: feature_set: Feature set that will be registered """ self._connect_core() feature_set.is_valid() feature_set_proto = feature_set.to_proto() if len(feature_set_proto.spec.project) == 0: if self.project is None: raise ValueError( f"No project found in feature set {feature_set.name}. " f"Please set the project within the feature set or within " f"your Feast Client.") else: feature_set_proto.spec.project = self.project # Convert the feature set to a request and send to Feast Core try: apply_fs_response = self._core_service_stub.ApplyFeatureSet( ApplyFeatureSetRequest(feature_set=feature_set_proto), timeout=self._config.getint( CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY), ) # type: ApplyFeatureSetResponse except grpc.RpcError as e: raise grpc.RpcError(e.details()) # Extract the returned feature set applied_fs = FeatureSet.from_proto(apply_fs_response.feature_set) # If the feature set has changed, update the local copy if apply_fs_response.status == ApplyFeatureSetResponse.Status.CREATED: print( f'Feature set updated/created: "{applied_fs.name}:{applied_fs.version}"' ) # If no change has been applied, do nothing if apply_fs_response.status == ApplyFeatureSetResponse.Status.NO_CHANGE: print(f"No change detected or applied: {feature_set.name}") # Deep copy from the returned feature set to the local feature set feature_set._update_from_feature_set(applied_fs) def list_feature_sets(self, project: str = None, name: str = None, version: str = None) -> List[FeatureSet]: """ Retrieve a list of feature sets from Feast Core Args: project: Filter feature sets based on project name name: Filter feature sets based on feature set name version: Filter feature sets based on version numbf, Returns: List of feature sets """ self._connect_core() if project is None: if self.project is not None: project = self.project else: project = "*" if name is None: name = "*" if version is None: version = "*" filter = ListFeatureSetsRequest.Filter(project=project, feature_set_name=name, feature_set_version=version) # Get latest feature sets from Feast Core feature_set_protos = self._core_service_stub.ListFeatureSets( ListFeatureSetsRequest( filter=filter)) # type: ListFeatureSetsResponse # Extract feature sets and return feature_sets = [] for feature_set_proto in feature_set_protos.feature_sets: feature_set = FeatureSet.from_proto(feature_set_proto) feature_set._client = self feature_sets.append(feature_set) return feature_sets def get_feature_set(self, name: str, version: int = None, project: str = None) -> Union[FeatureSet, None]: """ Retrieves a feature set. If no version is specified then the latest version will be returned. Args: project: Feast project that this feature set belongs to name: Name of feature set version: Version of feature set Returns: Returns either the specified feature set, or raises an exception if none is found """ self._connect_core() if project is None: if self.project is not None: project = self.project else: raise ValueError("No project has been configured.") if version is None: version = 0 try: get_feature_set_response = self._core_service_stub.GetFeatureSet( GetFeatureSetRequest( project=project, name=name.strip(), version=int(version))) # type: GetFeatureSetResponse except grpc.RpcError as e: raise grpc.RpcError(e.details()) return FeatureSet.from_proto(get_feature_set_response.feature_set) def list_entities(self) -> Dict[str, Entity]: """ Returns a dictionary of entities across all feature sets Returns: Dictionary of entities, indexed by name """ entities_dict = OrderedDict() for fs in self.list_feature_sets(): for entity in fs.entities: entities_dict[entity.name] = entity return entities_dict def get_batch_features( self, feature_refs: List[str], entity_rows: Union[pd.DataFrame, str], default_project: str = None, ) -> RetrievalJob: """ Retrieves historical features from a Feast Serving deployment. Args: feature_refs (List[str]): List of feature references that will be returned for each entity. Each feature reference should have the following format "project/feature:version". entity_rows (Union[pd.DataFrame, str]): Pandas dataframe containing entities and a 'datetime' column. Each entity in a feature set must be present as a column in this dataframe. The datetime column must contain timestamps in datetime64 format. default_project: Default project where feature values will be found. Returns: feast.job.RetrievalJob: Returns a retrival job object that can be used to monitor retrieval progress asynchronously, and can be used to materialize the results. Examples: >>> from feast import Client >>> from datetime import datetime >>> >>> feast_client = Client(core_url="localhost:6565", serving_url="localhost:6566") >>> feature_refs = ["my_project/bookings_7d:1", "booking_14d"] >>> entity_rows = pd.DataFrame( >>> { >>> "datetime": [pd.datetime.now() for _ in range(3)], >>> "customer": [1001, 1002, 1003], >>> } >>> ) >>> feature_retrieval_job = feast_client.get_batch_features( >>> feature_refs, entity_rows, default_project="my_project") >>> df = feature_retrieval_job.to_dataframe() >>> print(df) """ self._connect_serving() feature_references = _build_feature_references( feature_refs=feature_refs, default_project=default_project) # Retrieve serving information to determine store type and # staging location serving_info = self._serving_service_stub.GetFeastServingInfo( GetFeastServingInfoRequest(), timeout=self._config.getint( CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY), ) # type: GetFeastServingInfoResponse if serving_info.type != FeastServingType.FEAST_SERVING_TYPE_BATCH: raise Exception( f'You are connected to a store "{self._serving_url}" which ' f"does not support batch retrieval ") if isinstance(entity_rows, pd.DataFrame): # Pandas DataFrame detected # Remove timezone from datetime column if isinstance(entity_rows["datetime"].dtype, pd.core.dtypes.dtypes.DatetimeTZDtype): entity_rows["datetime"] = pd.DatetimeIndex( entity_rows["datetime"]).tz_localize(None) elif isinstance(entity_rows, str): # String based source if not entity_rows.endswith((".avro", "*")): raise Exception( f"Only .avro and wildcard paths are accepted as entity_rows" ) else: raise Exception(f"Only pandas.DataFrame and str types are allowed" f" as entity_rows, but got {type(entity_rows)}.") # Export and upload entity row DataFrame to staging location # provided by Feast staged_files = export_source_to_staging_location( entity_rows, serving_info.job_staging_location) # type: List[str] request = GetBatchFeaturesRequest( features=feature_references, dataset_source=DatasetSource(file_source=DatasetSource.FileSource( file_uris=staged_files, data_format=DataFormat.DATA_FORMAT_AVRO)), ) # Retrieve Feast Job object to manage life cycle of retrieval response = self._serving_service_stub.GetBatchFeatures(request) return RetrievalJob(response.job, self._serving_service_stub) def get_online_features( self, feature_refs: List[str], entity_rows: List[GetOnlineFeaturesRequest.EntityRow], default_project: Optional[str] = None, ) -> GetOnlineFeaturesResponse: """ Retrieves the latest online feature data from Feast Serving Args: feature_refs: List of feature references in the following format [project]/[feature_name]:[version]. Only the feature name is a required component in the reference. example: ["my_project/my_feature_1:3", "my_project3/my_feature_4:1",] entity_rows: List of GetFeaturesRequest.EntityRow where each row contains entities. Timestamp should not be set for online retrieval. All entity types within a feature default_project: This project will be used if the project name is not provided in the feature reference Returns: Returns a list of maps where each item in the list contains the latest feature values for the provided entities """ self._connect_serving() return self._serving_service_stub.GetOnlineFeatures( GetOnlineFeaturesRequest( features=_build_feature_references( feature_refs=feature_refs, default_project=(default_project if not self.project else self.project), ), entity_rows=entity_rows, )) def list_ingest_jobs( self, job_id: str = None, feature_set_ref: FeatureSetRef = None, store_name: str = None, ): """ List the ingestion jobs currently registered in Feast, with optional filters. Provides detailed metadata about each ingestion job. Args: job_id: Select specific ingestion job with the given job_id feature_set_ref: Filter ingestion jobs by target feature set (via reference) store_name: Filter ingestion jobs by target feast store's name Returns: List of IngestJobs matching the given filters """ self._connect_core() # construct list request feature_set_ref = None list_filter = ListIngestionJobsRequest.Filter( id=job_id, feature_set_reference=feature_set_ref, store_name=store_name, ) request = ListIngestionJobsRequest(filter=list_filter) # make list request & unpack response response = self._core_service_stub.ListIngestionJobs(request) ingest_jobs = [ IngestJob(proto, self._core_service_stub) for proto in response.jobs ] return ingest_jobs def restart_ingest_job(self, job: IngestJob): """ Restart ingestion job currently registered in Feast. NOTE: Data might be lost during the restart for some job runners. Does not support stopping a job in a transitional (ie pending, suspending, aborting), terminal state (ie suspended or aborted) or unknown status Args: job: IngestJob to restart """ self._connect_core() request = RestartIngestionJobRequest(id=job.id) try: self._core_service_stub.RestartIngestionJob(request) except grpc.RpcError as e: raise grpc.RpcError(e.details()) def stop_ingest_job(self, job: IngestJob): """ Stop ingestion job currently resgistered in Feast Does nothing if the target job if already in a terminal state (ie suspended or aborted). Does not support stopping a job in a transitional (ie pending, suspending, aborting) or in a unknown status Args: job: IngestJob to restart """ self._connect_core() request = StopIngestionJobRequest(id=job.id) try: self._core_service_stub.StopIngestionJob(request) except grpc.RpcError as e: raise grpc.RpcError(e.details()) def ingest( self, feature_set: Union[str, FeatureSet], source: Union[pd.DataFrame, str], chunk_size: int = 10000, version: int = None, force_update: bool = False, max_workers: int = max(CPU_COUNT - 1, 1), disable_progress_bar: bool = False, timeout: int = KAFKA_CHUNK_PRODUCTION_TIMEOUT, ) -> None: """ Loads feature data into Feast for a specific feature set. Args: feature_set (typing.Union[str, feast.feature_set.FeatureSet]): Feature set object or the string name of the feature set (without a version). source (typing.Union[pd.DataFrame, str]): Either a file path or Pandas Dataframe to ingest into Feast Files that are currently supported: * parquet * csv * json chunk_size (int): Amount of rows to load and ingest at a time. version (int): Feature set version. force_update (bool): Automatically update feature set based on source data prior to ingesting. This will also register changes to Feast. max_workers (int): Number of worker processes to use to encode values. disable_progress_bar (bool): Disable printing of progress statistics. timeout (int): Timeout in seconds to wait for completion. Returns: None: None """ if isinstance(feature_set, FeatureSet): name = feature_set.name if version is None: version = feature_set.version elif isinstance(feature_set, str): name = feature_set else: raise Exception(f"Feature set name must be provided") # Read table and get row count dir_path, dest_path = _read_table_from_source(source, chunk_size, max_workers) pq_file = pq.ParquetFile(dest_path) row_count = pq_file.metadata.num_rows # Update the feature set based on PyArrow table of first row group if force_update: feature_set.infer_fields_from_pa( table=pq_file.read_row_group(0), discard_unused_fields=True, replace_existing_features=True, ) self.apply(feature_set) current_time = time.time() print("Waiting for feature set to be ready for ingestion...") while True: if timeout is not None and time.time() - current_time >= timeout: raise TimeoutError( "Timed out waiting for feature set to be ready") feature_set = self.get_feature_set(name, version) if (feature_set is not None and feature_set.status == FeatureSetStatus.STATUS_READY): break time.sleep(3) if timeout is not None: timeout = timeout - int(time.time() - current_time) try: # Kafka configs brokers = feature_set.get_kafka_source_brokers() topic = feature_set.get_kafka_source_topic() producer = get_producer(brokers, row_count, disable_progress_bar) # Loop optimization declarations produce = producer.produce flush = producer.flush # Transform and push data to Kafka if feature_set.source.source_type == "Kafka": for chunk in get_feature_row_chunks( file=dest_path, row_groups=list(range(pq_file.num_row_groups)), fs=feature_set, max_workers=max_workers, ): # Push FeatureRow one chunk at a time to kafka for serialized_row in chunk: produce(topic=topic, value=serialized_row) # Force a flush after each chunk flush(timeout=timeout) # Remove chunk from memory del chunk else: raise Exception( f"Could not determine source type for feature set " f'"{feature_set.name}" with source type ' f'"{feature_set.source.source_type}"') # Print ingestion statistics producer.print_results() finally: # Remove parquet file(s) that were created earlier print("Removing temporary file(s)...") shutil.rmtree(dir_path) return None
class Client: """ Feast Client: Used for creating, managing, and retrieving features. """ def __init__(self, options: Optional[Dict[str, str]] = None, **kwargs): """ The Feast Client should be initialized with at least one service url Please see constants.py for configuration options. Commonly used options or arguments include: core_url: Feast Core URL. Used to manage features serving_url: Feast Serving URL. Used to retrieve features project: Sets the active project. This field is optional. core_secure: Use client-side SSL/TLS for Core gRPC API serving_secure: Use client-side SSL/TLS for Serving gRPC API enable_auth: Enable authentication and authorization auth_provider: Authentication provider – "google" or "oauth" if auth_provider is "oauth", the following fields are mandatory – oauth_grant_type, oauth_client_id, oauth_client_secret, oauth_audience, oauth_token_request_url Args: options: Configuration options to initialize client with **kwargs: Additional keyword arguments that will be used as configuration options along with "options" """ if options is None: options = dict() self._config = Config(options={**options, **kwargs}) self._core_service_stub: Optional[CoreServiceStub] = None self._serving_service_stub: Optional[ServingServiceStub] = None self._job_service_stub: Optional[JobServiceStub] = None self._auth_metadata: Optional[grpc.AuthMetadataPlugin] = None # Configure Auth Metadata Plugin if auth is enabled if self._config.getboolean(opt.ENABLE_AUTH): self._auth_metadata = feast_auth.get_auth_metadata_plugin( self._config) @property def _core_service(self): """ Creates or returns the gRPC Feast Core Service Stub Returns: CoreServiceStub """ if not self._core_service_stub: channel = create_grpc_channel( url=self._config.get(opt.CORE_URL), enable_ssl=self._config.getboolean(opt.CORE_ENABLE_SSL), enable_auth=self._config.getboolean(opt.ENABLE_AUTH), ssl_server_cert_path=self._config.get( opt.CORE_SERVER_SSL_CERT), auth_metadata_plugin=self._auth_metadata, timeout=self._config.getint(opt.GRPC_CONNECTION_TIMEOUT), ) self._core_service_stub = CoreServiceStub(channel) return self._core_service_stub @property def _serving_service(self): """ Creates or returns the gRPC Feast Serving Service Stub. If both `opentracing` and `grpcio-opentracing` are installed, an opentracing interceptor will be instantiated based on the global tracer. Returns: ServingServiceStub """ if not self._serving_service_stub: channel = create_grpc_channel( url=self._config.get(opt.SERVING_URL), enable_ssl=self._config.getboolean(opt.SERVING_ENABLE_SSL), enable_auth=self._config.getboolean(opt.ENABLE_AUTH), ssl_server_cert_path=self._config.get( opt.SERVING_SERVER_SSL_CERT), auth_metadata_plugin=self._auth_metadata, timeout=self._config.getint(opt.GRPC_CONNECTION_TIMEOUT), ) try: import opentracing from grpc_opentracing import open_tracing_client_interceptor from grpc_opentracing.grpcext import intercept_channel interceptor = open_tracing_client_interceptor( opentracing.global_tracer()) channel = intercept_channel(channel, interceptor) except ImportError: pass self._serving_service_stub = ServingServiceStub(channel) return self._serving_service_stub @property def _use_job_service(self) -> bool: return self._config.exists(opt.JOB_SERVICE_URL) @property def _job_service(self): """ Creates or returns the gRPC Feast Job Service Stub Returns: JobServiceStub """ # Don't try to initialize job service stub if the job service is disabled if not self._use_job_service: return None if not self._job_service_stub: channel = create_grpc_channel( url=self._config.get(opt.JOB_SERVICE_URL), enable_ssl=self._config.getboolean(opt.JOB_SERVICE_ENABLE_SSL), enable_auth=self._config.getboolean(opt.ENABLE_AUTH), ssl_server_cert_path=self._config.get( opt.JOB_SERVICE_SERVER_SSL_CERT), auth_metadata_plugin=self._auth_metadata, timeout=self._config.getint(opt.GRPC_CONNECTION_TIMEOUT), ) self._job_service_service_stub = JobServiceStub(channel) return self._job_service_service_stub def _extra_grpc_params(self) -> Dict[str, Any]: return dict( timeout=self._config.getint(opt.GRPC_CONNECTION_TIMEOUT), metadata=self._get_grpc_metadata(), ) @property def core_url(self) -> str: """ Retrieve Feast Core URL Returns: Feast Core URL string """ return self._config.get(opt.CORE_URL) @core_url.setter def core_url(self, value: str): """ Set the Feast Core URL Args: value: Feast Core URL """ self._config.set(opt.CORE_URL, value) @property def serving_url(self) -> str: """ Retrieve Feast Serving URL Returns: Feast Serving URL string """ return self._config.get(opt.SERVING_URL) @serving_url.setter def serving_url(self, value: str): """ Set the Feast Serving URL Args: value: Feast Serving URL """ self._config.set(opt.SERVING_URL, value) @property def job_service_url(self) -> str: """ Retrieve Feast Job Service URL Returns: Feast Job Service URL string """ return self._config.get(opt.JOB_SERVICE_URL) @job_service_url.setter def job_service_url(self, value: str): """ Set the Feast Job Service URL Args: value: Feast Job Service URL """ self._config.set(opt.JOB_SERVICE_URL, value) @property def core_secure(self) -> bool: """ Retrieve Feast Core client-side SSL/TLS setting Returns: Whether client-side SSL/TLS is enabled """ return self._config.getboolean(opt.CORE_ENABLE_SSL) @core_secure.setter def core_secure(self, value: bool): """ Set the Feast Core client-side SSL/TLS setting Args: value: True to enable client-side SSL/TLS """ self._config.set(opt.CORE_ENABLE_SSL, value) @property def serving_secure(self) -> bool: """ Retrieve Feast Serving client-side SSL/TLS setting Returns: Whether client-side SSL/TLS is enabled """ return self._config.getboolean(opt.SERVING_ENABLE_SSL) @serving_secure.setter def serving_secure(self, value: bool): """ Set the Feast Serving client-side SSL/TLS setting Args: value: True to enable client-side SSL/TLS """ self._config.set(opt.SERVING_ENABLE_SSL, value) @property def job_service_secure(self) -> bool: """ Retrieve Feast Job Service client-side SSL/TLS setting Returns: Whether client-side SSL/TLS is enabled """ return self._config.getboolean(opt.JOB_SERVICE_ENABLE_SSL) @job_service_secure.setter def job_service_secure(self, value: bool): """ Set the Feast Job Service client-side SSL/TLS setting Args: value: True to enable client-side SSL/TLS """ self._config.set(opt.JOB_SERVICE_ENABLE_SSL, value) def version(self): """ Returns version information from Feast Core and Feast Serving """ import pkg_resources result = { "sdk": { "version": pkg_resources.get_distribution("feast").version }, "serving": "not configured", "core": "not configured", } if self.serving_url: serving_version = self._serving_service.GetFeastServingInfo( GetFeastServingInfoRequest(), timeout=self._config.getint(opt.GRPC_CONNECTION_TIMEOUT), metadata=self._get_grpc_metadata(), ).version result["serving"] = { "url": self.serving_url, "version": serving_version } if self.core_url: core_version = self._core_service.GetFeastCoreVersion( GetFeastCoreVersionRequest(), timeout=self._config.getint(opt.GRPC_CONNECTION_TIMEOUT), metadata=self._get_grpc_metadata(), ).version result["core"] = {"url": self.core_url, "version": core_version} return result @property def project(self) -> str: """ Retrieve currently active project Returns: Project name """ if not self._config.get(opt.PROJECT): raise ValueError("No project has been configured.") return self._config.get(opt.PROJECT) def set_project(self, project: Optional[str] = None): """ Set currently active Feast project Args: project: Project to set as active. If unset, will reset to the default project. """ if project is None: project = opt().PROJECT self._config.set(opt.PROJECT, project) def list_projects(self) -> List[str]: """ List all active Feast projects Returns: List of project names """ response = self._core_service.ListProjects( ListProjectsRequest(), timeout=self._config.getint(opt.GRPC_CONNECTION_TIMEOUT), metadata=self._get_grpc_metadata(), ) # type: ListProjectsResponse return list(response.projects) def create_project(self, project: str): """ Creates a Feast project Args: project: Name of project """ self._core_service.CreateProject( CreateProjectRequest(name=project), timeout=self._config.getint(opt.GRPC_CONNECTION_TIMEOUT), metadata=self._get_grpc_metadata(), ) # type: CreateProjectResponse def archive_project(self, project): """ Archives a project. Project will still continue to function for ingestion and retrieval, but will be in a read-only state. It will also not be visible from the Core API for management purposes. Args: project: Name of project to archive """ try: self._core_service_stub.ArchiveProject( ArchiveProjectRequest(name=project), timeout=self._config.getint(opt.GRPC_CONNECTION_TIMEOUT), metadata=self._get_grpc_metadata(), ) # type: ArchiveProjectResponse except grpc.RpcError as e: raise grpc.RpcError(e.details()) # revert to the default project if self._project == project: self._project = opt().PROJECT def apply( self, objects: Union[List[Union[Entity, FeatureTable]], Entity, FeatureTable], project: str = None, ): """ Idempotently registers entities and feature tables with Feast Core. Either a single entity or feature table or a list can be provided. Args: objects: List of entities and/or feature tables that will be registered Examples: >>> from feast import Client >>> from feast.entity import Entity >>> from feast.value_type import ValueType >>> >>> feast_client = Client(core_url="localhost:6565") >>> entity = Entity( >>> name="driver_entity", >>> description="Driver entity for car rides", >>> value_type=ValueType.STRING, >>> labels={ >>> "key": "val" >>> } >>> ) >>> feast_client.apply(entity) """ if project is None: project = self.project if not isinstance(objects, list): objects = [objects] for obj in objects: if isinstance(obj, Entity): self._apply_entity(project, obj) # type: ignore elif isinstance(obj, FeatureTable): self._apply_feature_table(project, obj) # type: ignore else: raise ValueError( f"Could not determine object type to apply {obj} with type {type(obj)}. Type must be Entity or FeatureTable." ) def apply_entity(self, entities: Union[List[Entity], Entity], project: str = None): """ Deprecated. Please see apply(). """ warnings.warn( "The method apply_entity() is being deprecated. Please use apply() instead. Feast 0.10 and onwards will not support apply_entity().", DeprecationWarning, ) if project is None: project = self.project if not isinstance(entities, list): entities = [entities] for entity in entities: if isinstance(entity, Entity): self._apply_entity(project, entity) # type: ignore continue raise ValueError( f"Could not determine entity type to apply {entity}") def _apply_entity(self, project: str, entity: Entity): """ Registers a single entity with Feast Args: entity: Entity that will be registered """ entity.is_valid() entity_proto = entity.to_spec_proto() # Convert the entity to a request and send to Feast Core try: apply_entity_response = self._core_service.ApplyEntity( ApplyEntityRequest(project=project, spec=entity_proto), # type: ignore timeout=self._config.getint(opt.GRPC_CONNECTION_TIMEOUT), metadata=self._get_grpc_metadata(), ) # type: ApplyEntityResponse except grpc.RpcError as e: raise grpc.RpcError(e.details()) # Extract the returned entity applied_entity = Entity.from_proto(apply_entity_response.entity) # Deep copy from the returned entity to the local entity entity._update_from_entity(applied_entity) def list_entities(self, project: str = None, labels: Dict[str, str] = dict()) -> List[Entity]: """ Retrieve a list of entities from Feast Core Args: project: Filter entities based on project name labels: User-defined labels that these entities are associated with Returns: List of entities """ if project is None: project = self.project filter = ListEntitiesRequest.Filter(project=project, labels=labels) # Get latest entities from Feast Core entity_protos = self._core_service.ListEntities( ListEntitiesRequest(filter=filter), metadata=self._get_grpc_metadata(), ) # type: ListEntitiesResponse # Extract entities and return entities = [] for entity_proto in entity_protos.entities: entity = Entity.from_proto(entity_proto) entity._client = self entities.append(entity) return entities def get_entity(self, name: str, project: str = None) -> Entity: """ Retrieves an entity. Args: project: Feast project that this entity belongs to name: Name of entity Returns: Returns either the specified entity, or raises an exception if none is found """ if project is None: project = self.project try: get_entity_response = self._core_service.GetEntity( GetEntityRequest(project=project, name=name.strip()), metadata=self._get_grpc_metadata(), ) # type: GetEntityResponse except grpc.RpcError as e: raise grpc.RpcError(e.details()) entity = Entity.from_proto(get_entity_response.entity) return entity def apply_feature_table( self, feature_tables: Union[List[FeatureTable], FeatureTable], project: str = None, ): """ Deprecated. Please see apply(). """ warnings.warn( "The method apply_feature_table() is being deprecated. Please use apply() instead. Feast 0.10 and onwards will not support apply_feature_table().", DeprecationWarning, ) if project is None: project = self.project if not isinstance(feature_tables, list): feature_tables = [feature_tables] for feature_table in feature_tables: if isinstance(feature_table, FeatureTable): self._apply_feature_table(project, feature_table) # type: ignore continue raise ValueError( f"Could not determine feature table type to apply {feature_table}" ) def _apply_feature_table(self, project: str, feature_table: FeatureTable): """ Registers a single feature table with Feast Args: feature_table: Feature table that will be registered """ feature_table.is_valid() feature_table_proto = feature_table.to_spec_proto() # Convert the feature table to a request and send to Feast Core try: apply_feature_table_response = self._core_service.ApplyFeatureTable( ApplyFeatureTableRequest( project=project, table_spec=feature_table_proto), # type: ignore timeout=self._config.getint(opt.GRPC_CONNECTION_TIMEOUT), metadata=self._get_grpc_metadata(), ) # type: ApplyFeatureTableResponse except grpc.RpcError as e: raise grpc.RpcError(e.details()) # Extract the returned feature table applied_feature_table = FeatureTable.from_proto( apply_feature_table_response.table) # Deep copy from the returned feature table to the local entity feature_table._update_from_feature_table(applied_feature_table) def list_feature_tables( self, project: str = None, labels: Dict[str, str] = dict() ) -> List[FeatureTable]: """ Retrieve a list of feature tables from Feast Core Args: project: Filter feature tables based on project name Returns: List of feature tables """ if project is None: project = self.project filter = ListFeatureTablesRequest.Filter(project=project, labels=labels) # Get latest feature tables from Feast Core feature_table_protos = self._core_service.ListFeatureTables( ListFeatureTablesRequest(filter=filter), metadata=self._get_grpc_metadata(), ) # type: ListFeatureTablesResponse # Extract feature tables and return feature_tables = [] for feature_table_proto in feature_table_protos.tables: feature_table = FeatureTable.from_proto(feature_table_proto) feature_table._client = self feature_tables.append(feature_table) return feature_tables def get_feature_table(self, name: str, project: str = None) -> FeatureTable: """ Retrieves a feature table. Args: project: Feast project that this feature table belongs to name: Name of feature table Returns: Returns either the specified feature table, or raises an exception if none is found """ if project is None: project = self.project try: get_feature_table_response = self._core_service.GetFeatureTable( GetFeatureTableRequest(project=project, name=name.strip()), metadata=self._get_grpc_metadata(), ) # type: GetFeatureTableResponse except grpc.RpcError as e: raise grpc.RpcError(e.details()) return FeatureTable.from_proto(get_feature_table_response.table) def delete_feature_table(self, name: str, project: str = None) -> None: """ Deletes a feature table. Args: project: Feast project that this feature table belongs to name: Name of feature table """ if project is None: project = self.project try: self._core_service.DeleteFeatureTable( DeleteFeatureTableRequest(project=project, name=name.strip()), metadata=self._get_grpc_metadata(), ) except grpc.RpcError as e: raise grpc.RpcError(e.details()) def list_features_by_ref( self, project: str = None, entities: List[str] = list(), labels: Dict[str, str] = dict(), ) -> Dict[FeatureRef, Feature]: """ Retrieve a dictionary of feature reference to feature from Feast Core based on filters provided. Args: project: Feast project that these features belongs to entities: Feast entity that these features are associated with labels: Feast labels that these features are associated with Returns: Dictionary of <feature references: features> Examples: >>> from feast import Client >>> >>> feast_client = Client(core_url="localhost:6565") >>> features = feast_client.list_features(project="test_project", entities=["driver_id"], labels={"key1":"val1","key2":"val2"}) >>> print(features) """ if project is None: project = self.project filter = ListFeaturesRequest.Filter(project=project, entities=entities, labels=labels) feature_protos = self._core_service.ListFeatures( ListFeaturesRequest(filter=filter), metadata=self._get_grpc_metadata(), ) # type: ListFeaturesResponse # Extract features and return features_dict = {} for ref_str, feature_proto in feature_protos.features.items(): feature_ref = FeatureRef.from_str(ref_str) feature = Feature.from_proto(feature_proto) features_dict[feature_ref] = feature return features_dict def ingest( self, feature_table: Union[str, FeatureTable], source: Union[pd.DataFrame, str], project: str = None, chunk_size: int = 10000, max_workers: int = max(CPU_COUNT - 1, 1), timeout: int = int(opt().BATCH_INGESTION_PRODUCTION_TIMEOUT), ) -> None: """ Batch load feature data into a FeatureTable. Args: feature_table (typing.Union[str, feast.feature_table.FeatureTable]): FeatureTable object or the string name of the feature table source (typing.Union[pd.DataFrame, str]): Either a file path or Pandas Dataframe to ingest into Feast Files that are currently supported: * parquet * csv * json project: Feast project to locate FeatureTable chunk_size (int): Amount of rows to load and ingest at a time. max_workers (int): Number of worker processes to use to encode values. timeout (int): Timeout in seconds to wait for completion. Examples: >>> from feast import Client >>> >>> client = Client(core_url="localhost:6565") >>> ft_df = pd.DataFrame( >>> { >>> "datetime": [pd.datetime.now()], >>> "driver": [1001], >>> "rating": [4.3], >>> } >>> ) >>> client.set_project("project1") >>> >>> driver_ft = client.get_feature_table("driver") >>> client.ingest(driver_ft, ft_df) """ if project is None: project = self.project if isinstance(feature_table, str): name = feature_table if isinstance(feature_table, FeatureTable): name = feature_table.name fetched_feature_table: Optional[FeatureTable] = self.get_feature_table( name, project) if fetched_feature_table is not None: feature_table = fetched_feature_table else: raise Exception(f"FeatureTable, {name} cannot be found.") # Check 1) Only parquet file format for FeatureTable batch source is supported if (feature_table.batch_source and issubclass(type(feature_table.batch_source), FileSource) and isinstance( type(feature_table.batch_source.file_options.file_format), ParquetFormat)): raise Exception( f"No suitable batch source found for FeatureTable, {name}." f"Only BATCH_FILE source with parquet format is supported for batch ingestion." ) pyarrow_table, column_names = _read_table_from_source(source) # Check 2) Check if FeatureTable batch source field mappings can be found in provided source table _check_field_mappings( column_names, name, feature_table.batch_source.event_timestamp_column, feature_table.batch_source.field_mapping, ) dir_path = None with_partitions = False if (issubclass(type(feature_table.batch_source), FileSource) and feature_table.batch_source.date_partition_column): with_partitions = True dest_path = _write_partitioned_table_from_source( column_names, pyarrow_table, feature_table.batch_source.date_partition_column, feature_table.batch_source.event_timestamp_column, ) else: dir_path, dest_path = _write_non_partitioned_table_from_source( column_names, pyarrow_table, chunk_size, max_workers, ) try: if issubclass(type(feature_table.batch_source), FileSource): file_url = feature_table.batch_source.file_options.file_url.rstrip( "*") _upload_to_file_source(file_url, with_partitions, dest_path, self._config) if issubclass(type(feature_table.batch_source), BigQuerySource): bq_table_ref = feature_table.batch_source.bigquery_options.table_ref feature_table_timestamp_column = ( feature_table.batch_source.event_timestamp_column) _upload_to_bq_source(bq_table_ref, feature_table_timestamp_column, dest_path) finally: # Remove parquet file(s) that were created earlier print("Removing temporary file(s)...") if dir_path: shutil.rmtree(dir_path) print( "Data has been successfully ingested into FeatureTable batch source." ) def _get_grpc_metadata(self): """ Returns a metadata tuple to attach to gRPC requests. This is primarily used when authentication is enabled but SSL/TLS is disabled. Returns: Tuple of metadata to attach to each gRPC call """ if self._config.getboolean(opt.ENABLE_AUTH) and self._auth_metadata: return self._auth_metadata.get_signed_meta() return () def get_online_features( self, feature_refs: List[str], entity_rows: List[Dict[str, Any]], project: Optional[str] = None, ) -> OnlineResponse: """ Retrieves the latest online feature data from Feast Serving. Args: feature_refs: List of feature references that will be returned for each entity. Each feature reference should have the following format: "feature_table:feature" where "feature_table" & "feature" refer to the feature and feature table names respectively. Only the feature name is required. entity_rows: A list of dictionaries where each key-value is an entity-name, entity-value pair. project: Optionally specify the the project override. If specified, uses given project for retrieval. Overrides the projects specified in Feature References if also are specified. Returns: GetOnlineFeaturesResponse containing the feature data in records. Each EntityRow provided will yield one record, which contains data fields with data value and field status metadata (if included). Examples: >>> from feast import Client >>> >>> feast_client = Client(core_url="localhost:6565", serving_url="localhost:6566") >>> feature_refs = ["sales:daily_transactions"] >>> entity_rows = [{"customer_id": 0},{"customer_id": 1}] >>> >>> online_response = feast_client.get_online_features( >>> feature_refs, entity_rows, project="my_project") >>> online_response_dict = online_response.to_dict() >>> print(online_response_dict) {'sales:daily_transactions': [1.1,1.2], 'sales:customer_id': [0,1]} """ try: response = self._serving_service.GetOnlineFeaturesV2( GetOnlineFeaturesRequestV2( features=_build_feature_references( feature_ref_strs=feature_refs), entity_rows=_infer_online_entity_rows(entity_rows), project=project if project is not None else self.project, ), timeout=self._config.getint(opt.GRPC_CONNECTION_TIMEOUT), metadata=self._get_grpc_metadata(), ) except grpc.RpcError as e: raise grpc.RpcError(e.details()) response = OnlineResponse(response) return response def get_historical_features( self, feature_refs: List[str], entity_source: Union[pd.DataFrame, FileSource, BigQuerySource], output_location: Optional[str] = None, ) -> RetrievalJob: """ Launch a historical feature retrieval job. Args: feature_refs: List of feature references that will be returned for each entity. Each feature reference should have the following format: "feature_table:feature" where "feature_table" & "feature" refer to the feature and feature table names respectively. entity_source (Union[pd.DataFrame, FileSource, BigQuerySource]): Source for the entity rows. If entity_source is a Panda DataFrame, the dataframe will be staged to become accessible by spark workers. If one of feature tables' source is in BigQuery - entities will be upload to BQ. Otherwise to remote file storage (derived from configured staging location). It is also assumed that the column event_timestamp is present in the dataframe, and is of type datetime without timezone information. The user needs to make sure that the source (or staging location, if entity_source is a Panda DataFrame) is accessible from the Spark cluster that will be used for the retrieval job. destination_path: Specifies the path in a bucket to write the exported feature data files Returns: Returns a retrieval job object that can be used to monitor retrieval progress asynchronously, and can be used to materialize the results. Examples: >>> from feast import Client >>> from feast.data_format import ParquetFormat >>> from datetime import datetime >>> feast_client = Client(core_url="localhost:6565") >>> feature_refs = ["bookings:bookings_7d", "bookings:booking_14d"] >>> entity_source = FileSource("event_timestamp", ParquetFormat(), "gs://some-bucket/customer") >>> feature_retrieval_job = feast_client.get_historical_features( >>> feature_refs, entity_source) >>> output_file_uri = feature_retrieval_job.get_output_file_uri() "gs://some-bucket/output/ """ feature_tables = self._get_feature_tables_from_feature_refs( feature_refs, self.project) assert all( ft.batch_source.created_timestamp_column for ft in feature_tables), ( "All BatchSources attached to retrieved FeatureTables " "must have specified `created_timestamp_column` to be used in " "historical dataset generation.") if output_location is None: output_location = os.path.join( self._config.get(opt.HISTORICAL_FEATURE_OUTPUT_LOCATION), str(uuid.uuid4()), ) output_format = self._config.get(opt.HISTORICAL_FEATURE_OUTPUT_FORMAT) feature_sources = [ feature_table.batch_source for feature_table in feature_tables ] if isinstance(entity_source, pd.DataFrame): if any( isinstance(source, BigQuerySource) for source in feature_sources): first_bq_source = [ source for source in feature_sources if isinstance(source, BigQuerySource) ][0] source_ref = table_reference_from_string( first_bq_source.bigquery_options.table_ref) entity_source = stage_entities_to_bq(entity_source, source_ref.project, source_ref.dataset_id) else: entity_source = stage_entities_to_fs( entity_source, staging_location=self._config.get( opt.SPARK_STAGING_LOCATION), config=self._config, ) if self._use_job_service: response = self._job_service.GetHistoricalFeatures( GetHistoricalFeaturesRequest( feature_refs=feature_refs, entity_source=entity_source.to_proto(), project=self.project, output_format=output_format, output_location=output_location, ), **self._extra_grpc_params(), ) return RemoteRetrievalJob( self._job_service, self._extra_grpc_params, response.id, output_file_uri=response.output_file_uri, ) else: return start_historical_feature_retrieval_job( client=self, project=self.project, entity_source=entity_source, feature_tables=feature_tables, output_format=output_format, output_path=output_location, ) def get_historical_features_df( self, feature_refs: List[str], entity_source: Union[FileSource, BigQuerySource], ): """ Launch a historical feature retrieval job. Args: feature_refs: List of feature references that will be returned for each entity. Each feature reference should have the following format: "feature_table:feature" where "feature_table" & "feature" refer to the feature and feature table names respectively. entity_source (Union[FileSource, BigQuerySource]): Source for the entity rows. The user needs to make sure that the source is accessible from the Spark cluster that will be used for the retrieval job. Returns: Returns the historical feature retrieval result in the form of Spark dataframe. Examples: >>> from feast import Client >>> from feast.data_format import ParquetFormat >>> from datetime import datetime >>> from pyspark.sql import SparkSession >>> spark = SparkSession.builder.getOrCreate() >>> feast_client = Client(core_url="localhost:6565") >>> feature_refs = ["bookings:bookings_7d", "bookings:booking_14d"] >>> entity_source = FileSource("event_timestamp", ParquetFormat, "gs://some-bucket/customer") >>> df = feast_client.get_historical_features( >>> feature_refs, entity_source) """ feature_tables = self._get_feature_tables_from_feature_refs( feature_refs, self.project) return start_historical_feature_retrieval_spark_session( client=self, project=self.project, entity_source=entity_source, feature_tables=feature_tables, ) def _get_feature_tables_from_feature_refs(self, feature_refs: List[str], project: Optional[str]): feature_refs_grouped_by_table = [ (feature_table_name, list(grouped_feature_refs)) for feature_table_name, grouped_feature_refs in groupby( feature_refs, lambda x: x.split(":")[0]) ] feature_tables = [] for feature_table_name, grouped_feature_refs in feature_refs_grouped_by_table: feature_table = self.get_feature_table(feature_table_name, project) feature_names = [f.split(":")[-1] for f in grouped_feature_refs] feature_table.features = [ f for f in feature_table.features if f.name in feature_names ] feature_tables.append(feature_table) return feature_tables def start_offline_to_online_ingestion( self, feature_table: FeatureTable, start: datetime, end: datetime, ) -> SparkJob: """ Launch Ingestion Job from Batch Source to Online Store for given featureTable :param feature_table: FeatureTable which will be ingested :param start: lower datetime boundary :param end: upper datetime boundary :return: Spark Job Proxy object """ if not self._use_job_service: return start_offline_to_online_ingestion( client=self, project=self.project, feature_table=feature_table, start=start, end=end, ) else: request = StartOfflineToOnlineIngestionJobRequest( project=self.project, table_name=feature_table.name, ) request.start_date.FromDatetime(start) request.end_date.FromDatetime(end) response = self._job_service.StartOfflineToOnlineIngestionJob( request) return RemoteBatchIngestionJob( self._job_service, self._extra_grpc_params, response.id, ) def start_stream_to_online_ingestion( self, feature_table: FeatureTable, extra_jars: Optional[List[str]] = None, project: str = None, ) -> SparkJob: if not self._use_job_service: return start_stream_to_online_ingestion( client=self, project=project or self.project, feature_table=feature_table, extra_jars=extra_jars or [], ) else: request = StartStreamToOnlineIngestionJobRequest( project=self.project, table_name=feature_table.name, ) response = self._job_service.StartStreamToOnlineIngestionJob( request) return RemoteStreamIngestionJob(self._job_service, self._extra_grpc_params, response.id) def list_jobs(self, include_terminated: bool) -> List[SparkJob]: if not self._use_job_service: return list_jobs(include_terminated, self) else: request = ListJobsRequest(include_terminated=include_terminated) response = self._job_service.ListJobs(request) return [ get_remote_job_from_proto(self._job_service, self._extra_grpc_params, job) for job in response.jobs ] def get_job_by_id(self, job_id: str) -> SparkJob: if not self._use_job_service: return get_job_by_id(job_id, self) else: request = GetJobRequest(job_id=job_id) response = self._job_service.GetJob(request) return get_remote_job_from_proto(self._job_service, self._extra_grpc_params, response.job) def stage_dataframe( self, df: pd.DataFrame, event_timestamp_column: str, ) -> FileSource: return stage_dataframe(df, event_timestamp_column, self._config)
class Client: """ Feast Client: Used for creating, managing, and retrieving features. """ def __init__(self, options: Optional[Dict[str, str]] = None, **kwargs): """ The Feast Client should be initialized with at least one service url Please see constants.py for configuration options. Commonly used options or arguments include: core_url: Feast Core URL. Used to manage features serving_url: Feast Serving URL. Used to retrieve features project: Sets the active project. This field is optional. core_secure: Use client-side SSL/TLS for Core gRPC API serving_secure: Use client-side SSL/TLS for Serving gRPC API enable_auth: Enable authentication and authorization auth_provider: Authentication provider – "google" or "oauth" if auth_provider is "oauth", the following fields are mandatory – oauth_grant_type, oauth_client_id, oauth_client_secret, oauth_audience, oauth_token_request_url Args: options: Configuration options to initialize client with **kwargs: Additional keyword arguments that will be used as configuration options along with "options" """ if options is None: options = dict() self._config = Config(options={**options, **kwargs}) self._core_service_stub: Optional[CoreServiceStub] = None self._serving_service_stub: Optional[ServingServiceStub] = None self._auth_metadata: Optional[grpc.AuthMetadataPlugin] = None # Configure Auth Metadata Plugin if auth is enabled if self._config.getboolean(CONFIG_ENABLE_AUTH_KEY): self._auth_metadata = feast_auth.get_auth_metadata_plugin( self._config) @property def _core_service(self): """ Creates or returns the gRPC Feast Core Service Stub Returns: CoreServiceStub """ if not self._core_service_stub: channel = create_grpc_channel( url=self._config.get(CONFIG_CORE_URL_KEY), enable_ssl=self._config.getboolean(CONFIG_CORE_ENABLE_SSL_KEY), enable_auth=self._config.getboolean(CONFIG_ENABLE_AUTH_KEY), ssl_server_cert_path=self._config.get( CONFIG_CORE_SERVER_SSL_CERT_KEY), auth_metadata_plugin=self._auth_metadata, timeout=self._config.getint( CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY), ) self._core_service_stub = CoreServiceStub(channel) return self._core_service_stub @property def _serving_service(self): """ Creates or returns the gRPC Feast Serving Service Stub Returns: ServingServiceStub """ if not self._serving_service_stub: channel = create_grpc_channel( url=self._config.get(CONFIG_SERVING_URL_KEY), enable_ssl=self._config.getboolean( CONFIG_SERVING_ENABLE_SSL_KEY), enable_auth=self._config.getboolean(CONFIG_ENABLE_AUTH_KEY), ssl_server_cert_path=self._config.get( CONFIG_SERVING_SERVER_SSL_CERT_KEY), auth_metadata_plugin=self._auth_metadata, timeout=self._config.getint( CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY), ) self._serving_service_stub = ServingServiceStub(channel) return self._serving_service_stub @property def core_url(self) -> str: """ Retrieve Feast Core URL Returns: Feast Core URL string """ return self._config.get(CONFIG_CORE_URL_KEY) @core_url.setter def core_url(self, value: str): """ Set the Feast Core URL Args: value: Feast Core URL """ self._config.set(CONFIG_CORE_URL_KEY, value) @property def serving_url(self) -> str: """ Retrieve Serving Core URL Returns: Feast Serving URL string """ return self._config.get(CONFIG_SERVING_URL_KEY) @serving_url.setter def serving_url(self, value: str): """ Set the Feast Serving URL Args: value: Feast Serving URL """ self._config.set(CONFIG_SERVING_URL_KEY, value) @property def core_secure(self) -> bool: """ Retrieve Feast Core client-side SSL/TLS setting Returns: Whether client-side SSL/TLS is enabled """ return self._config.getboolean(CONFIG_CORE_ENABLE_SSL_KEY) @core_secure.setter def core_secure(self, value: bool): """ Set the Feast Core client-side SSL/TLS setting Args: value: True to enable client-side SSL/TLS """ self._config.set(CONFIG_CORE_ENABLE_SSL_KEY, value) @property def serving_secure(self) -> bool: """ Retrieve Feast Serving client-side SSL/TLS setting Returns: Whether client-side SSL/TLS is enabled """ return self._config.getboolean(CONFIG_SERVING_ENABLE_SSL_KEY) @serving_secure.setter def serving_secure(self, value: bool): """ Set the Feast Serving client-side SSL/TLS setting Args: value: True to enable client-side SSL/TLS """ self._config.set(CONFIG_SERVING_ENABLE_SSL_KEY, value) def version(self): """ Returns version information from Feast Core and Feast Serving """ import pkg_resources result = { "sdk": { "version": pkg_resources.get_distribution("feast").version }, "serving": "not configured", "core": "not configured", } if self.serving_url: serving_version = self._serving_service.GetFeastServingInfo( GetFeastServingInfoRequest(), timeout=self._config.getint( CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY), metadata=self._get_grpc_metadata(), ).version result["serving"] = { "url": self.serving_url, "version": serving_version } if self.core_url: core_version = self._core_service.GetFeastCoreVersion( GetFeastCoreVersionRequest(), timeout=self._config.getint( CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY), metadata=self._get_grpc_metadata(), ).version result["core"] = {"url": self.core_url, "version": core_version} return result @property def project(self) -> Union[str, None]: """ Retrieve currently active project Returns: Project name """ return self._config.get(CONFIG_PROJECT_KEY) def set_project(self, project: Optional[str] = None): """ Set currently active Feast project Args: project: Project to set as active. If unset, will reset to the default project. """ if project is None: project = FEAST_DEFAULT_OPTIONS[CONFIG_PROJECT_KEY] self._config.set(CONFIG_PROJECT_KEY, project) def list_projects(self) -> List[str]: """ List all active Feast projects Returns: List of project names """ response = self._core_service.ListProjects( ListProjectsRequest(), timeout=self._config.getint( CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY), metadata=self._get_grpc_metadata(), ) # type: ListProjectsResponse return list(response.projects) def create_project(self, project: str): """ Creates a Feast project Args: project: Name of project """ self._core_service.CreateProject( CreateProjectRequest(name=project), timeout=self._config.getint( CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY), metadata=self._get_grpc_metadata(), ) # type: CreateProjectResponse def archive_project(self, project): """ Archives a project. Project will still continue to function for ingestion and retrieval, but will be in a read-only state. It will also not be visible from the Core API for management purposes. Args: project: Name of project to archive """ try: self._core_service_stub.ArchiveProject( ArchiveProjectRequest(name=project), timeout=self._config.getint( CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY), metadata=self._get_grpc_metadata(), ) # type: ArchiveProjectResponse except grpc.RpcError as e: raise grpc.RpcError(e.details()) # revert to the default project if self._project == project: self._project = FEAST_DEFAULT_OPTIONS[CONFIG_PROJECT_KEY] def apply(self, feature_sets: Union[List[FeatureSet], FeatureSet]): """ Idempotently registers feature set(s) with Feast Core. Either a single feature set or a list can be provided. Args: feature_sets: List of feature sets that will be registered """ if not isinstance(feature_sets, list): feature_sets = [feature_sets] for feature_set in feature_sets: if isinstance(feature_set, FeatureSet): self._apply_feature_set(feature_set) continue raise ValueError( f"Could not determine feature set type to apply {feature_set}") def _apply_feature_set(self, feature_set: FeatureSet): """ Registers a single feature set with Feast Args: feature_set: Feature set that will be registered """ feature_set.is_valid() feature_set_proto = feature_set.to_proto() if len(feature_set_proto.spec.project) == 0: if self.project is not None: feature_set_proto.spec.project = self.project # Convert the feature set to a request and send to Feast Core try: apply_fs_response = self._core_service.ApplyFeatureSet( ApplyFeatureSetRequest(feature_set=feature_set_proto), timeout=self._config.getint( CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY), metadata=self._get_grpc_metadata(), ) # type: ApplyFeatureSetResponse except grpc.RpcError as e: raise grpc.RpcError(e.details()) # Extract the returned feature set applied_fs = FeatureSet.from_proto(apply_fs_response.feature_set) # If the feature set has changed, update the local copy if apply_fs_response.status == ApplyFeatureSetResponse.Status.CREATED: print(f'Feature set created: "{applied_fs.name}"') if apply_fs_response.status == ApplyFeatureSetResponse.Status.UPDATED: print(f'Feature set updated: "{applied_fs.name}"') # If no change has been applied, do nothing if apply_fs_response.status == ApplyFeatureSetResponse.Status.NO_CHANGE: print(f"No change detected or applied: {feature_set.name}") # Deep copy from the returned feature set to the local feature set feature_set._update_from_feature_set(applied_fs) def list_feature_sets( self, project: str = None, name: str = None, labels: Dict[str, str] = dict()) -> List[FeatureSet]: """ Retrieve a list of feature sets from Feast Core Args: project: Filter feature sets based on project name name: Filter feature sets based on feature set name Returns: List of feature sets """ if project is None: if self.project is not None: project = self.project else: project = "*" if name is None: name = "*" filter = ListFeatureSetsRequest.Filter(project=project, feature_set_name=name, labels=labels) # Get latest feature sets from Feast Core feature_set_protos = self._core_service.ListFeatureSets( ListFeatureSetsRequest(filter=filter), metadata=self._get_grpc_metadata(), ) # type: ListFeatureSetsResponse # Extract feature sets and return feature_sets = [] for feature_set_proto in feature_set_protos.feature_sets: feature_set = FeatureSet.from_proto(feature_set_proto) feature_set._client = self feature_sets.append(feature_set) return feature_sets def get_feature_set(self, name: str, project: str = None) -> Union[FeatureSet, None]: """ Retrieves a feature set. Args: project: Feast project that this feature set belongs to name: Name of feature set Returns: Returns either the specified feature set, or raises an exception if none is found """ if project is None: if self.project is not None: project = self.project else: raise ValueError("No project has been configured.") try: get_feature_set_response = self._core_service.GetFeatureSet( GetFeatureSetRequest(project=project, name=name.strip()), metadata=self._get_grpc_metadata(), ) # type: GetFeatureSetResponse except grpc.RpcError as e: raise grpc.RpcError(e.details()) return FeatureSet.from_proto(get_feature_set_response.feature_set) def list_features_by_ref( self, project: str = None, entities: List[str] = list(), labels: Dict[str, str] = dict(), ) -> Dict[FeatureRef, Feature]: """ Returns a list of features based on filters provided. Args: project: Feast project that these features belongs to entities: Feast entity that these features are associated with labels: Feast labels that these features are associated with Returns: Dictionary of <feature references: features> Examples: >>> from feast import Client >>> >>> feast_client = Client(core_url="localhost:6565") >>> features = list_features_by_ref(project="test_project", entities=["driver_id"], labels={"key1":"val1","key2":"val2"}) >>> print(features) """ if project is None: if self.project is not None: project = self.project else: project = "default" filter = ListFeaturesRequest.Filter(project=project, entities=entities, labels=labels) feature_protos = self._core_service.ListFeatures( ListFeaturesRequest(filter=filter), metadata=self._get_grpc_metadata(), ) # type: ListFeaturesResponse features_dict = {} for ref_str, feature_proto in feature_protos.features.items(): feature_ref = FeatureRef.from_str(ref_str, ignore_project=True) feature = Feature.from_proto(feature_proto) features_dict[feature_ref] = feature return features_dict def list_entities(self) -> Dict[str, Entity]: """ Returns a dictionary of entities across all feature sets Returns: Dictionary of entities, indexed by name """ entities_dict = OrderedDict() for fs in self.list_feature_sets(): for entity in fs.entities: entities_dict[entity.name] = entity return entities_dict def get_batch_features( self, feature_refs: List[str], entity_rows: Union[pd.DataFrame, str], compute_statistics: bool = False, project: str = None, ) -> RetrievalJob: """ Deprecated. Please see get_historical_features. """ warnings.warn( "The method get_batch_features() is being deprecated. Please use the identical get_historical_features(). " "Feast 0.7 and onwards will not support get_batch_features().", DeprecationWarning, ) return self.get_historical_features(feature_refs, entity_rows, compute_statistics, project) def get_historical_features( self, feature_refs: List[str], entity_rows: Union[pd.DataFrame, str], compute_statistics: bool = False, project: str = None, ) -> RetrievalJob: """ Retrieves historical features from a Feast Serving deployment. Args: feature_refs: List of feature references that will be returned for each entity. Each feature reference should have the following format: "feature_set:feature" where "feature_set" & "feature" refer to the feature and feature set names respectively. Only the feature name is required. entity_rows (Union[pd.DataFrame, str]): Pandas dataframe containing entities and a 'datetime' column. Each entity in a feature set must be present as a column in this dataframe. The datetime column must contain timestamps in datetime64 format. compute_statistics (bool): Indicates whether Feast should compute statistics over the retrieved dataset. project: Specifies the project which contain the FeatureSets which the requested features belong to. Returns: feast.job.RetrievalJob: Returns a retrival job object that can be used to monitor retrieval progress asynchronously, and can be used to materialize the results. Examples: >>> from feast import Client >>> from datetime import datetime >>> >>> feast_client = Client(core_url="localhost:6565", serving_url="localhost:6566") >>> feature_refs = ["my_project/bookings_7d", "booking_14d"] >>> entity_rows = pd.DataFrame( >>> { >>> "datetime": [pd.datetime.now() for _ in range(3)], >>> "customer": [1001, 1002, 1003], >>> } >>> ) >>> feature_retrieval_job = feast_client.get_historical_features( >>> feature_refs, entity_rows, project="my_project") >>> df = feature_retrieval_job.to_dataframe() >>> print(df) """ # Retrieve serving information to determine store type and # staging location serving_info = self._serving_service.GetFeastServingInfo( GetFeastServingInfoRequest(), timeout=self._config.getint( CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY), metadata=self._get_grpc_metadata(), ) # type: GetFeastServingInfoResponse if serving_info.type != FeastServingType.FEAST_SERVING_TYPE_BATCH: raise Exception( f'You are connected to a store "{self.serving_url}" which ' f"does not support batch retrieval ") if isinstance(entity_rows, pd.DataFrame): # Pandas DataFrame detected # Remove timezone from datetime column if isinstance(entity_rows["datetime"].dtype, pd.core.dtypes.dtypes.DatetimeTZDtype): entity_rows["datetime"] = pd.DatetimeIndex( entity_rows["datetime"]).tz_localize(None) elif isinstance(entity_rows, str): # String based source if not entity_rows.endswith((".avro", "*")): raise Exception( "Only .avro and wildcard paths are accepted as entity_rows" ) else: raise Exception(f"Only pandas.DataFrame and str types are allowed" f" as entity_rows, but got {type(entity_rows)}.") # Export and upload entity row DataFrame to staging location # provided by Feast staged_files = export_source_to_staging_location( entity_rows, serving_info.job_staging_location) # type: List[str] request = GetBatchFeaturesRequest( features=_build_feature_references( feature_ref_strs=feature_refs, project=project if project is not None else self.project, ), dataset_source=DatasetSource(file_source=DatasetSource.FileSource( file_uris=staged_files, data_format=DataFormat.DATA_FORMAT_AVRO)), compute_statistics=compute_statistics, ) # Retrieve Feast Job object to manage life cycle of retrieval try: response = self._serving_service.GetBatchFeatures( request, metadata=self._get_grpc_metadata()) except grpc.RpcError as e: raise grpc.RpcError(e.details()) return RetrievalJob( response.job, self._serving_service, auth_metadata_plugin=self._auth_metadata, ) def get_online_features( self, feature_refs: List[str], entity_rows: List[Union[GetOnlineFeaturesRequest.EntityRow, Dict[str, Any]]], project: Optional[str] = None, omit_entities: bool = False, ) -> OnlineResponse: """ Retrieves the latest online feature data from Feast Serving Args: feature_refs: List of feature references that will be returned for each entity. Each feature reference should have the following format: "feature_set:feature" where "feature_set" & "feature" refer to the feature and feature set names respectively. Only the feature name is required. entity_rows: A list of dictionaries where each key is an entity and each value is feast.types.Value or Python native form. project: Optionally specify the the project override. If specified, uses given project for retrieval. Overrides the projects specified in Feature References if also are specified. omit_entities: If true will omit entity values in the returned feature data. Returns: GetOnlineFeaturesResponse containing the feature data in records. Each EntityRow provided will yield one record, which contains data fields with data value and field status metadata (if included). Examples: >>> from feast import Client >>> >>> feast_client = Client(core_url="localhost:6565", serving_url="localhost:6566") >>> feature_refs = ["daily_transactions"] >>> entity_rows = [{"customer_id": 0},{"customer_id": 1}] >>> >>> online_response = feast_client.get_online_features( >>> feature_refs, entity_rows, project="my_project") >>> online_response_dict = online_response.to_dict() >>> print(online_response_dict) {'daily_transactions': [1.1,1.2], 'customer_id': [0,1]} """ try: response = self._serving_service.GetOnlineFeatures( GetOnlineFeaturesRequest( omit_entities_in_response=omit_entities, features=_build_feature_references( feature_ref_strs=feature_refs), entity_rows=_infer_online_entity_rows(entity_rows), project=project if project is not None else self.project, ), metadata=self._get_grpc_metadata(), ) except grpc.RpcError as e: raise grpc.RpcError(e.details()) response = OnlineResponse(response) return response def list_ingest_jobs( self, job_id: str = None, feature_set_ref: FeatureSetRef = None, store_name: str = None, ): """ List the ingestion jobs currently registered in Feast, with optional filters. Provides detailed metadata about each ingestion job. Args: job_id: Select specific ingestion job with the given job_id feature_set_ref: Filter ingestion jobs by target feature set (via reference) store_name: Filter ingestion jobs by target feast store's name Returns: List of IngestJobs matching the given filters """ # construct list request feature_set_ref_proto = None if feature_set_ref: feature_set_ref_proto = feature_set_ref.to_proto() list_filter = ListIngestionJobsRequest.Filter( id=job_id, feature_set_reference=feature_set_ref_proto, store_name=store_name, ) request = ListIngestionJobsRequest(filter=list_filter) # make list request & unpack response response = self._core_service.ListIngestionJobs( request, metadata=self._get_grpc_metadata(), ) # type: ignore ingest_jobs = [ IngestJob(proto, self._core_service, auth_metadata_plugin=self._auth_metadata) for proto in response.jobs # type: ignore ] return ingest_jobs def restart_ingest_job(self, job: IngestJob): """ Restart ingestion job currently registered in Feast. NOTE: Data might be lost during the restart for some job runners. Does not support stopping a job in a transitional (ie pending, suspending, aborting), terminal state (ie suspended or aborted) or unknown status Args: job: IngestJob to restart """ request = RestartIngestionJobRequest(id=job.id) try: self._core_service.RestartIngestionJob( request, metadata=self._get_grpc_metadata(), ) # type: ignore except grpc.RpcError as e: raise grpc.RpcError(e.details()) def stop_ingest_job(self, job: IngestJob): """ Stop ingestion job currently resgistered in Feast Does nothing if the target job if already in a terminal state (ie suspended or aborted). Does not support stopping a job in a transitional (ie pending, suspending, aborting) or in a unknown status Args: job: IngestJob to restart """ request = StopIngestionJobRequest(id=job.id) try: self._core_service.StopIngestionJob( request, metadata=self._get_grpc_metadata(), ) # type: ignore except grpc.RpcError as e: raise grpc.RpcError(e.details()) def ingest( self, feature_set: Union[str, FeatureSet], source: Union[pd.DataFrame, str], chunk_size: int = 10000, max_workers: int = max(CPU_COUNT - 1, 1), disable_progress_bar: bool = False, timeout: int = KAFKA_CHUNK_PRODUCTION_TIMEOUT, ) -> str: """ Loads feature data into Feast for a specific feature set. Args: feature_set (typing.Union[str, feast.feature_set.FeatureSet]): Feature set object or the string name of the feature set source (typing.Union[pd.DataFrame, str]): Either a file path or Pandas Dataframe to ingest into Feast Files that are currently supported: * parquet * csv * json chunk_size (int): Amount of rows to load and ingest at a time. max_workers (int): Number of worker processes to use to encode values. disable_progress_bar (bool): Disable printing of progress statistics. timeout (int): Timeout in seconds to wait for completion. Returns: str: ingestion id for this dataset Examples: >>> from feast import Client >>> >>> client = Client(core_url="localhost:6565") >>> fs_df = pd.DataFrame( >>> { >>> "datetime": [pd.datetime.now()], >>> "driver": [1001], >>> "rating": [4.3], >>> } >>> ) >>> client.set_project("project1") >>> client.ingest("driver", fs_df) >>> >>> driver_fs = client.get_feature_set(name="driver", project="project1") >>> client.ingest(driver_fs, fs_df) """ if isinstance(feature_set, FeatureSet): name = feature_set.name project = feature_set.project elif isinstance(feature_set, str): if self.project is not None: project = self.project else: project = "default" name = feature_set else: raise Exception("Feature set name must be provided") # Read table and get row count dir_path, dest_path = _read_table_from_source(source, chunk_size, max_workers) pq_file = pq.ParquetFile(dest_path) row_count = pq_file.metadata.num_rows current_time = time.time() print("Waiting for feature set to be ready for ingestion...") while True: if timeout is not None and time.time() - current_time >= timeout: raise TimeoutError( "Timed out waiting for feature set to be ready") fetched_feature_set: Optional[FeatureSet] = self.get_feature_set( name, project) if (fetched_feature_set is not None and fetched_feature_set.status == FeatureSetStatus.STATUS_READY): feature_set = fetched_feature_set break time.sleep(3) if timeout is not None: timeout = timeout - int(time.time() - current_time) try: # Kafka configs brokers = feature_set.get_kafka_source_brokers() topic = feature_set.get_kafka_source_topic() producer = get_producer(brokers, row_count, disable_progress_bar) # Loop optimization declarations produce = producer.produce flush = producer.flush ingestion_id = _generate_ingestion_id(feature_set) # Transform and push data to Kafka if feature_set.source.source_type == "Kafka": for chunk in get_feature_row_chunks( file=dest_path, row_groups=list(range(pq_file.num_row_groups)), fs=feature_set, ingestion_id=ingestion_id, max_workers=max_workers, ): # Push FeatureRow one chunk at a time to kafka for serialized_row in chunk: produce(topic=topic, value=serialized_row) # Force a flush after each chunk flush(timeout=timeout) # Remove chunk from memory del chunk else: raise Exception( f"Could not determine source type for feature set " f'"{feature_set.name}" with source type ' f'"{feature_set.source.source_type}"') # Print ingestion statistics producer.print_results() finally: # Remove parquet file(s) that were created earlier print("Removing temporary file(s)...") shutil.rmtree(dir_path) return ingestion_id def get_statistics( self, feature_set_id: str, store: str, features: List[str] = [], ingestion_ids: Optional[List[str]] = None, start_date: Optional[datetime.datetime] = None, end_date: Optional[datetime.datetime] = None, force_refresh: bool = False, project: Optional[str] = None, ) -> statistics_pb2.DatasetFeatureStatisticsList: """ Retrieves the feature featureStatistics computed over the data in the batch stores. Args: feature_set_id: Feature set id to retrieve batch featureStatistics for. If project is not provided, the default ("default") will be used. store: Name of the store to retrieve feature featureStatistics over. This store must be a historical store. features: Optional list of feature names to filter from the results. ingestion_ids: Optional list of dataset Ids by which to filter data before retrieving featureStatistics. Cannot be used with start_date and end_date. If multiple dataset ids are provided, unaggregatable featureStatistics will be dropped. start_date: Optional start date over which to filter statistical data. Data from this date will be included. Cannot be used with dataset_ids. If the provided period spans multiple days, unaggregatable featureStatistics will be dropped. end_date: Optional end date over which to filter statistical data. Data from this data will not be included. Cannot be used with dataset_ids. If the provided period spans multiple days, unaggregatable featureStatistics will be dropped. force_refresh: Setting this flag to true will force a recalculation of featureStatistics and overwrite results currently in the cache, if any. project: Manual override for default project. Returns: Returns a tensorflow DatasetFeatureStatisticsList containing TFDV featureStatistics. """ if ingestion_ids is not None and (start_date is not None or end_date is not None): raise ValueError( "Only one of dataset_id or [start_date, end_date] can be provided." ) if project != "" and "/" not in feature_set_id: feature_set_id = f"{project}/{feature_set_id}" request = GetFeatureStatisticsRequest( feature_set_id=feature_set_id, features=features, store=store, force_refresh=force_refresh, ) if ingestion_ids is not None: request.ingestion_ids.extend(ingestion_ids) else: if start_date is not None: request.start_date.CopyFrom( Timestamp(seconds=int(start_date.timestamp()))) if end_date is not None: request.end_date.CopyFrom( Timestamp(seconds=int(end_date.timestamp()))) return self._core_service.GetFeatureStatistics( request).dataset_feature_statistics_list def _get_grpc_metadata(self): """ Returns a metadata tuple to attach to gRPC requests. This is primarily used when authentication is enabled but SSL/TLS is disabled. Returns: Tuple of metadata to attach to each gRPC call """ if self._config.getboolean( CONFIG_ENABLE_AUTH_KEY) and self._auth_metadata: return self._auth_metadata.get_signed_meta() return ()
class Client: def __init__(self, core_url: str = None, serving_url: str = None, verbose: bool = False): self._core_url = core_url self._serving_url = serving_url self._verbose = verbose self.__core_channel: grpc.Channel = None self.__serving_channel: grpc.Channel = None self._core_service_stub: CoreServiceStub = None self._serving_service_stub: ServingServiceStub = None @property def core_url(self) -> str: if self._core_url is not None: return self._core_url if os.getenv(FEAST_CORE_URL_ENV_KEY) is not None: return os.getenv(FEAST_CORE_URL_ENV_KEY) return "" @core_url.setter def core_url(self, value: str): self._core_url = value @property def serving_url(self) -> str: if self._serving_url is not None: return self._serving_url if os.getenv(FEAST_SERVING_URL_ENV_KEY) is not None: return os.getenv(FEAST_SERVING_URL_ENV_KEY) return "" @serving_url.setter def serving_url(self, value: str): self._serving_url = value def version(self): """ Returns version information from Feast Core and Feast Serving :return: Dictionary containing Core and Serving versions and status """ self._connect_core() self._connect_serving() core_version = "" serving_version = "" core_status = "not connected" serving_status = "not connected" try: core_version = self._core_service_stub.GetFeastCoreVersion( GetFeastCoreVersionRequest(), timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT).version core_status = "connected" except grpc.RpcError as e: print( format_grpc_exception("GetFeastCoreVersion", e.code(), e.details())) try: serving_version = self._serving_service_stub.GetFeastServingInfo( GetFeastServingInfoRequest(), timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT).version serving_status = "connected" except grpc.RpcError as e: print( format_grpc_exception("GetFeastServingInfo", e.code(), e.details())) return { "core": { "url": self.core_url, "version": core_version, "status": core_status, }, "serving": { "url": self.serving_url, "version": serving_version, "status": serving_status, }, } def _connect_core(self, skip_if_connected=True): """ Connect to Core API """ if skip_if_connected and self._core_service_stub: return if not self.core_url: raise ValueError("Please set Feast Core URL.") if self.__core_channel is None: self.__core_channel = grpc.insecure_channel(self.core_url) try: grpc.channel_ready_future(self.__core_channel).result( timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT) except grpc.FutureTimeoutError: print( f"Connection timed out while attempting to connect to Feast Core gRPC server {self.core_url}" ) sys.exit(1) else: self._core_service_stub = CoreServiceStub(self.__core_channel) def _connect_serving(self, skip_if_connected=True): """ Connect to Serving API """ if skip_if_connected and self._serving_service_stub: return if not self.serving_url: raise ValueError("Please set Feast Serving URL.") if self.__serving_channel is None: self.__serving_channel = grpc.insecure_channel(self.serving_url) try: grpc.channel_ready_future(self.__serving_channel).result( timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT) except grpc.FutureTimeoutError: print( f"Connection timed out while attempting to connect to Feast Serving gRPC server {self.serving_url} " ) sys.exit(1) else: self._serving_service_stub = ServingServiceStub( self.__serving_channel) def apply(self, feature_sets: Union[List[FeatureSet], FeatureSet]): """ Idempotently registers feature set(s) with Feast Core. Either a single feature set or a list can be provided. :param feature_sets: Union[List[FeatureSet], FeatureSet] """ if not isinstance(feature_sets, list): feature_sets = [feature_sets] for feature_set in feature_sets: if isinstance(feature_set, FeatureSet): self._apply_feature_set(feature_set) continue raise ValueError( f"Could not determine feature set type to apply {feature_set}") def _apply_feature_set(self, feature_set: FeatureSet): self._connect_core() feature_set._client = self valid, message = feature_set.is_valid() if not valid: raise Exception(message) try: apply_fs_response = self._core_service_stub.ApplyFeatureSet( ApplyFeatureSetRequest(feature_set=feature_set.to_proto()), timeout=GRPC_CONNECTION_TIMEOUT_APPLY, ) # type: ApplyFeatureSetResponse applied_fs = FeatureSet.from_proto(apply_fs_response.feature_set) if apply_fs_response.status == ApplyFeatureSetResponse.Status.CREATED: print( f'Feature set updated/created: "{applied_fs.name}:{applied_fs.version}".' ) feature_set._update_from_feature_set(applied_fs, is_dirty=False) return if apply_fs_response.status == ApplyFeatureSetResponse.Status.NO_CHANGE: print(f"No change detected in feature set {feature_set.name}") return except grpc.RpcError as e: print( format_grpc_exception("ApplyFeatureSet", e.code(), e.details())) def list_feature_sets(self) -> List[FeatureSet]: """ Retrieve a list of feature sets from Feast Core :return: Returns a list of feature sets """ self._connect_core() try: # Get latest feature sets from Feast Core feature_set_protos = self._core_service_stub.ListFeatureSets( ListFeatureSetsRequest()) # type: ListFeatureSetsResponse except grpc.RpcError as e: raise Exception( format_grpc_exception("ListFeatureSets", e.code(), e.details())) # Store list of feature sets feature_sets = [] for feature_set_proto in feature_set_protos.feature_sets: feature_set = FeatureSet.from_proto(feature_set_proto) feature_set._client = self feature_sets.append(feature_set) return feature_sets def get_feature_set( self, name: str, version: int = None, fail_if_missing: bool = False) -> Union[FeatureSet, None]: """ Retrieve a single feature set from Feast Core :param name: (str) Name of feature set :param version: (int) Version of feature set :param fail_if_missing: (bool) Throws an exception if the feature set is not found :return: Returns a single feature set """ self._connect_core() try: get_feature_set_response = self._core_service_stub.GetFeatureSet( GetFeatureSetRequest( name=name.strip(), version=str(version))) # type: GetFeatureSetResponse feature_set = get_feature_set_response.feature_set except grpc.RpcError as e: print(format_grpc_exception("GetFeatureSet", e.code(), e.details())) else: if feature_set is not None: return FeatureSet.from_proto(feature_set) if fail_if_missing: raise Exception( f'Could not find feature set with name "{name}" and ' f'version "{version}"') def list_entities(self) -> Dict[str, Entity]: """ Returns a dictionary of entities across all feature sets :return: Dictionary of entity name to Entity """ entities_dict = OrderedDict() for fs in self.list_feature_sets(): for entity in fs.entities: entities_dict[entity.name] = entity return entities_dict def get_batch_features(self, feature_ids: List[str], entity_rows: pd.DataFrame) -> Job: """ Retrieves historical features from a Feast Serving deployment. Args: feature_ids: List of feature ids that will be returned for each entity. Each feature id should have the following format "feature_set_name:version:feature_name". entity_rows: Pandas dataframe containing entities and a 'datetime' column. Each entity in a feature set must be present as a column in this dataframe. The datetime column must contain timestamps in datetime64 format Returns: Feast batch retrieval job: feast.job.Job Example usage: ============================================================ >>> from feast import Client >>> from datetime import datetime >>> >>> feast_client = Client(core_url="localhost:6565", serving_url="localhost:6566") >>> feature_ids = ["customer:1:bookings_7d"] >>> entity_rows = pd.DataFrame( >>> { >>> "datetime": [pd.datetime.now() for _ in range(3)], >>> "customer": [1001, 1002, 1003], >>> } >>> ) >>> feature_retrieval_job = feast_client.get_batch_features(feature_ids, entity_rows) >>> df = feature_retrieval_job.to_dataframe() >>> print(df) """ self._connect_serving() try: fs_request = _build_feature_set_request(feature_ids) # Validate entity rows based on entities in Feast Core self._validate_entity_rows_for_batch_retrieval( entity_rows, fs_request) # We want the timestamp column naming to be consistent with the # rest of Feast entity_rows.columns = [ "event_timestamp" if col == "datetime" else col for col in entity_rows.columns ] # Remove timezone from datetime column if isinstance( entity_rows["event_timestamp"].dtype, pd.core.dtypes.dtypes.DatetimeTZDtype, ): entity_rows["event_timestamp"] = pd.DatetimeIndex( entity_rows["event_timestamp"]).tz_localize(None) # Retrieve serving information to determine store type and staging location serving_info = self._serving_service_stub.GetFeastServingInfo( GetFeastServingInfoRequest(), timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT ) # type: GetFeastServingInfoResponse if serving_info.type != FeastServingType.FEAST_SERVING_TYPE_BATCH: raise Exception( f'You are connected to a store "{self._serving_url}" which does not support batch retrieval' ) # Export and upload entity row dataframe to staging location provided by Feast staged_file = export_dataframe_to_staging_location( entity_rows, serving_info.job_staging_location) # type: str request = GetBatchFeaturesRequest( feature_sets=fs_request, dataset_source=DatasetSource( file_source=DatasetSource.FileSource( file_uris=[staged_file], data_format=DataFormat.DATA_FORMAT_AVRO)), ) # Retrieve Feast Job object to manage life cycle of retrieval response = self._serving_service_stub.GetBatchFeatures(request) return Job(response.job, self._serving_service_stub) except grpc.RpcError as e: print( format_grpc_exception("GetBatchFeatures", e.code(), e.details())) def _validate_entity_rows_for_batch_retrieval(self, entity_rows, feature_sets_request): """ Validate whether an entity_row dataframe contains the correct information for batch retrieval :param entity_rows: Pandas dataframe containing entities and datetime column. Each entity in a feature set must be present as a column in this dataframe. :param feature_sets_request: Feature sets that will """ # Ensure datetime column exists if "datetime" not in entity_rows.columns: raise ValueError( f'Entity rows does not contain "datetime" column in columns {entity_rows.columns}' ) # Validate dataframe columns based on feature set entities for feature_set in feature_sets_request: fs = self.get_feature_set(name=feature_set.name, version=feature_set.version) if fs is None: raise ValueError( f'Feature set "{feature_set.name}:{feature_set.version}" could not be found' ) for entity_type in fs.entities: if entity_type.name not in entity_rows.columns: raise ValueError( f'Dataframe does not contain entity "{entity_type.name}" column in columns "{entity_rows.columns}"' ) def get_online_features( self, feature_ids: List[str], entity_rows: List[GetOnlineFeaturesRequest.EntityRow], ) -> GetOnlineFeaturesResponse: """ Retrieves the latest online feature data from Feast Serving :param feature_ids: List of feature Ids in the following format [feature_set_name]:[version]:[feature_name] example: ["feature_set_1:6:my_feature_1", "feature_set_1:6:my_feature_2",] :param entity_rows: List of GetFeaturesRequest.EntityRow where each row contains entities. Timestamp should not be set for online retrieval. All entity types within a feature set must be provided for each entity key. :return: Returns a list of maps where each item in the list contains the latest feature values for the provided entities """ self._connect_serving() try: response = self._serving_service_stub.GetOnlineFeatures( GetOnlineFeaturesRequest( feature_sets=_build_feature_set_request(feature_ids), entity_rows=entity_rows, )) # type: GetOnlineFeaturesResponse except grpc.RpcError as e: print( format_grpc_exception("GetOnlineFeatures", e.code(), e.details())) else: return response def ingest( self, feature_set: Union[str, FeatureSet], dataframe: pd.DataFrame, version: int = None, force_update: bool = False, max_workers: int = CPU_COUNT, disable_progress_bar: bool = False, chunk_size: int = 5000, ): """ Loads data into Feast for a specific feature set. :param feature_set: (str, FeatureSet) Feature set object or the string name of the feature set (without a version) :param dataframe: Pandas dataframe to load into Feast for this feature set :param version: (int) Version of the feature set for which this ingestion should happen :param force_update: (bool) Automatically update feature set based on data frame before ingesting data :param max_workers: Number of worker processes to use to encode the dataframe :param disable_progress_bar: Disable progress bar during ingestion :param chunk_size: Number of rows per chunk to encode before ingesting to Feast """ if isinstance(feature_set, FeatureSet): name = feature_set.name if version is None: version = feature_set.version elif isinstance(feature_set, str): name = feature_set else: raise Exception(f"Feature set name must be provided") feature_set = self.get_feature_set(name, version, fail_if_missing=True) # Update the feature set based on dataframe schema if force_update: feature_set.infer_fields_from_df(dataframe, discard_unused_fields=True, replace_existing_features=True) self.apply(feature_set) if feature_set.source.source_type == "Kafka": ingest_kafka( feature_set=feature_set, dataframe=dataframe, max_workers=max_workers, disable_progress_bar=disable_progress_bar, chunk_size=chunk_size, ) else: raise Exception(f"Could not determine source type for feature set " f'"{feature_set.name}" with source type ' f'"{feature_set.source.source_type}"')
class Client: def __init__(self, core_url=None, serving_url=None, verbose=False): """Create an instance of Feast client which is connected to feast endpoint specified in the parameter. If no url is provided, the client will default to the url specified in the environment variable FEAST_CORE_URL. Args: core_url (str, optional): feast's grpc endpoint URL (e.g.: "my.feast.com:8433") serving_url (str, optional): feast serving's grpc endpoint URL (e.g.: "my.feast.com:8433") """ if core_url is None: core_url = os.getenv(FEAST_CORE_URL_ENV_KEY) self._core_url = core_url if serving_url is None: serving_url = os.getenv(FEAST_SERVING_URL_ENV_KEY) self._serving_url = serving_url self.__core_channel = None self.__serving_channel = None self._core_service_stub = None self._job_service_stub = None self._dataset_service_stub = None self._serving_service_stub = None self._verbose = verbose self._table_downloader = TableDownloader() @property def core_url(self): if self._core_url is None: self._core_url = os.getenv(FEAST_CORE_URL_ENV_KEY) if self._core_url is None: raise ValueError( "Core API URL not set. Either set the " + "environment variable {} or set it explicitly.".format( FEAST_CORE_URL_ENV_KEY)) return self._core_url @core_url.setter def core_url(self, value): self._core_url = value @property def serving_url(self): if self._serving_url is None: self._serving_url = os.getenv(FEAST_SERVING_URL_ENV_KEY) if self._serving_url is None: raise ValueError( "Serving API URL not set. Either set the " + "environment variable {} or set it explicitly.".format( FEAST_SERVING_URL_ENV_KEY)) return self._serving_url @serving_url.setter def serving_url(self, value): self._serving_url = value @property def verbose(self): return self._verbose @verbose.setter def verbose(self, val): if not isinstance(val, bool): raise TypeError("verbose should be a boolean value") self._verbose = val def apply(self, obj): """Create or update one or many feast's resource (feature, entity, importer, storage). Args: obj (object): one or many feast's resource // create_entity (bool, optional): (default: {None}) // create_features (bool, optional): [description] (default: {None}) """ if isinstance(obj, list): ids = [] for resource in obj: ids.append(self._apply(resource)) return ids else: return self._apply(obj) def run(self, importer, name_override=None, apply_entity=False, apply_features=False): """ Run an import job Args: importer (feast.sdk.importer.Importer): importer instance name_override (str, optional): Job name override apply_entity (bool, optional): (default: False) create/update entity inside importer apply_features (bool, optional): (default: False) create/update features inside importer Returns: (str) job ID of the import job """ request = JobServiceTypes.SubmitImportJobRequest( importSpec=importer.spec) if name_override is not None: request.name = name_override if apply_entity: self._apply_entity(importer.entity) if apply_features: for feature in importer.features: self._apply_feature(importer.features[feature]) if importer.require_staging: print("Staging file to remote path {}".format( importer.remote_path)) importer.stage() print("Submitting job with spec:\n {}".format( spec_to_yaml(importer.spec))) self._connect_core() response = self._job_service_stub.SubmitJob(request) print("Submitted job with id: {}".format(response.jobId)) return response.jobId def create_dataset(self, feature_set, start_date, end_date, limit=None, name_prefix=None): """ Create training dataset for a feature set. The training dataset will be bounded by event timestamp between start_date and end_date. Specify limit to limit number of row returned. The training dataset will reside in a bigquery table specified by destination. Args: feature_set (feast.sdk.resources.feature_set.FeatureSet): feature set representing the data wanted start_date (str): starting date of the training data in ISO 8601 format (e.g.: "2018-12-31") end_date (str): end date of training data in ISO 8601 format (e.g.: "2018-12-31") limit (int, optional): (default: None) maximum number of row returned name_prefix (str, optional): (default: None) name prefix. :return: feast.resources.feature_set.DatasetInfo: DatasetInfo containing the information of training dataset """ self._check_create_dataset_args(feature_set, start_date, end_date, limit) req = DatasetServiceTypes.CreateDatasetRequest( featureSet=feature_set.proto, startDate=_timestamp_from_datetime(_parse_date(start_date)), endDate=_timestamp_from_datetime(_parse_date(end_date)), limit=limit, namePrefix=name_prefix, ) if self.verbose: print("creating training dataset for features: " + str(feature_set.features)) self._connect_core() resp = self._dataset_service_stub.CreateDataset(req) if self.verbose: print("created dataset {}: {}".format(resp.datasetInfo.name, resp.datasetInfo.tableUrl)) return DatasetInfo(resp.datasetInfo.name, resp.datasetInfo.tableUrl) def get_serving_data(self, feature_set, entity_keys, ts_range=None): """Get feature value from feast serving API. If server_url is not provided, the value stored in the environment variable FEAST_SERVING_URL is used to connect to the serving server instead. Args: feature_set (feast.sdk.resources.feature_set.FeatureSet): feature set representing the data wanted entity_keys (:obj: `list` of :obj: `str): list of entity keys ts_range (:obj: `list` of str, optional): size 2 list of start and end time, in datetime type. It will filter out any feature value having event timestamp outside of the ts_range. Returns: pandas.DataFrame: DataFrame of results """ start = None end = None if ts_range is not None: if len(ts_range) != 2: raise ValueError("ts_range must have len 2") start = ts_range[0] end = ts_range[1] if type(start) is not datetime or type(end) is not datetime: raise TypeError("start and end must be datetime type") request = self._build_serving_request(feature_set, entity_keys) self._connect_serving() return self._response_to_df( feature_set, self._serving_service_stub.QueryFeatures(request), start, end) def download_dataset(self, dataset_info, dest, staging_location, file_type=FileType.CSV): """ Download training dataset as file Args: dataset_info (feast.sdk.resources.feature_set.DatasetInfo) : dataset_info to be downloaded dest (str): destination's file path staging_location (str): url to staging_location (currently support a folder in GCS) file_type (feast.sdk.resources.feature_set.FileType): (default: FileType.CSV) exported file format Returns: str: path to the downloaded file """ return self._table_downloader.download_table_as_file( dataset_info.full_table_id, dest, staging_location, file_type) def download_dataset_to_df(self, dataset_info, staging_location): """ Download training dataset as Pandas Dataframe Args: dataset_info (feast.sdk.resources.feature_set.DatasetInfo) : dataset_info to be downloaded staging_location: url to staging_location (currently support a folder in GCS) Returns: pandas.DataFrame: dataframe of the training dataset """ return self._table_downloader.download_table_as_df( dataset_info.full_table_id, staging_location) def close(self): """ Close underlying connection to Feast's core and serving end points. """ self.__core_channel.close() self.__core_channel = None self.__serving_channel.close() self.__serving_channel = None def _connect_core(self): """Connect to core api""" if self.__core_channel is None: self.__core_channel = grpc.insecure_channel(self.core_url) self._core_service_stub = CoreServiceStub(self.__core_channel) self._job_service_stub = JobServiceStub(self.__core_channel) self._dataset_service_stub = DatasetServiceStub( self.__core_channel) def _connect_serving(self): """Connect to serving api""" if self.__serving_channel is None: self.__serving_channel = grpc.insecure_channel(self.serving_url) self._serving_service_stub = ServingAPIStub(self.__serving_channel) def _build_serving_request(self, feature_set, entity_keys): """Helper function to build serving service request.""" return QueryFeaturesRequest( entityName=feature_set.entity, entityId=entity_keys, featureId=feature_set.features, ) def _response_to_df(self, feature_set, response, start=None, end=None): is_filter_time = start is not None and end is not None df = pd.DataFrame(columns=[feature_set.entity] + feature_set.features) dtypes = {} for entity_id in response.entities: feature_map = response.entities[entity_id].features row = {response.entityName: entity_id} for feature_id in feature_map: v = feature_map[feature_id].value if is_filter_time: ts = feature_map[feature_id].timestamp.ToDatetime() if ts < start or ts > end: continue feast_valuetype = v.WhichOneof("val") if feast_valuetype not in dtypes: dtypes[feature_id] = types.FEAST_VALUETYPE_TO_DTYPE[ feast_valuetype] v = getattr(v, v.WhichOneof("val")) row[feature_id] = v df = df.append(row, ignore_index=True) return df.astype(dtypes).reset_index(drop=True) def _apply(self, obj): """Applies a single object to feast core. Args: obj (object): one of [Feature, Entity, FeatureGroup, Storage, Importer] """ if isinstance(obj, Feature): return self._apply_feature(obj) elif isinstance(obj, Entity): return self._apply_entity(obj) elif isinstance(obj, FeatureGroup): return self._apply_feature_group(obj) elif isinstance(obj, Storage): return self._apply_storage(obj) else: raise TypeError("Apply can only be passed one of the following \ types: [Feature, Entity, FeatureGroup, Storage, Importer]") def _apply_feature(self, feature): """Apply the feature to the core API Args: feature (feast.sdk.resources.feature.Feature): feature to apply """ self._connect_core() response = self._core_service_stub.ApplyFeature(feature.spec) if self.verbose: print("Successfully applied feature with id: {}\n---\n{}".format( response.featureId, feature)) return response.featureId def _apply_entity(self, entity): """Apply the entity to the core API Args: entity (feast.sdk.resources.entity.Entity): entity to apply """ self._connect_core() response = self._core_service_stub.ApplyEntity(entity.spec) if self.verbose: print("Successfully applied entity with name: {}\n---\n{}".format( response.entityName, entity)) return response.entityName def _apply_feature_group(self, feature_group): """Apply the feature group to the core API Args: feature_group (feast.sdk.resources.feature_group.FeatureGroup): feature group to apply """ self._connect_core() response = self._core_service_stub.ApplyFeatureGroup( feature_group.spec) if self.verbose: print("Successfully applied feature group with id: " + "{}\n---\n{}".format(response.featureGroupId, feature_group)) return response.featureGroupId def _apply_storage(self, storage): """Apply the storage to the core API Args: storage (feast.sdk.resources.storage.Storage): storage to apply """ self._connect_core() response = self._core_service_stub.ApplyStorage(storage.spec) if self.verbose: print("Successfully applied storage with id: " + "{}\n{}".format(response.storageId, storage)) return response.storageId def _check_create_dataset_args(self, feature_set, start_date, end_date, limit): if len(feature_set.features) < 1: raise ValueError("feature set is empty") start = _parse_date(start_date) end = _parse_date(end_date) if end < start: raise ValueError("end_date is before start_date") if limit is not None and limit < 1: raise ValueError("limit is not a positive integer")
class Client: """ Feast Client: Used for creating, managing, and retrieving features. """ def __init__(self, core_url: str = None, serving_url: str = None, verbose: bool = False): """ The Feast Client should be initialized with at least one service url Args: core_url: Feast Core URL. Used to manage features serving_url: Feast Serving URL. Used to retrieve features verbose: Enable verbose logging """ self._core_url = core_url self._serving_url = serving_url self._verbose = verbose self.__core_channel: grpc.Channel = None self.__serving_channel: grpc.Channel = None self._core_service_stub: CoreServiceStub = None self._serving_service_stub: ServingServiceStub = None @property def core_url(self) -> str: """ Retrieve Feast Core URL """ if self._core_url is not None: return self._core_url if os.getenv(FEAST_CORE_URL_ENV_KEY) is not None: return os.getenv(FEAST_CORE_URL_ENV_KEY) return "" @core_url.setter def core_url(self, value: str): """ Set the Feast Core URL Returns: Feast Core URL string """ self._core_url = value @property def serving_url(self) -> str: """ Retrieve Serving Core URL """ if self._serving_url is not None: return self._serving_url if os.getenv(FEAST_SERVING_URL_ENV_KEY) is not None: return os.getenv(FEAST_SERVING_URL_ENV_KEY) return "" @serving_url.setter def serving_url(self, value: str): """ Set the Feast Serving URL Returns: Feast Serving URL string """ self._serving_url = value def version(self): """ Returns version information from Feast Core and Feast Serving """ result = {} if self.serving_url: self._connect_serving() serving_version = self._serving_service_stub.GetFeastServingInfo( GetFeastServingInfoRequest(), timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT).version result["serving"] = { "url": self.serving_url, "version": serving_version } if self.core_url: self._connect_core() core_version = self._core_service_stub.GetFeastCoreVersion( GetFeastCoreVersionRequest(), timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT).version result["core"] = {"url": self.core_url, "version": core_version} return result def _connect_core(self, skip_if_connected: bool = True): """ Connect to Core API Args: skip_if_connected: Do not attempt to connect if already connected """ if skip_if_connected and self._core_service_stub: return if not self.core_url: raise ValueError("Please set Feast Core URL.") if self.__core_channel is None: self.__core_channel = grpc.insecure_channel(self.core_url) try: grpc.channel_ready_future(self.__core_channel).result( timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT) except grpc.FutureTimeoutError: raise ConnectionError( f"Connection timed out while attempting to connect to Feast " f"Core gRPC server {self.core_url} ") else: self._core_service_stub = CoreServiceStub(self.__core_channel) def _connect_serving(self, skip_if_connected=True): """ Connect to Serving API Args: skip_if_connected: Do not attempt to connect if already connected """ if skip_if_connected and self._serving_service_stub: return if not self.serving_url: raise ValueError("Please set Feast Serving URL.") if self.__serving_channel is None: self.__serving_channel = grpc.insecure_channel(self.serving_url) try: grpc.channel_ready_future(self.__serving_channel).result( timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT) except grpc.FutureTimeoutError: raise ConnectionError( f"Connection timed out while attempting to connect to Feast " f"Serving gRPC server {self.serving_url} ") else: self._serving_service_stub = ServingServiceStub( self.__serving_channel) def apply(self, feature_sets: Union[List[FeatureSet], FeatureSet]): """ Idempotently registers feature set(s) with Feast Core. Either a single feature set or a list can be provided. Args: feature_sets: List of feature sets that will be registered """ if not isinstance(feature_sets, list): feature_sets = [feature_sets] for feature_set in feature_sets: if isinstance(feature_set, FeatureSet): self._apply_feature_set(feature_set) continue raise ValueError( f"Could not determine feature set type to apply {feature_set}") def _apply_feature_set(self, feature_set: FeatureSet): """ Registers a single feature set with Feast Args: feature_set: Feature set that will be registered """ self._connect_core() feature_set._client = self feature_set.is_valid() # Convert the feature set to a request and send to Feast Core apply_fs_response = self._core_service_stub.ApplyFeatureSet( ApplyFeatureSetRequest(feature_set=feature_set.to_proto()), timeout=GRPC_CONNECTION_TIMEOUT_APPLY, ) # type: ApplyFeatureSetResponse # Extract the returned feature set applied_fs = FeatureSet.from_proto(apply_fs_response.feature_set) # If the feature set has changed, update the local copy if apply_fs_response.status == ApplyFeatureSetResponse.Status.CREATED: print( f'Feature set updated/created: "{applied_fs.name}:{applied_fs.version}"' ) # If no change has been applied, do nothing if apply_fs_response.status == ApplyFeatureSetResponse.Status.NO_CHANGE: print(f"No change detected or applied: {feature_set.name}") # Deep copy from the returned feature set to the local feature set feature_set._update_from_feature_set(applied_fs) def list_feature_sets(self) -> List[FeatureSet]: """ Retrieve a list of feature sets from Feast Core Returns: List of feature sets """ self._connect_core() # Get latest feature sets from Feast Core feature_set_protos = self._core_service_stub.ListFeatureSets( ListFeatureSetsRequest()) # type: ListFeatureSetsResponse # Extract feature sets and return feature_sets = [] for feature_set_proto in feature_set_protos.feature_sets: feature_set = FeatureSet.from_proto(feature_set_proto) feature_set._client = self feature_sets.append(feature_set) return feature_sets def get_feature_set(self, name: str, version: int = None) -> Union[FeatureSet, None]: """ Retrieves a feature set. If no version is specified then the latest version will be returned. Args: name: Name of feature set version: Version of feature set Returns: Returns either the specified feature set, or raises an exception if none is found """ self._connect_core() if version is None: version = 0 get_feature_set_response = self._core_service_stub.GetFeatureSet( GetFeatureSetRequest( name=name.strip(), version=int(version))) # type: GetFeatureSetResponse return FeatureSet.from_proto(get_feature_set_response.feature_set) def list_entities(self) -> Dict[str, Entity]: """ Returns a dictionary of entities across all feature sets Returns: Dictionary of entities, indexed by name """ entities_dict = OrderedDict() for fs in self.list_feature_sets(): for entity in fs.entities: entities_dict[entity.name] = entity return entities_dict def get_batch_features(self, feature_ids: List[str], entity_rows: Union[pd.DataFrame, str]) -> Job: """ Retrieves historical features from a Feast Serving deployment. Args: feature_ids (List[str]): List of feature ids that will be returned for each entity. Each feature id should have the following format "feature_set_name:version:feature_name". entity_rows (Union[pd.DataFrame, str]): Pandas dataframe containing entities and a 'datetime' column. Each entity in a feature set must be present as a column in this dataframe. The datetime column must contain timestamps in datetime64 format. Returns: feast.job.Job: Returns a job object that can be used to monitor retrieval progress asynchronously, and can be used to materialize the results. Examples: >>> from feast import Client >>> from datetime import datetime >>> >>> feast_client = Client(core_url="localhost:6565", serving_url="localhost:6566") >>> feature_ids = ["customer:1:bookings_7d"] >>> entity_rows = pd.DataFrame( >>> { >>> "datetime": [pd.datetime.now() for _ in range(3)], >>> "customer": [1001, 1002, 1003], >>> } >>> ) >>> feature_retrieval_job = feast_client.get_batch_features(feature_ids, entity_rows) >>> df = feature_retrieval_job.to_dataframe() >>> print(df) """ self._connect_serving() fs_request = _build_feature_set_request(feature_ids) # Retrieve serving information to determine store type and # staging location serving_info = self._serving_service_stub.GetFeastServingInfo( GetFeastServingInfoRequest(), timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT ) # type: GetFeastServingInfoResponse if serving_info.type != FeastServingType.FEAST_SERVING_TYPE_BATCH: raise Exception( f'You are connected to a store "{self._serving_url}" which ' f"does not support batch retrieval ") if isinstance(entity_rows, pd.DataFrame): # Pandas DataFrame detected # Validate entity rows to based on entities in Feast Core self._validate_dataframe_for_batch_retrieval( entity_rows=entity_rows, feature_sets_request=fs_request) # Remove timezone from datetime column if isinstance(entity_rows["datetime"].dtype, pd.core.dtypes.dtypes.DatetimeTZDtype): entity_rows["datetime"] = pd.DatetimeIndex( entity_rows["datetime"]).tz_localize(None) elif isinstance(entity_rows, str): # String based source if entity_rows.endswith((".avro", "*")): # Validate Avro entity rows to based on entities in Feast Core self._validate_avro_for_batch_retrieval( source=entity_rows, feature_sets_request=fs_request) else: raise Exception( f"Only .avro and wildcard paths are accepted as entity_rows" ) else: raise Exception(f"Only pandas.DataFrame and str types are allowed" f" as entity_rows, but got {type(entity_rows)}.") # Export and upload entity row DataFrame to staging location # provided by Feast staged_files = export_source_to_staging_location( entity_rows, serving_info.job_staging_location) # type: List[str] request = GetBatchFeaturesRequest( feature_sets=fs_request, dataset_source=DatasetSource(file_source=DatasetSource.FileSource( file_uris=staged_files, data_format=DataFormat.DATA_FORMAT_AVRO)), ) # Retrieve Feast Job object to manage life cycle of retrieval response = self._serving_service_stub.GetBatchFeatures(request) return Job(response.job, self._serving_service_stub) def _validate_dataframe_for_batch_retrieval(self, entity_rows: pd.DataFrame, feature_sets_request): """ Validate whether an the entity rows in a DataFrame contains the correct information for batch retrieval. Datetime column must be present in the DataFrame. Args: entity_rows (pd.DataFrame): Pandas DataFrame containing entities and datetime column. Each entity in a feature set must be present as a column in this DataFrame. feature_sets_request: Feature sets that will be requested. """ self._validate_columns(columns=entity_rows.columns, feature_sets_request=feature_sets_request, datetime_field="datetime") def _validate_avro_for_batch_retrieval(self, source: str, feature_sets_request): """ Validate whether the entity rows in an Avro source file contains the correct information for batch retrieval. Only gs:// and local files (file://) uri schemes are allowed. Avro file must have a column named "event_timestamp". No checks will be done if a GCS path is provided. Args: source (str): File path to Avro. feature_sets_request: Feature sets that will be requested. """ p = urlparse(source) if p.scheme == "gs": # GCS path provided (Risk is delegated to user) # No validation if GCS path is provided return elif p.scheme == "file" or not p.scheme: # Local file (file://) provided file_path = os.path.abspath(os.path.join(p.netloc, p.path)) else: raise Exception( f"Unsupported uri scheme provided {p.scheme}, only " f"local files (file://), and gs:// schemes are " f"allowed") with open(file_path, "rb") as f: reader = fastavro.reader(f) schema = json.loads(reader.metadata["avro.schema"]) columns = [x["name"] for x in schema["fields"]] self._validate_columns(columns=columns, feature_sets_request=feature_sets_request, datetime_field="event_timestamp") def _validate_columns(self, columns: List[str], feature_sets_request, datetime_field: str) -> None: """ Check if the required column contains the correct values for batch retrieval. Args: columns (List[str]): List of columns to validate against feature_sets_request. feature_sets_request (): Feature sets that will be requested. datetime_field (str): Name of the datetime field that must be enforced and present as a column in the data source. Returns: None: None """ # Ensure datetime column exists if datetime_field not in columns: raise ValueError( f'Entity rows does not contain "{datetime_field}" column in ' f'columns {columns}') # Validate Avro columns based on feature set entities for feature_set in feature_sets_request: fs = self.get_feature_set(name=feature_set.name, version=feature_set.version) if fs is None: raise ValueError( f'Feature set "{feature_set.name}:{feature_set.version}" ' f"could not be found") for entity_type in fs.entities: if entity_type.name not in columns: raise ValueError( f'Input does not contain entity' f' "{entity_type.name}" column in columns "{columns}"') def get_online_features( self, feature_ids: List[str], entity_rows: List[GetOnlineFeaturesRequest.EntityRow], ) -> GetOnlineFeaturesResponse: """ Retrieves the latest online feature data from Feast Serving Args: feature_ids: List of feature Ids in the following format [feature_set_name]:[version]:[feature_name] example: ["feature_set_1:6:my_feature_1", "feature_set_1:6:my_feature_2",] entity_rows: List of GetFeaturesRequest.EntityRow where each row contains entities. Timestamp should not be set for online retrieval. All entity types within a feature Returns: Returns a list of maps where each item in the list contains the latest feature values for the provided entities """ self._connect_serving() return self._serving_service_stub.GetOnlineFeatures( GetOnlineFeaturesRequest( feature_sets=_build_feature_set_request(feature_ids), entity_rows=entity_rows, )) # type: GetOnlineFeaturesResponse def ingest(self, feature_set: Union[str, FeatureSet], source: Union[pd.DataFrame, str], chunk_size: int = 10000, version: int = None, force_update: bool = False, max_workers: int = max(CPU_COUNT - 1, 1), disable_progress_bar: bool = False, timeout: int = KAFKA_CHUNK_PRODUCTION_TIMEOUT) -> None: """ Loads feature data into Feast for a specific feature set. Args: feature_set (typing.Union[str, FeatureSet]): Feature set object or the string name of the feature set (without a version). source (typing.Union[pd.DataFrame, str]): Either a file path or Pandas Dataframe to ingest into Feast Files that are currently supported: * parquet * csv * json chunk_size (int): Amount of rows to load and ingest at a time. version (int): Feature set version. force_update (bool): Automatically update feature set based on source data prior to ingesting. This will also register changes to Feast. max_workers (int): Number of worker processes to use to encode values. disable_progress_bar (bool): Disable printing of progress statistics. timeout (int): Timeout in seconds to wait for completion. Returns: None: None """ if isinstance(feature_set, FeatureSet): name = feature_set.name if version is None: version = feature_set.version elif isinstance(feature_set, str): name = feature_set else: raise Exception(f"Feature set name must be provided") # Read table and get row count tmp_table_name = _read_table_from_source(source, chunk_size, max_workers) pq_file = pq.ParquetFile(tmp_table_name) row_count = pq_file.metadata.num_rows # Update the feature set based on PyArrow table of first row group if force_update: feature_set.infer_fields_from_pa(table=pq_file.read_row_group(0), discard_unused_fields=True, replace_existing_features=True) self.apply(feature_set) current_time = time.time() print("Waiting for feature set to be ready for ingestion...") while True: if timeout is not None and time.time() - current_time >= timeout: raise TimeoutError( "Timed out waiting for feature set to be ready") feature_set = self.get_feature_set(name, version) if (feature_set is not None and feature_set.status == FeatureSetStatus.STATUS_READY): break time.sleep(3) if timeout is not None: timeout = timeout - int(time.time() - current_time) try: # Kafka configs brokers = feature_set.get_kafka_source_brokers() topic = feature_set.get_kafka_source_topic() producer = get_producer(brokers, row_count, disable_progress_bar) # Loop optimization declarations produce = producer.produce flush = producer.flush # Transform and push data to Kafka if feature_set.source.source_type == "Kafka": for chunk in get_feature_row_chunks( file=tmp_table_name, row_groups=list(range(pq_file.num_row_groups)), fs=feature_set, max_workers=max_workers): # Push FeatureRow one chunk at a time to kafka for serialized_row in chunk: produce(topic=topic, value=serialized_row) # Force a flush after each chunk flush(timeout=timeout) # Remove chunk from memory del chunk else: raise Exception( f"Could not determine source type for feature set " f'"{feature_set.name}" with source type ' f'"{feature_set.source.source_type}"') # Print ingestion statistics producer.print_results() finally: # Remove parquet file(s) that were created earlier print("Removing temporary file(s)...") os.remove(tmp_table_name) return None
class Client: """ Feast Client: Used for creating, managing, and retrieving features. """ def __init__(self, options: Optional[Dict[str, str]] = None, **kwargs): """ The Feast Client should be initialized with at least one service url Please see constants.py for configuration options. Commonly used options or arguments include: core_url: Feast Core URL. Used to manage features serving_url: Feast Serving URL. Used to retrieve features project: Sets the active project. This field is optional. core_secure: Use client-side SSL/TLS for Core gRPC API serving_secure: Use client-side SSL/TLS for Serving gRPC API enable_auth: Enable authentication and authorization auth_provider: Authentication provider – "google" or "oauth" if auth_provider is "oauth", the following fields are mandatory – oauth_grant_type, oauth_client_id, oauth_client_secret, oauth_audience, oauth_token_request_url Args: options: Configuration options to initialize client with **kwargs: Additional keyword arguments that will be used as configuration options along with "options" """ if options is None: options = dict() self._config = Config(options={**options, **kwargs}) self._core_service_stub: Optional[CoreServiceStub] = None self._serving_service_stub: Optional[ServingServiceStub] = None self._auth_metadata: Optional[grpc.AuthMetadataPlugin] = None # Configure Auth Metadata Plugin if auth is enabled if self._config.getboolean(opt.ENABLE_AUTH): self._auth_metadata = feast_auth.get_auth_metadata_plugin( self._config) self._configure_telemetry() @property def config(self) -> Config: return self._config @property def _core_service(self): """ Creates or returns the gRPC Feast Core Service Stub Returns: CoreServiceStub """ if not self._core_service_stub: channel = create_grpc_channel( url=self._config.get(opt.CORE_URL), enable_ssl=self._config.getboolean(opt.CORE_ENABLE_SSL), enable_auth=self._config.getboolean(opt.ENABLE_AUTH), ssl_server_cert_path=self._config.get( opt.CORE_SERVER_SSL_CERT), auth_metadata_plugin=self._auth_metadata, timeout=self._config.getint(opt.GRPC_CONNECTION_TIMEOUT), ) self._core_service_stub = CoreServiceStub(channel) return self._core_service_stub @property def _serving_service(self): """ Creates or returns the gRPC Feast Serving Service Stub. If both `opentracing` and `grpcio-opentracing` are installed, an opentracing interceptor will be instantiated based on the global tracer. Returns: ServingServiceStub """ if not self._serving_service_stub: channel = create_grpc_channel( url=self._config.get(opt.SERVING_URL), enable_ssl=self._config.getboolean(opt.SERVING_ENABLE_SSL), enable_auth=self._config.getboolean(opt.ENABLE_AUTH), ssl_server_cert_path=self._config.get( opt.SERVING_SERVER_SSL_CERT), auth_metadata_plugin=self._auth_metadata, timeout=self._config.getint(opt.GRPC_CONNECTION_TIMEOUT), ) try: import opentracing from grpc_opentracing import open_tracing_client_interceptor from grpc_opentracing.grpcext import intercept_channel interceptor = open_tracing_client_interceptor( opentracing.global_tracer()) channel = intercept_channel(channel, interceptor) except ImportError: pass self._serving_service_stub = ServingServiceStub(channel) return self._serving_service_stub def _extra_grpc_params(self) -> Dict[str, Any]: return dict( timeout=self._config.getint(opt.GRPC_CONNECTION_TIMEOUT), metadata=self._get_grpc_metadata(), ) @property def core_url(self) -> str: """ Retrieve Feast Core URL Returns: Feast Core URL string """ return self._config.get(opt.CORE_URL) @core_url.setter def core_url(self, value: str): """ Set the Feast Core URL Args: value: Feast Core URL """ self._config.set(opt.CORE_URL, value) @property def serving_url(self) -> str: """ Retrieve Feast Serving URL Returns: Feast Serving URL string """ return self._config.get(opt.SERVING_URL) @serving_url.setter def serving_url(self, value: str): """ Set the Feast Serving URL Args: value: Feast Serving URL """ self._config.set(opt.SERVING_URL, value) @property def job_service_url(self) -> str: """ Retrieve Feast Job Service URL Returns: Feast Job Service URL string """ return self._config.get(opt.JOB_SERVICE_URL) @job_service_url.setter def job_service_url(self, value: str): """ Set the Feast Job Service URL Args: value: Feast Job Service URL """ self._config.set(opt.JOB_SERVICE_URL, value) @property def core_secure(self) -> bool: """ Retrieve Feast Core client-side SSL/TLS setting Returns: Whether client-side SSL/TLS is enabled """ return self._config.getboolean(opt.CORE_ENABLE_SSL) @core_secure.setter def core_secure(self, value: bool): """ Set the Feast Core client-side SSL/TLS setting Args: value: True to enable client-side SSL/TLS """ self._config.set(opt.CORE_ENABLE_SSL, value) @property def serving_secure(self) -> bool: """ Retrieve Feast Serving client-side SSL/TLS setting Returns: Whether client-side SSL/TLS is enabled """ return self._config.getboolean(opt.SERVING_ENABLE_SSL) @serving_secure.setter def serving_secure(self, value: bool): """ Set the Feast Serving client-side SSL/TLS setting Args: value: True to enable client-side SSL/TLS """ self._config.set(opt.SERVING_ENABLE_SSL, value) @property def job_service_secure(self) -> bool: """ Retrieve Feast Job Service client-side SSL/TLS setting Returns: Whether client-side SSL/TLS is enabled """ return self._config.getboolean(opt.JOB_SERVICE_ENABLE_SSL) @job_service_secure.setter def job_service_secure(self, value: bool): """ Set the Feast Job Service client-side SSL/TLS setting Args: value: True to enable client-side SSL/TLS """ self._config.set(opt.JOB_SERVICE_ENABLE_SSL, value) def version(self, sdk_only=False): """ Returns version information from Feast Core and Feast Serving """ import pkg_resources try: sdk_version = pkg_resources.get_distribution("feast").version except pkg_resources.DistributionNotFound: sdk_version = "local build" if sdk_only: return sdk_version result = { "sdk": { "version": sdk_version }, "serving": "not configured", "core": "not configured", } if self.serving_url: serving_version = self._serving_service.GetFeastServingInfo( GetFeastServingInfoRequest(), timeout=self._config.getint(opt.GRPC_CONNECTION_TIMEOUT), metadata=self._get_grpc_metadata(), ).version result["serving"] = { "url": self.serving_url, "version": serving_version } if self.core_url: core_version = self._core_service.GetFeastCoreVersion( GetFeastCoreVersionRequest(), timeout=self._config.getint(opt.GRPC_CONNECTION_TIMEOUT), metadata=self._get_grpc_metadata(), ).version result["core"] = {"url": self.core_url, "version": core_version} return result def _configure_telemetry(self): telemetry_filepath = join(expanduser("~"), ".feast", "telemetry") self._telemetry_enabled = ( self._config.get(opt.TELEMETRY, "True") == "True" ) # written this way to turn the env var string into a boolean if self._telemetry_enabled: self._telemetry_counter = {"get_online_features": 0} if os.path.exists(telemetry_filepath): with open(telemetry_filepath, "r") as f: self._telemetry_id = f.read() else: self._telemetry_id = str(uuid.uuid4()) print( "Feast is an open source project that collects anonymized usage statistics. To opt out or learn more see https://docs.feast.dev/v/master/advanced/telemetry" ) with open(telemetry_filepath, "w") as f: f.write(self._telemetry_id) else: if os.path.exists(telemetry_filepath): os.remove(telemetry_filepath) @property def project(self) -> str: """ Retrieve currently active project Returns: Project name """ if not self._config.get(opt.PROJECT): raise ValueError("No project has been configured.") return self._config.get(opt.PROJECT) def set_project(self, project: Optional[str] = None): """ Set currently active Feast project Args: project: Project to set as active. If unset, will reset to the default project. """ if project is None: project = opt().PROJECT self._config.set(opt.PROJECT, project) def list_projects(self) -> List[str]: """ List all active Feast projects Returns: List of project names """ response = self._core_service.ListProjects( ListProjectsRequest(), timeout=self._config.getint(opt.GRPC_CONNECTION_TIMEOUT), metadata=self._get_grpc_metadata(), ) # type: ListProjectsResponse return list(response.projects) def create_project(self, project: str): """ Creates a Feast project Args: project: Name of project """ self._core_service.CreateProject( CreateProjectRequest(name=project), timeout=self._config.getint(opt.GRPC_CONNECTION_TIMEOUT), metadata=self._get_grpc_metadata(), ) # type: CreateProjectResponse def archive_project(self, project): """ Archives a project. Project will still continue to function for ingestion and retrieval, but will be in a read-only state. It will also not be visible from the Core API for management purposes. Args: project: Name of project to archive """ try: self._core_service_stub.ArchiveProject( ArchiveProjectRequest(name=project), timeout=self._config.getint(opt.GRPC_CONNECTION_TIMEOUT), metadata=self._get_grpc_metadata(), ) # type: ArchiveProjectResponse except grpc.RpcError as e: raise grpc.RpcError(e.details()) # revert to the default project if self._project == project: self._project = opt().PROJECT def apply( self, objects: Union[List[Union[Entity, FeatureTable]], Entity, FeatureTable], project: str = None, ): """ Idempotently registers entities and feature tables with Feast Core. Either a single entity or feature table or a list can be provided. Args: objects: List of entities and/or feature tables that will be registered Examples: >>> from feast import Client >>> from feast.entity import Entity >>> from feast.value_type import ValueType >>> >>> feast_client = Client(core_url="localhost:6565") >>> entity = Entity( >>> name="driver_entity", >>> description="Driver entity for car rides", >>> value_type=ValueType.STRING, >>> labels={ >>> "key": "val" >>> } >>> ) >>> feast_client.apply(entity) """ if self._telemetry_enabled: log_usage( "apply", self._telemetry_id, datetime.utcnow(), self.version(sdk_only=True), ) if project is None: project = self.project if not isinstance(objects, list): objects = [objects] for obj in objects: if isinstance(obj, Entity): self._apply_entity(project, obj) # type: ignore elif isinstance(obj, FeatureTable): self._apply_feature_table(project, obj) # type: ignore else: raise ValueError( f"Could not determine object type to apply {obj} with type {type(obj)}. Type must be Entity or FeatureTable." ) def apply_entity(self, entities: Union[List[Entity], Entity], project: str = None): """ Deprecated. Please see apply(). """ warnings.warn( "The method apply_entity() is being deprecated. Please use apply() instead. Feast 0.10 and onwards will not support apply_entity().", DeprecationWarning, ) if project is None: project = self.project if not isinstance(entities, list): entities = [entities] for entity in entities: if isinstance(entity, Entity): self._apply_entity(project, entity) # type: ignore continue raise ValueError( f"Could not determine entity type to apply {entity}") def _apply_entity(self, project: str, entity: Entity): """ Registers a single entity with Feast Args: entity: Entity that will be registered """ entity.is_valid() entity_proto = entity.to_spec_proto() # Convert the entity to a request and send to Feast Core try: apply_entity_response = self._core_service.ApplyEntity( ApplyEntityRequest(project=project, spec=entity_proto), # type: ignore timeout=self._config.getint(opt.GRPC_CONNECTION_TIMEOUT), metadata=self._get_grpc_metadata(), ) # type: ApplyEntityResponse except grpc.RpcError as e: raise grpc.RpcError(e.details()) # Extract the returned entity applied_entity = Entity.from_proto(apply_entity_response.entity) # Deep copy from the returned entity to the local entity entity._update_from_entity(applied_entity) def list_entities(self, project: str = None, labels: Dict[str, str] = dict()) -> List[Entity]: """ Retrieve a list of entities from Feast Core Args: project: Filter entities based on project name labels: User-defined labels that these entities are associated with Returns: List of entities """ if project is None: project = self.project filter = ListEntitiesRequest.Filter(project=project, labels=labels) # Get latest entities from Feast Core entity_protos = self._core_service.ListEntities( ListEntitiesRequest(filter=filter), metadata=self._get_grpc_metadata(), ) # type: ListEntitiesResponse # Extract entities and return entities = [] for entity_proto in entity_protos.entities: entity = Entity.from_proto(entity_proto) entity._client = self entities.append(entity) return entities def get_entity(self, name: str, project: str = None) -> Entity: """ Retrieves an entity. Args: project: Feast project that this entity belongs to name: Name of entity Returns: Returns either the specified entity, or raises an exception if none is found """ if self._telemetry_enabled: log_usage( "get_entity", self._telemetry_id, datetime.utcnow(), self.version(sdk_only=True), ) if project is None: project = self.project try: get_entity_response = self._core_service.GetEntity( GetEntityRequest(project=project, name=name.strip()), metadata=self._get_grpc_metadata(), ) # type: GetEntityResponse except grpc.RpcError as e: raise grpc.RpcError(e.details()) entity = Entity.from_proto(get_entity_response.entity) return entity def apply_feature_table( self, feature_tables: Union[List[FeatureTable], FeatureTable], project: str = None, ): """ Deprecated. Please see apply(). """ warnings.warn( "The method apply_feature_table() is being deprecated. Please use apply() instead. Feast 0.10 and onwards will not support apply_feature_table().", DeprecationWarning, ) if project is None: project = self.project if not isinstance(feature_tables, list): feature_tables = [feature_tables] for feature_table in feature_tables: if isinstance(feature_table, FeatureTable): self._apply_feature_table(project, feature_table) # type: ignore continue raise ValueError( f"Could not determine feature table type to apply {feature_table}" ) def _apply_feature_table(self, project: str, feature_table: FeatureTable): """ Registers a single feature table with Feast Args: feature_table: Feature table that will be registered """ feature_table.is_valid() feature_table_proto = feature_table.to_spec_proto() # Convert the feature table to a request and send to Feast Core try: apply_feature_table_response = self._core_service.ApplyFeatureTable( ApplyFeatureTableRequest( project=project, table_spec=feature_table_proto), # type: ignore timeout=self._config.getint(opt.GRPC_CONNECTION_TIMEOUT), metadata=self._get_grpc_metadata(), ) # type: ApplyFeatureTableResponse except grpc.RpcError as e: raise grpc.RpcError(e.details()) # Extract the returned feature table applied_feature_table = FeatureTable.from_proto( apply_feature_table_response.table) # Deep copy from the returned feature table to the local entity feature_table._update_from_feature_table(applied_feature_table) def list_feature_tables( self, project: str = None, labels: Dict[str, str] = dict() ) -> List[FeatureTable]: """ Retrieve a list of feature tables from Feast Core Args: project: Filter feature tables based on project name Returns: List of feature tables """ if project is None: project = self.project filter = ListFeatureTablesRequest.Filter(project=project, labels=labels) # Get latest feature tables from Feast Core feature_table_protos = self._core_service.ListFeatureTables( ListFeatureTablesRequest(filter=filter), metadata=self._get_grpc_metadata(), ) # type: ListFeatureTablesResponse # Extract feature tables and return feature_tables = [] for feature_table_proto in feature_table_protos.tables: feature_table = FeatureTable.from_proto(feature_table_proto) feature_table._client = self feature_tables.append(feature_table) return feature_tables def get_feature_table(self, name: str, project: str = None) -> FeatureTable: """ Retrieves a feature table. Args: project: Feast project that this feature table belongs to name: Name of feature table Returns: Returns either the specified feature table, or raises an exception if none is found """ if self._telemetry_enabled: log_usage( "get_feature_table", self._telemetry_id, datetime.utcnow(), self.version(sdk_only=True), ) if project is None: project = self.project try: get_feature_table_response = self._core_service.GetFeatureTable( GetFeatureTableRequest(project=project, name=name.strip()), metadata=self._get_grpc_metadata(), ) # type: GetFeatureTableResponse except grpc.RpcError as e: raise grpc.RpcError(e.details()) return FeatureTable.from_proto(get_feature_table_response.table) def delete_feature_table(self, name: str, project: str = None) -> None: """ Deletes a feature table. Args: project: Feast project that this feature table belongs to name: Name of feature table """ if project is None: project = self.project try: self._core_service.DeleteFeatureTable( DeleteFeatureTableRequest(project=project, name=name.strip()), metadata=self._get_grpc_metadata(), ) except grpc.RpcError as e: raise grpc.RpcError(e.details()) def list_features_by_ref( self, project: str = None, entities: List[str] = list(), labels: Dict[str, str] = dict(), ) -> Dict[FeatureRef, Feature]: """ Retrieve a dictionary of feature reference to feature from Feast Core based on filters provided. Args: project: Feast project that these features belongs to entities: Feast entity that these features are associated with labels: Feast labels that these features are associated with Returns: Dictionary of <feature references: features> Examples: >>> from feast import Client >>> >>> feast_client = Client(core_url="localhost:6565") >>> features = feast_client.list_features(project="test_project", entities=["driver_id"], labels={"key1":"val1","key2":"val2"}) >>> print(features) """ if project is None: project = self.project filter = ListFeaturesRequest.Filter(project=project, entities=entities, labels=labels) feature_protos = self._core_service.ListFeatures( ListFeaturesRequest(filter=filter), metadata=self._get_grpc_metadata(), ) # type: ListFeaturesResponse # Extract features and return features_dict = {} for ref_str, feature_proto in feature_protos.features.items(): feature_ref = FeatureRef.from_str(ref_str) feature = Feature.from_proto(feature_proto) features_dict[feature_ref] = feature return features_dict def ingest( self, feature_table: Union[str, FeatureTable], source: Union[pd.DataFrame, str], project: str = None, chunk_size: int = 10000, max_workers: int = max(CPU_COUNT - 1, 1), timeout: int = int(opt().BATCH_INGESTION_PRODUCTION_TIMEOUT), ) -> None: """ Batch load feature data into a FeatureTable. Args: feature_table (typing.Union[str, feast.feature_table.FeatureTable]): FeatureTable object or the string name of the feature table source (typing.Union[pd.DataFrame, str]): Either a file path or Pandas Dataframe to ingest into Feast Files that are currently supported: * parquet * csv * json project: Feast project to locate FeatureTable chunk_size (int): Amount of rows to load and ingest at a time. max_workers (int): Number of worker processes to use to encode values. timeout (int): Timeout in seconds to wait for completion. Examples: >>> from feast import Client >>> >>> client = Client(core_url="localhost:6565") >>> ft_df = pd.DataFrame( >>> { >>> "datetime": [pd.datetime.now()], >>> "driver": [1001], >>> "rating": [4.3], >>> } >>> ) >>> client.set_project("project1") >>> >>> driver_ft = client.get_feature_table("driver") >>> client.ingest(driver_ft, ft_df) """ if self._telemetry_enabled: log_usage( "ingest", self._telemetry_id, datetime.utcnow(), self.version(sdk_only=True), ) if project is None: project = self.project if isinstance(feature_table, str): name = feature_table if isinstance(feature_table, FeatureTable): name = feature_table.name fetched_feature_table: Optional[FeatureTable] = self.get_feature_table( name, project) if fetched_feature_table is not None: feature_table = fetched_feature_table else: raise Exception(f"FeatureTable, {name} cannot be found.") # Check 1) Only parquet file format for FeatureTable batch source is supported if (feature_table.batch_source and issubclass(type(feature_table.batch_source), FileSource) and isinstance( type(feature_table.batch_source.file_options.file_format), ParquetFormat)): raise Exception( f"No suitable batch source found for FeatureTable, {name}." f"Only BATCH_FILE source with parquet format is supported for batch ingestion." ) pyarrow_table, column_names = _read_table_from_source(source) # Check 2) Check if FeatureTable batch source field mappings can be found in provided source table _check_field_mappings( column_names, name, feature_table.batch_source.event_timestamp_column, feature_table.batch_source.field_mapping, ) dir_path = None with_partitions = False if (issubclass(type(feature_table.batch_source), FileSource) and feature_table.batch_source.date_partition_column): with_partitions = True dest_path = _write_partitioned_table_from_source( column_names, pyarrow_table, feature_table.batch_source.date_partition_column, feature_table.batch_source.event_timestamp_column, ) else: dir_path, dest_path = _write_non_partitioned_table_from_source( column_names, pyarrow_table, chunk_size, max_workers, ) try: if issubclass(type(feature_table.batch_source), FileSource): file_url = feature_table.batch_source.file_options.file_url.rstrip( "*") _upload_to_file_source(file_url, with_partitions, dest_path, self._config) if issubclass(type(feature_table.batch_source), BigQuerySource): bq_table_ref = feature_table.batch_source.bigquery_options.table_ref feature_table_timestamp_column = ( feature_table.batch_source.event_timestamp_column) _upload_to_bq_source(bq_table_ref, feature_table_timestamp_column, dest_path) finally: # Remove parquet file(s) that were created earlier print("Removing temporary file(s)...") if dir_path: shutil.rmtree(dir_path) print( "Data has been successfully ingested into FeatureTable batch source." ) def _get_grpc_metadata(self): """ Returns a metadata tuple to attach to gRPC requests. This is primarily used when authentication is enabled but SSL/TLS is disabled. Returns: Tuple of metadata to attach to each gRPC call """ if self._config.getboolean(opt.ENABLE_AUTH) and self._auth_metadata: return self._auth_metadata.get_signed_meta() return () def get_online_features( self, feature_refs: List[str], entity_rows: List[Dict[str, Any]], project: Optional[str] = None, ) -> OnlineResponse: """ Retrieves the latest online feature data from Feast Serving. Args: feature_refs: List of feature references that will be returned for each entity. Each feature reference should have the following format: "feature_table:feature" where "feature_table" & "feature" refer to the feature and feature table names respectively. Only the feature name is required. entity_rows: A list of dictionaries where each key-value is an entity-name, entity-value pair. project: Optionally specify the the project override. If specified, uses given project for retrieval. Overrides the projects specified in Feature References if also are specified. Returns: GetOnlineFeaturesResponse containing the feature data in records. Each EntityRow provided will yield one record, which contains data fields with data value and field status metadata (if included). Examples: >>> from feast import Client >>> >>> feast_client = Client(core_url="localhost:6565", serving_url="localhost:6566") >>> feature_refs = ["sales:daily_transactions"] >>> entity_rows = [{"customer_id": 0},{"customer_id": 1}] >>> >>> online_response = feast_client.get_online_features( >>> feature_refs, entity_rows, project="my_project") >>> online_response_dict = online_response.to_dict() >>> print(online_response_dict) {'sales:daily_transactions': [1.1,1.2], 'sales:customer_id': [0,1]} """ if self._telemetry_enabled: if self._telemetry_counter["get_online_features"] % 1000 == 0: log_usage( "get_online_features", self._telemetry_id, datetime.utcnow(), self.version(sdk_only=True), ) self._telemetry_counter["get_online_features"] += 1 try: response = self._serving_service.GetOnlineFeaturesV2( GetOnlineFeaturesRequestV2( features=_build_feature_references( feature_ref_strs=feature_refs), entity_rows=_infer_online_entity_rows(entity_rows), project=project if project is not None else self.project, ), timeout=self._config.getint(opt.GRPC_CONNECTION_TIMEOUT), metadata=self._get_grpc_metadata(), ) except grpc.RpcError as e: raise grpc.RpcError(e.details()) response = OnlineResponse(response) return response
class Client: def __init__(self, core_url=None, serving_url=None, verbose=False): """Create an instance of Feast client which is connected to feast endpoint specified in the parameter. If no url is provided, the client will default to the url specified in the environment variable FEAST_CORE_URL. Args: core_url (str, optional): feast's grpc endpoint URL (e.g.: "my.feast.com:8433") serving_url (str, optional): feast serving's grpc endpoint URL (e.g.: "my.feast.com:8433") """ if core_url is None: core_url = os.getenv(FEAST_CORE_URL_ENV_KEY) self._core_url = core_url if serving_url is None: serving_url = os.getenv(FEAST_SERVING_URL_ENV_KEY) self._serving_url = serving_url self.__core_channel = None self.__serving_channel = None self._core_service_stub = None self._job_service_stub = None self._dataset_service_stub = None self._serving_service_stub = None self._verbose = verbose self._table_downloader = TableDownloader() @property def core_url(self): if self._core_url is None: self._core_url = os.getenv(FEAST_CORE_URL_ENV_KEY) if self._core_url is None: raise ValueError( "Core API URL not set. Either set the " + "environment variable {} or set it explicitly.".format( FEAST_CORE_URL_ENV_KEY)) return self._core_url @core_url.setter def core_url(self, value): self._core_url = value @property def serving_url(self): if self._serving_url is None: self._serving_url = os.getenv(FEAST_SERVING_URL_ENV_KEY) if self._serving_url is None: raise ValueError( "Serving API URL not set. Either set the " + "environment variable {} or set it explicitly.".format( FEAST_SERVING_URL_ENV_KEY)) return self._serving_url @serving_url.setter def serving_url(self, value): self._serving_url = value @property def verbose(self): return self._verbose @verbose.setter def verbose(self, val): if not isinstance(val, bool): raise TypeError("verbose should be a boolean value") self._verbose = val def apply(self, obj): """Create or update one or many feast's resource (feature, entity, importer, storage). Args: obj (object): one or many feast's resource // create_entity (bool, optional): (default: {None}) // create_features (bool, optional): [description] (default: {None}) """ if isinstance(obj, list): ids = [] for resource in obj: ids.append(self._apply(resource)) return ids else: return self._apply(obj) def run(self, importer, name_override=None, apply_entity=False, apply_features=False): """ Run an import job Args: importer (feast.sdk.importer.Importer): importer instance name_override (str, optional): Job name override apply_entity (bool, optional): (default: False) create/update entity inside importer apply_features (bool, optional): (default: False) create/update features inside importer Returns: (str) job ID of the import job """ request = JobServiceTypes.SubmitImportJobRequest( importSpec=importer.spec) if name_override is not None: request.name = name_override if apply_entity: self._apply_entity(importer.entity) if apply_features: for feature in importer.features: self._apply_feature(feature) if importer.require_staging: print("Staging file to remote path {}".format( importer.remote_path)) importer.stage() print("Submitting job with spec:\n {}".format( spec_to_yaml(importer.spec))) self._connect_core() response = self._job_service_stub.SubmitJob(request) print("Submitted job with id: {}".format(response.jobId)) return response.jobId def create_dataset(self, feature_set, start_date, end_date, limit=None, name_prefix=None): """ Create training dataset for a feature set. The training dataset will be bounded by event timestamp between start_date and end_date. Specify limit to limit number of row returned. The training dataset will reside in a bigquery table specified by destination. Args: feature_set (feast.sdk.resources.feature_set.FeatureSet): feature set representing the data wanted start_date (str): starting date of the training data in ISO 8601 format (e.g.: "2018-12-31") end_date (str): end date of training data in ISO 8601 format (e.g.: "2018-12-31") limit (int, optional): (default: None) maximum number of row returned name_prefix (str, optional): (default: None) name prefix. :return: feast.resources.feature_set.DatasetInfo: DatasetInfo containing the information of training dataset """ self._check_create_dataset_args(feature_set, start_date, end_date, limit) req = DatasetServiceTypes.CreateDatasetRequest( featureSet=feature_set.proto, startDate=_timestamp_from_datetime(_parse_date(start_date)), endDate=_timestamp_from_datetime(_parse_date(end_date)), limit=limit, namePrefix=name_prefix) if self.verbose: print("creating training dataset for features: " + str(feature_set.features)) self._connect_core() resp = self._dataset_service_stub.CreateDataset(req) if self.verbose: print("created dataset {}: {}".format(resp.datasetInfo.name, resp.datasetInfo.tableUrl)) return DatasetInfo(resp.datasetInfo.name, resp.datasetInfo.tableUrl) def get_serving_data(self, feature_set, entity_keys, request_type=ServingRequestType.LAST, ts_range=[], limit=10): """Get data from the feast serving layer. You can either retrieve the the latest value, or a list of the latest values, up to a provided limit. If server_url is not provided, the value stored in the environment variable FEAST_SERVING_URL is used to connect to the serving server instead. Args: feature_set (feast.sdk.resources.feature_set.FeatureSet): feature set representing the data wanted entity_keys (:obj: `list` of :obj: `str): list of entity keys request_type (feast.sdk.utils.types.ServingRequestType): (default: feast.sdk.utils.types.ServingRequestType.LAST) type of request: one of [LIST, LAST] ts_range (:obj: `list` of str, optional): size 2 list of start timestamp and end timestamp, in ISO 8601 format. Only required if request_type is set to LIST limit (int, optional): (default: 10) number of values to get. Only required if request_type is set to LIST Returns: pandas.DataFrame: DataFrame of results """ ts_range = [ _timestamp_from_datetime(dateutil.parser.parse(dt)) for dt in ts_range ] request = self._build_serving_request(feature_set, entity_keys, request_type, ts_range, limit) self._connect_serving() return self._response_to_df( feature_set, self._serving_service_stub.QueryFeatures(request)) def download_dataset(self, dataset_info, dest, staging_location, file_type=FileType.CSV): """ Download training dataset as file Args: dataset_info (feast.sdk.resources.feature_set.DatasetInfo) : dataset_info to be downloaded dest (str): destination's file path staging_location (str): url to staging_location (currently support a folder in GCS) file_type (feast.sdk.resources.feature_set.FileType): (default: FileType.CSV) exported file format Returns: str: path to the downloaded file """ return self._table_downloader.download_table_as_file( dataset_info.table_id, dest, staging_location, file_type) def download_dataset_to_df(self, dataset_info, staging_location): """ Download training dataset as Pandas Dataframe Args: dataset_info (feast.sdk.resources.feature_set.DatasetInfo) : dataset_info to be downloaded staging_location: url to staging_location (currently support a folder in GCS) Returns: pandas.DataFrame: dataframe of the training dataset """ return self._table_downloader.download_table_as_df( dataset_info.table_id, staging_location) def close(self): """ Close underlying connection to Feast's core and serving end points. """ self.__core_channel.close() self.__core_channel = None self.__serving_channel.close() self.__serving_channel = None def _connect_core(self): """Connect to core api""" if self.__core_channel is None: self.__core_channel = grpc.insecure_channel(self.core_url) self._core_service_stub = CoreServiceStub(self.__core_channel) self._job_service_stub = JobServiceStub(self.__core_channel) self._dataset_service_stub = DatasetServiceStub( self.__core_channel) def _connect_serving(self): """Connect to serving api""" if self.__serving_channel is None: self.__serving_channel = grpc.insecure_channel(self.serving_url) self._serving_service_stub = ServingAPIStub(self.__serving_channel) def _build_serving_request(self, feature_set, entity_keys, request_type, ts_range, limit): """Helper function to build serving service request.""" request = QueryFeatures.Request(entityName=feature_set.entity, entityId=entity_keys) features = [ RequestDetail(featureId=feat_id, type=request_type.value) for feat_id in feature_set.features ] if request_type == ServingRequestType.LIST: ts_range = TimestampRange(start=ts_range[0], end=ts_range[1]) request.timestampRange.CopyFrom(ts_range) for feature in features: feature.limit = limit request.requestDetails.extend(features) return request def _response_to_df(self, feature_set, response): entity_tables = [] for entity_key in response.entities: feature_tables = [] features = response.entities[entity_key].features for feature_name in features: rows = [] v_list = features[feature_name].valueList v_list = getattr(v_list, v_list.WhichOneof("valueList")).val for idx in range(len(v_list)): row = { response.entityName: entity_key, feature_name: v_list[idx] } if features[feature_name].HasField("timestampList"): ts_seconds = \ features[feature_name].timestampList.val[idx].seconds row["timestamp"] = datetime.fromtimestamp(ts_seconds) rows.append(row) feature_tables.append(pd.DataFrame(rows)) entity_table = feature_tables[0] for idx in range(1, len(feature_tables)): entity_table = pd.merge(left=entity_table, right=feature_tables[idx], how='outer') entity_tables.append(entity_table) if len(entity_tables) == 0: return pd.DataFrame(columns=[feature_set.entity, "timestamp"] + feature_set.features) df = pd.concat(entity_tables) return df.reset_index(drop=True) def _apply(self, obj): """Applies a single object to feast core. Args: obj (object): one of [Feature, Entity, FeatureGroup, Storage, Importer] """ if isinstance(obj, Feature): return self._apply_feature(obj) elif isinstance(obj, Entity): return self._apply_entity(obj) elif isinstance(obj, FeatureGroup): return self._apply_feature_group(obj) elif isinstance(obj, Storage): return self._apply_storage(obj) else: raise TypeError('Apply can only be passed one of the following \ types: [Feature, Entity, FeatureGroup, Storage, Importer]') def _apply_feature(self, feature): """Apply the feature to the core API Args: feature (feast.sdk.resources.feature.Feature): feature to apply """ self._connect_core() response = self._core_service_stub.ApplyFeature(feature.spec) if self.verbose: print("Successfully applied feature with id: {}\n---\n{}".format( response.featureId, feature)) return response.featureId def _apply_entity(self, entity): """Apply the entity to the core API Args: entity (feast.sdk.resources.entity.Entity): entity to apply """ self._connect_core() response = self._core_service_stub.ApplyEntity(entity.spec) if self.verbose: print("Successfully applied entity with name: {}\n---\n{}".format( response.entityName, entity)) return response.entityName def _apply_feature_group(self, feature_group): """Apply the feature group to the core API Args: feature_group (feast.sdk.resources.feature_group.FeatureGroup): feature group to apply """ self._connect_core() response = self._core_service_stub.ApplyFeatureGroup( feature_group.spec) if self.verbose: print("Successfully applied feature group with id: " + "{}\n---\n{}".format(response.featureGroupId, feature_group)) return response.featureGroupId def _apply_storage(self, storage): """Apply the storage to the core API Args: storage (feast.sdk.resources.storage.Storage): storage to apply """ self._connect_core() response = self._core_service_stub.ApplyStorage(storage.spec) if self.verbose: print("Successfully applied storage with id: " + "{}\n{}".format(response.storageId, storage)) return response.storageId def _check_create_dataset_args(self, feature_set, start_date, end_date, limit): if len(feature_set.features) < 1: raise ValueError("feature set is empty") start = _parse_date(start_date) end = _parse_date(end_date) if end < start: raise ValueError("end_date is before start_date") if limit is not None and limit < 1: raise ValueError("limit is not a positive integer")
class Client: """ Feast Client: Used for creating, managing, and retrieving features. """ def __init__(self, core_url: str = None, serving_url: str = None, verbose: bool = False): """ The Feast Client should be initialized with at least one service url Args: core_url: Feast Core URL. Used to manage features serving_url: Feast Serving URL. Used to retrieve features verbose: Enable verbose logging """ self._core_url = core_url self._serving_url = serving_url self._verbose = verbose self.__core_channel: grpc.Channel = None self.__serving_channel: grpc.Channel = None self._core_service_stub: CoreServiceStub = None self._serving_service_stub: ServingServiceStub = None @property def core_url(self) -> str: """ Retrieve Feast Core URL """ if self._core_url is not None: return self._core_url if os.getenv(FEAST_CORE_URL_ENV_KEY) is not None: return os.getenv(FEAST_CORE_URL_ENV_KEY) return "" @core_url.setter def core_url(self, value: str): """ Set the Feast Core URL Returns: Feast Core URL string """ self._core_url = value @property def serving_url(self) -> str: """ Retrieve Serving Core URL """ if self._serving_url is not None: return self._serving_url if os.getenv(FEAST_SERVING_URL_ENV_KEY) is not None: return os.getenv(FEAST_SERVING_URL_ENV_KEY) return "" @serving_url.setter def serving_url(self, value: str): """ Set the Feast Serving URL Returns: Feast Serving URL string """ self._serving_url = value def version(self): """ Returns version information from Feast Core and Feast Serving """ result = {} if self.serving_url: self._connect_serving() serving_version = self._serving_service_stub.GetFeastServingInfo( GetFeastServingInfoRequest(), timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT).version result["serving"] = { "url": self.serving_url, "version": serving_version } if self.core_url: self._connect_core() core_version = self._core_service_stub.GetFeastCoreVersion( GetFeastCoreVersionRequest(), timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT).version result["core"] = {"url": self.core_url, "version": core_version} return result def _connect_core(self, skip_if_connected: bool = True): """ Connect to Core API Args: skip_if_connected: Do not attempt to connect if already connected """ if skip_if_connected and self._core_service_stub: return if not self.core_url: raise ValueError("Please set Feast Core URL.") if self.__core_channel is None: self.__core_channel = grpc.insecure_channel(self.core_url) try: grpc.channel_ready_future(self.__core_channel).result( timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT) except grpc.FutureTimeoutError: raise ConnectionError( f"Connection timed out while attempting to connect to Feast " f"Core gRPC server {self.core_url} ") else: self._core_service_stub = CoreServiceStub(self.__core_channel) def _connect_serving(self, skip_if_connected=True): """ Connect to Serving API Args: skip_if_connected: Do not attempt to connect if already connected """ if skip_if_connected and self._serving_service_stub: return if not self.serving_url: raise ValueError("Please set Feast Serving URL.") if self.__serving_channel is None: self.__serving_channel = grpc.insecure_channel(self.serving_url) try: grpc.channel_ready_future(self.__serving_channel).result( timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT) except grpc.FutureTimeoutError: raise ConnectionError( f"Connection timed out while attempting to connect to Feast " f"Serving gRPC server {self.serving_url} ") else: self._serving_service_stub = ServingServiceStub( self.__serving_channel) def apply(self, feature_sets: Union[List[FeatureSet], FeatureSet]): """ Idempotently registers feature set(s) with Feast Core. Either a single feature set or a list can be provided. Args: feature_sets: List of feature sets that will be registered """ if not isinstance(feature_sets, list): feature_sets = [feature_sets] for feature_set in feature_sets: if isinstance(feature_set, FeatureSet): self._apply_feature_set(feature_set) continue raise ValueError( f"Could not determine feature set type to apply {feature_set}") def _apply_feature_set(self, feature_set: FeatureSet): """ Registers a single feature set with Feast Args: feature_set: Feature set that will be registered """ self._connect_core() feature_set._client = self feature_set.is_valid() # Convert the feature set to a request and send to Feast Core apply_fs_response = self._core_service_stub.ApplyFeatureSet( ApplyFeatureSetRequest(feature_set=feature_set.to_proto()), timeout=GRPC_CONNECTION_TIMEOUT_APPLY, ) # type: ApplyFeatureSetResponse # Extract the returned feature set applied_fs = FeatureSet.from_proto(apply_fs_response.feature_set) # If the feature set has changed, update the local copy if apply_fs_response.status == ApplyFeatureSetResponse.Status.CREATED: print( f'Feature set updated/created: "{applied_fs.name}:{applied_fs.version}"' ) # If no change has been applied, do nothing if apply_fs_response.status == ApplyFeatureSetResponse.Status.NO_CHANGE: print(f"No change detected or applied: {feature_set.name}") # Deep copy from the returned feature set to the local feature set feature_set.update_from_feature_set(applied_fs) def list_feature_sets(self) -> List[FeatureSet]: """ Retrieve a list of feature sets from Feast Core Returns: List of feature sets """ self._connect_core() # Get latest feature sets from Feast Core feature_set_protos = self._core_service_stub.ListFeatureSets( ListFeatureSetsRequest()) # type: ListFeatureSetsResponse # Extract feature sets and return feature_sets = [] for feature_set_proto in feature_set_protos.feature_sets: feature_set = FeatureSet.from_proto(feature_set_proto) feature_set._client = self feature_sets.append(feature_set) return feature_sets def get_feature_set(self, name: str, version: int = None) -> Union[FeatureSet, None]: """ Retrieves a feature set. If no version is specified then the latest version will be returned. Args: name: Name of feature set version: Version of feature set Returns: Returns either the specified feature set, or raises an exception if none is found """ self._connect_core() if version is None: version = 0 get_feature_set_response = self._core_service_stub.GetFeatureSet( GetFeatureSetRequest( name=name.strip(), version=int(version))) # type: GetFeatureSetResponse return FeatureSet.from_proto(get_feature_set_response.feature_set) def list_entities(self) -> Dict[str, Entity]: """ Returns a dictionary of entities across all feature sets Returns: Dictionary of entities, indexed by name """ entities_dict = OrderedDict() for fs in self.list_feature_sets(): for entity in fs.entities: entities_dict[entity.name] = entity return entities_dict def get_batch_features(self, feature_ids: List[str], entity_rows: pd.DataFrame) -> Job: """ Retrieves historical features from a Feast Serving deployment. Args: feature_ids: List of feature ids that will be returned for each entity. Each feature id should have the following format "feature_set_name:version:feature_name". entity_rows: Pandas dataframe containing entities and a 'datetime' column. Each entity in a feature set must be present as a column in this dataframe. The datetime column must Returns: Returns a job object that can be used to monitor retrieval progress asynchronously, and can be used to materialize the results Examples: >>> from feast import Client >>> from datetime import datetime >>> >>> feast_client = Client(core_url="localhost:6565", serving_url="localhost:6566") >>> feature_ids = ["customer:1:bookings_7d"] >>> entity_rows = pd.DataFrame( >>> { >>> "datetime": [pd.datetime.now() for _ in range(3)], >>> "customer": [1001, 1002, 1003], >>> } >>> ) >>> feature_retrieval_job = feast_client.get_batch_features(feature_ids, entity_rows) >>> df = feature_retrieval_job.to_dataframe() >>> print(df) """ self._connect_serving() fs_request = _build_feature_set_request(feature_ids) # Validate entity rows based on entities in Feast Core self._validate_entity_rows_for_batch_retrieval(entity_rows, fs_request) # Remove timezone from datetime column if isinstance(entity_rows["datetime"].dtype, pd.core.dtypes.dtypes.DatetimeTZDtype): entity_rows["datetime"] = pd.DatetimeIndex( entity_rows["datetime"]).tz_localize(None) # Retrieve serving information to determine store type and # staging location serving_info = self._serving_service_stub.GetFeastServingInfo( GetFeastServingInfoRequest(), timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT ) # type: GetFeastServingInfoResponse if serving_info.type != FeastServingType.FEAST_SERVING_TYPE_BATCH: raise Exception( f'You are connected to a store "{self._serving_url}" which ' f"does not support batch retrieval ") # Export and upload entity row dataframe to staging location # provided by Feast staged_file = export_dataframe_to_staging_location( entity_rows, serving_info.job_staging_location) # type: str request = GetBatchFeaturesRequest( feature_sets=fs_request, dataset_source=DatasetSource(file_source=DatasetSource.FileSource( file_uris=[staged_file], data_format=DataFormat.DATA_FORMAT_AVRO)), ) # Retrieve Feast Job object to manage life cycle of retrieval response = self._serving_service_stub.GetBatchFeatures(request) return Job(response.job, self._serving_service_stub) def _validate_entity_rows_for_batch_retrieval(self, entity_rows, feature_sets_request): """ Validate whether an entity_row dataframe contains the correct information for batch retrieval Args: entity_rows: Pandas dataframe containing entities and datetime column. Each entity in a feature set must be present as a column in this dataframe. feature_sets_request: Feature sets that will be requested """ # Ensure datetime column exists if "datetime" not in entity_rows.columns: raise ValueError( f'Entity rows does not contain "datetime" column in columns ' f"{entity_rows.columns}") # Validate dataframe columns based on feature set entities for feature_set in feature_sets_request: fs = self.get_feature_set(name=feature_set.name, version=feature_set.version) if fs is None: raise ValueError( f'Feature set "{feature_set.name}:{feature_set.version}" ' f"could not be found") for entity_type in fs.entities: if entity_type.name not in entity_rows.columns: raise ValueError( f'Dataframe does not contain entity "{entity_type.name}"' f' column in columns "{entity_rows.columns}"') def get_online_features( self, feature_ids: List[str], entity_rows: List[GetOnlineFeaturesRequest.EntityRow], ) -> GetOnlineFeaturesResponse: """ Retrieves the latest online feature data from Feast Serving Args: feature_ids: List of feature Ids in the following format [feature_set_name]:[version]:[feature_name] example: ["feature_set_1:6:my_feature_1", "feature_set_1:6:my_feature_2",] entity_rows: List of GetFeaturesRequest.EntityRow where each row contains entities. Timestamp should not be set for online retrieval. All entity types within a feature Returns: Returns a list of maps where each item in the list contains the latest feature values for the provided entities """ self._connect_serving() return self._serving_service_stub.GetOnlineFeatures( GetOnlineFeaturesRequest( feature_sets=_build_feature_set_request(feature_ids), entity_rows=entity_rows, )) # type: GetOnlineFeaturesResponse def ingest( self, feature_set: Union[str, FeatureSet], source: Union[pd.DataFrame, str], version: int = None, force_update: bool = False, max_workers: int = CPU_COUNT, disable_progress_bar: bool = False, chunk_size: int = 5000, timeout: int = None, ): """ Loads feature data into Feast for a specific feature set. Args: feature_set: Name of feature set or a feature set object source: Either a file path or Pandas Dataframe to ingest into Feast Files that are currently supported: * parquet * csv * json version: Feature set version force_update: Automatically update feature set based on source data prior to ingesting. This will also register changes to Feast max_workers: Number of worker processes to use to encode values disable_progress_bar: Disable printing of progress statistics chunk_size: Maximum amount of rows to load into memory and ingest at a time timeout: Seconds to wait before ingestion times out """ if isinstance(feature_set, FeatureSet): name = feature_set.name if version is None: version = feature_set.version elif isinstance(feature_set, str): name = feature_set else: raise Exception(f"Feature set name must be provided") table = _read_table_from_source(source) # Update the feature set based on DataFrame schema if force_update: # Use a small as reference DataFrame to infer fields ref_df = table.to_batches(max_chunksize=20)[0].to_pandas() feature_set.infer_fields_from_df(ref_df, discard_unused_fields=True, replace_existing_features=True) self.apply(feature_set) feature_set = self.get_feature_set(name, version) if feature_set.source.source_type == "Kafka": ingest_table_to_kafka( feature_set=feature_set, table=table, max_workers=max_workers, disable_pbar=disable_progress_bar, chunk_size=chunk_size, timeout=timeout, ) else: raise Exception(f"Could not determine source type for feature set " f'"{feature_set.name}" with source type ' f'"{feature_set.source.source_type}"')