class Client: """ Feast Client: Used for creating, managing, and retrieving features. """ def __init__(self, options: Optional[Dict[str, str]] = None, **kwargs): """ The Feast Client should be initialized with at least one service url Args: core_url: Feast Core URL. Used to manage features serving_url: Feast Serving URL. Used to retrieve features project: Sets the active project. This field is optional. core_secure: Use client-side SSL/TLS for Core gRPC API serving_secure: Use client-side SSL/TLS for Serving gRPC API options: Configuration options to initialize client with **kwargs: Additional keyword arguments that will be used as configuration options along with "options" """ if options is None: options = dict() self._config = Config(options={**options, **kwargs}) self.__core_channel: grpc.Channel = None self.__serving_channel: grpc.Channel = None self._core_service_stub: CoreServiceStub = None self._serving_service_stub: ServingServiceStub = None @property def core_url(self) -> str: """ Retrieve Feast Core URL Returns: Feast Core URL string """ return self._config.get(CONFIG_CORE_URL_KEY) @core_url.setter def core_url(self, value: str): """ Set the Feast Core URL Args: value: Feast Core URL """ self._config.set(CONFIG_CORE_URL_KEY, value) @property def serving_url(self) -> str: """ Retrieve Serving Core URL Returns: Feast Serving URL string """ return self._config.get(CONFIG_SERVING_URL_KEY) @serving_url.setter def serving_url(self, value: str): """ Set the Feast Serving URL Args: value: Feast Serving URL """ self._config.set(CONFIG_SERVING_URL_KEY, value) @property def core_secure(self) -> bool: """ Retrieve Feast Core client-side SSL/TLS setting Returns: Whether client-side SSL/TLS is enabled """ return self._config.getboolean(CONFIG_CORE_SECURE_KEY) @core_secure.setter def core_secure(self, value: bool): """ Set the Feast Core client-side SSL/TLS setting Args: value: True to enable client-side SSL/TLS """ self._config.set(CONFIG_CORE_SECURE_KEY, value) @property def serving_secure(self) -> bool: """ Retrieve Feast Serving client-side SSL/TLS setting Returns: Whether client-side SSL/TLS is enabled """ return self._config.getboolean(CONFIG_SERVING_SECURE_KEY) @serving_secure.setter def serving_secure(self, value: bool): """ Set the Feast Serving client-side SSL/TLS setting Args: value: True to enable client-side SSL/TLS """ self._config.set(CONFIG_SERVING_SECURE_KEY, value) def version(self): """ Returns version information from Feast Core and Feast Serving """ result = {} if self.serving_url: self._connect_serving() serving_version = self._serving_service_stub.GetFeastServingInfo( GetFeastServingInfoRequest(), timeout=self._config.getint( CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY), ).version result["serving"] = { "url": self.serving_url, "version": serving_version } if self.core_url: self._connect_core() core_version = self._core_service_stub.GetFeastCoreVersion( GetFeastCoreVersionRequest(), timeout=self._config.getint( CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY), ).version result["core"] = {"url": self.core_url, "version": core_version} return result def _connect_core(self, skip_if_connected: bool = True): """ Connect to Core API Args: skip_if_connected: Do not attempt to connect if already connected """ if skip_if_connected and self._core_service_stub: return if not self.core_url: raise ValueError("Please set Feast Core URL.") if self.__core_channel is None: if self.core_secure or self.core_url.endswith(":443"): self.__core_channel = grpc.secure_channel( self.core_url, grpc.ssl_channel_credentials()) else: self.__core_channel = grpc.insecure_channel(self.core_url) try: grpc.channel_ready_future( self.__core_channel).result(timeout=self._config.getint( CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY)) except grpc.FutureTimeoutError: raise ConnectionError( f"Connection timed out while attempting to connect to Feast " f"Core gRPC server {self.core_url} ") else: self._core_service_stub = CoreServiceStub(self.__core_channel) def _connect_serving(self, skip_if_connected=True): """ Connect to Serving API Args: skip_if_connected: Do not attempt to connect if already connected """ if skip_if_connected and self._serving_service_stub: return if not self.serving_url: raise ValueError("Please set Feast Serving URL.") if self.__serving_channel is None: if self.serving_secure or self.serving_url.endswith(":443"): self.__serving_channel = grpc.secure_channel( self.serving_url, grpc.ssl_channel_credentials()) else: self.__serving_channel = grpc.insecure_channel( self.serving_url) try: grpc.channel_ready_future( self.__serving_channel).result(timeout=self._config.getint( CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY)) except grpc.FutureTimeoutError: raise ConnectionError( f"Connection timed out while attempting to connect to Feast " f"Serving gRPC server {self.serving_url} ") else: self._serving_service_stub = ServingServiceStub( self.__serving_channel) @property def project(self) -> Union[str, None]: """ Retrieve currently active project Returns: Project name """ return self._config.get(CONFIG_PROJECT_KEY) def set_project(self, project: str): """ Set currently active Feast project Args: project: Project to set as active """ self._config.set(CONFIG_PROJECT_KEY, project) def list_projects(self) -> List[str]: """ List all active Feast projects Returns: List of project names """ self._connect_core() response = self._core_service_stub.ListProjects( ListProjectsRequest(), timeout=self._config.getint( CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY), ) # type: ListProjectsResponse return list(response.projects) def create_project(self, project: str): """ Creates a Feast project Args: project: Name of project """ self._connect_core() self._core_service_stub.CreateProject( CreateProjectRequest(name=project), timeout=self._config.getint( CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY), ) # type: CreateProjectResponse def archive_project(self, project): """ Archives a project. Project will still continue to function for ingestion and retrieval, but will be in a read-only state. It will also not be visible from the Core API for management purposes. Args: project: Name of project to archive """ self._connect_core() self._core_service_stub.ArchiveProject( ArchiveProjectRequest(name=project), timeout=self._config.getint( CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY), ) # type: ArchiveProjectResponse if self._project == project: self._project = "" def apply(self, feature_sets: Union[List[FeatureSet], FeatureSet]): """ Idempotently registers feature set(s) with Feast Core. Either a single feature set or a list can be provided. Args: feature_sets: List of feature sets that will be registered """ if not isinstance(feature_sets, list): feature_sets = [feature_sets] for feature_set in feature_sets: if isinstance(feature_set, FeatureSet): self._apply_feature_set(feature_set) continue raise ValueError( f"Could not determine feature set type to apply {feature_set}") def _apply_feature_set(self, feature_set: FeatureSet): """ Registers a single feature set with Feast Args: feature_set: Feature set that will be registered """ self._connect_core() feature_set.is_valid() feature_set_proto = feature_set.to_proto() if len(feature_set_proto.spec.project) == 0: if self.project is None: raise ValueError( f"No project found in feature set {feature_set.name}. " f"Please set the project within the feature set or within " f"your Feast Client.") else: feature_set_proto.spec.project = self.project # Convert the feature set to a request and send to Feast Core try: apply_fs_response = self._core_service_stub.ApplyFeatureSet( ApplyFeatureSetRequest(feature_set=feature_set_proto), timeout=self._config.getint( CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY), ) # type: ApplyFeatureSetResponse except grpc.RpcError as e: raise grpc.RpcError(e.details()) # Extract the returned feature set applied_fs = FeatureSet.from_proto(apply_fs_response.feature_set) # If the feature set has changed, update the local copy if apply_fs_response.status == ApplyFeatureSetResponse.Status.CREATED: print( f'Feature set updated/created: "{applied_fs.name}:{applied_fs.version}"' ) # If no change has been applied, do nothing if apply_fs_response.status == ApplyFeatureSetResponse.Status.NO_CHANGE: print(f"No change detected or applied: {feature_set.name}") # Deep copy from the returned feature set to the local feature set feature_set._update_from_feature_set(applied_fs) def list_feature_sets(self, project: str = None, name: str = None, version: str = None) -> List[FeatureSet]: """ Retrieve a list of feature sets from Feast Core Args: project: Filter feature sets based on project name name: Filter feature sets based on feature set name version: Filter feature sets based on version numbf, Returns: List of feature sets """ self._connect_core() if project is None: if self.project is not None: project = self.project else: project = "*" if name is None: name = "*" if version is None: version = "*" filter = ListFeatureSetsRequest.Filter(project=project, feature_set_name=name, feature_set_version=version) # Get latest feature sets from Feast Core feature_set_protos = self._core_service_stub.ListFeatureSets( ListFeatureSetsRequest( filter=filter)) # type: ListFeatureSetsResponse # Extract feature sets and return feature_sets = [] for feature_set_proto in feature_set_protos.feature_sets: feature_set = FeatureSet.from_proto(feature_set_proto) feature_set._client = self feature_sets.append(feature_set) return feature_sets def get_feature_set(self, name: str, version: int = None, project: str = None) -> Union[FeatureSet, None]: """ Retrieves a feature set. If no version is specified then the latest version will be returned. Args: project: Feast project that this feature set belongs to name: Name of feature set version: Version of feature set Returns: Returns either the specified feature set, or raises an exception if none is found """ self._connect_core() if project is None: if self.project is not None: project = self.project else: raise ValueError("No project has been configured.") if version is None: version = 0 try: get_feature_set_response = self._core_service_stub.GetFeatureSet( GetFeatureSetRequest( project=project, name=name.strip(), version=int(version))) # type: GetFeatureSetResponse except grpc.RpcError as e: raise grpc.RpcError(e.details()) return FeatureSet.from_proto(get_feature_set_response.feature_set) def list_entities(self) -> Dict[str, Entity]: """ Returns a dictionary of entities across all feature sets Returns: Dictionary of entities, indexed by name """ entities_dict = OrderedDict() for fs in self.list_feature_sets(): for entity in fs.entities: entities_dict[entity.name] = entity return entities_dict def get_batch_features( self, feature_refs: List[str], entity_rows: Union[pd.DataFrame, str], default_project: str = None, ) -> RetrievalJob: """ Retrieves historical features from a Feast Serving deployment. Args: feature_refs (List[str]): List of feature references that will be returned for each entity. Each feature reference should have the following format "project/feature:version". entity_rows (Union[pd.DataFrame, str]): Pandas dataframe containing entities and a 'datetime' column. Each entity in a feature set must be present as a column in this dataframe. The datetime column must contain timestamps in datetime64 format. default_project: Default project where feature values will be found. Returns: feast.job.RetrievalJob: Returns a retrival job object that can be used to monitor retrieval progress asynchronously, and can be used to materialize the results. Examples: >>> from feast import Client >>> from datetime import datetime >>> >>> feast_client = Client(core_url="localhost:6565", serving_url="localhost:6566") >>> feature_refs = ["my_project/bookings_7d:1", "booking_14d"] >>> entity_rows = pd.DataFrame( >>> { >>> "datetime": [pd.datetime.now() for _ in range(3)], >>> "customer": [1001, 1002, 1003], >>> } >>> ) >>> feature_retrieval_job = feast_client.get_batch_features( >>> feature_refs, entity_rows, default_project="my_project") >>> df = feature_retrieval_job.to_dataframe() >>> print(df) """ self._connect_serving() feature_references = _build_feature_references( feature_refs=feature_refs, default_project=default_project) # Retrieve serving information to determine store type and # staging location serving_info = self._serving_service_stub.GetFeastServingInfo( GetFeastServingInfoRequest(), timeout=self._config.getint( CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY), ) # type: GetFeastServingInfoResponse if serving_info.type != FeastServingType.FEAST_SERVING_TYPE_BATCH: raise Exception( f'You are connected to a store "{self._serving_url}" which ' f"does not support batch retrieval ") if isinstance(entity_rows, pd.DataFrame): # Pandas DataFrame detected # Remove timezone from datetime column if isinstance(entity_rows["datetime"].dtype, pd.core.dtypes.dtypes.DatetimeTZDtype): entity_rows["datetime"] = pd.DatetimeIndex( entity_rows["datetime"]).tz_localize(None) elif isinstance(entity_rows, str): # String based source if not entity_rows.endswith((".avro", "*")): raise Exception( f"Only .avro and wildcard paths are accepted as entity_rows" ) else: raise Exception(f"Only pandas.DataFrame and str types are allowed" f" as entity_rows, but got {type(entity_rows)}.") # Export and upload entity row DataFrame to staging location # provided by Feast staged_files = export_source_to_staging_location( entity_rows, serving_info.job_staging_location) # type: List[str] request = GetBatchFeaturesRequest( features=feature_references, dataset_source=DatasetSource(file_source=DatasetSource.FileSource( file_uris=staged_files, data_format=DataFormat.DATA_FORMAT_AVRO)), ) # Retrieve Feast Job object to manage life cycle of retrieval response = self._serving_service_stub.GetBatchFeatures(request) return RetrievalJob(response.job, self._serving_service_stub) def get_online_features( self, feature_refs: List[str], entity_rows: List[GetOnlineFeaturesRequest.EntityRow], default_project: Optional[str] = None, ) -> GetOnlineFeaturesResponse: """ Retrieves the latest online feature data from Feast Serving Args: feature_refs: List of feature references in the following format [project]/[feature_name]:[version]. Only the feature name is a required component in the reference. example: ["my_project/my_feature_1:3", "my_project3/my_feature_4:1",] entity_rows: List of GetFeaturesRequest.EntityRow where each row contains entities. Timestamp should not be set for online retrieval. All entity types within a feature default_project: This project will be used if the project name is not provided in the feature reference Returns: Returns a list of maps where each item in the list contains the latest feature values for the provided entities """ self._connect_serving() return self._serving_service_stub.GetOnlineFeatures( GetOnlineFeaturesRequest( features=_build_feature_references( feature_refs=feature_refs, default_project=(default_project if not self.project else self.project), ), entity_rows=entity_rows, )) def list_ingest_jobs( self, job_id: str = None, feature_set_ref: FeatureSetRef = None, store_name: str = None, ): """ List the ingestion jobs currently registered in Feast, with optional filters. Provides detailed metadata about each ingestion job. Args: job_id: Select specific ingestion job with the given job_id feature_set_ref: Filter ingestion jobs by target feature set (via reference) store_name: Filter ingestion jobs by target feast store's name Returns: List of IngestJobs matching the given filters """ self._connect_core() # construct list request feature_set_ref = None list_filter = ListIngestionJobsRequest.Filter( id=job_id, feature_set_reference=feature_set_ref, store_name=store_name, ) request = ListIngestionJobsRequest(filter=list_filter) # make list request & unpack response response = self._core_service_stub.ListIngestionJobs(request) ingest_jobs = [ IngestJob(proto, self._core_service_stub) for proto in response.jobs ] return ingest_jobs def restart_ingest_job(self, job: IngestJob): """ Restart ingestion job currently registered in Feast. NOTE: Data might be lost during the restart for some job runners. Does not support stopping a job in a transitional (ie pending, suspending, aborting), terminal state (ie suspended or aborted) or unknown status Args: job: IngestJob to restart """ self._connect_core() request = RestartIngestionJobRequest(id=job.id) try: self._core_service_stub.RestartIngestionJob(request) except grpc.RpcError as e: raise grpc.RpcError(e.details()) def stop_ingest_job(self, job: IngestJob): """ Stop ingestion job currently resgistered in Feast Does nothing if the target job if already in a terminal state (ie suspended or aborted). Does not support stopping a job in a transitional (ie pending, suspending, aborting) or in a unknown status Args: job: IngestJob to restart """ self._connect_core() request = StopIngestionJobRequest(id=job.id) try: self._core_service_stub.StopIngestionJob(request) except grpc.RpcError as e: raise grpc.RpcError(e.details()) def ingest( self, feature_set: Union[str, FeatureSet], source: Union[pd.DataFrame, str], chunk_size: int = 10000, version: int = None, force_update: bool = False, max_workers: int = max(CPU_COUNT - 1, 1), disable_progress_bar: bool = False, timeout: int = KAFKA_CHUNK_PRODUCTION_TIMEOUT, ) -> None: """ Loads feature data into Feast for a specific feature set. Args: feature_set (typing.Union[str, feast.feature_set.FeatureSet]): Feature set object or the string name of the feature set (without a version). source (typing.Union[pd.DataFrame, str]): Either a file path or Pandas Dataframe to ingest into Feast Files that are currently supported: * parquet * csv * json chunk_size (int): Amount of rows to load and ingest at a time. version (int): Feature set version. force_update (bool): Automatically update feature set based on source data prior to ingesting. This will also register changes to Feast. max_workers (int): Number of worker processes to use to encode values. disable_progress_bar (bool): Disable printing of progress statistics. timeout (int): Timeout in seconds to wait for completion. Returns: None: None """ if isinstance(feature_set, FeatureSet): name = feature_set.name if version is None: version = feature_set.version elif isinstance(feature_set, str): name = feature_set else: raise Exception(f"Feature set name must be provided") # Read table and get row count dir_path, dest_path = _read_table_from_source(source, chunk_size, max_workers) pq_file = pq.ParquetFile(dest_path) row_count = pq_file.metadata.num_rows # Update the feature set based on PyArrow table of first row group if force_update: feature_set.infer_fields_from_pa( table=pq_file.read_row_group(0), discard_unused_fields=True, replace_existing_features=True, ) self.apply(feature_set) current_time = time.time() print("Waiting for feature set to be ready for ingestion...") while True: if timeout is not None and time.time() - current_time >= timeout: raise TimeoutError( "Timed out waiting for feature set to be ready") feature_set = self.get_feature_set(name, version) if (feature_set is not None and feature_set.status == FeatureSetStatus.STATUS_READY): break time.sleep(3) if timeout is not None: timeout = timeout - int(time.time() - current_time) try: # Kafka configs brokers = feature_set.get_kafka_source_brokers() topic = feature_set.get_kafka_source_topic() producer = get_producer(brokers, row_count, disable_progress_bar) # Loop optimization declarations produce = producer.produce flush = producer.flush # Transform and push data to Kafka if feature_set.source.source_type == "Kafka": for chunk in get_feature_row_chunks( file=dest_path, row_groups=list(range(pq_file.num_row_groups)), fs=feature_set, max_workers=max_workers, ): # Push FeatureRow one chunk at a time to kafka for serialized_row in chunk: produce(topic=topic, value=serialized_row) # Force a flush after each chunk flush(timeout=timeout) # Remove chunk from memory del chunk else: raise Exception( f"Could not determine source type for feature set " f'"{feature_set.name}" with source type ' f'"{feature_set.source.source_type}"') # Print ingestion statistics producer.print_results() finally: # Remove parquet file(s) that were created earlier print("Removing temporary file(s)...") shutil.rmtree(dir_path) return None
class Client: """ Feast Client: Used for creating, managing, and retrieving features. """ def __init__(self, core_url: str = None, serving_url: str = None, verbose: bool = False): """ The Feast Client should be initialized with at least one service url Args: core_url: Feast Core URL. Used to manage features serving_url: Feast Serving URL. Used to retrieve features verbose: Enable verbose logging """ self._core_url = core_url self._serving_url = serving_url self._verbose = verbose self.__core_channel: grpc.Channel = None self.__serving_channel: grpc.Channel = None self._core_service_stub: CoreServiceStub = None self._serving_service_stub: ServingServiceStub = None @property def core_url(self) -> str: """ Retrieve Feast Core URL """ if self._core_url is not None: return self._core_url if os.getenv(FEAST_CORE_URL_ENV_KEY) is not None: return os.getenv(FEAST_CORE_URL_ENV_KEY) return "" @core_url.setter def core_url(self, value: str): """ Set the Feast Core URL Returns: Feast Core URL string """ self._core_url = value @property def serving_url(self) -> str: """ Retrieve Serving Core URL """ if self._serving_url is not None: return self._serving_url if os.getenv(FEAST_SERVING_URL_ENV_KEY) is not None: return os.getenv(FEAST_SERVING_URL_ENV_KEY) return "" @serving_url.setter def serving_url(self, value: str): """ Set the Feast Serving URL Returns: Feast Serving URL string """ self._serving_url = value def version(self): """ Returns version information from Feast Core and Feast Serving """ result = {} if self.serving_url: self._connect_serving() serving_version = self._serving_service_stub.GetFeastServingInfo( GetFeastServingInfoRequest(), timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT).version result["serving"] = { "url": self.serving_url, "version": serving_version } if self.core_url: self._connect_core() core_version = self._core_service_stub.GetFeastCoreVersion( GetFeastCoreVersionRequest(), timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT).version result["core"] = {"url": self.core_url, "version": core_version} return result def _connect_core(self, skip_if_connected: bool = True): """ Connect to Core API Args: skip_if_connected: Do not attempt to connect if already connected """ if skip_if_connected and self._core_service_stub: return if not self.core_url: raise ValueError("Please set Feast Core URL.") if self.__core_channel is None: self.__core_channel = grpc.insecure_channel(self.core_url) try: grpc.channel_ready_future(self.__core_channel).result( timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT) except grpc.FutureTimeoutError: raise ConnectionError( f"Connection timed out while attempting to connect to Feast " f"Core gRPC server {self.core_url} ") else: self._core_service_stub = CoreServiceStub(self.__core_channel) def _connect_serving(self, skip_if_connected=True): """ Connect to Serving API Args: skip_if_connected: Do not attempt to connect if already connected """ if skip_if_connected and self._serving_service_stub: return if not self.serving_url: raise ValueError("Please set Feast Serving URL.") if self.__serving_channel is None: self.__serving_channel = grpc.insecure_channel(self.serving_url) try: grpc.channel_ready_future(self.__serving_channel).result( timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT) except grpc.FutureTimeoutError: raise ConnectionError( f"Connection timed out while attempting to connect to Feast " f"Serving gRPC server {self.serving_url} ") else: self._serving_service_stub = ServingServiceStub( self.__serving_channel) def apply(self, feature_sets: Union[List[FeatureSet], FeatureSet]): """ Idempotently registers feature set(s) with Feast Core. Either a single feature set or a list can be provided. Args: feature_sets: List of feature sets that will be registered """ if not isinstance(feature_sets, list): feature_sets = [feature_sets] for feature_set in feature_sets: if isinstance(feature_set, FeatureSet): self._apply_feature_set(feature_set) continue raise ValueError( f"Could not determine feature set type to apply {feature_set}") def _apply_feature_set(self, feature_set: FeatureSet): """ Registers a single feature set with Feast Args: feature_set: Feature set that will be registered """ self._connect_core() feature_set._client = self feature_set.is_valid() # Convert the feature set to a request and send to Feast Core apply_fs_response = self._core_service_stub.ApplyFeatureSet( ApplyFeatureSetRequest(feature_set=feature_set.to_proto()), timeout=GRPC_CONNECTION_TIMEOUT_APPLY, ) # type: ApplyFeatureSetResponse # Extract the returned feature set applied_fs = FeatureSet.from_proto(apply_fs_response.feature_set) # If the feature set has changed, update the local copy if apply_fs_response.status == ApplyFeatureSetResponse.Status.CREATED: print( f'Feature set updated/created: "{applied_fs.name}:{applied_fs.version}"' ) # If no change has been applied, do nothing if apply_fs_response.status == ApplyFeatureSetResponse.Status.NO_CHANGE: print(f"No change detected or applied: {feature_set.name}") # Deep copy from the returned feature set to the local feature set feature_set._update_from_feature_set(applied_fs) def list_feature_sets(self) -> List[FeatureSet]: """ Retrieve a list of feature sets from Feast Core Returns: List of feature sets """ self._connect_core() # Get latest feature sets from Feast Core feature_set_protos = self._core_service_stub.ListFeatureSets( ListFeatureSetsRequest()) # type: ListFeatureSetsResponse # Extract feature sets and return feature_sets = [] for feature_set_proto in feature_set_protos.feature_sets: feature_set = FeatureSet.from_proto(feature_set_proto) feature_set._client = self feature_sets.append(feature_set) return feature_sets def get_feature_set(self, name: str, version: int = None) -> Union[FeatureSet, None]: """ Retrieves a feature set. If no version is specified then the latest version will be returned. Args: name: Name of feature set version: Version of feature set Returns: Returns either the specified feature set, or raises an exception if none is found """ self._connect_core() if version is None: version = 0 get_feature_set_response = self._core_service_stub.GetFeatureSet( GetFeatureSetRequest( name=name.strip(), version=int(version))) # type: GetFeatureSetResponse return FeatureSet.from_proto(get_feature_set_response.feature_set) def list_entities(self) -> Dict[str, Entity]: """ Returns a dictionary of entities across all feature sets Returns: Dictionary of entities, indexed by name """ entities_dict = OrderedDict() for fs in self.list_feature_sets(): for entity in fs.entities: entities_dict[entity.name] = entity return entities_dict def get_batch_features(self, feature_ids: List[str], entity_rows: Union[pd.DataFrame, str]) -> Job: """ Retrieves historical features from a Feast Serving deployment. Args: feature_ids (List[str]): List of feature ids that will be returned for each entity. Each feature id should have the following format "feature_set_name:version:feature_name". entity_rows (Union[pd.DataFrame, str]): Pandas dataframe containing entities and a 'datetime' column. Each entity in a feature set must be present as a column in this dataframe. The datetime column must contain timestamps in datetime64 format. Returns: feast.job.Job: Returns a job object that can be used to monitor retrieval progress asynchronously, and can be used to materialize the results. Examples: >>> from feast import Client >>> from datetime import datetime >>> >>> feast_client = Client(core_url="localhost:6565", serving_url="localhost:6566") >>> feature_ids = ["customer:1:bookings_7d"] >>> entity_rows = pd.DataFrame( >>> { >>> "datetime": [pd.datetime.now() for _ in range(3)], >>> "customer": [1001, 1002, 1003], >>> } >>> ) >>> feature_retrieval_job = feast_client.get_batch_features(feature_ids, entity_rows) >>> df = feature_retrieval_job.to_dataframe() >>> print(df) """ self._connect_serving() fs_request = _build_feature_set_request(feature_ids) # Retrieve serving information to determine store type and # staging location serving_info = self._serving_service_stub.GetFeastServingInfo( GetFeastServingInfoRequest(), timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT ) # type: GetFeastServingInfoResponse if serving_info.type != FeastServingType.FEAST_SERVING_TYPE_BATCH: raise Exception( f'You are connected to a store "{self._serving_url}" which ' f"does not support batch retrieval ") if isinstance(entity_rows, pd.DataFrame): # Pandas DataFrame detected # Validate entity rows to based on entities in Feast Core self._validate_dataframe_for_batch_retrieval( entity_rows=entity_rows, feature_sets_request=fs_request) # Remove timezone from datetime column if isinstance(entity_rows["datetime"].dtype, pd.core.dtypes.dtypes.DatetimeTZDtype): entity_rows["datetime"] = pd.DatetimeIndex( entity_rows["datetime"]).tz_localize(None) elif isinstance(entity_rows, str): # String based source if entity_rows.endswith((".avro", "*")): # Validate Avro entity rows to based on entities in Feast Core self._validate_avro_for_batch_retrieval( source=entity_rows, feature_sets_request=fs_request) else: raise Exception( f"Only .avro and wildcard paths are accepted as entity_rows" ) else: raise Exception(f"Only pandas.DataFrame and str types are allowed" f" as entity_rows, but got {type(entity_rows)}.") # Export and upload entity row DataFrame to staging location # provided by Feast staged_files = export_source_to_staging_location( entity_rows, serving_info.job_staging_location) # type: List[str] request = GetBatchFeaturesRequest( feature_sets=fs_request, dataset_source=DatasetSource(file_source=DatasetSource.FileSource( file_uris=staged_files, data_format=DataFormat.DATA_FORMAT_AVRO)), ) # Retrieve Feast Job object to manage life cycle of retrieval response = self._serving_service_stub.GetBatchFeatures(request) return Job(response.job, self._serving_service_stub) def _validate_dataframe_for_batch_retrieval(self, entity_rows: pd.DataFrame, feature_sets_request): """ Validate whether an the entity rows in a DataFrame contains the correct information for batch retrieval. Datetime column must be present in the DataFrame. Args: entity_rows (pd.DataFrame): Pandas DataFrame containing entities and datetime column. Each entity in a feature set must be present as a column in this DataFrame. feature_sets_request: Feature sets that will be requested. """ self._validate_columns(columns=entity_rows.columns, feature_sets_request=feature_sets_request, datetime_field="datetime") def _validate_avro_for_batch_retrieval(self, source: str, feature_sets_request): """ Validate whether the entity rows in an Avro source file contains the correct information for batch retrieval. Only gs:// and local files (file://) uri schemes are allowed. Avro file must have a column named "event_timestamp". No checks will be done if a GCS path is provided. Args: source (str): File path to Avro. feature_sets_request: Feature sets that will be requested. """ p = urlparse(source) if p.scheme == "gs": # GCS path provided (Risk is delegated to user) # No validation if GCS path is provided return elif p.scheme == "file" or not p.scheme: # Local file (file://) provided file_path = os.path.abspath(os.path.join(p.netloc, p.path)) else: raise Exception( f"Unsupported uri scheme provided {p.scheme}, only " f"local files (file://), and gs:// schemes are " f"allowed") with open(file_path, "rb") as f: reader = fastavro.reader(f) schema = json.loads(reader.metadata["avro.schema"]) columns = [x["name"] for x in schema["fields"]] self._validate_columns(columns=columns, feature_sets_request=feature_sets_request, datetime_field="event_timestamp") def _validate_columns(self, columns: List[str], feature_sets_request, datetime_field: str) -> None: """ Check if the required column contains the correct values for batch retrieval. Args: columns (List[str]): List of columns to validate against feature_sets_request. feature_sets_request (): Feature sets that will be requested. datetime_field (str): Name of the datetime field that must be enforced and present as a column in the data source. Returns: None: None """ # Ensure datetime column exists if datetime_field not in columns: raise ValueError( f'Entity rows does not contain "{datetime_field}" column in ' f'columns {columns}') # Validate Avro columns based on feature set entities for feature_set in feature_sets_request: fs = self.get_feature_set(name=feature_set.name, version=feature_set.version) if fs is None: raise ValueError( f'Feature set "{feature_set.name}:{feature_set.version}" ' f"could not be found") for entity_type in fs.entities: if entity_type.name not in columns: raise ValueError( f'Input does not contain entity' f' "{entity_type.name}" column in columns "{columns}"') def get_online_features( self, feature_ids: List[str], entity_rows: List[GetOnlineFeaturesRequest.EntityRow], ) -> GetOnlineFeaturesResponse: """ Retrieves the latest online feature data from Feast Serving Args: feature_ids: List of feature Ids in the following format [feature_set_name]:[version]:[feature_name] example: ["feature_set_1:6:my_feature_1", "feature_set_1:6:my_feature_2",] entity_rows: List of GetFeaturesRequest.EntityRow where each row contains entities. Timestamp should not be set for online retrieval. All entity types within a feature Returns: Returns a list of maps where each item in the list contains the latest feature values for the provided entities """ self._connect_serving() return self._serving_service_stub.GetOnlineFeatures( GetOnlineFeaturesRequest( feature_sets=_build_feature_set_request(feature_ids), entity_rows=entity_rows, )) # type: GetOnlineFeaturesResponse def ingest(self, feature_set: Union[str, FeatureSet], source: Union[pd.DataFrame, str], chunk_size: int = 10000, version: int = None, force_update: bool = False, max_workers: int = max(CPU_COUNT - 1, 1), disable_progress_bar: bool = False, timeout: int = KAFKA_CHUNK_PRODUCTION_TIMEOUT) -> None: """ Loads feature data into Feast for a specific feature set. Args: feature_set (typing.Union[str, FeatureSet]): Feature set object or the string name of the feature set (without a version). source (typing.Union[pd.DataFrame, str]): Either a file path or Pandas Dataframe to ingest into Feast Files that are currently supported: * parquet * csv * json chunk_size (int): Amount of rows to load and ingest at a time. version (int): Feature set version. force_update (bool): Automatically update feature set based on source data prior to ingesting. This will also register changes to Feast. max_workers (int): Number of worker processes to use to encode values. disable_progress_bar (bool): Disable printing of progress statistics. timeout (int): Timeout in seconds to wait for completion. Returns: None: None """ if isinstance(feature_set, FeatureSet): name = feature_set.name if version is None: version = feature_set.version elif isinstance(feature_set, str): name = feature_set else: raise Exception(f"Feature set name must be provided") # Read table and get row count tmp_table_name = _read_table_from_source(source, chunk_size, max_workers) pq_file = pq.ParquetFile(tmp_table_name) row_count = pq_file.metadata.num_rows # Update the feature set based on PyArrow table of first row group if force_update: feature_set.infer_fields_from_pa(table=pq_file.read_row_group(0), discard_unused_fields=True, replace_existing_features=True) self.apply(feature_set) current_time = time.time() print("Waiting for feature set to be ready for ingestion...") while True: if timeout is not None and time.time() - current_time >= timeout: raise TimeoutError( "Timed out waiting for feature set to be ready") feature_set = self.get_feature_set(name, version) if (feature_set is not None and feature_set.status == FeatureSetStatus.STATUS_READY): break time.sleep(3) if timeout is not None: timeout = timeout - int(time.time() - current_time) try: # Kafka configs brokers = feature_set.get_kafka_source_brokers() topic = feature_set.get_kafka_source_topic() producer = get_producer(brokers, row_count, disable_progress_bar) # Loop optimization declarations produce = producer.produce flush = producer.flush # Transform and push data to Kafka if feature_set.source.source_type == "Kafka": for chunk in get_feature_row_chunks( file=tmp_table_name, row_groups=list(range(pq_file.num_row_groups)), fs=feature_set, max_workers=max_workers): # Push FeatureRow one chunk at a time to kafka for serialized_row in chunk: produce(topic=topic, value=serialized_row) # Force a flush after each chunk flush(timeout=timeout) # Remove chunk from memory del chunk else: raise Exception( f"Could not determine source type for feature set " f'"{feature_set.name}" with source type ' f'"{feature_set.source.source_type}"') # Print ingestion statistics producer.print_results() finally: # Remove parquet file(s) that were created earlier print("Removing temporary file(s)...") os.remove(tmp_table_name) return None
class Client: def __init__(self, core_url: str = None, serving_url: str = None, verbose: bool = False): self._core_url = core_url self._serving_url = serving_url self._verbose = verbose self.__core_channel: grpc.Channel = None self.__serving_channel: grpc.Channel = None self._core_service_stub: CoreServiceStub = None self._serving_service_stub: ServingServiceStub = None @property def core_url(self) -> str: if self._core_url is not None: return self._core_url if os.getenv(FEAST_CORE_URL_ENV_KEY) is not None: return os.getenv(FEAST_CORE_URL_ENV_KEY) return "" @core_url.setter def core_url(self, value: str): self._core_url = value @property def serving_url(self) -> str: if self._serving_url is not None: return self._serving_url if os.getenv(FEAST_SERVING_URL_ENV_KEY) is not None: return os.getenv(FEAST_SERVING_URL_ENV_KEY) return "" @serving_url.setter def serving_url(self, value: str): self._serving_url = value def version(self): """ Returns version information from Feast Core and Feast Serving :return: Dictionary containing Core and Serving versions and status """ self._connect_core() self._connect_serving() core_version = "" serving_version = "" core_status = "not connected" serving_status = "not connected" try: core_version = self._core_service_stub.GetFeastCoreVersion( GetFeastCoreVersionRequest(), timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT).version core_status = "connected" except grpc.RpcError as e: print( format_grpc_exception("GetFeastCoreVersion", e.code(), e.details())) try: serving_version = self._serving_service_stub.GetFeastServingInfo( GetFeastServingInfoRequest(), timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT).version serving_status = "connected" except grpc.RpcError as e: print( format_grpc_exception("GetFeastServingInfo", e.code(), e.details())) return { "core": { "url": self.core_url, "version": core_version, "status": core_status, }, "serving": { "url": self.serving_url, "version": serving_version, "status": serving_status, }, } def _connect_core(self, skip_if_connected=True): """ Connect to Core API """ if skip_if_connected and self._core_service_stub: return if not self.core_url: raise ValueError("Please set Feast Core URL.") if self.__core_channel is None: self.__core_channel = grpc.insecure_channel(self.core_url) try: grpc.channel_ready_future(self.__core_channel).result( timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT) except grpc.FutureTimeoutError: print( f"Connection timed out while attempting to connect to Feast Core gRPC server {self.core_url}" ) sys.exit(1) else: self._core_service_stub = CoreServiceStub(self.__core_channel) def _connect_serving(self, skip_if_connected=True): """ Connect to Serving API """ if skip_if_connected and self._serving_service_stub: return if not self.serving_url: raise ValueError("Please set Feast Serving URL.") if self.__serving_channel is None: self.__serving_channel = grpc.insecure_channel(self.serving_url) try: grpc.channel_ready_future(self.__serving_channel).result( timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT) except grpc.FutureTimeoutError: print( f"Connection timed out while attempting to connect to Feast Serving gRPC server {self.serving_url} " ) sys.exit(1) else: self._serving_service_stub = ServingServiceStub( self.__serving_channel) def apply(self, feature_sets: Union[List[FeatureSet], FeatureSet]): """ Idempotently registers feature set(s) with Feast Core. Either a single feature set or a list can be provided. :param feature_sets: Union[List[FeatureSet], FeatureSet] """ if not isinstance(feature_sets, list): feature_sets = [feature_sets] for feature_set in feature_sets: if isinstance(feature_set, FeatureSet): self._apply_feature_set(feature_set) continue raise ValueError( f"Could not determine feature set type to apply {feature_set}") def _apply_feature_set(self, feature_set: FeatureSet): self._connect_core() feature_set._client = self valid, message = feature_set.is_valid() if not valid: raise Exception(message) try: apply_fs_response = self._core_service_stub.ApplyFeatureSet( ApplyFeatureSetRequest(feature_set=feature_set.to_proto()), timeout=GRPC_CONNECTION_TIMEOUT_APPLY, ) # type: ApplyFeatureSetResponse applied_fs = FeatureSet.from_proto(apply_fs_response.feature_set) if apply_fs_response.status == ApplyFeatureSetResponse.Status.CREATED: print( f'Feature set updated/created: "{applied_fs.name}:{applied_fs.version}".' ) feature_set._update_from_feature_set(applied_fs, is_dirty=False) return if apply_fs_response.status == ApplyFeatureSetResponse.Status.NO_CHANGE: print(f"No change detected in feature set {feature_set.name}") return except grpc.RpcError as e: print( format_grpc_exception("ApplyFeatureSet", e.code(), e.details())) def list_feature_sets(self) -> List[FeatureSet]: """ Retrieve a list of feature sets from Feast Core :return: Returns a list of feature sets """ self._connect_core() try: # Get latest feature sets from Feast Core feature_set_protos = self._core_service_stub.ListFeatureSets( ListFeatureSetsRequest()) # type: ListFeatureSetsResponse except grpc.RpcError as e: raise Exception( format_grpc_exception("ListFeatureSets", e.code(), e.details())) # Store list of feature sets feature_sets = [] for feature_set_proto in feature_set_protos.feature_sets: feature_set = FeatureSet.from_proto(feature_set_proto) feature_set._client = self feature_sets.append(feature_set) return feature_sets def get_feature_set( self, name: str, version: int = None, fail_if_missing: bool = False) -> Union[FeatureSet, None]: """ Retrieve a single feature set from Feast Core :param name: (str) Name of feature set :param version: (int) Version of feature set :param fail_if_missing: (bool) Throws an exception if the feature set is not found :return: Returns a single feature set """ self._connect_core() try: get_feature_set_response = self._core_service_stub.GetFeatureSet( GetFeatureSetRequest( name=name.strip(), version=str(version))) # type: GetFeatureSetResponse feature_set = get_feature_set_response.feature_set except grpc.RpcError as e: print(format_grpc_exception("GetFeatureSet", e.code(), e.details())) else: if feature_set is not None: return FeatureSet.from_proto(feature_set) if fail_if_missing: raise Exception( f'Could not find feature set with name "{name}" and ' f'version "{version}"') def list_entities(self) -> Dict[str, Entity]: """ Returns a dictionary of entities across all feature sets :return: Dictionary of entity name to Entity """ entities_dict = OrderedDict() for fs in self.list_feature_sets(): for entity in fs.entities: entities_dict[entity.name] = entity return entities_dict def get_batch_features(self, feature_ids: List[str], entity_rows: pd.DataFrame) -> Job: """ Retrieves historical features from a Feast Serving deployment. Args: feature_ids: List of feature ids that will be returned for each entity. Each feature id should have the following format "feature_set_name:version:feature_name". entity_rows: Pandas dataframe containing entities and a 'datetime' column. Each entity in a feature set must be present as a column in this dataframe. The datetime column must contain timestamps in datetime64 format Returns: Feast batch retrieval job: feast.job.Job Example usage: ============================================================ >>> from feast import Client >>> from datetime import datetime >>> >>> feast_client = Client(core_url="localhost:6565", serving_url="localhost:6566") >>> feature_ids = ["customer:1:bookings_7d"] >>> entity_rows = pd.DataFrame( >>> { >>> "datetime": [pd.datetime.now() for _ in range(3)], >>> "customer": [1001, 1002, 1003], >>> } >>> ) >>> feature_retrieval_job = feast_client.get_batch_features(feature_ids, entity_rows) >>> df = feature_retrieval_job.to_dataframe() >>> print(df) """ self._connect_serving() try: fs_request = _build_feature_set_request(feature_ids) # Validate entity rows based on entities in Feast Core self._validate_entity_rows_for_batch_retrieval( entity_rows, fs_request) # We want the timestamp column naming to be consistent with the # rest of Feast entity_rows.columns = [ "event_timestamp" if col == "datetime" else col for col in entity_rows.columns ] # Remove timezone from datetime column if isinstance( entity_rows["event_timestamp"].dtype, pd.core.dtypes.dtypes.DatetimeTZDtype, ): entity_rows["event_timestamp"] = pd.DatetimeIndex( entity_rows["event_timestamp"]).tz_localize(None) # Retrieve serving information to determine store type and staging location serving_info = self._serving_service_stub.GetFeastServingInfo( GetFeastServingInfoRequest(), timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT ) # type: GetFeastServingInfoResponse if serving_info.type != FeastServingType.FEAST_SERVING_TYPE_BATCH: raise Exception( f'You are connected to a store "{self._serving_url}" which does not support batch retrieval' ) # Export and upload entity row dataframe to staging location provided by Feast staged_file = export_dataframe_to_staging_location( entity_rows, serving_info.job_staging_location) # type: str request = GetBatchFeaturesRequest( feature_sets=fs_request, dataset_source=DatasetSource( file_source=DatasetSource.FileSource( file_uris=[staged_file], data_format=DataFormat.DATA_FORMAT_AVRO)), ) # Retrieve Feast Job object to manage life cycle of retrieval response = self._serving_service_stub.GetBatchFeatures(request) return Job(response.job, self._serving_service_stub) except grpc.RpcError as e: print( format_grpc_exception("GetBatchFeatures", e.code(), e.details())) def _validate_entity_rows_for_batch_retrieval(self, entity_rows, feature_sets_request): """ Validate whether an entity_row dataframe contains the correct information for batch retrieval :param entity_rows: Pandas dataframe containing entities and datetime column. Each entity in a feature set must be present as a column in this dataframe. :param feature_sets_request: Feature sets that will """ # Ensure datetime column exists if "datetime" not in entity_rows.columns: raise ValueError( f'Entity rows does not contain "datetime" column in columns {entity_rows.columns}' ) # Validate dataframe columns based on feature set entities for feature_set in feature_sets_request: fs = self.get_feature_set(name=feature_set.name, version=feature_set.version) if fs is None: raise ValueError( f'Feature set "{feature_set.name}:{feature_set.version}" could not be found' ) for entity_type in fs.entities: if entity_type.name not in entity_rows.columns: raise ValueError( f'Dataframe does not contain entity "{entity_type.name}" column in columns "{entity_rows.columns}"' ) def get_online_features( self, feature_ids: List[str], entity_rows: List[GetOnlineFeaturesRequest.EntityRow], ) -> GetOnlineFeaturesResponse: """ Retrieves the latest online feature data from Feast Serving :param feature_ids: List of feature Ids in the following format [feature_set_name]:[version]:[feature_name] example: ["feature_set_1:6:my_feature_1", "feature_set_1:6:my_feature_2",] :param entity_rows: List of GetFeaturesRequest.EntityRow where each row contains entities. Timestamp should not be set for online retrieval. All entity types within a feature set must be provided for each entity key. :return: Returns a list of maps where each item in the list contains the latest feature values for the provided entities """ self._connect_serving() try: response = self._serving_service_stub.GetOnlineFeatures( GetOnlineFeaturesRequest( feature_sets=_build_feature_set_request(feature_ids), entity_rows=entity_rows, )) # type: GetOnlineFeaturesResponse except grpc.RpcError as e: print( format_grpc_exception("GetOnlineFeatures", e.code(), e.details())) else: return response def ingest( self, feature_set: Union[str, FeatureSet], dataframe: pd.DataFrame, version: int = None, force_update: bool = False, max_workers: int = CPU_COUNT, disable_progress_bar: bool = False, chunk_size: int = 5000, ): """ Loads data into Feast for a specific feature set. :param feature_set: (str, FeatureSet) Feature set object or the string name of the feature set (without a version) :param dataframe: Pandas dataframe to load into Feast for this feature set :param version: (int) Version of the feature set for which this ingestion should happen :param force_update: (bool) Automatically update feature set based on data frame before ingesting data :param max_workers: Number of worker processes to use to encode the dataframe :param disable_progress_bar: Disable progress bar during ingestion :param chunk_size: Number of rows per chunk to encode before ingesting to Feast """ if isinstance(feature_set, FeatureSet): name = feature_set.name if version is None: version = feature_set.version elif isinstance(feature_set, str): name = feature_set else: raise Exception(f"Feature set name must be provided") feature_set = self.get_feature_set(name, version, fail_if_missing=True) # Update the feature set based on dataframe schema if force_update: feature_set.infer_fields_from_df(dataframe, discard_unused_fields=True, replace_existing_features=True) self.apply(feature_set) if feature_set.source.source_type == "Kafka": ingest_kafka( feature_set=feature_set, dataframe=dataframe, max_workers=max_workers, disable_progress_bar=disable_progress_bar, chunk_size=chunk_size, ) else: raise Exception(f"Could not determine source type for feature set " f'"{feature_set.name}" with source type ' f'"{feature_set.source.source_type}"')
class Client: """ Feast Client: Used for creating, managing, and retrieving features. """ def __init__(self, core_url: str = None, serving_url: str = None, verbose: bool = False): """ The Feast Client should be initialized with at least one service url Args: core_url: Feast Core URL. Used to manage features serving_url: Feast Serving URL. Used to retrieve features verbose: Enable verbose logging """ self._core_url = core_url self._serving_url = serving_url self._verbose = verbose self.__core_channel: grpc.Channel = None self.__serving_channel: grpc.Channel = None self._core_service_stub: CoreServiceStub = None self._serving_service_stub: ServingServiceStub = None @property def core_url(self) -> str: """ Retrieve Feast Core URL """ if self._core_url is not None: return self._core_url if os.getenv(FEAST_CORE_URL_ENV_KEY) is not None: return os.getenv(FEAST_CORE_URL_ENV_KEY) return "" @core_url.setter def core_url(self, value: str): """ Set the Feast Core URL Returns: Feast Core URL string """ self._core_url = value @property def serving_url(self) -> str: """ Retrieve Serving Core URL """ if self._serving_url is not None: return self._serving_url if os.getenv(FEAST_SERVING_URL_ENV_KEY) is not None: return os.getenv(FEAST_SERVING_URL_ENV_KEY) return "" @serving_url.setter def serving_url(self, value: str): """ Set the Feast Serving URL Returns: Feast Serving URL string """ self._serving_url = value def version(self): """ Returns version information from Feast Core and Feast Serving """ result = {} if self.serving_url: self._connect_serving() serving_version = self._serving_service_stub.GetFeastServingInfo( GetFeastServingInfoRequest(), timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT).version result["serving"] = { "url": self.serving_url, "version": serving_version } if self.core_url: self._connect_core() core_version = self._core_service_stub.GetFeastCoreVersion( GetFeastCoreVersionRequest(), timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT).version result["core"] = {"url": self.core_url, "version": core_version} return result def _connect_core(self, skip_if_connected: bool = True): """ Connect to Core API Args: skip_if_connected: Do not attempt to connect if already connected """ if skip_if_connected and self._core_service_stub: return if not self.core_url: raise ValueError("Please set Feast Core URL.") if self.__core_channel is None: self.__core_channel = grpc.insecure_channel(self.core_url) try: grpc.channel_ready_future(self.__core_channel).result( timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT) except grpc.FutureTimeoutError: raise ConnectionError( f"Connection timed out while attempting to connect to Feast " f"Core gRPC server {self.core_url} ") else: self._core_service_stub = CoreServiceStub(self.__core_channel) def _connect_serving(self, skip_if_connected=True): """ Connect to Serving API Args: skip_if_connected: Do not attempt to connect if already connected """ if skip_if_connected and self._serving_service_stub: return if not self.serving_url: raise ValueError("Please set Feast Serving URL.") if self.__serving_channel is None: self.__serving_channel = grpc.insecure_channel(self.serving_url) try: grpc.channel_ready_future(self.__serving_channel).result( timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT) except grpc.FutureTimeoutError: raise ConnectionError( f"Connection timed out while attempting to connect to Feast " f"Serving gRPC server {self.serving_url} ") else: self._serving_service_stub = ServingServiceStub( self.__serving_channel) def apply(self, feature_sets: Union[List[FeatureSet], FeatureSet]): """ Idempotently registers feature set(s) with Feast Core. Either a single feature set or a list can be provided. Args: feature_sets: List of feature sets that will be registered """ if not isinstance(feature_sets, list): feature_sets = [feature_sets] for feature_set in feature_sets: if isinstance(feature_set, FeatureSet): self._apply_feature_set(feature_set) continue raise ValueError( f"Could not determine feature set type to apply {feature_set}") def _apply_feature_set(self, feature_set: FeatureSet): """ Registers a single feature set with Feast Args: feature_set: Feature set that will be registered """ self._connect_core() feature_set._client = self feature_set.is_valid() # Convert the feature set to a request and send to Feast Core apply_fs_response = self._core_service_stub.ApplyFeatureSet( ApplyFeatureSetRequest(feature_set=feature_set.to_proto()), timeout=GRPC_CONNECTION_TIMEOUT_APPLY, ) # type: ApplyFeatureSetResponse # Extract the returned feature set applied_fs = FeatureSet.from_proto(apply_fs_response.feature_set) # If the feature set has changed, update the local copy if apply_fs_response.status == ApplyFeatureSetResponse.Status.CREATED: print( f'Feature set updated/created: "{applied_fs.name}:{applied_fs.version}"' ) # If no change has been applied, do nothing if apply_fs_response.status == ApplyFeatureSetResponse.Status.NO_CHANGE: print(f"No change detected or applied: {feature_set.name}") # Deep copy from the returned feature set to the local feature set feature_set.update_from_feature_set(applied_fs) def list_feature_sets(self) -> List[FeatureSet]: """ Retrieve a list of feature sets from Feast Core Returns: List of feature sets """ self._connect_core() # Get latest feature sets from Feast Core feature_set_protos = self._core_service_stub.ListFeatureSets( ListFeatureSetsRequest()) # type: ListFeatureSetsResponse # Extract feature sets and return feature_sets = [] for feature_set_proto in feature_set_protos.feature_sets: feature_set = FeatureSet.from_proto(feature_set_proto) feature_set._client = self feature_sets.append(feature_set) return feature_sets def get_feature_set(self, name: str, version: int = None) -> Union[FeatureSet, None]: """ Retrieves a feature set. If no version is specified then the latest version will be returned. Args: name: Name of feature set version: Version of feature set Returns: Returns either the specified feature set, or raises an exception if none is found """ self._connect_core() if version is None: version = 0 get_feature_set_response = self._core_service_stub.GetFeatureSet( GetFeatureSetRequest( name=name.strip(), version=int(version))) # type: GetFeatureSetResponse return FeatureSet.from_proto(get_feature_set_response.feature_set) def list_entities(self) -> Dict[str, Entity]: """ Returns a dictionary of entities across all feature sets Returns: Dictionary of entities, indexed by name """ entities_dict = OrderedDict() for fs in self.list_feature_sets(): for entity in fs.entities: entities_dict[entity.name] = entity return entities_dict def get_batch_features(self, feature_ids: List[str], entity_rows: pd.DataFrame) -> Job: """ Retrieves historical features from a Feast Serving deployment. Args: feature_ids: List of feature ids that will be returned for each entity. Each feature id should have the following format "feature_set_name:version:feature_name". entity_rows: Pandas dataframe containing entities and a 'datetime' column. Each entity in a feature set must be present as a column in this dataframe. The datetime column must Returns: Returns a job object that can be used to monitor retrieval progress asynchronously, and can be used to materialize the results Examples: >>> from feast import Client >>> from datetime import datetime >>> >>> feast_client = Client(core_url="localhost:6565", serving_url="localhost:6566") >>> feature_ids = ["customer:1:bookings_7d"] >>> entity_rows = pd.DataFrame( >>> { >>> "datetime": [pd.datetime.now() for _ in range(3)], >>> "customer": [1001, 1002, 1003], >>> } >>> ) >>> feature_retrieval_job = feast_client.get_batch_features(feature_ids, entity_rows) >>> df = feature_retrieval_job.to_dataframe() >>> print(df) """ self._connect_serving() fs_request = _build_feature_set_request(feature_ids) # Validate entity rows based on entities in Feast Core self._validate_entity_rows_for_batch_retrieval(entity_rows, fs_request) # Remove timezone from datetime column if isinstance(entity_rows["datetime"].dtype, pd.core.dtypes.dtypes.DatetimeTZDtype): entity_rows["datetime"] = pd.DatetimeIndex( entity_rows["datetime"]).tz_localize(None) # Retrieve serving information to determine store type and # staging location serving_info = self._serving_service_stub.GetFeastServingInfo( GetFeastServingInfoRequest(), timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT ) # type: GetFeastServingInfoResponse if serving_info.type != FeastServingType.FEAST_SERVING_TYPE_BATCH: raise Exception( f'You are connected to a store "{self._serving_url}" which ' f"does not support batch retrieval ") # Export and upload entity row dataframe to staging location # provided by Feast staged_file = export_dataframe_to_staging_location( entity_rows, serving_info.job_staging_location) # type: str request = GetBatchFeaturesRequest( feature_sets=fs_request, dataset_source=DatasetSource(file_source=DatasetSource.FileSource( file_uris=[staged_file], data_format=DataFormat.DATA_FORMAT_AVRO)), ) # Retrieve Feast Job object to manage life cycle of retrieval response = self._serving_service_stub.GetBatchFeatures(request) return Job(response.job, self._serving_service_stub) def _validate_entity_rows_for_batch_retrieval(self, entity_rows, feature_sets_request): """ Validate whether an entity_row dataframe contains the correct information for batch retrieval Args: entity_rows: Pandas dataframe containing entities and datetime column. Each entity in a feature set must be present as a column in this dataframe. feature_sets_request: Feature sets that will be requested """ # Ensure datetime column exists if "datetime" not in entity_rows.columns: raise ValueError( f'Entity rows does not contain "datetime" column in columns ' f"{entity_rows.columns}") # Validate dataframe columns based on feature set entities for feature_set in feature_sets_request: fs = self.get_feature_set(name=feature_set.name, version=feature_set.version) if fs is None: raise ValueError( f'Feature set "{feature_set.name}:{feature_set.version}" ' f"could not be found") for entity_type in fs.entities: if entity_type.name not in entity_rows.columns: raise ValueError( f'Dataframe does not contain entity "{entity_type.name}"' f' column in columns "{entity_rows.columns}"') def get_online_features( self, feature_ids: List[str], entity_rows: List[GetOnlineFeaturesRequest.EntityRow], ) -> GetOnlineFeaturesResponse: """ Retrieves the latest online feature data from Feast Serving Args: feature_ids: List of feature Ids in the following format [feature_set_name]:[version]:[feature_name] example: ["feature_set_1:6:my_feature_1", "feature_set_1:6:my_feature_2",] entity_rows: List of GetFeaturesRequest.EntityRow where each row contains entities. Timestamp should not be set for online retrieval. All entity types within a feature Returns: Returns a list of maps where each item in the list contains the latest feature values for the provided entities """ self._connect_serving() return self._serving_service_stub.GetOnlineFeatures( GetOnlineFeaturesRequest( feature_sets=_build_feature_set_request(feature_ids), entity_rows=entity_rows, )) # type: GetOnlineFeaturesResponse def ingest( self, feature_set: Union[str, FeatureSet], source: Union[pd.DataFrame, str], version: int = None, force_update: bool = False, max_workers: int = CPU_COUNT, disable_progress_bar: bool = False, chunk_size: int = 5000, timeout: int = None, ): """ Loads feature data into Feast for a specific feature set. Args: feature_set: Name of feature set or a feature set object source: Either a file path or Pandas Dataframe to ingest into Feast Files that are currently supported: * parquet * csv * json version: Feature set version force_update: Automatically update feature set based on source data prior to ingesting. This will also register changes to Feast max_workers: Number of worker processes to use to encode values disable_progress_bar: Disable printing of progress statistics chunk_size: Maximum amount of rows to load into memory and ingest at a time timeout: Seconds to wait before ingestion times out """ if isinstance(feature_set, FeatureSet): name = feature_set.name if version is None: version = feature_set.version elif isinstance(feature_set, str): name = feature_set else: raise Exception(f"Feature set name must be provided") table = _read_table_from_source(source) # Update the feature set based on DataFrame schema if force_update: # Use a small as reference DataFrame to infer fields ref_df = table.to_batches(max_chunksize=20)[0].to_pandas() feature_set.infer_fields_from_df(ref_df, discard_unused_fields=True, replace_existing_features=True) self.apply(feature_set) feature_set = self.get_feature_set(name, version) if feature_set.source.source_type == "Kafka": ingest_table_to_kafka( feature_set=feature_set, table=table, max_workers=max_workers, disable_pbar=disable_progress_bar, chunk_size=chunk_size, timeout=timeout, ) else: raise Exception(f"Could not determine source type for feature set " f'"{feature_set.name}" with source type ' f'"{feature_set.source.source_type}"')