def test_restarting_failed_jobs(feature_table): """ If configured - restart failed jobs """ feast_client = FeastClient( job_service_pause_between_jobs=0, job_service_retry_failed_jobs=True, options={"whitelisted_projects": "default,ride"}, ) feast_client.list_projects = Mock(return_value=["default"]) feast_client.list_feature_tables = Mock() spark_client = Client(feast_client) spark_client.list_jobs = Mock() spark_client.start_stream_to_online_ingestion = Mock() spark_client.feature_store.list_feature_tables.return_value = [ feature_table ] spark_client.list_jobs.return_value = [] ensure_stream_ingestion_jobs(spark_client, all_projects=True) spark_client.list_jobs.assert_called_once_with(include_terminated=False) spark_client.start_stream_to_online_ingestion.assert_called_once_with( feature_table, [], project="default")
def feast_client(): c = FeastClient( job_service_pause_between_jobs=0, options={"whitelisted_projects": "default,ride"}, ) c.list_projects = Mock(return_value=["default", "ride", "invalid_project"]) c.list_feature_tables = Mock() yield c
def ensure_stream_ingestion_jobs(client: feast.Client, all_projects: bool): """Ensures all required stream ingestion jobs are running and cleans up the unnecessary jobs. More concretely, it will determine - which stream ingestion jobs are running - which stream ingestion jobs should be running And it'll do 2 kinds of operations - Cancel all running jobs that should not be running - Start all non-existent jobs that should be running Args: all_projects (bool): If true, runs the check for all project. Otherwise only checks the client's current project. """ projects = client.list_projects() if all_projects else [client.project] expected_job_hash_to_table_refs = _get_expected_job_hash_to_table_refs( client, projects) expected_job_hashes = set(expected_job_hash_to_table_refs.keys()) jobs_by_hash: Dict[str, StreamIngestionJob] = {} for job in client.list_jobs(include_terminated=False): if isinstance(job, StreamIngestionJob): jobs_by_hash[job.get_hash()] = job existing_job_hashes = set(jobs_by_hash.keys()) job_hashes_to_cancel = existing_job_hashes - expected_job_hashes job_hashes_to_start = expected_job_hashes - existing_job_hashes logging.debug( f"existing_job_hashes = {sorted(list(existing_job_hashes))} expected_job_hashes = {sorted(list(expected_job_hashes))}" ) for job_hash in job_hashes_to_cancel: job = jobs_by_hash[job_hash] logging.info( f"Cancelling a stream ingestion job with job_hash={job_hash} job_id={job.get_id()} status={job.get_status()}" ) try: job.cancel() except FailedPrecondition as exc: logging.warning(f"Job canceling failed with exception {exc}") for job_hash in job_hashes_to_start: # Any job that we wish to start should be among expected table refs map project, table_name = expected_job_hash_to_table_refs[job_hash] logging.info( f"Starting a stream ingestion job for project={project}, table_name={table_name} with job_hash={job_hash}" ) feature_table = client.get_feature_table(name=table_name, project=project) client.start_stream_to_online_ingestion(feature_table, [], project=project)
def feast_client(): c = FeastClient(job_service_pause_between_jobs=0) c.list_projects = Mock(return_value=["default"]) c.list_feature_tables = Mock() yield c
class FeastExtractor(Extractor): """ Extracts feature tables from Feast Core service. Since Feast is a metadata store (and not the database itself), it maps the following atributes: * a database is name of feast project * table name is a name of the feature table * columns are features stored in the feature table """ FEAST_SERVICE_CONFIG_KEY = "instance_name" FEAST_ENDPOINT_CONFIG_KEY = "endpoint" DESCRIBE_FEATURE_TABLES = "describe_feature_tables" DEFAULT_CONFIG = ConfigFactory.from_dict({ FEAST_SERVICE_CONFIG_KEY: "main", DESCRIBE_FEATURE_TABLES: True }) def init(self, conf: ConfigTree) -> None: conf = conf.with_fallback(FeastExtractor.DEFAULT_CONFIG) self._feast_service = conf.get_string( FeastExtractor.FEAST_SERVICE_CONFIG_KEY) self._describe_feature_tables = conf.get_bool( FeastExtractor.DESCRIBE_FEATURE_TABLES) self._client = Client( core_url=conf.get_string(FeastExtractor.FEAST_ENDPOINT_CONFIG_KEY)) self._extract_iter: Union[None, Iterator] = None def get_scope(self) -> str: return "extractor.feast" def extract(self) -> Union[TableMetadata, None]: """ For every feature table from Feast, a multiple objets are extracted: 1. TableMetadata with feature table description 2. Programmatic Description of the feature table, containing metadata - date of creation and labels 3. Programmatic Description with Batch Source specification 4. (if applicable) Programmatic Description with Stream Source specification """ if not self._extract_iter: self._extract_iter = self._get_extract_iter() try: return next(self._extract_iter) except StopIteration: return None def _get_extract_iter(self) -> Iterator[TableMetadata]: for project in self._client.list_projects(): for feature_table in self._client.list_feature_tables( project=project): yield from self._extract_feature_table(project, feature_table) def _extract_feature_table( self, project: str, feature_table: FeatureTable) -> Iterator[TableMetadata]: columns = [] for index, entity_name in enumerate(feature_table.entities): entity = self._client.get_entity(entity_name, project=project) columns.append( ColumnMetadata(entity.name, entity.description, entity.value_type, index)) for index, feature in enumerate(feature_table.features): columns.append( ColumnMetadata( feature.name, None, feature.dtype.name, len(feature_table.entities) + index, )) yield TableMetadata( "feast", self._feast_service, project, feature_table.name, None, columns, ) if self._describe_feature_tables: created_at = datetime.utcfromtimestamp( feature_table.created_timestamp.seconds) description = f"* Created at **{created_at}**\n" if feature_table.labels: description += "* Labels:\n" for key, value in feature_table.labels.items(): description += f" * {key}: **{value}**\n" yield TableMetadata( "feast", self._feast_service, project, feature_table.name, description, description_source="feature_table_details", ) yield TableMetadata( "feast", self._feast_service, project, feature_table.name, f'```\n{yaml.dump(feature_table.to_dict()["spec"]["batchSource"])}```', description_source="batch_source", ) if feature_table.stream_source: yield TableMetadata( "feast", self._feast_service, project, feature_table.name, f'```\n{yaml.dump(feature_table.to_dict()["spec"]["streamSource"])}```', description_source="stream_source", )