def get_latest_timestamps(): store = FeatureStore(repo_path=".") feature_views = store.list_feature_views() for fv in feature_views: print( f"Data source latest event for {fv.name} is {fv.batch_source._meta.latest_event_timestamp}" )
class FeastRepositorySource(Source): """ This plugin extracts: - Entities as [`MLPrimaryKey`](https://datahubproject.io/docs/graphql/objects#mlprimarykey) - Features as [`MLFeature`](https://datahubproject.io/docs/graphql/objects#mlfeature) - Feature views and on-demand feature views as [`MLFeatureTable`](https://datahubproject.io/docs/graphql/objects#mlfeaturetable) - Batch and stream source details as [`Dataset`](https://datahubproject.io/docs/graphql/objects#dataset) - Column types associated with each entity and feature """ source_config: FeastRepositorySourceConfig report: SourceReport feature_store: FeatureStore def __init__(self, config: FeastRepositorySourceConfig, ctx: PipelineContext): super().__init__(ctx) self.source_config = config self.report = SourceReport() self.feature_store = FeatureStore(self.source_config.path) def _get_field_type(self, field_type: ValueType, parent_name: str) -> str: """ Maps types encountered in Feast to corresponding schema types. """ ml_feature_data_type = _field_type_mapping.get(field_type) if ml_feature_data_type is None: self.report.report_warning( parent_name, f"unable to map type {field_type} to metadata schema" ) ml_feature_data_type = MLFeatureDataType.UNKNOWN return ml_feature_data_type def _get_data_source_details(self, source: DataSource) -> Tuple[str, str]: """ Get Feast batch/stream source platform and name. """ platform = "unknown" name = "unknown" if isinstance(source, FileSource): platform = "file" name = source.path.replace("://", ".").replace("/", ".") if isinstance(source, BigQuerySource): platform = "bigquery" name = source.table if isinstance(source, KafkaSource): platform = "kafka" name = source.kafka_options.topic if isinstance(source, KinesisSource): platform = "kinesis" name = ( f"{source.kinesis_options.region}:{source.kinesis_options.stream_name}" ) if isinstance(source, RequestDataSource): platform = "request" name = source.name return platform, name def _get_data_sources(self, feature_view: FeatureView) -> List[str]: """ Get data source URN list. """ sources = [] if feature_view.batch_source is not None: batch_source_platform, batch_source_name = self._get_data_source_details( feature_view.batch_source ) sources.append( builder.make_dataset_urn( batch_source_platform, batch_source_name, self.source_config.environment, ) ) if feature_view.stream_source is not None: stream_source_platform, stream_source_name = self._get_data_source_details( feature_view.stream_source ) sources.append( builder.make_dataset_urn( stream_source_platform, stream_source_name, self.source_config.environment, ) ) return sources def _get_entity_workunit( self, feature_view: FeatureView, entity: Entity ) -> MetadataWorkUnit: """ Generate an MLPrimaryKey work unit for a Feast entity. """ feature_view_name = f"{self.feature_store.project}.{feature_view.name}" entity_snapshot = MLPrimaryKeySnapshot( urn=builder.make_ml_primary_key_urn(feature_view_name, entity.name), aspects=[StatusClass(removed=False)], ) entity_snapshot.aspects.append( MLPrimaryKeyPropertiesClass( description=entity.description, dataType=self._get_field_type(entity.value_type, entity.name), sources=self._get_data_sources(feature_view), ) ) mce = MetadataChangeEvent(proposedSnapshot=entity_snapshot) return MetadataWorkUnit(id=entity.name, mce=mce) def _get_feature_workunit( self, feature_view: Union[FeatureView, OnDemandFeatureView], feature: Feature, ) -> MetadataWorkUnit: """ Generate an MLFeature work unit for a Feast feature. """ feature_view_name = f"{self.feature_store.project}.{feature_view.name}" feature_snapshot = MLFeatureSnapshot( urn=builder.make_ml_feature_urn(feature_view_name, feature.name), aspects=[StatusClass(removed=False)], ) feature_sources = [] if isinstance(feature_view, FeatureView): feature_sources = self._get_data_sources(feature_view) elif isinstance(feature_view, OnDemandFeatureView): if feature_view.input_request_data_sources is not None: for request_source in feature_view.input_request_data_sources.values(): source_platform, source_name = self._get_data_source_details( request_source ) feature_sources.append( builder.make_dataset_urn( source_platform, source_name, self.source_config.environment, ) ) if feature_view.input_feature_view_projections is not None: for ( feature_view_projection ) in feature_view.input_feature_view_projections.values(): feature_view_source = self.feature_store.get_feature_view( feature_view_projection.name ) feature_sources.extend(self._get_data_sources(feature_view_source)) feature_snapshot.aspects.append( MLFeaturePropertiesClass( description=feature.labels.get("description"), dataType=self._get_field_type(feature.dtype, feature.name), sources=feature_sources, ) ) mce = MetadataChangeEvent(proposedSnapshot=feature_snapshot) return MetadataWorkUnit(id=feature.name, mce=mce) def _get_feature_view_workunit(self, feature_view: FeatureView) -> MetadataWorkUnit: """ Generate an MLFeatureTable work unit for a Feast feature view. """ feature_view_name = f"{self.feature_store.project}.{feature_view.name}" feature_view_snapshot = MLFeatureTableSnapshot( urn=builder.make_ml_feature_table_urn("feast", feature_view_name), aspects=[ BrowsePathsClass( paths=[f"/feast/{self.feature_store.project}/{feature_view_name}"] ), StatusClass(removed=False), ], ) feature_view_snapshot.aspects.append( MLFeatureTablePropertiesClass( mlFeatures=[ builder.make_ml_feature_urn( feature_view_name, feature.name, ) for feature in feature_view.features ], mlPrimaryKeys=[ builder.make_ml_primary_key_urn(feature_view_name, entity_name) for entity_name in feature_view.entities ], ) ) mce = MetadataChangeEvent(proposedSnapshot=feature_view_snapshot) return MetadataWorkUnit(id=feature_view_name, mce=mce) def _get_on_demand_feature_view_workunit( self, on_demand_feature_view: OnDemandFeatureView ) -> MetadataWorkUnit: """ Generate an MLFeatureTable work unit for a Feast on-demand feature view. """ on_demand_feature_view_name = ( f"{self.feature_store.project}.{on_demand_feature_view.name}" ) on_demand_feature_view_snapshot = MLFeatureTableSnapshot( urn=builder.make_ml_feature_table_urn("feast", on_demand_feature_view_name), aspects=[ BrowsePathsClass( paths=[ f"/feast/{self.feature_store.project}/{on_demand_feature_view_name}" ] ), StatusClass(removed=False), ], ) on_demand_feature_view_snapshot.aspects.append( MLFeatureTablePropertiesClass( mlFeatures=[ builder.make_ml_feature_urn( on_demand_feature_view_name, feature.name, ) for feature in on_demand_feature_view.features ], mlPrimaryKeys=[], ) ) mce = MetadataChangeEvent(proposedSnapshot=on_demand_feature_view_snapshot) return MetadataWorkUnit(id=on_demand_feature_view_name, mce=mce) @classmethod def create(cls, config_dict, ctx): config = FeastRepositorySourceConfig.parse_obj(config_dict) return cls(config, ctx) def get_workunits(self) -> Iterable[MetadataWorkUnit]: for feature_view in self.feature_store.list_feature_views(): for entity_name in feature_view.entities: entity = self.feature_store.get_entity(entity_name) work_unit = self._get_entity_workunit(feature_view, entity) self.report.report_workunit(work_unit) yield work_unit for feature in feature_view.features: work_unit = self._get_feature_workunit(feature_view, feature) self.report.report_workunit(work_unit) yield work_unit work_unit = self._get_feature_view_workunit(feature_view) self.report.report_workunit(work_unit) yield work_unit for on_demand_feature_view in self.feature_store.list_on_demand_feature_views(): for feature in on_demand_feature_view.features: work_unit = self._get_feature_workunit(on_demand_feature_view, feature) self.report.report_workunit(work_unit) yield work_unit work_unit = self._get_on_demand_feature_view_workunit( on_demand_feature_view ) self.report.report_workunit(work_unit) yield work_unit def get_report(self) -> SourceReport: return self.report def close(self) -> None: return
def test_universal_cli(test_repo_config) -> None: project = f"test_universal_cli_{str(uuid.uuid4()).replace('-', '')[:8]}" runner = CliRunner() with tempfile.TemporaryDirectory() as repo_dir_name: feature_store_yaml = make_feature_store_yaml(project, test_repo_config, repo_dir_name) repo_path = Path(repo_dir_name) repo_config = repo_path / "feature_store.yaml" repo_config.write_text(dedent(feature_store_yaml)) repo_example = repo_path / "example.py" repo_example.write_text(get_example_repo("example_feature_repo_1.py")) result = runner.run(["apply"], cwd=repo_path) assertpy.assert_that(result.returncode).is_equal_to(0) # Store registry contents, to be compared later. fs = FeatureStore(repo_path=str(repo_path)) registry_dict = fs.registry.to_dict(project=project) # entity & feature view list commands should succeed result = runner.run(["entities", "list"], cwd=repo_path) assertpy.assert_that(result.returncode).is_equal_to(0) result = runner.run(["feature-views", "list"], cwd=repo_path) assertpy.assert_that(result.returncode).is_equal_to(0) result = runner.run(["feature-services", "list"], cwd=repo_path) assertpy.assert_that(result.returncode).is_equal_to(0) # entity & feature view describe commands should succeed when objects exist result = runner.run(["entities", "describe", "driver"], cwd=repo_path) assertpy.assert_that(result.returncode).is_equal_to(0) result = runner.run(["feature-views", "describe", "driver_locations"], cwd=repo_path) assertpy.assert_that(result.returncode).is_equal_to(0) result = runner.run( ["feature-services", "describe", "driver_locations_service"], cwd=repo_path) assertpy.assert_that(result.returncode).is_equal_to(0) assertpy.assert_that(fs.list_feature_views()).is_length(3) # entity & feature view describe commands should fail when objects don't exist result = runner.run(["entities", "describe", "foo"], cwd=repo_path) assertpy.assert_that(result.returncode).is_equal_to(1) result = runner.run(["feature-views", "describe", "foo"], cwd=repo_path) assertpy.assert_that(result.returncode).is_equal_to(1) result = runner.run(["feature-services", "describe", "foo"], cwd=repo_path) assertpy.assert_that(result.returncode).is_equal_to(1) # Doing another apply should be a no op, and should not cause errors result = runner.run(["apply"], cwd=repo_path) assertpy.assert_that(result.returncode).is_equal_to(0) basic_rw_test( FeatureStore(repo_path=str(repo_path), config=None), view_name="driver_locations", ) # Confirm that registry contents have not changed. assertpy.assert_that(registry_dict).is_equal_to( fs.registry.to_dict(project=project)) result = runner.run(["teardown"], cwd=repo_path) assertpy.assert_that(result.returncode).is_equal_to(0)
def test_universal_cli(environment: Environment): project = f"test_universal_cli_{str(uuid.uuid4()).replace('-', '')[:8]}" runner = CliRunner() with tempfile.TemporaryDirectory() as repo_dir_name: try: repo_path = Path(repo_dir_name) feature_store_yaml = make_feature_store_yaml( project, environment.test_repo_config, repo_path) repo_config = repo_path / "feature_store.yaml" repo_config.write_text(dedent(feature_store_yaml)) repo_example = repo_path / "example.py" repo_example.write_text( get_example_repo("example_feature_repo_1.py")) result = runner.run(["apply"], cwd=repo_path) assertpy.assert_that(result.returncode).is_equal_to(0) # Store registry contents, to be compared later. fs = FeatureStore(repo_path=str(repo_path)) registry_dict = fs.registry.to_dict(project=project) # Save only the specs, not the metadata. registry_specs = { key: [fco["spec"] if "spec" in fco else fco for fco in value] for key, value in registry_dict.items() } # entity & feature view list commands should succeed result = runner.run(["entities", "list"], cwd=repo_path) assertpy.assert_that(result.returncode).is_equal_to(0) result = runner.run(["feature-views", "list"], cwd=repo_path) assertpy.assert_that(result.returncode).is_equal_to(0) result = runner.run(["feature-services", "list"], cwd=repo_path) assertpy.assert_that(result.returncode).is_equal_to(0) result = runner.run(["data-sources", "list"], cwd=repo_path) assertpy.assert_that(result.returncode).is_equal_to(0) # entity & feature view describe commands should succeed when objects exist result = runner.run(["entities", "describe", "driver"], cwd=repo_path) assertpy.assert_that(result.returncode).is_equal_to(0) result = runner.run( ["feature-views", "describe", "driver_locations"], cwd=repo_path) assertpy.assert_that(result.returncode).is_equal_to(0) result = runner.run( ["feature-services", "describe", "driver_locations_service"], cwd=repo_path, ) assertpy.assert_that(result.returncode).is_equal_to(0) assertpy.assert_that(fs.list_feature_views()).is_length(4) result = runner.run( ["data-sources", "describe", "customer_profile_source"], cwd=repo_path, ) assertpy.assert_that(result.returncode).is_equal_to(0) assertpy.assert_that(fs.list_data_sources()).is_length(4) # entity & feature view describe commands should fail when objects don't exist result = runner.run(["entities", "describe", "foo"], cwd=repo_path) assertpy.assert_that(result.returncode).is_equal_to(1) result = runner.run(["feature-views", "describe", "foo"], cwd=repo_path) assertpy.assert_that(result.returncode).is_equal_to(1) result = runner.run(["feature-services", "describe", "foo"], cwd=repo_path) assertpy.assert_that(result.returncode).is_equal_to(1) result = runner.run(["data-sources", "describe", "foo"], cwd=repo_path) assertpy.assert_that(result.returncode).is_equal_to(1) # Doing another apply should be a no op, and should not cause errors result = runner.run(["apply"], cwd=repo_path) assertpy.assert_that(result.returncode).is_equal_to(0) basic_rw_test( FeatureStore(repo_path=str(repo_path), config=None), view_name="driver_locations", ) # Confirm that registry contents have not changed. registry_dict = fs.registry.to_dict(project=project) assertpy.assert_that(registry_specs).is_equal_to({ key: [fco["spec"] if "spec" in fco else fco for fco in value] for key, value in registry_dict.items() }) result = runner.run(["teardown"], cwd=repo_path) assertpy.assert_that(result.returncode).is_equal_to(0) finally: runner.run(["teardown"], cwd=repo_path)
class FeastExtractor(Extractor): """ Extracts feature tables from Feast feature store file. Since Feast is a metadata store (and not the database itself), it maps the following attributes: * a database is name of feast project * table name is a name of the feature view * columns are features stored in the feature view """ FEAST_REPOSITORY_PATH = "/path/to/repository" DESCRIBE_FEATURE_VIEWS = "describe_feature_views" DEFAULT_CONFIG = ConfigFactory.from_dict({ FEAST_REPOSITORY_PATH: ".", DESCRIBE_FEATURE_VIEWS: True }) def init(self, conf: ConfigTree) -> None: conf = conf.with_fallback(FeastExtractor.DEFAULT_CONFIG) self._feast_repository_path = conf.get_string( FeastExtractor.FEAST_REPOSITORY_PATH) self._describe_feature_views = conf.get_bool( FeastExtractor.DESCRIBE_FEATURE_VIEWS) self._feast = FeatureStore(repo_path=self._feast_repository_path) self._extract_iter: Union[None, Iterator] = None def get_scope(self) -> str: return "extractor.feast" def extract(self) -> Union[TableMetadata, None]: """ For every feature table from Feast, a multiple objets are extracted: 1. TableMetadata with feature view description 2. Programmatic Description of the feature view, containing metadata - date of creation and labels 3. Programmatic Description with Batch Source specification 4. (if applicable) Programmatic Description with Stream Source specification """ if not self._extract_iter: self._extract_iter = self._get_extract_iter() try: return next(self._extract_iter) except StopIteration: return None def _get_extract_iter(self) -> Iterator[TableMetadata]: for feature_view in self._feast.list_feature_views(): yield from self._extract_feature_view(feature_view) def _extract_feature_view( self, feature_view: FeatureView) -> Iterator[TableMetadata]: columns = [] for index, entity_name in enumerate(feature_view.entities): entity = self._feast.get_entity(entity_name) columns.append( ColumnMetadata(entity.name, entity.description, entity.value_type.name, index)) for index, feature in enumerate(feature_view.features): columns.append( ColumnMetadata( feature.name, None, feature.dtype.name, len(feature_view.entities) + index, )) yield TableMetadata( "feast", self._feast.config.provider, self._feast.project, feature_view.name, None, columns, ) if self._describe_feature_views: description = str() if feature_view.created_timestamp: created_at = datetime.utcfromtimestamp( feature_view.created_timestamp.timestamp()) description = f"* Created at **{created_at}**\n" if feature_view.tags: description += "* Tags:\n" for key, value in feature_view.tags.items(): description += f" * {key}: **{value}**\n" yield TableMetadata( "feast", self._feast.config.provider, self._feast.project, feature_view.name, description, description_source="feature_view_details", ) yield TableMetadata( "feast", self._feast.config.provider, self._feast.project, feature_view.name, f"```\n{str(feature_view.batch_source.to_proto())}```", description_source="batch_source", ) if feature_view.stream_source: yield TableMetadata( "feast", self._feast.config.provider, self._feast.project, feature_view.name, f"```\n{str(feature_view.stream_source.to_proto())}```", description_source="stream_source", )