def cli(core_url, output_path): client = Client(core_url=core_url) tables = client.list_feature_tables() # sort tables by name for consistent outputs tables = sorted(tables, key=lambda x: x.name) parsed_tables = [] for table in tables: # sort entities by name for consistent outputs entities = sorted(table.entities) batch_source = None stream_source = None # platform and name for constructing URN later on batch_source_platform = "unknown" stream_source_platform = "unknown" batch_source_name = "unknown" stream_source_name = "unknown" if isinstance(table.batch_source, BigQuerySource): batch_source = "BigQuerySource" batch_source_platform = "bigquery" batch_source_name = table.batch_source.bigquery_options.table_ref if isinstance(table.batch_source, FileSource): batch_source = "FileSource" batch_source_platform = "file" # replace slashes because the react frontend can't parse them correctly batch_source_name = table.batch_source.file_options.file_url.replace( "/", "." ) # replace redundant file prefix if batch_source_name.startswith("file:.."): batch_source_name = batch_source_name[7:] if isinstance(table.stream_source, KafkaSource): stream_source = "KafkaSource" stream_source_platform = "kafka" stream_source_name = table.stream_source.kafka_options.topic if isinstance(table.stream_source, KinesisSource): stream_source = "KinesisSource" stream_source_platform = "kinesis" stream_source_name = f"{table.stream_source.kinesis_options.region}-{table.stream_source.kinesis_options.stream_name}" # currently unused in MCE outputs, but useful for debugging stream_source_config = table.to_dict()["spec"].get("streamSource") batch_source_config = table.to_dict()["spec"]["batchSource"] raw_entities = [ client.get_entity(entity_name) for entity_name in table.entities ] raw_entities = sorted(raw_entities, key=lambda x: x.name) source_info = { "batch_source": batch_source, "stream_source": stream_source, "batch_source_config": batch_source_config, "stream_source_config": stream_source_config, "batch_source_platform": batch_source_platform, "stream_source_platform": stream_source_platform, "batch_source_name": batch_source_name, "stream_source_name": stream_source_name, } # sort entities by name for consistent outputs entities = sorted( [ { "name": x.name, "type": x.value_type.name, "description": x.description, **source_info, } for x in raw_entities ], key=lambda x: x["name"], ) # sort features by name for consistent outputs features = sorted( [ {"name": x.name, "type": x.dtype.name, **source_info} for x in table.features ], key=lambda x: x["name"], ) parsed_tables.append( { "name": table.name, "entities": entities, "features": features, } ) if output_path is not None: with open(output_path, "w") as f: json.dump(parsed_tables, f) else: print(parsed_tables)
class FeastExtractor(Extractor): """ Extracts feature tables from Feast Core service. Since Feast is a metadata store (and not the database itself), it maps the following atributes: * a database is name of feast project * table name is a name of the feature table * columns are features stored in the feature table """ FEAST_SERVICE_CONFIG_KEY = "instance_name" FEAST_ENDPOINT_CONFIG_KEY = "endpoint" DESCRIBE_FEATURE_TABLES = "describe_feature_tables" DEFAULT_CONFIG = ConfigFactory.from_dict({ FEAST_SERVICE_CONFIG_KEY: "main", DESCRIBE_FEATURE_TABLES: True }) def init(self, conf: ConfigTree) -> None: conf = conf.with_fallback(FeastExtractor.DEFAULT_CONFIG) self._feast_service = conf.get_string( FeastExtractor.FEAST_SERVICE_CONFIG_KEY) self._describe_feature_tables = conf.get_bool( FeastExtractor.DESCRIBE_FEATURE_TABLES) self._client = Client( core_url=conf.get_string(FeastExtractor.FEAST_ENDPOINT_CONFIG_KEY)) self._extract_iter: Union[None, Iterator] = None def get_scope(self) -> str: return "extractor.feast" def extract(self) -> Union[TableMetadata, None]: """ For every feature table from Feast, a multiple objets are extracted: 1. TableMetadata with feature table description 2. Programmatic Description of the feature table, containing metadata - date of creation and labels 3. Programmatic Description with Batch Source specification 4. (if applicable) Programmatic Description with Stream Source specification """ if not self._extract_iter: self._extract_iter = self._get_extract_iter() try: return next(self._extract_iter) except StopIteration: return None def _get_extract_iter(self) -> Iterator[TableMetadata]: for project in self._client.list_projects(): for feature_table in self._client.list_feature_tables( project=project): yield from self._extract_feature_table(project, feature_table) def _extract_feature_table( self, project: str, feature_table: FeatureTable) -> Iterator[TableMetadata]: columns = [] for index, entity_name in enumerate(feature_table.entities): entity = self._client.get_entity(entity_name, project=project) columns.append( ColumnMetadata(entity.name, entity.description, entity.value_type, index)) for index, feature in enumerate(feature_table.features): columns.append( ColumnMetadata( feature.name, None, feature.dtype.name, len(feature_table.entities) + index, )) yield TableMetadata( "feast", self._feast_service, project, feature_table.name, None, columns, ) if self._describe_feature_tables: created_at = datetime.utcfromtimestamp( feature_table.created_timestamp.seconds) description = f"* Created at **{created_at}**\n" if feature_table.labels: description += "* Labels:\n" for key, value in feature_table.labels.items(): description += f" * {key}: **{value}**\n" yield TableMetadata( "feast", self._feast_service, project, feature_table.name, description, description_source="feature_table_details", ) yield TableMetadata( "feast", self._feast_service, project, feature_table.name, f'```\n{yaml.dump(feature_table.to_dict()["spec"]["batchSource"])}```', description_source="batch_source", ) if feature_table.stream_source: yield TableMetadata( "feast", self._feast_service, project, feature_table.name, f'```\n{yaml.dump(feature_table.to_dict()["spec"]["streamSource"])}```', description_source="stream_source", )