예제 #1
0
class FeastRepositorySource(Source):
    """
    This plugin extracts:

    - Entities as [`MLPrimaryKey`](https://datahubproject.io/docs/graphql/objects#mlprimarykey)
    - Features as [`MLFeature`](https://datahubproject.io/docs/graphql/objects#mlfeature)
    - Feature views and on-demand feature views as [`MLFeatureTable`](https://datahubproject.io/docs/graphql/objects#mlfeaturetable)
    - Batch and stream source details as [`Dataset`](https://datahubproject.io/docs/graphql/objects#dataset)
    - Column types associated with each entity and feature
    """

    source_config: FeastRepositorySourceConfig
    report: SourceReport
    feature_store: FeatureStore

    def __init__(self, config: FeastRepositorySourceConfig, ctx: PipelineContext):
        super().__init__(ctx)

        self.source_config = config
        self.report = SourceReport()
        self.feature_store = FeatureStore(self.source_config.path)

    def _get_field_type(self, field_type: ValueType, parent_name: str) -> str:
        """
        Maps types encountered in Feast to corresponding schema types.
        """

        ml_feature_data_type = _field_type_mapping.get(field_type)

        if ml_feature_data_type is None:
            self.report.report_warning(
                parent_name, f"unable to map type {field_type} to metadata schema"
            )

            ml_feature_data_type = MLFeatureDataType.UNKNOWN

        return ml_feature_data_type

    def _get_data_source_details(self, source: DataSource) -> Tuple[str, str]:
        """
        Get Feast batch/stream source platform and name.
        """

        platform = "unknown"
        name = "unknown"

        if isinstance(source, FileSource):
            platform = "file"

            name = source.path.replace("://", ".").replace("/", ".")

        if isinstance(source, BigQuerySource):
            platform = "bigquery"
            name = source.table

        if isinstance(source, KafkaSource):
            platform = "kafka"
            name = source.kafka_options.topic

        if isinstance(source, KinesisSource):
            platform = "kinesis"
            name = (
                f"{source.kinesis_options.region}:{source.kinesis_options.stream_name}"
            )

        if isinstance(source, RequestDataSource):
            platform = "request"
            name = source.name

        return platform, name

    def _get_data_sources(self, feature_view: FeatureView) -> List[str]:
        """
        Get data source URN list.
        """

        sources = []

        if feature_view.batch_source is not None:
            batch_source_platform, batch_source_name = self._get_data_source_details(
                feature_view.batch_source
            )
            sources.append(
                builder.make_dataset_urn(
                    batch_source_platform,
                    batch_source_name,
                    self.source_config.environment,
                )
            )

        if feature_view.stream_source is not None:
            stream_source_platform, stream_source_name = self._get_data_source_details(
                feature_view.stream_source
            )
            sources.append(
                builder.make_dataset_urn(
                    stream_source_platform,
                    stream_source_name,
                    self.source_config.environment,
                )
            )

        return sources

    def _get_entity_workunit(
        self, feature_view: FeatureView, entity: Entity
    ) -> MetadataWorkUnit:
        """
        Generate an MLPrimaryKey work unit for a Feast entity.
        """

        feature_view_name = f"{self.feature_store.project}.{feature_view.name}"

        entity_snapshot = MLPrimaryKeySnapshot(
            urn=builder.make_ml_primary_key_urn(feature_view_name, entity.name),
            aspects=[StatusClass(removed=False)],
        )

        entity_snapshot.aspects.append(
            MLPrimaryKeyPropertiesClass(
                description=entity.description,
                dataType=self._get_field_type(entity.value_type, entity.name),
                sources=self._get_data_sources(feature_view),
            )
        )

        mce = MetadataChangeEvent(proposedSnapshot=entity_snapshot)

        return MetadataWorkUnit(id=entity.name, mce=mce)

    def _get_feature_workunit(
        self,
        feature_view: Union[FeatureView, OnDemandFeatureView],
        feature: Feature,
    ) -> MetadataWorkUnit:
        """
        Generate an MLFeature work unit for a Feast feature.
        """
        feature_view_name = f"{self.feature_store.project}.{feature_view.name}"

        feature_snapshot = MLFeatureSnapshot(
            urn=builder.make_ml_feature_urn(feature_view_name, feature.name),
            aspects=[StatusClass(removed=False)],
        )

        feature_sources = []

        if isinstance(feature_view, FeatureView):
            feature_sources = self._get_data_sources(feature_view)
        elif isinstance(feature_view, OnDemandFeatureView):
            if feature_view.input_request_data_sources is not None:
                for request_source in feature_view.input_request_data_sources.values():
                    source_platform, source_name = self._get_data_source_details(
                        request_source
                    )

                    feature_sources.append(
                        builder.make_dataset_urn(
                            source_platform,
                            source_name,
                            self.source_config.environment,
                        )
                    )

            if feature_view.input_feature_view_projections is not None:
                for (
                    feature_view_projection
                ) in feature_view.input_feature_view_projections.values():
                    feature_view_source = self.feature_store.get_feature_view(
                        feature_view_projection.name
                    )

                    feature_sources.extend(self._get_data_sources(feature_view_source))

        feature_snapshot.aspects.append(
            MLFeaturePropertiesClass(
                description=feature.labels.get("description"),
                dataType=self._get_field_type(feature.dtype, feature.name),
                sources=feature_sources,
            )
        )

        mce = MetadataChangeEvent(proposedSnapshot=feature_snapshot)

        return MetadataWorkUnit(id=feature.name, mce=mce)

    def _get_feature_view_workunit(self, feature_view: FeatureView) -> MetadataWorkUnit:
        """
        Generate an MLFeatureTable work unit for a Feast feature view.
        """

        feature_view_name = f"{self.feature_store.project}.{feature_view.name}"

        feature_view_snapshot = MLFeatureTableSnapshot(
            urn=builder.make_ml_feature_table_urn("feast", feature_view_name),
            aspects=[
                BrowsePathsClass(
                    paths=[f"/feast/{self.feature_store.project}/{feature_view_name}"]
                ),
                StatusClass(removed=False),
            ],
        )

        feature_view_snapshot.aspects.append(
            MLFeatureTablePropertiesClass(
                mlFeatures=[
                    builder.make_ml_feature_urn(
                        feature_view_name,
                        feature.name,
                    )
                    for feature in feature_view.features
                ],
                mlPrimaryKeys=[
                    builder.make_ml_primary_key_urn(feature_view_name, entity_name)
                    for entity_name in feature_view.entities
                ],
            )
        )

        mce = MetadataChangeEvent(proposedSnapshot=feature_view_snapshot)

        return MetadataWorkUnit(id=feature_view_name, mce=mce)

    def _get_on_demand_feature_view_workunit(
        self, on_demand_feature_view: OnDemandFeatureView
    ) -> MetadataWorkUnit:
        """
        Generate an MLFeatureTable work unit for a Feast on-demand feature view.
        """

        on_demand_feature_view_name = (
            f"{self.feature_store.project}.{on_demand_feature_view.name}"
        )

        on_demand_feature_view_snapshot = MLFeatureTableSnapshot(
            urn=builder.make_ml_feature_table_urn("feast", on_demand_feature_view_name),
            aspects=[
                BrowsePathsClass(
                    paths=[
                        f"/feast/{self.feature_store.project}/{on_demand_feature_view_name}"
                    ]
                ),
                StatusClass(removed=False),
            ],
        )

        on_demand_feature_view_snapshot.aspects.append(
            MLFeatureTablePropertiesClass(
                mlFeatures=[
                    builder.make_ml_feature_urn(
                        on_demand_feature_view_name,
                        feature.name,
                    )
                    for feature in on_demand_feature_view.features
                ],
                mlPrimaryKeys=[],
            )
        )

        mce = MetadataChangeEvent(proposedSnapshot=on_demand_feature_view_snapshot)

        return MetadataWorkUnit(id=on_demand_feature_view_name, mce=mce)

    @classmethod
    def create(cls, config_dict, ctx):
        config = FeastRepositorySourceConfig.parse_obj(config_dict)
        return cls(config, ctx)

    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        for feature_view in self.feature_store.list_feature_views():
            for entity_name in feature_view.entities:
                entity = self.feature_store.get_entity(entity_name)

                work_unit = self._get_entity_workunit(feature_view, entity)
                self.report.report_workunit(work_unit)

                yield work_unit

            for feature in feature_view.features:
                work_unit = self._get_feature_workunit(feature_view, feature)
                self.report.report_workunit(work_unit)

                yield work_unit

            work_unit = self._get_feature_view_workunit(feature_view)
            self.report.report_workunit(work_unit)

            yield work_unit

        for on_demand_feature_view in self.feature_store.list_on_demand_feature_views():
            for feature in on_demand_feature_view.features:
                work_unit = self._get_feature_workunit(on_demand_feature_view, feature)
                self.report.report_workunit(work_unit)

                yield work_unit

            work_unit = self._get_on_demand_feature_view_workunit(
                on_demand_feature_view
            )
            self.report.report_workunit(work_unit)

            yield work_unit

    def get_report(self) -> SourceReport:
        return self.report

    def close(self) -> None:
        return
예제 #2
0
class FeastExtractor(Extractor):
    """
    Extracts feature tables from Feast feature store file. Since Feast is
    a metadata store (and not the database itself), it maps the
    following attributes:

     * a database is name of feast project
     * table name is a name of the feature view
     * columns are features stored in the feature view
    """

    FEAST_REPOSITORY_PATH = "/path/to/repository"
    DESCRIBE_FEATURE_VIEWS = "describe_feature_views"
    DEFAULT_CONFIG = ConfigFactory.from_dict({
        FEAST_REPOSITORY_PATH: ".",
        DESCRIBE_FEATURE_VIEWS: True
    })

    def init(self, conf: ConfigTree) -> None:
        conf = conf.with_fallback(FeastExtractor.DEFAULT_CONFIG)
        self._feast_repository_path = conf.get_string(
            FeastExtractor.FEAST_REPOSITORY_PATH)
        self._describe_feature_views = conf.get_bool(
            FeastExtractor.DESCRIBE_FEATURE_VIEWS)
        self._feast = FeatureStore(repo_path=self._feast_repository_path)
        self._extract_iter: Union[None, Iterator] = None

    def get_scope(self) -> str:
        return "extractor.feast"

    def extract(self) -> Union[TableMetadata, None]:
        """
        For every feature table from Feast, a multiple objets are extracted:

        1. TableMetadata with feature view description
        2. Programmatic Description of the feature view, containing
           metadata - date of creation and labels
        3. Programmatic Description with Batch Source specification
        4. (if applicable) Programmatic Description with Stream Source
           specification
        """
        if not self._extract_iter:
            self._extract_iter = self._get_extract_iter()
        try:
            return next(self._extract_iter)
        except StopIteration:
            return None

    def _get_extract_iter(self) -> Iterator[TableMetadata]:
        for feature_view in self._feast.list_feature_views():
            yield from self._extract_feature_view(feature_view)

    def _extract_feature_view(
            self, feature_view: FeatureView) -> Iterator[TableMetadata]:
        columns = []
        for index, entity_name in enumerate(feature_view.entities):
            entity = self._feast.get_entity(entity_name)
            columns.append(
                ColumnMetadata(entity.name, entity.description,
                               entity.value_type.name, index))

        for index, feature in enumerate(feature_view.features):
            columns.append(
                ColumnMetadata(
                    feature.name,
                    None,
                    feature.dtype.name,
                    len(feature_view.entities) + index,
                ))

        yield TableMetadata(
            "feast",
            self._feast.config.provider,
            self._feast.project,
            feature_view.name,
            None,
            columns,
        )

        if self._describe_feature_views:
            description = str()
            if feature_view.created_timestamp:
                created_at = datetime.utcfromtimestamp(
                    feature_view.created_timestamp.timestamp())
                description = f"* Created at **{created_at}**\n"

            if feature_view.tags:
                description += "* Tags:\n"
                for key, value in feature_view.tags.items():
                    description += f"    * {key}: **{value}**\n"

            yield TableMetadata(
                "feast",
                self._feast.config.provider,
                self._feast.project,
                feature_view.name,
                description,
                description_source="feature_view_details",
            )

            yield TableMetadata(
                "feast",
                self._feast.config.provider,
                self._feast.project,
                feature_view.name,
                f"```\n{str(feature_view.batch_source.to_proto())}```",
                description_source="batch_source",
            )

            if feature_view.stream_source:
                yield TableMetadata(
                    "feast",
                    self._feast.config.provider,
                    self._feast.project,
                    feature_view.name,
                    f"```\n{str(feature_view.stream_source.to_proto())}```",
                    description_source="stream_source",
                )