Exemplo n.º 1
0
def test_update_data_sources_with_inferred_event_timestamp_col(
        universal_data_sources):
    (_, _, data_sources) = universal_data_sources
    data_sources_copy = deepcopy(data_sources)

    # remove defined timestamp_field to allow for inference
    for data_source in data_sources_copy.values():
        data_source.timestamp_field = None
        data_source.event_timestamp_column = None

    update_data_sources_with_inferred_event_timestamp_col(
        data_sources_copy.values(),
        RepoConfig(provider="local", project="test"),
    )
    actual_event_timestamp_cols = [
        source.timestamp_field for source in data_sources_copy.values()
    ]

    assert actual_event_timestamp_cols == ["event_timestamp"] * len(
        data_sources_copy.values())
Exemplo n.º 2
0
def test_update_data_sources_with_inferred_event_timestamp_col(
        simple_dataset_1):
    df_with_two_viable_timestamp_cols = simple_dataset_1.copy(deep=True)
    df_with_two_viable_timestamp_cols["ts_2"] = simple_dataset_1["ts_1"]

    with prep_file_source(df=simple_dataset_1) as file_source:
        data_sources = [
            file_source,
            simple_bq_source_using_table_ref_arg(simple_dataset_1),
            simple_bq_source_using_query_arg(simple_dataset_1),
        ]
        update_data_sources_with_inferred_event_timestamp_col(data_sources)
        actual_event_timestamp_cols = [
            source.event_timestamp_column for source in data_sources
        ]

        assert actual_event_timestamp_cols == ["ts_1", "ts_1", "ts_1"]

    with prep_file_source(df=df_with_two_viable_timestamp_cols) as file_source:
        with pytest.raises(RegistryInferenceFailure):
            # two viable event_timestamp_columns
            update_data_sources_with_inferred_event_timestamp_col(
                [file_source])
Exemplo n.º 3
0
def test_update_file_data_source_with_inferred_event_timestamp_col(
        simple_dataset_1):
    df_with_two_viable_timestamp_cols = simple_dataset_1.copy(deep=True)
    df_with_two_viable_timestamp_cols["ts_2"] = simple_dataset_1["ts_1"]

    with prep_file_source(df=simple_dataset_1) as file_source:
        data_sources = [
            file_source,
            simple_bq_source_using_table_arg(simple_dataset_1),
            simple_bq_source_using_query_arg(simple_dataset_1),
        ]
        update_data_sources_with_inferred_event_timestamp_col(
            data_sources, RepoConfig(provider="local", project="test"))
        actual_event_timestamp_cols = [
            source.timestamp_field for source in data_sources
        ]

        assert actual_event_timestamp_cols == ["ts_1", "ts_1", "ts_1"]

    with prep_file_source(df=df_with_two_viable_timestamp_cols) as file_source:
        with pytest.raises(RegistryInferenceFailure):
            # two viable timestamp_fields
            update_data_sources_with_inferred_event_timestamp_col(
                [file_source], RepoConfig(provider="local", project="test"))
Exemplo n.º 4
0
def apply_total(repo_config: RepoConfig, repo_path: Path,
                skip_source_validation: bool):
    from colorama import Fore, Style

    os.chdir(repo_path)
    registry_config = repo_config.get_registry_config()
    project = repo_config.project
    if not is_valid_name(project):
        print(
            f"{project} is not valid. Project name should only have "
            f"alphanumerical values and underscores but not start with an underscore."
        )
        sys.exit(1)
    registry = Registry(
        registry_path=registry_config.path,
        repo_path=repo_path,
        cache_ttl=timedelta(seconds=registry_config.cache_ttl_seconds),
    )
    registry._initialize_registry()
    sys.dont_write_bytecode = True
    repo = parse_repo(repo_path)
    _validate_feature_views(repo.feature_views)
    data_sources = [t.batch_source for t in repo.feature_views]

    if not skip_source_validation:
        # Make sure the data source used by this feature view is supported by Feast
        for data_source in data_sources:
            data_source.validate(repo_config)

    # Make inferences
    update_entities_with_inferred_types_from_feature_views(
        repo.entities, repo.feature_views, repo_config)
    update_data_sources_with_inferred_event_timestamp_col(
        data_sources, repo_config)
    for view in repo.feature_views:
        view.infer_features_from_batch_source(repo_config)

    repo_table_names = set(t.name for t in repo.feature_tables)

    for t in repo.feature_views:
        repo_table_names.add(t.name)

    tables_to_delete = []
    for registry_table in registry.list_feature_tables(project=project):
        if registry_table.name not in repo_table_names:
            tables_to_delete.append(registry_table)

    views_to_delete = []
    for registry_view in registry.list_feature_views(project=project):
        if registry_view.name not in repo_table_names:
            views_to_delete.append(registry_view)

    sys.dont_write_bytecode = False
    for entity in repo.entities:
        registry.apply_entity(entity, project=project, commit=False)
        click.echo(
            f"Registered entity {Style.BRIGHT + Fore.GREEN}{entity.name}{Style.RESET_ALL}"
        )

    # Delete tables that should not exist
    for registry_table in tables_to_delete:
        registry.delete_feature_table(registry_table.name,
                                      project=project,
                                      commit=False)
        click.echo(
            f"Deleted feature table {Style.BRIGHT + Fore.GREEN}{registry_table.name}{Style.RESET_ALL} from registry"
        )

    # Create tables that should
    for table in repo.feature_tables:
        registry.apply_feature_table(table, project, commit=False)
        click.echo(
            f"Registered feature table {Style.BRIGHT + Fore.GREEN}{table.name}{Style.RESET_ALL}"
        )

    # Delete views that should not exist
    for registry_view in views_to_delete:
        registry.delete_feature_view(registry_view.name,
                                     project=project,
                                     commit=False)
        click.echo(
            f"Deleted feature view {Style.BRIGHT + Fore.GREEN}{registry_view.name}{Style.RESET_ALL} from registry"
        )

    # Create views that should exist
    for view in repo.feature_views:
        registry.apply_feature_view(view, project, commit=False)
        click.echo(
            f"Registered feature view {Style.BRIGHT + Fore.GREEN}{view.name}{Style.RESET_ALL}"
        )
    registry.commit()

    apply_feature_services(registry, project, repo)

    infra_provider = get_provider(repo_config, repo_path)

    all_to_delete: List[Union[FeatureTable, FeatureView]] = []
    all_to_delete.extend(tables_to_delete)
    all_to_delete.extend(views_to_delete)

    all_to_keep: List[Union[FeatureTable, FeatureView]] = []
    all_to_keep.extend(repo.feature_tables)
    all_to_keep.extend(repo.feature_views)

    entities_to_delete: List[Entity] = []
    repo_entities_names = set([e.name for e in repo.entities])
    for registry_entity in registry.list_entities(project=project):
        if registry_entity.name not in repo_entities_names:
            entities_to_delete.append(registry_entity)

    entities_to_keep: List[Entity] = repo.entities

    for name in [view.name for view in repo.feature_tables
                 ] + [table.name for table in repo.feature_views]:
        click.echo(
            f"Deploying infrastructure for {Style.BRIGHT + Fore.GREEN}{name}{Style.RESET_ALL}"
        )
    for name in [view.name for view in views_to_delete
                 ] + [table.name for table in tables_to_delete]:
        click.echo(
            f"Removing infrastructure for {Style.BRIGHT + Fore.GREEN}{name}{Style.RESET_ALL}"
        )

    infra_provider.update_infra(
        project,
        tables_to_delete=all_to_delete,
        tables_to_keep=all_to_keep,
        entities_to_delete=entities_to_delete,
        entities_to_keep=entities_to_keep,
        partial=False,
    )
Exemplo n.º 5
0
    def apply(
        self,
        objects: Union[Entity, FeatureView, FeatureService,
                       List[Union[FeatureView, Entity, FeatureService]], ],
        commit: bool = True,
    ):
        """Register objects to metadata store and update related infrastructure.

        The apply method registers one or more definitions (e.g., Entity, FeatureView) and registers or updates these
        objects in the Feast registry. Once the registry has been updated, the apply method will update related
        infrastructure (e.g., create tables in an online store) in order to reflect these new definitions. All
        operations are idempotent, meaning they can safely be rerun.

        Args:
            objects: A single object, or a list of objects that should be registered with the Feature Store.
            commit: whether to commit changes to the registry

        Raises:
            ValueError: The 'objects' parameter could not be parsed properly.

        Examples:
            Register an Entity and a FeatureView.

            >>> from feast import FeatureStore, Entity, FeatureView, Feature, ValueType, FileSource, RepoConfig
            >>> from datetime import timedelta
            >>> fs = FeatureStore(repo_path="feature_repo")
            >>> driver = Entity(name="driver_id", value_type=ValueType.INT64, description="driver id")
            >>> driver_hourly_stats = FileSource(
            ...     path="feature_repo/data/driver_stats.parquet",
            ...     event_timestamp_column="event_timestamp",
            ...     created_timestamp_column="created",
            ... )
            >>> driver_hourly_stats_view = FeatureView(
            ...     name="driver_hourly_stats",
            ...     entities=["driver_id"],
            ...     ttl=timedelta(seconds=86400 * 1),
            ...     batch_source=driver_hourly_stats,
            ... )
            >>> fs.apply([driver_hourly_stats_view, driver]) # register entity and feature view
        """
        # TODO: Add locking

        if not isinstance(objects, Iterable):
            objects = [objects]

        assert isinstance(objects, list)

        views_to_update = [ob for ob in objects if isinstance(ob, FeatureView)]
        _validate_feature_views(views_to_update)
        entities_to_update = [ob for ob in objects if isinstance(ob, Entity)]
        services_to_update = [
            ob for ob in objects if isinstance(ob, FeatureService)
        ]

        # Make inferences
        update_entities_with_inferred_types_from_feature_views(
            entities_to_update, views_to_update, self.config)

        update_data_sources_with_inferred_event_timestamp_col(
            [view.batch_source for view in views_to_update], self.config)

        for view in views_to_update:
            view.infer_features_from_batch_source(self.config)

        if len(views_to_update) + len(entities_to_update) + len(
                services_to_update) != len(objects):
            raise ValueError(
                "Unknown object type provided as part of apply() call")

        for view in views_to_update:
            self._registry.apply_feature_view(view,
                                              project=self.project,
                                              commit=False)
        for ent in entities_to_update:
            self._registry.apply_entity(ent,
                                        project=self.project,
                                        commit=False)
        for feature_service in services_to_update:
            self._registry.apply_feature_service(feature_service,
                                                 project=self.project)

        self._get_provider().update_infra(
            project=self.project,
            tables_to_delete=[],
            tables_to_keep=views_to_update,
            entities_to_delete=[],
            entities_to_keep=entities_to_update,
            partial=True,
        )

        if commit:
            self._registry.commit()
Exemplo n.º 6
0
    def apply(
        self, objects: Union[Entity, FeatureView, List[Union[FeatureView, Entity]]]
    ):
        """Register objects to metadata store and update related infrastructure.

        The apply method registers one or more definitions (e.g., Entity, FeatureView) and registers or updates these
        objects in the Feast registry. Once the registry has been updated, the apply method will update related
        infrastructure (e.g., create tables in an online store) in order to reflect these new definitions. All
        operations are idempotent, meaning they can safely be rerun.

        Args:
            objects (List[Union[FeatureView, Entity]]): A list of FeatureView or Entity objects that should be
                registered

        Examples:
            Register a single Entity and FeatureView.

            >>> from feast.feature_store import FeatureStore
            >>> from feast import Entity, FeatureView, Feature, ValueType, FileSource
            >>> from datetime import timedelta
            >>>
            >>> fs = FeatureStore()
            >>> customer_entity = Entity(name="customer", value_type=ValueType.INT64, description="customer entity")
            >>> customer_feature_view = FeatureView(
            >>>     name="customer_fv",
            >>>     entities=["customer"],
            >>>     features=[Feature(name="age", dtype=ValueType.INT64)],
            >>>     input=FileSource(path="file.parquet", event_timestamp_column="timestamp"),
            >>>     ttl=timedelta(days=1)
            >>> )
            >>> fs.apply([customer_entity, customer_feature_view])
        """

        # TODO: Add locking
        # TODO: Optimize by only making a single call (read/write)

        if isinstance(objects, Entity) or isinstance(objects, FeatureView):
            objects = [objects]
        assert isinstance(objects, list)

        views_to_update = [ob for ob in objects if isinstance(ob, FeatureView)]
        entities_to_update = [ob for ob in objects if isinstance(ob, Entity)]

        # Make inferences
        update_entities_with_inferred_types_from_feature_views(
            entities_to_update, views_to_update
        )
        update_data_sources_with_inferred_event_timestamp_col(
            [view.input for view in views_to_update]
        )
        for view in views_to_update:
            view.infer_features_from_input_source()

        if len(views_to_update) + len(entities_to_update) != len(objects):
            raise ValueError("Unknown object type provided as part of apply() call")

        for view in views_to_update:
            self._registry.apply_feature_view(view, project=self.project)
        for ent in entities_to_update:
            self._registry.apply_entity(ent, project=self.project)

        self._get_provider().update_infra(
            project=self.project,
            tables_to_delete=[],
            tables_to_keep=views_to_update,
            entities_to_delete=[],
            entities_to_keep=entities_to_update,
            partial=True,
        )
Exemplo n.º 7
0
    def apply(
        self,
        objects: Union[Entity, FeatureView, FeatureService,
                       List[Union[FeatureView, Entity, FeatureService]], ],
    ):
        """Register objects to metadata store and update related infrastructure.

        The apply method registers one or more definitions (e.g., Entity, FeatureView) and registers or updates these
        objects in the Feast registry. Once the registry has been updated, the apply method will update related
        infrastructure (e.g., create tables in an online store) in order to reflect these new definitions. All
        operations are idempotent, meaning they can safely be rerun.

        Args:
            objects: A single object, or a list of objects that should be registered with the Feature Store.

        Raises:
            ValueError: The 'objects' parameter could not be parsed properly.

        Examples:
            Register a single Entity and FeatureView.

            >>> from feast.feature_store import FeatureStore
            >>> from feast import Entity, FeatureView, Feature, ValueType, FileSource
            >>> from datetime import timedelta
            >>>
            >>> fs = FeatureStore()
            >>> customer_entity = Entity(name="customer", value_type=ValueType.INT64, description="customer entity")
            >>> customer_feature_view = FeatureView(
            >>>     name="customer_fv",
            >>>     entities=["customer"],
            >>>     features=[Feature(name="age", dtype=ValueType.INT64)],
            >>>     batch_source=FileSource(path="file.parquet", event_timestamp_column="timestamp"),
            >>>     ttl=timedelta(days=1)
            >>> )
            >>> fs.apply([customer_entity, customer_feature_view])
        """
        # TODO: Add locking

        if not isinstance(objects, Iterable):
            objects = [objects]

        assert isinstance(objects, list)

        views_to_update = [ob for ob in objects if isinstance(ob, FeatureView)]
        _validate_feature_views(views_to_update)
        entities_to_update = [ob for ob in objects if isinstance(ob, Entity)]
        services_to_update = [
            ob for ob in objects if isinstance(ob, FeatureService)
        ]

        # Make inferences
        update_entities_with_inferred_types_from_feature_views(
            entities_to_update, views_to_update, self.config)

        update_data_sources_with_inferred_event_timestamp_col(
            [view.batch_source for view in views_to_update], self.config)

        for view in views_to_update:
            view.infer_features_from_batch_source(self.config)

        if len(views_to_update) + len(entities_to_update) + len(
                services_to_update) != len(objects):
            raise ValueError(
                "Unknown object type provided as part of apply() call")

        for view in views_to_update:
            self._registry.apply_feature_view(view,
                                              project=self.project,
                                              commit=False)
        for ent in entities_to_update:
            self._registry.apply_entity(ent,
                                        project=self.project,
                                        commit=False)
        for feature_service in services_to_update:
            self._registry.apply_feature_service(feature_service,
                                                 project=self.project)
        self._registry.commit()

        self._get_provider().update_infra(
            project=self.project,
            tables_to_delete=[],
            tables_to_keep=views_to_update,
            entities_to_delete=[],
            entities_to_keep=entities_to_update,
            partial=True,
        )