Пример #1
0
    def get_feature_service(self,
                            name: str,
                            project: str,
                            allow_cache: bool = False) -> FeatureService:
        """
        Retrieves a feature service.

        Args:
            name: Name of feature service
            project: Feast project that this feature service belongs to

        Returns:
            Returns either the specified feature service, or raises an exception if
            none is found
        """
        registry = self._get_registry_proto(allow_cache=allow_cache)

        for feature_service_proto in registry.feature_services:
            if (feature_service_proto.spec.project == project
                    and feature_service_proto.spec.name == name):
                return FeatureService.from_proto(feature_service_proto)
        raise FeatureServiceNotFoundException(name, project=project)
Пример #2
0
    def list_feature_services(
            self,
            project: str,
            allow_cache: bool = False) -> List[FeatureService]:
        """
        Retrieve a list of feature services from the registry

        Args:
            allow_cache: Whether to allow returning entities from a cached registry
            project: Filter entities based on project name

        Returns:
            List of feature services
        """

        registry = self._get_registry_proto(allow_cache=allow_cache)
        feature_services = []
        for feature_service_proto in registry.feature_services:
            if feature_service_proto.spec.project == project:
                feature_services.append(
                    FeatureService.from_proto(feature_service_proto))
        return feature_services
Пример #3
0
def test_historical_features(environment, universal_data_sources,
                             full_feature_names):
    store = environment.feature_store

    (entities, datasets, data_sources) = universal_data_sources

    feature_views = construct_universal_feature_views(data_sources)

    entity_df_with_request_data = datasets.entity_df.copy(deep=True)
    entity_df_with_request_data["val_to_add"] = [
        i for i in range(len(entity_df_with_request_data))
    ]
    entity_df_with_request_data["driver_age"] = [
        i + 100 for i in range(len(entity_df_with_request_data))
    ]

    feature_service = FeatureService(
        name="convrate_plus100",
        features=[
            feature_views.driver[["conv_rate"]], feature_views.driver_odfv
        ],
    )
    feature_service_entity_mapping = FeatureService(
        name="entity_mapping",
        features=[
            feature_views.location.with_name("origin").with_join_key_map(
                {"location_id": "origin_id"}),
            feature_views.location.with_name("destination").with_join_key_map(
                {"location_id": "destination_id"}),
        ],
    )

    store.apply([
        driver(),
        customer(),
        location(),
        feature_service,
        feature_service_entity_mapping,
        *feature_views.values(),
    ])

    event_timestamp = (DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL
                       if DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL
                       in datasets.orders_df.columns else "e_ts")
    full_expected_df = get_expected_training_df(
        datasets.customer_df,
        feature_views.customer,
        datasets.driver_df,
        feature_views.driver,
        datasets.orders_df,
        feature_views.order,
        datasets.location_df,
        feature_views.location,
        datasets.global_df,
        feature_views.global_fv,
        datasets.field_mapping_df,
        feature_views.field_mapping,
        entity_df_with_request_data,
        event_timestamp,
        full_feature_names,
    )

    # Only need the shadow entities features in the FeatureService test
    expected_df = full_expected_df.drop(
        columns=["origin__temperature", "destination__temperature"], )

    job_from_df = store.get_historical_features(
        entity_df=entity_df_with_request_data,
        features=[
            "driver_stats:conv_rate",
            "driver_stats:avg_daily_trips",
            "customer_profile:current_balance",
            "customer_profile:avg_passenger_count",
            "customer_profile:lifetime_trip_count",
            "conv_rate_plus_100:conv_rate_plus_100",
            "conv_rate_plus_100:conv_rate_plus_100_rounded",
            "conv_rate_plus_100:conv_rate_plus_val_to_add",
            "order:order_is_success",
            "global_stats:num_rides",
            "global_stats:avg_ride_length",
            "field_mapping:feature_name",
        ],
        full_feature_names=full_feature_names,
    )

    start_time = datetime.utcnow()
    actual_df_from_df_entities = job_from_df.to_df()

    print(
        f"actual_df_from_df_entities shape: {actual_df_from_df_entities.shape}"
    )
    end_time = datetime.utcnow()
    print(
        str(f"Time to execute job_from_df.to_df() = '{(end_time - start_time)}'\n"
            ))

    assert sorted(expected_df.columns) == sorted(
        actual_df_from_df_entities.columns)
    assert_frame_equal(
        expected_df,
        actual_df_from_df_entities,
        keys=[event_timestamp, "order_id", "driver_id", "customer_id"],
    )

    assert_feature_service_correctness(
        store,
        feature_service,
        full_feature_names,
        entity_df_with_request_data,
        expected_df,
        event_timestamp,
    )
    assert_feature_service_entity_mapping_correctness(
        store,
        feature_service_entity_mapping,
        full_feature_names,
        entity_df_with_request_data,
        full_expected_df,
        event_timestamp,
    )
    table_from_df_entities: pd.DataFrame = job_from_df.to_arrow().to_pandas()

    assert_frame_equal(
        expected_df,
        table_from_df_entities,
        keys=[event_timestamp, "order_id", "driver_id", "customer_id"],
    )
Пример #4
0
def test_historical_features(environment, universal_data_sources, full_feature_names):
    store = environment.feature_store

    (entities, datasets, data_sources) = universal_data_sources
    feature_views = construct_universal_feature_views(data_sources)

    customer_df, driver_df, location_df, orders_df, global_df, entity_df = (
        datasets["customer"],
        datasets["driver"],
        datasets["location"],
        datasets["orders"],
        datasets["global"],
        datasets["entity"],
    )
    entity_df_with_request_data = entity_df.copy(deep=True)
    entity_df_with_request_data["val_to_add"] = [
        i for i in range(len(entity_df_with_request_data))
    ]
    entity_df_with_request_data["driver_age"] = [
        i + 100 for i in range(len(entity_df_with_request_data))
    ]

    (
        customer_fv,
        driver_fv,
        driver_odfv,
        location_fv,
        order_fv,
        global_fv,
        driver_age_request_fv,
    ) = (
        feature_views["customer"],
        feature_views["driver"],
        feature_views["driver_odfv"],
        feature_views["location"],
        feature_views["order"],
        feature_views["global"],
        feature_views["driver_age_request_fv"],
    )

    feature_service = FeatureService(
        name="convrate_plus100",
        features=[
            feature_views["driver"][["conv_rate"]],
            driver_odfv,
            driver_age_request_fv,
        ],
    )
    feature_service_entity_mapping = FeatureService(
        name="entity_mapping",
        features=[
            location_fv.with_name("origin").with_join_key_map(
                {"location_id": "origin_id"}
            ),
            location_fv.with_name("destination").with_join_key_map(
                {"location_id": "destination_id"}
            ),
        ],
    )

    feast_objects = []
    feast_objects.extend(
        [
            customer_fv,
            driver_fv,
            driver_odfv,
            location_fv,
            order_fv,
            global_fv,
            driver_age_request_fv,
            driver(),
            customer(),
            location(),
            feature_service,
            feature_service_entity_mapping,
        ]
    )
    store.apply(feast_objects)

    entity_df_query = None
    orders_table = table_name_from_data_source(data_sources["orders"])
    if orders_table:
        entity_df_query = f"SELECT customer_id, driver_id, order_id, origin_id, destination_id, event_timestamp FROM {orders_table}"

    event_timestamp = (
        DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL
        if DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL in orders_df.columns
        else "e_ts"
    )
    full_expected_df = get_expected_training_df(
        customer_df,
        customer_fv,
        driver_df,
        driver_fv,
        orders_df,
        order_fv,
        location_df,
        location_fv,
        global_df,
        global_fv,
        entity_df_with_request_data,
        event_timestamp,
        full_feature_names,
    )

    # Only need the shadow entities features in the FeatureService test
    expected_df = full_expected_df.drop(
        columns=["origin__temperature", "destination__temperature"],
    )

    if entity_df_query:
        job_from_sql = store.get_historical_features(
            entity_df=entity_df_query,
            features=[
                "driver_stats:conv_rate",
                "driver_stats:avg_daily_trips",
                "customer_profile:current_balance",
                "customer_profile:avg_passenger_count",
                "customer_profile:lifetime_trip_count",
                "order:order_is_success",
                "global_stats:num_rides",
                "global_stats:avg_ride_length",
            ],
            full_feature_names=full_feature_names,
        )

        start_time = datetime.utcnow()
        actual_df_from_sql_entities = job_from_sql.to_df()
        end_time = datetime.utcnow()
        print(
            str(f"\nTime to execute job_from_sql.to_df() = '{(end_time - start_time)}'")
        )

        # Not requesting the on demand transform with an entity_df query (can't add request data in them)
        expected_df_query = expected_df.drop(
            columns=[
                "conv_rate_plus_100",
                "conv_rate_plus_100_rounded",
                "val_to_add",
                "conv_rate_plus_val_to_add",
                "driver_age",
            ]
        )
        assert sorted(expected_df_query.columns) == sorted(
            actual_df_from_sql_entities.columns
        )

        actual_df_from_sql_entities = (
            actual_df_from_sql_entities[expected_df_query.columns]
            .sort_values(by=[event_timestamp, "order_id", "driver_id", "customer_id"])
            .drop_duplicates()
            .reset_index(drop=True)
        )
        expected_df_query = (
            expected_df_query.sort_values(
                by=[event_timestamp, "order_id", "driver_id", "customer_id"]
            )
            .drop_duplicates()
            .reset_index(drop=True)
        )

        assert_frame_equal(
            actual_df_from_sql_entities, expected_df_query, check_dtype=False,
        )

        table_from_sql_entities = job_from_sql.to_arrow()
        df_from_sql_entities = (
            table_from_sql_entities.to_pandas()[expected_df_query.columns]
            .sort_values(by=[event_timestamp, "order_id", "driver_id", "customer_id"])
            .drop_duplicates()
            .reset_index(drop=True)
        )

        for col in df_from_sql_entities.columns:
            expected_df_query[col] = expected_df_query[col].astype(
                df_from_sql_entities[col].dtype
            )

        assert_frame_equal(expected_df_query, df_from_sql_entities)

    job_from_df = store.get_historical_features(
        entity_df=entity_df_with_request_data,
        features=[
            "driver_stats:conv_rate",
            "driver_stats:avg_daily_trips",
            "customer_profile:current_balance",
            "customer_profile:avg_passenger_count",
            "customer_profile:lifetime_trip_count",
            "conv_rate_plus_100:conv_rate_plus_100",
            "conv_rate_plus_100:conv_rate_plus_100_rounded",
            "conv_rate_plus_100:conv_rate_plus_val_to_add",
            "order:order_is_success",
            "global_stats:num_rides",
            "global_stats:avg_ride_length",
            "driver_age:driver_age",
        ],
        full_feature_names=full_feature_names,
    )

    start_time = datetime.utcnow()
    actual_df_from_df_entities = job_from_df.to_df()

    print(f"actual_df_from_df_entities shape: {actual_df_from_df_entities.shape}")
    end_time = datetime.utcnow()
    print(str(f"Time to execute job_from_df.to_df() = '{(end_time - start_time)}'\n"))

    assert sorted(expected_df.columns) == sorted(actual_df_from_df_entities.columns)
    expected_df: pd.DataFrame = (
        expected_df.sort_values(
            by=[event_timestamp, "order_id", "driver_id", "customer_id"]
        )
        .drop_duplicates()
        .reset_index(drop=True)
    )
    actual_df_from_df_entities = (
        actual_df_from_df_entities[expected_df.columns]
        .sort_values(by=[event_timestamp, "order_id", "driver_id", "customer_id"])
        .drop_duplicates()
        .reset_index(drop=True)
    )

    assert_frame_equal(
        expected_df, actual_df_from_df_entities, check_dtype=False,
    )
    assert_feature_service_correctness(
        store,
        feature_service,
        full_feature_names,
        entity_df_with_request_data,
        expected_df,
        event_timestamp,
    )
    assert_feature_service_entity_mapping_correctness(
        store,
        feature_service_entity_mapping,
        full_feature_names,
        entity_df_with_request_data,
        full_expected_df,
        event_timestamp,
    )

    table_from_df_entities: pd.DataFrame = job_from_df.to_arrow().to_pandas()

    columns_expected_in_table = expected_df.columns.tolist()

    table_from_df_entities = (
        table_from_df_entities[columns_expected_in_table]
        .sort_values(by=[event_timestamp, "order_id", "driver_id", "customer_id"])
        .drop_duplicates()
        .reset_index(drop=True)
    )
    assert_frame_equal(actual_df_from_df_entities, table_from_df_entities)

    # If request data is missing that's needed for on demand transform, throw an error
    with pytest.raises(RequestDataNotFoundInEntityDfException):
        store.get_historical_features(
            entity_df=entity_df,
            features=[
                "driver_stats:conv_rate",
                "driver_stats:avg_daily_trips",
                "customer_profile:current_balance",
                "customer_profile:avg_passenger_count",
                "customer_profile:lifetime_trip_count",
                "conv_rate_plus_100:conv_rate_plus_100",
                "conv_rate_plus_100:conv_rate_plus_val_to_add",
                "global_stats:num_rides",
                "global_stats:avg_ride_length",
            ],
            full_feature_names=full_feature_names,
        )
    # If request data is missing that's needed for a request feature view, throw an error
    with pytest.raises(RequestDataNotFoundInEntityDfException):
        store.get_historical_features(
            entity_df=entity_df,
            features=[
                "driver_stats:conv_rate",
                "driver_stats:avg_daily_trips",
                "customer_profile:current_balance",
                "customer_profile:avg_passenger_count",
                "customer_profile:lifetime_trip_count",
                "driver_age:driver_age",
                "global_stats:num_rides",
                "global_stats:avg_ride_length",
            ],
            full_feature_names=full_feature_names,
        )
Пример #5
0
def test_feature_service_with_description():
    feature_service = FeatureService(
        name="my-feature-service", features=[], description="a clear description"
    )
    assert feature_service.to_proto().spec.description == "a clear description"
Пример #6
0
def test_feature_service_without_description():
    feature_service = FeatureService(name="my-feature-service", features=[])
    #
    assert feature_service.to_proto().spec.description == ""
Пример #7
0
    return df


generated_data_source = FileSource(
    path="benchmark_data.parquet",
    timestamp_field="event_timestamp",
)

entity = Entity(
    name="entity",
    value_type=ValueType.STRING,
)

benchmark_feature_views = [
    FeatureView(
        name=f"feature_view_{i}",
        entities=["entity"],
        ttl=Duration(seconds=86400),
        schema=[
            Field(name=f"feature_{10 * i + j}", dtype=Int64) for j in range(10)
        ],
        online=True,
        batch_source=generated_data_source,
    ) for i in range(25)
]

benchmark_feature_service = FeatureService(
    name=f"benchmark_feature_service",
    features=benchmark_feature_views,
)