예제 #1
0
def retrieve_online(feature_store: FeatureStore,
                    dataset: pd.DataFrame) -> dict:
    inference_data = random.choice(dataset["Hospital Number"])
    logger.info(f"Hospital Number chosen for inference is: {inference_data}")
    entity_rows = [{"Hospital Number": inference_data}]

    return feature_store.get_online_features(FEAST_FEATURES, entity_rows)
예제 #2
0
def main():
    pd.set_option("display.max_columns", None)
    pd.set_option("display.width", 1000)

    # Load the feature store from the current path
    fs = FeatureStore(repo_path=".")

    # Deploy the feature store to AWS
    print("Deploying feature store to AWS...")
    fs.apply([driver, driver_hourly_stats_view])

    # Select features
    feature_refs = ["driver_hourly_stats:conv_rate", "driver_hourly_stats:acc_rate"]

    print("Loading features into the online store...")
    fs.materialize_incremental(end_date=datetime.now())

    print("Retrieving online features...")

    # Retrieve features from the online store (DynamoDB)
    online_features = fs.get_online_features(
        feature_refs=feature_refs,
        entity_rows=[{"driver_id": 1001}, {"driver_id": 1002}],
    ).to_dict()

    print(pd.DataFrame.from_dict(online_features))
예제 #3
0
def _assert_online_features(store: FeatureStore, driver_df: pd.DataFrame,
                            max_date: datetime):
    """Assert that features in online store are up to date with `max_date` date."""
    # Read features back
    response = store.get_online_features(
        features=[
            "driver_hourly_stats:conv_rate",
            "driver_hourly_stats:avg_daily_trips",
            "global_daily_stats:num_rides",
            "global_daily_stats:avg_ride_length",
        ],
        entity_rows=[{
            "driver_id": 1001
        }],
        full_feature_names=True,
    )

    # Float features should still be floats from the online store...
    assert (response.proto.results[list(
        response.proto.metadata.feature_names.val).index(
            "driver_hourly_stats__conv_rate")].values[0].float_val > 0)

    result = response.to_dict()
    assert len(result) == 5
    assert "driver_hourly_stats__avg_daily_trips" in result
    assert "driver_hourly_stats__conv_rate" in result
    assert (abs(result["driver_hourly_stats__conv_rate"][0] -
                _get_last_feature_row(driver_df, 1001, max_date)["conv_rate"])
            < 0.01)
    assert "global_daily_stats__num_rides" in result
    assert "global_daily_stats__avg_ride_length" in result
예제 #4
0
def _assert_online_features(store: FeatureStore, driver_df: pd.DataFrame,
                            max_date: datetime):
    """Assert that features in online store are up to date with `max_date` date."""
    # Read features back
    result = store.get_online_features(
        features=[
            "driver_hourly_stats:conv_rate",
            "driver_hourly_stats:avg_daily_trips",
            "global_daily_stats:num_rides",
            "global_daily_stats:avg_ride_length",
        ],
        entity_rows=[{
            "driver_id": 1001
        }],
        full_feature_names=True,
    ).to_dict()

    assert len(result) == 5
    assert "driver_hourly_stats__avg_daily_trips" in result
    assert "driver_hourly_stats__conv_rate" in result
    assert (abs(result["driver_hourly_stats__conv_rate"][0] -
                _get_last_feature_row(driver_df, 1001, max_date)["conv_rate"])
            < 0.01)
    assert "global_daily_stats__num_rides" in result
    assert "global_daily_stats__avg_ride_length" in result
예제 #5
0
def check_offline_and_online_features(
    fs: FeatureStore,
    fv: FeatureView,
    driver_id: int,
    event_timestamp: datetime,
    expected_value: Optional[float],
    full_feature_names: bool,
    check_offline_store: bool = True,
) -> None:
    # Check online store
    response_dict = fs.get_online_features(
        [f"{fv.name}:value"],
        [{
            "driver_id": driver_id
        }],
        full_feature_names=full_feature_names,
    ).to_dict()

    if full_feature_names:
        if expected_value:
            assert (
                abs(response_dict[f"{fv.name}__value"][0] - expected_value) <
                1e-6), f"Response: {response_dict}, Expected: {expected_value}"
        else:
            assert response_dict[f"{fv.name}__value"][0] is None
    else:
        if expected_value:
            assert (abs(response_dict["value"][0] - expected_value) < 1e-6
                    ), f"Response: {response_dict}, Expected: {expected_value}"
        else:
            assert response_dict["value"][0] is None

    # Check offline store
    if check_offline_store:
        df = fs.get_historical_features(
            entity_df=pd.DataFrame.from_dict({
                "driver_id": [driver_id],
                "event_timestamp": [event_timestamp]
            }),
            features=[f"{fv.name}:value"],
            full_feature_names=full_feature_names,
        ).to_df()

        if full_feature_names:
            if expected_value:
                assert (abs(
                    df.to_dict(orient="list")[f"{fv.name}__value"][0] -
                    expected_value) < 1e-6)
            else:
                assert not df.to_dict(
                    orient="list")[f"{fv.name}__value"] or math.isnan(
                        df.to_dict(orient="list")[f"{fv.name}__value"][0])
        else:
            if expected_value:
                assert (abs(
                    df.to_dict(orient="list")["value"][0] - expected_value) <
                        1e-6)
            else:
                assert not df.to_dict(orient="list")["value"] or math.isnan(
                    df.to_dict(orient="list")["value"][0])
예제 #6
0
class DriverRankingModel:
    def __init__(self):
        # Load model
        self.model = load("driver_model.bin")

        # Set up feature store
        self.fs = FeatureStore(repo_path="driver_ranking/")

    def predict(self, driver_ids):
        # Read features from Feast
        driver_features = self.fs.get_online_features(
            entity_rows=[{
                "driver_id": driver_id
            } for driver_id in driver_ids],
            feature_refs=[
                "driver_hourly_stats:conv_rate",
                "driver_hourly_stats:acc_rate",
                "driver_hourly_stats:avg_daily_trips",
            ],
        )
        features_df = pd.DataFrame.from_dict(driver_features.to_dict())

        # Make prediction
        features_df["prediction"] = self.model.predict(features_df)

        # Choose best driver
        best_driver_id = features_df["driver_id"].iloc[
            features_df["prediction"].argmax()]

        # return best driver
        return best_driver_id
예제 #7
0
파일: test.py 프로젝트: feast-dev/feast
def main():
    pd.set_option("display.max_columns", None)
    pd.set_option("display.width", 1000)

    # Load the feature store from the current path
    fs = FeatureStore(repo_path=".")

    # Deploy the feature store to Snowflake
    print("Deploying feature store to Snowflake...")
    fs.apply([driver, driver_stats_fv])

    # Select features
    features = [
        "driver_hourly_stats:conv_rate", "driver_hourly_stats:acc_rate"
    ]

    # Create an entity dataframe. This is the dataframe that will be enriched with historical features
    entity_df = pd.DataFrame({
        "event_timestamp": [
            pd.Timestamp(dt, unit="ms", tz="UTC").round("ms")
            for dt in pd.date_range(
                start=datetime.now() - timedelta(days=3),
                end=datetime.now(),
                periods=3,
            )
        ],
        "driver_id": [1001, 1002, 1003],
    })

    print("Retrieving training data...")

    # Retrieve historical features by joining the entity dataframe to the Snowflake table source
    training_df = fs.get_historical_features(features=features,
                                             entity_df=entity_df).to_df()

    print()
    print(training_df)

    print()
    print("Loading features into the online store...")
    fs.materialize_incremental(end_date=datetime.now())

    print()
    print("Retrieving online features...")

    # Retrieve features from the online store
    online_features = fs.get_online_features(
        features=features,
        entity_rows=[{
            "driver_id": 1001
        }, {
            "driver_id": 1002
        }],
    ).to_dict()

    print()
    print(pd.DataFrame.from_dict(online_features))
예제 #8
0
def run_demo():
    store = FeatureStore(repo_path=".")

    print("--- Historical features (from saved dataset) ---")
    ds = store.get_saved_dataset("my_training_ds")
    print(ds.to_df())

    print("\n--- Online features ---")
    features = store.get_online_features(
        features=store.get_feature_service("credit_score_v3"),
        entity_rows=[
            {"zipcode": 30721, "dob_ssn": "19530219_5179", "transaction_amt": 1023}
        ],
    ).to_dict()
    for key, value in sorted(features.items()):
        print(key, " : ", value)
예제 #9
0
def _assert_online_features(store: FeatureStore, driver_df: pd.DataFrame,
                            max_date: datetime):
    """Assert that features in online store are up to date with `max_date` date."""
    # Read features back
    result = store.get_online_features(
        feature_refs=[
            "driver_hourly_stats:conv_rate",
            "driver_hourly_stats:avg_daily_trips",
        ],
        entity_rows=[{
            "driver_id": 1001
        }],
    )

    assert "driver_hourly_stats__avg_daily_trips" in result.to_dict()

    assert "driver_hourly_stats__conv_rate" in result.to_dict()
    assert (abs(result.to_dict()["driver_hourly_stats__conv_rate"][0] -
                _get_last_feature_row(driver_df, 1001, max_date)["conv_rate"])
            < 0.01)
예제 #10
0
def test_online() -> None:
    """
    Test reading from the online store in local mode.
    """
    runner = CliRunner()
    with runner.local_repo(
            get_example_repo("example_feature_repo_1.py")) as store:
        # Write some data to two tables

        driver_locations_fv = store.get_feature_view(name="driver_locations")
        customer_profile_fv = store.get_feature_view(name="customer_profile")
        customer_driver_combined_fv = store.get_feature_view(
            name="customer_driver_combined")

        provider = store._get_provider()

        driver_key = EntityKeyProto(join_keys=["driver"],
                                    entity_values=[ValueProto(int64_val=1)])
        provider.online_write_batch(
            project=store.config.project,
            table=driver_locations_fv,
            data=[(
                driver_key,
                {
                    "lat": ValueProto(double_val=0.1),
                    "lon": ValueProto(string_val="1.0"),
                },
                datetime.utcnow(),
                datetime.utcnow(),
            )],
            progress=None,
        )

        customer_key = EntityKeyProto(join_keys=["customer"],
                                      entity_values=[ValueProto(int64_val=5)])
        provider.online_write_batch(
            project=store.config.project,
            table=customer_profile_fv,
            data=[(
                customer_key,
                {
                    "avg_orders_day": ValueProto(float_val=1.0),
                    "name": ValueProto(string_val="John"),
                    "age": ValueProto(int64_val=3),
                },
                datetime.utcnow(),
                datetime.utcnow(),
            )],
            progress=None,
        )

        customer_key = EntityKeyProto(
            join_keys=["customer", "driver"],
            entity_values=[ValueProto(int64_val=5),
                           ValueProto(int64_val=1)],
        )
        provider.online_write_batch(
            project=store.config.project,
            table=customer_driver_combined_fv,
            data=[(
                customer_key,
                {
                    "trips": ValueProto(int64_val=7)
                },
                datetime.utcnow(),
                datetime.utcnow(),
            )],
            progress=None,
        )

        # Retrieve two features using two keys, one valid one non-existing
        result = store.get_online_features(
            feature_refs=[
                "driver_locations:lon",
                "customer_profile:avg_orders_day",
                "customer_profile:name",
                "customer_driver_combined:trips",
            ],
            entity_rows=[{
                "driver": 1,
                "customer": 5
            }, {
                "driver": 1,
                "customer": 5
            }],
        ).to_dict()

        assert "driver_locations__lon" in result
        assert "customer_profile__avg_orders_day" in result
        assert "customer_profile__name" in result
        assert result["driver"] == [1, 1]
        assert result["customer"] == [5, 5]
        assert result["driver_locations__lon"] == ["1.0", "1.0"]
        assert result["customer_profile__avg_orders_day"] == [1.0, 1.0]
        assert result["customer_profile__name"] == ["John", "John"]
        assert result["customer_driver_combined__trips"] == [7, 7]

        # Ensure features are still in result when keys not found
        result = store.get_online_features(
            feature_refs=["customer_driver_combined:trips"],
            entity_rows=[{
                "driver": 0,
                "customer": 0
            }],
        ).to_dict()

        assert "customer_driver_combined__trips" in result

        # invalid table reference
        with pytest.raises(FeatureViewNotFoundException):
            store.get_online_features(
                feature_refs=["driver_locations_bad:lon"],
                entity_rows=[{
                    "driver": 1
                }],
            )

        # Create new FeatureStore object with fast cache invalidation
        cache_ttl = 1
        fs_fast_ttl = FeatureStore(config=RepoConfig(
            registry=RegistryConfig(path=store.config.registry,
                                    cache_ttl_seconds=cache_ttl),
            online_store=store.config.online_store,
            project=store.config.project,
            provider=store.config.provider,
        ))

        # Should download the registry and cache it permanently (or until manually refreshed)
        result = fs_fast_ttl.get_online_features(
            feature_refs=[
                "driver_locations:lon",
                "customer_profile:avg_orders_day",
                "customer_profile:name",
                "customer_driver_combined:trips",
            ],
            entity_rows=[{
                "driver": 1,
                "customer": 5
            }],
        ).to_dict()
        assert result["driver_locations__lon"] == ["1.0"]
        assert result["customer_driver_combined__trips"] == [7]

        # Rename the registry.db so that it cant be used for refreshes
        os.rename(store.config.registry, store.config.registry + "_fake")

        # Wait for registry to expire
        time.sleep(cache_ttl)

        # Will try to reload registry because it has expired (it will fail because we deleted the actual registry file)
        with pytest.raises(FileNotFoundError):
            fs_fast_ttl.get_online_features(
                feature_refs=[
                    "driver_locations:lon",
                    "customer_profile:avg_orders_day",
                    "customer_profile:name",
                    "customer_driver_combined:trips",
                ],
                entity_rows=[{
                    "driver": 1,
                    "customer": 5
                }],
            ).to_dict()

        # Restore registry.db so that we can see if it actually reloads registry
        os.rename(store.config.registry + "_fake", store.config.registry)

        # Test if registry is actually reloaded and whether results return
        result = fs_fast_ttl.get_online_features(
            feature_refs=[
                "driver_locations:lon",
                "customer_profile:avg_orders_day",
                "customer_profile:name",
                "customer_driver_combined:trips",
            ],
            entity_rows=[{
                "driver": 1,
                "customer": 5
            }],
        ).to_dict()
        assert result["driver_locations__lon"] == ["1.0"]
        assert result["customer_driver_combined__trips"] == [7]

        # Create a registry with infinite cache (for users that want to manually refresh the registry)
        fs_infinite_ttl = FeatureStore(config=RepoConfig(
            registry=RegistryConfig(path=store.config.registry,
                                    cache_ttl_seconds=0),
            online_store=store.config.online_store,
            project=store.config.project,
            provider=store.config.provider,
        ))

        # Should return results (and fill the registry cache)
        result = fs_infinite_ttl.get_online_features(
            feature_refs=[
                "driver_locations:lon",
                "customer_profile:avg_orders_day",
                "customer_profile:name",
                "customer_driver_combined:trips",
            ],
            entity_rows=[{
                "driver": 1,
                "customer": 5
            }],
        ).to_dict()
        assert result["driver_locations__lon"] == ["1.0"]
        assert result["customer_driver_combined__trips"] == [7]

        # Wait a bit so that an arbitrary TTL would take effect
        time.sleep(2)

        # Rename the registry.db so that it cant be used for refreshes
        os.rename(store.config.registry, store.config.registry + "_fake")

        # TTL is infinite so this method should use registry cache
        result = fs_infinite_ttl.get_online_features(
            feature_refs=[
                "driver_locations:lon",
                "customer_profile:avg_orders_day",
                "customer_profile:name",
                "customer_driver_combined:trips",
            ],
            entity_rows=[{
                "driver": 1,
                "customer": 5
            }],
        ).to_dict()
        assert result["driver_locations__lon"] == ["1.0"]
        assert result["customer_driver_combined__trips"] == [7]

        # Force registry reload (should fail because file is missing)
        with pytest.raises(FileNotFoundError):
            fs_infinite_ttl.refresh_registry()

        # Restore registry.db so that teardown works
        os.rename(store.config.registry + "_fake", store.config.registry)