示例#1
0
def check_offline_and_online_features(
    fs: FeatureStore,
    fv: FeatureView,
    driver_id: int,
    event_timestamp: datetime,
    expected_value: Optional[float],
    full_feature_names: bool,
    check_offline_store: bool = True,
) -> None:
    # Check online store
    response_dict = fs.get_online_features(
        [f"{fv.name}:value"],
        [{
            "driver_id": driver_id
        }],
        full_feature_names=full_feature_names,
    ).to_dict()

    if full_feature_names:
        if expected_value:
            assert (
                abs(response_dict[f"{fv.name}__value"][0] - expected_value) <
                1e-6), f"Response: {response_dict}, Expected: {expected_value}"
        else:
            assert response_dict[f"{fv.name}__value"][0] is None
    else:
        if expected_value:
            assert (abs(response_dict["value"][0] - expected_value) < 1e-6
                    ), f"Response: {response_dict}, Expected: {expected_value}"
        else:
            assert response_dict["value"][0] is None

    # Check offline store
    if check_offline_store:
        df = fs.get_historical_features(
            entity_df=pd.DataFrame.from_dict({
                "driver_id": [driver_id],
                "event_timestamp": [event_timestamp]
            }),
            features=[f"{fv.name}:value"],
            full_feature_names=full_feature_names,
        ).to_df()

        if full_feature_names:
            if expected_value:
                assert (abs(
                    df.to_dict(orient="list")[f"{fv.name}__value"][0] -
                    expected_value) < 1e-6)
            else:
                assert not df.to_dict(
                    orient="list")[f"{fv.name}__value"] or math.isnan(
                        df.to_dict(orient="list")[f"{fv.name}__value"][0])
        else:
            if expected_value:
                assert (abs(
                    df.to_dict(orient="list")["value"][0] - expected_value) <
                        1e-6)
            else:
                assert not df.to_dict(orient="list")["value"] or math.isnan(
                    df.to_dict(orient="list")["value"][0])
示例#2
0
def load_historical_features(feature_store: FeatureStore) -> FlyteSchema:
    entity_df = pd.DataFrame.from_dict({
        "Hospital Number": [
            "530101",
            "5290409",
            "5291329",
            "530051",
            "529518",
            "530101",
            "529340",
            "5290409",
            "530034",
        ],
        "event_timestamp": [
            datetime(2021, 6, 25, 16, 36, 27),
            datetime(2021, 6, 25, 16, 36, 27),
            datetime(2021, 6, 25, 16, 36, 27),
            datetime(2021, 6, 25, 16, 36, 27),
            datetime(2021, 6, 25, 16, 36, 27),
            datetime(2021, 7, 5, 11, 36, 1),
            datetime(2021, 6, 25, 16, 36, 27),
            datetime(2021, 7, 5, 11, 50, 40),
            datetime(2021, 6, 25, 16, 36, 27),
        ],
    })

    return feature_store.get_historical_features(
        entity_df=entity_df, features=FEAST_FEATURES)  # noqa
示例#3
0
文件: test.py 项目: feast-dev/feast
def main():
    pd.set_option("display.max_columns", None)
    pd.set_option("display.width", 1000)

    # Load the feature store from the current path
    fs = FeatureStore(repo_path=".")

    # Deploy the feature store to Snowflake
    print("Deploying feature store to Snowflake...")
    fs.apply([driver, driver_stats_fv])

    # Select features
    features = [
        "driver_hourly_stats:conv_rate", "driver_hourly_stats:acc_rate"
    ]

    # Create an entity dataframe. This is the dataframe that will be enriched with historical features
    entity_df = pd.DataFrame({
        "event_timestamp": [
            pd.Timestamp(dt, unit="ms", tz="UTC").round("ms")
            for dt in pd.date_range(
                start=datetime.now() - timedelta(days=3),
                end=datetime.now(),
                periods=3,
            )
        ],
        "driver_id": [1001, 1002, 1003],
    })

    print("Retrieving training data...")

    # Retrieve historical features by joining the entity dataframe to the Snowflake table source
    training_df = fs.get_historical_features(features=features,
                                             entity_df=entity_df).to_df()

    print()
    print(training_df)

    print()
    print("Loading features into the online store...")
    fs.materialize_incremental(end_date=datetime.now())

    print()
    print("Retrieving online features...")

    # Retrieve features from the online store
    online_features = fs.get_online_features(
        features=features,
        entity_rows=[{
            "driver_id": 1001
        }, {
            "driver_id": 1002
        }],
    ).to_dict()

    print()
    print(pd.DataFrame.from_dict(online_features))
示例#4
0
def generate_saved_dataset():
    store = FeatureStore(repo_path=".")
    entity_df = pd.read_parquet(path="data/loan_table.parquet")

    fs = store.get_feature_service("credit_score_v1")
    job = store.get_historical_features(entity_df=entity_df, features=fs,)
    store.create_saved_dataset(
        from_=job,
        name="my_training_ds",
        storage=SavedDatasetFileStorage(path="my_training_ds.parquet"),
        feature_service=fs,
        profiler=credit_profiler,
    )
示例#5
0
def get_historical_features():
    """Retrieve historical features for training."""
    # Entities to pull data for (should dynamically read this from somewhere)
    project_ids = [1, 2, 3]
    now = datetime.now()
    timestamps = [datetime(now.year, now.month, now.day)] * len(project_ids)
    entity_df = pd.DataFrame.from_dict({"id": project_ids, "event_timestamp": timestamps})

    # Get historical features
    store = FeatureStore(repo_path=Path(config.BASE_DIR, "features"))
    training_df = store.get_historical_features(
        entity_df=entity_df,
        feature_refs=["project_details:text", "project_details:tags"],
    ).to_df()

    # Store in location for training task to pick up
    print(training_df.head())
示例#6
0
from sklearn.linear_model import LinearRegression

import helpers

# Load driver order data
orders = pd.read_csv("driver_orders.csv", sep="\t")
orders["event_timestamp"] = pd.to_datetime(orders["event_timestamp"])

# Set up feature store
fs = FeatureStore(repo_path="driver_ranking/")

# Retrieve training data from BigQuery
training_df = fs.get_historical_features(
    entity_df=orders,
    feature_refs=[
        "driver_hourly_stats:conv_rate",
        "driver_hourly_stats:acc_rate",
        "driver_hourly_stats:avg_daily_trips",
    ],
).to_df()

# # Print output
# print(training_df)

# Train model
target = "trip_completed"

reg = LinearRegression()
train_X = training_df[training_df.columns.drop(target).drop("event_timestamp")]
train_Y = training_df.loc[:, target]
reg.fit(train_X, train_Y)
dump(reg, "driver_model.bin")
示例#7
0
    "user_id": [1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008],
    "event_timestamp": [
        datetime(2021, 4, 21, 17, 58, 9),
        datetime(2021, 4, 21, 17, 58, 9),
        datetime(2021, 4, 21, 17, 58, 9),
        datetime(2021, 4, 21, 17, 58, 9),
        datetime(2021, 4, 21, 17, 58, 9),
        datetime(2021, 4, 21, 17, 58, 9),
        datetime(2021, 4, 21, 17, 58, 9),
        datetime(2021, 4, 21, 17, 58, 9),
    ]
})
training_df = store.get_historical_features(
    entity_df=entity_df,
    feature_refs=[
        'driver_hourly_stats:daily_transactions',
        'driver_hourly_stats:total_transactions',
    ],
).to_df()
print(training_df)
executionTime = (time.time() - startTime)
print('Execution time in seconds: ' + str(executionTime))

startTime = time.time()
entity_df = pd.DataFrame.from_dict({
    "user_id": [1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008],
    "event_timestamp": [
        datetime(2021, 4, 24, 17, 58, 9),
        datetime(2021, 4, 24, 17, 58, 9),
        datetime(2021, 4, 24, 17, 58, 9),
        datetime(2021, 4, 24, 17, 58, 9),
示例#8
0
entity_df = pd.DataFrame.from_dict({
    "driver_id": [1001, 1002, 1003, 1004],
    "event_timestamp": [
        datetime(2021, 4, 12, 10, 59, 42),
        datetime(2021, 4, 12, 8,  12, 10),
        datetime(2021, 4, 12, 16, 40, 26),
        datetime(2021, 4, 12, 15, 1 , 12)
    ]
})

store = FeatureStore(repo_path="feast_repo")

training_df = store.get_historical_features(
    entity_df=entity_df, 
    feature_refs = [
        'driver_hourly_stats:conv_rate',
        'driver_hourly_stats:acc_rate',
        'driver_hourly_stats:avg_daily_trips'
    ],
).to_df()

print(training_df.head())

# another feature store

store = FeatureStore(repo_path="feature_transaction")
customer_df = pd.read_parquet("customers.parquet")

training_df = store.get_historical_features(
    entity_df=customer_df, 
    feature_refs = [
        'customer_transactions:total_transactions',