Exemplo n.º 1
0
def test_push_features_and_read(environment, universal_data_sources):
    store = environment.feature_store

    (_, datasets, data_sources) = universal_data_sources
    feature_views = construct_universal_feature_views(data_sources)

    store.apply([driver(), customer(), location(), *feature_views.values()])
    data = {
        "location_id": [1],
        "temperature": [4],
        "event_timestamp":
        [pd.Timestamp(datetime.datetime.utcnow()).round("ms")],
        "created": [pd.Timestamp(datetime.datetime.utcnow()).round("ms")],
    }
    df_ingest = pd.DataFrame(data)

    store.push("location_stats_push_source", df_ingest)

    online_resp = store.get_online_features(
        features=["pushable_location_stats:temperature"],
        entity_rows=[{
            "location_id": 1
        }],
    )
    online_resp_dict = online_resp.to_dict()
    assert online_resp_dict["location_id"] == [1]
    assert online_resp_dict["temperature"] == [4]
Exemplo n.º 2
0
def test_historical_features_with_missing_request_data(environment,
                                                       universal_data_sources,
                                                       full_feature_names):
    store = environment.feature_store

    (_, datasets, data_sources) = universal_data_sources
    feature_views = construct_universal_feature_views(data_sources)

    store.apply([driver(), customer(), location(), *feature_views.values()])

    # If request data is missing that's needed for on demand transform, throw an error
    with pytest.raises(RequestDataNotFoundInEntityDfException):
        store.get_historical_features(
            entity_df=datasets.entity_df,
            features=[
                "customer_profile:current_balance",
                "customer_profile:avg_passenger_count",
                "customer_profile:lifetime_trip_count",
                "conv_rate_plus_100:conv_rate_plus_100",
                "conv_rate_plus_100:conv_rate_plus_val_to_add",
                "global_stats:num_rides",
                "global_stats:avg_ride_length",
                "field_mapping:feature_name",
            ],
            full_feature_names=full_feature_names,
        )
Exemplo n.º 3
0
def test_historical_retrieval_with_validation(environment,
                                              universal_data_sources):
    store = environment.feature_store
    (entities, datasets, data_sources) = universal_data_sources
    feature_views = construct_universal_feature_views(data_sources)
    store.apply([driver(), customer(), location(), *feature_views.values()])

    # Create two identical retrieval jobs
    entity_df = datasets.entity_df.drop(
        columns=["order_id", "origin_id", "destination_id"])
    reference_job = store.get_historical_features(
        entity_df=entity_df,
        features=_features,
    )
    job = store.get_historical_features(
        entity_df=entity_df,
        features=_features,
    )

    # Save dataset using reference job and retrieve it
    store.create_saved_dataset(
        from_=reference_job,
        name="my_training_dataset",
        storage=environment.data_source_creator.
        create_saved_dataset_destination(),
    )
    saved_dataset = store.get_saved_dataset("my_training_dataset")

    # If validation pass there will be no exceptions on this point
    reference = saved_dataset.as_reference(profiler=configurable_profiler)
    job.to_df(validation_reference=reference)
Exemplo n.º 4
0
def create_location_stats_feature_view(source, infer_features: bool = False):
    location_stats_feature_view = FeatureView(
        name="location_stats",
        entities=[location()],
        schema=None
        if infer_features else [Field(name="temperature", dtype=Int32)],
        source=source,
        ttl=timedelta(days=2),
    )
    return location_stats_feature_view
def test_online_retrieval(environment, universal_data_sources, benchmark):

    fs = environment.feature_store
    entities, datasets, data_sources = universal_data_sources
    feature_views = construct_universal_feature_views(data_sources)

    feature_service = FeatureService(
        "convrate_plus100",
        features=[
            feature_views["driver"][["conv_rate"]],
            feature_views["driver_odfv"]
        ],
    )

    feast_objects = []
    feast_objects.extend(feature_views.values())
    feast_objects.extend([driver(), customer(), location(), feature_service])
    fs.apply(feast_objects)
    fs.materialize(environment.start_date, environment.end_date)

    sample_drivers = random.sample(entities["driver"], 10)

    sample_customers = random.sample(entities["customer"], 10)

    entity_rows = [{
        "driver": d,
        "customer_id": c,
        "val_to_add": 50
    } for (d, c) in zip(sample_drivers, sample_customers)]

    feature_refs = [
        "driver_stats:conv_rate",
        "driver_stats:avg_daily_trips",
        "customer_profile:current_balance",
        "customer_profile:avg_passenger_count",
        "customer_profile:lifetime_trip_count",
        "conv_rate_plus_100:conv_rate_plus_100",
        "conv_rate_plus_100:conv_rate_plus_val_to_add",
        "global_stats:num_rides",
        "global_stats:avg_ride_length",
    ]
    unprefixed_feature_refs = [
        f.rsplit(":", 1)[-1] for f in feature_refs if ":" in f
    ]
    # Remove the on demand feature view output features, since they're not present in the source dataframe
    unprefixed_feature_refs.remove("conv_rate_plus_100")
    unprefixed_feature_refs.remove("conv_rate_plus_val_to_add")

    benchmark(
        fs.get_online_features,
        features=feature_refs,
        entity_rows=entity_rows,
    )
Exemplo n.º 6
0
def setup_python_fs_client():
    config = IntegrationTestRepoConfig()
    environment = construct_test_environment(config)
    fs = environment.feature_store
    try:
        entities, datasets, data_sources = construct_universal_test_data(
            environment)
        feature_views = construct_universal_feature_views(data_sources)
        feast_objects: List[FeastObject] = []
        feast_objects.extend(feature_views.values())
        feast_objects.extend([driver(), customer(), location()])
        fs.apply(feast_objects)
        fs.materialize(environment.start_date, environment.end_date)
        client = TestClient(get_app(fs))
        yield client
    finally:
        fs.teardown()
        environment.data_source_creator.teardown()
Exemplo n.º 7
0
def test_historical_retrieval_fails_on_validation(environment,
                                                  universal_data_sources):
    store = environment.feature_store

    (entities, datasets, data_sources) = universal_data_sources
    feature_views = construct_universal_feature_views(data_sources)

    store.apply([driver(), customer(), location(), *feature_views.values()])

    entity_df = datasets.entity_df.drop(
        columns=["order_id", "origin_id", "destination_id"])

    reference_job = store.get_historical_features(
        entity_df=entity_df,
        features=_features,
    )

    store.create_saved_dataset(
        from_=reference_job,
        name="my_other_dataset",
        storage=environment.data_source_creator.
        create_saved_dataset_destination(),
    )

    job = store.get_historical_features(
        entity_df=entity_df,
        features=_features,
    )

    with pytest.raises(ValidationFailed) as exc_info:
        job.to_df(validation_reference=store.get_saved_dataset(
            "my_other_dataset").as_reference(
                profiler=profiler_with_unrealistic_expectations))

    failed_expectations = exc_info.value.report.errors
    assert len(failed_expectations) == 2

    assert failed_expectations[
        0].check_name == "expect_column_max_to_be_between"
    assert failed_expectations[0].column_name == "current_balance"

    assert failed_expectations[
        1].check_name == "expect_column_values_to_be_in_set"
    assert failed_expectations[1].column_name == "avg_passenger_count"
Exemplo n.º 8
0
def test_online_retrieval(environment, universal_data_sources,
                          full_feature_names):
    fs = environment.feature_store
    entities, datasets, data_sources = universal_data_sources
    feature_views = construct_universal_feature_views(data_sources)

    feature_service = FeatureService(
        "convrate_plus100",
        features=[
            feature_views.driver[["conv_rate"]],
            feature_views.driver_odfv,
            feature_views.customer[["current_balance"]],
        ],
    )
    feature_service_entity_mapping = FeatureService(
        name="entity_mapping",
        features=[
            feature_views.location.with_name("origin").with_join_key_map(
                {"location_id": "origin_id"}),
            feature_views.location.with_name("destination").with_join_key_map(
                {"location_id": "destination_id"}),
        ],
    )

    feast_objects = []
    feast_objects.extend(feature_views.values())
    feast_objects.extend([
        driver(),
        customer(),
        location(),
        feature_service,
        feature_service_entity_mapping,
    ])
    fs.apply(feast_objects)
    fs.materialize(
        environment.start_date - timedelta(days=1),
        environment.end_date + timedelta(days=1),
    )

    entity_sample = datasets.orders_df.sample(10)[[
        "customer_id", "driver_id", "order_id", "event_timestamp"
    ]]
    orders_df = datasets.orders_df[(
        datasets.orders_df["customer_id"].isin(entity_sample["customer_id"])
        & datasets.orders_df["driver_id"].isin(entity_sample["driver_id"]))]

    sample_drivers = entity_sample["driver_id"]
    drivers_df = datasets.driver_df[datasets.driver_df["driver_id"].isin(
        sample_drivers)]

    sample_customers = entity_sample["customer_id"]
    customers_df = datasets.customer_df[
        datasets.customer_df["customer_id"].isin(sample_customers)]

    location_pairs = np.array(
        list(itertools.permutations(entities.location_vals, 2)))
    sample_location_pairs = location_pairs[np.random.choice(
        len(location_pairs), 10)].T.tolist()
    origins_df = datasets.location_df[datasets.location_df["location_id"].isin(
        sample_location_pairs[0])]
    destinations_df = datasets.location_df[
        datasets.location_df["location_id"].isin(sample_location_pairs[1])]

    global_df = datasets.global_df

    entity_rows = [{
        "driver_id": d,
        "customer_id": c,
        "val_to_add": 50
    } for (d, c) in zip(sample_drivers, sample_customers)]

    feature_refs = [
        "driver_stats:conv_rate",
        "driver_stats:avg_daily_trips",
        "customer_profile:current_balance",
        "customer_profile:avg_passenger_count",
        "customer_profile:lifetime_trip_count",
        "conv_rate_plus_100:conv_rate_plus_100",
        "conv_rate_plus_100:conv_rate_plus_val_to_add",
        "order:order_is_success",
        "global_stats:num_rides",
        "global_stats:avg_ride_length",
    ]
    unprefixed_feature_refs = [
        f.rsplit(":", 1)[-1] for f in feature_refs if ":" in f
    ]
    # Remove the on demand feature view output features, since they're not present in the source dataframe
    unprefixed_feature_refs.remove("conv_rate_plus_100")
    unprefixed_feature_refs.remove("conv_rate_plus_val_to_add")

    online_features_dict = get_online_features_dict(
        environment=environment,
        features=feature_refs,
        entity_rows=entity_rows,
        full_feature_names=full_feature_names,
    )

    # Test that the on demand feature views compute properly even if the dependent conv_rate
    # feature isn't requested.
    online_features_no_conv_rate = get_online_features_dict(
        environment=environment,
        features=[
            ref for ref in feature_refs if ref != "driver_stats:conv_rate"
        ],
        entity_rows=entity_rows,
        full_feature_names=full_feature_names,
    )

    assert online_features_no_conv_rate is not None

    keys = set(online_features_dict.keys())
    expected_keys = set(
        f.replace(":", "__") if full_feature_names else f.split(":")[-1]
        for f in feature_refs) | {"customer_id", "driver_id"}
    assert (
        keys == expected_keys
    ), f"Response keys are different from expected: {keys - expected_keys} (extra) and {expected_keys - keys} (missing)"

    tc = unittest.TestCase()
    for i, entity_row in enumerate(entity_rows):
        df_features = get_latest_feature_values_from_dataframes(
            driver_df=drivers_df,
            customer_df=customers_df,
            orders_df=orders_df,
            global_df=global_df,
            entity_row=entity_row,
        )

        assert df_features["customer_id"] == online_features_dict[
            "customer_id"][i]
        assert df_features["driver_id"] == online_features_dict["driver_id"][i]
        tc.assertAlmostEqual(
            online_features_dict[response_feature_name("conv_rate_plus_100",
                                                       feature_refs,
                                                       full_feature_names)][i],
            df_features["conv_rate"] + 100,
            delta=0.0001,
        )
        tc.assertAlmostEqual(
            online_features_dict[response_feature_name(
                "conv_rate_plus_val_to_add", feature_refs,
                full_feature_names)][i],
            df_features["conv_rate"] + df_features["val_to_add"],
            delta=0.0001,
        )
        for unprefixed_feature_ref in unprefixed_feature_refs:
            tc.assertAlmostEqual(
                df_features[unprefixed_feature_ref],
                online_features_dict[response_feature_name(
                    unprefixed_feature_ref, feature_refs,
                    full_feature_names)][i],
                delta=0.0001,
            )

    # Check what happens for missing values
    missing_responses_dict = get_online_features_dict(
        environment=environment,
        features=feature_refs,
        entity_rows=[{
            "driver_id": 0,
            "customer_id": 0,
            "val_to_add": 100
        }],
        full_feature_names=full_feature_names,
    )
    assert missing_responses_dict is not None
    for unprefixed_feature_ref in unprefixed_feature_refs:
        if unprefixed_feature_ref not in {"num_rides", "avg_ride_length"}:
            tc.assertIsNone(missing_responses_dict[response_feature_name(
                unprefixed_feature_ref, feature_refs, full_feature_names)][0])

    # Check what happens for missing request data
    with pytest.raises(RequestDataNotFoundInEntityRowsException):
        get_online_features_dict(
            environment=environment,
            features=feature_refs,
            entity_rows=[{
                "driver_id": 0,
                "customer_id": 0
            }],
            full_feature_names=full_feature_names,
        )

    assert_feature_service_correctness(
        environment,
        feature_service,
        entity_rows,
        full_feature_names,
        drivers_df,
        customers_df,
        orders_df,
        global_df,
    )

    entity_rows = [{
        "origin_id": origin,
        "destination_id": destination
    } for (_driver, _customer, origin, destination
           ) in zip(sample_drivers, sample_customers, *sample_location_pairs)]
    assert_feature_service_entity_mapping_correctness(
        environment,
        feature_service_entity_mapping,
        entity_rows,
        full_feature_names,
        origins_df,
        destinations_df,
    )
Exemplo n.º 9
0
def test_historical_features_persisting(environment, universal_data_sources,
                                        full_feature_names):
    store = environment.feature_store

    (entities, datasets, data_sources) = universal_data_sources
    feature_views = construct_universal_feature_views(data_sources)

    store.apply([driver(), customer(), location(), *feature_views.values()])

    entity_df = datasets.entity_df.drop(
        columns=["order_id", "origin_id", "destination_id"])

    job = store.get_historical_features(
        entity_df=entity_df,
        features=[
            "customer_profile:current_balance",
            "customer_profile:avg_passenger_count",
            "customer_profile:lifetime_trip_count",
            "order:order_is_success",
            "global_stats:num_rides",
            "global_stats:avg_ride_length",
            "field_mapping:feature_name",
        ],
        full_feature_names=full_feature_names,
    )

    saved_dataset = store.create_saved_dataset(
        from_=job,
        name="saved_dataset",
        storage=environment.data_source_creator.
        create_saved_dataset_destination(),
        tags={"env": "test"},
    )

    event_timestamp = DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL
    expected_df = get_expected_training_df(
        datasets.customer_df,
        feature_views.customer,
        datasets.driver_df,
        feature_views.driver,
        datasets.orders_df,
        feature_views.order,
        datasets.location_df,
        feature_views.location,
        datasets.global_df,
        feature_views.global_fv,
        datasets.field_mapping_df,
        feature_views.field_mapping,
        entity_df,
        event_timestamp,
        full_feature_names,
    ).drop(columns=[
        response_feature_name("conv_rate_plus_100", full_feature_names),
        response_feature_name("conv_rate_plus_100_rounded",
                              full_feature_names),
        response_feature_name("avg_daily_trips", full_feature_names),
        response_feature_name("conv_rate", full_feature_names),
        "origin__temperature",
        "destination__temperature",
    ])

    assert_frame_equal(
        expected_df,
        saved_dataset.to_df(),
        keys=[event_timestamp, "driver_id", "customer_id"],
    )

    assert_frame_equal(
        job.to_df(),
        saved_dataset.to_df(),
        keys=[event_timestamp, "driver_id", "customer_id"],
    )
Exemplo n.º 10
0
def test_historical_features_with_entities_from_query(environment,
                                                      universal_data_sources,
                                                      full_feature_names):
    store = environment.feature_store
    (entities, datasets, data_sources) = universal_data_sources
    feature_views = construct_universal_feature_views(data_sources)

    orders_table = table_name_from_data_source(data_sources.orders)
    if not orders_table:
        raise pytest.skip("Offline source is not sql-based")

    data_source_creator = environment.test_repo_config.offline_store_creator
    if data_source_creator.__name__ == SnowflakeDataSourceCreator.__name__:
        entity_df_query = f"""
        SELECT "customer_id", "driver_id", "order_id", "origin_id", "destination_id", "event_timestamp"
        FROM "{orders_table}"
        """
    else:
        entity_df_query = f"""
        SELECT customer_id, driver_id, order_id, origin_id, destination_id, event_timestamp
        FROM {orders_table}
        """

    store.apply([driver(), customer(), location(), *feature_views.values()])

    job_from_sql = store.get_historical_features(
        entity_df=entity_df_query,
        features=[
            "customer_profile:current_balance",
            "customer_profile:avg_passenger_count",
            "customer_profile:lifetime_trip_count",
            "order:order_is_success",
            "global_stats:num_rides",
            "global_stats:avg_ride_length",
            "field_mapping:feature_name",
        ],
        full_feature_names=full_feature_names,
    )

    start_time = datetime.utcnow()
    actual_df_from_sql_entities = job_from_sql.to_df()
    end_time = datetime.utcnow()
    print(
        str(f"\nTime to execute job_from_sql.to_df() = '{(end_time - start_time)}'"
            ))

    event_timestamp = (DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL
                       if DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL
                       in datasets.orders_df.columns else "e_ts")
    full_expected_df = get_expected_training_df(
        datasets.customer_df,
        feature_views.customer,
        datasets.driver_df,
        feature_views.driver,
        datasets.orders_df,
        feature_views.order,
        datasets.location_df,
        feature_views.location,
        datasets.global_df,
        feature_views.global_fv,
        datasets.field_mapping_df,
        feature_views.field_mapping,
        datasets.entity_df,
        event_timestamp,
        full_feature_names,
    )

    # Not requesting the on demand transform with an entity_df query (can't add request data in them)
    expected_df_query = full_expected_df.drop(columns=[
        response_feature_name("conv_rate_plus_100", full_feature_names),
        response_feature_name("conv_rate_plus_100_rounded",
                              full_feature_names),
        response_feature_name("avg_daily_trips", full_feature_names),
        response_feature_name("conv_rate", full_feature_names),
        "origin__temperature",
        "destination__temperature",
    ])
    assert_frame_equal(
        expected_df_query,
        actual_df_from_sql_entities,
        keys=[event_timestamp, "order_id", "driver_id", "customer_id"],
    )

    table_from_sql_entities = job_from_sql.to_arrow().to_pandas()
    for col in table_from_sql_entities.columns:
        expected_df_query[col] = expected_df_query[col].astype(
            table_from_sql_entities[col].dtype)

    assert_frame_equal(
        expected_df_query,
        table_from_sql_entities,
        keys=[event_timestamp, "order_id", "driver_id", "customer_id"],
    )
Exemplo n.º 11
0
def test_historical_features(environment, universal_data_sources,
                             full_feature_names):
    store = environment.feature_store

    (entities, datasets, data_sources) = universal_data_sources

    feature_views = construct_universal_feature_views(data_sources)

    entity_df_with_request_data = datasets.entity_df.copy(deep=True)
    entity_df_with_request_data["val_to_add"] = [
        i for i in range(len(entity_df_with_request_data))
    ]
    entity_df_with_request_data["driver_age"] = [
        i + 100 for i in range(len(entity_df_with_request_data))
    ]

    feature_service = FeatureService(
        name="convrate_plus100",
        features=[
            feature_views.driver[["conv_rate"]], feature_views.driver_odfv
        ],
    )
    feature_service_entity_mapping = FeatureService(
        name="entity_mapping",
        features=[
            feature_views.location.with_name("origin").with_join_key_map(
                {"location_id": "origin_id"}),
            feature_views.location.with_name("destination").with_join_key_map(
                {"location_id": "destination_id"}),
        ],
    )

    store.apply([
        driver(),
        customer(),
        location(),
        feature_service,
        feature_service_entity_mapping,
        *feature_views.values(),
    ])

    event_timestamp = (DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL
                       if DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL
                       in datasets.orders_df.columns else "e_ts")
    full_expected_df = get_expected_training_df(
        datasets.customer_df,
        feature_views.customer,
        datasets.driver_df,
        feature_views.driver,
        datasets.orders_df,
        feature_views.order,
        datasets.location_df,
        feature_views.location,
        datasets.global_df,
        feature_views.global_fv,
        datasets.field_mapping_df,
        feature_views.field_mapping,
        entity_df_with_request_data,
        event_timestamp,
        full_feature_names,
    )

    # Only need the shadow entities features in the FeatureService test
    expected_df = full_expected_df.drop(
        columns=["origin__temperature", "destination__temperature"], )

    job_from_df = store.get_historical_features(
        entity_df=entity_df_with_request_data,
        features=[
            "driver_stats:conv_rate",
            "driver_stats:avg_daily_trips",
            "customer_profile:current_balance",
            "customer_profile:avg_passenger_count",
            "customer_profile:lifetime_trip_count",
            "conv_rate_plus_100:conv_rate_plus_100",
            "conv_rate_plus_100:conv_rate_plus_100_rounded",
            "conv_rate_plus_100:conv_rate_plus_val_to_add",
            "order:order_is_success",
            "global_stats:num_rides",
            "global_stats:avg_ride_length",
            "field_mapping:feature_name",
        ],
        full_feature_names=full_feature_names,
    )

    start_time = datetime.utcnow()
    actual_df_from_df_entities = job_from_df.to_df()

    print(
        f"actual_df_from_df_entities shape: {actual_df_from_df_entities.shape}"
    )
    end_time = datetime.utcnow()
    print(
        str(f"Time to execute job_from_df.to_df() = '{(end_time - start_time)}'\n"
            ))

    assert sorted(expected_df.columns) == sorted(
        actual_df_from_df_entities.columns)
    assert_frame_equal(
        expected_df,
        actual_df_from_df_entities,
        keys=[event_timestamp, "order_id", "driver_id", "customer_id"],
    )

    assert_feature_service_correctness(
        store,
        feature_service,
        full_feature_names,
        entity_df_with_request_data,
        expected_df,
        event_timestamp,
    )
    assert_feature_service_entity_mapping_correctness(
        store,
        feature_service_entity_mapping,
        full_feature_names,
        entity_df_with_request_data,
        full_expected_df,
        event_timestamp,
    )
    table_from_df_entities: pd.DataFrame = job_from_df.to_arrow().to_pandas()

    assert_frame_equal(
        expected_df,
        table_from_df_entities,
        keys=[event_timestamp, "order_id", "driver_id", "customer_id"],
    )
Exemplo n.º 12
0
def test_online_retrieval(environment, universal_data_sources,
                          full_feature_names):

    fs = environment.feature_store
    entities, datasets, data_sources = universal_data_sources
    feature_views = construct_universal_feature_views(data_sources)

    feature_service = FeatureService(
        "convrate_plus100",
        features=[
            feature_views["driver"][["conv_rate"]],
            feature_views["driver_odfv"],
            feature_views["driver_age_request_fv"],
        ],
    )
    feature_service_entity_mapping = FeatureService(
        name="entity_mapping",
        features=[
            feature_views["location"].with_name("origin").with_join_key_map(
                {"location_id": "origin_id"}),
            feature_views["location"].with_name(
                "destination").with_join_key_map(
                    {"location_id": "destination_id"}),
        ],
    )

    feast_objects = []
    feast_objects.extend(feature_views.values())
    feast_objects.extend([
        driver(),
        customer(),
        location(),
        feature_service,
        feature_service_entity_mapping,
    ])
    fs.apply(feast_objects)
    fs.materialize(
        environment.start_date - timedelta(days=1),
        environment.end_date + timedelta(days=1),
    )

    entity_sample = datasets["orders"].sample(10)[[
        "customer_id", "driver_id", "order_id", "event_timestamp"
    ]]
    orders_df = datasets["orders"][(
        datasets["orders"]["customer_id"].isin(entity_sample["customer_id"])
        & datasets["orders"]["driver_id"].isin(entity_sample["driver_id"]))]

    sample_drivers = entity_sample["driver_id"]
    drivers_df = datasets["driver"][datasets["driver"]["driver_id"].isin(
        sample_drivers)]

    sample_customers = entity_sample["customer_id"]
    customers_df = datasets["customer"][datasets["customer"]
                                        ["customer_id"].isin(sample_customers)]

    location_pairs = np.array(
        list(itertools.permutations(entities["location"], 2)))
    sample_location_pairs = location_pairs[np.random.choice(
        len(location_pairs), 10)].T
    origins_df = datasets["location"][datasets["location"]["location_id"].isin(
        sample_location_pairs[0])]
    destinations_df = datasets["location"][
        datasets["location"]["location_id"].isin(sample_location_pairs[1])]

    global_df = datasets["global"]

    entity_rows = [{
        "driver": d,
        "customer_id": c,
        "val_to_add": 50,
        "driver_age": 25
    } for (d, c) in zip(sample_drivers, sample_customers)]

    feature_refs = [
        "driver_stats:conv_rate",
        "driver_stats:avg_daily_trips",
        "customer_profile:current_balance",
        "customer_profile:avg_passenger_count",
        "customer_profile:lifetime_trip_count",
        "conv_rate_plus_100:conv_rate_plus_100",
        "conv_rate_plus_100:conv_rate_plus_val_to_add",
        "order:order_is_success",
        "global_stats:num_rides",
        "global_stats:avg_ride_length",
        "driver_age:driver_age",
    ]
    unprefixed_feature_refs = [
        f.rsplit(":", 1)[-1] for f in feature_refs if ":" in f
    ]
    # Remove the on demand feature view output features, since they're not present in the source dataframe
    unprefixed_feature_refs.remove("conv_rate_plus_100")
    unprefixed_feature_refs.remove("conv_rate_plus_val_to_add")

    online_features = fs.get_online_features(
        features=feature_refs,
        entity_rows=entity_rows,
        full_feature_names=full_feature_names,
    )
    assert online_features is not None

    online_features_dict = online_features.to_dict()
    keys = online_features_dict.keys()
    assert (
        len(keys) == len(feature_refs) + 3
    )  # Add three for the driver id and the customer id entity keys + val_to_add request data.
    for feature in feature_refs:
        # full_feature_names does not apply to request feature views
        if full_feature_names and feature != "driver_age:driver_age":
            assert feature.replace(":", "__") in keys
        else:
            assert feature.rsplit(":", 1)[-1] in keys
            assert ("driver_stats" not in keys
                    and "customer_profile" not in keys and "order" not in keys
                    and "global_stats" not in keys)

    tc = unittest.TestCase()
    for i, entity_row in enumerate(entity_rows):
        df_features = get_latest_feature_values_from_dataframes(
            driver_df=drivers_df,
            customer_df=customers_df,
            orders_df=orders_df,
            global_df=global_df,
            entity_row=entity_row,
        )

        assert df_features["customer_id"] == online_features_dict[
            "customer_id"][i]
        assert df_features["driver_id"] == online_features_dict["driver_id"][i]
        tc.assertAlmostEqual(
            online_features_dict[response_feature_name("conv_rate_plus_100",
                                                       full_feature_names)][i],
            df_features["conv_rate"] + 100,
            delta=0.0001,
        )
        tc.assertAlmostEqual(
            online_features_dict[response_feature_name(
                "conv_rate_plus_val_to_add", full_feature_names)][i],
            df_features["conv_rate"] + df_features["val_to_add"],
            delta=0.0001,
        )
        for unprefixed_feature_ref in unprefixed_feature_refs:
            tc.assertAlmostEqual(
                df_features[unprefixed_feature_ref],
                online_features_dict[response_feature_name(
                    unprefixed_feature_ref, full_feature_names)][i],
                delta=0.0001,
            )

    # Check what happens for missing values
    missing_responses_dict = fs.get_online_features(
        features=feature_refs,
        entity_rows=[{
            "driver": 0,
            "customer_id": 0,
            "val_to_add": 100,
            "driver_age": 125
        }],
        full_feature_names=full_feature_names,
    ).to_dict()
    assert missing_responses_dict is not None
    for unprefixed_feature_ref in unprefixed_feature_refs:
        if unprefixed_feature_ref not in {
                "num_rides", "avg_ride_length", "driver_age"
        }:
            tc.assertIsNone(missing_responses_dict[response_feature_name(
                unprefixed_feature_ref, full_feature_names)][0])

    # Check what happens for missing request data
    with pytest.raises(RequestDataNotFoundInEntityRowsException):
        fs.get_online_features(
            features=feature_refs,
            entity_rows=[{
                "driver": 0,
                "customer_id": 0
            }],
            full_feature_names=full_feature_names,
        ).to_dict()

    # Also with request data
    with pytest.raises(RequestDataNotFoundInEntityRowsException):
        fs.get_online_features(
            features=feature_refs,
            entity_rows=[{
                "driver": 0,
                "customer_id": 0,
                "val_to_add": 20
            }],
            full_feature_names=full_feature_names,
        ).to_dict()

    assert_feature_service_correctness(
        fs,
        feature_service,
        entity_rows,
        full_feature_names,
        drivers_df,
        customers_df,
        orders_df,
        global_df,
    )

    entity_rows = [{
        "driver": driver,
        "customer_id": customer,
        "origin_id": origin,
        "destination_id": destination,
    } for (driver, customer, origin, destination
           ) in zip(sample_drivers, sample_customers, *sample_location_pairs)]
    assert_feature_service_entity_mapping_correctness(
        fs,
        feature_service_entity_mapping,
        entity_rows,
        full_feature_names,
        drivers_df,
        customers_df,
        orders_df,
        origins_df,
        destinations_df,
    )
Exemplo n.º 13
0
def test_historical_features(environment, universal_data_sources, full_feature_names):
    store = environment.feature_store

    (entities, datasets, data_sources) = universal_data_sources
    feature_views = construct_universal_feature_views(data_sources)

    customer_df, driver_df, location_df, orders_df, global_df, entity_df = (
        datasets["customer"],
        datasets["driver"],
        datasets["location"],
        datasets["orders"],
        datasets["global"],
        datasets["entity"],
    )
    entity_df_with_request_data = entity_df.copy(deep=True)
    entity_df_with_request_data["val_to_add"] = [
        i for i in range(len(entity_df_with_request_data))
    ]
    entity_df_with_request_data["driver_age"] = [
        i + 100 for i in range(len(entity_df_with_request_data))
    ]

    (
        customer_fv,
        driver_fv,
        driver_odfv,
        location_fv,
        order_fv,
        global_fv,
        driver_age_request_fv,
    ) = (
        feature_views["customer"],
        feature_views["driver"],
        feature_views["driver_odfv"],
        feature_views["location"],
        feature_views["order"],
        feature_views["global"],
        feature_views["driver_age_request_fv"],
    )

    feature_service = FeatureService(
        name="convrate_plus100",
        features=[
            feature_views["driver"][["conv_rate"]],
            driver_odfv,
            driver_age_request_fv,
        ],
    )
    feature_service_entity_mapping = FeatureService(
        name="entity_mapping",
        features=[
            location_fv.with_name("origin").with_join_key_map(
                {"location_id": "origin_id"}
            ),
            location_fv.with_name("destination").with_join_key_map(
                {"location_id": "destination_id"}
            ),
        ],
    )

    feast_objects = []
    feast_objects.extend(
        [
            customer_fv,
            driver_fv,
            driver_odfv,
            location_fv,
            order_fv,
            global_fv,
            driver_age_request_fv,
            driver(),
            customer(),
            location(),
            feature_service,
            feature_service_entity_mapping,
        ]
    )
    store.apply(feast_objects)

    entity_df_query = None
    orders_table = table_name_from_data_source(data_sources["orders"])
    if orders_table:
        entity_df_query = f"SELECT customer_id, driver_id, order_id, origin_id, destination_id, event_timestamp FROM {orders_table}"

    event_timestamp = (
        DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL
        if DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL in orders_df.columns
        else "e_ts"
    )
    full_expected_df = get_expected_training_df(
        customer_df,
        customer_fv,
        driver_df,
        driver_fv,
        orders_df,
        order_fv,
        location_df,
        location_fv,
        global_df,
        global_fv,
        entity_df_with_request_data,
        event_timestamp,
        full_feature_names,
    )

    # Only need the shadow entities features in the FeatureService test
    expected_df = full_expected_df.drop(
        columns=["origin__temperature", "destination__temperature"],
    )

    if entity_df_query:
        job_from_sql = store.get_historical_features(
            entity_df=entity_df_query,
            features=[
                "driver_stats:conv_rate",
                "driver_stats:avg_daily_trips",
                "customer_profile:current_balance",
                "customer_profile:avg_passenger_count",
                "customer_profile:lifetime_trip_count",
                "order:order_is_success",
                "global_stats:num_rides",
                "global_stats:avg_ride_length",
            ],
            full_feature_names=full_feature_names,
        )

        start_time = datetime.utcnow()
        actual_df_from_sql_entities = job_from_sql.to_df()
        end_time = datetime.utcnow()
        print(
            str(f"\nTime to execute job_from_sql.to_df() = '{(end_time - start_time)}'")
        )

        # Not requesting the on demand transform with an entity_df query (can't add request data in them)
        expected_df_query = expected_df.drop(
            columns=[
                "conv_rate_plus_100",
                "conv_rate_plus_100_rounded",
                "val_to_add",
                "conv_rate_plus_val_to_add",
                "driver_age",
            ]
        )
        assert sorted(expected_df_query.columns) == sorted(
            actual_df_from_sql_entities.columns
        )

        actual_df_from_sql_entities = (
            actual_df_from_sql_entities[expected_df_query.columns]
            .sort_values(by=[event_timestamp, "order_id", "driver_id", "customer_id"])
            .drop_duplicates()
            .reset_index(drop=True)
        )
        expected_df_query = (
            expected_df_query.sort_values(
                by=[event_timestamp, "order_id", "driver_id", "customer_id"]
            )
            .drop_duplicates()
            .reset_index(drop=True)
        )

        assert_frame_equal(
            actual_df_from_sql_entities, expected_df_query, check_dtype=False,
        )

        table_from_sql_entities = job_from_sql.to_arrow()
        df_from_sql_entities = (
            table_from_sql_entities.to_pandas()[expected_df_query.columns]
            .sort_values(by=[event_timestamp, "order_id", "driver_id", "customer_id"])
            .drop_duplicates()
            .reset_index(drop=True)
        )

        for col in df_from_sql_entities.columns:
            expected_df_query[col] = expected_df_query[col].astype(
                df_from_sql_entities[col].dtype
            )

        assert_frame_equal(expected_df_query, df_from_sql_entities)

    job_from_df = store.get_historical_features(
        entity_df=entity_df_with_request_data,
        features=[
            "driver_stats:conv_rate",
            "driver_stats:avg_daily_trips",
            "customer_profile:current_balance",
            "customer_profile:avg_passenger_count",
            "customer_profile:lifetime_trip_count",
            "conv_rate_plus_100:conv_rate_plus_100",
            "conv_rate_plus_100:conv_rate_plus_100_rounded",
            "conv_rate_plus_100:conv_rate_plus_val_to_add",
            "order:order_is_success",
            "global_stats:num_rides",
            "global_stats:avg_ride_length",
            "driver_age:driver_age",
        ],
        full_feature_names=full_feature_names,
    )

    start_time = datetime.utcnow()
    actual_df_from_df_entities = job_from_df.to_df()

    print(f"actual_df_from_df_entities shape: {actual_df_from_df_entities.shape}")
    end_time = datetime.utcnow()
    print(str(f"Time to execute job_from_df.to_df() = '{(end_time - start_time)}'\n"))

    assert sorted(expected_df.columns) == sorted(actual_df_from_df_entities.columns)
    expected_df: pd.DataFrame = (
        expected_df.sort_values(
            by=[event_timestamp, "order_id", "driver_id", "customer_id"]
        )
        .drop_duplicates()
        .reset_index(drop=True)
    )
    actual_df_from_df_entities = (
        actual_df_from_df_entities[expected_df.columns]
        .sort_values(by=[event_timestamp, "order_id", "driver_id", "customer_id"])
        .drop_duplicates()
        .reset_index(drop=True)
    )

    assert_frame_equal(
        expected_df, actual_df_from_df_entities, check_dtype=False,
    )
    assert_feature_service_correctness(
        store,
        feature_service,
        full_feature_names,
        entity_df_with_request_data,
        expected_df,
        event_timestamp,
    )
    assert_feature_service_entity_mapping_correctness(
        store,
        feature_service_entity_mapping,
        full_feature_names,
        entity_df_with_request_data,
        full_expected_df,
        event_timestamp,
    )

    table_from_df_entities: pd.DataFrame = job_from_df.to_arrow().to_pandas()

    columns_expected_in_table = expected_df.columns.tolist()

    table_from_df_entities = (
        table_from_df_entities[columns_expected_in_table]
        .sort_values(by=[event_timestamp, "order_id", "driver_id", "customer_id"])
        .drop_duplicates()
        .reset_index(drop=True)
    )
    assert_frame_equal(actual_df_from_df_entities, table_from_df_entities)

    # If request data is missing that's needed for on demand transform, throw an error
    with pytest.raises(RequestDataNotFoundInEntityDfException):
        store.get_historical_features(
            entity_df=entity_df,
            features=[
                "driver_stats:conv_rate",
                "driver_stats:avg_daily_trips",
                "customer_profile:current_balance",
                "customer_profile:avg_passenger_count",
                "customer_profile:lifetime_trip_count",
                "conv_rate_plus_100:conv_rate_plus_100",
                "conv_rate_plus_100:conv_rate_plus_val_to_add",
                "global_stats:num_rides",
                "global_stats:avg_ride_length",
            ],
            full_feature_names=full_feature_names,
        )
    # If request data is missing that's needed for a request feature view, throw an error
    with pytest.raises(RequestDataNotFoundInEntityDfException):
        store.get_historical_features(
            entity_df=entity_df,
            features=[
                "driver_stats:conv_rate",
                "driver_stats:avg_daily_trips",
                "customer_profile:current_balance",
                "customer_profile:avg_passenger_count",
                "customer_profile:lifetime_trip_count",
                "driver_age:driver_age",
                "global_stats:num_rides",
                "global_stats:avg_ride_length",
            ],
            full_feature_names=full_feature_names,
        )