示例#1
0
def create_feature_view(name, feature_dtype, feature_is_list, has_empty_list,
                        data_source):
    if feature_is_list is True:
        if feature_dtype == "int32":
            value_type = ValueType.INT32_LIST
        elif feature_dtype == "int64":
            value_type = ValueType.INT64_LIST
        elif feature_dtype == "float":
            value_type = ValueType.FLOAT_LIST
        elif feature_dtype == "bool":
            value_type = ValueType.BOOL_LIST
    else:
        if feature_dtype == "int32":
            value_type = ValueType.INT32
        elif feature_dtype == "int64":
            value_type = ValueType.INT64
        elif feature_dtype == "float":
            value_type = ValueType.FLOAT
        elif feature_dtype == "bool":
            value_type = ValueType.BOOL

    return driver_feature_view(
        data_source,
        name=name,
        value_type=value_type,
    )
示例#2
0
def create_feature_view(name, feature_dtype, feature_is_list, has_empty_list,
                        data_source):
    if feature_is_list is True:
        if feature_dtype == "int32":
            dtype = Array(Int32)
        elif feature_dtype == "int64":
            dtype = Array(Int64)
        elif feature_dtype == "float":
            dtype = Array(Float32)
        elif feature_dtype == "bool":
            dtype = Array(Bool)
        elif feature_dtype == "datetime":
            dtype = Array(UnixTimestamp)
    else:
        if feature_dtype == "int32":
            dtype = Int32
        elif feature_dtype == "int64":
            dtype = Int64
        elif feature_dtype == "float":
            dtype = Float32
        elif feature_dtype == "bool":
            dtype = Bool
        elif feature_dtype == "datetime":
            dtype = UnixTimestamp

    return driver_feature_view(
        data_source,
        name=name,
        dtype=dtype,
    )
示例#3
0
def create_feature_view(feature_dtype, feature_is_list, data_source):
    return driver_feature_view(
        data_source,
        value_type=python_type_to_feast_value_type(
            feature_dtype,
            value=get_feature_values_for_dtype(feature_dtype,
                                               feature_is_list)[0],
        ),
    )
示例#4
0
def test_e2e_consistency(environment, e2e_data_sources, infer_features):
    fs = environment.feature_store
    df, data_source = e2e_data_sources
    fv = driver_feature_view(data_source=data_source, infer_features=infer_features)

    entity = driver()
    fs.apply([fv, entity])

    run_offline_online_store_consistency_test(fs, fv)
def prep_bq_fs_and_fv(
    bq_source_type: str, ) -> Iterator[Tuple[FeatureStore, FeatureView]]:
    client = bigquery.Client()
    gcp_project = client.project
    bigquery_dataset = "test_ingestion"
    dataset = bigquery.Dataset(f"{gcp_project}.{bigquery_dataset}")
    client.create_dataset(dataset, exists_ok=True)
    dataset.default_table_expiration_ms = (1000 * 60 * 60 * 24 * 14
                                           )  # 2 weeks in milliseconds
    client.update_dataset(dataset, ["default_table_expiration_ms"])

    df = create_dataset()

    job_config = bigquery.LoadJobConfig()
    table_ref = f"{gcp_project}.{bigquery_dataset}.{bq_source_type}_correctness_{int(time.time_ns())}"
    query = f"SELECT * FROM `{table_ref}`"
    job = client.load_table_from_dataframe(df,
                                           table_ref,
                                           job_config=job_config)
    job.result()

    bigquery_source = BigQuerySource(
        table_ref=table_ref if bq_source_type == "table" else None,
        query=query if bq_source_type == "query" else None,
        event_timestamp_column="ts",
        created_timestamp_column="created_ts",
        date_partition_column="",
        field_mapping={
            "ts_1": "ts",
            "id": "driver_id"
        },
    )

    fv = driver_feature_view(bigquery_source)
    e = Entity(
        name="driver",
        description="id for driver",
        join_key="driver_id",
        value_type=ValueType.INT32,
    )
    with tempfile.TemporaryDirectory() as repo_dir_name:
        config = RepoConfig(
            registry=str(Path(repo_dir_name) / "registry.db"),
            project=f"test_bq_correctness_{str(uuid.uuid4()).replace('-', '')}",
            provider="gcp",
            online_store=DatastoreOnlineStoreConfig(
                namespace="integration_test"),
        )
        fs = FeatureStore(config=config)
        fs.apply([fv, e])

        yield fs, fv

        fs.teardown()
示例#6
0
def test_e2e_consistency(environment, e2e_data_sources, infer_features):
    fs = environment.feature_store
    df, data_source = e2e_data_sources
    fv = driver_feature_view(
        name=f"test_consistency_{'with_inference' if infer_features else ''}",
        data_source=data_source,
        infer_features=infer_features,
    )

    entity = driver()
    fs.apply([fv, entity])

    # materialization is run in two steps and
    # we use timestamp from generated dataframe as a split point
    split_dt = df["ts_1"][4].to_pydatetime() - timedelta(seconds=1)

    run_offline_online_store_consistency_test(fs, fv, split_dt)
def prep_redis_fs_and_fv() -> Iterator[Tuple[FeatureStore, FeatureView]]:
    with tempfile.NamedTemporaryFile(suffix=".parquet") as f:
        df = create_dataset()
        f.close()
        df.to_parquet(f.name)
        file_source = FileSource(
            file_format=ParquetFormat(),
            path=f"file://{f.name}",
            event_timestamp_column="ts",
            created_timestamp_column="created_ts",
            date_partition_column="",
            field_mapping={
                "ts_1": "ts",
                "id": "driver_id"
            },
        )
        fv = driver_feature_view(file_source)
        e = Entity(
            name="driver",
            description="id for driver",
            join_key="driver_id",
            value_type=ValueType.INT32,
        )
        project = f"test_redis_correctness_{str(uuid.uuid4()).replace('-', '')}"
        print(f"Using project: {project}")
        with tempfile.TemporaryDirectory() as repo_dir_name:
            config = RepoConfig(
                registry=str(Path(repo_dir_name) / "registry.db"),
                project=project,
                provider="local",
                online_store=RedisOnlineStoreConfig(
                    type="redis",
                    redis_type=RedisType.redis,
                    connection_string="localhost:6379,db=0",
                ),
            )
            fs = FeatureStore(config=config)
            fs.apply([fv, e])

            yield fs, fv

            fs.teardown()
def prep_redshift_fs_and_fv(
    source_type: str, ) -> Iterator[Tuple[FeatureStore, FeatureView]]:
    client = aws_utils.get_redshift_data_client("us-west-2")
    s3 = aws_utils.get_s3_resource("us-west-2")

    df = create_dataset()

    table_name = f"test_ingestion_{source_type}_correctness_{int(time.time_ns())}_{random.randint(1000, 9999)}"

    offline_store = RedshiftOfflineStoreConfig(
        cluster_id="feast-integration-tests",
        region="us-west-2",
        user="******",
        database="feast",
        s3_staging_location=
        "s3://feast-integration-tests/redshift/tests/ingestion",
        iam_role="arn:aws:iam::402087665549:role/redshift_s3_access_role",
    )

    aws_utils.upload_df_to_redshift(
        client,
        offline_store.cluster_id,
        offline_store.database,
        offline_store.user,
        s3,
        f"{offline_store.s3_staging_location}/copy/{table_name}.parquet",
        offline_store.iam_role,
        table_name,
        df,
    )

    redshift_source = RedshiftSource(
        table=table_name if source_type == "table" else None,
        query=f"SELECT * FROM {table_name}"
        if source_type == "query" else None,
        event_timestamp_column="ts",
        created_timestamp_column="created_ts",
        date_partition_column="",
        field_mapping={
            "ts_1": "ts",
            "id": "driver_id"
        },
    )

    fv = driver_feature_view(redshift_source)
    e = Entity(
        name="driver",
        description="id for driver",
        join_key="driver_id",
        value_type=ValueType.INT32,
    )
    with tempfile.TemporaryDirectory(
    ) as repo_dir_name, tempfile.TemporaryDirectory() as data_dir_name:
        config = RepoConfig(
            registry=str(Path(repo_dir_name) / "registry.db"),
            project=f"test_bq_correctness_{str(uuid.uuid4()).replace('-', '')}",
            provider="local",
            online_store=SqliteOnlineStoreConfig(
                path=str(Path(data_dir_name) / "online_store.db")),
            offline_store=offline_store,
        )
        fs = FeatureStore(config=config)
        fs.apply([fv, e])

        yield fs, fv

        fs.teardown()

    # Clean up the uploaded Redshift table
    aws_utils.execute_redshift_statement(
        client,
        offline_store.cluster_id,
        offline_store.database,
        offline_store.user,
        f"DROP TABLE {table_name}",
    )
示例#9
0
def test_online_store_cleanup(environment, universal_data_sources):
    """
    Some online store implementations (like Redis) keep features from different features views
    but with common entities together.
    This might end up with deletion of all features attached to the entity,
    when only one feature view was deletion target (see https://github.com/feast-dev/feast/issues/2150).

    Plan:
        1. Register two feature views with common entity "driver"
        2. Materialize data
        3. Check if features are available (via online retrieval)
        4. Delete one feature view
        5. Check that features for other are still available
        6. Delete another feature view (and create again)
        7. Verify that features for both feature view were deleted
    """
    fs = environment.feature_store
    entities, datasets, data_sources = universal_data_sources
    driver_stats_fv = construct_universal_feature_views(data_sources).driver

    driver_entities = entities.driver_vals
    df = pd.DataFrame({
        "ts_1": [environment.end_date] * len(driver_entities),
        "created_ts": [environment.end_date] * len(driver_entities),
        "driver_id":
        driver_entities,
        "value":
        np.random.random(size=len(driver_entities)),
    })

    ds = environment.data_source_creator.create_data_source(
        df, destination_name="simple_driver_dataset")

    simple_driver_fv = driver_feature_view(
        data_source=ds, name="test_universal_online_simple_driver")

    fs.apply([driver(), simple_driver_fv, driver_stats_fv])

    fs.materialize(
        environment.start_date - timedelta(days=1),
        environment.end_date + timedelta(days=1),
    )
    expected_values = df.sort_values(by="driver_id")

    features = [f"{simple_driver_fv.name}:value"]
    entity_rows = [{
        "driver_id": driver_id
    } for driver_id in sorted(driver_entities)]

    online_features = fs.get_online_features(
        features=features, entity_rows=entity_rows).to_dict()
    assert np.allclose(expected_values["value"], online_features["value"])

    fs.apply(objects=[simple_driver_fv],
             objects_to_delete=[driver_stats_fv],
             partial=False)

    online_features = fs.get_online_features(
        features=features, entity_rows=entity_rows).to_dict()
    assert np.allclose(expected_values["value"], online_features["value"])

    fs.apply(objects=[], objects_to_delete=[simple_driver_fv], partial=False)

    def eventually_apply() -> Tuple[None, bool]:
        try:
            fs.apply([simple_driver_fv])
        except BotoCoreError:
            return None, False

        return None, True

    # Online store backend might have eventual consistency in schema update
    # So recreating table that was just deleted might need some retries
    wait_retry_backoff(eventually_apply, timeout_secs=60)

    online_features = fs.get_online_features(
        features=features, entity_rows=entity_rows).to_dict()
    assert all(v is None for v in online_features["value"])