def create_feature_view(name, feature_dtype, feature_is_list, has_empty_list, data_source): if feature_is_list is True: if feature_dtype == "int32": value_type = ValueType.INT32_LIST elif feature_dtype == "int64": value_type = ValueType.INT64_LIST elif feature_dtype == "float": value_type = ValueType.FLOAT_LIST elif feature_dtype == "bool": value_type = ValueType.BOOL_LIST else: if feature_dtype == "int32": value_type = ValueType.INT32 elif feature_dtype == "int64": value_type = ValueType.INT64 elif feature_dtype == "float": value_type = ValueType.FLOAT elif feature_dtype == "bool": value_type = ValueType.BOOL return driver_feature_view( data_source, name=name, value_type=value_type, )
def create_feature_view(name, feature_dtype, feature_is_list, has_empty_list, data_source): if feature_is_list is True: if feature_dtype == "int32": dtype = Array(Int32) elif feature_dtype == "int64": dtype = Array(Int64) elif feature_dtype == "float": dtype = Array(Float32) elif feature_dtype == "bool": dtype = Array(Bool) elif feature_dtype == "datetime": dtype = Array(UnixTimestamp) else: if feature_dtype == "int32": dtype = Int32 elif feature_dtype == "int64": dtype = Int64 elif feature_dtype == "float": dtype = Float32 elif feature_dtype == "bool": dtype = Bool elif feature_dtype == "datetime": dtype = UnixTimestamp return driver_feature_view( data_source, name=name, dtype=dtype, )
def create_feature_view(feature_dtype, feature_is_list, data_source): return driver_feature_view( data_source, value_type=python_type_to_feast_value_type( feature_dtype, value=get_feature_values_for_dtype(feature_dtype, feature_is_list)[0], ), )
def test_e2e_consistency(environment, e2e_data_sources, infer_features): fs = environment.feature_store df, data_source = e2e_data_sources fv = driver_feature_view(data_source=data_source, infer_features=infer_features) entity = driver() fs.apply([fv, entity]) run_offline_online_store_consistency_test(fs, fv)
def prep_bq_fs_and_fv( bq_source_type: str, ) -> Iterator[Tuple[FeatureStore, FeatureView]]: client = bigquery.Client() gcp_project = client.project bigquery_dataset = "test_ingestion" dataset = bigquery.Dataset(f"{gcp_project}.{bigquery_dataset}") client.create_dataset(dataset, exists_ok=True) dataset.default_table_expiration_ms = (1000 * 60 * 60 * 24 * 14 ) # 2 weeks in milliseconds client.update_dataset(dataset, ["default_table_expiration_ms"]) df = create_dataset() job_config = bigquery.LoadJobConfig() table_ref = f"{gcp_project}.{bigquery_dataset}.{bq_source_type}_correctness_{int(time.time_ns())}" query = f"SELECT * FROM `{table_ref}`" job = client.load_table_from_dataframe(df, table_ref, job_config=job_config) job.result() bigquery_source = BigQuerySource( table_ref=table_ref if bq_source_type == "table" else None, query=query if bq_source_type == "query" else None, event_timestamp_column="ts", created_timestamp_column="created_ts", date_partition_column="", field_mapping={ "ts_1": "ts", "id": "driver_id" }, ) fv = driver_feature_view(bigquery_source) e = Entity( name="driver", description="id for driver", join_key="driver_id", value_type=ValueType.INT32, ) with tempfile.TemporaryDirectory() as repo_dir_name: config = RepoConfig( registry=str(Path(repo_dir_name) / "registry.db"), project=f"test_bq_correctness_{str(uuid.uuid4()).replace('-', '')}", provider="gcp", online_store=DatastoreOnlineStoreConfig( namespace="integration_test"), ) fs = FeatureStore(config=config) fs.apply([fv, e]) yield fs, fv fs.teardown()
def test_e2e_consistency(environment, e2e_data_sources, infer_features): fs = environment.feature_store df, data_source = e2e_data_sources fv = driver_feature_view( name=f"test_consistency_{'with_inference' if infer_features else ''}", data_source=data_source, infer_features=infer_features, ) entity = driver() fs.apply([fv, entity]) # materialization is run in two steps and # we use timestamp from generated dataframe as a split point split_dt = df["ts_1"][4].to_pydatetime() - timedelta(seconds=1) run_offline_online_store_consistency_test(fs, fv, split_dt)
def prep_redis_fs_and_fv() -> Iterator[Tuple[FeatureStore, FeatureView]]: with tempfile.NamedTemporaryFile(suffix=".parquet") as f: df = create_dataset() f.close() df.to_parquet(f.name) file_source = FileSource( file_format=ParquetFormat(), path=f"file://{f.name}", event_timestamp_column="ts", created_timestamp_column="created_ts", date_partition_column="", field_mapping={ "ts_1": "ts", "id": "driver_id" }, ) fv = driver_feature_view(file_source) e = Entity( name="driver", description="id for driver", join_key="driver_id", value_type=ValueType.INT32, ) project = f"test_redis_correctness_{str(uuid.uuid4()).replace('-', '')}" print(f"Using project: {project}") with tempfile.TemporaryDirectory() as repo_dir_name: config = RepoConfig( registry=str(Path(repo_dir_name) / "registry.db"), project=project, provider="local", online_store=RedisOnlineStoreConfig( type="redis", redis_type=RedisType.redis, connection_string="localhost:6379,db=0", ), ) fs = FeatureStore(config=config) fs.apply([fv, e]) yield fs, fv fs.teardown()
def prep_redshift_fs_and_fv( source_type: str, ) -> Iterator[Tuple[FeatureStore, FeatureView]]: client = aws_utils.get_redshift_data_client("us-west-2") s3 = aws_utils.get_s3_resource("us-west-2") df = create_dataset() table_name = f"test_ingestion_{source_type}_correctness_{int(time.time_ns())}_{random.randint(1000, 9999)}" offline_store = RedshiftOfflineStoreConfig( cluster_id="feast-integration-tests", region="us-west-2", user="******", database="feast", s3_staging_location= "s3://feast-integration-tests/redshift/tests/ingestion", iam_role="arn:aws:iam::402087665549:role/redshift_s3_access_role", ) aws_utils.upload_df_to_redshift( client, offline_store.cluster_id, offline_store.database, offline_store.user, s3, f"{offline_store.s3_staging_location}/copy/{table_name}.parquet", offline_store.iam_role, table_name, df, ) redshift_source = RedshiftSource( table=table_name if source_type == "table" else None, query=f"SELECT * FROM {table_name}" if source_type == "query" else None, event_timestamp_column="ts", created_timestamp_column="created_ts", date_partition_column="", field_mapping={ "ts_1": "ts", "id": "driver_id" }, ) fv = driver_feature_view(redshift_source) e = Entity( name="driver", description="id for driver", join_key="driver_id", value_type=ValueType.INT32, ) with tempfile.TemporaryDirectory( ) as repo_dir_name, tempfile.TemporaryDirectory() as data_dir_name: config = RepoConfig( registry=str(Path(repo_dir_name) / "registry.db"), project=f"test_bq_correctness_{str(uuid.uuid4()).replace('-', '')}", provider="local", online_store=SqliteOnlineStoreConfig( path=str(Path(data_dir_name) / "online_store.db")), offline_store=offline_store, ) fs = FeatureStore(config=config) fs.apply([fv, e]) yield fs, fv fs.teardown() # Clean up the uploaded Redshift table aws_utils.execute_redshift_statement( client, offline_store.cluster_id, offline_store.database, offline_store.user, f"DROP TABLE {table_name}", )
def test_online_store_cleanup(environment, universal_data_sources): """ Some online store implementations (like Redis) keep features from different features views but with common entities together. This might end up with deletion of all features attached to the entity, when only one feature view was deletion target (see https://github.com/feast-dev/feast/issues/2150). Plan: 1. Register two feature views with common entity "driver" 2. Materialize data 3. Check if features are available (via online retrieval) 4. Delete one feature view 5. Check that features for other are still available 6. Delete another feature view (and create again) 7. Verify that features for both feature view were deleted """ fs = environment.feature_store entities, datasets, data_sources = universal_data_sources driver_stats_fv = construct_universal_feature_views(data_sources).driver driver_entities = entities.driver_vals df = pd.DataFrame({ "ts_1": [environment.end_date] * len(driver_entities), "created_ts": [environment.end_date] * len(driver_entities), "driver_id": driver_entities, "value": np.random.random(size=len(driver_entities)), }) ds = environment.data_source_creator.create_data_source( df, destination_name="simple_driver_dataset") simple_driver_fv = driver_feature_view( data_source=ds, name="test_universal_online_simple_driver") fs.apply([driver(), simple_driver_fv, driver_stats_fv]) fs.materialize( environment.start_date - timedelta(days=1), environment.end_date + timedelta(days=1), ) expected_values = df.sort_values(by="driver_id") features = [f"{simple_driver_fv.name}:value"] entity_rows = [{ "driver_id": driver_id } for driver_id in sorted(driver_entities)] online_features = fs.get_online_features( features=features, entity_rows=entity_rows).to_dict() assert np.allclose(expected_values["value"], online_features["value"]) fs.apply(objects=[simple_driver_fv], objects_to_delete=[driver_stats_fv], partial=False) online_features = fs.get_online_features( features=features, entity_rows=entity_rows).to_dict() assert np.allclose(expected_values["value"], online_features["value"]) fs.apply(objects=[], objects_to_delete=[simple_driver_fv], partial=False) def eventually_apply() -> Tuple[None, bool]: try: fs.apply([simple_driver_fv]) except BotoCoreError: return None, False return None, True # Online store backend might have eventual consistency in schema update # So recreating table that was just deleted might need some retries wait_retry_backoff(eventually_apply, timeout_secs=60) online_features = fs.get_online_features( features=features, entity_rows=entity_rows).to_dict() assert all(v is None for v in online_features["value"])