def test_apply_conflicting_featureview_names( feature_store_with_local_registry): """ Test applying feature views with non-case-insensitively unique names""" driver_stats = FeatureView( name="driver_hourly_stats", entities=["driver_id"], ttl=timedelta(seconds=10), online=False, batch_source=FileSource(path="driver_stats.parquet"), tags={}, ) customer_stats = FeatureView( name="DRIVER_HOURLY_STATS", entities=["id"], ttl=timedelta(seconds=10), online=False, batch_source=FileSource(path="customer_stats.parquet"), tags={}, ) try: feature_store_with_local_registry.apply([driver_stats, customer_stats]) error = None except ValueError as e: error = e assert ( isinstance(error, ValueError) and "Please ensure that all feature view names are case-insensitively unique" in error.args[0]) feature_store_with_local_registry.teardown()
def test_apply_duplicated_featureview_names(feature_store_with_local_registry): """ Test applying feature views with duplicated names""" driver_stats = FeatureView( name="driver_hourly_stats", entities=["driver_id"], ttl=timedelta(seconds=10), online=False, input=FileSource(path="driver_stats.parquet"), tags={}, ) customer_stats = FeatureView( name="driver_hourly_stats", entities=["id"], ttl=timedelta(seconds=10), online=False, input=FileSource(path="customer_stats.parquet"), tags={}, ) try: feature_store_with_local_registry.apply([driver_stats, customer_stats]) error = None except ValueError as e: error = e assert ( isinstance(error, ValueError) and "Please ensure that all feature view names are unique" in error.args[0] ) feature_store_with_local_registry.teardown()
def test_infer_datasource_names_file(): file_path = "path/to/test.csv" data_source = FileSource(path=file_path) assert data_source.name == file_path source_name = "my_name" data_source = FileSource(name=source_name, path=file_path) assert data_source.name == source_name
def test_apply_data_source(test_registry: Registry): # Create Feature Views batch_source = FileSource( name="test_source", file_format=ParquetFormat(), path="file://feast/*", timestamp_field="ts_col", created_timestamp_column="timestamp", ) fv1 = FeatureView( name="my_feature_view_1", schema=[ Field(name="fs1_my_feature_1", dtype=Int64), Field(name="fs1_my_feature_2", dtype=String), Field(name="fs1_my_feature_3", dtype=Array(String)), Field(name="fs1_my_feature_4", dtype=Array(Bytes)), ], entities=["fs1_my_entity_1"], tags={"team": "matchmaking"}, batch_source=batch_source, ttl=timedelta(minutes=5), ) project = "project" # Register data source and feature view test_registry.apply_data_source(batch_source, project, commit=False) test_registry.apply_feature_view(fv1, project, commit=True) registry_feature_views = test_registry.list_feature_views(project) registry_data_sources = test_registry.list_data_sources(project) assert len(registry_feature_views) == 1 assert len(registry_data_sources) == 1 registry_feature_view = registry_feature_views[0] assert registry_feature_view.batch_source == batch_source registry_data_source = registry_data_sources[0] assert registry_data_source == batch_source # Check that change to batch source propagates batch_source.timestamp_field = "new_ts_col" test_registry.apply_data_source(batch_source, project, commit=False) test_registry.apply_feature_view(fv1, project, commit=True) registry_feature_views = test_registry.list_feature_views(project) registry_data_sources = test_registry.list_data_sources(project) assert len(registry_feature_views) == 1 assert len(registry_data_sources) == 1 registry_feature_view = registry_feature_views[0] assert registry_feature_view.batch_source == batch_source registry_batch_source = test_registry.list_data_sources(project)[0] assert registry_batch_source == batch_source test_registry.teardown() # Will try to reload registry, which will fail because the file has been deleted with pytest.raises(FileNotFoundError): test_registry._get_registry_proto()
def store_offline(feature_store: FeatureStore, dataframe: FlyteSchema) -> FeatureStore: horse_colic_entity = Entity(name="Hospital Number", value_type=ValueType.STRING) horse_colic_feature_view = FeatureView( name="horse_colic_stats", entities=["Hospital Number"], features=[ Feature(name="rectal temperature", dtype=ValueType.FLOAT), Feature(name="total protein", dtype=ValueType.FLOAT), Feature(name="peripheral pulse", dtype=ValueType.FLOAT), Feature(name="surgical lesion", dtype=ValueType.STRING), Feature(name="abdominal distension", dtype=ValueType.FLOAT), Feature(name="nasogastric tube", dtype=ValueType.STRING), Feature(name="outcome", dtype=ValueType.STRING), Feature(name="packed cell volume", dtype=ValueType.FLOAT), Feature(name="nasogastric reflux PH", dtype=ValueType.FLOAT), ], batch_source=FileSource( path=str(dataframe.remote_path), event_timestamp_column="timestamp", ), ttl=timedelta(days=1), ) # Ingest the data into feast feature_store.apply([horse_colic_entity, horse_colic_feature_view]) return feature_store
def setup_feature_store(): """Prepares the local environment for a FeatureStore docstring test.""" from datetime import datetime, timedelta from feast import Entity, Feature, FeatureStore, FeatureView, FileSource, ValueType from feast.repo_operations import init_repo init_repo("feature_repo", "local") fs = FeatureStore(repo_path="feature_repo") driver = Entity( name="driver_id", value_type=ValueType.INT64, description="driver id", ) driver_hourly_stats = FileSource( path="feature_repo/data/driver_stats.parquet", event_timestamp_column="event_timestamp", created_timestamp_column="created", ) driver_hourly_stats_view = FeatureView( name="driver_hourly_stats", entities=["driver_id"], ttl=timedelta(seconds=86400 * 1), features=[ Feature(name="conv_rate", dtype=ValueType.FLOAT), Feature(name="acc_rate", dtype=ValueType.FLOAT), Feature(name="avg_daily_trips", dtype=ValueType.INT64), ], batch_source=driver_hourly_stats, ) fs.apply([driver_hourly_stats_view, driver]) fs.materialize( start_date=datetime.utcnow() - timedelta(hours=3), end_date=datetime.utcnow() - timedelta(minutes=10), )
def stage_entities_to_fs(entity_source: pd.DataFrame, staging_location: str, config: Config) -> FileSource: """ Dumps given (entities) dataframe as parquet file and stage it to remote file storage (subdirectory of staging_location) :return: FileSource with remote destination path """ entity_staging_uri = urlparse( os.path.join(staging_location, str(uuid.uuid4()))) staging_client = get_staging_client(entity_staging_uri.scheme, config) with tempfile.NamedTemporaryFile() as df_export_path: # prevent casting ns -> ms exception inside pyarrow entity_source["event_timestamp"] = entity_source[ "event_timestamp"].dt.floor("ms") entity_source.to_parquet(df_export_path.name) with open(df_export_path.name, "rb") as f: staging_client.upload_fileobj(f, df_export_path.name, remote_uri=entity_staging_uri) # ToDo: support custom event_timestamp_column return FileSource( event_timestamp_column="event_timestamp", file_format=ParquetFormat(), file_url=entity_staging_uri.geturl(), )
def create_data_source( self, df: pd.DataFrame, destination_name: str, timestamp_field="ts", created_timestamp_column="created_ts", field_mapping: Dict[str, str] = None, ) -> DataSource: destination_name = self.get_prefixed_table_name(destination_name) f = tempfile.NamedTemporaryFile( prefix=f"{self.project_name}_{destination_name}", suffix=".parquet", delete=False, ) df.to_parquet(f.name) self.files.append(f) return FileSource( file_format=ParquetFormat(), path=f"{f.name}", timestamp_field=timestamp_field, created_timestamp_column=created_timestamp_column, field_mapping=field_mapping or {"ts_1": "ts"}, )
def create_data_source( self, df: pd.DataFrame, destination_name: Optional[str] = None, suffix: Optional[str] = None, event_timestamp_column="ts", created_timestamp_column="created_ts", field_mapping: Dict[str, str] = None, ) -> DataSource: filename = f"{destination_name}.parquet" port = self.minio.get_exposed_port("9000") host = self.minio.get_container_host_ip() minio_endpoint = f"{host}:{port}" self._upload_parquet_file(df, filename, minio_endpoint) return FileSource( file_format=ParquetFormat(), path=f"s3://{self.bucket}/{filename}", event_timestamp_column=event_timestamp_column, created_timestamp_column=created_timestamp_column, date_partition_column="", field_mapping=field_mapping or {"ts_1": "ts"}, s3_endpoint_override=f"http://{host}:{port}", )
def test_apply_feature_view_integration(test_feature_store): # Create Feature Views batch_source = FileSource( file_format=ParquetFormat(), path="file://feast/*", timestamp_field="ts_col", created_timestamp_column="timestamp", date_partition_column="date_partition_col", ) fv1 = FeatureView( name="my_feature_view_1", schema=[ Field(name="fs1_my_feature_1", dtype=Int64), Field(name="fs1_my_feature_2", dtype=String), Field(name="fs1_my_feature_3", dtype=Array(String)), Field(name="fs1_my_feature_4", dtype=Array(Bytes)), ], entities=["fs1_my_entity_1"], tags={"team": "matchmaking"}, batch_source=batch_source, ttl=timedelta(minutes=5), ) # Register Feature View test_feature_store.apply([fv1]) feature_views = test_feature_store.list_feature_views() # List Feature Views assert (len(feature_views) == 1 and feature_views[0].name == "my_feature_view_1" and feature_views[0].features[0].name == "fs1_my_feature_1" and feature_views[0].features[0].dtype == Int64 and feature_views[0].features[1].name == "fs1_my_feature_2" and feature_views[0].features[1].dtype == String and feature_views[0].features[2].name == "fs1_my_feature_3" and feature_views[0].features[2].dtype == Array(String) and feature_views[0].features[3].name == "fs1_my_feature_4" and feature_views[0].features[3].dtype == Array(Bytes) and feature_views[0].entities[0] == "fs1_my_entity_1") feature_view = test_feature_store.get_feature_view("my_feature_view_1") assert (feature_view.name == "my_feature_view_1" and feature_view.features[0].name == "fs1_my_feature_1" and feature_view.features[0].dtype == Int64 and feature_view.features[1].name == "fs1_my_feature_2" and feature_view.features[1].dtype == String and feature_view.features[2].name == "fs1_my_feature_3" and feature_view.features[2].dtype == Array(String) and feature_view.features[3].name == "fs1_my_feature_4" and feature_view.features[3].dtype == Array(Bytes) and feature_view.entities[0] == "fs1_my_entity_1") test_feature_store.delete_feature_view("my_feature_view_1") feature_views = test_feature_store.list_feature_views() assert len(feature_views) == 0 test_feature_store.teardown()
def test_apply_feature_view_integration(test_feature_store): # Create Feature Views batch_source = FileSource( file_format=ParquetFormat(), path="file://feast/*", event_timestamp_column="ts_col", created_timestamp_column="timestamp", date_partition_column="date_partition_col", ) fv1 = FeatureView( name="my_feature_view_1", features=[ Feature(name="fs1_my_feature_1", dtype=ValueType.INT64), Feature(name="fs1_my_feature_2", dtype=ValueType.STRING), Feature(name="fs1_my_feature_3", dtype=ValueType.STRING_LIST), Feature(name="fs1_my_feature_4", dtype=ValueType.BYTES_LIST), ], entities=["fs1_my_entity_1"], tags={"team": "matchmaking"}, batch_source=batch_source, ttl=timedelta(minutes=5), ) # Register Feature View test_feature_store.apply([fv1]) feature_views = test_feature_store.list_feature_views() # List Feature Views assert (len(feature_views) == 1 and feature_views[0].name == "my_feature_view_1" and feature_views[0].features[0].name == "fs1_my_feature_1" and feature_views[0].features[0].dtype == ValueType.INT64 and feature_views[0].features[1].name == "fs1_my_feature_2" and feature_views[0].features[1].dtype == ValueType.STRING and feature_views[0].features[2].name == "fs1_my_feature_3" and feature_views[0].features[2].dtype == ValueType.STRING_LIST and feature_views[0].features[3].name == "fs1_my_feature_4" and feature_views[0].features[3].dtype == ValueType.BYTES_LIST and feature_views[0].entities[0] == "fs1_my_entity_1") feature_view = test_feature_store.get_feature_view("my_feature_view_1") assert (feature_view.name == "my_feature_view_1" and feature_view.features[0].name == "fs1_my_feature_1" and feature_view.features[0].dtype == ValueType.INT64 and feature_view.features[1].name == "fs1_my_feature_2" and feature_view.features[1].dtype == ValueType.STRING and feature_view.features[2].name == "fs1_my_feature_3" and feature_view.features[2].dtype == ValueType.STRING_LIST and feature_view.features[3].name == "fs1_my_feature_4" and feature_view.features[3].dtype == ValueType.BYTES_LIST and feature_view.entities[0] == "fs1_my_entity_1") test_feature_store.delete_feature_view("my_feature_view_1") feature_views = test_feature_store.list_feature_views() assert len(feature_views) == 0 test_feature_store.teardown()
def stage_customer_daily_profile_parquet_source(directory, df): customer_profile_path = os.path.join(directory, "customer_profile.parquet") df.to_parquet(path=customer_profile_path, allow_truncated_timestamps=True) return FileSource( path=customer_profile_path, event_timestamp_column="event_timestamp", created_timestamp_column="created", )
def test_apply_object_and_read(test_feature_store): assert isinstance(test_feature_store, FeatureStore) # Create Feature Views batch_source = FileSource( file_format=ParquetFormat(), path="file://feast/*", event_timestamp_column="ts_col", created_timestamp_column="timestamp", ) e1 = Entity(name="fs1_my_entity_1", value_type=ValueType.STRING, description="something") e2 = Entity(name="fs1_my_entity_2", value_type=ValueType.STRING, description="something") fv1 = FeatureView( name="my_feature_view_1", features=[ Feature(name="fs1_my_feature_1", dtype=ValueType.INT64), Feature(name="fs1_my_feature_2", dtype=ValueType.STRING), Feature(name="fs1_my_feature_3", dtype=ValueType.STRING_LIST), Feature(name="fs1_my_feature_4", dtype=ValueType.BYTES_LIST), ], entities=["fs1_my_entity_1"], tags={"team": "matchmaking"}, batch_source=batch_source, ttl=timedelta(minutes=5), ) fv2 = FeatureView( name="my_feature_view_2", features=[ Feature(name="fs1_my_feature_1", dtype=ValueType.INT64), Feature(name="fs1_my_feature_2", dtype=ValueType.STRING), Feature(name="fs1_my_feature_3", dtype=ValueType.STRING_LIST), Feature(name="fs1_my_feature_4", dtype=ValueType.BYTES_LIST), ], entities=["fs1_my_entity_1"], tags={"team": "matchmaking"}, batch_source=batch_source, ttl=timedelta(minutes=5), ) # Register Feature View test_feature_store.apply([fv1, e1, fv2, e2]) fv1_actual = test_feature_store.get_feature_view("my_feature_view_1") e1_actual = test_feature_store.get_entity("fs1_my_entity_1") assert fv1 == fv1_actual assert e1 == e1_actual assert fv2 != fv1_actual assert e2 != e1_actual test_feature_store.teardown()
def transactions_feature_table(spark, client): schema = StructType([ StructField("customer_id", IntegerType()), StructField("event_timestamp", TimestampType()), StructField("created_timestamp", TimestampType()), StructField("total_transactions", DoubleType()), StructField("is_vip", BooleanType()), ]) df_data = [ ( 1001, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=1), 50.0, True, ), ( 1001, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=2), 100.0, True, ), ( 2001, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=1), 400.0, False, ), ( 1001, datetime(year=2020, month=9, day=2), datetime(year=2020, month=9, day=1), 200.0, False, ), ( 1001, datetime(year=2020, month=9, day=4), datetime(year=2020, month=9, day=1), 300.0, False, ), ] temp_dir, file_uri = create_temp_parquet_file(spark, "transactions", schema, df_data) file_source = FileSource("event_timestamp", "created_timestamp", ParquetFormat(), file_uri) features = [ Feature("total_transactions", ValueType.DOUBLE), Feature("is_vip", ValueType.BOOL), ] feature_table = FeatureTable("transactions", ["customer_id"], features, batch_source=file_source) yield client.apply_feature_table(feature_table) shutil.rmtree(temp_dir)
def stage_driver_hourly_stats_parquet_source(directory, df): # Write to disk driver_stats_path = os.path.join(directory, "driver_stats.parquet") df.to_parquet(path=driver_stats_path, allow_truncated_timestamps=True) return FileSource( path=driver_stats_path, event_timestamp_column="event_timestamp", created_timestamp_column="", )
def prep_file_source(df, event_timestamp_column=None) -> FileSource: with tempfile.NamedTemporaryFile(suffix=".parquet") as f: f.close() df.to_parquet(f.name) file_source = FileSource( file_format=ParquetFormat(), path=f.name, event_timestamp_column=event_timestamp_column, ) yield file_source
def test_offline_ingestion(feast_client: Client, staging_path: str): entity = Entity( name="s2id", description="S2id", value_type=ValueType.INT64, ) feature_table = FeatureTable( name="drivers", entities=["s2id"], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=FileSource( "event_timestamp", "event_timestamp", ParquetFormat(), os.path.join(staging_path, "batch-storage"), ), ) feast_client.apply_entity(entity) feast_client.apply_feature_table(feature_table) original = generate_data() feast_client.ingest(feature_table, original) # write to batch (offline) storage job = feast_client.start_offline_to_online_ingestion( feature_table, datetime.today(), datetime.today() + timedelta(days=1)) status = wait_retry_backoff( lambda: (job.get_status(), job.get_status() != SparkJobStatus.IN_PROGRESS), 300) assert status == SparkJobStatus.COMPLETED features = feast_client.get_online_features( ["drivers:unique_drivers"], entity_rows=[{ "s2id": s2_id } for s2_id in original["s2id"].tolist()], ).to_dict() ingested = pd.DataFrame.from_dict(features) pd.testing.assert_frame_equal( ingested[["s2id", "drivers:unique_drivers"]], original[[ "s2id", "unique_drivers" ]].rename(columns={"unique_drivers": "drivers:unique_drivers"}), )
def test_schedule_batch_ingestion_jobs(pytestconfig, feast_client: Client, feast_spark_client: SparkClient): entity = Entity( name="s2id", description="S2id", value_type=ValueType.INT64, ) batch_source = FileSource( file_format=ParquetFormat(), file_url="gs://example/feast/*", event_timestamp_column="datetime_col", created_timestamp_column="timestamp", date_partition_column="datetime", ) feature_table = FeatureTable( name=f"schedule_{str(uuid.uuid4())}".replace("-", "_"), entities=["s2id"], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=batch_source, ) feast_client.apply(entity) feast_client.apply(feature_table) feast_spark_client.schedule_offline_to_online_ingestion( feature_table, 1, "0 0 * * *") config.load_incluster_config() k8s_api = client.CustomObjectsApi() def get_scheduled_spark_application(): job_hash = hashlib.md5(f"{feast_client.project}-{feature_table.name}". encode()).hexdigest() resource_name = f"feast-{job_hash}" return k8s_api.get_namespaced_custom_object( group="sparkoperator.k8s.io", version="v1beta2", namespace=pytestconfig.getoption("k8s_namespace"), plural="scheduledsparkapplications", name=resource_name, ) response = get_scheduled_spark_application() assert response["spec"]["schedule"] == "0 0 * * *" feast_spark_client.schedule_offline_to_online_ingestion( feature_table, 1, "1 0 * * *") response = get_scheduled_spark_application() assert response["spec"]["schedule"] == "1 0 * * *" feast_spark_client.unschedule_offline_to_online_ingestion(feature_table)
def evaluate_offline_job(): filesystem, path = FileSource.create_filesystem_and_path( data_source.path, data_source.file_options.s3_endpoint_override ) source_df = pd.read_parquet(path, filesystem=filesystem) # Make sure all timestamp fields are tz-aware. We default tz-naive fields to UTC source_df[event_timestamp_column] = source_df[event_timestamp_column].apply( lambda x: x if x.tzinfo is not None else x.replace(tzinfo=pytz.utc) ) if created_timestamp_column: source_df[created_timestamp_column] = source_df[ created_timestamp_column ].apply( lambda x: x if x.tzinfo is not None else x.replace(tzinfo=pytz.utc) ) source_columns = set(source_df.columns) if not set(join_key_columns).issubset(source_columns): raise FeastJoinKeysDuringMaterialization( data_source.path, set(join_key_columns), source_columns ) ts_columns = ( [event_timestamp_column, created_timestamp_column] if created_timestamp_column else [event_timestamp_column] ) source_df.sort_values(by=ts_columns, inplace=True) filtered_df = source_df[ (source_df[event_timestamp_column] >= start_date) & (source_df[event_timestamp_column] < end_date) ] columns_to_extract = set( join_key_columns + feature_name_columns + ts_columns ) if join_key_columns: last_values_df = filtered_df.drop_duplicates( join_key_columns, keep="last", ignore_index=True ) else: last_values_df = filtered_df last_values_df[DUMMY_ENTITY_ID] = DUMMY_ENTITY_VAL columns_to_extract.add(DUMMY_ENTITY_ID) return last_values_df[columns_to_extract]
def persist(self, storage: SavedDatasetStorage): assert isinstance(storage, SavedDatasetFileStorage) filesystem, path = FileSource.create_filesystem_and_path( storage.file_options.uri, storage.file_options.s3_endpoint_override, ) if path.endswith(".parquet"): pyarrow.parquet.write_table(self.to_arrow(), where=path, filesystem=filesystem) else: # otherwise assume destination is directory pyarrow.parquet.write_to_dataset(self.to_arrow(), root_path=path, filesystem=filesystem)
def bookings_feature_table_with_mapping(spark, client): schema = StructType([ StructField("id", IntegerType()), StructField("datetime", TimestampType()), StructField("created_datetime", TimestampType()), StructField("total_completed_bookings", IntegerType()), ]) df_data = [ ( 8001, datetime(year=2020, month=9, day=1, tzinfo=utc), datetime(year=2020, month=9, day=1, tzinfo=utc), 100, ), ( 8001, datetime(year=2020, month=9, day=2, tzinfo=utc), datetime(year=2020, month=9, day=2, tzinfo=utc), 150, ), ( 8002, datetime(year=2020, month=9, day=2, tzinfo=utc), datetime(year=2020, month=9, day=2, tzinfo=utc), 200, ), ] temp_dir, file_uri = create_temp_parquet_file(spark, "bookings", schema, df_data) file_source = FileSource( event_timestamp_column="datetime", created_timestamp_column="created_datetime", file_format=ParquetFormat(), file_url=file_uri, field_mapping={"id": "driver_id"}, ) features = [Feature("total_completed_bookings", ValueType.INT32)] max_age = Duration() max_age.FromSeconds(86400) feature_table = FeatureTable("bookings", ["driver_id"], features, batch_source=file_source, max_age=max_age) yield client.apply(feature_table) shutil.rmtree(temp_dir)
def test_update_feature_views_with_inferred_features(): file_source = FileSource(name="test", path="test path") entity1 = Entity(name="test1", join_keys=["test_column_1"]) entity2 = Entity(name="test2", join_keys=["test_column_2"]) feature_view_1 = FeatureView( name="test1", entities=[entity1], schema=[ Field(name="feature", dtype=Float32), Field(name="test_column_1", dtype=String), ], source=file_source, ) feature_view_2 = FeatureView( name="test2", entities=[entity1, entity2], schema=[ Field(name="feature", dtype=Float32), Field(name="test_column_1", dtype=String), Field(name="test_column_2", dtype=String), ], source=file_source, ) assert len(feature_view_1.schema) == 2 assert len(feature_view_1.features) == 2 # The entity field should be deleted from the schema and features of the feature view. update_feature_views_with_inferred_features([feature_view_1], [entity1], RepoConfig(provider="local", project="test")) assert len(feature_view_1.schema) == 1 assert len(feature_view_1.features) == 1 assert len(feature_view_2.schema) == 3 assert len(feature_view_2.features) == 3 # The entity fields should be deleted from the schema and features of the feature view. update_feature_views_with_inferred_features( [feature_view_2], [entity1, entity2], RepoConfig(provider="local", project="test"), ) assert len(feature_view_2.schema) == 1 assert len(feature_view_2.features) == 1
def batch_source(local_staging_path: str, pytestconfig, request: FixtureRequest): if pytestconfig.getoption("env") == "gcloud": bq_project = pytestconfig.getoption("bq_project") bq_dataset = request.getfixturevalue("bq_dataset") return BigQuerySource( event_timestamp_column="event_timestamp", created_timestamp_column="created_timestamp", table_ref= f"{bq_project}:{bq_dataset}.source_{datetime.now():%Y%m%d%H%M%s}", ) else: return FileSource( event_timestamp_column="event_timestamp", created_timestamp_column="created_timestamp", file_format=ParquetFormat(), file_url=os.path.join(local_staging_path, "transactions"), )
def create_data_source( self, destination: str, df: pd.DataFrame, event_timestamp_column="ts", created_timestamp_column="created_ts", field_mapping: Dict[str, str] = None, ) -> DataSource: self.f = tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) df.to_parquet(self.f.name) return FileSource( file_format=ParquetFormat(), path=f"file://{self.f.name}", event_timestamp_column=event_timestamp_column, created_timestamp_column=created_timestamp_column, date_partition_column="", field_mapping=field_mapping or {"ts_1": "ts"}, )
def create_schema(kafka_broker, topic_name, feature_table_name): entity = Entity(name="key", description="Key", value_type=ValueType.INT64) feature_table = FeatureTable( name=feature_table_name, entities=["key"], features=[Feature("num", ValueType.INT64), Feature("set", ValueType.STRING)], batch_source=FileSource( event_timestamp_column="event_timestamp", file_format=ParquetFormat(), file_url="/dev/null", ), stream_source=KafkaSource( event_timestamp_column="event_timestamp", bootstrap_servers=kafka_broker, message_format=AvroFormat(avro_schema()), topic=topic_name, ), ) return entity, feature_table
def prep_redis_fs_and_fv() -> Iterator[Tuple[FeatureStore, FeatureView]]: with tempfile.NamedTemporaryFile(suffix=".parquet") as f: df = create_dataset() f.close() df.to_parquet(f.name) file_source = FileSource( file_format=ParquetFormat(), path=f"file://{f.name}", event_timestamp_column="ts", created_timestamp_column="created_ts", date_partition_column="", field_mapping={ "ts_1": "ts", "id": "driver_id" }, ) fv = driver_feature_view(file_source) e = Entity( name="driver", description="id for driver", join_key="driver_id", value_type=ValueType.INT32, ) project = f"test_redis_correctness_{str(uuid.uuid4()).replace('-', '')}" print(f"Using project: {project}") with tempfile.TemporaryDirectory() as repo_dir_name: config = RepoConfig( registry=str(Path(repo_dir_name) / "registry.db"), project=project, provider="local", online_store=RedisOnlineStoreConfig( type="redis", redis_type=RedisType.redis, connection_string="localhost:6379,db=0", ), ) fs = FeatureStore(config=config) fs.apply([fv, e]) yield fs, fv fs.teardown()
def test_historical_feature_retrieval_with_field_mappings_from_local_spark_session( spark, client, driver_entity, bookings_feature_table_with_mapping, ): schema = StructType([ StructField("driver_id", IntegerType()), StructField("event_timestamp", TimestampType()), ]) df_data = [ (8001, datetime(year=2020, month=9, day=1, tzinfo=utc)), (8001, datetime(year=2020, month=9, day=2, tzinfo=utc)), (8002, datetime(year=2020, month=9, day=1, tzinfo=utc)), ] temp_dir, file_uri = create_temp_parquet_file(spark, "drivers", schema, df_data) entity_source = FileSource( event_timestamp_column="event_timestamp", file_format=ParquetFormat(), file_url=file_uri, ) joined_df = client.get_historical_features_df( ["bookings:total_completed_bookings"], entity_source, ) expected_joined_df_schema = StructType([ StructField("driver_id", IntegerType()), StructField("event_timestamp", TimestampType()), StructField("bookings__total_completed_bookings", IntegerType()), ]) expected_joined_df_data = [ (8001, datetime(year=2020, month=9, day=1, tzinfo=utc), 100), (8001, datetime(year=2020, month=9, day=2, tzinfo=utc), 150), (8002, datetime(year=2020, month=9, day=1, tzinfo=utc), None), ] expected_joined_df = spark.createDataFrame( spark.sparkContext.parallelize(expected_joined_df_data), expected_joined_df_schema, ) assert_dataframe_equal(joined_df, expected_joined_df) shutil.rmtree(temp_dir)
def bookings_feature_table(spark, client): schema = StructType([ StructField("driver_id", IntegerType()), StructField("event_timestamp", TimestampType()), StructField("created_timestamp", TimestampType()), StructField("total_completed_bookings", IntegerType()), ]) df_data = [ ( 8001, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=1), 100, ), ( 8001, datetime(year=2020, month=9, day=2), datetime(year=2020, month=9, day=2), 150, ), ( 8002, datetime(year=2020, month=9, day=2), datetime(year=2020, month=9, day=2), 200, ), ] temp_dir, file_uri = create_temp_parquet_file(spark, "bookings", schema, df_data) file_source = FileSource("event_timestamp", "created_timestamp", "parquet", file_uri) features = [Feature("total_completed_bookings", ValueType.INT32)] max_age = Duration() max_age.FromSeconds(86400) feature_table = FeatureTable("bookings", ["driver_id"], features, batch_source=file_source, max_age=max_age) yield client.apply_feature_table(feature_table) shutil.rmtree(temp_dir)
def test_apply_feature_view_integration(test_registry): # Create Feature Views batch_source = FileSource( file_format=ParquetFormat(), path="file://feast/*", event_timestamp_column="ts_col", created_timestamp_column="timestamp", date_partition_column="date_partition_col", ) fv1 = FeatureView( name="my_feature_view_1", features=[ Feature(name="fs1_my_feature_1", dtype=ValueType.INT64), Feature(name="fs1_my_feature_2", dtype=ValueType.STRING), Feature(name="fs1_my_feature_3", dtype=ValueType.STRING_LIST), Feature(name="fs1_my_feature_4", dtype=ValueType.BYTES_LIST), ], entities=["fs1_my_entity_1"], tags={"team": "matchmaking"}, batch_source=batch_source, ttl=timedelta(minutes=5), ) project = "project" # Register Feature View test_registry.apply_feature_view(fv1, project) feature_views = test_registry.list_feature_views(project) # List Feature Views assert (len(feature_views) == 1 and feature_views[0].name == "my_feature_view_1" and feature_views[0].features[0].name == "fs1_my_feature_1" and feature_views[0].features[0].dtype == ValueType.INT64 and feature_views[0].features[1].name == "fs1_my_feature_2" and feature_views[0].features[1].dtype == ValueType.STRING and feature_views[0].features[2].name == "fs1_my_feature_3" and feature_views[0].features[2].dtype == ValueType.STRING_LIST and feature_views[0].features[3].name == "fs1_my_feature_4" and feature_views[0].features[3].dtype == ValueType.BYTES_LIST and feature_views[0].entities[0] == "fs1_my_entity_1") feature_view = test_registry.get_feature_view("my_feature_view_1", project) assert (feature_view.name == "my_feature_view_1" and feature_view.features[0].name == "fs1_my_feature_1" and feature_view.features[0].dtype == ValueType.INT64 and feature_view.features[1].name == "fs1_my_feature_2" and feature_view.features[1].dtype == ValueType.STRING and feature_view.features[2].name == "fs1_my_feature_3" and feature_view.features[2].dtype == ValueType.STRING_LIST and feature_view.features[3].name == "fs1_my_feature_4" and feature_view.features[3].dtype == ValueType.BYTES_LIST and feature_view.entities[0] == "fs1_my_entity_1") test_registry.delete_feature_view("my_feature_view_1", project) feature_views = test_registry.list_feature_views(project) assert len(feature_views) == 0 test_registry.teardown() # Will try to reload registry, which will fail because the file has been deleted with pytest.raises(FileNotFoundError): test_registry._get_registry_proto()
def evaluate_historical_retrieval(): # Make sure all event timestamp fields are tz-aware. We default tz-naive fields to UTC entity_df[entity_df_event_timestamp_col] = entity_df[ entity_df_event_timestamp_col].apply( lambda x: x if x.tzinfo is not None else x.replace(tzinfo=pytz.utc)) # Create a copy of entity_df to prevent modifying the original entity_df_with_features = entity_df.copy() # Convert event timestamp column to datetime and normalize time zone to UTC # This is necessary to avoid issues with pd.merge_asof entity_df_with_features[ entity_df_event_timestamp_col] = pd.to_datetime( entity_df_with_features[entity_df_event_timestamp_col], utc=True) # Sort event timestamp values entity_df_with_features = entity_df_with_features.sort_values( entity_df_event_timestamp_col) # Load feature view data from sources and join them incrementally for feature_view, features in feature_views_to_features.items(): event_timestamp_column = ( feature_view.batch_source.event_timestamp_column) created_timestamp_column = ( feature_view.batch_source.created_timestamp_column) # Read offline parquet data in pyarrow format. filesystem, path = FileSource.create_filesystem_and_path( feature_view.batch_source.path, feature_view.batch_source.file_options. s3_endpoint_override, ) table = pyarrow.parquet.read_table(path, filesystem=filesystem) # Rename columns by the field mapping dictionary if it exists if feature_view.batch_source.field_mapping is not None: table = _run_field_mapping( table, feature_view.batch_source.field_mapping) # Rename entity columns by the join_key_map dictionary if it exists if feature_view.projection.join_key_map: table = _run_field_mapping( table, feature_view.projection.join_key_map) # Convert pyarrow table to pandas dataframe. Note, if the underlying data has missing values, # pandas will convert those values to np.nan if the dtypes are numerical (floats, ints, etc.) or boolean # If the dtype is 'object', then missing values are inferred as python `None`s. # More details at: # https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html#values-considered-missing df_to_join = table.to_pandas() # Make sure all timestamp fields are tz-aware. We default tz-naive fields to UTC df_to_join[event_timestamp_column] = df_to_join[ event_timestamp_column].apply(lambda x: x if x.tzinfo is not None else x.replace(tzinfo=pytz.utc)) if created_timestamp_column: df_to_join[created_timestamp_column] = df_to_join[ created_timestamp_column].apply( lambda x: x if x.tzinfo is not None else x.replace( tzinfo=pytz.utc)) # Sort dataframe by the event timestamp column df_to_join = df_to_join.sort_values(event_timestamp_column) # Build a list of all the features we should select from this source feature_names = [] for feature in features: # Modify the separator for feature refs in column names to double underscore. We are using # double underscore as separator for consistency with other databases like BigQuery, # where there are very few characters available for use as separators if full_feature_names: formatted_feature_name = ( f"{feature_view.projection.name_to_use()}__{feature}" ) else: formatted_feature_name = feature # Add the feature name to the list of columns feature_names.append(formatted_feature_name) # Ensure that the source dataframe feature column includes the feature view name as a prefix df_to_join.rename( columns={feature: formatted_feature_name}, inplace=True, ) # Build a list of entity columns to join on (from the right table) join_keys = [] for entity_name in feature_view.entities: entity = registry.get_entity(entity_name, project) join_key = feature_view.projection.join_key_map.get( entity.join_key, entity.join_key) join_keys.append(join_key) right_entity_columns = join_keys right_entity_key_columns = [event_timestamp_column ] + right_entity_columns # Remove all duplicate entity keys (using created timestamp) right_entity_key_sort_columns = right_entity_key_columns if created_timestamp_column: # If created_timestamp is available, use it to dedupe deterministically right_entity_key_sort_columns = right_entity_key_sort_columns + [ created_timestamp_column ] df_to_join.sort_values(by=right_entity_key_sort_columns, inplace=True) df_to_join.drop_duplicates( right_entity_key_sort_columns, keep="last", ignore_index=True, inplace=True, ) # Select only the columns we need to join from the feature dataframe df_to_join = df_to_join[right_entity_key_columns + feature_names] # Do point in-time-join between entity_df and feature dataframe entity_df_with_features = pd.merge_asof( entity_df_with_features, df_to_join, left_on=entity_df_event_timestamp_col, right_on=event_timestamp_column, by=right_entity_columns or None, tolerance=feature_view.ttl, ) # Remove right (feature table/view) event_timestamp column. if event_timestamp_column != entity_df_event_timestamp_col: entity_df_with_features.drop( columns=[event_timestamp_column], inplace=True) # Ensure that we delete dataframes to free up memory del df_to_join # Move "event_timestamp" column to front current_cols = entity_df_with_features.columns.tolist() current_cols.remove(entity_df_event_timestamp_col) entity_df_with_features = entity_df_with_features[ [entity_df_event_timestamp_col] + current_cols] return entity_df_with_features