def alltypes_featuretable(): batch_source = FileSource( file_format="parquet", file_url="file://feast/*", event_timestamp_column="ts_col", created_timestamp_column="timestamp", date_partition_column="date_partition_col", ) return FeatureTable( name="alltypes", entities=["alltypes_id"], features=[ Feature(name="float_feature", dtype=ValueType.FLOAT), Feature(name="int64_feature", dtype=ValueType.INT64), Feature(name="int32_feature", dtype=ValueType.INT32), Feature(name="string_feature", dtype=ValueType.STRING), Feature(name="bytes_feature", dtype=ValueType.BYTES), Feature(name="bool_feature", dtype=ValueType.BOOL), Feature(name="double_feature", dtype=ValueType.DOUBLE), Feature(name="double_list_feature", dtype=ValueType.DOUBLE_LIST), Feature(name="float_list_feature", dtype=ValueType.FLOAT_LIST), Feature(name="int64_list_feature", dtype=ValueType.INT64_LIST), Feature(name="int32_list_feature", dtype=ValueType.INT32_LIST), Feature(name="string_list_feature", dtype=ValueType.STRING_LIST), Feature(name="bytes_list_feature", dtype=ValueType.BYTES_LIST), Feature(name="bool_list_feature", dtype=ValueType.BOOL_LIST), ], max_age=Duration(seconds=3600), batch_source=batch_source, labels={"cat": "alltypes"}, )
def prep_dynamodb_fs_and_fv() -> Iterator[Tuple[FeatureStore, FeatureView]]: with tempfile.NamedTemporaryFile(suffix=".parquet") as f: df = create_dataset() f.close() df.to_parquet(f.name) file_source = FileSource( file_format=ParquetFormat(), file_url=f"file://{f.name}", event_timestamp_column="ts", created_timestamp_column="created_ts", date_partition_column="", field_mapping={"ts_1": "ts", "id": "driver_id"}, ) fv = get_feature_view(file_source) e = Entity( name="driver", description="id for driver", join_key="driver_id", value_type=ValueType.INT32, ) with tempfile.TemporaryDirectory() as repo_dir_name: config = RepoConfig( registry=str(Path(repo_dir_name) / "registry.db"), project=f"test_bq_correctness_{str(uuid.uuid4()).replace('-', '')}", provider="aws", online_store=DynamoDBOnlineStoreConfig(region="us-west-2"), offline_store=FileOfflineStoreConfig(), ) fs = FeatureStore(config=config) fs.apply([fv, e]) yield fs, fv
def stage_entities_to_fs(entity_source: pd.DataFrame, staging_location: str, config: Config) -> FileSource: """ Dumps given (entities) dataframe as parquet file and stage it to remote file storage (subdirectory of staging_location) :return: FileSource with remote destination path """ entity_staging_uri = urlparse( os.path.join(staging_location, str(uuid.uuid4()))) staging_client = get_staging_client(entity_staging_uri.scheme, config) with tempfile.NamedTemporaryFile() as df_export_path: # prevent casting ns -> ms exception inside pyarrow entity_source["event_timestamp"] = entity_source[ "event_timestamp"].dt.floor("ms") entity_source.to_parquet(df_export_path.name) with open(df_export_path.name, "rb") as f: staging_client.upload_fileobj(f, df_export_path.name, remote_uri=entity_staging_uri) # ToDo: support custom event_timestamp_column return FileSource( event_timestamp_column="event_timestamp", file_format=ParquetFormat(), file_url=entity_staging_uri.geturl(), )
def prep_local_fs_and_fv() -> Iterator[Tuple[FeatureStore, FeatureView]]: with tempfile.NamedTemporaryFile(suffix=".parquet") as f: df = create_dataset() f.close() df.to_parquet(f.name) file_source = FileSource( file_format=ParquetFormat(), file_url=f"file://{f.name}", event_timestamp_column="ts", created_timestamp_column="created_ts", date_partition_column="", field_mapping={ "ts_1": "ts", "id": "driver_id" }, ) fv = get_feature_view(file_source) with tempfile.TemporaryDirectory( ) as repo_dir_name, tempfile.TemporaryDirectory() as data_dir_name: config = RepoConfig( registry=str(Path(repo_dir_name) / "registry.db"), project= f"test_bq_correctness_{str(uuid.uuid4()).replace('-', '')}", provider="local", online_store=OnlineStoreConfig(local=LocalOnlineStoreConfig( path=str(Path(data_dir_name) / "online_store.db"))), ) fs = FeatureStore(config=config) fs.apply([fv]) yield fs, fv
def stage_dataframe(df, event_timestamp_column: str, config: Config) -> FileSource: """ Helper function to upload a pandas dataframe in parquet format to a temporary location (under SPARK_STAGING_LOCATION) and return it wrapped in a FileSource. Args: event_timestamp_column(str): the name of the timestamp column in the dataframe. config(Config): feast config. """ staging_location = config.get(opt.SPARK_STAGING_LOCATION) staging_uri = urlparse(staging_location) with tempfile.NamedTemporaryFile() as f: df.to_parquet(f) file_url = urlunparse( get_staging_client(staging_uri.scheme, config).upload_fileobj( f, f.name, remote_path_prefix=os.path.join(staging_location, "dataframes"), remote_path_suffix=".parquet", ) ) return FileSource( event_timestamp_column=event_timestamp_column, file_format=ParquetFormat(), file_url=file_url, )
def stage_customer_daily_profile_parquet_source(directory, df): customer_profile_path = os.path.join(directory, "customer_profile.parquet") df.to_parquet(path=customer_profile_path, allow_truncated_timestamps=True) return FileSource( path=customer_profile_path, event_timestamp_column="datetime", created_timestamp_column="created", )
def test_apply_feature_view_integration(self, test_feature_store): # Create Feature Views batch_source = FileSource( file_format=ParquetFormat(), file_url="file://feast/*", event_timestamp_column="ts_col", created_timestamp_column="timestamp", date_partition_column="date_partition_col", ) fv1 = FeatureView( name="my_feature_view_1", features=[ Feature(name="fs1_my_feature_1", dtype=ValueType.INT64), Feature(name="fs1_my_feature_2", dtype=ValueType.STRING), Feature(name="fs1_my_feature_3", dtype=ValueType.STRING_LIST), Feature(name="fs1_my_feature_4", dtype=ValueType.BYTES_LIST), ], entities=["fs1_my_entity_1"], tags={"team": "matchmaking"}, input=batch_source, ttl=timedelta(minutes=5), ) # Register Feature View test_feature_store.apply([fv1]) feature_views = test_feature_store.list_feature_views() # List Feature Views assert (len(feature_views) == 1 and feature_views[0].name == "my_feature_view_1" and feature_views[0].features[0].name == "fs1_my_feature_1" and feature_views[0].features[0].dtype == ValueType.INT64 and feature_views[0].features[1].name == "fs1_my_feature_2" and feature_views[0].features[1].dtype == ValueType.STRING and feature_views[0].features[2].name == "fs1_my_feature_3" and feature_views[0].features[2].dtype == ValueType.STRING_LIST and feature_views[0].features[3].name == "fs1_my_feature_4" and feature_views[0].features[3].dtype == ValueType.BYTES_LIST and feature_views[0].entities[0] == "fs1_my_entity_1") feature_view = test_feature_store.get_feature_view("my_feature_view_1") assert (feature_view.name == "my_feature_view_1" and feature_view.features[0].name == "fs1_my_feature_1" and feature_view.features[0].dtype == ValueType.INT64 and feature_view.features[1].name == "fs1_my_feature_2" and feature_view.features[1].dtype == ValueType.STRING and feature_view.features[2].name == "fs1_my_feature_3" and feature_view.features[2].dtype == ValueType.STRING_LIST and feature_view.features[3].name == "fs1_my_feature_4" and feature_view.features[3].dtype == ValueType.BYTES_LIST and feature_view.entities[0] == "fs1_my_entity_1") test_feature_store.delete_feature_view("my_feature_view_1") feature_views = test_feature_store.list_feature_views() assert len(feature_views) == 0
def stage_driver_hourly_stats_parquet_source(directory, df): # Write to disk driver_stats_path = os.path.join(directory, "driver_stats.parquet") df.to_parquet(path=driver_stats_path, allow_truncated_timestamps=True) return FileSource( path=driver_stats_path, event_timestamp_column="datetime", created_timestamp_column="", )
def test_apply_object_and_read(test_feature_store): assert isinstance(test_feature_store, FeatureStore) # Create Feature Views batch_source = FileSource( file_format=ParquetFormat(), file_url="file://feast/*", event_timestamp_column="ts_col", created_timestamp_column="timestamp", ) e1 = Entity(name="fs1_my_entity_1", value_type=ValueType.STRING, description="something") e2 = Entity(name="fs1_my_entity_2", value_type=ValueType.STRING, description="something") fv1 = FeatureView( name="my_feature_view_1", features=[ Feature(name="fs1_my_feature_1", dtype=ValueType.INT64), Feature(name="fs1_my_feature_2", dtype=ValueType.STRING), Feature(name="fs1_my_feature_3", dtype=ValueType.STRING_LIST), Feature(name="fs1_my_feature_4", dtype=ValueType.BYTES_LIST), ], entities=["fs1_my_entity_1"], tags={"team": "matchmaking"}, input=batch_source, ttl=timedelta(minutes=5), ) fv2 = FeatureView( name="my_feature_view_2", features=[ Feature(name="fs1_my_feature_1", dtype=ValueType.INT64), Feature(name="fs1_my_feature_2", dtype=ValueType.STRING), Feature(name="fs1_my_feature_3", dtype=ValueType.STRING_LIST), Feature(name="fs1_my_feature_4", dtype=ValueType.BYTES_LIST), ], entities=["fs1_my_entity_1"], tags={"team": "matchmaking"}, input=batch_source, ttl=timedelta(minutes=5), ) # Register Feature View test_feature_store.apply([fv1, e1, fv2, e2]) fv1_actual = test_feature_store.get_feature_view("my_feature_view_1") e1_actual = test_feature_store.get_entity("fs1_my_entity_1") assert fv1 == fv1_actual assert e1 == e1_actual assert fv2 != fv1_actual assert e2 != e1_actual
def prep_file_source(df, event_timestamp_column=None) -> FileSource: with tempfile.NamedTemporaryFile(suffix=".parquet") as f: f.close() df.to_parquet(f.name) file_source = FileSource( file_format=ParquetFormat(), file_url=f.name, event_timestamp_column=event_timestamp_column, ) yield file_source
def test_apply_feature_table_success(self, test_client): test_client.set_project("project1") # Create Feature Tables batch_source = FileSource( file_format="parquet", file_url="file://feast/*", event_timestamp_column="ts_col", created_timestamp_column="timestamp", date_partition_column="date_partition_col", ) stream_source = KafkaSource( bootstrap_servers="localhost:9094", class_path="random/path/to/class", topic="test_topic", event_timestamp_column="ts_col", created_timestamp_column="timestamp", ) ft1 = FeatureTable( name="my-feature-table-1", features=[ Feature(name="fs1-my-feature-1", dtype=ValueType.INT64), Feature(name="fs1-my-feature-2", dtype=ValueType.STRING), Feature(name="fs1-my-feature-3", dtype=ValueType.STRING_LIST), Feature(name="fs1-my-feature-4", dtype=ValueType.BYTES_LIST), ], entities=["fs1-my-entity-1"], labels={"team": "matchmaking"}, batch_source=batch_source, stream_source=stream_source, ) # Register Feature Table with Core test_client.apply_feature_table(ft1) feature_tables = test_client.list_feature_tables() # List Feature Tables assert ( len(feature_tables) == 1 and feature_tables[0].name == "my-feature-table-1" and feature_tables[0].features[0].name == "fs1-my-feature-1" and feature_tables[0].features[0].dtype == ValueType.INT64 and feature_tables[0].features[1].name == "fs1-my-feature-2" and feature_tables[0].features[1].dtype == ValueType.STRING and feature_tables[0].features[2].name == "fs1-my-feature-3" and feature_tables[0].features[2].dtype == ValueType.STRING_LIST and feature_tables[0].features[3].name == "fs1-my-feature-4" and feature_tables[0].features[3].dtype == ValueType.BYTES_LIST and feature_tables[0].entities[0] == "fs1-my-entity-1" )
def batch_source(self): return FileSource( field_mapping={ "ride_distance": "ride_distance", "ride_duration": "ride_duration", }, file_format=ParquetFormat(), file_url="file://feast/*", event_timestamp_column="ts_col", created_timestamp_column="timestamp", date_partition_column="date_partition_col", )
def stage_dataframe(self, df: pandas.DataFrame, event_timestamp: str) -> FileSource: with tempfile.NamedTemporaryFile() as f: df.to_parquet(f) file_url = _s3_upload( f, f.name, remote_path_prefix=os.path.join(self._staging_location, "dataframes"), remote_path_suffix=".parquet", ) return FileSource( event_timestamp_column=event_timestamp, file_format=ParquetFormat(), file_url=file_url, )
def test_feature_table_import_export_yaml(self): batch_source = FileSource( field_mapping={ "ride_distance": "ride_distance", "ride_duration": "ride_duration", }, file_format="parquet", file_url="file://feast/*", event_timestamp_column="ts_col", created_timestamp_column="timestamp", date_partition_column="date_partition_col", ) stream_source = KafkaSource( field_mapping={ "ride_distance": "ride_distance", "ride_duration": "ride_duration", }, bootstrap_servers="localhost:9094", class_path="random/path/to/class", topic="test_topic", event_timestamp_column="ts_col", created_timestamp_column="timestamp", ) test_feature_table = FeatureTable( name="car_driver", features=[ Feature(name="ride_distance", dtype=ValueType.FLOAT), Feature(name="ride_duration", dtype=ValueType.STRING), ], entities=["car_driver_entity"], labels={"team": "matchmaking"}, batch_source=batch_source, stream_source=stream_source, ) # Create a string YAML representation of the feature table string_yaml = test_feature_table.to_yaml() # Create a new feature table object from the YAML string actual_feature_table_from_string = FeatureTable.from_yaml(string_yaml) # Ensure equality is upheld to original feature table assert test_feature_table == actual_feature_table_from_string
def basic_featuretable(): batch_source = FileSource( field_mapping={ "dev_entity": "dev_entity_field", "dev_feature_float": "dev_feature_float_field", "dev_feature_string": "dev_feature_string_field", }, file_format="PARQUET", file_url="gs://example/feast/*", event_timestamp_column="datetime_col", created_timestamp_column="timestamp", date_partition_column="datetime", ) stream_source = KafkaSource( field_mapping={ "dev_entity": "dev_entity_field", "dev_feature_float": "dev_feature_float_field", "dev_feature_string": "dev_feature_string_field", }, bootstrap_servers="localhost:9094", class_path="random/path/to/class", topic="test_topic", event_timestamp_column="datetime_col", created_timestamp_column="timestamp", ) return FeatureTable( name="basic_featuretable", entities=["driver_id", "customer_id"], features=[ Feature(name="dev_feature_float", dtype=ValueType.FLOAT), Feature(name="dev_feature_string", dtype=ValueType.STRING), ], max_age=Duration(seconds=3600), batch_source=batch_source, stream_source=stream_source, labels={ "key1": "val1", "key2": "val2" }, )
def _create_ft(self, client: Client, features) -> None: entity = Entity( name="driver_car_id", description="Car driver id", value_type=ValueType.STRING, labels={"team": "matchmaking"}, ) # Register Entity with Core client.apply_entity(entity) # Create Feature Tables batch_source = FileSource( file_format=ParquetFormat(), file_url="file://feast/*", event_timestamp_column="ts_col", created_timestamp_column="timestamp", date_partition_column="date_partition_col", ) stream_source = KafkaSource( bootstrap_servers="localhost:9094", message_format=ProtoFormat("class.path"), topic="test_topic", event_timestamp_column="ts_col", created_timestamp_column="timestamp", ) ft1 = FeatureTable( name=self.table_name, features=features, entities=["driver_car_id"], labels={"team": "matchmaking"}, batch_source=batch_source, stream_source=stream_source, ) # Register Feature Table with Core client.apply_feature_table(ft1)
def stage_entities_to_fs(entity_source: pd.DataFrame, staging_location: str) -> FileSource: """ Dumps given (entities) dataframe as parquet file and stage it to remote file storage (subdirectory of staging_location) :return: FileSource with remote destination path """ entity_staging_uri = urlparse( os.path.join(staging_location, str(uuid.uuid4()))) staging_client = get_staging_client(entity_staging_uri.scheme) with tempfile.NamedTemporaryFile() as df_export_path: entity_source.to_parquet(df_export_path.name) bucket = (None if entity_staging_uri.scheme == "file" else entity_staging_uri.netloc) staging_client.upload_file(df_export_path.name, bucket, entity_staging_uri.path.lstrip("/")) # ToDo: support custom event_timestamp_column return FileSource( event_timestamp_column="event_timestamp", file_format=ParquetFormat(), file_url=entity_staging_uri.geturl(), )
# This is an example feature definition file from google.protobuf.duration_pb2 import Duration from feast import Entity, Feature, FeatureView, ValueType from feast.data_source import FileSource # Read data from parquet files. Parquet is convenient for local development mode. For # production, you can use your favorite DWH, such as BigQuery. See Feast documentation # for more info. wine_features_table = FileSource( event_timestamp_column="datetime", path= "/Users/julesdamji/examples/py/feature_store/data/wine_features.parquet") # Define an entity for the weather features. You can think of entity as a primary key used to # fetch features. acidity = Entity(name="volatile_acidity", value_type=ValueType.DOUBLE, description="acidity") # Our parquet files contain serving data that includes four 10 columns. Here we define a Feature View that will allow us to serve this # data to our model online. wine_features_view = FeatureView( name="wine_features", entities=["fixed_acidity"], ttl=Duration(seconds=86400 * 1), features=[ Feature(name="volatile_acidity", dtype=ValueType.DOUBLE), Feature(name="citric_acid", dtype=ValueType.DOUBLE),
def create_driver_hourly_stats_source(parquet_path): return FileSource( path=parquet_path, event_timestamp_column="datetime", created_timestamp_column="created", )
# This is an example feature definition file from google.protobuf.duration_pb2 import Duration from feast import Entity, Feature, FeatureView, ValueType from feast.data_source import FileSource # Read data from parquet files. Parquet is convenient for local development mode. For # production, you can use your favorite DWH, such as BigQuery. See Feast documentation # for more info. driver_hourly_stats = FileSource( path= "/home/ec2-user/SageMaker/feast_fraud_demo/famous_lemur/data/driver_stats.parquet", event_timestamp_column="datetime", created_timestamp_column="created", ) # Define an entity for the driver. You can think of entity as a primary key used to # fetch features. driver = Entity( name="driver_id", value_type=ValueType.INT64, description="driver id", ) # Our parquet files contain sample data that includes a driver_id column, timestamps and # three feature column. Here we define a Feature View that will allow us to serve this # data to our model online. driver_hourly_stats_view = FeatureView( name="driver_hourly_stats", entities=["driver_id"],
# This is an example feature definition file from google.protobuf.duration_pb2 import Duration from feast import Entity, Feature, FeatureView, ValueType from feast.data_source import FileSource # Read data from parquet files. Parquet is convenient for local development mode. For # production, you can use your favorite DWH, such as BigQuery. See Feast documentation # for more info. driver_hourly_stats = FileSource( path="/home/chapman/Documents/feast-start/feast_repo/data/driver_stats.parquet", event_timestamp_column="datetime", created_timestamp_column="created", ) # Define an entity for the driver. You can think of entity as a primary key used to # fetch features. driver = Entity(name="driver_id", value_type=ValueType.INT64, description="driver id",) # Our parquet files contain sample data that includes a driver_id column, timestamps and # three feature column. Here we define a Feature View that will allow us to serve this # data to our model online. driver_hourly_stats_view = FeatureView( name="driver_hourly_stats", entities=["driver_id"], ttl=Duration(seconds=86400 * 1), features=[ Feature(name="conv_rate", dtype=ValueType.FLOAT), Feature(name="acc_rate", dtype=ValueType.FLOAT), Feature(name="avg_daily_trips", dtype=ValueType.INT64),
# create dummy entity since Feast demands it entity_2 = Entity( name="dummy_entity_2", description="Dummy entity 2", value_type=ValueType.INT32, labels={"key": "val"}, ) # commit entities test_client.apply([entity_1, entity_2]) # dummy file source batch_source = FileSource( file_format=ParquetFormat(), file_url="file://feast/*", event_timestamp_column="ts_col", created_timestamp_column="timestamp", date_partition_column="date_partition_col", ) # first feature table for testing, with all of Feast's datatypes table_1 = FeatureTable( name="test_feature_table_all_feature_dtypes", features=[ Feature(name="test_BYTES_feature", dtype=ValueType.BYTES), Feature(name="test_STRING_feature", dtype=ValueType.STRING), Feature(name="test_INT32_feature", dtype=ValueType.INT32), Feature(name="test_INT64_feature", dtype=ValueType.INT64), Feature(name="test_DOUBLE_feature", dtype=ValueType.DOUBLE), Feature(name="test_FLOAT_feature", dtype=ValueType.FLOAT), Feature(name="test_BOOL_feature", dtype=ValueType.BOOL),
def test_apply_feature_table_integration(self, test_client): # Create Feature Tables batch_source = FileSource( file_format=ParquetFormat(), file_url="file://feast/*", event_timestamp_column="ts_col", created_timestamp_column="timestamp", date_partition_column="date_partition_col", ) stream_source = KafkaSource( bootstrap_servers="localhost:9094", message_format=ProtoFormat("class.path"), topic="test_topic", event_timestamp_column="ts_col", ) ft1 = FeatureTable( name="my-feature-table-1", features=[ Feature(name="fs1-my-feature-1", dtype=ValueType.INT64), Feature(name="fs1-my-feature-2", dtype=ValueType.STRING), Feature(name="fs1-my-feature-3", dtype=ValueType.STRING_LIST), Feature(name="fs1-my-feature-4", dtype=ValueType.BYTES_LIST), ], entities=["fs1-my-entity-1"], labels={"team": "matchmaking"}, batch_source=batch_source, stream_source=stream_source, ) # Register Feature Table with Core test_client.apply(ft1) feature_tables = test_client.list_feature_tables() # List Feature Tables assert (len(feature_tables) == 1 and feature_tables[0].name == "my-feature-table-1" and feature_tables[0].features[0].name == "fs1-my-feature-1" and feature_tables[0].features[0].dtype == ValueType.INT64 and feature_tables[0].features[1].name == "fs1-my-feature-2" and feature_tables[0].features[1].dtype == ValueType.STRING and feature_tables[0].features[2].name == "fs1-my-feature-3" and feature_tables[0].features[2].dtype == ValueType.STRING_LIST and feature_tables[0].features[3].name == "fs1-my-feature-4" and feature_tables[0].features[3].dtype == ValueType.BYTES_LIST and feature_tables[0].entities[0] == "fs1-my-entity-1") feature_table = test_client.get_feature_table("my-feature-table-1") assert (feature_table.name == "my-feature-table-1" and feature_table.features[0].name == "fs1-my-feature-1" and feature_table.features[0].dtype == ValueType.INT64 and feature_table.features[1].name == "fs1-my-feature-2" and feature_table.features[1].dtype == ValueType.STRING and feature_table.features[2].name == "fs1-my-feature-3" and feature_table.features[2].dtype == ValueType.STRING_LIST and feature_table.features[3].name == "fs1-my-feature-4" and feature_table.features[3].dtype == ValueType.BYTES_LIST and feature_table.entities[0] == "fs1-my-entity-1") test_client.delete_feature_table("my-feature-table-1") feature_tables = test_client.list_feature_tables() assert len(feature_tables) == 0
# This is an example feature definition file from google.protobuf.duration_pb2 import Duration from feast import Entity, Feature, FeatureView, ValueType from feast.data_source import FileSource # Read data from parquet files. Parquet is convenient for local development mode. For # production, you can use your favorite DWH, such as BigQuery. See Feast documentation # for more info. driver_hourly_stats = FileSource( path="%PARQUET_PATH%", event_timestamp_column="datetime", created_timestamp_column="created", ) # Define an entity for the driver. You can think of entity as a primary key used to # fetch features. driver = Entity( name="driver_id", value_type=ValueType.INT64, description="driver id", ) # Our parquet files contain sample data that includes a driver_id column, timestamps and # three feature column. Here we define a Feature View that will allow us to serve this # data to our model online. driver_hourly_stats_view = FeatureView( name="driver_hourly_stats", entities=["driver_id"], ttl=Duration(seconds=86400 * 1),
# This is an example feature definition file from google.protobuf.duration_pb2 import Duration from feast import Entity, Feature, FeatureView, ValueType from feast.data_source import FileSource # Read data from parquet files. Parquet is convenient for local development mode. For # production, you can use your favorite DWH, such as BigQuery. See Feast documentation # for more info. driver_hourly_stats = FileSource( path= "/Users/julesdamji/examples/py/feature_store/feature_repo/data/driver_stats.parquet", event_timestamp_column="datetime", created_timestamp_column="created", ) # Define an entity for the driver. You can think of entity as a primary key used to # fetch features. driver = Entity( name="driver_id", value_type=ValueType.INT64, description="driver id", ) # Our parquet files contain sample data that includes a driver_id column, timestamps and # three feature column. Here we define a Feature View that will allow us to serve this # data to our model online. driver_hourly_stats_view = FeatureView( name="driver_hourly_stats", entities=["driver_id"],
from google.protobuf.duration_pb2 import Duration from feast import Entity, FeatureView, ValueType from feast.data_source import FileSource driver_hourly_stats = FileSource( path="%PARQUET_PATH%", # placeholder to be replaced by the test created_timestamp_column="created", ) driver = Entity(name="driver_id", value_type=ValueType.INT64, description="driver id",) # features are inferred from columns of data source driver_hourly_stats_view = FeatureView( name="driver_hourly_stats", entities=["driver_id"], ttl=Duration(seconds=86400 * 1), online=True, input=driver_hourly_stats, tags={}, )
def get_historical_features( self, feature_refs: List[str], entity_source: Union[pd.DataFrame, FileSource, BigQuerySource], project: str = None, ) -> RetrievalJob: """ Launch a historical feature retrieval job. Args: feature_refs: List of feature references that will be returned for each entity. Each feature reference should have the following format: "feature_table:feature" where "feature_table" & "feature" refer to the feature and feature table names respectively. entity_source (Union[pd.DataFrame, FileSource, BigQuerySource]): Source for the entity rows. If entity_source is a Panda DataFrame, the dataframe will be exported to the staging location as parquet file. It is also assumed that the column event_timestamp is present in the dataframe, and is of type datetime without timezone information. The user needs to make sure that the source (or staging location, if entity_source is a Panda DataFrame) is accessible from the Spark cluster that will be used for the retrieval job. project: Specifies the project that contains the feature tables which the requested features belong to. Returns: Returns a retrieval job object that can be used to monitor retrieval progress asynchronously, and can be used to materialize the results. Examples: >>> from feast import Client >>> from datetime import datetime >>> feast_client = Client(core_url="localhost:6565") >>> feature_refs = ["bookings:bookings_7d", "bookings:booking_14d"] >>> entity_source = FileSource("event_timestamp", "parquet", "gs://some-bucket/customer") >>> feature_retrieval_job = feast_client.get_historical_features( >>> feature_refs, entity_source, project="my_project") >>> output_file_uri = feature_retrieval_job.get_output_file_uri() "gs://some-bucket/output/ """ feature_tables = self._get_feature_tables_from_feature_refs( feature_refs, project) output_location = os.path.join( self._config.get(CONFIG_SPARK_HISTORICAL_FEATURE_OUTPUT_LOCATION), str(uuid.uuid4()), ) output_format = self._config.get( CONFIG_SPARK_HISTORICAL_FEATURE_OUTPUT_FORMAT) if isinstance(entity_source, pd.DataFrame): staging_location = self._config.get(CONFIG_SPARK_STAGING_LOCATION) entity_staging_uri = urlparse( os.path.join(staging_location, str(uuid.uuid4()))) staging_client = get_staging_client(entity_staging_uri.scheme) with tempfile.NamedTemporaryFile() as df_export_path: entity_source.to_parquet(df_export_path.name) bucket = (None if entity_staging_uri.scheme == "file" else entity_staging_uri.netloc) staging_client.upload_file(df_export_path.name, bucket, entity_staging_uri.path.lstrip("/")) entity_source = FileSource( "event_timestamp", "created_timestamp", ParquetFormat(), entity_staging_uri.geturl(), ) return start_historical_feature_retrieval_job( self, entity_source, feature_tables, output_format, os.path.join(output_location, str(uuid.uuid4())), )
# This is an example feature definition file from google.protobuf.duration_pb2 import Duration from feast import Entity, Feature, FeatureView, ValueType from feast.data_source import FileSource # Read data from parquet files. Parquet is convenient for local development mode. For # production, you can use your favorite DWH, such as BigQuery. See Feast documentation # for more info. # if its parquet, it can just be a folder of parquet files, based on the parquet # format - then you can keep appending to the folder as required. batch_source = FileSource( path="/home/chapman/Documents/feast-start/feature_multi/data/events", event_timestamp_column="event_timestamp", created_timestamp_column="created_timestamp", ) # Define an entity for the driver. You can think of entity as a primary key used to # fetch features. customer = Entity( name="user_id", value_type=ValueType.INT64, description="customer id for transactions", ) # Our parquet files contain sample data that includes a driver_id column, timestamps and # three feature column. Here we define a Feature View that will allow us to serve this # data to our model online. customer_events = FeatureView( name="customer_events",
# This is an example feature definition file from google.protobuf.duration_pb2 import Duration from feast import Entity, Feature, FeatureView, ValueType from feast.data_source import FileSource # Read data from parquet files. Parquet is convenient for local development mode. For # production, you can use your favorite DWH, such as BigQuery. See Feast documentation # for more info. batch_source = FileSource( path= "/home/chapman/Documents/feast-start/feature_transaction/data/transactions.parquet", event_timestamp_column="event_timestamp", created_timestamp_column="created_timestamp", ) # Define an entity for the driver. You can think of entity as a primary key used to # fetch features. customer = Entity( name="user_id", value_type=ValueType.INT64, description="customer id for transactions", ) # Our parquet files contain sample data that includes a driver_id column, timestamps and # three feature column. Here we define a Feature View that will allow us to serve this # data to our model online. customer_transactions = FeatureView( name="customer_transactions", entities=["user_id"],
# Feature definition from datetime import datetime from pathlib import Path from feast import Entity, Feature, FeatureView, ValueType from feast.data_source import FileSource from google.protobuf.duration_pb2 import Duration from config import config # Read data START_TIME = "2020-02-17" project_details = FileSource( path=str(Path(config.DATA_DIR, "features.parquet")), event_timestamp_column="created_on", ) # Define an entity for the project project = Entity( name="id", value_type=ValueType.INT64, description="project id", ) # Define a Feature View for each project # Can be used for fetching historical data and online serving project_details_view = FeatureView( name="project_details", entities=["id"], ttl=Duration(seconds=(datetime.today() -