def test_partial() -> None: """ Add another table to existing repo using partial apply API. Make sure both the table applied via CLI apply and the new table are passing RW test. """ runner = CliRunner() with runner.local_repo(get_example_repo("example_feature_repo_1.py"), "bigquery") as store: driver_locations_source = BigQuerySource( table="feast-oss.public.drivers", timestamp_field="event_timestamp", created_timestamp_column="created_timestamp", ) driver_locations_100 = FeatureView( name="driver_locations_100", entities=["driver"], ttl=timedelta(days=1), schema=[ Field(name="lat", dtype=Float32), Field(name="lon", dtype=String), Field(name="name", dtype=String), ], online=True, batch_source=driver_locations_source, tags={}, ) store.apply([driver_locations_100]) basic_rw_test(store, view_name="driver_locations") basic_rw_test(store, view_name="driver_locations_100")
def test_get_column_names_preserves_feature_ordering(): entity = Entity("my-entity", description="My entity", value_type=ValueType.STRING) fv = FeatureView( name="my-fv", entities=["my-entity"], ttl=timedelta(days=1), batch_source=BigQuerySource(table="non-existent-mock"), schema=[ Field(name="a", dtype=String), Field(name="b", dtype=String), Field(name="c", dtype=String), Field(name="d", dtype=String), Field(name="e", dtype=String), Field(name="f", dtype=String), Field(name="g", dtype=String), Field(name="h", dtype=String), Field(name="i", dtype=String), Field(name="j", dtype=String), ], ) _, feature_list, _, _ = _get_column_names(fv, [entity]) assert feature_list == ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"]
def stage_entities_to_bq(entity_source: pd.DataFrame, project: str, dataset: str) -> BigQuerySource: """ Stores given (entity) dataframe as new table in BQ. Name of the table generated based on current time. Table will expire in 1 day. Returns BigQuerySource with reference to created table. """ bq_client = bigquery.Client() destination = bigquery.TableReference( bigquery.DatasetReference(project, dataset), f"_entities_{datetime.now():%Y%m%d%H%M%s}", ) # prevent casting ns -> ms exception inside pyarrow entity_source["event_timestamp"] = entity_source[ "event_timestamp"].dt.floor("ms") load_job: bigquery.LoadJob = bq_client.load_table_from_dataframe( entity_source, destination) load_job.result() # wait until complete dest_table: bigquery.Table = bq_client.get_table(destination) dest_table.expires = datetime.now() + timedelta(days=1) bq_client.update_table(dest_table, fields=["expires"]) return BigQuerySource( event_timestamp_column="event_timestamp", table_ref= f"{destination.project}:{destination.dataset_id}.{destination.table_id}", )
def create_data_source( self, destination: str, df: pd.DataFrame, event_timestamp_column="ts", created_timestamp_column="created_ts", field_mapping: Dict[str, str] = None, **kwargs, ) -> DataSource: job_config = bigquery.LoadJobConfig() if self.gcp_project not in destination: destination = f"{self.gcp_project}.{self.project_name}.{destination}" job = self.client.load_table_from_dataframe(df, destination, job_config=job_config) job.result() self.tables.append(destination) return BigQuerySource( table_ref=destination, event_timestamp_column=event_timestamp_column, created_timestamp_column=created_timestamp_column, date_partition_column="", field_mapping=field_mapping or {"ts_1": "ts"}, )
def create_data_source( self, df: pd.DataFrame, destination_name: Optional[str] = None, event_timestamp_column="ts", created_timestamp_column="created_ts", field_mapping: Dict[str, str] = None, **kwargs, ) -> DataSource: destination_name = self.get_prefixed_table_name(destination_name) self.create_dataset() if self.gcp_project not in destination_name: destination_name = ( f"{self.gcp_project}.{self.project_name}.{destination_name}") job = self.client.load_table_from_dataframe(df, destination_name) job.result() self.tables.append(destination_name) return BigQuerySource( table_ref=destination_name, event_timestamp_column=event_timestamp_column, created_timestamp_column=created_timestamp_column, date_partition_column="", field_mapping=field_mapping or {"ts_1": "ts"}, )
def create_bq_view_of_joined_features_and_entities( source: BigQuerySource, entity_source: BigQuerySource, entity_names: List[str]) -> BigQuerySource: """ Creates BQ view that joins tables from `source` and `entity_source` with join key derived from `entity_names`. Returns BigQuerySource with reference to created view. """ bq_client = bigquery.Client() source_ref = table_reference_from_string(source.bigquery_options.table_ref) entities_ref = table_reference_from_string( entity_source.bigquery_options.table_ref) destination_ref = bigquery.TableReference( bigquery.DatasetReference(source_ref.project, source_ref.dataset_id), f"_view_{source_ref.table_id}_{datetime.now():%Y%m%d%H%M%s}", ) view = bigquery.Table(destination_ref) view.view_query = JOIN_TEMPLATE.format( entities=entities_ref, source=source_ref, entity_key=" AND ".join( [f"source.{e} = entities.{e}" for e in entity_names]), ) view.expires = datetime.now() + timedelta(days=1) bq_client.create_table(view) return BigQuerySource( event_timestamp_column=source.event_timestamp_column, created_timestamp_column=source.created_timestamp_column, table_ref=f"{view.project}:{view.dataset_id}.{view.table_id}", field_mapping=source.field_mapping, date_partition_column=source.date_partition_column, )
def simple_bq_source_using_query_arg(df, event_timestamp_column=None) -> BigQuerySource: bq_source_using_table = simple_bq_source_using_table_arg(df, event_timestamp_column) return BigQuerySource( name=bq_source_using_table.table, query=f"SELECT * FROM {bq_source_using_table.table}", timestamp_field=event_timestamp_column, )
def test_basic(self) -> None: """ Add another table to existing repo using partial apply API. Make sure both the table applied via CLI apply and the new table are passing RW test. """ runner = CliRunner() with runner.local_repo(get_example_repo("example_feature_repo_1.py")) as store: driver_locations_source = BigQuerySource( table_ref="rh_prod.ride_hailing_co.drivers", event_timestamp_column="event_timestamp", created_timestamp_column="created_timestamp", ) driver_locations_100 = FeatureView( name="driver_locations_100", entities=["driver"], ttl=Duration(seconds=86400 * 1), features=[ Feature(name="lat", dtype=ValueType.FLOAT), Feature(name="lon", dtype=ValueType.STRING), Feature(name="name", dtype=ValueType.STRING), ], online=True, input=driver_locations_source, tags={}, ) store.apply([driver_locations_100]) basic_rw_test(store, view_name="driver_locations") basic_rw_test(store, view_name="driver_locations_100")
def simple_bq_source_using_query_arg(df, event_timestamp_column=None) -> BigQuerySource: bq_source_using_table_ref = simple_bq_source_using_table_ref_arg( df, event_timestamp_column ) return BigQuerySource( query=f"SELECT * FROM {bq_source_using_table_ref.table_ref}", event_timestamp_column=event_timestamp_column, )
def prep_bq_fs_and_fv( bq_source_type: str, ) -> Iterator[Tuple[FeatureStore, FeatureView]]: client = bigquery.Client() gcp_project = client.project bigquery_dataset = "test_ingestion" dataset = bigquery.Dataset(f"{gcp_project}.{bigquery_dataset}") client.create_dataset(dataset, exists_ok=True) dataset.default_table_expiration_ms = (1000 * 60 * 60 * 24 * 14 ) # 2 weeks in milliseconds client.update_dataset(dataset, ["default_table_expiration_ms"]) df = create_dataset() job_config = bigquery.LoadJobConfig() table_ref = f"{gcp_project}.{bigquery_dataset}.{bq_source_type}_correctness_{int(time.time_ns())}" query = f"SELECT * FROM `{table_ref}`" job = client.load_table_from_dataframe(df, table_ref, job_config=job_config) job.result() bigquery_source = BigQuerySource( table_ref=table_ref if bq_source_type == "table" else None, query=query if bq_source_type == "query" else None, event_timestamp_column="ts", created_timestamp_column="created_ts", date_partition_column="", field_mapping={ "ts_1": "ts", "id": "driver_id" }, ) fv = driver_feature_view(bigquery_source) e = Entity( name="driver", description="id for driver", join_key="driver_id", value_type=ValueType.INT32, ) with tempfile.TemporaryDirectory() as repo_dir_name: config = RepoConfig( registry=str(Path(repo_dir_name) / "registry.db"), project=f"test_bq_correctness_{str(uuid.uuid4()).replace('-', '')}", provider="gcp", online_store=DatastoreOnlineStoreConfig( namespace="integration_test"), ) fs = FeatureStore(config=config) fs.apply([fv, e]) yield fs, fv fs.teardown()
def simple_bq_source_using_table_arg(df, event_timestamp_column=None) -> BigQuerySource: client = bigquery.Client() gcp_project = client.project bigquery_dataset = f"ds_{time.time_ns()}" dataset = bigquery.Dataset(f"{gcp_project}.{bigquery_dataset}") client.create_dataset(dataset, exists_ok=True) dataset.default_table_expiration_ms = ( 1000 * 60 * 60 # 60 minutes in milliseconds (seems to be minimum limit for gcloud) ) client.update_dataset(dataset, ["default_table_expiration_ms"]) table = f"{gcp_project}.{bigquery_dataset}.table_{random.randrange(100, 999)}" job = client.load_table_from_dataframe(df, table) job.result() return BigQuerySource(table=table, timestamp_field=event_timestamp_column,)
def batch_source(local_staging_path: str, pytestconfig, request: FixtureRequest): if pytestconfig.getoption("env") == "gcloud": bq_project = pytestconfig.getoption("bq_project") bq_dataset = request.getfixturevalue("bq_dataset") return BigQuerySource( event_timestamp_column="event_timestamp", created_timestamp_column="created_timestamp", table_ref= f"{bq_project}:{bq_dataset}.source_{datetime.now():%Y%m%d%H%M%s}", ) else: return FileSource( event_timestamp_column="event_timestamp", created_timestamp_column="created_timestamp", file_format=ParquetFormat(), file_url=os.path.join(local_staging_path, "transactions"), )
def test_offline_ingestion_from_bq_view(pytestconfig, bq_dataset, feast_client: Client, feast_spark_client: SparkClient): original = generate_data() bq_project = pytestconfig.getoption("bq_project") bq_client = bigquery.Client(project=bq_project) source_ref = bigquery.TableReference( bigquery.DatasetReference(bq_project, bq_dataset), f"ingestion_source_{datetime.now():%Y%m%d%H%M%s}", ) bq_client.load_table_from_dataframe(original, source_ref).result() view_ref = bigquery.TableReference( bigquery.DatasetReference(bq_project, bq_dataset), f"ingestion_view_{datetime.now():%Y%m%d%H%M%s}", ) view = bigquery.Table(view_ref) view.view_query = f"select * from `{source_ref.project}.{source_ref.dataset_id}.{source_ref.table_id}`" bq_client.create_table(view) entity = Entity(name="s2id", description="S2id", value_type=ValueType.INT64) feature_table = FeatureTable( name="bq_ingestion", entities=["s2id"], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=BigQuerySource( event_timestamp_column="event_timestamp", table_ref= f"{view_ref.project}:{view_ref.dataset_id}.{view_ref.table_id}", ), ) feast_client.apply(entity) feast_client.apply(feature_table) ingest_and_verify(feast_client, feast_spark_client, feature_table, original)
def simple_bq_source_using_table_ref_arg( df, event_timestamp_column=None ) -> BigQuerySource: client = bigquery.Client() gcp_project = client.project bigquery_dataset = "ds" dataset = bigquery.Dataset(f"{gcp_project}.{bigquery_dataset}") client.create_dataset(dataset, exists_ok=True) dataset.default_table_expiration_ms = ( 1000 * 60 * 60 # 60 minutes in milliseconds (seems to be minimum limit for gcloud) ) client.update_dataset(dataset, ["default_table_expiration_ms"]) table_ref = f"{gcp_project}.{bigquery_dataset}.table_1" job = client.load_table_from_dataframe( df, table_ref, job_config=bigquery.LoadJobConfig() ) job.result() return BigQuerySource( table_ref=table_ref, event_timestamp_column=event_timestamp_column, )
from datetime import timedelta from feast import BigQuerySource, Entity, Feature, FeatureView, ValueType driver = Entity(name="driver_id", join_key="driver_id", value_type=ValueType.INT64,) driver_stats_source = BigQuerySource( table_ref="feast-oss.demo_data.driver_hourly_stats", event_timestamp_column="datetime", created_timestamp_column="created", ) driver_stats_fv = FeatureView( name="driver_hourly_stats", entities=["driver_id"], ttl=timedelta(weeks=52), features=[ Feature(name="conv_rate", dtype=ValueType.FLOAT), Feature(name="acc_rate", dtype=ValueType.FLOAT), Feature(name="avg_daily_trips", dtype=ValueType.INT64), ], input=driver_stats_source, tags={"team": "driver_performance"}, )
from datetime import timedelta from feast import BigQuerySource, Entity, Feature, FeatureView, ValueType nonexistent_source = BigQuerySource( table_ref="project.dataset.nonexistent_table", event_timestamp_column="" ) driver = Entity(name="driver", value_type=ValueType.INT64, description="driver id",) nonexistent_features = FeatureView( name="driver_locations", entities=["driver"], ttl=timedelta(days=1), features=[ Feature(name="lat", dtype=ValueType.FLOAT), Feature(name="lon", dtype=ValueType.STRING), ], input=nonexistent_source, )
from datetime import timedelta from feast import ( BigQuerySource, Entity, Feature, FeatureService, FeatureView, ValueType, ) driver_locations_source = BigQuerySource( table_ref="feast-oss.public.drivers", event_timestamp_column="event_timestamp", created_timestamp_column="created_timestamp", ) customer_profile_source = BigQuerySource( table_ref="feast-oss.public.customers", event_timestamp_column="event_timestamp", ) customer_driver_combined_source = BigQuerySource( table_ref="feast-oss.public.customer_driver", event_timestamp_column="event_timestamp", ) driver = Entity( name="driver", # The name is derived from this argument, not object name. value_type=ValueType.INT64, description="driver id", )
name="driver", # The join keys of an entity describe the storage level field/column on which # features can be looked up. The join keys are also used to join feature # tables/views when building feature vectors join_keys=["driver_id"], # The storage level type for an entity value_type=ValueType.INT64, ) # Indicates a data source from which feature values can be retrieved. Sources are queried when building training # datasets or materializing features into an online store. driver_stats_source = BigQuerySource( # The BigQuery table where features can be found table="feast-oss.demo_data.driver_hourly_stats_2", # The event timestamp is used for point-in-time joins and for ensuring only # features within the TTL are returned timestamp_field="event_timestamp", # The (optional) created timestamp is used to ensure there are no duplicate # feature rows in the offline store or when building training datasets created_timestamp_column="created", ) # Feature views are a grouping based on how features are stored in either the # online or offline store. driver_stats_fv = FeatureView( # The unique name of this feature view. Two feature views in a single # project cannot have the same name name="driver_hourly_stats", # The list of entities specifies the keys required for joining or looking # up features from this feature view. The reference provided in this field # correspond to the name of a defined entity (or entities) entities=["driver"],
def test_historical_features_from_bigquery_sources(provider_type, infer_event_timestamp_col, capsys, full_feature_names): start_date = datetime.now().replace(microsecond=0, second=0, minute=0) ( customer_entities, driver_entities, end_date, orders_df, start_date, ) = generate_entities(start_date, infer_event_timestamp_col) bigquery_dataset = ( f"test_hist_retrieval_{int(time.time_ns())}_{random.randint(1000, 9999)}" ) with BigQueryDataSet(bigquery_dataset), TemporaryDirectory() as temp_dir: gcp_project = bigquery.Client().project # Orders Query table_id = f"{bigquery_dataset}.orders" stage_orders_bigquery(orders_df, table_id) entity_df_query = f"SELECT * FROM {gcp_project}.{table_id}" # Driver Feature View driver_df = driver_data.create_driver_hourly_stats_df( driver_entities, start_date, end_date) driver_table_id = f"{gcp_project}.{bigquery_dataset}.driver_hourly" stage_driver_hourly_stats_bigquery_source(driver_df, driver_table_id) driver_source = BigQuerySource( table_ref=driver_table_id, event_timestamp_column="event_timestamp", created_timestamp_column="created", ) driver_fv = create_driver_hourly_stats_feature_view(driver_source) # Customer Feature View customer_df = driver_data.create_customer_daily_profile_df( customer_entities, start_date, end_date) customer_table_id = f"{gcp_project}.{bigquery_dataset}.customer_profile" stage_customer_daily_profile_bigquery_source(customer_df, customer_table_id) customer_source = BigQuerySource( table_ref=customer_table_id, event_timestamp_column="event_timestamp", created_timestamp_column="created", ) customer_fv = create_customer_daily_profile_feature_view( customer_source) driver = Entity(name="driver", join_key="driver_id", value_type=ValueType.INT64) customer = Entity(name="customer_id", value_type=ValueType.INT64) if provider_type == "local": store = FeatureStore(config=RepoConfig( registry=os.path.join(temp_dir, "registry.db"), project="default", provider="local", online_store=SqliteOnlineStoreConfig(path=os.path.join( temp_dir, "online_store.db"), ), offline_store=BigQueryOfflineStoreConfig( type="bigquery", dataset=bigquery_dataset), )) elif provider_type == "gcp": store = FeatureStore(config=RepoConfig( registry=os.path.join(temp_dir, "registry.db"), project="".join( random.choices(string.ascii_uppercase + string.digits, k=10)), provider="gcp", offline_store=BigQueryOfflineStoreConfig( type="bigquery", dataset=bigquery_dataset), )) elif provider_type == "gcp_custom_offline_config": store = FeatureStore(config=RepoConfig( registry=os.path.join(temp_dir, "registry.db"), project="".join( random.choices(string.ascii_uppercase + string.digits, k=10)), provider="gcp", offline_store=BigQueryOfflineStoreConfig(type="bigquery", dataset="foo"), )) else: raise Exception( "Invalid provider used as part of test configuration") store.apply([driver, customer, driver_fv, customer_fv]) try: event_timestamp = (DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL if DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL in orders_df.columns else "e_ts") expected_df = get_expected_training_df( customer_df, customer_fv, driver_df, driver_fv, orders_df, event_timestamp, full_feature_names, ) job_from_sql = store.get_historical_features( entity_df=entity_df_query, features=[ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", ], full_feature_names=full_feature_names, ) start_time = datetime.utcnow() actual_df_from_sql_entities = job_from_sql.to_df() end_time = datetime.utcnow() with capsys.disabled(): print( str(f"\nTime to execute job_from_sql.to_df() = '{(end_time - start_time)}'" )) assert sorted(expected_df.columns) == sorted( actual_df_from_sql_entities.columns) assert_frame_equal( expected_df.sort_values(by=[ event_timestamp, "order_id", "driver_id", "customer_id" ]).reset_index(drop=True), actual_df_from_sql_entities[expected_df.columns].sort_values( by=[ event_timestamp, "order_id", "driver_id", "customer_id" ]).reset_index(drop=True), check_dtype=False, ) table_from_sql_entities = job_from_sql.to_arrow() assert_frame_equal(actual_df_from_sql_entities, table_from_sql_entities.to_pandas()) timestamp_column = ("e_ts" if infer_event_timestamp_col else DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL) entity_df_query_with_invalid_join_key = ( f"select order_id, driver_id, customer_id as customer, " f"order_is_success, {timestamp_column}, FROM {gcp_project}.{table_id}" ) # Rename the join key; this should now raise an error. assertpy.assert_that(store.get_historical_features).raises( errors.FeastEntityDFMissingColumnsError).when_called_with( entity_df=entity_df_query_with_invalid_join_key, features=[ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", ], ) job_from_df = store.get_historical_features( entity_df=orders_df, features=[ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", ], full_feature_names=full_feature_names, ) # Rename the join key; this should now raise an error. orders_df_with_invalid_join_key = orders_df.rename( {"customer_id": "customer"}, axis="columns") assertpy.assert_that(store.get_historical_features).raises( errors.FeastEntityDFMissingColumnsError).when_called_with( entity_df=orders_df_with_invalid_join_key, features=[ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", ], ) # Make sure that custom dataset name is being used from the offline_store config if provider_type == "gcp_custom_offline_config": assertpy.assert_that( job_from_df.query).contains("foo.feast_entity_df") else: assertpy.assert_that(job_from_df.query).contains( f"{bigquery_dataset}.feast_entity_df") start_time = datetime.utcnow() actual_df_from_df_entities = job_from_df.to_df() end_time = datetime.utcnow() with capsys.disabled(): print( str(f"Time to execute job_from_df.to_df() = '{(end_time - start_time)}'\n" )) assert sorted(expected_df.columns) == sorted( actual_df_from_df_entities.columns) assert_frame_equal( expected_df.sort_values(by=[ event_timestamp, "order_id", "driver_id", "customer_id" ]).reset_index(drop=True), actual_df_from_df_entities[expected_df.columns].sort_values( by=[ event_timestamp, "order_id", "driver_id", "customer_id" ]).reset_index(drop=True), check_dtype=False, ) table_from_df_entities = job_from_df.to_arrow() assert_frame_equal(actual_df_from_df_entities, table_from_df_entities.to_pandas()) finally: store.teardown()
# features can be looked up. The join key is also used to join feature # tables/views when building feature vectors join_key="CustomerID", # The storage level type for an entity value_type=ValueType.INT64, ) # Indicates a data source from which feature values can be retrieved. Sources are queried when building training # datasets or materializing features into an online store. # Indicates a data source from which feature values can be retrieved. Sources are queried when building training # datasets or materializing features into an online store. transaction_stats = BigQuerySource( # The BigQuery table where features can be found table_ref="srivatsan-project.customer.transactions", # The event timestamp is used for point-in-time joins and for ensuring only # features within the TTL are returned event_timestamp_column="event_timestamp", # The (optional) created timestamp is used to ensure there are no duplicate # feature rows in the offline store or when building training datasets created_timestamp_column="created_timestamp", ) #transaction_stats = FileSource( # path="/home/jupyter/transactions.parquet", # event_timestamp_column="event_timestamp", # created_timestamp_column="created_timestamp", #) # Feature views are a grouping based on how features are stored in either the # online or offline store. transaction_stats_fv = FeatureView( # The unique name of this feature view. Two feature views in a single
from datetime import timedelta from feast import BigQuerySource, Entity, Feature, FeatureView, ValueType driver_locations_source = BigQuerySource( table_ref="rh_prod.ride_hailing_co.drivers", event_timestamp_column="event_timestamp", created_timestamp_column="created_timestamp", ) customer_profile_source = BigQuerySource( table_ref="rh_prod.ride_hailing_co.customers", event_timestamp_column="event_timestamp", ) customer_driver_combined_source = BigQuerySource( table_ref="rh_prod.ride_hailing_co.customer_driver", event_timestamp_column="event_timestamp", ) driver = Entity( name="driver", # The name is derived from this argument, not object name. value_type=ValueType.INT64, description="driver id", ) customer = Entity( name="customer", # The name is derived from this argument, not object name. value_type=ValueType.STRING, )
from datetime import timedelta from feast import ( BigQuerySource, Entity, FeatureService, FeatureView, Field, PushSource, ValueType, ) from feast.types import Float32, Int64, String driver_locations_source = BigQuerySource( table="feast-oss.public.drivers", timestamp_field="event_timestamp", created_timestamp_column="created_timestamp", ) driver_locations_source_query = BigQuerySource( query="SELECT * from feast-oss.public.drivers", timestamp_field="event_timestamp", created_timestamp_column="created_timestamp", ) driver_locations_source_query_2 = BigQuerySource( query="SELECT lat * 2 FROM feast-oss.public.drivers", timestamp_field="event_timestamp", created_timestamp_column="created_timestamp", )
def test_historical_features_from_bigquery_sources_containing_backfills( capsys): now = datetime.now().replace(microsecond=0, second=0, minute=0) tomorrow = now + timedelta(days=1) entity_dataframe = pd.DataFrame(data=[ { "driver_id": 1001, "event_timestamp": now + timedelta(days=2) }, { "driver_id": 1002, "event_timestamp": now + timedelta(days=2) }, ]) driver_stats_df = pd.DataFrame(data=[ # Duplicated rows simple case { "driver_id": 1001, "avg_daily_trips": 10, "event_timestamp": now, "created": tomorrow, }, { "driver_id": 1001, "avg_daily_trips": 20, "event_timestamp": tomorrow, "created": tomorrow, }, # Duplicated rows after a backfill { "driver_id": 1002, "avg_daily_trips": 30, "event_timestamp": now, "created": tomorrow, }, { "driver_id": 1002, "avg_daily_trips": 40, "event_timestamp": tomorrow, "created": now, }, ]) expected_df = pd.DataFrame(data=[ { "driver_id": 1001, "event_timestamp": now + timedelta(days=2), "avg_daily_trips": 20, }, { "driver_id": 1002, "event_timestamp": now + timedelta(days=2), "avg_daily_trips": 40, }, ]) bigquery_dataset = ( f"test_hist_retrieval_{int(time.time_ns())}_{random.randint(1000, 9999)}" ) with BigQueryDataSet(bigquery_dataset), TemporaryDirectory() as temp_dir: gcp_project = bigquery.Client().project # Entity Dataframe SQL query table_id = f"{bigquery_dataset}.orders" stage_orders_bigquery(entity_dataframe, table_id) entity_df_query = f"SELECT * FROM {gcp_project}.{table_id}" # Driver Feature View driver_table_id = f"{gcp_project}.{bigquery_dataset}.driver_hourly" stage_driver_hourly_stats_bigquery_source(driver_stats_df, driver_table_id) store = FeatureStore(config=RepoConfig( registry=os.path.join(temp_dir, "registry.db"), project="".join( random.choices(string.ascii_uppercase + string.digits, k=10)), provider="gcp", offline_store=BigQueryOfflineStoreConfig(type="bigquery", dataset=bigquery_dataset), )) driver = Entity(name="driver", join_key="driver_id", value_type=ValueType.INT64) driver_fv = FeatureView( name="driver_stats", entities=["driver"], features=[Feature(name="avg_daily_trips", dtype=ValueType.INT32)], batch_source=BigQuerySource( table_ref=driver_table_id, event_timestamp_column="event_timestamp", created_timestamp_column="created", ), ttl=None, ) store.apply([driver, driver_fv]) try: job_from_sql = store.get_historical_features( entity_df=entity_df_query, features=["driver_stats:avg_daily_trips"], full_feature_names=False, ) start_time = datetime.utcnow() actual_df_from_sql_entities = job_from_sql.to_df() end_time = datetime.utcnow() with capsys.disabled(): print( str(f"\nTime to execute job_from_sql.to_df() = '{(end_time - start_time)}'" )) assert sorted(expected_df.columns) == sorted( actual_df_from_sql_entities.columns) assert_frame_equal( expected_df.sort_values(by=["driver_id"]).reset_index( drop=True), actual_df_from_sql_entities[expected_df.columns].sort_values( by=["driver_id"]).reset_index(drop=True), check_dtype=False, ) finally: store.teardown()