def test_offline_ingestion( feast_client: Client, feast_spark_client: SparkClient, batch_source: Union[BigQuerySource, FileSource], ): entity = Entity( name="s2id", description="S2id", value_type=ValueType.INT64, ) feature_table = FeatureTable( name="drivers", entities=["s2id"], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=batch_source, ) feast_client.apply(entity) feast_client.apply(feature_table) original = generate_data() feast_client.ingest(feature_table, original) # write to batch (offline) storage ingest_and_verify(feast_client, feast_spark_client, feature_table, original)
def test_list_jobs_long_table_name( feast_client: Client, feast_spark_client: SparkClient, batch_source: Union[BigQuerySource, FileSource], ): entity = Entity(name="long_entity_name" * 10, description="S2id", value_type=ValueType.INT64) feature_table = FeatureTable( name="just1a2featuretable3with4a5really6really7really8really9really10", entities=[entity.name], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=batch_source, ) feast_client.apply(entity) feast_client.apply(feature_table) data_sample = generate_data().rename(columns={"s2id": entity.name}) feast_client.ingest(feature_table, data_sample) job = feast_spark_client.start_offline_to_online_ingestion( feature_table, data_sample.event_timestamp.min().to_pydatetime(), data_sample.event_timestamp.max().to_pydatetime() + timedelta(seconds=1), ) wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.COMPLETED), 180) all_job_ids = [ job.get_id() for job in feast_spark_client.list_jobs( include_terminated=True, project=feast_client.project, table_name=feature_table.name, ) ] assert job.get_id() in all_job_ids features = feast_client.get_online_features( [f"{feature_table.name}:unique_drivers"], entity_rows=[{ entity.name: key } for key in data_sample[entity.name].tolist()], ).to_dict() ingested = pd.DataFrame.from_dict(features) pd.testing.assert_frame_equal( ingested[[entity.name, f"{feature_table.name}:unique_drivers"]], data_sample[[entity.name, "unique_drivers"]].rename( columns={"unique_drivers": f"{feature_table.name}:unique_drivers" }), )
def test_offline_ingestion(feast_client: Client, staging_path: str): entity = Entity( name="s2id", description="S2id", value_type=ValueType.INT64, ) feature_table = FeatureTable( name="drivers", entities=["s2id"], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=FileSource( "event_timestamp", "event_timestamp", ParquetFormat(), os.path.join(staging_path, "batch-storage"), ), ) feast_client.apply_entity(entity) feast_client.apply_feature_table(feature_table) original = generate_data() feast_client.ingest(feature_table, original) # write to batch (offline) storage job = feast_client.start_offline_to_online_ingestion( feature_table, datetime.today(), datetime.today() + timedelta(days=1)) status = wait_retry_backoff( lambda: (job.get_status(), job.get_status() != SparkJobStatus.IN_PROGRESS), 300) assert status == SparkJobStatus.COMPLETED features = feast_client.get_online_features( ["drivers:unique_drivers"], entity_rows=[{ "s2id": s2_id } for s2_id in original["s2id"].tolist()], ).to_dict() ingested = pd.DataFrame.from_dict(features) pd.testing.assert_frame_equal( ingested[["s2id", "drivers:unique_drivers"]], original[[ "s2id", "unique_drivers" ]].rename(columns={"unique_drivers": "drivers:unique_drivers"}), )
def test_historical_features( feast_client: Client, batch_source: Union[BigQuerySource, FileSource] ): customer_entity = Entity( name="user_id", description="Customer", value_type=ValueType.INT64 ) feast_client.apply_entity(customer_entity) max_age = Duration() max_age.FromSeconds(2 * 86400) transactions_feature_table = FeatureTable( name="transactions", entities=["user_id"], features=[ Feature("daily_transactions", ValueType.DOUBLE), Feature("total_transactions", ValueType.DOUBLE), ], batch_source=batch_source, max_age=max_age, ) feast_client.apply_feature_table(transactions_feature_table) transactions_df, customers_df = generate_data() feast_client.ingest(transactions_feature_table, transactions_df) feature_refs = ["transactions:daily_transactions"] job = feast_client.get_historical_features(feature_refs, customers_df) output_dir = job.get_output_file_uri() joined_df = read_parquet(output_dir) expected_joined_df = pd.DataFrame( { "event_timestamp": customers_df.event_timestamp.tolist(), "user_id": customers_df.user_id.tolist(), "transactions__daily_transactions": transactions_df.daily_transactions.tolist() + [None] * transactions_df.shape[0], } ) assert_frame_equal( joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index(drop=True), expected_joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index( drop=True ), )
def test_list_jobs_long_table_name( feast_client: Client, feast_spark_client: SparkClient, batch_source: Union[BigQuerySource, FileSource], ): entity = Entity( name="s2id", description="S2id", value_type=ValueType.INT64, ) feature_table = FeatureTable( name= "just1a2featuretable3with4a5really6really7really8really9really10really11really12long13name", entities=["s2id"], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=batch_source, ) feast_client.apply(entity) feast_client.apply(feature_table) data_sample = generate_data() feast_client.ingest(feature_table, data_sample) job = feast_spark_client.start_offline_to_online_ingestion( feature_table, data_sample.event_timestamp.min().to_pydatetime(), data_sample.event_timestamp.max().to_pydatetime() + timedelta(seconds=1), ) wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.COMPLETED), 180) all_job_ids = [ job.get_id() for job in feast_spark_client.list_jobs( include_terminated=True, project=feast_client.project, table_name=feature_table.name, ) ] assert job.get_id() in all_job_ids
def test_historical_features( feast_client: Client, tfrecord_feast_client: Client, batch_source: Union[BigQuerySource, FileSource], ): customer_entity = Entity(name="user_id", description="Customer", value_type=ValueType.INT64) feast_client.apply(customer_entity) max_age = Duration() max_age.FromSeconds(2 * 86400) transactions_feature_table = FeatureTable( name="transactions", entities=["user_id"], features=[ Feature("daily_transactions", ValueType.DOUBLE), Feature("total_transactions", ValueType.DOUBLE), ], batch_source=batch_source, max_age=max_age, ) feast_client.apply(transactions_feature_table) transactions_df, customers_df = generate_data() feast_client.ingest(transactions_feature_table, transactions_df) feature_refs = ["transactions:daily_transactions"] # remove microseconds because job.get_start_time() does not contain microseconds job_submission_time = datetime.utcnow().replace(microsecond=0) job = feast_client.get_historical_features(feature_refs, customers_df) assert job.get_start_time() >= job_submission_time assert job.get_start_time() <= job_submission_time + timedelta(hours=1) output_dir = job.get_output_file_uri() # will both be None if not using Azure blob storage account_name, account_key = _get_azure_creds(feast_client) joined_df = read_parquet(output_dir, azure_account_name=account_name, azure_account_key=account_key) expected_joined_df = pd.DataFrame({ "event_timestamp": customers_df.event_timestamp.tolist(), "user_id": customers_df.user_id.tolist(), "transactions__daily_transactions": transactions_df.daily_transactions.tolist() + [None] * transactions_df.shape[0], }) assert_frame_equal( joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index( drop=True), expected_joined_df.sort_values( by=["user_id", "event_timestamp"]).reset_index(drop=True), ) job = tfrecord_feast_client.get_historical_features( feature_refs, customers_df) job.get_output_file_uri() assert job.get_status() == SparkJobStatus.COMPLETED
Feature(name="string_list_feature", dtype=ValueType.STRING_LIST), Feature(name="bytes_list_feature", dtype=ValueType.BYTES_LIST), ], ) client = Client(core_url=feast_core_url, serving_url=feast_online_serving_url) # Register feature set client.apply(all_types_fs_expected) df.info() df.describe() df.head() # Ingest tdata client.ingest(all_types_fs_expected, df) # Wait for data to be available def try_get_features(): online_request_entity = [{"user_id": 1001}] online_request_features = ["float_feature"] response = client.get_online_features(entity_rows=online_request_entity, feature_refs=online_request_features) response_dict = response.to_dict() if response_dict['float_feature'] == df.iloc[0]['float_feature']: return response_dict, True return response_dict, False
def test_historical_features(feast_client: Client, local_staging_path: str): customer_entity = Entity(name="user_id", description="Customer", value_type=ValueType.INT64) feast_client.apply_entity(customer_entity) max_age = Duration() max_age.FromSeconds(2 * 86400) transactions_feature_table = FeatureTable( name="transactions", entities=["user_id"], features=[ Feature("daily_transactions", ValueType.DOUBLE), Feature("total_transactions", ValueType.DOUBLE), ], batch_source=FileSource( "event_timestamp", "created_timestamp", ParquetFormat(), os.path.join(local_staging_path, "transactions"), ), max_age=max_age, ) feast_client.apply_feature_table(transactions_feature_table) retrieval_date = (datetime.utcnow().replace( hour=0, minute=0, second=0, microsecond=0).replace(tzinfo=None)) retrieval_outside_max_age_date = retrieval_date + timedelta(1) event_date = retrieval_date - timedelta(2) creation_date = retrieval_date - timedelta(1) customers = [1001, 1002, 1003, 1004, 1005] daily_transactions = [np.random.rand() * 10 for _ in customers] total_transactions = [np.random.rand() * 100 for _ in customers] transactions_df = pd.DataFrame({ "event_timestamp": [event_date for _ in customers], "created_timestamp": [creation_date for _ in customers], "user_id": customers, "daily_transactions": daily_transactions, "total_transactions": total_transactions, }) feast_client.ingest(transactions_feature_table, transactions_df) feature_refs = ["transactions:daily_transactions"] customer_df = pd.DataFrame({ "event_timestamp": [retrieval_date for _ in customers] + [retrieval_outside_max_age_date for _ in customers], "user_id": customers + customers, }) job = feast_client.get_historical_features(feature_refs, customer_df) output_dir = job.get_output_file_uri() joined_df = read_parquet(output_dir) expected_joined_df = pd.DataFrame({ "event_timestamp": [retrieval_date for _ in customers] + [retrieval_outside_max_age_date for _ in customers], "user_id": customers + customers, "transactions__daily_transactions": daily_transactions + [None] * len(customers), }) assert_frame_equal( joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index( drop=True), expected_joined_df.sort_values( by=["user_id", "event_timestamp"]).reset_index(drop=True), )