def start_historical_feature_retrieval_spark_session( client: "Client", project: str, entity_source: Union[FileSource, BigQuerySource], feature_tables: List[FeatureTable], ): from pyspark.sql import SparkSession from feast.pyspark.historical_feature_retrieval_job import ( retrieve_historical_features, ) spark_session = SparkSession.builder.getOrCreate() return retrieve_historical_features( spark=spark_session, entity_source_conf=_source_to_argument(entity_source, client._config), feature_tables_sources_conf=[ _source_to_argument(feature_table.batch_source, client._config) for feature_table in feature_tables ], feature_tables_conf=[ _feature_table_to_argument(client, project, feature_table) for feature_table in feature_tables ], )
def test_historical_feature_retrieval_with_mapping(spark: SparkSession): test_data_dir = path.join(pathlib.Path(__file__).parent.absolute(), "data") entity_source = FileSource( format="csv", path= f"file://{path.join(test_data_dir, 'column_mapping_test_entity.csv')}", event_timestamp_column="event_timestamp", field_mapping={"id": "customer_id"}, options={ "inferSchema": "true", "header": "true" }, ) booking_source = FileSource( format="csv", path= f"file://{path.join(test_data_dir, 'column_mapping_test_feature.csv')}", event_timestamp_column="datetime", created_timestamp_column="created_datetime", options={ "inferSchema": "true", "header": "true" }, ) booking_table = FeatureTable( name="bookings", entities=[Field("customer_id", "int32")], features=[Field("total_bookings", "int32")], ) joined_df = retrieve_historical_features( spark, entity_source, [booking_source], [booking_table], ) expected_joined_schema = StructType([ StructField("customer_id", IntegerType()), StructField("event_timestamp", TimestampType()), StructField("bookings__total_bookings", IntegerType()), ]) expected_joined_data = [ (1001, datetime(year=2020, month=9, day=2), 200), (1001, datetime(year=2020, month=9, day=3), 200), (2001, datetime(year=2020, month=9, day=4), 600), (2001, datetime(year=2020, month=9, day=4), 600), (3001, datetime(year=2020, month=9, day=4), 700), ] expected_joined_df = spark.createDataFrame( spark.sparkContext.parallelize(expected_joined_data), expected_joined_schema) assert_dataframe_equal(joined_df, expected_joined_df)
def test_historical_feature_retrieval_with_mapping(spark): test_data_dir = path.join(pathlib.Path(__file__).parent.absolute(), "data") retrieval_conf = { "entity": { "format": "csv", "path": f"file://{path.join(test_data_dir, 'column_mapping_test_entity.csv')}", "options": {"inferSchema": "true", "header": "true"}, "col_mapping": {"id": "customer_id"}, "dtypes": {"customer_id": "int"}, }, "tables": [ { "format": "csv", "path": f"file://{path.join(test_data_dir, 'column_mapping_test_feature.csv')}", "name": "bookings", "options": {"inferSchema": "true", "header": "true"}, "col_mapping": { "datetime": "event_timestamp", "created_datetime": "created_timestamp", }, "dtypes": {"customer_id": "int"}, }, ], "queries": [ { "table": "bookings", "features": ["total_bookings"], "join": ["customer_id"], } ], } joined_df = retrieve_historical_features(spark, retrieval_conf) expected_joined_schema = StructType( [ StructField("customer_id", IntegerType()), StructField("event_timestamp", TimestampType()), StructField("bookings__total_bookings", IntegerType()), ] ) expected_joined_data = [ (1001, datetime(year=2020, month=9, day=2), 200), (1001, datetime(year=2020, month=9, day=3), 200), (2001, datetime(year=2020, month=9, day=4), 600), (2001, datetime(year=2020, month=9, day=4), 600), (3001, datetime(year=2020, month=9, day=4), 700), ] expected_joined_df = spark.createDataFrame( spark.sparkContext.parallelize(expected_joined_data), expected_joined_schema ) assert_dataframe_equal(joined_df, expected_joined_df)
def test_large_historical_feature_retrieval( spark: SparkSession, large_entity_csv_file: str, large_feature_csv_file: str ): nr_rows = 1000 start_datetime = datetime(year=2020, month=8, day=31) expected_join_data = [ (1000 + i, start_datetime + timedelta(days=i), i * 10) for i in range(nr_rows) ] expected_join_data_schema = StructType( [ StructField("customer_id", IntegerType()), StructField("event_timestamp", TimestampType()), StructField("feature__total_bookings", IntegerType()), ] ) expected_join_data_df = spark.createDataFrame( spark.sparkContext.parallelize(expected_join_data), expected_join_data_schema ) entity_source = { "file": { "format": "csv", "path": f"file://{large_entity_csv_file}", "event_timestamp_column": "event_timestamp", "field_mapping": {"id": "customer_id"}, "options": {"inferSchema": "true", "header": "true"}, } } feature_source = { "file": { "format": "csv", "path": f"file://{large_feature_csv_file}", "event_timestamp_column": "event_timestamp", "created_timestamp_column": "created_timestamp", "options": {"inferSchema": "true", "header": "true"}, } } feature_table = { "name": "feature", "entities": [{"name": "customer_id", "type": "int32"}], "features": [{"name": "total_bookings", "type": "int32"}], } joined_df = retrieve_historical_features( spark, entity_source, [feature_source], [feature_table] ) assert_dataframe_equal(joined_df, expected_join_data_df)
def test_large_historical_feature_retrieval( spark, large_entity_csv_file, large_feature_csv_file ): nr_rows = 1000 start_datetime = datetime(year=2020, month=8, day=31) expected_join_data = [ (1000 + i, start_datetime + timedelta(days=i), i * 10) for i in range(nr_rows) ] expected_join_data_schema = StructType( [ StructField("customer_id", IntegerType()), StructField("event_timestamp", TimestampType()), StructField("feature__total_bookings", IntegerType()), ] ) expected_join_data_df = spark.createDataFrame( spark.sparkContext.parallelize(expected_join_data), expected_join_data_schema ) retrieval_conf = { "entity": { "format": "csv", "path": f"file://{large_entity_csv_file}", "options": {"inferSchema": "true", "header": "true"}, }, "tables": [ { "format": "csv", "path": f"file://{large_feature_csv_file}", "name": "feature", "options": {"inferSchema": "true", "header": "true"}, }, ], "queries": [ { "table": "feature", "features": ["total_bookings"], "join": ["customer_id"], } ], } joined_df = retrieve_historical_features(spark, retrieval_conf) assert_dataframe_equal(joined_df, expected_join_data_df)
def test_historical_feature_retrieval(spark): test_data_dir = path.join(pathlib.Path(__file__).parent.absolute(), "data") batch_retrieval_conf = { "entity": { "format": "csv", "path": f"file://{path.join(test_data_dir, 'customer_driver_pairs.csv')}", "options": {"inferSchema": "true", "header": "true"}, "dtypes": {"customer_id": "int", "driver_id": "int"}, }, "tables": [ { "format": "csv", "path": f"file://{path.join(test_data_dir, 'bookings.csv')}", "name": "bookings", "options": {"inferSchema": "true", "header": "true"}, "dtypes": {"driver_id": "int"}, }, { "format": "csv", "path": f"file://{path.join(test_data_dir, 'transactions.csv')}", "name": "transactions", "options": {"inferSchema": "true", "header": "true"}, "dtypes": {"customer_id": "int"}, }, ], "queries": [ { "table": "transactions", "features": ["daily_transactions"], "join": ["customer_id"], "max_age": 86400, }, { "table": "bookings", "features": ["completed_bookings"], "join": ["driver_id"], }, ], } joined_df = retrieve_historical_features(spark, batch_retrieval_conf) expected_joined_schema = StructType( [ StructField("customer_id", IntegerType()), StructField("driver_id", IntegerType()), StructField("event_timestamp", TimestampType()), StructField("transactions__daily_transactions", FloatType()), StructField("bookings__completed_bookings", IntegerType()), ] ) expected_joined_data = [ (1001, 8001, datetime(year=2020, month=9, day=2), 100.0, 300,), (1001, 8002, datetime(year=2020, month=9, day=2), 100.0, 500,), (1001, 8002, datetime(year=2020, month=9, day=3), None, 500,), (2001, 8002, datetime(year=2020, month=9, day=3), None, 500,), (2001, 8002, datetime(year=2020, month=9, day=4), None, 500,), ] expected_joined_df = spark.createDataFrame( spark.sparkContext.parallelize(expected_joined_data), expected_joined_schema ) assert_dataframe_equal(joined_df, expected_joined_df)
def test_historical_feature_retrieval_with_schema_errors(spark: SparkSession): test_data_dir = path.join(pathlib.Path(__file__).parent.absolute(), "data") entity_source = { "file": { "format": { "json_class": "CSVFormat" }, "path": f"file://{path.join(test_data_dir, 'customer_driver_pairs.csv')}", "event_timestamp_column": "event_timestamp", "options": { "inferSchema": "true", "header": "true" }, } } entity_source_missing_timestamp = { "file": { "format": { "json_class": "CSVFormat" }, "path": f"file://{path.join(test_data_dir, 'customer_driver_pairs.csv')}", "event_timestamp_column": "datetime", "options": { "inferSchema": "true", "header": "true" }, } } entity_source_missing_entity = { "file": { "format": { "json_class": "CSVFormat" }, "path": f"file://{path.join(test_data_dir, 'customers.csv')}", "event_timestamp_column": "event_timestamp", "options": { "inferSchema": "true", "header": "true" }, } } booking_source = { "file": { "format": { "json_class": "CSVFormat" }, "path": f"file://{path.join(test_data_dir, 'bookings.csv')}", "event_timestamp_column": "event_timestamp", "created_timestamp_column": "created_timestamp", "options": { "inferSchema": "true", "header": "true" }, } } booking_source_missing_timestamp = { "file": { "format": { "json_class": "CSVFormat" }, "path": f"file://{path.join(test_data_dir, 'bookings.csv')}", "event_timestamp_column": "datetime", "created_timestamp_column": "created_datetime", "options": { "inferSchema": "true", "header": "true" }, } } booking_table = { "name": "bookings", "entities": [{ "name": "driver_id", "type": "int32" }], "features": [{ "name": "completed_bookings", "type": "int32" }], } booking_table_missing_features = { "name": "bookings", "entities": [{ "name": "driver_id", "type": "int32" }], "features": [{ "name": "nonexist_feature", "type": "int32" }], } booking_table_wrong_column_type = { "name": "bookings", "entities": [{ "name": "driver_id", "type": "string" }], "features": [{ "name": "completed_bookings", "type": "int32" }], } with pytest.raises(SchemaError): retrieve_historical_features( spark, entity_source_missing_timestamp, [booking_source], [booking_table], ) with pytest.raises(SchemaError): retrieve_historical_features( spark, entity_source, [booking_source_missing_timestamp], [booking_table], ) with pytest.raises(SchemaError): retrieve_historical_features( spark, entity_source, [booking_source], [booking_table_missing_features], ) with pytest.raises(SchemaError): retrieve_historical_features( spark, entity_source, [booking_source], [booking_table_wrong_column_type], ) with pytest.raises(SchemaError): retrieve_historical_features( spark, entity_source_missing_entity, [booking_source], [booking_table], )
def test_historical_feature_retrieval_with_mapping(spark: SparkSession): test_data_dir = path.join(pathlib.Path(__file__).parent.absolute(), "data") entity_source = { "file": { "format": { "json_class": "CSVFormat" }, "path": f"file://{path.join(test_data_dir, 'column_mapping_test_entity.csv')}", "event_timestamp_column": "event_timestamp", "field_mapping": { "id": "customer_id" }, "options": { "inferSchema": "true", "header": "true" }, } } booking_source = { "file": { "format": { "json_class": "CSVFormat" }, "path": f"file://{path.join(test_data_dir, 'column_mapping_test_feature.csv')}", "event_timestamp_column": "datetime", "created_timestamp_column": "created_datetime", "options": { "inferSchema": "true", "header": "true" }, } } booking_table = { "name": "bookings", "entities": [{ "name": "customer_id", "type": "int32" }], "features": [{ "name": "total_bookings", "type": "int32" }], } joined_df = retrieve_historical_features( spark, entity_source, [booking_source], [booking_table], ) expected_joined_schema = StructType([ StructField("customer_id", IntegerType()), StructField("event_timestamp", TimestampType()), StructField("bookings__total_bookings", IntegerType()), ]) expected_joined_data = [ (1001, datetime(year=2020, month=9, day=2), 200), (1001, datetime(year=2020, month=9, day=3), 200), (2001, datetime(year=2020, month=9, day=4), 600), (2001, datetime(year=2020, month=9, day=4), 600), (3001, datetime(year=2020, month=9, day=4), 700), ] expected_joined_df = spark.createDataFrame( spark.sparkContext.parallelize(expected_joined_data), expected_joined_schema) assert_dataframe_equal(joined_df, expected_joined_df)
def test_historical_feature_retrieval(spark: SparkSession): test_data_dir = path.join(pathlib.Path(__file__).parent.absolute(), "data") entity_source = { "file": { "format": { "json_class": "CSVFormat" }, "path": f"file://{path.join(test_data_dir, 'customer_driver_pairs.csv')}", "event_timestamp_column": "event_timestamp", "options": { "inferSchema": "true", "header": "true" }, } } booking_source = { "file": { "format": { "json_class": "CSVFormat" }, "path": f"file://{path.join(test_data_dir, 'bookings.csv')}", "event_timestamp_column": "event_timestamp", "created_timestamp_column": "created_timestamp", "options": { "inferSchema": "true", "header": "true" }, } } transaction_source = { "file": { "format": { "json_class": "CSVFormat" }, "path": f"file://{path.join(test_data_dir, 'transactions.csv')}", "event_timestamp_column": "event_timestamp", "created_timestamp_column": "created_timestamp", "options": { "inferSchema": "true", "header": "true" }, } } booking_table = { "name": "bookings", "entities": [{ "name": "driver_id", "type": "int32" }], "features": [{ "name": "completed_bookings", "type": "int32" }], } transaction_table = { "name": "transactions", "entities": [{ "name": "customer_id", "type": "int32" }], "features": [{ "name": "daily_transactions", "type": "double" }], "max_age": 86400, } joined_df = retrieve_historical_features( spark, entity_source, [transaction_source, booking_source], [transaction_table, booking_table], ) expected_joined_schema = StructType([ StructField("customer_id", IntegerType()), StructField("driver_id", IntegerType()), StructField("event_timestamp", TimestampType()), StructField("transactions__daily_transactions", FloatType()), StructField("bookings__completed_bookings", IntegerType()), ]) expected_joined_data = [ ( 1001, 8001, datetime(year=2020, month=9, day=2), 100.0, 300, ), ( 1001, 8002, datetime(year=2020, month=9, day=2), 100.0, 500, ), ( 1001, 8002, datetime(year=2020, month=9, day=3), None, 500, ), ( 2001, 8002, datetime(year=2020, month=9, day=3), None, 500, ), ( 2001, 8002, datetime(year=2020, month=9, day=4), None, 500, ), ] expected_joined_df = spark.createDataFrame( spark.sparkContext.parallelize(expected_joined_data), expected_joined_schema) assert_dataframe_equal(joined_df, expected_joined_df)
def test_historical_feature_retrieval_with_schema_errors(spark: SparkSession): test_data_dir = path.join(pathlib.Path(__file__).parent.absolute(), "data") entity_source = FileSource( format="csv", path=f"file://{path.join(test_data_dir, 'customer_driver_pairs.csv')}", event_timestamp_column="event_timestamp", options={ "inferSchema": "true", "header": "true" }, ) entity_source_missing_timestamp = FileSource( format="csv", path=f"file://{path.join(test_data_dir, 'customer_driver_pairs.csv')}", event_timestamp_column="datetime", options={ "inferSchema": "true", "header": "true" }, ) entity_source_missing_entity = FileSource( format="csv", path=f"file://{path.join(test_data_dir, 'customers.csv')}", event_timestamp_column="event_timestamp", options={ "inferSchema": "true", "header": "true" }, ) booking_source = FileSource( format="csv", path=f"file://{path.join(test_data_dir, 'bookings.csv')}", event_timestamp_column="event_timestamp", created_timestamp_column="created_timestamp", options={ "inferSchema": "true", "header": "true" }, ) booking_source_missing_timestamp = FileSource( format="csv", path=f"file://{path.join(test_data_dir, 'bookings.csv')}", event_timestamp_column="datetime", created_timestamp_column="created_datetime", options={ "inferSchema": "true", "header": "true" }, ) booking_table = FeatureTable( name="bookings", entities=[Field("driver_id", "int32")], features=[Field("completed_bookings", "int32")], ) booking_table_missing_features = FeatureTable( name="bookings", entities=[Field("driver_id", "int32")], features=[Field("nonexist_feature", "int32")], ) booking_table_wrong_column_type = FeatureTable( name="bookings", entities=[Field("driver_id", "string")], features=[Field("completed_bookings", "int32")], ) with pytest.raises(SchemaError): retrieve_historical_features( spark, entity_source_missing_timestamp, [booking_source], [booking_table], ) with pytest.raises(SchemaError): retrieve_historical_features( spark, entity_source, [booking_source_missing_timestamp], [booking_table], ) with pytest.raises(SchemaError): retrieve_historical_features( spark, entity_source, [booking_source], [booking_table_missing_features], ) with pytest.raises(SchemaError): retrieve_historical_features( spark, entity_source, [booking_source], [booking_table_wrong_column_type], ) with pytest.raises(SchemaError): retrieve_historical_features( spark, entity_source_missing_entity, [booking_source], [booking_table], )
def test_historical_feature_retrieval(spark: SparkSession): test_data_dir = path.join(pathlib.Path(__file__).parent.absolute(), "data") entity_source = FileSource( format="csv", path=f"file://{path.join(test_data_dir, 'customer_driver_pairs.csv')}", event_timestamp_column="event_timestamp", options={ "inferSchema": "true", "header": "true" }, ) booking_source = FileSource( format="csv", path=f"file://{path.join(test_data_dir, 'bookings.csv')}", event_timestamp_column="event_timestamp", created_timestamp_column="created_timestamp", options={ "inferSchema": "true", "header": "true" }, ) transaction_source = FileSource( format="csv", path=f"file://{path.join(test_data_dir, 'transactions.csv')}", event_timestamp_column="event_timestamp", created_timestamp_column="created_timestamp", options={ "inferSchema": "true", "header": "true" }, ) booking_table = FeatureTable( name="bookings", entities=[Field("driver_id", "int32")], features=[Field("completed_bookings", "int32")], ) transaction_table = FeatureTable( name="transactions", entities=[Field("customer_id", "int32")], features=[Field("daily_transactions", "double")], max_age=86400, ) joined_df = retrieve_historical_features( spark, entity_source, [transaction_source, booking_source], [transaction_table, booking_table], ) expected_joined_schema = StructType([ StructField("customer_id", IntegerType()), StructField("driver_id", IntegerType()), StructField("event_timestamp", TimestampType()), StructField("transactions__daily_transactions", FloatType()), StructField("bookings__completed_bookings", IntegerType()), ]) expected_joined_data = [ ( 1001, 8001, datetime(year=2020, month=9, day=2), 100.0, 300, ), ( 1001, 8002, datetime(year=2020, month=9, day=2), 100.0, 500, ), ( 1001, 8002, datetime(year=2020, month=9, day=3), None, 500, ), ( 2001, 8002, datetime(year=2020, month=9, day=3), None, 500, ), ( 2001, 8002, datetime(year=2020, month=9, day=4), None, 500, ), ] expected_joined_df = spark.createDataFrame( spark.sparkContext.parallelize(expected_joined_data), expected_joined_schema) assert_dataframe_equal(joined_df, expected_joined_df)