Пример #1
0
def start_historical_feature_retrieval_spark_session(
    client: "Client",
    project: str,
    entity_source: Union[FileSource, BigQuerySource],
    feature_tables: List[FeatureTable],
):
    from pyspark.sql import SparkSession

    from feast.pyspark.historical_feature_retrieval_job import (
        retrieve_historical_features,
    )

    spark_session = SparkSession.builder.getOrCreate()
    return retrieve_historical_features(
        spark=spark_session,
        entity_source_conf=_source_to_argument(entity_source, client._config),
        feature_tables_sources_conf=[
            _source_to_argument(feature_table.batch_source, client._config)
            for feature_table in feature_tables
        ],
        feature_tables_conf=[
            _feature_table_to_argument(client, project, feature_table)
            for feature_table in feature_tables
        ],
    )
Пример #2
0
def test_historical_feature_retrieval_with_mapping(spark: SparkSession):
    test_data_dir = path.join(pathlib.Path(__file__).parent.absolute(), "data")
    entity_source = FileSource(
        format="csv",
        path=
        f"file://{path.join(test_data_dir,  'column_mapping_test_entity.csv')}",
        event_timestamp_column="event_timestamp",
        field_mapping={"id": "customer_id"},
        options={
            "inferSchema": "true",
            "header": "true"
        },
    )
    booking_source = FileSource(
        format="csv",
        path=
        f"file://{path.join(test_data_dir,  'column_mapping_test_feature.csv')}",
        event_timestamp_column="datetime",
        created_timestamp_column="created_datetime",
        options={
            "inferSchema": "true",
            "header": "true"
        },
    )
    booking_table = FeatureTable(
        name="bookings",
        entities=[Field("customer_id", "int32")],
        features=[Field("total_bookings", "int32")],
    )

    joined_df = retrieve_historical_features(
        spark,
        entity_source,
        [booking_source],
        [booking_table],
    )

    expected_joined_schema = StructType([
        StructField("customer_id", IntegerType()),
        StructField("event_timestamp", TimestampType()),
        StructField("bookings__total_bookings", IntegerType()),
    ])

    expected_joined_data = [
        (1001, datetime(year=2020, month=9, day=2), 200),
        (1001, datetime(year=2020, month=9, day=3), 200),
        (2001, datetime(year=2020, month=9, day=4), 600),
        (2001, datetime(year=2020, month=9, day=4), 600),
        (3001, datetime(year=2020, month=9, day=4), 700),
    ]
    expected_joined_df = spark.createDataFrame(
        spark.sparkContext.parallelize(expected_joined_data),
        expected_joined_schema)

    assert_dataframe_equal(joined_df, expected_joined_df)
Пример #3
0
def test_historical_feature_retrieval_with_mapping(spark):
    test_data_dir = path.join(pathlib.Path(__file__).parent.absolute(), "data")
    retrieval_conf = {
        "entity": {
            "format": "csv",
            "path": f"file://{path.join(test_data_dir,  'column_mapping_test_entity.csv')}",
            "options": {"inferSchema": "true", "header": "true"},
            "col_mapping": {"id": "customer_id"},
            "dtypes": {"customer_id": "int"},
        },
        "tables": [
            {
                "format": "csv",
                "path": f"file://{path.join(test_data_dir,  'column_mapping_test_feature.csv')}",
                "name": "bookings",
                "options": {"inferSchema": "true", "header": "true"},
                "col_mapping": {
                    "datetime": "event_timestamp",
                    "created_datetime": "created_timestamp",
                },
                "dtypes": {"customer_id": "int"},
            },
        ],
        "queries": [
            {
                "table": "bookings",
                "features": ["total_bookings"],
                "join": ["customer_id"],
            }
        ],
    }

    joined_df = retrieve_historical_features(spark, retrieval_conf)

    expected_joined_schema = StructType(
        [
            StructField("customer_id", IntegerType()),
            StructField("event_timestamp", TimestampType()),
            StructField("bookings__total_bookings", IntegerType()),
        ]
    )

    expected_joined_data = [
        (1001, datetime(year=2020, month=9, day=2), 200),
        (1001, datetime(year=2020, month=9, day=3), 200),
        (2001, datetime(year=2020, month=9, day=4), 600),
        (2001, datetime(year=2020, month=9, day=4), 600),
        (3001, datetime(year=2020, month=9, day=4), 700),
    ]
    expected_joined_df = spark.createDataFrame(
        spark.sparkContext.parallelize(expected_joined_data), expected_joined_schema
    )

    assert_dataframe_equal(joined_df, expected_joined_df)
Пример #4
0
def test_large_historical_feature_retrieval(
    spark: SparkSession, large_entity_csv_file: str, large_feature_csv_file: str
):
    nr_rows = 1000
    start_datetime = datetime(year=2020, month=8, day=31)
    expected_join_data = [
        (1000 + i, start_datetime + timedelta(days=i), i * 10) for i in range(nr_rows)
    ]
    expected_join_data_schema = StructType(
        [
            StructField("customer_id", IntegerType()),
            StructField("event_timestamp", TimestampType()),
            StructField("feature__total_bookings", IntegerType()),
        ]
    )

    expected_join_data_df = spark.createDataFrame(
        spark.sparkContext.parallelize(expected_join_data), expected_join_data_schema
    )

    entity_source = {
        "file": {
            "format": "csv",
            "path": f"file://{large_entity_csv_file}",
            "event_timestamp_column": "event_timestamp",
            "field_mapping": {"id": "customer_id"},
            "options": {"inferSchema": "true", "header": "true"},
        }
    }
    feature_source = {
        "file": {
            "format": "csv",
            "path": f"file://{large_feature_csv_file}",
            "event_timestamp_column": "event_timestamp",
            "created_timestamp_column": "created_timestamp",
            "options": {"inferSchema": "true", "header": "true"},
        }
    }
    feature_table = {
        "name": "feature",
        "entities": [{"name": "customer_id", "type": "int32"}],
        "features": [{"name": "total_bookings", "type": "int32"}],
    }

    joined_df = retrieve_historical_features(
        spark, entity_source, [feature_source], [feature_table]
    )
    assert_dataframe_equal(joined_df, expected_join_data_df)
Пример #5
0
def test_large_historical_feature_retrieval(
    spark, large_entity_csv_file, large_feature_csv_file
):
    nr_rows = 1000
    start_datetime = datetime(year=2020, month=8, day=31)
    expected_join_data = [
        (1000 + i, start_datetime + timedelta(days=i), i * 10) for i in range(nr_rows)
    ]
    expected_join_data_schema = StructType(
        [
            StructField("customer_id", IntegerType()),
            StructField("event_timestamp", TimestampType()),
            StructField("feature__total_bookings", IntegerType()),
        ]
    )

    expected_join_data_df = spark.createDataFrame(
        spark.sparkContext.parallelize(expected_join_data), expected_join_data_schema
    )

    retrieval_conf = {
        "entity": {
            "format": "csv",
            "path": f"file://{large_entity_csv_file}",
            "options": {"inferSchema": "true", "header": "true"},
        },
        "tables": [
            {
                "format": "csv",
                "path": f"file://{large_feature_csv_file}",
                "name": "feature",
                "options": {"inferSchema": "true", "header": "true"},
            },
        ],
        "queries": [
            {
                "table": "feature",
                "features": ["total_bookings"],
                "join": ["customer_id"],
            }
        ],
    }

    joined_df = retrieve_historical_features(spark, retrieval_conf)
    assert_dataframe_equal(joined_df, expected_join_data_df)
Пример #6
0
def test_historical_feature_retrieval(spark):
    test_data_dir = path.join(pathlib.Path(__file__).parent.absolute(), "data")
    batch_retrieval_conf = {
        "entity": {
            "format": "csv",
            "path": f"file://{path.join(test_data_dir,  'customer_driver_pairs.csv')}",
            "options": {"inferSchema": "true", "header": "true"},
            "dtypes": {"customer_id": "int", "driver_id": "int"},
        },
        "tables": [
            {
                "format": "csv",
                "path": f"file://{path.join(test_data_dir,  'bookings.csv')}",
                "name": "bookings",
                "options": {"inferSchema": "true", "header": "true"},
                "dtypes": {"driver_id": "int"},
            },
            {
                "format": "csv",
                "path": f"file://{path.join(test_data_dir,  'transactions.csv')}",
                "name": "transactions",
                "options": {"inferSchema": "true", "header": "true"},
                "dtypes": {"customer_id": "int"},
            },
        ],
        "queries": [
            {
                "table": "transactions",
                "features": ["daily_transactions"],
                "join": ["customer_id"],
                "max_age": 86400,
            },
            {
                "table": "bookings",
                "features": ["completed_bookings"],
                "join": ["driver_id"],
            },
        ],
    }

    joined_df = retrieve_historical_features(spark, batch_retrieval_conf)

    expected_joined_schema = StructType(
        [
            StructField("customer_id", IntegerType()),
            StructField("driver_id", IntegerType()),
            StructField("event_timestamp", TimestampType()),
            StructField("transactions__daily_transactions", FloatType()),
            StructField("bookings__completed_bookings", IntegerType()),
        ]
    )

    expected_joined_data = [
        (1001, 8001, datetime(year=2020, month=9, day=2), 100.0, 300,),
        (1001, 8002, datetime(year=2020, month=9, day=2), 100.0, 500,),
        (1001, 8002, datetime(year=2020, month=9, day=3), None, 500,),
        (2001, 8002, datetime(year=2020, month=9, day=3), None, 500,),
        (2001, 8002, datetime(year=2020, month=9, day=4), None, 500,),
    ]
    expected_joined_df = spark.createDataFrame(
        spark.sparkContext.parallelize(expected_joined_data), expected_joined_schema
    )

    assert_dataframe_equal(joined_df, expected_joined_df)
Пример #7
0
def test_historical_feature_retrieval_with_schema_errors(spark: SparkSession):
    test_data_dir = path.join(pathlib.Path(__file__).parent.absolute(), "data")
    entity_source = {
        "file": {
            "format": {
                "json_class": "CSVFormat"
            },
            "path":
            f"file://{path.join(test_data_dir,  'customer_driver_pairs.csv')}",
            "event_timestamp_column": "event_timestamp",
            "options": {
                "inferSchema": "true",
                "header": "true"
            },
        }
    }
    entity_source_missing_timestamp = {
        "file": {
            "format": {
                "json_class": "CSVFormat"
            },
            "path":
            f"file://{path.join(test_data_dir,  'customer_driver_pairs.csv')}",
            "event_timestamp_column": "datetime",
            "options": {
                "inferSchema": "true",
                "header": "true"
            },
        }
    }
    entity_source_missing_entity = {
        "file": {
            "format": {
                "json_class": "CSVFormat"
            },
            "path": f"file://{path.join(test_data_dir,  'customers.csv')}",
            "event_timestamp_column": "event_timestamp",
            "options": {
                "inferSchema": "true",
                "header": "true"
            },
        }
    }

    booking_source = {
        "file": {
            "format": {
                "json_class": "CSVFormat"
            },
            "path": f"file://{path.join(test_data_dir,  'bookings.csv')}",
            "event_timestamp_column": "event_timestamp",
            "created_timestamp_column": "created_timestamp",
            "options": {
                "inferSchema": "true",
                "header": "true"
            },
        }
    }
    booking_source_missing_timestamp = {
        "file": {
            "format": {
                "json_class": "CSVFormat"
            },
            "path": f"file://{path.join(test_data_dir,  'bookings.csv')}",
            "event_timestamp_column": "datetime",
            "created_timestamp_column": "created_datetime",
            "options": {
                "inferSchema": "true",
                "header": "true"
            },
        }
    }
    booking_table = {
        "name": "bookings",
        "entities": [{
            "name": "driver_id",
            "type": "int32"
        }],
        "features": [{
            "name": "completed_bookings",
            "type": "int32"
        }],
    }
    booking_table_missing_features = {
        "name": "bookings",
        "entities": [{
            "name": "driver_id",
            "type": "int32"
        }],
        "features": [{
            "name": "nonexist_feature",
            "type": "int32"
        }],
    }
    booking_table_wrong_column_type = {
        "name": "bookings",
        "entities": [{
            "name": "driver_id",
            "type": "string"
        }],
        "features": [{
            "name": "completed_bookings",
            "type": "int32"
        }],
    }

    with pytest.raises(SchemaError):
        retrieve_historical_features(
            spark,
            entity_source_missing_timestamp,
            [booking_source],
            [booking_table],
        )

    with pytest.raises(SchemaError):
        retrieve_historical_features(
            spark,
            entity_source,
            [booking_source_missing_timestamp],
            [booking_table],
        )

    with pytest.raises(SchemaError):
        retrieve_historical_features(
            spark,
            entity_source,
            [booking_source],
            [booking_table_missing_features],
        )

    with pytest.raises(SchemaError):
        retrieve_historical_features(
            spark,
            entity_source,
            [booking_source],
            [booking_table_wrong_column_type],
        )

    with pytest.raises(SchemaError):
        retrieve_historical_features(
            spark,
            entity_source_missing_entity,
            [booking_source],
            [booking_table],
        )
Пример #8
0
def test_historical_feature_retrieval_with_mapping(spark: SparkSession):
    test_data_dir = path.join(pathlib.Path(__file__).parent.absolute(), "data")
    entity_source = {
        "file": {
            "format": {
                "json_class": "CSVFormat"
            },
            "path":
            f"file://{path.join(test_data_dir,  'column_mapping_test_entity.csv')}",
            "event_timestamp_column": "event_timestamp",
            "field_mapping": {
                "id": "customer_id"
            },
            "options": {
                "inferSchema": "true",
                "header": "true"
            },
        }
    }
    booking_source = {
        "file": {
            "format": {
                "json_class": "CSVFormat"
            },
            "path":
            f"file://{path.join(test_data_dir,  'column_mapping_test_feature.csv')}",
            "event_timestamp_column": "datetime",
            "created_timestamp_column": "created_datetime",
            "options": {
                "inferSchema": "true",
                "header": "true"
            },
        }
    }
    booking_table = {
        "name": "bookings",
        "entities": [{
            "name": "customer_id",
            "type": "int32"
        }],
        "features": [{
            "name": "total_bookings",
            "type": "int32"
        }],
    }

    joined_df = retrieve_historical_features(
        spark,
        entity_source,
        [booking_source],
        [booking_table],
    )

    expected_joined_schema = StructType([
        StructField("customer_id", IntegerType()),
        StructField("event_timestamp", TimestampType()),
        StructField("bookings__total_bookings", IntegerType()),
    ])

    expected_joined_data = [
        (1001, datetime(year=2020, month=9, day=2), 200),
        (1001, datetime(year=2020, month=9, day=3), 200),
        (2001, datetime(year=2020, month=9, day=4), 600),
        (2001, datetime(year=2020, month=9, day=4), 600),
        (3001, datetime(year=2020, month=9, day=4), 700),
    ]
    expected_joined_df = spark.createDataFrame(
        spark.sparkContext.parallelize(expected_joined_data),
        expected_joined_schema)

    assert_dataframe_equal(joined_df, expected_joined_df)
Пример #9
0
def test_historical_feature_retrieval(spark: SparkSession):
    test_data_dir = path.join(pathlib.Path(__file__).parent.absolute(), "data")
    entity_source = {
        "file": {
            "format": {
                "json_class": "CSVFormat"
            },
            "path":
            f"file://{path.join(test_data_dir,  'customer_driver_pairs.csv')}",
            "event_timestamp_column": "event_timestamp",
            "options": {
                "inferSchema": "true",
                "header": "true"
            },
        }
    }
    booking_source = {
        "file": {
            "format": {
                "json_class": "CSVFormat"
            },
            "path": f"file://{path.join(test_data_dir,  'bookings.csv')}",
            "event_timestamp_column": "event_timestamp",
            "created_timestamp_column": "created_timestamp",
            "options": {
                "inferSchema": "true",
                "header": "true"
            },
        }
    }
    transaction_source = {
        "file": {
            "format": {
                "json_class": "CSVFormat"
            },
            "path": f"file://{path.join(test_data_dir,  'transactions.csv')}",
            "event_timestamp_column": "event_timestamp",
            "created_timestamp_column": "created_timestamp",
            "options": {
                "inferSchema": "true",
                "header": "true"
            },
        }
    }
    booking_table = {
        "name": "bookings",
        "entities": [{
            "name": "driver_id",
            "type": "int32"
        }],
        "features": [{
            "name": "completed_bookings",
            "type": "int32"
        }],
    }
    transaction_table = {
        "name": "transactions",
        "entities": [{
            "name": "customer_id",
            "type": "int32"
        }],
        "features": [{
            "name": "daily_transactions",
            "type": "double"
        }],
        "max_age": 86400,
    }

    joined_df = retrieve_historical_features(
        spark,
        entity_source,
        [transaction_source, booking_source],
        [transaction_table, booking_table],
    )

    expected_joined_schema = StructType([
        StructField("customer_id", IntegerType()),
        StructField("driver_id", IntegerType()),
        StructField("event_timestamp", TimestampType()),
        StructField("transactions__daily_transactions", FloatType()),
        StructField("bookings__completed_bookings", IntegerType()),
    ])

    expected_joined_data = [
        (
            1001,
            8001,
            datetime(year=2020, month=9, day=2),
            100.0,
            300,
        ),
        (
            1001,
            8002,
            datetime(year=2020, month=9, day=2),
            100.0,
            500,
        ),
        (
            1001,
            8002,
            datetime(year=2020, month=9, day=3),
            None,
            500,
        ),
        (
            2001,
            8002,
            datetime(year=2020, month=9, day=3),
            None,
            500,
        ),
        (
            2001,
            8002,
            datetime(year=2020, month=9, day=4),
            None,
            500,
        ),
    ]
    expected_joined_df = spark.createDataFrame(
        spark.sparkContext.parallelize(expected_joined_data),
        expected_joined_schema)

    assert_dataframe_equal(joined_df, expected_joined_df)
Пример #10
0
def test_historical_feature_retrieval_with_schema_errors(spark: SparkSession):
    test_data_dir = path.join(pathlib.Path(__file__).parent.absolute(), "data")
    entity_source = FileSource(
        format="csv",
        path=f"file://{path.join(test_data_dir,  'customer_driver_pairs.csv')}",
        event_timestamp_column="event_timestamp",
        options={
            "inferSchema": "true",
            "header": "true"
        },
    )
    entity_source_missing_timestamp = FileSource(
        format="csv",
        path=f"file://{path.join(test_data_dir,  'customer_driver_pairs.csv')}",
        event_timestamp_column="datetime",
        options={
            "inferSchema": "true",
            "header": "true"
        },
    )
    entity_source_missing_entity = FileSource(
        format="csv",
        path=f"file://{path.join(test_data_dir,  'customers.csv')}",
        event_timestamp_column="event_timestamp",
        options={
            "inferSchema": "true",
            "header": "true"
        },
    )

    booking_source = FileSource(
        format="csv",
        path=f"file://{path.join(test_data_dir,  'bookings.csv')}",
        event_timestamp_column="event_timestamp",
        created_timestamp_column="created_timestamp",
        options={
            "inferSchema": "true",
            "header": "true"
        },
    )
    booking_source_missing_timestamp = FileSource(
        format="csv",
        path=f"file://{path.join(test_data_dir,  'bookings.csv')}",
        event_timestamp_column="datetime",
        created_timestamp_column="created_datetime",
        options={
            "inferSchema": "true",
            "header": "true"
        },
    )
    booking_table = FeatureTable(
        name="bookings",
        entities=[Field("driver_id", "int32")],
        features=[Field("completed_bookings", "int32")],
    )
    booking_table_missing_features = FeatureTable(
        name="bookings",
        entities=[Field("driver_id", "int32")],
        features=[Field("nonexist_feature", "int32")],
    )
    booking_table_wrong_column_type = FeatureTable(
        name="bookings",
        entities=[Field("driver_id", "string")],
        features=[Field("completed_bookings", "int32")],
    )

    with pytest.raises(SchemaError):
        retrieve_historical_features(
            spark,
            entity_source_missing_timestamp,
            [booking_source],
            [booking_table],
        )

    with pytest.raises(SchemaError):
        retrieve_historical_features(
            spark,
            entity_source,
            [booking_source_missing_timestamp],
            [booking_table],
        )

    with pytest.raises(SchemaError):
        retrieve_historical_features(
            spark,
            entity_source,
            [booking_source],
            [booking_table_missing_features],
        )

    with pytest.raises(SchemaError):
        retrieve_historical_features(
            spark,
            entity_source,
            [booking_source],
            [booking_table_wrong_column_type],
        )

    with pytest.raises(SchemaError):
        retrieve_historical_features(
            spark,
            entity_source_missing_entity,
            [booking_source],
            [booking_table],
        )
Пример #11
0
def test_historical_feature_retrieval(spark: SparkSession):
    test_data_dir = path.join(pathlib.Path(__file__).parent.absolute(), "data")
    entity_source = FileSource(
        format="csv",
        path=f"file://{path.join(test_data_dir,  'customer_driver_pairs.csv')}",
        event_timestamp_column="event_timestamp",
        options={
            "inferSchema": "true",
            "header": "true"
        },
    )
    booking_source = FileSource(
        format="csv",
        path=f"file://{path.join(test_data_dir,  'bookings.csv')}",
        event_timestamp_column="event_timestamp",
        created_timestamp_column="created_timestamp",
        options={
            "inferSchema": "true",
            "header": "true"
        },
    )
    transaction_source = FileSource(
        format="csv",
        path=f"file://{path.join(test_data_dir,  'transactions.csv')}",
        event_timestamp_column="event_timestamp",
        created_timestamp_column="created_timestamp",
        options={
            "inferSchema": "true",
            "header": "true"
        },
    )
    booking_table = FeatureTable(
        name="bookings",
        entities=[Field("driver_id", "int32")],
        features=[Field("completed_bookings", "int32")],
    )
    transaction_table = FeatureTable(
        name="transactions",
        entities=[Field("customer_id", "int32")],
        features=[Field("daily_transactions", "double")],
        max_age=86400,
    )

    joined_df = retrieve_historical_features(
        spark,
        entity_source,
        [transaction_source, booking_source],
        [transaction_table, booking_table],
    )

    expected_joined_schema = StructType([
        StructField("customer_id", IntegerType()),
        StructField("driver_id", IntegerType()),
        StructField("event_timestamp", TimestampType()),
        StructField("transactions__daily_transactions", FloatType()),
        StructField("bookings__completed_bookings", IntegerType()),
    ])

    expected_joined_data = [
        (
            1001,
            8001,
            datetime(year=2020, month=9, day=2),
            100.0,
            300,
        ),
        (
            1001,
            8002,
            datetime(year=2020, month=9, day=2),
            100.0,
            500,
        ),
        (
            1001,
            8002,
            datetime(year=2020, month=9, day=3),
            None,
            500,
        ),
        (
            2001,
            8002,
            datetime(year=2020, month=9, day=3),
            None,
            500,
        ),
        (
            2001,
            8002,
            datetime(year=2020, month=9, day=4),
            None,
            500,
        ),
    ]
    expected_joined_df = spark.createDataFrame(
        spark.sparkContext.parallelize(expected_joined_data),
        expected_joined_schema)

    assert_dataframe_equal(joined_df, expected_joined_df)