def test__assert_validation_count(self, written_count, dataframe_count, threshold): # arrange writer = (HistoricalFeatureStoreWriter(validation_threshold=threshold) if threshold else HistoricalFeatureStoreWriter()) # act and assert writer._assert_validation_count("table", written_count, dataframe_count)
def test_sink(input_dataframe, feature_set): # arrange client = SparkClient() client.conn.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic") feature_set_df = feature_set.construct(input_dataframe, client) target_latest_df = OnlineFeatureStoreWriter.filter_latest( feature_set_df, id_columns=[key.name for key in feature_set.keys]) columns_sort = feature_set_df.schema.fieldNames() # setup historical writer s3config = Mock() s3config.mode = "overwrite" s3config.format_ = "parquet" s3config.get_options = Mock( return_value={"path": "test_folder/historical/entity/feature_set"}) s3config.get_path_with_partitions = Mock( return_value="test_folder/historical/entity/feature_set") historical_writer = HistoricalFeatureStoreWriter(db_config=s3config, interval_mode=True) # setup online writer # TODO: Change for CassandraConfig when Cassandra for test is ready online_config = Mock() online_config.mode = "overwrite" online_config.format_ = "parquet" online_config.get_options = Mock( return_value={"path": "test_folder/online/entity/feature_set"}) online_writer = OnlineFeatureStoreWriter(db_config=online_config) writers = [historical_writer, online_writer] sink = Sink(writers) # act client.sql("CREATE DATABASE IF NOT EXISTS {}".format( historical_writer.database)) sink.flush(feature_set, feature_set_df, client) # get historical results historical_result_df = client.read( s3config.format_, path=s3config.get_path_with_partitions(feature_set.name, feature_set_df), ) # get online results online_result_df = client.read( online_config.format_, **online_config.get_options(feature_set.name)) # assert # assert historical results assert sorted(feature_set_df.select(*columns_sort).collect()) == sorted( historical_result_df.select(*columns_sort).collect()) # assert online results assert sorted(target_latest_df.select(*columns_sort).collect()) == sorted( online_result_df.select(*columns_sort).collect()) # tear down shutil.rmtree("test_folder")
def test_flush(self, feature_set_dataframe, mocker): # given spark_client = SparkClient() writer = [ HistoricalFeatureStoreWriter(), OnlineFeatureStoreWriter(), ] for w in writer: w.write = mocker.stub("write") feature_set = mocker.stub("feature_set") feature_set.entity = "house" feature_set.name = "test" # when sink = Sink(writers=writer) sink.flush( dataframe=feature_set_dataframe, feature_set=feature_set, spark_client=spark_client, ) # then for w in writer: w.write.assert_called_once()
def __init__(self): super(FirstPipeline, self).__init__( source=Source( readers=[TableReader(id="t", database="db", table="table",)], query=f"select * from t", # noqa ), feature_set=FeatureSet( name="first", entity="entity", description="description", features=[ Feature(name="feature1", description="test", dtype=DataType.FLOAT,), Feature( name="feature2", description="another test", dtype=DataType.STRING, ), ], keys=[ KeyFeature( name="id", description="identifier", dtype=DataType.BIGINT, ) ], timestamp=TimestampFeature(), ), sink=Sink( writers=[HistoricalFeatureStoreWriter(), OnlineFeatureStoreWriter()] ), )
def test__repartition_df(self, spark_session, spark_context): # arrange start = datetime.datetime(year=1970, month=1, day=1) end = datetime.datetime(year=2020, month=12, day=31) random_dates = [ ( lambda: start + datetime. timedelta(seconds=random.randint( # noqa: S311 0, int( (end - start).total_seconds()))))().date().isoformat() for _ in range(10000) ] data = [{"timestamp": date} for date in random_dates] input_df = spark_session.read.json(spark_context.parallelize(data, 1), schema="timestamp timestamp") writer = HistoricalFeatureStoreWriter() # act result_df = writer._create_partitions(input_df) # assert # Only one partition id, meaning data is not partitioned assert input_df.select(spark_partition_id()).distinct().count() == 1 # Desired number of partitions assert result_df.select(spark_partition_id()).distinct().count() == 200
def test_write( self, feature_set_dataframe, historical_feature_set_dataframe, mocker, feature_set, ): # given spark_client = mocker.stub("spark_client") spark_client.write_table = mocker.stub("write_table") writer = HistoricalFeatureStoreWriter() # when writer.write( feature_set=feature_set, dataframe=feature_set_dataframe, spark_client=spark_client, ) result_df = spark_client.write_table.call_args[1]["dataframe"] # then assert_dataframe_equality(historical_feature_set_dataframe, result_df) assert (writer.db_config.format_ == spark_client.write_table.call_args[1]["format_"]) assert writer.db_config.mode == spark_client.write_table.call_args[1][ "mode"] assert (writer.PARTITION_BY == spark_client.write_table.call_args[1] ["partition_by"]) assert feature_set.name == spark_client.write_table.call_args[1][ "table_name"]
def test_feature_set_pipeline_with_execution_date( self, mocked_date_df, spark_session, fixed_windows_output_feature_set_date_dataframe, feature_set_pipeline, ): # arrange table_reader_table = "b_table" create_temp_view(dataframe=mocked_date_df, name=table_reader_table) target_df = fixed_windows_output_feature_set_date_dataframe.filter( "timestamp < '2016-04-13'") historical_writer = HistoricalFeatureStoreWriter(debug_mode=True) feature_set_pipeline.sink.writers = [historical_writer] # act feature_set_pipeline.run_for_date(execution_date="2016-04-12") df = spark_session.sql( "select * from historical_feature_store__feature_set") # assert assert_dataframe_equality(df, target_df)
def test_write_interval_mode( self, feature_set_dataframe, historical_feature_set_dataframe, mocker, feature_set, ): # given spark_client = SparkClient() spark_client.write_table = mocker.stub("write_table") spark_client.conn.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic") writer = HistoricalFeatureStoreWriter(interval_mode=True) # when writer.write( feature_set=feature_set, dataframe=feature_set_dataframe, spark_client=spark_client, ) result_df = spark_client.write_table.call_args[1]["dataframe"] # then assert_dataframe_equality(historical_feature_set_dataframe, result_df) assert writer.database == spark_client.write_table.call_args[1][ "database"] assert feature_set.name == spark_client.write_table.call_args[1][ "table_name"] assert (writer.PARTITION_BY == spark_client.write_table.call_args[1] ["partition_by"])
def test__create_partitions(self, spark_session, spark_context): # arrange start = datetime.datetime(year=1970, month=1, day=1) end = datetime.datetime(year=2020, month=12, day=31) random_dates = [ ( lambda: start + datetime. timedelta(seconds=random.randint( # noqa: S311 0, int( (end - start).total_seconds()))))().date().isoformat() for _ in range(10000) ] data = [{"timestamp": date} for date in random_dates] input_df = spark_session.read.json(spark_context.parallelize(data, 1), schema="timestamp timestamp") writer = HistoricalFeatureStoreWriter() # act result_df = writer._create_partitions(input_df) # assert assert result_df.select("year", "month", "day").distinct().count() == len( set(random_dates))
def test_write_in_debug_mode_with_interval_mode( self, feature_set_dataframe, historical_feature_set_dataframe, feature_set, spark_session, mocker, ): # given spark_client = SparkClient() spark_client.write_dataframe = mocker.stub("write_dataframe") spark_client.conn.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic") writer = HistoricalFeatureStoreWriter(debug_mode=True, interval_mode=True) # when writer.write( feature_set=feature_set, dataframe=feature_set_dataframe, spark_client=spark_client, ) result_df = spark_session.table( f"historical_feature_store__{feature_set.name}") # then assert_dataframe_equality(historical_feature_set_dataframe, result_df)
def test_run_agg_with_end_date(self, spark_session): test_pipeline = FeatureSetPipeline( spark_client=SparkClient(), source=Mock( spec=Source, readers=[ TableReader( id="source_a", database="db", table="table", ) ], query="select * from source_a", ), feature_set=Mock( spec=AggregatedFeatureSet, name="feature_set", entity="entity", description="description", keys=[ KeyFeature( name="user_id", description="The user's Main ID or device ID", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(from_column="ts"), features=[ Feature( name="listing_page_viewed__rent_per_month", description="Average of something.", transformation=AggregatedTransform(functions=[ Function(functions.avg, DataType.FLOAT), Function(functions.stddev_pop, DataType.FLOAT), ], ), ), ], ), sink=Mock( spec=Sink, writers=[HistoricalFeatureStoreWriter(db_config=None)], ), ) # feature_set need to return a real df for streaming validation sample_df = spark_session.createDataFrame([{ "a": "x", "b": "y", "c": "3" }]) test_pipeline.feature_set.construct.return_value = sample_df test_pipeline.run(end_date="2016-04-18") test_pipeline.source.construct.assert_called_once() test_pipeline.feature_set.construct.assert_called_once() test_pipeline.sink.flush.assert_called_once() test_pipeline.sink.validate.assert_called_once()
def test_pipeline_with_hooks(self, spark_session): # arrange hook1 = AddHook(value=1) spark_session.sql( "select 1 as id, timestamp('2020-01-01') as timestamp, 0 as feature" ).createOrReplaceTempView("test") target_df = spark_session.sql( "select 1 as id, timestamp('2020-01-01') as timestamp, 6 as feature, 2020 " "as year, 1 as month, 1 as day") historical_writer = HistoricalFeatureStoreWriter(debug_mode=True) test_pipeline = FeatureSetPipeline( source=Source( readers=[ TableReader( id="reader", table="test", ).add_post_hook(hook1) ], query="select * from reader", ).add_post_hook(hook1), feature_set=FeatureSet( name="feature_set", entity="entity", description="description", features=[ Feature( name="feature", description="test", transformation=SQLExpressionTransform( expression="feature + 1"), dtype=DataType.INTEGER, ), ], keys=[ KeyFeature( name="id", description="The user's Main ID or device ID", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(), ).add_pre_hook(hook1).add_post_hook(hook1), sink=Sink(writers=[historical_writer], ).add_pre_hook(hook1), ) # act test_pipeline.run() output_df = spark_session.table( "historical_feature_store__feature_set") # assert output_df.show() assert_dataframe_equality(output_df, target_df)
def test_apply_migration(self, feature_set, mocker): # given m = CassandraMigration() m.apply_migration = mocker.stub("apply_migration") # when m.apply_migration(feature_set, HistoricalFeatureStoreWriter()) # then m.apply_migration.assert_called_once()
def test_validate(self, feature_set_dataframe, mocker, feature_set): # given spark_client = mocker.stub("spark_client") spark_client.read_table = mocker.stub("read_table") spark_client.read_table.return_value = feature_set_dataframe writer = HistoricalFeatureStoreWriter() # when writer.validate(feature_set, feature_set_dataframe, spark_client) # then spark_client.read_table.assert_called_once()
def test_source_raise(self): with pytest.raises(ValueError, match="source must be a Source instance"): FeatureSetPipeline( spark_client=SparkClient(), source=Mock( spark_client=SparkClient(), readers=[ TableReader( id="source_a", database="db", table="table", ), ], query="select * from source_a", ), feature_set=Mock( spec=FeatureSet, name="feature_set", entity="entity", description="description", keys=[ KeyFeature( name="user_id", description="The user's Main ID or device ID", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(from_column="ts"), features=[ Feature( name="listing_page_viewed__rent_per_month", description="Average of something.", transformation=SparkFunctionTransform(functions=[ Function(functions.avg, DataType.FLOAT), Function(functions.stddev_pop, DataType.FLOAT), ], ).with_window( partition_by="user_id", order_by=TIMESTAMP_COLUMN, window_definition=["7 days", "2 weeks"], mode="fixed_windows", ), ), ], ), sink=Mock( spec=Sink, writers=[HistoricalFeatureStoreWriter(db_config=None)], ), )
def test_validate_false(self, feature_set_dataframe, mocker, feature_set): # given spark_client = mocker.stub("spark_client") spark_client.read_table = mocker.stub("read_table") # limiting df to 1 row, now the counts should'n be the same spark_client.read_table.return_value = feature_set_dataframe.limit(1) writer = HistoricalFeatureStoreWriter() # when with pytest.raises(AssertionError): _ = writer.validate(feature_set, feature_set_dataframe, spark_client)
def test_validate_interval_mode(self, historical_feature_set_dataframe, mocker, feature_set): # given spark_client = mocker.stub("spark_client") spark_client.read = mocker.stub("read") spark_client.read.return_value = historical_feature_set_dataframe writer = HistoricalFeatureStoreWriter(interval_mode=True) # when writer.validate(feature_set, historical_feature_set_dataframe, spark_client) # then spark_client.read.assert_called_once()
def feature_set_pipeline( spark_context, spark_session, ): feature_set_pipeline = FeatureSetPipeline( source=Source( readers=[ TableReader(id="b_source", table="b_table",).with_incremental_strategy( incremental_strategy=IncrementalStrategy(column="timestamp") ), ], query=f"select * from b_source ", # noqa ), feature_set=FeatureSet( name="feature_set", entity="entity", description="description", features=[ Feature( name="feature", description="test", transformation=SparkFunctionTransform( functions=[ Function(F.avg, DataType.FLOAT), Function(F.stddev_pop, DataType.FLOAT), ], ).with_window( partition_by="id", order_by=TIMESTAMP_COLUMN, mode="fixed_windows", window_definition=["1 day"], ), ), ], keys=[ KeyFeature( name="id", description="The user's Main ID or device ID", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(), ), sink=Sink(writers=[HistoricalFeatureStoreWriter(debug_mode=True)]), ) return feature_set_pipeline
def loader(features_set_df: pyspark.sql.DataFrame) -> Sink: db_config = get_config() keyspace = "feature_store" table_name = "orders_feature_master_table_" primary_key = "customer_id" create_table(features_set_df, keyspace, table_name, primary_key) writers = [ HistoricalFeatureStoreWriter(debug_mode=True), OnlineFeatureStoreWriter(db_config=db_config) ] #writers = [HistoricalFeatureStoreWriter(debug_mode=True)] sink = Sink(writers=writers) return sink
def test_flush_with_invalid_df(self, not_feature_set_dataframe, mocker): # given spark_client = SparkClient() writer = [ HistoricalFeatureStoreWriter(), OnlineFeatureStoreWriter(), ] feature_set = mocker.stub("feature_set") feature_set.entity = "house" feature_set.name = "test" # when sink = Sink(writers=writer) # then with pytest.raises(ValueError): sink.flush( dataframe=not_feature_set_dataframe, feature_set=feature_set, spark_client=spark_client, )
def test_write_interval_mode_invalid_partition_mode( self, feature_set_dataframe, historical_feature_set_dataframe, mocker, feature_set, ): # given spark_client = SparkClient() spark_client.write_dataframe = mocker.stub("write_dataframe") spark_client.conn.conf.set("spark.sql.sources.partitionOverwriteMode", "static") writer = HistoricalFeatureStoreWriter(interval_mode=True) # when with pytest.raises(RuntimeError): _ = writer.write( feature_set=feature_set, dataframe=feature_set_dataframe, spark_client=spark_client, )
def test_write_in_debug_mode( self, feature_set_dataframe, historical_feature_set_dataframe, feature_set, spark_session, ): # given spark_client = SparkClient() writer = HistoricalFeatureStoreWriter(debug_mode=True) # when writer.write( feature_set=feature_set, dataframe=feature_set_dataframe, spark_client=spark_client, ) result_df = spark_session.table( f"historical_feature_store__{feature_set.name}") # then assert_dataframe_equality(historical_feature_set_dataframe, result_df)
def test_sink_raise(self): with pytest.raises(ValueError, match="sink must be a Sink instance"): FeatureSetPipeline( spark_client=SparkClient(), source=Mock( spec=Source, readers=[ TableReader( id="source_a", database="db", table="table", ), ], query="select * from source_a", ), feature_set=Mock( spec=FeatureSet, name="feature_set", entity="entity", description="description", features=[ Feature( name="user_id", description="The user's Main ID or device ID", dtype=DataType.FLOAT, ), Feature( name="ts", description="The timestamp feature", dtype=DataType.TIMESTAMP, ), ], key_columns=["user_id"], timestamp_column="ts", ), sink=Mock( writers=[HistoricalFeatureStoreWriter(db_config=None)], ), )
def test_validate_false(self, feature_set_dataframe, mocker): # given spark_client = SparkClient() writer = [ HistoricalFeatureStoreWriter(), OnlineFeatureStoreWriter(), ] for w in writer: w.validate = mocker.stub("validate") w.validate.side_effect = AssertionError("test") feature_set = mocker.stub("feature_set") # when sink = Sink(writers=writer) # then with pytest.raises(RuntimeError): sink.validate( dataframe=feature_set_dataframe, feature_set=feature_set, spark_client=spark_client, )
def test_validate(self, feature_set_dataframe, mocker): # given spark_client = SparkClient() writer = [ HistoricalFeatureStoreWriter(), OnlineFeatureStoreWriter(), ] for w in writer: w.validate = mocker.stub("validate") feature_set = mocker.stub("feature_set") # when sink = Sink(writers=writer) sink.validate( dataframe=feature_set_dataframe, feature_set=feature_set, spark_client=spark_client, ) # then for w in writer: w.validate.assert_called_once()
def test_feature_set_args(self): # arrange and act out_columns = [ "user_id", "timestamp", "listing_page_viewed__rent_per_month__avg_over_7_days_fixed_windows", "listing_page_viewed__rent_per_month__avg_over_2_weeks_fixed_windows", "listing_page_viewed__rent_per_month__stddev_pop_over_7_days_fixed_windows", "listing_page_viewed__rent_per_month__" "stddev_pop_over_2_weeks_fixed_windows", # noqa ] pipeline = FeatureSetPipeline( source=Source( readers=[ TableReader( id="source_a", database="db", table="table", ), FileReader( id="source_b", path="path", format="parquet", ), ], query="select a.*, b.specific_feature " "from source_a left join source_b on a.id=b.id", ), feature_set=FeatureSet( name="feature_set", entity="entity", description="description", keys=[ KeyFeature( name="user_id", description="The user's Main ID or device ID", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(from_column="ts"), features=[ Feature( name="listing_page_viewed__rent_per_month", description="Average of something.", transformation=SparkFunctionTransform(functions=[ Function(functions.avg, DataType.FLOAT), Function(functions.stddev_pop, DataType.FLOAT), ], ).with_window( partition_by="user_id", order_by=TIMESTAMP_COLUMN, window_definition=["7 days", "2 weeks"], mode="fixed_windows", ), ), ], ), sink=Sink(writers=[ HistoricalFeatureStoreWriter(db_config=None), OnlineFeatureStoreWriter(db_config=None), ], ), ) assert isinstance(pipeline.spark_client, SparkClient) assert len(pipeline.source.readers) == 2 assert all( isinstance(reader, Reader) for reader in pipeline.source.readers) assert isinstance(pipeline.source.query, str) assert pipeline.feature_set.name == "feature_set" assert pipeline.feature_set.entity == "entity" assert pipeline.feature_set.description == "description" assert isinstance(pipeline.feature_set.timestamp, TimestampFeature) assert len(pipeline.feature_set.keys) == 1 assert all( isinstance(k, KeyFeature) for k in pipeline.feature_set.keys) assert len(pipeline.feature_set.features) == 1 assert all( isinstance(feature, Feature) for feature in pipeline.feature_set.features) assert pipeline.feature_set.columns == out_columns assert len(pipeline.sink.writers) == 2 assert all( isinstance(writer, Writer) for writer in pipeline.sink.writers)
def test_feature_set_pipeline( self, mocked_df, spark_session, fixed_windows_output_feature_set_dataframe, ): # arrange table_reader_id = "a_source" table_reader_table = "table" table_reader_db = environment.get_variable( "FEATURE_STORE_HISTORICAL_DATABASE") create_temp_view(dataframe=mocked_df, name=table_reader_id) create_db_and_table( spark=spark_session, table_reader_id=table_reader_id, table_reader_db=table_reader_db, table_reader_table=table_reader_table, ) dbconfig = Mock() dbconfig.mode = "overwrite" dbconfig.format_ = "parquet" dbconfig.get_options = Mock( return_value={"path": "test_folder/historical/entity/feature_set"}) historical_writer = HistoricalFeatureStoreWriter(db_config=dbconfig) # act test_pipeline = FeatureSetPipeline( source=Source( readers=[ TableReader( id=table_reader_id, database=table_reader_db, table=table_reader_table, ), ], query=f"select * from {table_reader_id} ", # noqa ), feature_set=FeatureSet( name="feature_set", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=SparkFunctionTransform(functions=[ Function(F.avg, DataType.FLOAT), Function(F.stddev_pop, DataType.FLOAT), ], ).with_window( partition_by="id", order_by=TIMESTAMP_COLUMN, mode="fixed_windows", window_definition=["2 minutes", "15 minutes"], ), ), Feature( name="divided_feature", description="unit test", dtype=DataType.FLOAT, transformation=CustomTransform( transformer=divide, column1="feature1", column2="feature2", ), ), ], keys=[ KeyFeature( name="id", description="The user's Main ID or device ID", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(), ), sink=Sink(writers=[historical_writer]), ) test_pipeline.run() # assert path = dbconfig.get_options("historical/entity/feature_set").get( "path") df = spark_session.read.parquet(path).orderBy(TIMESTAMP_COLUMN) target_df = fixed_windows_output_feature_set_dataframe.orderBy( test_pipeline.feature_set.timestamp_column) # assert assert_dataframe_equality(df, target_df) # tear down shutil.rmtree("test_folder")
def test_pipeline_interval_run(self, mocked_date_df, pipeline_interval_run_target_dfs, spark_session): """Testing pipeline's idempotent interval run feature. Source data: +-------+---+-------------------+-------------------+ |feature| id| ts| timestamp| +-------+---+-------------------+-------------------+ | 200| 1|2016-04-11 11:31:11|2016-04-11 11:31:11| | 300| 1|2016-04-12 11:44:12|2016-04-12 11:44:12| | 400| 1|2016-04-13 11:46:24|2016-04-13 11:46:24| | 500| 1|2016-04-14 12:03:21|2016-04-14 12:03:21| +-------+---+-------------------+-------------------+ The test executes 3 runs for different time intervals. The input data has 4 data points: 2016-04-11, 2016-04-12, 2016-04-13 and 2016-04-14. The following run specifications are: 1) Interval: from 2016-04-11 to 2016-04-13 Target table result: +---+-------+---+-----+------+-------------------+----+ |day|feature| id|month|run_id| timestamp|year| +---+-------+---+-----+------+-------------------+----+ | 11| 200| 1| 4| 1|2016-04-11 11:31:11|2016| | 12| 300| 1| 4| 1|2016-04-12 11:44:12|2016| | 13| 400| 1| 4| 1|2016-04-13 11:46:24|2016| +---+-------+---+-----+------+-------------------+----+ 2) Interval: only 2016-04-14. Target table result: +---+-------+---+-----+------+-------------------+----+ |day|feature| id|month|run_id| timestamp|year| +---+-------+---+-----+------+-------------------+----+ | 11| 200| 1| 4| 1|2016-04-11 11:31:11|2016| | 12| 300| 1| 4| 1|2016-04-12 11:44:12|2016| | 13| 400| 1| 4| 1|2016-04-13 11:46:24|2016| | 14| 500| 1| 4| 2|2016-04-14 12:03:21|2016| +---+-------+---+-----+------+-------------------+----+ 3) Interval: only 2016-04-11. Target table result: +---+-------+---+-----+------+-------------------+----+ |day|feature| id|month|run_id| timestamp|year| +---+-------+---+-----+------+-------------------+----+ | 11| 200| 1| 4| 3|2016-04-11 11:31:11|2016| | 12| 300| 1| 4| 1|2016-04-12 11:44:12|2016| | 13| 400| 1| 4| 1|2016-04-13 11:46:24|2016| | 14| 500| 1| 4| 2|2016-04-14 12:03:21|2016| +---+-------+---+-----+------+-------------------+----+ """ # arrange create_temp_view(dataframe=mocked_date_df, name="input_data") db = environment.get_variable("FEATURE_STORE_HISTORICAL_DATABASE") path = "test_folder/historical/entity/feature_set" spark_session.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic") spark_session.sql(f"create database if not exists {db}") spark_session.sql( f"create table if not exists {db}.feature_set_interval " f"(id int, timestamp timestamp, feature int, " f"run_id int, year int, month int, day int);") dbconfig = MetastoreConfig() dbconfig.get_options = Mock(return_value={ "mode": "overwrite", "format_": "parquet", "path": path }) historical_writer = HistoricalFeatureStoreWriter(db_config=dbconfig, interval_mode=True) first_run_hook = RunHook(id=1) second_run_hook = RunHook(id=2) third_run_hook = RunHook(id=3) ( first_run_target_df, second_run_target_df, third_run_target_df, ) = pipeline_interval_run_target_dfs test_pipeline = FeatureSetPipeline( source=Source( readers=[ TableReader( id="id", table="input_data", ).with_incremental_strategy(IncrementalStrategy("ts")), ], query="select * from id ", ), feature_set=FeatureSet( name="feature_set_interval", entity="entity", description="", keys=[ KeyFeature( name="id", description="", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(from_column="ts"), features=[ Feature(name="feature", description="", dtype=DataType.INTEGER), Feature(name="run_id", description="", dtype=DataType.INTEGER), ], ), sink=Sink([historical_writer], ), ) # act and assert dbconfig.get_path_with_partitions = Mock(return_value=[ "test_folder/historical/entity/feature_set/year=2016/month=4/day=11", "test_folder/historical/entity/feature_set/year=2016/month=4/day=12", "test_folder/historical/entity/feature_set/year=2016/month=4/day=13", ]) test_pipeline.feature_set.add_pre_hook(first_run_hook) test_pipeline.run(end_date="2016-04-13", start_date="2016-04-11") first_run_output_df = spark_session.read.parquet(path) assert_dataframe_equality(first_run_output_df, first_run_target_df) dbconfig.get_path_with_partitions = Mock(return_value=[ "test_folder/historical/entity/feature_set/year=2016/month=4/day=14", ]) test_pipeline.feature_set.add_pre_hook(second_run_hook) test_pipeline.run_for_date("2016-04-14") second_run_output_df = spark_session.read.parquet(path) assert_dataframe_equality(second_run_output_df, second_run_target_df) dbconfig.get_path_with_partitions = Mock(return_value=[ "test_folder/historical/entity/feature_set/year=2016/month=4/day=11", ]) test_pipeline.feature_set.add_pre_hook(third_run_hook) test_pipeline.run_for_date("2016-04-11") third_run_output_df = spark_session.read.parquet(path) assert_dataframe_equality(third_run_output_df, third_run_target_df) # tear down shutil.rmtree("test_folder")