def test_feature_set(): return AggregatedFeatureSet( name="feature_set", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=AggregatedTransform(functions=[ Function(functions.avg, DataType.DOUBLE), Function(functions.stddev_pop, DataType.DOUBLE), ]), ), Feature( name="feature2", description="test", transformation=AggregatedTransform( functions=[Function(functions.count, DataType.INTEGER)]), ), ], keys=[ KeyFeature( name="id", description="The user's Main ID or device ID", dtype=DataType.BIGINT, ) ], timestamp=TimestampFeature(), ).with_windows(definitions=["1 week", "2 days"])
def agg_feature_set(): return AggregatedFeatureSet( name="name", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=AggregatedTransform( functions=[Function(functions.avg, DataType.DOUBLE)], ), ), Feature( name="feature2", description="test", transformation=AggregatedTransform( functions=[Function(functions.avg, DataType.DOUBLE)]), ), ], keys=[ KeyFeature( name="id", description="description", dtype=DataType.BIGINT, ) ], timestamp=TimestampFeature(), )
def test_construct( self, feature_set_dataframe, fixed_windows_output_feature_set_dataframe ): # given spark_client = SparkClient() # arrange feature_set = FeatureSet( name="feature_set", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=SparkFunctionTransform( functions=[ Function(F.avg, DataType.FLOAT), Function(F.stddev_pop, DataType.FLOAT), ] ).with_window( partition_by="id", order_by=TIMESTAMP_COLUMN, mode="fixed_windows", window_definition=["2 minutes", "15 minutes"], ), ), Feature( name="divided_feature", description="unit test", dtype=DataType.FLOAT, transformation=CustomTransform( transformer=divide, column1="feature1", column2="feature2", ), ), ], keys=[ KeyFeature( name="id", description="The user's Main ID or device ID", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(), ) output_df = ( feature_set.construct(feature_set_dataframe, client=spark_client) .orderBy(feature_set.timestamp_column) .select(feature_set.columns) ) target_df = fixed_windows_output_feature_set_dataframe.orderBy( feature_set.timestamp_column ).select(feature_set.columns) # assert assert_dataframe_equality(output_df, target_df)
def test_aggregations_with_filter_expression(self, spark_context): # arrange test_feature = Feature( name="feature_with_filter", description="unit test", transformation=AggregatedTransform( functions=[ Function(functions.avg, DataType.DOUBLE), Function(functions.min, DataType.DOUBLE), Function(functions.max, DataType.DOUBLE), ], filter_expression="type = 'a'", ), from_column="feature", ) target_aggregations = [ agg( functions.when(functions.expr("type = 'a'"), functions.col("feature"))) for agg in [functions.avg, functions.min, functions.max] ] # act output_aggregations = [ agg.function for agg in test_feature.transformation.aggregations ] # assert # cast to string to compare the columns definitions because direct column # comparison was not working assert str(target_aggregations) == str(output_aggregations)
def test_get_schema(self): expected_schema = [ {"column_name": "id", "type": LongType(), "primary_key": True}, {"column_name": "timestamp", "type": TimestampType(), "primary_key": False}, { "column_name": "feature1__avg_over_2_minutes_fixed_windows", "type": FloatType(), "primary_key": False, }, { "column_name": "feature1__avg_over_15_minutes_fixed_windows", "type": FloatType(), "primary_key": False, }, { "column_name": "feature1__stddev_pop_over_2_minutes_fixed_windows", "type": DoubleType(), "primary_key": False, }, { "column_name": "feature1__stddev_pop_over_15_minutes_fixed_windows", "type": DoubleType(), "primary_key": False, }, ] feature_set = FeatureSet( name="feature_set", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=SparkFunctionTransform( functions=[ Function(F.avg, DataType.FLOAT), Function(F.stddev_pop, DataType.DOUBLE), ] ).with_window( partition_by="id", order_by=TIMESTAMP_COLUMN, mode="fixed_windows", window_definition=["2 minutes", "15 minutes"], ), ), ], keys=[ KeyFeature( name="id", description="The user's Main ID or device ID", dtype=DataType.BIGINT, ) ], timestamp=TimestampFeature(), ) schema = feature_set.get_schema() assert schema == expected_schema
def feature_set(): feature_set = FeatureSet( name="feature_set", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=SparkFunctionTransform(functions=[ Function(functions.avg, DataType.FLOAT), Function(functions.stddev_pop, DataType.DOUBLE), ]).with_window( partition_by="id", order_by=TIMESTAMP_COLUMN, mode="fixed_windows", window_definition=["2 minutes", "15 minutes"], ), ), ], keys=[ KeyFeature( name="id", description="The user's Main ID or device ID", dtype=DataType.BIGINT, ) ], timestamp=TimestampFeature(), ) return feature_set
def test_run_agg_with_end_date(self, spark_session): test_pipeline = FeatureSetPipeline( spark_client=SparkClient(), source=Mock( spec=Source, readers=[ TableReader( id="source_a", database="db", table="table", ) ], query="select * from source_a", ), feature_set=Mock( spec=AggregatedFeatureSet, name="feature_set", entity="entity", description="description", keys=[ KeyFeature( name="user_id", description="The user's Main ID or device ID", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(from_column="ts"), features=[ Feature( name="listing_page_viewed__rent_per_month", description="Average of something.", transformation=AggregatedTransform(functions=[ Function(functions.avg, DataType.FLOAT), Function(functions.stddev_pop, DataType.FLOAT), ], ), ), ], ), sink=Mock( spec=Sink, writers=[HistoricalFeatureStoreWriter(db_config=None)], ), ) # feature_set need to return a real df for streaming validation sample_df = spark_session.createDataFrame([{ "a": "x", "b": "y", "c": "3" }]) test_pipeline.feature_set.construct.return_value = sample_df test_pipeline.run(end_date="2016-04-18") test_pipeline.source.construct.assert_called_once() test_pipeline.feature_set.construct.assert_called_once() test_pipeline.sink.flush.assert_called_once() test_pipeline.sink.validate.assert_called_once()
def test_source_raise(self): with pytest.raises(ValueError, match="source must be a Source instance"): FeatureSetPipeline( spark_client=SparkClient(), source=Mock( spark_client=SparkClient(), readers=[ TableReader( id="source_a", database="db", table="table", ), ], query="select * from source_a", ), feature_set=Mock( spec=FeatureSet, name="feature_set", entity="entity", description="description", keys=[ KeyFeature( name="user_id", description="The user's Main ID or device ID", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(from_column="ts"), features=[ Feature( name="listing_page_viewed__rent_per_month", description="Average of something.", transformation=SparkFunctionTransform(functions=[ Function(functions.avg, DataType.FLOAT), Function(functions.stddev_pop, DataType.FLOAT), ], ).with_window( partition_by="user_id", order_by=TIMESTAMP_COLUMN, window_definition=["7 days", "2 weeks"], mode="fixed_windows", ), ), ], ), sink=Mock( spec=Sink, writers=[HistoricalFeatureStoreWriter(db_config=None)], ), )
def test_feature_transform(self, feature_set_dataframe, target_df_agg): test_feature = Feature( name="feature1", description="unit test", transformation=AggregatedTransform(functions=[ Function(functions.avg, DataType.DOUBLE), Function(functions.stddev_pop, DataType.DOUBLE), ]), ) # aggregated feature transform won't run transformations # and depends on the feature set with pytest.raises(NotImplementedError): _ = test_feature.transform(feature_set_dataframe)
def test_feature_set_start_date( self, timestamp_c, feature_set_with_distinct_dataframe, ): fs = AggregatedFeatureSet( name="name", entity="entity", description="description", features=[ Feature( name="feature", description="test", transformation=AggregatedTransform( functions=[Function(functions.sum, DataType.INTEGER)]), ), ], keys=[ KeyFeature(name="h3", description="test", dtype=DataType.STRING) ], timestamp=timestamp_c, ).with_windows(["10 days", "3 weeks", "90 days"]) # assert start_date = fs.define_start_date("2016-04-14") assert start_date == "2016-01-14"
def test_feature_transform_with_distinct_empty_subset( self, timestamp_c, feature_set_with_distinct_dataframe): spark_client = SparkClient() with pytest.raises(ValueError, match="The distinct subset param can't be empty."): AggregatedFeatureSet( name="name", entity="entity", description="description", features=[ Feature( name="feature", description="test", transformation=AggregatedTransform(functions=[ Function(functions.sum, DataType.INTEGER) ]), ), ], keys=[ KeyFeature(name="h3", description="test", dtype=DataType.STRING) ], timestamp=timestamp_c, ).with_windows(["3 days"]).with_distinct( subset=[], keep="first").construct(feature_set_with_distinct_dataframe, spark_client, end_date="2020-01-10")
def test_feature_set_with_invalid_feature(self, key_id, timestamp_c, dataframe): spark_client = SparkClient() with pytest.raises(ValueError): AggregatedFeatureSet( name="name", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=SparkFunctionTransform(functions=[ Function(functions.avg, DataType.FLOAT) ], ).with_window( partition_by="id", mode="row_windows", window_definition=["2 events"], ), ), ], keys=[key_id], timestamp=timestamp_c, ).construct(dataframe, spark_client)
def test_feature_transform_with_distinct( self, timestamp_c, feature_set_with_distinct_dataframe, target_with_distinct_dataframe, ): spark_client = SparkClient() fs = (AggregatedFeatureSet( name="name", entity="entity", description="description", features=[ Feature( name="feature", description="test", transformation=AggregatedTransform( functions=[Function(functions.sum, DataType.INTEGER)]), ), ], keys=[ KeyFeature(name="h3", description="test", dtype=DataType.STRING) ], timestamp=timestamp_c, ).with_windows(["3 days"]).with_distinct(subset=["id"], keep="last")) # assert output_df = fs.construct(feature_set_with_distinct_dataframe, spark_client, end_date="2020-01-10") assert_dataframe_equality(output_df, target_with_distinct_dataframe)
def feature_set_pipeline( spark_context, spark_session, ): feature_set_pipeline = FeatureSetPipeline( source=Source( readers=[ TableReader(id="b_source", table="b_table",).with_incremental_strategy( incremental_strategy=IncrementalStrategy(column="timestamp") ), ], query=f"select * from b_source ", # noqa ), feature_set=FeatureSet( name="feature_set", entity="entity", description="description", features=[ Feature( name="feature", description="test", transformation=SparkFunctionTransform( functions=[ Function(F.avg, DataType.FLOAT), Function(F.stddev_pop, DataType.FLOAT), ], ).with_window( partition_by="id", order_by=TIMESTAMP_COLUMN, mode="fixed_windows", window_definition=["1 day"], ), ), ], keys=[ KeyFeature( name="id", description="The user's Main ID or device ID", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(), ), sink=Sink(writers=[HistoricalFeatureStoreWriter(debug_mode=True)]), ) return feature_set_pipeline
def test_unsupported_aggregation(self, feature_set_dataframe): with pytest.raises(TypeError): Feature( name="feature1", description="unit test", transformation=AggregatedTransform( functions=[Function("median", DataType.DOUBLE)]), )
def test_blank_aggregation(self, feature_set_dataframe): with pytest.raises(ValueError): Feature( name="feature1", description="unit test", transformation=AggregatedTransform( functions=[Function(func="", data_type="")]), )
def test_output_columns(self): test_feature = Feature( name="feature1", description="unit test", transformation=AggregatedTransform(functions=[ Function(functions.avg, DataType.DOUBLE), Function(functions.stddev_pop, DataType.DOUBLE), ]), ) df_columns = test_feature.get_output_columns() assert all([ a == b for a, b in zip( df_columns, ["feature1__avg", "feature1__stddev_pop"], ) ])
def test_feature_transform(self, feature_set_dataframe, target_df_spark): test_feature = Feature( name="feature", description="unit test", transformation=SparkFunctionTransform( functions=[Function(functions.cos, DataType.DOUBLE)], ), from_column="feature1", ) output_df = test_feature.transform(feature_set_dataframe) assert_dataframe_equality(output_df, target_df_spark)
def test_negative_windows(self, feature_set_dataframe): with pytest.raises(KeyError): Feature( name="feature1", description="unit test", transformation=SparkFunctionTransform( functions=[Function(functions.avg, DataType.DOUBLE)], ).with_window( partition_by="id", mode="fixed_windows", window_definition=["-2 weeks"], ), ).transform(feature_set_dataframe)
def test_agg_feature_set_with_window(self, key_id, timestamp_c, dataframe, rolling_windows_agg_dataframe): spark_client = SparkClient() fs = AggregatedFeatureSet( name="name", entity="entity", description="description", features=[ Feature( name="feature1", description="unit test", transformation=AggregatedTransform( functions=[Function(functions.avg, DataType.FLOAT)]), ), Feature( name="feature2", description="unit test", transformation=AggregatedTransform( functions=[Function(functions.avg, DataType.FLOAT)]), ), ], keys=[key_id], timestamp=timestamp_c, ).with_windows(definitions=["1 week"]) # raises without end date with pytest.raises(ValueError): _ = fs.construct(dataframe, spark_client) # filters with date smaller then mocked max output_df = fs.construct(dataframe, spark_client, end_date="2016-04-17") assert output_df.count() < rolling_windows_agg_dataframe.count() output_df = fs.construct(dataframe, spark_client, end_date="2016-05-01") assert_dataframe_equality(output_df, rolling_windows_agg_dataframe)
def test_anonymous_function(self): with pytest.raises( AttributeError, match= "Anonymous functions are not supported on AggregatedTransform.", ): Feature( name="feature1", description="unit test", transformation=AggregatedTransform(functions=[ Function(func=partial(functions.count), data_type=DataType.INTEGER) ]), ).get_output_columns()
def test_feature_transform_with_window(self, feature_set_dataframe, target_df_rows_agg): test_feature = Feature( name="feature1", description="unit test", transformation=SparkFunctionTransform(functions=[ Function(functions.avg, DataType.DOUBLE) ], ).with_window( partition_by="id", mode="row_windows", window_definition=["2 events", "3 events"], ), ) output_df = test_feature.transform(feature_set_dataframe) assert_dataframe_equality(output_df, target_df_rows_agg)
def test_feature_set_with_invalid_feature(self, key_id, timestamp_c, dataframe): spark_client = SparkClient() with pytest.raises(ValueError): FeatureSet( name="name", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=AggregatedTransform( functions=[Function(F.avg, DataType.FLOAT)]), ), ], keys=[key_id], timestamp=timestamp_c, ).construct(dataframe, spark_client)
def feature_set_incremental(): key_features = [ KeyFeature(name="id", description="Description", dtype=DataType.INTEGER) ] ts_feature = TimestampFeature(from_column=TIMESTAMP_COLUMN) features = [ Feature( name="feature", description="test", transformation=AggregatedTransform( functions=[Function(functions.sum, DataType.INTEGER)] ), ), ] return AggregatedFeatureSet( "feature_set", "entity", "description", keys=key_features, timestamp=ts_feature, features=features, )
def test_output_columns(self): test_feature = Feature( name="feature1", description="unit test", transformation=SparkFunctionTransform(functions=[ Function(functions.avg, DataType.DOUBLE) ], ).with_window( partition_by="id", mode="fixed_windows", window_definition=["7 days", "2 weeks"], ), ) df_columns = test_feature.get_output_columns() assert all([ a == b for a, b in zip( df_columns, [ "feature1__avg_over_7_days_fixed_windows", "feature1__avg_over_2_weeks_fixed_windows", ], ) ])
def test_feature_set_pipeline( self, mocked_df, spark_session, fixed_windows_output_feature_set_dataframe, ): # arrange table_reader_id = "a_source" table_reader_table = "table" table_reader_db = environment.get_variable( "FEATURE_STORE_HISTORICAL_DATABASE") create_temp_view(dataframe=mocked_df, name=table_reader_id) create_db_and_table( spark=spark_session, table_reader_id=table_reader_id, table_reader_db=table_reader_db, table_reader_table=table_reader_table, ) dbconfig = Mock() dbconfig.mode = "overwrite" dbconfig.format_ = "parquet" dbconfig.get_options = Mock( return_value={"path": "test_folder/historical/entity/feature_set"}) historical_writer = HistoricalFeatureStoreWriter(db_config=dbconfig) # act test_pipeline = FeatureSetPipeline( source=Source( readers=[ TableReader( id=table_reader_id, database=table_reader_db, table=table_reader_table, ), ], query=f"select * from {table_reader_id} ", # noqa ), feature_set=FeatureSet( name="feature_set", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=SparkFunctionTransform(functions=[ Function(F.avg, DataType.FLOAT), Function(F.stddev_pop, DataType.FLOAT), ], ).with_window( partition_by="id", order_by=TIMESTAMP_COLUMN, mode="fixed_windows", window_definition=["2 minutes", "15 minutes"], ), ), Feature( name="divided_feature", description="unit test", dtype=DataType.FLOAT, transformation=CustomTransform( transformer=divide, column1="feature1", column2="feature2", ), ), ], keys=[ KeyFeature( name="id", description="The user's Main ID or device ID", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(), ), sink=Sink(writers=[historical_writer]), ) test_pipeline.run() # assert path = dbconfig.get_options("historical/entity/feature_set").get( "path") df = spark_session.read.parquet(path).orderBy(TIMESTAMP_COLUMN) target_df = fixed_windows_output_feature_set_dataframe.orderBy( test_pipeline.feature_set.timestamp_column) # assert assert_dataframe_equality(df, target_df) # tear down shutil.rmtree("test_folder")
def test_feature_transform_with_data_type_array(self, spark_context, spark_session): # arrange input_data = [ { "id": 1, "timestamp": "2020-04-22T00:00:00+00:00", "feature": 10 }, { "id": 1, "timestamp": "2020-04-22T00:00:00+00:00", "feature": 20 }, { "id": 1, "timestamp": "2020-04-22T00:00:00+00:00", "feature": 30 }, { "id": 2, "timestamp": "2020-04-22T00:00:00+00:00", "feature": 10 }, ] target_data = [ { "id": 1, "timestamp": "2020-04-22T00:00:00+00:00", "feature__collect_set": [30.0, 20.0, 10.0], }, { "id": 2, "timestamp": "2020-04-22T00:00:00+00:00", "feature__collect_set": [10.0], }, ] input_df = create_df_from_collection( input_data, spark_context, spark_session).withColumn( "timestamp", functions.to_timestamp(functions.col("timestamp"))) target_df = create_df_from_collection( target_data, spark_context, spark_session).withColumn( "timestamp", functions.to_timestamp(functions.col("timestamp"))) fs = AggregatedFeatureSet( name="name", entity="entity", description="description", keys=[ KeyFeature(name="id", description="test", dtype=DataType.INTEGER) ], timestamp=TimestampFeature(), features=[ Feature( name="feature", description="aggregations with ", dtype=DataType.BIGINT, transformation=AggregatedTransform(functions=[ Function(functions.collect_set, DataType.ARRAY_FLOAT), ], ), from_column="feature", ), ], ) # act output_df = fs.construct(input_df, SparkClient()) # assert assert_dataframe_equality(target_df, output_df)
def test_get_schema(self): expected_schema = [ { "column_name": "id", "type": LongType(), "primary_key": True }, { "column_name": "timestamp", "type": TimestampType(), "primary_key": False }, { "column_name": "feature1__avg_over_1_week_rolling_windows", "type": DoubleType(), "primary_key": False, }, { "column_name": "feature1__avg_over_2_days_rolling_windows", "type": DoubleType(), "primary_key": False, }, { "column_name": "feature1__stddev_pop_over_1_week_rolling_windows", "type": FloatType(), "primary_key": False, }, { "column_name": "feature1__stddev_pop_over_2_days_rolling_windows", "type": FloatType(), "primary_key": False, }, { "column_name": "feature2__count_over_1_week_rolling_windows", "type": ArrayType(StringType(), True), "primary_key": False, }, { "column_name": "feature2__count_over_2_days_rolling_windows", "type": ArrayType(StringType(), True), "primary_key": False, }, ] feature_set = AggregatedFeatureSet( name="feature_set", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=AggregatedTransform(functions=[ Function(functions.avg, DataType.DOUBLE), Function(functions.stddev_pop, DataType.FLOAT), ], ), ), Feature( name="feature2", description="test", transformation=AggregatedTransform(functions=[ Function(functions.count, DataType.ARRAY_STRING) ]), ), ], keys=[ KeyFeature( name="id", description="The user's Main ID or device ID", dtype=DataType.BIGINT, ) ], timestamp=TimestampFeature(), ).with_windows(definitions=["1 week", "2 days"]) schema = feature_set.get_schema() assert schema == expected_schema
def test_feature_transform_with_filter_expression(self, spark_context, spark_session): # arrange input_data = [ { "id": 1, "timestamp": "2020-04-22T00:00:00+00:00", "feature": 10, "type": "a", }, { "id": 1, "timestamp": "2020-04-22T00:00:00+00:00", "feature": 20, "type": "a", }, { "id": 1, "timestamp": "2020-04-22T00:00:00+00:00", "feature": 30, "type": "b", }, { "id": 2, "timestamp": "2020-04-22T00:00:00+00:00", "feature": 10, "type": "a", }, ] target_data = [ { "id": 1, "timestamp": "2020-04-22T00:00:00+00:00", "feature_only_type_a__avg": 15.0, "feature_only_type_a__min": 10, "feature_only_type_a__max": 20, }, { "id": 2, "timestamp": "2020-04-22T00:00:00+00:00", "feature_only_type_a__avg": 10.0, "feature_only_type_a__min": 10, "feature_only_type_a__max": 10, }, ] input_df = create_df_from_collection( input_data, spark_context, spark_session).withColumn( "timestamp", functions.to_timestamp(functions.col("timestamp"))) target_df = create_df_from_collection( target_data, spark_context, spark_session).withColumn( "timestamp", functions.to_timestamp(functions.col("timestamp"))) fs = AggregatedFeatureSet( name="name", entity="entity", description="description", keys=[ KeyFeature(name="id", description="test", dtype=DataType.INTEGER) ], timestamp=TimestampFeature(), features=[ Feature( name="feature_only_type_a", description="aggregations only when type = a", dtype=DataType.BIGINT, transformation=AggregatedTransform( functions=[ Function(functions.avg, DataType.FLOAT), Function(functions.min, DataType.FLOAT), Function(functions.max, DataType.FLOAT), ], filter_expression="type = 'a'", ), from_column="feature", ), ], ) # act output_df = fs.construct(input_df, SparkClient()) # assert assert_dataframe_equality(target_df, output_df)
def test_feature_set_args(self): # arrange and act out_columns = [ "user_id", "timestamp", "listing_page_viewed__rent_per_month__avg_over_7_days_fixed_windows", "listing_page_viewed__rent_per_month__avg_over_2_weeks_fixed_windows", "listing_page_viewed__rent_per_month__stddev_pop_over_7_days_fixed_windows", "listing_page_viewed__rent_per_month__" "stddev_pop_over_2_weeks_fixed_windows", # noqa ] pipeline = FeatureSetPipeline( source=Source( readers=[ TableReader( id="source_a", database="db", table="table", ), FileReader( id="source_b", path="path", format="parquet", ), ], query="select a.*, b.specific_feature " "from source_a left join source_b on a.id=b.id", ), feature_set=FeatureSet( name="feature_set", entity="entity", description="description", keys=[ KeyFeature( name="user_id", description="The user's Main ID or device ID", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(from_column="ts"), features=[ Feature( name="listing_page_viewed__rent_per_month", description="Average of something.", transformation=SparkFunctionTransform(functions=[ Function(functions.avg, DataType.FLOAT), Function(functions.stddev_pop, DataType.FLOAT), ], ).with_window( partition_by="user_id", order_by=TIMESTAMP_COLUMN, window_definition=["7 days", "2 weeks"], mode="fixed_windows", ), ), ], ), sink=Sink(writers=[ HistoricalFeatureStoreWriter(db_config=None), OnlineFeatureStoreWriter(db_config=None), ], ), ) assert isinstance(pipeline.spark_client, SparkClient) assert len(pipeline.source.readers) == 2 assert all( isinstance(reader, Reader) for reader in pipeline.source.readers) assert isinstance(pipeline.source.query, str) assert pipeline.feature_set.name == "feature_set" assert pipeline.feature_set.entity == "entity" assert pipeline.feature_set.description == "description" assert isinstance(pipeline.feature_set.timestamp, TimestampFeature) assert len(pipeline.feature_set.keys) == 1 assert all( isinstance(k, KeyFeature) for k in pipeline.feature_set.keys) assert len(pipeline.feature_set.features) == 1 assert all( isinstance(feature, Feature) for feature in pipeline.feature_set.features) assert pipeline.feature_set.columns == out_columns assert len(pipeline.sink.writers) == 2 assert all( isinstance(writer, Writer) for writer in pipeline.sink.writers)