def test_output_columns(self): # arrange h3_feature = Feature( name="new_feature", description="unit test", dtype=DataType.STRING, transformation=H3HashTransform( h3_resolutions=[6, 7, 8, 9, 10, 11, 12], lat_column="lat", lng_column="lng", ), ) target_columns = [ "lat_lng__h3_hash__6", "lat_lng__h3_hash__7", "lat_lng__h3_hash__8", "lat_lng__h3_hash__9", "lat_lng__h3_hash__10", "lat_lng__h3_hash__11", "lat_lng__h3_hash__12", ] # act output_columns = h3_feature.get_output_columns() # assert assert sorted(output_columns) == sorted(target_columns)
def agg_feature_set(): return AggregatedFeatureSet( name="name", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=AggregatedTransform( functions=[Function(functions.avg, DataType.DOUBLE)], ), ), Feature( name="feature2", description="test", transformation=AggregatedTransform( functions=[Function(functions.avg, DataType.DOUBLE)]), ), ], keys=[ KeyFeature( name="id", description="description", dtype=DataType.BIGINT, ) ], timestamp=TimestampFeature(), )
def test_construct( self, feature_set_dataframe, fixed_windows_output_feature_set_dataframe ): # given spark_client = SparkClient() # arrange feature_set = FeatureSet( name="feature_set", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=SparkFunctionTransform( functions=[ Function(F.avg, DataType.FLOAT), Function(F.stddev_pop, DataType.FLOAT), ] ).with_window( partition_by="id", order_by=TIMESTAMP_COLUMN, mode="fixed_windows", window_definition=["2 minutes", "15 minutes"], ), ), Feature( name="divided_feature", description="unit test", dtype=DataType.FLOAT, transformation=CustomTransform( transformer=divide, column1="feature1", column2="feature2", ), ), ], keys=[ KeyFeature( name="id", description="The user's Main ID or device ID", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(), ) output_df = ( feature_set.construct(feature_set_dataframe, client=spark_client) .orderBy(feature_set.timestamp_column) .select(feature_set.columns) ) target_df = fixed_windows_output_feature_set_dataframe.orderBy( feature_set.timestamp_column ).select(feature_set.columns) # assert assert_dataframe_equality(output_df, target_df)
def test_feature_set(): return AggregatedFeatureSet( name="feature_set", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=AggregatedTransform(functions=[ Function(functions.avg, DataType.DOUBLE), Function(functions.stddev_pop, DataType.DOUBLE), ]), ), Feature( name="feature2", description="test", transformation=AggregatedTransform( functions=[Function(functions.count, DataType.INTEGER)]), ), ], keys=[ KeyFeature( name="id", description="The user's Main ID or device ID", dtype=DataType.BIGINT, ) ], timestamp=TimestampFeature(), ).with_windows(definitions=["1 week", "2 days"])
def __init__(self): super(FirstPipeline, self).__init__( source=Source( readers=[TableReader(id="t", database="db", table="table",)], query=f"select * from t", # noqa ), feature_set=FeatureSet( name="first", entity="entity", description="description", features=[ Feature(name="feature1", description="test", dtype=DataType.FLOAT,), Feature( name="feature2", description="another test", dtype=DataType.STRING, ), ], keys=[ KeyFeature( name="id", description="identifier", dtype=DataType.BIGINT, ) ], timestamp=TimestampFeature(), ), sink=Sink( writers=[HistoricalFeatureStoreWriter(), OnlineFeatureStoreWriter()] ), )
def test_construct_rolling_windows_with_end_date( self, feature_set_dataframe, rolling_windows_output_feature_set_dataframe_base_date, ): # given spark_client = SparkClient() # arrange feature_set = AggregatedFeatureSet( name="feature_set", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=AggregatedTransform( functions=[ Function(F.avg, DataType.DOUBLE), Function(F.stddev_pop, DataType.DOUBLE), ], ), ), Feature( name="feature2", description="test", transformation=AggregatedTransform( functions=[ Function(F.avg, DataType.DOUBLE), Function(F.stddev_pop, DataType.DOUBLE), ], ), ), ], keys=[ KeyFeature( name="id", description="The user's Main ID or device ID", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(), ).with_windows(definitions=["1 day", "1 week"]) # act output_df = feature_set.construct( feature_set_dataframe, client=spark_client, end_date="2016-04-18" ).orderBy("timestamp") target_df = rolling_windows_output_feature_set_dataframe_base_date.orderBy( feature_set.timestamp_column ).select(feature_set.columns) # assert assert_dataframe_equality(output_df, target_df)
def test_feature_transform_with_dtype(self, feature_set_dataframe): test_feature = Feature( name="feature", description="unit test", dtype=DataType.TIMESTAMP, ) df = test_feature.transform(feature_set_dataframe) assert dict(df.dtypes).get("feature") == "timestamp"
def test_feature_get_output_columns_without_transformations(self): test_feature = Feature( name="feature", from_column="origin", description="unit test", dtype=DataType.BIGINT, ) assert test_feature.get_output_columns() == [test_feature.name]
def test_feature_transform_invalid_output(self, feature_set_dataframe): with pytest.raises(Exception): test_feature = Feature( name="feature1_plus_a", description="unit test", dtype=DataType.FLOAT, transformation=SQLExpressionTransform( expression="feature2 + a"), ) test_feature.transform(feature_set_dataframe).collect()
def test_feature_transform(self, feature_set_dataframe, target_df_spark): test_feature = Feature( name="feature", description="unit test", transformation=SparkFunctionTransform( functions=[Function(functions.cos, DataType.DOUBLE)], ), from_column="feature1", ) output_df = test_feature.transform(feature_set_dataframe) assert_dataframe_equality(output_df, target_df_spark)
def test_feature_transform_no_from_column(self, feature_set_dataframe): test_feature = Feature( name="feature", description="unit test feature without transformation", dtype=DataType.BIGINT, ) df = test_feature.transform(feature_set_dataframe) assert all([ a == b for a, b in zip(df.columns, feature_set_dataframe.columns) ])
def test_feature_transform(self, feature_set_dataframe, target_df_agg): test_feature = Feature( name="feature1", description="unit test", transformation=AggregatedTransform(functions=[ Function(functions.avg, DataType.DOUBLE), Function(functions.stddev_pop, DataType.DOUBLE), ]), ) # aggregated feature transform won't run transformations # and depends on the feature set with pytest.raises(NotImplementedError): _ = test_feature.transform(feature_set_dataframe)
def test_feature_transform_output(self, feature_set_dataframe): test_feature = Feature( name="feature1_over_feature2", description="unit test", dtype=DataType.FLOAT, transformation=SQLExpressionTransform( expression="feature1/feature2"), ) df = test_feature.transform(feature_set_dataframe).collect() assert df[0]["feature1_over_feature2"] == 1 assert df[1]["feature1_over_feature2"] == 1 assert df[2]["feature1_over_feature2"] == 1 assert df[3]["feature1_over_feature2"] == 1
def test_output_columns(self, feature_set_dataframe): test_feature = Feature( name="feature", description="unit test", dtype=DataType.BIGINT, transformation=CustomTransform( transformer=divide, column1="feature1", column2="feature2", ), ) df_columns = test_feature.get_output_columns() assert isinstance(df_columns, list) assert df_columns == ["feature"]
def test_h3_feature_set(self, h3_input_df, h3_target_df): spark_client = SparkClient() feature_set = AggregatedFeatureSet( name="h3_test", entity="h3geolocation", description="Test", keys=[ KeyFeature( name="h3_id", description="The h3 hash ID", dtype=DataType.DOUBLE, transformation=H3HashTransform( h3_resolutions=[6, 7, 8, 9, 10, 11, 12], lat_column="lat", lng_column="lng", ).with_stack(), ) ], timestamp=TimestampFeature(), features=[ Feature( name="house_id", description="Count of house ids over a day.", transformation=AggregatedTransform( functions=[Function(F.count, DataType.BIGINT)]), ), ], ).with_windows(definitions=["1 day"]) output_df = feature_set.construct(h3_input_df, client=spark_client, end_date="2016-04-14") assert_dataframe_equality(output_df, h3_target_df)
def test_feature_get_output_columns_with_transformations( self, feature_set_dataframe): some_transformation = Mock() some_transformation.output_columns = feature_set_dataframe.columns test_feature = Feature( name="feature", from_column="origin", description="unit test", transformation=some_transformation, dtype=DataType.BIGINT, ) assert test_feature.get_output_columns( ) == feature_set_dataframe.columns
def test_overwriting_column(self, spark_session): # arrange input_df = spark_session.sql("select 0 as feature") feature_with_same_name = Feature( name="feature", description="description", dtype=DataType.INTEGER, transformation=SQLExpressionTransform(expression="feature + 1"), ) target_df = spark_session.sql("select 1 as feature") # act output_df = feature_with_same_name.transform(input_df) # assert assert_dataframe_equality(output_df, target_df)
def test_aggregations_with_filter_expression(self, spark_context): # arrange test_feature = Feature( name="feature_with_filter", description="unit test", transformation=AggregatedTransform( functions=[ Function(functions.avg, DataType.DOUBLE), Function(functions.min, DataType.DOUBLE), Function(functions.max, DataType.DOUBLE), ], filter_expression="type = 'a'", ), from_column="feature", ) target_aggregations = [ agg( functions.when(functions.expr("type = 'a'"), functions.col("feature"))) for agg in [functions.avg, functions.min, functions.max] ] # act output_aggregations = [ agg.function for agg in test_feature.transformation.aggregations ] # assert # cast to string to compare the columns definitions because direct column # comparison was not working assert str(target_aggregations) == str(output_aggregations)
def test_output_columns(self): test_feature = Feature( name="feature1_over_feature2", description="unit test", dtype=DataType.FLOAT, transformation=SQLExpressionTransform( expression="feature1/feature2"), ) df_columns = test_feature.get_output_columns() assert all( [a == b for a, b in zip( df_columns, ["feature1_over_feature2"], )])
def test_custom_transform_output(self, feature_set_dataframe): test_feature = Feature( name="feature", description="unit test", dtype=DataType.BIGINT, transformation=CustomTransform( transformer=divide, column1="feature1", column2="feature2", ), ) df = test_feature.transform(feature_set_dataframe).collect() assert df[0]["feature"] == 1 assert df[1]["feature"] == 1 assert df[2]["feature"] == 1 assert df[3]["feature"] == 1
def ratio_order_amount_and_items(): return Feature(name="ratio_order_amount_by_items_val", description="ratio order amount by items count", dtype=DataType.DOUBLE, transformation=CustomTransform(transformer=divide, column1="order_total_amount", column2="items_qtd"))
def test_columns_not_in_dataframe(self, spark_context, spark_session): # arrange input_df = create_df_from_collection(self.input_data, spark_context, spark_session) feature = Feature( name="id", description="stack transformation", dtype=DataType.STRING, transformation=StackTransform("id_c", "id_d"), ) # act and assert with pytest.raises(ValueError, match="Columns not found, columns in df: "): feature.transform(input_df)
def test_feature_transform_with_from_column(self, feature_set_dataframe): test_feature = Feature( name="new_feature", from_column="feature", description="unit test", dtype=DataType.BIGINT, ) df = test_feature.transform(feature_set_dataframe) assert all([ a == b for a, b in zip( sorted(df.columns), sorted(["new_feature", "id", TIMESTAMP_COLUMN, "feature"]), ) ])
def test_get_schema(self): expected_schema = [ {"column_name": "id", "type": LongType(), "primary_key": True}, {"column_name": "timestamp", "type": TimestampType(), "primary_key": False}, { "column_name": "feature1__avg_over_2_minutes_fixed_windows", "type": FloatType(), "primary_key": False, }, { "column_name": "feature1__avg_over_15_minutes_fixed_windows", "type": FloatType(), "primary_key": False, }, { "column_name": "feature1__stddev_pop_over_2_minutes_fixed_windows", "type": DoubleType(), "primary_key": False, }, { "column_name": "feature1__stddev_pop_over_15_minutes_fixed_windows", "type": DoubleType(), "primary_key": False, }, ] feature_set = FeatureSet( name="feature_set", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=SparkFunctionTransform( functions=[ Function(F.avg, DataType.FLOAT), Function(F.stddev_pop, DataType.DOUBLE), ] ).with_window( partition_by="id", order_by=TIMESTAMP_COLUMN, mode="fixed_windows", window_definition=["2 minutes", "15 minutes"], ), ), ], keys=[ KeyFeature( name="id", description="The user's Main ID or device ID", dtype=DataType.BIGINT, ) ], timestamp=TimestampFeature(), ) schema = feature_set.get_schema() assert schema == expected_schema
def test_feature_set_start_date( self, timestamp_c, feature_set_with_distinct_dataframe, ): fs = AggregatedFeatureSet( name="name", entity="entity", description="description", features=[ Feature( name="feature", description="test", transformation=AggregatedTransform( functions=[Function(functions.sum, DataType.INTEGER)]), ), ], keys=[ KeyFeature(name="h3", description="test", dtype=DataType.STRING) ], timestamp=timestamp_c, ).with_windows(["10 days", "3 weeks", "90 days"]) # assert start_date = fs.define_start_date("2016-04-14") assert start_date == "2016-01-14"
def feature_set(): feature_set = FeatureSet( name="feature_set", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=SparkFunctionTransform(functions=[ Function(functions.avg, DataType.FLOAT), Function(functions.stddev_pop, DataType.DOUBLE), ]).with_window( partition_by="id", order_by=TIMESTAMP_COLUMN, mode="fixed_windows", window_definition=["2 minutes", "15 minutes"], ), ), ], keys=[ KeyFeature( name="id", description="The user's Main ID or device ID", dtype=DataType.BIGINT, ) ], timestamp=TimestampFeature(), ) return feature_set
def test_feature_transform_with_distinct_empty_subset( self, timestamp_c, feature_set_with_distinct_dataframe): spark_client = SparkClient() with pytest.raises(ValueError, match="The distinct subset param can't be empty."): AggregatedFeatureSet( name="name", entity="entity", description="description", features=[ Feature( name="feature", description="test", transformation=AggregatedTransform(functions=[ Function(functions.sum, DataType.INTEGER) ]), ), ], keys=[ KeyFeature(name="h3", description="test", dtype=DataType.STRING) ], timestamp=timestamp_c, ).with_windows(["3 days"]).with_distinct( subset=[], keep="first").construct(feature_set_with_distinct_dataframe, spark_client, end_date="2020-01-10")
def test_feature_transform_with_distinct( self, timestamp_c, feature_set_with_distinct_dataframe, target_with_distinct_dataframe, ): spark_client = SparkClient() fs = (AggregatedFeatureSet( name="name", entity="entity", description="description", features=[ Feature( name="feature", description="test", transformation=AggregatedTransform( functions=[Function(functions.sum, DataType.INTEGER)]), ), ], keys=[ KeyFeature(name="h3", description="test", dtype=DataType.STRING) ], timestamp=timestamp_c, ).with_windows(["3 days"]).with_distinct(subset=["id"], keep="last")) # assert output_df = fs.construct(feature_set_with_distinct_dataframe, spark_client, end_date="2020-01-10") assert_dataframe_equality(output_df, target_with_distinct_dataframe)
def test_feature_set_with_invalid_feature(self, key_id, timestamp_c, dataframe): spark_client = SparkClient() with pytest.raises(ValueError): AggregatedFeatureSet( name="name", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=SparkFunctionTransform(functions=[ Function(functions.avg, DataType.FLOAT) ], ).with_window( partition_by="id", mode="row_windows", window_definition=["2 events"], ), ), ], keys=[key_id], timestamp=timestamp_c, ).construct(dataframe, spark_client)
def test_construct_without_window( self, feature_set_dataframe, target_df_without_window, ): # given spark_client = SparkClient() # arrange feature_set = AggregatedFeatureSet( name="feature_set", entity="entity", description="description", features=[ Feature( name="feature1", description="test", dtype=DataType.DOUBLE, transformation=AggregatedTransform( functions=[Function(F.avg, DataType.DOUBLE)]), ), Feature( name="feature2", description="test", dtype=DataType.FLOAT, transformation=AggregatedTransform( functions=[Function(F.count, DataType.BIGINT)]), ), ], keys=[ KeyFeature( name="id", description="The user's Main ID or device ID", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(from_column="fixed_ts"), ) # act output_df = feature_set.construct(feature_set_dataframe, client=spark_client) # assert assert_dataframe_equality(output_df, target_df_without_window)