def ratio_order_amount_and_items(): return Feature(name="ratio_order_amount_by_items_val", description="ratio order amount by items count", dtype=DataType.DOUBLE, transformation=CustomTransform(transformer=divide, column1="order_total_amount", column2="items_qtd"))
def test_construct( self, feature_set_dataframe, fixed_windows_output_feature_set_dataframe ): # given spark_client = SparkClient() # arrange feature_set = FeatureSet( name="feature_set", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=SparkFunctionTransform( functions=[ Function(F.avg, DataType.FLOAT), Function(F.stddev_pop, DataType.FLOAT), ] ).with_window( partition_by="id", order_by=TIMESTAMP_COLUMN, mode="fixed_windows", window_definition=["2 minutes", "15 minutes"], ), ), Feature( name="divided_feature", description="unit test", dtype=DataType.FLOAT, transformation=CustomTransform( transformer=divide, column1="feature1", column2="feature2", ), ), ], keys=[ KeyFeature( name="id", description="The user's Main ID or device ID", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(), ) output_df = ( feature_set.construct(feature_set_dataframe, client=spark_client) .orderBy(feature_set.timestamp_column) .select(feature_set.columns) ) target_df = fixed_windows_output_feature_set_dataframe.orderBy( feature_set.timestamp_column ).select(feature_set.columns) # assert assert_dataframe_equality(output_df, target_df)
def ratio_order_amount_and_average_ticket(): return Feature( name="ratio_order_amount_by_average_ticket_val", description="ratio order amount by restaurant average ticket", dtype=DataType.DOUBLE, transformation=CustomTransform(transformer=divide, column1="order_total_amount", column2="average_ticket"))
def test_blank_transformer(self, feature_set_dataframe): with pytest.raises(ValueError): Feature( name="feature", description="unit test", dtype=DataType.BIGINT, transformation=CustomTransform(transformer=None), )
def test_output_columns(self, feature_set_dataframe): test_feature = Feature( name="feature", description="unit test", dtype=DataType.BIGINT, transformation=CustomTransform( transformer=divide, column1="feature1", column2="feature2", ), ) df_columns = test_feature.get_output_columns() assert isinstance(df_columns, list) assert df_columns == ["feature"]
def test_custom_transform_output(self, feature_set_dataframe): test_feature = Feature( name="feature", description="unit test", dtype=DataType.BIGINT, transformation=CustomTransform( transformer=divide, column1="feature1", column2="feature2", ), ) df = test_feature.transform(feature_set_dataframe).collect() assert df[0]["feature"] == 1 assert df[1]["feature"] == 1 assert df[2]["feature"] == 1 assert df[3]["feature"] == 1
def test_feature_transform(self, feature_set_dataframe): test_feature = Feature( name="feature", description="unit test", dtype=DataType.BIGINT, transformation=CustomTransform( transformer=divide, column1="feature1", column2="feature2", ), ) df = test_feature.transform(feature_set_dataframe) assert all( [ a == b for a, b in zip( df.columns, ["feature1", "feature2", "id", TIMESTAMP_COLUMN, "feature"], ) ] )
def test_feature_set_pipeline( self, mocked_df, spark_session, fixed_windows_output_feature_set_dataframe, ): # arrange table_reader_id = "a_source" table_reader_table = "table" table_reader_db = environment.get_variable( "FEATURE_STORE_HISTORICAL_DATABASE") create_temp_view(dataframe=mocked_df, name=table_reader_id) create_db_and_table( spark=spark_session, table_reader_id=table_reader_id, table_reader_db=table_reader_db, table_reader_table=table_reader_table, ) dbconfig = Mock() dbconfig.mode = "overwrite" dbconfig.format_ = "parquet" dbconfig.get_options = Mock( return_value={"path": "test_folder/historical/entity/feature_set"}) historical_writer = HistoricalFeatureStoreWriter(db_config=dbconfig) # act test_pipeline = FeatureSetPipeline( source=Source( readers=[ TableReader( id=table_reader_id, database=table_reader_db, table=table_reader_table, ), ], query=f"select * from {table_reader_id} ", # noqa ), feature_set=FeatureSet( name="feature_set", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=SparkFunctionTransform(functions=[ Function(F.avg, DataType.FLOAT), Function(F.stddev_pop, DataType.FLOAT), ], ).with_window( partition_by="id", order_by=TIMESTAMP_COLUMN, mode="fixed_windows", window_definition=["2 minutes", "15 minutes"], ), ), Feature( name="divided_feature", description="unit test", dtype=DataType.FLOAT, transformation=CustomTransform( transformer=divide, column1="feature1", column2="feature2", ), ), ], keys=[ KeyFeature( name="id", description="The user's Main ID or device ID", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(), ), sink=Sink(writers=[historical_writer]), ) test_pipeline.run() # assert path = dbconfig.get_options("historical/entity/feature_set").get( "path") df = spark_session.read.parquet(path).orderBy(TIMESTAMP_COLUMN) target_df = fixed_windows_output_feature_set_dataframe.orderBy( test_pipeline.feature_set.timestamp_column) # assert assert_dataframe_equality(df, target_df) # tear down shutil.rmtree("test_folder")
def count_items_in_order(): return Feature(name="items_qtd", description="count number of items in order", dtype=DataType.INTEGER, transformation=CustomTransform(transformer=count_items, column="items"))
def avg_order_total_amount_from_last_1_month(): return Feature(name="avg_order_amount_from_last_1_month_val", description="average order amount from last 1 month", dtype=DataType.DOUBLE, transformation=CustomTransform(transformer=avg_last_1_month, column="order_total_amount"))