def test_filtering( self, filtering_dataframe, key_id, timestamp_c, feature1, feature2, feature3, output_filtering_dataframe, ): spark_client = Mock() # arrange feature_set = FeatureSet( "name", "entity", "description", [key_id], timestamp_c, [feature1, feature2, feature3], ) # act result_df = (feature_set.construct( filtering_dataframe, spark_client).orderBy("timestamp").collect()) # assert assert (result_df == output_filtering_dataframe.orderBy( "timestamp").select(feature_set.columns).collect())
def test_construct_transformations( self, dataframe, feature_set_dataframe, key_id, timestamp_c, feature_add, feature_divide, ): spark_client = Mock() # arrange feature_set = FeatureSet( "name", "entity", "description", [key_id], timestamp_c, [feature_add, feature_divide], ) # act result_df = feature_set.construct(dataframe, spark_client) # assert assert_dataframe_equality(result_df, feature_set_dataframe)
def test_get_schema(self): expected_schema = [ {"column_name": "id", "type": LongType(), "primary_key": True}, {"column_name": "timestamp", "type": TimestampType(), "primary_key": False}, { "column_name": "feature1__avg_over_2_minutes_fixed_windows", "type": FloatType(), "primary_key": False, }, { "column_name": "feature1__avg_over_15_minutes_fixed_windows", "type": FloatType(), "primary_key": False, }, { "column_name": "feature1__stddev_pop_over_2_minutes_fixed_windows", "type": DoubleType(), "primary_key": False, }, { "column_name": "feature1__stddev_pop_over_15_minutes_fixed_windows", "type": DoubleType(), "primary_key": False, }, ] feature_set = FeatureSet( name="feature_set", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=SparkFunctionTransform( functions=[ Function(F.avg, DataType.FLOAT), Function(F.stddev_pop, DataType.DOUBLE), ] ).with_window( partition_by="id", order_by=TIMESTAMP_COLUMN, mode="fixed_windows", window_definition=["2 minutes", "15 minutes"], ), ), ], keys=[ KeyFeature( name="id", description="The user's Main ID or device ID", dtype=DataType.BIGINT, ) ], timestamp=TimestampFeature(), ) schema = feature_set.get_schema() assert schema == expected_schema
def test_construct( self, feature_set_dataframe, fixed_windows_output_feature_set_dataframe ): # given spark_client = SparkClient() # arrange feature_set = FeatureSet( name="feature_set", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=SparkFunctionTransform( functions=[ Function(F.avg, DataType.FLOAT), Function(F.stddev_pop, DataType.FLOAT), ] ).with_window( partition_by="id", order_by=TIMESTAMP_COLUMN, mode="fixed_windows", window_definition=["2 minutes", "15 minutes"], ), ), Feature( name="divided_feature", description="unit test", dtype=DataType.FLOAT, transformation=CustomTransform( transformer=divide, column1="feature1", column2="feature2", ), ), ], keys=[ KeyFeature( name="id", description="The user's Main ID or device ID", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(), ) output_df = ( feature_set.construct(feature_set_dataframe, client=spark_client) .orderBy(feature_set.timestamp_column) .select(feature_set.columns) ) target_df = fixed_windows_output_feature_set_dataframe.orderBy( feature_set.timestamp_column ).select(feature_set.columns) # assert assert_dataframe_equality(output_df, target_df)
def test_construct( self, dataframe, feature_set_dataframe, key_id, timestamp_c, feature_add, feature_divide, ): spark_client = Mock() # arrange feature_set = FeatureSet( "name", "entity", "description", [key_id], timestamp_c, [feature_add, feature_divide], ) # act result_df = feature_set.construct(dataframe, spark_client) result_columns = result_df.columns # assert assert (result_columns == key_id.get_output_columns() + timestamp_c.get_output_columns() + feature_add.get_output_columns() + feature_divide.get_output_columns()) assert_dataframe_equality(result_df, feature_set_dataframe) assert result_df.is_cached
def test_columns(self, key_id, timestamp_c, feature_add, feature_divide): # arrange name = "name" entity = "entity" description = "description" # act fs = FeatureSet( name, entity, description, [key_id], timestamp_c, [feature_add, feature_divide], ) out_columns = fs.columns # assert assert ( out_columns == key_id.get_output_columns() + timestamp_c.get_output_columns() + feature_add.get_output_columns() + feature_divide.get_output_columns() )
def transformer(): # primary key keys = [ KeyFeature( name="customer_id", description="Unique identificator code for customer.", from_column="customer_id", dtype=DataType.STRING, ) ] ts_feature = TimestampFeature(from_column="order_created_at") # features transformations features = [ #order_total_amount(), count_items_in_order(), avg_order_total_amount_from_last_1_month(), ratio_order_amount_and_items(), ratio_order_amount_and_average_ticket() ] # joining all together feature_set = FeatureSet( name="orders_feature_master_table", entity= "orders_feature_master_table", # entity: to which "business context" this feature set belongs description="Features describring events about ifood store.", keys=keys, timestamp=ts_feature, features=features, ) return feature_set
def feature_set(): feature_set = FeatureSet( name="feature_set", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=SparkFunctionTransform(functions=[ Function(functions.avg, DataType.FLOAT), Function(functions.stddev_pop, DataType.DOUBLE), ]).with_window( partition_by="id", order_by=TIMESTAMP_COLUMN, mode="fixed_windows", window_definition=["2 minutes", "15 minutes"], ), ), ], keys=[ KeyFeature( name="id", description="The user's Main ID or device ID", dtype=DataType.BIGINT, ) ], timestamp=TimestampFeature(), ) return feature_set
def test_getters(self, feature_add, feature_divide, key_id, timestamp_c): # arrange name = "name" entity = "entity" description = "description" # act feature_set = FeatureSet( name, entity, description, [key_id], timestamp_c, [feature_add, feature_divide], ) # assert assert name == feature_set.name assert entity == feature_set.entity assert description == feature_set.description assert [key_id] == feature_set.keys assert timestamp_c == feature_set.timestamp assert [feature_add, feature_divide] == feature_set.features assert "timestamp" == feature_set.timestamp_column assert ["id"] == feature_set.keys_columns
def __init__(self): super(FirstPipeline, self).__init__( source=Source( readers=[TableReader(id="t", database="db", table="table",)], query=f"select * from t", # noqa ), feature_set=FeatureSet( name="first", entity="entity", description="description", features=[ Feature(name="feature1", description="test", dtype=DataType.FLOAT,), Feature( name="feature2", description="another test", dtype=DataType.STRING, ), ], keys=[ KeyFeature( name="id", description="identifier", dtype=DataType.BIGINT, ) ], timestamp=TimestampFeature(), ), sink=Sink( writers=[HistoricalFeatureStoreWriter(), OnlineFeatureStoreWriter()] ), )
def apply_migration(self, feature_set: FeatureSet, writer: Writer, debug_mode: bool) -> None: """Apply the migration in the respective database. Args: feature_set: the feature set. writer: the writer being used to load the feature set. debug_mode: if active, it brings up the queries generated. """ logger.info(f"Migrating feature set: {feature_set.name}") table_name = (feature_set.name if not writer.write_to_entity else feature_set.entity) fs_schema = writer.db_config.translate(feature_set.get_schema()) db_schema = self._get_schema(table_name, writer.database) queries = self.create_query(fs_schema, table_name, db_schema, writer.write_to_entity) if debug_mode: print("#### DEBUG MODE ###\n" f"Feature set: {feature_set.name}\n" "Queries:\n" f"{queries}") else: for q in queries: logger.info(f"Applying this query: {q} ...") self._client.sql(q) logger.info(f"Feature Set migration finished successfully.") # inform in drone console which feature set was migrated print(f"The {feature_set.name} feature set was migrated.")
def test_construct_invalid_df(self, key_id, timestamp_c, feature_add, feature_divide): spark_client = Mock() # arrange feature_set = FeatureSet( "name", "entity", "description", [key_id], timestamp_c, [feature_add, feature_divide], ) # act and assert with pytest.raises(ValueError): _ = feature_set.construct("not a dataframe", spark_client)
def test_pipeline_with_hooks(self, spark_session): # arrange hook1 = AddHook(value=1) spark_session.sql( "select 1 as id, timestamp('2020-01-01') as timestamp, 0 as feature" ).createOrReplaceTempView("test") target_df = spark_session.sql( "select 1 as id, timestamp('2020-01-01') as timestamp, 6 as feature, 2020 " "as year, 1 as month, 1 as day") historical_writer = HistoricalFeatureStoreWriter(debug_mode=True) test_pipeline = FeatureSetPipeline( source=Source( readers=[ TableReader( id="reader", table="test", ).add_post_hook(hook1) ], query="select * from reader", ).add_post_hook(hook1), feature_set=FeatureSet( name="feature_set", entity="entity", description="description", features=[ Feature( name="feature", description="test", transformation=SQLExpressionTransform( expression="feature + 1"), dtype=DataType.INTEGER, ), ], keys=[ KeyFeature( name="id", description="The user's Main ID or device ID", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(), ).add_pre_hook(hook1).add_post_hook(hook1), sink=Sink(writers=[historical_writer], ).add_pre_hook(hook1), ) # act test_pipeline.run() output_df = spark_session.table( "historical_feature_store__feature_set") # assert output_df.show() assert_dataframe_equality(output_df, target_df)
def test_construct_with_date_boundaries( self, feature_set_dates_dataframe, feature_set_dates_output_dataframe): # given spark_client = SparkClient() # arrange feature_set = FeatureSet( name="feature_set", entity="entity", description="description", features=[ Feature( name="feature", description="test", dtype=DataType.FLOAT, ), ], keys=[ KeyFeature( name="id", description="The user's Main ID or device ID", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(), ) output_df = (feature_set.construct( feature_set_dates_dataframe, client=spark_client, start_date="2016-04-11", end_date="2016-04-12", ).orderBy(feature_set.timestamp_column).select(feature_set.columns)) target_df = feature_set_dates_output_dataframe.orderBy( feature_set.timestamp_column).select(feature_set.columns) # assert assert_dataframe_equality(output_df, target_df)
def get_db_schema(self, feature_set: FeatureSet) -> List[Dict[Any, Any]]: """Get desired database schema. Args: feature_set: object processed with feature set metadata. Returns: Desired database schema. """ db_schema = self.db_config.translate(feature_set.get_schema()) return db_schema
def test_multiple_timestamps(self, feature_add, key_id, timestamp_c): # arrange name = "name" entity = "entity" description = "description" timestamp_c.get_output_columns = Mock( return_value=["timestamp1", "timestamp2"]) # act and assert with pytest.raises(ValueError): _ = FeatureSet(name, entity, description, [key_id], timestamp_c, [feature_add])
def feature_set(): key_features = [ KeyFeature(name="id", description="Description", dtype=DataType.INTEGER) ] ts_feature = TimestampFeature(from_column=TIMESTAMP_COLUMN) features = [ Feature(name="feature", description="Description", dtype=DataType.BIGINT,) ] return FeatureSet( "feature_set", "entity", "description", keys=key_features, timestamp=ts_feature, features=features, )
def test_duplicate_features(self, feature_add, key_id, timestamp_c): # arrange name = "name" entity = "entity" description = "description" # act and assert with pytest.raises(KeyError): _ = FeatureSet( name, entity, description, [key_id], timestamp_c, [feature_add, feature_add], )
def feature_set_pipeline( spark_context, spark_session, ): feature_set_pipeline = FeatureSetPipeline( source=Source( readers=[ TableReader(id="b_source", table="b_table",).with_incremental_strategy( incremental_strategy=IncrementalStrategy(column="timestamp") ), ], query=f"select * from b_source ", # noqa ), feature_set=FeatureSet( name="feature_set", entity="entity", description="description", features=[ Feature( name="feature", description="test", transformation=SparkFunctionTransform( functions=[ Function(F.avg, DataType.FLOAT), Function(F.stddev_pop, DataType.FLOAT), ], ).with_window( partition_by="id", order_by=TIMESTAMP_COLUMN, mode="fixed_windows", window_definition=["1 day"], ), ), ], keys=[ KeyFeature( name="id", description="The user's Main ID or device ID", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(), ), sink=Sink(writers=[HistoricalFeatureStoreWriter(debug_mode=True)]), ) return feature_set_pipeline
def test_feature_without_datatype(self, key_id, timestamp_c, dataframe): spark_client = SparkClient() with pytest.raises(ValueError): FeatureSet( name="name", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=SQLExpressionTransform( expression="feature1 + a"), ), ], keys=[key_id], timestamp=timestamp_c, ).construct(dataframe, spark_client)
def test__get_features_columns(self): # arrange feature_1 = Feature("feature1", "description", DataType.FLOAT) feature_1.get_output_columns = Mock(return_value=["col_a", "col_b"]) feature_2 = Feature("feature2", "description", DataType.FLOAT) feature_2.get_output_columns = Mock(return_value=["col_c"]) feature_3 = Feature("feature3", "description", DataType.FLOAT) feature_3.get_output_columns = Mock(return_value=["col_d"]) target_features_columns = ["col_a", "col_b", "col_c", "col_d"] # act result_features_columns = FeatureSet._get_features_columns( feature_1, feature_2, feature_3) # assert assert target_features_columns == result_features_columns
def test_feature_set_with_invalid_feature(self, key_id, timestamp_c, dataframe): spark_client = SparkClient() with pytest.raises(ValueError): FeatureSet( name="name", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=AggregatedTransform( functions=[Function(F.avg, DataType.FLOAT)]), ), ], keys=[key_id], timestamp=timestamp_c, ).construct(dataframe, spark_client)
def feature_set(): key_features = [ KeyFeature(name="id", description="Description", dtype=DataType.INTEGER) ] ts_feature = TimestampFeature(from_column="timestamp") features = [ Feature(name="feature", description="Description", dtype=DataType.FLOAT), ] return FeatureSet( "test_sink_feature_set", "test_sink_entity", "description", keys=key_features, timestamp=ts_feature, features=features, )
def test_feature_set_args(self): # arrange and act out_columns = [ "user_id", "timestamp", "listing_page_viewed__rent_per_month__avg_over_7_days_fixed_windows", "listing_page_viewed__rent_per_month__avg_over_2_weeks_fixed_windows", "listing_page_viewed__rent_per_month__stddev_pop_over_7_days_fixed_windows", "listing_page_viewed__rent_per_month__" "stddev_pop_over_2_weeks_fixed_windows", # noqa ] pipeline = FeatureSetPipeline( source=Source( readers=[ TableReader( id="source_a", database="db", table="table", ), FileReader( id="source_b", path="path", format="parquet", ), ], query="select a.*, b.specific_feature " "from source_a left join source_b on a.id=b.id", ), feature_set=FeatureSet( name="feature_set", entity="entity", description="description", keys=[ KeyFeature( name="user_id", description="The user's Main ID or device ID", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(from_column="ts"), features=[ Feature( name="listing_page_viewed__rent_per_month", description="Average of something.", transformation=SparkFunctionTransform(functions=[ Function(functions.avg, DataType.FLOAT), Function(functions.stddev_pop, DataType.FLOAT), ], ).with_window( partition_by="user_id", order_by=TIMESTAMP_COLUMN, window_definition=["7 days", "2 weeks"], mode="fixed_windows", ), ), ], ), sink=Sink(writers=[ HistoricalFeatureStoreWriter(db_config=None), OnlineFeatureStoreWriter(db_config=None), ], ), ) assert isinstance(pipeline.spark_client, SparkClient) assert len(pipeline.source.readers) == 2 assert all( isinstance(reader, Reader) for reader in pipeline.source.readers) assert isinstance(pipeline.source.query, str) assert pipeline.feature_set.name == "feature_set" assert pipeline.feature_set.entity == "entity" assert pipeline.feature_set.description == "description" assert isinstance(pipeline.feature_set.timestamp, TimestampFeature) assert len(pipeline.feature_set.keys) == 1 assert all( isinstance(k, KeyFeature) for k in pipeline.feature_set.keys) assert len(pipeline.feature_set.features) == 1 assert all( isinstance(feature, Feature) for feature in pipeline.feature_set.features) assert pipeline.feature_set.columns == out_columns assert len(pipeline.sink.writers) == 2 assert all( isinstance(writer, Writer) for writer in pipeline.sink.writers)
def test_feature_set_pipeline( self, mocked_df, spark_session, fixed_windows_output_feature_set_dataframe, ): # arrange table_reader_id = "a_source" table_reader_table = "table" table_reader_db = environment.get_variable( "FEATURE_STORE_HISTORICAL_DATABASE") create_temp_view(dataframe=mocked_df, name=table_reader_id) create_db_and_table( spark=spark_session, table_reader_id=table_reader_id, table_reader_db=table_reader_db, table_reader_table=table_reader_table, ) dbconfig = Mock() dbconfig.mode = "overwrite" dbconfig.format_ = "parquet" dbconfig.get_options = Mock( return_value={"path": "test_folder/historical/entity/feature_set"}) historical_writer = HistoricalFeatureStoreWriter(db_config=dbconfig) # act test_pipeline = FeatureSetPipeline( source=Source( readers=[ TableReader( id=table_reader_id, database=table_reader_db, table=table_reader_table, ), ], query=f"select * from {table_reader_id} ", # noqa ), feature_set=FeatureSet( name="feature_set", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=SparkFunctionTransform(functions=[ Function(F.avg, DataType.FLOAT), Function(F.stddev_pop, DataType.FLOAT), ], ).with_window( partition_by="id", order_by=TIMESTAMP_COLUMN, mode="fixed_windows", window_definition=["2 minutes", "15 minutes"], ), ), Feature( name="divided_feature", description="unit test", dtype=DataType.FLOAT, transformation=CustomTransform( transformer=divide, column1="feature1", column2="feature2", ), ), ], keys=[ KeyFeature( name="id", description="The user's Main ID or device ID", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(), ), sink=Sink(writers=[historical_writer]), ) test_pipeline.run() # assert path = dbconfig.get_options("historical/entity/feature_set").get( "path") df = spark_session.read.parquet(path).orderBy(TIMESTAMP_COLUMN) target_df = fixed_windows_output_feature_set_dataframe.orderBy( test_pipeline.feature_set.timestamp_column) # assert assert_dataframe_equality(df, target_df) # tear down shutil.rmtree("test_folder")
def test_pipeline_interval_run(self, mocked_date_df, pipeline_interval_run_target_dfs, spark_session): """Testing pipeline's idempotent interval run feature. Source data: +-------+---+-------------------+-------------------+ |feature| id| ts| timestamp| +-------+---+-------------------+-------------------+ | 200| 1|2016-04-11 11:31:11|2016-04-11 11:31:11| | 300| 1|2016-04-12 11:44:12|2016-04-12 11:44:12| | 400| 1|2016-04-13 11:46:24|2016-04-13 11:46:24| | 500| 1|2016-04-14 12:03:21|2016-04-14 12:03:21| +-------+---+-------------------+-------------------+ The test executes 3 runs for different time intervals. The input data has 4 data points: 2016-04-11, 2016-04-12, 2016-04-13 and 2016-04-14. The following run specifications are: 1) Interval: from 2016-04-11 to 2016-04-13 Target table result: +---+-------+---+-----+------+-------------------+----+ |day|feature| id|month|run_id| timestamp|year| +---+-------+---+-----+------+-------------------+----+ | 11| 200| 1| 4| 1|2016-04-11 11:31:11|2016| | 12| 300| 1| 4| 1|2016-04-12 11:44:12|2016| | 13| 400| 1| 4| 1|2016-04-13 11:46:24|2016| +---+-------+---+-----+------+-------------------+----+ 2) Interval: only 2016-04-14. Target table result: +---+-------+---+-----+------+-------------------+----+ |day|feature| id|month|run_id| timestamp|year| +---+-------+---+-----+------+-------------------+----+ | 11| 200| 1| 4| 1|2016-04-11 11:31:11|2016| | 12| 300| 1| 4| 1|2016-04-12 11:44:12|2016| | 13| 400| 1| 4| 1|2016-04-13 11:46:24|2016| | 14| 500| 1| 4| 2|2016-04-14 12:03:21|2016| +---+-------+---+-----+------+-------------------+----+ 3) Interval: only 2016-04-11. Target table result: +---+-------+---+-----+------+-------------------+----+ |day|feature| id|month|run_id| timestamp|year| +---+-------+---+-----+------+-------------------+----+ | 11| 200| 1| 4| 3|2016-04-11 11:31:11|2016| | 12| 300| 1| 4| 1|2016-04-12 11:44:12|2016| | 13| 400| 1| 4| 1|2016-04-13 11:46:24|2016| | 14| 500| 1| 4| 2|2016-04-14 12:03:21|2016| +---+-------+---+-----+------+-------------------+----+ """ # arrange create_temp_view(dataframe=mocked_date_df, name="input_data") db = environment.get_variable("FEATURE_STORE_HISTORICAL_DATABASE") path = "test_folder/historical/entity/feature_set" spark_session.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic") spark_session.sql(f"create database if not exists {db}") spark_session.sql( f"create table if not exists {db}.feature_set_interval " f"(id int, timestamp timestamp, feature int, " f"run_id int, year int, month int, day int);") dbconfig = MetastoreConfig() dbconfig.get_options = Mock(return_value={ "mode": "overwrite", "format_": "parquet", "path": path }) historical_writer = HistoricalFeatureStoreWriter(db_config=dbconfig, interval_mode=True) first_run_hook = RunHook(id=1) second_run_hook = RunHook(id=2) third_run_hook = RunHook(id=3) ( first_run_target_df, second_run_target_df, third_run_target_df, ) = pipeline_interval_run_target_dfs test_pipeline = FeatureSetPipeline( source=Source( readers=[ TableReader( id="id", table="input_data", ).with_incremental_strategy(IncrementalStrategy("ts")), ], query="select * from id ", ), feature_set=FeatureSet( name="feature_set_interval", entity="entity", description="", keys=[ KeyFeature( name="id", description="", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(from_column="ts"), features=[ Feature(name="feature", description="", dtype=DataType.INTEGER), Feature(name="run_id", description="", dtype=DataType.INTEGER), ], ), sink=Sink([historical_writer], ), ) # act and assert dbconfig.get_path_with_partitions = Mock(return_value=[ "test_folder/historical/entity/feature_set/year=2016/month=4/day=11", "test_folder/historical/entity/feature_set/year=2016/month=4/day=12", "test_folder/historical/entity/feature_set/year=2016/month=4/day=13", ]) test_pipeline.feature_set.add_pre_hook(first_run_hook) test_pipeline.run(end_date="2016-04-13", start_date="2016-04-11") first_run_output_df = spark_session.read.parquet(path) assert_dataframe_equality(first_run_output_df, first_run_target_df) dbconfig.get_path_with_partitions = Mock(return_value=[ "test_folder/historical/entity/feature_set/year=2016/month=4/day=14", ]) test_pipeline.feature_set.add_pre_hook(second_run_hook) test_pipeline.run_for_date("2016-04-14") second_run_output_df = spark_session.read.parquet(path) assert_dataframe_equality(second_run_output_df, second_run_target_df) dbconfig.get_path_with_partitions = Mock(return_value=[ "test_folder/historical/entity/feature_set/year=2016/month=4/day=11", ]) test_pipeline.feature_set.add_pre_hook(third_run_hook) test_pipeline.run_for_date("2016-04-11") third_run_output_df = spark_session.read.parquet(path) assert_dataframe_equality(third_run_output_df, third_run_target_df) # tear down shutil.rmtree("test_folder")
def __init__(self): super(AwesomeDatasetPipeline, self).__init__( source=Source( readers=[ FileReader( id="order_events", path="data/order_events/input.csv", format="csv", format_options={"header": True}, ), FileReader( id="user_chargebacks", path="data/feature_store/historical/user/user_chargebacks", format="parquet", ), FileReader( id="user_orders", path="data/feature_store/historical/user/user_orders", format="parquet", ), ], query=""" with feature_sets_merge as( select user_orders.cpf, user_orders.timestamp, user_chargebacks.timestamp as chargeback_timestamp, cpf_orders__count_over_3_days_rolling_windows, cpf_orders__count_over_7_days_rolling_windows, cpf_orders__count_over_30_days_rolling_windows, cpf_chargebacks__count_over_3_days_rolling_windows, cpf_chargebacks__count_over_7_days_rolling_windows, cpf_chargebacks__count_over_30_days_rolling_windows, row_number() over ( partition by (user_orders.cpf, user_orders.timestamp) order by user_chargebacks.timestamp desc ) as rn from user_orders left join user_chargebacks on user_orders.cpf = user_chargebacks.cpf and user_orders.timestamp >= user_chargebacks.timestamp ), feature_sets_rn_filter as( select * from feature_sets_merge where rn = 1 ), orders_with_feature_sets as( select order_events.order_id, timestamp(order_events.order_timestamp) as timestamp, timestamp(order_events.chargeback_timestamp) as chargeback_timestamp, order_events.cpf, feature_sets_rn_filter.cpf_orders__count_over_3_days_rolling_windows, feature_sets_rn_filter.cpf_orders__count_over_7_days_rolling_windows, feature_sets_rn_filter.cpf_orders__count_over_30_days_rolling_windows, feature_sets_rn_filter.cpf_chargebacks__count_over_3_days_rolling_windows, feature_sets_rn_filter.cpf_chargebacks__count_over_7_days_rolling_windows, feature_sets_rn_filter.cpf_chargebacks__count_over_30_days_rolling_windows, row_number() over ( partition by (order_events.cpf, order_events.order_timestamp) order by feature_sets_rn_filter.timestamp desc ) as rn from order_events join feature_sets_rn_filter on order_events.cpf = feature_sets_rn_filter.cpf and timestamp(order_events.order_timestamp) >= feature_sets_rn_filter.timestamp ) select order_id, timestamp, chargeback_timestamp, cpf, cpf_orders__count_over_3_days_rolling_windows, cpf_orders__count_over_7_days_rolling_windows, cpf_orders__count_over_30_days_rolling_windows, coalesce( cpf_chargebacks__count_over_3_days_rolling_windows, 0) as cpf_chargebacks__count_over_3_days_rolling_windows, coalesce( cpf_chargebacks__count_over_7_days_rolling_windows, 0) as cpf_chargebacks__count_over_7_days_rolling_windows, coalesce( cpf_chargebacks__count_over_30_days_rolling_windows, 0) as cpf_chargebacks__count_over_30_days_rolling_windows from orders_with_feature_sets where rn = 1 """, ), feature_set=FeatureSet( name="awesome_dataset", entity="user", description="Dataset enriching orders events with aggregated features " "on total of orders and chargebacks by user.", keys=[ KeyFeature( name="order_id", description="Orders unique identifier.", dtype=DataType.STRING, ) ], timestamp=TimestampFeature(), features=[ Feature( name="chargeback_timestamp", description="Timestamp for the order creation.", dtype=DataType.TIMESTAMP, ), Feature( name="cpf", description="User unique identifier, user entity key.", dtype=DataType.STRING, ), Feature( name="cpf_orders__count_over_3_days_rolling_windows", description="Count of orders over 3 days rolling windows group " "by user (identified by CPF)", dtype=DataType.INTEGER, ), Feature( name="cpf_orders__count_over_7_days_rolling_windows", description="Count of orders over 7 days rolling windows group " "by user (identified by CPF)", dtype=DataType.INTEGER, ), Feature( name="cpf_orders__count_over_30_days_rolling_windows", description="Count of orders over 30 days rolling windows group" " by user (identified by CPF)", dtype=DataType.INTEGER, ), Feature( name="cpf_chargebacks__count_over_3_days_rolling_windows", description="Count of chargebacks over 3 days rolling windows " "group by user (identified by CPF)", dtype=DataType.INTEGER, ), Feature( name="cpf_chargebacks__count_over_7_days_rolling_windows", description="Count of chargebacks over 7 days rolling windows " "group by user (identified by CPF)", dtype=DataType.INTEGER, ), Feature( name="cpf_chargebacks__count_over_30_days_rolling_windows", description="Count of chargebacks over 30 days rolling windows " "group by user (identified by CPF)", dtype=DataType.INTEGER, ), ], ), sink=Sink(writers=[DatasetWriter()]), )
def test_cannot_instantiate(self, name, entity, description, keys, timestamp, features): # act and assert with pytest.raises(ValueError): FeatureSet(name, entity, description, keys, timestamp, features)