示例#1
0
    def test_output_columns(self):
        # arrange
        h3_feature = Feature(
            name="new_feature",
            description="unit test",
            dtype=DataType.STRING,
            transformation=H3HashTransform(
                h3_resolutions=[6, 7, 8, 9, 10, 11, 12],
                lat_column="lat",
                lng_column="lng",
            ),
        )
        target_columns = [
            "lat_lng__h3_hash__6",
            "lat_lng__h3_hash__7",
            "lat_lng__h3_hash__8",
            "lat_lng__h3_hash__9",
            "lat_lng__h3_hash__10",
            "lat_lng__h3_hash__11",
            "lat_lng__h3_hash__12",
        ]

        # act
        output_columns = h3_feature.get_output_columns()

        # assert
        assert sorted(output_columns) == sorted(target_columns)
示例#2
0
def agg_feature_set():
    return AggregatedFeatureSet(
        name="name",
        entity="entity",
        description="description",
        features=[
            Feature(
                name="feature1",
                description="test",
                transformation=AggregatedTransform(
                    functions=[Function(functions.avg, DataType.DOUBLE)], ),
            ),
            Feature(
                name="feature2",
                description="test",
                transformation=AggregatedTransform(
                    functions=[Function(functions.avg, DataType.DOUBLE)]),
            ),
        ],
        keys=[
            KeyFeature(
                name="id",
                description="description",
                dtype=DataType.BIGINT,
            )
        ],
        timestamp=TimestampFeature(),
    )
示例#3
0
    def test_construct(
        self, feature_set_dataframe, fixed_windows_output_feature_set_dataframe
    ):
        # given

        spark_client = SparkClient()

        # arrange

        feature_set = FeatureSet(
            name="feature_set",
            entity="entity",
            description="description",
            features=[
                Feature(
                    name="feature1",
                    description="test",
                    transformation=SparkFunctionTransform(
                        functions=[
                            Function(F.avg, DataType.FLOAT),
                            Function(F.stddev_pop, DataType.FLOAT),
                        ]
                    ).with_window(
                        partition_by="id",
                        order_by=TIMESTAMP_COLUMN,
                        mode="fixed_windows",
                        window_definition=["2 minutes", "15 minutes"],
                    ),
                ),
                Feature(
                    name="divided_feature",
                    description="unit test",
                    dtype=DataType.FLOAT,
                    transformation=CustomTransform(
                        transformer=divide, column1="feature1", column2="feature2",
                    ),
                ),
            ],
            keys=[
                KeyFeature(
                    name="id",
                    description="The user's Main ID or device ID",
                    dtype=DataType.INTEGER,
                )
            ],
            timestamp=TimestampFeature(),
        )

        output_df = (
            feature_set.construct(feature_set_dataframe, client=spark_client)
            .orderBy(feature_set.timestamp_column)
            .select(feature_set.columns)
        )

        target_df = fixed_windows_output_feature_set_dataframe.orderBy(
            feature_set.timestamp_column
        ).select(feature_set.columns)

        # assert
        assert_dataframe_equality(output_df, target_df)
示例#4
0
def test_feature_set():
    return AggregatedFeatureSet(
        name="feature_set",
        entity="entity",
        description="description",
        features=[
            Feature(
                name="feature1",
                description="test",
                transformation=AggregatedTransform(functions=[
                    Function(functions.avg, DataType.DOUBLE),
                    Function(functions.stddev_pop, DataType.DOUBLE),
                ]),
            ),
            Feature(
                name="feature2",
                description="test",
                transformation=AggregatedTransform(
                    functions=[Function(functions.count, DataType.INTEGER)]),
            ),
        ],
        keys=[
            KeyFeature(
                name="id",
                description="The user's Main ID or device ID",
                dtype=DataType.BIGINT,
            )
        ],
        timestamp=TimestampFeature(),
    ).with_windows(definitions=["1 week", "2 days"])
示例#5
0
 def __init__(self):
     super(FirstPipeline, self).__init__(
         source=Source(
             readers=[TableReader(id="t", database="db", table="table",)],
             query=f"select * from t",  # noqa
         ),
         feature_set=FeatureSet(
             name="first",
             entity="entity",
             description="description",
             features=[
                 Feature(name="feature1", description="test", dtype=DataType.FLOAT,),
                 Feature(
                     name="feature2",
                     description="another test",
                     dtype=DataType.STRING,
                 ),
             ],
             keys=[
                 KeyFeature(
                     name="id", description="identifier", dtype=DataType.BIGINT,
                 )
             ],
             timestamp=TimestampFeature(),
         ),
         sink=Sink(
             writers=[HistoricalFeatureStoreWriter(), OnlineFeatureStoreWriter()]
         ),
     )
    def test_construct_rolling_windows_with_end_date(
        self,
        feature_set_dataframe,
        rolling_windows_output_feature_set_dataframe_base_date,
    ):
        # given

        spark_client = SparkClient()

        # arrange

        feature_set = AggregatedFeatureSet(
            name="feature_set",
            entity="entity",
            description="description",
            features=[
                Feature(
                    name="feature1",
                    description="test",
                    transformation=AggregatedTransform(
                        functions=[
                            Function(F.avg, DataType.DOUBLE),
                            Function(F.stddev_pop, DataType.DOUBLE),
                        ],
                    ),
                ),
                Feature(
                    name="feature2",
                    description="test",
                    transformation=AggregatedTransform(
                        functions=[
                            Function(F.avg, DataType.DOUBLE),
                            Function(F.stddev_pop, DataType.DOUBLE),
                        ],
                    ),
                ),
            ],
            keys=[
                KeyFeature(
                    name="id",
                    description="The user's Main ID or device ID",
                    dtype=DataType.INTEGER,
                )
            ],
            timestamp=TimestampFeature(),
        ).with_windows(definitions=["1 day", "1 week"])

        # act
        output_df = feature_set.construct(
            feature_set_dataframe, client=spark_client, end_date="2016-04-18"
        ).orderBy("timestamp")

        target_df = rolling_windows_output_feature_set_dataframe_base_date.orderBy(
            feature_set.timestamp_column
        ).select(feature_set.columns)

        # assert
        assert_dataframe_equality(output_df, target_df)
示例#7
0
    def test_feature_transform_with_dtype(self, feature_set_dataframe):

        test_feature = Feature(
            name="feature",
            description="unit test",
            dtype=DataType.TIMESTAMP,
        )
        df = test_feature.transform(feature_set_dataframe)

        assert dict(df.dtypes).get("feature") == "timestamp"
示例#8
0
    def test_feature_get_output_columns_without_transformations(self):

        test_feature = Feature(
            name="feature",
            from_column="origin",
            description="unit test",
            dtype=DataType.BIGINT,
        )

        assert test_feature.get_output_columns() == [test_feature.name]
示例#9
0
    def test_feature_transform_invalid_output(self, feature_set_dataframe):
        with pytest.raises(Exception):
            test_feature = Feature(
                name="feature1_plus_a",
                description="unit test",
                dtype=DataType.FLOAT,
                transformation=SQLExpressionTransform(
                    expression="feature2 + a"),
            )

            test_feature.transform(feature_set_dataframe).collect()
    def test_feature_transform(self, feature_set_dataframe, target_df_spark):
        test_feature = Feature(
            name="feature",
            description="unit test",
            transformation=SparkFunctionTransform(
                functions=[Function(functions.cos, DataType.DOUBLE)], ),
            from_column="feature1",
        )

        output_df = test_feature.transform(feature_set_dataframe)

        assert_dataframe_equality(output_df, target_df_spark)
示例#11
0
    def test_feature_transform_no_from_column(self, feature_set_dataframe):

        test_feature = Feature(
            name="feature",
            description="unit test feature without transformation",
            dtype=DataType.BIGINT,
        )

        df = test_feature.transform(feature_set_dataframe)

        assert all([
            a == b for a, b in zip(df.columns, feature_set_dataframe.columns)
        ])
    def test_feature_transform(self, feature_set_dataframe, target_df_agg):
        test_feature = Feature(
            name="feature1",
            description="unit test",
            transformation=AggregatedTransform(functions=[
                Function(functions.avg, DataType.DOUBLE),
                Function(functions.stddev_pop, DataType.DOUBLE),
            ]),
        )

        # aggregated feature transform won't run transformations
        # and depends on the feature set
        with pytest.raises(NotImplementedError):
            _ = test_feature.transform(feature_set_dataframe)
示例#13
0
    def test_feature_transform_output(self, feature_set_dataframe):
        test_feature = Feature(
            name="feature1_over_feature2",
            description="unit test",
            dtype=DataType.FLOAT,
            transformation=SQLExpressionTransform(
                expression="feature1/feature2"),
        )

        df = test_feature.transform(feature_set_dataframe).collect()

        assert df[0]["feature1_over_feature2"] == 1
        assert df[1]["feature1_over_feature2"] == 1
        assert df[2]["feature1_over_feature2"] == 1
        assert df[3]["feature1_over_feature2"] == 1
示例#14
0
    def test_output_columns(self, feature_set_dataframe):

        test_feature = Feature(
            name="feature",
            description="unit test",
            dtype=DataType.BIGINT,
            transformation=CustomTransform(
                transformer=divide, column1="feature1", column2="feature2",
            ),
        )

        df_columns = test_feature.get_output_columns()

        assert isinstance(df_columns, list)
        assert df_columns == ["feature"]
    def test_h3_feature_set(self, h3_input_df, h3_target_df):
        spark_client = SparkClient()

        feature_set = AggregatedFeatureSet(
            name="h3_test",
            entity="h3geolocation",
            description="Test",
            keys=[
                KeyFeature(
                    name="h3_id",
                    description="The h3 hash ID",
                    dtype=DataType.DOUBLE,
                    transformation=H3HashTransform(
                        h3_resolutions=[6, 7, 8, 9, 10, 11, 12],
                        lat_column="lat",
                        lng_column="lng",
                    ).with_stack(),
                )
            ],
            timestamp=TimestampFeature(),
            features=[
                Feature(
                    name="house_id",
                    description="Count of house ids over a day.",
                    transformation=AggregatedTransform(
                        functions=[Function(F.count, DataType.BIGINT)]),
                ),
            ],
        ).with_windows(definitions=["1 day"])

        output_df = feature_set.construct(h3_input_df,
                                          client=spark_client,
                                          end_date="2016-04-14")

        assert_dataframe_equality(output_df, h3_target_df)
示例#16
0
    def test_feature_get_output_columns_with_transformations(
            self, feature_set_dataframe):

        some_transformation = Mock()
        some_transformation.output_columns = feature_set_dataframe.columns

        test_feature = Feature(
            name="feature",
            from_column="origin",
            description="unit test",
            transformation=some_transformation,
            dtype=DataType.BIGINT,
        )

        assert test_feature.get_output_columns(
        ) == feature_set_dataframe.columns
示例#17
0
    def test_overwriting_column(self, spark_session):
        # arrange
        input_df = spark_session.sql("select 0 as feature")
        feature_with_same_name = Feature(
            name="feature",
            description="description",
            dtype=DataType.INTEGER,
            transformation=SQLExpressionTransform(expression="feature + 1"),
        )
        target_df = spark_session.sql("select 1 as feature")

        # act
        output_df = feature_with_same_name.transform(input_df)

        # assert
        assert_dataframe_equality(output_df, target_df)
    def test_aggregations_with_filter_expression(self, spark_context):
        # arrange
        test_feature = Feature(
            name="feature_with_filter",
            description="unit test",
            transformation=AggregatedTransform(
                functions=[
                    Function(functions.avg, DataType.DOUBLE),
                    Function(functions.min, DataType.DOUBLE),
                    Function(functions.max, DataType.DOUBLE),
                ],
                filter_expression="type = 'a'",
            ),
            from_column="feature",
        )
        target_aggregations = [
            agg(
                functions.when(functions.expr("type = 'a'"),
                               functions.col("feature")))
            for agg in [functions.avg, functions.min, functions.max]
        ]

        # act
        output_aggregations = [
            agg.function for agg in test_feature.transformation.aggregations
        ]

        # assert

        # cast to string to compare the columns definitions because direct column
        # comparison was not working
        assert str(target_aggregations) == str(output_aggregations)
示例#19
0
    def test_output_columns(self):
        test_feature = Feature(
            name="feature1_over_feature2",
            description="unit test",
            dtype=DataType.FLOAT,
            transformation=SQLExpressionTransform(
                expression="feature1/feature2"),
        )

        df_columns = test_feature.get_output_columns()

        assert all(
            [a == b for a, b in zip(
                df_columns,
                ["feature1_over_feature2"],
            )])
示例#20
0
    def test_custom_transform_output(self, feature_set_dataframe):
        test_feature = Feature(
            name="feature",
            description="unit test",
            dtype=DataType.BIGINT,
            transformation=CustomTransform(
                transformer=divide, column1="feature1", column2="feature2",
            ),
        )

        df = test_feature.transform(feature_set_dataframe).collect()

        assert df[0]["feature"] == 1
        assert df[1]["feature"] == 1
        assert df[2]["feature"] == 1
        assert df[3]["feature"] == 1
def ratio_order_amount_and_items():
    return Feature(name="ratio_order_amount_by_items_val",
                   description="ratio order amount by items count",
                   dtype=DataType.DOUBLE,
                   transformation=CustomTransform(transformer=divide,
                                                  column1="order_total_amount",
                                                  column2="items_qtd"))
示例#22
0
    def test_columns_not_in_dataframe(self, spark_context, spark_session):
        # arrange
        input_df = create_df_from_collection(self.input_data, spark_context,
                                             spark_session)

        feature = Feature(
            name="id",
            description="stack transformation",
            dtype=DataType.STRING,
            transformation=StackTransform("id_c", "id_d"),
        )

        # act and assert
        with pytest.raises(ValueError,
                           match="Columns not found, columns in df: "):
            feature.transform(input_df)
示例#23
0
    def test_feature_transform_with_from_column(self, feature_set_dataframe):

        test_feature = Feature(
            name="new_feature",
            from_column="feature",
            description="unit test",
            dtype=DataType.BIGINT,
        )
        df = test_feature.transform(feature_set_dataframe)

        assert all([
            a == b for a, b in zip(
                sorted(df.columns),
                sorted(["new_feature", "id", TIMESTAMP_COLUMN, "feature"]),
            )
        ])
示例#24
0
    def test_get_schema(self):
        expected_schema = [
            {"column_name": "id", "type": LongType(), "primary_key": True},
            {"column_name": "timestamp", "type": TimestampType(), "primary_key": False},
            {
                "column_name": "feature1__avg_over_2_minutes_fixed_windows",
                "type": FloatType(),
                "primary_key": False,
            },
            {
                "column_name": "feature1__avg_over_15_minutes_fixed_windows",
                "type": FloatType(),
                "primary_key": False,
            },
            {
                "column_name": "feature1__stddev_pop_over_2_minutes_fixed_windows",
                "type": DoubleType(),
                "primary_key": False,
            },
            {
                "column_name": "feature1__stddev_pop_over_15_minutes_fixed_windows",
                "type": DoubleType(),
                "primary_key": False,
            },
        ]

        feature_set = FeatureSet(
            name="feature_set",
            entity="entity",
            description="description",
            features=[
                Feature(
                    name="feature1",
                    description="test",
                    transformation=SparkFunctionTransform(
                        functions=[
                            Function(F.avg, DataType.FLOAT),
                            Function(F.stddev_pop, DataType.DOUBLE),
                        ]
                    ).with_window(
                        partition_by="id",
                        order_by=TIMESTAMP_COLUMN,
                        mode="fixed_windows",
                        window_definition=["2 minutes", "15 minutes"],
                    ),
                ),
            ],
            keys=[
                KeyFeature(
                    name="id",
                    description="The user's Main ID or device ID",
                    dtype=DataType.BIGINT,
                )
            ],
            timestamp=TimestampFeature(),
        )

        schema = feature_set.get_schema()

        assert schema == expected_schema
    def test_feature_set_start_date(
        self,
        timestamp_c,
        feature_set_with_distinct_dataframe,
    ):
        fs = AggregatedFeatureSet(
            name="name",
            entity="entity",
            description="description",
            features=[
                Feature(
                    name="feature",
                    description="test",
                    transformation=AggregatedTransform(
                        functions=[Function(functions.sum, DataType.INTEGER)]),
                ),
            ],
            keys=[
                KeyFeature(name="h3",
                           description="test",
                           dtype=DataType.STRING)
            ],
            timestamp=timestamp_c,
        ).with_windows(["10 days", "3 weeks", "90 days"])

        # assert
        start_date = fs.define_start_date("2016-04-14")

        assert start_date == "2016-01-14"
示例#26
0
def feature_set():
    feature_set = FeatureSet(
        name="feature_set",
        entity="entity",
        description="description",
        features=[
            Feature(
                name="feature1",
                description="test",
                transformation=SparkFunctionTransform(functions=[
                    Function(functions.avg, DataType.FLOAT),
                    Function(functions.stddev_pop, DataType.DOUBLE),
                ]).with_window(
                    partition_by="id",
                    order_by=TIMESTAMP_COLUMN,
                    mode="fixed_windows",
                    window_definition=["2 minutes", "15 minutes"],
                ),
            ),
        ],
        keys=[
            KeyFeature(
                name="id",
                description="The user's Main ID or device ID",
                dtype=DataType.BIGINT,
            )
        ],
        timestamp=TimestampFeature(),
    )

    return feature_set
    def test_feature_transform_with_distinct_empty_subset(
            self, timestamp_c, feature_set_with_distinct_dataframe):
        spark_client = SparkClient()

        with pytest.raises(ValueError,
                           match="The distinct subset param can't be empty."):
            AggregatedFeatureSet(
                name="name",
                entity="entity",
                description="description",
                features=[
                    Feature(
                        name="feature",
                        description="test",
                        transformation=AggregatedTransform(functions=[
                            Function(functions.sum, DataType.INTEGER)
                        ]),
                    ),
                ],
                keys=[
                    KeyFeature(name="h3",
                               description="test",
                               dtype=DataType.STRING)
                ],
                timestamp=timestamp_c,
            ).with_windows(["3 days"]).with_distinct(
                subset=[],
                keep="first").construct(feature_set_with_distinct_dataframe,
                                        spark_client,
                                        end_date="2020-01-10")
    def test_feature_transform_with_distinct(
        self,
        timestamp_c,
        feature_set_with_distinct_dataframe,
        target_with_distinct_dataframe,
    ):
        spark_client = SparkClient()

        fs = (AggregatedFeatureSet(
            name="name",
            entity="entity",
            description="description",
            features=[
                Feature(
                    name="feature",
                    description="test",
                    transformation=AggregatedTransform(
                        functions=[Function(functions.sum, DataType.INTEGER)]),
                ),
            ],
            keys=[
                KeyFeature(name="h3",
                           description="test",
                           dtype=DataType.STRING)
            ],
            timestamp=timestamp_c,
        ).with_windows(["3 days"]).with_distinct(subset=["id"], keep="last"))

        # assert
        output_df = fs.construct(feature_set_with_distinct_dataframe,
                                 spark_client,
                                 end_date="2020-01-10")
        assert_dataframe_equality(output_df, target_with_distinct_dataframe)
    def test_feature_set_with_invalid_feature(self, key_id, timestamp_c,
                                              dataframe):
        spark_client = SparkClient()

        with pytest.raises(ValueError):
            AggregatedFeatureSet(
                name="name",
                entity="entity",
                description="description",
                features=[
                    Feature(
                        name="feature1",
                        description="test",
                        transformation=SparkFunctionTransform(functions=[
                            Function(functions.avg, DataType.FLOAT)
                        ], ).with_window(
                            partition_by="id",
                            mode="row_windows",
                            window_definition=["2 events"],
                        ),
                    ),
                ],
                keys=[key_id],
                timestamp=timestamp_c,
            ).construct(dataframe, spark_client)
    def test_construct_without_window(
        self,
        feature_set_dataframe,
        target_df_without_window,
    ):
        # given

        spark_client = SparkClient()

        # arrange

        feature_set = AggregatedFeatureSet(
            name="feature_set",
            entity="entity",
            description="description",
            features=[
                Feature(
                    name="feature1",
                    description="test",
                    dtype=DataType.DOUBLE,
                    transformation=AggregatedTransform(
                        functions=[Function(F.avg, DataType.DOUBLE)]),
                ),
                Feature(
                    name="feature2",
                    description="test",
                    dtype=DataType.FLOAT,
                    transformation=AggregatedTransform(
                        functions=[Function(F.count, DataType.BIGINT)]),
                ),
            ],
            keys=[
                KeyFeature(
                    name="id",
                    description="The user's Main ID or device ID",
                    dtype=DataType.INTEGER,
                )
            ],
            timestamp=TimestampFeature(from_column="fixed_ts"),
        )

        # act
        output_df = feature_set.construct(feature_set_dataframe,
                                          client=spark_client)

        # assert
        assert_dataframe_equality(output_df, target_df_without_window)