Exemplo n.º 1
0
    def test_csv_file_with_schema_and_header(self):
        # given
        spark_client = SparkClient()
        schema_csv = StructType(
            [
                StructField("A", LongType()),
                StructField("B", DoubleType()),
                StructField("C", StringType()),
            ]
        )

        file = "tests/unit/butterfree/extract/readers/file-reader-test.csv"

        # when
        file_reader = FileReader(
            id="id",
            path=file,
            format="csv",
            schema=schema_csv,
            format_options={"header": True},
        )
        df = file_reader.consume(spark_client)

        # assert
        assert schema_csv == df.schema
        assert df.columns == ["A", "B", "C"]
        for value in range(3):
            assert df.first()[value] != ["A", "B", "C"][value]
Exemplo n.º 2
0
    def test_consume_with_stream_without_schema(self, spark_client, target_df):
        # arrange
        path = "path/to/file.json"
        format = "json"
        schema = None
        format_options = None
        stream = True
        options = dict({"path": path})

        spark_client.read.return_value = target_df
        file_reader = FileReader(
            "test", path, format, schema, format_options, stream=stream
        )

        # act
        output_df = file_reader.consume(spark_client)

        # assert

        # assert call for schema infer
        spark_client.read.assert_any_call(format=format, options=options)
        # assert call for stream read
        # stream
        spark_client.read.assert_called_with(
            format=format, options=options, schema=output_df.schema, stream=stream
        )
        assert target_df.collect() == output_df.collect()
Exemplo n.º 3
0
    def test_filter(self, feature_set_dataframe, spark_context, spark_session):
        # given
        file_reader = FileReader("test", "path/to/file", "format")

        file_reader.with_(
            transformer=filter,
            condition="test not in ('fail') and feature in (110, 120)",
        )

        # when
        result_df = file_reader._apply_transformations(feature_set_dataframe)

        target_data = [
            {
                "id": 1,
                TIMESTAMP_COLUMN: 1,
                "feature": 110,
                "test": "pass"
            },
            {
                "id": 1,
                TIMESTAMP_COLUMN: 2,
                "feature": 120,
                "test": "pass"
            },
        ]
        target_df = spark_session.read.json(
            spark_context.parallelize(target_data, 1))

        # then
        assert result_df.collect() == target_df.collect()
Exemplo n.º 4
0
    def test_build_with_incremental_strategy(
        self, incremental_source_df, spark_client, spark_session
    ):
        # arrange
        readers = [
            # directly from column
            FileReader(
                id="test_1", path="path/to/file", format="format"
            ).with_incremental_strategy(
                incremental_strategy=IncrementalStrategy(column="date")
            ),
            # from milliseconds
            FileReader(
                id="test_2", path="path/to/file", format="format"
            ).with_incremental_strategy(
                incremental_strategy=IncrementalStrategy().from_milliseconds(
                    column_name="milliseconds"
                )
            ),
            # from str
            FileReader(
                id="test_3", path="path/to/file", format="format"
            ).with_incremental_strategy(
                incremental_strategy=IncrementalStrategy().from_string(
                    column_name="date_str", mask="dd/MM/yyyy"
                )
            ),
            # from year, month, day partitions
            FileReader(
                id="test_4", path="path/to/file", format="format"
            ).with_incremental_strategy(
                incremental_strategy=(
                    IncrementalStrategy().from_year_month_day_partitions()
                )
            ),
        ]

        spark_client.read.return_value = incremental_source_df
        target_df = incremental_source_df.where(
            "date >= date('2020-07-29') and date <= date('2020-07-31')"
        )

        # act
        for reader in readers:
            reader.build(
                client=spark_client, start_date="2020-07-29", end_date="2020-07-31"
            )

        output_dfs = [
            spark_session.table(f"test_{i + 1}") for i, _ in enumerate(readers)
        ]

        # assert
        for output_df in output_dfs:
            assert_dataframe_equality(output_df=output_df, target_df=target_df)
Exemplo n.º 5
0
    def test_build(self, target_df, spark_client, spark_session):
        # arrange
        file_reader = FileReader("test", "path/to/file", "format")
        spark_client.read.return_value = target_df

        # act
        file_reader.build(spark_client)
        result_df = spark_session.sql("select * from test")

        # assert
        assert target_df.collect() == result_df.collect()
Exemplo n.º 6
0
    def test_with_(self, transformations, spark_client):
        # arrange
        file_reader = FileReader("test", "path/to/file", "format")

        # act
        for transformation in transformations:
            file_reader.with_(
                transformation["transformer"],
                *transformation["args"],
                **transformation["kwargs"],
            )

        # assert
        assert file_reader.transformations == transformations
Exemplo n.º 7
0
    def test_build_with_columns(self, target_df, column_target_df,
                                spark_client, spark_session):
        # arrange
        file_reader = FileReader("test", "path/to/file", "format")
        spark_client.read.return_value = target_df

        # act
        file_reader.build(
            client=spark_client,
            columns=[("col1", "new_col1"), ("col2", "new_col2")],
        )
        result_df = spark_session.sql("select * from test")

        # assert
        assert column_target_df.collect() == result_df.collect()
Exemplo n.º 8
0
    def test_consume(
        self, path, format, schema, format_options, spark_client, target_df
    ):
        # arrange
        spark_client.read.return_value = target_df
        file_reader = FileReader("test", path, format, schema, format_options)

        # act
        output_df = file_reader.consume(spark_client)
        options = dict({"path": path}, **format_options if format_options else {})

        # assert
        spark_client.read.assert_called_once_with(
            format=format, options=options, schema=schema, stream=False
        )
        assert target_df.collect() == output_df.collect()
Exemplo n.º 9
0
 def __init__(self):
     super(UserChargebacksPipeline, self).__init__(
         source=Source(
             readers=[
                 FileReader(
                     id="chargeback_events",
                     path="data/order_events/input.csv",
                     format="csv",
                     format_options={"header": True},
                 )
             ],
             query=("""
                 select
                     cpf,
                     timestamp(chargeback_timestamp) as timestamp,
                     order_id
                 from
                     chargeback_events
                 where
                     chargeback_timestamp is not null
                 """),
         ),
         feature_set=AggregatedFeatureSet(
             name="user_chargebacks",
             entity="user",
             description="Aggregates the total of chargebacks from users in "
             "different time windows.",
             keys=[
                 KeyFeature(
                     name="cpf",
                     description="User unique identifier, entity key.",
                     dtype=DataType.STRING,
                 )
             ],
             timestamp=TimestampFeature(),
             features=[
                 Feature(
                     name="cpf_chargebacks",
                     description=
                     "Total of chargebacks registered on user's CPF",
                     transformation=AggregatedTransform(functions=[
                         Function(functions.count, DataType.INTEGER)
                     ]),
                     from_column="order_id",
                 ),
             ],
         ).with_windows(
             definitions=["3 days", "7 days", "30 days"]).add_post_hook(
                 ZeroFillHook()),
         sink=Sink(writers=[
             LocalHistoricalFSWriter(),
             OnlineFeatureStoreWriter(
                 interval_mode=True,
                 check_schema_hook=NotCheckSchemaHook(),
                 debug_mode=True,
             ),
         ]),
     )
Exemplo n.º 10
0
    def test_apply_pivot_transformation(self, input_df, pivot_df):
        # arrange
        file_reader = FileReader("test", "path/to/file", "format")
        file_reader.with_(
            transformer=pivot,
            group_by_columns=["id", "ts"],
            pivot_column="pivot_column",
            agg_column="has_feature",
            aggregation=first,
        )

        # act
        result_df = file_reader._apply_transformations(input_df)

        # assert
        assert compare_dataframes(
            actual_df=result_df,
            expected_df=pivot_df,
        )
Exemplo n.º 11
0
    def test_json_file_with_schema(self):
        # given
        spark_client = SparkClient()
        schema_json = StructType(
            [
                StructField("A", StringType()),
                StructField("B", DoubleType()),
                StructField("C", StringType()),
            ]
        )

        file = "tests/unit/butterfree/extract/readers/file-reader-test.json"

        # when
        file_reader = FileReader(id="id", path=file, format="json", schema=schema_json)
        df = file_reader.consume(spark_client)

        # assert
        assert schema_json == df.schema
Exemplo n.º 12
0
    def test__apply_transformations(
        self,
        input_data,
        transformations,
        transformed_data,
        spark_context,
        spark_session,
        spark_client,
    ):
        # arrange
        file_reader = FileReader("test", "path/to/file", "format")
        file_reader.transformations = transformations
        input_df = spark_session.read.json(
            spark_context.parallelize(input_data, 1))
        target_df = spark_session.read.json(
            spark_context.parallelize(transformed_data, 1))

        # act
        result_df = file_reader._apply_transformations(input_df)

        # assert
        assert target_df.collect() == result_df.collect()
Exemplo n.º 13
0
    def test_source(
        self,
        target_df_source,
        target_df_table_reader,
        spark_session,
    ):
        # given
        spark_client = SparkClient()

        table_reader_id = "a_test_source"
        table_reader_db = "db"
        table_reader_table = "table_test_source"

        create_temp_view(dataframe=target_df_table_reader,
                         name=table_reader_id)
        create_db_and_table(
            spark=spark_session,
            table_reader_id=table_reader_id,
            table_reader_db=table_reader_db,
            table_reader_table=table_reader_table,
        )

        file_reader_id = "b_test_source"
        data_sample_path = INPUT_PATH + "/data.json"

        # when
        source = Source(
            readers=[
                TableReader(
                    id=table_reader_id,
                    database=table_reader_db,
                    table=table_reader_table,
                ),
                FileReader(id=file_reader_id,
                           path=data_sample_path,
                           format="json"),
            ],
            query=f"select a.*, b.feature2 "  # noqa
            f"from {table_reader_id} a "  # noqa
            f"inner join {file_reader_id} b on a.id = b.id ",  # noqa
        )

        result_df = source.construct(client=spark_client)
        target_df = target_df_source

        # then
        assert (compare_dataframes(
            actual_df=result_df,
            expected_df=target_df,
            columns_sort=result_df.columns,
        ) is True)
Exemplo n.º 14
0
    def test_filter_with_invalidations(self, feature_set_dataframe, condition,
                                       spark_context, spark_session):
        # given
        file_reader = FileReader("test", "path/to/file", "format")

        file_reader.with_(transformer=filter, condition=condition)

        # then
        with pytest.raises(TypeError):
            file_reader._apply_transformations(feature_set_dataframe)
Exemplo n.º 15
0
    def test_feature_set_args(self):
        # arrange and act
        out_columns = [
            "user_id",
            "timestamp",
            "listing_page_viewed__rent_per_month__avg_over_7_days_fixed_windows",
            "listing_page_viewed__rent_per_month__avg_over_2_weeks_fixed_windows",
            "listing_page_viewed__rent_per_month__stddev_pop_over_7_days_fixed_windows",
            "listing_page_viewed__rent_per_month__"
            "stddev_pop_over_2_weeks_fixed_windows",
            # noqa
        ]
        pipeline = FeatureSetPipeline(
            source=Source(
                readers=[
                    TableReader(
                        id="source_a",
                        database="db",
                        table="table",
                    ),
                    FileReader(
                        id="source_b",
                        path="path",
                        format="parquet",
                    ),
                ],
                query="select a.*, b.specific_feature "
                "from source_a left join source_b on a.id=b.id",
            ),
            feature_set=FeatureSet(
                name="feature_set",
                entity="entity",
                description="description",
                keys=[
                    KeyFeature(
                        name="user_id",
                        description="The user's Main ID or device ID",
                        dtype=DataType.INTEGER,
                    )
                ],
                timestamp=TimestampFeature(from_column="ts"),
                features=[
                    Feature(
                        name="listing_page_viewed__rent_per_month",
                        description="Average of something.",
                        transformation=SparkFunctionTransform(functions=[
                            Function(functions.avg, DataType.FLOAT),
                            Function(functions.stddev_pop, DataType.FLOAT),
                        ], ).with_window(
                            partition_by="user_id",
                            order_by=TIMESTAMP_COLUMN,
                            window_definition=["7 days", "2 weeks"],
                            mode="fixed_windows",
                        ),
                    ),
                ],
            ),
            sink=Sink(writers=[
                HistoricalFeatureStoreWriter(db_config=None),
                OnlineFeatureStoreWriter(db_config=None),
            ], ),
        )

        assert isinstance(pipeline.spark_client, SparkClient)
        assert len(pipeline.source.readers) == 2
        assert all(
            isinstance(reader, Reader) for reader in pipeline.source.readers)
        assert isinstance(pipeline.source.query, str)
        assert pipeline.feature_set.name == "feature_set"
        assert pipeline.feature_set.entity == "entity"
        assert pipeline.feature_set.description == "description"
        assert isinstance(pipeline.feature_set.timestamp, TimestampFeature)
        assert len(pipeline.feature_set.keys) == 1
        assert all(
            isinstance(k, KeyFeature) for k in pipeline.feature_set.keys)
        assert len(pipeline.feature_set.features) == 1
        assert all(
            isinstance(feature, Feature)
            for feature in pipeline.feature_set.features)
        assert pipeline.feature_set.columns == out_columns
        assert len(pipeline.sink.writers) == 2
        assert all(
            isinstance(writer, Writer) for writer in pipeline.sink.writers)
Exemplo n.º 16
0
 def test_init_invalid_params(self, path, format):
     # act and assert
     with pytest.raises(ValueError):
         FileReader("id", path, format)
Exemplo n.º 17
0
    def __init__(self):
        super(AwesomeDatasetPipeline, self).__init__(
            source=Source(
                readers=[
                    FileReader(
                        id="order_events",
                        path="data/order_events/input.csv",
                        format="csv",
                        format_options={"header": True},
                    ),
                    FileReader(
                        id="user_chargebacks",
                        path="data/feature_store/historical/user/user_chargebacks",
                        format="parquet",
                    ),
                    FileReader(
                        id="user_orders",
                        path="data/feature_store/historical/user/user_orders",
                        format="parquet",
                    ),
                ],
                query="""
with feature_sets_merge as(
    select
        user_orders.cpf,
        user_orders.timestamp,
        user_chargebacks.timestamp as chargeback_timestamp,
        cpf_orders__count_over_3_days_rolling_windows,
        cpf_orders__count_over_7_days_rolling_windows,
        cpf_orders__count_over_30_days_rolling_windows,
        cpf_chargebacks__count_over_3_days_rolling_windows,
        cpf_chargebacks__count_over_7_days_rolling_windows,
        cpf_chargebacks__count_over_30_days_rolling_windows,
        row_number() over (
            partition by (user_orders.cpf, user_orders.timestamp)
            order by user_chargebacks.timestamp desc
        ) as rn
    from
        user_orders
        left join user_chargebacks
            on  user_orders.cpf = user_chargebacks.cpf
            and user_orders.timestamp >= user_chargebacks.timestamp
),
feature_sets_rn_filter as(
    select
        *
    from
        feature_sets_merge
    where
        rn = 1
),
orders_with_feature_sets as(
    select
        order_events.order_id,
        timestamp(order_events.order_timestamp) as timestamp,
        timestamp(order_events.chargeback_timestamp) as chargeback_timestamp,
        order_events.cpf,
        feature_sets_rn_filter.cpf_orders__count_over_3_days_rolling_windows,
        feature_sets_rn_filter.cpf_orders__count_over_7_days_rolling_windows,
        feature_sets_rn_filter.cpf_orders__count_over_30_days_rolling_windows,
        feature_sets_rn_filter.cpf_chargebacks__count_over_3_days_rolling_windows,
        feature_sets_rn_filter.cpf_chargebacks__count_over_7_days_rolling_windows,
        feature_sets_rn_filter.cpf_chargebacks__count_over_30_days_rolling_windows,
        row_number() over (
            partition by (order_events.cpf, order_events.order_timestamp)
            order by feature_sets_rn_filter.timestamp desc
        ) as rn
    from
        order_events
        join feature_sets_rn_filter
            on order_events.cpf = feature_sets_rn_filter.cpf
            and timestamp(order_events.order_timestamp) >=
            feature_sets_rn_filter.timestamp
)
select
    order_id,
    timestamp,
    chargeback_timestamp,
    cpf,
    cpf_orders__count_over_3_days_rolling_windows,
    cpf_orders__count_over_7_days_rolling_windows,
    cpf_orders__count_over_30_days_rolling_windows,
    coalesce(
        cpf_chargebacks__count_over_3_days_rolling_windows,
    0) as cpf_chargebacks__count_over_3_days_rolling_windows,
    coalesce(
        cpf_chargebacks__count_over_7_days_rolling_windows,
    0) as cpf_chargebacks__count_over_7_days_rolling_windows,
    coalesce(
        cpf_chargebacks__count_over_30_days_rolling_windows,
    0) as cpf_chargebacks__count_over_30_days_rolling_windows
from
    orders_with_feature_sets
where
    rn = 1
                """,
            ),
            feature_set=FeatureSet(
                name="awesome_dataset",
                entity="user",
                description="Dataset enriching orders events with aggregated features "
                "on total of orders and chargebacks by user.",
                keys=[
                    KeyFeature(
                        name="order_id",
                        description="Orders unique identifier.",
                        dtype=DataType.STRING,
                    )
                ],
                timestamp=TimestampFeature(),
                features=[
                    Feature(
                        name="chargeback_timestamp",
                        description="Timestamp for the order creation.",
                        dtype=DataType.TIMESTAMP,
                    ),
                    Feature(
                        name="cpf",
                        description="User unique identifier, user entity key.",
                        dtype=DataType.STRING,
                    ),
                    Feature(
                        name="cpf_orders__count_over_3_days_rolling_windows",
                        description="Count of orders over 3 days rolling windows group "
                        "by user (identified by CPF)",
                        dtype=DataType.INTEGER,
                    ),
                    Feature(
                        name="cpf_orders__count_over_7_days_rolling_windows",
                        description="Count of orders over 7 days rolling windows group "
                        "by user (identified by CPF)",
                        dtype=DataType.INTEGER,
                    ),
                    Feature(
                        name="cpf_orders__count_over_30_days_rolling_windows",
                        description="Count of orders over 30 days rolling windows group"
                        " by user (identified by CPF)",
                        dtype=DataType.INTEGER,
                    ),
                    Feature(
                        name="cpf_chargebacks__count_over_3_days_rolling_windows",
                        description="Count of chargebacks over 3 days rolling windows "
                        "group by user (identified by CPF)",
                        dtype=DataType.INTEGER,
                    ),
                    Feature(
                        name="cpf_chargebacks__count_over_7_days_rolling_windows",
                        description="Count of chargebacks over 7 days rolling windows "
                        "group by user (identified by CPF)",
                        dtype=DataType.INTEGER,
                    ),
                    Feature(
                        name="cpf_chargebacks__count_over_30_days_rolling_windows",
                        description="Count of chargebacks over 30 days rolling windows "
                        "group by user (identified by CPF)",
                        dtype=DataType.INTEGER,
                    ),
                ],
            ),
            sink=Sink(writers=[DatasetWriter()]),
        )