Exemplo n.º 1
0
    def test_resample_millis(self):
        """Test of resampling for millisecond windows"""
        schema = StructType([
            StructField("symbol", StringType()),
            StructField("date", StringType()),
            StructField("event_ts", StringType()),
            StructField("trade_pr", FloatType()),
            StructField("trade_pr_2", FloatType())
        ])

        expectedSchema = StructType([
            StructField("symbol", StringType()),
            StructField("event_ts", StringType()),
            StructField("floor_trade_pr", FloatType()),
            StructField("floor_date", StringType()),
            StructField("floor_trade_pr_2", FloatType())
        ])

        expectedSchemaMS = StructType([
            StructField("symbol", StringType()),
            StructField("event_ts", StringType(), True),
            StructField("date", DoubleType()),
            StructField("trade_pr", DoubleType()),
            StructField("trade_pr_2", DoubleType())
        ])

        data = [["S1", "SAME_DT", "2020-08-01 00:00:10.12345", 349.21, 10.0],
                ["S1", "SAME_DT", "2020-08-01 00:00:10.123", 340.21, 9.0],
                ["S1", "SAME_DT", "2020-08-01 00:00:10.124", 353.32, 8.0]]

        expected_data_ms = [[
            "S1", "2020-08-01 00:00:10.123", None, 344.71, 9.5
        ], ["S1", "2020-08-01 00:00:10.124", None, 353.32, 8.0]]

        # construct dataframes
        df = self.buildTestDF(schema, data)
        dfExpected = self.buildTestDF(expectedSchemaMS, expected_data_ms)

        # convert to TSDF
        tsdf_left = TSDF(df, partition_cols=["symbol"])

        # 30 minute aggregation
        resample_ms = tsdf_left.resample(freq="ms", func="mean").df.withColumn(
            "trade_pr", F.round(F.col('trade_pr'), 2))

        int_df = TSDF(tsdf_left.df.withColumn(
            "event_ts",
            F.col("event_ts").cast("timestamp")),
                      partition_cols=['symbol'])
        interpolated = int_df.interpolate(freq='ms',
                                          func='floor',
                                          method='ffill')
        self.assertDataFramesEqual(resample_ms, dfExpected)
Exemplo n.º 2
0
    def test_interpolation_using_custom_params(self):
        """Verify that by specifying optional paramters it will change the result of the interpolation based on those modified params."""
        self.buildTestingDataFrame()

        expected_data = [
            ["A", "A-1", "2020-01-01 00:00:00", 0.0, False, False],
            ["A", "A-1", "2020-01-01 00:00:30", 1.0, True, True],
            ["A", "A-1", "2020-01-01 00:01:00", 2.0, False, False],
            ["A", "A-1", "2020-01-01 00:01:30", 3.0, False, True],
            ["A", "A-1", "2020-01-01 00:02:00", 4.0, False, True],
            ["A", "A-1", "2020-01-01 00:02:30", 5.0, True, True],
            ["A", "A-1", "2020-01-01 00:03:00", 6.0, True, True],
            ["A", "A-1", "2020-01-01 00:03:30", 7.0, False, True],
            ["A", "A-1", "2020-01-01 00:04:00", 8.0, False, False],
            ["A", "A-1", "2020-01-01 00:04:30", 9.0, True, True],
            ["A", "A-1", "2020-01-01 00:05:00", 10.0, True, True],
            ["A", "A-1", "2020-01-01 00:05:30", 11.0, False, False],
        ]

        expected_schema = StructType([
            StructField("partition_a", StringType()),
            StructField("partition_b", StringType()),
            StructField("other_ts_col", StringType(), False),
            StructField("value_a", DoubleType()),
            StructField("is_ts_interpolated", BooleanType(), False),
            StructField("is_interpolated_value_a", BooleanType(), False),
        ])

        # Modify input DataFrame using different ts_col
        expected_df: DataFrame = self.buildTestDF(expected_schema,
                                                  expected_data,
                                                  ts_cols=["other_ts_col"])

        input_tsdf = TSDF(
            self.simple_input_tsdf.df.withColumnRenamed(
                "event_ts", "other_ts_col"),
            partition_cols=["partition_a", "partition_b"],
            ts_col="other_ts_col",
        )

        actual_df: DataFrame = input_tsdf.interpolate(
            ts_col="other_ts_col",
            show_interpolated=True,
            partition_cols=["partition_a", "partition_b"],
            target_cols=["value_a"],
            freq="30 seconds",
            func="mean",
            method="linear",
        ).df

        assert_df_equality(expected_df, actual_df, ignore_nullable=True)