예제 #1
0
    def test_extrapolation(self, df, config, columns):
        params = get_resampling_params(config)
        resampler = Resampler(params)
        output_df = resampler.transform(df, columns.date)
        assert output_df.loc[7, columns.data] == 316.2
        assert math.isnan(output_df.loc[7, columns.category])

        config.pop("category_imputation_method")
        resampler = Resampler(params)
        output_df = resampler.transform(df, columns.date)
        assert output_df.loc[7, columns.data] == 316.2
        assert math.isnan(output_df.loc[7, columns.category])

        config["extrapolation_method"] = "none"
        params = get_resampling_params(config)
        resampler = Resampler(params)
        output_df = resampler.transform(df, columns.date)
        assert math.isnan(output_df.loc[6, columns.category])
        category_results = np.array(output_df[columns.category].values,
                                    dtype=np.float64)
        assert np.isnan(category_results).all()

        config["extrapolation_method"] = "interpolation"
        params = get_resampling_params(config)
        resampler = Resampler(params)
        output_df = resampler.transform(df, columns.date)
        assert np.round(output_df.loc[7, columns.data], 3) == 316.003
        assert math.isnan(output_df.loc[7, columns.category])
예제 #2
0
    def test_no_categorical_impute(self, df, config, columns):
        config.pop("category_imputation_method")
        params_no_impute = get_resampling_params(config)
        resampler_no_impute = Resampler(params_no_impute)
        no_impute_df = resampler_no_impute.transform(df, "Date")
        assert pd.isnull(no_impute_df[columns.category].values).all()

        config["category_imputation_method"] = "empty"
        params_with_impute = get_resampling_params(config)
        resampler_with_impute = Resampler(params_with_impute)
        impute_df = resampler_with_impute.transform(df, "Date")
        assert pd.isnull(impute_df[columns.category].values).all()
예제 #3
0
    def test_missing_categorical(self, missing_row_df, config, columns):
        config["time_unit"] = "weeks"
        config["time_step"] = 12
        config["category_imputation_method"] = "clip"
        params = get_resampling_params(config)
        resampler = Resampler(params)
        output_df = resampler.transform(missing_row_df, columns.date)
        assert np.all(output_df.categorical.values == "second")

        config["category_imputation_method"] = "previous"
        params = get_resampling_params(config)
        resampler = Resampler(params)
        output_df = resampler.transform(missing_row_df, columns.date)
        assert math.isnan(output_df.loc[0, columns.category])
        assert np.all(output_df.loc[1:, columns.category].values == "second")
예제 #4
0
    def test_three_identifiers(self, long_df_3, params, config,
                               datetime_column):
        resampler = Resampler(params)
        groupby_columns = ["country", "item", "store"]
        datetime_column = config.get('datetime_column')
        output_df = resampler.transform(long_df_3,
                                        datetime_column,
                                        groupby_columns=groupby_columns)

        np.testing.assert_array_equal(
            output_df[datetime_column].values,
            pd.DatetimeIndex([
                "1959-02-01",
                "1959-02-15",
                "1959-03-01",
                "1959-02-01",
                "1959-02-15",
                "1959-03-01",
                "1959-02-01",
                "1959-02-15",
                "1959-03-01",
                "1959-02-01",
                "1959-02-15",
                "1959-03-01",
            ]))
예제 #5
0
    def test_no_category_values(self, df, config, columns):
        config["category_imputation_method"] = "previous"
        params = get_resampling_params(config)
        resampler = Resampler(params)
        output_df_first = resampler.transform(df, columns.date)
        np.testing.assert_array_equal(
            output_df_first.categorical.values,
            np.array([
                'first', 'first', 'first', 'first', 'first', 'second',
                'second', 'second'
            ]))

        config["category_imputation_method"] = "empty"
        params = get_resampling_params(config)
        resampler = Resampler(params)
        output_df_empty = resampler.transform(df, columns.date)
        assert math.isnan(output_df_empty.loc[0, columns.category])
예제 #6
0
 def test_df_multiple_dates(self, df_multiple_dates, config, columns):
     config["category_imputation_method"] = "previous"
     config["time_unit"] = "hours"
     config["time_step"] = 12
     params = get_resampling_params(config)
     resampler = Resampler(params)
     output_df = resampler.transform(df_multiple_dates, columns.date)
     assert pd.isnull(output_df.loc[1, "date2"])
예제 #7
0
 def test_mode_filling(self, df3, config, columns):
     config["category_imputation_method"] = "mode"
     config["time_unit"] = "weeks"
     config["time_step"] = 1
     params = get_resampling_params(config)
     resampler = Resampler(params)
     output_df = resampler.transform(df3, columns.date)
     assert np.all(output_df.categorical.values == "second")
예제 #8
0
 def test_microseconds(self, config, columns):
     config["time_unit"] = "microseconds"
     config["time_step"] = 3
     params = get_resampling_params(config)
     resampler = Resampler(params)
     df_DST = get_df_DST("U", columns)
     output_df = resampler.transform(df_DST, columns.date)
     expected_dates = pd.DatetimeIndex(['2019-01-31T00:59:00.000000000', '2019-01-31T00:59:00.000003000'])
     np.testing.assert_array_equal(output_df[columns.date].values, expected_dates)
예제 #9
0
 def test_long_df_different_sizes(self, long_df_different_sizes, params,
                                  config, datetime_column):
     resampler = Resampler(params)
     groupby_columns = ["country"]
     datetime_column = config.get('datetime_column')
     output_df = resampler.transform(long_df_different_sizes,
                                     datetime_column,
                                     groupby_columns=groupby_columns)
     assert output_df.shape == (12, 4)
예제 #10
0
 def test_clip_filling(self, long_df, config, columns):
     config["category_imputation_method"] = "clip"
     config["time_unit"] = "weeks"
     config["time_step"] = 1
     params = get_resampling_params(config)
     resampler = Resampler(params)
     output_df = resampler.transform(long_df,
                                     columns.date,
                                     groupby_columns=["country"])
     assert output_df.loc[3, columns.category] == "first"
예제 #11
0
 def test_empty_filling(self, df2, config, columns):
     config["time_unit"] = "hours"
     config["time_step"] = 12
     params = get_resampling_params(config)
     resampler = Resampler(params)
     output_df = resampler.transform(df2, columns.date)
     assert output_df.loc[0, columns.category] == "first"
     assert math.isnan(output_df.loc[1, columns.category])
     assert math.isnan(output_df.loc[2, columns.category])
     assert output_df.loc[6, columns.category] == "second"
     assert math.isnan(output_df.loc[7, columns.category])
예제 #12
0
    def test_year(self, config, columns):
        config["time_unit"] = "years"
        params = get_resampling_params(config)
        resampler = Resampler(params)
        df = get_df("Y", columns)
        output_df = resampler.transform(df, columns.date)

        assert np.mean(output_df[columns.data]) == 316.19
        expected_dates = pd.DatetimeIndex(['1959-12-31T00:00:00.000000000', '1961-12-31T00:00:00.000000000',
                                           '1963-12-31T00:00:00.000000000'])
        np.testing.assert_array_equal(output_df[columns.date].values, expected_dates)
예제 #13
0
 def test_next_filling_long_format(self, long_df, config, columns):
     config["category_imputation_method"] = "next"
     config["time_unit"] = "weeks"
     config["time_step"] = 1
     params = get_resampling_params(config)
     resampler = Resampler(params)
     output_df = resampler.transform(long_df,
                                     columns.date,
                                     groupby_columns=["country"])
     assert math.isnan(output_df.loc[4, columns.category])
     assert output_df.loc[3, columns.category] == "second"
예제 #14
0
 def test_hours_DST(self, config, columns):
     config["time_unit"] = "hours"
     params = get_resampling_params(config)
     resampler = Resampler(params)
     df_DST = get_df_DST("4H", columns)
     output_df = resampler.transform(df_DST, columns.date)
     assert np.mean(output_df[columns.data]) == 316.33428571428567
     expected_dates = pd.DatetimeIndex(['2019-01-31T01:00:00.000000000', '2019-01-31T03:00:00.000000000',
                                        '2019-01-31T05:00:00.000000000', '2019-01-31T07:00:00.000000000',
                                        '2019-01-31T09:00:00.000000000', '2019-01-31T11:00:00.000000000',
                                        '2019-01-31T13:00:00.000000000'])
     np.testing.assert_array_equal(output_df[columns.date].values, expected_dates)
예제 #15
0
 def test_next_filling(self, df2, config, columns):
     config["category_imputation_method"] = "next"
     config["time_unit"] = "hours"
     config["time_step"] = 12
     params = get_resampling_params(config)
     resampler = Resampler(params)
     output_df = resampler.transform(df2, columns.date)
     assert output_df.loc[0, columns.category] == "first"
     assert output_df.loc[1, columns.category] == "first"
     assert output_df.loc[3, columns.category] == "first"
     assert output_df.loc[5, columns.category] == "second"
     assert output_df.loc[9, columns.category] == "third"
예제 #16
0
 def test_long_format(self, long_df, params, config, datetime_column):
     resampler = Resampler(params)
     groupby_columns = ["country"]
     datetime_column = config.get('datetime_column')
     output_df = resampler.transform(long_df,
                                     datetime_column,
                                     groupby_columns=groupby_columns)
     np.testing.assert_array_equal(
         output_df[datetime_column].values,
         pd.DatetimeIndex([
             "1959-02-01", "1959-02-15", "1959-03-01", "1959-02-01",
             "1959-02-15", "1959-03-01"
         ]))
예제 #17
0
 def test_weeks_monday_end(self, config, columns):
     config["time_unit"] = "weeks"
     config["time_unit_end_of_week"] = "MON"
     params = get_resampling_params(config)
     resampler = Resampler(params)
     df = get_df("M", columns)
     output_df = resampler.transform(df, columns.date)
     assert np.mean(output_df[columns.data]) == 316.36625000000004
     expected_dates = pd.DatetimeIndex(['1959-02-02T00:00:00.000000000', '1959-02-16T00:00:00.000000000',
                                        '1959-03-02T00:00:00.000000000', '1959-03-16T00:00:00.000000000',
                                        '1959-03-30T00:00:00.000000000', '1959-04-13T00:00:00.000000000',
                                        '1959-04-27T00:00:00.000000000', '1959-05-11T00:00:00.000000000'])
     np.testing.assert_array_equal(output_df[columns.date].values, expected_dates)
예제 #18
0
 def test_empty_identifiers(self, df, params, config, datetime_column):
     resampler = Resampler(params)
     datetime_column = config.get('datetime_column')
     output_df = resampler.transform(df,
                                     datetime_column,
                                     groupby_columns=[])
     assert output_df.shape == (8, 4)
     output_df = resampler.transform(df, datetime_column)
     assert output_df.shape == (8, 4)
     output_df = resampler.transform(df,
                                     datetime_column,
                                     groupby_columns=None)
     assert output_df.shape == (8, 4)
예제 #19
0
 def test_nanoseconds(self, config, columns):
     config["time_unit"] = "nanoseconds"
     config["time_step"] = 1
     params = get_resampling_params(config)
     resampler = Resampler(params)
     df = get_df_DST("3N", columns)
     output_df = resampler.transform(df, columns.date)
     expected_dates = pd.DatetimeIndex(['2019-01-31T00:59:00.000000000', '2019-01-31T00:59:00.000000001',
                                        '2019-01-31T00:59:00.000000002', '2019-01-31T00:59:00.000000003',
                                        '2019-01-31T00:59:00.000000004', '2019-01-31T00:59:00.000000005',
                                        '2019-01-31T00:59:00.000000006', '2019-01-31T00:59:00.000000007',
                                        '2019-01-31T00:59:00.000000008', '2019-01-31T00:59:00.000000009'])
     np.testing.assert_array_equal(output_df[columns.date].values, expected_dates)
예제 #20
0
 def test_mode_filling_long_format(self, long_df_mode, config, columns):
     config["category_imputation_method"] = "mode"
     config["time_unit"] = "weeks"
     config["time_step"] = 1
     params = get_resampling_params(config)
     resampler = Resampler(params)
     output_df = resampler.transform(long_df_mode,
                                     columns.date,
                                     groupby_columns=["country"])
     assert np.all(output_df.loc[output_df.country == 0,
                                 columns.category].values == "first")
     assert np.all(output_df.loc[output_df.country == 1,
                                 columns.category].values == "fourth")
예제 #21
0
 def test_seconds(self, config, columns):
     config["time_unit"] = "seconds"
     config["time_step"] = 30
     params = get_resampling_params(config)
     resampler = Resampler(params)
     df_DST = get_df_DST("min", columns)
     output_df = resampler.transform(df_DST, columns.date)
     assert np.mean(output_df[columns.data]) == 316.28999999999996
     expected_dates = pd.DatetimeIndex(['2019-01-31T00:59:00.000000000', '2019-01-31T00:59:30.000000000',
                                        '2019-01-31T01:00:00.000000000', '2019-01-31T01:00:30.000000000',
                                        '2019-01-31T01:01:00.000000000', '2019-01-31T01:01:30.000000000',
                                        '2019-01-31T01:02:00.000000000'])
     np.testing.assert_array_equal(output_df[columns.date].values, expected_dates)
예제 #22
0
 def test_constant_value_filling(self, df2, config, columns):
     config["category_imputation_method"] = "constant"
     config["category_constant_value"] = "myvalue"
     config["time_unit"] = "hours"
     config["time_step"] = 12
     params = get_resampling_params(config)
     resampler = Resampler(params)
     output_df = resampler.transform(df2, columns.date)
     assert output_df.loc[0, columns.category] == "first"
     assert output_df.loc[1, columns.category] == "myvalue"
     assert output_df.loc[2, columns.category] == "myvalue"
     assert output_df.loc[6, columns.category] == "second"
     assert output_df.loc[7, columns.category] == "myvalue"
예제 #23
0
 def test_bool_column(self, bool_df, config, columns):
     config["category_imputation_method"] = "previous"
     config["time_unit"] = "hours"
     config["time_step"] = 12
     params = get_resampling_params(config)
     resampler = Resampler(params)
     output_df = resampler.transform(bool_df, columns.date)
     np.testing.assert_array_equal(
         output_df.categorical.values,
         np.array([
             True, True, True, True, True, True, False, False, False, False,
             False
         ]))
예제 #24
0
 def test_previous_filling(self, df2, config, columns):
     config["category_imputation_method"] = "previous"
     config["time_unit"] = "hours"
     config["time_step"] = 12
     params = get_resampling_params(config)
     resampler = Resampler(params)
     output_df = resampler.transform(df2, columns.date)
     np.testing.assert_array_equal(
         output_df.categorical.values,
         np.array([
             'first', 'first', 'first', 'first', 'first', 'first', 'second',
             'second', 'second', 'second', 'third'
         ]))
예제 #25
0
 def test_days(self, config, columns):
     config["time_unit"] = "days"
     params = get_resampling_params(config)
     resampler = Resampler(params)
     df = get_df("W-TUE", columns)
     output_df = resampler.transform(df, columns.date)
     assert np.mean(output_df[columns.data]) == 316.3254545454545
     expected_dates = pd.DatetimeIndex(['1959-01-06T00:00:00.000000000', '1959-01-08T00:00:00.000000000',
                                        '1959-01-10T00:00:00.000000000', '1959-01-12T00:00:00.000000000',
                                        '1959-01-14T00:00:00.000000000', '1959-01-16T00:00:00.000000000',
                                        '1959-01-18T00:00:00.000000000', '1959-01-20T00:00:00.000000000',
                                        '1959-01-22T00:00:00.000000000', '1959-01-24T00:00:00.000000000',
                                        '1959-01-26T00:00:00.000000000'])
     np.testing.assert_array_equal(output_df[columns.date].values, expected_dates)
예제 #26
0
 def test_days_DST(self, config, columns):
     config["time_unit"] = "days"
     params = get_resampling_params(config)
     resampler = Resampler(params)
     df_DST = get_df_DST("W-WED", columns)
     output_df = resampler.transform(df_DST, columns.date)
     assert np.mean(output_df[columns.data]) == 316.3072727272727
     expected_dates = pd.DatetimeIndex(['2019-02-05T23:00:00.000000000', '2019-02-07T23:00:00.000000000',
                                        '2019-02-09T23:00:00.000000000', '2019-02-11T23:00:00.000000000',
                                        '2019-02-13T23:00:00.000000000', '2019-02-15T23:00:00.000000000',
                                        '2019-02-17T23:00:00.000000000', '2019-02-19T23:00:00.000000000',
                                        '2019-02-21T23:00:00.000000000', '2019-02-23T23:00:00.000000000',
                                        '2019-02-25T23:00:00.000000000'])
     np.testing.assert_array_equal(output_df[columns.date].values, expected_dates)
예제 #27
0
 def test_mix_identifiers(self, long_df_4, params, config, datetime_column):
     resampler = Resampler(params)
     groupby_columns = ["country", "item", "store"]
     datetime_column = config.get('datetime_column')
     output_df = resampler.transform(long_df_4,
                                     datetime_column,
                                     groupby_columns=groupby_columns)
     expected_dates = pd.DatetimeIndex([
         '2020-02-02T00:00:00.000000000', '2020-02-16T00:00:00.000000000',
         '2020-03-01T00:00:00.000000000', '2020-02-29T00:00:00.000000000',
         '2020-01-31T00:00:00.000000000', '2020-02-02T00:00:00.000000000',
         '2020-02-16T00:00:00.000000000', '2020-03-01T00:00:00.000000000',
         '2020-02-02T00:00:00.000000000', '2020-02-16T00:00:00.000000000',
         '2020-03-01T00:00:00.000000000'
     ])
     np.testing.assert_array_equal(output_df[datetime_column].values,
                                   expected_dates)
예제 #28
0
    def test_month(self, config, columns):
        config["time_unit"] = "months"
        params = get_resampling_params(config)
        resampler = Resampler(params)
        df = get_df("Y", columns)
        output_df = resampler.transform(df, columns.date)

        assert np.mean(output_df[columns.data]) == 316.32550000000003
        expected_dates = pd.DatetimeIndex(['1959-12-31T00:00:00.000000000', '1960-02-29T00:00:00.000000000',
                                           '1960-04-30T00:00:00.000000000', '1960-06-30T00:00:00.000000000',
                                           '1960-08-31T00:00:00.000000000', '1960-10-31T00:00:00.000000000',
                                           '1960-12-31T00:00:00.000000000', '1961-02-28T00:00:00.000000000',
                                           '1961-04-30T00:00:00.000000000', '1961-06-30T00:00:00.000000000',
                                           '1961-08-31T00:00:00.000000000', '1961-10-31T00:00:00.000000000',
                                           '1961-12-31T00:00:00.000000000', '1962-02-28T00:00:00.000000000',
                                           '1962-04-30T00:00:00.000000000', '1962-06-30T00:00:00.000000000',
                                           '1962-08-31T00:00:00.000000000', '1962-10-31T00:00:00.000000000',
                                           '1962-12-31T00:00:00.000000000', '1963-02-28T00:00:00.000000000'])
        np.testing.assert_array_equal(output_df[columns.date].values, expected_dates)
예제 #29
0
 def test_previous_filling_long_format(self, long_df, config, columns):
     config["category_imputation_method"] = "previous"
     config["time_unit"] = "weeks"
     config["time_step"] = 1
     params = get_resampling_params(config)
     resampler = Resampler(params)
     output_df = resampler.transform(long_df,
                                     columns.date,
                                     groupby_columns=["country"])
     expected_dates = pd.DatetimeIndex([
         '1959-02-01T00:00:00.000000000', '1959-02-08T00:00:00.000000000',
         '1959-02-15T00:00:00.000000000', '1959-02-22T00:00:00.000000000',
         '1959-03-01T00:00:00.000000000', '1959-02-01T00:00:00.000000000',
         '1959-02-08T00:00:00.000000000', '1959-02-15T00:00:00.000000000',
         '1959-02-22T00:00:00.000000000', '1959-03-01T00:00:00.000000000'
     ])
     np.testing.assert_array_equal(output_df[columns.date].values,
                                   expected_dates)
     expected_categorical = np.array([
         'first', 'first', 'first', 'first', 'second', 'third', 'third',
         'third', 'third', 'fourth'
     ])
     np.testing.assert_array_equal(output_df.categorical.values,
                                   expected_categorical)
from dataiku.customrecipe import get_recipe_config

from dku_timeseries import Resampler
from io_utils import get_input_output
from recipe_config_loading import check_and_get_groupby_columns, check_time_column_parameter, check_python_version, get_resampling_params

check_python_version()

# --- Setup
(input_dataset, output_dataset) = get_input_output()
recipe_config = get_recipe_config()
input_dataset_columns = [
    column["name"] for column in input_dataset.read_schema()
]
check_time_column_parameter(recipe_config, input_dataset_columns)
groupby_columns = check_and_get_groupby_columns(recipe_config,
                                                input_dataset_columns)
datetime_column = recipe_config.get('datetime_column')
params = get_resampling_params(recipe_config)

# --- Run
df = input_dataset.get_dataframe()
resampler = Resampler(params)
output_df = resampler.transform(df,
                                datetime_column,
                                groupby_columns=groupby_columns)

# --- Write output
output_dataset.write_with_schema(output_df)