def test_weekly_params(self, config): config["time_unit"] = "weeks" params = get_resampling_params(config) assert params.resampling_step == "2W" config["time_unit_end_of_week"] = "MON" params = get_resampling_params(config) assert params.time_unit_end_of_week == "MON" assert params.resampling_step == "2W-MON"
def test_semi_annual_params(self, config): config["time_unit"] = "semi_annual" params = get_resampling_params(config) assert params.time_step == 12 assert params.resampling_step == "12M" config["time_step"] = 1.5 params = get_resampling_params(config) assert params.time_step == 9 assert params.resampling_step == "9M"
def test_invalid_time_step(self, config): config.pop("time_step") with pytest.raises(ValueError) as err: _ = get_resampling_params(config) assert "Invalid time step" in str(err.value) config["time_step"] = 0 with pytest.raises(ValueError) as err: _ = get_resampling_params(config) assert "Time step can not be null or negative" in str(err.value)
def test_no_categorical_impute(self, df, config, columns): config.pop("category_imputation_method") params_no_impute = get_resampling_params(config) resampler_no_impute = Resampler(params_no_impute) no_impute_df = resampler_no_impute.transform(df, "Date") assert pd.isnull(no_impute_df[columns.category].values).all() config["category_imputation_method"] = "empty" params_with_impute = get_resampling_params(config) resampler_with_impute = Resampler(params_with_impute) impute_df = resampler_with_impute.transform(df, "Date") assert pd.isnull(impute_df[columns.category].values).all()
def test_missing_categorical(self, missing_row_df, config, columns): config["time_unit"] = "weeks" config["time_step"] = 12 config["category_imputation_method"] = "clip" params = get_resampling_params(config) resampler = Resampler(params) output_df = resampler.transform(missing_row_df, columns.date) assert np.all(output_df.categorical.values == "second") config["category_imputation_method"] = "previous" params = get_resampling_params(config) resampler = Resampler(params) output_df = resampler.transform(missing_row_df, columns.date) assert math.isnan(output_df.loc[0, columns.category]) assert np.all(output_df.loc[1:, columns.category].values == "second")
def test_generate_date_range_month(self, config): config["time_unit"] = "months" params = get_resampling_params(config) frequency = params.resampling_step time_unit = params.time_unit time_step = params.time_step end_time = pd.Timestamp('2021-06-20 00:00:00') start_time = pd.Timestamp('2021-01-31 00:00:00') date_range = generate_date_range(start_time, end_time, 0, 0, 0, frequency, time_step, time_unit) np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2021-01-31', '2021-03-31', '2021-05-31', '2021-07-31'])) start_time = pd.Timestamp('2021-01-23 00:00:00') date_range = generate_date_range(start_time, end_time, 0, 0, 0, frequency, time_step, time_unit) np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2021-01-31', '2021-03-31', '2021-05-31', '2021-07-31'])) start_time = pd.Timestamp('2021-01-31 10:00:00') date_range = generate_date_range(start_time, end_time, 0, 0, 0, frequency, time_step, time_unit) np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2021-01-31', '2021-03-31', '2021-05-31', '2021-07-31'])) start_time = pd.Timestamp('2021-01-31 10:00:00').tz_localize("CET") end_time = pd.Timestamp('2021-06-20 00:00:00').tz_localize("CET") date_range = generate_date_range(start_time, end_time, 0, 0, 0, frequency, time_step, time_unit) np.testing.assert_array_equal(date_range, pd.DatetimeIndex( ['2021-01-31 00:00:00+01:00', '2021-03-31 00:00:00+02:00', '2021-05-31 00:00:00+02:00', '2021-07-31 00:00:00+02:00'])) start_time = pd.Timestamp('2021-01-31 10:00:00') end_time = pd.Timestamp('2021-06-20 00:00:00') date_range = generate_date_range(start_time, end_time, 1, 0, 1, frequency, time_step, time_unit) np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2021-03-31', '2021-05-31', '2021-07-31']))
def test_generate_date_range_b_days(self, config): config["time_unit"] = "business_days" config["time_step"] = 1 start_time = pd.Timestamp('2021-01-02 00:00:00') end_time = pd.Timestamp('2021-01-10 00:00:00') params = get_resampling_params(config) frequency = params.resampling_step time_unit = params.time_unit time_step = params.time_step date_range = generate_date_range(start_time, end_time, 0, 0, 0, frequency, time_step, time_unit) np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2021-01-04', '2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08', '2021-01-11'])) clip_start = 1 clip_end = 1 shift = 0 date_range = generate_date_range(start_time, end_time, clip_start, clip_end, shift, frequency, time_step, time_unit) np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2021-01-04', '2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08', '2021-01-11'])) clip_start = 2 clip_end = 2 shift = 0 date_range = generate_date_range(start_time, end_time, clip_start, clip_end, shift, frequency, time_step, time_unit) np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08']))
def test_generate_date_range_microseconds(self, config): config["time_unit"] = "microseconds" config["time_step"] = 1 start_time = pd.Timestamp('20190131 01:59:00').tz_localize('CET') end_time = pd.Timestamp('2019-01-31 01:59:00.000016').tz_localize('CET') params = get_resampling_params(config) frequency = params.resampling_step time_unit = params.time_unit time_step = params.time_step clip_start = 5 shift = 2 clip_end = 3 date_range = generate_date_range(start_time, end_time, clip_start, clip_end, shift, frequency, time_step, time_unit) expected_range = pd.DatetimeIndex(['2019-01-31 01:59:00.000007+01:00', '2019-01-31 01:59:00.000008+01:00', '2019-01-31 01:59:00.000009+01:00', '2019-01-31 01:59:00.000010+01:00', '2019-01-31 01:59:00.000011+01:00', '2019-01-31 01:59:00.000012+01:00', '2019-01-31 01:59:00.000013+01:00', '2019-01-31 01:59:00.000014+01:00', '2019-01-31 01:59:00.000015+01:00']) np.testing.assert_array_equal(date_range, expected_range)
def test_no_category_values(self, df, config, columns): config["category_imputation_method"] = "previous" params = get_resampling_params(config) resampler = Resampler(params) output_df_first = resampler.transform(df, columns.date) np.testing.assert_array_equal( output_df_first.categorical.values, np.array([ 'first', 'first', 'first', 'first', 'first', 'second', 'second', 'second' ])) config["category_imputation_method"] = "empty" params = get_resampling_params(config) resampler = Resampler(params) output_df_empty = resampler.transform(df, columns.date) assert math.isnan(output_df_empty.loc[0, columns.category])
def test_df_multiple_dates(self, df_multiple_dates, config, columns): config["category_imputation_method"] = "previous" config["time_unit"] = "hours" config["time_step"] = 12 params = get_resampling_params(config) resampler = Resampler(params) output_df = resampler.transform(df_multiple_dates, columns.date) assert pd.isnull(output_df.loc[1, "date2"])
def test_mode_filling(self, df3, config, columns): config["category_imputation_method"] = "mode" config["time_unit"] = "weeks" config["time_step"] = 1 params = get_resampling_params(config) resampler = Resampler(params) output_df = resampler.transform(df3, columns.date) assert np.all(output_df.categorical.values == "second")
def test_microseconds(self, config, columns): config["time_unit"] = "microseconds" config["time_step"] = 3 params = get_resampling_params(config) resampler = Resampler(params) df_DST = get_df_DST("U", columns) output_df = resampler.transform(df_DST, columns.date) expected_dates = pd.DatetimeIndex(['2019-01-31T00:59:00.000000000', '2019-01-31T00:59:00.000003000']) np.testing.assert_array_equal(output_df[columns.date].values, expected_dates)
def test_clip_filling(self, long_df, config, columns): config["category_imputation_method"] = "clip" config["time_unit"] = "weeks" config["time_step"] = 1 params = get_resampling_params(config) resampler = Resampler(params) output_df = resampler.transform(long_df, columns.date, groupby_columns=["country"]) assert output_df.loc[3, columns.category] == "first"
def test_next_filling_long_format(self, long_df, config, columns): config["category_imputation_method"] = "next" config["time_unit"] = "weeks" config["time_step"] = 1 params = get_resampling_params(config) resampler = Resampler(params) output_df = resampler.transform(long_df, columns.date, groupby_columns=["country"]) assert math.isnan(output_df.loc[4, columns.category]) assert output_df.loc[3, columns.category] == "second"
def test_empty_filling(self, df2, config, columns): config["time_unit"] = "hours" config["time_step"] = 12 params = get_resampling_params(config) resampler = Resampler(params) output_df = resampler.transform(df2, columns.date) assert output_df.loc[0, columns.category] == "first" assert math.isnan(output_df.loc[1, columns.category]) assert math.isnan(output_df.loc[2, columns.category]) assert output_df.loc[6, columns.category] == "second" assert math.isnan(output_df.loc[7, columns.category])
def test_year(self, config, columns): config["time_unit"] = "years" params = get_resampling_params(config) resampler = Resampler(params) df = get_df("Y", columns) output_df = resampler.transform(df, columns.date) assert np.mean(output_df[columns.data]) == 316.19 expected_dates = pd.DatetimeIndex(['1959-12-31T00:00:00.000000000', '1961-12-31T00:00:00.000000000', '1963-12-31T00:00:00.000000000']) np.testing.assert_array_equal(output_df[columns.date].values, expected_dates)
def test_next_filling(self, df2, config, columns): config["category_imputation_method"] = "next" config["time_unit"] = "hours" config["time_step"] = 12 params = get_resampling_params(config) resampler = Resampler(params) output_df = resampler.transform(df2, columns.date) assert output_df.loc[0, columns.category] == "first" assert output_df.loc[1, columns.category] == "first" assert output_df.loc[3, columns.category] == "first" assert output_df.loc[5, columns.category] == "second" assert output_df.loc[9, columns.category] == "third"
def test_hours_DST(self, config, columns): config["time_unit"] = "hours" params = get_resampling_params(config) resampler = Resampler(params) df_DST = get_df_DST("4H", columns) output_df = resampler.transform(df_DST, columns.date) assert np.mean(output_df[columns.data]) == 316.33428571428567 expected_dates = pd.DatetimeIndex(['2019-01-31T01:00:00.000000000', '2019-01-31T03:00:00.000000000', '2019-01-31T05:00:00.000000000', '2019-01-31T07:00:00.000000000', '2019-01-31T09:00:00.000000000', '2019-01-31T11:00:00.000000000', '2019-01-31T13:00:00.000000000']) np.testing.assert_array_equal(output_df[columns.date].values, expected_dates)
def test_seconds(self, config, columns): config["time_unit"] = "seconds" config["time_step"] = 30 params = get_resampling_params(config) resampler = Resampler(params) df_DST = get_df_DST("min", columns) output_df = resampler.transform(df_DST, columns.date) assert np.mean(output_df[columns.data]) == 316.28999999999996 expected_dates = pd.DatetimeIndex(['2019-01-31T00:59:00.000000000', '2019-01-31T00:59:30.000000000', '2019-01-31T01:00:00.000000000', '2019-01-31T01:00:30.000000000', '2019-01-31T01:01:00.000000000', '2019-01-31T01:01:30.000000000', '2019-01-31T01:02:00.000000000']) np.testing.assert_array_equal(output_df[columns.date].values, expected_dates)
def test_mode_filling_long_format(self, long_df_mode, config, columns): config["category_imputation_method"] = "mode" config["time_unit"] = "weeks" config["time_step"] = 1 params = get_resampling_params(config) resampler = Resampler(params) output_df = resampler.transform(long_df_mode, columns.date, groupby_columns=["country"]) assert np.all(output_df.loc[output_df.country == 0, columns.category].values == "first") assert np.all(output_df.loc[output_df.country == 1, columns.category].values == "fourth")
def test_bool_column(self, bool_df, config, columns): config["category_imputation_method"] = "previous" config["time_unit"] = "hours" config["time_step"] = 12 params = get_resampling_params(config) resampler = Resampler(params) output_df = resampler.transform(bool_df, columns.date) np.testing.assert_array_equal( output_df.categorical.values, np.array([ True, True, True, True, True, True, False, False, False, False, False ]))
def test_generate_date_range_half_year(self, config): config["time_step"] = 1 config["time_unit"] = "semi_annual" start_time = pd.Timestamp('2020-01-01 00:00:00') end_time = pd.Timestamp('2021-06-18 00:00:00') params = get_resampling_params(config) frequency = params.resampling_step time_unit = params.time_unit time_step = params.time_step date_range = generate_date_range(start_time, end_time, 0, 0, 0, frequency, time_step, time_unit) np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2020-01-31', '2020-07-31', '2021-01-31', '2021-07-31']))
def test_weeks_monday_end(self, config, columns): config["time_unit"] = "weeks" config["time_unit_end_of_week"] = "MON" params = get_resampling_params(config) resampler = Resampler(params) df = get_df("M", columns) output_df = resampler.transform(df, columns.date) assert np.mean(output_df[columns.data]) == 316.36625000000004 expected_dates = pd.DatetimeIndex(['1959-02-02T00:00:00.000000000', '1959-02-16T00:00:00.000000000', '1959-03-02T00:00:00.000000000', '1959-03-16T00:00:00.000000000', '1959-03-30T00:00:00.000000000', '1959-04-13T00:00:00.000000000', '1959-04-27T00:00:00.000000000', '1959-05-11T00:00:00.000000000']) np.testing.assert_array_equal(output_df[columns.date].values, expected_dates)
def test_nanoseconds(self, config, columns): config["time_unit"] = "nanoseconds" config["time_step"] = 1 params = get_resampling_params(config) resampler = Resampler(params) df = get_df_DST("3N", columns) output_df = resampler.transform(df, columns.date) expected_dates = pd.DatetimeIndex(['2019-01-31T00:59:00.000000000', '2019-01-31T00:59:00.000000001', '2019-01-31T00:59:00.000000002', '2019-01-31T00:59:00.000000003', '2019-01-31T00:59:00.000000004', '2019-01-31T00:59:00.000000005', '2019-01-31T00:59:00.000000006', '2019-01-31T00:59:00.000000007', '2019-01-31T00:59:00.000000008', '2019-01-31T00:59:00.000000009']) np.testing.assert_array_equal(output_df[columns.date].values, expected_dates)
def test_previous_filling(self, df2, config, columns): config["category_imputation_method"] = "previous" config["time_unit"] = "hours" config["time_step"] = 12 params = get_resampling_params(config) resampler = Resampler(params) output_df = resampler.transform(df2, columns.date) np.testing.assert_array_equal( output_df.categorical.values, np.array([ 'first', 'first', 'first', 'first', 'first', 'first', 'second', 'second', 'second', 'second', 'third' ]))
def test_constant_value_filling(self, df2, config, columns): config["category_imputation_method"] = "constant" config["category_constant_value"] = "myvalue" config["time_unit"] = "hours" config["time_step"] = 12 params = get_resampling_params(config) resampler = Resampler(params) output_df = resampler.transform(df2, columns.date) assert output_df.loc[0, columns.category] == "first" assert output_df.loc[1, columns.category] == "myvalue" assert output_df.loc[2, columns.category] == "myvalue" assert output_df.loc[6, columns.category] == "second" assert output_df.loc[7, columns.category] == "myvalue"
def test_days(self, config, columns): config["time_unit"] = "days" params = get_resampling_params(config) resampler = Resampler(params) df = get_df("W-TUE", columns) output_df = resampler.transform(df, columns.date) assert np.mean(output_df[columns.data]) == 316.3254545454545 expected_dates = pd.DatetimeIndex(['1959-01-06T00:00:00.000000000', '1959-01-08T00:00:00.000000000', '1959-01-10T00:00:00.000000000', '1959-01-12T00:00:00.000000000', '1959-01-14T00:00:00.000000000', '1959-01-16T00:00:00.000000000', '1959-01-18T00:00:00.000000000', '1959-01-20T00:00:00.000000000', '1959-01-22T00:00:00.000000000', '1959-01-24T00:00:00.000000000', '1959-01-26T00:00:00.000000000']) np.testing.assert_array_equal(output_df[columns.date].values, expected_dates)
def test_days_DST(self, config, columns): config["time_unit"] = "days" params = get_resampling_params(config) resampler = Resampler(params) df_DST = get_df_DST("W-WED", columns) output_df = resampler.transform(df_DST, columns.date) assert np.mean(output_df[columns.data]) == 316.3072727272727 expected_dates = pd.DatetimeIndex(['2019-02-05T23:00:00.000000000', '2019-02-07T23:00:00.000000000', '2019-02-09T23:00:00.000000000', '2019-02-11T23:00:00.000000000', '2019-02-13T23:00:00.000000000', '2019-02-15T23:00:00.000000000', '2019-02-17T23:00:00.000000000', '2019-02-19T23:00:00.000000000', '2019-02-21T23:00:00.000000000', '2019-02-23T23:00:00.000000000', '2019-02-25T23:00:00.000000000']) np.testing.assert_array_equal(output_df[columns.date].values, expected_dates)
def test_generate_date_range_nanoseconds(self, config): config["time_unit"] = "nanoseconds" config["time_step"] = 1 start_time = pd.Timestamp('2019-01-31T00:59:00.000000000') end_time = pd.Timestamp('2019-01-31T00:59:00.000000009') params = get_resampling_params(config) frequency = params.resampling_step time_unit = params.time_unit time_step = params.time_step clip_start = 5 shift = 2 clip_end = 3 date_range = generate_date_range(start_time, end_time, clip_start, clip_end, shift, frequency, time_step, time_unit) np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2019-01-31 00:59:00.000000007', '2019-01-31 00:59:00.000000008']))
def test_month(self, config, columns): config["time_unit"] = "months" params = get_resampling_params(config) resampler = Resampler(params) df = get_df("Y", columns) output_df = resampler.transform(df, columns.date) assert np.mean(output_df[columns.data]) == 316.32550000000003 expected_dates = pd.DatetimeIndex(['1959-12-31T00:00:00.000000000', '1960-02-29T00:00:00.000000000', '1960-04-30T00:00:00.000000000', '1960-06-30T00:00:00.000000000', '1960-08-31T00:00:00.000000000', '1960-10-31T00:00:00.000000000', '1960-12-31T00:00:00.000000000', '1961-02-28T00:00:00.000000000', '1961-04-30T00:00:00.000000000', '1961-06-30T00:00:00.000000000', '1961-08-31T00:00:00.000000000', '1961-10-31T00:00:00.000000000', '1961-12-31T00:00:00.000000000', '1962-02-28T00:00:00.000000000', '1962-04-30T00:00:00.000000000', '1962-06-30T00:00:00.000000000', '1962-08-31T00:00:00.000000000', '1962-10-31T00:00:00.000000000', '1962-12-31T00:00:00.000000000', '1963-02-28T00:00:00.000000000']) np.testing.assert_array_equal(output_df[columns.date].values, expected_dates)