def test_roll_series_with_gap_different_input_types_same_result_uniform( rolling_series_pd, ): # Offset inputs will only produce the same results as numeric inputs # when the data has a uniform frequency offset_gap = "2d" offset_window_length = "5d" int_gap = 2 int_window_length = 5 # Rolling series' with matching input types expected_rolling_numeric = _roll_series_with_gap( rolling_series_pd, window_size=int_window_length, gap=int_gap ).max() def count_wrapper(sub_s): return _apply_roll_with_offset_gap(sub_s, offset_gap, max, min_periods=1) rolling_count_obj = _roll_series_with_gap( rolling_series_pd, window_size=offset_window_length, gap=offset_gap ) expected_rolling_offset = rolling_count_obj.apply(count_wrapper) # confirm that the offset and gap results are equal to one another pd.testing.assert_series_equal(expected_rolling_numeric, expected_rolling_offset) # Rolling series' with mismatched input types mismatched_numeric_gap = _roll_series_with_gap( rolling_series_pd, window_size=offset_window_length, gap=int_gap ).max() # Confirm the mismatched results also produce the same results pd.testing.assert_series_equal(expected_rolling_numeric, mismatched_numeric_gap)
def test_roll_series_with_gap(window_length, gap, rolling_series_pd): rolling_max = _roll_series_with_gap(rolling_series_pd, window_length, gap=gap).max() rolling_min = _roll_series_with_gap(rolling_series_pd, window_length, gap=gap).min() assert len(rolling_max) == len(rolling_series_pd) assert len(rolling_min) == len(rolling_series_pd) gap_num = get_number_from_offset(gap) window_length_num = get_number_from_offset(window_length) for i in range(len(rolling_series_pd)): start_idx = i - gap_num - window_length_num + 1 if isinstance(gap, str): # No gap functionality is happening, so gap isn't taken account in the end index # it's like the gap is 0; it includes the row itself end_idx = i else: end_idx = i - gap_num # If start and end are negative, they're entirely before if start_idx < 0 and end_idx < 0: assert pd.isnull(rolling_max.iloc[i]) assert pd.isnull(rolling_min.iloc[i]) continue if start_idx < 0: start_idx = 0 # Because the row values are a range from 0 to 20, the rolling min will be the start index # and the rolling max will be the end idx assert rolling_min.iloc[i] == start_idx assert rolling_max.iloc[i] == end_idx
def test_roll_series_with_gap_nullable_types_with_nans(rolling_series_pd): window_length = 3 gap = 2 nullable_floats = rolling_series_pd.astype("float64").replace( {1: np.nan, 3: np.nan} ) nullable_ints = nullable_floats.astype("Int64") nullable_ints_rolling_max = _roll_series_with_gap( nullable_ints, window_length, gap=gap ).max() nullable_floats_rolling_max = _roll_series_with_gap( nullable_floats, window_length, gap=gap ).max() pd.testing.assert_series_equal( nullable_ints_rolling_max, nullable_floats_rolling_max ) expected_early_values = [np.nan, np.nan, 0, 0, 2, 2, 4] + list( range(7 - gap, len(rolling_series_pd) - gap) ) for i in range(len(rolling_series_pd)): actual = nullable_floats_rolling_max.iloc[i] expected = expected_early_values[i] if pd.isnull(actual): assert pd.isnull(expected) else: assert actual == expected
def test_roll_series_with_gap_incorrect_types(rolling_series_pd): error = "Window length must be either an offset string or an integer." with pytest.raises(TypeError, match=error): _roll_series_with_gap(rolling_series_pd, window_size=4.2, gap=4) error = "Gap must be either an offset string or an integer." with pytest.raises(TypeError, match=error): _roll_series_with_gap(rolling_series_pd, window_size=4, gap=4.2)
def test_roll_series_with_gap_negative_inputs(rolling_series_pd): error = "Window length must be greater than zero." with pytest.raises(ValueError, match=error): _roll_series_with_gap(rolling_series_pd, window_size=-4, gap=4) error = "Gap must be greater than or equal to zero." with pytest.raises(ValueError, match=error): _roll_series_with_gap(rolling_series_pd, window_size=4, gap=-4)
def test_apply_roll_with_offset_data_frequency_higher_than_parameters_frequency(): window_length = "5D" # 120 hours window_length_num = 5 # In order for min periods to be the length of the window, we multiply 24hours*5 min_periods = window_length_num * 24 datetimes = list(pd.date_range(start="2017-01-01", freq="1H", periods=200)) high_frequency_series = pd.Series(range(200), index=datetimes) # Check without gap gap = "0d" gap_num = 0 def max_wrapper(sub_s): return _apply_roll_with_offset_gap(sub_s, gap, max, min_periods=min_periods) rolling_max_obj = _roll_series_with_gap( high_frequency_series, window_length, min_periods=min_periods, gap=gap ) rolling_max_series = rolling_max_obj.apply(max_wrapper) assert rolling_max_series.isna().sum() == (min_periods - 1) + gap_num # Check with small gap gap = "3H" gap_num = 3 def max_wrapper(sub_s): return _apply_roll_with_offset_gap(sub_s, gap, max, min_periods=min_periods) rolling_max_obj = _roll_series_with_gap( high_frequency_series, window_length, min_periods=min_periods, gap=gap ) rolling_max_series = rolling_max_obj.apply(max_wrapper) assert rolling_max_series.isna().sum() == (min_periods - 1) + gap_num # Check with large gap - in terms of days, so we'll multiply by 24hours for number of nans gap = "2D" gap_num = 2 def max_wrapper(sub_s): return _apply_roll_with_offset_gap(sub_s, gap, max, min_periods=min_periods) rolling_max_obj = _roll_series_with_gap( high_frequency_series, window_length, min_periods=min_periods, gap=gap ) rolling_max_series = rolling_max_obj.apply(max_wrapper) assert rolling_max_series.isna().sum() == (min_periods - 1) + (gap_num * 24)
def test_roll_series_with_gap_nullable_types(rolling_series_pd): window_length = 3 gap = 2 # Because we're inserting nans, confirm that nullability of the dtype doesn't have an impact on the results nullable_series = rolling_series_pd.astype("Int64") non_nullable_series = rolling_series_pd.astype("int64") nullable_rolling_max = _roll_series_with_gap( nullable_series, window_length, gap=gap ).max() non_nullable_rolling_max = _roll_series_with_gap( non_nullable_series, window_length, gap=gap ).max() pd.testing.assert_series_equal(nullable_rolling_max, non_nullable_rolling_max)
def rolling_count(datetime): x = pd.Series(1, index=datetime) rolled_series = _roll_series_with_gap(x, self.window_length, gap=self.gap, min_periods=self.min_periods) if isinstance(self.gap, str): # Since _apply_roll_with_offset_gap doesn't artificially add nans before rolling, # it produces correct results additional_args = (self.gap, len, self.min_periods) return rolled_series.apply(_apply_roll_with_offset_gap, args=additional_args).values rolling_count_series = rolled_series.count() # The shift made to account for gap adds NaNs to the rolled series # Those values get counted towards min_periods when they shouldn't. # So we need to replace any of those partial values with NaNs if not self.min_periods: # when min periods is 0 or None it's treated the same as if it's 1 num_nans = self.gap else: num_nans = self.min_periods - 1 + self.gap rolling_count_series.iloc[range(num_nans)] = np.nan return rolling_count_series.values
def test_apply_roll_with_offset_gap_non_uniform(): window_length = "3d" gap = "3d" # When the data isn't uniform, this impacts the number of values in each rolling window datetimes = ( list(pd.date_range(start="2017-01-01", freq="1d", periods=7)) + list(pd.date_range(start="2017-02-01", freq="2d", periods=7)) + list(pd.date_range(start="2017-03-01", freq="1d", periods=7)) ) no_freq_series = pd.Series(range(len(datetimes)), index=datetimes) assert pd.infer_freq(no_freq_series.index) is None expected_series = pd.Series( [None, None, None, 1, 2, 3, 3] + [None, None, 1, 1, 1, 1, 1] + [None, None, None, 1, 2, 3, 3], index=datetimes, ) def count_wrapper(sub_s): return _apply_roll_with_offset_gap(sub_s, gap, len, min_periods=1) rolling_count_obj = _roll_series_with_gap(no_freq_series, window_length, gap=gap) rolling_count_series = rolling_count_obj.apply(count_wrapper) pd.testing.assert_series_equal(rolling_count_series, expected_series)
def test_rolling_mean(min_periods, window_length, gap, rolling_series_pd): gap_num = get_number_from_offset(gap) window_length_num = get_number_from_offset(window_length) # Since we're using a uniform series we can check correctness using numeric parameters expected_vals = (_roll_series_with_gap( rolling_series_pd, window_length_num, gap=gap_num, min_periods=min_periods).mean().values) primitive_instance = RollingMean(window_length=window_length, gap=gap, min_periods=min_periods) primitive_func = primitive_instance.get_function() actual_vals = pd.Series( primitive_func(rolling_series_pd.index, pd.Series(rolling_series_pd.values))) # Since min_periods of 0 is the same as min_periods of 1 num_nans_from_min_periods = min_periods or 1 assert actual_vals.isna().sum() == gap_num + num_nans_from_min_periods - 1 pd.testing.assert_series_equal(pd.Series(expected_vals), actual_vals)
def test_rolling_std(min_periods, window_length, gap, rolling_series_pd): gap_num = get_number_from_offset(gap) window_length_num = get_number_from_offset(window_length) # Since we're using a uniform series we can check correctness using numeric parameters expected_vals = (_roll_series_with_gap( rolling_series_pd, window_length_num, gap=gap_num, min_periods=min_periods).std().values) primitive_instance = RollingSTD(window_length=window_length, gap=gap, min_periods=min_periods) primitive_func = primitive_instance.get_function() actual_vals = pd.Series( primitive_func(rolling_series_pd.index, pd.Series(rolling_series_pd.values))) # Since min_periods of 0 is the same as min_periods of 1 num_nans_from_min_periods = min_periods or 2 if min_periods in [0, 1]: # the additional nan is because std pandas function returns NaN if there's only one value num_nans = gap_num + 1 else: num_nans = gap_num + num_nans_from_min_periods - 1 # The extra 1 at the beinning is because the std pandas function returns NaN if there's only one value assert actual_vals.isna().sum() == num_nans pd.testing.assert_series_equal(pd.Series(expected_vals), actual_vals)
def test_roll_series_with_gap_early_values(window_length, gap, rolling_series_pd): gap_num = get_number_from_offset(gap) window_length_num = get_number_from_offset(window_length) # Default min periods is 1 - will include all default_partial_values = _roll_series_with_gap( rolling_series_pd, window_length, gap=gap ).count() num_empty_aggregates = len(default_partial_values.loc[default_partial_values == 0]) num_partial_aggregates = len( (default_partial_values.loc[default_partial_values != 0]).loc[ default_partial_values < window_length_num ] ) assert num_partial_aggregates == window_length_num - 1 if isinstance(gap, str): # gap isn't handled, so we'll always at least include the row itself assert num_empty_aggregates == 0 else: assert num_empty_aggregates == gap_num # Make min periods the size of the window no_partial_values = _roll_series_with_gap( rolling_series_pd, window_length, gap=gap, min_periods=window_length_num ).count() num_null_aggregates = len(no_partial_values.loc[pd.isna(no_partial_values)]) num_partial_aggregates = len( no_partial_values.loc[no_partial_values < window_length_num] ) # because we shift, gap is included as nan values in the series. # Count treats nans in a window as values that don't get counted, # so the gap rows get included in the count for whether a window has "min periods". # This is different than max, for example, which does not count nans in a window as values towards "min periods" assert num_null_aggregates == window_length_num - 1 if isinstance(gap, str): # gap isn't handled, so we'll never have any partial aggregates assert num_partial_aggregates == 0 else: assert num_partial_aggregates == gap_num
def rolling_mean(datetime, numeric): x = pd.Series(numeric.values, index=datetime.values) rolled_series = _roll_series_with_gap(x, self.window_length, gap=self.gap, min_periods=self.min_periods) if isinstance(self.gap, str): additional_args = (self.gap, np.mean, self.min_periods) return rolled_series.apply(_apply_roll_with_offset_gap, args=additional_args).values return rolled_series.mean().values
def test_apply_roll_with_offset_gap(window_length, gap, rolling_series_pd): def max_wrapper(sub_s): return _apply_roll_with_offset_gap(sub_s, gap, max, min_periods=1) rolling_max_obj = _roll_series_with_gap(rolling_series_pd, window_length, gap=gap) rolling_max_series = rolling_max_obj.apply(max_wrapper) def min_wrapper(sub_s): return _apply_roll_with_offset_gap(sub_s, gap, min, min_periods=1) rolling_min_obj = _roll_series_with_gap(rolling_series_pd, window_length, gap=gap) rolling_min_series = rolling_min_obj.apply(min_wrapper) assert len(rolling_max_series) == len(rolling_series_pd) assert len(rolling_min_series) == len(rolling_series_pd) gap_num = get_number_from_offset(gap) window_length_num = get_number_from_offset(window_length) for i in range(len(rolling_series_pd)): start_idx = i - gap_num - window_length_num + 1 # Now that we have the _apply call, this acts as expected end_idx = i - gap_num # If start and end are negative, they're entirely before if start_idx < 0 and end_idx < 0: assert pd.isnull(rolling_max_series.iloc[i]) assert pd.isnull(rolling_min_series.iloc[i]) continue if start_idx < 0: start_idx = 0 # Because the row values are a range from 0 to 20, the rolling min will be the start index # and the rolling max will be the end idx assert rolling_min_series.iloc[i] == start_idx assert rolling_max_series.iloc[i] == end_idx
def test_apply_roll_with_offset_data_min_periods_too_big(rolling_series_pd): window_length = "5D" gap = "2d" # Since the data has a daily frequency, there will only be, at most, 5 rows in the window min_periods = 6 def max_wrapper(sub_s): return _apply_roll_with_offset_gap(sub_s, gap, max, min_periods=min_periods) rolling_max_obj = _roll_series_with_gap( rolling_series_pd, window_length, min_periods=min_periods, gap=gap ) rolling_max_series = rolling_max_obj.apply(max_wrapper) # The resulting series is comprised entirely of nans assert rolling_max_series.isna().sum() == len(rolling_series_pd)
def test_roll_series_with_non_offset_string_inputs(rolling_series_pd): error = "Cannot roll series. The specified gap, test, is not a valid offset alias." with pytest.raises(ValueError, match=error): _roll_series_with_gap(rolling_series_pd, window_size="4D", gap="test") error = "Cannot roll series. The specified window length, test, is not a valid offset alias." with pytest.raises(ValueError, match=error): _roll_series_with_gap(rolling_series_pd, window_size="test", gap="7D") # Test mismatched types error error = ( "Cannot roll series with offset gap, 2d, and numeric window length, 7. " "If an offset alias is used for gap, the window length must also be defined as an offset alias. " "Please either change gap to be numeric or change window length to be an offset alias." ) with pytest.raises(TypeError, match=error): _roll_series_with_gap(rolling_series_pd, window_size=7, gap="2d").max()
def test_apply_roll_with_offset_gap_min_periods(min_periods, rolling_series_pd): window_length = "5d" window_length_num = 5 gap = "3d" gap_num = 3 def count_wrapper(sub_s): return _apply_roll_with_offset_gap(sub_s, gap, len, min_periods=min_periods) rolling_count_obj = _roll_series_with_gap(rolling_series_pd, window_length, gap=gap) rolling_count_series = rolling_count_obj.apply(count_wrapper) # gap essentially creates rolling series that have no elements; which should be nan # to differentiate from when a window only has null values num_empty_aggregates = rolling_count_series.isna().sum() num_partial_aggregates = len( (rolling_count_series.loc[rolling_count_series != 0]).loc[ rolling_count_series < window_length_num ] ) assert num_empty_aggregates == min_periods - 1 + gap_num assert num_partial_aggregates == window_length_num - min_periods
def test_rolling_count(window_length, gap, rolling_series_pd): gap_num = get_number_from_offset(gap) window_length_num = get_number_from_offset(window_length) expected_vals = (_roll_series_with_gap( rolling_series_pd, window_length_num, gap=gap_num, min_periods=window_length_num, ).count().values) primitive_instance = RollingCount(window_length=window_length, gap=gap, min_periods=window_length_num) primitive_func = primitive_instance.get_function() actual_vals = pd.Series(primitive_func(rolling_series_pd.index)) num_nans = gap_num + window_length_num - 1 assert actual_vals.isna().sum() == num_nans # RollingCount will not match the exact _roll_series_with_gap call, # because it handles the min_periods difference within the primitive pd.testing.assert_series_equal( pd.Series(expected_vals).iloc[num_nans:], actual_vals.iloc[num_nans:])
def test_roll_series_with_no_gap(window_length, rolling_series_pd): actual_rolling = _roll_series_with_gap(rolling_series_pd, window_length).mean() expected_rolling = rolling_series_pd.rolling(window_length, min_periods=1).mean() pd.testing.assert_series_equal(actual_rolling, expected_rolling)