def test_ts_plot_format_coord(self): def check_format_of_first_point(ax, expected_string): first_line = ax.get_lines()[0] first_x = first_line.get_xdata()[0].ordinal first_y = first_line.get_ydata()[0] try: assert expected_string == ax.format_coord(first_x, first_y) except (ValueError): pytest.skip("skipping test because issue forming " "test comparison GH7664") annual = Series(1, index=date_range('2014-01-01', periods=3, freq='A-DEC')) _, ax = self.plt.subplots() annual.plot(ax=ax) check_format_of_first_point(ax, 't = 2014 y = 1.000000') # note this is added to the annual plot already in existence, and # changes its freq field daily = Series(1, index=date_range('2014-01-01', periods=3, freq='D')) daily.plot(ax=ax) check_format_of_first_point(ax, 't = 2014-01-01 y = 1.000000') tm.close() # tsplot _, ax = self.plt.subplots() from pandas.tseries.plotting import tsplot tsplot(annual, self.plt.Axes.plot, ax=ax) check_format_of_first_point(ax, 't = 2014 y = 1.000000') tsplot(daily, self.plt.Axes.plot, ax=ax) check_format_of_first_point(ax, 't = 2014-01-01 y = 1.000000')
def test_evenly_divisible_with_no_extra_bins(self): # 4076 # when the frequency is evenly divisible, sometimes extra bins df = DataFrame(np.random.randn(9, 3), index=date_range('2000-1-1', periods=9)) result = df.resample('5D').mean() expected = pd.concat( [df.iloc[0:5].mean(), df.iloc[5:].mean()], axis=1).T expected.index = [Timestamp('2000-1-1'), Timestamp('2000-1-6')] assert_frame_equal(result, expected) index = date_range(start='2001-5-4', periods=28) df = DataFrame( [{'REST_KEY': 1, 'DLY_TRN_QT': 80, 'DLY_SLS_AMT': 90, 'COOP_DLY_TRN_QT': 30, 'COOP_DLY_SLS_AMT': 20}] * 28 + [{'REST_KEY': 2, 'DLY_TRN_QT': 70, 'DLY_SLS_AMT': 10, 'COOP_DLY_TRN_QT': 50, 'COOP_DLY_SLS_AMT': 20}] * 28, index=index.append(index)).sort_index() index = date_range('2001-5-4', periods=4, freq='7D') expected = DataFrame( [{'REST_KEY': 14, 'DLY_TRN_QT': 14, 'DLY_SLS_AMT': 14, 'COOP_DLY_TRN_QT': 14, 'COOP_DLY_SLS_AMT': 14}] * 4, index=index) result = df.resample('7D').count() assert_frame_equal(result, expected) expected = DataFrame( [{'REST_KEY': 21, 'DLY_TRN_QT': 1050, 'DLY_SLS_AMT': 700, 'COOP_DLY_TRN_QT': 560, 'COOP_DLY_SLS_AMT': 280}] * 4, index=index) result = df.resample('7D').sum() assert_frame_equal(result, expected)
def test_frame_align_aware(self): idx1 = date_range('2001', periods=5, freq='H', tz='US/Eastern') idx2 = date_range('2001', periods=5, freq='2H', tz='US/Eastern') df1 = DataFrame(np.random.randn(len(idx1), 3), idx1) df2 = DataFrame(np.random.randn(len(idx2), 3), idx2) new1, new2 = df1.align(df2) assert df1.index.tz == new1.index.tz assert df2.index.tz == new2.index.tz # different timezones convert to UTC # frame with frame df1_central = df1.tz_convert('US/Central') new1, new2 = df1.align(df1_central) assert new1.index.tz == pytz.UTC assert new2.index.tz == pytz.UTC # frame with Series new1, new2 = df1.align(df1_central[0], axis=0) assert new1.index.tz == pytz.UTC assert new2.index.tz == pytz.UTC df1[0].align(df1_central, axis=0) assert new1.index.tz == pytz.UTC assert new2.index.tz == pytz.UTC
def test_from_weekly_resampling(self): idxh = date_range('1/1/1999', periods=52, freq='W') idxl = date_range('1/1/1999', periods=12, freq='M') high = Series(np.random.randn(len(idxh)), idxh) low = Series(np.random.randn(len(idxl)), idxl) _, ax = self.plt.subplots() low.plot(ax=ax) high.plot(ax=ax) expected_h = idxh.to_period().asi8.astype(np.float64) expected_l = np.array([1514, 1519, 1523, 1527, 1531, 1536, 1540, 1544, 1549, 1553, 1558, 1562], dtype=np.float64) for l in ax.get_lines(): assert PeriodIndex(data=l.get_xdata()).freq == idxh.freq xdata = l.get_xdata(orig=False) if len(xdata) == 12: # idxl lines tm.assert_numpy_array_equal(xdata, expected_l) else: tm.assert_numpy_array_equal(xdata, expected_h) tm.close() # tsplot from pandas.tseries.plotting import tsplot _, ax = self.plt.subplots() tsplot(low, self.plt.Axes.plot, ax=ax) lines = tsplot(high, self.plt.Axes.plot, ax=ax) for l in lines: assert PeriodIndex(data=l.get_xdata()).freq == idxh.freq xdata = l.get_xdata(orig=False) if len(xdata) == 12: # idxl lines tm.assert_numpy_array_equal(xdata, expected_l) else: tm.assert_numpy_array_equal(xdata, expected_h)
def test_resample_group_info(n, k): # GH10914 # use a fixed seed to always have the same uniques prng = np.random.RandomState(1234) dr = date_range(start='2015-08-27', periods=n // 10, freq='T') ts = Series(prng.randint(0, n // k, n).astype('int64'), index=prng.choice(dr, n)) left = ts.resample('30T').nunique() ix = date_range(start=ts.index.min(), end=ts.index.max(), freq='30T') vals = ts.values bins = np.searchsorted(ix.values, ts.index, side='right') sorter = np.lexsort((vals, bins)) vals, bins = vals[sorter], bins[sorter] mask = np.r_[True, vals[1:] != vals[:-1]] mask |= np.r_[True, bins[1:] != bins[:-1]] arr = np.bincount(bins[mask] - 1, minlength=len(ix)).astype('int64', copy=False) right = Series(arr, index=ix) assert_series_equal(left, right)
def test_resample_upsampling_picked_but_not_correct(): # Test for issue #3020 dates = date_range('01-Jan-2014', '05-Jan-2014', freq='D') series = Series(1, index=dates) result = series.resample('D').mean() assert result.index[0] == dates[0] # GH 5955 # incorrect deciding to upsample when the axis frequency matches the # resample frequency s = Series(np.arange(1., 6), index=[datetime( 1975, 1, i, 12, 0) for i in range(1, 6)]) expected = Series(np.arange(1., 6), index=date_range( '19750101', periods=5, freq='D')) result = s.resample('D').count() assert_series_equal(result, Series(1, index=expected.index)) result1 = s.resample('D').sum() result2 = s.resample('D').mean() assert_series_equal(result1, expected) assert_series_equal(result2, expected)
def test_resample_base(): rng = date_range('1/1/2000 00:00:00', '1/1/2000 02:00', freq='s') ts = Series(np.random.randn(len(rng)), index=rng) resampled = ts.resample('5min', base=2).mean() exp_rng = date_range('12/31/1999 23:57:00', '1/1/2000 01:57', freq='5min') tm.assert_index_equal(resampled.index, exp_rng)
def test_mixed_freq_hf_first(self): idxh = date_range('1/1/1999', periods=365, freq='D') idxl = date_range('1/1/1999', periods=12, freq='M') high = Series(np.random.randn(len(idxh)), idxh) low = Series(np.random.randn(len(idxl)), idxl) high.plot() ax = low.plot() for l in ax.get_lines(): assert PeriodIndex(data=l.get_xdata()).freq == 'D'
def test_resample_rounding(): # GH 8371 # odd results when rounding is needed data = """date,time,value 11-08-2014,00:00:01.093,1 11-08-2014,00:00:02.159,1 11-08-2014,00:00:02.667,1 11-08-2014,00:00:03.175,1 11-08-2014,00:00:07.058,1 11-08-2014,00:00:07.362,1 11-08-2014,00:00:08.324,1 11-08-2014,00:00:08.830,1 11-08-2014,00:00:08.982,1 11-08-2014,00:00:09.815,1 11-08-2014,00:00:10.540,1 11-08-2014,00:00:11.061,1 11-08-2014,00:00:11.617,1 11-08-2014,00:00:13.607,1 11-08-2014,00:00:14.535,1 11-08-2014,00:00:15.525,1 11-08-2014,00:00:17.960,1 11-08-2014,00:00:20.674,1 11-08-2014,00:00:21.191,1""" df = pd.read_csv(StringIO(data), parse_dates={'timestamp': [ 'date', 'time']}, index_col='timestamp') df.index.name = None result = df.resample('6s').sum() expected = DataFrame({'value': [ 4, 9, 4, 2 ]}, index=date_range('2014-11-08', freq='6s', periods=4)) assert_frame_equal(result, expected) result = df.resample('7s').sum() expected = DataFrame({'value': [ 4, 10, 4, 1 ]}, index=date_range('2014-11-08', freq='7s', periods=4)) assert_frame_equal(result, expected) result = df.resample('11s').sum() expected = DataFrame({'value': [ 11, 8 ]}, index=date_range('2014-11-08', freq='11s', periods=2)) assert_frame_equal(result, expected) result = df.resample('13s').sum() expected = DataFrame({'value': [ 13, 6 ]}, index=date_range('2014-11-08', freq='13s', periods=2)) assert_frame_equal(result, expected) result = df.resample('17s').sum() expected = DataFrame({'value': [ 16, 3 ]}, index=date_range('2014-11-08', freq='17s', periods=2)) assert_frame_equal(result, expected)
def test_downsample_across_dst(): # GH 8531 tz = pytz.timezone('Europe/Berlin') dt = datetime(2014, 10, 26) dates = date_range(tz.localize(dt), periods=4, freq='2H') result = Series(5, index=dates).resample('H').mean() expected = Series([5., np.nan] * 3 + [5.], index=date_range(tz.localize(dt), periods=7, freq='H')) tm.assert_series_equal(result, expected)
def test_resample_extra_index_point(): # GH#9756 index = date_range(start='20150101', end='20150331', freq='BM') expected = DataFrame({'A': Series([21, 41, 63], index=index)}) index = date_range(start='20150101', end='20150331', freq='B') df = DataFrame( {'A': Series(range(len(index)), index=index)}, dtype='int64') result = df.resample('BM').last() assert_frame_equal(result, expected)
def test_tz_localize_convert_copy_inplace_mutate(self, copy, method, tz): # GH 6326 result = Series(np.arange(0, 5), index=date_range('20131027', periods=5, freq='1H', tz=tz)) getattr(result, method)('UTC', copy=copy) expected = Series(np.arange(0, 5), index=date_range('20131027', periods=5, freq='1H', tz=tz)) tm.assert_series_equal(result, expected)
def test_resample_tz_localized(self): dr = date_range(start='2012-4-13', end='2012-5-1') ts = Series(lrange(len(dr)), dr) ts_utc = ts.tz_localize('UTC') ts_local = ts_utc.tz_convert('America/Los_Angeles') result = ts_local.resample('W').mean() ts_local_naive = ts_local.copy() ts_local_naive.index = [x.replace(tzinfo=None) for x in ts_local_naive.index.to_pydatetime()] exp = ts_local_naive.resample( 'W').mean().tz_localize('America/Los_Angeles') assert_series_equal(result, exp) # it works result = ts_local.resample('D').mean() # #2245 idx = date_range('2001-09-20 15:59', '2001-09-20 16:00', freq='T', tz='Australia/Sydney') s = Series([1, 2], index=idx) result = s.resample('D', closed='right', label='right').mean() ex_index = date_range('2001-09-21', periods=1, freq='D', tz='Australia/Sydney') expected = Series([1.5], index=ex_index) assert_series_equal(result, expected) # for good measure result = s.resample('D', kind='period').mean() ex_index = period_range('2001-09-20', periods=1, freq='D') expected = Series([1.5], index=ex_index) assert_series_equal(result, expected) # GH 6397 # comparing an offset that doesn't propagate tz's rng = date_range('1/1/2011', periods=20000, freq='H') rng = rng.tz_localize('EST') ts = DataFrame(index=rng) ts['first'] = np.random.randn(len(rng)) ts['second'] = np.cumsum(np.random.randn(len(rng))) expected = DataFrame( { 'first': ts.resample('A').sum()['first'], 'second': ts.resample('A').mean()['second']}, columns=['first', 'second']) result = ts.resample( 'A').agg({'first': np.sum, 'second': np.mean}).reindex(columns=['first', 'second']) assert_frame_equal(result, expected)
def test_series_tz_convert(self): rng = date_range('1/1/2011', periods=200, freq='D', tz='US/Eastern') ts = Series(1, index=rng) result = ts.tz_convert('Europe/Berlin') assert result.index.tz.zone == 'Europe/Berlin' # can't convert tz-naive rng = date_range('1/1/2011', periods=200, freq='D') ts = Series(1, index=rng) tm.assert_raises_regex(TypeError, "Cannot convert tz-naive", ts.tz_convert, 'US/Eastern')
def test_resample_loffset_upsample(): # GH 20744 rng = date_range('1/1/2000 00:00:00', '1/1/2000 00:13:00', freq='min') s = Series(np.random.randn(14), index=rng) result = s.resample('5min', closed='right', label='right', loffset=timedelta(minutes=1)).ffill() idx = date_range('1/1/2000', periods=4, freq='5min') expected = Series([s[0], s[5], s[10], s[-1]], index=idx + timedelta(minutes=1)) assert_series_equal(result, expected)
def test_series_tz_localize(self): rng = date_range('1/1/2011', periods=100, freq='H') ts = Series(1, index=rng) result = ts.tz_localize('utc') assert result.index.tz.zone == 'UTC' # Can't localize if already tz-aware rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') ts = Series(1, index=rng) tm.assert_raises_regex(TypeError, 'Already tz-aware', ts.tz_localize, 'US/Eastern')
def test_secondary_upsample(self): idxh = date_range('1/1/1999', periods=365, freq='D') idxl = date_range('1/1/1999', periods=12, freq='M') high = Series(np.random.randn(len(idxh)), idxh) low = Series(np.random.randn(len(idxl)), idxl) low.plot() ax = high.plot(secondary_y=True) for l in ax.get_lines(): assert PeriodIndex(l.get_xdata()).freq == 'D' assert hasattr(ax, 'left_ax') assert not hasattr(ax, 'right_ax') for l in ax.left_ax.get_lines(): assert PeriodIndex(l.get_xdata()).freq == 'D'
def test_frame_join_tzaware(self): test1 = DataFrame(np.zeros((6, 3)), index=date_range("2012-11-15 00:00:00", periods=6, freq="100L", tz="US/Central")) test2 = DataFrame(np.zeros((3, 3)), index=date_range("2012-11-15 00:00:00", periods=3, freq="250L", tz="US/Central"), columns=lrange(3, 6)) result = test1.join(test2, how='outer') ex_index = test1.index.union(test2.index) tm.assert_index_equal(result.index, ex_index) assert result.index.tz.zone == 'US/Central'
def test_secondary_y_regular_ts_xlim(self): # GH 3490 - regular-timeseries with secondary y index_1 = date_range(start='2000-01-01', periods=4, freq='D') index_2 = date_range(start='2000-01-05', periods=4, freq='D') s1 = Series(1, index=index_1) s2 = Series(2, index=index_2) ax = s1.plot() left_before, right_before = ax.get_xlim() s2.plot(secondary_y=True, ax=ax) left_after, right_after = ax.get_xlim() assert left_before == left_after assert right_before < right_after
def test_resample_size(): n = 10000 dr = date_range('2015-09-19', periods=n, freq='T') ts = Series(np.random.randn(n), index=np.random.choice(dr, n)) left = ts.resample('7T').size() ix = date_range(start=left.index.min(), end=ts.index.max(), freq='7T') bins = np.searchsorted(ix.values, ts.index.values, side='right') val = np.bincount(bins, minlength=len(ix) + 1)[1:].astype('int64', copy=False) right = Series(val, index=ix) assert_series_equal(left, right)
def test_high_freq(self): freaks = ['ms', 'us'] for freq in freaks: _, ax = self.plt.subplots() rng = date_range('1/1/2012', periods=100000, freq=freq) ser = Series(np.random.randn(len(rng)), rng) _check_plot_works(ser.plot, ax=ax)
def test_fill_method_and_how_upsample(self): # GH2073 s = Series(np.arange(9, dtype='int64'), index=date_range('2010-01-01', periods=9, freq='Q')) last = s.resample('M').ffill() both = s.resample('M').ffill().resample('M').last().astype('int64') assert_series_equal(last, both)
def test_fake_inferred_business(self): _, ax = self.plt.subplots() rng = date_range('2001-1-1', '2001-1-10') ts = Series(lrange(len(rng)), rng) ts = ts[:3].append(ts[5:]) ts.plot(ax=ax) assert not hasattr(ax, 'freq')
def test_plot_offset_freq(self): ser = tm.makeTimeSeries() _check_plot_works(ser.plot) dr = date_range(ser.index[0], freq='BQS', periods=10) ser = Series(np.random.randn(len(dr)), dr) _check_plot_works(ser.plot)
def test_series_append_dst(self): rng1 = date_range('1/1/2016 01:00', periods=3, freq='H', tz='US/Eastern') rng2 = date_range('8/1/2016 01:00', periods=3, freq='H', tz='US/Eastern') ser1 = Series([1, 2, 3], index=rng1) ser2 = Series([10, 11, 12], index=rng2) ts_result = ser1.append(ser2) exp_index = DatetimeIndex(['2016-01-01 01:00', '2016-01-01 02:00', '2016-01-01 03:00', '2016-08-01 01:00', '2016-08-01 02:00', '2016-08-01 03:00'], tz='US/Eastern') exp = Series([1, 2, 3, 10, 11, 12], index=exp_index) tm.assert_series_equal(ts_result, exp) assert ts_result.index.tz == rng1.tz
def test_nearest_upsample_with_limit(): rng = date_range('1/1/2000', periods=3, freq='5t') ts = Series(np.random.randn(len(rng)), rng) result = ts.resample('t').nearest(limit=2) expected = ts.reindex(result.index, method='nearest', limit=2) assert_series_equal(result, expected)
def test_mixed_freq_shared_ax(self): # GH13341, using sharex=True idx1 = date_range('2015-01-01', periods=3, freq='M') idx2 = idx1[:1].union(idx1[2:]) s1 = Series(range(len(idx1)), idx1) s2 = Series(range(len(idx2)), idx2) fig, (ax1, ax2) = self.plt.subplots(nrows=2, sharex=True) s1.plot(ax=ax1) s2.plot(ax=ax2) assert ax1.freq == 'M' assert ax2.freq == 'M' assert (ax1.lines[0].get_xydata()[0, 0] == ax2.lines[0].get_xydata()[0, 0]) # using twinx fig, ax1 = self.plt.subplots() ax2 = ax1.twinx() s1.plot(ax=ax1) s2.plot(ax=ax2) assert (ax1.lines[0].get_xydata()[0, 0] == ax2.lines[0].get_xydata()[0, 0])
def test_secondary_y_ts(self): idx = date_range('1/1/2000', periods=10) ser = Series(np.random.randn(10), idx) ser2 = Series(np.random.randn(10), idx) fig, _ = self.plt.subplots() ax = ser.plot(secondary_y=True) assert hasattr(ax, 'left_ax') assert not hasattr(ax, 'right_ax') axes = fig.get_axes() l = ax.get_lines()[0] xp = Series(l.get_ydata(), l.get_xdata()).to_timestamp() assert_series_equal(ser, xp) assert ax.get_yaxis().get_ticks_position() == 'right' assert not axes[0].get_yaxis().get_visible() self.plt.close(fig) _, ax2 = self.plt.subplots() ser2.plot(ax=ax2) assert (ax2.get_yaxis().get_ticks_position() == self.default_tick_position) self.plt.close(ax2.get_figure()) ax = ser2.plot() ax2 = ser.plot(secondary_y=True) assert ax.get_yaxis().get_visible()
def test_tz_aware_asfreq(self, tz): dr = date_range('2011-12-01', '2012-07-20', freq='D', tz=tz) ser = Series(np.random.randn(len(dr)), index=dr) # it works! ser.asfreq('T')
def test_series_truncate_datetimeindex_tz(self): # GH 9243 idx = date_range('4/1/2005', '4/30/2005', freq='D', tz='US/Pacific') s = Series(range(len(idx)), index=idx) result = s.truncate(datetime(2005, 4, 2), datetime(2005, 4, 4)) expected = Series([1, 2, 3], index=idx[1:4]) tm.assert_series_equal(result, expected)
def _simple_date_range_series(start, end, freq="D"): rng = date_range(start, end, freq=freq) return Series(np.random.randn(len(rng)), index=rng)
from collections import OrderedDict from datetime import datetime import numpy as np import pytest import pandas as pd from pandas import DataFrame, Series import pandas._testing as tm from pandas.core.indexes.datetimes import date_range dti = date_range(start=datetime(2005, 1, 1), end=datetime(2005, 1, 10), freq="Min") test_series = Series(np.random.rand(len(dti)), dti) _test_frame = DataFrame({"A": test_series, "B": test_series, "C": np.arange(len(dti))}) @pytest.fixture def test_frame(): return _test_frame.copy() def test_str(): r = test_series.resample("H") assert ( "DatetimeIndexResampler [freq=<Hour>, axis=0, closed=left, " "label=left, convention=start, origin=start_day]" in str(r) ) r = test_series.resample("H", origin="2000-01-01")
def test_agg(): # test with all three Resampler apis and TimeGrouper np.random.seed(1234) index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D") index.name = "date" df = DataFrame(np.random.rand(10, 2), columns=list("AB"), index=index) df_col = df.reset_index() df_mult = df_col.copy() df_mult.index = pd.MultiIndex.from_arrays( [range(10), df.index], names=["index", "date"] ) r = df.resample("2D") cases = [ r, df_col.resample("2D", on="date"), df_mult.resample("2D", level="date"), df.groupby(pd.Grouper(freq="2D")), ] a_mean = r["A"].mean() a_std = r["A"].std() a_sum = r["A"].sum() b_mean = r["B"].mean() b_std = r["B"].std() b_sum = r["B"].sum() expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean", "std"]]) for t in cases: result = t.aggregate([np.mean, np.std]) tm.assert_frame_equal(result, expected) expected = pd.concat([a_mean, b_std], axis=1) for t in cases: result = t.aggregate({"A": np.mean, "B": np.std}) tm.assert_frame_equal(result, expected, check_like=True) expected = pd.concat([a_mean, a_std], axis=1) expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "std")]) for t in cases: result = t.aggregate({"A": ["mean", "std"]}) tm.assert_frame_equal(result, expected) expected = pd.concat([a_mean, a_sum], axis=1) expected.columns = ["mean", "sum"] for t in cases: result = t["A"].aggregate(["mean", "sum"]) tm.assert_frame_equal(result, expected) msg = "nested renamer is not supported" for t in cases: with pytest.raises(pd.core.base.SpecificationError, match=msg): t.aggregate({"A": {"mean": "mean", "sum": "sum"}}) expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1) expected.columns = pd.MultiIndex.from_tuples( [("A", "mean"), ("A", "sum"), ("B", "mean2"), ("B", "sum2")] ) for t in cases: with pytest.raises(pd.core.base.SpecificationError, match=msg): t.aggregate( { "A": {"mean": "mean", "sum": "sum"}, "B": {"mean2": "mean", "sum2": "sum"}, } ) expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) expected.columns = pd.MultiIndex.from_tuples( [("A", "mean"), ("A", "std"), ("B", "mean"), ("B", "std")] ) for t in cases: result = t.aggregate({"A": ["mean", "std"], "B": ["mean", "std"]}) tm.assert_frame_equal(result, expected, check_like=True) expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1) expected.columns = pd.MultiIndex.from_tuples( [ ("r1", "A", "mean"), ("r1", "A", "sum"), ("r2", "B", "mean"), ("r2", "B", "sum"), ] )
def test_agg_misc(): # test with all three Resampler apis and TimeGrouper np.random.seed(1234) index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq='D') index.name = 'date' df = DataFrame(np.random.rand(10, 2), columns=list('AB'), index=index) df_col = df.reset_index() df_mult = df_col.copy() df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index], names=['index', 'date']) r = df.resample('2D') cases = [ r, df_col.resample('2D', on='date'), df_mult.resample('2D', level='date'), df.groupby(pd.Grouper(freq='2D')) ] # passed lambda for t in cases: result = t.agg({'A': np.sum, 'B': lambda x: np.std(x, ddof=1)}) rcustom = t['B'].apply(lambda x: np.std(x, ddof=1)) expected = pd.concat([r['A'].sum(), rcustom], axis=1) assert_frame_equal(result, expected, check_like=True) # agg with renamers expected = pd.concat( [t['A'].sum(), t['B'].sum(), t['A'].mean(), t['B'].mean()], axis=1) expected.columns = pd.MultiIndex.from_tuples([('result1', 'A'), ('result1', 'B'), ('result2', 'A'), ('result2', 'B')]) for t in cases: with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = t[['A', 'B']].agg( OrderedDict([('result1', np.sum), ('result2', np.mean)])) assert_frame_equal(result, expected, check_like=True) # agg with different hows expected = pd.concat( [t['A'].sum(), t['A'].std(), t['B'].mean(), t['B'].std()], axis=1) expected.columns = pd.MultiIndex.from_tuples([('A', 'sum'), ('A', 'std'), ('B', 'mean'), ('B', 'std')]) for t in cases: result = t.agg( OrderedDict([('A', ['sum', 'std']), ('B', ['mean', 'std'])])) assert_frame_equal(result, expected, check_like=True) # equivalent of using a selection list / or not for t in cases: result = t[['A', 'B']].agg({'A': ['sum', 'std'], 'B': ['mean', 'std']}) assert_frame_equal(result, expected, check_like=True) # series like aggs for t in cases: with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = t['A'].agg({'A': ['sum', 'std']}) expected = pd.concat([t['A'].sum(), t['A'].std()], axis=1) expected.columns = pd.MultiIndex.from_tuples([('A', 'sum'), ('A', 'std')]) assert_frame_equal(result, expected, check_like=True) expected = pd.concat( [t['A'].agg(['sum', 'std']), t['A'].agg(['mean', 'std'])], axis=1) expected.columns = pd.MultiIndex.from_tuples([('A', 'sum'), ('A', 'std'), ('B', 'mean'), ('B', 'std')]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = t['A'].agg({'A': ['sum', 'std'], 'B': ['mean', 'std']}) assert_frame_equal(result, expected, check_like=True) # errors # invalid names in the agg specification msg = "\"Column 'B' does not exist!\"" for t in cases: with pytest.raises(KeyError, match=msg): t[['A']].agg({'A': ['sum', 'std'], 'B': ['mean', 'std']})
import pytest import pandas.util._test_decorators as td from pandas.util._test_decorators import async_mark import pandas as pd from pandas import DataFrame, Series, Timestamp, compat import pandas._testing as tm from pandas.core.indexes.datetimes import date_range test_frame = DataFrame( { "A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40) }, index=date_range("1/1/2000", freq="s", periods=40), ) @async_mark() @td.check_file_leaks async def test_tab_complete_ipython6_warning(ip): from IPython.core.completer import provisionalcompleter code = dedent("""\ import pandas._testing as tm s = tm.makeTimeSeries() rs = s.resample("D") """) await ip.run_code(code)
def __call__(self): # if no data have been set, this will tank with a ValueError try: dmin, dmax = self.viewlim_to_dt() except ValueError: return [] # We need to cap at the endpoints of valid datetime # FIXME: dont leave commented-out # TODO(wesm) unused? # if dmin > dmax: # dmax, dmin = dmin, dmax # delta = relativedelta(dmax, dmin) # try: # start = dmin - delta # except ValueError: # start = _from_ordinal(1.0) # try: # stop = dmax + delta # except ValueError: # # The magic number! # stop = _from_ordinal(3652059.9999999) nmax, nmin = dates.date2num((dmax, dmin)) num = (nmax - nmin) * 86400 * 1000 max_millis_ticks = 6 for interval in [1, 10, 50, 100, 200, 500]: if num <= interval * (max_millis_ticks - 1): self._interval = interval break else: # We went through the whole loop without breaking, default to 1 self._interval = 1000.0 estimate = (nmax - nmin) / (self._get_unit() * self._get_interval()) if estimate > self.MAXTICKS * 2: raise RuntimeError( "MillisecondLocator estimated to generate " f"{estimate:d} ticks from {dmin} to {dmax}: exceeds Locator.MAXTICKS" f"* 2 ({self.MAXTICKS * 2:d}) " ) interval = self._get_interval() freq = f"{interval}L" tz = self.tz.tzname(None) st = _from_ordinal(dates.date2num(dmin)) # strip tz ed = _from_ordinal(dates.date2num(dmax)) all_dates = date_range(start=st, end=ed, freq=freq, tz=tz).astype(object) try: if len(all_dates) > 0: locs = self.raise_if_exceeds(dates.date2num(all_dates)) return locs except Exception: # pragma: no cover pass lims = dates.date2num([dmin, dmax]) return lims
def test_resample_tz_localized(self): dr = date_range(start="2012-4-13", end="2012-5-1") ts = Series(range(len(dr)), index=dr) ts_utc = ts.tz_localize("UTC") ts_local = ts_utc.tz_convert("America/Los_Angeles") result = ts_local.resample("W").mean() ts_local_naive = ts_local.copy() ts_local_naive.index = [ x.replace(tzinfo=None) for x in ts_local_naive.index.to_pydatetime() ] exp = ts_local_naive.resample("W").mean().tz_localize( "America/Los_Angeles") exp.index = pd.DatetimeIndex(exp.index, freq="W") tm.assert_series_equal(result, exp) # it works result = ts_local.resample("D").mean() # #2245 idx = date_range("2001-09-20 15:59", "2001-09-20 16:00", freq="T", tz="Australia/Sydney") s = Series([1, 2], index=idx) result = s.resample("D", closed="right", label="right").mean() ex_index = date_range("2001-09-21", periods=1, freq="D", tz="Australia/Sydney") expected = Series([1.5], index=ex_index) tm.assert_series_equal(result, expected) # for good measure result = s.resample("D", kind="period").mean() ex_index = period_range("2001-09-20", periods=1, freq="D") expected = Series([1.5], index=ex_index) tm.assert_series_equal(result, expected) # GH 6397 # comparing an offset that doesn't propagate tz's rng = date_range("1/1/2011", periods=20000, freq="H") rng = rng.tz_localize("EST") ts = DataFrame(index=rng) ts["first"] = np.random.randn(len(rng)) ts["second"] = np.cumsum(np.random.randn(len(rng))) expected = DataFrame( { "first": ts.resample("A").sum()["first"], "second": ts.resample("A").mean()["second"], }, columns=["first", "second"], ) result = (ts.resample("A").agg({ "first": np.sum, "second": np.mean }).reindex(columns=["first", "second"])) tm.assert_frame_equal(result, expected)
from datetime import datetime from operator import methodcaller import numpy as np import pytest import pandas as pd from pandas import DataFrame, Series, Timestamp import pandas._testing as tm from pandas.core.groupby.grouper import Grouper from pandas.core.indexes.datetimes import date_range test_series = Series(np.random.randn(1000), index=date_range("1/1/2000", periods=1000)) def test_apply(): grouper = Grouper(freq="A", label="right", closed="right") grouped = test_series.groupby(grouper) def f(x): return x.sort_values()[-3:] applied = grouped.apply(f) expected = test_series.groupby(lambda x: x.year).apply(f) applied.index = applied.index.droplevel(0) expected.index = expected.index.droplevel(0) tm.assert_series_equal(applied, expected)
def interval_range( start=None, end=None, periods=None, freq=None, name=None, closed="right" ): """ Return a fixed frequency IntervalIndex. Parameters ---------- start : numeric or datetime-like, default None Left bound for generating intervals. end : numeric or datetime-like, default None Right bound for generating intervals. periods : int, default None Number of periods to generate. freq : numeric, str, or DateOffset, default None The length of each interval. Must be consistent with the type of start and end, e.g. 2 for numeric, or '5H' for datetime-like. Default is 1 for numeric and 'D' for datetime-like. name : str, default None Name of the resulting IntervalIndex. closed : {'left', 'right', 'both', 'neither'}, default 'right' Whether the intervals are closed on the left-side, right-side, both or neither. Returns ------- IntervalIndex See Also -------- IntervalIndex : An Index of intervals that are all closed on the same side. Notes ----- Of the four parameters ``start``, ``end``, ``periods``, and ``freq``, exactly three must be specified. If ``freq`` is omitted, the resulting ``IntervalIndex`` will have ``periods`` linearly spaced elements between ``start`` and ``end``, inclusively. To learn more about datetime-like frequency strings, please see `this link <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`__. Examples -------- Numeric ``start`` and ``end`` is supported. >>> pd.interval_range(start=0, end=5) IntervalIndex([(0, 1], (1, 2], (2, 3], (3, 4], (4, 5]], closed='right', dtype='interval[int64]') Additionally, datetime-like input is also supported. >>> pd.interval_range(start=pd.Timestamp('2017-01-01'), ... end=pd.Timestamp('2017-01-04')) IntervalIndex([(2017-01-01, 2017-01-02], (2017-01-02, 2017-01-03], (2017-01-03, 2017-01-04]], closed='right', dtype='interval[datetime64[ns]]') The ``freq`` parameter specifies the frequency between the left and right. endpoints of the individual intervals within the ``IntervalIndex``. For numeric ``start`` and ``end``, the frequency must also be numeric. >>> pd.interval_range(start=0, periods=4, freq=1.5) IntervalIndex([(0.0, 1.5], (1.5, 3.0], (3.0, 4.5], (4.5, 6.0]], closed='right', dtype='interval[float64]') Similarly, for datetime-like ``start`` and ``end``, the frequency must be convertible to a DateOffset. >>> pd.interval_range(start=pd.Timestamp('2017-01-01'), ... periods=3, freq='MS') IntervalIndex([(2017-01-01, 2017-02-01], (2017-02-01, 2017-03-01], (2017-03-01, 2017-04-01]], closed='right', dtype='interval[datetime64[ns]]') Specify ``start``, ``end``, and ``periods``; the frequency is generated automatically (linearly spaced). >>> pd.interval_range(start=0, end=6, periods=4) IntervalIndex([(0.0, 1.5], (1.5, 3.0], (3.0, 4.5], (4.5, 6.0]], closed='right', dtype='interval[float64]') The ``closed`` parameter specifies which endpoints of the individual intervals within the ``IntervalIndex`` are closed. >>> pd.interval_range(end=5, periods=4, closed='both') IntervalIndex([[1, 2], [2, 3], [3, 4], [4, 5]], closed='both', dtype='interval[int64]') """ start = maybe_box_datetimelike(start) end = maybe_box_datetimelike(end) endpoint = start if start is not None else end if freq is None and com.any_none(periods, start, end): freq = 1 if is_number(endpoint) else "D" if com.count_not_none(start, end, periods, freq) != 3: raise ValueError( "Of the four parameters: start, end, periods, and " "freq, exactly three must be specified" ) if not _is_valid_endpoint(start): raise ValueError(f"start must be numeric or datetime-like, got {start}") elif not _is_valid_endpoint(end): raise ValueError(f"end must be numeric or datetime-like, got {end}") if is_float(periods): periods = int(periods) elif not is_integer(periods) and periods is not None: raise TypeError(f"periods must be a number, got {periods}") if freq is not None and not is_number(freq): try: freq = to_offset(freq) except ValueError as err: raise ValueError( f"freq must be numeric or convertible to DateOffset, got {freq}" ) from err # verify type compatibility if not all( [ _is_type_compatible(start, end), _is_type_compatible(start, freq), _is_type_compatible(end, freq), ] ): raise TypeError("start, end, freq need to be type compatible") # +1 to convert interval count to breaks count (n breaks = n-1 intervals) if periods is not None: periods += 1 if is_number(endpoint): # force consistency between start/end/freq (lower end if freq skips it) if com.all_not_none(start, end, freq): end -= (end - start) % freq # compute the period/start/end if unspecified (at most one) if periods is None: periods = int((end - start) // freq) + 1 elif start is None: start = end - (periods - 1) * freq elif end is None: end = start + (periods - 1) * freq breaks = np.linspace(start, end, periods) if all(is_integer(x) for x in com.not_none(start, end, freq)): # np.linspace always produces float output breaks = maybe_downcast_numeric(breaks, np.dtype("int64")) else: # delegate to the appropriate range function if isinstance(endpoint, Timestamp): breaks = date_range(start=start, end=end, periods=periods, freq=freq) else: breaks = timedelta_range(start=start, end=end, periods=periods, freq=freq) return IntervalIndex.from_breaks(breaks, name=name, closed=closed)
def test_agg_misc(): # test with all three Resampler apis and TimeGrouper np.random.seed(1234) index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D") index.name = "date" df = DataFrame(np.random.rand(10, 2), columns=list("AB"), index=index) df_col = df.reset_index() df_mult = df_col.copy() df_mult.index = pd.MultiIndex.from_arrays( [range(10), df.index], names=["index", "date"] ) r = df.resample("2D") cases = [ r, df_col.resample("2D", on="date"), df_mult.resample("2D", level="date"), df.groupby(pd.Grouper(freq="2D")), ] # passed lambda for t in cases: result = t.agg({"A": np.sum, "B": lambda x: np.std(x, ddof=1)}) rcustom = t["B"].apply(lambda x: np.std(x, ddof=1)) expected = pd.concat([r["A"].sum(), rcustom], axis=1) tm.assert_frame_equal(result, expected, check_like=True) # agg with renamers expected = pd.concat( [t["A"].sum(), t["B"].sum(), t["A"].mean(), t["B"].mean()], axis=1 ) expected.columns = pd.MultiIndex.from_tuples( [("result1", "A"), ("result1", "B"), ("result2", "A"), ("result2", "B")] ) msg = r"Column\(s\) \['result1', 'result2'\] do not exist" for t in cases: with pytest.raises(pd.core.base.SpecificationError, match=msg): t[["A", "B"]].agg(OrderedDict([("result1", np.sum), ("result2", np.mean)])) # agg with different hows expected = pd.concat( [t["A"].sum(), t["A"].std(), t["B"].mean(), t["B"].std()], axis=1 ) expected.columns = pd.MultiIndex.from_tuples( [("A", "sum"), ("A", "std"), ("B", "mean"), ("B", "std")] ) for t in cases: result = t.agg(OrderedDict([("A", ["sum", "std"]), ("B", ["mean", "std"])])) tm.assert_frame_equal(result, expected, check_like=True) # equivalent of using a selection list / or not for t in cases: result = t[["A", "B"]].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) tm.assert_frame_equal(result, expected, check_like=True) msg = "nested renamer is not supported" # series like aggs for t in cases: with pytest.raises(pd.core.base.SpecificationError, match=msg): t["A"].agg({"A": ["sum", "std"]}) with pytest.raises(pd.core.base.SpecificationError, match=msg): t["A"].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) # errors # invalid names in the agg specification msg = "\"Column 'B' does not exist!\"" for t in cases: with pytest.raises(KeyError, match=msg): t[["A"]].agg({"A": ["sum", "std"], "B": ["mean", "std"]})
from textwrap import dedent import numpy as np import pandas as pd from pandas import DataFrame, Series, Timestamp from pandas.core.indexes.datetimes import date_range import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal test_frame = DataFrame({ 'A': [1] * 20 + [2] * 12 + [3] * 8, 'B': np.arange(40) }, index=date_range('1/1/2000', freq='s', periods=40)) def test_tab_complete_ipython6_warning(ip): from IPython.core.completer import provisionalcompleter code = dedent("""\ import pandas.util.testing as tm s = tm.makeTimeSeries() rs = s.resample("D") """) ip.run_code(code) with tm.assert_produces_warning(None): with provisionalcompleter('ignore'): list(ip.Completer.completions('rs.', 1))
def test_agg(): # test with all three Resampler apis and TimeGrouper np.random.seed(1234) index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq='D') index.name = 'date' df = DataFrame(np.random.rand(10, 2), columns=list('AB'), index=index) df_col = df.reset_index() df_mult = df_col.copy() df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index], names=['index', 'date']) r = df.resample('2D') cases = [ r, df_col.resample('2D', on='date'), df_mult.resample('2D', level='date'), df.groupby(pd.Grouper(freq='2D')) ] a_mean = r['A'].mean() a_std = r['A'].std() a_sum = r['A'].sum() b_mean = r['B'].mean() b_std = r['B'].std() b_sum = r['B'].sum() expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) expected.columns = pd.MultiIndex.from_product([['A', 'B'], ['mean', 'std']]) for t in cases: result = t.aggregate([np.mean, np.std]) assert_frame_equal(result, expected) expected = pd.concat([a_mean, b_std], axis=1) for t in cases: result = t.aggregate({'A': np.mean, 'B': np.std}) assert_frame_equal(result, expected, check_like=True) expected = pd.concat([a_mean, a_std], axis=1) expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), ('A', 'std')]) for t in cases: result = t.aggregate({'A': ['mean', 'std']}) assert_frame_equal(result, expected) expected = pd.concat([a_mean, a_sum], axis=1) expected.columns = ['mean', 'sum'] for t in cases: result = t['A'].aggregate(['mean', 'sum']) assert_frame_equal(result, expected) expected = pd.concat([a_mean, a_sum], axis=1) expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), ('A', 'sum')]) for t in cases: with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = t.aggregate({'A': {'mean': 'mean', 'sum': 'sum'}}) assert_frame_equal(result, expected, check_like=True) expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1) expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), ('A', 'sum'), ('B', 'mean2'), ('B', 'sum2')]) for t in cases: with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = t.aggregate({ 'A': { 'mean': 'mean', 'sum': 'sum' }, 'B': { 'mean2': 'mean', 'sum2': 'sum' } }) assert_frame_equal(result, expected, check_like=True) expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), ('A', 'std'), ('B', 'mean'), ('B', 'std')]) for t in cases: result = t.aggregate({'A': ['mean', 'std'], 'B': ['mean', 'std']}) assert_frame_equal(result, expected, check_like=True) expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1) expected.columns = pd.MultiIndex.from_tuples([('r1', 'A', 'mean'), ('r1', 'A', 'sum'), ('r2', 'B', 'mean'), ('r2', 'B', 'sum')])
from datetime import datetime from operator import methodcaller import numpy as np import pytest import pandas as pd from pandas import DataFrame, Panel, Series from pandas.core.indexes.datetimes import date_range from pandas.core.resample import TimeGrouper import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal test_series = Series(np.random.randn(1000), index=date_range('1/1/2000', periods=1000)) def test_apply(): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): grouper = pd.TimeGrouper(freq='A', label='right', closed='right') grouped = test_series.groupby(grouper) def f(x): return x.sort_values()[-3:] applied = grouped.apply(f) expected = test_series.groupby(lambda x: x.year).apply(f) applied.index = applied.index.droplevel(0) expected.index = expected.index.droplevel(0)
def test_string_index_alias_tz_aware(self, tz): rng = date_range("1/1/2000", periods=10, tz=tz) ser = Series(np.random.randn(len(rng)), index=rng) result = ser["1/3/2000"] tm.assert_almost_equal(result, ser[2])