def test_resample_with_nat(): # GH 13020 index = DatetimeIndex([pd.NaT, '1970-01-01 00:00:00', pd.NaT, '1970-01-01 00:00:01', '1970-01-01 00:00:02']) frame = DataFrame([2, 3, 5, 7, 11], index=index) index_1s = DatetimeIndex(['1970-01-01 00:00:00', '1970-01-01 00:00:01', '1970-01-01 00:00:02']) frame_1s = DataFrame([3, 7, 11], index=index_1s) assert_frame_equal(frame.resample('1s').mean(), frame_1s) index_2s = DatetimeIndex(['1970-01-01 00:00:00', '1970-01-01 00:00:02']) frame_2s = DataFrame([5, 11], index=index_2s) assert_frame_equal(frame.resample('2s').mean(), frame_2s) index_3s = DatetimeIndex(['1970-01-01 00:00:00']) frame_3s = DataFrame([7], index=index_3s) assert_frame_equal(frame.resample('3s').mean(), frame_3s) assert_frame_equal(frame.resample('60s').mean(), frame_3s)
def test_resample_median_bug_1688(self): df = DataFrame([1, 2], index=[datetime(2012, 1, 1, 0, 0, 0), datetime(2012, 1, 1, 0, 5, 0)]) result = df.resample("T", how=lambda x: x.mean()) exp = df.asfreq("T") tm.assert_frame_equal(result, exp) result = df.resample("T", how="median") exp = df.asfreq("T") tm.assert_frame_equal(result, exp)
def test_raises_on_non_datetimelike_index(): # this is a non datetimelike index xp = DataFrame() msg = ("Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex," " but got an instance of 'Index'") with pytest.raises(TypeError, match=msg): xp.resample('A').mean()
def test_evenly_divisible_with_no_extra_bins(self): # 4076 # when the frequency is evenly divisible, sometimes extra bins df = DataFrame(np.random.randn(9, 3), index=date_range('2000-1-1', periods=9)) result = df.resample('5D').mean() expected = pd.concat( [df.iloc[0:5].mean(), df.iloc[5:].mean()], axis=1).T expected.index = [Timestamp('2000-1-1'), Timestamp('2000-1-6')] assert_frame_equal(result, expected) index = date_range(start='2001-5-4', periods=28) df = DataFrame( [{'REST_KEY': 1, 'DLY_TRN_QT': 80, 'DLY_SLS_AMT': 90, 'COOP_DLY_TRN_QT': 30, 'COOP_DLY_SLS_AMT': 20}] * 28 + [{'REST_KEY': 2, 'DLY_TRN_QT': 70, 'DLY_SLS_AMT': 10, 'COOP_DLY_TRN_QT': 50, 'COOP_DLY_SLS_AMT': 20}] * 28, index=index.append(index)).sort_index() index = date_range('2001-5-4', periods=4, freq='7D') expected = DataFrame( [{'REST_KEY': 14, 'DLY_TRN_QT': 14, 'DLY_SLS_AMT': 14, 'COOP_DLY_TRN_QT': 14, 'COOP_DLY_SLS_AMT': 14}] * 4, index=index) result = df.resample('7D').count() assert_frame_equal(result, expected) expected = DataFrame( [{'REST_KEY': 21, 'DLY_TRN_QT': 1050, 'DLY_SLS_AMT': 700, 'COOP_DLY_TRN_QT': 560, 'COOP_DLY_SLS_AMT': 280}] * 4, index=index) result = df.resample('7D').sum() assert_frame_equal(result, expected)
def test_resample_weekly_bug_1726(self): # 8/6/12 is a Monday ind = DatetimeIndex(start="8/6/2012", end="8/26/2012", freq="D") n = len(ind) data = [[x] * 5 for x in range(n)] df = DataFrame(data, columns=["open", "high", "low", "close", "vol"], index=ind) # it works! df.resample("W-MON", how="first", closed="left", label="left")
def test_default_left_closed_label(self): others = ["MS", "AS", "QS", "D", "H"] others_freq = ["D", "Q", "M", "H", "T"] for from_freq, to_freq in zip(others_freq, others): idx = DatetimeIndex(start="8/15/2012", periods=100, freq=from_freq) df = DataFrame(np.random.randn(len(idx), 2), idx) resampled = df.resample(to_freq) assert_frame_equal(resampled, df.resample(to_freq, closed="left", label="left"))
def test_resample_unequal_times(self): # #1772 start = datetime(1999, 3, 1, 5) # end hour is less than start end = datetime(2012, 7, 31, 4) bad_ind = date_range(start, end, freq="30min") df = DataFrame({'close': 1}, index=bad_ind) # it works! df.resample('AS', 'sum')
def test_resample_weekly_bug_1726(self): # 8/6/12 is a Monday ind = DatetimeIndex(start="8/6/2012", end="8/26/2012", freq="D") n = len(ind) data = [[x] * 5 for x in range(n)] df = DataFrame(data, columns=['open', 'high', 'low', 'close', 'vol'], index=ind) # it works! df.resample('W-MON', how='first', closed='left', label='left')
def test_default_right_closed_label(self): end_freq = ["D", "Q", "M", "D"] end_types = ["M", "A", "Q", "W"] for from_freq, to_freq in zip(end_freq, end_types): idx = DatetimeIndex(start="8/15/2012", periods=100, freq=from_freq) df = DataFrame(np.random.randn(len(idx), 2), idx) resampled = df.resample(to_freq) assert_frame_equal(resampled, df.resample(to_freq, closed="right", label="right"))
def test_default_left_closed_label(self): others = ['MS', 'AS', 'QS', 'D', 'H'] others_freq = ['D', 'Q', 'M', 'H', 'T'] for from_freq, to_freq in zip(others_freq, others): idx = date_range(start='8/15/2012', periods=100, freq=from_freq) df = DataFrame(np.random.randn(len(idx), 2), idx) resampled = df.resample(to_freq).mean() assert_frame_equal(resampled, df.resample(to_freq, closed='left', label='left').mean())
def test_selection(self, index, freq, kind): # This is a bug, these should be implemented # GH 14008 rng = np.arange(len(index), dtype=np.int64) df = DataFrame({'date': index, 'a': rng}, index=pd.MultiIndex.from_arrays([rng, index], names=['v', 'd'])) with pytest.raises(NotImplementedError): df.resample(freq, on='date', kind=kind) with pytest.raises(NotImplementedError): df.resample(freq, level='d', kind=kind)
def test_default_right_closed_label(self): end_freq = ['D', 'Q', 'M', 'D'] end_types = ['M', 'A', 'Q', 'W'] for from_freq, to_freq in zip(end_freq, end_types): idx = date_range(start='8/15/2012', periods=100, freq=from_freq) df = DataFrame(np.random.randn(len(idx), 2), idx) resampled = df.resample(to_freq).mean() assert_frame_equal(resampled, df.resample(to_freq, closed='right', label='right').mean())
def test_selection(self, index, freq, kind, kwargs): # This is a bug, these should be implemented # GH 14008 rng = np.arange(len(index), dtype=np.int64) df = DataFrame({'date': index, 'a': rng}, index=pd.MultiIndex.from_arrays([rng, index], names=['v', 'd'])) msg = ("Resampling from level= or on= selection with a PeriodIndex is" r" not currently supported, use \.set_index\(\.\.\.\) to" " explicitly set index") with pytest.raises(NotImplementedError, match=msg): df.resample(freq, kind=kind, **kwargs)
def test_resample_median_bug_1688(self): for dtype in ["int64", "int32", "float64", "float32"]: df = DataFrame([1, 2], index=[datetime(2012, 1, 1, 0, 0, 0), datetime(2012, 1, 1, 0, 5, 0)], dtype=dtype) result = df.resample("T", how=lambda x: x.mean()) exp = df.asfreq("T") tm.assert_frame_equal(result, exp) result = df.resample("T", how="median") exp = df.asfreq("T") tm.assert_frame_equal(result, exp)
def test_resample_median_bug_1688(): for dtype in ['int64', 'int32', 'float64', 'float32']: df = DataFrame([1, 2], index=[datetime(2012, 1, 1, 0, 0, 0), datetime(2012, 1, 1, 0, 5, 0)], dtype=dtype) result = df.resample("T").apply(lambda x: x.mean()) exp = df.asfreq('T') tm.assert_frame_equal(result, exp) result = df.resample("T").median() exp = df.asfreq('T') tm.assert_frame_equal(result, exp)
def test_resample_anchored_intraday(self): # #1471, #1458 rng = date_range('1/1/2012', '4/1/2012', freq='10min') df = DataFrame(rng.month, index=rng) result = df.resample('M') expected = df.resample('M', kind='period').to_timestamp() tm.assert_frame_equal(result, expected) result = df.resample('M', closed='left') expected = df.resample('M', kind='period', closed='left').to_timestamp() tm.assert_frame_equal(result, expected) rng = date_range('1/1/2012', '4/1/2013', freq='10min') df = DataFrame(rng.month, index=rng) result = df.resample('Q') expected = df.resample('Q', kind='period').to_timestamp() tm.assert_frame_equal(result, expected) result = df.resample('Q', closed='left') expected = df.resample('Q', kind='period', closed='left').to_timestamp() tm.assert_frame_equal(result, expected) ts = _simple_ts('2012-04-29 23:00', '2012-04-30 5:00', freq='h') resampled = ts.resample('M') self.assert_(len(resampled) == 1)
def test_try_aggregate_non_existing_column(): # GH 16766 data = [ {'dt': datetime(2017, 6, 1, 0), 'x': 1.0, 'y': 2.0}, {'dt': datetime(2017, 6, 1, 1), 'x': 2.0, 'y': 2.0}, {'dt': datetime(2017, 6, 1, 2), 'x': 3.0, 'y': 1.5} ] df = DataFrame(data).set_index('dt') # Error as we don't have 'z' column with pytest.raises(KeyError): df.resample('30T').agg({'x': ['mean'], 'y': ['median'], 'z': ['sum']})
def slide14(): frame = DataFrame(np.random.randn(2, 4), index=pd.date_range('1/1/2000', periods=2, freq='W-WED'), columns=['Colorado', 'Texas', 'New York', 'Ohio']) print frame[:5] df_daily = frame.resample('D') print 'daily fill_method=none' print df_daily print 'daily fill_method=ffill' print frame.resample('D', fill_method='ffill') print 'daily fill_method=ffill limit=2' print frame.resample('D', fill_method='ffill', limit=2) print frame.resample('W-THU', fill_method='ffill') print 'resampling with periods' frame = DataFrame(np.random.randn(24, 4), index=pd.period_range('1-2000', '12-2001', freq='M'), columns=['Colorado', 'Texas', 'New York', 'Ohio']) print frame[:5] annual_frame = frame.resample('A-DEC', how='mean') print annual_frame print 'resample Quarterly' print annual_frame.resample('Q-DEC', fill_method='ffill') print annual_frame.resample('Q-DEC', fill_method='ffill', convention='start')
def test_resample_axis1(self): rng = date_range("1/1/2000", "2/29/2000") df = DataFrame(np.random.randn(3, len(rng)), columns=rng, index=["a", "b", "c"]) result = df.resample("M", axis=1) expected = df.T.resample("M").T tm.assert_frame_equal(result, expected)
def test_resample_across_dst(): # The test resamples a DatetimeIndex with values before and after a # DST change # Issue: 14682 # The DatetimeIndex we will start with # (note that DST happens at 03:00+02:00 -> 02:00+01:00) # 2016-10-30 02:23:00+02:00, 2016-10-30 02:23:00+01:00 df1 = DataFrame([1477786980, 1477790580], columns=['ts']) dti1 = DatetimeIndex(pd.to_datetime(df1.ts, unit='s') .dt.tz_localize('UTC') .dt.tz_convert('Europe/Madrid')) # The expected DatetimeIndex after resampling. # 2016-10-30 02:00:00+02:00, 2016-10-30 02:00:00+01:00 df2 = DataFrame([1477785600, 1477789200], columns=['ts']) dti2 = DatetimeIndex(pd.to_datetime(df2.ts, unit='s') .dt.tz_localize('UTC') .dt.tz_convert('Europe/Madrid')) df = DataFrame([5, 5], index=dti1) result = df.resample(rule='H').sum() expected = DataFrame([5, 5], index=dti2) assert_frame_equal(result, expected)
def get_date_trend(self, mode_date): """ :param mode_date: 日期模式,合并到最短时间单位. 0-day, 1-week, 2-month, 3-Quarter. (default 2) """ axisLabels = self.oriDate[:] pointVals = [{copy.deepcopy(oriValue): 1} for oriValue in self.oriValues] rule_mode = {'0': 'D', '1': 'W', '2': 'M', '3': 'Q'} df = DataFrame(pointVals, index=axisLabels) df = df.resample(rule_mode[str(mode_date)], how='sum') df = df.fillna(0) """各项总和""" # cols_name = [] # for name, col in df.iteritems(): # cols_name.append(name) # df['SUM'] = 0 # for i in xrange(len(cols_name)): # df['SUM'] += df[cols_name[i]] """宿舍比重""" # df['PER_DORM'] = df['dorm']/df['SUM'] if 'dorm' in df else 0 # 仅当存在宿舍值时才计算宿舍比重,否则设为0 axisLabels = map(lambda x: x.strftime('%Y-%m-%d'), df.index.tolist()) # 从dataframe 中取出作为索引的日期标签成为队列 seriesData = [] legendLabels = [] for colName, col in df.iteritems(): legendLabels.append(colName) data = map(lambda x: 0.0 if isnan(x) else float(x), col.tolist()) seriesData.append({'name': colName, 'data': data}) json_dateTrend = {'axisLabels': axisLabels, 'legendLabels': legendLabels, 'seriesData': seriesData} return json_dateTrend
def test_annual_upsample(self): targets = ["D", "B", "M"] for month in MONTHS: ts = _simple_pts("1/1/1990", "12/31/1995", freq="A-%s" % month) for targ, conv, meth in product(targets, ["start", "end"], ["ffill", "bfill"]): result = ts.resample(targ, fill_method=meth, convention=conv) expected = result.to_timestamp(targ, how=conv) expected = expected.asfreq(targ, meth).to_period() assert_series_equal(result, expected) df = DataFrame({"a": ts}) rdf = df.resample("D", fill_method="ffill") exp = df["a"].resample("D", fill_method="ffill") assert_series_equal(rdf["a"], exp) rng = period_range("2000", "2003", freq="A-DEC") ts = Series([1, 2, 3, 4], index=rng) result = ts.resample("M", fill_method="ffill") ex_index = period_range("2000-01", "2003-12", freq="M") expected = ts.asfreq("M", how="start").reindex(ex_index, method="ffill") assert_series_equal(result, expected)
def test_subset(self): N = 10 rng = date_range('1/1/1990', periods=N, freq='53s') df = DataFrame({'A': np.arange(N), 'B': np.arange(N)}, index=rng) df.loc[4:8, 'A'] = np.nan dates = date_range('1/1/1990', periods=N * 3, freq='25s') # with a subset of A should be the same result = df.asof(dates, subset='A') expected = df.asof(dates) tm.assert_frame_equal(result, expected) # same with A/B result = df.asof(dates, subset=['A', 'B']) expected = df.asof(dates) tm.assert_frame_equal(result, expected) # B gives self.df.asof result = df.asof(dates, subset='B') expected = df.resample('25s', closed='right').ffill().reindex(dates) expected.iloc[20:] = 9 tm.assert_frame_equal(result, expected)
def test_annual_upsample(self): targets = ['D', 'B', 'M'] for month in MONTHS: ts = _simple_pts('1/1/1990', '12/31/1995', freq='A-%s' % month) for targ, conv, meth in product(targets, ['start', 'end'], ['ffill', 'bfill']): result = ts.resample(targ, fill_method=meth, convention=conv) expected = result.to_timestamp(targ, how=conv) expected = expected.asfreq(targ, meth).to_period() assert_series_equal(result, expected) df = DataFrame({'a' : ts}) rdf = df.resample('D', fill_method='ffill') exp = df['a'].resample('D', fill_method='ffill') assert_series_equal(rdf['a'], exp) rng = period_range('2000', '2003', freq='A-DEC') ts = Series([1, 2, 3, 4], index=rng) result = ts.resample('M', fill_method='ffill') ex_index = period_range('2000-01', '2003-12', freq='M') expected = ts.asfreq('M', how='start').reindex(ex_index, method='ffill') assert_series_equal(result, expected)
def test_resample_with_only_nat(self): # GH 13224 pi = PeriodIndex([pd.NaT] * 3, freq='S') frame = DataFrame([2, 3, 5], index=pi) expected_index = PeriodIndex(data=[], freq=pi.freq) expected = DataFrame([], index=expected_index) result = frame.resample('1s').mean() assert_frame_equal(result, expected)
def test_resample_axis1(self): rng = date_range('1/1/2000', '2/29/2000') df = DataFrame(np.random.randn(3, len(rng)), columns=rng, index=['a', 'b', 'c']) result = df.resample('M', axis=1) expected = df.T.resample('M').T tm.assert_frame_equal(result, expected)
def test_asfreq_bug(): df = DataFrame(data=[1, 3], index=[timedelta(), timedelta(minutes=3)]) result = df.resample('1T').asfreq() expected = DataFrame(data=[1, np.nan, np.nan, 3], index=timedelta_range('0 day', periods=4, freq='1T')) assert_frame_equal(result, expected)
def save_to_file(self, fn): gg = DataFrame(self.power_series_apps_table) try: del gg['diff1'] del gg['diff2'] except Exception: print('') gg['Loc Events'] = self.loc.events_apps_1min['Apps'] apps = self.loc.metadata.get_channels() sd = {} #Initialize series with 0s for app in apps: sd[app] = Series(0, index=gg.index) #Count location events for each appliance for index, row in gg.iterrows(): try: if len(row['Loc Events']) > 0: for app in apps: n = row['Loc Events'].count(app) sd[app][index] = n except Exception: continue if self.loc.name == 'REDD': sd[(3,4)] = sd[3] sd[(10,20)] = sd[10] del sd[3] del sd[4] del sd[10] del sd[20] #Change column names and append them to gral table locevents = DataFrame(sd) locevents.columns = [(str(col) + ' locEv') for col in locevents] for locEv in locevents: gg[locEv] = locevents[locEv] #Get power values of each appliance and resample for 1min act = DataFrame(self.loc.appliances_consuming_times) act = act.resample('1Min') if self.loc.name == 'REDD': del act[3] del act[10] act.columns = [(3,4), 5,6,7,8,9,11,12,13,14,15,16,17,18,19,(10,20)] act.columns = [(str(col) + ' conEv') for col in act] for app in act: gg[app] = act[app] gg.columns = [str(col) for col in gg] gg = gg[sorted(gg.columns)] gg.to_csv(fn) return
def test_resample_with_nat(self, periods, values, freq, expected_values): # GH 13224 index = PeriodIndex(periods, freq='S') frame = DataFrame(values, index=index) expected_index = period_range('1970-01-01 00:00:00', periods=len(expected_values), freq=freq) expected = DataFrame(expected_values, index=expected_index) result = frame.resample(freq).mean() assert_frame_equal(result, expected)
def test_resample_extra_index_point(): # GH#9756 index = date_range(start='20150101', end='20150331', freq='BM') expected = DataFrame({'A': Series([21, 41, 63], index=index)}) index = date_range(start='20150101', end='20150331', freq='B') df = DataFrame( {'A': Series(range(len(index)), index=index)}, dtype='int64') result = df.resample('BM').last() assert_frame_equal(result, expected)
def test_annual_upsample(self, simple_period_range_series): ts = simple_period_range_series("1/1/1990", "12/31/1995", freq="A-DEC") df = DataFrame({"a": ts}) rdf = df.resample("D").ffill() exp = df["a"].resample("D").ffill() tm.assert_series_equal(rdf["a"], exp) rng = period_range("2000", "2003", freq="A-DEC") ts = Series([1, 2, 3, 4], index=rng) result = ts.resample("M").ffill() ex_index = period_range("2000-01", "2003-12", freq="M") expected = ts.asfreq("M", how="start").reindex(ex_index, method="ffill") tm.assert_series_equal(result, expected)
def test_annual_upsample(self, simple_period_range_series): ts = simple_period_range_series('1/1/1990', '12/31/1995', freq='A-DEC') df = DataFrame({'a': ts}) rdf = df.resample('D').ffill() exp = df['a'].resample('D').ffill() assert_series_equal(rdf['a'], exp) rng = period_range('2000', '2003', freq='A-DEC') ts = Series([1, 2, 3, 4], index=rng) result = ts.resample('M').ffill() ex_index = period_range('2000-01', '2003-12', freq='M') expected = ts.asfreq('M', how='start').reindex(ex_index, method='ffill') assert_series_equal(result, expected)
def test_resample_timedelta_values(): # GH 13119 # check that timedelta dtype is preserved when NaT values are # introduced by the resampling times = timedelta_range("1 day", "6 day", freq="4D") df = DataFrame({"time": times}, index=times) times2 = timedelta_range("1 day", "6 day", freq="2D") exp = Series(times2, index=times2, name="time") exp.iloc[1] = pd.NaT res = df.resample("2D").first()["time"] tm.assert_series_equal(res, exp) res = df["time"].resample("2D").first() tm.assert_series_equal(res, exp)
def test_resample_categorical_data_with_timedeltaindex(): # GH #12169 df = DataFrame({"Group_obj": "A"}, index=pd.to_timedelta(list(range(20)), unit="s")) df["Group"] = df["Group_obj"].astype("category") result = df.resample("10s").agg(lambda x: (x.value_counts().index[0])) expected = DataFrame( { "Group_obj": ["A", "A"], "Group": ["A", "A"] }, index=pd.TimedeltaIndex([0, 10], unit="s", freq="10s"), ) expected = expected.reindex(["Group_obj", "Group"], axis=1) expected["Group"] = expected["Group_obj"] tm.assert_frame_equal(result, expected)
def test_resample_timedelta_values(): # GH 13119 # check that timedelta dtype is preserved when NaT values are # introduced by the resampling times = timedelta_range('1 day', '4 day', freq='4D') df = DataFrame({'time': times}, index=times) times2 = timedelta_range('1 day', '4 day', freq='2D') exp = Series(times2, index=times2, name='time') exp.iloc[1] = pd.NaT res = df.resample('2D').first()['time'] tm.assert_series_equal(res, exp) res = df['time'].resample('2D').first() tm.assert_series_equal(res, exp)
def test_resample_with_timedeltas(): expected = DataFrame({"A": np.arange(1480)}) expected = expected.groupby(expected.index // 30).sum() expected.index = pd.timedelta_range("0 days", freq="30T", periods=50) df = DataFrame( {"A": np.arange(1480)}, index=pd.to_timedelta(np.arange(1480), unit="T") ) result = df.resample("30T").sum() tm.assert_frame_equal(result, expected) s = df["A"] result = s.resample("30T").sum() tm.assert_series_equal(result, expected["A"])
def test_resample_datetime_values(): # GH 13119 # check that datetime dtype is preserved when NaT values are # introduced by the resampling dates = [datetime(2016, 1, 15), datetime(2016, 1, 19)] df = DataFrame({'timestamp': dates}, index=dates) exp = Series([datetime(2016, 1, 15), pd.NaT, datetime(2016, 1, 19)], index=date_range('2016-01-15', periods=3, freq='2D'), name='timestamp') res = df.resample('2D').first()['timestamp'] tm.assert_series_equal(res, exp) res = df['timestamp'].resample('2D').first() tm.assert_series_equal(res, exp)
def apply_charting_to_df( df: pd.DataFrame, chart_period: str, start_time: str, stop_time: str ): """Modifies the dataframe based on the chart_period, start dates and end dates Parameters ---------- df: dataframe with data loaded chart_period: string, describes how often to sample data, default is '1Min' (1 minute) see https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects start_time: datestring in YYYY-MM-DD HH:MM (ex. 2020-08-31 04:00) of when to begin the backtest stop_time: datestring of YYYY-MM-DD HH:MM when to stop the backtest Returns DataFrame, a sorted dataframe ready for consumption by run_backtest """ if df.index.dtype != "datetime64[ns]": headers = df.columns.values.tolist() headers.extend([df.index.name]) if "date" not in headers: raise Exception( "Data does not have a date column. Headers must include date, open, high, low, close, volume." ) time_unit = detect_time_unit(df.date[1]) df.date = pd.to_datetime(df.date, unit=time_unit) df.set_index("date", inplace=True) if start_time: if isinstance(start_time, datetime) or type(start_time) is int: time_unit = detect_time_unit(start_time) start_time = pd.to_datetime(start_time, unit=time_unit) start_time = start_time.strftime("%Y-%m-%d %H:%M:%S") if stop_time: if isinstance(stop_time, datetime) or type(stop_time) is int: time_unit = detect_time_unit(stop_time) stop_time = pd.to_datetime(stop_time, unit=time_unit) stop_time = stop_time.strftime("%Y-%m-%d %H:%M:%S") df = df.resample(chart_period).first() if start_time and stop_time: df = df[start_time:stop_time] # noqa elif start_time and not stop_time: df = df[start_time:] # noqa elif not start_time and stop_time: df = df[:stop_time] return df
def test_annual_upsample(self): targets = ['D', 'B', 'M'] for month in MONTHS: ts = _simple_pts('1/1/1990', '12/31/1995', freq='A-%s' % month) for targ, conv, meth in product(targets, ['start', 'end'], ['ffill', 'bfill']): result = ts.resample(targ, fill_method=meth, convention=conv) expected = result.to_timestamp(targ, how=conv) expected = expected.asfreq(targ, meth).to_period() assert_series_equal(result, expected) df = DataFrame({'a': ts}) rdf = df.resample('D', fill_method='ffill') exp = df['a'].resample('D', fill_method='ffill') assert_series_equal(rdf['a'], exp)
def resample_calendar(df: pd.DataFrame, offset: str) -> pd.DataFrame: """Resample the DataFrame by calendar offset. See http://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#anchored-offsets for compatible offsets. :param df: data :param offset: calendar offset :return: result DataFrame """ d = { "open": "first", "high": "max", "low": "min", "close": "last", "volume": "sum" } return df.resample(offset).agg(d)
def test_asfreq_resample_set_correct_freq(self): # GH#5613 # we test if .asfreq() and .resample() set the correct value for .freq df = DataFrame( {"date": ["2012-01-01", "2012-01-02", "2012-01-03"], "col": [1, 2, 3]} ) df = df.set_index(to_datetime(df.date)) # testing the settings before calling .asfreq() and .resample() assert df.index.freq is None assert df.index.inferred_freq == "D" # does .asfreq() set .freq correctly? assert df.asfreq("D").index.freq == "D" # does .resample() set .freq correctly? assert df.resample("D").asfreq().index.freq == "D"
def create_fn_list_ror_ts(ror: pd.DataFrame, *, period: str = 'year') -> list: """ Returns a list of functions of weights. """ # Frame.weights_sum_is_one(weights) initial_inv = 1000 fn_list = [] for x in ror.resample(period): def ror_list_fn(weights, y=x): df = y[1] # select ror part of the grouped data inv_period_spread = np.asarray(weights) * initial_inv # rebalancing assets_wealth_indexes = inv_period_spread * (1 + df).cumprod() wealth_index_local = assets_wealth_indexes.sum(axis=1) ror_local = wealth_index_local.pct_change() return ror_local fn_list.append(ror_list_fn) return fn_list
def test_resample_dtype_preservation(): # GH 12202 # validation tests for dtype preservation df = DataFrame({'date': pd.date_range(start='2016-01-01', periods=4, freq='W'), 'group': [1, 1, 2, 2], 'val': Series([5, 6, 7, 8], dtype='int32')} ).set_index('date') result = df.resample('1D').ffill() assert result.val.dtype == np.int32 result = df.groupby('group').resample('1D').ffill() assert result.val.dtype == np.int32
def test_resample_empty_dataframe(self, freq, resample_method): # GH13212 index = self.create_series().index[:0] f = DataFrame(index=index) # count retains dimensions too result = getattr(f.resample(freq), resample_method)() if resample_method != 'size': expected = f.copy() else: # GH14962 expected = Series([]) expected.index = f.index._shallow_copy(freq=freq) assert_index_equal(result.index, expected.index) assert result.index.freq == expected.index.freq assert_almost_equal(result, expected, check_dtype=False)
def test_resample_quantile_timedelta(): # GH: 29485 df = DataFrame( {"value": pd.to_timedelta(np.arange(4), unit="s")}, index=pd.date_range("20200101", periods=4, tz="UTC"), ) result = df.resample("2D").quantile(0.99) expected = DataFrame( { "value": [ pd.Timedelta("0 days 00:00:00.990000"), pd.Timedelta("0 days 00:00:02.990000"), ] }, index=pd.date_range("20200101", periods=2, tz="UTC", freq="2D"), ) tm.assert_frame_equal(result, expected)
def test_asfreq_resample_set_correct_freq(self, frame_or_series): # GH#5613 # we test if .asfreq() and .resample() set the correct value for .freq dti = to_datetime(["2012-01-01", "2012-01-02", "2012-01-03"]) obj = DataFrame({"col": [1, 2, 3]}, index=dti) if frame_or_series is Series: obj = obj["col"] # testing the settings before calling .asfreq() and .resample() assert obj.index.freq is None assert obj.index.inferred_freq == "D" # does .asfreq() set .freq correctly? assert obj.asfreq("D").index.freq == "D" # does .resample() set .freq correctly? assert obj.resample("D").asfreq().index.freq == "D"
def test_metadata_propagation_indiv(self): # groupby df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C': np.random.randn(8), 'D': np.random.randn(8)}) result = df.groupby('A').sum() self.check_metadata(df,result) # resample df = DataFrame(np.random.randn(1000,2), index=date_range('20130101',periods=1000,freq='s')) result = df.resample('1T') self.check_metadata(df,result)
def winter_monthly(df: pd.DataFrame) -> pd.DataFrame: """Compute winter monthly deaths as a %age of all winter deaths.""" df = df.query(("Date >= '1 Jul 2020' and Date <= '30 Jun 2021'")) df = df.resample("M").sum() assert df["UK"].sum() == 95234 # quality check # convert to monthly percentage of total df = df.div(df.sum()) * 100 # data is to mid April 2021: pad remaining months to end of winter period with None idx = pd.to_datetime( [datetime(2021, 5, 31, 0, 0, 0), datetime(2021, 6, 30, 0, 0, 0)] ) null_data = pd.DataFrame(columns=["UK"], data=[None, None], index=idx) df = df.append(null_data) return df
def test_resample_dup_index(self): # GH 4812 # dup columns with resample raising df = DataFrame(np.random.randn(4, 12), index=[2000, 2000, 2000, 2000], columns=[ Period(year=2000, month=i + 1, freq='M') for i in range(12) ]) df.iloc[3, :] = np.nan result = df.resample('Q', axis=1) expected = df.groupby(lambda x: int((x.month - 1) / 3), axis=1).mean() expected.columns = [ Period(year=2000, quarter=i + 1, freq='Q') for i in range(4) ] assert_frame_equal(result, expected)
def ohlcv_fill_up_missing_data(dataframe: DataFrame, timeframe: str, pair: str) -> DataFrame: """ Fills up missing data with 0 volume rows, using the previous close as price for "open", "high" "low" and "close", volume is set to 0 """ from freqtrade.exchange import timeframe_to_minutes ohlcv_dict = { 'open': 'first', 'high': 'max', 'low': 'min', 'close': 'last', 'volume': 'sum' } timeframe_minutes = timeframe_to_minutes(timeframe) # Resample to create "NAN" values df = dataframe.resample(f'{timeframe_minutes}min', on='date').agg(ohlcv_dict) # Forwardfill close for missing columns df['close'] = df['close'].fillna(method='ffill') # Use close for "open, high, low" df.loc[:, ['open', 'high', 'low']] = df[['open', 'high', 'low']].fillna(value={ 'open': df['close'], 'high': df['close'], 'low': df['close'], }) df.reset_index(inplace=True) len_before = len(dataframe) len_after = len(df) pct_missing = (len_after - len_before) / len_before if len_before > 0 else 0 if len_before != len_after: message = ( f"Missing data fillup for {pair}: before: {len_before} - after: {len_after}" f" - {round(pct_missing * 100, 2)}%") if pct_missing > 0.01: logger.info(message) else: # Don't be verbose if only a small amount is missing logger.debug(message) return df
def test_apply_columns_multilevel(): # GH 16231 cols = pd.MultiIndex.from_tuples([("A", "a", "", "one"), ("B", "b", "i", "two")]) ind = date_range(start="2017-01-01", freq="15Min", periods=8) df = DataFrame(np.array([0] * 16).reshape(8, 2), index=ind, columns=cols) agg_dict = { col: (np.sum if col[3] == "one" else np.mean) for col in df.columns } result = df.resample("H").apply(lambda x: agg_dict[x.name](x)) expected = DataFrame( 2 * [[0, 0.0]], index=date_range(start="2017-01-01", freq="1H", periods=2), columns=pd.MultiIndex.from_tuples([("A", "a", "", "one"), ("B", "b", "i", "two")]), ) tm.assert_frame_equal(result, expected)
def test_apply_with_mutated_index(): # GH 15169 index = date_range("1-1-2015", "12-31-15", freq="D") df = DataFrame(data={"col1": np.random.rand(len(index))}, index=index) def f(x): s = Series([1, 2], index=["a", "b"]) return s expected = df.groupby(pd.Grouper(freq="M")).apply(f) result = df.resample("M").apply(f) tm.assert_frame_equal(result, expected) # A case for series expected = df["col1"].groupby(pd.Grouper(freq="M")).apply(f) result = df["col1"].resample("M").apply(f) tm.assert_series_equal(result, expected)
def test_resample_dtype_preservation(): # GH 12202 # validation tests for dtype preservation df = DataFrame( { "date": pd.date_range(start="2016-01-01", periods=4, freq="W"), "group": [1, 1, 2, 2], "val": Series([5, 6, 7, 8], dtype="int32"), } ).set_index("date") result = df.resample("1D").ffill() assert result.val.dtype == np.int32 result = df.groupby("group").resample("1D").ffill() assert result.val.dtype == np.int32
def test_apply_with_mutated_index(): # GH 15169 index = pd.date_range('1-1-2015', '12-31-15', freq='D') df = DataFrame(data={'col1': np.random.rand(len(index))}, index=index) def f(x): s = Series([1, 2], index=['a', 'b']) return s expected = df.groupby(pd.Grouper(freq='M')).apply(f) result = df.resample('M').apply(f) assert_frame_equal(result, expected) # A case for series expected = df['col1'].groupby(pd.Grouper(freq='M')).apply(f) result = df['col1'].resample('M').apply(f) assert_series_equal(result, expected)
def test_agg_nested_dicts(): np.random.seed(1234) index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D") index.name = "date" df = DataFrame(np.random.rand(10, 2), columns=list("AB"), index=index) df_col = df.reset_index() df_mult = df_col.copy() df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index], names=["index", "date"]) r = df.resample("2D") cases = [ r, df_col.resample("2D", on="date"), df_mult.resample("2D", level="date"), df.groupby(pd.Grouper(freq="2D")), ] msg = "nested renamer is not supported" for t in cases: with pytest.raises(pd.core.base.SpecificationError, match=msg): t.aggregate({ "r1": { "A": ["mean", "sum"] }, "r2": { "B": ["mean", "sum"] } }) for t in cases: with pytest.raises(pd.core.base.SpecificationError, match=msg): t[["A", "B"]].agg({ "A": { "ra": ["mean", "std"] }, "B": { "rb": ["mean", "std"] } }) with pytest.raises(pd.core.base.SpecificationError, match=msg): t.agg({"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}})
def hyperopt_loss_function(results: DataFrame, trade_count: int, min_date: datetime, max_date: datetime, *args, **kwargs) -> float: """ Objective function, returns smaller number for more optimal results. Uses Sharpe Ratio calculation. """ resample_freq = '1D' slippage_per_trade_ratio = 0.0005 days_in_year = 365 annual_risk_free_rate = 0.0 risk_free_rate = annual_risk_free_rate / days_in_year # apply slippage per trade to profit_percent results.loc[:, 'profit_percent_after_slippage'] = \ results['profit_percent'] - slippage_per_trade_ratio # create the index within the min_date and end max_date t_index = date_range(start=min_date, end=max_date, freq=resample_freq, normalize=True) sum_daily = (results.resample(resample_freq, on='close_date').agg({ "profit_percent_after_slippage": sum }).reindex(t_index).fillna(0)) total_profit = sum_daily[ "profit_percent_after_slippage"] - risk_free_rate expected_returns_mean = total_profit.mean() up_stdev = total_profit.std() if up_stdev != 0: sharp_ratio = expected_returns_mean / up_stdev * math.sqrt( days_in_year) else: # Define high (negative) sharpe ratio to be clear that this is NOT optimal. sharp_ratio = -20. # print(t_index, sum_daily, total_profit) # print(risk_free_rate, expected_returns_mean, up_stdev, sharp_ratio) return -sharp_ratio
def resample_data(data: pd.DataFrame, frequency: str) -> pd.DataFrame: data = data.dropna(subset=["time"]) data["time"] = pd.to_datetime(data["time"]) data = data.sort_index(axis=0) if "latitude" in data.columns and "longitude" in data.columns: original_df = data[["time", "latitude", "longitude"]] else: original_df = data[["time"]] resample_value = "24H" if frequency.lower() == "daily" else "1H" averages = pd.DataFrame(data.resample(resample_value, on="time").mean()) averages["time"] = averages.index averages["time"] = averages["time"].apply(lambda x: date_to_str(x)) averages = averages.reset_index(drop=True) if resample_value == "1H": original_df["time"] = original_df["time"].apply( lambda x: date_to_str_hours(x)) elif resample_value == "24H": original_df["time"] = original_df["time"].apply( lambda x: date_to_str_days(x)) else: original_df["time"] = original_df["time"].apply( lambda x: date_to_str(x)) if "latitude" in original_df.columns and "longitude" in original_df.columns: def reset_latitude_or_longitude(time: str, field: str): date_row = pd.DataFrame( original_df.loc[original_df["time"] == time]) if date_row.empty: return time return (date_row.iloc[0]["latitude"] if field == "latitude" else date_row.iloc[0]["longitude"]) averages["latitude"] = averages.apply( lambda row: reset_latitude_or_longitude(row["time"], "latitude"), axis=1) averages["longitude"] = averages.apply( lambda row: reset_latitude_or_longitude(row["time"], "longitude"), axis=1) return averages
def make_cum_area(trans: pd.DataFrame, account_id: str, color_num: int = 0, time_resolution: int = 0) -> go.Scatter: """returns an go Scatter object with cumulative total by time_resolution period for the selected account.""" tr = CONST["time_res_lookup"][time_resolution] resample_keyword = tr["resample_keyword"] trans = trans.set_index("date") bin_amounts = trans.resample(resample_keyword).sum().cumsum() bin_amounts["date"] = bin_amounts.index bin_amounts["value"] = bin_amounts["amount"] bin_amounts["label"] = account_id try: marker_color = disc_colors[color_num] except IndexError: # don't ever run out of colors marker_color = "var(--Cyan)" bin_amounts[ "texttemplate"] = "%{customdata}" # workaround for passing variables through layers of plotly scatter = go.Scatter( x=bin_amounts["date"], y=bin_amounts["value"], name=account_id, mode="lines+markers", marker={ "symbol": "circle", "opacity": 1, "color": marker_color }, customdata=bin_amounts["label"], hovertemplate= "%{customdata}<br>%{y:$,.0f}<br>%{x}<extra></extra>", # TODO: pass in unit for $ line={ "width": 0.5, "color": marker_color }, hoverlabel={"namelength": 15}, stackgroup="one", ) return scatter
def unique_devices_per_bin_size(df: pd.DataFrame, bin_size: str) -> List: """ Utility function for the Device Events dataframe. This function returns a json ready dictionary in the following format [{'time_seen': '2018-11-17 21:00:00', 'number_of_devices_seen': 0},{...}] """ unique_clients_per_unit_time_df = ( df.resample(bin_size)["device"].unique().to_frame() ["device"].str.len() # pythonic way to count a list .to_frame().rename(columns={ "device": "devices" }).reset_index()) unique_clients_per_unit_time_df["time"] = unique_clients_per_unit_time_df[ "time"].map(lambda x: x.strftime("%s")) # unix time for D3 data = unique_clients_per_unit_time_df.to_dict("records") return data
def resample_data(data: pd.DataFrame, time: str): data['date'] = pd.to_datetime(data['date']) data.set_index('date', inplace=True) data.sort_index(inplace=True) # Converting to OHLC format data_ohlc = data.resample(time).apply({ 'open': 'first', 'high': 'max', 'low': 'min', 'close': 'last', 'volume': 'sum' }) data_ohlc.dropna(inplace=True) data_ohlc = data_ohlc.reset_index() print("Function resample_data done.\n") return data_ohlc