def test_subset(self): N = 10 rng = date_range('1/1/1990', periods=N, freq='53s') df = DataFrame({'A': np.arange(N), 'B': np.arange(N)}, index=rng) df.loc[4:8, 'A'] = np.nan dates = date_range('1/1/1990', periods=N * 3, freq='25s') # with a subset of A should be the same result = df.asof(dates, subset='A') expected = df.asof(dates) tm.assert_frame_equal(result, expected) # same with A/B result = df.asof(dates, subset=['A', 'B']) expected = df.asof(dates) tm.assert_frame_equal(result, expected) # B gives self.df.asof result = df.asof(dates, subset='B') expected = df.resample('25s', closed='right').ffill().reindex(dates) expected.iloc[20:] = 9 tm.assert_frame_equal(result, expected)
def test_subset(self): N = 10 rng = date_range('1/1/1990', periods=N, freq='53s') df = DataFrame({'A': np.arange(N), 'B': np.arange(N)}, index=rng) df.loc[4:8, 'A'] = np.nan dates = date_range('1/1/1990', periods=N * 3, freq='25s') # with a subset of A should be the same result = df.asof(dates, subset='A') expected = df.asof(dates) assert_frame_equal(result, expected) # same with A/B result = df.asof(dates, subset=['A', 'B']) expected = df.asof(dates) assert_frame_equal(result, expected) # B gives self.df.asof result = df.asof(dates, subset='B') expected = df.resample('25s', closed='right').ffill().reindex(dates) expected.iloc[20:] = 9 assert_frame_equal(result, expected)
def test_asof_periodindex_mismatched_freq(self): N = 50 rng = period_range("1/1/1990", periods=N, freq="H") df = DataFrame(np.random.randn(N), index=rng) # Mismatched freq msg = "Input has different freq" with pytest.raises(IncompatibleFrequency, match=msg): df.asof(rng.asfreq("D"))
def test_missing(self): # GH 15118 # no match found - `where` value before earliest date in index N = 10 rng = date_range('1/1/1990', periods=N, freq='53s') df = DataFrame({'A': np.arange(N), 'B': np.arange(N)}, index=rng) result = df.asof('1989-12-31') expected = Series(index=['A', 'B'], name=Timestamp('1989-12-31')) tm.assert_series_equal(result, expected) result = df.asof(to_datetime(['1989-12-31'])) expected = DataFrame(index=to_datetime(['1989-12-31']), columns=['A', 'B'], dtype='float64') tm.assert_frame_equal(result, expected)
def test_time_zone_aware_index(self, stamp, expected): # GH21194 # Testing awareness of DataFrame index considering different # UTC and timezone df = DataFrame(data=[1, 2], index=[Timestamp('2018-01-01 21:00:05.001+00:00'), Timestamp('2018-01-01 22:35:10.550+00:00')]) result = df.asof(stamp) tm.assert_series_equal(result, expected)
def test_time_zone_aware_index(self, stamp, expected): # GH21194 # Testing awareness of DataFrame index considering different # UTC and timezone df = DataFrame(data=[1, 2], index=[ Timestamp('2018-01-01 21:00:05.001+00:00'), Timestamp('2018-01-01 22:35:10.550+00:00') ]) result = df.asof(stamp) tm.assert_series_equal(result, expected)
def _concat_executions(market_data: pd.DataFrame, executions: Union[List, pd.DataFrame]): if isinstance(executions, List): executions_df = pd.DataFrame(executions).set_index('datetime') elif isinstance(executions, pd.DataFrame): executions_df = executions.set_index('datetime') else: raise Exception( f'executions只支持格式:{_concat_executions.__annotations__["executions"]}' ) executions_df.index = pd.to_datetime(executions_df.index) executions_df = executions_df.sort_index() market_data = market_data.sort_index() executions_df.index = market_data.asof(executions_df.index)['datetime'] executions_df_grouped = executions_df.groupby('datetime').apply( lambda df: df.to_dict('records')) executions_df_grouped.name = 'trades' market_data = market_data.merge(executions_df_grouped, 'left', left_index=True, right_index=True) return market_data
class AsOfDataFrame(object): goal_time = 0.2 def setup(self): self.N = 10000 self.M = 100 self.rng = date_range(start='1/1/1990', periods=self.N, freq='53s') self.dates = date_range(start='1/1/1990', periods=(self.N * 10), freq='5s') self.ts = DataFrame(np.random.randn(self.N, self.M), index=self.rng) self.ts2 = self.ts.copy() self.ts2.iloc[250:5000] = np.nan self.ts3 = self.ts.copy() self.ts3.iloc[-5000:] = np.nan # test speed of pre-computing NAs. def time_asof(self): self.ts.asof(self.dates) # should be roughly the same as above. def time_asof_nan(self): self.ts2.asof(self.dates) # test speed of the code path for a scalar index # with pre-computing all NAs. def time_asof_single(self): self.ts.asof(self.dates[0]) # should be roughly the same as above. def time_asof_nan_single(self): self.ts3.asof(self.dates[-1]) # test speed of the code path for a scalar index # before the start. should be without the cost of # pre-computing all the NAs. def time_asof_single_early(self): self.ts.asof(self.dates[0] - dt.timedelta(10))
def split_into_examples(df: pd.DataFrame, label: str, examples: [np.ndarray], labels_of_examples: [str], time_series_length, interval_in_seconds, config, failure_times_of_examples: [str], failure_time, window_times_of_examples: [str], y, i_dataset): thread_list = [] # sample time_series_length many values form each of the intervals if their length is near the configured value if not config.use_over_lapping_windows: # split case into single intervals with the configured length interval_list = [ g for c, g in df.groupby( pd.Grouper(level='timestamp', freq=str(interval_in_seconds) + 's')) ] for g in interval_list: g_len = (g.index[-1] - g.index[0]).total_seconds() # ensure time interval is long enough if interval_in_seconds - 0.5 <= g_len <= interval_in_seconds + 0.5: t = DFConverter(g, time_series_length, False) thread_list.append(t) else: # print("df.index[0]: ", df.index[0], "df.index[-1]: ", df.index[-1]) start_time = df.index[0] end_time = df.index[-1] # slide over data frame and extract windows until the window would exceed the last time step while start_time + pd.to_timedelta( config.over_lapping_window_interval_in_seconds, unit='s') < end_time: # generate a list with indexes for window index = pd.date_range(start_time, periods=config.time_series_length, freq=config.resample_frequency) # print("from: ", index[0], "to: ", index[-1]) # for use_over_lapping_windows doesn't do more than converting the part of the df into a numpy array # using the converter thread overhead to be able to so no further different handling is needed t = DFConverter(df.asof(index), time_series_length, True) thread_list.append(t) # update next start time for next window start_time = start_time + pd.to_timedelta( config.over_lapping_window_interval_in_seconds, unit='s') # sampling done multi threaded with the amount of cores configured thread_limit = config.max_parallel_cores if len( thread_list) > config.max_parallel_cores else len(thread_list) threads_finished = 0 while threads_finished < len(thread_list): if threads_finished + thread_limit > len(thread_list): thread_limit = len(thread_list) - threads_finished r = threads_finished + thread_limit for i in range(threads_finished, r): thread_list[i].start() for i in range(threads_finished, r): thread_list[i].join() for i in range(threads_finished, r): examples.append(thread_list[i].result) labels_of_examples.append(label) if failure_time == "": failure_times_of_examples.append("noFailure-" + str(i_dataset) + "-" + str(y)) else: failure_times_of_examples.append(str(failure_time)) window_times_of_examples.append(thread_list[i].windowTimesAsString) threads_finished += thread_limit