def custom_index(data, on, window=30, function='median', num=30, sort_mode=False): """ Generate a custom index data dataframe with symbol and timestamp columns on column on which the index is to be generated window look back window function function to be applied out number of stocks to pick each day sort_mode whether to pick top stocks or bottom stocks """ from fastbt.datasource import DataSource ds = DataSource(data) ds.add_rolling(on=on, window=window, function=function, lag=1, col_name='custom_index') grouped = ds.data.groupby('timestamp') if sort_mode: return grouped.apply(lambda x: x.sort_values( by='custom_index').head(num)).reset_index(drop=True) else: return grouped.apply(lambda x: x.sort_values( by='custom_index').tail(num)).reset_index(drop=True)
def test_rolling_zscore(): np.random.seed(100) df = pd.DataFrame(np.random.randn(100, 4), columns=["open", "high", "low", "close"]) df["symbol"] = list("ABCD") * 25 dates = list(pd.date_range(end="2018-04-25", periods=25)) * 4 df["timestamp"] = dates from fastbt.datasource import DataSource ds = DataSource(df) ds.add_rolling(on="close", window=5, function="zscore") assert ds.data.query('symbol=="A"').iloc[8]["rol_zscore_close_5"].round(2) == 0.12 assert ds.data.query('symbol=="B"').iloc[-7]["rol_zscore_close_5"].round(2) == 0.17 assert ds.data.query('symbol=="C"').iloc[-6]["rol_zscore_close_5"].round(2) == -0.48
def test_rolling_zscore(): np.random.seed(100) df = pd.DataFrame(np.random.randn(100, 4), columns=['open', 'high', 'low', 'close']) df['symbol'] = list('ABCD') * 25 dates = list(pd.date_range(end='2018-04-25', periods=25)) * 4 df['timestamp'] = dates from fastbt.datasource import DataSource ds = DataSource(df) ds.add_rolling(on='close', window=5, function='zscore') assert ds.data.query('symbol=="A"').iloc[8]['rol_zscore_close_5'].round( 2) == 0.12 assert ds.data.query('symbol=="B"').iloc[-7]['rol_zscore_close_5'].round( 2) == 0.17 assert ds.data.query('symbol=="C"').iloc[-6]['rol_zscore_close_5'].round( 2) == -0.48
def transform(data): """ Apply the necessary transformation to the given data """ ds = DataSource(data, timestamp='date') for i in range(2,8): ds.add_rolling(on='high', window=i, col_name='rmax'+str(i), function='max', lag=1) ds.add_rolling(on='low', window=i, col_name='rmin'+str(i), function='min', lag=1) ds.add_formula('(open/prevclose)-1', col_name='pret') ds.add_formula('(close/open)-1', col_name='idret') ds.add_formula('(tottrdval/totaltrades)', col_name='qtrd') for i in [1,2,3]: ds.add_pct_change(on='close', period=i, col_name='ret'+str(i), lag=1) for i in [2,3]: ds.add_rolling(on='tottrdval', window=i, col_name='vol'+str(i), function='sum', lag=1) for col in ['tottrdval', 'perdel', 'qtrd']: ds.add_lag(on=col, period=1, col_name='prev_'+col) return ds.data
class TestDataSource(unittest.TestCase): def setUp(self): df = pd.read_csv('tests/data/sample.csv', parse_dates=['timestamp']) self.ds = DataSource(data=df) def test_data(self): self.assertEqual(self.ds.data.iloc[20, 1], 'five') self.assertEqual(self.ds.data.iloc[14, 3], 112) self.assertEqual(self.ds.data.iloc[24, 7], 10.54) def test_data_without_sort(self): df = pd.read_csv('tests/data/sample.csv', parse_dates=['timestamp']) self.ds = DataSource(data=df, sort=False) self.assertEqual(self.ds.data.iloc[9, 4], 999) self.assertEqual(self.ds.data.iloc[24, 6], 41688) self.assertEqual(self.ds.data.at[4, 'close'], 10.6) def test_initialize_case(self): df = pd.read_csv('tests/data/sample.csv', parse_dates=['timestamp']) df.columns = [x.upper() for x in df.columns] self.assertEqual(df.columns[0], 'TIMESTAMP') self.ds = DataSource(data=df) self.assertEqual(self.ds.data.columns[0], 'timestamp') def test_initialize_column_rename(self): df = pd.read_csv('tests/data/sample.csv', parse_dates=['timestamp']) df.columns = [ 'TS', 'TRADINGSYMBOL', 'OPEN', 'HIGH', 'LOW', 'CLOSE', 'VOLUME', 'PREVCLOSE' ] self.ds = DataSource(data=df, timestamp='TS', symbol='TRADINGSYMBOL') self.assertEqual(self.ds.data.columns[0], 'timestamp') self.assertEqual(self.ds.data.columns[1], 'symbol') def test_add_lag(self): length = len(self.ds.data) idx = pd.IndexSlice self.ds.add_lag(on='close') self.ds.add_lag(on='volume', period=2) d = self.ds.data.set_index(['timestamp', 'symbol']) self.assertEqual(d.at[idx['2018-01-04', 'one'], 'lag_close_1'], 11) self.assertEqual(d.at[idx['2018-01-06', 'six'], 'lag_volume_2'], 86014) self.assertEqual(len(self.ds.data.columns), 10) self.assertEqual(len(self.ds.data), length) def test_add_lag_column_rename(self): idx = pd.IndexSlice self.ds.add_lag(on='close') self.ds.add_lag(on='close', col_name='some_col') d = self.ds.data.set_index(['timestamp', 'symbol']) self.assertEqual(d.at[idx['2018-01-04', 'one'], 'lag_close_1'], 11) self.assertEqual(d.at[idx['2018-01-04', 'one'], 'some_col'], 11) self.assertEqual(d.at[idx['2018-01-05', 'three'], 'some_col'], 109) def test_add_pct_change(self): idx = pd.IndexSlice self.ds.add_pct_change(on='close') self.ds.add_pct_change(on='close', period=2) self.ds.add_pct_change(on='close', period=2, col_name='new_col') d = self.ds.data.set_index(['timestamp', 'symbol']) R = lambda x: round(x, 2) self.assertEqual(R(d.at[idx['2018-01-05', 'three'], 'chg_close_1']), -0.07) self.assertEqual(R(d.at[idx['2018-01-06', 'five'], 'chg_close_1']), 0.17) self.assertEqual(R(d.at[idx['2018-01-05', 'four'], 'chg_close_2']), 0.05) self.assertEqual(R(d.at[idx['2018-01-05', 'four'], 'new_col']), 0.05) self.assertEqual(R(d.at[idx['2018-01-03', 'six'], 'new_col']), -0.1) self.assertEqual(pd.isna(d.at[idx['2018-01-02', 'one'], 'new_col']), True) self.assertEqual(len(self.ds.data.columns), 11) def test_add_pct_change_lag(self): idx = pd.IndexSlice self.ds.add_pct_change(on='close', period=2, lag=1) self.ds.add_pct_change(on='close', period=1, lag=2) d = self.ds.data.set_index(['timestamp', 'symbol']) R = lambda x: round(x, 2) self.assertEqual(R(d.at[idx['2018-01-04', 'four'], 'chg_close_2']), 0.09) self.assertEqual(R(d.at[idx['2018-01-04', 'four'], 'chg_close_1']), 0.01) self.assertEqual(R(d.at[idx['2018-01-06', 'three'], 'chg_close_1']), -0.01) def test_add_pct_change_lag_col_name(self): idx = pd.IndexSlice self.ds.add_pct_change(on='high', period=2, lag=1) self.ds.add_pct_change(on='close', period=1, lag=2, col_name='lagged_2') d = self.ds.data.set_index(['timestamp', 'symbol']) R = lambda x: round(x, 2) self.assertEqual(R(d.at[idx['2018-01-05', 'six'], 'chg_high_2']), -0.04) self.assertEqual(R(d.at[idx['2018-01-04', 'four'], 'lagged_2']), 0.01) def test_formula_add_col_name(self): idx = pd.IndexSlice self.ds.add_formula('open+close', 'new_col') self.ds.add_formula('volume/close', 'new_col_2') d = self.ds.data.set_index(['timestamp', 'symbol']) R = lambda x: round(x, 2) self.assertEqual(R(d.at[idx['2018-01-04', 'four'], 'new_col']), 336) self.assertEqual(R(d.at[idx['2018-01-06', 'one'], 'new_col_2']), 77755.77) def test_formula_case_insensitive(self): idx = pd.IndexSlice self.ds.add_formula('OPEN+CLOSE', 'new_col') self.ds.add_formula('volume/close', 'NEW_COL_2') d = self.ds.data.set_index(['timestamp', 'symbol']) R = lambda x: round(x, 2) self.assertEqual(R(d.at[idx['2018-01-04', 'four'], 'new_col']), 336) self.assertEqual(R(d.at[idx['2018-01-06', 'one'], 'new_col_2']), 77755.77) def test_formula_calculated_column(self): idx = pd.IndexSlice self.ds.add_formula('(open+close)*100', 'new_col_1') self.ds.add_formula('volume/100', 'new_col_2') self.ds.add_formula('new_col_1+new_col_2', 'new_col_3') d = self.ds.data.set_index(['timestamp', 'symbol']) R = lambda x: round(x, 2) self.assertEqual(R(d.at[idx['2018-01-06', 'one'], 'new_col_3']), 10190.6) self.assertEqual(R(d.at[idx['2018-01-05', 'two'], 'new_col_3']), 200389.97) def test_rolling_simple(self): from pandas import isna q = 'symbol == "one"' df = pd.read_csv('tests/data/sample.csv', parse_dates=['timestamp']).query(q) df['r2'] = df['close'].rolling(2).mean() self.ds.add_rolling(2, col_name='r2') df2 = self.ds.data.query(q) print('RESULT', df['r2'], df2['r2']) for a, b in zip(df['r2'], df2['r2']): if not (isna(a)): assert a == b def test_rolling_values(self): idx = pd.IndexSlice self.ds.add_rolling(4, on='volume', function='max') d = self.ds.data.set_index(['timestamp', 'symbol']) R = lambda x: round(x, 2) self.assertEqual(d.at[idx['2018-01-05', 'five'], 'rol_max_volume_4'], 971704) self.assertEqual(d.at[idx['2018-01-05', 'six'], 'rol_max_volume_4'], 195539) self.assertEqual(d.at[idx['2018-01-04', 'three'], 'rol_max_volume_4'], 433733) # Adding lag and testing self.ds.add_rolling(4, on='volume', function='max', lag=1) d = self.ds.data.set_index(['timestamp', 'symbol']) self.assertEqual(d.at[idx['2018-01-06', 'five'], 'rol_max_volume_4'], 971704) self.assertEqual(d.at[idx['2018-01-06', 'six'], 'rol_max_volume_4'], 195539) self.assertEqual(d.at[idx['2018-01-05', 'three'], 'rol_max_volume_4'], 433733) # Testing for 2 lags and column name self.ds.add_rolling(4, on='volume', function='max', lag=2, col_name='check') d = self.ds.data.set_index(['timestamp', 'symbol']) self.assertEqual(d.at[idx['2018-01-06', 'three'], 'check'], 433733) def test_batch(self): length = len(self.ds.data) batch = [{ 'P': { 'on': 'close', 'period': 1, 'lag': 1 } }, { 'L': { 'on': 'volume', 'period': 1 } }, { 'F': { 'formula': '(open+close)/2', 'col_name': 'AvgPrice' } }, { 'I': { 'indicator': 'SMA', 'period': 3, 'lag': 1, 'col_name': 'SMA3' } }, { 'F': { 'formula': 'avgprice + sma3', 'col_name': 'final' } }, { 'R': { 'window': 3, 'function': 'mean' } }] d = self.ds.batch_process(batch).set_index(['timestamp', 'symbol']) self.assertEqual(len(d.columns), 12) self.assertEqual(len(self.ds.data.columns), 14) self.assertEqual(len(self.ds.data), length) def test_raise_error_if_not_dataframe(self): pass
class TestDataSource(unittest.TestCase): def setUp(self): df = pd.read_csv("tests/data/sample.csv", parse_dates=["timestamp"]) self.ds = DataSource(data=df) def test_data(self): self.assertEqual(self.ds.data.iloc[20, 1], "five") self.assertEqual(self.ds.data.iloc[14, 3], 112) self.assertEqual(self.ds.data.iloc[24, 7], 10.54) def test_data_without_sort(self): df = pd.read_csv("tests/data/sample.csv", parse_dates=["timestamp"]) self.ds = DataSource(data=df, sort=False) self.assertEqual(self.ds.data.iloc[9, 4], 999) self.assertEqual(self.ds.data.iloc[24, 6], 41688) self.assertEqual(self.ds.data.at[4, "close"], 10.6) def test_initialize_case(self): df = pd.read_csv("tests/data/sample.csv", parse_dates=["timestamp"]) df.columns = [x.upper() for x in df.columns] self.assertEqual(df.columns[0], "TIMESTAMP") self.ds = DataSource(data=df) self.assertEqual(self.ds.data.columns[0], "timestamp") def test_initialize_column_rename(self): df = pd.read_csv("tests/data/sample.csv", parse_dates=["timestamp"]) df.columns = [ "TS", "TRADINGSYMBOL", "OPEN", "HIGH", "LOW", "CLOSE", "VOLUME", "PREVCLOSE", ] self.ds = DataSource(data=df, timestamp="TS", symbol="TRADINGSYMBOL") self.assertEqual(self.ds.data.columns[0], "timestamp") self.assertEqual(self.ds.data.columns[1], "symbol") def test_add_lag(self): length = len(self.ds.data) idx = pd.IndexSlice self.ds.add_lag(on="close") self.ds.add_lag(on="volume", period=2) d = self.ds.data.set_index(["timestamp", "symbol"]) self.assertEqual(d.at[idx["2018-01-04", "one"], "lag_close_1"], 11) self.assertEqual(d.at[idx["2018-01-06", "six"], "lag_volume_2"], 86014) self.assertEqual(len(self.ds.data.columns), 10) self.assertEqual(len(self.ds.data), length) def test_add_lag_column_rename(self): idx = pd.IndexSlice self.ds.add_lag(on="close") self.ds.add_lag(on="close", col_name="some_col") d = self.ds.data.set_index(["timestamp", "symbol"]) self.assertEqual(d.at[idx["2018-01-04", "one"], "lag_close_1"], 11) self.assertEqual(d.at[idx["2018-01-04", "one"], "some_col"], 11) self.assertEqual(d.at[idx["2018-01-05", "three"], "some_col"], 109) def test_add_pct_change(self): idx = pd.IndexSlice self.ds.add_pct_change(on="close") self.ds.add_pct_change(on="close", period=2) self.ds.add_pct_change(on="close", period=2, col_name="new_col") d = self.ds.data.set_index(["timestamp", "symbol"]) R = lambda x: round(x, 2) self.assertEqual(R(d.at[idx["2018-01-05", "three"], "chg_close_1"]), -0.07) self.assertEqual(R(d.at[idx["2018-01-06", "five"], "chg_close_1"]), 0.17) self.assertEqual(R(d.at[idx["2018-01-05", "four"], "chg_close_2"]), 0.05) self.assertEqual(R(d.at[idx["2018-01-05", "four"], "new_col"]), 0.05) self.assertEqual(R(d.at[idx["2018-01-03", "six"], "new_col"]), -0.1) self.assertEqual(pd.isna(d.at[idx["2018-01-02", "one"], "new_col"]), True) self.assertEqual(len(self.ds.data.columns), 11) def test_add_pct_change_lag(self): idx = pd.IndexSlice self.ds.add_pct_change(on="close", period=2, lag=1) self.ds.add_pct_change(on="close", period=1, lag=2) d = self.ds.data.set_index(["timestamp", "symbol"]) R = lambda x: round(x, 2) self.assertEqual(R(d.at[idx["2018-01-04", "four"], "chg_close_2"]), 0.09) self.assertEqual(R(d.at[idx["2018-01-04", "four"], "chg_close_1"]), 0.01) self.assertEqual(R(d.at[idx["2018-01-06", "three"], "chg_close_1"]), -0.01) def test_add_pct_change_lag_col_name(self): idx = pd.IndexSlice self.ds.add_pct_change(on="high", period=2, lag=1) self.ds.add_pct_change(on="close", period=1, lag=2, col_name="lagged_2") d = self.ds.data.set_index(["timestamp", "symbol"]) R = lambda x: round(x, 2) self.assertEqual(R(d.at[idx["2018-01-05", "six"], "chg_high_2"]), -0.04) self.assertEqual(R(d.at[idx["2018-01-04", "four"], "lagged_2"]), 0.01) def test_formula_add_col_name(self): idx = pd.IndexSlice self.ds.add_formula("open+close", "new_col") self.ds.add_formula("volume/close", "new_col_2") d = self.ds.data.set_index(["timestamp", "symbol"]) R = lambda x: round(x, 2) self.assertEqual(R(d.at[idx["2018-01-04", "four"], "new_col"]), 336) self.assertEqual(R(d.at[idx["2018-01-06", "one"], "new_col_2"]), 77755.77) def test_formula_case_insensitive(self): idx = pd.IndexSlice self.ds.add_formula("OPEN+CLOSE", "new_col") self.ds.add_formula("volume/close", "NEW_COL_2") d = self.ds.data.set_index(["timestamp", "symbol"]) R = lambda x: round(x, 2) self.assertEqual(R(d.at[idx["2018-01-04", "four"], "new_col"]), 336) self.assertEqual(R(d.at[idx["2018-01-06", "one"], "new_col_2"]), 77755.77) def test_formula_calculated_column(self): idx = pd.IndexSlice self.ds.add_formula("(open+close)*100", "new_col_1") self.ds.add_formula("volume/100", "new_col_2") self.ds.add_formula("new_col_1+new_col_2", "new_col_3") d = self.ds.data.set_index(["timestamp", "symbol"]) R = lambda x: round(x, 2) self.assertEqual(R(d.at[idx["2018-01-06", "one"], "new_col_3"]), 10190.6) self.assertEqual(R(d.at[idx["2018-01-05", "two"], "new_col_3"]), 200389.97) def test_rolling_simple(self): from pandas import isna q = 'symbol == "one"' df = pd.read_csv("tests/data/sample.csv", parse_dates=["timestamp"]).query(q) df["r2"] = df["close"].rolling(2).mean() self.ds.add_rolling(2, col_name="r2") df2 = self.ds.data.query(q) print("RESULT", df["r2"], df2["r2"]) for a, b in zip(df["r2"], df2["r2"]): if not (isna(a)): assert a == b def test_rolling_values(self): idx = pd.IndexSlice self.ds.add_rolling(4, on="volume", function="max") d = self.ds.data.set_index(["timestamp", "symbol"]) R = lambda x: round(x, 2) self.assertEqual(d.at[idx["2018-01-05", "five"], "rol_max_volume_4"], 971704) self.assertEqual(d.at[idx["2018-01-05", "six"], "rol_max_volume_4"], 195539) self.assertEqual(d.at[idx["2018-01-04", "three"], "rol_max_volume_4"], 433733) # Adding lag and testing self.ds.add_rolling(4, on="volume", function="max", lag=1) d = self.ds.data.set_index(["timestamp", "symbol"]) self.assertEqual(d.at[idx["2018-01-06", "five"], "rol_max_volume_4"], 971704) self.assertEqual(d.at[idx["2018-01-06", "six"], "rol_max_volume_4"], 195539) self.assertEqual(d.at[idx["2018-01-05", "three"], "rol_max_volume_4"], 433733) # Testing for 2 lags and column name self.ds.add_rolling(4, on="volume", function="max", lag=2, col_name="check") d = self.ds.data.set_index(["timestamp", "symbol"]) self.assertEqual(d.at[idx["2018-01-06", "three"], "check"], 433733) def test_batch(self): length = len(self.ds.data) batch = [ {"P": {"on": "close", "period": 1, "lag": 1}}, {"L": {"on": "volume", "period": 1}}, {"F": {"formula": "(open+close)/2", "col_name": "AvgPrice"}}, {"I": {"indicator": "SMA", "period": 3, "lag": 1, "col_name": "SMA3"}}, {"F": {"formula": "avgprice + sma3", "col_name": "final"}}, {"R": {"window": 3, "function": "mean"}}, ] d = self.ds.batch_process(batch).set_index(["timestamp", "symbol"]) self.assertEqual(len(d.columns), 12) self.assertEqual(len(self.ds.data.columns), 14) self.assertEqual(len(self.ds.data), length) def test_raise_error_if_not_dataframe(self): pass