def _read_stock(self): gdf_filename = self.gdf_filename_pattern.format( self.stock, self.r, self.s) reg_filename = '{}'.format(self.stock) logger.debug('Will read %s and %s', gdf_filename, reg_filename) d = lob.load_prepared_data(gdf_filename, data_dir=self.data_dir, cv=False, length=self.data_length) if d is not None and len(d) == 2: df, df_test = d else: return None df_reg, df_reg_test = lob.load_prepared_data( reg_filename, data_dir='../gaussian_filter/data', cv=False, length=self.data_length) df['queue_imbalance'] = df_reg['queue_imbalance'] df['prev_queue_imbalance'] = df['queue_imbalance'].shift() df.dropna(inplace=True) df_test['queue_imbalance'] = df_reg_test['queue_imbalance'] df_test['prev_queue_imbalance'] = df_test['queue_imbalance'].shift() df_test.dropna(inplace=True) return df, df_test
def _read_stock(self): gdf_filename = self.gdf_filename_pattern.format( self.stock, self.r, self.s) reg_filename = '{}'.format(self.stock) logger.debug('Will read %s and %s', gdf_filename, reg_filename) d = lob.load_prepared_data(gdf_filename, data_dir=self.data_dir, length=self.data_length) if len(d) == 2: df, df_test = d else: return pd.DataFrame(), pd.DataFrame() df_reg, df_reg_test = lob.load_prepared_data( reg_filename, data_dir=self.reg_data_dir, length=self.data_length) df['datetime'] = df_reg['Unnamed: 0'] df['bid_price'] = df_reg['bid_price'] df['ask_price'] = df_reg['ask_price'] df['queue_imbalance'] = df_reg['queue_imbalance'] df['prev_queue_imbalance'] = df['queue_imbalance'].shift() df.index = pd.to_datetime(df['datetime']) df.dropna(inplace=True) df_test['datetime'] = df_reg_test['Unnamed: 0'] df_test['bid_price'] = df_reg_test['bid_price'] df_test['ask_price'] = df_reg_test['ask_price'] df_test['queue_imbalance'] = df_reg_test['queue_imbalance'] df_test['prev_queue_imbalance'] = df_test['queue_imbalance'].shift() df_test.index = pd.to_datetime(df_test['datetime']) df_test.dropna(inplace=True) return df, df_test
def _read_stocks(self): dfs = {} dfs_test = {} dfs_reg = {} dfs_reg_test = {} for stock in self.stocks: gdf_filename = self.gdf_filename_pattern.format( stock, self.r, self.s) reg_filename = '{}'.format(stock) dfs[stock], dfs_test[stock] = lob.load_prepared_data( gdf_filename, data_dir=self.data_dir, cv=False, length=self.data_length) dfs_reg[stock], dfs_reg_test[stock] = lob.load_prepared_data( reg_filename, data_dir='../gaussian_filter/data', cv=False, length=self.data_length) for stock in self.stocks: dfs[stock]['queue_imbalance'] = dfs_reg[stock]['queue_imbalance'] dfs[stock]['prev_queue_imbalance'] = dfs[stock][ 'queue_imbalance'].shift() dfs[stock].dropna(inplace=True) dfs_test[stock]['queue_imbalance'] = dfs_reg_test[stock][ 'queue_imbalance'] dfs_test[stock]['prev_queue_imbalance'] = dfs_test[stock][ 'queue_imbalance'].shift() dfs_test[stock].dropna(inplace=True) return dfs, dfs_test
def main(r=0.02, s=0.2, n=15, K=50): print('*****************************************************') print('r', r, 's', s) gdf_columns = ['gdf_' + str(i) for i in range(0, n)] gdfs_r = r * np.ones(K) gdfs_m = 0.1000 * np.hstack([np.arange(- K // 2, 0), np.arange(1, K // 2 + 1)]) gdfs_s = s * np.ones(K) gdfs = np.vstack([gdfs_r, gdfs_m, gdfs_s]).T data_length = 5050 stocks = ['9061'] stocks = ['9062', '9063', '9064', '9065'] for s in stocks: try: d, d_cv, d_test = lob.load_prepared_data(s, data_dir='data/', cv=True, length=data_length) dfs = transform_to_orders(d, n, gdfs) clf = svm_classification(dfs, gdf_columns) predictions = clf.predict(dfs.loc[:, gdf_columns]) print('train', s, roc_auc_score(predictions, dfs['mid_price_indicator'])) dfs_test = transform_to_orders(d_test, n, gdfs) predictions = clf.predict(dfs_test.loc[:, gdf_columns]) print('test ', s, roc_auc_score(predictions, dfs_test['mid_price_indicator'])) except Exception as e: print(e)
def write_svm_gdf(self, K=None, Kn=None, rr=None, ss=None): results = [] try: df_gdf_res = pd.read_csv('{}/res_gdf_svm_{}_{}.csv'.format( self.results_dir, self.stock, Kn)) print('Results read from file') except FileNotFoundError: print('Results file does not exist yet') df_gdf_res = pd.DataFrame(columns=[ 'svm', 'c', 'gamma', 'roc_cv_score', 'roc_train_score', 'K', 'Kn', 'r', 's' ]) for r in rr: for s in ss: filename = 'gdf_{}_len{}_r{}_s{}_K{}{}'.format( self.stock, self.data_length, r, s, K, self.suffix) dfs, dfs_cv, dfs_test = lob.load_prepared_data( filename, data_dir=self.data_dir, cv=True, length=None) # we don't care about length here for C in [1, 10, 100, 1000, 10000]: for gamma in [1, 10, 100, 1000, 10000]: if self.is_in_results( df_gdf_res, { 'c': C, 'gamma': gamma, 'r': r, 's': s, 'K': K, 'Kn': Kn, 'svm': 'rbf' }): continue res = self.perform_gdf_svm(dfs, dfs_cv, C=C, gamma=gamma, r=r, s=s, K=K, Kn=Kn) results.append(res) pd.DataFrame(results).to_csv( '{}/new_res_gdf_svm_{}_{}.csv'.format( self.results_dir, self.stock, Kn)) for result in results: df_gdf_res = df_gdf_res.append(result, ignore_index=True) df_gdf_res.to_csv('results/res_gdf_svm_{}_{}.csv'.format( self.stock, Kn)) return df_gdf_res
def _read_stock(self): reg_filename = '{}'.format(self.stock) logger.debug('Will read %s', reg_filename) df, df_test = lob.load_prepared_data(reg_filename, data_dir=self.data_dir, length=self.data_length) df['datetime'] = df['Unnamed: 0'] df['prev_queue_imbalance'] = df['queue_imbalance'].shift() df.index = pd.to_datetime(df['datetime']) df.dropna(inplace=True) df_test['datetime'] = df_test['Unnamed: 0'] df_test['prev_queue_imbalance'] = df_test['queue_imbalance'].shift() df_test.index = pd.to_datetime(df_test['datetime']) df_test.dropna(inplace=True) return df, df_test
def main(stock): """ This gets gdf_data :return: """ K = 50 length = 15000 rr = [0.01, 0.05, 0.1, 0.5, 1.0] ss = [0.01, 0.05, 0.1, 0.5, 1.0] gdf_data_dir = 'data_gdf' results_dir = 'data_res_logistic' gdf_start = 0 gdf_end = 50 algorithm = 'logistic' results = [] results_filename = os.path.join( results_dir, 'res_log_{}_len{}_K{}-{}.csv'.format(stock, length, gdf_start, gdf_end)) results_partial_filename = os.path.join( results_dir, 'res_log_{}_len{}_K{}-{}_partial.csv'.format(stock, length, gdf_start, gdf_end)) for r in rr: for s in ss: gdf_filename = 'gdf_{}_len{}_r{}_s{}_K{}'.format( stock, length, r, s, K) dfs, dfs_test = lob.load_prepared_data(gdf_filename, data_dir=gdf_data_dir, cv=False, length=length) gdf_columns = ['gdf_' + str(i) for i in range(gdf_start, gdf_end)] res = {'r': r, 's': s, 'stock': stock, 'K': K, 'method': algorithm} print('********************************************') print(res) try: scores = svm_classification(dfs, gdf_columns) print(res, scores) results.append({**res, **scores}) except Exception as e: print('Exception', e, res) results.append(res) pd.DataFrame(results).to_csv(results_partial_filename) pd.DataFrame(results).to_csv(results_filename)
def main(): for s in stocks: df, df_cv, df_test = lob.load_prepared_data(s, cv=True, length=data_length) if df is None: continue for c in cs: for g in gammas: for coef0 in coef0s: svm = SVMSigmoid(s, df, c=c, coef0=coef0, gamma=g, data_length=data_length) svm.predict(df_cv, 'cv', check=False) sleep(1) svm = SVMRbf(s, df, c=c, gamma=g, data_length=data_length) svm.predict(df_cv, 'cv', check=False) sleep(1) svm = SVMLinear(s, df, c=c, data_length=data_length) svm.predict(df_cv, 'cv', check=False)
def main(stock): """ This gets gdf_data :return: """ K = 50 length = 15000 rr = [0.01, 0.05, 0.1, 0.5, 1.0] ss = [0.01, 0.05, 0.1, 0.5, 1.0] gdf_data_dir = 'data_gdf_feature_scaling' results_dir = 'data_res_gdf_feature_scaling' gdf_start = 24 gdf_end = 26 algorithm = 'svm_rbf' for r in rr: for s in ss: results_filename = os.path.join( results_dir, 'res_{}_len{}_r{}_s{}_K{}-{}.csv'.format( stock, length, r, s, gdf_start, gdf_end)) results_partial_filename = os.path.join( results_dir, 'res_{}_len{}_r{}_s{}_K{}-{}_partial.csv'.format( stock, length, r, s, gdf_start, gdf_end)) gdf_filename = 'gdf_{}_r{}_s{}_K{}_feature_scaling'.format( stock, r, s, K) if os.path.exists(results_filename): print('Exists ', results_filename) continue else: print('Will create ', results_filename) dfs, dfs_test = lob.load_prepared_data(gdf_filename, data_dir=gdf_data_dir, cv=False, length=length) gdf_columns = ['gdf_' + str(i) for i in range(gdf_start, gdf_end)] results = [] for C in [1, 10, 100, 1000, 10000]: for gamma in [1, 10, 100, 1000, 10000]: res = { 'C': C, 'gamma': gamma, 'r': r, 's': s, 'stock': stock, 'K': K, 'method': algorithm } print('********************************************') print(res) try: scores = svm_classification(dfs, gdf_columns, C=C, gamma=gamma) print(res, scores) results.append({**res, **scores}) except Exception as e: print('Exception', e, res) results.append(res) pd.DataFrame(results).to_csv(results_partial_filename) pd.DataFrame(results).to_csv(results_filename)
def main(stock): """ This gets gdf_data :return: """ K = 50 length = 15000 rr = [0.01, 0.05, 0.1, 0.5, 1.0] ss = [0.01, 0.05, 0.1, 0.5, 1.0] for r in rr: for s in ss: # TODO: if file exists if os.path.exists('res_{}_len{}_r{}_s{}_K{}.csv'.format( stock, length, r, s, K)): print( 'Exists ', 'res_{}_len{}_r{}_s{}_K{}.csv'.format( stock, length, r, s, K)) continue else: print( 'Will create ', 'res_{}_len{}_r{}_s{}_K{}.csv'.format( stock, length, r, s, K)) filename = 'gdf_{}_len{}_r{}_s{}_K{}'.format( stock, length, r, s, K) dfs, dfs_cv, dfs_test = lob.load_prepared_data( filename, data_dir='data_gdf_/', cv=True, length=length) gdf_columns = ['gdf_' + str(i) for i in range(0, 50)] results = [] for C in [1, 10, 100, 1000, 10000]: for gamma in [1, 10, 100, 1000, 10000]: res = {} res['C'] = C res['gamma'] = gamma res['r'] = r res['s'] = s res['stock'] = stock res['K'] = K print('********************************************') print('C', C, 'gamma', gamma) lob.mo clf = svm_classification(dfs, gdf_columns, C=C, gamma=gamma) predictions = clf.predict(dfs.loc[:, gdf_columns]) try: roc_train = roc_auc_score(predictions, dfs['mid_price_indicator']) res['roc_train'] = roc_train print('train', s, roc_train) except Exception as e: print(e) pd.DataFrame(results).to_csv( 'res_{}_len{}_r{}_s{}_K{}.csv_partial'.format( stock, length, r, s, K)) predictions = clf.predict(dfs_cv.loc[:, gdf_columns]) try: roc_cv = roc_auc_score(predictions, dfs_cv['mid_price_indicator']) res['roc_cv'] = roc_cv print('test ', s, roc_cv) except Exception as e: print(e) pd.DataFrame(results).to_csv( 'res_{}_len{}_r{}_s{}_K{}.csv_partial'.format( stock, length, r, s, K)) results.append(res) pd.DataFrame(results).to_csv('res_{}_len{}_r{}_s{}_K{}.csv'.format( stock, length, r, s, K))
'13113', '2290', '9269', '12059', '3879', '1229', '4695', '5836', '10484', '2890', '1694', '1080', '3107', '11038', '12417', '9266', '4320', '3022', '3388', '8080', '1431', '12255', '7843', '11714', '4575', '2028', '11946', '2813', '11869' ] i = 0 data_length = 10000 rocs_areas = {} plt.figure() for s in stocks: try: print('for', s) d, d_cv, d_test = lob.load_prepared_data( s, data_dir='../queue_imbalance/data/prepared/', cv=True, length=data_length) print('performing regressions', s) reg = lob.logistic_regression(d, 0, len(d)) print('performing predictions', s) score = lob.plot_roc(d_test, reg, stock=s) rocs_areas[s] = score print('{} (area = {})'.format(s, score)) i += 1 # if i % 10 == 0: # plt.savefig('plots_cv_{}.png'.format(i)) # plt.figure() except Exception as e: