def main(stock, r=0.1, s=0.1): try: result_dir = 'res_mlp_pca_gdf_que_prev10_split_15000' data_length = 15000 svm_gdf_res = gdf_pca.SvmGdfResults( stock, r=r, s=s, data_length=data_length, gdf_filename_pattern='gdf_{}_r{}_s{}_K50', data_dir='../gaussian_filter/data_gdf_whole/') results = [] for alpha in [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 2]: for hidden_layer_size in [(8, 16), (16, 16), (16, 8), (8, 8)]: activation = 'tanh' solver = 'adam' clf = MLPClassifier(solver=solver, alpha=alpha, activation=activation, hidden_layer_sizes=hidden_layer_size, random_state=1) scores = svm_gdf_res.train_clf( clf, feature_name='pca_gdf_que_prev_split10', method='mlp') results.append({ **scores, 'alpha': alpha, 'solver': solver, 'hidden_layer_sizes': hidden_layer_size }) pd.DataFrame(results).to_csv( os.path.join( result_dir, 'mlp_pca_gdf_{}_len{}_r{}_s{}.csv'.format( stock, data_length, r, s))) except Exception as e: print(e)
def train_lstm(res): gdf_filename_pattern = 'gdf_{}_r{}_s{}_K50' data_length = 24000 r = res['r'].values[0] s = res['s'].values[0] feature = res['features'].values[0] n_steps = int(res['n_steps'].values[0]) stock = str(int(res['stock'].values[0])) arch = res['arch'].values[0] print(stock, n_steps, feature) gdf_dfs = gdf_pca.SvmGdfResults(stock, r=r, s=s, data_length=data_length, gdf_filename_pattern=gdf_filename_pattern) weights = gdf_dfs.get_classes_weights() epochs = 50 batch_size = 512 filename = os.path.join( 'res_lstm_predictions', f'pred_lstm_iter_{stock}_len{data_length}_r{r}_s{s}.csv') if os.path.exists(filename): print(f'Exists {filename}.') return None get_model_func = get_model(arch) s, m = gdf_dfs.train_lstm(get_model_func, feature_name=feature, should_return_model=True, fit_kwargs={ 'epochs': epochs, 'batch_size': batch_size, 'verbose': 0, 'shuffle': False }, should_validate=False, compile_kwargs={ 'loss': 'binary_crossentropy', 'optimizer': 'adam', 'metrics': [matthews_correlation, auc_roc] }, class_weight=weights, n_steps=n_steps) test_x, test_y = gdf_dfs.get_test_set(feature_name=feature, n_steps=n_steps) pred = m.predict_classes(test_x) df_scores = pd.DataFrame() df_scores['pred'] = pred.ravel() df_scores['actual'] = test_y df_scores.to_csv(filename) return None
def main(stock, r=0.1, s=0.1): result_dir = 'res_mlp_pca' data_length = 24000 svm_gdf_res = gdf_pca.SvmGdfResults( stock, r=r, s=s, data_length=data_length, gdf_filename_pattern='gdf_{}_r{}_s{}_K50') feature_name = 'pca_n_gdf_que' weights = svm_gdf_res.get_classes_weights() epochs = 50 batch_size = 512 filename = os.path.join(result_dir, 'mlp_pca_gdf_n_{}_len{}_r{}_s{}.csv'.format(stock, data_length, r, s)) if os.path.exists(filename): print(f'Exists {filename}') return filename_partial = os.path.join( result_dir, 'mlp_pca_n_gdf_{}_len{}_r{}_s{}.csv_partial'.format(stock, data_length, r, s)) df_partial = pd.DataFrame() if os.path.exists(filename_partial): print(f'Reading partial file {filename_partial}') df_partial = pd.read_csv(filename_partial) for k, get_m in models.items(): if np.any(df_partial): print(filename_partial) row = df_partial[df_partial['hidden_layer_sizes'] == k] if np.any(row): print(f'Read result for hidden layer {k} in {filename_partial}') continue print(f'Training {stock} {r} {s} {k}') plot_name = f'plot_mlp/{stock}_mlp_pca_gdf_n_r{r}_s{s}' score = svm_gdf_res.train_mlp( get_m, feature_name=feature_name, method='mlp', fit_kwargs={'epochs': epochs, 'batch_size': batch_size, 'verbose': 0, 'shuffle': False}, compile_kwargs={'loss': 'binary_crossentropy', 'optimizer': 'adam', 'metrics': [auc_roc, matthews_correlation, 'acc']}, plot_name=plot_name, class_weight=weights) score = {**score, 'r': r, 's': s, 'epochs': epochs, 'batch_size': batch_size} score = {'solver': 'adam', 'hidden_layer_sizes': k, 'learning_rate': 0.001, **score} df_partial = df_partial.append(pd.DataFrame([score]), ignore_index=True) df_partial.index = list((range(len(df_partial)))) df_partial.to_csv(filename_partial) df_partial.to_csv(filename) return True
def train_lstm(res): data_length = 24000 r = res['r'].values[0] s = res['s'].values[0] feature = res['features'].values[0] n_steps = int(res['n_steps'].values[0]) unit = res['unit'].values[0] stock = str(int(res['stock'].values[0])) arch = res['arch'].values[0] gdf_dfs = gdf_pca.SvmGdfResults( stock, r=r, s=s, data_length=data_length, gdf_filename_pattern='gdf_{}_r{}_s{}_K50') weights = gdf_dfs.get_classes_weights() epochs = 50 batch_size = 512 filename = os.path.join('res_lstm_iter', f'res_lstm_iter_{stock}_len{data_length}_r{r}_s{s}.csv') partial_filename = filename + '_partial' # if os.path.exists(filename): # print(f'Exists {filename}.') # return None df_partial = pd.DataFrame() if os.path.exists(partial_filename): df_partial = pd.read_csv(partial_filename) if len(df_partial) < 30: logger.info('Iteration %s stock %s', len(df_partial), stock) get_model = get_lstm_model_for_arch(arch) try: score = gdf_dfs.train_lstm( get_model, feature_name=feature, fit_kwargs={'epochs': epochs, 'batch_size': batch_size, 'verbose': 0, 'shuffle': False}, compile_kwargs={'loss': 'binary_crossentropy', 'optimizer': 'adam', 'metrics': [matthews_correlation, auc_roc]}, class_weight=weights, n_steps=n_steps) score = {**score, 'r': r, 's': s, 'unit': unit, 'arch': arch, 'epochs': epochs, 'batch_size': batch_size, 'n_steps': n_steps} df_partial = df_partial.append([score]) df_partial.to_csv(partial_filename) logger.info('Done %s stock %s', len(df_partial), stock) except Exception as e: logger.error('%s: iter %s %s', stock, len(df_partial), e) raise Exception(stock, e) df_partial.to_csv(filename) return None
def perform_mcnemar(res): gdf_filename_pattern = 'gdf_{}_r{}_s{}_K50' data_length = 24000 r = res['r'].values[0] s = res['s'].values[0] stock = str(int(res['stock'].values[0])) gdf_dfs = gdf_pca.SvmGdfResults( stock, r=r, s=s, data_length=data_length, gdf_filename_pattern=gdf_filename_pattern) df = gdf_dfs.df df_test = gdf_dfs.df_test df_lstm = pd.read_csv(f'res_lstm_predictions/pred_lstm_iter_{stock}_len{data_length}_r{r}_s{s}.csv') reg = LogisticRegression(class_weight=get_classes_weights(df)) reg.fit(df[['queue_imbalance']], df['mid_price_indicator']) log_pred = reg.predict(df_test[['queue_imbalance']]) df_all = pd.DataFrame() df_all['pred_log'] = log_pred[(len(log_pred) - len(df_lstm)):] df_all['pred_lstm'] = df_lstm['pred'].values df_all['actual'] = df_test['mid_price_indicator'].values[(len(log_pred) - len(df_lstm)):] df_all['correct_lstm'] = (df_all['pred_lstm'] == df_all['actual']).astype(np.int64) df_all['correct_log'] = (df_all['pred_log'] == df_all['actual']).astype(np.int64) table = pd.crosstab(df_all['correct_lstm'], df_all['correct_log']) mcnemar_res = mcnemar(table, exact=False, correction=True) df_mcnemar = pd.DataFrame() df_mcnemar['pvalue'] = [mcnemar_res.pvalue] df_mcnemar['statistic'] = [mcnemar_res.statistic] df_mcnemar['TN'] = [table[0][0]] df_mcnemar['FN'] = [table[0][1]] df_mcnemar['FP'] = [table[1][0]] df_mcnemar['TP'] = [table[1][1]] df_mcnemar['stock'] = stock df_mcnemar.to_csv(f'res_lstm_predictions/mcnemar_lstm_log_{stock}_len{data_length}_r{r}_s{s}.csv') return mcnemar_res
def main(stock, r=0.1, s=0.1): result_dir = 'res_mlp_pca' data_length = 24000 svm_gdf_res = gdf_pca.SvmGdfResults( stock, r=r, s=s, data_length=data_length, gdf_filename_pattern='gdf_{}_r{}_s{}_K50') feature_name = 'pca_n_gdf_que' n = svm_gdf_res.get_pca(feature_name).n_components hidden_layer_sizes = [(n, ), (n, n), (2 * n, n), (2 * n, 2 * n), (n, 2 * n), (n, n, n)] weights = svm_gdf_res.get_classes_weights() epochs = 10 batch_size = 300 filename = os.path.join( result_dir, 'mlp_pca_gdf_{}_len{}_r{}_s{}.csv'.format(stock, data_length, r, s)) if os.path.exists(filename): print(f'Exists {filename}') return filename_partial = os.path.join( result_dir, 'mlp_pca_gdf_{}_len{}_r{}_s{}.csv_partial'.format( stock, data_length, r, s)) df_partial = pd.DataFrame() if os.path.exists(filename_partial): print(f'Reading partial file {filename_partial}') df_partial = pd.read_csv(filename_partial) for hidden_layer_size in hidden_layer_sizes: for learning_rate in [0.001 ]: #[0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0]: if np.any(df_partial): print(filename_partial) row = df_partial[df_partial['hidden_layer_sizes'] == hidden_layer_size] if np.any(row) and len(row) >= 1: print(row) row = df_partial[ df_partial['hidden_layer_sizes'] == hidden_layer_size][ df_partial['learning_rate'] == learning_rate] print(row) if np.any(row): print( f'Read result for hidden layer {hidden_layer_size} lr {learning_rate} in {filename_partial}' ) continue print( f'Training {stock} {r} {s} {hidden_layer_size} {learning_rate}' ) solver = optimizers.Adam(lr=learning_rate) model = Sequential() if isinstance(hidden_layer_size, int): model.add(Dense(hidden_layer_size)) else: for h in hidden_layer_size: model.add(Dense(h)) model.add(Dense(1, activation='sigmoid')) plot_name = f'plot_mlp/{stock}_mlp_pca_n_r{r}_s{s}' score = svm_gdf_res.train_mlp(model, feature_name=feature_name, method='mlp', fit_kwargs={ 'epochs': epochs, 'batch_size': batch_size, 'verbose': 0, 'shuffle': False }, compile_kwargs={ 'loss': 'binary_crossentropy', 'optimizer': solver, 'metrics': [auc_roc, 'acc'] }, plot_name=plot_name, class_weight=weights) score = { **score, 'r': r, 's': s, 'arch': model.to_json(), 'epochs': epochs, 'batch_size': batch_size } score = { 'solver': solver, 'hidden_layer_sizes': hidden_layer_size, 'learning_rate': learning_rate, **score } df_partial = df_partial.append(pd.DataFrame([score]), ignore_index=True) df_partial.index = list((range(len(df_partial)))) # df_partial.drop(columns=[[c for c in df_partial.columns if 'Unnamed' in c]], inplace=True) df_partial.to_csv(filename_partial) df_partial.to_csv(filename) return True
def train_lstm(stock, r, s, data_length, units=None, kernel_regularizations=None): import tensorflow as tf auc_roc = as_keras_metric(tf.metrics.auc) r = float(r) s = float(s) data_length = int(data_length) print('running', stock, r, s, data_length) gdf_filename_pattern = 'gdf_{}_r{}_s{}_K50' gdf_dfs = gdf_pca.SvmGdfResults(str(stock), r=r, s=s, data_length=data_length, gdf_filename_pattern=gdf_filename_pattern) weights = gdf_dfs.get_classes_weights() feature = 'pca_n_gdf_que' # , 'pca_n_gdf_que_prev'] epochs = 50 batch_size = 512 n_steps = 1 filename = os.path.join( 'res_gru', f'res_gru_pca_n_one_layer_{stock}_len{data_length}_r{r}_s{s}.csv') # if os.path.exists(filename): # logger.info('Exists %s', filename) # return partial_filename = filename + '_partial' df_partial = pd.DataFrame() if os.path.exists(partial_filename): df_partial = pd.read_csv(partial_filename) if 'kernel_reg' not in df_partial.columns: print('Kernel reg not in columns!') df_partial['kernel_reg'] = np.zeros(len(df_partial)).astype( np.float) df_partial.drop( columns=[c for c in df_partial.columns if 'Unnamed' in c], inplace=True) for unit in units: unit_str = f'({unit}: tanh, 1)' for kernel_reg in kernel_regularizations: if np.any(df_partial): row = df_partial[df_partial['unit'] == unit_str][ df_partial['kernel_reg'] == kernel_reg][ df_partial['n_steps'] == n_steps] if np.any(row): print( f'Already calculated {stock} {unit_str} {kernel_reg}') continue print(f'Will train {stock} r{r} s{s} {unit_str} {kernel_reg}') pca = gdf_dfs.get_pca(feature) get_model = get_model_func(unit, input_shape=(n_steps, pca.n_components_)) plot_name = f'plot_lstm/{stock}_one_layer_u{unit}_kr{kernel_reg}_pca_n_r{r}_s{s}' score = gdf_dfs.train_lstm(get_model, feature_name=feature, method='gru', fit_kwargs={ 'epochs': epochs, 'batch_size': batch_size, 'verbose': 0, 'shuffle': False }, compile_kwargs={ 'loss': 'binary_crossentropy', 'optimizer': 'adam', 'metrics': [matthews_correlation, auc_roc] }, plot_name=plot_name, class_weight=weights, n_steps=n_steps) score = { **score, 'r': r, 's': s, 'unit': unit_str, 'kernel_reg': kernel_reg, 'epochs': epochs, 'batch_size': batch_size, 'n_steps': n_steps } df_partial = df_partial.append(pd.DataFrame([score]), ignore_index=True) df_partial.to_csv(partial_filename) df_partial.drop(columns=[c for c in df_partial.columns if 'Unnamed' in c], inplace=True) df_partial.to_csv(filename) return True