def run(res_auc, delete_n_last_common_features, n_common_last, delete_n_last_features, n_last): X_train, y_train, X_test_zf, y_test_zf, id_zf = generate_train_data( SF_impute_svd, delete_n_last_common_features=delete_n_last_common_features, n_common_last=n_common_last, delete_n_last_features=delete_n_last_features, n_last=n_last) test_data_gg, test_label_gg, columns, id_gg = generate_data( OV_impute_svd, delete_n_last_common_features=delete_n_last_common_features, n_common_last=n_common_last, delete_n_last_features=delete_n_last_features, n_last=n_last) test_data_xy, test_label_xy, _, id_xy = generate_data( CHWH_impute_svd, delete_n_last_common_features=delete_n_last_common_features, n_common_last=n_common_last, delete_n_last_features=delete_n_last_features, n_last=n_last) y_train = y_train.reshape(len(y_train), ) X_test = [test_data_gg, X_test_zf, test_data_xy] y_test = [ test_label_gg.reshape(len(test_label_gg), ), y_test_zf.reshape(len(y_test_zf), ), test_label_xy.reshape(len(test_data_xy), ) ] n = 0 if delete_n_last_common_features: n = n_common_last if delete_n_last_features: n = n_last res_auc.loc[n]['feature_num'] = X_train.shape[1] weights1 = { 'gg': np.array([0.4, 0.3, 0.3]), 'zf': np.array([0.4, 0.3, 0.3]), 'xy': np.array([0.6, 0.3, 0.1]) } for i, hp in zip(range(3), ['gg', 'zf', 'xy']): rf_results = rf(X_train, y_train, X_test[i], y_test[i]) lrl2_results = lrl2(X_train, y_train, X_test[i], y_test[i]) svm_results = svm(X_train, y_train, X_test[i], y_test[i]) # voting method vote_results = voting(rf_results[2], lrl2_results[2], svm_results[2], weights1, 'soft', hp) fpr, tpr, thr_ = roc_curve(y_test[i], vote_results[2].T[1], pos_label=2) res_auc.loc[n][hp] = auc(fpr, tpr)
def generate_dataset(path): dataset = data_preprocess.generate_data(path) idf = calc_idf(dataset) feature_set = [ total_idf, average_idf, discourse_marker, numeric_token, contains_quote, num_quote, lexicon_token, marker_token, context_position, candidate_length, candidate_token_length ] xs = [] ys = [] rs = [] for argument in dataset: text, nr, r = argument for c in nr: xs += [generate_feature(argument, c, idf, feature_set)] ys += [0.0] rs += [c] for c in r: xs += [generate_feature(argument, c, idf, feature_set)] ys += [1.0] rs += [c] return rs, xs, ys
from sklearn.metrics import recall_score from sklearn.metrics import f1_score import data_preprocess import deep_learning_models # choose the Decimal preprocessed data from the benchmark dataset. # We only use level 5 limit order book data. # prepar the data for training. file_path = 'data/' data_name = '_Dst_Auction_DecPre_CF_1.txt' data_level = 5 forecast_size = 10 look_back = 100 data_train = data_preprocess.read_benchmark_data( file_path + 'Train' + data_name, data_level) data_x, data_y = data_preprocess.generate_data(data_train, forecast_size, look_back) del data_train # get inout and target data for training our deep learning model. # check the distribution train_price, train_volume, train_prob = data_preprocess.benchmark_data_for_model( data_x, data_y) print('positive sample ratio in train: ', np.mean(train_prob[:, 0])) print('negative sample ratio in train: ', np.mean(train_prob[:, 1])) print('neutral sample ratio in train: ', np.mean(train_prob[:, 2])) del data_x, data_y # set batch size and learning rate. Train the deep learning model. batch_size = 256 learning_rate = 0.001 cnn_model = deep_learning_models.cnn_classification_benchmark_mid_price_model(
import numpy as np from model import full_model from data_preprocess import generate_data import keras period = 30 x, y = generate_data(period) total = len(x) x_train = np.array(x[:int(0.8 * total)]) y_train = np.array(y[:int(0.8 * total)]) x_test = np.array(x[int(0.8 * total):]) y_test = np.array(y[int(0.8 * total):]) print(x_train.shape) print(y_train.shape) print(x_test.shape) print(y_test.shape) callbacks = [ keras.callbacks.EarlyStopping(monitor='loss', patience=25, verbose=1), keras.callbacks.ModelCheckpoint("Resnet_50_{epoch:03d}.hdf5", monitor='loss', verbose=1, mode='auto'), keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.5, patience=5, verbose=1, mode='auto', epsilon=0.01,
# fixed random seed seed(2020) SF_impute_median = './data_filter5_median_impute/SF_impute_with_median.xlsx' SF_impute_svd = './data_filter30_svdimpute/SF.xlsx' OV_impute_median = './data_filter5_median_impute/OV_impute_with_median.xlsx' OV_impute_svd = './data_filter30_svdimpute/OV.xlsx' CHWH_impute_median = './data_filter5_median_impute/CHWH_impute_with_median.xlsx' CHWH_impute_svd = './data_filter30_svdimpute/CHWH_replace.xlsx' X_train, y_train, X_test_zf, y_test_zf, id_zf = generate_train_data( SF_impute_svd, delete_n_last_features=True, n_last=17) test_data_gg, test_label_gg, columns, id_gg = generate_data( OV_impute_svd, delete_n_last_features=True, n_last=17) test_data_xy, test_label_xy, _, id_xy = generate_data( CHWH_impute_svd, delete_n_last_features=True, n_last=17) # results path path = './results/' + dt.datetime.now().strftime( '%Y%m%d-%H-%M') + '-impute-svd' os.makedirs(path, exist_ok=True) # save models path save_models = './save_models' os.makedirs(save_models, exist_ok=True) # feature rank path rank_path = './feature_select/' + dt.datetime.now().strftime( '%Y%m%d-%H-%M') + '-feature_rank' os.makedirs(rank_path, exist_ok=True)
# load the order and message book. We use the level 5 limit order book data. file_path = 'data/' data_name = 'AAPL_2012-06-21_34200000_57600000_' data_level = 5 data_order = np.loadtxt(file_path + data_name + 'orderbook_' + str(data_level) + '.csv', delimiter=',') data_message = np.loadtxt(file_path + data_name + 'message_' + str(data_level) + '.csv', delimiter=',') # set time window, forecast size, and look back range. time_window = 0.25 forecast_size = 100 look_back = 100 # turn data into evenly spaced, then split the one day dataset into two half day datasets. evenly_spaced_data = data_preprocess.rescale_data(data_order, data_message, data_level, time_window) data_x, data_y = data_preprocess.generate_data(evenly_spaced_data, forecast_size, look_back) del evenly_spaced_data # model for long only # set profit threshold and generate the data for training and testing. profit_threshold_for_model = 0.03 train_x, train_y, test_x, test_y = data_preprocess.train_test_split(data_x, data_y, forecast_size) train_price, train_volume, train_prob = data_preprocess.data_for_trading_model(train_x, train_y, 'long', profit_threshold_for_model) test_price, test_volume, test_prob = data_preprocess.data_for_trading_model(test_x, test_y, 'long', profit_threshold_for_model) print('positive same ratio in train: ', np.mean(train_prob)) print('positive same ratio in test: ', np.mean(test_prob)) # set batch size and learning rate. # train the model and predict the probability of making a profit by longing one share of stock batch_size = 256 learning_rate = 0.001
from sklearn.externals import joblib from data_preprocess import generate_train_data, generate_data path = './save_models/20200626-21-50/' models = [path+i for i in ['rf.pkl', 'gbdt.pkl', 'lrl2.pkl', 'svm.pkl', 'mlp.pkl']] # load data x_ 数据已包含feature name 为 dataframe格式 X_train, y_train, X_test_zf, y_test_zf, id_zf = generate_train_data('./data_filter30_svdimpute/SF.xlsx', delete_n_last_features=False, over_sample=False) x_data_gg, test_label_gg, columns, id_gg = generate_data('./data_filter30_svdimpute/OV.xlsx', delete_n_last_features=False) x_data_xy, test_label_xy, _, id_xy = generate_data('./data_filter30_svdimpute/CHWH.xlsx', delete_n_last_features=False) # predict for x in [X_test_zf, x_data_gg, x_data_xy]: for model in models: clf = joblib.load(model) result = clf.predict(x) print('finished!')