예제 #1
0
def run(res_auc, delete_n_last_common_features, n_common_last,
        delete_n_last_features, n_last):
    X_train, y_train, X_test_zf, y_test_zf, id_zf = generate_train_data(
        SF_impute_svd,
        delete_n_last_common_features=delete_n_last_common_features,
        n_common_last=n_common_last,
        delete_n_last_features=delete_n_last_features,
        n_last=n_last)
    test_data_gg, test_label_gg, columns, id_gg = generate_data(
        OV_impute_svd,
        delete_n_last_common_features=delete_n_last_common_features,
        n_common_last=n_common_last,
        delete_n_last_features=delete_n_last_features,
        n_last=n_last)
    test_data_xy, test_label_xy, _, id_xy = generate_data(
        CHWH_impute_svd,
        delete_n_last_common_features=delete_n_last_common_features,
        n_common_last=n_common_last,
        delete_n_last_features=delete_n_last_features,
        n_last=n_last)

    y_train = y_train.reshape(len(y_train), )
    X_test = [test_data_gg, X_test_zf, test_data_xy]
    y_test = [
        test_label_gg.reshape(len(test_label_gg), ),
        y_test_zf.reshape(len(y_test_zf), ),
        test_label_xy.reshape(len(test_data_xy), )
    ]
    n = 0
    if delete_n_last_common_features:
        n = n_common_last
    if delete_n_last_features:
        n = n_last
    res_auc.loc[n]['feature_num'] = X_train.shape[1]
    weights1 = {
        'gg': np.array([0.4, 0.3, 0.3]),
        'zf': np.array([0.4, 0.3, 0.3]),
        'xy': np.array([0.6, 0.3, 0.1])
    }
    for i, hp in zip(range(3), ['gg', 'zf', 'xy']):
        rf_results = rf(X_train, y_train, X_test[i], y_test[i])
        lrl2_results = lrl2(X_train, y_train, X_test[i], y_test[i])
        svm_results = svm(X_train, y_train, X_test[i], y_test[i])
        # voting method
        vote_results = voting(rf_results[2], lrl2_results[2], svm_results[2],
                              weights1, 'soft', hp)
        fpr, tpr, thr_ = roc_curve(y_test[i],
                                   vote_results[2].T[1],
                                   pos_label=2)
        res_auc.loc[n][hp] = auc(fpr, tpr)
예제 #2
0
def generate_dataset(path):
    dataset = data_preprocess.generate_data(path)
    idf = calc_idf(dataset)
    feature_set = [
        total_idf, average_idf, discourse_marker, numeric_token,
        contains_quote, num_quote, lexicon_token, marker_token,
        context_position, candidate_length, candidate_token_length
    ]
    xs = []
    ys = []
    rs = []
    for argument in dataset:
        text, nr, r = argument
        for c in nr:
            xs += [generate_feature(argument, c, idf, feature_set)]
            ys += [0.0]
            rs += [c]
        for c in r:
            xs += [generate_feature(argument, c, idf, feature_set)]
            ys += [1.0]
            rs += [c]

    return rs, xs, ys
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
import data_preprocess
import deep_learning_models

# choose the Decimal preprocessed data from the benchmark dataset.
# We only use level 5 limit order book data.
# prepar the data for training.
file_path = 'data/'
data_name = '_Dst_Auction_DecPre_CF_1.txt'
data_level = 5
forecast_size = 10
look_back = 100
data_train = data_preprocess.read_benchmark_data(
    file_path + 'Train' + data_name, data_level)
data_x, data_y = data_preprocess.generate_data(data_train, forecast_size,
                                               look_back)
del data_train

# get inout and target data for training our deep learning model.
# check the distribution
train_price, train_volume, train_prob = data_preprocess.benchmark_data_for_model(
    data_x, data_y)
print('positive sample ratio in train: ', np.mean(train_prob[:, 0]))
print('negative sample ratio in train: ', np.mean(train_prob[:, 1]))
print('neutral sample ratio in train: ', np.mean(train_prob[:, 2]))
del data_x, data_y

# set batch size and learning rate. Train the deep learning model.
batch_size = 256
learning_rate = 0.001
cnn_model = deep_learning_models.cnn_classification_benchmark_mid_price_model(
예제 #4
0
import numpy as np
from model import full_model
from data_preprocess import generate_data
import keras
period = 30
x, y = generate_data(period)
total = len(x)
x_train = np.array(x[:int(0.8 * total)])
y_train = np.array(y[:int(0.8 * total)])

x_test = np.array(x[int(0.8 * total):])
y_test = np.array(y[int(0.8 * total):])

print(x_train.shape)
print(y_train.shape)

print(x_test.shape)
print(y_test.shape)

callbacks = [
    keras.callbacks.EarlyStopping(monitor='loss', patience=25, verbose=1),
    keras.callbacks.ModelCheckpoint("Resnet_50_{epoch:03d}.hdf5",
                                    monitor='loss',
                                    verbose=1,
                                    mode='auto'),
    keras.callbacks.ReduceLROnPlateau(monitor='loss',
                                      factor=0.5,
                                      patience=5,
                                      verbose=1,
                                      mode='auto',
                                      epsilon=0.01,
예제 #5
0
# fixed random seed
seed(2020)

SF_impute_median = './data_filter5_median_impute/SF_impute_with_median.xlsx'
SF_impute_svd = './data_filter30_svdimpute/SF.xlsx'

OV_impute_median = './data_filter5_median_impute/OV_impute_with_median.xlsx'
OV_impute_svd = './data_filter30_svdimpute/OV.xlsx'

CHWH_impute_median = './data_filter5_median_impute/CHWH_impute_with_median.xlsx'
CHWH_impute_svd = './data_filter30_svdimpute/CHWH_replace.xlsx'

X_train, y_train, X_test_zf, y_test_zf, id_zf = generate_train_data(
    SF_impute_svd, delete_n_last_features=True, n_last=17)
test_data_gg, test_label_gg, columns, id_gg = generate_data(
    OV_impute_svd, delete_n_last_features=True, n_last=17)
test_data_xy, test_label_xy, _, id_xy = generate_data(
    CHWH_impute_svd, delete_n_last_features=True, n_last=17)

# results path
path = './results/' + dt.datetime.now().strftime(
    '%Y%m%d-%H-%M') + '-impute-svd'
os.makedirs(path, exist_ok=True)
# save models path
save_models = './save_models'
os.makedirs(save_models, exist_ok=True)
# feature rank path
rank_path = './feature_select/' + dt.datetime.now().strftime(
    '%Y%m%d-%H-%M') + '-feature_rank'
os.makedirs(rank_path, exist_ok=True)
# load the order and message book. We use the level 5 limit order book data.
file_path = 'data/'
data_name = 'AAPL_2012-06-21_34200000_57600000_'
data_level = 5
data_order = np.loadtxt(file_path + data_name + 'orderbook_' + str(data_level) + '.csv', delimiter=',')
data_message = np.loadtxt(file_path + data_name + 'message_' + str(data_level) + '.csv', delimiter=',')

# set time window, forecast size, and look back range.
time_window = 0.25
forecast_size = 100
look_back = 100

# turn data into evenly spaced, then split the one day dataset into two half day datasets.
evenly_spaced_data = data_preprocess.rescale_data(data_order, data_message, data_level, time_window)
data_x, data_y = data_preprocess.generate_data(evenly_spaced_data, forecast_size, look_back)
del evenly_spaced_data

# model for long only 
# set profit threshold and generate the data for training and testing.
profit_threshold_for_model = 0.03
train_x, train_y, test_x, test_y = data_preprocess.train_test_split(data_x, data_y, forecast_size)
train_price, train_volume, train_prob = data_preprocess.data_for_trading_model(train_x, train_y, 'long', profit_threshold_for_model)
test_price, test_volume, test_prob = data_preprocess.data_for_trading_model(test_x, test_y, 'long', profit_threshold_for_model)
print('positive same ratio in train: ', np.mean(train_prob))
print('positive same ratio in test: ', np.mean(test_prob))

# set batch size and learning rate. 
# train the model and predict the probability of making a profit by longing one share of stock
batch_size = 256
learning_rate = 0.001
예제 #7
0
from sklearn.externals import joblib
from data_preprocess import generate_train_data, generate_data

path = './save_models/20200626-21-50/'

models = [path+i for i in ['rf.pkl', 'gbdt.pkl', 'lrl2.pkl', 'svm.pkl', 'mlp.pkl']]

# load data x_ 数据已包含feature name 为 dataframe格式
X_train, y_train, X_test_zf, y_test_zf, id_zf = generate_train_data('./data_filter30_svdimpute/SF.xlsx',
                                                                    delete_n_last_features=False,
                                                                    over_sample=False)
x_data_gg, test_label_gg, columns, id_gg = generate_data('./data_filter30_svdimpute/OV.xlsx',
                                                         delete_n_last_features=False)
x_data_xy, test_label_xy, _, id_xy = generate_data('./data_filter30_svdimpute/CHWH.xlsx',
                                                   delete_n_last_features=False)
# predict
for x in [X_test_zf, x_data_gg, x_data_xy]:
    for model in models:
        clf = joblib.load(model)
        result = clf.predict(x)

print('finished!')