forked from adrian-aley/sample_trading_system
-
Notifications
You must be signed in to change notification settings - Fork 0
/
train_test_split.py
64 lines (54 loc) · 2.47 KB
/
train_test_split.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/usr/bin/python
# -*- coding: utf-8 -*-
# train_test_split.py
import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.metrics import confusion_matrix
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.svm import LinearSVC, SVC
from create_lagged_series import create_lagged_series
if __name__ == "__main__":
# Create a lagged series of the S&P500 US stock market index
snpret = create_lagged_series(
"^GSPC", datetime.datetime(2000,1,1),
datetime.date.today(), lags=5
)
# Use the prior two days of returns as predictor
# values, with direction as the response
X = snpret[["Lag1","Lag2"]]
y = snpret["Direction"]
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.8, random_state=42
)
# Create the (parametrised) models
print("Hit Rates/Confusion Matrices:\n")
models = [("Logistic Regression (LR)", LogisticRegression()),
("Linear Discriminant Analysis (LDA)", LDA()),
("Quadratic Discriminant Analysis (QDA)", QDA()),
("Linear Support Vector Classification (LSVC)", LinearSVC()),
("Reduced Support Vector Machines (RSVM)", SVC(
C=1000000.0, cache_size=200, class_weight=None,
coef0=0.0, degree=3, gamma=0.0001, kernel='rbf',
max_iter=-1, probability=False, random_state=None,
shrinking=True, tol=0.001, verbose=False)
),
("Random Forest Classifier (RF)", RandomForestClassifier(
n_estimators=1000, criterion='gini',
max_depth=None, min_samples_split=2,
min_samples_leaf=1, max_features='auto',
bootstrap=True, oob_score=False, n_jobs=1,
random_state=None, verbose=0)
)]
# Iterate through the models
for m in models:
# Train each of the models on the training set
m[1].fit(X_train, y_train)
# Make an array of predictions on the test set
pred = m[1].predict(X_test)
# Output the hit-rate and the confusion matrix for each model
print("%s:\n%0.3f" % (m[0], m[1].score(X_test, y_test)))
print("%s\n" % confusion_matrix(pred, y_test))