/
DT.py
73 lines (51 loc) · 2.22 KB
/
DT.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
def analysis_glass():
from sklearn.cross_validation import train_test_split
fp = "glass.csv"
df = pd.read_csv(fp)
train, test = train_test_split(df, test_size = 0.3)
train = train.as_matrix()
train_X = train[:, 0:9]
train_Y = train[:,9]
test = test.as_matrix()
test_X = test[:, 0:9]
test_Y = test[:,9]
return train_X, train_Y, test_X, test_Y
def do_gradient_boost(lr = 1.0, md = 1):
#The best values of lr and md have to be determined through grid search
# for this dataset ~ lr =0.05, md =3 gave 0.769 on the test set
from sklearn.ensemble import GradientBoostingClassifier
train_X, train_Y, test_X, test_Y = analysis_glass()
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=lr,\
max_depth=md, \
random_state=0).fit(train_X, train_Y)
return clf.score(test_X, test_Y)
def do_random_forests(md = None):
from sklearn.ensemble import RandomForestClassifier
train_X, train_Y, test_X, test_Y = analysis_glass()
rfc = RandomForestClassifier(n_estimators=100, max_depth = md)
rfc.fit(train_X, train_Y)
return rfc.score(test_X, test_Y)
# Try extremely randomized trees
def do_extra_trees(md = None):
from sklearn.ensemble import ExtraTreesClassifier
train_X, train_Y, test_X, test_Y = analysis_glass()
ETC = ExtraTreesClassifier(n_estimators=100, max_depth = md)
ETC.fit(train_X, train_Y)
return ETC.score(test_X, test_Y)
# Lets try a combination of Random Tree Embedding and Naive Bayes
def do_TRT(ne = 10, md = 3):
from sklearn.ensemble import RandomTreesEmbedding
from sklearn.naive_bayes import BernoulliNB
train_X, train_Y, test_X, test_Y = analysis_glass()
all_X = np.vstack((train_X, test_X))
hasher = RandomTreesEmbedding(n_estimators=ne,\
random_state=0, max_depth=md)
all_X_trans = hasher.fit_transform(all_X)
train_X_trans = all_X[0:149, :]
test_X_trans = all_X[149:, :]
nb = BernoulliNB()
nb.fit(train_X_trans, train_Y)
return nb.score(test_X_trans, test_Y)