-
Notifications
You must be signed in to change notification settings - Fork 0
/
models.py
100 lines (77 loc) · 2.9 KB
/
models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import numpy as np
from sklearn import cross_validation
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from features import read_features
from utils import plot_scores, plot_learning_curve, normalize_features, clean_data, impute_nan
def train_model(X, y, clf):
#split the dataset
crossvalidation = cross_validation.StratifiedKFold(y, n_folds=5)
#fit the model
cms = []
train_scores = []
test_scores = []
for train, test in crossvalidation:
X_train, y_train = X[train], y[train]
X_test, y_test = X[test], y[test]
X_train, X_test = impute_nan(X_train, X_test)
X_train, X_test = normalize_features(X_train, X_test)
#print(X_train[0])
clf.fit(X_train, y_train)
#evaluate the model
train_score = clf.score(X_train, y_train)
train_scores.append(train_score)
test_score = clf.score(X_test, y_test)
test_scores.append(test_score)
y_predict = clf.predict(X_test)
cm = confusion_matrix(y_test, y_predict)
cms.append(cm)
return np.mean(test_scores), np.mean(train_scores), np.asarray(cms)
def find_best_k(X, y):
"""
find best k for KNN
"""
test_scores = []
train_scores = []
cm_norms = []
ks = [1,2,4,8,16,32, 48, 64]
for i in ks:
knn = KNeighborsClassifier(n_neighbors=i, weights = 'distance')
test_score, train_score, cms = train_model(X, y, knn)
test_scores.append(test_score)
train_scores.append(train_score)
cm_norms.append(np.sum(cms, axis=0))
print("test: ", test_scores)
print("train: ", train_scores)
#plot_scores(ks, test_scores, train_scores)
def svm_tuning(X, y):
"""
find best C for linear kernal svm
"""
C_range = [1E6, 1E7, 1E8]
for c in C_range:
svm = SVC(kernel='linear', C=c)
test_score, train_score, cms = train_model(X, y, svm)
print("For C: %f, train_score=%f, test_score=%f" % (c, train_score, test_score))
print()
def rf_tuning(X, y):
n_trees = [100, 500, 800]
for n in n_trees:
rf = RandomForestClassifier(n_estimators=n)
test_score, train_score, cms = train_model(X, y, rf)
print(rf.feature_importances_)
print("For n_tree: %f, train_score=%f, test_score=%f" % (n, train_score, test_score))
print(np.sum(cms, axis = 0))
print()
if __name__ == "__main__":
features = ['zcr', 'rms', 'sc', 'sr', 'sf','mfcc']
X, y = read_features(features)
# print(X.shape)
X = clean_data(X)
#find_best_k(X,y)
rf_tuning(X, y)
#plot learning curve for 8nn
# clf = KNeighborsClassifier(n_neighbors=8, weights = 'distance')
# crossvalidation = cross_validation.StratifiedKFold(y, n_folds=5)
# plot_learning_curve(clf, "knn_learning_curve", X, y, cv= crossvalidation, n_jobs=4)