/
PCA.py
75 lines (55 loc) · 2.45 KB
/
PCA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# -*- coding: UTF-8 -*-
import numpy as np
from time import time
from SVM.DataHandler import DataHandler
from sklearn.cross_validation import KFold
from sklearn.decomposition import RandomizedPCA
from sklearn import svm
from sklearn import neighbors
DEFAULT_FOLDS_NUMBER = 5
DEFAULT_COMPONENTS_NUMBER = 10
def pca_estimator(data, targets, estimator, components_number=DEFAULT_COMPONENTS_NUMBER,
folds_number=DEFAULT_FOLDS_NUMBER):
kf = KFold(len(targets), n_folds=folds_number)
# 'scores' is numpy array. An index is a number of a fold. A value is a percent of right
# predicted samples from a test.
scores = np.zeros(folds_number)
start = time()
index = 0
for train, test in kf:
x_train, x_test, y_train, y_test = data[train], data[test], targets[train], targets[test]
pca = RandomizedPCA(n_components=components_number, whiten=True).fit(x_train)
x_train_pca = pca.transform(x_train)
x_test_pca = pca.transform(x_test)
clf = estimator.fit(x_train_pca, y_train)
scores[index] = clf.score(x_test_pca, y_test)
index += 1
# print("Iteration %d from %d has done! Score: %f" % (index, folds_number,
# scores[index - 1]))
finish = time()
return scores.mean(), scores.std() * 2, (finish - start)
if __name__ == "__main__":
data_handler = DataHandler()
all_data, all_targets = data_handler.get_training_data()
samples_size = 5000
training_data = all_data[-samples_size:]
training_targets = all_targets[-samples_size:]
best_components_number = 0
best_mean = 0
best_standart_deviation = 0
best_time = 0
# estimator = svm.SVC(kernel='linear', C=1)
estimator = svm.SVC(kernel='rbf', C=1, gamma=0.0001)
# estimator = neighbors.KNeighborsClassifier(n_neighbors=12)
for n_components in range(1, 100):
mean, standart_deviation, work_time = pca_estimator(training_data, training_targets, estimator,
n_components)
if mean > best_mean:
best_components_number = n_components
best_mean = mean
best_standart_deviation = standart_deviation
best_time = work_time
print(n_components)
print("N_components: %d" % best_components_number)
print("Accuracy: %0.2f (+/- %0.2f)" % (best_mean, best_standart_deviation))
print("Time: %0.2f" % best_time)