/
main.py
189 lines (145 loc) · 5.38 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import pickle
import sys
import numpy as np
import pandas as pd
from time import time
from sklearn.cross_validation import train_test_split, ShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from binarizer import binarize
from fscore import fscore
from learning_curve import plot_learning_curve
# preprocess the input data
df = pd.read_csv("polen_data.csv")
# shuffle the dataset
df = df.sample(frac=1).reset_index(drop=True)
# remove unnecessary columns
del df["Unnamed: 0"], df["TENDENCIJA"]
# output vector
y = df['KONCENTRACIJA'].as_matrix()
del df['KONCENTRACIJA']
# output the column names
print(df.columns.values.tolist())
# binarize id columns
id_cols = ["vrstaBiljke", "ID_VRSTE", "ID_LOKACIJE"]
id_columns = df[id_cols].as_matrix()
for col in id_cols:
del df[col]
id_columns_binarized = binarize(id_columns[:, 0])
for i in range(1, np.size(id_cols)):
id_columns_binarized = np.column_stack((id_columns_binarized, binarize(id_columns[:, i])))
# transform to suitable representation
X = df.as_matrix()
# testing the shapes before and after the binarization
print('before binarization X = ', X.shape)
print('id_columns_binazried = ', id_columns_binarized.shape)
X = np.column_stack((X, id_columns_binarized))
print('after binarization X = ', X.shape)
# clear from n/a's
clear_rows = ~np.isnan(X).any(axis=1)
X = X[clear_rows]
y = y[clear_rows]
print('Normal shape(X) = ', np.shape(X))
# create polynomial features
poly = PolynomialFeatures(degree=2)
X = poly.fit_transform(X)
print('Polynomial shape(X) = ', np.shape(X))
# cut off features with low variance
sel = VarianceThreshold(threshold=0.1)
X = sel.fit_transform(X)
print('Without low var features shape(X) = ', np.shape(X))
# hack, because we have to few instances of both c2 and c3
y[y == 3] = 2
# get the minimum sample
under_sample = sys.maxsize
for i in range(min(y), max(y) + 1):
temp_sum = sum(y == i)
if temp_sum < under_sample:
under_sample = temp_sum
for i in range(min(y), max(y) + 1):
print('class ', i, ' samples = ', sum(y == i))
# under-sample each of the oversampled classes
X_leftover = []
y_leftover = []
for i in range(min(y), max(y) + 1):
k = 1
X_current_class = X[y == i]
y_current_class = y[y == i]
X = X[y != i]
y = y[y != i]
# taking full 1-th class, and 4 times lesser 0-th class
if i == 1:
k = 20
else:
if i == 0:
k = 4
# under-sample the current class
X = np.row_stack((X, X_current_class[0:k * under_sample, :]))
y = np.concatenate((y, y_current_class[0:k * under_sample]), axis=0)
# save the leftover training examples
if not len(X_leftover) and not len(y_leftover):
X_leftover = X_current_class[k * under_sample:, :]
y_leftover = y_current_class[k * under_sample:]
else:
X_leftover = np.row_stack((X_leftover, X_current_class[k * under_sample:, :]))
y_leftover = np.concatenate((y_leftover, y_current_class[k * under_sample:]), axis=0)
X, y = shuffle(X, y, random_state=0)
X_leftover, y_leftover = shuffle(X_leftover, y_leftover, random_state=0)
for i in range(min(y), max(y) + 1):
print('sample size of class ', i, ' =', sum(y == i))
print('X_leftover size = ', np.shape(X_leftover))
print('y_leftover size = ', np.shape(y_leftover))
for i in range(min(y), max(y) + 1):
print('class ', i, ' samples after undersampling = ', sum(y == i))
# scale the features to 0 mean and variance 1
std = StandardScaler()
X = std.fit_transform(X)
# split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y)
# scale the test set with training set mean & variance
# and add the leftover ones to the test set
X_leftover = (X_leftover - std.mean_) / std.var_
y_test = np.concatenate((y_leftover, y_test), axis=0)
X_test = np.row_stack((X_leftover, X_test))
X_test, y_test = shuffle(X_test, y_test, random_state=0)
for i in range(min(y), max(y) + 1):
print('class ', i, ' training samples = ', sum(y_train == i))
for i in range(min(y), max(y) + 1):
print('class ', i, ' test samples = ', sum(y_test == i))
"""
# try with blending multiple ensembles
best = 0
for i in range(0, 100):
current_score = blender(X_train, y_train, X_test, y_test)
if current_score > best:
best = current_score
"""
# RDF prediction model
clf = RandomForestClassifier(class_weight='balanced', n_estimators=500, n_jobs=-1)
# plot learning curves
cv = ShuffleSplit(X.shape[0], n_iter=40,
test_size=0.2, random_state=0)
title = "Learning Curves (Random Forests)"
plot_learning_curve(clf, title, X, y, ylim=(0.7, 1.01), cv=cv, n_jobs=1)
# train the model
clf.fit(X_train, y_train)
# predict the test classes
y_score = clf.predict_proba(X_test)
# fscore plot
fscore(y_test, y_score)
# saving the classifier
with open('classifier.pickle', 'wb') as handle:
pickle.dump(clf, handle)
# saving the standard scaler
with open('scaler.pickle', 'wb') as handle:
pickle.dump(std, handle)
# saving the polynomial transformer
with open('polynomial-transformer.pickle', 'wb') as handle:
pickle.dump(poly, handle)
# saving the variance feature selector
with open('variance-threshholder.pickle', 'wb') as handle:
pickle.dump(sel, handle)