forked from edwardhunter/soc290
-
Notifications
You must be signed in to change notification settings - Fork 0
/
template_supervised.py
415 lines (347 loc) · 14 KB
/
template_supervised.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
#!/usr/bin/env python
"""
@package css
@file css/template_supervised.py
@author Edward Hunter
@brief A template to be customized for supervised learning experiments.
"""
# Copyright and licence.
"""
Copyright (C) 2014 Edward Hunter
edward.a.hunter@gmail.com
840 24th Street
San Diego, CA 92102
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
# Import common modules and utilities.
from common import *
# Define method and models available.
METHOD = ''
MODELS = ()
def train(data, dataset, model, **kwargs):
"""
Train and store feature extractor, dimension reducer and classifier.
@param data: training and testing dataset dictionary.
@param dataset: dataset name string, valid key to data.
@param model: model name string.
"""
# Verify input parameters.
if not isinstance(data, dict):
raise ValueError('Invalid data dictionary.')
if not isinstance(dataset, str) or dataset not in DATASETS:
raise ValueError('Invalid dataset.')
if not isinstance(model,str) or model not in MODELS:
raise ValueError('Invalid model type parameter.')
# Retrieve training data.
data_train = data['train']
data_train_target = data['train_target']
# Retrieve options.
dim = kwargs.get('dim', None)
df_min = kwargs.get('df_min',1)
df_max = kwargs.get('df_max',1.0)
# Create dimension reducer.
fselector = None
if dim:
fselector = SelectKBest(chi2, k=dim)
############################################################
# Create feature extractor, classifier.
############################################################
# TODO: create clf, vectorizer.
############################################################
# Extract features, reducing dimension if specified.
print 'Extracting text features...'
start = time.time()
x_train = vectorizer.fit_transform(data_train)
if fselector:
x_train = fselector.fit_transform(x_train, data_train_target)
if fselector.__normalize:
x_train = normalize(x_train)
print 'Extracted in %f seconds.' % (time.time() - start)
print 'Feature dimension: %i' %x_train.shape[1]
print 'Feature density: %f' % density(x_train)
# Train classifier.
print 'Training classifier...'
start = time.time()
clf.fit(x_train, data_train_target)
print 'Trained in %f seconds.' % (time.time() - start)
# Default grid search and top features output triggers.
grid_search_output = False
top_features_output = False
############################################################
# Grid search and top feature output for SVMs.
############################################################
# TODO: SVMs only.
############################################################
# Print out grid search results.
if grid_search_output:
print 'Best score: ' + str(clf.best_score_)
print 'Optimal parameters: '
for k,v in clf.best_params_.iteritems():
print '%s=%s' % (k, str(v))
# Print out top features results.
if top_features_output:
print("Classifier shape: %s" % str(clf.coef_.shape))
feature_names = np.asarray(vectorizer.get_feature_names())
top = clf.coef_.toarray().argsort(axis=1)[0]
top_pos = top[-svm_top:]
top_neg = top[:svm_top]
print '-'*40
print 'Top %s Features:' % data['target_names'][1]
for idx in top_pos:
print feature_names[idx]
print '-'*40
print 'Top %s Features:' % data['target_names'][0]
for idx in top_neg:
print feature_names[idx]
# Create object file names.
fname_args = []
if dim:
fname_args.append(str(dim))
fname_args.append(kwargs.get('fappend', ''))
mdl_fname = make_fname(METHOD, model, dataset, 'mdl', 'pk', *fname_args)
vec_fname = make_fname(METHOD, model, dataset, 'vec', 'pk', *fname_args)
dim_fname = make_fname(METHOD, model, dataset, 'dim', 'pk', *fname_args)
mdl_path = os.path.join(MODEL_HOME,mdl_fname)
vec_path = os.path.join(MODEL_HOME,vec_fname)
dim_path = os.path.join(MODEL_HOME,dim_fname)
if not os.path.exists(MODEL_HOME):
os.makedirs(MODEL_HOME)
# Write out model.
fhandle = open(mdl_path,'w')
pickle.dump(clf, fhandle)
fhandle.close()
print 'Model written to file %s' % (mdl_path)
# Write out feature extractor.
fhandle = open(vec_path,'w')
pickle.dump(vectorizer, fhandle)
fhandle.close()
print 'Feature extractor written to file %s' % (vec_path)
# Write out dimension reducer.
if dim:
fhandle = open(dim_path,'w')
pickle.dump(fselector, fhandle)
fhandle.close()
print 'Dimension reducer written to file %s' % (dim_path)
def predict(input_data, dataset, model, **kwargs):
"""
Predict data categories from trained classifier.
@param input_data: vector of input data to classify.
@param dataset: dataset name string, valid key to data.
@param model: model name string.
@return pred: prediction vector for input_data.
"""
# Verify input parameters.
if not isinstance(input_data, list):
raise ValueError('Invalid input data.')
if not isinstance(dataset, str) or dataset not in DATASETS:
raise ValueError('Invalid dataset name.')
if not isinstance(model,str) or model not in MODELS:
raise ValueError('Invalid model name.')
# Retrieve options.
dim = kwargs.get('dim', None)
fappend = kwargs.get('fappend', '')
# Create object file names.
fname_args = []
if dim:
fname_args.append(str(dim))
fname_args.append(fappend)
mdl_fname = make_fname(METHOD, model, dataset, 'mdl', 'pk', *fname_args)
vec_fname = make_fname(METHOD, model, dataset, 'vec', 'pk', *fname_args)
dim_fname = make_fname(METHOD, model, dataset, 'dim', 'pk', *fname_args)
mdl_path = os.path.join(MODEL_HOME,mdl_fname)
vec_path = os.path.join(MODEL_HOME,vec_fname)
dim_path = os.path.join(MODEL_HOME,dim_fname)
# Read in the classifer.
fhandle = open(mdl_path)
clf = pickle.load(fhandle)
fhandle.close()
print 'Read classifer from file: %s' % mdl_path
# Read in the feature extractor.
fhandle = open(vec_path)
vectorizer = pickle.load(fhandle)
fhandle.close()
print 'Read feature extractor from file: %s' % vec_path
# If requested, load the dimension reducer.
if dim:
fhandle = open(dim_path, 'r')
fselector = pickle.load(fhandle)
fhandle.close()
print 'Read dimesion reducer from file %s' % (dim_path)
# Compute features and predict.
x_test = vectorizer.transform(input_data)
if dim:
x_test = fselector.transform(x_test)
if fselector.__normalize:
x_test = normalize(x_test)
pred = clf.predict(x_test)
# Return vector of predicted labels.
return pred
def eval(data, dataset, model, **kwargs):
"""
Evaluate a trained classifer against test data.
Prints out F1, precision, recall and confusion.
Saves a png image of the confusion matrix.
@param: data training and testing dataset dictionary.
@param dataset: dataset name string, valid key to data.
@param model: model name string.
"""
# Verify input parameters.
if not isinstance(data, dict):
raise ValueError('Invalid data dictionary.')
if not isinstance(dataset, str) or dataset not in DATASETS:
raise ValueError('Invalid dataset name.')
if not isinstance(model,str) or model not in MODELS:
raise ValueError('Invalid model name.')
# Extract test and target data.
data_test = data['test']
data_test_target = data['test_target']
data_target_names = data['target_names']
# Predict test data.
pred = predict(data_test, dataset, model, **kwargs)
# Evaluate predictions: metrics.
class_report = metrics.classification_report(data_test_target, pred,
target_names=data_target_names)
conf_matrix = metrics.confusion_matrix(data_test_target ,pred)
# Set up report and figure name, path.
fname_args = []
if dim:
fname_args.append(str(dim))
fname_args.append(fappend)
reportname = make_fname(METHOD, model, dataset, 'report', 'txt', *fname_args)
reportpath = os.path.join(REPORT_HOME, reportname)
figfname = make_fname(METHOD, model, dataset, 'confusion', 'png', *fname_args)
figpath = os.path.join(REPORT_HOME, figfname)
# Print evaluations.
report = 'Report File: %s\n' % reportname
report += '-'*80 +'\n'
report += 'Classification Report:\n'
report += class_report
report += '\n\n'
report += '-'*80 +'\n'
report += 'Confusion Matrix:\n'
n = len(data_target_names)
conf_max = np.amax(conf_matrix)
lmax = math.log(conf_max, 10)
width = int(lmax) + 1
fmtstr ='%' + str(width) + 'd '
for j in range(n):
row = ''
for i in range(n):
row += (fmtstr % int(conf_matrix[j,i]))
report += row + '\n'
report += '\n\n'
print report
if not os.path.exists(REPORT_HOME):
os.makedirs(REPORT_HOME)
# Write report.
rf = open(reportpath, 'w')
rf.write(report)
rf.close()
# Create an image of the log confusion matrix.
confusion_image_type = kwargs.get('confusion', None)
if not confusion_image_type:
pass
elif confusion_image_type not in ('linear','log'):
warnstr = 'WARNING: unrecognized confusion image option '
warnstr += '"%s"' % confusion_image_type
warnstr += '\nConfusion image not saved.'
print warnstr
else:
if confusion_image_type == 'log':
log_conf_matrix = np.log10(conf_matrix+1)
plt.pcolor(np.flipud(log_conf_matrix))
title = '%s %s Log Confusion, %s' % (METHOD, model, dataset)
elif confusion_image_type == 'linear':
plt.pcolor(np.flipud(conf_matrix))
title = '%s %s Confusion, %s' % (METHOD, model, dataset)
plt.xticks(np.arange(n)+0.5, np.arange(1,n+1))
plt.yticks(np.arange(n)+0.5, np.arange(n,0, -1))
plt.xlabel('Predicted Category')
plt.ylabel('True Category')
plt.set_cmap('hot')
plt.colorbar()
plt.title(title)
plt.savefig(figpath)
if __name__ == '__main__':
# Load training/testing utilities.
from data_utils import load_data, DATASETS
# Parse command line arguments and options.
usage = 'usage: %prog [options] model dataset'
usage += ('\n\tmodel = %s\n\tdataset = %s') % (MODELS, DATASETS)
description = 'Train and evaluate supervised classifiers.'
p = optparse.OptionParser(usage=usage, description=description)
p.add_option('-f','--fappend', action='store', dest='fappend',
help='File name appendix string.')
p.add_option('-d','--dim', action='store', dest='dim', type='int',
help='Reduced feature dimension integer.')
p.add_option('-c', '--confusion', action='store', dest='confusion',
help='Save confusion image. Options: linear, log')
p.add_option('-o', '--overwrite', action='store_true', dest='overwrite',
help='Overwrite existing files.')
p.add_option('--df_min', action='store',type='float', dest='df_min',
help='Minimum frequency (int) or proportion (float) (default=1.0).')
p.add_option('--df_max', action='store', type='float', dest='df_max',
help='Maximum document frequency proportion (default=1.0).')
p.set_defaults(fappend=None, dim=None, confusion=None, overwrite=False,
df_min=1.0, df_max=1.0)
############################################################
# Add method specific options.
############################################################
# TODO: SVMs only.
############################################################
(opts, args) = p.parse_args()
if len(args) < 2:
p.print_usage()
sys.exit(1)
model = args[0]
dataset = args[1]
fappend = opts.fappend
dim = opts.dim
confusion = opts.confusion
overwrite = opts.overwrite
if opts.df_min == int(opts.df_min):
df_min = int(opts.df_min)
else:
df_min = opts.df_min
df_max = opts.df_max
kwargs = dict(
dim=dim,
fappend=fappend,
df_min=df_min,
df_max=df_max,
confusion=confusion,
)
############################################################
# Extract method specific options.
############################################################
# TODO: SVMs only.
############################################################
# Load data.
data = load_data(dataset)
# Create object file names.
fname_args = []
if dim:
fname_args.append(str(dim))
fname_args.append(fappend)
mdl_fname = make_fname(METHOD, model, dataset, 'mdl', 'pk', *fname_args)
vec_fname = make_fname(METHOD, model, dataset, 'vec', 'pk', *fname_args)
dim_fname = make_fname(METHOD, model, dataset, 'dim', 'pk', *fname_args)
mdl_path = os.path.join(MODEL_HOME,mdl_fname)
vec_path = os.path.join(MODEL_HOME,vec_fname)
dim_path = os.path.join(MODEL_HOME,dim_fname)
model_files_present = os.path.isfile(mdl_path) and os.path.isfile(vec_path)
if dim:
dim_files_present = model_files_present and os.path.isfile(dim_path)
if overwrite or not model_files_present:
train(data, dataset, model, **kwargs)
# Evaluate classifier.
eval(data, dataset, model, **kwargs)