forked from mmortonson/seizure_prediction
/
optimize_model.py
executable file
·80 lines (75 loc) · 3.74 KB
/
optimize_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/usr/bin/env python
#
# Perform a grid search in several settings for training a
# classification model to optimize the AUC.
import sys
import itertools
import numpy as np
from sklearn import linear_model
from train_model import train_model
def optimize_model(features_files, submission_file,
classifier=linear_model.LogisticRegression,
feature_columns=range(2, 27),
min_features=1, max_features=1,
parameters={'C': np.logspace(-3, 1, 5),
'class_weight': ['auto']},
**kwargs):
"""
Find the combination of features (selected from columns listed
in feature_columns in the files listed in features_files,
including 1 to max_features different features) and model
parameters that optimizes the AUC for the chosen classifier,
then update predicted test probabilities in submission_file.
In the parameters dictionary, the keys are the keywords to be used
when initializing the classifier, and each value is an array of
arguments to loop over when searching for the best model.
Additional settings are passed to train_model in kwargs.
"""
best_model = {'AUC': 0.}
# vary number of features
if min_features > max_features:
sys.exit('min_features must be <= max_features')
for n_features in range(min_features, max_features+1):
# vary features used in model training
if n_features == min_features:
if min_features == 1:
feature_columns_grid = [[i] for i in feature_columns]
else:
feature_columns_grid = list(itertools.combinations( \
feature_columns, min_features))
elif len(best_model['columns']) < n_features-1:
# exit loop since last iteration didn't add new features
break
else:
remaining_features = list(feature_columns)
for i in best_model['columns']:
remaining_features.remove(i)
feature_columns_grid = [list(best_model['columns']) + [i] \
for i in remaining_features]
for f_cols in feature_columns_grid:
# vary model parameters
for model_args in list(itertools.product( \
*[[(k, v) for v in parameters[k]] \
for k in parameters.keys()])):
# train the classifier and compute AUC
model, auc_mean, auc_std = train_model(features_files,
f_cols, classifier,
dict(model_args),
**kwargs)
print '\r' + ', '.join([str(fc) for fc in f_cols]) + \
' AUC = {0:.2f}+/-{1:.2f}'.format(auc_mean, auc_std),
sys.stdout.flush()
if auc_mean > best_model['AUC']:
print '\n ', model_args
best_model = {'AUC': auc_mean,
'columns': f_cols,
'parameters': model_args}
print '\r' + 70*' ' + '\n'
# compute predictions for best model, update submission CSV,
# and plot learning curves and ROC curve
model, auc_mean, auc_std = train_model(features_files,
best_model['columns'], classifier,
dict(best_model['parameters']),
submission_file=submission_file,
save_settings=True, plot=True,
**kwargs)