-
Notifications
You must be signed in to change notification settings - Fork 0
/
ml.py
136 lines (123 loc) · 4.63 KB
/
ml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
from __future__ import division
import os
import sys
import random
if not '/home/blakeb/.local/lib/python2.7/scikit_learn-0.11-py2.7-linux-x86_64.egg' in sys.path:
if os.path.exists('/home/blakeb/.local/lib/python2.7/scikit_learn-0.11-py2.7-linux-x86_64.egg'):
sys.path.append('/home/blakeb/.local/lib/python2.7/scikit_learn-0.11-py2.7-linux-x86_64.egg')
import sklearn as sk
from sklearn.svm import SVC
from sklearn.datasets import make_classification
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
import utils as ut
import numpy as np
import multiprocessing
import features as fe
NCORES = multiprocessing.cpu_count()
def fit_and_test(scored, clf, norm=True):
"""
scored: [arr_train, arr_test]
"""
arr_train, arr_test = scored
if not exist_pos_neg(arr_train):
return []
scaler = fit_clf(arr_train, clf, norm=norm)
tested = classify(clf, arr_test, scaler)
return tested
def exist_pos_neg(arr):
hits = arr['hit']
found_true = False
found_false = False
for h in hits:
if h:
found_true = True
else:
found_false = True
if found_true and found_false:
return True
return False
def fit_clf(arr, clfbase, norm=True):
if norm:
arr = fe.retype_arr(arr) # change f2 to f4 to prevent overflow
X,y = arr_feats(arr), arr['hit']
scaler = None
if norm:
X, scaler = normalize(X)
print "Training classifier: %s examples, %s features" % (len(X), len(X[0]))
clfbase.fit(X,y) # skip this if just normalizing
return scaler
def normalize(X):
print "Fitting and scaling training features."
scaler = sk.preprocessing.StandardScaler().fit(X)
X = scaler.transform(X)
return X, scaler
def classify(clf, arr, scaler=None, do_sort=True):
"""
If the clf was trained without at least a pos and a neg, this will fail.
"""
X = arr_feats(arr)
if scaler:
print "Scaling features before prediction."
X = scaler.transform(X)
print "Predicting: %s examples, %s features" % (len(arr), len(X[0]))
probs = (x[1] for x in clf.predict_proba(X))
tested = zip(arr['id1'], arr['id2'], probs, arr['hit'])
if do_sort:
random.shuffle(tested)
tested.sort(key=lambda x:x[2],reverse=True)
return tested
def arr_feats(arr):
return [[x for x in r] for r in arr[list(arr.dtype.names[3:])]]
def tree(n_estimators=200,n_jobs=NCORES-1, bootstrap=True, **kwargs):
return ExtraTreesClassifier(n_estimators=n_estimators, n_jobs=n_jobs,
bootstrap=bootstrap, **kwargs)
def tree_feats(**kwargs):
return tree(compute_importances=True, **kwargs)
def svm(kernel='linear', cache_size=4000, **kwargs):
return SVC(kernel=kernel, cache_size=cache_size, probability=True, **kwargs)
def linear(dual=False, **kwargs):
return LinearSVC(dual=dual, **kwargs)
def feature_selection(arr, clf, printn=10, do_plot=False):
"""
clf: ml.tree(compute_importances=True) or ml.linear()
"""
names = arr.dtype.names[3:]
fit_clf(arr, clf, norm=True)
importances = (clf.coef_[0] if hasattr(clf, 'coef_') else
clf.feature_importances_)
indices = np.argsort(importances)[::-1]
ranked = [(names[index], importances[index]) for index in indices]
print "Dislaying top %s features:" % printn
for i,(name,imp) in enumerate(ranked[:printn]):
print "%d. %s (%f)" % (i + 1, name, imp)
# Plot the feature importances of the trees and of the forest
if do_plot:
import pylab as pl
pl.figure()
pl.title("Feature importances")
for tree in forest.estimators_:
pl.plot(indnums, tree.feature_importances_[indices], "r")
pl.plot(indnums, importances[indices], "b")
pl.show()
feats, weights = zip(*ranked)
return list(feats), list(weights)
if __name__ == '__main__':
if len(sys.argv) < 4:
sys.exit("usage: python ml.py train_test feats_f clf_type \
donorm kwarg1_val1-kwarg2-val2")
ttf = sys.argv[1]
tt = np.load(ttf)
feats = ut.loadpy(sys.argv[2])
k = sys.argv[3]
do_norm = sys.argv[4]
kvs = sys.argv[5]
kwargs = dict([tuple(kv.split('_')) for kv in kvs.split('-')]) \
if kvs else {}
clf = tree(**kwargs) if k=='tree' else svm(kernel=k, **kwargs)
ts = [('%s features, %s kernel, norm: %s, %s' %(n,k,do_norm, kvs),
fit_and_test([fe.keep_cols(t, ut.i0(feats[:n])) for t in tt],
clf, norm=do_norm))
for n in 20,30,40,50]
ut.savepy(ts, 'ts_%s_%s_%s_%s' %(k,do_norm,kvs,ttf))