-
Notifications
You must be signed in to change notification settings - Fork 1
/
randfor.py
executable file
·125 lines (110 loc) · 6 KB
/
randfor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# Given a labeled data set (arbitrary features, binary response) and a filename with list of features to evaluate, constructs random forest model and compares the features
import optparse
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
def main():
p = optparse.OptionParser()
p.add_option('--model', '-m', default = 'model', type = str, help = 'model filename prefix')
p.add_option('--load', '-L', default = False, action = 'store_true', help = 'load model from file')
p.add_option('--features', '-f', default = 'features.txt', type = str, help = 'feature filename')
p.add_option('--data', '-d', default = 'data.csv', type = str, help = 'marked data filename')
p.add_option('--verbose', '-v', default = False, action = 'store_true', help = 'verbosity flag')
p.add_option('--thresh', '-T', default = 0.5, type = float, help = 'probability threshold to classify True')
p.add_option('--n_estimators', '-n', default = 100, type = int, help = 'number of random forest estimators')
p.add_option('--test_fraction', '-t', default = 0.25, type = float, help = 'fraction of data to use for testing')
p.add_option('--seed', '-s', default = None, type = int, help = 'random seed')
p.add_option('--jobs', '-j', default = -1, type = int, help = 'number of jobs (-1 if maximum)')
p.add_option('--probs', '-p', default = None, type = str, help = 'filename for output probabilities')
opts, args = p.parse_args()
model_filename = opts.model + '%s.pickle' % ('' if opts.seed is None else str(opts.seed))
probs_filename = ('predicted_probs%s.dat' % ('' if opts.seed is None else str(opts.seed))) if opts.probs is None else opts.probs
np.random.seed(opts.seed)
if opts.verbose:
print("\nReading marked data from %s..." % opts.data)
# establish data frame
df = pd.read_csv(opts.data)
n_lines = len(df)
# choose test set as random test_fraction of data, leaving the remainder for training
n_test = int(opts.test_fraction * n_lines)
if opts.verbose:
print("Read %d lines of data -> %d lines (training), %d lines (test)" % (n_lines, n_lines - n_test, n_test))
test_subset = np.random.permutation(range(n_lines))[:n_test]
is_train = np.ones(n_lines, dtype = bool)
for i in test_subset:
is_train[i] = False
# establish training and test sets
train, test = df[is_train], df[~is_train]
if opts.load:
rfc = pickle.load(open(model_filename, 'rb'))
if opts.verbose:
print("\nLoaded model from '%s'.\n" % model_filename)
else:
# set the random forest instance
rfc = RandomForestClassifier(n_estimators = opts.n_estimators, n_jobs = opts.jobs)
# set list of features (all the uncommented features above dotted line in feature file; leading/trailing whitespace is stripped
with open(opts.features, 'r') as f:
lines = f.readlines()
line_starts_with_dash = [(line[0] == '-') for line in lines]
assert (line_starts_with_dash.count(True) == 1), "Feature file must have a single dashed line separating input/output features."
dashed_line_index = line_starts_with_dash.index(True)
rfc.input_features = []
for i in range(dashed_line_index):
feature = lines[i].partition('#')[0].strip()
if (len(feature) > 0):
rfc.input_features.append(feature)
output_features = []
for i in range(dashed_line_index + 1, len(lines)):
feature = lines[i].partition('#')[0].strip()
if (len(feature) > 0):
output_features.append(feature)
assert (len(output_features) == 1), "Feature file must have exactly one output feature."
rfc.output_feature = output_features[0]
num_features = len(rfc.input_features)
assert (num_features > 0), "Feature file must have at least one input feature."
X = train[rfc.input_features]
y = train[rfc.output_feature]
if (not opts.load):
# train the forest
if opts.verbose:
print("\nTraining %d random forests..." % opts.n_estimators)
rfc.fit(X, y)
# save off the model
pickle.dump(rfc, open(model_filename, 'wb'))
if opts.verbose:
print("\nSaved model to '%s'.\n" % model_filename)
# make predictions on the test data
probs = rfc.predict_proba(test[rfc.input_features])[:, 1]
probs_series = pd.Series(probs)
probs_series.to_csv(probs_filename, index = False)
test_preds = (probs >= opts.thresh)
conf_df = pd.crosstab(test[rfc.output_feature], test_preds, rownames = ['actual'], colnames = ['predicted'])
conf_mat = np.asarray(conf_df)
class_report = classification_report(test[rfc.output_feature], test_preds)
print("\nConfusion Matrix")
print(conf_df)
print("\nClassification Report")
print(class_report)
accuracy = (conf_mat[0, 0] + conf_mat[1, 1]) / float(np.sum(conf_mat))
print("Accuracy = %.3f%%" % (100. * accuracy))
print("\nFeature Importances")
triples = [(i, rfc.input_features[i], rfc.feature_importances_[i]) for i in range(num_features)]
triples.sort(key = lambda pair : pair[2], reverse = True)
indices, features, importances = zip(*triples)
for i in range(num_features):
print("%17s %3d.%03d%%" % (features[i], int(100. * importances[i]), round(1000 * (100. * importances[i] - int(100. * importances[i])))))
stds = np.std([tree.feature_importances_ for tree in rfc.estimators_], axis = 0)
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances, color = 'r', yerr = [stds[i] for i in indices], align = 'center')
plt.xticks(range(X.shape[1]), features, rotation = 'vertical')
plt.xlim([-1, X.shape[1]])
fig = plt.gcf()
fig.subplots_adjust(bottom = 0.25)
plt.savefig('feature_importances%s.png' % ('' if opts.seed is None else str(opts.seed)))
#plt.show()
if __name__ == "__main__":
main()