/
ActivityClassifier.py
130 lines (112 loc) · 4.77 KB
/
ActivityClassifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
'''
Classify activities based on sensor data. The classification is done using a
random forest. The forests are printed followed by a confusion matrix and
classification efficiency statistics
'''
#
# Copyright (C) 2017 Ezra Erb
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 3 as published
# by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# I'd appreciate a note if you find this program useful or make
# updates. Please contact me through LinkedIn or github (my profile also has
# a link to the code depository)
#
import sys
import csv
import random
from collections import defaultdict
import numpy as np
from RandomForest import RandomForest
from ConfusionMatrix import ConfusionMatrix
def load_sample_data(file_name):
'''
Load sample data from the given file, strip columns that are not useful for
classification, return the results as an NDarray.
'''
# For efficient subspace sampling, need a 2D NDarray. However, this causes
# problems handling the category, because NDArrays require all entries to
# be the same type. The solution chosen here is to map the categories to
# float values, and use those. Need a map of the values back to the
# categories for output, and the reverse to translate input
cat_to_id = {}
id_to_cat = {}
# Construct the final array as a list and then merge
sample_set = []
with open(file_name, 'rt') as fin:
cfin = csv.reader(fin, delimiter=';')
# First line is a header. Burn it
next(cfin)
for mrow in cfin:
category = mrow[18]
record_id = -1.0 # Invalid value
if category in cat_to_id.keys():
record_id = cat_to_id[category]
else:
record_id = float(len(cat_to_id.keys())) # Indexed from zero
cat_to_id[category] = record_id
id_to_cat[record_id] = category
# Row contains several columns containing data spcific to each
# individual measured. Since this data has perfect corrolation,
# taking multiple is useless for classification. Take the last
# one, Body Mass Index.
samples = [float(i) for i in mrow[5:18]]
samples.append(record_id) # Correct category becomes last column
sample_set.append(samples)
return (np.array(sample_set), id_to_cat)
def filter_sample_set(sample_set):
'''
This method converts a big sample sant and reduces it to make testing the
code easier. It randomly samples from each combination of BMI and category.
'''
# Split the list into sublists based on BMI/result comobs
split_lists = defaultdict(list)
for data in sample_set:
sublist_type = (data[0], data[-1])
split_lists[sublist_type].append(data)
output_samples = []
for data in split_lists.itervalues():
# https://www.safaribooksonline.com/library/view/python-cookbook/0596001673/ch02s09.html
sublist_size = len(data)
for dummy in range(10):
wanted_sample = random.randrange(sublist_size)
output_samples.append(data[wanted_sample])
data[wanted_sample] = data[sublist_size - 1]
sublist_size = sublist_size - 1
return np.array(output_samples)
def main():
'''
Main classification driver. Read in data files, classify the sensor data
they contain, and evaluate the performance of the classifier
'''
if len(sys.argv) < 2:
print 'USAGE: ActivityClassifier.py (path to data file)'
sys.exit(1)
(sample_set, id_to_cat) = load_sample_data(sys.argv[1])
# TESTING: Reduce the number of samples
# sample_set = filter_sample_set(sample_set)
# Divide the samples into two, training and test. Keep 25% for test
np.random.shuffle(sample_set)
test_sample_count = sample_set.shape[0] / 4
test_samples = sample_set[:test_sample_count]
training_samples = sample_set[test_sample_count:]
random_forest = RandomForest(training_samples, 9)
print random_forest
confusion_matrix = ConfusionMatrix(id_to_cat.values())
for sample in test_samples:
classified_category = random_forest.classify_activity(sample)
confusion_matrix.add_result(int(sample[-1]), classified_category)
print confusion_matrix
confusion_matrix.report_stats()
if __name__ == '__main__':
main()