-
Notifications
You must be signed in to change notification settings - Fork 0
/
dimReduction.py
148 lines (142 loc) · 6.05 KB
/
dimReduction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import numpy as np
import datetime
import time
import argparse
import gc # garbage collection
# my util function
from util_weka import load_weikaFormat
# time conversion
_d = lambda t: datetime.datetime.strptime(t, '%Y-%m-%d %H:%M:%S')
time2number = lambda t_datetime: time.mktime(_d(t_datetime).timetuple()) + 1e-6 * _d(t_datetime).microsecond
def writeCache(outputFilename, data):
print 'Writing Cache: ', outputFilename
with open(outputFilename, 'w') as f1:
data_str = data.astype(np.str)
for line in data_str:
line_write = ' '.join(line) + '\n'
f1.write(line_write)
def writeTimestamp(outputFilename, timestampData):
print 'Writing timestamp: ', outputFilename
with open(outputFilename, 'w') as f1:
data_str = timestampData.astype(np.str)
for line in data_str:
f1.write(line+'\n')
def getIO(sensorStr, baseName):
## sensorsStr: 'SPL' (in order)
sensors = []
addName = ''
soundStr = 'allSound_'
pirStr = 'allPIR_'
lightStr = 'allLight_'
####################################
## Sepecify sensors
sensors_sounds = ['101S54_Sound', '100S67_Sound', '100S69_Sound', '99S67_Sound', '102S58_Sound', '101S56_Sound', '102S56_Sound', '99S61_Sound', '102S57_Sound']
sensors_lights = ['99S65_Light', '99S60_Light', '100S65_Light', '100S60_Light', '102S55_Light']
# remove problemetic '102S21_PIR'
sensors_PIR = ['101S4_PIR', '102S12_PIR', '102S32_PIR', '102S33_PIR', '99S24_PIR', '100S3_PIR', '99S4_PIR', '102S31_PIR', '100S24_PIR', '100S14_PIR', '102S34_PIR', '102S11_PIR']
####################################
if 'S' in sensorStr:
sensors += sensors_sounds
addName += soundStr
if 'P' in sensorStr:
sensors += sensors_PIR
addName += pirStr
if 'L' in sensorStr:
sensors += sensors_lights
addName += lightStr
wholeName_outputFile = addName + baseName
return sensors, wholeName_outputFile
def getBasename(name):
# format: merge-year-month-daytoyear-month-day.dat
name = name.split('/')[-1].split('.')[0].split('-')
name = ''.join(name[1:])
return name
if __name__=='__main__':
argparser = argparse.ArgumentParser()
argparser.add_argument('wekaFilePath', type=str, help='wekaFilePath, eg:merge-2014-09-01to2014-09-07.dat')
argparser.add_argument('dimReductionType', type=str, help='types of dimensionality reduction, [SC/PCA]')
argparser.add_argument('reducedDimension', type=int, help='reducedDimension, integer, eg: 3')
argparser.add_argument('specifiedSensors', type=str, help='specifiedSensors, eg: SPL')
argparser.add_argument('outputDir', type=str, help='output directory')
args = argparser.parse_args()
args = vars(args)
print '\
####################################\n\
# word construction #\n\
####################################'
## Input: raw data; weka format
## Output: .PCAcache
####################################
# Weka format
####################################
## parameter setting
wekaFilePath = args['wekaFilePath']
reducedDimension = args['reducedDimension']
if 'SC' in args['dimReductionType']:
baseName = getBasename(wekaFilePath) + '_SC_Dimension{0}_weka.cache'.format(reducedDimension)
elif 'PCA' in args['dimReductionType']:
baseName = getBasename(wekaFilePath) + '_PCA_Dimension{0}_weka.cache'.format(reducedDimension)
else:
print 'Error: No Reduction Method Specified!!!'
####################################
# Input argument: i.e. "SPL"
specifiedSensors = args['specifiedSensors']
####################################
sensorIDs, outputFilename = getIO(specifiedSensors, baseName)
####################################
# Load weka format
####################################
## loading data, return numpy array
dataArray, t = load_weikaFormat(wekaFilePath, sensorIDs)
## convert the type: str to float
print "Warning: Please make sure there is no any '?' value in the data."
dataArray = dataArray.astype(np.float)
gc.collect()
####################################
# Dimensionality Reduction #
####################################
data_reduced = np.empty((0,0))
if 'SC' in args['dimReductionType']:
####################################
# Sparse Coding #
####################################
print 'Sparse Coding:'
# normalize every column respectively
from sklearn.preprocessing import MinMaxScaler
normalizer = MinMaxScaler() # feature range (0,1)
dataArray_normalized = normalizer.fit_transform(dataArray)
print 'normalized data:'
print dataArray_normalized
# reduce to the specified dimension
from learnDic import sparse_coding
from sklearn.decomposition import sparse_encode
dl = sparse_coding(reducedDimension, dataArray_normalized, 0.2, 1000, 0.0001)
code = sparse_encode(dataArray_normalized, dl.components_)
data_reduced = code
print 'Reduced data:'
print data_reduced
print 'Dictionary:'
print dl.components_
print 'iteration:', dl.n_iter_
elif 'PCA' in args['dimReductionType']:
####################################
# Principal Component Analysis #
####################################
from matplotlib.mlab import PCA as mlabPCA
print 'PCA:'
myPCA = mlabPCA(dataArray)
data_reduced = myPCA.Y[:,0:reducedDimension]# reduce to the specified dimension
print 'Raw data:'
print dataArray
print 'Reduced data:'
print data_reduced
else:
print 'Error: No Reduction Method Specified!!!'
####################################
# End of Dimensionality Reduction #
####################################
print 'data_reduced dimension:', data_reduced.shape
writeCache(args['outputDir']+outputFilename, data_reduced)
writeTimestamp(args['outputDir']+'timestamp', t)
print 'Output file:', outputFilename
print 'Done'