-
Notifications
You must be signed in to change notification settings - Fork 0
/
clusterTrafficData.py
202 lines (187 loc) · 8.63 KB
/
clusterTrafficData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
__author__ = 'Daniel'
from KMeans import *
from CentroidDetermination import *
from clusterEvaluation import *
import pika
import json
from RingBuffer import RingBuffer
import logging
from pprint import pformat
import csv
from datetime import datetime, timedelta
from customLogging import info, writeToCsv, setChannelName
from trafficDataImport import getMetaData
initialPacketsize = 100
packetsize = 10
centroidinp = [[], []]
means = False
clusterResult = []
#Here only the features are stored which are taken into account while clustering
clusterDataStore = RingBuffer(50000)
#Here metadata is stored. Basically everything which is important for later analysis but should not taken into
# account while clustering (e.g. location, timestamp) . It must be ensured that this buffer is consistent with the
# buffer above (i.e. index links the data)
# TODO: Once able to retrieve individual values through CKAN API fill this with values
metaDataStore = RingBuffer(50000)
#save the chosen k
k = 0
datetimeFormat = '%Y-%m-%dT%H:%M:%S'
#Timestamps to know when initial centroid computation and the recalculations take place
startdate = datetime.strptime('2014-02-13T12:25:00', datetimeFormat)
datesincerecalculation = datetime.strptime('2014-02-13T12:25:00', datetimeFormat)
recalculationtime = 0
def init(name, recalctime):
global recalculationtime
logging.basicConfig()
# logging.basicConfig(filename="log/"+name+".log",filemode='wb')
# logger = logging.getLogger(__name__)
startdate = datetime.now()
recalculationtime = recalctime
#setup connection and declare channel
connection = pika.BlockingConnection(pika.ConnectionParameters(host="127.0.0.1"))
channel = connection.channel()
channel.exchange_declare(exchange='clustertraffic', type='fanout')
result = channel.queue_declare(exclusive=True)
queue_name = result.method.queue
channel.queue_bind(exchange='clustertraffic', queue=queue_name)
setChannelName(name)
#initialize logging
# logger.setLevel(logging.DEBUG)
# ch = logging.StreamHandler()
# ch.setLevel(logging.DEBUG)
# formatter = logging.Formatter('%(asctime)s %(message)s')
# ch.setFormatter(formatter)
# logger.addHandler(ch)
return channel, queue_name
#for performance reasons, the for loop in this method is also used to store the values in the Ring Buffer
def transform(arr, store=True):
ret = []
for i in range(len(arr[0])):
ret.append([arr[0][i], arr[1][i]])
if store:
clusterDataStore.append(ret[i])
return ret
def recalculated_clustering(inp, k):
# global logger
means = recalculate_centroids(inp, k)
kmeansinput = transform(inp, False)
features = len(kmeansinput[0])
weights = [1 for i in range(features)]
result = kmeans(means, kmeansinput, weights, features, len(means))
return result
def initial_clustering(timestamp, inp):
# global logger
means = predetermine_centroids(inp)
kmeansinput = transform(inp)
# pprint(means)
features = len(kmeansinput[0])
weights = [1 for i in range(features)]
result = [kmeans(v, kmeansinput, weights, features, len(v)) for v in means]
pprint(result)
clustering = [c["cluster"] for c in result if not isinstance(c, float)]
silhoutteCoefficients = [silhoutteCoefficient(timestamp, c1) for c1 in clustering]
clustersizes = [[len(c) for c in c1] for c1 in clustering]
highestSil = -1
for i, s in enumerate(silhoutteCoefficients):
if s > highestSil:
highestSil = s
idx = i
info(timestamp, "Clustersizes after initial Clustering:")
info(timestamp, pformat(clustersizes))
info(timestamp, "Chosen k = %i" % len(clustering[idx]))
pprint(idx)
return result[idx]
def callback(ch, method, properties, body):
global centroidinp, means, clusterResult, k, logger, datetimeFormat, startdate, datesincerecalculation, recalculationtime, recalcsize
body = json.loads(body)
currentTimeStamp = datetime.strptime(body["data"]["TIMESTAMP"], datetimeFormat)
report_id = body["data"]["REPORT_ID"]
while True:
try:
metaData = getMetaData(report_id)
break
except:
print "metaData retrieval timed out, try again"
if not means:
#if len(centroidinp[0]) < 30:
if (currentTimeStamp-startdate) < timedelta(hours=24):
centroidinp[0].append(body["data"]["avgSpeed"])
centroidinp[1].append(body["data"]["vehicleCount"])
newValue = [body["data"]["avgSpeed"], body["data"]["vehicleCount"]]
hitBucket = "n/a"
writeToCsv(newValue, currentTimeStamp, metaData, hitBucket)
#print "waiting for data %i" % len(centroidinp[0])
return
else:
datesincerecalculation = currentTimeStamp
clusterResult = initial_clustering(currentTimeStamp, centroidinp)
k = len(clusterResult["means"])
info(currentTimeStamp, "Results after initial clustering:")
info(currentTimeStamp, "Cluster:")
clustersizes = [len(c) for c in clusterResult['cluster']]
info(currentTimeStamp, pformat(clustersizes))
info(currentTimeStamp, pformat(clustersizes))
means = [{'Average Speed': x[0], 'Vehicle Count': x[1]} for x in clusterResult['means']]
info(currentTimeStamp, "Centroids:")
info(currentTimeStamp, pformat(means))
return
#elif len(clusterDataStore) > recalcsize:
elif (currentTimeStamp-datesincerecalculation) > timedelta(hours=recalculationtime):
recalcsize += 100
#print "clustering data %i" % len(clusterDataStore)
# Enough time has past to recalibrate
datesincerecalculation = currentTimeStamp
centroidinp = [[], []]
newValue = [body["data"]["avgSpeed"], body["data"]["vehicleCount"]]
clusterDataStore.append(newValue)
metaDataStore.append(metaData)
for x in clusterDataStore.get():
for i, v in enumerate(x):
centroidinp[i].append(v)
info(currentTimeStamp, "Recalibrating Centroids...")
#print "Recalibrating Centroids..."
clusterResult = recalculated_clustering(centroidinp, k)
if clusterResult == 0:
# Some bug which can only be reproduced at random causes clusterResult to become 0
# Ugly hack: just ignore and hope system recovers...
while clusterResult == 0:
clusterResult = recalculated_clustering(centroidinp, k)
print "recalculating centorids went wrong, here we go again"
return
# info(m)
# info(lastm)
# info("New Centroids. Last m %i, new m $i" % (lastm, m))
means = [{'Average Speed': x[0], 'Vehicle Count': x[1]} for x in clusterResult['means']]
info(currentTimeStamp, pformat(means))
clustersizes = [len(c) for c in clusterResult['cluster']]
hitBucket = "n/a"
info(currentTimeStamp, pformat(clustersizes))
writeToCsv(newValue, currentTimeStamp, metaData, hitBucket)
return
else:
#print "clustering data %i" % len(clusterDataStore)
newValue = [body["data"]["avgSpeed"], body["data"]["vehicleCount"]]
clusterDataStore.append(newValue)
metaDataStore.append(metaData)
features = len(newValue)
weights = [1 for i in range(features)]
clusterResult = kmeans_new_value(clusterResult['means'], clusterResult['cluster'], weights, features, newValue)
means = [{'Average Speed': x[0], 'Vehicle Count': x[1]} for x in clusterResult['means']]
info(currentTimeStamp, "Centroids at time " + currentTimeStamp.strftime(datetimeFormat))
info(currentTimeStamp, pformat(means))
clustersizes = [len(c) for c in clusterResult['cluster']]
hitBucket = clusterResult['hit_bucket']
info(currentTimeStamp, pformat(clustersizes))
writeToCsv(newValue, currentTimeStamp, metaData, hitBucket)
return
def clusterData(channelname, recalctime):
# global logger
channel, queue_name = init(channelname, recalctime)
channel.exchange_declare(exchange='clustertraffic', type='fanout')
info(startdate, "Started main program, waiting for data...")
with open(channelname + 'Data' + startdate.strftime("%Y-%m-%d") + '.csv', 'wb') as csvfile:
wr = csv.writer(csvfile, delimiter=',', quotechar='\"', quoting=csv.QUOTE_MINIMAL)
wr.writerow(["Average Speed", "Vehicle Count", "Timestamp", "Street1", "City1", "Latitude1", "Longitude1",
"Street2", "City2", "Latitude2", "Longitude2", "Nearest Centroid"])
channel.basic_consume(callback, queue=queue_name, no_ack=True)
channel.start_consuming()