def makeModelCounts( splits, modelLocation, dataLocation, neighborhoodLocation=None, minBehavior=0, compress=2, splitLength=8 ): """ Makes a set of counts for a given dataset and models. Neighborhood location specifies if the models and data need to be preclustered. Returns the datavector and the associated split times. """ files = os.listdir(modelLocation) neighborhood = False dVector = [] times = [] if neighborhoodLocation: neighborclusters = ncluster.parse(neighborhoodLocation) neighborhood = True # Iterate over splits. for s in splits: oldSplit = datetime.datetime.strptime(s[0], "%Y-%m-%d %H:%M:%S") newSplit = datetime.datetime.strptime(s[1], "%Y-%m-%d %H:%M:%S") tmpDoc = [] # Loop over all models for f in files: # It is a data file. if f.split(".")[-1] == "dat": # Open it and grab the models and sensor list fn = dataio.loadData(modelLocation + str(f)) fn.matrixToModel(fn.modelList) cd, td = bbdata.getdata(oldSplit, newSplit, comp=compress, sens=fn.sensors, readLocation=dataLocation) cd2 = cd if neighborhood: local = neighborclusters[str(fn.sensors)] cd2 = ncluster.convertNeighborhood(cd, local) cd2 = numpy.array(cd2, ndmin=2) cd2 = cd2.T sData = markov_anneal.splitLocalMax(cd2, td, splitLength) try: val, counts = analysis.ratio(sData.values(), fn.models) except: counts = [0] * len(fn.models) val = [0] * len(fn.models) tmpDoc += counts if len(tmpDoc) >= minBehavior: dVector.append(tmpDoc) times.append(oldSplit) oldSplit = newSplit return dVector, times
def testDrawHMMCluster(fileLocation, limit = 100, cColor = (255, 255, 255)): """Draw the set of data associated with each hmm cluster. A cap of limit data patterns per cluster will be drawn to ensure that the images are of reasonable dimensions. """ oData = dataio.loadData(fileLocation) oData.assignedData = [oData.sData] for i in range(len(oData.assignedData)): v = len(oData.assignedData[i]) if v > limit: v = limit ns = oData.assignedData[i][0:v] tmp = [] tmp.append(ns) drawHMMCluster(tmp, oData.data.shape[1], \ writeLocation = "../output/cluster" + str(i) + ".png", \ cColor = cColor)
def testDrawHMMCluster(fileLocation, limit=100, cColor=(255, 255, 255)): """Draw the set of data associated with each hmm cluster. A cap of limit data patterns per cluster will be drawn to ensure that the images are of reasonable dimensions. """ oData = dataio.loadData(fileLocation) oData.assignedData = [oData.sData] for i in range(len(oData.assignedData)): v = len(oData.assignedData[i]) if v > limit: v = limit ns = oData.assignedData[i][0:v] tmp = [] tmp.append(ns) drawHMMCluster(tmp, oData.data.shape[1], \ writeLocation = "../output/cluster" + str(i) + ".png", \ cColor = cColor)
neighborhoodLocation = "../../data/generated/clean/neighborclusters.txt" lsaLocation = "../../runs/real/data_min_3.lsa" writeLocation = "../../runs/real/projected_lunch_early.data" st = "2008-03-17 00:00:00" et = "2008-03-23 23:59:59" splitLength = 8 minBehavior = 0 if __name__ == "__main__": origList = [] projList = [] timeVec = [] classList = [] lsaData = dataio.loadData(lsaLocation) splits = bbdata.makeSplits(40, st, et, valid = [0, 2, 4], \ splitLen = datetime.timedelta(minutes = splitLength), \ sPeriod = "12:05:00", \ ePeriod = "12:20:00") dvec, tvec = projections.makeModelCounts(splits, modelDirectory, dataDirectory, \ neighborhoodLocation, minBehavior) origList += dvec timeVec += tvec tmpP = analysis.projectList(dvec, lsaData.pwz) projList += tmpP classList += projections.classify(tmpP, 0)
if __name__ == "__main__": st = datetime.datetime.strptime(st, "%Y-%m-%d %H:%M:%S") et = datetime.datetime.strptime(et, "%Y-%m-%d %H:%M:%S") files = os.listdir(modelLocation) #Get the sensor blocks for f in files: print f #It is a data file. if f.split('.')[-1] == 'dat': #Open it and grab the models and sensor list fn = dataio.loadData(modelLocation + str(f)) fn.matrixToModel(fn.modelList) print "Sensors:" + str(fn.sensors) cd, td = bbdata.comp(st, et, \ comp = compress, \ sens = fn.sensors, readLocation = dataLocation) sData = markov_anneal.splitLocalMax(cd, td, splitLen) outFile = writeLocation + str(f.split('.')[0]) + '.txt' #Make the file. detections.write_detections(sData, fn.models, fileName=outFile)
#TODO Make this into a function -- makeTDMatrix for s in splits: print i i += 1 oldSplit = datetime.datetime.strptime(s[0], "%Y-%m-%d %H:%M:%S") newSplit = datetime.datetime.strptime(s[1], "%Y-%m-%d %H:%M:%S") tmpDoc = [] suppress.suppress(2) #Get the sensor blocks for f in files: #It is a data file. if f.split('.')[-1] == 'dat': #Open it and grab the models and sensor list fn = dataio.loadData(modelDirectory + str(f)) fn.matrixToModel(fn.modelList) cd, td = bbdata.comp(oldSplit, newSplit, \ comp = compress, \ sens = fn.sensors, readLocation = dataDirectory) sData = markov_anneal.splitLocalMax(cd, td, splitLen) #for each split, make a document matrix and append it to the #ongoing tdmatrix try: val, counts = analysis.ratio(sData.values(), fn.models) except: counts = [0] * len(fn.models) val = [0] * len(fn.models)
suppress.suppress(2) from ghmm import * suppress.restore(2) readLocation = "../../runs/clean/models/" if __name__ == "__main__": files = os.listdir(readLocation) suppress.suppress(2) for f in files: print f #It is a data file. if f.split('.')[-1] == 'dat': #Open files fn = dataio.loadData(readLocation + str(f)) fn.matrixToModel(fn.modelList) sigma = IntegerRange(0, fn.obs) alldata = [] for i in fn.assignedData: alldata += i print "hmm silhouette:" + str(hmmextra.hmmSilhoutte(alldata, fn.models, sigma)) print "inter-model dist:" + str(markov_anneal._fitness(fn.models, fn.assignedData, sigma)) print "Outliers:" + str(len(fn.out)) print "Clusters per models:" + str([len(i) for i in fn.assignedData]) print "" suppress.restore(2)
def apply_projection(assigned, u, startClassify = 0): allData = [] for l in range(len(assigned)): tmpClust = analysis.projectList(assigned[l], u) projections.classify(tmpClust, l + startClassify) allData += tmpClust return allData if __name__ == "__main__": tmpd = dataio.loadData(dataDirectory) assigned = make_assigned(tmpd.projList) u = lsa_reduce(assigned) data = apply_projection(assigned, u) nd = classify_data(assigned, data, tmpd.centers) #tmpd2 = dataio.loadData(dataDirectory2) #assigned2 = make_assigned(tmpd2.projList[:40]) #data2 = apply_projection(assigned2, u, 2) #nd2 = classify_data(assigned2, data2, tmpd.centers) #for l in range(len(nd)): # nd[l] += nd2[l] t = visualizer.plotPoints(nd)
ePeriod = "19:00:00") #Iterate over splits. for s in splits: print i i += 1 oldSplit = datetime.datetime.strptime(s[0], "%Y-%m-%d %H:%M:%S") newSplit = datetime.datetime.strptime(s[1], "%Y-%m-%d %H:%M:%S") tmpDoc = [] #Loop over all models for f in files: #It is a data file. if f.split('.')[-1] == 'dat': #Open it and grab the models and sensor list fn = dataio.loadData(modelLocation + str(f)) fn.matrixToModel(fn.modelList) cd, td = bbdata.getdata(oldSplit, newSplit, \ comp = compress, \ sens = fn.sensors, readLocation = dataLocation) local = neighborclusters[str(fn.sensors)] cd2 = ncluster.convertNeighborhood(cd, local) cd2 = numpy.array(cd2, ndmin=2) cd2 = cd2.T sData = markov_anneal.splitLocalMax(cd2, td, splitLength) #for each split, make a document matrix and append it to the
start = calc.datetonumber("2008-01-01 00:00:00") start /= 100000 print start sensAll = [] diff = a[-1] - start print diff for i in range(diff): tmp = db('date') == start + i sensAll.append(len(tmp)) fig = plt.figure() ax = fig.add_subplot(111) ind = numpy.arange(diff) print sensAll rect = ax.bar(ind, sensAll, 0.3, color = 'r') plt.show() if __name__ == "__main__": data = dataio.loadData(dbLocation) print "Data loaded." #getNumActivations(data['db']) plotNumSensors(data['db'])
def getdata(st, et, \ pStart = datetime.datetime.strptime("00:00:00", "%H:%M:%S"), \ pEnd = datetime.datetime.strptime("23:59:59", "%H:%M:%S"), \ vDays = [0, 1, 2, 3, 4, 5, 6], \ comp = 1, \ sens = allSensors, \ readLocation = bLocation, compress = True): """getdata from a given database location. st = start time et = end time. comp = how compressed (in seconds) the returned data list should be. compression uses the compressVector method compress = If the returned result should be as an array for each sensor or as a single number representing the data of the region of sensors given. Both start and end times must be of datetime objects. returns a numpy array. For vDays -- 0 = Monday. 6 = Sunday. """ global allData global dataLocation #Open the database -- Do not open if already in memory. if (not allData) or (not (readLocation == dataLocation)): allData = dataio.loadData(readLocation + "data.dat") dataLocation = readLocation positions = {} #Position in array for each sensor db = allData['db'] timeList = [] cData = [] #Data set being built. ct = st oneSec = datetime.timedelta(seconds = 1) tmp = 0 #Find the starting positions for all sensors. -- Position will always be #the index of the next sensor location past the current time. for s in sens: ct = st.toordinal() while True: tmp = (db('date') == ct) & (db('sensor') == s) if len(tmp.records) > 0: positions[s] = tmp.records[0]['index'] break else: ct += 1 if ct > et.toordinal(): #Set it to an upper limit. positions[s] = 1000000000 break ct = st #Check if position isn't far enough and advance as necessary current = calc.datetonumber(ct) try: while allData[s][positions[s]] < current: positions[s] += 1 except: positions[s] = 1000000000 while(ct <= et): #Check if the time is valid. if ct.weekday() in vDays: if _validTime(ct, pStart, pEnd): cVec = [0] * len(sens) for i in range(len(sens)): if positions.has_key(sens[i]): #Convert the ct to compressed time current = calc.datetonumber(ct) t = positions[sens[i]] #If the current time plus the comp time pass some real #data, then update cVec and position if t >= len(allData[sens[i]]) - 1: continue if current + comp >= allData[sens[i]][t]: cVec[i] = 1 #print "Sensor:" + str(sens[i]) + " t:" + str(t) + " len:" + str(len(allData[sens[i]])) + " CC:" + str(current + comp) + " end:" + str(allData[sens[i]][-1]) #Find the next valid position and update. while((current + comp > allData[sens[i]][t]) and \ (t < len(allData[sens[i]]) - 1)): t += 1 positions[sens[i]] = t #Determine the compressed value for cVec if compress: cData.append(compressVector(cVec)) timeList.append(ct) #At the end add a second ct += oneSec * comp #Delete all objects #del ad cData = numpy.array(cData) cData.resize(cData.shape[0], 1) timeList = numpy.array(timeList) timeList.resize(timeList.shape[0], 1) return cData, timeList
if __name__ == "__main__": st = datetime.datetime.strptime(st, "%Y-%m-%d %H:%M:%S") et = datetime.datetime.strptime(et, "%Y-%m-%d %H:%M:%S") files = os.listdir(modelLocation) #Get the sensor blocks for f in files: print f #It is a data file. if f.split('.')[-1] == 'dat': #Open it and grab the models and sensor list fn = dataio.loadData(modelLocation + str(f)) fn.matrixToModel(fn.modelList) print "Sensors:" + str(fn.sensors) cd, td = bbdata.comp(st, et, \ comp = compress, \ sens = fn.sensors, readLocation = dataLocation) sData = markov_anneal.splitLocalMax(cd, td, splitLen) outFile = writeLocation + str(f.split('.')[0]) + '.txt' #Make the file. detections.write_detections(sData, fn.models, fileName = outFile)
dataDirectory = "../../runs/real/data_min_3.lsa" projDirectory = "../../runs/real/projected.data" modelDirectory = "../../runs/real/models_min_3/" if __name__ == "__main__": files = os.listdir(modelDirectory) modelNumber = [] for f in files: print f #It is a data file. if f.split('.')[-1] == 'dat': #Open files fn = dataio.loadData(modelDirectory + str(f)) for i in range(len(fn.modelList)): modelNumber.append(str(f) + " -- " + str(i)) data = dataio.loadData(dataDirectory) projected = dataio.loadData(projDirectory) values = [[] for j in range(len(projected.centers))] for j in range(len(projected.centers)): for i in range(len(data.pwz)): values[j].append(numpy.dot(projected.centers[j], data.pwz[i])) for i in range(len(values[0])):
skip = datetime.timedelta(minutes = skipLength)) #Iterate over splits. for s in splits: print i i+=1 oldSplit = datetime.datetime.strptime(s[0], "%Y-%m-%d %H:%M:%S") newSplit = datetime.datetime.strptime(s[1], "%Y-%m-%d %H:%M:%S") tmpDoc = [] #Loop over all models for f in files: #It is a data file. if f.split('.')[-1] == 'dat': #Open it and grab the models and sensor list fn = dataio.loadData(modelLocation + str(f)) fn.matrixToModel(fn.modelList) cd, td = bbdata.getdata(oldSplit, newSplit, \ comp = compress, \ sens = fn.sensors, readLocation = dataLocation) #cd2 = cd local = neighborclusters[str(fn.sensors)] cd2 = ncluster.convertNeighborhood(cd, local) cd2 = numpy.array(cd2, ndmin = 2) cd2 = cd2.T sData = markov_anneal.splitLocalMax(cd2, td, splitLength)
def makeModelCounts(splits, modelLocation, dataLocation, \ neighborhoodLocation = None, minBehavior = 0, \ compress = 2, splitLength = 8): """ Makes a set of counts for a given dataset and models. Neighborhood location specifies if the models and data need to be preclustered. Returns the datavector and the associated split times. """ files = os.listdir(modelLocation) neighborhood = False dVector = [] times = [] if neighborhoodLocation: neighborclusters = ncluster.parse(neighborhoodLocation) neighborhood = True #Iterate over splits. for s in splits: oldSplit = datetime.datetime.strptime(s[0], "%Y-%m-%d %H:%M:%S") newSplit = datetime.datetime.strptime(s[1], "%Y-%m-%d %H:%M:%S") tmpDoc = [] #Loop over all models for f in files: #It is a data file. if f.split('.')[-1] == 'dat': #Open it and grab the models and sensor list fn = dataio.loadData(modelLocation + str(f)) fn.matrixToModel(fn.modelList) cd, td = bbdata.getdata(oldSplit, newSplit, \ comp = compress, \ sens = fn.sensors, readLocation = dataLocation) cd2 = cd if neighborhood: local = neighborclusters[str(fn.sensors)] cd2 = ncluster.convertNeighborhood(cd, local) cd2 = numpy.array(cd2, ndmin = 2) cd2 = cd2.T sData = markov_anneal.splitLocalMax(cd2, td, splitLength) try: val, counts = analysis.ratio(sData.values(), fn.models) except: counts = [0] * len(fn.models) val = [0] * len(fn.models) tmpDoc += counts if len(tmpDoc) >= minBehavior: dVector.append(tmpDoc) times.append(oldSplit) oldSplit = newSplit return dVector, times
""" make_clusters.py Author: James Howard Program used to make clusters for a set of data. Saves cluster centers and information to the same file where data was taken from. """ import pycl import pybb.data.dataio as dataio dataDirectory = "../../runs/real/projected_lunch.data" writeLocation = "../../runs/real/projected_lunch.data" if __name__ == "__main__": tmpd = dataio.loadData(dataDirectory) data = tmpd.classList data = pycl.Dataset(data) #Try kmeans kmeans = pycl.Kmeans(2) kmeans.train(data) tmpd.centers = kmeans._Kmeans__centers tmpd.clusters = kmeans._Kmeans__clusters dataio.saveData(writeLocation, tmpd)
suppress.restore(2) readLocation = "../../runs/clean/models/" if __name__ == "__main__": files = os.listdir(readLocation) suppress.suppress(2) for f in files: print f #It is a data file. if f.split('.')[-1] == 'dat': #Open files fn = dataio.loadData(readLocation + str(f)) fn.matrixToModel(fn.modelList) sigma = IntegerRange(0, fn.obs) alldata = [] for i in fn.assignedData: alldata += i print "hmm silhouette:" + str( hmmextra.hmmSilhoutte(alldata, fn.models, sigma)) print "inter-model dist:" + str( markov_anneal._fitness(fn.models, fn.assignedData, sigma)) print "Outliers:" + str(len(fn.out)) print "Clusters per models:" + str( [len(i) for i in fn.assignedData])
#TODO Make this into a function -- makeTDMatrix for s in splits: print i i+=1 oldSplit = datetime.datetime.strptime(s[0], "%Y-%m-%d %H:%M:%S") newSplit = datetime.datetime.strptime(s[1], "%Y-%m-%d %H:%M:%S") tmpDoc = [] suppress.suppress(2) #Get the sensor blocks for f in files: #It is a data file. if f.split('.')[-1] == 'dat': #Open it and grab the models and sensor list fn = dataio.loadData(modelDirectory + str(f)) fn.matrixToModel(fn.modelList) cd, td = bbdata.comp(oldSplit, newSplit, \ comp = compress, \ sens = fn.sensors, readLocation = dataDirectory) sData = markov_anneal.splitLocalMax(cd, td, splitLen) #for each split, make a document matrix and append it to the #ongoing tdmatrix try: val, counts = analysis.ratio(sData.values(), fn.models) except: counts = [0] * len(fn.models) val = [0] * len(fn.models)
#Perform simple clustering removing all data from potential clustering #below minData threshold sd = datetime.datetime.strptime(startDate, "%Y-%m-%d %H:%M:%S") sk = datetime.timedelta(seconds=skip) numclusters = int(math.ceil((blockmax - mindata) / (1.0 * blocksize))) + 1 #For now just group by simple blocks from minData to 1000 clusters = [[] for i in range(numclusters)] for s in range(len(blocks)): ct = sd + sk * s #Determine cluster c = min(int(math.ceil((blocks[s] - mindata) / (1.0 * blocksize))), numclusters - 1) clusters[c].append((blocks[s], str(ct))) return clusters if __name__ == "__main__": data = dataio.loadData(dbLocation) blocks = makeBlocks(data, window, skip, startDate, endDate, sensors) clusters = clusterBlocks(blocks, skip, startDate)
dataDirectory = "../../runs/real/data_min_3.lsa" projDirectory = "../../runs/real/projected.data" modelDirectory = "../../runs/real/models_min_3/" if __name__ == "__main__": files = os.listdir(modelDirectory) modelNumber = [] for f in files: print f #It is a data file. if f.split('.')[-1] == 'dat': #Open files fn = dataio.loadData(modelDirectory + str(f)) for i in range(len(fn.modelList)): modelNumber.append(str(f) + " -- " + str(i)) data = dataio.loadData(dataDirectory) projected = dataio.loadData(projDirectory) values = [[] for j in range(len(projected.centers))] for j in range(len(projected.centers)): for i in range(len(data.pwz)): values[j].append(numpy.dot(projected.centers[j], data.pwz[i])) for i in range(len(values[0])): print str(abs(values[0][i] - values[1][i])) + " : " + \