예제 #1
0
def makeModelCounts(
    splits, modelLocation, dataLocation, neighborhoodLocation=None, minBehavior=0, compress=2, splitLength=8
):
    """
    Makes a set of counts for a given dataset and models.  
    
    Neighborhood location specifies if the models and data need to be preclustered.
    
    Returns the datavector and the associated split times.
    """
    files = os.listdir(modelLocation)

    neighborhood = False
    dVector = []
    times = []

    if neighborhoodLocation:
        neighborclusters = ncluster.parse(neighborhoodLocation)
        neighborhood = True

    # Iterate over splits.
    for s in splits:
        oldSplit = datetime.datetime.strptime(s[0], "%Y-%m-%d %H:%M:%S")
        newSplit = datetime.datetime.strptime(s[1], "%Y-%m-%d %H:%M:%S")

        tmpDoc = []
        # Loop over all models
        for f in files:
            # It is a data file.
            if f.split(".")[-1] == "dat":
                # Open it and grab the models and sensor list
                fn = dataio.loadData(modelLocation + str(f))
                fn.matrixToModel(fn.modelList)

                cd, td = bbdata.getdata(oldSplit, newSplit, comp=compress, sens=fn.sensors, readLocation=dataLocation)

                cd2 = cd
                if neighborhood:
                    local = neighborclusters[str(fn.sensors)]
                    cd2 = ncluster.convertNeighborhood(cd, local)

                cd2 = numpy.array(cd2, ndmin=2)
                cd2 = cd2.T

                sData = markov_anneal.splitLocalMax(cd2, td, splitLength)

                try:
                    val, counts = analysis.ratio(sData.values(), fn.models)
                except:
                    counts = [0] * len(fn.models)
                    val = [0] * len(fn.models)
                tmpDoc += counts

        if len(tmpDoc) >= minBehavior:
            dVector.append(tmpDoc)
            times.append(oldSplit)

        oldSplit = newSplit

    return dVector, times
예제 #2
0
def testDrawHMMCluster(fileLocation, limit = 100, cColor = (255, 255, 255)):
    """Draw the set of data associated with each hmm cluster.
    
    A cap of limit data patterns per cluster will be drawn to ensure that 
    the images are of reasonable dimensions.
    """

    oData = dataio.loadData(fileLocation)

    oData.assignedData = [oData.sData]

    for i in range(len(oData.assignedData)):
        v = len(oData.assignedData[i])
        if v > limit:
            v = limit
        ns = oData.assignedData[i][0:v]
        tmp = []
        tmp.append(ns)
        
        drawHMMCluster(tmp, oData.data.shape[1], \
                        writeLocation = "../output/cluster" + str(i) + ".png", \
                        cColor = cColor)
예제 #3
0
def testDrawHMMCluster(fileLocation, limit=100, cColor=(255, 255, 255)):
    """Draw the set of data associated with each hmm cluster.
    
    A cap of limit data patterns per cluster will be drawn to ensure that 
    the images are of reasonable dimensions.
    """

    oData = dataio.loadData(fileLocation)

    oData.assignedData = [oData.sData]

    for i in range(len(oData.assignedData)):
        v = len(oData.assignedData[i])
        if v > limit:
            v = limit
        ns = oData.assignedData[i][0:v]
        tmp = []
        tmp.append(ns)

        drawHMMCluster(tmp, oData.data.shape[1], \
                        writeLocation = "../output/cluster" + str(i) + ".png", \
                        cColor = cColor)
예제 #4
0
neighborhoodLocation = "../../data/generated/clean/neighborclusters.txt"
lsaLocation = "../../runs/real/data_min_3.lsa"
writeLocation = "../../runs/real/projected_lunch_early.data"
        
st = "2008-03-17 00:00:00"
et = "2008-03-23 23:59:59"
splitLength = 8
minBehavior = 0

if __name__ == "__main__":
    
    origList = []
    projList = []
    timeVec = []
    classList = []
    lsaData = dataio.loadData(lsaLocation)
    
    splits = bbdata.makeSplits(40, st, et, valid = [0, 2, 4], \
                    splitLen = datetime.timedelta(minutes = splitLength), \
                    sPeriod = "12:05:00", \
                    ePeriod = "12:20:00")
                    
    dvec, tvec = projections.makeModelCounts(splits, modelDirectory, dataDirectory, \
                                        neighborhoodLocation, minBehavior)
    
    origList += dvec
    timeVec += tvec
    tmpP = analysis.projectList(dvec, lsaData.pwz)
    projList += tmpP
    classList += projections.classify(tmpP, 0)
    
예제 #5
0
if __name__ == "__main__":

    st = datetime.datetime.strptime(st, "%Y-%m-%d %H:%M:%S")
    et = datetime.datetime.strptime(et, "%Y-%m-%d %H:%M:%S")

    files = os.listdir(modelLocation)

    #Get the sensor blocks
    for f in files:
        print f
        #It is a data file.
        if f.split('.')[-1] == 'dat':

            #Open it and grab the models and sensor list
            fn = dataio.loadData(modelLocation + str(f))
            fn.matrixToModel(fn.modelList)

            print "Sensors:" + str(fn.sensors)
            cd, td = bbdata.comp(st, et, \
                    comp = compress, \
                    sens = fn.sensors,
                    readLocation = dataLocation)

            sData = markov_anneal.splitLocalMax(cd, td, splitLen)

            outFile = writeLocation + str(f.split('.')[0]) + '.txt'

            #Make the file.
            detections.write_detections(sData, fn.models, fileName=outFile)
예제 #6
0
    #TODO Make this into a function -- makeTDMatrix
    for s in splits:
        print i
        i += 1
        oldSplit = datetime.datetime.strptime(s[0], "%Y-%m-%d %H:%M:%S")
        newSplit = datetime.datetime.strptime(s[1], "%Y-%m-%d %H:%M:%S")
        tmpDoc = []

        suppress.suppress(2)
        #Get the sensor blocks
        for f in files:
            #It is a data file.
            if f.split('.')[-1] == 'dat':

                #Open it and grab the models and sensor list
                fn = dataio.loadData(modelDirectory + str(f))
                fn.matrixToModel(fn.modelList)
                cd, td = bbdata.comp(oldSplit, newSplit, \
                                    comp = compress, \
                                    sens = fn.sensors,
                                    readLocation = dataDirectory)

                sData = markov_anneal.splitLocalMax(cd, td, splitLen)

                #for each split, make a document matrix and append it to the
                #ongoing tdmatrix
                try:
                    val, counts = analysis.ratio(sData.values(), fn.models)
                except:
                    counts = [0] * len(fn.models)
                    val = [0] * len(fn.models)
예제 #7
0
suppress.suppress(2)
from ghmm import *
suppress.restore(2)

readLocation = "../../runs/clean/models/"

if __name__ == "__main__":
    files = os.listdir(readLocation)

    suppress.suppress(2)
    for f in files:
        print f
        #It is a data file.
        if f.split('.')[-1] == 'dat':
        
            #Open files
            fn = dataio.loadData(readLocation + str(f))
            fn.matrixToModel(fn.modelList)
            
            sigma = IntegerRange(0, fn.obs)
            
            alldata = []
            for i in fn.assignedData:
                alldata += i
            
            print "hmm silhouette:" + str(hmmextra.hmmSilhoutte(alldata, fn.models, sigma))
            print "inter-model dist:" + str(markov_anneal._fitness(fn.models, fn.assignedData, sigma))
            print "Outliers:" + str(len(fn.out))
            print "Clusters per models:" + str([len(i) for i in fn.assignedData])
            print ""
    suppress.restore(2)
예제 #8
0
def apply_projection(assigned, u, startClassify = 0):
    allData = []
    
    for l in range(len(assigned)):
        tmpClust = analysis.projectList(assigned[l], u)
        projections.classify(tmpClust, l + startClassify)
        allData += tmpClust
        
    return allData
    


if __name__ == "__main__":
    
    tmpd = dataio.loadData(dataDirectory)
        
    assigned = make_assigned(tmpd.projList)
    u = lsa_reduce(assigned)
    data = apply_projection(assigned, u)
    nd = classify_data(assigned, data, tmpd.centers)
    
    #tmpd2 = dataio.loadData(dataDirectory2)
    #assigned2 = make_assigned(tmpd2.projList[:40])
    #data2 = apply_projection(assigned2, u, 2)    
    #nd2 = classify_data(assigned2, data2, tmpd.centers)

    #for l in range(len(nd)):
    #    nd[l] += nd2[l]

    t = visualizer.plotPoints(nd)
예제 #9
0
                    ePeriod = "19:00:00")

    #Iterate over splits.
    for s in splits:
        print i
        i += 1
        oldSplit = datetime.datetime.strptime(s[0], "%Y-%m-%d %H:%M:%S")
        newSplit = datetime.datetime.strptime(s[1], "%Y-%m-%d %H:%M:%S")

        tmpDoc = []
        #Loop over all models
        for f in files:
            #It is a data file.
            if f.split('.')[-1] == 'dat':
                #Open it and grab the models and sensor list
                fn = dataio.loadData(modelLocation + str(f))
                fn.matrixToModel(fn.modelList)

                cd, td = bbdata.getdata(oldSplit, newSplit, \
                                    comp = compress, \
                                    sens = fn.sensors,
                                    readLocation = dataLocation)

                local = neighborclusters[str(fn.sensors)]
                cd2 = ncluster.convertNeighborhood(cd, local)
                cd2 = numpy.array(cd2, ndmin=2)
                cd2 = cd2.T

                sData = markov_anneal.splitLocalMax(cd2, td, splitLength)

                #for each split, make a document matrix and append it to the
예제 #10
0
    start = calc.datetonumber("2008-01-01 00:00:00")
    start /= 100000
    print start

    sensAll = []

    diff = a[-1] - start
    
    print diff
    
    for i in range(diff):
        tmp = db('date') == start + i
        
        sensAll.append(len(tmp))
        
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ind = numpy.arange(diff)
    
    print sensAll
    
    rect = ax.bar(ind, sensAll, 0.3, color = 'r')
    
    plt.show()
    

if __name__ == "__main__":
    data = dataio.loadData(dbLocation)
    print "Data loaded."
    #getNumActivations(data['db'])
    plotNumSensors(data['db'])
예제 #11
0
def getdata(st, et, \
            pStart = datetime.datetime.strptime("00:00:00", "%H:%M:%S"), \
            pEnd = datetime.datetime.strptime("23:59:59", "%H:%M:%S"), \
            vDays = [0, 1, 2, 3, 4, 5, 6], \
            comp = 1, \
            sens = allSensors, \
            readLocation = bLocation,
            compress = True):
    """getdata from a given database location.
    
    st = start time
    et = end time.
    comp = how compressed (in seconds) the returned data list should be.
            compression uses the compressVector method
    compress = If the returned result should be as an array for each 
                sensor or as a single number representing the data of the 
                region of sensors given.
    
    Both start and end times must be of datetime objects.
    
    returns a numpy array.
    
    For vDays -- 0 = Monday.  6 = Sunday.
    """
    global allData
    global dataLocation
    
    #Open the database -- Do not open if already in memory.
    if (not allData) or (not (readLocation == dataLocation)):
        allData = dataio.loadData(readLocation + "data.dat")
        dataLocation = readLocation
    
    positions = {} #Position in array for each sensor
    db = allData['db']
    timeList = []
    cData = []  #Data set being built.
    ct = st
    oneSec = datetime.timedelta(seconds = 1)
    tmp = 0

    #Find the starting positions for all sensors. -- Position will always be
    #the index of the next sensor location past the current time.
    for s in sens:
        ct = st.toordinal()
        while True:
            tmp = (db('date') == ct) & (db('sensor') == s)


            if len(tmp.records) > 0:
                positions[s] = tmp.records[0]['index']
                break
            else:
                ct += 1
                
            if ct > et.toordinal():
                #Set it to an upper limit.
                positions[s] = 1000000000
                break

        ct = st
        
        #Check if position isn't far enough and advance as necessary
        current = calc.datetonumber(ct)
        try:
            while allData[s][positions[s]] < current:
                positions[s] += 1
        except:
            positions[s] = 1000000000

    while(ct <= et):
        #Check if the time is valid.
        if ct.weekday() in vDays:
            if _validTime(ct, pStart, pEnd):
                cVec = [0] * len(sens)
                
                for i in range(len(sens)):
                    if positions.has_key(sens[i]):
                        
                        #Convert the ct to compressed time
                        current = calc.datetonumber(ct)
                        t = positions[sens[i]]

                        #If the current time plus the comp time pass some real 
                        #data, then update cVec and position
                        if t >= len(allData[sens[i]]) - 1:
                            continue
                            
                        if current + comp >= allData[sens[i]][t]:
                            cVec[i] = 1
                            
                            #print "Sensor:" + str(sens[i]) + "   t:" + str(t) + "    len:" + str(len(allData[sens[i]])) + "     CC:" + str(current + comp) + "     end:" + str(allData[sens[i]][-1])
                            
                            #Find the next valid position and update.
                            while((current + comp > allData[sens[i]][t]) and \
                                  (t < len(allData[sens[i]]) - 1)):
                                  t += 1
                            positions[sens[i]] = t    
                        
                #Determine the compressed value for cVec
                if compress:
                    cData.append(compressVector(cVec))
                timeList.append(ct)

        #At the end add a second
        ct += oneSec * comp
    
    #Delete all objects
    #del ad
    cData = numpy.array(cData)
    cData.resize(cData.shape[0], 1)
    timeList = numpy.array(timeList)
    timeList.resize(timeList.shape[0], 1)
    
    return cData, timeList
예제 #12
0
if __name__ == "__main__":
    
    st = datetime.datetime.strptime(st, "%Y-%m-%d %H:%M:%S")
    et = datetime.datetime.strptime(et, "%Y-%m-%d %H:%M:%S")
    
    files = os.listdir(modelLocation)
    
    #Get the sensor blocks
    for f in files:
        print f
        #It is a data file.
        if f.split('.')[-1] == 'dat':
            
            #Open it and grab the models and sensor list
            fn = dataio.loadData(modelLocation + str(f))
            fn.matrixToModel(fn.modelList)
            
            print "Sensors:" + str(fn.sensors)
            cd, td = bbdata.comp(st, et, \
                    comp = compress, \
                    sens = fn.sensors,
                    readLocation = dataLocation)
                
            sData = markov_anneal.splitLocalMax(cd, td, splitLen)
        
            outFile = writeLocation + str(f.split('.')[0]) + '.txt'
        
            #Make the file.
            detections.write_detections(sData, fn.models, fileName = outFile)
예제 #13
0
dataDirectory = "../../runs/real/data_min_3.lsa"
projDirectory = "../../runs/real/projected.data"
modelDirectory = "../../runs/real/models_min_3/"


if __name__ == "__main__":

    files = os.listdir(modelDirectory)
    modelNumber = []
    for f in files:
        print f
        #It is a data file.
        if f.split('.')[-1] == 'dat':
        
            #Open files
            fn = dataio.loadData(modelDirectory + str(f))
            
            for i in range(len(fn.modelList)):
                modelNumber.append(str(f) + " -- " + str(i))
            
    data = dataio.loadData(dataDirectory)
    projected = dataio.loadData(projDirectory)
    
    values = [[] for j in range(len(projected.centers))]
    
    for j in range(len(projected.centers)):
        for i in range(len(data.pwz)):
            values[j].append(numpy.dot(projected.centers[j], data.pwz[i]))
    

    for i in range(len(values[0])):
예제 #14
0
                            skip = datetime.timedelta(minutes = skipLength))
    
    #Iterate over splits.
    for s in splits:
        print i
        i+=1
        oldSplit = datetime.datetime.strptime(s[0], "%Y-%m-%d %H:%M:%S")
        newSplit = datetime.datetime.strptime(s[1], "%Y-%m-%d %H:%M:%S")
        
        tmpDoc = []
        #Loop over all models
        for f in files:
            #It is a data file.
            if f.split('.')[-1] == 'dat':
                #Open it and grab the models and sensor list
                fn = dataio.loadData(modelLocation + str(f))
                fn.matrixToModel(fn.modelList)
            
                cd, td = bbdata.getdata(oldSplit, newSplit, \
                                    comp = compress, \
                                    sens = fn.sensors,
                                    readLocation = dataLocation)
                
                #cd2 = cd
                local = neighborclusters[str(fn.sensors)]
                cd2 = ncluster.convertNeighborhood(cd, local)
                cd2 = numpy.array(cd2, ndmin = 2)
                cd2 = cd2.T
                
                sData = markov_anneal.splitLocalMax(cd2, td, splitLength)
예제 #15
0
def makeModelCounts(splits, modelLocation, dataLocation, \
                    neighborhoodLocation = None, minBehavior = 0, \
                    compress = 2, splitLength = 8):
    """
    Makes a set of counts for a given dataset and models.  
    
    Neighborhood location specifies if the models and data need to be preclustered.
    
    Returns the datavector and the associated split times.
    """
    files = os.listdir(modelLocation)
    
    neighborhood = False
    dVector = []
    times = []
    
    if neighborhoodLocation:
        neighborclusters = ncluster.parse(neighborhoodLocation)
        neighborhood = True
        
    #Iterate over splits.
    for s in splits:
        oldSplit = datetime.datetime.strptime(s[0], "%Y-%m-%d %H:%M:%S")
        newSplit = datetime.datetime.strptime(s[1], "%Y-%m-%d %H:%M:%S")
        
        tmpDoc = []
        #Loop over all models
        for f in files:
            #It is a data file.
            if f.split('.')[-1] == 'dat':
                #Open it and grab the models and sensor list
                fn = dataio.loadData(modelLocation + str(f))
                fn.matrixToModel(fn.modelList)
            
                cd, td = bbdata.getdata(oldSplit, newSplit, \
                                    comp = compress, \
                                    sens = fn.sensors,
                                    readLocation = dataLocation)
                
                
                cd2 = cd
                if neighborhood:
                    local = neighborclusters[str(fn.sensors)]
                    cd2 = ncluster.convertNeighborhood(cd, local)
                
                cd2 = numpy.array(cd2, ndmin = 2)
                cd2 = cd2.T

                sData = markov_anneal.splitLocalMax(cd2, td, splitLength)

                try:
                    val, counts = analysis.ratio(sData.values(), fn.models)
                except:
                    counts = [0] * len(fn.models)
                    val = [0] * len(fn.models)
                tmpDoc += counts

        if len(tmpDoc) >= minBehavior:
            dVector.append(tmpDoc)
            times.append(oldSplit)

        oldSplit = newSplit
        
    return dVector, times
예제 #16
0
"""
make_clusters.py

Author: James Howard

Program used to make clusters for a set of data.  Saves cluster centers and information to 
the same file where data was taken from.
"""

import pycl
import pybb.data.dataio as dataio
        
dataDirectory = "../../runs/real/projected_lunch.data"
writeLocation = "../../runs/real/projected_lunch.data"
        
if __name__ == "__main__":
    
    tmpd = dataio.loadData(dataDirectory)
    data = tmpd.classList
    data = pycl.Dataset(data)

    #Try kmeans
    kmeans = pycl.Kmeans(2)
    kmeans.train(data)
    tmpd.centers = kmeans._Kmeans__centers
    tmpd.clusters = kmeans._Kmeans__clusters
    
    dataio.saveData(writeLocation, tmpd)
예제 #17
0
suppress.restore(2)

readLocation = "../../runs/clean/models/"

if __name__ == "__main__":
    files = os.listdir(readLocation)

    suppress.suppress(2)
    for f in files:
        print f
        #It is a data file.
        if f.split('.')[-1] == 'dat':

            #Open files
            fn = dataio.loadData(readLocation + str(f))
            fn.matrixToModel(fn.modelList)

            sigma = IntegerRange(0, fn.obs)

            alldata = []
            for i in fn.assignedData:
                alldata += i

            print "hmm silhouette:" + str(
                hmmextra.hmmSilhoutte(alldata, fn.models, sigma))
            print "inter-model dist:" + str(
                markov_anneal._fitness(fn.models, fn.assignedData, sigma))
            print "Outliers:" + str(len(fn.out))
            print "Clusters per models:" + str(
                [len(i) for i in fn.assignedData])
예제 #18
0
    #TODO Make this into a function -- makeTDMatrix
    for s in splits:
        print i
        i+=1
        oldSplit = datetime.datetime.strptime(s[0], "%Y-%m-%d %H:%M:%S")
        newSplit = datetime.datetime.strptime(s[1], "%Y-%m-%d %H:%M:%S")
        tmpDoc = []
        
        suppress.suppress(2)
        #Get the sensor blocks
        for f in files:
            #It is a data file.
            if f.split('.')[-1] == 'dat':

                #Open it and grab the models and sensor list
                fn = dataio.loadData(modelDirectory + str(f))
                fn.matrixToModel(fn.modelList)
                cd, td = bbdata.comp(oldSplit, newSplit, \
                                    comp = compress, \
                                    sens = fn.sensors,
                                    readLocation = dataDirectory)
                                    
                sData = markov_anneal.splitLocalMax(cd, td, splitLen)
                
                #for each split, make a document matrix and append it to the
                #ongoing tdmatrix
                try:
                    val, counts = analysis.ratio(sData.values(), fn.models)
                except:
                    counts = [0] * len(fn.models)
                    val = [0] * len(fn.models)
예제 #19
0
    #Perform simple clustering removing all data from potential clustering
    #below minData threshold

    sd = datetime.datetime.strptime(startDate, "%Y-%m-%d %H:%M:%S")
    sk = datetime.timedelta(seconds=skip)

    numclusters = int(math.ceil((blockmax - mindata) / (1.0 * blocksize))) + 1

    #For now just group by simple blocks from minData to 1000
    clusters = [[] for i in range(numclusters)]

    for s in range(len(blocks)):
        ct = sd + sk * s

        #Determine cluster
        c = min(int(math.ceil((blocks[s] - mindata) / (1.0 * blocksize))),
                numclusters - 1)

        clusters[c].append((blocks[s], str(ct)))

    return clusters


if __name__ == "__main__":

    data = dataio.loadData(dbLocation)

    blocks = makeBlocks(data, window, skip, startDate, endDate, sensors)
    clusters = clusterBlocks(blocks, skip, startDate)
예제 #20
0
dataDirectory = "../../runs/real/data_min_3.lsa"
projDirectory = "../../runs/real/projected.data"
modelDirectory = "../../runs/real/models_min_3/"

if __name__ == "__main__":

    files = os.listdir(modelDirectory)
    modelNumber = []
    for f in files:
        print f
        #It is a data file.
        if f.split('.')[-1] == 'dat':

            #Open files
            fn = dataio.loadData(modelDirectory + str(f))

            for i in range(len(fn.modelList)):
                modelNumber.append(str(f) + " -- " + str(i))

    data = dataio.loadData(dataDirectory)
    projected = dataio.loadData(projDirectory)

    values = [[] for j in range(len(projected.centers))]

    for j in range(len(projected.centers)):
        for i in range(len(data.pwz)):
            values[j].append(numpy.dot(projected.centers[j], data.pwz[i]))

    for i in range(len(values[0])):
        print str(abs(values[0][i] - values[1][i])) + "   :   " + \