예제 #1
0
def weeklydataset_sg_ndata(filesource, label_index):
    
    trainreader = csv.reader(open(filesource, "rb"), delimiter= ',', quotechar = '"')
    indexmon = list()
    indextue = list()
    indexwed = list()
    indexthu = list()
    indexfri = list()
    indexsat = list()
    indexsun = list()
    weekindex = [indexmon, indextue, indexwed, indexthu, indexfri, indexsat, indexsun]
    
    # vectorDataSet will contain the request from monday to sunday 
    #and the request are sorted by the hour
    count = 0
    nline = 0
    for row in trainreader:
        nline += 1
        try:
            count = len(row)
            rawdate = row[1]
            standarddate = time.gmtime(toepoch(rawdate))
            row[5] = float(row[5])
            row[6] = float(row[6])
            row[7] = float(row[7])
            weekindex[weekday(standarddate[0], standarddate[1], standarddate[2])].append(row)
        except:
            print nline
            
    
    vectorDataSet = []
    for column in range(count - len(label_index)):
        vectorDataSet.append(list())
    
    label = list()
    for column in range(len(label_index)):
        label.append(list())
    for index in weekindex:
        sortedindex = sorted(index, key=lambda hour : time.gmtime(toepoch(hour[1]))[4])
        sortedindex = sorted(sortedindex, key=lambda hour : time.gmtime(toepoch(hour[1]))[3])
#        vectorDataSet.append(sortedindex)
        for index2 in sortedindex:
            labelcounter = 0
            vectorcounter = 0
            for item in range(len(index2)):
                if item in label_index:
                    label[labelcounter].append(index2[item])
                    labelcounter = labelcounter+1
                else:
                    vectorDataSet[vectorcounter].append(index2[item])
                    vectorcounter = vectorcounter+1
#                    train.append(index2[item])
    
     
    return vectorDataSet, label
def aggregateby10mins_sg_mcmc_ndata(timestamps, numbercluster):
    mon = []
    [mon.append([0]*2) for i in range(144)]
    tue = []
    [tue.append([0]*2) for i in range(144)]
    wed = []
    [wed.append([0]*2) for i in range(144)]
    thu = []
    [thu.append([0]*2) for i in range(144)]
    fri = []
    [fri.append([0]*2) for i in range(144)]
    sat = []
    [sat.append([0]*2) for i in range(144)]
    sun = []
    [sun.append([0]*2) for i in range(144)]
    week = [mon, tue, wed, thu, fri, sat, sun]
    
    
    for line in timestamps:
        standarddate = time.gmtime(toepoch(line))
        dweek = standarddate[6]
        hour = standarddate[3]
        min = standarddate[4]
#        weeknumber = datetime.datetime(standarddate[0], standarddate[1], standarddate[2],0,0).isocalendar()[1]
        if (standarddate[2] <= 7):
            week[dweek][(hour *6) + min/10] [0] +=1
        else:
            week[dweek][(hour *6) + min/10] [1] +=1
    
    target = mon + tue + wed + thu + fri + sat + sun
    
    input = [inp for inp in range(144*5)]
    
    x = [inp for inp in range(144*7)]
    tmp = zip(*target)
    y1 = tmp[0]
    y2 = tmp[1]
    
    print "len x = %f" % len(x)
    print "len y1 = %f" % len(y1)
    print "len y2 = %f" % len(y2)
    fig = figure()
    ax1 = fig.add_subplot(1,1,1)
    ax1.scatter(x, y1, c='b')
    ax1.scatter(x, y2, c='r')
#    ax1.axis([0,max(x)+10,0,max(target)+100])
    ax1.set_xlabel('minute of the week')
    ax1.set_ylabel('Number of requests')
    fig.savefig("aggregation_cluster_%d" % (numbercluster), format='png')
    
    return target
def aggregateby10mins_sg_ndata_avg(timestamps, numbercluster):
     
    mon = [0]*144
    tue = [0]*144
    wed = [0]*144
    thu = [0]*144
    fri = [0]*144
    sat = [0]*144
    sun = [0]*144
    week = [mon, tue, wed, thu, fri, sat, sun]
    
    
    for line in timestamps:
        standarddate = time.gmtime(toepoch(line))
        dweek = standarddate[6]
        hour = standarddate[3]
        min = standarddate[4]
        
        week[dweek][(hour *6) + min/10] +=1
        
#    for wday in week:
#        for m in wday:
#            m = m/53
    
    target = mon + tue + wed + thu + fri + sat + sun
#    for i in range(len(target)):
#        if target[i] > 500:
#            target[i] = 500
    
    input = [inp for inp in range(144*5)]
    
    x = [inp for inp in range(144*7)]
    y = week
    
    fig = figure()
    ax1 = fig.add_subplot(1,1,1)
    ax1.scatter(x, y)
    ax1.axis([0,max(x)+10,0,max(target)+100])
    ax1.set_xlabel('minute of the week')
    ax1.set_ylabel('Number of requests')
    fig.savefig("aggregation_cluster_%d" % (numbercluster), format='png')
#    show()
    
    return x, target
def aggregateby30sec_sg_ndata_avg(timestamps):
     
    mon = [0]*2880
    tue = [0]*2880
    wed = [0]*2880
    thu = [0]*2880
    fri = [0]*2880
    sat = [0]*2880
    sun = [0]*2880
    week = [mon, tue, wed, thu, fri, sat, sun]
    
    
    for line in timestamps:
        standarddate = time.gmtime(toepoch(line))
        dweek = standarddate[6]
        hour = standarddate[3]
        min = standarddate[4]
        
        week[dweek][(hour *120) + (2*min)] +=1
        
#    for wday in week:
#        for m in wday:
#            m = m/53
    
    target = mon + tue + wed + thu + fri + sat + sun
#    for i in range(len(target)):
#        if target[i] > 500:
#            target[i] = 500
    
    input = [inp for inp in range(144*5)]
    
    x = [inp for inp in range(2880*7)]
    y = week
    
    fig = figure()
    ax1 = fig.add_subplot(1,1,1)
    ax1.scatter(x, y)
    ax1.axis([0,max(x)+10,0,max(target)+100])
    ax1.set_xlabel('minute of the week')
    ax1.set_ylabel('Number of requests')
    show()
    
    return x, target