def divide_by_cmd(filename1, filename2, position):
    X, label = weeklydataset(filename1, [])
    X2, label2 = weeklydataset(filename2, [])
    
    x = X + X2
    
    f = operator.itemgetter(position)
    
    commands = map(f, x)
    
    print commands[0:20]
    
    unique_cmd_set = set(commands)
    
    print unique_cmd_set
    
    unique_cmd = []
    
    while len(unique_cmd_set) > 0:
        
        unique_cmd.append(unique_cmd_set.pop())
        
    print unique_cmd
    
    files = []
    
    for i in range(len(unique_cmd)):
        files.append(csv.writer(open(unique_cmd[i]+"_cmd.csv", "wb"), delimiter=";"))
        
    
    for elem in x:
        index = unique_cmd.index(elem[position])
        files[index].writerow(elem)
Exemplo n.º 2
0
def test():

    #    data = [["claudio","Di Cosmo"], ["claudino", "Cosimino"], ["fabio", "Melillo"], ["fabietto", "Mellillo"], ["angelo", "Furno"], ["angioletto", "Furnetto"], ["antonio", "Cuomo"], ["antoniuccio", "Cuomuccio"], ["marcangelo", "Frunillo"]]
    X, label = weeklydataset("/home/claudio/Workloads/WmProxyWL/nlog.csv", [])
    start_time = time()
    centroids, clusters = kmeans(X, 4)
    end_time = time()

    print end_time - start_time

    print centroids
    print len(clusters[0])
    print len(clusters[1])
    print len(clusters[2])
    print len(clusters[3])

    results = csv.writer(open("/home/claudio/Workloads/WmProxyWL/cluster0.csv", "wb"), delimiter=";")

    results.writerow(["Cluster 0", len(clusters[0])])
    results.writerows(clusters[0])

    results = csv.writer(open("/home/claudio/Workloads/WmProxyWL/cluster1.csv", "wb"), delimiter=";")

    results.writerow(["Cluster 1", len(clusters[1])])
    results.writerows(clusters[1])

    results = csv.writer(open("/home/claudio/Workloads/WmProxyWL/cluster2.csv", "wb"), delimiter=";")

    results.writerow(["Cluster 2", len(clusters[2])])
    results.writerows(clusters[2])

    results = csv.writer(open("/home/claudio/Workloads/WmProxyWL/cluster3.csv", "wb"), delimiter=";")

    results.writerow(["Cluster 3", len(clusters[3])])
    results.writerows(clusters[3])
def clusterize():
    X, label = weeklydataset('/home/work/Workloads/WmProxyWL/train.csv', [])
    X2, label2 = weeklydataset('/home/claudio/Workloads/WmProxyWL/test.csv', [])
    
    x = X + X2
    start_time = time()
    centroids, clusters = kmeans(x, 4)
    end_time = time()
    
    print end_time - start_time
    
    print centroids
    print len(clusters[0])
    print len(clusters[1])
    print len(clusters[2])
    print len(clusters[3])
    
    results = csv.writer(open("/home/claudio/Workloads/WmProxyWL/cluster0.csv", "wb"), delimiter=";")
    
    results.writerow(["Cluster 0", len(clusters[0])])
    results.writerows(clusters[0])
    
    results = csv.writer(open("/home/claudio/Workloads/WmProxyWL/cluster1.csv", "wb"), delimiter=";")
    
    results.writerow(["Cluster 1", len(clusters[1])])
    results.writerows(clusters[1])
    
    results = csv.writer(open("/home/claudio/Workloads/WmProxyWL/cluster2.csv", "wb"), delimiter=";")
    
    results.writerow(["Cluster 2", len(clusters[2])])
    results.writerows(clusters[2])
    
    results = csv.writer(open("/home/claudio/Workloads/WmProxyWL/cluster3.csv", "wb"), delimiter=";")
    
    results.writerow(["Cluster 3", len(clusters[3])])
    results.writerows(clusters[3])
'''
Created on Jul 18, 2011

@author: work
'''
#from Pycluster import clustercentroids, kcluster
from kmeans import kmeans
from numpy import matrix, float64 
from thesis.scripts.dataset.dataset import weeklydataset
import matplotlib.pyplot as plt


#[X, label] = weeklydataset_shogun('/home/work/Projects/EclipseProjects/thesis/Scripts/cpu_mod.csv', [0])
X, label = weeklydataset('/media/DATA/Thesis/Workloads/GenericWorkloadModeler/workloads/WMproxy/wmpcommon_cmd.csv', [])
#X = open('/home/work/Projects/EclipseProjects/thesis/Scripts/cpu.csv',)

K = range(2,3)

labels = list()
error = list()
nfound = list()
cdata = list()
cmask = list()
#param = X[5:8]
#parameters = matrix(X)
for k in K:
#    tmplabels, tmperror, tmpnfound = kcluster(parameters, nclusters=k, mask=None, weight=None, transpose=1, npass=1, method='a', dist='e', initialid=None)
#    tmpcdata, tmpcmask = clustercentroids(parameters, None, tmplabels, 'a', 1)
    tmperror, tmp_cluster = kmeans(X, k)
#    labels.append(tmplabels)
    error.append(tmperror)