def select_Z(dataset, OPTION_NZ): from scipy.cluster.vq import kmeans as scipy_kmeans np.random.seed(seed=149221) Z, _ = scipy_kmeans(dataset.xtrain, OPTION_NZ) return (Z)
def load_data(seed, ntrain, datasetName, num_inducing): d = io.loadmat('benchmarks.mat')[datasetName][0, 0] x, y = d[0], d[1] y = np.where(y == 1, 1, 0) # data is stored as +-1, we use 1, 0 # split into train, test sets np.random.seed(seed) index = np.random.permutation(x.shape[0]) itrain, itest = index[:ntrain], index[ntrain:] xtrain, xtest = x[itrain], x[itest] ytrain, ytest = y[itrain], y[itest] # normalize using training data mean, std xmean, xstd = xtrain.mean(0), xtrain.std(0) xstd = np.where(xstd > 1e-6, xstd, 1.) xtrain, xtest = (xtrain-xmean)/xstd, (xtest-xmean)/xstd Z, _ = scipy_kmeans(xtrain, num_inducing) return dict(Xtrain=xtrain, Ytrain=ytrain, Xtest=xtest, Ytest=ytest, Z=Z)
def load_data(seed, ntrain, datasetName, num_inducing): d = io.loadmat('benchmarks.mat')[datasetName][0, 0] x, y = d[0], d[1] y = np.where(y == 1, 1, 0) # data is stored as +-1, we use 1, 0 # split into train, test sets np.random.seed(seed) index = np.random.permutation(x.shape[0]) itrain, itest = index[:ntrain], index[ntrain:] xtrain, xtest = x[itrain], x[itest] ytrain, ytest = y[itrain], y[itest] # normalize using training data mean, std xmean, xstd = xtrain.mean(0), xtrain.std(0) xstd = np.where(xstd > 1e-6, xstd, 1.) xtrain, xtest = (xtrain - xmean) / xstd, (xtest - xmean) / xstd Z, _ = scipy_kmeans(xtrain, num_inducing) return dict(Xtrain=xtrain, Ytrain=ytrain, Xtest=xtest, Ytest=ytest, Z=Z)
f.close() del f d2 = datetime.now() print "Loading time was: %d.%d" % ((d2-d1).seconds, (d2-d1).microseconds) # ------------------------------- K = int(sys.argv[2]) print 'Starting clusterig method...' d1 = datetime.now() if len(sys.argv)>3 and '--scipy' in sys.argv: print "Scipy kmeans" centroids, labels = scipy_kmeans(m, K, minit='points') else: print "Opencv kmeans" samples = cv.fromarray(m) labels = cv.CreateMat(samples.height, 1, cv.CV_32SC1) # crit = (cv.CV_TERMCRIT_EPS + cv.CV_TERMCRIT_ITER, 10, 1.0) crit = (cv.CV_TERMCRIT_ITER, 10, 0) cv.KMeans2(samples, K, labels, crit) d2 = datetime.now() print "Elapsed time for %d clusters: %d.%d" % (K, (d2-d1).seconds, (d2-d1).microseconds) print 'Updating HDF file with results...' d1 = datetime.now()
for i, cluster in enumerate(clusters): print "Cluster", i, len(cluster.data_objects) print "Silhouette score", cluster_evaluation.silhouette_score(clusters) time_2 = time.time() print time_2 - time_1, "seconds" print time_1 = time.time() print "Scipy kmeans" data_objects = data_objects.ix[:, 0:7].as_matrix() codebook, distortion = scipy_kmeans(data_objects, k) code, distortion = vq(data_objects, codebook) for i in range(k): print "Cluster", i, len(filter(lambda x:x==i, code)) print "Silhouette score", silhouette_score(data_objects, code) time_2 = time.time() print time_2 - time_1, "seconds"