def example_4(): """ compare to scikitlearn implementation of kmeans """ import sklearn.cluster as skc import time ndata = 50000 dimension = 10 ncentroids = 1000 data = npr.randn(ndata, dimension).astype(np.float64) centroids0 = data[0:ncentroids, :] t0 = time.time() kmeans.get_clustering(X = data, init = centroids0, n_clusters = ncentroids, algorithm = 'auto', verbose = 1, n_threads = 1) t1 = time.time() sklearner = skc.k_means(X = data, n_clusters = ncentroids, max_iter = 1000, n_init = 1, init = centroids0, precompute_distances = False, verbose = True, n_jobs = 1, return_n_iter = True, tol = 0.0) t2 = time.time() kmeans_time = t1 - t0 sklearner_time = t2 - t1 print "sklearn : ", sklearner_time, " s" print "this kmeans: ", kmeans_time, " s"
def example_1(ndata=1e4, dimension=100, dtype=np.float64): """ basic use : cluster random data """ data = npr.randn(ndata, dimension).astype(dtype) clustering = kmeans.get_clustering(X=data, n_clusters=60, algorithm='auto', verbose=2)
def example_4(): """ compare to scikitlearn implementation of kmeans """ import sklearn.cluster as skc import time ndata = 50000 dimension = 10 ncentroids = 1000 data = npr.randn(ndata, dimension).astype(np.float64) centroids0 = data[0:ncentroids, :] t0 = time.time() kmeans.get_clustering(X=data, init=centroids0, n_clusters=ncentroids, algorithm='auto', verbose=1, n_threads=1) t1 = time.time() sklearner = skc.k_means(X=data, n_clusters=ncentroids, max_iter=1000, n_init=1, init=centroids0, precompute_distances=False, verbose=True, n_jobs=1, return_n_iter=True, tol=0.0) t2 = time.time() kmeans_time = t1 - t0 sklearner_time = t2 - t1 print "sklearn : ", sklearner_time, " s" print "this kmeans: ", kmeans_time, " s"
def on_standard(): data = npr.randn(20000, 5) ndata, dim = data.shape data_validation = None for ncentroids in [10, 100, 1000]: C0 = npr.permutation(data)[0:ncentroids, :] print "with #centroids = ", ncentroids print "verbosity \ttime" alg = 'simple' for verb, alg in zip([0,1,2], [alg, alg, alg]): t0 = time.time() X = kmeans.get_clustering(X = data, n_clusters = ncentroids, init = C0, algorithm = alg, n_threads = 1, verbose = verb, X_validation = data_validation, validation_period = 1, capture_verbose = True) print verb, "\t\t", time.time() - t0
def on_syinNS(): datafile = None # path to data data = np.loadtxt(datafile) ndata, dim = data.shape data_validation = None for ncentroids in [10, 100, 1000]: C0 = npr.permutation(data)[0:ncentroids, :] print "with #centroids = ", ncentroids print "verbosity \ttime" for verb, alg in zip([0,1,2], ['syinSN', 'syinSN', 'syinSN']): t0 = time.time() X = kmeans.get_clustering(X = data, n_clusters = ncentroids, init = C0, algorithm = alg, n_threads = 1, verbose = verb, X_validation = data_validation, validation_period = 1, capture_verbose = True) print verb, "\t\t", time.time() - t0
def dense_data_example(): """ cluster dense data points. """ import random npr.seed(1011) random.seed(1011) K = int(1e2) ndata = int(7e3) dimension = 5 data = np.array(1 + npr.randn(ndata, dimension), dtype=np.float64) seed = 1011 z = pyzentas.pyzen(init="kmeans++~1", K=K, metric='l2', energy='quadratic', exponent_coeff=0, max_rounds=20, max_time=1., max_itok=3.0, seed=seed, nthreads=1, patient=False, with_tests=False, algorithm="clarans", level=3) do_vdimap = False do_refinement = True refinement_algorithm = "yinyang" rf_max_rounds = 3000 rf_max_time = 10000. tangerine = z.den(data, do_vdimap, do_refinement, refinement_algorithm, rf_max_rounds, rf_max_time) run_eakmeans = False if run_eakmeans: sys.path.append(datapaths.datapaths["eaklibdir"]) import kmeans indices = random.sample(xrange(ndata), K) indices = np.array(indices, dtype=np.uint64) indices.sort() X = kmeans.get_clustering(X=data, n_clusters=K, init=indices, verbose=1, n_threads=1, algorithm="yin-sn")
def example_3(): """ compare selkNS to standard algorithm on random data. selkNS is faster, but not by as much as when the data has structure. """ algs = ['sta','selkNS'] times = dict.fromkeys(algs) data = npr.randn(50000, 25).astype(np.float64) seed = 1011 for alg in algs: clustering = kmeans.get_clustering(X = data, n_clusters = 1000, algorithm = alg, verbose = 1, n_threads = 1, seed = seed) times[alg] = clustering['duration'] print "TIMES:" for alg in algs: print alg, " : ", times[alg]
def example_3(): """ compare selkNS to standard algorithm on random data. selkNS is faster, but not by as much as when the data has structure. """ algs = ['sta', 'selk-ns'] times = dict.fromkeys(algs) data = npr.randn(50000, 25).astype(np.float64) seed = 1011 for alg in algs: clustering = kmeans.get_clustering(X=data, n_clusters=1000, algorithm=alg, verbose=1, n_threads=1, seed=seed) times[alg] = clustering['duration'] print "TIMES:" for alg in algs: print alg, " : ", times[alg]
def example_2(): """ compare algorithms which may be good in low-d ham, ann, expSN, expNS, syinSN, syinNS, yin, on dataset ldfpads.txt (~160000 points in 3 dimensions) """ data = np.loadtxt('ldfpads.txt') print "Data shape : ", data.shape seed = npr.randint(100000) algs = ['ham','ann', 'expSN', 'expNS', 'syinSN', 'syinNS', 'yin'] times = dict.fromkeys(algs) for alg in algs: clustering = kmeans.get_clustering(X = data, n_clusters = 1000, algorithm = alg, verbose = 1, n_threads = 1, seed = seed) times[alg] = clustering['duration'] print "TIMES:" for alg in algs: print alg, " : ", times[alg]
def example_2(): """ compare algorithms which may be good in low-d ham, ann, expSN, expNS, syinSN, syinNS, yin, on dataset ldfpads.txt (~160000 points in 3 dimensions) """ data = np.loadtxt('ldfpads.txt') print "Data shape : ", data.shape seed = npr.randint(100000) algs = ['ham', 'ann', 'exp-sn', 'exp-ns', 'syin-sn', 'syin-ns', 'yin'] times = dict.fromkeys(algs) for alg in algs: clustering = kmeans.get_clustering(X=data, n_clusters=1000, algorithm=alg, verbose=1, n_threads=1, seed=seed) times[alg] = clustering['duration'] print "TIMES:" for alg in algs: print alg, " : ", times[alg]
def go(X, K, withskl, witheak, withzen): """ X : data K : number of clusters withskl, witheak, withzen : bools indicating whether to run with em. """ indices_init = np.arange(K, dtype=np.uint64) C_init = X[indices_init] results = {} if withskl == True: results["skl"] = {} from sklearn.cluster import KMeans # run until convergence, initialise with scikit-learn's special version of k-means++ (see zentas wiki entry for discussion). sklc = KMeans(n_clusters=K, init="k-means++", max_iter=100000000, tol=1e-20, verbose=0, n_init=1) tsk0 = time.time() sklc.fit(X) tsk1 = time.time() sklacc = np.sum( np.min(np.sum((np.expand_dims(X, axis=1) - np.expand_dims(sklc.cluster_centers_, axis=0))**2, axis=2), axis=1)) / X.shape[0] results["skl"]["t"] = tsk1 - tsk0 results["skl"]["mse"] = sklacc if witheak: results["eak"] = {} sys.path.append(datapaths.datapath["eaklibdir"]) import kmeans teak0 = time.time() eak = kmeans.get_clustering(X, K, verbose=1, init="kmeans++", n_threads=4) teak1 = time.time() results["eak"]["t"] = teak1 - teak0 results["eak"]['mse'] = eak["mse"] if withzen: results["zen"] = {} # run with zentas. pipeline here is (1) kmeans++ (2) clarans (3) lloyd. z = pyzentas.pyzen(K=K, metric='l2', energy='quadratic', max_itok=10.0, max_time=5.0, max_proposals=K**2, seed=npr.randint(1000), patient=True, nthreads=4, init="kmeans++-4", with_tests=False, capture_output=True, rooted=False) tzen0 = time.time() tangerine = z.den(X, do_vdimap=True, do_refinement=True, rf_max_rounds=10000000) tzen1 = time.time() results["zen"]["t"] = tzen0 - tzen1 results["zen"]["out"] = pyzentas.get_processed_output( tangerine['output']) results["zen"]['mse'] = results["zen"]["out"]["mE"][-1] return results
def example_1(ndata = 1e4, dimension = 100, dtype = np.float64): """ basic use : cluster random data """ data = npr.randn(ndata, dimension).astype(dtype) clustering = kmeans.get_clustering(X = data, n_clusters = 60, algorithm = 'auto', verbose = 2)
sizes = [20000, 30000, 40000] # ,100000, 1000000, 10000000 D = 2 # number of dimensions k = 5 # number of clusters reps = 1 duration_dict = {} # saves duration for every size """ 2. iterate over sizes and do elki """ for r in range(reps): for i in range(len(sizes)): """ 2.1 load data from .txt """ file_name = 'data/samples_' + str(sizes[i]) + '.txt' X = np.genfromtxt(file_name, dtype='float64') """ 2.2 do fast kmeans for cenroids """ st = time() initial_clustering = kmeans.get_clustering(X=X, n_clusters=k, algorithm='auto', verbose=0) initial_clustering_duration = time() - st centroids = initial_clustering['C'] """ 2.3 do elki and save duration and energy in dict """ best_labels, duration = elki.elki(X, k, centroids) #st = time() #best_labels = elki_py(X, k, sizes[i], D, centroids) #duration = time() - st energy = calculateEnergy(X, centroids, best_labels) anaObj = sa.analysisObj(file_name, X.shape[0], duration, initial_clustering_duration, energy) duration_dict[sizes[i] + r] = anaObj # changed sizes[i] -> r """ 2.4 draw results (Optional) """ #if (r == reps-1):
import kmeans import numpy as np import numpy.random as npr old_seed = npr.randint(100000) ndata = 10000 dimension = 300 n_clusters = 10 npr.seed(1012) X = npr.randn(ndata, dimension) C0 = 1.001*npr.randn(n_clusters, dimension) npr.seed(old_seed) bla = kmeans.get_clustering(X = X, n_clusters = n_clusters, init = "BF", verbose = 1, seed = old_seed)
import kmeans import numpy as np import numpy.random as npr old_seed = npr.randint(100000) ndata = 10000 dimension = 300 n_clusters = 10 npr.seed(1012) X = npr.randn(ndata, dimension) C0 = 1.001 * npr.randn(n_clusters, dimension) npr.seed(old_seed) bla = kmeans.get_clustering(X=X, n_clusters=n_clusters, init="BF", verbose=1, seed=old_seed)
ndata = 6000 dimension = 10 data = npr.randn(ndata, dimension) # data = mnist.read_MNIST(dataset = "projected", ndata = ndata, dimension = dimension).astype(np.float64) ncentroids = 20 C0 = data[0:ncentroids, :] print "entering clustering" # TODO : `change' minibatchsize to GBsize0 # reso = kmeans.get_clustering(X = data, n_clusters = ncentroids, init = C0, algorithm = "simple", n_threads = 1, verbose = 2, minibatchsize = ndata/2, X_validation = data, validation_period = 1, capture_verbose = False, max_iter = 5) reso = kmeans.get_clustering( X=data, n_clusters=ncentroids, init=C0, algorithm="gbsimple", n_threads=1, verbose=2, minibatchsize=ndata / 2, X_validation=data, validation_period=1, capture_verbose=False, max_iter=15, )