示例#1
0
def example_4():
	"""
	compare to scikitlearn implementation of kmeans
	"""

	import sklearn.cluster as skc
	import time
	
	ndata = 50000
	dimension = 10
	ncentroids = 1000
	data = npr.randn(ndata, dimension).astype(np.float64)

	centroids0 = data[0:ncentroids, :]

	t0 = time.time()
	kmeans.get_clustering(X = data, init = centroids0, n_clusters = ncentroids, algorithm = 'auto', verbose = 1, n_threads = 1)
	t1 = time.time()

	sklearner = skc.k_means(X = data, n_clusters = ncentroids, max_iter = 1000, n_init = 1, init = centroids0, precompute_distances = False, verbose = True, n_jobs = 1, return_n_iter = True, tol = 0.0)
	t2 = time.time()	
	
	kmeans_time = t1 - t0
	sklearner_time = t2 - t1
	
	print "sklearn : ", sklearner_time, " s"
	print "this kmeans: ",  kmeans_time, " s"
示例#2
0
def example_1(ndata=1e4, dimension=100, dtype=np.float64):
    """
	basic use : cluster random data
	"""
    data = npr.randn(ndata, dimension).astype(dtype)
    clustering = kmeans.get_clustering(X=data,
                                       n_clusters=60,
                                       algorithm='auto',
                                       verbose=2)
示例#3
0
def example_4():
    """
	compare to scikitlearn implementation of kmeans
	"""

    import sklearn.cluster as skc
    import time

    ndata = 50000
    dimension = 10
    ncentroids = 1000
    data = npr.randn(ndata, dimension).astype(np.float64)

    centroids0 = data[0:ncentroids, :]

    t0 = time.time()
    kmeans.get_clustering(X=data,
                          init=centroids0,
                          n_clusters=ncentroids,
                          algorithm='auto',
                          verbose=1,
                          n_threads=1)
    t1 = time.time()

    sklearner = skc.k_means(X=data,
                            n_clusters=ncentroids,
                            max_iter=1000,
                            n_init=1,
                            init=centroids0,
                            precompute_distances=False,
                            verbose=True,
                            n_jobs=1,
                            return_n_iter=True,
                            tol=0.0)
    t2 = time.time()

    kmeans_time = t1 - t0
    sklearner_time = t2 - t1

    print "sklearn : ", sklearner_time, " s"
    print "this kmeans: ", kmeans_time, " s"
示例#4
0
def on_standard():
	data = npr.randn(20000, 5)
	ndata, dim = data.shape
	data_validation = None
	
	for ncentroids in [10, 100, 1000]:
		C0 = npr.permutation(data)[0:ncentroids, :]
		print "with #centroids = ", ncentroids
		print "verbosity \ttime"
		alg = 'simple'
		for verb, alg in zip([0,1,2], [alg, alg, alg]):
			t0 = time.time()	
			X = kmeans.get_clustering(X = data, n_clusters = ncentroids, init = C0, algorithm = alg, n_threads = 1, verbose = verb, X_validation = data_validation, validation_period = 1, capture_verbose = True)
			print verb, "\t\t", time.time() - t0
示例#5
0
def on_syinNS():
	datafile = None # path to data
	data = np.loadtxt(datafile)
	ndata, dim = data.shape
	data_validation = None
	

	for ncentroids in [10, 100, 1000]:
		C0 = npr.permutation(data)[0:ncentroids, :]
		print "with #centroids = ", ncentroids
		print "verbosity \ttime"
		for verb, alg in zip([0,1,2], ['syinSN', 'syinSN', 'syinSN']):
			t0 = time.time()	
			X = kmeans.get_clustering(X = data, n_clusters = ncentroids, init = C0, algorithm = alg, n_threads = 1, verbose = verb, X_validation = data_validation, validation_period = 1, capture_verbose = True)
			print verb, "\t\t", time.time() - t0
示例#6
0
def dense_data_example():
    """
  cluster dense data points.
  """

    import random
    npr.seed(1011)
    random.seed(1011)
    K = int(1e2)
    ndata = int(7e3)
    dimension = 5
    data = np.array(1 + npr.randn(ndata, dimension), dtype=np.float64)
    seed = 1011
    z = pyzentas.pyzen(init="kmeans++~1",
                       K=K,
                       metric='l2',
                       energy='quadratic',
                       exponent_coeff=0,
                       max_rounds=20,
                       max_time=1.,
                       max_itok=3.0,
                       seed=seed,
                       nthreads=1,
                       patient=False,
                       with_tests=False,
                       algorithm="clarans",
                       level=3)
    do_vdimap = False
    do_refinement = True
    refinement_algorithm = "yinyang"
    rf_max_rounds = 3000
    rf_max_time = 10000.
    tangerine = z.den(data, do_vdimap, do_refinement, refinement_algorithm,
                      rf_max_rounds, rf_max_time)

    run_eakmeans = False
    if run_eakmeans:
        sys.path.append(datapaths.datapaths["eaklibdir"])
        import kmeans
        indices = random.sample(xrange(ndata), K)
        indices = np.array(indices, dtype=np.uint64)
        indices.sort()
        X = kmeans.get_clustering(X=data,
                                  n_clusters=K,
                                  init=indices,
                                  verbose=1,
                                  n_threads=1,
                                  algorithm="yin-sn")
示例#7
0
def example_3():
	"""
	compare selkNS to standard algorithm on random data. selkNS is faster, but not by as much as when the data has structure.
	"""	
	algs = ['sta','selkNS']
	times = dict.fromkeys(algs)
	data = npr.randn(50000, 25).astype(np.float64)
	seed = 1011
	for alg in algs:
		clustering = kmeans.get_clustering(X = data, n_clusters = 1000, algorithm = alg, verbose = 1, n_threads = 1, seed = seed)
		times[alg] = clustering['duration']
	

	print "TIMES:"
	for alg in algs:
		print alg, " : ", times[alg]
示例#8
0
def example_3():
    """
	compare selkNS to standard algorithm on random data. selkNS is faster, but not by as much as when the data has structure.
	"""
    algs = ['sta', 'selk-ns']
    times = dict.fromkeys(algs)
    data = npr.randn(50000, 25).astype(np.float64)
    seed = 1011
    for alg in algs:
        clustering = kmeans.get_clustering(X=data,
                                           n_clusters=1000,
                                           algorithm=alg,
                                           verbose=1,
                                           n_threads=1,
                                           seed=seed)
        times[alg] = clustering['duration']

    print "TIMES:"
    for alg in algs:
        print alg, " : ", times[alg]
示例#9
0
def example_2():
	"""
	compare algorithms which may be good in low-d 
	ham, ann, expSN, expNS, syinSN, syinNS, yin,
	on dataset ldfpads.txt (~160000 points in 3 dimensions)
	"""
	data = np.loadtxt('ldfpads.txt')
	print "Data shape : ", data.shape
	seed = npr.randint(100000)
	algs = ['ham','ann', 'expSN', 'expNS', 'syinSN', 'syinNS', 'yin']

	times = dict.fromkeys(algs)
	for alg in algs:
		clustering = kmeans.get_clustering(X = data, n_clusters = 1000, algorithm = alg, verbose = 1, n_threads = 1, seed = seed)
		times[alg] = clustering['duration']
		
	
	print "TIMES:"
	for alg in algs:
		print alg, " : ", times[alg]
示例#10
0
def example_2():
    """
	compare algorithms which may be good in low-d 
	ham, ann, expSN, expNS, syinSN, syinNS, yin,
	on dataset ldfpads.txt (~160000 points in 3 dimensions)
	"""
    data = np.loadtxt('ldfpads.txt')
    print "Data shape : ", data.shape
    seed = npr.randint(100000)
    algs = ['ham', 'ann', 'exp-sn', 'exp-ns', 'syin-sn', 'syin-ns', 'yin']

    times = dict.fromkeys(algs)
    for alg in algs:
        clustering = kmeans.get_clustering(X=data,
                                           n_clusters=1000,
                                           algorithm=alg,
                                           verbose=1,
                                           n_threads=1,
                                           seed=seed)
        times[alg] = clustering['duration']

    print "TIMES:"
    for alg in algs:
        print alg, " : ", times[alg]
def go(X, K, withskl, witheak, withzen):
    """
  X : data
  K : number of clusters
  withskl, witheak, withzen : bools indicating whether to run with em.
  """
    indices_init = np.arange(K, dtype=np.uint64)
    C_init = X[indices_init]

    results = {}
    if withskl == True:
        results["skl"] = {}
        from sklearn.cluster import KMeans
        # run until convergence, initialise with scikit-learn's special version of k-means++ (see zentas wiki entry for discussion).
        sklc = KMeans(n_clusters=K,
                      init="k-means++",
                      max_iter=100000000,
                      tol=1e-20,
                      verbose=0,
                      n_init=1)
        tsk0 = time.time()
        sklc.fit(X)
        tsk1 = time.time()
        sklacc = np.sum(
            np.min(np.sum((np.expand_dims(X, axis=1) -
                           np.expand_dims(sklc.cluster_centers_, axis=0))**2,
                          axis=2),
                   axis=1)) / X.shape[0]
        results["skl"]["t"] = tsk1 - tsk0
        results["skl"]["mse"] = sklacc

    if witheak:
        results["eak"] = {}
        sys.path.append(datapaths.datapath["eaklibdir"])
        import kmeans
        teak0 = time.time()
        eak = kmeans.get_clustering(X,
                                    K,
                                    verbose=1,
                                    init="kmeans++",
                                    n_threads=4)
        teak1 = time.time()
        results["eak"]["t"] = teak1 - teak0
        results["eak"]['mse'] = eak["mse"]

    if withzen:
        results["zen"] = {}
        # run with zentas. pipeline here is (1) kmeans++ (2) clarans (3) lloyd.
        z = pyzentas.pyzen(K=K,
                           metric='l2',
                           energy='quadratic',
                           max_itok=10.0,
                           max_time=5.0,
                           max_proposals=K**2,
                           seed=npr.randint(1000),
                           patient=True,
                           nthreads=4,
                           init="kmeans++-4",
                           with_tests=False,
                           capture_output=True,
                           rooted=False)
        tzen0 = time.time()
        tangerine = z.den(X,
                          do_vdimap=True,
                          do_refinement=True,
                          rf_max_rounds=10000000)
        tzen1 = time.time()
        results["zen"]["t"] = tzen0 - tzen1
        results["zen"]["out"] = pyzentas.get_processed_output(
            tangerine['output'])
        results["zen"]['mse'] = results["zen"]["out"]["mE"][-1]

    return results
示例#12
0
def example_1(ndata = 1e4, dimension = 100, dtype = np.float64):
	"""
	basic use : cluster random data
	"""
	data = npr.randn(ndata, dimension).astype(dtype)
	clustering = kmeans.get_clustering(X = data, n_clusters = 60, algorithm = 'auto', verbose = 2)
示例#13
0
    sizes = [20000, 30000, 40000]  # ,100000, 1000000, 10000000
    D = 2  # number of dimensions
    k = 5  # number of clusters
    reps = 1

    duration_dict = {}  # saves duration for every size
    """ 2. iterate over sizes and do elki """
    for r in range(reps):
        for i in range(len(sizes)):
            """ 2.1 load data from .txt """
            file_name = 'data/samples_' + str(sizes[i]) + '.txt'
            X = np.genfromtxt(file_name, dtype='float64')
            """ 2.2 do fast kmeans for cenroids """
            st = time()
            initial_clustering = kmeans.get_clustering(X=X,
                                                       n_clusters=k,
                                                       algorithm='auto',
                                                       verbose=0)
            initial_clustering_duration = time() - st
            centroids = initial_clustering['C']
            """ 2.3 do elki and save duration and energy in dict """
            best_labels, duration = elki.elki(X, k, centroids)
            #st = time()
            #best_labels = elki_py(X, k, sizes[i], D, centroids)
            #duration = time() - st

            energy = calculateEnergy(X, centroids, best_labels)
            anaObj = sa.analysisObj(file_name, X.shape[0], duration,
                                    initial_clustering_duration, energy)
            duration_dict[sizes[i] + r] = anaObj  # changed sizes[i] -> r
            """ 2.4 draw results (Optional) """
            #if (r == reps-1):
示例#14
0
import kmeans

import numpy as np
import numpy.random as npr

old_seed = npr.randint(100000)

ndata = 10000
dimension = 300
n_clusters  = 10
npr.seed(1012)

X = npr.randn(ndata, dimension)
C0 = 1.001*npr.randn(n_clusters, dimension)

npr.seed(old_seed)

bla = kmeans.get_clustering(X = X, n_clusters = n_clusters, init = "BF", verbose = 1, seed = old_seed)
示例#15
0
import kmeans

import numpy as np
import numpy.random as npr

old_seed = npr.randint(100000)

ndata = 10000
dimension = 300
n_clusters = 10
npr.seed(1012)

X = npr.randn(ndata, dimension)
C0 = 1.001 * npr.randn(n_clusters, dimension)

npr.seed(old_seed)

bla = kmeans.get_clustering(X=X,
                            n_clusters=n_clusters,
                            init="BF",
                            verbose=1,
                            seed=old_seed)
示例#16
0
ndata = 6000

dimension = 10
data = npr.randn(ndata, dimension)

# data =  mnist.read_MNIST(dataset = "projected", ndata = ndata, dimension = dimension).astype(np.float64)


ncentroids = 20

C0 = data[0:ncentroids, :]
print "entering clustering"

# TODO : `change' minibatchsize to GBsize0

# reso = kmeans.get_clustering(X = data, n_clusters = ncentroids, init = C0, algorithm = "simple", n_threads = 1, verbose = 2, minibatchsize = ndata/2, X_validation = data, validation_period = 1, capture_verbose = False, max_iter = 5)

reso = kmeans.get_clustering(
    X=data,
    n_clusters=ncentroids,
    init=C0,
    algorithm="gbsimple",
    n_threads=1,
    verbose=2,
    minibatchsize=ndata / 2,
    X_validation=data,
    validation_period=1,
    capture_verbose=False,
    max_iter=15,
)