예제 #1
    def train(self, pos, neg, k=7, nofIterations=20, encoding="utf8"):
        with open(pos, encoding=encoding) as fpos:
            tweets_pos = fpos.readlines()

        with open(neg, encoding=encoding) as fneg:
            tweets_neg = fneg.readlines()

        print("representing training data...")
        tweets_pos = np.array(
            [self.representation(tweet) for tweet in tweets_pos])
        tweets_neg = np.array(
            [self.representation(tweet) for tweet in tweets_neg])

        self.__cent_pos = kmeans(tweets_pos, k=k, iterationCount=nofIterations)
        self.__cent_neg = kmeans(tweets_neg, k=k, iterationCount=nofIterations)

        ntotal = len(tweets_pos) + len(tweets_neg)
        ncorrect = 0

        self._clf = (self.__cent_pos, self.__cent_neg)


        for t in tweets_pos:
            if self._predict(t) == 1:
                ncorrect += 1

        for t in tweets_neg:
            if self._predict(t) == -1:
                ncorrect += 1

        accuracy = ncorrect / ntotal
        print("classifier trained")
        print(f"accuracy on training set:{accuracy}")
예제 #2
        def train_function(x, y, model, window_state):
            if not model:
                class Model:
                    centroids = None
                    k = self.k
                    sum_iterations = 0
                    sum_error = 0
                    i = 0
                model = Model()
            if model.centroids is not None and self.incremental:
                [centroids, index, i] = kmeans(x, model.k, model.centroids,
                [centroids, index, i] = kmeans(x, model.k, draw=self.draw,
            model.centroids = centroids
            self.centroids = centroids
            error = evaluate_error(x, centroids, index)

            if self.output:
                print "Error: ", error

            model.sum_iterations += i
            model.sum_error += error
            model.i += 1
            return model
예제 #3
def kmeansMain():
    kmax = 10
    clustering = []
    iterator = 1
    bests = [2, 3, 4, 5]
    for cluster in clustering:
        transposed = np.transpose(cluster)
        mins = []
        maxs = []
        for dim in transposed:
        bestObjs = []
        for k in range(1, kmax + 1):
            objs = []
            for init in range(k):
                centers = []
                for i in range(k):
                    data = []
                    for j in range(len(mins)):
                        data.append(np.random.uniform(mins[j], maxs[j]))
                _, obj = kmeans(cluster, np.array(centers))
        plt.plot(range(1, kmax + 1), bestObjs)
        plt.ylabel("Value of Objective Function")
        plt.title("Cluster " + str(iterator))

        k = bests[iterator - 1]
        centers2 = []
        for i in range(k):
            data2 = []
            for j in range(len(mins)):
                data2.append(np.random.uniform(mins[j], maxs[j]))
        resCenters, _ = kmeans(cluster, np.array(centers2))
        assignments = assign_clusters(cluster, resCenters)
        assigned = []
        assigned2 = []
        for i in range(k):
        for i in range(len(cluster)):
        for arr in assigned:
        colors = ['or', 'ok', 'ob', 'og', 'oy']
        for i in range(k):
            plt.plot(assigned2[i][:, 0], assigned2[i][:, 1], colors[i])
        plt.title("K-means Clusters")
        iterator += 1
예제 #4
파일: enkf.py 프로젝트: fatadama/estimation
def clusterConvergence2Modes(xk, activeMeans):
    # evaluate the fit of two means to the data
    testMeans = 2
    (idxk, mui) = kmeans.kmeans(xk.transpose(), testMeans)
    # dimension of the data
    p = xk.shape[0]
    # numnber of particles
    N = xk.shape[1]
    # evaluate the mean and covariance of each cluster
    meansk = np.zeros((p, testMeans))
    Pkkk = np.zeros((p, p, testMeans))

    # evaluate the mean and covariance of each cluster
    for k in range(testMeans):
        # compute the mean of all members where meansIdx == k
        idx = np.nonzero(idxk == k)
        idx = idx[0]
        meansk[:, k] = mui[k, :].transpose()
        # compute the covariance
        Pkk = np.zeros((p, p))
        coef = 1.0 / (float(N) - 1.0)
        for j in idx:
            Pkk = Pkk + coef * np.outer(xk[:, j] - meansk[:, k], xk[:, j] - meansk[:, k])
        Pkkk[:, :, k] = Pkk.copy()
        # evaluate the likelihood for each point, under the bimodal assumption
    L2 = np.zeros(N)
    for k in range(N):
        # assume the likelihood is proportional to the PDF
        pxk = gaussianNormalPdf(xk[:, j], meansk[:, idxk[k]], Pkkk[:, :, idxk[k]])
        L2[k] = pxk
        # Akaike information criterion for bimodal case
    AIC2 = 2.0 * 4 - 2.0 * math.log(np.max(L2))
    # evaluate the likelihood under the monomodal assumption
    P11 = np.zeros((p, p))
    mux = np.mean(xk, axis=1)
    for k in range(N):
        P11 = P11 + (1.0) / (float(N) - 1.0) * np.outer(xk[:, k] - mux, xk[:, k] - mux)
    L = np.zeros(N)
    for k in range(N):
        L[k] = gaussianNormalPdf(xk[:, j], mux, P11)
        # information criterion for unimodal case
    AIC = 2.0 * 2 - 2.0 * math.log(np.max(L))
    # smaller AIC value is better
    print("AIC1 = %f,AIC2 = %f, L1 = %f, L2 = %f" % (AIC, AIC2, np.max(L), np.max(L2)))
    # if AIC < AIC2:
    if np.max(L) > np.max(L2):
        (idxk, mui) = kmeans.kmeans(xk.transpose(), 1)
    return (idxk, mui)
예제 #5
 def get(self):
     url = self.request.get('url')
     k = int(self.request.get('k', default_value=3))
     t = int(self.request.get('t', default_value=144))
     result = urlopen(url)
     # Get image
     if (result.getcode() == 200):
         data = result.read()
     img = images.Image(data)
     raw_width = img.width
     raw_height = img.height
     width = raw_width
     height = raw_height
     # Resize to max
     if raw_width > MAX and raw_height > MAX:
         if raw_width > raw_height:
             height = MAX
             width = width * MAX / height
             width = MAX
             height = height * MAX / width
     img = images.resize(data, width, height)
     reader = png.Reader(bytes=img)
     # Get pixels
     (width, height, pixels, meta) = reader.read_flat()
     n = PIXEL_SIZE
     (means, counts) = kmeans.kmeans(k, pixels, n, 255, t)
     colors = [hex_string(means[i * n : i * n + n]) for i in range(len(counts))]
     output = {}
     output['colors'] = [{'color': colors[i], 'count': counts[i]} for i in range(len(counts))]        
     output['sum'] = sum(counts)
     self.response.headers['Content-Type'] = 'application/json'
예제 #6
파일: gmm.py 프로젝트: 42niks/GMM
def _init(X, mean, diagonol=False):
	km = kmeans(K)
	km.means = mean
	m, sigma, pi = _extract_from_kmeans(km)
	sigma = _fix_sigma(sigma, def_sigma_tol)
	return sigma, pi
예제 #7
def vect_quant(input_file_folder, input_file_name, X, means_file, K):
    # print('Launching Thread vect_quant for ', input_file_folder, input_file_name, K)
    f = open(means_file + 'K' + str(K) + '.txt', 'r')
    lines = f.readlines()
    f = None
    w = []
    for point in lines:
        w.append(np.fromstring(point, dtype=float, sep=' '))
    means = np.array(w)
    w = None
    point = None
    lines = None

    friday = kmeans(K)
    friday.means = means
    means = None
    clusters = friday.clusters
    firday = None

    output_folder = input_file_folder + '/vq' + str(K)
    if not os.path.exists(output_folder):
    output_file = open(output_folder + '/' + input_file_name[:-4] + '.txt',
    for c in clusters:
        output_file.write(str(c) + ' ')
예제 #8
def main():

    # Input file that contains the input for
    # commuters, cabs and destination location
    input_file = "sample_inputs/%s" % SAMPLE_INPUT_FILE

    if len(sys.argv) == 2:
        input_file = sys.argv[1]

    # Parse input file
    commuters, cabs, destination = parse_input_file(input_file)
    # Create clusters of commuters
    clusters = kmeans(commuters, cabs)
    # Centroid of each cluster represents virtual centre of
    # each commuter group
    groups = [c.centroid for c in clusters]
    # Find distance between each commuter group and cab
    all_distances = get_group_cab_distances(groups, cabs)
    # Find optimal total distance and route travelled by all the cabs
    optimal_distance, optimal_route = optimal_total_distance(
                                        all_distances, groups, cabs)
    # Add total distance between group and destination to optimal distance
    optimal_distance = add_destination_distance(optimal_distance,
                            groups, destination)
    # Print answer
    print_answer(optimal_distance, optimal_route, clusters, cabs)
예제 #9
def find_k_star(data_points, threshold):
    """ Starting with 1 cluster, finds out the optimum value of k*.

    In each iteration, the value of number of clusters is doubled.
    num_clusters = 1
    prev_cohesion = None
    while True:
        clusters, cohesion = kmeans(data_points, num_clusters)
        if prev_cohesion:
            change_rate = float(abs(cohesion - prev_cohesion)) / (
                prev_cohesion * num_clusters / 2)
            if change_rate < threshold:
        prev_cohesion = cohesion
        num_clusters *= 2

    if num_clusters > len(data_points):
        print len(data_points)

    # perform binary search to find kstar. As num of clusters have already been doubled in previous for loop
    # so start corresponds to num_clusters/4 and end to num_clusters/2
    kstar = binary_search(data_points, num_clusters / 4, num_clusters / 2,

    print kstar
예제 #10
def KmeansCluster(FileName = "",Num = 3):
    dataSet = []
    fileIn = open(FileName)
    for line in fileIn.readlines():
        lineArr = line.strip().split(',')
        dataSet.append([float(i) for i in lineArr])
    dataSet = mat(dataSet)
    k = Num
    centroids, clusterAssment = km.kmeans(dataSet, k)
    return centroids,clusterAssment

# FileName = "/Users/nevin47/Desktop/项目/学术/TopTen_Python/DataMiningTopTen/KMeans/Table/FINAL.csv"
# Center,Out = KmeansCluster(FileName,3)
# # print Out
# dataSet = []
# fileIn = open(FileName)
# for line in fileIn.readlines():
#     lineArr = line.strip().split(',')
#     dataSet.append([float(i) for i in lineArr])
# dataSet = mat(dataSet)
# # C = transpose(mat(Center[1]))
# # D = dataSet[1]
# # print D*C
# Result = km.CalVectorDistance(dataSet,Out)
# # print Result[:,0,:]
# SUM = 0
# for i in range(dataSet.shape[0]):
#     SUM += km.CalVectorCoefficient(Result,i,3)
# print SUM/dataSet.shape[0]
# # print km.CalVectorCoefficient(Result,0,3)
예제 #11
def part1_q4(dataset, show_graph=False):
    print(">>> PART 1: QUESTION 4")
    clustering = kmeans(dataset, 2, initCentroids=[('i1', 3, 2), ('i2', 4, 8)], distance_type='Manhattan')
    if show_graph:
예제 #12
 def segment(self):
 	algorithm = self.algorithmComboBox.currentText()
     imgPath = str(self.userpath.text())
     index = self.featureComboBox.currentIndex()
     features = ["INTENSITY","INTENSITY+LOC","RGB","YUV","LM",
     if algorithm == "K-means":
         k = int(self.kText.text())
         iterations = int(self.iterationsText.text())
         epsilon = float(self.epsilonText.text())
         print imgPath,index,k,iterations,epsilon
         org = cv.LoadImageM(imgPath)
         im = kmeans.kmeans(imgPath,features[index],k,iterations,epsilon)
     elif algorithm == "Mean Shift":
         if index == 4:
             QtGui.QMessageBox.information(self, 'Error',
                     'LM is not supported in mean shift')
         print imgPath,features[index]
         org = cv.LoadImageM(imgPath)
         im = meanshift.meanshift(imgPath,features[index])
예제 #13
def part1_q2(dataset, show_graph=False):
    print(">>> PART 1: QUESTION 2")
    clustering = kmeans(dataset, 2, initCentroids=[('i1', 4, 6), ('i2', 5, 4)], distance_type='Euclidean')
    if show_graph:
예제 #14
def kmeans_test(request):
	data = None
	form = kmeansNumSamplesForm()
	grouped_data = None
	clusters = None
	error_list = None
	if request.method=='POST':
		form = kmeansNumSamplesForm(request.POST)
		if form.is_valid():
			sample_size = int(form.cleaned_data['num_samples'])
			k = int(form.cleaned_data['k'])
			# Generate random data
			data = numpy.random.random((sample_size, 2)) 
			# Calculate kmeans
			if form.cleaned_data['method']=='Basic':
				grouped_data, clusters, error_list = kmeans.kmeans(data,num_clusters=k, min_error=0.01, max_iter=100)
				grouped_data, clusters, error_list = kmeans.bisecting_kmeans(data,k=k, min_error=0.01, max_iter=50)
	return render_to_response('visualization/kmeans.html', {
		'data': grouped_data,
		'clusters': clusters,
		'error_list': error_list,
		'k': k,
		'sample_size': sample_size,
		}, context_instance=RequestContext(request))
예제 #15
def main():
    import matplotlib.pyplot as plt
    from sklearn.datasets.samples_generator import make_blobs
    n_centers = 3
    X, y = make_blobs(n_samples=1000, centers=n_centers, n_features=2,
                    cluster_std=0.7, random_state=0)

    # Run this K-Means
    import kmeans
    t0 = time.time()
    y_pred, centers, obj_val_seq = kmeans.kmeans(X, n_centers)
    t1 = time.time()
    print("Final obj val: {}".format(obj_val_seq[-1]))
    print("Time taken (this implementation): {}".format(t1 - t0))

    # Run scikit-learn's K-Means
    from sklearn.cluster import k_means
    t0 = time.time()
    centers, y_pred, obj_val = k_means(X, n_centers, random_state=0)
    t1 = time.time()
    print("Final obj val: {}".format(obj_val))
    print("Time taken (Scikit, 1 job): {}".format(t1 - t0))

    # Plot change in objective value over iteration
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(obj_val_seq, 'b-', marker='*')
    fig.suptitle("Change in K-means objective value across iterations")
    ax.set_ylabel("Objective value")

    # Plot data
    from itertools import cycle
    colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    fig = plt.figure(figsize=plt.figaspect(0.5))  # Make twice as wide to accomodate both plots
    ax = fig.add_subplot(121)
    ax.set_title("Data with true labels and final centers")
    for k, color in zip(range(n_centers), colors):
        ax.plot(X[y==k, 0], X[y==k, 1], color + '.')

    initial_centers = kmeans.init_centers(X, n_centers, 2) # This is valid because we always use the same random seed.
    # Plot initial centers
    for x in initial_centers:
        ax.plot(x[0], x[1], "mo", markeredgecolor="k", markersize=8)

    # Plot final centers
    for x in centers:
        ax.plot(x[0], x[1], "co", markeredgecolor="k", markersize=8)

    # Plot assignments
    colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    ax = fig.add_subplot(122)
    ax.set_title("Data with final assignments")
    for k, color in zip(range(n_centers), colors):
        ax.plot(X[y_pred==k, 0], X[y_pred==k, 1], color + '.')

예제 #16
파일: main.py 프로젝트: meshuai/kmeans
def main(filename, k, function):
    file = open(filename)
    pts = []
    for line in file.readlines():
        line = line.split()
        pts.append(( int(line[0]), int(line[1])))

    group, s = kmeans.kmeans(pts, k, function)
    print group
    print s
    f = open(filename +'cluster', 'w')
    for each in range(len(group)):
        f.write( str(group[each])+'\n\n')
        cc = numpy.random.rand(10000)
        for i in group[each]:
            matplotlib.pyplot.scatter(i[0],i[1],s=2, color=cc)
        pylab.plot(s[each][0], s[each][1], '+')

    ll = filename + 'cluster'
    ll = 'gedit '+ ll
    ll = ll.split()
    process = Popen(ll)

예제 #17
def main():
    finput = open('../resources/BB.txt','r')
    mat = np.loadtxt(finput,delimiter=' ',)

    X = prepareX()
    k = 10
    min_samples = 4
    opt_cutoff = 0.5
    threshold = 500
    reduce_threshold = 600

    est = kmeans(X,k,opt_cutoff)
    est2 = kmeans2(X,opt_cutoff,threshold,reduce_threshold)

    (cluster_label0,id2point) = buildLabels(est,X)
    (cluster_label01,id2point) = buildLabels(est2,X)
    cluster_label1 = SpectralClustering(k).fit_predict(mat)
    cluster_label1 = buildLabels2(cluster_label1,X)
    cluster_label2 = DBSCAN(min_samples=min_samples).fit_predict(mat)
    cluster_label2 = buildLabels2(cluster_label2,X)

    np.savetxt('../resources/KMeans.txt', cluster_label0, fmt='%s', newline='\n', header='', footer='', comments='# ')
    np.savetxt('../resources/KMeans2.txt', cluster_label01, fmt='%s', newline='\n', header='', footer='', comments='# ')
    np.savetxt('../resources/SpectralClustering.txt', cluster_label1, fmt='%s', newline='\n', header='', footer='', comments='# ')
    np.savetxt('../resources/DBSCAN.txt', cluster_label2, fmt='%s', newline='\n', header='', footer='', comments='# ')

    np.savetxt('../resources/ID2Point.txt', id2point, fmt=["%s",]*3, newline='\n')
예제 #18
def clustering():
	# Create pairs for clustering (age/eng/prog/uni)
	pairs = []
	for i, val in enumerate(uni):
		if(val > 0 and prog[i] > 0):
			pairs.append((val, prog[i]))

	# Run k-means
	seeds = [pairs[0], pairs[1], pairs[2]]
	clusters = kmeans.kmeans(pairs, seeds)

	# Show diagram with coloured clusters
	for i in clusters:
		cluster = clusters[i]
		xaxis = []
		yaxis = []
		for pair in cluster:
		plt.plot(xaxis, yaxis, "o")

	plt.xlabel('University years')
	plt.ylabel('Programming skill')
예제 #19
    def __init__(self,
        self.nin = inputs.shape[1]
        self.nout = targets.shape[1]
        self.ndata = inputs.shape[0]
        self.nRBF = nRBF
        self.usekmeans = usekmeans
        self.normalise = normalise

        if usekmeans:
            self.kmeansnet = kmeans.kmeans(self.nRBF, inputs)

        self.hidden = np.zeros((self.ndata, self.nRBF + 1))

        if sigma == 0:
            d = (inputs.max(axis=0) - inputs.min(axis=0)).max()
            self.sigma = d / np.sqrt(2 * nRBF)
            self.sigma = sigma

        self.perceptron = pcn.pcn(self.hidden[:, :-1], targets)
        self.weights1 = np.zeros((self.nin, self.nRBF))
def sc(data_points: ndarray, k: int) -> (ndarray, list):
    sc(data_points, k)

    Run the spectral clustering

    data_points : ndarray
        Data points to be clustered.
    k : int
        number of clusters

    (ndarray, list)
        cc: The centroids of clusters
        aff: The affectation of each node to it's cluster
         if aff[i] = j  then the node i is in the cluster j

    logging.info("Starting Spectral Clustering")
    # Calculate the eigenvectors
    logging.info("Calculating the eigenvectors")
    _, u = la.eigh(data_points, eigvals=(0, k))
    # Run the k-means
    logging.info("Running k-means")
    # cc, aff = cl.vq.kmeans2(u, k)
    cc, aff = kmeans(u, k)
    return cc, aff
예제 #21
def questao21():
    dset = load_dataset('dataset1.csv')
    xo = dset.T[1].astype(float)  # segunda coluna
    x = dset.T[1].astype(float)  # segunda coluna
    yo = dset.T[2].astype(float)  # terceira coluna
    y = dset.T[2].astype(float)  # terceira coluna

    # a normalização com z-score ajudou na visualização e é necessária para agrupamento
    #x = [z_score(x, xi) for xi in x]
    #y = [z_score(y, yi) for yi in y]
    #centros_iniciais = [(z_score(xo, 1), z_score(yo, 2)), (z_score(xo, 4), z_score(yo, 2))]

    centros_iniciais = [(1, 2), (4, 2)]
    pontos = zip(x, y)

    clusters, iteracoes = kmeans(pontos, 2, centros_iniciais=centros_iniciais)

    cluster1 = clusters[0].pontos
    cluster2 = clusters[1].pontos
    plt.plot([xi[0] for xi in cluster1], [yi[1] for yi in cluster1], 'ro')
    plt.plot([clusters[0].centroide[0]], [clusters[0].centroide[1]], 'r*')
    plt.plot([xi[0] for xi in cluster2], [yi[1] for yi in cluster2], 'go')
    plt.plot([clusters[1].centroide[0]], [clusters[1].centroide[1]], 'g*')
    print "Novos centróides:", clusters[0].centroide, " e ", clusters[
예제 #22
파일: main.py 프로젝트: mendola/k-means
def main():
        inputFile = datFile = sys.argv[1]
        k = sys.argv[2]
    except Exception as e:
        print('Oh No! => %s' % e)
        print('Usage:\npython3 ./main.py <data.mat> <k>')

    mat = spio.loadmat(inputFile, squeeze_me=True)
    fname = os.path.splitext(inputFile)
    rawdata = mat[fname[0]]
    dataIn = rawdata[:, (0, 1)]
    mu, clusters = km.kmeans(int(k), 10, 0.00000001, dataIn)

    colors = ["r", "b", "g", "k", "c", "y", "m"]
    for i in range(0, max(clusters) + 1):
        indices = myFind(clusters, lambda x: x == i)
        x = dataIn[indices, 0]
        y = dataIn[indices, 1]
        plt.scatter(x, y, c=colors[i])
        plt.scatter(mu[i, 0], mu[i, 1], c="yellow", marker="*", s=300)
        plt.title("K-means results")
예제 #23
파일: test.py 프로젝트: filipius/colluding
def test_kmeans():
    name = "input-test-kmeans.txt"
    out = kmeans.kmeans(name)
    #print "gmis result = ", out
    #note: all the indexest of the graph above are different from gmis, as here we start at 0,
    #but gmis starts at 1, which means that we have a shift for all nodes
    expectedresult1 = [
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1
    expectedresult2 = [
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2
    failed = False
    for i in xrange(len(expectedresult1)):
        if out[i] != expectedresult1[i]:
            failed = True
    if failed:
        for i in xrange(len(expectedresult2)):
            if out[i] != expectedresult2[i]:
                failed = True
    if failed:
            "Possible problem in test_kmeans(). Might only be a different random output typical of k-means: "
            + str(out) + "\n")
예제 #24
def main():
    # Read in the dataset
    X = np.loadtxt('data/fisher_iris_data.csv', delimiter=',').T
    Y = np.loadtxt('data/fisher_iris_labels.csv', dtype=str)

    labels = list(set(Y)) # get unique labels
    num_classes = len(labels)
    Y = [labels.index(y) for y in Y]  # convert labels to integers

    ## Plot dataset
    fig = plt.figure()
    plt.scatter(X[0, :], X[1, :])
    plt.title('Data without labels')
    plt.scatter(X[0, :], X[1, :], c=Y)
    plt.title('Data with true labels')

    ax1 = plt.subplot(2,2,1)
    ax2 = plt.subplot(2,2,4)
    scatterPlot = 0
    num_evals = 100
    kmeans_performance = np.zeros(num_evals)
    kmeans_solutions = []
    for i in range(num_evals):
        print('\nK-Means Run ', i+1)
        cluster_idx = kmeans(X, num_classes, ax1=ax1, ax2=ax2)
        kmeans_performance[i] = evaluate(X, Y, cluster_idx)
        print('Accuracy: ', kmeans_performance[i])

    plt.subplot(2, 2, 3)
예제 #25
파일: mainprog.py 프로젝트: zkytony/446-hw
def sampleClustersGraph():
    data, featureNames = load_2d_data()
    k = 6
    iniCenters = orderedCenters(data, k)
    clusters, centers = kmeans(data, k,
    graphClusters(clusters, centers, data, featureNames)
예제 #26
 def train(self, X):
     Compute K-Means clustering on each class label and store your result in self.cluster_centers_
     :param X: inputs of training data, a 2D Numpy array
     :return: None
     self.cluster_centers_ = kmeans(X, self.k, self.max_iter, self.tol)
예제 #27
파일: rbf.py 프로젝트: vsrz/CS673
    def __init__(self,inputs,targets,nRBF,sigma=0,usekmeans=0,normalise=0):
        self.nin = shape(inputs)[1]
        self.nout = shape(targets)[1]        
        self.ndata = shape(inputs)[0]
        self.nRBF = nRBF
        self.usekmeans = usekmeans
        self.normalise = normalise

        #print "Initalizing RBFN with parameters: "
        #print "Inputs   : " + str(shape(inputs))
        #print "targets  : " + str(shape(targets))
        #print "nRBF     : " + str(nRBF)
#        print "Sigma    : " + str(sigma)
#        print "K-Means  : " + str(usekmeans)
#        print "Normalise: " + str(normalise)
#        print

        if usekmeans:
            self.kmeansnet = kmeans.kmeans(self.nRBF,inputs)
        self.hidden = zeros((self.ndata,self.nRBF+1))
        if sigma==0:
            # Set width of Gaussians
            d = (inputs.max(axis=0)-inputs.min(axis=0)).max()
            self.sigma = d/sqrt(2*nRBF)  
            self.sigma = sigma
        self.perceptron = pcn.pcn(self.hidden[:,:-1],targets)

        # Initialise network
        self.weights1 = zeros((self.nin,self.nRBF))
def cluster_data(inputpath, outputpath, index_column, name_column,
                 columns_to_load, num_clusters, max_iterations):

    names, input_data = read_and_normalise_csv(inputpath, index_column,
                                               name_column, columns_to_load)
    cluster_map = kmeans(input_data, num_clusters, max_iterations)

    f = None
        f = open(outputpath, 'w')
    except FileNotFoundError:
            f = open(outputpath, 'x')
            print("ERROR: could not load file")
            return (0, [], 0)

    #cluster_ary = sorted(list(zip(names, cluster_map)), key = lambda x: x[1], reverse=True)
    cluster_ary = [[] for c in range(num_clusters)]
    for i in range(len(names)):

    return cluster_ary
 def __init__(self, topic_dict, vocab_list, k = 4):
     print("create cluster model")
     self.k = k
     self.vocab_list = vocab_list
     self.tar_list = self.__dict2dataInfo(topic_dict, vocab_list)
     print("K means ...")
     [self.clusters, self.centroids] = kmeans.kmeans(self.tar_list, k)
예제 #30
def spectral(W, k):
    SPECTRUAL spectral clustering

            W: Adjacency matrix, N-by-N matrix
            k: number of clusters

            idx: data point cluster labels, n-by-1 vector.
    n, n = np.shape(W)
    idx = np.zeros((n ,1))
    D = np.zeros((n, n))
    for i in range(n):
        D[i][i] = np.sum(W[i][:])
    L = D - W
    # begin answer
    eng = np.linalg.eig(L)
    enval = eng[0]
    en = eng[1]
    sort_index = np.argsort(enval)
    topk = en[:,sort_index[:k]]
    idx = kmeans(topk, k)
    return idx
예제 #31
def KmeansCluster(FileName="", Num=3):
    dataSet = []
    fileIn = open(FileName)
    for line in fileIn.readlines():
        lineArr = line.strip().split(',')
        dataSet.append([float(i) for i in lineArr])
    dataSet = mat(dataSet)
    k = Num
    centroids, clusterAssment = km.kmeans(dataSet, k)
    return centroids, clusterAssment

# FileName = "/Users/nevin47/Desktop/项目/学术/TopTen_Python/DataMiningTopTen/KMeans/Table/FINAL.csv"
# Center,Out = KmeansCluster(FileName,3)
# # print Out
# dataSet = []
# fileIn = open(FileName)
# for line in fileIn.readlines():
#     lineArr = line.strip().split(',')
#     dataSet.append([float(i) for i in lineArr])
# dataSet = mat(dataSet)
# # C = transpose(mat(Center[1]))
# # D = dataSet[1]
# # print D*C
# Result = km.CalVectorDistance(dataSet,Out)
# # print Result[:,0,:]
# SUM = 0
# for i in range(dataSet.shape[0]):
#     SUM += km.CalVectorCoefficient(Result,i,3)
# print SUM/dataSet.shape[0]
# # print km.CalVectorCoefficient(Result,0,3)
예제 #32
    def getDepht(self, im3d, bbox):
        x1 = int(bbox[0])
        y1 = int(bbox[1])
        x2 = int(bbox[2])
        y2 = int(bbox[3])
        average = 0.0
        strIm = ""
        num = 0
        filteredImage = {}
        for y in range(y1, y2 + 1):
            strIm += "|"
            for x in range(x1, x2 + 1):
                d = im3d[y][x][0]
                strIm += "{:>3}|".format(d)
                #if d<=7: continue
                filteredImage[(y, x)] = d
                average += 1.0 * d
                num += 1
            strIm += "\n"
        #print strIm
        c1, c2 = kmeans(filteredImage, 2, 1000, 320, 240)
        depth1 = average / num
        depth2 = min([c1, c2])
        depth1 = (depth1 + 18.7579) / 0.5181
        depth2 = (depth2 + 18.7579) / 0.5181

        print depth1
        print depth2

        if abs(depth1 - depth2) > 10:
            depth = depth2
            depth = depth1

        return depth
예제 #33
def questao21():
    dset = load_dataset('dataset1.csv')
    xo = dset.T[1].astype(float) # segunda coluna
    x = dset.T[1].astype(float) # segunda coluna
    yo = dset.T[2].astype(float) # terceira coluna
    y = dset.T[2].astype(float) # terceira coluna

    # a normalização com z-score ajudou na visualização e é necessária para agrupamento
    #x = [z_score(x, xi) for xi in x]
    #y = [z_score(y, yi) for yi in y]
    #centros_iniciais = [(z_score(xo, 1), z_score(yo, 2)), (z_score(xo, 4), z_score(yo, 2))]

    centros_iniciais = [(1,2), (4,2)]
    pontos = zip(x, y)

    clusters, iteracoes = kmeans(pontos, 2, centros_iniciais=centros_iniciais)

    cluster1 = clusters[0].pontos
    cluster2 = clusters[1].pontos
    plt.plot([xi[0] for xi in cluster1], [yi[1] for yi in cluster1], 'ro')
    plt.plot([clusters[0].centroide[0]], [clusters[0].centroide[1]], 'r*')
    plt.plot([xi[0] for xi in cluster2], [yi[1] for yi in cluster2], 'go')
    plt.plot([clusters[1].centroide[0]], [clusters[1].centroide[1]], 'g*')
    print "Novos centróides:", clusters[0].centroide, " e ", clusters[1].centroide
예제 #34
    def __init__(self,
        self.nin = shape(inputs)[1]
        self.nout = shape(targets)[1]
        self.ndata = shape(inputs)[0]
        self.nRBF = nRBF
        self.usekmeans = usekmeans
        self.normalise = normalise

        if usekmeans:
            self.kmeansnet = kmeans.kmeans(self.nRBF, inputs)

        self.hidden = zeros((self.ndata, self.nRBF + 1))

        if sigma == 0:
            # Set width of Gaussians
            d = (inputs.max(axis=0) - inputs.min(axis=0)).max()
            self.sigma = d / sqrt(2 * nRBF)
            self.sigma = sigma

        self.perceptron = pcn.pcn(self.hidden[:, :-1], targets)

        # Initialise network
        self.weights1 = zeros((self.nin, self.nRBF))
예제 #35
def main(algorithm, data, cl_labels, min_k, max_k, max_iterations, epsilon):
    results, silhouette, chs, ssws, ssbs, ars, hom, comp = [], [], [], [], [], [], [], []
    membership, centroids, labels = [], [], []

    for c in range(min_k, max_k + 1):
        if algorithm == 'kmeans':
            labels, centroids = kmeans.kmeans(data, c)
        elif algorithm == 'bisecting_kmeans':
            labels, centroids = bisecting_kmeans.bisecting_kmeans(data, c)
        elif algorithm == 'fuzzy_cmeans':
            membership, centroids = fuzzyCmeans.execute(data, max_iterations, c, epsilon)
            labels = fuzzyCmeans.get_labels(len(data), membership)

        silhouette.append((c, metrics.silhouette_score(data, labels, metric='euclidean')))
        chs.append((c, metrics.calinski_harabaz_score(data, labels)))
        ssws.append((c, utils.get_ssw(data, centroids, labels)))
        ssbs.append((c, utils.get_ssb(centroids)))
        ars.append((c, metrics.adjusted_rand_score(cl_labels, labels)))
        hom.append((c, metrics.homogeneity_score(cl_labels, labels)))
        comp.append((c, metrics.completeness_score(cl_labels, labels)))

    results.append(("Silhouette", "", zip(*silhouette)[0], "", zip(*silhouette)[1], 333, "blue"))
    results.append(("Calinski-Harabaz Index", "", zip(*chs)[0], "", zip(*chs)[1], 334, "blue"))
    results.append(("Intra cluster Variance", "", zip(*ssws)[0], "", zip(*ssws)[1], 331, "blue"))
    results.append(("Inter cluster Variance", "", zip(*ssbs)[0], "", zip(*ssbs)[1], 332, "blue"))
    results.append(("Adjusted Rand Index", "", zip(*ars)[0], "", zip(*ars)[1], 335, "orange"))
    results.append(("Homogeneity", "", zip(*hom)[0], "", zip(*hom)[1], 336, "orange"))
    results.append(("Completeness", "", zip(*comp)[0], "", zip(*comp)[1], 337, "orange"))

    utils.plot_results(results, algorithm)
예제 #36
 def __init__(self, topic_dict, vocab, k=4):
     print("create cluster model")
     self.k = k
     self.vocab_info = [vocab, len(list(vocab.keys()))]
     self.tar_list = self.__dict2dataInfo(topic_dict)
     print("K means ...")
     [self.clusters, self.centroids] = kmeans.kmeans(self.tar_list, k)
예제 #37
    def initialize_population(self):

        points = self.generate_random_array()
        clusters, centroids = kmeans.kmeans(points, self.k)

        initial_population = []

        for i in range(1, self.k + 1):

            tmp_points = points[clusters == i]
            costs = [self.cf.fitness(point) for point in tmp_points]
            max_idx = min(len(costs), self.pop_size // self.k)
            best_indexes = sorted(range(len(costs)),
                                  key=lambda i: costs[i])[:max_idx]
                for idx in best_indexes

        if len(initial_population) < self.pop_size:
            initial_population = np.append(
                    self.pop_size - len(initial_population), self.ind_size),

        return np.array(initial_population)
예제 #38
    def test_kmeans_9(self):
        dataset = self.__load_dataset()

        out = kmeans(dataset, 9)
        percentage = avg_iou(dataset, out)

        np.testing.assert_almost_equal(percentage, 0.672, decimal=2)
예제 #39
def clusterEigenvectors(k, laplacian, maxIterations):
    #print "---Eigenvector Clustering---"
    # Call kmeans to cluster the resulting eigenvectors
    clusters = kmeans(k, laplacian, maxIterations)
    return clusters
예제 #40
def spectral(W, k):
    SPECTRUAL spectral clustering

            W: Adjacency matrix, N-by-N matrix
            k: number of clusters

            idx: data point cluster labels, n-by-1 vector.
    # begin answer
    N = W.shape[0]
    D = (np.array(np.sum(W, axis=1)).T)[0]
    #D = np.array(np.sum(W, axis=1))
    L = np.diag(D) - W
    # DLD
    D_ = np.diag(1.0 / np.sqrt(D))
    L = np.dot(np.dot(D_, L), D_)
    value, vector = np.linalg.eig(L)
    value = zip(value, range(N))
    value = sorted(value, key=lambda x: x[0])
    a, b = value[1]
    #H = vector[:, 1]
    H = (np.array(vector[:, b]).T)[0]
    t1 = np.mean(H)
    t2 = np.std(H)
    H = (H - t1) / t2
    H = np.array([H]).T
    res = kmeans(H, 2)
    return res
예제 #41
 def _get_anchors(self,
                  input_shape=(224, 224),
         @input_shape tuple (h, w)
         @bboxes_in format: [ [[xmin,ymin, xmax, ymax, label],], ]
                     value range: x [0, w], y [0, h]
         @return anchors, format: 10 value tuple
     w = input_shape[1]
     h = input_shape[0]
     # TODO: add position to iou, not only box size
     bboxes = []
     for items in bboxes_in:
         for bbox in items:
                 ((bbox[2] - bbox[0]) / w, (bbox[3] - bbox[1]) / h))
     bboxes = np.array(bboxes)
     self.log.i(f"bboxes num: {len(bboxes)}, first bbox: {bboxes[0]}")
     out = kmeans.kmeans(bboxes, k=clusters)
     iou = kmeans.avg_iou(bboxes, out) * 100
     self.log.i("bbox accuracy(IOU): {:.2f}%".format(iou))
     self.log.i("bound boxes: {}".format(",".join(
         "({:f},{:.2f})".format(item[0] * w, item[1] * h) for item in out)))
     for i, wh in enumerate(out):
         out[i][0] = wh[0] * w / strip_size
         out[i][1] = wh[1] * h / strip_size
     anchors = list(out.flatten())
     self.log.i(f"anchors: {anchors}")
     ratios = np.around(out[:, 0] / out[:, 1], decimals=2).tolist()
     self.log.i("w/h ratios: {}".format(sorted(ratios)))
     return anchors
예제 #42
def train_model(k=2):

	#Train k-Means on the training data
	model = kmeans.kmeans(n_clusters=k)

	#Predict back the training ratings and compute the RMSE
	XtrainHat = model.predict(Xtrain,Xtrain)
	tr= model.rmse(Xtrain,XtrainHat)

	#Predict the validation ratings and compute the RMSE
	XvalHat = model.predict(Xtrain,Xval)
	val= model.rmse(Xval,XvalHat)

	#Predict the test ratings and compute the RMSE
	XtestHat = model.predict(Xtrain,Xtest)
	te= model.rmse(Xtest,XtestHat)

	#Get the cluster assignments for the training data
	z = model.cluster(Xtrain)
	print(z) , len(z)

	#Get the clusters 
	centers = model.get_centers()

	print("K=%d Errors: %.7f %.7f "%(k,tr,val))
예제 #43
def gmm_init(k, samples):
    init a gauss mixture model for all samples
    using kmeans algorithm
    weights don't sum up to 1
    centers = km.kmeans(k, samples)
    clusters = km.cluster(samples, centers)

    #params is a list of (mean, sigma, weight)
    #    shapec = np.shape(centers[0])
    shapes = np.shape(np.outer(samples[0], samples[0]))

    #params = [[np.zeros_like(centers[0]), np.zeros(shapes), 0]]*k
    params = [None] * k
    for i in range(k):
        cluster, center = clusters[i], centers[i]
        num_samples = len(cluster)
        deviation = np.zeros(shapes)
        for sample in cluster:
            diff = sample - center
            deviation += np.outer(diff, diff)
        deviation /= len(cluster)
        params[i] = [center, deviation, num_samples]
    return params
def ikmeans(data_points):

    data_points: np.ndarray
        The data points to be clustered
    (np.ndarray, np.ndarray)
        2-D array of centroids
        1-D array of affectation list of data nodes
    n = len(data_points)
    logging.info("Starting the modified Spectral Clustering")
    # Calculate the eigenvectors
    logging.info("Calculating the eigenvectors")
    _, u = la.eigh(data_points)
    k = 3
    cc = aff = None
    # run the iterations
    while True:
        _u = u[:, :k]
        centroids = km.get_random_initial_centroids(_u, k)
        cg = np.average(_u, axis=0)
        cg.resize((1, len(cg)))
        centroids = np.concatenate((centroids, cg), axis=0)
        cc, aff = km.kmeans(_u, k, centroids)
        if len(np.unique(aff)) == k or k == n:
            k += 1

    return cc, aff
예제 #45
def MNIST_eval_euclidean(metric_func, numbers=[1,2,3], 
                         nrange=range(10,100,10), num_avg=10):
    """Return metric evaluation on MNIST dataset using Euclidean distance
    on all the algorithms.

    metric_func - metric being evaluated
    numbers - digits chosen in MNIST data set
    nrange - range of N's to be tested, number of data points
    num_avg - number of times we cluster the same points and take
              the average, min, and max 
    kmedoids_metric - metric computed with K-medoids
    kmeans_metric - metric computed with K-means
    kmeans_sklearn_metric - metric with kmeans from sklearn
    digits = datasets.load_digits()
    images = digits.images
    kmedoids_metric = []
    kmeans_metric = []
    kmeans_sklearn_metric = []
    for n in nrange:
        # generate true labels
        labels = np.concatenate([[m]*n for m in numbers])

        data = np.concatenate([
          images[np.where(digits.target==i)][np.random.choice(range(173), n)] 
          for i in numbers
        data2 = data.reshape(len(data), 64)

        m1 = []; m2 = []; m3 = [];
        for i in range(num_avg):
            # our algorithms
            j1, _ = kmedoids.kmedoids(len(numbers),
            j2, _ = kmeans.kmeans(len(numbers), data2, distance.euclidean)
            # sklearn k-means
            km = KMeans(len(numbers))
            j3 = km.fit(data2).labels_
            a = metric_func(labels, j1)
            b = metric_func(labels, j2)
            c = metric_func(labels, j3)
        kmedoids_metric.append([np.mean(m1), np.min(m1), np.max(m1)])
        kmeans_metric.append([np.mean(m2), np.min(m2), np.max(m2)])
        kmeans_sklearn_metric.append([np.mean(m3), np.min(m3), np.max(m3)])
    return kmedoids_metric, kmeans_metric, kmeans_sklearn_metric
예제 #46
def gauss_eval(dist_matrix_kmedoids, dist_func_kmeans, metric_func, 
               nrange=range(10,100,10), num_avg=5):
    """Return metric evaluation on gaussian dataset against N.
    Compare K-medoids and K-means.
    dist_matrix_kmedoids - function to generate the distance matrix for
    dist_func_kmeans - distance function to be used in kmeans
    metric_func - metric function being evaluated
    nrange - range of N's to be tested, number of data points
    num_avg - number of times we cluster the same points and take
              the average, min, and max 
    kmedoids_metric - metric computed with K-medoids
    kmeans_metric - metric computed with K-means
    kmeans_sklearn_metric - metric with kmeans from sklearn
    kmedoids_metric = []
    kmeans_metric = []
    kmeans_sklearn_metric = []
    # we generate data with n points in each cluster and evaluate 
    # the algorithms
    for n in nrange:
        data = np.concatenate((
            np.random.multivariate_normal([0, 0], [[4,0], [0,1]], n),
            np.random.multivariate_normal([3, 5], [[1,0.8], [0.8,2]], n),
            np.random.multivariate_normal([-2, 3], [[0.5,0], [0,0.5]], n))
        labels = np.concatenate([[m]*n for m in range(3)])

        m1 = []
        m2 = []
        m3 = []
        k = 3
        for i in range(num_avg):
            j1, _ = kmedoids.kmedoids(k, dist_matrix_kmedoids(data))
            j2, _ = kmeans.kmeans(k, data, dist_func_kmeans)
            km = KMeans(k)
            r = km.fit(data)
            j3 = r.labels_

            a = metric_func(labels, j1)
            b = metric_func(labels, j2)
            c = metric_func(labels, j3)

        kmedoids_metric.append([np.mean(m1), np.min(m1), np.max(m1)])
        kmeans_metric.append([np.mean(m2), np.min(m2), np.max(m2)])
        kmeans_sklearn_metric.append([np.mean(m3), np.min(m3), np.max(m3)])
    return kmedoids_metric, kmeans_metric, kmeans_sklearn_metric
예제 #47
def dominant_colors(path, k):
    with timer("Image loaded: {}"):
        image = Image.open(path)
    with timer("Transform points: {}"):
        pts = points(image)
    with timer("Calculate centers: {}"):
        centers = kmeans(pts, k)
    return centers
예제 #48
 def get_dominant_colors(self):
     img = Image.open(self.wallpaper)
     img.thumbnail((300, 300))  # Resize to speed up python loop.
     width, height = img.size
     points = self._get_points_from_image(img)
     rgbs = kmeans.kmeans(points, self.k)
     #rgbs = [map(int, c.center.coords) for c in clusters]
     return [self.rgb_to_hex(rgb) for rgb in rgbs]
예제 #49
파일: test.py 프로젝트: hellokang/kmeans
def test_single_point():
    value = [0, 10, 20]
    points = [
        [value, 1]
    k = 1
    means = kmeans(points, k)
    assert 1 == len(means)
    assert value == means[0]
예제 #50
파일: GMM.py 프로젝트: Alan215/ml
def GMM(K, data, stop_times):
    num = data.shape[0]
    dim = data.shape[1]

    clusters = kmeans.kmeans(K, data, 100)

    mul = np.zeros((K, dim))
    cov = np.zeros((K, dim, dim))
    for i in range(K):
        mul[i] = np.mean(clusters[i],axis=0)
        cov[i] = np.cov(clusters[i].T)

    #latent variable
    z = np.zeros((num, K))

    times = 0
    # print data[999]
        p = np.zeros((num, K))
        #E step
        for i in range(num):
            for j in range(K):
                p[i, j] = np.exp(-1/2*(data[i]-mul[j]).dot(inv(cov[j])).dot((data[i]-mul[j]).T))*det(cov[j])**(-1/2)
                # print p[i,j]

            for j in range(K):
                z[i,j] = p[i, j]/np.sum(p[i, :])
        #M step
        for j in range(K):
            tmp = np.zeros((1,dim))
            for i in range(num):
                tmp += z[i,j]*data[i]
            mul[j] = tmp/np.sum(z[:,j])
        for j in range(K):
            tmp = np.zeros((dim,dim))
            for i in range(num):
                tmp += z[i,j]*np.outer((data[i]-mul[j]),(data[i]-mul[j]))
            cov[j] = tmp/np.sum(z[:,j])
        times += 1
        # print times
    print mul, cov

    out = []
    fig = plt.figure()
    ax = fig.gca(projection='3d')
    for i in range(K):
        ax.plot_trisurf(data[:,0], data[:,1], out[i], color=np.random.rand(50))
    # ax.set_xlim(-40, 40)
    # ax.set_ylim(-40, 40)
    # ax.set_zlim(-100, 100)

 def do_analysis(self):
     partnum = len(self.cfg.user_subsets)
     start_description = range(partnum)
     log.info("Performing subset splitting using kmeans")
     # Create the first scheme
     start_scheme = scheme.create_scheme(self.cfg, "start_scheme", start_description)
     for i in start_scheme:
         # Save the alignment path
         i.make_alignment(self.cfg, self.alignment)
         phylip_file = i.alignment_path
         print phylip_file
         # Add option to output likelihoods, *raxml version takes more 
         # modfying of the commands in the analyse function
         phyml.analyse("GTR", str(phylip_file), "./analysis/start_tree/filtered_source.phy_phyml_tree.txt", "unlinked", "--print_site_lnl")
         phyml_lk_file = str(phylip_file) + "_phyml_lk_GTR.txt"
         likelihood_dictionary = kmeans.phyml_likelihood_parser(phyml_lk_file)
     print start_scheme
예제 #52
def runBenchmark(N=200, M=10000, K=3, usePCA=False, n_components=2):

	# note that order of clusters will vary from run to run, so track which true pop with each cluster
	#	along with true frac count

	# sample N indivs without replacement, and also get first M snps for each geno
	indices = random.sample(range(len(genoArr_)), N)
	indivs_copy = np.array([copy.deepcopy(indivs_[i]) for i in indices])
	for i in range(N):
		indivs_copy[i].geno = np.array(indivs_copy[i].geno[:M])
		indivs_copy[i].j = i 		# also update position in new indiv list

	genoArr_copy = np.array([genoArr_[i][:M] for i in indices])

	### TIMING ###
	pcaTime = 0
	def pca_i(): # zero input fxn for timeit
		return PCA_nocluster.pca_transform(indivs_copy, genoArr_copy, n_components)

	if usePCA:
		#print("timing pca...") # test 1 runs. genoArr_copy isn't changed from run-to-run
		#(indivs_copy does get changed, but shouldn't affect run since it restarts each time)
		# unlike kmeans, pca is deterministic so runtime shouldn't vary. also pca step is slower, bottleneck.
		genoArr_copy_geno = genoArr_copy  # make a copy of genotype data first (as opposed to components)
		pcaTime = timeit.timeit(pca_i, number = 2)/2.0 

		# get components for genoArr_copy for kmeans (both timing and quality runs)
		# pca deterministic, so don't need to rerun for each quality trial
		pcaObj, genoArr_copy = PCA_nocluster.pca_transform(indivs_copy, genoArr_copy, n_components)

	def kmeans_i(): # zero input fxn for timeit
		return kmeans.kmeans(indivs_copy, genoArr_copy, K)

	# timing kmeans, run 10x per data pt, avg
	#print("timing k-means 10x...")
	kmeansTime = timeit.timeit(kmeans_i, number = 10)/10.0 

	### QUALITY ###
	majFracAvgsByRun = np.zeros(10)

	for run in range(10):
		#a = np.asarray_chkfinite(indivs_copy)
		#a = np.asarray_chkfinite(genoArr_copy)
		centers = kmeans.kmeans(indivs_copy, genoArr_copy, K, maxIter = 1000, verbose = False)
		kmeansObj = kmeans.kmeansObj(indivs_copy, centers)
		majPops, majFracs, clusterSizes = majorityPop(indivs_copy, K)
		majFracAvgsByRun[run] = 1.0*sum(majFracs)/K  # unweighted avg across clusters

	majFrac_avg = np.mean(majFracAvgsByRun)
	majFrac_std = np.std(majFracAvgsByRun)

	# pcaTime is 0 if pca isn't used
	return kmeansObj, majFrac_avg, majFrac_std, kmeansTime, majPops, majFracs, pcaTime
예제 #53
파일: enkf.py 프로젝트: fatadama/estimation
def clusterConvergence2ModesL(xk, ym, Rk, yk):
    Np = xk.shape[1]
    p = xk.shape[0]
    d = ym.shape[0]
    # evaluate the unimodal fit
    # compute the mean
    mu1 = np.mean(xk, axis=1)
    # compute the covariance
    coef = 1.0 / (float(Np) - 1.0)
    Pxx = np.zeros((2, 2))
    for k in range(Np):
        Pxx = Pxx + coef * np.outer(xk[:, k] - mu1, xk[:, k] - mu1)
    Ly1 = np.zeros(Np)
    for k in range(Np):
        yexp = yk[:, k]
        # compute the PDF of y given xk[:,k]
        pyx = gaussianNormalPdf(ym - yexp, np.zeros(d), Rk)
        # compute the PDF of xk[:,k]
        px = gaussianNormalPdf(xk[:, k], mu1, Pxx)
        Ly1[k] = pyx * px
        # evaluate the bimodal fit
    Ly2 = np.zeros(Np)
    Pxx2 = np.zeros((2, 2, 2))
    mux2 = np.zeros((2, 2))
    (idxk, mui) = kmeans.kmeans(xk.transpose(), 2)
    for jk in range(2):
        idx = np.nonzero(idxk == jk)
        idx = idx[0]
        # compute the covariance for the jkth mode
        N2 = len(idx)
        # error checking to prevent single-particle clusters, which don't make sense and break the covariance computation
        if N2 == 1:
            # set Ly2 to zero & break
            Ly2 = np.zeros(Np)
        coef = 1.0 / (float(N2) - 1.0)
        mu2 = np.mean(xk[:, idx], axis=1)
        mux2[jk, :] = mu2
        Px2 = np.zeros((2, 2))
        for k in idx:
            Px2 = Px2 + coef * np.outer(xk[:, k] - mu2, xk[:, k] - mu2)
        Pxx2[jk, :, :] = Px2.copy()
        for k in idx:
            yexp = yk[:, k]
            # compute the PDF of y given xk[:,k]
            pyx = gaussianNormalPdf(ym - yexp, np.zeros(d), Rk)
            # compute the PDF of xk[:,k]
            px1 = gaussianNormalPdf(xk[:, k], mu2, Px2)
            Ly2[k] = pyx * px
    print("L1 = %g, L2 = %g" % (Ly1.max(), Ly2.max()))
    if not (Ly2.max() > Ly1.max()):
        idxk = np.zeros(Np)
        # mui = np.mean(xk,axis=1).transpose()
        return (1, idxk, mu1, Pxx)
    return (2, idxk, mux2, Pxx2)
예제 #54
파일: test.py 프로젝트: hellokang/kmeans
def test_two_points():
    real_mean = [10, 10, 10]
    points = [
        [(0, 0, 0), 1],
        [(20, 20, 20), 1]
    k = 1
    means = kmeans(points, k)
    assert 1 == len(means)
    assert real_mean == means[0]
예제 #55
파일: test.py 프로젝트: hellokang/kmeans
def test_two_points_with_weights():
    real_mean = [20, 20, 20]
    points = [
        [(0, 0, 0), 1],
        [(30, 30, 30), 2]
    k = 1
    means = kmeans(points, k)
    assert 1 == len(means)
    assert real_mean == means[0]
예제 #56
def KmeansCluster(FileName = "",Num = 3):
    dataSet = []
    fileIn = open(FileName)
    for line in fileIn.readlines():
        lineArr = line.strip().split(',')
        dataSet.append([float(i) for i in lineArr])
    dataSet = mat(dataSet)
    k = Num
    centroids, clusterAssment = km.kmeans(dataSet, k)
    return centroids,clusterAssment
def clustering_map(cityMap, k):
    cluster the city map into k clusters.
    # convert all the cityMap vertices into k-means structure
    points = [kmeans.Point([cityMap.pos[v][0], cityMap.pos[v][1]], v) for v in cityMap.rv]

    # Cluster those data!
    opt_cutoff = 0.5
    clusters = kmeans.kmeans(points, k, opt_cutoff)
예제 #58
파일: test.py 프로젝트: hellokang/kmeans
def test_two_points_two_centers():
    values = [
        [0, 10, 20],
        [-100, -400, -1600]
    points = [
        [value, 1] for value in values
    k = 2
    means = kmeans(points, k)
    assert 2 == len(means)
    for value in values:
        assert value in means