def train(self, pos, neg, k=7, nofIterations=20, encoding="utf8"): with open(pos, encoding=encoding) as fpos: tweets_pos = fpos.readlines() with open(neg, encoding=encoding) as fneg: tweets_neg = fneg.readlines() print("representing training data...") tweets_pos = np.array( [self.representation(tweet) for tweet in tweets_pos]) tweets_neg = np.array( [self.representation(tweet) for tweet in tweets_neg]) print("fitting...") self.__cent_pos = kmeans(tweets_pos, k=k, iterationCount=nofIterations) self.__cent_neg = kmeans(tweets_neg, k=k, iterationCount=nofIterations) ntotal = len(tweets_pos) + len(tweets_neg) ncorrect = 0 self._clf = (self.__cent_pos, self.__cent_neg) self._store_clf() for t in tweets_pos: if self._predict(t) == 1: ncorrect += 1 for t in tweets_neg: if self._predict(t) == -1: ncorrect += 1 accuracy = ncorrect / ntotal print("classifier trained") print(f"accuracy on training set:{accuracy}")
def train_function(x, y, model, window_state): if not model: class Model: centroids = None k = self.k sum_iterations = 0 sum_error = 0 i = 0 model = Model() if model.centroids is not None and self.incremental: [centroids, index, i] = kmeans(x, model.k, model.centroids, draw=self.draw, output=self.output) else: [centroids, index, i] = kmeans(x, model.k, draw=self.draw, output=self.output) model.centroids = centroids self.centroids = centroids error = evaluate_error(x, centroids, index) if self.output: print "Error: ", error model.sum_iterations += i model.sum_error += error model.i += 1 return model
def kmeansMain(): kmax = 10 clustering = [] clustering.append(np.load('hw2_data/kmeans/clustering1.npy')) clustering.append(np.load('hw2_data/kmeans/clustering2.npy')) clustering.append(np.load('hw2_data/kmeans/clustering3.npy')) clustering.append(np.load('hw2_data/kmeans/clustering4.npy')) iterator = 1 bests = [2, 3, 4, 5] for cluster in clustering: transposed = np.transpose(cluster) mins = [] maxs = [] for dim in transposed: mins.append(min(dim)) maxs.append(max(dim)) bestObjs = [] for k in range(1, kmax + 1): objs = [] for init in range(k): centers = [] for i in range(k): data = [] for j in range(len(mins)): data.append(np.random.uniform(mins[j], maxs[j])) centers.append(data) _, obj = kmeans(cluster, np.array(centers)) objs.append(obj) bestObjs.append(min(objs)) plt.plot(range(1, kmax + 1), bestObjs) plt.ylabel("Value of Objective Function") plt.xlabel("k") plt.title("Cluster " + str(iterator)) plt.show() k = bests[iterator - 1] centers2 = [] for i in range(k): data2 = [] for j in range(len(mins)): data2.append(np.random.uniform(mins[j], maxs[j])) centers2.append(data2) resCenters, _ = kmeans(cluster, np.array(centers2)) assignments = assign_clusters(cluster, resCenters) assigned = [] assigned2 = [] for i in range(k): assigned.append([]) for i in range(len(cluster)): assigned[assignments[i]].append(cluster[i]) for arr in assigned: assigned2.append(np.array(arr)) plt.clf() colors = ['or', 'ok', 'ob', 'og', 'oy'] for i in range(k): plt.plot(assigned2[i][:, 0], assigned2[i][:, 1], colors[i]) plt.title("K-means Clusters") plt.show() iterator += 1
def clusterConvergence2Modes(xk, activeMeans): # evaluate the fit of two means to the data testMeans = 2 (idxk, mui) = kmeans.kmeans(xk.transpose(), testMeans) # dimension of the data p = xk.shape[0] # numnber of particles N = xk.shape[1] # evaluate the mean and covariance of each cluster meansk = np.zeros((p, testMeans)) Pkkk = np.zeros((p, p, testMeans)) # evaluate the mean and covariance of each cluster for k in range(testMeans): # compute the mean of all members where meansIdx == k idx = np.nonzero(idxk == k) idx = idx[0] meansk[:, k] = mui[k, :].transpose() # compute the covariance Pkk = np.zeros((p, p)) coef = 1.0 / (float(N) - 1.0) for j in idx: Pkk = Pkk + coef * np.outer(xk[:, j] - meansk[:, k], xk[:, j] - meansk[:, k]) Pkkk[:, :, k] = Pkk.copy() # evaluate the likelihood for each point, under the bimodal assumption L2 = np.zeros(N) for k in range(N): # assume the likelihood is proportional to the PDF pxk = gaussianNormalPdf(xk[:, j], meansk[:, idxk[k]], Pkkk[:, :, idxk[k]]) L2[k] = pxk # Akaike information criterion for bimodal case AIC2 = 2.0 * 4 - 2.0 * math.log(np.max(L2)) # evaluate the likelihood under the monomodal assumption P11 = np.zeros((p, p)) mux = np.mean(xk, axis=1) for k in range(N): P11 = P11 + (1.0) / (float(N) - 1.0) * np.outer(xk[:, k] - mux, xk[:, k] - mux) L = np.zeros(N) for k in range(N): L[k] = gaussianNormalPdf(xk[:, j], mux, P11) # information criterion for unimodal case AIC = 2.0 * 2 - 2.0 * math.log(np.max(L)) # smaller AIC value is better print("AIC1 = %f,AIC2 = %f, L1 = %f, L2 = %f" % (AIC, AIC2, np.max(L), np.max(L2))) # if AIC < AIC2: if np.max(L) > np.max(L2): (idxk, mui) = kmeans.kmeans(xk.transpose(), 1) return (idxk, mui)
def get(self): url = self.request.get('url') k = int(self.request.get('k', default_value=3)) t = int(self.request.get('t', default_value=144)) result = urlopen(url) # Get image if (result.getcode() == 200): data = result.read() img = images.Image(data) raw_width = img.width raw_height = img.height width = raw_width height = raw_height # Resize to max if raw_width > MAX and raw_height > MAX: if raw_width > raw_height: height = MAX width = width * MAX / height else: width = MAX height = height * MAX / width img = images.resize(data, width, height) reader = png.Reader(bytes=img) # Get pixels (width, height, pixels, meta) = reader.read_flat() n = PIXEL_SIZE (means, counts) = kmeans.kmeans(k, pixels, n, 255, t) colors = [hex_string(means[i * n : i * n + n]) for i in range(len(counts))] output = {} output['colors'] = [{'color': colors[i], 'count': counts[i]} for i in range(len(counts))] output['sum'] = sum(counts) self.response.headers['Content-Type'] = 'application/json' self.response.write(json.dumps(output))
def _init(X, mean, diagonol=False): km = kmeans(K) km.means = mean km.assign_clusters(X) m, sigma, pi = _extract_from_kmeans(km) sigma = _fix_sigma(sigma, def_sigma_tol) return sigma, pi
def vect_quant(input_file_folder, input_file_name, X, means_file, K): # print('Launching Thread vect_quant for ', input_file_folder, input_file_name, K) f = open(means_file + 'K' + str(K) + '.txt', 'r') lines = f.readlines() f.close() f = None w = [] for point in lines: w.append(np.fromstring(point, dtype=float, sep=' ')) means = np.array(w) w = None point = None lines = None gc.collect() friday = kmeans(K) friday.means = means friday.initialize_clusters(X) friday.assign_clusters(X) means = None gc.collect() clusters = friday.clusters firday = None gc.collect() output_folder = input_file_folder + '/vq' + str(K) if not os.path.exists(output_folder): os.makedirs(output_folder) output_file = open(output_folder + '/' + input_file_name[:-4] + '.txt', 'w') for c in clusters: output_file.write(str(c) + ' ')
def main(): # Input file that contains the input for # commuters, cabs and destination location input_file = "sample_inputs/%s" % SAMPLE_INPUT_FILE if len(sys.argv) == 2: input_file = sys.argv[1] # Parse input file commuters, cabs, destination = parse_input_file(input_file) # Create clusters of commuters clusters = kmeans(commuters, cabs) # Centroid of each cluster represents virtual centre of # each commuter group groups = [c.centroid for c in clusters] # Find distance between each commuter group and cab all_distances = get_group_cab_distances(groups, cabs) # Find optimal total distance and route travelled by all the cabs optimal_distance, optimal_route = optimal_total_distance( all_distances, groups, cabs) # Add total distance between group and destination to optimal distance optimal_distance = add_destination_distance(optimal_distance, groups, destination) # Print answer print_answer(optimal_distance, optimal_route, clusters, cabs)
def find_k_star(data_points, threshold): """ Starting with 1 cluster, finds out the optimum value of k*. In each iteration, the value of number of clusters is doubled. """ num_clusters = 1 prev_cohesion = None while True: clusters, cohesion = kmeans(data_points, num_clusters) if prev_cohesion: change_rate = float(abs(cohesion - prev_cohesion)) / ( prev_cohesion * num_clusters / 2) if change_rate < threshold: break prev_cohesion = cohesion num_clusters *= 2 if num_clusters > len(data_points): print len(data_points) return # perform binary search to find kstar. As num of clusters have already been doubled in previous for loop # so start corresponds to num_clusters/4 and end to num_clusters/2 kstar = binary_search(data_points, num_clusters / 4, num_clusters / 2, threshold) print kstar
def KmeansCluster(FileName = "",Num = 3): dataSet = [] fileIn = open(FileName) for line in fileIn.readlines(): lineArr = line.strip().split(',') dataSet.append([float(i) for i in lineArr]) dataSet = mat(dataSet) k = Num centroids, clusterAssment = km.kmeans(dataSet, k) return centroids,clusterAssment ##Demo # FileName = "/Users/nevin47/Desktop/项目/学术/TopTen_Python/DataMiningTopTen/KMeans/Table/FINAL.csv" # Center,Out = KmeansCluster(FileName,3) # # print Out # # dataSet = [] # fileIn = open(FileName) # for line in fileIn.readlines(): # lineArr = line.strip().split(',') # dataSet.append([float(i) for i in lineArr]) # dataSet = mat(dataSet) # # C = transpose(mat(Center[1])) # # D = dataSet[1] # # print D*C # Result = km.CalVectorDistance(dataSet,Out) # # print Result[:,0,:] # SUM = 0 # for i in range(dataSet.shape[0]): # SUM += km.CalVectorCoefficient(Result,i,3) # print SUM/dataSet.shape[0] # # print km.CalVectorCoefficient(Result,0,3)
def part1_q4(dataset, show_graph=False): print(">>> PART 1: QUESTION 4") clustering = kmeans(dataset, 2, initCentroids=[('i1', 3, 2), ('i2', 4, 8)], distance_type='Manhattan') printTable(clustering["centroids"]) if show_graph: showClusters2D(clustering) print('')
def segment(self): algorithm = self.algorithmComboBox.currentText() imgPath = str(self.userpath.text()) index = self.featureComboBox.currentIndex() features = ["INTENSITY","INTENSITY+LOC","RGB","YUV","LM", "ILM","PCA"] if algorithm == "K-means": k = int(self.kText.text()) iterations = int(self.iterationsText.text()) epsilon = float(self.epsilonText.text()) print imgPath,index,k,iterations,epsilon org = cv.LoadImageM(imgPath) im = kmeans.kmeans(imgPath,features[index],k,iterations,epsilon) cv.ShowImage("original",org) cv.ShowImage("segmented",im) elif algorithm == "Mean Shift": if index == 4: QtGui.QMessageBox.information(self, 'Error', 'LM is not supported in mean shift') return print imgPath,features[index] org = cv.LoadImageM(imgPath) im = meanshift.meanshift(imgPath,features[index]) cv.ShowImage("original",org) cv.ShowImage("segmented",im)
def part1_q2(dataset, show_graph=False): print(">>> PART 1: QUESTION 2") clustering = kmeans(dataset, 2, initCentroids=[('i1', 4, 6), ('i2', 5, 4)], distance_type='Euclidean') printTable(clustering["centroids"]) if show_graph: showClusters2D(clustering) print('')
def kmeans_test(request): data = None form = kmeansNumSamplesForm() k=None sample_size=None grouped_data = None clusters = None error_list = None if request.method=='POST': form = kmeansNumSamplesForm(request.POST) if form.is_valid(): sample_size = int(form.cleaned_data['num_samples']) k = int(form.cleaned_data['k']) # Generate random data data = numpy.random.random((sample_size, 2)) # Calculate kmeans if form.cleaned_data['method']=='Basic': grouped_data, clusters, error_list = kmeans.kmeans(data,num_clusters=k, min_error=0.01, max_iter=100) else: grouped_data, clusters, error_list = kmeans.bisecting_kmeans(data,k=k, min_error=0.01, max_iter=50) return render_to_response('visualization/kmeans.html', { 'data': grouped_data, 'clusters': clusters, 'error_list': error_list, 'form':form, 'k': k, 'sample_size': sample_size, }, context_instance=RequestContext(request))
def main(): import matplotlib.pyplot as plt from sklearn.datasets.samples_generator import make_blobs n_centers = 3 X, y = make_blobs(n_samples=1000, centers=n_centers, n_features=2, cluster_std=0.7, random_state=0) # Run this K-Means import kmeans t0 = time.time() y_pred, centers, obj_val_seq = kmeans.kmeans(X, n_centers) t1 = time.time() print("Final obj val: {}".format(obj_val_seq[-1])) print("Time taken (this implementation): {}".format(t1 - t0)) # Run scikit-learn's K-Means from sklearn.cluster import k_means t0 = time.time() centers, y_pred, obj_val = k_means(X, n_centers, random_state=0) t1 = time.time() print("Final obj val: {}".format(obj_val)) print("Time taken (Scikit, 1 job): {}".format(t1 - t0)) # Plot change in objective value over iteration fig = plt.figure() ax = fig.add_subplot(111) ax.plot(obj_val_seq, 'b-', marker='*') fig.suptitle("Change in K-means objective value across iterations") ax.set_xlabel("Iteration") ax.set_ylabel("Objective value") fig.show() # Plot data from itertools import cycle colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') fig = plt.figure(figsize=plt.figaspect(0.5)) # Make twice as wide to accomodate both plots ax = fig.add_subplot(121) ax.set_title("Data with true labels and final centers") for k, color in zip(range(n_centers), colors): ax.plot(X[y==k, 0], X[y==k, 1], color + '.') initial_centers = kmeans.init_centers(X, n_centers, 2) # This is valid because we always use the same random seed. # Plot initial centers for x in initial_centers: ax.plot(x[0], x[1], "mo", markeredgecolor="k", markersize=8) # Plot final centers for x in centers: ax.plot(x[0], x[1], "co", markeredgecolor="k", markersize=8) # Plot assignments colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') ax = fig.add_subplot(122) ax.set_title("Data with final assignments") for k, color in zip(range(n_centers), colors): ax.plot(X[y_pred==k, 0], X[y_pred==k, 1], color + '.') fig.tight_layout() fig.gca() fig.show()
def main(filename, k, function): file = open(filename) pts = [] for line in file.readlines(): line = line.split() pts.append(( int(line[0]), int(line[1]))) file.close() group, s = kmeans.kmeans(pts, k, function) print group print s f = open(filename +'cluster', 'w') for each in range(len(group)): f.write( str(group[each])+'\n\n') cc = numpy.random.rand(10000) for i in group[each]: matplotlib.pyplot.scatter(i[0],i[1],s=2, color=cc) pylab.plot(s[each][0], s[each][1], '+') f.close() ll = filename + 'cluster' ll = 'gedit '+ ll ll = ll.split() process = Popen(ll) matplotlib.pyplot.show()
def main(): finput = open('../resources/BB.txt','r') mat = np.loadtxt(finput,delimiter=' ',) finput.close() X = prepareX() k = 10 min_samples = 4 opt_cutoff = 0.5 threshold = 500 reduce_threshold = 600 est = kmeans(X,k,opt_cutoff) est2 = kmeans2(X,opt_cutoff,threshold,reduce_threshold) (cluster_label0,id2point) = buildLabels(est,X) (cluster_label01,id2point) = buildLabels(est2,X) cluster_label1 = SpectralClustering(k).fit_predict(mat) cluster_label1 = buildLabels2(cluster_label1,X) cluster_label2 = DBSCAN(min_samples=min_samples).fit_predict(mat) cluster_label2 = buildLabels2(cluster_label2,X) np.savetxt('../resources/KMeans.txt', cluster_label0, fmt='%s', newline='\n', header='', footer='', comments='# ') np.savetxt('../resources/KMeans2.txt', cluster_label01, fmt='%s', newline='\n', header='', footer='', comments='# ') np.savetxt('../resources/SpectralClustering.txt', cluster_label1, fmt='%s', newline='\n', header='', footer='', comments='# ') np.savetxt('../resources/DBSCAN.txt', cluster_label2, fmt='%s', newline='\n', header='', footer='', comments='# ') np.savetxt('../resources/ID2Point.txt', id2point, fmt=["%s",]*3, newline='\n')
def clustering(): # Create pairs for clustering (age/eng/prog/uni) pairs = [] for i, val in enumerate(uni): if(val > 0 and prog[i] > 0): pairs.append((val, prog[i])) # Run k-means print(pairs) seeds = [pairs[0], pairs[1], pairs[2]] clusters = kmeans.kmeans(pairs, seeds) # Show diagram with coloured clusters for i in clusters: cluster = clusters[i] xaxis = [] yaxis = [] for pair in cluster: xaxis.append(pair[0]) yaxis.append(pair[1]) print(yaxis) plt.plot(xaxis, yaxis, "o") plt.axis([0,12,0,12]) plt.xlabel('University years') plt.ylabel('Programming skill') plt.show()
def __init__(self, inputs, targets, nRBF, sigma=0, usekmeans=0, normalise=0): self.nin = inputs.shape[1] self.nout = targets.shape[1] self.ndata = inputs.shape[0] self.nRBF = nRBF self.usekmeans = usekmeans self.normalise = normalise if usekmeans: self.kmeansnet = kmeans.kmeans(self.nRBF, inputs) self.hidden = np.zeros((self.ndata, self.nRBF + 1)) if sigma == 0: d = (inputs.max(axis=0) - inputs.min(axis=0)).max() self.sigma = d / np.sqrt(2 * nRBF) else: self.sigma = sigma self.perceptron = pcn.pcn(self.hidden[:, :-1], targets) self.weights1 = np.zeros((self.nin, self.nRBF))
def sc(data_points: ndarray, k: int) -> (ndarray, list): """ sc(data_points, k) Run the spectral clustering Parameters ---------- data_points : ndarray Data points to be clustered. k : int number of clusters Returns ------- (ndarray, list) cc: The centroids of clusters aff: The affectation of each node to it's cluster if aff[i] = j then the node i is in the cluster j """ logging.info("Starting Spectral Clustering") # Calculate the eigenvectors logging.info("Calculating the eigenvectors") _, u = la.eigh(data_points, eigvals=(0, k)) # Run the k-means logging.info("Running k-means") # cc, aff = cl.vq.kmeans2(u, k) cc, aff = kmeans(u, k) return cc, aff
def questao21(): dset = load_dataset('dataset1.csv') xo = dset.T[1].astype(float) # segunda coluna x = dset.T[1].astype(float) # segunda coluna yo = dset.T[2].astype(float) # terceira coluna y = dset.T[2].astype(float) # terceira coluna # a normalização com z-score ajudou na visualização e é necessária para agrupamento #x = [z_score(x, xi) for xi in x] #y = [z_score(y, yi) for yi in y] #centros_iniciais = [(z_score(xo, 1), z_score(yo, 2)), (z_score(xo, 4), z_score(yo, 2))] centros_iniciais = [(1, 2), (4, 2)] pontos = zip(x, y) clusters, iteracoes = kmeans(pontos, 2, centros_iniciais=centros_iniciais) cluster1 = clusters[0].pontos cluster2 = clusters[1].pontos plt.plot([xi[0] for xi in cluster1], [yi[1] for yi in cluster1], 'ro') plt.plot([clusters[0].centroide[0]], [clusters[0].centroide[1]], 'r*') plt.plot([xi[0] for xi in cluster2], [yi[1] for yi in cluster2], 'go') plt.plot([clusters[1].centroide[0]], [clusters[1].centroide[1]], 'g*') plt.savefig('grupo1.png') print "Novos centróides:", clusters[0].centroide, " e ", clusters[ 1].centroide
def main(): try: inputFile = datFile = sys.argv[1] k = sys.argv[2] except Exception as e: print('Oh No! => %s' % e) print('Usage:\npython3 ./main.py <data.mat> <k>') sys.exit(2) mat = spio.loadmat(inputFile, squeeze_me=True) fname = os.path.splitext(inputFile) rawdata = mat[fname[0]] print(fname[0]) dataIn = rawdata[:, (0, 1)] mu, clusters = km.kmeans(int(k), 10, 0.00000001, dataIn) colors = ["r", "b", "g", "k", "c", "y", "m"] for i in range(0, max(clusters) + 1): indices = myFind(clusters, lambda x: x == i) x = dataIn[indices, 0] y = dataIn[indices, 1] plt.scatter(x, y, c=colors[i]) plt.scatter(mu[i, 0], mu[i, 1], c="yellow", marker="*", s=300) plt.title("K-means results") plt.show()
def test_kmeans(): name = "input-test-kmeans.txt" out = kmeans.kmeans(name) #print "gmis result = ", out #note: all the indexest of the graph above are different from gmis, as here we start at 0, #but gmis starts at 1, which means that we have a shift for all nodes expectedresult1 = [ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ] expectedresult2 = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 ] failed = False for i in xrange(len(expectedresult1)): if out[i] != expectedresult1[i]: failed = True if failed: for i in xrange(len(expectedresult2)): if out[i] != expectedresult2[i]: failed = True if failed: sys.stderr.write( "Possible problem in test_kmeans(). Might only be a different random output typical of k-means: " + str(out) + "\n")
def main(): # Read in the dataset X = np.loadtxt('data/fisher_iris_data.csv', delimiter=',').T Y = np.loadtxt('data/fisher_iris_labels.csv', dtype=str) labels = list(set(Y)) # get unique labels num_classes = len(labels) Y = [labels.index(y) for y in Y] # convert labels to integers ## Plot dataset fig = plt.figure() plt.subplot(2,2,1) plt.scatter(X[0, :], X[1, :]) plt.title('Data without labels') plt.subplot(2,2,2) plt.scatter(X[0, :], X[1, :], c=Y) plt.title('Data with true labels') ax1 = plt.subplot(2,2,1) ax2 = plt.subplot(2,2,4) scatterPlot = 0 num_evals = 100 kmeans_performance = np.zeros(num_evals) kmeans_solutions = [] for i in range(num_evals): print('\nK-Means Run ', i+1) cluster_idx = kmeans(X, num_classes, ax1=ax1, ax2=ax2) kmeans_performance[i] = evaluate(X, Y, cluster_idx) print('Accuracy: ', kmeans_performance[i]) kmeans_solutions.append(cluster_idx) plt.subplot(2, 2, 3) plt.hist(kmeans_performance)
def sampleClustersGraph(): data, featureNames = load_2d_data() k = 6 iniCenters = orderedCenters(data, k) clusters, centers = kmeans(data, k, initialCenters=iniCenters) graphClusters(clusters, centers, data, featureNames)
def train(self, X): """ Compute K-Means clustering on each class label and store your result in self.cluster_centers_ :param X: inputs of training data, a 2D Numpy array :return: None """ self.cluster_centers_ = kmeans(X, self.k, self.max_iter, self.tol)
def __init__(self,inputs,targets,nRBF,sigma=0,usekmeans=0,normalise=0): self.nin = shape(inputs)[1] self.nout = shape(targets)[1] self.ndata = shape(inputs)[0] self.nRBF = nRBF self.usekmeans = usekmeans self.normalise = normalise #print "Initalizing RBFN with parameters: " #print "Inputs : " + str(shape(inputs)) #print "targets : " + str(shape(targets)) #print "nRBF : " + str(nRBF) # print "Sigma : " + str(sigma) # print "K-Means : " + str(usekmeans) # print "Normalise: " + str(normalise) # print if usekmeans: self.kmeansnet = kmeans.kmeans(self.nRBF,inputs) self.hidden = zeros((self.ndata,self.nRBF+1)) if sigma==0: # Set width of Gaussians d = (inputs.max(axis=0)-inputs.min(axis=0)).max() self.sigma = d/sqrt(2*nRBF) else: self.sigma = sigma self.perceptron = pcn.pcn(self.hidden[:,:-1],targets) # Initialise network self.weights1 = zeros((self.nin,self.nRBF))
def cluster_data(inputpath, outputpath, index_column, name_column, columns_to_load, num_clusters, max_iterations): names, input_data = read_and_normalise_csv(inputpath, index_column, name_column, columns_to_load) cluster_map = kmeans(input_data, num_clusters, max_iterations) f = None try: f = open(outputpath, 'w') except FileNotFoundError: try: f = open(outputpath, 'x') except: print("ERROR: could not load file") return (0, [], 0) #cluster_ary = sorted(list(zip(names, cluster_map)), key = lambda x: x[1], reverse=True) cluster_ary = [[] for c in range(num_clusters)] for i in range(len(names)): cluster_ary[cluster_map[i]].append(names[i]) f.write(json.dumps(cluster_ary)) f.close() return cluster_ary
def __init__(self, topic_dict, vocab_list, k = 4): print("create cluster model") self.k = k self.vocab_list = vocab_list self.tar_list = self.__dict2dataInfo(topic_dict, vocab_list) print("K means ...") [self.clusters, self.centroids] = kmeans.kmeans(self.tar_list, k)
def spectral(W, k): ''' SPECTRUAL spectral clustering Input: W: Adjacency matrix, N-by-N matrix k: number of clusters Output: idx: data point cluster labels, n-by-1 vector. ''' # YOUR CODE HERE n, n = np.shape(W) idx = np.zeros((n ,1)) D = np.zeros((n, n)) for i in range(n): D[i][i] = np.sum(W[i][:]) L = D - W # begin answer eng = np.linalg.eig(L) enval = eng[0] en = eng[1] sort_index = np.argsort(enval) topk = en[:,sort_index[:k]] idx = kmeans(topk, k) return idx
def KmeansCluster(FileName="", Num=3): dataSet = [] fileIn = open(FileName) for line in fileIn.readlines(): lineArr = line.strip().split(',') dataSet.append([float(i) for i in lineArr]) dataSet = mat(dataSet) k = Num centroids, clusterAssment = km.kmeans(dataSet, k) return centroids, clusterAssment ##Demo # FileName = "/Users/nevin47/Desktop/项目/学术/TopTen_Python/DataMiningTopTen/KMeans/Table/FINAL.csv" # Center,Out = KmeansCluster(FileName,3) # # print Out # # dataSet = [] # fileIn = open(FileName) # for line in fileIn.readlines(): # lineArr = line.strip().split(',') # dataSet.append([float(i) for i in lineArr]) # dataSet = mat(dataSet) # # C = transpose(mat(Center[1])) # # D = dataSet[1] # # print D*C # Result = km.CalVectorDistance(dataSet,Out) # # print Result[:,0,:] # SUM = 0 # for i in range(dataSet.shape[0]): # SUM += km.CalVectorCoefficient(Result,i,3) # print SUM/dataSet.shape[0] # # print km.CalVectorCoefficient(Result,0,3)
def getDepht(self, im3d, bbox): x1 = int(bbox[0]) y1 = int(bbox[1]) x2 = int(bbox[2]) y2 = int(bbox[3]) average = 0.0 strIm = "" num = 0 filteredImage = {} for y in range(y1, y2 + 1): strIm += "|" for x in range(x1, x2 + 1): d = im3d[y][x][0] strIm += "{:>3}|".format(d) #if d<=7: continue filteredImage[(y, x)] = d average += 1.0 * d num += 1 strIm += "\n" #print strIm c1, c2 = kmeans(filteredImage, 2, 1000, 320, 240) depth1 = average / num depth2 = min([c1, c2]) depth1 = (depth1 + 18.7579) / 0.5181 depth2 = (depth2 + 18.7579) / 0.5181 print depth1 print depth2 if abs(depth1 - depth2) > 10: depth = depth2 else: depth = depth1 return depth
def questao21(): dset = load_dataset('dataset1.csv') xo = dset.T[1].astype(float) # segunda coluna x = dset.T[1].astype(float) # segunda coluna yo = dset.T[2].astype(float) # terceira coluna y = dset.T[2].astype(float) # terceira coluna # a normalização com z-score ajudou na visualização e é necessária para agrupamento #x = [z_score(x, xi) for xi in x] #y = [z_score(y, yi) for yi in y] #centros_iniciais = [(z_score(xo, 1), z_score(yo, 2)), (z_score(xo, 4), z_score(yo, 2))] centros_iniciais = [(1,2), (4,2)] pontos = zip(x, y) clusters, iteracoes = kmeans(pontos, 2, centros_iniciais=centros_iniciais) cluster1 = clusters[0].pontos cluster2 = clusters[1].pontos plt.plot([xi[0] for xi in cluster1], [yi[1] for yi in cluster1], 'ro') plt.plot([clusters[0].centroide[0]], [clusters[0].centroide[1]], 'r*') plt.plot([xi[0] for xi in cluster2], [yi[1] for yi in cluster2], 'go') plt.plot([clusters[1].centroide[0]], [clusters[1].centroide[1]], 'g*') plt.savefig('grupo1.png') print "Novos centróides:", clusters[0].centroide, " e ", clusters[1].centroide
def __init__(self, inputs, targets, nRBF, sigma=0, usekmeans=0, normalise=0): self.nin = shape(inputs)[1] self.nout = shape(targets)[1] self.ndata = shape(inputs)[0] self.nRBF = nRBF self.usekmeans = usekmeans self.normalise = normalise if usekmeans: self.kmeansnet = kmeans.kmeans(self.nRBF, inputs) self.hidden = zeros((self.ndata, self.nRBF + 1)) if sigma == 0: # Set width of Gaussians d = (inputs.max(axis=0) - inputs.min(axis=0)).max() self.sigma = d / sqrt(2 * nRBF) else: self.sigma = sigma self.perceptron = pcn.pcn(self.hidden[:, :-1], targets) # Initialise network self.weights1 = zeros((self.nin, self.nRBF))
def main(algorithm, data, cl_labels, min_k, max_k, max_iterations, epsilon): results, silhouette, chs, ssws, ssbs, ars, hom, comp = [], [], [], [], [], [], [], [] membership, centroids, labels = [], [], [] for c in range(min_k, max_k + 1): if algorithm == 'kmeans': labels, centroids = kmeans.kmeans(data, c) elif algorithm == 'bisecting_kmeans': labels, centroids = bisecting_kmeans.bisecting_kmeans(data, c) elif algorithm == 'fuzzy_cmeans': membership, centroids = fuzzyCmeans.execute(data, max_iterations, c, epsilon) labels = fuzzyCmeans.get_labels(len(data), membership) silhouette.append((c, metrics.silhouette_score(data, labels, metric='euclidean'))) chs.append((c, metrics.calinski_harabaz_score(data, labels))) ssws.append((c, utils.get_ssw(data, centroids, labels))) ssbs.append((c, utils.get_ssb(centroids))) ars.append((c, metrics.adjusted_rand_score(cl_labels, labels))) hom.append((c, metrics.homogeneity_score(cl_labels, labels))) comp.append((c, metrics.completeness_score(cl_labels, labels))) results.append(("Silhouette", "", zip(*silhouette)[0], "", zip(*silhouette)[1], 333, "blue")) results.append(("Calinski-Harabaz Index", "", zip(*chs)[0], "", zip(*chs)[1], 334, "blue")) results.append(("Intra cluster Variance", "", zip(*ssws)[0], "", zip(*ssws)[1], 331, "blue")) results.append(("Inter cluster Variance", "", zip(*ssbs)[0], "", zip(*ssbs)[1], 332, "blue")) results.append(("Adjusted Rand Index", "", zip(*ars)[0], "", zip(*ars)[1], 335, "orange")) results.append(("Homogeneity", "", zip(*hom)[0], "", zip(*hom)[1], 336, "orange")) results.append(("Completeness", "", zip(*comp)[0], "", zip(*comp)[1], 337, "orange")) print(labels) utils.plot_results(results, algorithm)
def __init__(self, topic_dict, vocab, k=4): print("create cluster model") self.k = k self.vocab_info = [vocab, len(list(vocab.keys()))] self.tar_list = self.__dict2dataInfo(topic_dict) print("K means ...") [self.clusters, self.centroids] = kmeans.kmeans(self.tar_list, k)
def initialize_population(self): points = self.generate_random_array() clusters, centroids = kmeans.kmeans(points, self.k) initial_population = [] for i in range(1, self.k + 1): tmp_points = points[clusters == i] costs = [self.cf.fitness(point) for point in tmp_points] max_idx = min(len(costs), self.pop_size // self.k) best_indexes = sorted(range(len(costs)), key=lambda i: costs[i])[:max_idx] [ initial_population.append(tmp_points[idx]) for idx in best_indexes ] if len(initial_population) < self.pop_size: initial_population = np.append( initial_population, self.generate_random_array( self.pop_size - len(initial_population), self.ind_size), axis=0) return np.array(initial_population)
def test_kmeans_9(self): dataset = self.__load_dataset() out = kmeans(dataset, 9) percentage = avg_iou(dataset, out) np.testing.assert_almost_equal(percentage, 0.672, decimal=2)
def clusterEigenvectors(k, laplacian, maxIterations): #print #print "---Eigenvector Clustering---" # Call kmeans to cluster the resulting eigenvectors clusters = kmeans(k, laplacian, maxIterations) return clusters
def spectral(W, k): ''' SPECTRUAL spectral clustering Input: W: Adjacency matrix, N-by-N matrix k: number of clusters Output: idx: data point cluster labels, n-by-1 vector. ''' # YOUR CODE HERE # begin answer N = W.shape[0] D = (np.array(np.sum(W, axis=1)).T)[0] #D = np.array(np.sum(W, axis=1)) L = np.diag(D) - W # DLD D_ = np.diag(1.0 / np.sqrt(D)) L = np.dot(np.dot(D_, L), D_) value, vector = np.linalg.eig(L) print(vector.shape) value = zip(value, range(N)) value = sorted(value, key=lambda x: x[0]) a, b = value[1] #H = vector[:, 1] H = (np.array(vector[:, b]).T)[0] t1 = np.mean(H) t2 = np.std(H) H = (H - t1) / t2 H = np.array([H]).T res = kmeans(H, 2) return res
def _get_anchors(self, bboxes_in, input_shape=(224, 224), clusters=5, strip_size=32): ''' @input_shape tuple (h, w) @bboxes_in format: [ [[xmin,ymin, xmax, ymax, label],], ] value range: x [0, w], y [0, h] @return anchors, format: 10 value tuple ''' w = input_shape[1] h = input_shape[0] # TODO: add position to iou, not only box size bboxes = [] for items in bboxes_in: for bbox in items: bboxes.append( ((bbox[2] - bbox[0]) / w, (bbox[3] - bbox[1]) / h)) bboxes = np.array(bboxes) self.log.i(f"bboxes num: {len(bboxes)}, first bbox: {bboxes[0]}") out = kmeans.kmeans(bboxes, k=clusters) iou = kmeans.avg_iou(bboxes, out) * 100 self.log.i("bbox accuracy(IOU): {:.2f}%".format(iou)) self.log.i("bound boxes: {}".format(",".join( "({:f},{:.2f})".format(item[0] * w, item[1] * h) for item in out))) for i, wh in enumerate(out): out[i][0] = wh[0] * w / strip_size out[i][1] = wh[1] * h / strip_size anchors = list(out.flatten()) self.log.i(f"anchors: {anchors}") ratios = np.around(out[:, 0] / out[:, 1], decimals=2).tolist() self.log.i("w/h ratios: {}".format(sorted(ratios))) return anchors
def train_model(k=2): #Train k-Means on the training data #k=2 model = kmeans.kmeans(n_clusters=k) model.fit(Xtrain) #Predict back the training ratings and compute the RMSE XtrainHat = model.predict(Xtrain,Xtrain) tr= model.rmse(Xtrain,XtrainHat) #Predict the validation ratings and compute the RMSE XvalHat = model.predict(Xtrain,Xval) val= model.rmse(Xval,XvalHat) #Predict the test ratings and compute the RMSE XtestHat = model.predict(Xtrain,Xtest) te= model.rmse(Xtest,XtestHat) #Get the cluster assignments for the training data z = model.cluster(Xtrain) print(z) , len(z) #Get the clusters centers = model.get_centers() print(centers) print("K=%d Errors: %.7f %.7f "%(k,tr,val))
def gmm_init(k, samples): """ init a gauss mixture model for all samples using kmeans algorithm weights don't sum up to 1 """ centers = km.kmeans(k, samples) clusters = km.cluster(samples, centers) #params is a list of (mean, sigma, weight) # shapec = np.shape(centers[0]) shapes = np.shape(np.outer(samples[0], samples[0])) #params = [[np.zeros_like(centers[0]), np.zeros(shapes), 0]]*k params = [None] * k for i in range(k): cluster, center = clusters[i], centers[i] num_samples = len(cluster) deviation = np.zeros(shapes) for sample in cluster: diff = sample - center deviation += np.outer(diff, diff) deviation /= len(cluster) params[i] = [center, deviation, num_samples] return params
def ikmeans(data_points): """ Parameters ---------- data_points: np.ndarray The data points to be clustered Returns ------- (np.ndarray, np.ndarray) 2-D array of centroids 1-D array of affectation list of data nodes """ n = len(data_points) logging.info("Starting the modified Spectral Clustering") # Calculate the eigenvectors logging.info("Calculating the eigenvectors") _, u = la.eigh(data_points) k = 3 cc = aff = None # run the iterations while True: _u = u[:, :k] centroids = km.get_random_initial_centroids(_u, k) cg = np.average(_u, axis=0) cg.resize((1, len(cg))) centroids = np.concatenate((centroids, cg), axis=0) cc, aff = km.kmeans(_u, k, centroids) if len(np.unique(aff)) == k or k == n: break else: k += 1 return cc, aff
def MNIST_eval_euclidean(metric_func, numbers=[1,2,3], nrange=range(10,100,10), num_avg=10): """Return metric evaluation on MNIST dataset using Euclidean distance on all the algorithms. Input ----- metric_func - metric being evaluated numbers - digits chosen in MNIST data set nrange - range of N's to be tested, number of data points num_avg - number of times we cluster the same points and take the average, min, and max Output ------ kmedoids_metric - metric computed with K-medoids kmeans_metric - metric computed with K-means kmeans_sklearn_metric - metric with kmeans from sklearn """ digits = datasets.load_digits() images = digits.images kmedoids_metric = [] kmeans_metric = [] kmeans_sklearn_metric = [] for n in nrange: # generate true labels labels = np.concatenate([[m]*n for m in numbers]) data = np.concatenate([ images[np.where(digits.target==i)][np.random.choice(range(173), n)] for i in numbers ]) data2 = data.reshape(len(data), 64) m1 = []; m2 = []; m3 = []; for i in range(num_avg): # our algorithms j1, _ = kmedoids.kmedoids(len(numbers), distance.euclidean_matrix(data2)) j2, _ = kmeans.kmeans(len(numbers), data2, distance.euclidean) # sklearn k-means km = KMeans(len(numbers)) j3 = km.fit(data2).labels_ a = metric_func(labels, j1) b = metric_func(labels, j2) c = metric_func(labels, j3) m1.append(a) m2.append(b) m3.append(c) kmedoids_metric.append([np.mean(m1), np.min(m1), np.max(m1)]) kmeans_metric.append([np.mean(m2), np.min(m2), np.max(m2)]) kmeans_sklearn_metric.append([np.mean(m3), np.min(m3), np.max(m3)]) return kmedoids_metric, kmeans_metric, kmeans_sklearn_metric
def gauss_eval(dist_matrix_kmedoids, dist_func_kmeans, metric_func, nrange=range(10,100,10), num_avg=5): """Return metric evaluation on gaussian dataset against N. Compare K-medoids and K-means. Input ----- dist_matrix_kmedoids - function to generate the distance matrix for kmedoids dist_func_kmeans - distance function to be used in kmeans metric_func - metric function being evaluated nrange - range of N's to be tested, number of data points num_avg - number of times we cluster the same points and take the average, min, and max Output ------ kmedoids_metric - metric computed with K-medoids kmeans_metric - metric computed with K-means kmeans_sklearn_metric - metric with kmeans from sklearn """ kmedoids_metric = [] kmeans_metric = [] kmeans_sklearn_metric = [] # we generate data with n points in each cluster and evaluate # the algorithms for n in nrange: data = np.concatenate(( np.random.multivariate_normal([0, 0], [[4,0], [0,1]], n), np.random.multivariate_normal([3, 5], [[1,0.8], [0.8,2]], n), np.random.multivariate_normal([-2, 3], [[0.5,0], [0,0.5]], n)) ) labels = np.concatenate([[m]*n for m in range(3)]) m1 = [] m2 = [] m3 = [] k = 3 for i in range(num_avg): j1, _ = kmedoids.kmedoids(k, dist_matrix_kmedoids(data)) j2, _ = kmeans.kmeans(k, data, dist_func_kmeans) km = KMeans(k) r = km.fit(data) j3 = r.labels_ a = metric_func(labels, j1) b = metric_func(labels, j2) c = metric_func(labels, j3) m1.append(a) m2.append(b) m3.append(c) kmedoids_metric.append([np.mean(m1), np.min(m1), np.max(m1)]) kmeans_metric.append([np.mean(m2), np.min(m2), np.max(m2)]) kmeans_sklearn_metric.append([np.mean(m3), np.min(m3), np.max(m3)]) return kmedoids_metric, kmeans_metric, kmeans_sklearn_metric
def dominant_colors(path, k): with timer("Image loaded: {}"): image = Image.open(path) with timer("Transform points: {}"): pts = points(image) with timer("Calculate centers: {}"): centers = kmeans(pts, k) return centers
def get_dominant_colors(self): img = Image.open(self.wallpaper) img.thumbnail((300, 300)) # Resize to speed up python loop. width, height = img.size points = self._get_points_from_image(img) rgbs = kmeans.kmeans(points, self.k) #rgbs = [map(int, c.center.coords) for c in clusters] return [self.rgb_to_hex(rgb) for rgb in rgbs]
def test_single_point(): value = [0, 10, 20] points = [ [value, 1] ] k = 1 means = kmeans(points, k) assert 1 == len(means) assert value == means[0]
def GMM(K, data, stop_times): num = data.shape[0] dim = data.shape[1] clusters = kmeans.kmeans(K, data, 100) mul = np.zeros((K, dim)) cov = np.zeros((K, dim, dim)) for i in range(K): mul[i] = np.mean(clusters[i],axis=0) cov[i] = np.cov(clusters[i].T) #latent variable z = np.zeros((num, K)) times = 0 # print data[999] while(times<stop_times): #compute p = np.zeros((num, K)) #E step for i in range(num): for j in range(K): p[i, j] = np.exp(-1/2*(data[i]-mul[j]).dot(inv(cov[j])).dot((data[i]-mul[j]).T))*det(cov[j])**(-1/2) # print p[i,j] for j in range(K): z[i,j] = p[i, j]/np.sum(p[i, :]) #M step for j in range(K): tmp = np.zeros((1,dim)) for i in range(num): tmp += z[i,j]*data[i] mul[j] = tmp/np.sum(z[:,j]) for j in range(K): tmp = np.zeros((dim,dim)) for i in range(num): tmp += z[i,j]*np.outer((data[i]-mul[j]),(data[i]-mul[j])) cov[j] = tmp/np.sum(z[:,j]) times += 1 # print times print mul, cov out = [] fig = plt.figure() ax = fig.gca(projection='3d') for i in range(K): out.append(np.diag(np.exp(-1/2*(data-mul[i]).dot(inv(cov[i])).dot((data-mul[i]).T))/((2*np.pi)*det(cov[i])**(1/2)))) ax.plot_trisurf(data[:,0], data[:,1], out[i], color=np.random.rand(50)) ax.set_xlabel('X') # ax.set_xlim(-40, 40) ax.set_ylabel('Y') # ax.set_ylim(-40, 40) ax.set_zlabel('Z') # ax.set_zlim(-100, 100) plt.show()
def do_analysis(self): partnum = len(self.cfg.user_subsets) start_description = range(partnum) log.info("Performing subset splitting using kmeans") # Create the first scheme start_scheme = scheme.create_scheme(self.cfg, "start_scheme", start_description) for i in start_scheme: # Save the alignment path i.make_alignment(self.cfg, self.alignment) phylip_file = i.alignment_path print phylip_file # Add option to output likelihoods, *raxml version takes more # modfying of the commands in the analyse function phyml.analyse("GTR", str(phylip_file), "./analysis/start_tree/filtered_source.phy_phyml_tree.txt", "unlinked", "--print_site_lnl") phyml_lk_file = str(phylip_file) + "_phyml_lk_GTR.txt" likelihood_dictionary = kmeans.phyml_likelihood_parser(phyml_lk_file) kmeans.kmeans(likelihood_dictionary) print start_scheme
def runBenchmark(N=200, M=10000, K=3, usePCA=False, n_components=2): # note that order of clusters will vary from run to run, so track which true pop with each cluster # along with true frac count # sample N indivs without replacement, and also get first M snps for each geno indices = random.sample(range(len(genoArr_)), N) indivs_copy = np.array([copy.deepcopy(indivs_[i]) for i in indices]) for i in range(N): indivs_copy[i].geno = np.array(indivs_copy[i].geno[:M]) indivs_copy[i].j = i # also update position in new indiv list genoArr_copy = np.array([genoArr_[i][:M] for i in indices]) ### TIMING ### pcaTime = 0 def pca_i(): # zero input fxn for timeit return PCA_nocluster.pca_transform(indivs_copy, genoArr_copy, n_components) if usePCA: #print("timing pca...") # test 1 runs. genoArr_copy isn't changed from run-to-run #(indivs_copy does get changed, but shouldn't affect run since it restarts each time) # unlike kmeans, pca is deterministic so runtime shouldn't vary. also pca step is slower, bottleneck. genoArr_copy_geno = genoArr_copy # make a copy of genotype data first (as opposed to components) pcaTime = timeit.timeit(pca_i, number = 2)/2.0 # get components for genoArr_copy for kmeans (both timing and quality runs) # pca deterministic, so don't need to rerun for each quality trial pcaObj, genoArr_copy = PCA_nocluster.pca_transform(indivs_copy, genoArr_copy, n_components) def kmeans_i(): # zero input fxn for timeit return kmeans.kmeans(indivs_copy, genoArr_copy, K) # timing kmeans, run 10x per data pt, avg #print("timing k-means 10x...") kmeansTime = timeit.timeit(kmeans_i, number = 10)/10.0 ### QUALITY ### majFracAvgsByRun = np.zeros(10) for run in range(10): #a = np.asarray_chkfinite(indivs_copy) #a = np.asarray_chkfinite(genoArr_copy) centers = kmeans.kmeans(indivs_copy, genoArr_copy, K, maxIter = 1000, verbose = False) kmeansObj = kmeans.kmeansObj(indivs_copy, centers) majPops, majFracs, clusterSizes = majorityPop(indivs_copy, K) majFracAvgsByRun[run] = 1.0*sum(majFracs)/K # unweighted avg across clusters majFrac_avg = np.mean(majFracAvgsByRun) majFrac_std = np.std(majFracAvgsByRun) # pcaTime is 0 if pca isn't used return kmeansObj, majFrac_avg, majFrac_std, kmeansTime, majPops, majFracs, pcaTime
def clusterConvergence2ModesL(xk, ym, Rk, yk): Np = xk.shape[1] p = xk.shape[0] d = ym.shape[0] # evaluate the unimodal fit # compute the mean mu1 = np.mean(xk, axis=1) # compute the covariance coef = 1.0 / (float(Np) - 1.0) Pxx = np.zeros((2, 2)) for k in range(Np): Pxx = Pxx + coef * np.outer(xk[:, k] - mu1, xk[:, k] - mu1) Ly1 = np.zeros(Np) for k in range(Np): yexp = yk[:, k] # compute the PDF of y given xk[:,k] pyx = gaussianNormalPdf(ym - yexp, np.zeros(d), Rk) # compute the PDF of xk[:,k] px = gaussianNormalPdf(xk[:, k], mu1, Pxx) Ly1[k] = pyx * px # evaluate the bimodal fit Ly2 = np.zeros(Np) Pxx2 = np.zeros((2, 2, 2)) mux2 = np.zeros((2, 2)) (idxk, mui) = kmeans.kmeans(xk.transpose(), 2) for jk in range(2): idx = np.nonzero(idxk == jk) idx = idx[0] # compute the covariance for the jkth mode N2 = len(idx) # error checking to prevent single-particle clusters, which don't make sense and break the covariance computation if N2 == 1: # set Ly2 to zero & break Ly2 = np.zeros(Np) break coef = 1.0 / (float(N2) - 1.0) mu2 = np.mean(xk[:, idx], axis=1) mux2[jk, :] = mu2 Px2 = np.zeros((2, 2)) for k in idx: Px2 = Px2 + coef * np.outer(xk[:, k] - mu2, xk[:, k] - mu2) Pxx2[jk, :, :] = Px2.copy() for k in idx: yexp = yk[:, k] # compute the PDF of y given xk[:,k] pyx = gaussianNormalPdf(ym - yexp, np.zeros(d), Rk) # compute the PDF of xk[:,k] px1 = gaussianNormalPdf(xk[:, k], mu2, Px2) Ly2[k] = pyx * px print("L1 = %g, L2 = %g" % (Ly1.max(), Ly2.max())) if not (Ly2.max() > Ly1.max()): idxk = np.zeros(Np) # mui = np.mean(xk,axis=1).transpose() return (1, idxk, mu1, Pxx) return (2, idxk, mux2, Pxx2) """
def test_two_points(): real_mean = [10, 10, 10] points = [ [(0, 0, 0), 1], [(20, 20, 20), 1] ] k = 1 means = kmeans(points, k) assert 1 == len(means) assert real_mean == means[0]
def test_two_points_with_weights(): real_mean = [20, 20, 20] points = [ [(0, 0, 0), 1], [(30, 30, 30), 2] ] k = 1 means = kmeans(points, k) assert 1 == len(means) assert real_mean == means[0]
def KmeansCluster(FileName = "",Num = 3): dataSet = [] fileIn = open(FileName) for line in fileIn.readlines(): lineArr = line.strip().split(',') dataSet.append([float(i) for i in lineArr]) dataSet = mat(dataSet) k = Num centroids, clusterAssment = km.kmeans(dataSet, k) return centroids,clusterAssment
def clustering_map(cityMap, k): """ cluster the city map into k clusters. """ # convert all the cityMap vertices into k-means structure points = [kmeans.Point([cityMap.pos[v][0], cityMap.pos[v][1]], v) for v in cityMap.rv] # Cluster those data! opt_cutoff = 0.5 clusters = kmeans.kmeans(points, k, opt_cutoff) cityMap.node_clusters(clusters)
def test_two_points_two_centers(): values = [ [0, 10, 20], [-100, -400, -1600] ] points = [ [value, 1] for value in values ] k = 2 means = kmeans(points, k) assert 2 == len(means) for value in values: assert value in means