def main(): # # of clusters k = 3 data = pd.read_csv("data_Noah_Preprocessing.csv", encoding='utf-8') data = data.drop(['pitch_type'], axis=1) result, finalCentroids = k_means(data.values, n_clusters=k) #fill the dataframe with kMeans result cluster = pd.DataFrame(data=result, columns=['cluster#']) outData = pd.read_csv("data_Noah_Preprocessing.csv", encoding='utf-8') outData = appendCol(outData, cluster) outData = outData.sort_values(by=['cluster#', 'pitch_type']) kMeansAccuracy(outData, k) centroids = pd.DataFrame(data=finalCentroids, columns=['x', 'y', 'speed', 'az']) #dummy values for visualization dummy = pd.DataFrame(data={'pitch_type': ['GG'] * k, 'cluster#': [k] * k}) centroids = appendCol(centroids, dummy) #dummy values for visualization dummy2 = pd.DataFrame( data={'sizedummy': ['4'] * data.shape[0] + ['10'] * k}) outData = outData.append(centroids, sort=False, ignore_index=True) outData = appendCol(outData, dummy2) outData.to_csv("data_Noah_Cluster.csv", index_label=False, index=False)
def spectral_cluster(data): # 生成laplace 矩阵 lm = laplace_matrix(data) # 通过laplace矩阵 求特征值 特征向量 eg_values, eg_vectors = linalg.eig(lm) # 排序 idx = eg_values.argsort() eg_vectors = eg_vectors[:idx] m = len(data) # eg_data就是教程中的u eg_data = [[] for x in range(m)] for i in range(m): # u中的第i行 :y[i] eg_data[i] = [0 for x in range(k)] # 选取前K个特征征 for j in range(k): eg_data[i][j] = eg_vectors[i][j] return k_means(eg_data)
def do_experiment(train_data, test_data): """ conducts single experiment INPUT: train_data: (data_raw, data) test_data: (data_raw, data) bool learn: use unsupervised learning or classic receiver? OUTPUT: BER """ x_raw, x = train_data # STEP 1: find clusters # perform jump method to find clusters mapper = k_means(MAX_K) jump, data = mapper.jump_method(x, NUM_ITERATIONS, True, MAX_K) num_clusters_predicted = np.argmax(jump) # find most likely k value # extract data from the run with the most likely k value distortion, means, assign_train = data[num_clusters_predicted] # STEP 2: find mapping # modes saves the bit-values for each cluster mean as integer. It is found #+ by taking the mode of the symbols which have been assigned to each cluster modes = np.zeros(means.size, dtype=np.int8) error_values = np.array([bin(x).count('1') for x in range(MAX_K) ]) # create mapping xor diff -> biterrors for mean in range(num_clusters_predicted): modes[mean] = mode(x_raw[assign_train == mean])[0][0] # STEP 3: evaluate BER x_raw, x = test_data assign = np.argmin(np.abs(x[:, None] - means[None, :]), axis=1) x_recon = modes[assign] # count bit errors- this code is a bit messy diff = x_recon ^ x_raw # bitwise comparison bit_errors = np.sum(error_values[diff]) ber = bit_errors / (NUM_SAMPLES * BITS_PER_SYMBOL) return distortion, means, assign_train, jump, num_clusters_predicted, ber
def main(): filename='iris2d.data' if len(sys.argv) > 1: filename = sys.argv[1] else: print 'Assuming filename as \'iris2d.data\'' interval=[0]*2 print 'Do you want to choose a \'k\' or an interval of \'k\'s?' choice = raw_input('type \'k\' or \'i\': ') if choice == 'k': k = int(raw_input('Type a value of k: ')) interval[0]=interval[1]=k elif choice == 'i': interval = raw_input('Type the two numbers of the interval: ') interval = map(lambda a: int(a), interval.split()) assert len(interval) == 2 else: print 'Assuming k=3' interval[0]=interval[1]=3 print 'f0,f,d1,d2' dots=None for k in range(interval[0], interval[1]+1): dots = parse_file(filename) print 'k =',k result = k_means(k, dots) print result if result[0] > result[1]: print 'f0 is greater than the f from the table,', print 'so we can reject the hypothesis that there is no group (it\'s all OK).' else: print 'f0 is smaller than the f from the table, your K is likely to be inappropriated.' if len(dots[0].pos) == 2: choice = raw_input('Display last result with IBL1? [y/N] ') if choice == 'y': use_ibl(dots)
import numpy as np import pandas as pd import os import sys from k_means import * if __name__ == '__main__': np.set_printoptions(precision=2, suppress=True) # Cortar la impresión de decimales a 1 os.chdir('data') LARGER_DISTANCE = sys.maxsize # Leer los datos de archivo df = pd.read_csv("train_numbers.csv") # rellena los valores faltantes con la media DATA_SET = df.fillna(df.mean()).values # Tamaño del conjunto de datos DATA_LEN = len(DATA_SET) # inicializa el k means k_means(5, 0, DATA_SET, DATA_LEN, LARGER_DISTANCE)
ind = temp.upper().find('-TESTING=') if not ind == -1: testingFilename = str(temp[10:]) continue ''' Reading the training data ''' f = open(trainingFilename, 'r') trainingdata = json.load(f) traininglabels = array(trainingdata['labels']) trainingpoints = array(trainingdata['points']) f.close() # get the vectors usign k-means vectors = k_means(trainingpoints, N, True) ''' ***************************** PART 1 ***************************** ''' # preparing the data for matplot if printPlot: import matplotlib.pyplot as plt x = [] y = [] for point in trainingpoints: x.append(point[0]) y.append(point[1])
from image_utils import * from k_means import * if __name__ == "__main__": file = input( "please enter a image filename to run the k-means clustering algorithm on:" ) k = int( input( "please enter a value for k, the number of colors you would like to cluster the image into:" )) output = (input( "what would you like to name the file that contains the clustered image? (end name with .ppm)" )) image = read_ppm(file) result = k_means(image, k) for mean in range(len(result[0])): for j in range(len(result[1])): for k in range(len(result[1][0])): if (result[1])[j][k] == mean: (image[j][k]) = result[0][mean] save_ppm(output, image)
import numpy as np import k_means from k_means import k_means x = np.random.rand(1000, 2) * 100 kmeans = k_means(number_of_clusters=5, number_of_iteration=10, init='random') points_with_centroid_index, centroids = kmeans.fit(x) kmeans.visualize() kmeans = k_means(number_of_clusters=5, number_of_iteration=10, init='initial_centroids_from_points') points_with_centroid_index, centroids = kmeans.fit(x) kmeans.visualize()
# Angad Cheema cheem011 # Project 1, run_k_means module # function for taking a file from the user and running k means from k_means import * ''' takes file name, k and output file name as input applies k means function, then assigns each pixel to the average color of its respective cluster saves the new image to the output file ''' file = input("Input image file name: ") k = int(input("Input number of colors: ")) output_file = input("Input output file name: ") image = read_ppm(file) means_list, assignments = k_means(image, k) # does k means operation width, height = get_width_height(image) for x in range(width): for y in range(height): image[x][y] = means_list[ assignments[x][y]] # assigns each pixel to closest cluster mean save_ppm(output_file, image) print("Process completed, image saved to", output_file)
gen = data_gen(center, N, K) # data generator dr = data_drawer(K) ch = convex_hull() m = MST(100, K) #mass = gen.normal_gen() #mass = gen.nested_data() mass = gen.get_random_data() # all points #mass = gen.get_real_random_data() #mass = gen.strip_data() sample = mass[:sample_N] smpl = data_gen(center, sample_N, K) smpl.mass = sample s_edges = smpl.get_edges() s_edges.sort() mst = m.get_MST(s_edges) km_obj = k_means(mass, K) #mass = gen.strip_data() #convex = ch.convex_hull(result[0]) #dr.draw_MST(mst, mass) #dr.draw_data(mass) sample_clusters = m.get_clusters( sample) # get clusters from hierarchical clustering centroids = m.get_centroids() # get center of clusters res = 0 # nesting check for k in range(K): convex = ch.convex_hull(sample_clusters[k]) for i in centroids: if len(convex) != 0:
''' Reading the training data ''' f=open(trainingFilename,'r') trainingdata=json.load(f) traininglabels=array(trainingdata['labels']) trainingpoints=array(trainingdata['points']) f.close() # get the vectors usign k-means vectors = k_means(trainingpoints, N, True); ''' ***************************** PART 1 ***************************** ''' # preparing the data for matplot if printPlot: import matplotlib.pyplot as plt x = []; y = []; for point in trainingpoints: x.append(point[0]); y.append(point[1]);
#Ahmed Hassasn from image_utils import * from k_means import * if __name__ == "__main__": file_name = input( "What is the file name of the image you'd like to change?: ") k = int(input("What is the K value you would like to use?: ")) output = input( "What is the file name of the image you'd like to output to?: ") image = read_ppm(file_name) width, height = get_width_height(image) save_ppm(output, k_means(image, k))
def k_means(self, dataList, k, iteration): km = k_means(k, iteration, dataList) cif = tranDF2list(km.getOutPut()) plt = km.getImg() return cif, plt
def main(degree=4, cache_data=False, use_cached_data=False, use_all_data=True, plot=True, windowed=True): """ """ total_start = time.time() fetched_data = get_fetched_data(cache_data=cache_data, use_cached_data=use_cached_data, use_all_data=use_all_data) # Slice data into windows data, window_size = scraper.slice_windows(fetched_data) fft_data, z_list = poly_fit(data, window_size) # freq_vecs = [fft_vec for fft_vec in fft_data] symbol_dict = {i: data[i][1] for i in range(len(data))} total_freqs_data = np.zeros((6, len(data), fft_data[0].shape[1])) for v_ind, fft_vec in enumerate(fft_data): for c_ind, coeff_vec in enumerate(fft_vec): total_freqs_data[c_ind, v_ind] = np.abs(fft_vec[c_ind]) k_min = 1000 k_max = 1050 label_list = [] k_list = [] for c_ind, freq_data in enumerate(total_freqs_data): # print("working on coefficient {}".format(c_ind)) cost_k_list = [] for k in range(k_min, k_max): if k % 5 == 0: print("k is {0} of {1} for coefficient {2}".format( k - k_min, k_max - k_min, c_ind)) clusters, label, cost_list = k_means(freq_data, k) cost = cost_list[-1] cost_k_list.append(cost) opt_k = np.argmin(cost_k_list) + 1 + k_min plt.plot(range(k_min, k_max), cost_k_list, 'g^') plt.plot(opt_k, min(cost_k_list), 'rD') plt.title('Cost vs Number of Clusters') plt.savefig('plots/kmeans_{0}_ks.png'.format(c_ind), format='png') plt.close() X = freq_data clusters, label, cost_list = k_means(X, opt_k) label_list += [np.array(label)] k_list += [opt_k] # pt_cluster = clusters[label.flatten()] # fig = plt.figure() # ax = fig.add_subplot(111, projection="3d") # data_plot = ax.plot(X[:, 2], X[:, 50], X[:, 25], "bo", markersize=1) # center_plot = plt.plot(clusters[:, 0], clusters[:, 1], "g^", markersize=1) # # set up legend and save the plot to the current folder # plt.legend((data_plot, center_plot), ('data', 'clusters'), loc = 'best') # plt.title('Visualization with {} clusters'.format(opt_k)) # # plt.show() # plt.savefig('plots/kmeans_{0}_{1}.png'.format(c_ind, opt_k), format='png') # plt.close() label_list = np.array(label_list) data_list = [] for d_ind in range(label_list.shape[1]): datum = label_list[:, d_ind] out_vec = np.zeros((sum(k_list))) offset = 0 for c_ind in range(len(datum)): out_vec[offset + datum[c_ind]] = 1 offset += k_list[c_ind] data_list += [out_vec] data_list = np.array(data_list) cost_k_list = [] k_min = 1000 k_max = 1050 for k in range(k_min, k_max): if k % 50 == 0: print("on {0} of {1}".format(k, k_max)) clusters, label, cost_list = k_means(data_list, k) cost = cost_list[-1] cost_k_list.append(cost) opt_k = np.argmin(cost_k_list) + 1 clusters, label, cost_list = k_means(X, opt_k) plt.plot(range(k_min, k_max), cost_k_list, 'g^') plt.plot(opt_k, min(cost_k_list), 'rD') plt.title('Cost vs Number of Clusters') plt.savefig('plots/kmeans_layer2_ks.png'.format(c_ind), format='png') plt.close() label_symbols = [] # print(opt_k) total = 0 for i in range(opt_k): ind_arr = np.where(label == i) ind_arr = ind_arr[0] if len(ind_arr) > 0: cluster_syms = [] for ind in ind_arr: cluster_syms += [symbol_dict[ind]] label_symbols += [cluster_syms] total += len(cluster_syms) print(total) label_symbols = np.array(label_symbols) print(label_symbols) print("================================\n Total elapsed time: {}".format( time.time() - total_start)) return label_symbols
return parameters if __name__ == '__main__': param = getParameters() print("################# kmeans ######################") print("1 - charger les données iris") print("2 - appliquer kmeans sur des données générées aléatoirement") print("3 - tester la methode Elbow sur des données aleatoires") mode = int(input("choisissez le mode: ")) if mode == 1: points = read_iris_data() k_means(points, param['numberOfClusters'], param['maxNumberOfRepetitions'], iris=True) print( "Verifiez les fichiers iris_centers et iris_results pour voir les resultats de l'appel." ) elif mode == 2: points = generatepoints(param['numberOfDatas'], param['dataDimension'], param['minRandomNumber'], param['maxRandomNumber']) k_means(points, param['numberOfClusters'], param['maxNumberOfRepetitions']) print( "Verifiez les fichiers centroids et results pour voir les resultats de l'appel" )
def get(self): blog_data, blog_names = get_data() result = k_means(blog_data) named_list = elements_for_names(result, blog_names) return {'res': named_list} # Fetches first column that is Employee ID
from image_utils import * from k_means import * # inputs for the image and k file = input("Enter image file > ") k = int(input('Enter how many colors > ')) print('...processing...') image = read_ppm(file) # k means algorithm used to produce new image newImage = k_means(image, k) print('New modified image generated') # new image saved to file "newImage.ppm" save_ppm("newImage.ppm", newImage) print('New modified image saved')