def main(): """ Main function. """ # parse args parser = argparse.ArgumentParser() parser.add_argument('data', help='hw4_nolabel_train.dat') parser.add_argument('-t', '--trial', type=int, default=500, help='experiment times (default = 500)') parser.add_argument( '-o', '--output_to_png', default=False, action='store_true', help='Output image to files. (default is display on screen)') args = parser.parse_args() # get data data = get_data(args.data) # fit k_list = [2, 4, 6, 8, 10] avg_list = [] var_list = [] for k in k_list: err_list = [] k_means = KMeans(k) for _ in range(args.trial): k_means.fit(data) err_list.append(k_means.calc_err()) err_list = np.array(err_list) avg_list.append(err_list.mean()) var_list.append(err_list.var()) # plot plt.scatter(k_list, avg_list) plt.title('Average of $E_{in}$ vs. $k$') plt.xlabel('$k$') plt.ylabel('Average of $E_{in}$') if args.output_to_png: plt.savefig('q_15') else: plt.show() plt.clf() # plot plt.scatter(k_list, var_list) plt.title('Variance of $E_{in}$ vs. $k$') plt.xlabel('$k$') plt.ylabel('Variance of $E_{in}$') if args.output_to_png: plt.savefig('q_16') else: plt.show() plt.clf()
def _initialize_params(self, data): km = KMeans(self.k) km.fit(data) self.dim = data.shape[-1] _, self.means = km.predict(data) self.means = np.unique(self.means, axis=0) self.pis = np.random.uniform(0, 1, (self.k, )) self.pis = self.pis / np.sum(self.pis) self.covariances = np.array([np.eye(self.dim)] * self.k) * 100000000 self.gammas = np.zeros((data.shape[0], self.k))
def main(input_filepath, output_folder, k): """ Receives the location of the tf-idf scores as a command-line Path argument. """ logger = logging.getLogger(__name__) logger.info( 'Training the K-Means clustering algorithm based on the TF-IDF scores') # Get the models/tf-idf-scores.csv file dataset = pd.read_csv(input_filepath) logger.info('Loaded data file ' + input_filepath + ' with ' + str(len(dataset)) + ' rows') # Removes the first column and formats it like a list x = dataset.drop(dataset.columns[0], axis=1).values vector_dict = generate_vector_dict(dataset) # Number of clusters and max. number of iterations km = KMeans(k=k, max_iterations=500) km.fit(x) clusters = km.get_clusters(vector_dict) # Based on the value of K used, change the destination filename filepath_list = (output_folder + MODEL_REPORT_FILENAME).rsplit('.', 1) output_filepath = filepath_list[0] + '-' + str(k) + '.' + filepath_list[1] # Calculate SSE and MSC sse_score = km.get_sse_score() logger.info('SSE Score: ' + str(sse_score)) msc_score = km.get_msc_avg() logger.info('MSC Score: ' + str(msc_score)) # Generate the results report generate_report(clusters, sse_score, msc_score, output_filepath) logger.info('Created report file on ' + output_filepath) # Generate / Update the results table for future plots if os.path.isfile(output_folder + PLOT_TABLE_FILENAME): # Update the existing file dataset = pd.read_csv(output_folder + PLOT_TABLE_FILENAME) dataset.set_index('K Size', inplace=True) k_means_results = update_plot_results_table(dataset, (k, sse_score, msc_score)) else: # Create and update the file dataset = create_plot_results_table() k_means_results = update_plot_results_table(dataset, (k, sse_score, msc_score)) k_means_results.to_csv(output_folder + PLOT_TABLE_FILENAME, encoding='utf-8') logger.info('Updated report table on ' + output_folder + PLOT_TABLE_FILENAME)
def main(): data = load_data() results = [] np.random.seed(10) # pca_data = pca.pca(data, 2)[0] #pca from scratch # pca_data = pca.pca_s(data, 2) #pca from sk_learn library # code for simple run where k=2 # k=2 # random_centroids = np.random.randint(0, 128, k) # km = KMeans(k) # km.fit(data, random_centroids) for k in range(2, 11): random_centroids = np.random.randint(0, 128, k) km = KMeans(k) results.append(km.fit(data, random_centroids)) #comment this for without pca # results.append(km.fit(pca_data, random_centroids)) #comment this for with pca plt.plot(results, list(range(2, 11))) # plt.show() plt.savefig('k_means.png')
def fit(self, csr): """Apply bisecting k-means""" # initialize k-means with k=2 for bisection kmeans = KMeans(k=2, pct_change=self.k_means_pct_change, max_iter=self.k_means_max_iter) # initialize list of clusters with all points clusters = [range(0, csr.shape[0])] while len(clusters) < self.k: cluster = self.select_next_cluster(clusters) # bisect cluster iter times and select both clusters from split with lowest SSE lowest_sse = None best_split = None for i in range(self.n_iters): print 'Bisecting run # %d/%d, iter # %d/%d' % (len(clusters)+1, self.k-1, i+1, self.n_iters) # split cluster in two using k-means of 2 bisection = kmeans.fit(csr, cluster) split = lambda data, l: [cluster[j] for j, d in enumerate(data) if d == l] x, y = split(bisection, 0), split(bisection, 1) # calculate total SSE of both clusters and store if lowest so far sse_total = self.sse(csr[x, :]) + self.sse(csr[y, :]) if sse_total < lowest_sse or lowest_sse is None: lowest_sse = sse_total best_split = (x, y) # add best cluster split to list clusters.extend(best_split) return self.label_clusters(csr, clusters)
import numpy as np import matplotlib.pyplot as plt from k_means import KMeans k_means = KMeans(2) X = np.loadtxt("realdata.txt")[:, 1:] k_means.fit(X) labels = k_means.labels_ plt.xlabel('Length') plt.ylabel('Width') handles = [] s1 = plt.scatter(X[labels == 0, 0], X[labels == 0, 1], color='r', label="Cluter1", marker='o') handles.append(s1) s2 = plt.scatter(X[labels == 1, 0], X[labels == 1, 1], color='k', label="Cluter2", marker='^') handles.append(s2) plt.legend(handles=handles) plt.title('K-means') plt.show()
import numpy as np from k_means import KMeans from distance import euclidean from mean import mean import pickle DATA_PATH = 'D:\datasets\mnist\large_dataset\mnist_train.csv' print('Loading Data') f = open(DATA_PATH, 'r') data_list = [] for line in f.readlines(): observation = np.asfarray(line.split(',')[1:]) data_list.append(observation / 255) f.close() print('Finished Loading') print('Fitting Started') model = KMeans() clusters = model.fit(data_list, 10, euclidean, mean) print('Fitting Finished') print('Saving Clusters to ./clusters.pkl') f = open('./clusters.pkl', 'wb') pickle.dump(clusters, f, protocol=pickle.HIGHEST_PROTOCOL) f.close()
print(toGreen("Done"), "\nThe data shape: {}".format(data.shape)) if args.save_name is not None: save_path = "../data/" + args.save_name + '.npy' np.save(save_path, data) print("The generated data has been stored in {}".format(save_path)) else: print(toRed("2. Loading saved data...")) file_path = "../data/" + args.use_saved_data + '.npy' data = np.load(file_path) print("Done. The data shape: {}".format(data.shape)) # Process data using the chosen algorithm(s) print(toRed("3. Starting clustering...")) if args.mode == "KM": kmeans = KMeans(args.cluster_num, data.shape[0], max_iter=100) labels, centers, it = kmeans.fit(data) print(toGreen('Done'), '\nPredicted labels:\n', labels, '\nPredicted cluster center points:\n', centers) print(toRed("4. Evaluating...")) score = evaluate(data, labels) print("score:{}".format(score)) if args.visualize == True: print(toRed("5. Visualizing...")) print(toGreen("Done. Check the figure on screen, close it to exit the program.")) visualize_single(args.cluster_num, data, labels, centers) elif args.mode == "KMPP": kmeansplus = KMeansPlus(args.cluster_num, data.shape[0], max_iter=100) labels, centers = kmeansplus.fit(data) print(toGreen('Done'), '\nPredicted labels:\n', labels, '\nPredicted cluster center points:\n', centers) print(toRed("4. Evaluating...")) score = evaluate(data, labels) print("score:{}".format(score))
fig = plt.figure() # make 3D axis ax = fig.add_subplot(111, projection='3d') # scatter plots ax.scatter(images_proj[:, 0], images_proj[:, 1], images_proj[:, 2], zdir='z', s=10, c="blue", depthshade=True) # cluster and plot n_clusters = 345 k_means = KMeans(n_clusters) error = k_means.fit(images_proj) labels = k_means.classify_centroids(images_proj, labels[:10000]) centroids = k_means.centroids print(centroids) ax.scatter(centroids[:, 0], centroids[:, 1], centroids[:, 2], zdir='z', s=500, c="red", depthshade=True) plt.show()