def _init_params(self, x): """Initialize GMM parameters by K-means. :param x: (n_samples, n_features) features. :param n_components: the number of components. :return: Initialized GMM parameters: pi: (n_components,) mixing coefficients mean: (n_components, n_features) means cov: (n_components, n_features, n_features) covariances """ n_samples, n_features = x.shape k_means = KMeans(self.n_components) assigned_indices = k_means.fit_predict(x) mean_init = k_means.centers pi_init = np.zeros(self.n_components) cov_init = np.zeros((self.n_components, n_features, n_features)) for k in range(self.n_components): cond = assigned_indices == k d_k = x[cond] - mean_init[k] pi_init[k] = np.sum(cond) / n_samples cov_init[k] = np.dot(d_k.T, d_k) / np.sum(cond) return pi_init, mean_init, cov_init
def post_kmeantrain(array: str, featurename: str, orderfeature: str): data = pd.read_json(array) columnnames = featurename.split(',') # columnnames = ['DFA', 'violmax', 'maxpeaksqt'] num_examples = data.shape[0] # Get features. x_train = data[[iaxis for iaxis in columnnames]].values.reshape( (num_examples, len(columnnames))) # print(x_train) # Set K-Means parameters. num_clusters = 4 # Number of clusters into which we want to split our training dataset. max_iterations = 50 # maximum number of training iterations. # Init K-Means instance. k_means = KMeans(x_train, num_clusters) # Train K-Means instance. (centroids, closest_centroids_ids) = k_means.train(max_iterations) # print(centroids) data_frame = pd.DataFrame(centroids, columns=[iaxis for iaxis in columnnames]) # dfsort = data_frame.sort_values(by=[orderfeature]) L = [chr(i) for i in range(97, 97 + len(centroids))] dfsort['L'] = pd.Series(L, index=dfsort.index) dfreturn = dfsort.set_index('L', drop=True) # print(dfreturn.to_json(orient="index")) return dfreturn.to_json(orient="index")
def __init__(self): self.kmeans = KMeans() super(ClusteringGui, self).__init__() uic.loadUi(main_interface_file, self) self.browse_btn = self.findChild(QPushButton, 'browse_button') self.browse_btn.clicked.connect(self.on_browse_click) self.k_selector = self.findChild(QSpinBox, 'k_val_selector') self.k_selector.valueChanged.connect(self.on_update_k) self.repetitions_selector = self.findChild(QSpinBox, 'k_repetitions_selector') self.repetitions_selector.valueChanged.connect(self.on_set_repetitions) self.run_btn = self.findChild(QPushButton, 'run_button') self.run_btn.clicked.connect(self.on_run_click) self.run_btn.setEnabled(False) self.step_btn = self.findChild(QPushButton, 'step_button') self.step_btn.clicked.connect(self.on_step_click) self.step_btn.setEnabled(False) self.elbow_btn = self.findChild(QPushButton, 'elbow_chart_button') self.elbow_btn.clicked.connect(self.on_show_elbow) self.elbow_btn.setEnabled(False) self.layout = self.findChild(QVBoxLayout, 'layout') self.dimensions_label = self.findChild(QLabel, 'dimensions_label') self.dimensions_label.setText("") self.show()
def second_cluster_k_means(_rows, _comments, _follows, _times): tf = TfIDf(_rows, _comments, _follows, _times) tf_idf_dict = tf.tf_idf() tf_number = tf.get_total_keywords() print(sorted(tf_number.items(), key=lambda d: d[1], reverse=True)) vsm_file_name = 'second_vsm' vsm = BuildVsm(_rows, tf_idf_dict) scores = vsm.build_vsm(vsm_file_name) vsm_file_path = 'vsm集合\\{}\\{}.txt'.format(vsm_file_name, vsm_file_name) k_cluster = K_Means(_rows, _comments, _follows, _times) data_set = numpy.mat(load_data_set(vsm_file_path)) cluster_centroids, cluster_assment = k_cluster.k_means(data_set, 2) # # 获取矩阵中所有行的第一列,并生成每条文本所属的标签 labels = cluster_assment[:, 0] labels = [int(i[0]) for i in labels.tolist()] classify_file1(labels, 'second_vsm结果', _rows, _follows, _comments, _times, scores) # 使用sklearn中的KMeans算法进行聚类 data_set = numpy.mat(load_data_set(vsm_file_path)) cluster = KMeans(init='k-means++', n_clusters=2) matrix = cluster.fit_predict(data_set) print(matrix) labels = list(matrix) classify_file1(labels, 'second_vsm结果1', rows, follows, comments, times, scores)
def main(): iris_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/' \ 'iris/iris.data' x_col_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'] y_col_name = 'label' iris_df = pd.read_csv(iris_url, names=x_col_names + [y_col_name]) x_data = np.array(iris_df[x_col_names]) # perform k-means clustering k_means = KMeans(n_centers=3, init='k-means++', random_state=np.random.RandomState(0)) y_pred = k_means.fit_predict(x_data) centers = k_means.centers # plot plot_colors = ['r', 'g', 'b'] for ci in range(k_means.n_centers): plt.scatter(x_data[y_pred == ci, 0], x_data[y_pred == ci, 1], c=plot_colors[ci]) plt.scatter(centers[:, 0], centers[:, 1], c='y', label='centers') plt.title('k-means example on the iris dataset') plt.xlabel(x_col_names[0]) plt.ylabel(x_col_names[1]) plt.legend() plt.show()
def squared_clustering_errors(inputs, k): clusterer = KMeans(k) clusterer.train(inputs) means = clusterer.means assignments = map(clusterer.classify, inputs) return sum( squared_distance(input, means[cluster]) for input, cluster in zip(inputs, assignments))
def main(): """ Main function. """ # parse args parser = argparse.ArgumentParser() parser.add_argument('data', help='hw4_nolabel_train.dat') parser.add_argument('-t', '--trial', type=int, default=500, help='experiment times (default = 500)') parser.add_argument( '-o', '--output_to_png', default=False, action='store_true', help='Output image to files. (default is display on screen)') args = parser.parse_args() # get data data = get_data(args.data) # fit k_list = [2, 4, 6, 8, 10] avg_list = [] var_list = [] for k in k_list: err_list = [] k_means = KMeans(k) for _ in range(args.trial): k_means.fit(data) err_list.append(k_means.calc_err()) err_list = np.array(err_list) avg_list.append(err_list.mean()) var_list.append(err_list.var()) # plot plt.scatter(k_list, avg_list) plt.title('Average of $E_{in}$ vs. $k$') plt.xlabel('$k$') plt.ylabel('Average of $E_{in}$') if args.output_to_png: plt.savefig('q_15') else: plt.show() plt.clf() # plot plt.scatter(k_list, var_list) plt.title('Variance of $E_{in}$ vs. $k$') plt.xlabel('$k$') plt.ylabel('Variance of $E_{in}$') if args.output_to_png: plt.savefig('q_16') else: plt.show() plt.clf()
def main(): path = 'dog.jpeg' A = imread(path) A = A.astype(float) / 255. img_size = A.shape X = A.reshape(img_size[0] * img_size[1], img_size[2]) for k in [2, 4, 8, 16]: algorithm = KMeans(k=k, picture=X) algorithm.run_k_means(max_iterations=10)
def cluster_paragraphs(paragraphs, num_clusters=2): word_lists = make_word_lists(paragraphs) word_set = make_word_set(word_lists) word_vectors = make_word_vectors(word_set, word_lists) paragraph_map = dict(zip(map(str, word_vectors), paragraphs)) k_means = KMeans(num_clusters, word_vectors) k_means.main_loop() return translator(k_means.clusters, paragraph_map)
def _initialize_params(self, data): km = KMeans(self.k) km.fit(data) self.dim = data.shape[-1] _, self.means = km.predict(data) self.means = np.unique(self.means, axis=0) self.pis = np.random.uniform(0, 1, (self.k, )) self.pis = self.pis / np.sum(self.pis) self.covariances = np.array([np.eye(self.dim)] * self.k) * 100000000 self.gammas = np.zeros((data.shape[0], self.k))
def main(): # prepare sample data centers = 3 X, _ = make_blobs( n_samples=150, n_features=2, centers=centers, cluster_std=0.5, shuffle=True, random_state=0) # fit clusterings clusterings = [ KMeans( n_clusters=3, init='random', n_init=10, max_iter=300, tol=1.0e-4, random_state=1), KMeans( n_clusters=3, init='k-means++', n_init=1, max_iter=300, tol=1.0e-4, random_state=1) ] names = ['k-means', 'k-means++'] for clustering, name in zip(clusterings, names): # predict centroid of clusters and label of each data points y_pred = clustering.fit_predict(X) # plot predicted labels for i in range(centers): Xi = X[y_pred == i] plt.scatter( Xi[:, 0], Xi[:, 1], marker='o', edgecolor='black', label='cluster {0}'.format(i + 1)) # plot centroids plt.scatter( clustering.cluster_centers_[:, 0], clustering.cluster_centers_[:, 1], marker='*', edgecolor='black', label='centroids') # set plot area plt.grid() plt.legend() plt.title(name) plt.tight_layout() plt.show() # show attributes print('inertia:{0}'.format(clustering.inertia_)) print('iteration times:{0}'.format(clustering.n_iter_))
def kmeans_segment(img, n_clusters=DEFAULT_N_CLUSTERS, max_iter=K_MEANS_DEFAULT_MAX_ITER, include_spatial=False, visualize=False): n = img.shape[0] m = img.shape[1] if include_spatial: xx = np.arange(n) yy = np.arange(m) X, Y = np.meshgrid(yy, xx) img = np.concatenate((Y.reshape(n, m, 1), X.reshape(n, m, 1), img), axis=2) print("kmeans_segment(:include_spatial) img.shape = {}".format( img.shape)) # we do img.shape[-1] so we get last shape dim which in case of # include_spatial=True it will be 5 and in case of include_spatial=False # it will be # colors which is RGB = 3 img = img.reshape(-1, img.shape[-1]) # 2D array (n*m, features_count) segmented_image = KMeans(n_clusters, max_iter).fit(img).reshape(n, m) if visualize: plt.figure(figsize=(12, 12)) plt.axis('off') plt.imshow(segmented_image) return segmented_image
def main(): k = 3 data = [ 1.5, 9.5, 5.4, 1.6, 5.5, 9.3, 1.7, 9.1, 1.3, 2.0, 5.0, 7.0, 7.7, 8.0 ] data = zip(data, data) KMeans(k, data).cluster()
def run_test(self): test = KMeans(self.k, self.num_of_iters) total_sse = [] total_sum = 0 for seed in range(self.max_seeds): test.run(self.points, seed) sse = test.compute_sse() total_sse.append(sse) total_sum += sse minimal_sse = min(total_sse) mean_sse = total_sum / 10 maximal_sse = max(total_sse) return [ f"0-{self.max_seeds - 1}", self.k, self.num_of_iters, minimal_sse, mean_sse, maximal_sse ]
def main(): image = img.imread("sample_img1.png") colors = np.zeros((image.shape[0] * image.shape[1], image.shape[2])) for i in range(image.shape[0]): for j in range(image.shape[1]): colors[i * image.shape[1] + j] = image[i][j] termination_condition_threshold = 0.01 k_array = [2, 4, 8, 16, 32, 64] for k in k_array: k_means = KMeans(colors, k) k_means.start(termination_condition_threshold) output = np.zeros((image.shape[0], image.shape[1], image.shape[2])) for i in range(image.shape[0]): for j in range(image.shape[1]): output[i][j] = k_means.clusters.get_center( k_means.data.memberships[i * image.shape[1] + j]) img.imsave(f"output{k}.png", output)
def perform_SFS_feature_selection(model, model_type, num_of_classes, data_set): # Create a boolean string, 1 = include feature, 0 = leave it out feature_set = [i for i in xrange(data_set.shape[1])] chosen_features = [] chosen_clusters = [] base_performance = float("-inf") # while there are still features to choose from... while len(feature_set) > 0: # initialize performance metrics best_performance = float("-inf") best_clusters = [] #print "best performance = %f" % best_performance # Pick a feature that hasn't be chosen yet and train the model for feature in feature_set: chosen_features.append(feature) # Train model if model_type == "Kmeans": model = KMeans(num_of_classes) elif model_type == "HAC": model = HAC(num_of_classes) #print "Modeling with %s" % chosen_features clusters = model.cluster(data_set) # Calculate performance via LDA-like objective function current_performance = model.calculate_performance() #print "model performance = %f" % current_performance # if this combo of features beats the best performance so far # take note... if current_performance > best_performance: best_performance = current_performance best_feature = feature best_clusters = clusters #print "best performance updated to %f" % best_performance chosen_features.remove(feature) # If best noted performance beats the best performance we've seen # so far, add to chosen features if best_performance > base_performance: base_performance = best_performance feature_set.remove(best_feature) chosen_features.append(best_feature) chosen_clusters = best_clusters #print "base performance = %f" % base_performance else: #print "best performance = %f" % base_performance break return chosen_features, chosen_clusters
def cluster_paragraphs(paragraphs): word_lists = make_word_lists(paragraphs) #二维列表 word_lists1 = [] for i in range(len(word_lists)): str1 = " ".join(word_lists[i]) word_lists1.append(str1) # print "word_lists1:",word_lists1 word_set = make_word_set(word_lists) #所有词的集合 vec_df = tfidf(word_lists1) word_vectors = make_word_vectors(word_set, word_lists) #将每一条数据处理成一个固定长度的向量 # print "word_vectors:",word_vectors paragraph_map = dict(zip(map(str, word_vectors), paragraphs)) optimum_k = find_optimum_k(vec_df) k_means = KMeans(optimum_k, word_vectors) k_means.main_loop() return translator(k_means.clusters, paragraph_map)
def run_kmeans(): list_seed = [1, 1, 1, 12, 12, 12] list_k = [3, 4, 5, 3, 4, 5] print("seed k sl1") for index, value_seed in enumerate(list_seed): k = list_k[index] num_iterations = 10 input_path = "colors_dataset_ready.txt" random_seed = value_seed if k <= 1 or num_iterations <= 0: print('Please provide correct parameters') exit(1) if not os.path.exists(input_path): print('Input file does not exist') exit(1) points = load_data(input_path) if k >= len(points): print('Please set K less than size of dataset') exit(1) runner = KMeans(k, num_iterations) runner.run(points, random_seed) print(list_seed[index], end=" ") print(list_k[index], end=" ") runner.print_results()
def run_kmeans(): # print(len(argv)) if len(argv) < 4: print( 'Not enough arguments provided. Please provide 3 arguments: K, num_iterations, path_to_input' ) exit(1) k = int(argv[1]) num_iterations = int(argv[2]) input_path = argv[3] if len(argv) == 5: random_seed = int(argv[4]) else: random_seed = 0 if k <= 1 or num_iterations <= 0: print('Please provide correct parameters') exit(1) if not os.path.exists(input_path): print('Input file does not exist') exit(1) points = load_data(input_path) if k >= len(points): print('Please set K less than size of dataset') exit(1) runner = KMeans(k, num_iterations) runner.run(points, random_seed) runner.print_results()
def main(input_filepath, output_folder, k): """ Receives the location of the tf-idf scores as a command-line Path argument. """ logger = logging.getLogger(__name__) logger.info( 'Training the K-Means clustering algorithm based on the TF-IDF scores') # Get the models/tf-idf-scores.csv file dataset = pd.read_csv(input_filepath) logger.info('Loaded data file ' + input_filepath + ' with ' + str(len(dataset)) + ' rows') # Removes the first column and formats it like a list x = dataset.drop(dataset.columns[0], axis=1).values vector_dict = generate_vector_dict(dataset) # Number of clusters and max. number of iterations km = KMeans(k=k, max_iterations=500) km.fit(x) clusters = km.get_clusters(vector_dict) # Based on the value of K used, change the destination filename filepath_list = (output_folder + MODEL_REPORT_FILENAME).rsplit('.', 1) output_filepath = filepath_list[0] + '-' + str(k) + '.' + filepath_list[1] # Calculate SSE and MSC sse_score = km.get_sse_score() logger.info('SSE Score: ' + str(sse_score)) msc_score = km.get_msc_avg() logger.info('MSC Score: ' + str(msc_score)) # Generate the results report generate_report(clusters, sse_score, msc_score, output_filepath) logger.info('Created report file on ' + output_filepath) # Generate / Update the results table for future plots if os.path.isfile(output_folder + PLOT_TABLE_FILENAME): # Update the existing file dataset = pd.read_csv(output_folder + PLOT_TABLE_FILENAME) dataset.set_index('K Size', inplace=True) k_means_results = update_plot_results_table(dataset, (k, sse_score, msc_score)) else: # Create and update the file dataset = create_plot_results_table() k_means_results = update_plot_results_table(dataset, (k, sse_score, msc_score)) k_means_results.to_csv(output_folder + PLOT_TABLE_FILENAME, encoding='utf-8') logger.info('Updated report table on ' + output_folder + PLOT_TABLE_FILENAME)
def main(): data = load_data() results = [] np.random.seed(10) # pca_data = pca.pca(data, 2)[0] #pca from scratch # pca_data = pca.pca_s(data, 2) #pca from sk_learn library # code for simple run where k=2 # k=2 # random_centroids = np.random.randint(0, 128, k) # km = KMeans(k) # km.fit(data, random_centroids) for k in range(2, 11): random_centroids = np.random.randint(0, 128, k) km = KMeans(k) results.append(km.fit(data, random_centroids)) #comment this for without pca # results.append(km.fit(pca_data, random_centroids)) #comment this for with pca plt.plot(results, list(range(2, 11))) # plt.show() plt.savefig('k_means.png')
def __init__(self, input_file, n_bkts, vocab): sents = [] sent = [] with open(input_file) as f: for line in f.readlines(): info = line.strip().split() if info: assert (len(info) == 11), 'Illegal line: %s' % line word = vocab.word2id(info[1].lower()) lemma = vocab.lemma2id(info[2].lower()) tag = vocab.tag2id(info[4]) head, rel = int(info[6]), vocab.rel2id(info[7]) syn_mask = int(info[10]) sent.append([word, lemma, tag, head, rel, syn_mask]) else: sents.append(sent) sent = [] len_counter = Counter() for sent in sents: len_counter[len(sent)] += 1 self._bucket_sizes = KMeans(n_bkts, len_counter).splits self._buckets = [[] for i in xrange(n_bkts)] self._buckets_lens = [[] for i in xrange(n_bkts)] len2bkt = {} prev_size = -1 for bkt_idx, size in enumerate(self._bucket_sizes): len2bkt.update( zip(range(prev_size + 1, size + 1), [bkt_idx] * (size - prev_size))) prev_size = size self._record = [] for sent in sents: bkt_idx = len2bkt[len(sent)] self._buckets[bkt_idx].append(sent) self._buckets_lens[bkt_idx].append(len(sent)) idx = len(self._buckets[bkt_idx]) - 1 self._record.append((bkt_idx, idx)) for bkt_idx, (bucket, size) in enumerate(zip(self._buckets, self._bucket_sizes)): self._buckets[bkt_idx] = np.zeros((size, len(bucket), 6), dtype=np.int32) self._buckets_lens[bkt_idx] = np.array(self._buckets_lens[bkt_idx]) for idx, sent in enumerate(bucket): self._buckets[bkt_idx][:len(sent), idx, :] = np.array(sent, dtype=np.int32)
def fit(self, csr): """Apply bisecting k-means""" # initialize k-means with k=2 for bisection kmeans = KMeans(k=2, pct_change=self.k_means_pct_change, max_iter=self.k_means_max_iter) # initialize list of clusters with all points clusters = [range(0, csr.shape[0])] while len(clusters) < self.k: cluster = self.select_next_cluster(clusters) # bisect cluster iter times and select both clusters from split with lowest SSE lowest_sse = None best_split = None for i in range(self.n_iters): print 'Bisecting run # %d/%d, iter # %d/%d' % (len(clusters)+1, self.k-1, i+1, self.n_iters) # split cluster in two using k-means of 2 bisection = kmeans.fit(csr, cluster) split = lambda data, l: [cluster[j] for j, d in enumerate(data) if d == l] x, y = split(bisection, 0), split(bisection, 1) # calculate total SSE of both clusters and store if lowest so far sse_total = self.sse(csr[x, :]) + self.sse(csr[y, :]) if sse_total < lowest_sse or lowest_sse is None: lowest_sse = sse_total best_split = (x, y) # add best cluster split to list clusters.extend(best_split) return self.label_clusters(csr, clusters)
def __init__(self, input_file, n_bkts, vocab): sents = [] sent = [[Vocab.ROOT, Vocab.ROOT, 0, Vocab.ROOT]] with open(input_file) as f: for line in f.readlines(): info = line.strip().split() if info: if info[0] == "#": continue assert (len(info) == 10), 'Illegal line: %s' % line word, tag, head, rel = vocab.word2id( info[1].lower()), vocab.tag2id(info[3]), int( info[6]), vocab.rel2id(info[7]) sent.append([word, tag, head, rel]) else: sents.append(sent) sent = [[Vocab.ROOT, Vocab.ROOT, 0, Vocab.ROOT]] len_counter = Counter() for sent in sents: len_counter[len(sent)] += 1 print("start k-Mean bucketing") self._bucket_sizes = KMeans(n_bkts, len_counter).splits print("k-Mean finish") self._buckets = [[] for i in xrange(n_bkts)] len2bkt = {} prev_size = -1 for bkt_idx, size in enumerate(self._bucket_sizes): len2bkt.update( zip(range(prev_size + 1, size + 1), [bkt_idx] * (size - prev_size))) prev_size = size self._record = [] for sent in sents: bkt_idx = len2bkt[len(sent)] self._buckets[bkt_idx].append(sent) idx = len(self._buckets[bkt_idx]) - 1 self._record.append((bkt_idx, idx)) for bkt_idx, (bucket, size) in enumerate(zip(self._buckets, self._bucket_sizes)): self._buckets[bkt_idx] = np.zeros((size, len(bucket), 4), dtype=np.int32) for idx, sent in enumerate(bucket): self._buckets[bkt_idx][:len(sent), idx, :] = np.array(sent, dtype=np.int32)
def post_kmeanprict(array: str, centermodel: str, featurename: str): testdata = pd.read_json(array, orient='index') centers = pd.read_json(centermodel, orient='index') columnnames = featurename.split(',') testnumber = testdata.shape[0] # Get features. test_train = testdata[[iaxis for iaxis in columnnames]].values.reshape( (testnumber, len(columnnames))) centeridmodel = centers[[iaxis for iaxis in columnnames]].values.reshape( (len(centers), len(columnnames))) closest_centroids_ids = KMeans.centroids_find_closest( test_train, centeridmodel) tag = [] for i in closest_centroids_ids: tag.append(centers.index[int(i[0])]) testdata['tag'] = pd.Series(tag, index=testdata.index) return testdata.to_json(orient="index")
def evaluate_model(model, model_type, num_of_classes, candidate_feature_set, data_set): '''This method uses the inputted feature subset to cluster the inputted data and scores performance using a LDA-like objective function.''' # Convert candidate_feature_set representation from # f_1, ... f_d to the list of indices of the f_i = 1 # (for example, [1 0 0 1 0] -> [0 3] candidate_feature_set = \ [idx for idx in xrange(len(candidate_feature_set)) if candidate_feature_set[idx] == 1] if model_type == "Kmeans": model = KMeans(num_of_classes) elif model_type == "HAC": model = HAC(num_of_classes) model.cluster(data_set[:,candidate_feature_set]) return model.calculate_performance()
# Datasets to test tests = [('data_sets/original/glass_data.txt', 7), ('data_sets/original/iris_data.txt', 3), ('data_sets/original/spam_data.txt', 2)] for test in tests: data_instances = [] data_file = open(test[0]) print "Running with %s" % test[0] for line in data_file: line_split = line.split(',') data_instances.append(map(float, line_split)) data_instances = np.array(data_instances) # Run SFS using k-means and HAC kmeans_model = KMeans(test[1]) hac_model = HAC(test[1]) # Glass dataset if "glass" in test[0]: kmeans_sfs_glass = np.array([1,3]) kmeans_model.cluster(data_instances[:,kmeans_sfs_glass]) print "Kmeans SFS glass performance = %f" % kmeans_model.calculate_performance() kmeans_ga_glass = np.array([0,1,2,3,4,5,6]) kmeans_model = KMeans(test[1]) kmeans_model.cluster(data_instances[:,kmeans_ga_glass]) print "Kmeans GA glass performance = %f" % kmeans_model.calculate_performance() hac_sfs_glass = np.array([0]) hac_model.cluster(data_instances[:,hac_sfs_glass])
import numpy as np import matplotlib.pyplot as plt from k_means import KMeans k_means = KMeans(2) X = np.loadtxt("realdata.txt")[:, 1:] k_means.fit(X) labels = k_means.labels_ plt.xlabel('Length') plt.ylabel('Width') handles = [] s1 = plt.scatter(X[labels == 0, 0], X[labels == 0, 1], color='r', label="Cluter1", marker='o') handles.append(s1) s2 = plt.scatter(X[labels == 1, 0], X[labels == 1, 1], color='k', label="Cluter2", marker='^') handles.append(s2) plt.legend(handles=handles) plt.title('K-means') plt.show()
import numpy as np from k_means import KMeans from distance import euclidean from mean import mean import pickle DATA_PATH = 'D:\datasets\mnist\large_dataset\mnist_train.csv' print('Loading Data') f = open(DATA_PATH, 'r') data_list = [] for line in f.readlines(): observation = np.asfarray(line.split(',')[1:]) data_list.append(observation / 255) f.close() print('Finished Loading') print('Fitting Started') model = KMeans() clusters = model.fit(data_list, 10, euclidean, mean) print('Fitting Finished') print('Saving Clusters to ./clusters.pkl') f = open('./clusters.pkl', 'wb') pickle.dump(clusters, f, protocol=pickle.HIGHEST_PROTOCOL) f.close()
# GIS filters. This table should have central_area = [usable area in square meters]. # read in cell-level data # table contains site_id, grid_id, i, j, central_area, net_pv_capacity. with open('cell_central_pv_capacity_original.csv') as csvfile: data = list(csv.DictReader(csvfile)) x, y, area = np.array(list((r["i"], r["j"], r["central_area"]) for r in data), dtype=float).T # data = csv_to_dict('cell_central_pv_capacity_original.csv') # i = np.array(data["i"], dtype=float) # j = np.array(data["j"], dtype=float) # area = np.array(data["central_area"], dtype=float) # cluster the cells into 150 projects (somewhat arbitrarily) instead of ~750, # and use the cluster numbers as new site_id's. km = KMeans(150, np.c_[x, y], size=0.0001*area) km.init_centers() km.find_centers() # km.plot() for i in range(len(x)): # km.cluster_id is an array of cluster id's, same length as x and y data[i]["cluster_id"] = km.cluster_id[i] # insert the modified data into the database # note: it is reportedly faster to construct a single # insert query with all the values using python's string # construction operators, since executemany runs numerous # separate inserts. However, it's considered more secure to use # the database library's template substitution, so we do that. executemany(""" INSERT INTO cell_central_pv_capacity
class TestKMeans(unittest.TestCase): def setUp(self): self._kmeans = KMeans(2) # n_clusters def test_get_intial_centroids(self): data = np.array([[1, 1], [0, 0], [-1, -1], [2, 2]]) data = self._kmeans._get_initial_centroids(data) # random seed chooses the same centroids exp_data = np.array([[0, 0], [2, 2]]) self.assertTrue(np.array_equal(data, exp_data)) cent1, cent2 = [cluster.centroid for cluster in self._kmeans.clusters] exp_cent1 = np.array([1, 1]) exp_cent2 = np.array([-1, -1]) self.assertTrue(np.array_equal(cent1, exp_cent1)) self.assertTrue(np.array_equal(cent2, exp_cent2)) def test_choose_cluster(self): self._kmeans.clusters.append(self._kmeans.Cluster (np.array([1, 1]), initial=True)) self._kmeans.clusters.append(self._kmeans.Cluster (np.array([-1, -1]), initial=True)) self._kmeans._choose_cluster(np.array([1, 0])) self._kmeans._choose_cluster(np.array([-1, -2])) data_points1 = self._kmeans.clusters[0].data_points exp1 = np.array([[1, 1], [1, 0]]) data_points2 = self._kmeans.clusters[1].data_points exp2 = np.array([[-1, -1], [-1, -2]]) self.assertTrue(np.array_equal(data_points1, exp1)) self.assertTrue(np.array_equal(data_points2, exp2)) def test_squared_euclidian_dist(self): x1, y1 = (0, 0) #0 x2, y2 = (np.array([1, 2, 3]), np.array([3, 2, 1])) #8 x3, y3 = (np.array([[1, 2], [1, 2]])), np.array([[1, 1], [1, 1]]) exp1 = 0 res1 = self._kmeans._squared_euclidian_dist(x1, y1) exp2 = 8 res2 = self._kmeans._squared_euclidian_dist(x2, y2) exp3 = 2 res3 = self._kmeans._squared_euclidian_dist(x3, y3) self.assertEqual(res1, exp1) self.assertEqual(res2, exp2) self.assertEqual(res3, exp3) def test_is_finish_success(self): self._kmeans.clusters.append(self._kmeans.Cluster(np.array([1, 1]))) self._kmeans.clusters.append(self._kmeans.Cluster(np.array([-1, -1]))) self._kmeans.clusters[0].data_points = np.array([[1, 1], [1, 0]]) self._kmeans.clusters[1].data_points = np.array([[1, 1], [-1, -2]]) self._kmeans.prev_clusters = deepcopy(self._kmeans.clusters) res = self._kmeans._is_finish() exp = 1 self.assertEqual(res, exp) def test_is_finish_fail(self): self._kmeans.clusters.append(self._kmeans.Cluster(np.array([1, 1]))) self._kmeans.clusters.append(self._kmeans.Cluster(np.array([-1, -1]))) self._kmeans.clusters[0].data_points = np.array([[1, 1], [1, 0]]) self._kmeans.clusters[1].data_points = np.array([[1, 1], [-1, -2]]) self._kmeans.prev_clusters = deepcopy(self._kmeans.clusters) self._kmeans.clusters[0].data_points = np.array([[2, 1], [1, 0]]) res = self._kmeans._is_finish() exp = 0 self.assertEqual(res, exp) def test_update_centroids_and_data(self): self._kmeans.clusters.append(self._kmeans.Cluster(np.array([1, 1]))) self._kmeans.clusters.append(self._kmeans.Cluster(np.array([-1, -1]))) self._kmeans.clusters[0].data_points = np.array([[1, 1], [1, 0]]) self._kmeans.clusters[1].data_points = np.array([[1, 1], [-1, -2]]) self._kmeans.prev_clusters = deepcopy(self._kmeans.clusters) data = self._kmeans._update_centroids_and_data() res_centroid1 = self._kmeans.clusters[0].centroid res_centroid2 = self._kmeans.clusters[1].centroid exp_centroid1 = np.array([1., 0.5]) exp_centroid2 = np.array([0., -0.5]) exp_data = np.array([[1, 1], [1, 0], [1, 1], [-1, -2]]) self.assertTrue(np.array_equal(res_centroid1, exp_centroid1)) self.assertTrue(np.array_equal(res_centroid2, exp_centroid2)) self.assertTrue(np.array_equal(data, exp_data))
# graphing imports! import matplotlib.pyplot as plt import matplotlib.colors as colors # clustering from csvReader import CSVReader from k_means import KMeans inputFile = "microarraydata.csv" k = 4 csvReader = CSVReader() microarrayData = csvReader.read(inputFile) print microarrayData kmeans = KMeans(verbose=True) finalClusters = kmeans.kmeans(microarrayData, k) print "\nFinal set of gene clusters:" for clusterIdx, cluster in enumerate(finalClusters): print "\tCluster %d: %s" % (clusterIdx + 1, ["gene" + str(idx + 1) for gene, idx in cluster]) print ""
#!/usr/bin/env python3 import sys sys.path.append('code') import numpy as np from k_means import KMeans # Pokemon heigh/weight data = np.array([[0.4, 6.0], # Pikachu [0.7, 6.9], # Bulbasaur [0.6, 8.5], # Charmander [0.5, 9.0], # Squirtle [1.2, 36.0], # Slowpoke [1.6, 78.5], # Slowbro [1.1, 90.0], # Seel [1.7, 120.0],# Dewgong [2.2, 210.0],# Dragonite [1.7, 55.4], # Articuno [1.6, 52.6], # Zapdos [2.0, 60.0]] # Moltres ) if __name__ == "__main__": k_means = KMeans(2) k_means.train(data) k_means.report()
# weight = np.sqrt(traj_weight/traj_weight.max()) # for i, traj in enumerate(oil_price_traj): # # plot each row as a separate series, with appropriate width and alpha # # plt.semilogy(periods, traj, 'k-', linewidth=5*traj_weight[i]/traj_weight.mean(), alpha=.1) # plt.semilogy(periods, traj, 'k-', linewidth=10*weight[i], alpha=weight[i]) # plt.show() # mu, cluster_id = scipy.cluster.vq.kmeans2( # data=np.hstack([oil_prices.T, gas_prices.T]), # k=125, # minit='points' # ) # get a better starting point than scipy kmeans usually provides km = KMeans(125, np.hstack([oil_prices.T, gas_prices.T])) km.init_centers() # takes about 60 s for 100,000; roughly linear in # mu, cluster_id = scipy.cluster.vq.kmeans2( data=np.hstack([oil_prices.T, gas_prices.T]), k=km.mu ) for var in save_vars: f = os.path.join(pha_dir, var + '.npy') np.save(f, locals()[var]) # process the scenario data oil_price_traj = mu[:,:len(periods)] # first half of mu gas_price_traj = mu[:,-len(periods):] # second half of mu traj_weight = np.bincount(cluster_id)/cluster_id.shape[0] # print traj_weight
from k_means import KMeans import scipy.io as sio import numpy as np import matplotlib.pyplot as plt # =================== K-Means Clustering ====================== data = sio.loadmat('data\\ex7data2.mat') K = 3 num_iters = 10 X = data['X'] initial_centroids = np.matrix([[3,3], [6,2], [8,5]]) kmeans = KMeans(K, num_iters) idx = kmeans.findClosestCentroids(X, initial_centroids) kmeans.train_model(X, initial_centroids, True) # ============= K-Means Clustering on Pixels =============== data = sio.loadmat('data\\bird_small.mat') A = data['A'] A = A / 255 m, n, _ = A.shape X = A.reshape([-1, 3]) K = 16
print("The Final Selected Features are: (features are zero indexed) ") print("{}\n".format(selected_features)) print("The Fisher Score for the clustering is: ") print("{}\n".format(best_features["evaluation"])) pp = pprint.PrettyPrinter(indent=2, width=400) print( "For Clustered points, the key in the dictionary represents the cluster each data point belongs to. " ) print("Clustered points: ") pp.pprint(full_clusters) # KMeans experiments sys.stdout = open('results/GA-Kmeans-iris-results.txt', 'w') run_ga_kmeans_experiment("data/iris.data.txt", 3, KMeans(3)) sys.stdout = open('results/GA-Kmeans-glass-results.txt', 'w') run_ga_kmeans_experiment("data/glass.data.txt", 6, KMeans(6)) sys.stdout = open('results/GA-Kmeans-spambase-results.txt', 'w') run_ga_kmeans_experiment("data/spambase.data.txt", 2, KMeans(2), fraction_of_data_used=100) # HAC experiments sys.stdout = open('results/GA-HAC-iris-results.txt', 'w') run_hac_experiment("data/iris.data.txt", 3, HAC(3)) sys.stdout = open('results/GA-HAC-glass-results.txt', 'w')
def setUp(self): self._kmeans = KMeans(2) # n_clusters