def build_model(self): try: num_of_runs = self.Num_of_runs_val.get() num_of_clusters = self.Num_of_clusters_val.get() if (num_of_runs < 1): tkMessageBox.showerror("K Means Clustering", 'invalid number of runs') return if (num_of_clusters < 2 or num_of_clusters > 164): tkMessageBox.showerror( "K Means Clustering", 'number of cluster have to be grater then 1 and smaller then 164 (number of recordes)' ) return model.k_means(num_of_clusters, num_of_runs) scatter_path = model.plot_scatter() map_path = model.plot_map() root.geometry("1600x600+200+200") self.scatter_img = ImageTk.PhotoImage(Image.open(scatter_path)) self.scatter_img_label = tk.Label( self, image=self.scatter_img).grid(row=4, column=0, columnspan=3) self.map_img = ImageTk.PhotoImage(Image.open(map_path)) self.map_img_label = tk.Label(self, image=self.map_img).grid( row=4, column=3, columnspan=3) answer = tkMessageBox.askokcancel( "K Means Clustering", 'Clustring complited successfully, do you want to exit the program?' ) if (answer): self.master.quit() except ValueError as ve: tkMessageBox.showerror("K Means Clustering", ve.message)
def clustering(): try: n_clusters = int(request.args.get('num_clusters', 4)) assert n_clusters > 0 except (ValueError, AssertionError): raise abort(400, 'num_clusters must be a positive integer') try: max_iterations = int(request.args.get('max_iterations', 300)) assert max_iterations > 0 except (ValueError, AssertionError): raise abort(400, 'max_iterations must be a positive integer') try: points = parse_input(request.data.decode('ascii')) except ValueError: raise abort(400, 'Input data must be matrix MxN of float numbers') if not points.size: raise abort(400, 'Input matrix cannot be empty') labels = k_means(points, n_clusters, max_iterations) labels += 1 # normalize labels to begin from 1 response = make_response('[{}]'.format(';\n'.join( ', '.join(f'{x}' for x in list(row) + list(label)) for row, label in zip(points, labels)))) response.headers['Content-Type'] = 'text/plain; charset=utf-8' return response
def test_model(): """ Comparing results between our implementation and SKLearn with same set of init centers""" df = pandas.read_csv(DATASET_URL, sep='\t') X = df[['Distance_Feature', 'Speeding_Feature']].as_matrix() np.random.shuffle(X) X = X[:500] test_centers = initial_centers(X, 4) our_result = k_means(X, n_clusters=4, init=test_centers, convergence_threshold=1e-6) sample_result = KMeans(n_clusters=4, init=test_centers, n_init=1, algorithm='full').fit(X) our_pmf = Counter(our_result.reshape(-1)).values() sample_pmf = Counter(sample_result.labels_.reshape(-1)).values() print(our_pmf, sample_pmf) return set(our_pmf) == set(sample_pmf)
with open('tweets.pkl', 'wb') as output: pickle.dump(tweets, output, pickle.HIGHEST_PROTOCOL) except: pass try: with open('org_tweets.pkl', 'wb') as output: pickle.dump(orginal_content, output, pickle.HIGHEST_PROTOCOL) except: pass print('write file completed') # tfs = create_combined_vector(tweets, word_vec.model) tfs = create_vector(tweets) print(np.where(np.isnan(tfs))) k = 50 km = k_means(tfs, k) new_json = {} for i in set(km.labels_): current_cluster_bills = [ orginal_content[x] for x in np.where(km.labels_ == i)[0] ] new_json['class_' + str(i)] = current_cluster_bills with open('out.json', 'w') as outfile: json.dump(new_json, outfile) #plt.hist(km.labels_, bins=k) #plt.show()
# Calcular valores de los features print('Feature norm l1', dtlib.feature_norm_l1(dt)) print('Feature mean', dtlib.feature_mean(dt)) print('Feature std', dtlib.feature_std(dt)) # Calcular la VA exponencial # exponential_vectorize = np.vectorize(my_exponential) # dt_exp = exponential_vectorize(dt) # plt.hist(dt_exp) # plt.show() # Reducir a 2 dimensiones y plotear dt2 = dtlib.pca(dt, 2) # Calcular los centros con k-means kmeans_centroids, kmeans_cluster_ids = model.k_means(dt2, n_clusters, 100) #kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(dt2) # Comparar y plotear plt.scatter(dt2[:, 0], dt2[:, 1]) plt.scatter(kmeans_centroids[:, 0], kmeans_centroids[:, 1]) #plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1]) plt.show() ''' Conclusion: A pesar de haber agregado ruido en el datasets (NaN reemplazados por mean) se puede apreciar a simple vista 4 conjuntos dominantes y el algoritmo de k-means los identifica sin problemas ''' # Repetir procedimiento con un dataset cuyos centroidos estén más cerca