def accuracy(matrix, bin_cluster, bin_index, ground_truth): idx_list = [] # loop through every cluster and get the length of each of them for i in range(8): idx_list.append(len(bin_cluster[i])) # get the max index appeared in the clusters max_idx = idx_list.index(max(idx_list)) feature_m = [] for i in bin_cluster[max_idx]: feature_m.append(matrix[i]) bin_index_1 = bin_index[max_idx] # call kmeans to cluster labels kmeans = KMeans(n_clusters = 60, random_state=0).fit(feature_m) label_1 = list(kmeans.labels_) # train the cluster and index by kmeans model density_cluster, density_index = k_means.kmeans_model(ground_truth, label_1, 60, bin_index_1) # final_bin1 = bin_index[0] # final_bin2 = bin_index[1] # final_bin3 = bin_index[2] # final_bin4 = bin_index[3] # final_bin5 = bin_index[4] # final_bin6 = bin_index[5] # initialize the 250 labels as 0s final_result = len(matrix) * [0] # bin indices that holds all the possible labels bin1 = density_index[0] bin2 = bin_index[1] + density_index[1] bin3 = bin_index[2] + density_index[2] bin4 = bin_index[3] + density_index[3] bin5 = bin_index[4] + density_index[4] bin6 = bin_index[5] + density_index[5] # create variables that hold how many 1,2,3,4,5,6s are there in each bin bin1_l = len(bin1) * [1] bin2_l = len(bin2) * [2] bin3_l = len(bin3) * [3] bin4_l = len(bin4) * [4] bin5_l = len(bin5) * [5] bin6_l = len(bin6) * [6] # print(len(final_bin1)) # print(len(final_bin2)) # print(len(final_bin3)) # print(len(final_bin4)) # print(len(final_bin5)) # print(len(final_bin6)) # replace labels in final result by each of the bins for i in bin1: final_result[i] = bin1_l.pop() for i in bin2: final_result[i] = bin2_l.pop() for i in bin3: final_result[i] = bin3_l.pop() for i in bin4: final_result[i] = bin4_l.pop() for i in bin5: final_result[i] = bin5_l.pop() for i in bin6: final_result[i] = bin6_l.pop() sse = kmeans.inertia_ print("SSE value :", sse) return final_result
def create_data(X: dt.Frame = None) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]: if X is None: return [] # check the datatype of user-defined columns if not isinstance(include_columns, list): raise ValueError("Variable: 'include_columns' should be <list>") if not isinstance(ignore_columns, list): raise ValueError("Column: 'ignore_columns' should be <list>") if not isinstance(num_clusters, int): raise ValueError("Column: 'num_clusters' should be <int>") ## validate user-inputs and override the columns given by user features = list(X.names) if len(include_columns) > 0: for _ in include_columns: if _ not in list(X.names): raise ValueError("Column: '" + str(_) + "' is not present in the dataset") features = include_columns ## list to ignore specific columns given by user features = [_f for _f in features if _f not in ignore_columns] ## handle columns with missing values ignore_ = [] X_df = X.to_pandas() for col in features: # label encode categorical columns # refer - https://github.com/h2oai/driverlessai-recipes/pull/68#discussion_r365133392 if X_df[col].dtype == "object": X_df[f"{col}_enc"] = LabelEncoder().fit_transform(X_df[col].to_numpy()) ignore_.append(col) miss_percent = X_df[col].isna().sum() / X_df.shape[0] if miss_percent >= 0.3: # ignore columns having more than 30% missing values ignore_.append(col) elif miss_percent > 0.0: # impute by mean for other columns with missing values X_df[col] = X_df[col].fillna(X_df[col].mean()) features = [f for f in features if f not in ignore_] features += [_f for _f in X_df.columns if "_enc" in _f] if len(features) == 0: raise ValueError("Unable to cluster: No useful features available") X_clust = X_df[features].values # Apply min max scaling X_clust = MinMaxScaler().fit_transform(X_clust) # Go through possible numbers of clusters best_score = None best_n_clust = None best_clust_ids = None ## if number of clusters is pre-defined by user, then dont find the optimal if num_clusters > 1: model = KMeans(n_clusters=num_clusters, n_jobs=NUM_JOBS).fit(X_clust) clust_ids = model.predict(X_clust) score = get_score( X_clust, clust_ids ) best_score = score best_n_clust = num_clusters best_clust_ids = clust_ids else: for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS, CLUSTER_STEP_SIZE): model = KMeans(n_clusters=n_clusters, n_jobs=NUM_JOBS).fit(X_clust) clust_ids = model.predict(X_clust) X1 = X.to_pandas() score = get_score( list(X1[target]), clust_ids ) improve = False if best_score is None: improve = True elif best_score > score: improve = True if improve: best_score = score best_n_clust = n_clusters best_clust_ids = clust_ids if best_score is None: return [] else: X[:, f'cluster_ids_{best_n_clust}'] = dt.Frame(best_clust_ids) return X
class TestModelTypeChecking(object): """ Test model type checking utilities """ ##//////////////////////////////////////////////////////////////////// ## is_estimator testing ##//////////////////////////////////////////////////////////////////// def test_estimator_alias(self): """ Assert isestimator aliases is_estimator """ assert isestimator is is_estimator @pytest.mark.parametrize("model", ESTIMATORS, ids=obj_name) def test_is_estimator(self, model): """ Test that is_estimator works for instances and classes """ assert inspect.isclass(model) assert is_estimator(model) obj = model() assert is_estimator(obj) @pytest.mark.parametrize("cls", [ list, dict, tuple, set, str, bool, int, float ], ids=obj_name) def test_not_is_estimator(self, cls): """ Assert Python objects are not estimators """ assert inspect.isclass(cls) assert not is_estimator(cls) obj = cls() assert not is_estimator(obj) def test_is_estimator_pipeline(self): """ Test that is_estimator works for pipelines """ assert is_estimator(Pipeline) assert is_estimator(FeatureUnion) model = Pipeline([ ('reduce_dim', PCA()), ('linreg', LinearRegression()) ]) assert is_estimator(model) def test_is_estimator_search(self): """ Test that is_estimator works for search """ assert is_estimator(GridSearchCV) assert is_estimator(RandomizedSearchCV) model = GridSearchCV(SVR(), {'kernel': ['linear', 'rbf']}) assert is_estimator(model) @pytest.mark.parametrize("viz,params", [ (Visualizer, {}), (ScoreVisualizer, {'model': LinearRegression()}), (ModelVisualizer, {'model': LogisticRegression()}) ], ids=lambda i: obj_name(i[0])) def test_is_estimator_visualizer(self, viz, params): """ Test that is_estimator works for Visualizers """ assert inspect.isclass(viz) assert is_estimator(viz) obj = viz(**params) assert is_estimator(obj) ##//////////////////////////////////////////////////////////////////// ## is_regressor testing ##//////////////////////////////////////////////////////////////////// def test_regressor_alias(self): """ Assert isregressor aliases is_regressor """ assert isregressor is is_regressor @pytest.mark.parametrize("model", REGRESSORS, ids=obj_name) def test_is_regressor(self, model): """ Test that is_regressor works for instances and classes """ assert inspect.isclass(model) assert is_regressor(model) obj = model() assert is_regressor(obj) @pytest.mark.parametrize("model", CLASSIFIERS+CLUSTERERS+TRANSFORMERS+DECOMPOSITIONS, ids=obj_name) def test_not_is_regressor(self, model): """ Test that is_regressor does not match non-regressor estimators """ assert inspect.isclass(model) assert not is_regressor(model) obj = model() assert not is_regressor(obj) def test_is_regressor_pipeline(self): """ Test that is_regressor works for pipelines """ assert not is_regressor(Pipeline) assert not is_regressor(FeatureUnion) model = Pipeline([ ('reduce_dim', PCA()), ('linreg', LinearRegression()) ]) assert is_regressor(model) @pytest.mark.xfail(reason="grid search has no _estimator_type it seems") def test_is_regressor_search(self): """ Test that is_regressor works for search """ assert is_regressor(GridSearchCV) assert is_regressor(RandomizedSearchCV) model = GridSearchCV(SVR(), {'kernel': ['linear', 'rbf']}) assert is_regressor(model) @pytest.mark.parametrize("viz,params", [ (ScoreVisualizer, {'model': LinearRegression()}), (ModelVisualizer, {'model': Ridge()}) ], ids=lambda i: obj_name(i[0])) def test_is_regressor_visualizer(self, viz, params): """ Test that is_regressor works on visualizers """ assert inspect.isclass(viz) assert not is_regressor(viz) obj = viz(**params) assert is_regressor(obj) ##//////////////////////////////////////////////////////////////////// ## is_classifier testing ##//////////////////////////////////////////////////////////////////// def test_classifier_alias(self): """ Assert isclassifier aliases is_classifier """ assert isclassifier is is_classifier @pytest.mark.parametrize("model", CLASSIFIERS, ids=obj_name) def test_is_classifier(self, model): """ Test that is_classifier works for instances and classes """ assert inspect.isclass(model) assert is_classifier(model) obj = model() assert is_classifier(obj) @pytest.mark.parametrize("model", REGRESSORS+CLUSTERERS+TRANSFORMERS+DECOMPOSITIONS, ids=obj_name) def test_not_is_classifier(self, model): """ Test that is_classifier does not match non-classifier estimators """ assert inspect.isclass(model) assert not is_classifier(model) obj = model() assert not is_classifier(obj) def test_classifier_pipeline(self): """ Test that is_classifier works for pipelines """ assert not is_classifier(Pipeline) assert not is_classifier(FeatureUnion) model = Pipeline([ ('reduce_dim', PCA()), ('linreg', LogisticRegression()) ]) assert is_classifier(model) @pytest.mark.xfail(reason="grid search has no _estimator_type it seems") def test_is_classifier_search(self): """ Test that is_classifier works for search """ assert is_classifier(GridSearchCV) assert is_classifier(RandomizedSearchCV) model = GridSearchCV(SVC(), {'kernel': ['linear', 'rbf']}) assert is_classifier(model) @pytest.mark.parametrize("viz,params", [ (ScoreVisualizer, {'model': MultinomialNB()}), (ModelVisualizer, {'model': MLPClassifier()}) ], ids=lambda i: obj_name(i[0])) def test_is_classifier_visualizer(self, viz, params): """ Test that is_classifier works on visualizers """ assert inspect.isclass(viz) assert not is_classifier(viz) obj = viz(**params) assert is_classifier(obj) ##//////////////////////////////////////////////////////////////////// ## is_clusterer testing ##//////////////////////////////////////////////////////////////////// def test_clusterer_alias(self): """ Assert isclusterer aliases is_clusterer """ assert isclusterer is is_clusterer @pytest.mark.parametrize("model", CLUSTERERS, ids=obj_name) def test_is_clusterer(self, model): """ Test that is_clusterer works for instances and classes """ assert inspect.isclass(model) assert is_clusterer(model) obj = model() assert is_clusterer(obj) @pytest.mark.parametrize("model", REGRESSORS+CLASSIFIERS+TRANSFORMERS+DECOMPOSITIONS, ids=obj_name) def test_not_is_clusterer(self, model): """ Test that is_clusterer does not match non-clusterer estimators """ assert inspect.isclass(model) assert not is_clusterer(model) obj = model() assert not is_clusterer(obj) def test_clusterer_pipeline(self): """ Test that is_clusterer works for pipelines """ assert not is_clusterer(Pipeline) assert not is_clusterer(FeatureUnion) model = Pipeline([ ('reduce_dim', PCA()), ('kmeans', KMeans()) ]) assert is_clusterer(model) @pytest.mark.parametrize("viz,params", [ (ModelVisualizer, {'model': KMeans()}) ], ids=lambda i: obj_name(i[0])) def test_is_clusterer_visualizer(self, viz, params): """ Test that is_clusterer works on visualizers """ assert inspect.isclass(viz) assert not is_clusterer(viz) obj = viz(**params) assert is_clusterer(obj) ##//////////////////////////////////////////////////////////////////// ## is_gridsearch testing ##//////////////////////////////////////////////////////////////////// def test_gridsearch_alias(self): """ Assert isgridsearch aliases is_gridsearch """ assert isgridsearch is is_gridsearch @pytest.mark.parametrize("model", SEARCH, ids=obj_name) def test_is_gridsearch(self, model): """ Test that is_gridsearch works correctly """ assert inspect.isclass(model) assert is_gridsearch(model) obj = model(SVC, {"C": [0.5, 1, 10]}) assert is_gridsearch(obj) @pytest.mark.parametrize("model", [MLPRegressor, MLPClassifier, Imputer], ids=obj_name) def test_not_is_gridsearch(self, model): """ Test that is_gridsearch does not match non grid searches """ assert inspect.isclass(model) assert not is_gridsearch(model) obj = model() assert not is_gridsearch(obj) ##//////////////////////////////////////////////////////////////////// ## is_probabilistic testing ##//////////////////////////////////////////////////////////////////// def test_probabilistic_alias(self): """ Assert isprobabilistic aliases is_probabilistic """ assert isprobabilistic is is_probabilistic @pytest.mark.parametrize("model", [ MultinomialNB, GaussianNB, LogisticRegression, SVC, RandomForestClassifier, GradientBoostingClassifier, MLPClassifier, ], ids=obj_name) def test_is_probabilistic(self, model): """ Test that is_probabilistic works correctly """ assert inspect.isclass(model) assert is_probabilistic(model) obj = model() assert is_probabilistic(obj) @pytest.mark.parametrize("model", [ MLPRegressor, Imputer, StandardScaler, KMeans, RandomForestRegressor, ], ids=obj_name) def test_not_is_probabilistic(self, model): """ Test that is_probabilistic does not match non probablistic estimators """ assert inspect.isclass(model) assert not is_probabilistic(model) obj = model() assert not is_probabilistic(obj)
print(total) # For total, Let threshold be 10 result = Binarizer(10) # transformed feature print(result.fit_transform(total)) # Assignment 5 features = np.array([[50, 50], [49, 50], [48, 49], [-1.83, 3.52], [-2.76, 5.55], [-7.57, 4.90], [-1.85, 3.51], [-7.587, 3.72], [-17, -15], [-1.78, 3.47], [-1.98, 4.022], [-1.97, 2.34], [-5.25, 3.30], [-2.35, 4.0], [2.42, 5.14], [-1.61, 4.989], [-2.18, 3.33], [-20, -18], [-20, -20], [-21, -19]]) dataframe = pd.DataFrame(features, columns=["feature_1", "feature_2"]) clusterer = KMeans(3, random_state=0) # fit clusterer clusterer.fit(features) # Predict values dataframe["group"] = clusterer.predict(features) # View first few observation result = dataframe.head(20) print(result) # plot a = [] b = [] for i in range(0, 20): a.append(features[i, 0]) b.append(features[i, 1])
data=all_data13, order=1,line_kws={'color': 'blue'},scatter_kws={'color': 'grey'}).set(ylim=(0, 1)) palette=sns.cubehelix_palette(5, start=2, rot=0, dark=0, light=.95, reverse=False) sns.lmplot(x='oil_price', y='share_price_scaled',hue='year', col='name',ci=None, col_wrap=3, data=all_data13, order=1,palette=palette,size=4).set(ylim=(0, 1)) #============================================================================== # Unsupervised Learning - Cluster analysis on Shell data #============================================================================== from sklearn.cluster import KMeans shell=pd.DataFrame() shell=all_data13[all_data13['name']=='RDSB.L'] # We need to scale also oil price, so clustering is not influenced by the relative size of one axis. shell['oil_price_scaled']=scaler.fit_transform(shell['oil_price'].to_frame()) shell['cluster'] = KMeans(n_clusters=6, random_state=1).fit_predict(shell[['share_price_scaled','oil_price_scaled']]) # The 954 most common RGB monitor colors https://xkcd.com/color/rgb/ colors = ['baby blue', 'amber', 'scarlet', 'grey','milk chocolate', 'windows blue'] palette=sns.xkcd_palette(colors) sns.lmplot(x='oil_price', y='share_price_scaled',ci=None,palette=palette, hue='cluster',fit_reg=0 ,data=shell) #============================================================================== # Supervised learning linear regression #============================================================================== from sklearn import linear_model # 1.- Data preparation shell15=pd.DataFrame()
def skl_clustering(cd, n_clusters=10, **kwargs): # cd == ndarray(words*disjuncts) clustering = kwa(('agglomerative', 'ward'), 'clustering', **kwargs) if type(clustering) is str: if clustering == 'kmeans': clustering = ('kmeans', 'k-means++', 10) elif clustering == 'agglomerative': clustering = ('agglomerative', 'ward') elif clustering == 'mean_shift': clustering = ('mean_shift', 'auto') elif clustering == 'group': # TODO: call ILE clustering? print('Call ILE clustering from optimal_clusters?') elif clustering == 'random': # TODO: call random clustering? print('Call random clustering from optimal_clusters?') else: clustering = ('agglomerative', 'ward') # linkage: ('ward', 'average', 'complete') cluster_criteria = kwa('silhouette', 'cluster_criteria', **kwargs) # GL.0.6 legacy clustering_metric = kwa(('silhouette', 'euclidean'), 'clustering_metric', **kwargs) labels = np.asarray([[]]) metrics = {'clustering': clustering} centroids = np.asarray([[]]) try: # if True: # if clustering[0] == 'agglomerative': linkage = 'ward' affinity = 'euclidean' connectivity = None compute_full_tree = 'auto' if clustering[1] in ['average', 'complete', 'single']: linkage = clustering[1] if len(clustering) > 2: if clustering[2] in ['euclidean', 'cosine', 'manhattan']: affinity = clustering[2] if len(clustering) > 3: # connectivity print('skl_clustering: connectivity:', clustering[3]) if type(clustering[3]) is int and clustering[3] > 0: neighbors = clustering[3] # TODO: int / dict connectivity = kneighbors_graph(cd, neighbors, include_self=False) print(f'\nconnectivity: {connectivity}\n') if len(clustering) > 4: # compute_full_tree if clustering[4] is bool: compute_full_tree = clustering[4] print(f'compute_full_tree: {compute_full_tree}\n') model = AgglomerativeClustering( n_clusters=n_clusters, linkage=linkage, affinity=affinity, connectivity=connectivity, compute_full_tree=compute_full_tree) model.fit(cd) labels = model.labels_ # TODO: centroids = ... elif clustering[0] in ['k-means', 'kmeans']: print('skl_clustering ⇒ kmeans') # FIXME:DEL if clustering[1] in ['k-means++']: # 'random' - fails? init = clustering[1] else: init = 'k-means++' if len(clustering) > 2 and type(clustering[2]) is int: n_init = clustering[2] else: n_init = 10 model = KMeans(init=init, n_clusters=n_clusters, n_init=n_init) model.fit(cd) labels = model.labels_ metrics['inertia'] = model.inertia_ centroids = np.asarray(model.cluster_centers_[:(max(labels) + 1)]) elif clustering[0] in ['mean shift', 'mean_shift']: print('skl_clustering ⇒ mean shift') # FIXME:DEL if len(clustering) < 2: bandwidth = None if type(clustering[1]) is int: bandwidth = clustering[1] else: bandwidth = None # TODO: auto ⇒ estimate_bandwidth model = MeanShift(bandwidth=bandwidth) model.fit(cd) labels = model.labels_ centroids = np.asarray(model.cluster_centers_[:(max(labels) + 1)]) else: # TODO: random clustering? model = AgglomerativeClustering(linkage='ward', n_clusters=n_clusters) model.fit(cd) labels = model.labels_ # silhouette = metrics.silhouette_score(cd, labels, metric=silhouette_metric) try: metrics['silhouette_index'] = silhouette_score(cd, labels, metric=clustering_metric[1]) except: metrics['silhouette_index'] = 0.0 try: metrics['variance_ratio'] = calinski_harabaz_score(cd, labels) except: metrics['variance_ratio'] = 0.0 # try: metrics['davies_bouldin_score'] = davies_bouldin_score(cd, labels) # except: metrics['davies_bouldin_score'] = 0.0 return labels, metrics, centroids except: # else: # return [], {'clustering': 'skl_clustering error'}, []
km = k_means(X, k) km.calcul() super_scat_it(X, km.label, dim, km.centroid) # ### 7.2.2 Exploration of the K-means Algorithm with Scikit Learn # # Once the algorithm has been coded, we are going to make our life easier and simply use the [Scikit Learn library](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html) (-_-). First, let's check that everything is running fine. # In[ ]: kmeans = KMeans(n_clusters=k, random_state=0).fit(X) super_scat_it(X, kmeans.labels_, dim, kmeans.cluster_centers_) # ### 7.2.3 Choosing the optimal number of clusters # # Until now, we knew the actual number of subpopulations (parameterized by the variable $ k $) associated with the simulated data. On the other hand, with non simulated datasets, the data is only very rarely labeled. It is therefore important to develop methodologies in order to clearly define the number of clusters required. # **Questions 7.2** # # 1. Find a simple way to determine the optimal number of clusters. # 2. Implement it. # 3. How many clusters would you choose? # **Questions 7.3** #
if unique not in text_digit_vals: text_digit_vals[unique] = x x += 1 df[column] = list(map(convert_to_int,df[column])) return df df = handle_non_numeric(df) X = np.array(df.drop(['duration_ms','time_signature','Dancebility'],1).astype(float)) y = np.array(df['Manual Mood Classification']) X = preprocessing.scale(X) clf = KMeans(n_clusters = 3) clf.fit(X) correct_count = 0 for i in range(len(X)): predict_data = np.array(X[i].astype(float)) predict_data = predict_data.reshape(-1, len(predict_data)) prediction = clf.predict(predict_data) if prediction == y[i]: correct_count += 1 print(df.head()) print(correct_count/len(X))
import pickle from sklearn.cluster import KMeans from numpy import size if __name__ == "__main__": parser = argparse.ArgumentParser(description="Run KMeans on training set") parser.add_argument("--dataset", type=str, default="data/train_keypoints.p", help="number of clusters") parser.add_argument("--clusters", type=int, default=500, help="number of clusters") args = parser.parse_args() dataset = args.dataset clusters = args.clusters print("Loading dataset") train_features = pickle.load(open(dataset, "rb")) n_features = len(train_features) print("Number of feature points to run clustering on: %d" % n_features) # Clustering with KMeans. print("Running KMeans clustering") kmeans = KMeans(init='k-means++', n_clusters=clusters, n_init=10, n_jobs=2, verbose=1) kmeans.fit(train_features) # Save trained kmeans object to file. pickle.dump(kmeans, open("data/cb_%dclusters.p" % clusters, "wb"))
actual_split.remove('') from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer cv = CountVectorizer() X = cv.fit_transform(actual_split).toarray() V = cv.vocabulary_ B = cv.get_feature_names() from sklearn.decomposition import PCA pca = PCA(n_components=50) X = pca.fit_transform(X) from sklearn.cluster import KMeans wcss = [] for i in range(1, 11): kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10) kmeans.fit(X) wcss.append(kmeans.inertia_) plt.plot(range(1, 11), wcss) plt.title('Elbow graph') plt.xlabel('Number of clusters') plt.ylabel('WCSS') plt.savefig('Satisfied Initial Cluster WCSS.png', dpi=500) plt.show() opt = 5 kmeans = KMeans(n_clusters=opt, init='k-means++', max_iter=300, n_init=10) y_kmeans = kmeans.fit_predict(X) copy_actual['Cluster Level 3'] = list(y_kmeans)
def bow_kmeans(X,k): from sklearn.cluster import KMeans estimator = KMeans(init='k-means++', n_clusters=k, n_init=10, random_state= 0) estimator.fit(X) return estimator.labels_
# print len(question_dict) cluster_docs = [""] * 5 documents = [] charctesToRemove = ['"', "'"] # myfile = open('readd.txt', 'r') for line in question_dict: lines = question_dict[line].ques.encode('ascii', 'ignore') lineFile = lines.translate(None, ''.join(charctesToRemove)) documents.append(lineFile) # myfile.close() true_k = 5 vectorizer = TfidfVectorizer(stop_words='english') X = vectorizer.fit_transform(documents) model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1) model.fit(X) order_centroids = model.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() index = model.predict(X) k = 0 for q in question_dict: for answers in question_dict[q].ans: cluster_docs[index[k]] = cluster_docs[index[k]] + answers k = k + 1 # print("\n\n\n\nTop terms per cluster:") # for i in range(true_k): # print "Cluster %d:" % i,
def cluster(data): # Get path of app directory path = os.path.abspath(os.path.dirname(__file__)) + '/data/' cluster_csv_name = 'cluster_df.csv' all_csv_name = 'all_types.csv' cluster_csv_path = path + cluster_csv_name all_csv_path = path + all_csv_name all_df = pd.read_csv(all_csv_path) df = pd.read_csv(cluster_csv_path) df = df.drop(['Unnamed: 0'], axis=1) # Scale features scale = StandardScaler() scale.fit(df) df_std = scale.transform(df) kmeans = KMeans(n_clusters=50) kmeans = kmeans.fit(df_std) labels = kmeans.predict(df_std) # Create user input features user_input = { 'bedrooms': int(data['bedroom']), 'bathrooms': float(data['bathroom']), 'finished_SqFt': float(data['finished_sq_ft']), 'total_rooms': float(data['total_room']), 'Allston': 1 if data['neighborhood'] == 'Allston' else 0, 'Back Bay': 1 if data['neighborhood'] == 'Back Bay' else 0, 'Bay Village': 1 if data['neighborhood'] == 'Bay Village' else 0, 'Beacon Hill': 1 if data['neighborhood'] == 'Beacon Hill' else 0, 'Brighton': 1 if data['neighborhood'] == 'Brighton' else 0, 'Charlestown': 1 if data['neighborhood'] == 'Charlestown' else 0, 'Chinatown': 1 if data['neighborhood'] == 'Chinatown' else 0, 'Downtown': 1 if data['neighborhood'] == 'Downtown' else 0, 'Downtown Crossing': 1 if data['neighborhood'] == 'Downtown Crossing' else 0, 'East Boston': 1 if data['neighborhood'] == 'East Boston' else 0, 'Fenway': 1 if data['neighborhood'] == 'Fenway' else 0, 'Hyde Park': 1 if data['neighborhood'] == 'Hyde Park' else 0, 'Jamaica Plain': 1 if data['neighborhood'] == 'Jamaica Plain' else 0, 'Kenmore': 1 if data['neighborhood'] == 'Kenmore' else 0, 'Leather District': 1 if data['neighborhood'] == 'Leather District' else 0, 'Mattapan': 1 if data['neighborhood'] == 'Mattapan' else 0, 'Mission Hill': 1 if data['neighborhood'] == 'Mission Hill' else 0, 'North Dorchester': 1 if data['neighborhood'] == 'North Dorchester' else 0, 'North End': 1 if data['neighborhood'] == 'North End' else 0, 'Roslindale': 1 if data['neighborhood'] == 'Roslindale' else 0, 'Roxbury': 1 if data['neighborhood'] == 'Roxbury' else 0, 'South Boston': 1 if data['neighborhood'] == 'South Boston' else 0, 'South Dorchester': 1 if data['neighborhood'] == 'South Dorchester' else 0, 'South End': 1 if data['neighborhood'] == 'South End' else 0, 'West End': 1 if data['neighborhood'] == 'West End' else 0, 'West Roxbury': 1 if data['neighborhood'] == 'West Roxbury' else 0, 'Winthrop': 1 if data['neighborhood'] == 'Winthrop' else 0} user_df = pd.DataFrame(user_input, index=[0]) # Scale features from user input scaled_user_df = scale.transform(user_df) # Get cluster for user input user_cluster = kmeans.predict(scaled_user_df) # Get distance from user input datapoint trans = kmeans.transform(df_std) # Sort distance closest_points = [] argsor = np.argsort(trans[:, user_cluster[0]]) for i, argsortidx in enumerate(argsor): if i == 3: break closest_points.append(argsortidx) zpids = [] addresses = [] prices = [] sold_dates = [] # Get index of the 3 shortest distance from user cluster for i in closest_points: zpid = all_df.loc[i, 'zpid'] zpids.append(zpid) add = all_df.loc[i, 'address'] addresses.append(add) price = 'Last sold price: $' + abbrNumber(all_df.loc[i, 'price']) prices.append(price) sold_date = 'Sold on: {}'.format(all_df.loc[i, 'readable_date_sold']) sold_dates.append(sold_date) # Get picture by zpid pic_urls = [] home_urls = [] zillow_id = app.config['ZILLOW_API_KEY'] url = 'http://www.zillow.com/webservice/GetUpdatedPropertyDetails.htm?' tree = '' for id in zpids: zpid_data = {'zws-id': zillow_id, 'zpid': id} query_string = url + urllib.parse.urlencode(zpid_data) response = requests.get(query_string) msg = response.content tree = ET.fromstring(msg) code = tree.find('message/code') if code.text == '0': result = tree.find('response') homeInfo = result.find('links/homeInfo') images = result.find('images/image') home_url = homeInfo.text if homeInfo is not None else None pic_url = images[0].text if images is not None else 'http://source.unsplash.com/daily' home_urls.append(home_url) pic_urls.append(pic_url) else: home_urls.append(None) pic_urls.append('http://source.unsplash.com/daily') result = { 'addresses': addresses, 'prices': prices, 'sold_dates': sold_dates, 'home_urls': home_urls, 'pic_urls': pic_urls } return result
# Plot the scatter of solutions as (r, theta) points because they are phi symmetric plt.scatter(Rs, Thetas) plt.show() # #####################CLUSTERING################# num_clusters = 4 # Convert solns to an np array of (r, theta, phi) points solns_as_nparray = [] for i in solns: solns_as_nparray.append((i.r, i.theta, i.phi)) solns_as_nparray = np.array(solns_as_nparray) est = KMeans(n_clusters=num_clusters) est.fit(solns_as_nparray) labels = est.labels_ # Function to plot clusters that kmeans has estimated def plot_Kmeans_clusters(ax, sample, label, k): colors = ['bo', 'ro', 'go', 'mo'] for i in range(k): data = sample[label == i] ax.plot(data[:, 0], data[:, 1], colors[i]) # Plot clusters fig, ax = plt.subplots(figsize=(8, 8)) plot_Kmeans_clusters(ax, solns_as_nparray.astype(float), labels, num_clusters)
feature_2 = "exercised_stock_options" feature_3 = 'total_payments' poi = "poi" features_list = [poi, feature_1, feature_2, feature_3] data = featureFormat(data_dict, features_list ) poi, finance_features = targetFeatureSplit( data ) ### in the "clustering with 3 features" part of the mini-project, ### you'll want to change this line to ### for f1, f2, _ in finance_features: ### (as it's currently written, the line below assumes 2 features) for f1, f2, _ in finance_features: plt.scatter( f1, f2 ) plt.show() ### cluster here; create predictions of the cluster labels ### for the data and store them to a list called pred from sklearn.cluster import KMeans kmeans = KMeans(n_clusters=2) pred = kmeans.fit_predict(finance_features) ### rename the "name" parameter when you change the number of features ### so that the figure gets saved to a different file try: Draw(pred, finance_features, poi, mark_poi=False, name="clusters.pdf", f1_name=feature_1, f2_name=feature_2) except NameError: print "no predictions object named pred found, no clusters to plot"
def do_k_means(X: np.ndarray, k): kmeans = KMeans(n_clusters=k, max_iter=1000, n_init=500).fit(X) return kmeans.inertia_, kmeans.labels_
import pandas as pd import numpy as np from sklearn import preprocessing from sklearn.cluster import KMeans global labels data = np.array(final_data_object['no_identifiers_data_list']) data = preprocessing.MinMaxScaler().fit_transform(data) model = KMeans(n_clusters=4) clustering = model.fit(data) labels = clustering.labels_ labels = labels.tolist()
main_invest_region, risk_beta, return_on_investment_3month, return_on_investment_6month, return_on_investment_1year, return_on_investment_3year, risk_return_level, established_scale, scale, risk_standard_deviation, fee ] features.append(tmp_feature); X = np.array(features); kmeans = KMeans(n_clusters=20, random_state=0).fit(X); X_category = kmeans.labels_; cluster_centers = handle_cluster_center(kmeans.cluster_centers_, X_category); with open('../data/cluster_centers.json', 'w') as f: json.dump(cluster_centers, f, indent=1); new_fund_datas = []; for i in range(len(fund_datas)): fund_info = fund_datas[i]; fund_info['id'] = i; fund_info['cluster_id'] = int(X_category[i]); new_fund_datas.append(fund_info); with open('../data/new_fund.json', 'w') as f: json.dump(new_fund_datas, f, indent=1, ensure_ascii=False);
from model import Preprocess from sklearn.cluster import KMeans from sklearn.decomposition import LatentDirichletAllocation import matplotlib.pyplot as plt print('only we look whole distribution') from sklearn.manifold import LocallyLinearEmbedding from model import gen_init_point data, feature, corpus = Preprocess.preprocess_chinese() n_topic = 20 lle = LocallyLinearEmbedding(n_components=2) data_lle = lle.fit_transform(data) plt.show(data[:,0],data[:,1]) plt.show() km = KMeans(n_cluster=)
# Set random seed for reproducibility np.random.seed(1000) min_nb_clusters = 2 max_nb_clusters = 20 if __name__ == '__main__': # Load the dataset digits = load_digits() X_train = digits['data'] / np.max(digits['data']) # Compute the inertias inertias = np.zeros(shape=(max_nb_clusters - min_nb_clusters + 1, )) for i in range(min_nb_clusters, max_nb_clusters + 1): km = KMeans(n_clusters=i, random_state=1000) km.fit(X_train) inertias[i - min_nb_clusters] = km.inertia_ # Plot the inertias sns.set() fig, ax = plt.subplots(figsize=(12, 7)) ax.plot(np.arange(2, max_nb_clusters + 1), inertias, "o-") ax.set_xlabel("Number of clusters", fontsize=18) ax.set_ylabel("Inertia", fontsize=18) ax.set_xticks(np.arange(2, max_nb_clusters + 1)) ax.grid(True) plt.show()
def build_vocabulary(image_paths, vocab_size): """ This function should sample HOG descriptors from the training images, cluster them with kmeans, and then return the cluster centers. Inputs: image_paths: a Python list of image path strings vocab_size: an integer indicating the number of words desired for the bag of words vocab set Outputs: a vocab_size x (z*z*9) (see below) array which contains the cluster centers that result from the K Means clustering. You'll need to generate HOG features using the skimage.feature.hog() function. The documentation is available here: http://scikit-image.org/docs/dev/api/skimage.feature.html#skimage.feature.hog However, the documentation is a bit confusing, so we will highlight some important arguments to consider: cells_per_block: The hog function breaks the image into evenly-sized blocks, which are further broken down into cells, each made of pixels_per_cell pixels (see below). Setting this parameter tells the function how many cells to include in each block. This is a tuple of width and height. Your SIFT implementation, which had a total of 16 cells, was equivalent to setting this argument to (4,4). pixels_per_cell: This controls the width and height of each cell (in pixels). Like cells_per_block, it is a tuple. In your SIFT implementation, each cell was 4 pixels by 4 pixels, so (4,4). feature_vector: This argument is a boolean which tells the function what shape it should use for the return array. When set to True, it returns one long array. We recommend setting it to True and reshaping the result rather than working with the default value, as it is very confusing. It is up to you to choose your cells per block and pixels per cell. Choose values that generate reasonably-sized feature vectors and produce good classification results. For each cell, HOG produces a histogram (feature vector) of length 9. We want one feature vector per block. To do this we can append the histograms for each cell together. Let's say you set cells_per_block = (z,z). This means that the length of your feature vector for the block will be z*z*9. With feature_vector=True, hog() will return one long np array containing every cell histogram concatenated end to end. We want to break this up into a list of (z*z*9) block feature vectors. We can do this using a really nifty numpy function. When using np.reshape, you can set the length of one dimension to -1, which tells numpy to make this dimension as big as it needs to be to accomodate to reshape all of the data based on the other dimensions. So if we want to break our long np array (long_boi) into rows of z*z*9 feature vectors we can use small_bois = long_boi.reshape(-1, z*z*9). The number of feature vectors that come from this reshape is dependent on the size of the image you give to hog(). It will fit as many blocks as it can on the image. You can choose to resize (or crop) each image to a consistent size (therefore creating the same number of feature vectors per image), or you can find feature vectors in the original sized image. ONE MORE THING If we returned all the features we found as our vocabulary, we would have an absolutely massive vocabulary. That would make matching inefficient AND inaccurate! So we use K Means clustering to find a much smaller (vocab_size) number of representative points. We recommend using sklearn.cluster.KMeans to do this. Note that this can take a VERY LONG TIME to complete (upwards of ten minutes for large numbers of features and large max_iter), so set the max_iter argument to something low (we used 100) and be patient. You may also find success setting the "tol" argument (see documentation for details) """ features = None for im_path in image_paths: im = rgb2grey(imread(im_path)) cells_per_block = 3 im_hog = hog(im, cells_per_block=(cells_per_block, cells_per_block)) im_hog = im_hog.reshape(-1, cells_per_block * cells_per_block * 9) if features is None: features = im_hog else: features = np.vstack([features, im_hog]) clf = KMeans(vocab_size, max_iter=100, tol=1e-3, n_jobs=-1) clf.fit(features) return clf.cluster_centers_
cust_df.info() # In[6]: import numpy as np from sklearn.preprocessing import StandardScaler X = cust_df.values[:, 1:] X = np.nan_to_num(X) clus_dataset = StandardScaler().fit_transform(X) clus_dataset # In[7]: from sklearn.cluster import KMeans clusternum = 4 k_means = KMeans(init="k-means++", n_clusters=clusternum, n_init=12) k_means.fit(clus_dataset) lables = k_means.labels_ print(lables) # In[8]: cust_df['Clus_km'] = lables cust_df.head(5) # In[10]: cust_df.groupby('Clus_km').mean() # In[14]:
from sklearn.datasets import make_blobs import matplotlib.pyplot as plt data = make_blobs(n_samples=500, n_features=2, centers=4, cluster_std=1.8) print(data) plt.scatter(data[0][:,0],data[0][:,1], c=data[1]) plt.show() from sklearn.cluster import KMeans wcss=[] for i in range(1, 20): kmeans=KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10) kmeans.fit(data[0]) wcss.append(kmeans.inertia_) plt.plot(range(1,20),wcss ) plt.show() kmeans = KMeans(n_clusters=4, init='k-means++', max_iter=300, n_init=10) pred_y = kmeans.fit_predict(data[0]) plt.scatter(data[0][:,0], data[0][:,1]) plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='red') plt.show() from pyclustering.cluster.kmedoids import kmedoids initial_medoids = [100,200,300,400] k_medoids = kmedoids (nclusters=4, data=data[0],initial_index_medoids=initial_medoids) k_medoids.process() pred_y = k_medoids.predict(data[0]) from pyclustering.cluster import cluster_visualizer clusters = k_medoids.get_clusters() # list of clusters
f2 = london['Mean TemperatureC'].values X = np.array(list(zip(f1, f2))) #X = np.array(list(zip(f1, f2))) #X = london.iloc[:,[8,14]].values #pl.scatter(f1, f2, c='black', s=7) #pl.figure() #X = london.iloc[:,[1,7]].values #pl.scatter(X[:,0],X[:,1], c=cluster.labels_, cmap='rainbow') # Elbow Method I = [] for i in range(1, 11): kmeans = KMeans(n_clusters=i, init='k-means++') kmeans.fit(X) I.append(kmeans.inertia_) pl.figure() pl.plot(range(1, 11), I) pl.title('The Elbow Method') pl.xlabel('Number of Clusters') pl.ylabel('WCSS') pl.show() # Algorithme du Kmeans kmeans = KMeans(n_clusters=3, init='k-means++') y_kmeans = kmeans.fit_predict(X)
# for i in range(2,12): # km=KMeans(n_clusters=i,init='k-means++', max_iter=300, n_init=10, random_state=0) # km.fit(reduced_data) # wcss.append(km.inertia_) # plt.plot(range(2,12),wcss) # plt.title('Elbow Method') # plt.xlabel('Number of clusters') # plt.ylabel('wcss') # plt.show() # k means determine k distortions = [] K = range(2,12) for k in K: kmeanModel = KMeans(n_clusters=k).fit(reduced_data) kmeanModel.fit(reduced_data) distortions.append(sum(np.min(cdist(reduced_data, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / reduced_data.shape[0]) # Plot the elbow plt.plot(K, distortions, 'bx-') plt.xlabel('k') plt.ylabel('Distortion') plt.title('The Elbow Method showing the optimal k') plt.show() # cluster clusterer = KMeans(n_clusters=5, random_state = RAN_STATE).fit(reduced_data) preds = clusterer.predict(reduced_data)
sns.FacetGrid(df, hue="Species", height=6) .map(plt.scatter, "PetalLengthCm", "PetalWidthCm") .add_legend() plt.show() # Let's find the optimal number of cluster and apply K-Means Algorithm # In[8]: x = df.iloc[:, [0, 1, 2, 3]].values from sklearn.cluster import KMeans wcss = [] for i in range(1, 11): kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0) kmeans.fit(x) wcss.append(kmeans.inertia_) # In[9]: plt.plot(range(1, 11), wcss,'*-') plt.title('The elbow method') plt.xlabel('Number of clusters') plt.ylabel('WCSS') plt.text(4,200000,"optimal number of clusters = 3") plt.show()
def initialization(self): # Número de evaluaciones/épocas self._epochs = 0 self._evals = 0 self._last_EP_update_eval = -1 self._last_EP_update_epoch = -1 # Inicializar semilla aleatoria if self._random_seed is not None: random.seed(self._random_seed) np.random.seed(self._random_seed) #print("Semilla {}".format(self._random_seed)) # DEBUG # Población externa self._EP = np.empty(shape=(0, self._num_objectives)) self._EP_chromosomes = np.empty(shape=(0, self._dimensionality)) # Población de individuos self._population = np.random.randint( 0, self._num_clusts, size=(self._population_size, self._dimensionality)) # COMPROBADO if self._kmeans_init_ratio > 0.0: initialized_with_kmeans = random.sample( range(0, self._population_size), floor(self._population_size * self._kmeans_init_ratio)) for i in initialized_with_kmeans: self._population[i] = KMeans(n_clusters=self._num_clusts, max_iter=np.random.randint( 10, 20)).fit_predict( self._data) #.labels_ # Vectores solución (f-values) de todos los individuos de la población self._FV = np.empty((self._population_size, self._num_objectives)) for i in range(self._population_size): self._FV[i] = self.f(self._population[i]) # Punto de referencia z (mejor valor obtenido con cada función objetivo) if self._z is None: if self._maximization: self._z = np.amax(self._FV, axis=0) else: self._z = np.amin(self._FV, axis=0) # Punto de referencia z-worst: el peor valor obtenido con cada función objetivo if self._maximization: self._z_worst = np.amin(self._FV, axis=0) else: self._z_worst = np.amax(self._FV, axis=0) # Vectores de pesos lambda # SUS ELEMENTOS DEBEN SUMAR 1 self._lambdas = normalize( np.random.randint(low=1, high=99, size=(self._population_size, self._num_objectives))) # INICIALIZACIÓN DEL VECINDARIO # Matriz de distancias de los vectores lambda lambdas_distances = pairwise_distances(self._lambdas, Y=None, metric='euclidean') # Vecindario de cada vector de pesos lambda-i self._lambda_neighborhood = lambdas_distances.argsort( axis=1)[:, 0:self._lambda_neighborhood_size] # COMPROBADO
def train_net(data, params): # # UNPACK DATA # x_train, y_train, x_val, y_val, x_test, y_test = data['spectral']['train_and_test'] x_train_unlabeled, y_train_unlabeled, x_train_labeled, y_train_labeled = data['spectral']['train_unlabeled_and_labeled'] x_val_unlabeled, y_val_unlabeled, x_val_labeled, y_val_labeled = data['spectral']['val_unlabeled_and_labeled'] if 'siamese' in params['affinity']: pairs_train, dist_train, pairs_val, dist_val = data['siamese']['train_and_test'] x = np.concatenate((x_train, x_val, x_test), axis=0) y = np.concatenate((y_train, y_val, y_test), axis=0) if len(x_train_labeled): y_train_labeled_onehot = OneHotEncoder().fit_transform(y_train_labeled.reshape(-1, 1)).toarray() else: y_train_labeled_onehot = np.empty((0, len(np.unique(y)))) # # SET UP INPUTS # # create true y placeholder (not used in unsupervised training) y_true = tf.placeholder(tf.float32, shape=(None, params['n_clusters']), name='y_true') batch_sizes = { 'Unlabeled': params['batch_size'], 'Labeled': params['batch_size'], 'Orthonorm': params.get('batch_size_orthonorm', params['batch_size']), } input_shape = x.shape[1:] # spectralnet has three inputs -- they are defined here inputs = { 'Unlabeled': Input(shape=input_shape,name='UnlabeledInput'), 'Labeled': Input(shape=input_shape,name='LabeledInput'), 'Orthonorm': Input(shape=input_shape,name='OrthonormInput'), } # # DEFINE AND TRAIN SIAMESE NET # # run only if we are using a siamese network if params['affinity'] == 'siamese': siamese_net = networks.SiameseNet(inputs, params['arch'], params.get('siam_reg'), y_true) history = siamese_net.train(pairs_train, dist_train, pairs_val, dist_val, params['siam_lr'], params['siam_drop'], params['siam_patience'], params['siam_ne'], params['siam_batch_size']) else: siamese_net = None # # DEFINE AND TRAIN SPECTRALNET # spectral_net = networks.SpectralNet(inputs, params['arch'], params.get('spec_reg'), y_true, y_train_labeled_onehot, params['n_clusters'], params['affinity'], params['scale_nbr'], params['n_nbrs'], batch_sizes, siamese_net, x_train, len(x_train_labeled)) spectral_net.train( x_train_unlabeled, x_train_labeled, x_val_unlabeled, params['spec_lr'], params['spec_drop'], params['spec_patience'], params['spec_ne']) print("finished training") # # EVALUATE # # get final embeddings x_spectralnet = spectral_net.predict(x) # get accuracy and nmi kmeans_assignments, km = get_cluster_sols(x_spectralnet, ClusterClass=KMeans, n_clusters=params['n_clusters'], init_args={'n_init':10}) y_spectralnet, _ = get_y_preds(kmeans_assignments, y, params['n_clusters']) print_accuracy(kmeans_assignments, y, params['n_clusters']) from sklearn.metrics import normalized_mutual_info_score as nmi nmi_score = nmi(kmeans_assignments, y) print('NMI: ' + str(np.round(nmi_score, 3))) if params['generalization_metrics']: x_spectralnet_train = spectral_net.predict(x_train_unlabeled) x_spectralnet_test = spectral_net.predict(x_test) km_train = KMeans(n_clusters=params['n_clusters']).fit(x_spectralnet_train) from scipy.spatial.distance import cdist dist_mat = cdist(x_spectralnet_test, km_train.cluster_centers_) closest_cluster = np.argmin(dist_mat, axis=1) print_accuracy(closest_cluster, y_test, params['n_clusters'], ' generalization') nmi_score = nmi(closest_cluster, y_test) print('generalization NMI: ' + str(np.round(nmi_score, 3))) return spectral_net
print(index) for i in range(4): if(i not in index): newCentroid.append(centroids[i]) newHist.append(hist[i]) if(centroids != []): for (percent, color) in zip(newHist, newCentroid): print(color) if(percent>max): max = percent clr = color # return the bar chart return clr.astype("uint8").tolist() img = cv2.imread("2.jpeg") img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) img = img.reshape((img.shape[0] * img.shape[1],3)) #represent as row*column,channel number clt = KMeans(n_clusters=4) #cluster number clt.fit(img) hist = find_histogram(clt) bar = dominantColor(hist, clt.cluster_centers_) print(bar)
day_names = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday' ] #,'Sunday'] grids = grids_df.grid.loc[[_ in day_names for _ in grids_df.day_name]] grids = np.array(list(grids))[:, :15, :15] smooth_grids = np.array([grid_sqavg(_, 5).flatten() for _ in grids]) grids = np.array([_.flatten() for _ in grids]) fa = FactorAnalysis(n_components=3).fit_transform(grids) aics = [] k_vals = range(1, 25) for k in k_vals: print(k) km = KMeans(n_clusters=k).fit(fa) aics.append(kmeans_AIC(km)) plt.plot(k_vals, aics) plt.xlabel('Number of Clusters (k)', size=16) plt.ylabel('Akaike Information Criterion', size=16) plt.savefig('grid_aic_fa.png') plt.close() fig = plt.figure() ax = fig.add_subplot(111, projection='3d') k_best = np.where(np.array(aics) == np.min(aics))[0][0] km_best = KMeans(n_clusters=k_best).fit(fa) ax.scatter(fa[:, 0], fa[:, 1], fa[:, 2], c=km_best.labels_) plt.show()