#%% Importing the libraries import numpy as np import matplotlib.pyplot as plt import pandas as pd from sklearn.cluster import KMeans from gap_statistic import OptimalK #%% Importing the dataset dataset = pd.read_csv('clustering/pcs.csv', index_col=0) X = dataset.values names = dataset.index #%% Using the elbow method to find the optimal number of clusters wcss = [] for i in range(1, 20): print(i) kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 14) kmeans.fit(X) wcss.append(kmeans.inertia_) plt.plot(range(1, 20), wcss) plt.title('The Elbow Method') plt.xlabel('Number of clusters') plt.ylabel('WCSS') plt.show() #%% Gap Statistic optimalK = OptimalK(n_jobs=4, parallel_backend='joblib') n_clusters = optimalK(X, cluster_array=np.arange(1, 50)) test = optimalK.gap_df optimalK.plot_results() #%% Training the K-Means model on the dataset best_model = [] best_wcss = 160000
# Make some test data #X, y = make_blobs(n_samples=int(1e5), n_features=2, centers=3, random_state=25) #print('Data shape: ', X.shape) #print(X, type(X)) #X = np.array([[100., 1.], [200.,1.],[220.,1.],[230.,1.], [500.,1.], [600.,1.]]) X = np.array([[100.], [200.], [220.], [230.], [580.], [600.]]) #X = np.array([[100.],[200.],[300.],[400.], [500.], [600.]]) #X = np.array([[100.],[180.],[300.],[410.], [500.], [610.]]) print(X, type(X)) # Call OptimalK to determine best number of clusters print('Calculating optimal number of clusters') n_clusters = optimalK(X, cluster_array=np.arange(1, 6), n_refs=100) print('Optimal clusters: ', n_clusters) print('Diff', optimalK.gap_df["diff"]) #sys.exit() optimalK.plot_results() # Plot some results plt.plot(optimalK.gap_df.n_clusters, optimalK.gap_df.gap_value, linewidth=3) plt.scatter( optimalK.gap_df[optimalK.gap_df.n_clusters == n_clusters].n_clusters, optimalK.gap_df[optimalK.gap_df.n_clusters == n_clusters].gap_value, s=250, c='r') plt.grid(True) plt.xlabel('Cluster Count') plt.ylabel('Gap Value') plt.title('Gap Values by Cluster Count') plt.show() # Now that we have the optimal clusters, n, we build our own KMeans model... km = KMeans(n_clusters)