def score_with_davies(k): kmean = KMeans(k) labels = kmean.fit_predict(test) score = davies_bouldin_score(test, labels) return score
def apply_algo_k_3scores(X, params=None, quiet=True): '''Adapation of apply_algo_k_auto in benchmark context. [IN] - X (np.array[N,p]): design matrix to put in input of the algorithm. Each line is an observation, each column is a predictor. - params (dict): dict with all settings. Depends on 'max_k', 'n_clusters' - quiet (bool): if True, all prints are skipped [OUT] - labels (np.array[N]): vector of cluster number attribution BEWARE: the cluster identification number are random. Only borders matter. - n_clusters_opt (int): optimal number of clusters to be found in the data - classif_scores (float): value of classification score (chosen in params['n_clusters']) for the returned classification. ''' if params is None: params = utils.get_default_params() # Apply algorithm and compute scores for several number of clusters all_labels = [] s_scores = [] db_scores = [] ch_scores = [] for n_clusters in range(2, params['max_k'] + 1): labels = apply_algo(X, n_clusters, params=params) all_labels.append(labels) if len(np.unique(labels)) > 1: with np.errstate( divide='ignore', invalid='ignore' ): # to avoid itempestive warning ("RuntimeWarning: divide by zero encountered in true_divide...") db_scores.append(davies_bouldin_score(X, labels)) s_scores.append(silhouette_score(X, labels)) ch_scores.append(calinski_harabaz_score(X, labels)) else: db_scores.append(np.nan) s_scores.append(np.nan) ch_scores.append(np.nan) # Choose the best number of clusters valid = True if params['classif_score'] in ['silhouette', 'silh']: k_best = np.nanargmax(s_scores) if s_scores[k_best] < 0.6: if not quiet: print("Bad classification according to silhouette score (", s_scores[k_best], "). BLH is thus NaN") valid = False elif params['classif_score'] in ['davies_bouldin', 'db']: k_best = np.nanargmin(db_scores) if db_scores[k_best] > 0.4: if not quiet: print("Bad classification according to Davies-Bouldin score (", db_scores[k_best], "). BLH is thus NaN") valid = False else: k_best = np.nanargmax(ch_scores) if ch_scores[k_best] < 200: if not quiet: print( "Bad classification according to Calinski-Harabasz score (", ch_scores[k_best], "). BLH is thus NaN") valid = False if all(np.isnan(db_scores)): valid = False # Return the results if valid: result = all_labels[k_best], k_best + 2, s_scores[k_best], db_scores[ k_best], ch_scores[k_best] else: result = None, np.nan, s_scores[k_best], db_scores[k_best], ch_scores[ k_best] return result
time_start = time.process_time() # On regarde le temps CPU dataset = arff.loadarff(open(path + list_dataset[i], 'r')) data = [[x[0], x[1]] for x in dataset[0]] dbscan = cluster.DBSCAN(eps=esp[i], min_samples=min_samples[i]) y_pred = dbscan.fit_predict(data) labels = dbscan.labels_ plt.scatter((dataset[0])['x'], (dataset[0])['y'], c=y_pred, cmap="tab20") plt.show() if len(np.unique(labels)) > 1: print("Indice de Davies-Bouldin : " + str(metrics.davies_bouldin_score(data, labels))) print("Coefficient de silhouette : " + str(metrics.silhouette_score(data, labels, metric='euclidean'))) time_stop = time.process_time() # On regarde le temps CPU print("Temps de calcul : " + str(time_stop - time_start)) print( "\n#=================== Valeurs de esp et min_samples NON donné ===================#" ) print("\n#--------------------- Variation de epsilon -----------------------#") min_samples = [5, 34, 2, 19, 11, 2] for i in range(len(list_dataset)): print("\n#----------------- " + list_dataset[i] + " --------------------#") time_list = [] davies_bouldin_list = []
print("\n************"+datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')+"************\n") text_file.close() ### Number of clusters # ranged = [2, 3, 4, 12] for n_clusters in range(2,31): ## linkage{"ward","complete","average","single"}, optional (default="ward") model = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage=mode) predict = pd.DataFrame(model.fit_predict(df)) predict.columns=['predict'] # concatenate labels to df as a new column r = pd.concat([df,predict],axis=1) r.to_csv("csv/"+datestamp+"_Linkage_"+mode+"_3_"+str(n_clusters)+".csv") #print(r.sample(10)) # clusters silhouette_avg = silhouette_score(df.values, predict.values.ravel()) DBI_avg = davies_bouldin_score(df.values, predict.values.ravel()) text_file = open("data/"+datestamp+"_3_"+mode+"Linkage_score.txt", "a+") text_file.write("\n\nn_clusters ="+str(n_clusters)+"The average silhouette_score is :"+str(silhouette_avg)) text_file.write("\nn_clusters ="+str(n_clusters)+"The average DBI_score is :"+str(DBI_avg)) text_file.close() print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg) print("For n_clusters =", n_clusters, "The average DBI_score is :", DBI_avg)
def daviesBouldinScore(X, y_pred): return np.tanh(metrics.davies_bouldin_score(X, y_pred))
def methods(dataset, clustering, params, dictionary): cluster_plot(dataset, clustering, params) print(colored(f"Clustering quality for {params['name']}", 'red')) clustering_quality(silhouette_score(dataset, clustering.labels_), 'silhouette', dictionary) clustering_quality(davies_bouldin_score(dataset, clustering.labels_), 'davies_bouldin', dictionary) clustering_quality(calinski_harabasz_score(dataset, clustering.labels_), 'calinski_harabasz', dictionary)
def lapscore_main(): # iterate the whole process for 10 times for index, subsample in enumerate(X_testset): # construct affinity matrix kwargs_W = {"metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1} W = construct_W.construct_W(subsample, **kwargs_W) # obtain the scores of features idx = lap_score.lap_score(subsample, mode="rank", W=W) # obtian the array of variables through ranking X_col_list = X_test_full.columns.values.tolist() prepare_list['lap_ranked_Xtestset' + str(index)] = get_variable_rank(idx, X_col_list) ranked_var_filename = 'lap_ranked_Xtestset' + str(index) + '.txt' f_rank = open(ranked_var_filename, 'w') f_rank.write(str(prepare_list['lap_ranked_Xtestset' + str(index)])) f_rank.close() # perform evaluation on clustering task range_num_fea = range(10, 210, 10) # number of selected features range_n_clusters = [3, 4, 5, 6, 7, 8, 9, 10] # number of clusters # dynamic generating dictionaries to store results prepare_list['lapscore_criteria' + str(index)] = {'silhouette_score': [], 'ch_score': [], 'db_score': []} # deciding optimal value for num_cluster and the optimal number of selected features for n_cluster in range_n_clusters: for num_features in range_num_fea: # obtain the dataset on the selected features selected_features = subsample[:, idx[0:num_features]] # initialize the clusterer with n_clusters value and a random generator # seed of 10 for reproducbility clusterer = KMeans( n_clusters=n_cluster, random_state=10) cluster_labels = clusterer.fit_predict(selected_features) # the silhouette_score gives the average value for all the samples # this gives a perspective into the density and separation of the formed clusters silhouette_avg = metrics.silhouette_score( selected_features, cluster_labels, metric='euclidean') # write the content into the dict prepare_list['lapscore_criteria' + str(index)]['silhouette_score'].append(silhouette_avg) # in normal usage, the Calinski-Harabasz index is applied to the results of a cluster analysis ch_idx = metrics.calinski_harabasz_score( selected_features, cluster_labels) # write the content into the dict prepare_list['lapscore_criteria' + str(index) ]['ch_score'].append(ch_idx) # in normal usage, the Davies-Bouldin index is applied to the results of a cluster analysis db_idx = davies_bouldin_score( selected_features, cluster_labels) # write the content into the dict prepare_list['lapscore_criteria' + str(index)]['db_score'].append(db_idx) print("subset No.", index, "," "For n_clusters =", n_cluster, "," "For num_features =", num_features, "," "the average silhouette_score is: ", silhouette_avg, "," "the Calinski-Harabasz index is: ", ch_idx, "," "the Davies-Bouldin index is: ", db_idx) lapscore_silhouette_score = generate_criteria_tb( dict_name='lapscore_criteria', col_name='silhouette_score') lapscore_Calinski_Harabasz_index = generate_criteria_tb( dict_name='lapscore_criteria', col_name='ch_score') lapscore_Davies_Bouldin_index = generate_criteria_tb( dict_name='lapscore_criteria', col_name='db_score') lapscore_silhouette_score.to_csv( 'lapscore_silhouette_score.csv', index=False) lapscore_Calinski_Harabasz_index.to_csv( 'lapscore_Calinski_Harabasz_index.csv', index=False) lapscore_Davies_Bouldin_index.to_csv( 'lapscore_Davies_Bouldin_index.csv', index=False)
### Number of clusters for n_clusters in range(2, 31): model = KMeans(n_clusters=n_clusters).fit(data_points) predict = pd.DataFrame(model.fit_predict(df)) predict.columns = ['predict'] # concatenate labels to df as a new column r = pd.concat([df, predict], axis=1) #print(r.sample(1)) r.to_csv("csv/" + datestamp + "_KMeans_3_" + str(n_clusters) + ".csv") # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed # clusters silhouette_avg = silhouette_score(data_points, predict.values.ravel()) DBI_avg = davies_bouldin_score(data_points, predict.values.ravel()) text_file = open("data/" + datestamp + "_3_KMeans_score.txt", "a+") text_file.write("\n\nn_clusters =" + str(n_clusters) + "The average silhouette_score is :" + str(silhouette_avg)) text_file.write("\nn_clusters =" + str(n_clusters) + "The average DBI_score is :" + str(DBI_avg)) text_file.close() print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg) print("For n_clusters =", n_clusters, "The average DBI_score is :", DBI_avg)
FMI = metrics.fowlkes_mallows_score(Y, labels_pred) # external metrics print('SC, DB, FMI = ', SC, DB, FMI) ################## Graph metrics sse = [] fmi = [] dbi = [] k_list = range(1, 15) for k in k_list: km = KMeans(n_clusters=k, random_state=1) km.fit(X_norm) labels_pred = km.labels_ SC = metrics.silhouette_score(X_norm, labels_pred, metric='euclidean') DB = metrics.davies_bouldin_score(X_norm, labels_pred) # inner metrics #print(k, SC) sse.append([k, SC]) dbi.append([k, DB]) fmi.append([k, metrics.fowlkes_mallows_score(Y, labels_pred)]) #oca_results_scale = pd.DataFrame({'Cluster': range(2,15), 'SSE': sse}) oca_results_scale = pd.DataFrame({'Cluster': range(1, 15), 'FMI': fmi}) plt.figure(figsize=(12, 6)) plt.plot(pd.DataFrame(sse)[0], pd.DataFrame(sse)[1], marker='o') #plt.plot(pd.DataFrame(dbi)[0], pd.DataFrame(dbi)[1], marker='o') #plt.plot(pd.DataFrame(fmi)[0], pd.DataFrame(fmi)[1], marker='o') plt.title('Optimal Number of Clusters using Elbow Method (Scaled Data)') plt.xlabel('Number of clusters') plt.ylabel('silhouette_score') plt.show()
def calcDBs(preds, k): db = davies_bouldin_score(df, preds) dbs[k].append(db)
print("The Davies-Bouldin Index is used to measure better defined clusters.") print( "\nThe Davies-Bouldin score is lower when clusters more separated (e.g. better partitioned).\n" ) print("Zero is the lowest possible Davies-Bouldin score.\n") import warnings warnings.filterwarnings("ignore") range_n_clusters = [2, 3, 4, 5, 6] for n_clusters in range_n_clusters: clusterer = KMeans(n_clusters=n_clusters, random_state=10) cluster_labels = clusterer.fit_predict(peopleMatrixPcaTransform) score = metrics.davies_bouldin_score(peopleMatrixPcaTransform, cluster_labels) print("The Davies-Bouldin score for :", n_clusters, " clusters is: ", score) #Silhouette Analysis with Kmeans Clustering on the PCA transformed People Matrix # https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html#sphx-glr-auto-examples-cluster-plot-kmeans-silhouette-analysis-py range_n_clusters = [2, 3, 4, 5, 6] for n_clusters in range_n_clusters: # Create a subplot with 1 row and 2 columns fig, (ax1, ax2) = plt.subplots(1, 2) fig.set_size_inches(18, 7) # The 1st subplot is the silhouette plot # The silhouette coefficient can range from -1, 1 but in this example all # lie within [-0.1, 1]
K = range(2, 12, 1) distortions = [] silhouette = [] db_score = [] calinski = [] for k in K: print("For {} clusters".format(k)) km = KMeans(n_clusters=k, random_state=0).fit(df) labels = km.labels_ # labels = km.predict(df) # print(labels[0]) print("Silhouette Score: {}".format( silhouette_score(df, labels, metric='euclidean'))) print("DB Score: {}".format(davies_bouldin_score(df, labels))) print("Calinski-Harabasz Index: {}".format( metrics.calinski_harabasz_score(df, labels))) distortions.append(km.inertia_) silhouette.append(silhouette_score(df, labels, metric='euclidean')) db_score.append(davies_bouldin_score(df, labels)) calinski.append(metrics.calinski_harabasz_score(df, labels)) # plt.plot(K,silhouette,K,db_score) # plt.plot(calinski) # plt.xlabel('k') # plt.ylabel('Distortion') # plt.title('The Elbow Method showing the optimal k') # plt.legend(['Silhouette Score','DB Score']) # plt.xticks(K) # plt.show()
from sklearn.metrics.pairwise import cosine_similarity dist = cosine_similarity(feature_mat) #Silhouette score - Best if closer to 1 from sklearn.metrics import silhouette_score silhouette_avg = silhouette_score(feature_mat, kmeans.labels_) print("Silhouette score " + str(silhouette_avg)) #Calinski-Harabaz Index¶ - Higher the score better from sklearn.metrics import calinski_harabaz_score print("Calinski score : " + str(calinski_harabaz_score(feature_mat, clusters))) #Davies-Bouldin Index - Closer to zero better from sklearn.metrics import davies_bouldin_score print("Davies-Bouldin score : " + str(davies_bouldin_score(feature_mat, clusters))) #Recommendation #Testing with video id df = df.reset_index() df = df.drop(['index'], axis=1) vid = 'iUdgD8kYU-E' #Eg 1 Supreme court justice #vid = 'tG3wqbEmb7s' #eg 2 Iran nuclear deal #vid = 'Oms5r6_yJB8' #eg 3 Robert Mueller feature_df['id'] = df['id'] feature_df['clusters'] = clusters df['clusters'] = clusters rec_obj = recommendation() least_rel, most_rel = rec_obj.getRecommendation(vid, df, feature_df) print("The titles of most relevent recommendation \n")
'cc3_miles': i }, trans_miles_avg[i - 1]) Std = StandardScaler() X = Std.fit_transform(X.astype(float)) silhouette = [] davies = [] K = range(2, 11) for k in K: clusterer = KMeans(n_clusters=k, random_state=10) cluster_labels = clusterer.fit_predict(X) silhouette_avg = silhouette_score(X, cluster_labels) silhouette.append(silhouette_avg) L = range(2, 11) for l in L: clusterer = KMeans(n_clusters=l, random_state=10) cluster_labels = clusterer.fit_predict(X) davies_bouldin = davies_bouldin_score(X, cluster_labels) davies.append(davies_bouldin) plt.plot(K, silhouette, 'mo-', label='silhouette') plt.plot(L, davies, 'bo-', label='davies') plt.xlabel('k') plt.ylabel('silhouette/davies') plt.title('silhouette/davies - k', fontsize=14, fontweight='bold') plt.legend() plt.show()
def Davies_Bouldin_score(X, labels): return metrics.davies_bouldin_score(X, labels)
elif setup['algorithm'] == 2: algorithm = Kmeans(data, setup['clusters'], setup['class_args']) elif setup['algorithm'] == 3: algorithm = Agglomerative(data, setup['clusters'], setup['class_args']) elif setup['algorithm'] == 4: algorithm = DensityBased(data, setup['class_args']) elif setup['algorithm'] == 5: algorithm = optics(data, setup['class_args']) data_labels = algorithm.fit_predict() print(f'Silhouette:\t\t%0.4f' % silhouette_score(data, data_labels)) print(f'Calinski-Harabasz:\t%0.1f' % calinski_harabasz_score(data, data_labels)) print(f'Davies-Bouldin:\t\t%0.4f' % davies_bouldin_score(data, data_labels)) x = data.index y = data.iloc[:, len(data.columns) - 1] xlabel = "Index" ylabel = data.columns[len(data.columns) - 1] if argument_parser.get_plot_x_axis() is not None: x = data.iloc[:, argument_parser.get_plot_x_axis()] xlabel = data.columns[argument_parser.get_plot_x_axis()] if argument_parser.get_plot_y_axis() is not None: y = data.iloc[:, argument_parser.get_plot_y_axis()] ylabel = data.columns[argument_parser.get_plot_y_axis()]
def fit(self, X, y=None): self.columns = X.columns if self.columns is None else self.columns self.transform_cols = [x for x in X.columns if x in self.columns] # Evaluation if any([ self.eval_cluster, self.eval_silhouette, self.eval_chi, self.eval_dbi ]): n_clusters = [] n_noises = [] silhouettes1 = [] silhouettes2 = [] chis1 = [] chis2 = [] dbis1 = [] dbis2 = [] self.eval_df = pd.DataFrame() self.eval_df['eps'] = [x[0] for x in self.eps_samples_tuples] self.eval_df['min_samples'] = [ x[1] for x in self.eps_samples_tuples ] self.eval_df['centroid'] = self.eval_df['eps'].apply(lambda x: []) tmp_X = X[self.transform_cols].copy() index = 0 for eps, min_samples in tqdm(self.eps_samples_tuples): model = copy.deepcopy(self.model) model.eps = eps model.min_samples = min_samples model.fit(tmp_X) # Cluster centroid # Exclude calculating centroid of noise cluster tmp_df = pd.concat( [tmp_X, pd.Series(model.labels_, name='Cluster')], axis=1) tmp_df = tmp_df[tmp_df['Cluster'] != -1].reset_index(drop=True) self.eval_df.at[index, 'centroid'] = self.__calc_centroids( tmp_df[self.transform_cols], tmp_df['Cluster']) tmp_X2 = tmp_X.copy() tmp_X2 = pd.concat( [tmp_X2, pd.Series(model.labels_, name='Cluster')], axis=1) labels2 = tmp_X2[tmp_X2['Cluster'] != -1]['Cluster'].values tmp_X2 = tmp_X2[tmp_X2['Cluster'] != -1].drop( columns=['Cluster']).values # Reference: https://scikit-learn.org/stable/auto_examples/cluster/plot_dbscan.html n_cluster = len(np.unique(model.labels_)) n_cluster2 = len(np.unique(labels2)) if self.eval_cluster: n_clusters.append(n_cluster) n_noises.append(np.sum(np.where(model.labels_ == -1, 1, 0))) # Reference: https://towardsdatascience.com/clustering-metrics-better-than-the-elbow-method-6926e1f723a6 if self.eval_silhouette: kwargs = { 'metric': 'euclidean', 'sample_size': self.eval_sample_size, 'random_state': self.random_state } silhouettes1.append( np.nan if n_cluster <= 1 else silhouette_score( tmp_X, model.labels_, **kwargs)) silhouettes2.append( np.nan if n_cluster2 <= 1 else silhouette_score( tmp_X2, labels2, **kwargs)) # Reference: https://stats.stackexchange.com/questions/52838/what-is-an-acceptable-value-of-the-calinski-harabasz-ch-criterion if self.eval_chi: chis1.append(np.nan if n_cluster <= 1 else calinski_harabasz_score(tmp_X, model.labels_)) chis2.append(np.nan if n_cluster2 <= 1 else calinski_harabasz_score(tmp_X2, labels2)) # Reference: https://stackoverflow.com/questions/59279056/davies-bouldin-index-higher-or-lower-score-better if self.eval_dbi: dbis1.append(np.nan if n_cluster <= 1 else davies_bouldin_score(tmp_X, model.labels_)) dbis2.append(np.nan if n_cluster2 <= 1 else davies_bouldin_score(tmp_X2, labels2)) index += 1 if self.eval_cluster: self.eval_df['n_cluster'] = n_clusters self.eval_df['n_noise'] = n_noises if self.eval_silhouette: self.eval_df['silhouette'] = silhouettes1 self.eval_df['silhouette_w/o_noise'] = silhouettes2 if self.eval_chi: self.eval_df['calinski_harabasz'] = chis1 self.eval_df['calinski_harabasz_w/o_noise'] = chis2 if self.eval_dbi: self.eval_df['davies_bouldin'] = dbis1 self.eval_df['davies_bouldin_w/o_noise'] = dbis2 # Train else: self.model.fit(X[self.transform_cols]) # Exclude calculating centroid of noise cluster tmp_df = pd.concat([ X[self.transform_cols], pd.Series(self.model.labels_, name='Cluster') ], axis=1) tmp_df = tmp_df[tmp_df['Cluster'] != -1].reset_index(drop=True) self.centroid_df = pd.DataFrame(self.__calc_centroids( tmp_df[self.transform_cols], tmp_df['Cluster']), columns=self.transform_cols) self.centroid_df['Cluster'] = [ f'Cluster {x}' for x in np.unique(self.model.labels_) if x != -1 ] self.centroid_df.set_index('Cluster', inplace=True) self.centroid_df.index.name = None return self
plt.figure() plt.suptitle("Each row is the next iteration") for i in range(1, iteration): km_kpp = KMeans(n_clusters=9, init='k-means++', max_iter=1) km_frandom = KMeans(n_clusters=9, init=centers_random, max_iter=1) km_forgy = KMeans(n_clusters=9, init='random', max_iter=1) km_kpp.fit(X) km_frandom.fit(X) km_forgy.fit(X) y_kpp = km_kpp.predict(X) y_frandom = km_frandom.predict(X) y_forgy = km_forgy.predict(X) dbs_kpp.append(davies_bouldin_score(X, y_kpp)) silcoeff_kpp.append(silhouette_score(X, y_kpp)) dbs_frandom.append(davies_bouldin_score(X, y_frandom)) silcoeff_frandom.append(silhouette_score(X, y_frandom)) dbs_forgy.append(davies_bouldin_score(X, y_forgy)) silcoeff_forgy.append(silhouette_score(X, y_forgy)) plt.subplot(iteration - 1, 3, 3 * i - 2) plt.title("kmeans++") plt.scatter(X[:, 0], X[:, 1], marker='o', c=y_kpp, s=50, cmap='viridis') center_kpp = km_kpp.cluster_centers_ plt.scatter(center_kpp[:, 0], center_kpp[:, 1], c='black',
reduced_data = PCA(n_components=23).fit_transform( df) #choosing number of principal components as 23 silhouette = [] daviesBouldin = [] distortions = [] K = range(2, 30) for k in K: kmeans = KMeans(n_clusters=k).fit(reduced_data) kmeans.fit(reduced_data) distortions.append( sum( np.min(cdist(reduced_data, kmeans.cluster_centers_, 'euclidean'), axis=1)) / reduced_data.shape[0]) labels = kmeans.labels_ sh = metrics.silhouette_score(reduced_data, labels, metric='euclidean') db = davies_bouldin_score(reduced_data, labels) silhouette.append(sh) daviesBouldin.append(db) # Plot the elbow graph plt.plot(K, distortions, 'bx-') plt.xlabel('k') plt.ylabel('Distortion') plt.title('The Elbow Method showing the optimal k') plt.figure() #Calculate the silhouette index for each k plt.plot(K, silhouette, 'bx-') plt.xlabel('k-number') plt.ylabel('Silhouette score') plt.title('Silhouette scores for varying k')
plt.xlabel('First Component') plt.ylabel('Second Component') plt.show() print('#################Kmeans#################') #Plot if(not ignore): silhouette_scores=[] davies_bouldin_scores=[] calinski_harabasz_scores=[] for k in range(2, 20): kmeans = KMeans(n_clusters=k) kmeans.fit(df_pca) silhouette_scores.append(silhouette_score(df_pca, kmeans.labels_)) davies_bouldin_scores.append(davies_bouldin_score(df_pca,kmeans.labels_)) calinski_harabasz_scores.append(calinski_harabasz_score(df_pca,kmeans.labels_)) #Plots, we want high-high-low fig = plt.figure(figsize=(15, 5)) plt.plot(range(2, 20), silhouette_scores) plt.grid(True) plt.title('Get the optimal n_clusters') plt.xlabel('N_clusters') plt.ylabel('Silhoutte Score') plt.show() fig = plt.figure(figsize=(15, 5)) plt.plot(range(2, 20), calinski_harabasz_scores) plt.grid(True) plt.title('Get the optimal n_clusters')
# In[26]: corrected_image, r,c = correct_image(X_IPCA[0].reshape(orig_rows,orig_cols) , 13) vectorized = corrected_image.reshape((r*c), 1) kmeans = KMeans(random_state=0, init='random', n_clusters=4) labels = kmeans.fit_predict(vectorized) # In[27]: #We use the davies_bouldin_score as a clustering performance metric from sklearn.metrics import davies_bouldin_score score = davies_bouldin_score(vectorized, labels) print(score) # In[28]: # Convert these into single channel 8 bit pixels and finally merge them into the RGBA format - which represents the false colour image fig = plt.figure(figsize=(25, 50)) im_pil0 = Image.fromarray(segmented_images[0]) im0 = im_pil0.convert('L')
for m in models: predicts = m['m'].fit_predict(X) t = m['t'] X_PCA = PCA(n_components=2).fit_transform(X) plt.scatter(X_PCA[:, 0], X_PCA[:, 1], c=predicts, s=50, cmap='rainbow',alpha=0.5) plt.title(t) plt.savefig('out/'+t+'.png') plt.clf() print('\n3. Fazer a validação dos grupos através da utilização dos índices (Davies Bouldin e Silhouette)') print('{}\t{}\t{}'.format('Model', 'Bouldin', 'Silhouette')) for m in models: labels = m['m'].fit_predict(X) dbScore = davies_bouldin_score(X, labels) sScore = silhouette_score(X, labels, metric='euclidean') print('{}\t{}\t{}'.format(m['t'], dbScore, sScore)) print('\n4. Mostrar os resultados de ambos os índices para os agrupamentos criados') print( """ -------------------------------------------------------------------------- Modelo Bouldin Silhouette -------------------------------------------------------------------------- AgglomerativeClustering (k = 2) 0.7330018210488929 0.5532678504628996 KMeans (k = 2) 0.7133822795826191 0.5687897205830247 GaussianMixture (k = 2) 0.8604650094596924 0.3919496047300402 AgglomerativeClustering (k = 3) 0.6041813066360704 0.5281675826566276
model = KMeans(n_clusters=k) # use the first 5 PCA components X_pc = PCA_components.iloc[:, :5] #apply kmeans cluster model.fit(X_pc) cluster_labels = model.fit_predict(X_pc) #ag or spectral if k > 1: cur_silhout = silhouette_score(X_pc, cluster_labels) silouts.append(cur_silhout) cur_davies_b = davies_bouldin_score(X_pc, cluster_labels) davies_b.append(cur_davies_b) # Append the inertia to the list of inertias print(k) inertias.append(model.inertia_) #SI Figure 2 plt.figure() plt.plot(ks, inertias, '-o', color='black') plt.xlabel('Number of Clusters') plt.ylabel('Within-Cluster Sum of Squres(Inertia)') plt.xticks(ks) plt.tight_layout() ax = plt.axes() ax.grid(False)
def score_with_davies_bouldin(X: np.array, labels: list): return metrics.davies_bouldin_score(X, labels)
def apply_algo_k_auto(X, init_codification=None, quiet=True, params=None): '''Apply the machine learning algorithm for various number of clusters and choose the best according a certain score [IN] - X (np.array[N,p]): design matrix to put in input of the algorithm. Each line is an observation, each column is a predictor. - init_codification (dict): dict to link initialisation strategy with actual algorithm inputs. See kabl.core.apply_algo - quiet (boolean): if True, cut down all prints - params (dict): dict with all settings. Depends on 'max_k', 'n_clusters' [OUT] - labels (np.array[N]): vector of cluster number attribution BEWARE: the cluster identification number are random. Only borders matter. - n_clusters_opt (int): optimal number of clusters to be found in the data - classif_scores (float): value of classification score (chosen in params['n_clusters']) for the returned classification. ''' if params is None: params = utils.get_default_params() # Apply algorithm and compute scores for several number of clusters all_labels = [] classif_scores = [] for n_clusters in range(2, params['max_k']): labels = apply_algo(X, n_clusters, init_codification=init_codification, params=params) all_labels.append(labels) if params['classif_score'] in ['silhouette', 'silh']: classif_scores.append(silhouette_score(X, labels)) elif params['classif_score'] in ['davies_bouldin', 'db']: with np.errstate( divide='ignore', invalid='ignore' ): # to avoid itempestive warning ("RuntimeWarning: divide by zero encountered in true_divide...") classif_scores.append(davies_bouldin_score(X, labels)) else: # Default because fastest classif_scores.append(calinski_harabaz_score(X, labels)) # Choose the best number of clusters if params['classif_score'] in ['silhouette', 'silh']: k_best = np.argmax(classif_scores) if classif_scores[k_best] < 0.5: if not quiet: print("Bad classification according to silhouette score (", classif_scores[k_best], "). BLH is thus NaN") k_best = None elif params['classif_score'] in ['davies_bouldin', 'db']: k_best = np.argmin(classif_scores) if classif_scores[k_best] > 0.36: if not quiet: print("Bad classification according to Davies-Bouldin score (", classif_scores[k_best], "). BLH is thus NaN") k_best = None else: k_best = np.argmax(classif_scores) if classif_scores[k_best] < 200: if not quiet: print( "Bad classification according to Calinski-Harabasz score (", classif_scores[k_best], "). BLH is thus NaN") k_best = None # Return the results if k_best is not None: result = all_labels[k_best], k_best + 2, classif_scores[k_best] else: result = None, None, None return result
def DB(points, labelsPred): return float("%0.2f"%metrics.davies_bouldin_score(points, labelsPred))
def kabl_qualitymetrics(inputFile, outputFile=None, reference='None', rsFile='None', storeResults=True, params=None): '''Copy of blh_estimation including calculus and storage of scores [IN] - inputFile (str): path to the input file, as generated by raw2l1 - outputFile (str): path to the output file. Default adds ".out" before ".nc" - reference (str): path to the reference file, if any. - rsFile (str): path to the radiosounding estimations, if any (give the possibility to store it in the same netcdf) - storeResults (bool): if True, the field 'blh_ababl', containg BLH estimation, is stored in the outputFile - params (dict): dict of parameters. Depends on 'n_clusters' [OUT] - errl2_blh (float): root mean squared gap between BLH from KABL and the reference - errl1_blh (float): mean absolute gap between BLH from KABL and the reference - errl0_blh (float): maximum absolute gap between BLH from KABL and the reference - ch_score (float): mean over all day Calinski-Harabasz score (the higher, the better) - db_scores (float): mean over all day Davies-Bouldin score (the lower, the better) - s_scores (float): mean over all day silhouette score (the higher, the better) - chrono (float): computation time for the full day (seconds) - n_invalid (int): number of BLH estimation at NaN or Inf ''' t0 = time.time() #:::::::::::::::::::::: if params is None: params = utils.get_default_params() # 1. Extract the data #--------------------- loc, dateofday, lat, lon = utils.where_and_when(inputFile) t_values, z_values, rcs_1, rcs_2, blh_mnf, rr, vv, cbh = utils.extract_data( inputFile, to_extract=['rcs_1', 'rcs_2', 'pbl', 'rr', 'vv', 'b1'], params=params) blh = [] K_values = [] s_scores = [] db_scores = [] ch_scores = [] # setup toolbar toolbar_width = int(len(t_values) / 10) + 1 sys.stdout.write("KABL estimation (" + loc + dateofday.strftime(', %Y/%m/%d') + "): [%s]" % ("." * toolbar_width)) sys.stdout.flush() sys.stdout.write("\b" * (toolbar_width + 1)) # return to start of line, after '[' # Loop on all profile of the day for t in range(len(t_values)): # toolbar if np.mod(t, 10) == 0: if any(np.isnan(blh[-11:-1])): sys.stdout.write("!") else: sys.stdout.write("*") sys.stdout.flush() # 2. Prepare the data #--------------------- coords = { 'time': dt.datetime.utcfromtimestamp(t_values[t]), 'lat': lat, 'lon': lon } t_back = max(t - params['n_profiles'] + 1, 0) X, Z = prepare_data(coords, z_values, rcs_1[t_back:t + 1, :], rcs_2[t_back:t + 1, :], params=params) # 3. Apply the machine learning algorithm #--------------------- if isinstance(params['n_clusters'], int): n_clusters = params['n_clusters'] labels = apply_algo(X, params['n_clusters'], params=params) # Compute classification score if len(np.unique(labels)) > 1: with np.errstate( divide='ignore', invalid='ignore' ): # to avoid itempestive warning ("RuntimeWarning: divide by zero encountered in true_divide...") db_score = davies_bouldin_score(X, labels) s_score = silhouette_score(X, labels) ch_score = calinski_harabaz_score(X, labels) else: db_score = np.nan s_score = np.nan ch_score = np.nan else: labels, n_clusters, s_score, db_score, ch_score = apply_algo_k_3scores( X, params=params) # 4. Derive and store the BLH #--------------------- blh.append(blh_from_labels(labels, Z)) K_values.append(n_clusters) s_scores.append(s_score) db_scores.append(db_score) ch_scores.append(ch_score) # end toolbar t1 = time.time() #:::::::::::::::::::::: chrono = t1 - t0 sys.stdout.write("] (" + str(np.round(chrono, 4)) + " s)\n") if outputFile is None: fname = inputFile.split('/')[-1] outputFile = "DAILY_BENCHMARK_" + fname[10:-3] + ".nc" mask_cloud = cbh[:] <= 3000 if os.path.isfile(reference): blh_ref = np.loadtxt(reference) else: blh_ref = blh_mnf[:, 0] if storeResults: BLHS = [np.array(blh), np.array(blh_mnf[:, 0])] BLH_NAMES = ['BLH_KABL', 'BLH_INDUS'] if os.path.isfile(reference): BLHS.append(blh_ref) BLH_NAMES.append('BLH_REF') # Cloud base height is added as if it were a BLH though it's not BLHS.append(cbh) BLH_NAMES.append("CLOUD_BASE_HEIGHT") msg = utils.save_qualitymetrics(outputFile, t_values, BLHS, BLH_NAMES, [s_scores, db_scores, ch_scores], ['SILH', 'DB', 'CH'], [rr, vv], ['MASK_RAIN', 'MASK_FOG'], K_values, chrono, params) if os.path.isfile(rsFile): blh_rs = utils.extract_rs(rsFile, t_values[0], t_values[-1]) else: blh_rs = None # graphics.blhs_over_data(t_values,z_values,rcs_1,BLHS,[s[4:] for s in BLH_NAMES], # blh_rs=blh_rs,storeImages=True,showFigure=False) print(msg) errl2_blh = np.sqrt(np.nanmean((blh - blh_ref)**2)) errl1_blh = np.nanmean(np.abs(blh - blh_ref)) errl0_blh = np.nanmax(np.abs(blh - blh_ref)) corr_blh = np.corrcoef(blh, blh_ref)[0, 1] n_invalid = np.sum(np.isnan(blh)) + np.sum(np.isinf(blh)) return errl2_blh, errl1_blh, errl0_blh, corr_blh, np.mean( ch_scores), np.mean(db_scores), np.mean(s_scores), chrono, n_invalid
kmeans = KMeans(n_clusters=5) kmeans.fit(scaled_X) scores = [KMeans(n_clusters=i+2).fit(scaled_X).inertia_ for i in range(20)] sns.lineplot(np.arange(2, 22), scores) plt.xlabel('Number of clusters') plt.ylabel("Inertia") plt.title("Inertia of KMeans versus number of clusters") #Evaluación from sklearn.metrics import silhouette_score,davies_bouldin_score #silhouette scores print('kmeans: {}'.format(silhouette_score(scaled_X, kmeans.labels_, metric='euclidean'))) #db index print('kmeans: {}'.format(davies_bouldin_score(scaled_X, kmeans.labels_))) scaled_X['cluster'] = kmeans.labels_ #Usamos RandomForest para interpretar from sklearn.ensemble import RandomForestClassifier #take the noise out of the model scaled_X_no_noise = scaled_X[scaled_X.cluster != -1].copy() scaled_X_no_noise.drop(columns = ['TotalCharges', 'MonthlyCharges'], inplace = True) #build the RFC classifier to calculate MDI as a proxy for feature importance y = scaled_X_no_noise.iloc[:,-1] X = scaled_X_no_noise.iloc[:,:-1] clf = RandomForestClassifier(n_estimators=100).fit(X, y)
# figname = create_path("fig", sys.argv[1], "KMeans", sys.argv[2], filename=("%d.png" % n_clusters)) # silhouette_analysis(X, cluster_labels, n_clusters, figname) centers = pca.transform(clusterer.cluster_centers_) figname = create_path("fig", sys.argv[1], "KMeans", sys.argv[2], filename=("%d_vis.png" % n_clusters)) visualize_cluster(X_vis, cluster_labels, n_clusters, centers, figname) ari = metrics.adjusted_rand_score(y, cluster_labels) ami = metrics.adjusted_mutual_info_score(y, cluster_labels) nmi = metrics.normalized_mutual_info_score(y, cluster_labels) fms = metrics.fowlkes_mallows_score(y, cluster_labels) sil = metrics.silhouette_score(X, cluster_labels, metric='euclidean') chi = metrics.calinski_harabaz_score(X, cluster_labels) dbi = metrics.davies_bouldin_score(X, cluster_labels) print ("Adjusted Rand index: %.6f" % ari) print ("Adjusted Mutual Information: %.6f" % ami) print ("Normalized Mutual Information: %.6f" % nmi) print ("Fowlkes-Mallows score: %.6f" % fms) print ("Silhouette Coefficient: %.6f" % sil) print ("Calinski-Harabaz Index: %.6f" % chi) print ("Davies-Bouldin Index: %.6f" % dbi) ari_score.append(ari) ami_score.append(ami) nmi_score.append(nmi) fms_score.append(fms) sil_score.append(sil) chi_score.append(chi)
#fit = bestfeatures.fit(X,y) #dfscores = pd.DataFrame(fit.scores_) #dfcolumns = pd.DataFrame(X.columns) #featureScores = pd.concat([dfcolumns, dfscores], axis = 1) #featureScores.columns = ['Specs','Score'] #print(featureScores) # #print(featureScores.nlargest(19,'Score')) ########## k means X ########################################################## kmeans = KMeans(n_clusters=4) kmeans.fit(X) labels = kmeans.predict(X) #np.savetxt("kmeans.txt", labels) kmeans_DBI = davies_bouldin_score(X, labels) print("KMeans DBI score :", kmeans_DBI) kmeans_RI = rand_index_score( y, labels) #calculate_rand_index(y, labels)#rand_index_score(y, labels) print("KMeans RI score :", kmeans_RI) ##### Complete-Linkage Agglomerative nesting ################################## clustering = AgglomerativeClustering(n_clusters=4, linkage="complete").fit_predict(X) #np.savetxt("AgglomerativeClustering.txt", clustering) agnest_DBI = davies_bouldin_score(X, clustering) print("AgglomerativeClustering DBI score :", agnest_DBI) agnest_RI = rand_index_score(