def evaluate_KMeans(): # data input data_array, data, data_shape = input() data_array_transposed = data_array.transpose() # init k cluster_amount_init = int(math.sqrt(data_shape[1]) / 2) print 'Initial value of k is %d' % cluster_amount_init # take first 20 result into consideration # calculate the silhouette score range_silhouette_avg = [] for n in range(2, cluster_amount_init * 2 - 2): clusterer = KMeans(n_clusters=n, random_state=10) cluster_labels = clusterer.fit_predict(data_array_transposed) silhouette_avg = silhouette_score(data_array_transposed, cluster_labels) range_silhouette_avg.append(silhouette_avg) print 'For n_clusters = %d, The average silhouette_score is: %f' % ( n, silhouette_avg) # draw the chart plt.plot(range(2, cluster_amount_init * 2 - 2), range_silhouette_avg, 'bx-') plt.title('Silhouette_score-k line-chart') plt.xlabel('k') plt.ylabel('silhouette_score') plt.legend() plt.show()
def gmm(n): # data input data_array, data, data_shape = input() data_transposed = data.transpose() # get the cluster labels gmm = GMM(n_components=n, covariance_type='spherical') cluster_labels = gmm.fit_predict(data_transposed) return cluster_labels
def best_kmeans(): # data input data_array, data, data_shape = input() data_array_transposed = data_array.transpose() # get the cluster labels clusterer = KMeans(n_clusters=2, random_state=10) cluster_labels = clusterer.fit_predict(data_array_transposed) return cluster_labels
def best_dbscan(): # data input data_array, data, data_shape = input() data_transposed = data.transpose() # get the cluster labels db = DBSCAN(eps=310, min_samples=4) clusterer = db.fit(data_transposed) cluster_labels = clusterer.labels_ return cluster_labels
def validate(n): # data input data_array, data, data_shape = input() data_transposed = data.transpose() data_array_transposed = data_array.transpose() # get a random vipno to use vipno_pos = rd.randint(0, data_shape[1]) # get the result of GMM gmm = GMM(n_components=n, covariance_type='spherical') cluster_labels = gmm.fit_predict(data_transposed) # make a dictionary to index the cluster labels_dic = pd.DataFrame( np.row_stack((data_transposed.index, cluster_labels))) labels_dic = labels_dic.transpose().set_index(labels_dic.transpose()[0]) # get result of KNN, and compare for scale in [0.01, 0.05, 0.1, 0.2, 0.3, 0.5]: for k in [1, 2, 3, 4, 5]: # get the result of KNN hash_size = int(data_shape[1] * scale) results = knn(data_array, data, hash_size, data_shape, vipno_pos, k) if len(results) < 1: print 'For n_component = %d, hash_size = %d, k = %d: no result from KNN.\n' % ( n, hash_size, k) else: print 'For n_component = %d, hash_size = %d, k = %d, vipno_input = %d:' % ( n, hash_size, k, results[0]) # cluster of the vipno itself cluster = labels_dic.loc[results[0]][1] # and compare for result in results[1:]: cluster_result = labels_dic.loc[results[0]][1] print 'vipno_output: %d, result: %s' % ( result, 'same' if cluster == cluster_result else 'not same.') print ''
def evaluate_gmm(): # data input data_array, data, data_shape = input() data_transposed = data.transpose() data_array_transposed = data_array.transpose() # get the result of kMeans result_kmeans = best_kmeans() # get the result of DBSCAN result_dbscan = best_dbscan() # compare # DBSCAN and GMM n_components = 2 result_gmm = gmm(n_components) main_cluster = np.argmax(np.bincount(result_gmm)) count = 0 for label_index in range(0, data_shape[1] - 1): if result_dbscan[ label_index] == result_gmm[label_index] - main_cluster: count += 1 accuracy = float(count) / data_shape[1] print 'GMM accuracy in DBSCAN is: %f' % accuracy # kMeans and GMM n_components = 2 result_gmm = gmm(n_components) main_cluster = np.argmax(np.bincount(result_gmm)) count = 0 for label_index in range(0, data_shape[1] - 1): if result_kmeans[ label_index] == result_gmm[label_index] - main_cluster: count += 1 accuracy = float(count) / data_shape[1] print 'GMM accuracy in kMeans is: %f' % accuracy
def validate(n): # input again data_array, data, data_shape = input() data_array_transposed = data_array.transpose() # get a random vipno to use vipno_pos = rd.randint(0, data_shape[1]) # get the result of KMeans kmeans = KMeans(n_clusters=n, random_state=10).fit(data_array_transposed) # get the result of KNN using best n, including vipno itself for scale in [0.01, 0.05, 0.1, 0.2, 0.3, 0.5]: for k in [1, 2, 3, 4, 5]: # get the result of KNN hash_size = int(data_shape[1] * scale) results = knn(data_array, data, hash_size, data_shape, vipno_pos, k) if len(results) < 1: print 'For n_cluster = %d, hash_size = %d, k = %d: no result from KNN.\n' % ( n, hash_size, k) else: print 'For n_cluster = %d, hash_size = %d, k = %d, vipno_input = %d:' % ( n, hash_size, k, results[0]) # cluster of the vipno itself cluster = kmeans.predict( data.transpose().loc[results[0]].values.reshape(1, -1)) # and compare for result in results[1:]: cluster_result = kmeans.predict( data.transpose().loc[result].values.reshape(1, -1)) print 'vipno_output: %d, result: %s' % ( result, 'same' if cluster == cluster_result else 'not same.') print ''