def findeps(data): d = StandardScaler().fit_transform(data) d = np.nan_to_num(data) neighbors = NearestNeighbors(n_neighbors=2).fit(d) distances, indices = neighbors.kneighbors(d) distances = np.sort(distances, axis=0) distances = distances[:,1] plt.plot(distances) plt.show() rotor = Rotor() rotor.fit_rotate(np.concatenate((indices[:,0].reshape(-1, 1), distances.reshape(-1, 1)), axis = 1)) epsx = rotor.get_elbow_index() eps = distances[epsx] return eps
def get_eps(X, neigh=2): eps_dist = np.sort(calculate_kn_distance(X, neigh=neigh)) plt.hist(eps_dist, bins=60) plt.ylabel('n') plt.xlabel('Epsilon distance') plt.show() rotor = Rotor() curve_xy = np.concatenate( [np.arange(eps_dist.shape[0]).reshape(-1, 1), eps_dist.reshape(-1, 1)], 1) rotor.fit_rotate(curve_xy) rotor.plot_elbow() e_idx = rotor.get_elbow_index() return curve_xy[e_idx]
# determines how well each object lies within its cluster # The location of the maximum is considered as the appropriate number of clusters. k_silhouette = np.argmax(silhouette_avg_scores)+3 # visualization # plt.plot(range(3, 20), silhouette_avg_scores) # plt.title('Average silhouette method') # plt.xlabel('Number of clusters') # plt.ylabel('Average silhouette') # plt.show() # get elbow of wcss rotor = Rotor() rotor.fit_rotate(new_wcss) k_wcss = rotor.get_elbow_index()+3 # lest get average of the 2 scares for the K k = int(np.floor((k_wcss+k_silhouette/2))) # now lets do the same thing but with the libraries kmeans = KMeans(n_clusters=k) kmeans.fit(feature_cordinates[['X', 'Y']]) labels = kmeans.predict(feature_cordinates[['X', 'Y']]) centroids = kmeans.cluster_centers_ # lets save to a new json feature_cordinates['cluster'] = labels feature_cordinates.to_json(output_file_name) # visualization
#1rst evaluation # Nearest neighbors to find the optimal epsilon (maximum distance) https://towardsdatascience.com/machine-learning-clustering-dbscan-determine-the-optimal-value-for-epsilon-eps-python-example-3100091cfbc nbrs = NearestNeighbors(n_neighbors = 5, algorithm = 'kd_tree').fit(xyz_nn) #['auto', 'ball_tree', 'kd_tree', 'brute'] distances, indices = nbrs.kneighbors(xyz_nn) #the indices of the nearest neighbors distances = np.sort(distances, axis=0) distances = distances[:,4] plt.plot(distances) y = np.array(distances) x = np.linspace(0,len(x),len(x)) xy = np.vstack((x,y)).T rotor = Rotor() rotor.fit_rotate(xy) elbow_idx = rotor.get_elbow_index() rotor.plot_elbow() eps = distances[elbow_idx]/2 del x,y,xy clustering = DBSCAN( algorithm = 'kd_tree',eps=eps, min_samples=5).fit(xyz_nn) #the number of samples is D+1=4 labels = clustering.labels_ colors = [int(i % 23) for i in labels] # 554 labels to 23 distinguished colors v = pptk.viewer(data,colors) v.set(point_size=0.01) # matplotlib core_samples_mask = np.zeros_like(clustering.labels_, dtype=bool) core_samples_mask[clustering.core_sample_indices_] = True
save_labels(labels, csv_path) def dbscan_knee(X, save_path, metric): print('p1') neigh = NearestNeighbors(n_neighbors=3, metric=metric) neighbors = neigh.fit(X) print('p2') distances, indices = neighbors.kneighbors(X) distances = np.sort(distances, axis=0) distances = distances[:,1] print('hello before Rotor') rotor = Rotor() data = np.hstack((np.array(range(df.shape[0])).reshape(-1, 1), distances.reshape(-1, 1))) rotor.fit_rotate(data) eps = distances[rotor.get_elbow_index()] print('p3') plt.plot(distances) plt.title(f'eps={eps}') plt.savefig(f'{save_path}/knee_{metric}.jpg') plt.close() return eps # DBSCAN (min_points = 3, 4, 5) save_pref = [join(save_folder, sample[sample.rfind('/')+1:]) for sample in samples] distances = ['cosine', 'jaccard'] methods = ['average', 'ward']
def test_detect_elbow(self): r = Rotor() r.fit_rotate(sample_inc) self.assertAlmostEqual(r.get_elbow_index(), 11, delta=1)
def cluster_DBSCAN(data, eps=None, min_samples=None, metric="euclidean"): r"""Cluster data using DBSCAN. This function clusters the samples using a density-based cluster `DBSCAN <https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html>`_ provided by scikit. DBSCAN finds clusters of core samples of high density. A sample point is a core sample if at least `min_samples` points are within distance :math:`\varepsilon` of it. A cluster is defined as a set of sample points that are mutually density-connected and density-reachable, i.e. there is a path :math:`\left\langle p_{1}, p_{2}, \ldots, p_{n}\right\rangle` where each :math:`p_{i+1}` is within distance :math:`\varepsilon` of :math:`p_{i}` for any two p in the two. The values of `min_samples` and :math:`\varepsilon` determine the performance of this cluster. If None, `min_samples` takes the value of 2 * n_dims. If :math:`\varepsilon` is None, it is set as the value at the knee of the k-distance plot. Parameters ---------- data : numpy.ndarray, shape=(n_samples, n_dims) Sample data to find clusters. eps : None or scalar, default=None The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function. If None, it is set as the value at the knee of the k-distance plot. min_samples : None or scalar, default=None The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. If None, it takes the value of 2 * n_dims metric : string or callable, default=’euclidean’ The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by `sklearn.metrics.pairwise_distances` for its metric parameter. Returns ------- labels : array_like, shape=(n_samples,) Cluster labels for each data point. core_sample_indices : array_like, shape=(n_clusters,) Indices of core samples. """ if len(data) <= len(data[0]): return np.array([0 for dummy in data ]), np.arange(len(data))[np.newaxis, :] if 2 * len(data[0]) > len(data): min_samples = np.min([len(data[0]), 4]) elif len(data) < 1000: min_samples = np.min([2 * len(data[0]), len(data)]) elif len(data) >= 1000: min_samples = np.min([5 * len(data[0]), len(data)]) if eps is None: nearest_neighbors = NearestNeighbors(n_neighbors=min_samples) nearest_neighbors.fit(data) distances, indices = nearest_neighbors.kneighbors(data) distances = np.sort(distances, axis=0)[:, 1] data_vstacked = np.vstack([np.arange(len(distances)), distances]).T rotor = Rotor() rotor.fit_rotate(data_vstacked) elbow_index = rotor.get_elbow_index() eps = distances[elbow_index] dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric=metric) dbscan.fit(data) core_sample_indices = [[] for label in np.unique(dbscan.labels_) if label != -1] for core_sample_index in dbscan.core_sample_indices_: core_sample_indices[dbscan.labels_[core_sample_index]].append( core_sample_index) return dbscan.labels_, core_sample_indices