def optimizeEps(group, rep, fig=None): """ Special thanks to Cory Maklin https://towardsdatascience.com/machine-learning-clustering-dbscan-determine-the-optimal-value-for-epsilon-eps-python-example-3100091cfbc and also stackexchange user georg-un for the kneebow package: https://datascience.stackexchange.com/questions/57122/in-elbow-curve-how-to-find-the-point-from-where-the-curve-starts-to-rise """ X = group[["ae1", "ae2"]].to_numpy() neigh = NearestNeighbors(n_neighbors=2) nbrs = neigh.fit(X) dist, idx = nbrs.kneighbors(X) dist = np.sort(dist, axis=0) d = dist[:, 1] dist[:, 0] = idx[:, 0] #print(dist) #if fig is not None: #ax=fig.add_subplot(10,10,rep) #ax.plot(d) #plt.show() rotor = Rotor() rotor.fit_rotate(dist) elbow_index = rotor.get_elbow_index() #ax.axhline(dist[elbow_index][1]) return (dist[elbow_index][1])
def test_scaling(self): r = Rotor() r.fit_rotate(sample_inc, scale=True, theta=0) self.assertLessEqual(r._data[:, 0].max(), 1.00000001) self.assertLessEqual(r._data[:, 1].max(), 1.00000001) self.assertGreaterEqual(r._data[:, 0].min(), -0.00000001) self.assertGreaterEqual(r._data[:, 1].min(), -0.00000001)
def test_rotation(self): data = np.array([[0, 0], [1, 0], [0, 1]]) r = Rotor() r.fit_rotate(data, scale=False, theta=np.radians(45)) self.assertAlmostEqual(r._data[0].tolist()[0], 0, delta=0.01) self.assertAlmostEqual(r._data[0].tolist()[1], 0, delta=0.01) self.assertAlmostEqual(r._data[1].tolist()[0], 0.71, delta=0.01) self.assertAlmostEqual(r._data[1].tolist()[1], -0.71, delta=0.01) self.assertAlmostEqual(r._data[2].tolist()[0], 0.71, delta=0.01) self.assertAlmostEqual(r._data[2].tolist()[1], 0.71, delta=0.01)
def cluster_DBSCAN(data, eps=None, min_samples=None, metric="euclidean"): """Cluster data using DBSCAN. The the density-based spatial cluster `sklearn.cluster.DBSCAN <https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html>`_ to cluster the data. If not provided by users, the distance cutoff `eps` is determined by the 'Knee method' which finds the distance at which a sharp change happens. Parameters ---------- data : ndarray, shape=(n_samples, n_dims) eps : None or scalar, default=None min_samples : None or scalar, default=None metric : string or callable, default=’euclidean’ The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by `sklearn.metrics.pairwise_distances` for its metric parameter. plot_dist_curve : bool, default=True Returns ------- labels : array_like, shape=(n_samples) core_sample_indices : array_like, shape=(n_core_samples) """ if len(data) <= len(data[0]): return np.array([0 for dummy in data ]), np.arange(len(data))[np.newaxis, :] if eps is None: nearest_neighbors = NearestNeighbors(n_neighbors=3) nearest_neighbors.fit(data) distances, indices = nearest_neighbors.kneighbors(data) distances = np.sort(distances, axis=0)[:, 1] data_vstacked = np.vstack([np.arange(len(distances)), distances]).T rotor = Rotor() rotor.fit_rotate(data_vstacked) elbow_index = rotor.get_elbow_index() eps = distances[elbow_index] if min_samples is None: scores = [] for n_sample in np.arange(2, len(data) - 1, 2): dbscan = DBSCAN(eps=eps, min_samples=n_sample, metric=metric) dbscan.fit(data) labels = dbscan.labels_ if np.all(labels == -1): break else: scores.append(silhouette_score(data, labels)) min_samples = np.arange( 2, len(data) - 1, 2)[np.argmax(scores)] # the highest silhouette_score. dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric=metric) dbscan.fit(data) return dbscan.labels_, dbscan.core_sample_indices_
def get_elbow_index(couplings_dict, plot_elbow=False): """ find index at which coupling strengths starts to dramatically decrease """ y = list(couplings_dict.values()) x = list(range(len(y))) if plot_elbow: plt.figure() plt.plot(x, y) plt.show() y.reverse() data = np.array([[xi, yi] for xi, yi in zip(x, y)]) rotor = Rotor() rotor.fit_rotate(data) elbow_idx = rotor.get_elbow_index() return len(y) - elbow_idx
def get_elbow_no(df, column_name='E-value'): # Sorting E-values df_evalues = df.sort_values(by=[column_name]) # Using kneebow to find knee y = list(df_evalues[column_name]) x = list(range(len(y))) data = np.array([[xi, yi] for xi, yi in zip(x, y)]) rotor = Rotor() rotor.fit_rotate(data) elbow_idx = rotor.get_elbow_index() elbow_no = list(df_evalues['No'])[elbow_idx] return elbow_no
def findeps(data): d = StandardScaler().fit_transform(data) d = np.nan_to_num(data) neighbors = NearestNeighbors(n_neighbors=2).fit(d) distances, indices = neighbors.kneighbors(d) distances = np.sort(distances, axis=0) distances = distances[:,1] plt.plot(distances) plt.show() rotor = Rotor() rotor.fit_rotate(np.concatenate((indices[:,0].reshape(-1, 1), distances.reshape(-1, 1)), axis = 1)) epsx = rotor.get_elbow_index() eps = distances[epsx] return eps
def get_eps(X, neigh=2): eps_dist = np.sort(calculate_kn_distance(X, neigh=neigh)) plt.hist(eps_dist, bins=60) plt.ylabel('n') plt.xlabel('Epsilon distance') plt.show() rotor = Rotor() curve_xy = np.concatenate( [np.arange(eps_dist.shape[0]).reshape(-1, 1), eps_dist.reshape(-1, 1)], 1) rotor.fit_rotate(curve_xy) rotor.plot_elbow() e_idx = rotor.get_elbow_index() return curve_xy[e_idx]
xyz_nn = np.vstack([xn,yn,zn]).T #1rst evaluation # Nearest neighbors to find the optimal epsilon (maximum distance) https://towardsdatascience.com/machine-learning-clustering-dbscan-determine-the-optimal-value-for-epsilon-eps-python-example-3100091cfbc nbrs = NearestNeighbors(n_neighbors = 5, algorithm = 'kd_tree').fit(xyz_nn) #['auto', 'ball_tree', 'kd_tree', 'brute'] distances, indices = nbrs.kneighbors(xyz_nn) #the indices of the nearest neighbors distances = np.sort(distances, axis=0) distances = distances[:,4] plt.plot(distances) y = np.array(distances) x = np.linspace(0,len(x),len(x)) xy = np.vstack((x,y)).T rotor = Rotor() rotor.fit_rotate(xy) elbow_idx = rotor.get_elbow_index() rotor.plot_elbow() eps = distances[elbow_idx]/2 del x,y,xy clustering = DBSCAN( algorithm = 'kd_tree',eps=eps, min_samples=5).fit(xyz_nn) #the number of samples is D+1=4 labels = clustering.labels_ colors = [int(i % 23) for i in labels] # 554 labels to 23 distinguished colors v = pptk.viewer(data,colors) v.set(point_size=0.01) # matplotlib core_samples_mask = np.zeros_like(clustering.labels_, dtype=bool)
csv_path = join(folder, 'labels.csv') save_labels(labels, csv_path) def dbscan_knee(X, save_path, metric): print('p1') neigh = NearestNeighbors(n_neighbors=3, metric=metric) neighbors = neigh.fit(X) print('p2') distances, indices = neighbors.kneighbors(X) distances = np.sort(distances, axis=0) distances = distances[:,1] print('hello before Rotor') rotor = Rotor() data = np.hstack((np.array(range(df.shape[0])).reshape(-1, 1), distances.reshape(-1, 1))) rotor.fit_rotate(data) eps = distances[rotor.get_elbow_index()] print('p3') plt.plot(distances) plt.title(f'eps={eps}') plt.savefig(f'{save_path}/knee_{metric}.jpg') plt.close() return eps # DBSCAN (min_points = 3, 4, 5) save_pref = [join(save_folder, sample[sample.rfind('/')+1:]) for sample in samples] distances = ['cosine', 'jaccard']
def test_detect_knee(self): r = Rotor() r.fit_rotate(sample_dec) self.assertAlmostEqual(r.get_knee_index(), 7, delta=1)
def test_detect_elbow(self): r = Rotor() r.fit_rotate(sample_inc) self.assertAlmostEqual(r.get_elbow_index(), 11, delta=1)
def test_fit_rotate_params(self): r = Rotor() r.fit_rotate(sample_inc, scale=False, theta=0.7) self.assertFalse(r._scale) self.assertEqual(r._theta, 0.7)
def test_fit_rotate(self): r = Rotor() r.fit_rotate(sample_inc) self.assertIsNotNone(r._data)
def test_fit_rotate_default_parameter(self): r = Rotor() r.fit_rotate(sample_inc) self.assertTrue(r._scale) self.assertIsNotNone(r._theta)
# plt.show() # determines how well each object lies within its cluster # The location of the maximum is considered as the appropriate number of clusters. k_silhouette = np.argmax(silhouette_avg_scores)+3 # visualization # plt.plot(range(3, 20), silhouette_avg_scores) # plt.title('Average silhouette method') # plt.xlabel('Number of clusters') # plt.ylabel('Average silhouette') # plt.show() # get elbow of wcss rotor = Rotor() rotor.fit_rotate(new_wcss) k_wcss = rotor.get_elbow_index()+3 # lest get average of the 2 scares for the K k = int(np.floor((k_wcss+k_silhouette/2))) # now lets do the same thing but with the libraries kmeans = KMeans(n_clusters=k) kmeans.fit(feature_cordinates[['X', 'Y']]) labels = kmeans.predict(feature_cordinates[['X', 'Y']]) centroids = kmeans.cluster_centers_ # lets save to a new json feature_cordinates['cluster'] = labels feature_cordinates.to_json(output_file_name)
distances = distances[:, 1] distancebis = savgol_filter(distances, 151, 5) plt.figure(0) plt.plot(distances) plt.figure(1) plt.plot(distancebis) # compute second derivative smooth_d1 = np.gradient(distancebis) smooth_d2 = np.gradient(np.gradient(distancebis)) rotor = Rotor() new = np.zeros((0, 2)) for i in range(0, 346): array = np.array([[i, distancebis[i]]]) new = np.append(new, array, axis=0) rotor.fit_rotate(new) elbow_index = rotor.get_elbow_index() #print("yop :" +str(elbow_index)) print(new[elbow_index]) plt.figure(2) plt.plot(smooth_d2) """ infls = np.where(np.diff(np.sign(smooth_d2 )))[0] optiepsiIndex = np.where(smooth_d2 == np.amax(smooth_d2))[0] optiepsi = distancebis[optiepsiIndex] print(optiepsi) """ # In[5]:
def cluster_DBSCAN(data, eps=None, min_samples=None, metric="euclidean"): r"""Cluster data using DBSCAN. This function clusters the samples using a density-based cluster `DBSCAN <https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html>`_ provided by scikit. DBSCAN finds clusters of core samples of high density. A sample point is a core sample if at least `min_samples` points are within distance :math:`\varepsilon` of it. A cluster is defined as a set of sample points that are mutually density-connected and density-reachable, i.e. there is a path :math:`\left\langle p_{1}, p_{2}, \ldots, p_{n}\right\rangle` where each :math:`p_{i+1}` is within distance :math:`\varepsilon` of :math:`p_{i}` for any two p in the two. The values of `min_samples` and :math:`\varepsilon` determine the performance of this cluster. If None, `min_samples` takes the value of 2 * n_dims. If :math:`\varepsilon` is None, it is set as the value at the knee of the k-distance plot. Parameters ---------- data : numpy.ndarray, shape=(n_samples, n_dims) Sample data to find clusters. eps : None or scalar, default=None The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function. If None, it is set as the value at the knee of the k-distance plot. min_samples : None or scalar, default=None The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. If None, it takes the value of 2 * n_dims metric : string or callable, default=’euclidean’ The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by `sklearn.metrics.pairwise_distances` for its metric parameter. Returns ------- labels : array_like, shape=(n_samples,) Cluster labels for each data point. core_sample_indices : array_like, shape=(n_clusters,) Indices of core samples. """ if len(data) <= len(data[0]): return np.array([0 for dummy in data ]), np.arange(len(data))[np.newaxis, :] if 2 * len(data[0]) > len(data): min_samples = np.min([len(data[0]), 4]) elif len(data) < 1000: min_samples = np.min([2 * len(data[0]), len(data)]) elif len(data) >= 1000: min_samples = np.min([5 * len(data[0]), len(data)]) if eps is None: nearest_neighbors = NearestNeighbors(n_neighbors=min_samples) nearest_neighbors.fit(data) distances, indices = nearest_neighbors.kneighbors(data) distances = np.sort(distances, axis=0)[:, 1] data_vstacked = np.vstack([np.arange(len(distances)), distances]).T rotor = Rotor() rotor.fit_rotate(data_vstacked) elbow_index = rotor.get_elbow_index() eps = distances[elbow_index] dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric=metric) dbscan.fit(data) core_sample_indices = [[] for label in np.unique(dbscan.labels_) if label != -1] for core_sample_index in dbscan.core_sample_indices_: core_sample_indices[dbscan.labels_[core_sample_index]].append( core_sample_index) return dbscan.labels_, core_sample_indices