예제 #1
0
def findeps(data):
	d = StandardScaler().fit_transform(data)
	d = np.nan_to_num(data)
	neighbors = NearestNeighbors(n_neighbors=2).fit(d)
	distances, indices = neighbors.kneighbors(d)
	distances = np.sort(distances, axis=0)
	distances = distances[:,1]
	plt.plot(distances)
	plt.show()
	rotor = Rotor()
	rotor.fit_rotate(np.concatenate((indices[:,0].reshape(-1, 1), distances.reshape(-1, 1)), axis = 1))
	epsx = rotor.get_elbow_index()
	eps = distances[epsx]
	return eps
예제 #2
0
def get_eps(X, neigh=2):
    eps_dist = np.sort(calculate_kn_distance(X, neigh=neigh))
    plt.hist(eps_dist, bins=60)
    plt.ylabel('n')
    plt.xlabel('Epsilon distance')
    plt.show()

    rotor = Rotor()
    curve_xy = np.concatenate(
        [np.arange(eps_dist.shape[0]).reshape(-1, 1),
         eps_dist.reshape(-1, 1)], 1)
    rotor.fit_rotate(curve_xy)
    rotor.plot_elbow()
    e_idx = rotor.get_elbow_index()

    return curve_xy[e_idx]
# determines how well each object lies within its cluster
# The location of the maximum is considered as the appropriate number of clusters.
k_silhouette = np.argmax(silhouette_avg_scores)+3

# visualization
# plt.plot(range(3, 20), silhouette_avg_scores)
# plt.title('Average silhouette method')
# plt.xlabel('Number of clusters')
# plt.ylabel('Average silhouette')
# plt.show()

# get elbow of wcss
rotor = Rotor()
rotor.fit_rotate(new_wcss)
k_wcss = rotor.get_elbow_index()+3

# lest get average of the 2 scares for the K
k = int(np.floor((k_wcss+k_silhouette/2)))

# now lets do the same thing but with the libraries
kmeans = KMeans(n_clusters=k)
kmeans.fit(feature_cordinates[['X', 'Y']])
labels = kmeans.predict(feature_cordinates[['X', 'Y']])
centroids = kmeans.cluster_centers_

# lets save to a new json
feature_cordinates['cluster'] = labels
feature_cordinates.to_json(output_file_name)

# visualization
예제 #4
0
#1rst evaluation
# Nearest neighbors to find the optimal epsilon (maximum distance) https://towardsdatascience.com/machine-learning-clustering-dbscan-determine-the-optimal-value-for-epsilon-eps-python-example-3100091cfbc
nbrs = NearestNeighbors(n_neighbors = 5, algorithm = 'kd_tree').fit(xyz_nn) #['auto', 'ball_tree', 'kd_tree', 'brute']
distances, indices = nbrs.kneighbors(xyz_nn) #the indices of the nearest neighbors 
distances = np.sort(distances, axis=0)
distances = distances[:,4]
plt.plot(distances)

y = np.array(distances)
x = np.linspace(0,len(x),len(x))
xy = np.vstack((x,y)).T

rotor = Rotor()
rotor.fit_rotate(xy)
elbow_idx = rotor.get_elbow_index()
rotor.plot_elbow()
eps = distances[elbow_idx]/2
del x,y,xy

clustering = DBSCAN( algorithm = 'kd_tree',eps=eps, min_samples=5).fit(xyz_nn) #the number of samples is D+1=4
labels = clustering.labels_

colors = [int(i % 23) for i in labels] # 554 labels to 23 distinguished colors

v = pptk.viewer(data,colors)
v.set(point_size=0.01)

# matplotlib
core_samples_mask = np.zeros_like(clustering.labels_, dtype=bool)
core_samples_mask[clustering.core_sample_indices_] = True
예제 #5
0
            save_labels(labels, csv_path)

def dbscan_knee(X, save_path, metric):
    print('p1')
    neigh = NearestNeighbors(n_neighbors=3, metric=metric) 
    neighbors = neigh.fit(X) 
    print('p2')
    distances, indices = neighbors.kneighbors(X)  
    distances = np.sort(distances, axis=0) 
    distances = distances[:,1] 
    
    print('hello before Rotor')
    rotor = Rotor()
    data = np.hstack((np.array(range(df.shape[0])).reshape(-1, 1), distances.reshape(-1, 1)))
    rotor.fit_rotate(data)
    eps = distances[rotor.get_elbow_index()]
    print('p3')
    plt.plot(distances)
    plt.title(f'eps={eps}')
    plt.savefig(f'{save_path}/knee_{metric}.jpg')
    plt.close()
    
    return eps


# DBSCAN (min_points = 3, 4, 5)

save_pref = [join(save_folder, sample[sample.rfind('/')+1:]) for sample in samples]

distances = ['cosine', 'jaccard']
methods = ['average', 'ward']
예제 #6
0
 def test_detect_elbow(self):
     r = Rotor()
     r.fit_rotate(sample_inc)
     self.assertAlmostEqual(r.get_elbow_index(), 11, delta=1)
예제 #7
0
def cluster_DBSCAN(data, eps=None, min_samples=None, metric="euclidean"):
    r"""Cluster data using DBSCAN.

    This function clusters the samples using a density-based cluster
    `DBSCAN <https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html>`_ provided by scikit.
    DBSCAN finds clusters of core samples of high density. A sample point is a core sample if at least `min_samples`
    points are within distance :math:`\varepsilon` of it. A cluster is defined as a set of sample points that are
    mutually density-connected and density-reachable, i.e. there is a path
    :math:`\left\langle p_{1}, p_{2}, \ldots, p_{n}\right\rangle` where each :math:`p_{i+1}` is within distance
    :math:`\varepsilon` of :math:`p_{i}` for any two p in the two. The values of `min_samples` and :math:`\varepsilon`
    determine the performance of this cluster.

    If None, `min_samples` takes the value of 2 * n_dims. If :math:`\varepsilon` is None, it is set as the value at the
    knee of the k-distance plot.

    Parameters
    ----------
    data : numpy.ndarray, shape=(n_samples, n_dims)
        Sample data to find clusters.

    eps : None or scalar, default=None
        The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is
        not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to
        choose appropriately for your data set and distance function. If None, it is set as the value at the
        knee of the k-distance plot.

    min_samples : None or scalar, default=None
        The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This
        includes the point itself. If None, it takes the value of 2 * n_dims

    metric : string or callable, default=’euclidean’
        The metric to use when calculating distance between instances in a feature array. If metric
        is a string or callable, it must be one of the options allowed by `sklearn.metrics.pairwise_distances`
        for its metric parameter.

    Returns
    -------
    labels : array_like, shape=(n_samples,)
        Cluster labels for each data point.

    core_sample_indices : array_like, shape=(n_clusters,)
        Indices of core samples.

    """
    if len(data) <= len(data[0]):
        return np.array([0 for dummy in data
                         ]), np.arange(len(data))[np.newaxis, :]
    if 2 * len(data[0]) > len(data):
        min_samples = np.min([len(data[0]), 4])
    elif len(data) < 1000:
        min_samples = np.min([2 * len(data[0]), len(data)])
    elif len(data) >= 1000:
        min_samples = np.min([5 * len(data[0]), len(data)])
    if eps is None:
        nearest_neighbors = NearestNeighbors(n_neighbors=min_samples)
        nearest_neighbors.fit(data)
        distances, indices = nearest_neighbors.kneighbors(data)
        distances = np.sort(distances, axis=0)[:, 1]
        data_vstacked = np.vstack([np.arange(len(distances)), distances]).T
        rotor = Rotor()
        rotor.fit_rotate(data_vstacked)
        elbow_index = rotor.get_elbow_index()
        eps = distances[elbow_index]
    dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric=metric)
    dbscan.fit(data)
    core_sample_indices = [[] for label in np.unique(dbscan.labels_)
                           if label != -1]
    for core_sample_index in dbscan.core_sample_indices_:
        core_sample_indices[dbscan.labels_[core_sample_index]].append(
            core_sample_index)
    return dbscan.labels_, core_sample_indices