def test_dbscan_core_samples_toy(): X = [[0], [2], [3], [4], [6], [8], [10]] n_samples = len(X) for algorithm in ['brute', 'kd_tree', 'ball_tree']: # Degenerate case: every sample is a core sample, either with its own # cluster or including other close core samples. core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=1) assert_array_equal(core_samples, np.arange(n_samples)) assert_array_equal(labels, [0, 1, 1, 1, 2, 3, 4]) # With eps=1 and min_samples=2 only the 3 samples from the denser area # are core samples. All other points are isolated and considered noise. core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=2) assert_array_equal(core_samples, [1, 2, 3]) assert_array_equal(labels, [-1, 0, 0, 0, -1, -1, -1]) # Only the sample in the middle of the dense area is core. Its two # neighbors are edge samples. Remaining samples are noise. core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=3) assert_array_equal(core_samples, [2]) assert_array_equal(labels, [-1, 0, 0, 0, -1, -1, -1]) # It's no longer possible to extract core samples with eps=1: # everything is noise. core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=4) assert_array_equal(core_samples, []) assert_array_equal(labels, -np.ones(n_samples))
def test_dbscan_sparse(): core_sparse, labels_sparse = dbscan(sparse.lil_matrix(X), eps=.8, min_samples=10) core_dense, labels_dense = dbscan(X, eps=.8, min_samples=10) assert_array_equal(core_dense, core_sparse) assert_array_equal(labels_dense, labels_sparse)
def test_dbscan_sparse(): core_sparse, labels_sparse = dbscan(sparse.lil_matrix(X), eps=.8, min_samples=10, random_state=0) core_dense, labels_dense = dbscan(X, eps=.8, min_samples=10, random_state=0) assert_array_equal(core_dense, core_sparse) assert_array_equal(labels_dense, labels_sparse)
def test_boundaries(): # ensure min_samples is inclusive of core point core, _ = dbscan([[0], [1]], eps=2, min_samples=2) assert_in(0, core) # ensure eps is inclusive of circumference core, _ = dbscan([[0], [1], [1]], eps=1, min_samples=2) assert_in(0, core) core, _ = dbscan([[0], [1], [1]], eps=.99, min_samples=2) assert_not_in(0, core)
def test_dbscan_sparse_precomputed(): D = pairwise_distances(X) core_sparse, labels_sparse = dbscan(sparse.lil_matrix(D), eps=.8, min_samples=10, metric='precomputed') core_dense, labels_dense = dbscan(X, eps=.8, min_samples=10) assert_array_equal(core_dense, core_sparse) assert_array_equal(labels_dense, labels_sparse)
def test_dbscan_sparse_precomputed(): D = pairwise_distances(X) nn = NearestNeighbors(radius=0.9).fit(X) D_sparse = nn.radius_neighbors_graph(mode="distance") # Ensure it is sparse not merely on diagonals: assert D_sparse.nnz < D.shape[0] * (D.shape[0] - 1) core_sparse, labels_sparse = dbscan(D_sparse, eps=0.8, min_samples=10, metric="precomputed") core_dense, labels_dense = dbscan(D, eps=0.8, min_samples=10, metric="precomputed") assert_array_equal(core_dense, core_sparse) assert_array_equal(labels_dense, labels_sparse)
def test_dbscan_input_not_modified(use_sparse, metric): # test that the input is not modified by dbscan X = np.random.RandomState(0).rand(10, 10) X = sparse.csr_matrix(X) if use_sparse else X X_copy = X.copy() dbscan(X, metric=metric) if use_sparse: assert_array_equal(X.toarray(), X_copy.toarray()) else: assert_array_equal(X, X_copy)
def test_dbscan_sparse_precomputed(): D = pairwise_distances(X) nn = NearestNeighbors(radius=.9).fit(X) D_sparse = nn.radius_neighbors_graph(mode='distance') # Ensure it is sparse not merely on diagonals: assert D_sparse.nnz < D.shape[0] * (D.shape[0] - 1) core_sparse, labels_sparse = dbscan(D_sparse, eps=.8, min_samples=10, metric='precomputed') core_dense, labels_dense = dbscan(D, eps=.8, min_samples=10, metric='precomputed') assert_array_equal(core_dense, core_sparse) assert_array_equal(labels_dense, labels_sparse)
def test_dbscan_sparse_precomputed_different_eps(): # test that precomputed neighbors graph is filtered if computed with # a radius larger than DBSCAN's eps. lower_eps = 0.2 nn = NearestNeighbors(radius=lower_eps).fit(X) D_sparse = nn.radius_neighbors_graph(X, mode='distance') dbscan_lower = dbscan(D_sparse, eps=lower_eps, metric='precomputed') higher_eps = lower_eps + 0.7 nn = NearestNeighbors(radius=higher_eps).fit(X) D_sparse = nn.radius_neighbors_graph(X, mode='distance') dbscan_higher = dbscan(D_sparse, eps=lower_eps, metric='precomputed') assert_array_equal(dbscan_lower[0], dbscan_higher[0]) assert_array_equal(dbscan_lower[1], dbscan_higher[1])
def test_dbscan_balltree(): # Tests the DBSCAN algorithm with balltree for neighbor calculation. eps = 0.8 min_samples = 10 D = pairwise_distances(X) core_samples, labels = dbscan(D, metric="precomputed", eps=eps, min_samples=min_samples) # number of clusters, ignoring noise if present n_clusters_1 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_1, n_clusters) db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm="ball_tree") labels = db.fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_2, n_clusters) db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm="kd_tree") labels = db.fit(X).labels_ n_clusters_3 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_3, n_clusters) db = DBSCAN(p=1.0, eps=eps, min_samples=min_samples, algorithm="ball_tree") labels = db.fit(X).labels_ n_clusters_4 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_4, n_clusters) db = DBSCAN(leaf_size=20, eps=eps, min_samples=min_samples, algorithm="ball_tree") labels = db.fit(X).labels_ n_clusters_5 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_5, n_clusters)
def angle_predict(hits,m,i,rz_shift,eps,weights): aa = hits.a+m*(hits.r+0.000005*(hits.r**2))/1000*(i/2)/180*3.141 hits['f0'] = np.sin(aa) hits['f1'] = np.cos(aa) hits_b = hits[hits.type == 'b']['hit_id'] hits_c = hits[hits.type == 'c']['hit_id'] ss = StandardScaler() X = ss.fit_transform(np.column_stack([hits.f0.values, hits.f1.values, hits.z1.values, hits.z2.values, hits.xr.values, hits.yr.values])) X_b = np.multiply(np.vstack([X[ex-1] for ex in hits_b.values]),weights[0]) X_c = np.multiply(np.vstack([X[ex-1] for ex in hits_c.values]),weights[1]) Xw = np.zeros(X.shape) Xw[hits_b.values-1] = X_b[range(len(hits_b.values))] Xw[hits_c.values-1] = X_c[range(len(hits_c.values))] eps = eps + (i*0.000005) _,labels = dbscan(Xw, eps=eps, min_samples=1, algorithm='auto', n_jobs=4) unique,reverse,count = np.unique(labels,return_counts=True,return_inverse=True) c = count[reverse] c[np.where(labels==0)]=0 if abs(rz_shift) < 0.1: c[np.where(c>20)]=0 else: c[np.where(c>8)]=0 return (labels,c)
def get_features(sub, cluster_size=10): """ Input: dataframe with hits long tracks Output: array with features of long track """ hitst = sub.copy() X = np.column_stack([ hitst.x.values, hitst.y.values, hitst.z.values, hitst.track_id.values * 1000000 ]) _, hitst['labels'] = dbscan(X, eps=cluster_size, min_samples=1, algorithm='ball_tree', metric='euclidean') gp = hitst.groupby('track_id').agg({ 'hit_id': 'count', 'labels': 'nunique', 'volume_id': 'min', 'x': ['min', 'max', 'var'], 'y': ['min', 'max', 'var'], 'z': ['min', 'max', 'var', 'mean'] }) gp.columns = ["".join(t) for t in gp.columns.ravel()] gp = gp.rename( columns={ 'hit_idcount': 'nhits', 'labelsnunique': 'nclusters', 'volume_idmin': 'svolume' }).reset_index() gp['nhitspercluster'] = gp.nhits / gp.nclusters return gp
def test_dbscan_callable(): # Tests the DBSCAN algorithm with a callable metric. # Parameters chosen specifically for this task. # Different eps to other test, because distance is not normalised. eps = 0.8 min_samples = 10 # metric is the function reference, not the string key. metric = distance.euclidean # Compute DBSCAN # parameters chosen for task core_samples, labels = dbscan(X, metric=metric, eps=eps, min_samples=min_samples, algorithm='ball_tree') # number of clusters, ignoring noise if present n_clusters_1 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_1, n_clusters) db = DBSCAN(metric=metric, eps=eps, min_samples=min_samples, algorithm='ball_tree') labels = db.fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_2, n_clusters)
def predict(self, dfh): print("len(dfh) : {0}".format(len(dfh))) if ("rt" not in dfh.columns): dfh["rt"] = np.sqrt(dfh['x'].values**2 + dfh['y'].values**2) z = dfh['z'].values rt = dfh["rt"].values r = np.sqrt(dfh['x']**2 + dfh['y']**2 + dfh['z']**2) a0 = np.arctan2(dfh['y'].values, dfh['x'].values) layer_id = dfh['layer_id'].values.astype(np.float32) sys.stderr.write("dbscan for each parameters\n") scan_labels_list = [] for (dj, di) in tqdm(product(self.djs, self.dis), total=len(self.djs) * len(self.dis)): ar = a0 + di * rt zr = (z + dj) / rt * 0.1 if (self.param_type == 0): params = [ar, zr] elif (self.param_type == 1): params = [np.sin(ar), np.cos(ar), zr * 10, 1 / (10 * zr)] elif (self.param_type == 2): params = [np.sin(ar), np.cos(ar), zr] else: raise RuntimeError("invalid param_type.") if (self.weight is None): w = np.array([1.0 for _ in params]) else: w = np.array(self.weight) ss = StandardScaler() data1 = ss.fit_transform(np.column_stack(params)) data2 = w[np.newaxis, :] * data1 _, scan_label = dbscan( data2, eps=self.eps, min_samples=1, ) scan_labels_list.append(scan_label) sys.stderr.write("clustering\n") dfh["s1"] = dfh.hit_id dfh["N1"] = 1 for scan_labels in scan_labels_list: dfh["s2"] = scan_labels dfh["N2"] = dfh.groupby('s2')['s2'].transform('count') maxs1 = np.max(dfh.s1) dfh.s1 = np.where((dfh.N2 > dfh.N1) & (dfh.N2 < 20), dfh.s2 + maxs1, dfh.s1) dfh['s1'] = dfh['s1'].astype('int64') dfh['N1'] = dfh.groupby('s1')['s1'].transform('count') labels = dfh['s1'] return labels
def predict(self, hits, weights): x = hits.x.values y = hits.y.values z = self.rz_scale * hits.z.values r = np.sqrt(x**2 + y**2) d = np.sqrt(x**2 + y**2 + z**2) a = np.arctan2(y, x) zr = z / r dr = d / r hits['d'] = d w0, w1, w2, w3, w4 = weights ss = StandardScaler() results = [] dzi = -0.00010 for step in [11]: #range(21): #0.00060/121/-60 dz = dzi + (step * 0.00001) f0 = w0 * (a + (dz * z * np.sign(z))) f1 = w1 * (zr) f2 = w2 * (f0 / zr) f3 = w3 * (1 / zr) f4 = w4 * (f2 + f3) X = ss.fit_transform(np.column_stack([f0, f1, f2, f3, f4])) eps = self.eps - (abs(step - 10) * 0.000015) _, labels = dbscan(X, eps=eps, min_samples=1, algorithm='auto', n_jobs=4) unique, reverse, count = np.unique(labels, return_counts=True, return_inverse=True) c = count[reverse] c[np.where(labels == 0)] = 0 c[np.where(c > 20)] = 0 results.append((labels, c)) labels, counts = results[0] for i in range(1, len(results)): l, c = results[i] idx = np.where((c - counts > 0))[0] labels[idx] = l[idx] + labels.max() counts[idx] = c[idx] return labels
def find_labels(params): hits, dz = params a = hits['phi'].values z = hits['z'].values zr = hits['zr'].values aa = a + np.sign(z) * dz * z f0 = np.cos(aa) f1 = np.sin(aa) f2 = zr X = StandardScaler().fit_transform(np.column_stack([f0, f1, f2])) _, l = dbscan(X, eps=0.0045, min_samples=1, n_jobs=4) return l + 1
def seed_tracks(event_id, df, start_d, end_d, ax): seed = df.loc[df.d > start_d] seed = seed.loc[seed.d < end_d] N = len(seed) p = seed[['particle_id']].values.astype(np.int64) x, y, z, r, a, cosa, sina, phi = seed[[ 'x', 'y', 'z', 'r', 'a', 'cosa', 'sina', 'phi' ]].values.astype(np.float32).T particle_ids = np.unique(p) particle_ids = particle_ids[particle_ids != 0] num_particle_ids = len(particle_ids) # do dbscan here ======================================= data = np.column_stack([a, z / r * 0.1]) _, l = dbscan( data, eps=0.01, min_samples=1, ) #print(len(truth)) #print(len(seed)) #print(len(submission)) #print(len(l)) seed['l'] = pd.Series(l, index=seed.index) #print(seed) submission = pd.DataFrame(columns=['event_id', 'hit_id', 'track_id'], data=np.column_stack(([ int(event_id), ] * len(seed), seed.hit_id.values, l))).astype(int) score = score_event_fast(seed, submission) print(score) predicted_tracks, counts = np.unique(l, return_counts=True) predicted_tracks = predicted_tracks[counts > 1] for predicted_track in predicted_tracks[::100]: track_hits = seed[seed.l == predicted_track] ax.plot(xs=track_hits.a, ys=track_hits.r, zs=track_hits.z)
def get_base_partitioning(distance_matrix, eps=0.5, min_samples=2): """ Gets the base partitioning from the distance matrix using DBScan algorithm. Args: distance_matrix: a list of lists with the distances of references. Returns: A list of integers from 0 to k - 1, each one representing a block for the reference represented by the index. """ labels = dbscan(np.array(distance_matrix), metric='precomputed', eps=eps, min_samples=min_samples) next_label = max(labels[1]) + 1 for i in range(len(labels[1])): if labels[1][i] == -1: labels[1][i] = next_label next_label += 1 return labels[1].tolist(), number_of_clusters(labels[1])
def test_dbscan_similarity(): # Tests the DBSCAN algorithm with a similarity array. # Parameters chosen specifically for this task. eps = 0.15 min_samples = 10 # Compute similarities D = distance.squareform(distance.pdist(X)) D /= np.max(D) # Compute DBSCAN core_samples, labels = dbscan(D, metric="precomputed", eps=eps, min_samples=min_samples) # number of clusters, ignoring noise if present n_clusters_1 = len(set(labels)) - (1 if -1 in labels else 0) assert_equal(n_clusters_1, n_clusters) db = DBSCAN(metric="precomputed", eps=eps, min_samples=min_samples) labels = db.fit(D).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_2, n_clusters)
def test_dbscan_feature(): # Tests the DBSCAN algorithm with a feature vector array. # Parameters chosen specifically for this task. # Different eps to other test, because distance is not normalised. eps = 0.8 min_samples = 10 metric = "euclidean" # Compute DBSCAN # parameters chosen for task core_samples, labels = dbscan(X, metric=metric, eps=eps, min_samples=min_samples) # number of clusters, ignoring noise if present n_clusters_1 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_1, n_clusters) db = DBSCAN(metric=metric, eps=eps, min_samples=min_samples) labels = db.fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_2, n_clusters)
def test_dbscan_balltree(): # Tests the DBSCAN algorithm with balltree for neighbor calculation. eps = 0.8 min_samples = 10 D = pairwise_distances(X) core_samples, labels = dbscan(D, metric="precomputed", eps=eps, min_samples=min_samples) # number of clusters, ignoring noise if present n_clusters_1 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_1, n_clusters) db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm='ball_tree') labels = db.fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_2, n_clusters) db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm='kd_tree') labels = db.fit(X).labels_ n_clusters_3 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_3, n_clusters) db = DBSCAN(p=1.0, eps=eps, min_samples=min_samples, algorithm='ball_tree') labels = db.fit(X).labels_ n_clusters_4 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_4, n_clusters) db = DBSCAN(leaf_size=20, eps=eps, min_samples=min_samples, algorithm='ball_tree') labels = db.fit(X).labels_ n_clusters_5 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_5, n_clusters)
def test_dbscan_callable(): # Tests the DBSCAN algorithm with a callable metric. # Parameters chosen specifically for this task. # Different eps to other test, because distance is not normalised. eps = 0.8 min_samples = 10 # metric is the function reference, not the string key. metric = distance.euclidean # Compute DBSCAN # parameters chosen for task core_samples, labels = dbscan(X, metric=metric, eps=eps, min_samples=min_samples, algorithm="ball_tree") # number of clusters, ignoring noise if present n_clusters_1 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_1, n_clusters) db = DBSCAN(metric=metric, eps=eps, min_samples=min_samples, algorithm="ball_tree") labels = db.fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_2, n_clusters)
def test_dbscan_feature(): # Tests the DBSCAN algorithm with a feature vector array. # Parameters chosen specifically for this task. # Different eps to other test, because distance is not normalised. eps = 0.8 min_samples = 10 metric = 'euclidean' # Compute DBSCAN # parameters chosen for task core_samples, labels = dbscan(X, metric=metric, eps=eps, min_samples=min_samples) # number of clusters, ignoring noise if present n_clusters_1 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_1, n_clusters) db = DBSCAN(metric=metric, eps=eps, min_samples=min_samples) labels = db.fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_2, n_clusters)
def img_association(network, propagate_loader, min_sample=4, eps=0, rerank=False, k1=20, k2=6, intra_id_reinitialize=False): network.eval() print('Start Inference...') features = [] global_labels = [] all_cams = [] with torch.no_grad(): for c, data in enumerate(propagate_loader): images = data[0] g_label = data[3] cam = data[4] embed_feat = network(images) features.append(embed_feat.cpu()) global_labels.append(g_label) all_cams.append(cam) features = torch.cat(features, dim=0).numpy() global_labels = torch.cat(global_labels, dim=0).numpy() all_cams = torch.cat(all_cams, dim=0).numpy() print(' features: shape= {}'.format(features.shape)) # if needed, average camera-style transferred image features new_features = [] new_cams = [] for glab in np.unique(global_labels): idx = np.where(global_labels == glab)[0] new_features.append(np.mean(features[idx], axis=0)) new_cams.append(all_cams[idx]) new_features = np.array(new_features) new_cams = np.array(new_cams).squeeze() del features, all_cams # compute distance W new_features = new_features / np.linalg.norm(new_features, axis=1, keepdims=True) # l2-normalize if rerank: W = faiss_compute_jaccard_dist(torch.from_numpy(new_features), k1=k1, k2=k2) else: W = cdist(new_features, new_features, 'euclidean') print(' distance matrix: shape= {}'.format(W.shape)) # self-similarity for association print(' perform image grouping...') _, updated_label = dbscan(W, eps=eps, min_samples=min_sample, metric='precomputed', n_jobs=8) print(' eps in cluster: {:.3f}'.format(eps)) print(' updated_label: num_class= {}, {}/{} images are associated.' .format(updated_label.max() + 1, len(updated_label[updated_label >= 0]), len(updated_label))) if intra_id_reinitialize: print('re-computing initialized intra-ID feature...') intra_id_features = [] intra_id_labels = [] for cc in np.unique(new_cams): percam_ind = np.where(new_cams == cc)[0] percam_feature = new_features[percam_ind, :] percam_label = updated_label[percam_ind] percam_class_num = len(np.unique(percam_label[percam_label >= 0])) percam_id_feature = np.zeros((percam_class_num, percam_feature.shape[1]), dtype=np.float32) cnt = 0 for lbl in np.unique(percam_label): if lbl >= 0: ind = np.where(percam_label == lbl)[0] id_feat = np.mean(percam_feature[ind], axis=0) percam_id_feature[cnt, :] = id_feat intra_id_labels.append(lbl) cnt += 1 percam_id_feature = percam_id_feature / np.linalg.norm(percam_id_feature, axis=1, keepdims=True) intra_id_features.append(torch.from_numpy(percam_id_feature)) return updated_label, intra_id_features
def copac(X, k=10, mu=5, eps=0.5, alpha=0.85, metric='euclidean', metric_params=None, algorithm='auto', leaf_size=30, p=None, n_jobs=1, sample_weight=None): """Perform COPAC clustering from vector array. Parameters ---------- X : array of shape (n_samples, n_features) A feature array. k : int, optional, default=10 Size of local neighborhood for local correlation dimensionality. The paper suggests k >= 3 * n_features. mu : int, optional, default=5 Minimum number of points in a copac with mu <= k. eps : float, optional, default=0.5 Neighborhood predicate, so that neighbors are closer than `eps`. alpha : float in ]0,1[, optional, default=0.85 Threshold of how much variance needs to be explained by Eigenvalues. Assumed to be robust in range 0.8 <= alpha <= 0.9 [see Ref.] metric : string, or callable The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by sklearn.metrics.pairwise.pairwise_distances for its metric parameter. If metric is "precomputed", `X` is assumed to be a distance matrix and must be square. metric_params : dict, optional Additional keyword arguments for the metric function. algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional The algorithm to be used by the scikit-learn NearestNeighbors module to compute pointwise distances and find nearest neighbors. See NearestNeighbors module documentation for details. leaf_size : int, optional (default = 30) Leaf size passed to BallTree or cKDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem. p : float, optional The power of the Minkowski metric to be used to calculate distance between points. n_jobs : int, optional, default=1 Number of parallel processes. Use all cores with n_jobs=-1. sample_weight : None Currently ignored Returns ------- labels : array [n_samples] Cluster labels for each point. Noisy samples are given the label -1. References ---------- Elke Achtert, Christian Bohm, Hans-Peter Kriegel, Peer Kroger, A. Z. (n.d.). Robust, complete, and efficient correlation clustering. In Proceedings of the Seventh SIAM International Conference on Data Mining, April 26-28, 2007, Minneapolis, Minnesota, USA (2007), pp. 413–418. """ X = check_array(X) n, d = X.shape y = -np.ones(n, dtype=np.int) if n_jobs == -1: n_jobs = cpu_count() # Calculating M^ just once requires more memory, but saves computation lambda_ = np.zeros(n, dtype=int) M_hat = list() # Get nearest neighbors nn = NearestNeighbors(n_neighbors=k, metric=metric, algorithm=algorithm, leaf_size=leaf_size, metric_params=metric_params, p=p, n_jobs=n_jobs) nn.fit(X) knns = nn.kneighbors(return_distance=False) for P, knn in enumerate(knns): N_P = X[knn] # Correlation copac covariance matrix Sigma = np.cov(N_P[:, :], rowvar=False, ddof=0) # Decompose spsd matrix, and sort Eigenvalues descending E, V = LA.eigh(Sigma) E = np.sort(E)[::-1] # Local correlation dimension explanation_portion = np.cumsum(E) / E.sum() lambda_P = np.searchsorted(explanation_portion, alpha, side='left') lambda_P += 1 lambda_[P] = lambda_P # Correlation distance matrix E_hat = (np.arange(1, d + 1) > lambda_P).astype(int) M_hat.append(V @ np.diag(E_hat) @ V.T) # Group points by corr. dim. argsorted = np.argsort(lambda_) edges, _ = np.histogram(lambda_[argsorted], bins=np.arange(1, d + 2)) Ds = np.split(argsorted, np.cumsum(edges)) # Loop over partitions according to local corr. dim. max_label = 0 used_y = np.zeros_like(y, dtype=int) for D in Ds: n_D = D.shape[0] cdist_P = -np.ones(n_D * (n_D - 1) // 2, dtype=np.float) cdist_Q = -np.ones((n_D, n_D), dtype=np.float) start = 0 # Calculate triu part of distance matrix for i in range(0, n_D - 1): p = D[i] # Vectorized inner loop q = D[i + 1:n_D] stop = start + n_D - i - 1 cdist_P[start:stop] = _cdist(X[p], X[q], M_hat[p]) start = stop # Calculate tril part of distance matrix for i in range(1, n_D): q = D[i] p = D[0:i] cdist_Q[i, :i] = _cdist(X[q], X[p], M_hat[q]) # Extract tril to 1D array # TODO simplify... cdist_Q = cdist_Q.T[np.triu_indices_from(cdist_Q, k=1)] cdist = np.block([[cdist_P], [cdist_Q]]) # Square root of the higher value of cdist_P, cdist_Q cdist = np.sqrt(cdist.max(axis=0)) # Perform DBSCAN with full distance matrix cdist = squareform(cdist) clust = dbscan(X=cdist, eps=eps, min_samples=mu, metric='precomputed', n_jobs=n_jobs) _, labels = clust # Each DBSCAN run is unaware of previous ones, # so we need to keep track of previous copac IDs y_D = labels + max_label new_labels = np.unique(labels[labels >= 0]).size max_label += new_labels # Set copac labels in `y` y[D] = y_D used_y[D] += 1 assert np.all(used_y == 1), "Not all samples were handled exactly once!" return y
def test_weighted_dbscan(): # ensure sample_weight is validated assert_raises(ValueError, dbscan, [[0], [1]], sample_weight=[2]) assert_raises(ValueError, dbscan, [[0], [1]], sample_weight=[2, 3, 4]) # ensure sample_weight has an effect assert_array_equal([], dbscan([[0], [1]], sample_weight=None, min_samples=6)[0]) assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 5], min_samples=6)[0]) assert_array_equal([0], dbscan([[0], [1]], sample_weight=[6, 5], min_samples=6)[0]) assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[6, 6], min_samples=6)[0]) # points within eps of each other: assert_array_equal([0, 1], dbscan([[0], [1]], eps=1.5, sample_weight=[5, 1], min_samples=6)[0]) # and effect of non-positive and non-integer sample_weight: assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 0], eps=1.5, min_samples=6)[0]) assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[5.9, 0.1], eps=1.5, min_samples=6)[0]) assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[6, 0], eps=1.5, min_samples=6)[0]) assert_array_equal([], dbscan([[0], [1]], sample_weight=[6, -1], eps=1.5, min_samples=6)[0]) # for non-negative sample_weight, cores should be identical to repetition rng = np.random.RandomState(42) sample_weight = rng.randint(0, 5, X.shape[0]) core1, label1 = dbscan(X, sample_weight=sample_weight) assert_equal(len(label1), len(X)) X_repeated = np.repeat(X, sample_weight, axis=0) core_repeated, label_repeated = dbscan(X_repeated) core_repeated_mask = np.zeros(X_repeated.shape[0], dtype=bool) core_repeated_mask[core_repeated] = True core_mask = np.zeros(X.shape[0], dtype=bool) core_mask[core1] = True assert_array_equal(np.repeat(core_mask, sample_weight), core_repeated_mask) # sample_weight should work with precomputed distance matrix D = pairwise_distances(X) core3, label3 = dbscan(D, sample_weight=sample_weight, metric='precomputed') assert_array_equal(core1, core3) assert_array_equal(label1, label3) # sample_weight should work with estimator est = DBSCAN().fit(X, sample_weight=sample_weight) core4 = est.core_sample_indices_ label4 = est.labels_ assert_array_equal(core1, core4) assert_array_equal(label1, label4) est = DBSCAN() label5 = est.fit_predict(X, sample_weight=sample_weight) core5 = est.core_sample_indices_ assert_array_equal(core1, core5) assert_array_equal(label1, label5) assert_array_equal(label1, est.labels_)
def study_dbscan_for_tracklet_seeding(): ## load an event --- event_id = '000001029' data_dir = '/root/share/project/kaggle/cern/data/__download__/train_100_events' #detectors = pd.read_csv('/root/share/project/kaggle/cern/data/__download__/detectors.csv') particles = pd.read_csv(data_dir + '/event%s-particles.csv' % event_id) hits = pd.read_csv(data_dir + '/event%s-hits.csv' % event_id) truth = pd.read_csv(data_dir + '/event%s-truth.csv' % event_id) #cells = pd.read_csv(data_dir + '/event%s-cells.csv'%event_id) truth = truth.merge(hits, on=['hit_id'], how='left') truth = truth.merge(particles, on=['particle_id'], how='left') #-------------------------------------------------------- df = truth.copy() df = df.assign(r=np.sqrt(df.x**2 + df.y**2)) df = df.assign(d=np.sqrt(df.x**2 + df.y**2 + df.z**2)) df = df.assign(a=np.arctan2(df.y, df.x)) df = df.assign(cosa=np.cos(df.a)) df = df.assign(sina=np.sin(df.a)) df = df.assign(phi=np.arctan2(df.z, df.r)) df = df.assign(momentum=np.sqrt(df.px**2 + df.py**2 + df.pz**2)) df.loc[df.particle_id == 0, 'momentum'] = 0 #df = df.loc[df.z>500] # consider dataset subset #df = df.loc[df.r<50 ] ## 0.04397/0.04750 (0.92569) df = df.loc[df.z > 500] df = df.loc[(df.r > 50) & (df.r < 100)] ## 0.05259/0.05808 (0.90551) #df = df.loc[df.z>500] #df = df.loc[df.r<100] ## 0.09417/0.10557 (0.89195) #df = df.loc[(df.a>0) & (df.a<0.5)] #df = df.loc[(df.a>0) & (df.a<1)] # df = df.loc[df.z>500] # consider dataset subset # df = df.loc[(df.r>50) & (df.r<100)] #df = df.loc[(df.z>0) &(df.z<500)] #df = df.loc[df.r<200 ] #df = df.loc[(df.a>0) & (df.a<0.5)] #df = df.loc[(df.z>df.r)] #df = df.loc[(df.r>50) & (df.r<100) ] #------------------------------------------------------- N = len(df) layer_id = df['layer_id'].values.astype(np.float32) momentum = df['momentum'].values.astype(np.float32) p = df[['particle_id']].values.astype(np.int64) x, y, z, r, a, cosa, sina, phi = df[[ 'x', 'y', 'z', 'r', 'a', 'cosa', 'sina', 'phi' ]].values.astype(np.float32).T particle_ids = np.unique(p) particle_ids = particle_ids[particle_ids != 0] num_particle_ids = len(particle_ids) # do xxx ======================================= #color = plt.cm.hsv( (z-z.min()) / (z.max()-z.min())) color = plt.cm.hsv( (layer_id - layer_id.min()) / (layer_id.max() + 1 - layer_id.min())) plot3d_particles(ax3d1, particle_ids, p, a, r, z, z) ax3d1.scatter(a, r, z, c=color, s=64, edgecolors='none') #plt.show() dj = 0 di = 0 EPS = 1e-12 #if 1: candidates = [] for dj in np.arange(-20, 20 + EPS, 10): for di in np.arange(-0.003, 0.003 + EPS, 0.00025): ar = a + di * r zr = (z + dj) / r * 0.1 data2 = np.column_stack([ar, zr]) _, l = dbscan( data2, eps=0.0025, min_samples=1, ) track_ids = np.unique(l) track_ids = track_ids[track_ids != 0] neigbour = [np.where(l == t)[0] for t in track_ids] unique, inverse, c = np.unique(l, return_counts=True, return_inverse=True) unique = unique[unique != 0] c = c[inverse] c[l == 0] = 0 for u in unique: candidate = np.where(l == u)[0] candidates.append(candidate) #--- #<todo> #fix angle discontinunity problem here ... #----- #sort count = np.array([len(candidate) for candidate in candidates]) sort = np.argsort(-count) candidates = [candidates[s] for s in sort] #show max_label = 1 label = np.zeros(N, np.int32) count = np.zeros(N, np.int32) for candidate in candidates: n = candidate L = len(n) #print(L) #---- filtering (secret sauce) ---------- #if L<3: continue n = n[np.argsort(np.fabs(z[n]))] layer_id0 = layer_id[n[:-1]] layer_id1 = layer_id[n[1:]] ld = layer_id1 - layer_id0 if np.any(ld > 2): continue m = count[n].max() if L < m: continue #---- filtering ---------------------- count[n] = L label[n] = max_label max_label += 1 ## show: if L >= 3: #c = np.random.uniform(0,1,3)#[0,0,0] c = [0, 0, 0] #ax3d1.clear() #plot_particles(ax3d1, particle_ids, p, a,r,zr, z) #ax3d1.scatter(ar, r, zr, c=color, s=64, edgecolors='none') ax3d1.plot(a[n], r[n], z[n], '.-', color=c, markersize=5, linewidth=1) #ax3d1.plot(a[[n[0],n[-1]]],r[[n[0],n[-1]]],zr[[n[0],n[-1]]],'-', color=[1,0,0], markersize=5, linewidth=1) #plt.pause(0.01) #plt.waitforbuttonpress(-1) #plt.show() ##-###################################################################################3 submission = pd.DataFrame(columns=['event_id', 'hit_id', 'track_id'], data=np.column_stack(([ int(event_id), ] * len(df), df.hit_id.values, label))).astype(int) score1 = score_event(df, submission) score2, results = cpmp_fast_score(df, submission) #print results max_score = df.weight.sum() print('max_score = df.weight.sum() = %0.5f' % max_score) print('score1= %0.5f (%0.5f)' % (score1 * max_score, score1)) print('score2= %0.5f (%0.5f)' % (score2, score2 / max_score)) plt.show() print('end') exit(0)
def study_dbscan_for_tracklet_seeding(): ## load an event --- event_id = '000001029' path_to_train = "data/train_1" particles = pd.read_csv(path_to_train + '/event%s-particles.csv' % event_id) hits = pd.read_csv(path_to_train + '/event%s-hits.csv' % event_id) truth = pd.read_csv(path_to_train + '/event%s-truth.csv' % event_id) #cells = pd.read_csv(path_to_train + '/event%s-cells.csv'%event_id) truth = truth.merge(hits, on=['hit_id'], how='left') truth = truth.merge(particles, on=['particle_id'], how='left') #-------------------------------------------------------- df = truth.copy() df = df.assign(r=np.sqrt(df.x**2 + df.y**2)) df = df.assign(d=np.sqrt(df.x**2 + df.y**2 + df.z**2)) df = df.assign(a=np.arctan2(df.y, df.x)) df = df.assign(cosa=np.cos(df.a)) df = df.assign(sina=np.sin(df.a)) df = df.assign(phi=np.arctan2(df.z, df.r)) df = df.assign(momentum=np.sqrt(df.px**2 + df.py**2 + df.pz**2)) df.loc[df.particle_id == 0, 'momentum'] = 0 df = df.loc[df.z > 500] # consider dataset subset df = df.loc[df.r < 50] N = len(df) #------------------------------------------------------- momentum = df[['momentum']].values.astype(np.float32) p = df[['particle_id']].values.astype(np.int64) x, y, z, r, a, cosa, sina, phi = df[[ 'x', 'y', 'z', 'r', 'a', 'cosa', 'sina', 'phi' ]].values.astype(np.float32).T particle_ids = np.unique(p) particle_ids = particle_ids[particle_ids != 0] num_particle_ids = len(particle_ids) # do dbscan here ======================================= data = np.column_stack([a, z / r * 0.1]) _, l = dbscan( data, eps=0.01, min_samples=1, ) submission = pd.DataFrame(columns=['event_id', 'hit_id', 'track_id'], data=np.column_stack(([ int(event_id), ] * len(df), df.hit_id.values, l))).astype(int) #score1 = score_event(df, submission) #print(df) #print(submission) score2, results = cpmp_fast_score(df, submission) #print results #max_score = df.weight.sum() #print('max_score = df.weight.sum() = %0.5f'%max_score) #print('score1= %0.5f (%0.5f)'%(score1*max_score,score1)) #print('score2= %0.5f (%0.5f)'%(score2,score2/max_score)) ## analyse the results here ============================= d0, d1 = data.T track_ids = np.unique(l) track_ids = track_ids[track_ids != 0] num_track_ids = len(track_ids) fig = plt.figure(figsize=(8, 8)) ax = fig.add_subplot(111) fig.patch.set_facecolor('white') fig1 = plt.figure(figsize=(8, 8)) ax1 = fig1.add_subplot(111) ax1 = Axes3D(fig1) fig1.patch.set_facecolor('white') def show_ax(): ax1.set_xlabel('a', fontsize=16) ax1.set_ylabel('r', fontsize=16) ax1.set_zlabel('z', fontsize=16) ax.set_xlabel('a', fontsize=16) ax.set_ylabel('z/r', fontsize=16) # ax.grid() # ax.set_aspect('equal', 'box') plt.show() ## 0. show data: if False: ax.clear() ax1.clear() ax.plot(d0, d1, '.', color=[0.75, 0.75, 0.75], markersize=3, linewidth=0) ax1.plot(a, r, z, '.', color=[0.75, 0.75, 0.75], markersize=3, linewidth=0) show_ax() ## 1. show GT: if True: ax.clear() ax1.clear() ax.plot(d0, d1, '.', color=[0.75, 0.75, 0.75], markersize=3, linewidth=0) ax1.plot(a, r, z, '.', color=[0.75, 0.75, 0.75], markersize=3, linewidth=0) ax.set_title('Ground truth') ax1.set_title('Ground truth') ax1.set_xlabel('a', fontsize=16) ax1.set_ylabel('r', fontsize=16) ax1.set_zlabel('z', fontsize=16) ax.set_xlabel('a', fontsize=16) ax.set_ylabel('z/r', fontsize=16) for n in range(0, num_particle_ids, 1): particle_id = particle_ids[n] t = np.where(p == particle_id)[0] #if momentum[t[0]]<min_momentum: continue t = t[np.argsort(np.fabs(z[t]))] if np.fabs(a[t[0]] - a[t[-1]]) > 1: continue d = ((x[t[0]] - x[t[-1]])**2 + (y[t[0]] - y[t[-1]])**2 + (z[t[0]] - z[t[-1]])**2)**0.5 if d < 10: continue ###print(n, particle_id) color = np.random.uniform(0, 1, (3)) #ax.clear() #ax1.clear() ax.plot(data[t, 0], data[t, 1], '.', color=color, markersize=5, linewidth=0) ax1.plot(a[t], r[t], z[t], '.-', color=color, markersize=5, linewidth=1) #ax1.plot(a[h],r[h], z[h], 'o', color=[0,0,0], markersize=8, linewidth=1, mfc='none') #ax1.view_init(0, (ax_n*3)%360) #ax_n += 1 #fig1.savefig('/root/share/project/kaggle/cern/results/yy/%05d.png'%ax_n) #plt.pause(0.01) #plt.waitforbuttonpress(-1) #show_ax() ## 2. show dbscan prediction: if True: fig_ = plt.figure(figsize=(8, 8)) ax_ = fig_.add_subplot(111, ) fig_.patch.set_facecolor('white') fig1_ = plt.figure(figsize=(8, 8)) ax1_ = fig1_.add_subplot(111, projection='3d') fig1_.patch.set_facecolor('white') ax_.clear() ax1_.clear() ax_.plot(d0, d1, '.', color=[0.75, 0.75, 0.75], markersize=3, linewidth=0) ax1_.plot(a, r, z, '.', color=[0.75, 0.75, 0.75], markersize=3, linewidth=0) ax.set_title('DBSCAN Prediction') ax1.set_title('DBSCAN Prediction') ax1_.set_xlabel('a', fontsize=16) ax1_.set_ylabel('r', fontsize=16) ax1_.set_zlabel('z', fontsize=16) ax_.set_xlabel('a', fontsize=16) ax_.set_ylabel('z/r', fontsize=16) for n in range(0, num_track_ids, 1): track_id = track_ids[n] t = np.where(l == track_id)[0] #if momentum[t[0]]<min_momentum: continue t = t[np.argsort(np.fabs(z[t]))] if np.fabs(a[t[0]] - a[t[-1]]) > 1: continue d = ((x[t[0]] - x[t[-1]])**2 + (y[t[0]] - y[t[-1]])**2 + (z[t[0]] - z[t[-1]])**2)**0.5 if d < 10: continue ###print(n, track_id) color = np.random.uniform(0, 1, (3)) #ax.clear() #ax1.clear() ax_.plot(data[t, 0], data[t, 1], '.', color=color, markersize=5, linewidth=0) ax1_.plot(a[t], r[t], z[t], '.-', color=color, markersize=5, linewidth=1) #ax1.plot(a[h],r[h], z[h], 'o', color=[0,0,0], markersize=8, linewidth=1, mfc='none') #ax1.view_init(0, (ax_n*3)%360) #ax_n += 1 #fig1.savefig('/root/share/project/kaggle/cern/results/yy/%05d.png'%ax_n) #plt.pause(0.01) #plt.waitforbuttonpress(-1) #show_ax() #plt.show() ################################################################################################ # analysis ... ## <to be updated> ... results = results.assign( detected=(results.count_both > results.count_particle) & (results.count_both > results.count_track)) detected = results.loc[results.detected == True] missed = results.loc[(results.detected == False) & (results.count_track < results.count_particle * 0.5)] fp = results.loc[(results.detected == False) & (results.count_track > results.count_particle * 0.5)] detected = np.unique(detected.particle_id.values) missed = np.unique(missed.particle_id.values) fp = np.unique(fp.track_id.values) detected = detected[detected != 0] missed = missed[missed != 0] fp = fp[fp != 0] num_detected = len(detected) num_missed = len(missed) num_fp = len(fp) #shows detected tracks for (p, q) in [(p, detected), (p, missed), (l, fp)]: fig_ = plt.figure(figsize=(8, 8)) ax_ = fig_.add_subplot(111, ) fig_.patch.set_facecolor('white') fig1_ = plt.figure(figsize=(8, 8)) ax1_ = fig1_.add_subplot(111, projection='3d') fig1_.patch.set_facecolor('white') ax_.clear() ax1_.clear() ax_.plot(d0, d1, '.', color=[0.75, 0.75, 0.75], markersize=3, linewidth=0) ax1_.plot(a, r, z, '.', color=[0.75, 0.75, 0.75], markersize=3, linewidth=0) ax1_.set_xlabel('a', fontsize=16) ax1_.set_ylabel('r', fontsize=16) ax1_.set_zlabel('z', fontsize=16) ax_.set_xlabel('a', fontsize=16) ax_.set_ylabel('z/r', fontsize=16) for n in range(0, len(q), 1): t = np.where(p == q[n])[0] #if momentum[t[0]]<min_momentum: continue t = t[np.argsort(np.fabs(z[t]))] if np.fabs(a[t[0]] - a[t[-1]]) > 1: continue d = ((x[t[0]] - x[t[-1]])**2 + (y[t[0]] - y[t[-1]])**2 + (z[t[0]] - z[t[-1]])**2)**0.5 if d < 10: continue ##print(n, track_id) color = np.random.uniform(0, 1, (3)) #ax.clear() #ax1.clear() ax_.plot(data[t, 0], data[t, 1], '.', color=color, markersize=5, linewidth=0) ax1_.plot(a[t], r[t], z[t], '.-', color=color, markersize=5, linewidth=1) #plt.pause(0.01) plt.show() zz = 0 exit(0)
def make_data( a, zr, z, my_layer_id, p, # a_limit=(1.0,3.0), zr_limit=(4.0,7.0), a_limit=(1.0, 2.0), zr_limit=(4.0, 5.0), depth=6): a0, a1 = a_limit zr0, zr1 = zr_limit idx = np.where((a >= a0) & (a < a1) & (zr >= zr0) & (zr < zr1))[0] aa, zzr, zz = a[idx], zr[idx], z[idx] / 1000 ll = my_layer_id[idx] pp = p[idx] data3 = np.column_stack((aa, zzr, zz)) L = len(data3) pairs = [] for d in range(depth - 1): i0 = np.where(ll == d)[0] i1 = np.where(ll == d + 1)[0] L0 = len(i0) L1 = len(i1) if L0 == 0: continue if L1 == 0: continue q0 = data3[i0] q1 = data3[i1] qq0 = np.repeat(q0.reshape(L0, 1, 3), L1, axis=1).reshape(-1, 3) qq1 = np.repeat(q1.reshape(1, L1, 3), L0, axis=0).reshape(-1, 3) ii0 = np.repeat(i0.reshape(L0, 1), L1, axis=1).reshape(-1, 1) ii1 = np.repeat(i1.reshape(1, L1), L0, axis=0).reshape(-1, 1) unit = qq1 - qq0 unit = unit / np.sqrt((unit**2).sum(1, keepdims=True)) ii = np.zeros((L0 * L1, 1), np.int32) pair = np.concatenate((ii0, ii1, ii, qq0, qq1, unit), 1) pairs.append(pair) P = len(pairs) M = 0 for p in pairs: dM = len(p) p[:, 2] = np.arange(M, M + dM) M += dM distance = np.full((M, M), 100, np.float32) #INF for d in range(P - 1): for a in pairs[d]: ai0, ai1, ai = a[:3].astype(np.int32) ap, aq, aunit = np.split(a[3:], 3) if ((np.fabs(aunit[0]) > 0.25) | (np.fabs(aunit[1]) > 0.25)): continue b = pairs[d + 1] i = (np.where((b[:, 0] == ai1)))[0] bi = (b[:, 2][i]).astype(np.int32) dis = np.sqrt(((b[:, -3:][i] - aunit)**2).sum(1)) distance[ai, bi] = dis print('dbscan') _, l = dbscan(distance, eps=0.080, min_samples=1, metric='precomputed') cluster_id = np.unique(l + 1) #cluster_id = cluster_id[cluster_id!=0] num_cluster_id = len(cluster_id) ## draw clustering results ----------------------------------- print('draw clustering results') pairs_flat = np.vstack(pairs) AX3d1.clear() AX3d1.scatter(aa, zzr, zz, c=plt.cm.gnuplot(ll / depth), s=16, edgecolors='none') plot3d_particles(AX3d1, aa, zzr, zz, zz, pp, subsample=1, color=[0, 0, 0], linewidth=4) for id in cluster_id: #AX3d1.clear() #AX3d1.scatter(aa, zzr, zz, c=plt.cm.gnuplot( ll/depth ), s=16, edgecolors='none') t = np.where(l == id) t0 = pairs_flat[t, 0].astype(np.int32).reshape(-1) t1 = pairs_flat[t, 1].astype(np.int32).reshape(-1) t = np.unique(np.concatenate((t0, t1))) #if len(t0)<3: continue color = np.random.uniform(0, 1, 3) #AX3d1.plot(aa[t0], zzr[t0], zz[t0],'.-', color=color, markersize=15) #edgecolors= #AX3d1.plot(aa[t1], zzr[t1], zz[t1],'.-', color=color, markersize=15) AX3d1.plot(aa[t], zzr[t], zz[t], '.-', color=color, markersize=15) #plt.pause(0.01) #plt.waitforbuttonpress(-1) plt.show() return 0
def test_dbscan_badargs(args): # Test bad argument values: these should all raise ValueErrors with pytest.raises(ValueError): dbscan(X, **args)
def predict(self, dfh): print("size(dfh): {0}".format(len(dfh))) if ("rt" not in dfh.columns): dfh["rt"] = np.sqrt(dfh['x'].values**2 + dfh['y'].values**2) z = dfh['z'].values rt = dfh["rt"].values a0 = np.arctan2(dfh['y'].values, dfh['x'].values) layer_id = dfh['layer_id'].values.astype(np.float32) sys.stderr.write("dbscan for each (z,a) shifting\n") scan_labels = [] for (dj, di) in tqdm(product(self.djs, self.dis), total=len(self.djs) * len(self.dis)): ar = a0 + di * rt zr = (z + dj) / rt * 0.1 data2 = np.column_stack([ar, zr]) _, scan_label = dbscan( data2, eps=0.0025, min_samples=1, ) scan_labels.append(scan_label) sys.stderr.write("make candidates\n") candidates = [] for scan_label in tqdm(scan_labels): l = scan_label unique = np.unique(l) for u in unique: candidate = np.where(l == u)[0] candidates.append(candidate) print("# of candidates : {0}".format(len(candidates))) count = np.array([len(candidate) for candidate in candidates]) sort = np.argsort(-count) candidates = [candidates[s] for s in sort] max_label = 1 N = len(dfh) label = np.zeros(N, np.int32) count = np.zeros(N, np.int32) sys.stderr.write("calculate clustering label from candidates\n") for candidate in tqdm(candidates): n = candidate L = len(n) n = n[np.argsort(np.fabs(z[n]))] layer_id0 = layer_id[n[:-1]] layer_id1 = layer_id[n[1:]] ld = layer_id1 - layer_id0 if np.any(ld > 2): continue m = count[n].max() if L < m: continue count[n] = L label[n] = max_label max_label += 1 return label