def execute(self, namespace): from sklearn.cluster import dbscan inp = namespace[self.inputName] mapped = tabular.mappingFilter(inp) # Note that sklearn gives unclustered points label of -1, and first value starts at 0. try: core_samp, dbLabels = dbscan(np.vstack( [inp[k] for k in self.columns]).T, self.searchRadius, self.minClumpSize, n_jobs=self.numberOfJobs) multiproc = True except: core_samp, dbLabels = dbscan( np.vstack([inp[k] for k in self.columns]).T, self.searchRadius, self.minClumpSize) multiproc = False if multiproc: logger.info('using dbscan multiproc version') else: logger.info('falling back to dbscan single-threaded version') # shift dbscan labels up by one to match existing convention that a clumpID of 0 corresponds to unclumped mapped.addColumn('dbscanClumpID', dbLabels + 1) # propogate metadata, if present try: mapped.mdh = inp.mdh except AttributeError: pass namespace[self.outputName] = mapped
def execute(self, namespace): from sklearn.cluster import dbscan inp = namespace[self.inputName] mapped = tabular.MappingFilter(inp) # Note that sklearn gives unclustered points label of -1, and first value starts at 0. if self.multithreaded: core_samp, dbLabels = dbscan(np.vstack( [inp[k] for k in self.columns]).T, self.searchRadius, self.minClumpSize, n_jobs=self.numberOfJobs) else: #NB try-catch from Christians multithreaded example removed as I think we should see failure here core_samp, dbLabels = dbscan( np.vstack([inp[k] for k in self.columns]).T, self.searchRadius, self.minClumpSize) # shift dbscan labels up by one to match existing convention that a clumpID of 0 corresponds to unclumped mapped.addColumn(str(self.clumpColumnName), dbLabels + 1) # propogate metadata, if present try: mapped.mdh = inp.mdh except AttributeError: pass namespace[self.outputName] = mapped
def test_dbscan_sparse(): core_sparse, labels_sparse = dbscan(sparse.lil_matrix(X), eps=.8, min_samples=10) core_dense, labels_dense = dbscan(X, eps=.8, min_samples=10) assert_array_equal(core_dense, core_sparse) assert_array_equal(labels_dense, labels_sparse)
def test_dbscan_core_samples_toy(algorithm): X = [[0], [2], [3], [4], [6], [8], [10]] n_samples = len(X) # Degenerate case: every sample is a core sample, either with its own # cluster or including other close core samples. core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=1) assert_array_equal(core_samples, np.arange(n_samples)) assert_array_equal(labels, [0, 1, 1, 1, 2, 3, 4]) # With eps=1 and min_samples=2 only the 3 samples from the denser area # are core samples. All other points are isolated and considered noise. core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=2) assert_array_equal(core_samples, [1, 2, 3]) assert_array_equal(labels, [-1, 0, 0, 0, -1, -1, -1]) # Only the sample in the middle of the dense area is core. Its two # neighbors are edge samples. Remaining samples are noise. core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=3) assert_array_equal(core_samples, [2]) assert_array_equal(labels, [-1, 0, 0, 0, -1, -1, -1]) # It's no longer possible to extract core samples with eps=1: # everything is noise. core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=4) assert_array_equal(core_samples, []) assert_array_equal(labels, np.full(n_samples, -1.))
def test_boundaries(): # ensure min_samples is inclusive of core point core, _ = dbscan([[0], [1]], eps=2, min_samples=2) assert 0 in core # ensure eps is inclusive of circumference core, _ = dbscan([[0], [1], [1]], eps=1, min_samples=2) assert 0 in core core, _ = dbscan([[0], [1], [1]], eps=.99, min_samples=2) assert 0 not in core
def test_dbscan_input_not_modified(use_sparse, metric): # test that the input is not modified by dbscan X = np.random.RandomState(0).rand(10, 10) X = sparse.csr_matrix(X) if use_sparse else X X_copy = X.copy() dbscan(X, metric=metric) if use_sparse: assert_array_equal(X.toarray(), X_copy.toarray()) else: assert_array_equal(X, X_copy)
def test_dbscan_sparse_precomputed(include_self): D = pairwise_distances(X) nn = NearestNeighbors(radius=0.9).fit(X) X_ = X if include_self else None D_sparse = nn.radius_neighbors_graph(X=X_, mode="distance") # Ensure it is sparse not merely on diagonals: assert D_sparse.nnz < D.shape[0] * (D.shape[0] - 1) core_sparse, labels_sparse = dbscan( D_sparse, eps=0.8, min_samples=10, metric="precomputed" ) core_dense, labels_dense = dbscan( D, eps=0.8, min_samples=10, metric="precomputed" ) assert_array_equal(core_dense, core_sparse) assert_array_equal(labels_dense, labels_sparse)
def test_dbscan_sparse_precomputed_different_eps(): # test that precomputed neighbors graph is filtered if computed with # a radius larger than DBSCAN's eps. lower_eps = 0.2 nn = NearestNeighbors(radius=lower_eps).fit(X) D_sparse = nn.radius_neighbors_graph(X, mode='distance') dbscan_lower = dbscan(D_sparse, eps=lower_eps, metric='precomputed') higher_eps = lower_eps + 0.7 nn = NearestNeighbors(radius=higher_eps).fit(X) D_sparse = nn.radius_neighbors_graph(X, mode='distance') dbscan_higher = dbscan(D_sparse, eps=lower_eps, metric='precomputed') assert_array_equal(dbscan_lower[0], dbscan_higher[0]) assert_array_equal(dbscan_lower[1], dbscan_higher[1])
def run_dbscan(geodata: pd.DataFrame, eps: int, minpts: int) -> pd.DataFrame: """ Used to actually run the DBSCAN Algorithm, using the user-supplied Epsilon """ eps = eps / 1000 kms_per_radian = 6371.0088 epsilon = eps / kms_per_radian minsamples = minpts radians = np.radians(geodata[['x_coordinate', 'y_coordinate']]) # DBSCAN preds = dbscan(radians, eps=epsilon, min_samples=minsamples, algorithm='ball_tree', metric='haversine')[1] dbscan_coords = np.append(radians, preds.reshape(-1, 1), axis=1) pd.DataFrame(dbscan_coords).plot(x=1, y=0, kind="scatter", c=2, colorbar=True, title="DBSCAN (eps= 15m, min_points=5)", marker="+", colormap="tab20b") geodata['Cluster'] = pd.DataFrame(dbscan_coords)[2] return geodata
def get_cluster_assignments(radius_meter: float, min_measures: int, coordinates: List[List[float]], weights): km_radian = 6871.0088 # Conversion: kilometers per radian epsilon = radius_meter/1000/km_radian #return DBSCAN(metric='haversine', algorithm='ball_tree', eps=epsilon, min_samples=min_measures).fit( # radians(coordinates)).labels_ return dbscan(radians(coordinates), eps=epsilon, min_samples=min_measures, metric='haversine', algorithm='ball_tree', sample_weight=weights)[1] # returns: tuple(core_samples, labels)
def smooth_eye_position(eye_position, threshold=2): x_pos, y_pos = eye_position[:, 0], eye_position[:, 1] X = np.stack((x_pos, y_pos, np.linspace(0, len(x_pos) / 2, len(x_pos)))).T clusters = cluster.dbscan(X, eps=threshold, min_samples=3, metric='minkowski', p=2) move_events = np.where(clusters[1][1:] > clusters[1][:-1])[0] + 1 len_chunks = [move_events[0]] + list(move_events[1:] - move_events[:-1]) len_chunks.append(len(x_pos) - move_events[-1]) eye_x_positions = np.split(x_pos, move_events) eye_y_positions = np.split(y_pos, move_events) mean_x_pos = np.array(list(map(np.mean, eye_x_positions))) mean_y_pos = np.array(list(map(np.mean, eye_y_positions))) x_pos_smooth = np.concatenate( [[x_pos] * len_chunk for x_pos, len_chunk in zip(mean_x_pos, len_chunks)]) y_pos_smooth = np.concatenate( [[y_pos] * len_chunk for y_pos, len_chunk in zip(mean_y_pos, len_chunks)]) return np.stack((x_pos_smooth, y_pos_smooth)).T
def homog_lev_series(obj, eps=eps, min_samples=min_samples): name = obj.name original = obj.copy() obj = obj.drop_duplicates() data = obj.tolist() def lev_metric(x, y): i, j = int(x[0]), int(y[0]) return levenshtein(data[i], data[j]) X = np.arange(len(data)).reshape(-1, 1) labels = dbscan(X, metric=lev_metric, eps=eps, min_samples=min_samples)[1] x = pd.DataFrame({ 'A': obj.reset_index(drop=True), 'B': pd.Series(labels) }) y = x.drop_duplicates('B') y = y[~(y.B == -1)] y.columns = ['C', 'B'] x = x.merge(y, on='B', how='left') x['C'] = np.where(x.C.isnull(), x.A, x.C) results = pd.DataFrame({'A': original}) results = results.merge(x[['A', 'C']], on='A', how='left') out = results.C.rename(name) return out
def run_dbscan(geodata: pd.DataFrame) -> pd.DataFrame: kms_per_radian = 6371.0088 epsilon = 0.015 / kms_per_radian minsamples = 5 radians = np.radians(geodata[['x_coordinate', 'y_coordinate']]) # DBSCAN preds = dbscan(radians, eps=epsilon, min_samples=minsamples, algorithm='ball_tree', metric='haversine')[1] dbscan_coords = np.append(radians, preds.reshape(-1, 1), axis=1) pd.DataFrame(dbscan_coords).plot(x=1, y=0, kind="scatter", c=2, colorbar=True, title="DBSCAN (eps= 15m, min_points=5)", marker="+", colormap="tab20b") geodata['Cluster'] = pd.DataFrame(dbscan_coords)[2] return geodata
def categoryAnalysis(count=1000, dbScanCount=100, buckets=100, startEps=100, samples=3): matrix, counter = loadMatrixPickle(count, buckets) sys.exit() #run Dbscan start = time.time() db = dbscan(matrix.matrix[:dbScanCount], eps=startEps, algorithm='kd_tree', min_samples=samples, n_jobs=-1) print( "DBScan: eps= %.3f, min_samples=%d, %d clusters generated, %.2f%% noise, %d articles" % (startEps, samples, len(set(db[1])), 100.0 * list(db[1]).count(-1) / count, len(matrix.matrix))) print("DBScan time: %.2fs" % (time.time() - start)) start = time.time() totalDb = fastCluster(dbScanCount, db, matrix.matrix) print("fastCluster time: %.2fs" % (time.time() - start)) start = time.time() clusterCategoryCounter, totalCategoryCounter = getCategoryAppearanceRates( matrix, totalDb, counter) findHighCategoryRates(clusterCategoryCounter, totalCategoryCounter, totalDb, count, counter) print("Category analysing time: %.2fs" % (time.time() - start))
def split_eye_events(eye_tracking, eps=2): """ Split the record where the eye moves. Detection done with clustering on X,Y and time of the eye position. params: - eye_tracking: Eye traking array of the ellipse fit, in shape (t, (x,y,width,height,angle)) - eps: Distance to detect eye movements. Adjust this parameter if results are not satisfying - kind: kind of interpolation in {'linear', 'cubic', 'quintic'} return: - move_indexes, blink_indexes, noise_indexes """ x_pos = np.array(eye_tracking[:, 0]) X = np.stack((x_pos, np.linspace(0, len(x_pos), len(x_pos)) * .5)).T clusters = cluster.dbscan(X, eps=eps, min_samples=5, metric='minkowski', p=2) move_indexes = np.where(clusters[1][1:] > clusters[1][:-1])[0] + 1 noise_indexes = np.where(clusters[1] == -1)[0] blink_indexes = np.where(x_pos == 0)[0] return move_indexes, blink_indexes, noise_indexes
def post_process_y(y, eps=0.06, min_samples=2): # no postprocessing if min_samples > y.shape[1]: return y for i in range(y.shape[0]): row = y[i] row = row.reshape(-1, 1) _, labels = dbscan(row, eps=eps, min_samples=min_samples, metric='euclidean') minusonecluster = sum(labels == -1) clusters = len(np.unique(labels)) print(row, labels) # one of the unique clusters is -1 # these are clusters by themselves # and we do not touch them(we are # confident about the result) if minusonecluster > 0: clusters = clusters - 1 for j in range(clusters): indices = (labels == j) # we are not sure about the # ranking of the elements of # this cluster. We prefer to # assign them equal probability row[indices] = np.mean(row[indices]) row = row.reshape(1, -1) y[i] = row return y
def size_hist(parts, params, eps=1.0, sp=0): """ Finds clusters in the list of particles Parameters ---------- parts a list of particle objects to be clustered params a dict of configuration values eps the separation distance to use for identifying clusters sp the specie to identify clusters in """ # Extract the position vectors D = np.zeros((len(parts), 3)) ps = 0 for p in range(len(parts)): # Check if the particle is of the desired specie if sp == 0 or parts[p].sp == sp: D[ps] = parts[p].x ps += 1 # Truncate zeros if we didn't cluster everything D = D[:ps] [core, labels] = dbscan(D, eps=eps, min_samples=1) n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print('Found', n_clusters_, 'clusters') # Sizes of each cluster cluster_sizes = np.bincount(labels) # Number of clusters for given size size_hist = np.bincount(cluster_sizes) return size_hist[1:]
def get_dbscan_clusters_mask(mask, N, coins_hsv): # Generate data points for clustering data_points = np.empty(shape=(N, 3)) for i in range(0, N): local_mask = np.where(mask == i, 255, 0) local_mask = local_mask.astype(np.uint8) (m1, m2, m3, _) = cv2.mean(coins_hsv, local_mask) # hsv dava podobri rezultati od lab data_points[i] = [m1, m2, m3] core_samples, labels = dbscan(data_points, 16.5, 1) for x in range(0, mask.shape[0]): for y in range(0, mask.shape[1]): label = labels[mask[x, y]] if label == -1: # noise is background label = 0 mask[x, y] = label mask = np.where(mask == 0, 0, 255) mask = mask.astype(np.uint8) print("There are {} clusters".format(np.max(labels))) return mask
def dbscan_labels(pointcloud, epsilon, minpoints, rgb_weight=0, algorithm="ball_tree"): """ Find an array of point-labels of clusters found by the DBSCAN algorithm. Parameters ---------- pointcloud : pcl.PointCloud Input pointcloud. epsilon : float Neighborhood radius for DBSCAN. minpoints : integer Minimum neighborhood density for DBSCAN. rgb_weight : float, optional If non-zero, cluster on color information as well as location; specifies the relative weight of the RGB components to spatial coordinates in distance computations. (RGB values have wildly different scales than spatial coordinates.) Returns ------- labels : Sequence A sequence of labels per point. Label -1 indicates a point does not belong to any cluster, other labels indicate the cluster number a point belongs to. """ if rgb_weight > 0: X = pointcloud.to_array() X[:, 3:] *= rgb_weight else: X = pointcloud _, labels = dbscan(X, eps=epsilon, min_samples=minpoints, algorithm=algorithm) return np.asarray(labels)
def dbscan(threshold, matrix, taxa, revert=False, min_samples=1): """ Compute DBSCAN cluster analysis. """ if not taxa: taxa = list(range(1, len(matrix) + 1)) core_samples, labels = cluster.dbscan(matrix, eps=threshold, min_samples=min_samples, metric='precomputed') # change to our internal cluster style idx = max(labels) + 1 if idx == 0: idx += 1 for i, c in enumerate(labels): if c == -1: labels[i] = idx idx += 1 # check for revert if revert: return dict(zip(range(len(taxa)), labels)) # return stuff clr = {} for i, t in enumerate(taxa): try: clr[labels[i]] += [t] except KeyError: clr[clusters[i]] = [t] return clr
def size_hist(parts, params, eps=1.0, sp=0): """ Finds clusters in the list of particles Parameters ---------- parts a list of particle objects to be clustered params a dict of configuration values eps the separation distance to use for identifying clusters sp the specie to identify clusters in """ # Extract the position vectors D = np.zeros((len(parts),3)) ps = 0 for p in range(len(parts)): # Check if the particle is of the desired specie if sp == 0 or parts[p].sp == sp: D[ps] = parts[p].x ps += 1 # Truncate zeros if we didn't cluster everything D = D[:ps] [core, labels] = dbscan( D, eps=eps, min_samples=1) n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print('Found',n_clusters_,'clusters') # Sizes of each cluster cluster_sizes = np.bincount(labels) # Number of clusters for given size size_hist = np.bincount( cluster_sizes ) return size_hist[1:]
def step(self, state, a, o): """ state should be new agent internal state after taking action a and observing observation o. state should contain probability distribution of next observation. """ try: if state.type() == 'torch.FloatTensor': state = state.detach().numpy() except AttributeError: pass if self.h and len(self.s) > self.h_len: self.h = self.h[1:] self.s = self.s[1:] self.s_labels = self.s_labels[1:] self.s = np.append(self.s, [state], axis=0) self.h.append([a, o]) self.s_labels = dbscan([state.flatten() for state in self.s])[1] self.observed = np.zeros((max(self.s_labels) + 1, self.a, self.o)) for i in range(max(self.s_labels) + 1): state_indices = [j for j in range(len(self.s)) if self.s_labels[j] == i] clust_mean = np.mean([self.s[j] for j in state_indices], axis=0) self.observed[i] = clust_mean self.update_act() self.update_exp() self.calc_chisquare()
def test_dbscan_callable(): # Tests the DBSCAN algorithm with a callable metric. # Parameters chosen specifically for this task. # Different eps to other test, because distance is not normalised. eps = 0.8 min_samples = 10 # metric is the function reference, not the string key. metric = distance.euclidean # Compute DBSCAN # parameters chosen for task core_samples, labels = dbscan(X, metric=metric, eps=eps, min_samples=min_samples, algorithm='ball_tree') # number of clusters, ignoring noise if present n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters db = DBSCAN(metric=metric, eps=eps, min_samples=min_samples, algorithm='ball_tree') labels = db.fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert n_clusters_2 == n_clusters
def do_clustering(self, coeffs): """ Do DBSCAN clustering on the corrections. :param coeffs: Triplet of distance coefficients, corresponding to the sensitivity of the clustering to point separation along 1) x-axis (time), 2) y-axis (correction) and 3) slope (drift rate) :type coeffs: tuple(float, float, float) :return: Results of sklearn.cluster.dbscan (refer to third party documentation) """ sec_per_week = 7 * 24 * 3600 def _temporalDist2DSlope(p0, p1, coeffs): return math.sqrt((coeffs[0] * (p1[0] - p0[0]))**2 + (coeffs[1] * sec_per_week * (p1[1] - p0[1]))**2 + (coeffs[2] * sec_per_week * sec_per_week * (p1[2] - p0[2]))**2) data = np.column_stack( (self.correction_times_clean, self.corrections_clean, self.corrections_slope)) ind, ids = dbscan( data, eps=2 * sec_per_week, min_samples=7, metric=lambda p0, p1: _temporalDist2DSlope(p0, p1, coeffs)) return ind, ids
def segment_by_dbscan(binary_img: np.ndarray, eps: float = 5.0, min_samples: int = 10) -> List[np.ndarray]: """Use DBSCAN clustering to segment binary image. Parameters ---------- binary_img: np.ndarray binary image, a 2D array containing 0s and 1s (obtaind by thresholding original image converted to grayscale). eps: float the epsilon parameter of DBSCAN. min_samples: int minimum number of pixels each cluster (object) must contain in order to be considered a valid object. Returns ------- list List of coordinate arrays where the n-th entry is the array of positions of the pixels belonging to the n-th segmented object. """ indices = np.nonzero(binary_img) if len(indices[0]) == 0: return [] xy = np.vstack((indices[1], indices[0])).T core, labels = cluster.dbscan(xy, eps=eps, min_samples=min_samples, metric='euclidean', algorithm='auto') unique_labels = set(labels) unique_labels.discard(-1) # -1 is the noise label return [xy[labels == label] for label in sorted(unique_labels)]
def cluster(self, x: List[T]) -> List[int]: # NB(nkansal96): The selection of `eps` here is arbitrary and most likely wrong. Until this is fixed, # you should use `AffinityCluster` _, mapping = dbscan(self.get_distance_matrix(x), eps=2, min_samples=0, metric="precomputed") return mapping
def get_clusters(X, sent_coll, eps=DEFAULT_EPS): db = dbscan(X, eps=eps, min_samples=3, metric='cosine', algorithm='brute')[1] d = defaultdict(list) for i in range(len(db)): d[db[i]].append(sent_coll[i]) return d
def get_clusters(X, sent_coll, eps=DEFAULT_EPS, min_samp=3): from scipy.optimize import minimize_scalar new_eps = minimize_scalar(lambda x: -len(set(dbscan(X, eps=x, min_samples=min_samp, metric='cosine', algorithm='brute')[1])), method='bounded', bounds=[0, 1]).x db = dbscan(X, eps=new_eps, min_samples=min_samp, metric='cosine', algorithm='brute')[1] d = defaultdict(list) for i in range(len(db)): d[db[i]].append(sent_coll[i]) return d, new_eps
def get_dbscan_data(mdf, eps, npts): np.random.seed(42) res = dbscan(mdf[["LatRad", "LonRad"]].values, eps=eps * 1e-5, min_samples=npts, metric='haversine') df3 = mdf.copy() df3["cluster"] = res[1] return df3
def DBSCAN_clust(d, words, epsilon): if VERBOSE: print 'Running DBSCAN!' core, labels = dbscan(d, eps=epsilon, metric='precomputed') cluster_assignments = labels nclust = max(cluster_assignments) assignments = pd.DataFrame({'word':words, 'cluster':labels}) csizes, indices = eval_assignments(assignments, nclust, None) return assignments, csizes, indices
def cluster_into_spots(df, init_eps=150, levels=2, threshold=0.1): start_points = list() end_points = list() length = len(df) for i in range(length): start_points.append([df['start_lat'].iloc[i], df['start_lon'].iloc[i]]) end_points.append([df['end_lat'].iloc[i], df['end_lon'].iloc[i]]) points = np.radians(np.vstack([start_points, end_points])) haversine = DistanceMetric.get_metric('haversine') dist = haversine.pairwise(points) * R clusters = dbscan(dist, metric='precomputed', min_samples=1, eps=init_eps)[1] clusters = np.array(clusters, dtype=np.object) for _ in range(levels): init_eps = init_eps * 0.5 counts = dict(Counter(clusters)) for key in counts: if counts[key] > threshold * length: idxs = np.where(clusters == key)[0] dist = haversine.pairwise(points[idxs]) * R inner_clusters = dbscan(dist, metric='precomputed', min_samples=1, eps=init_eps)[1] for i, idx in enumerate(idxs): clusters[idx] = "{}_{}".format(clusters[idx], inner_clusters[i]) start_clusters = list() end_clusters = list() for i, cluster in enumerate(clusters): if i < length: start_clusters.append(clusters[i]) else: end_clusters.append(clusters[i % length + length]) df['start_cluster'] = start_clusters df['end_cluster'] = end_clusters return df
def run(self, src): x = self.table(src) settings = self.settings print(settings) cl = dbscan(eps=settings.eps, min_samples=settings.min_pts, algorithm="brute") y = cl.fit_predict(x) clusters = {} for x_i, y_i in zip(x, y): clusters[y_i] = clusters.get(y_i, []) + [x_i] return clusters
def mergeKeypoints(keypoints, eps): if len(keypoints) < 2: return keypoints points = np.array([keypoint.pt for keypoint in keypoints]) sizes = np.array([keypoint.size for keypoint in keypoints]) # http://scikit-learn.org/stable/modules/generated/sklearn.cluster.dbscan.html _, pointsLabels = dbscan(points, eps = eps, min_samples = 1, metric = 'euclidean') clustersPredicates = ((pointsLabels == label) for label in range(0, max(pointsLabels) + 1)) mergedKeypoints = [createCentroid(points[predicate], sizes[predicate], eps) for predicate in clustersPredicates] return mergedKeypoints
def test_dbscan(self): iris = datasets.load_iris() df = pdml.ModelFrame(iris) result = df.cluster.dbscan() expected = cluster.dbscan(iris.data) self.assertEqual(len(result), 2) self.assert_numpy_array_almost_equal(result[0], expected[0]) self.assertTrue(isinstance(result[1], pdml.ModelSeries)) self.assert_index_equal(result[1].index, df.index) self.assert_numpy_array_equal(result[1].values, expected[1])
def test_dbscan(self): iris = datasets.load_iris() df = pdml.ModelFrame(iris) result = df.cluster.dbscan() expected = cluster.dbscan(iris.data) self.assertEqual(len(result), 2) self.assert_numpy_array_almost_equal(result[0], expected[0]) self.assertIsInstance(result[1], pdml.ModelSeries) tm.assert_index_equal(result[1].index, df.index) tm.assert_numpy_array_equal(result[1].values, expected[1])
def dbscan_func(data_seg): point_cloud = data_seg[0] eps = data_seg[1] min_samples = data_seg[2] metric = data_seg[3] algorithm = data_seg[4] i_slice = data_seg[5] num_points = len(point_cloud) print("Starting to cluster slice {} with a total of {} points".format(i_slice, num_points)) oldtime = time() result = dbscan(point_cloud, eps=eps, min_samples=min_samples, metric=metric, algorithm=algorithm) print("Finished clustering slice {}. Time for slice: {} sec".format(i_slice, time() - oldtime)) core_sample_indices = result[0] labels = result[1] return core_sample_indices, labels, i_slice
def Dbscan(embeddings, id_word, word_id, eps, min_size): coreSamples, labels = dbscan(embeddings, eps, min_size) # group clusters clusters = {} for i, label in enumerate(labels): if label not in clusters: clusters[label] = [] clusters[label].append(id_word[i].encode('utf-8')) # output print(len(clusters) - 1) for c in clusters.iterkeys(): if c < 0: continue # -1 is noise print(' '.join([str(x) for x in embeddings[int(c)]])) print() # show clusters for c, words in clusters.iteritems(): print(c, ' '.join(words))
def dbscan( threshold, matrix, taxa, revert = False, min_samples = 1 ): """ Compute DBSCAN cluster analysis. """ if not taxa: taxa = list(range(1,len(matrix)+1)) core_samples,labels = cluster.dbscan( matrix, eps=threshold, min_samples = min_samples, metric = 'precomputed' ) # change to our internal cluster style idx = max(labels)+1 if idx == 0: idx += 1 for i,c in enumerate(labels): if c == -1: labels[i] = idx idx += 1 # check for revert if revert: return dict( zip( range(len(taxa)), labels ) ) # return stuff clr = {} for i,t in enumerate(taxa): try: clr[labels[i]] += [t] except KeyError: clr[clusters[i]] = [t] return clr
def _cluster_core(sort_list, r, visited, final_list): from sklearn.cluster import dbscan from scipy.spatial.distance import euclidean pos = np.r_[[i[1] for i in sort_list]] if len(pos) >= 2: _, labels = dbscan(pos, eps=r, min_samples=2) pool = set() for i, p in enumerate(sort_list): if p[1] in pool: continue c = labels[i] if c==-1: continue sub = pos[labels==c] cen = p[1] rad = r Local = [p[1]] ini = -1 while len(sub): out = [] for q in sub: if tuple(q) in pool: continue tmp = euclidean(q, cen) if tmp<=rad: Local.append(tuple(q)) else: out.append(tuple(q)) if len(out)==ini: break ini = len(out) tmp = np.r_[Local] # assign centroid to a certain pixel cen = tuple(tmp.mean(axis=0).round().astype(int)) rad = np.int(np.round(max([euclidean(cen,q) for q in Local]))) + r sub = np.r_[out] for q in Local: pool.add(q) final_list.append((p[1], cen, rad)) visited.update(pool)
def run_cluster(complPG, qfib, qsym, cl_radius=cl_radius, min_compl=min_compl): """ """ start = time.clock() # time this # # use transforms module for distance # quatDistance = lambda x, y: xf.quat_distance(x, y, qsym) # use compiled module for distance # just to be safe, must order qsym as C-contiguous qsym = np.array(qsym.T, order='C').T quatDistance = lambda x, y: xfcapi.quat_distance(np.array(x, order='C'), \ np.array(y, order='C'), \ qsym) qfib_r = qfib[:, np.r_[complPG] > min_compl] print "Feeding %d orientations above %.1f%% to clustering" % (qfib_r.shape[1], 100*min_compl) if haveScikit: print "Using scikit..." pdist = pairwise_distances(qfib_r.T, metric=quatDistance, n_jobs=-1) core_samples, labels = dbscan(pdist, eps=d2r*cl_radius, min_samples=1, metric='precomputed') cl = np.array(labels, dtype=int) + 1 else: print "Using fclusterdata with a tolerance of %f degrees..." % (cl_radius) cl = cluster.hierarchy.fclusterdata(qfib_r.T, d2r*cl_radius, criterion='distance', metric=quatDistance) nblobs = len(np.unique(cl)) qbar = np.zeros((4, nblobs)) for i in range(nblobs): npts = sum(cl == i + 1) # qbar[:, i] = mutil.unitVector( # np.sum(qfib_r[:, cl == i + 1].reshape(4, npts), axis=1).reshape(4, 1)).flatten() qbar[:, i] = rot.quatAverage(qfib_r[:, cl == i + 1].reshape(4, npts), qsym).flatten() elapsed = (time.clock() - start) print "clustering took %f seconds" % (elapsed) return qbar, cl
def vel_hist(parts, params, eps=1.0, sp=0): # Extract the position vectors D = np.zeros(( len(parts),3) ) # Particle velocities V = np.zeros( (len(parts),3 ) ) ps = 0 for p in range(len(parts)): # Check if the particle is of the desired specie if sp == 0 or parts[p].sp == sp: D[ps] = parts[p].x V[ps] = parts[p].v ps += 1 # Truncate zeros if we didn't cluster everything D = D[:ps] # Make clusters based on position [core, labels] = dbscan( D, eps=eps, min_samples=1) n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) # The net velocities of each cluster cluster_vels = np.zeros( (n_clusters_, 3) ); # Iterate all particles for p in range(ps): if labels[p] >= 0: # Make sure it was clustered # Add particle velocity to its cluster's net velocity cluster_vels[ labels[p] ] = np.add( cluster_vels[ labels[p] ], V[p] ) # Magnitudes of cluster velocities normed_vels = np.apply_along_axis( np.linalg.norm, 1, cluster_vels ) cluster_sizes = np.bincount(labels) # Number of clusters for given size size_hist = np.bincount( cluster_sizes ) # Speed as a function of size speed_size = np.zeros( len(size_hist) ) # Iterate through clusters for c in range(n_clusters_): speed_size[ cluster_sizes[c] ] = np.add(speed_size[ cluster_sizes[c] ], normed_vels[c]) # Average speed_size = np.divide(speed_size[1:], size_hist[1:]) return speed_size
def run_cluster(compl, qfib, qsym, cfg, min_samples=None, compl_thresh=None, radius=None): """ """ algorithm = cfg.find_orientations.clustering.algorithm # check for override on completeness threshold if compl_thresh is not None: min_compl = compl_thresh # check for override on radius if radius is not None: cl_radius = radius start = time.clock() # time this num_above = sum(np.array(compl) > min_compl) if num_above == 0: # nothing to cluster qbar = cl = np.array([]) elif num_above == 1: # short circuit qbar = qfib[:, np.array(compl) > min_compl] cl = [1] else: # use compiled module for distance # just to be safe, must order qsym as C-contiguous qsym = np.array(qsym.T, order='C').T def quat_distance(x, y): return xfcapi.quat_distance(np.array(x, order='C'), np.array(y, order='C'), qsym) qfib_r = qfib[:, np.array(compl) > min_compl] num_ors = qfib_r.shape[1] if num_ors > 25000: if algorithm == 'sph-dbscan' or algorithm == 'fclusterdata': logger.info("falling back to euclidean DBSCAN") algorithm = 'ort-dbscan' #raise RuntimeError, \ # "Requested clustering of %d orientations, which would be too slow!" %qfib_r.shape[1] logger.info( "Feeding %d orientations above %.1f%% to clustering", num_ors, 100*min_compl ) if algorithm == 'dbscan' and not have_sklearn: algorithm = 'fclusterdata' logger.warning( "sklearn >= 0.14 required for dbscan; using fclusterdata" ) if algorithm == 'dbscan' or algorithm == 'ort-dbscan' or algorithm == 'sph-dbscan': # munge min_samples according to options if min_samples is None or cfg.find_orientations.use_quaternion_grid is not None: min_samples = 1 if algorithm == 'sph-dbscan': logger.info("using spherical DBSCAN") # compute distance matrix pdist = pairwise_distances( qfib_r.T, metric=quat_distance, n_jobs=1 ) # run dbscan core_samples, labels = dbscan( pdist, eps=np.radians(cl_radius), min_samples=min_samples, metric='precomputed' ) else: if algorithm == 'ort-dbscan': logger.info("using euclidean orthographic DBSCAN") pts = qfib_r[1:, :].T eps = 0.25*np.radians(cl_radius) else: logger.info("using euclidean DBSCAN") pts = qfib_r.T eps = 0.5*np.radians(cl_radius) # run dbscan core_samples, labels = dbscan( pts, eps=eps, min_samples=min_samples, metric='minkowski', p=2, ) # extract cluster labels cl = np.array(labels, dtype=int) # convert to array noise_points = cl == -1 # index for marking noise cl += 1 # move index to 1-based instead of 0 cl[noise_points] = -1 # re-mark noise as -1 logger.info("dbscan found %d noise points", sum(noise_points)) elif algorithm == 'fclusterdata': logger.info("using spherical fclusetrdata") cl = cluster.hierarchy.fclusterdata( qfib_r.T, np.radians(cl_radius), criterion='distance', metric=quat_distance ) else: raise RuntimeError( "Clustering algorithm %s not recognized" % algorithm ) # extract number of clusters if np.any(cl == -1): nblobs = len(np.unique(cl)) - 1 else: nblobs = len(np.unique(cl)) """ PERFORM AVERAGING TO GET CLUSTER CENTROIDS """ qbar = np.zeros((4, nblobs)) for i in range(nblobs): npts = sum(cl == i + 1) qbar[:, i] = rot.quatAverageCluster( qfib_r[:, cl == i + 1].reshape(4, npts), qsym ).flatten() pass pass if (algorithm == 'dbscan' or algorithm == 'ort-dbscan') \ and qbar.size/4 > 1: logger.info("\tchecking for duplicate orientations...") cl = cluster.hierarchy.fclusterdata( qbar.T, np.radians(cl_radius), criterion='distance', metric=quat_distance) nblobs_new = len(np.unique(cl)) if nblobs_new < nblobs: logger.info("\tfound %d duplicates within %f degrees" \ %(nblobs-nblobs_new, cl_radius)) tmp = np.zeros((4, nblobs_new)) for i in range(nblobs_new): npts = sum(cl == i + 1) tmp[:, i] = rot.quatAverageCluster( qbar[:, cl == i + 1].reshape(4, npts), qsym ).flatten() pass qbar = tmp pass pass logger.info("clustering took %f seconds", time.clock() - start) logger.info( "Found %d orientation clusters with >=%.1f%% completeness" " and %2f misorientation", qbar.size/4, 100.*min_compl, cl_radius ) return np.atleast_2d(qbar), cl
def clusterData(X): original=X X = StandardScaler().fit_transform(X) coreSamples, labels = dbscan(X, min_samples=5, eps=.07, p=4) #return original, labels return rearrangeLabels(original, labels)
from Levenshtein import * import numpy as np from sklearn.cluster import dbscan data = ["ACCTCCTAGAAG", "ACCTACTAGAAGTT", "GAATATTAGGCCGA"] def lev_metric(x, y): i, j = int(x[0]), int(y[0]) # extract indices return levenshtein(data[i], data[j]) X = np.arange(len(data)).reshape(-1, 1) print(X) #array([[0], # [1], # [2]]) dbscan(X, metric=lev_metric, eps=5, min_samples=2) #([0, 1], array([ 0, 0, -1]))
def build_overlap_table(cfg, tol_mult=0.5): icfg = get_instrument_parameters(cfg) gt = np.loadtxt( os.path.join(cfg.analysis_dir, 'grains.out') ) ngrains = len(gt) mat_list = cPickle.load(open(cfg.material.definitions, 'r')) mat_names = [mat_list[i].name for i in range(len(mat_list))] mat_dict = dict(zip(mat_names, mat_list)) matl = mat_dict[cfg.material.active] pd = matl.planeData pd.exclusions = np.zeros(len(pd.exclusions), dtype=bool) pd.tThMax = np.radians(cfg.fit_grains.tth_max) pd.tThWidth = np.radians(cfg.fit_grains.tolerance.tth[-1]) # for clustering... eps = tol_mult*np.radians( min( min(cfg.fit_grains.tolerance.eta), 2*min(cfg.fit_grains.tolerance.omega) ) ) # merged two-theta indices tth_ranges_merged = pd.getMergedRanges()[0] pids = [] for hklids in tth_ranges_merged: pids.append( [pd.hklDataList[hklids[i]]['hklID'] for i in range(len(hklids))] ) # Make table of unit diffraction vectors st = [] for i in range(ngrains): this_st = np.loadtxt( os.path.join(cfg.analysis_dir, 'spots_%05d.out' %i) ) #... do all predicted? valid_spt = this_st[:, 0] >= 0 #valid_spt = np.ones(len(this_st), dtype=bool) angs = this_st[valid_spt, 7:10] dvec = xfcapi.anglesToDVec( angs, chi=icfg['oscillation_stage']['chi'] ) # [ grainID, reflID, hklID, D_s[0], D_s[1], D_s[2], tth, eta, ome ] st.append( np.hstack([ i*np.ones((sum(valid_spt), 1)), this_st[valid_spt, :2], dvec, angs, ]) ) # make overlap table # [[range_0], [range_1], ..., [range_n]] # range_0 = [grainIDs, reflIDs, hklIDs] that are within tol overlap_table = [] ii = 0 for pid in pids: print "processing ring set %d" %ii start0 = time.clock() tmp = []; a = []; b = []; c = [] for j in range(len(pid)): a.append( np.vstack( [st[i][st[i][:, 2] == pid[j], 3:6] for i in range(len(st))] ) ) b.append( np.vstack( [st[i][st[i][:, 2] == pid[j], 0:3] for i in range(len(st))] ) ) c.append( np.vstack( [st[i][st[i][:, 2] == pid[j], 6:9] for i in range(len(st))] ) ) pass a = np.vstack(a) # unit diffraction vectors in sample frame b = np.vstack(b) # [grainID, reflID, hklID] c = np.vstack(c) # predicted angles [tth, eta, ome] if len(a) > 0: # run dbscan core_samples, labels = dbscan( a, eps=eps, min_samples=2, metric='minkowski', p=2, ) cl, nblobs = postprocess_dbscan(labels) elapsed0 = time.clock() - start0 print "\tdbscan took %.2f seconds" % elapsed0 # import pdb; pdb.set_trace() print "\tcollapsing incidentals for %d candidates..." %nblobs start1 = time.clock() # time this for i in range(1, nblobs+1): # put in check on omega here these_angs = c[np.where(cl == i)[0], :] # local_cl = cluster.hierarchy.fclusterdata( # these_angs[:, 1:], # eps, # criterion='distance', # metric=adist # ) # local_nblobs = len(np.unique(local_cl)) _, local_labels = dbscan( these_angs[:, 1:], eps=eps, min_samples=2, metric=adist, n_jobs=-1, ) local_cl, local_nblobs = postprocess_dbscan(local_labels) if local_nblobs < len(these_angs): for j in range(1, local_nblobs + 1): npts = sum(local_cl == j) if npts >= 2: cl_idx = np.where(local_cl == j)[0] #import pdb; pdb.set_trace() tmp.append( b[np.where(cl == i)[0][cl_idx], :] ) elapsed1 = time.clock() - start1 print "\tomega filtering took %.2f seconds" %elapsed1 ii += 1 overlap_table.append(tmp) return overlap_table
def run_cluster(compl, qfib, qsym, cfg): """ """ cl_radius = cfg.find_orientations.clustering.radius min_compl = cfg.find_orientations.clustering.completeness algorithm = cfg.find_orientations.clustering.algorithm start = time.clock() # time this num_above = sum(np.array(compl) > min_compl) if num_above == 0: # nothing to cluster qbar = cl = np.array([]) elif num_above == 1: # short circuit qbar = qfib[:, np.array(compl) > min_compl] cl = [1] else: # use compiled module for distance # just to be safe, must order qsym as C-contiguous qsym = np.array(qsym.T, order='C').T quat_distance = lambda x, y: xfcapi.quat_distance( np.array(x, order='C'), np.array(y, order='C'), qsym ) qfib_r = qfib[:, np.array(compl) > min_compl] logger.info( "Feeding %d orientations above %.1f%% to clustering", qfib_r.shape[1], 100*min_compl ) if algorithm == 'dbscan' and not have_sklearn: algorithm = 'fclusterdata' logger.warning( "sklearn >= 0.14 required for dbscan, using fclusterdata" ) if algorithm == 'dbscan': pdist = pairwise_distances( qfib_r.T, metric=quat_distance, n_jobs=-1 ) core_samples, labels = dbscan( pdist, eps=np.radians(cl_radius), min_samples=1, metric='precomputed' ) cl = np.array(labels, dtype=int) + 1 elif algorithm == 'fclusterdata': cl = cluster.hierarchy.fclusterdata( qfib_r.T, np.radians(cl_radius), criterion='distance', metric=quat_distance ) else: raise RuntimeError( "Clustering algorithm %s not recognized" % algorithm ) nblobs = len(np.unique(cl)) qbar = np.zeros((4, nblobs)) for i in range(nblobs): npts = sum(cl == i + 1) qbar[:, i] = rot.quatAverage( qfib_r[:, cl == i + 1].reshape(4, npts), qsym ).flatten() logger.info("clustering took %f seconds", time.clock() - start) logger.info( "Found %d orientation clusters with >=%.1f%% completeness" " and %2f misorientation", qbar.size/4, 100.*min_compl, cl_radius ) return np.atleast_2d(qbar), cl
def build_overlap_table(cfg, tol_mult=0.5): icfg = get_instrument_parameters(cfg) gt = np.loadtxt( os.path.join(cfg.analysis_dir, 'grains.out') ) ngrains = len(gt) mat_list = cPickle.load(open(cfg.material.definitions, 'r')) mat_names = [mat_list[i].name for i in range(len(mat_list))] mat_dict = dict(zip(mat_names, mat_list)) matl = mat_dict[cfg.material.active] pd = matl.planeData pd.exclusions = np.zeros(len(pd.exclusions), dtype=bool) pd.tThMax = np.radians(cfg.fit_grains.tth_max) pd.tThWidth = np.radians(cfg.fit_grains.tolerance.tth[-1]) # for clustering... eps = tol_mult*np.radians( min( min(cfg.fit_grains.tolerance.eta), 2*min(cfg.fit_grains.tolerance.omega) ) ) # merged two-theta indices tth_ranges_merged = pd.getMergedRanges()[0] pids = [] for hklids in tth_ranges_merged: pids.append( [pd.hklDataList[hklids[i]]['hklID'] for i in range(len(hklids))] ) # Make table of unit diffraction vectors st = [] for i in range(ngrains): this_st = np.loadtxt( os.path.join(cfg.analysis_dir, 'spots_%05d.out' %i) ) #... do all predicted? valid_spt = this_st[:, 0] >= 0 #valid_spt = np.ones(len(this_st), dtype=bool) angs = this_st[valid_spt, 7:10] dvec = xfcapi.anglesToDVec( angs, chi=icfg['oscillation_stage']['chi'] ) # [ grainID, reflID, hklID, D_s[0], D_s[1], D_s[2], tth, eta, ome ] st.append( np.hstack([ i*np.ones((sum(valid_spt), 1)), this_st[valid_spt, :2], dvec, angs, ]) ) # make overlap table # [[range_0], [range_1], ..., [range_n]] # range_0 = [grainIDs, reflIDs, hklIDs] that are within tol overlap_table = [] ii = 0 for pid in pids: tmp = []; a = []; b = []; c = [] for j in range(len(pid)): a.append( np.vstack( [st[i][st[i][:, 2] == pid[j], 3:6] for i in range(len(st))] ) ) b.append( np.vstack( [st[i][st[i][:, 2] == pid[j], 0:3] for i in range(len(st))] ) ) c.append( np.vstack( [st[i][st[i][:, 2] == pid[j], 6:9] for i in range(len(st))] ) ) pass a = np.vstack(a) b = np.vstack(b) c = np.vstack(c) if len(a) > 0: # run dbscan core_samples, labels = dbscan( a, eps=eps, min_samples=2, metric='minkowski', p=2, ) cl = np.array(labels, dtype=int) # convert to array noise_points = cl == -1 # index for marking noise cl += 1 # move index to 1-based instead of 0 cl[noise_points] = -1 # re-mark noise as -1 # extract number of clusters if np.any(cl == -1): nblobs = len(np.unique(cl)) - 1 else: nblobs = len(np.unique(cl)) for i in range(1, nblobs+1): # put in check on omega here these_angs = c[np.where(cl == i)[0], :] local_cl = cluster.hierarchy.fclusterdata( these_angs[:, 1:], eps, criterion='distance', metric=adist ) local_nblobs = len(np.unique(local_cl)) if local_nblobs < len(these_angs): for j in range(1, local_nblobs + 1): npts = sum(local_cl == j) if npts >= 2: cl_idx = np.where(local_cl == j)[0] #import pdb; pdb.set_trace() tmp.append( b[np.where(cl == i)[0][cl_idx], :] ) print "processing ring set %d" %ii ii += 1 overlap_table.append(tmp) return overlap_table
def clustering(lats, longs, timestamps, ID, timestmp, multiPDF=False): """ Clusters the GPS coordinates using DBSCAN :param timestmp: The timestamp :param ID: The ID :param timestamps: The timestamps of the GPS coordinates :param lats: The latitudes :param longs: The longitudes :return: The rounded distance """ folder = "out/" plotDir = folder + "plots/Walking Test Analysis" R = 6371 # Radius of the earth in km cartesianX = [] cartesianY = [] cartesianZ = [] for lat, long in zip(lats, longs): # Convert to cartesian coordinates x = R * cos(lat) * cos(long) y = R * cos(lat) * sin(long) z = R * sin(lat) cartesianX.append(x) cartesianY.append(y) cartesianZ.append(z) combined = np.vstack((cartesianX, cartesianY, cartesianZ)).T (core_samples, labels) = dbscan(combined, eps=0.5) grouped = zip(labels, core_samples) nonGroupedPositions = [] for (label, core_sample) in grouped: if label != -1: lat = lats[core_sample] long = longs[core_sample] stamp = timestamps[core_sample] nonGroupedPositions.append((lat, long, stamp)) if len(nonGroupedPositions) > 0: y = zip(*nonGroupedPositions)[0] # the latitudes x = zip(*nonGroupedPositions)[1] # the longitudes t = zip(*nonGroupedPositions)[2] # the timestamps x2, y2, newx2, newy2 = smooth(y, x, t) plt.plot(y2, x2, label="Linear Interpolation") plt.plot(newy2, newx2, label="Savgol Filter", color="r") distance = calcDistanceWalked(newy2, newx2) grouped = sorted(grouped, key=itemgetter(0)) clusters = {} labels = [] for key, group in groupby(grouped, key=itemgetter(0)): # group the clusters based on their label labels.append(key) clusters[key] = [el[1] for el in group] noise = False colors = plt.get_cmap("Spectral")(np.linspace(0, 1, len(clusters))) for label in labels: indices = clusters[label] latitudes = [] longitudes = [] size = 10 alpha = 0.5 lineWidth = 0.15 for i in indices: latitudes.append(lats[i]) longitudes.append(longs[i]) if label == -1: # outliers are identified with a label of -1 plt.plot(latitudes, longitudes, "o", markerfacecolor=almost_black, markeredgecolor=almost_black, markersize=size, alpha=alpha, linewidth=lineWidth, label="Outlier") noise = True else: plt.plot(latitudes, longitudes, "o", markerfacecolor=colors[label], markeredgecolor=almost_black, markersize=size, alpha=alpha, linewidth=lineWidth, label="Cluster %i" % (label + 1)) plt.title("Timestamp: %s\n Number of clusters: %i\n Calculated distance: %i meters" % ( timestmp, (len(clusters) - 1) if noise else len(clusters), round(distance))) plt.xlabel("Latitude") plt.ylabel("Longitude") fancyPlot() writeToPdf(ID, plotDir) return True, distance else: # DBSCAN gave back an empty array, therefore we cannot perform any smoothing or distance calculation return False, 0
res res[0] res.keys() len(res['content']) res['content'].keys() res['content']['statuses'].keys() len(res['content']['statuses']) res['content']['statuses'][0] res['content']['statuses'][0]['text'] res['content']['statuses'][1]['text'] res['content']['statuses'][2]['text'] newtext = "\n".join(x['text'] for x in res['content']['statuses']) len(newtext) newtext[:100] newtext[:600] print(newtext[:600]) import summsnippets as summ2 tok =summ2.tokenizes(newtext) tups = summ2.pos_tag(tok) tups[:10] tups[:20] a = summ.make_sent_objs(tups) X = summ.build_sent_matrix(a) db = dbscan(X, eps=summ.DEFAULT_EPS, min_samples=3, metric='cosine', algorithm='brute')[1] from sklearn.cluster import dbscan db = dbscan(X, eps=summ2.DEFAULT_EPS, min_samples=3, metric='cosine', algorithm='brute')[1] db from collections import Countern from collections import Counter Counter(db)
import numpy as np from sklearn import cluster #excercise 7.3.4 k=3 samples=np.array([ (4,10),(7,10),(4,8),(6,8),(3,4),(10,5),(12,6),(11,4),(2,2),(5,2),(9,3),(12,3), ]) res = cluster.k_means(samples, k) labels=res[1] clus = [samples[labels==i] for i in range(k)] print 'cluster i, N, SUM, SUMSQ' for i in range(k): print i, [(clus[i]**j).sum(axis=0) for j in range(3)] for j in range(2): t=clus[i][:,j] print np.var(t), np.std(t) #density based scan print cluster.dbscan(samples, eps=3,min_samples=2)
def dbscan( threshold, matrix, taxa, revert=False, min_samples=1): """ Compute DBSCAN cluster analysis. Parameters ---------- threshold : float The threshold for clustering you want to use. matrix : list The two-dimensional matrix passed as list or array. taxa : list The list of taxon names. If set to "False" a fake list of taxon names will be created, giving a positive numerical ID in increasing order for each column in the matrix. revert : bool If set to "False", don't return taxon names but simply the language identifiers and their labels as a dictionary. Otherwise returns a dictionary with labels as keys and list of taxon names as values. min_samples : int (default=1) The minimal samples parameter of the DBCSCAN method from the SKLEARN package. Returns ------- clusters : dict Either a dictionary of taxon identifiers and labels, or a dictionary of labels and taxon names. Notes ----- This method does not work as expected, probably since it normally requires distances between points as input. We list it only for completeness here, but urge to be careful when using the code and checking properly our implementation in the source code. Requires the scikitlearn package, downloadable from http://scikit-learn.org/. """ if not cluster: raise ValueError("The package sklearn is needed to run this analysis.") if not taxa: taxa = list(range(1, len(matrix) + 1)) core_samples, labels = cluster.dbscan( matrix, eps=threshold, min_samples=min_samples, metric='precomputed') # change to our internal cluster style idx = max(labels) + 1 if idx == 0: idx += 1 for i, c in enumerate(labels): if c == -1: labels[i] = idx idx += 1 # check for revert if revert: return dict(zip(range(len(taxa)), labels)) clr = defaultdict(list) for i, t in enumerate(taxa): clr[labels[i]] += [t] return clr