def clusterFromDistMatrix(distanceMatrix): clusterer = HDBSCAN(min_cluster_size=2, metric='precomputed') clusterer.fit(distanceMatrix) labels = clusterer.labels_ probs = clusterer.probabilities_ labels = np.array((labels))[np.newaxis] labels = labels.T probs = np.array((probs))[np.newaxis] probs = probs.T results = np.concatenate((probs, gv.medSequenceMatrix), axis=1) results = np.concatenate((labels, results), axis=1) results = np.array(sorted(results, key=lambda a_entry: a_entry[0])) with open('treatmentClusters.txt', 'w') as csvfile: csvfile.write( "Cluster; Probability; ID; Month 1; Month 2; Month 3; Month 4; Month 5; Month 6" ) csvfile.write('\n') for i in range(results.shape[0]): csvfile.write(str(results[i, 0])) csvfile.write(';') csvfile.write(str(results[i, 1])) csvfile.write(';') csvfile.write(str(results[i, 2])) csvfile.write(';') for j in range(3, results.shape[1]): csvfile.write( str(results[i, j]).replace('{', '').replace('}', '')) csvfile.write(';') csvfile.write('\n')
def hdbscan(X, min_cluster_size=5, gen_min_span_tree=True, **kwargs): """Clustering with Hierarchical DBSCAN. Parameters ---------- X : array-like n x k attribute data min_cluster_size : int, default: 5 the minimum number of points necessary to generate a cluster gen_min_span_tree : bool Description of parameter `gen_min_span_tree` (the default is True). kwargs Returns ------- fitted cluster instance: hdbscan.hdbscan.HDBSCAN """ try: from hdbscan import HDBSCAN except ImportError: raise ImportError( "You must have the hdbscan package installed to use this function") model = HDBSCAN(min_cluster_size=min_cluster_size) model.fit(X) return model
def return_results(self, tiers): #Get the full matrix from the triangular matrix, get the row sums, and sort by them full_mat = self.matrix - self.matrix.T scores = np.sum(full_mat, axis=1) score_args = np.argsort(scores)[::-1][:self.limit] labs = list(np.array(self.labels)[score_args]) scores = scores[score_args] tier_names = ['God', 'S', 'A', 'B', 'C', 'D', 'E', 'F', 'Garbage'] max_tiers = 9 #If the user has not specified a tier number to use, use HDBSCAN to come up with the number if tiers is None: tiers = 0 if tiers <= 0: clusterer = HDBSCAN(min_cluster_size=2, min_samples=1, metric='l1') clusterer.fit(scores.reshape(-1, 1)) tiers = max(clusterer.labels_) #print clusterer.labels_ tiers = min(max_tiers, tiers + 1) #Use KMeans to split the scores into the set number of tiers clusterer = KMeans(n_clusters=tiers) try: clusterer.fit(scores.reshape(-1, 1)) except OverflowError: print scores, tiers raise l_pairs = zip(list(scores), list(clusterer.labels_), labs) curr_tier = -1 #Create the return string. for scr, tier, lab in l_pairs: if tier != curr_tier: print '\n' print tier_names.pop(0) + ':' curr_tier = tier print '%s %.2f' % (lab, scr)
def run(self): if self.isStopped(): self.canceled.emit() return False options = self.options clusterer = HDBSCAN(min_cluster_size=options.min_cluster_size, min_samples=options.min_samples, cluster_selection_epsilon=options.cluster_selection_epsilon, cluster_selection_method=options.cluster_selection_method) layout_data = self._widget.get_layout_data() isolated_nodes = layout_data['isolated_nodes'] layout = layout_data['layout'] mask = np.ones_like(layout, dtype=bool) mask[isolated_nodes] = False x = layout[mask].reshape(-1, 2) clusterer.fit(x.astype(np.float64)) i = 0 result = [] for n in self._widget.scene().nodes(): if n.index() in isolated_nodes: result.append("Noise") else: result.append(f"Cluster {clusterer.labels_[i] + 1}" if clusterer.labels_[i] > 0 else "Noise") i += 1 if not self.isStopped(): return result else: self.canceled.emit()
def hdbscan_samples(data, min_samples, n, filename): hdbscan = HDBSCAN(min_samples=min_samples, metric='haversine') data = data[np.random.randint(low=0, high=len(data), size=n), :] t0 = time.time() hdbscan.fit(np.radians(data)) t1 = time.time() - t0 clusters = len(np.unique(hdbscan.labels_)) project = os.path.realpath('.') csv = os.path.join(project, filename) if not os.path.exists(csv): with open(csv, mode='w') as timing: timing.write('min_samples,n,clusters,seconds\n') with open(csv, mode='a') as timing: timing.write('{},{},{},{}\n'.format(min_samples, n, clusters, t1)) print('HDBSCAN: {} samples, {} clusters, {} seconds'.format( n, clusters, t1)) return t1
def hdbscan_clustering(self, min_cluster_size=10, min_cluster_portion=None, min_samples=1, metric='hamming', cluster_selection_method='eom', allow_single_cluster=True, epsilon=0.2): if min_cluster_portion is not None: if min_cluster_size is None: min_cluster_size = 0 min_cluster_size = max(min_cluster_size, self.n_obs * min_cluster_portion) else: if min_cluster_size is None: raise ValueError( 'Either min_cluster_size or min_cluster_portion should be provided' ) runner = HDBSCAN(min_cluster_size=int(min_cluster_size), min_samples=int(min_samples), metric=metric, cluster_selection_method=cluster_selection_method, allow_single_cluster=allow_single_cluster) if self.leiden_result_df is None: raise ValueError( 'Run multi_leiden_clustering first before hdbscan_clustering') runner.fit(self.leiden_result_df) self.hdbscan = runner self.reselect_clusters(epsilon=epsilon, min_cluster_size=min_cluster_size) return
def fitness(self): clusterer = HDBSCAN( algorithm=self.parametros["algorithm"], min_cluster_size=self.parametros["min_cluster_size"], min_samples=self.parametros["min_samples"], cluster_selection_method=self. parametros["cluster_selection_method"], cluster_selection_epsilon=self. parametros["cluster_selection_epsilon"]) clusterer.fit(self.data) self.labels = clusterer.labels_ silhouette_score = self.silhouette_score(self.data, self.labels) # balance = self.balance(clusterer.labels_) # percents = self.calc_percents(clusterer.labels_) # len_labels = self.len_labels(clusterer.labels_) # noise_percents = [item for item in percents if item[0] == -1][0][1] score = silhouette_score # print(percents) # print("\n") # print('\'' + str(json.dumps(self.parametros)) + '\'') # print("\n") # print(score) # print("---------------------\n\n") return score
def cluster_data_points(data_points, cluster_size=5, distance_metric_func="Fractional"): points = [d['encoding'] for d in data_points] points = np.vstack(points) scaler = StandardScaler() scaler.fit(points) points = scaler.transform(points) dist_metric = Similarity() if distance_metric_func == "Fractional": dist_metric_func = dist_metric.fractional_distance else: dist_metric_func = dist_metric.euclidean_distance clusterer = HDBSCAN(min_cluster_size=cluster_size, metric='pyfunc', func=dist_metric_func) clusterer.fit(points) logging.info("Fit complete.") results = {} labelIDs = np.unique(clusterer.labels_) for labelID in labelIDs: paths = [] encodings = [] idxs = np.where(clusterer.labels_ == labelID)[0] for i in idxs: data = data_points[i] paths.append(data['path']) encodings.append(data['encoding']) results[labelID] = { 'paths': paths, 'mean_encoding': np.mean(np.asarray(encodings), axis=0), 'std_dev': np.std(encodings, axis=0), 'sample_size': len(paths) } return results
def cluster_hdbscan(above_gps): sample_by_feature = above_gps.to_numpy() print(sample_by_feature.shape) clusterer = HDBSCAN() clusterer.fit(sample_by_feature) cluster_ids = list(clusterer.labels_) return cluster_ids
def hdbscan_cluster(df: pd.DataFrame, min_cluster_size: int = 10, gen_min_span_tree: bool = True): clusterer = HDBSCAN(min_cluster_size=min_cluster_size, gen_min_span_tree=gen_min_span_tree) clusterer.fit(df) return clusterer.labels_, clusterer.probabilities_
def perform_hdbscan(self, min_cluster_size=15): hdbscan_clusterer = HDBSCAN(min_cluster_size, metric="precomputed") hdbscan_clusterer.fit(self.distance_matrix) self.hdbscan_results = { "parameters": hdbscan_clusterer.get_params(), "labels": hdbscan_clusterer.labels_, "probabilities": hdbscan_clusterer.probabilities_, "n_clusters": np.unique(hdbscan_clusterer.labels_).max() + 1, 'clusters': label_cnt_dict(hdbscan_clusterer.labels_) } print_dict(self.hdbscan_results)
def cluster(df, min_size=4, allow_single_cluster=True): """Use HDBSCAN -- (Hierarchical Density-Based Spatial Clustering of Applications with Noise) to find the best clusters for the meander. """ clusterer = HDBSCAN(min_cluster_size=min_size, min_samples=3, metric='haversine', allow_single_cluster=allow_single_cluster) clusterer.fit(df[['lat', 'lng']]) df.loc[:, 'label'] = ['ABCDEFGHIJKLMN'[i] for i in clusterer.labels_] return df.sort_values('label').reset_index(drop=True)
def _run_hdbscan(affinity: np.ndarray, min_cluster_size_for_hdbscan: int, min_cluster_size: int, max_cluster_size: int): assert affinity.shape[0] == affinity.shape[1] if affinity.shape[0] > max_cluster_size: allow_single_cluster = False else: allow_single_cluster = True db = HDBSCAN(metric='precomputed', min_cluster_size=min_cluster_size_for_hdbscan, min_samples=1, allow_single_cluster=allow_single_cluster) db.fit(affinity) return db
def hdbscan(self, min_cluster_size=10, prediction_data=False): """ DBSCAN but allows for varying density clusters and no longer requires epsilon parameter, which is difficult to tune. http://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html Scales slightly worse than DBSCAN, but with a more intuitive parameter. """ hdbscan = HDBSCAN(min_cluster_size=min_cluster_size, prediction_data=prediction_data) if prediction_data: return hdbscan.fit(self._safe_dense(self.matrix)) else: return hdbscan.fit(self.matrix)
class HDBSCAN: def __init__(self): self.cluster = HDBSCANBase(algorithm='best', approx_min_span_tree=True, gen_min_span_tree=False, leaf_size=40, metric='euclidean', min_cluster_size=15, min_samples=15, p=None) def fit(self, X, y=None): self.cluster.fit(X)
def clusters(cat, mask, colors): table = cat[mask] table.keep_columns(colors) data = table.to_pandas() clusterer = HDBSCAN(min_cluster_size=20) #100 for real clusterer.fit(data) labels = Table.Column(clusterer.labels_, name='ct') proba = Table.Column(clusterer.probabilities_, name='prob_ct') stars = Table([cat[mask]['XMMSRCID'], labels, proba]) return stars
def cluster(self, progress=None): images = list() with open('encoding_data.pkl', 'rb') as file: data = pickle.load(file) data = np.array(data) encodings = [d['encoding'] for d in data] X = np.vstack(encodings) pca = PCA(n_components='mle', svd_solver='full') X_new = pca.fit_transform(X) clt = HDBSCAN(metric='euclidean', min_cluster_size=10) clt.fit(X_new) labelIDs = np.unique(clt.labels_) done = 0 increment = float(100.00 / len(labelIDs)) if progress is not None: progress.setValue(0) for labelID in labelIDs: faces = list() idxs = np.where(clt.labels_ == labelID)[0] idxs = np.random.choice(idxs, size=min(25, len(idxs)), replace=False) for i in idxs: image = cv2.imread(data[i]['path']) (h, w) = image.shape[:2] image = cv2.resize(image, (int(w * 0.25), int(h * 0.25))) (t, r, b, l) = data[i]['loc'] face = image[t:b, l:r] face = cv2.resize(face, (96, 96)) faces.append(face) montage = build_montages(faces, (96, 96), (5, 5))[0] if progress is not None: done += increment progress.setValue(done) title = 'Face ID #{}'.format(labelID) title = 'Unknown Faces' if labelID == -1 else title cv2.imshow(title, montage) key = cv2.waitKey(0) & 0xFF if key == ord('k'): idxs = np.where(clt.labels_ == labelID)[0] for i in idxs: images.append(data[i]['path']) cv2.destroyAllWindows() elif key == ord('n'): cv2.destroyAllWindows() return images
def HDBSCAN(input_data): if input_data.size < 1: return np.zeros((0)), np.zeros((0)) hdbscan_object = Hierarchical_DBSCAN(allow_single_cluster=True) hdbscan_object.fit(input_data) labels = hdbscan_object.labels_ uniqueLabels, Count = np.unique(labels, return_counts=True) if len(uniqueLabels) > 10: ind_sorted = np.argsort(-Count) lbls_sorted = uniqueLabels[ind_sorted] lbls_rm = lbls_sorted[10:-1] for i in lbls_rm: labels[labels == i] = -1 labels = np.unique(labels, return_inverse=True)[1] - 1 return labels
def HDBSCAN( self, parameters ): # data, min_cluster_size, min_samples, alpha, cluster_selection_method): result = {} default_min_cluster_size = 3 default_min_samples = 3 default_alpha = 0.5 #大于1的float default_cluster_selection_method = "eom" # "eom", "leaf" data = np.array(parameters['data']) data = preprocessing.MinMaxScaler().fit_transform(data) if parameters.get('min_cluster_size') is not None: default_min_cluster_size = int(parameters['min_cluster_size']) if parameters.get('min_samples') is not None: default_min_samples = int(parameters['min_samples']) if parameters.get('alpha') is not None: default_alpha = float(parameters['alpha']) if parameters.get('cluster_selection_method') is not None: default_cluster_selection_method = str( parameters['cluster_selection_method']) model = HDBSCAN( min_cluster_size=default_min_cluster_size, min_samples=default_min_samples, alpha=default_alpha, cluster_selection_method=default_cluster_selection_method, allow_single_cluster=True) clustering = model.fit(data) result['clustering'] = clustering return result
def TrainCluster(x): clusterer = HDBSCAN(min_cluster_size=1250, gen_min_span_tree=True, prediction_data=True) # creating a clustering object hdb = clusterer.fit(x) #Fitting cluster object on training data hdb.prediction_data del (x) return hdb
def fit(self, X): """ Apply the ST DBSCAN algorithm ---------- X : 2D numpy array with The first element of the array should be the time attribute as float. The following positions in the array are treated as spatial coordinates. The structure should look like this [[time_step1, x, y], [time_step2, x, y]..] For example 2D dataset: array([[0,0.45,0.43], [0,0.54,0.34],...]) Returns ------- self """ # check if input is correct X = check_array(X) if not self.eps1 > 0.0 or not self.eps2 > 0.0 or not self.min_samples > 0.0: raise ValueError('eps1, eps2, minPts must be positive') n, m = X.shape # Compute sqaured form Euclidean Distance Matrix for 'time' attribute and the spatial attributes time_dist = squareform(pdist(X[:, 0].reshape(n, 1), metric=self.metric)) euc_dist = squareform(pdist(X[:, 1:], metric=self.metric)) # filter the euc_dist matrix using the time_dist dist = np.where(time_dist <= self.eps2, euc_dist, 2 * self.eps1) db = HDBSCAN(min_samples=self.min_samples, metric='precomputed') # db = DBSCAN(eps=self.eps1, # min_samples=self.min_samples, # metric='precomputed') db.fit(dist) self.labels = db.labels_ return self
def cluster_data_points(self, data=None, processed=False): if data is None or len(data) < 1: return None if processed is True: with open('video_data.pkl', 'rb') as file: data = pickle.load(file) self.clusterSize = self.cs points = [d['encoding'] for d in data] points = np.vstack(points) points = normalize(points, norm='l2', axis=1) dist_metric = Similarity() clusterer = HDBSCAN(min_cluster_size=self.clusterSize, metric='pyfunc', func=dist_metric.fractional_distance) clusterer.fit(points) results = dict() labelIDs = np.unique(clusterer.labels_) for labelID in labelIDs: idxs = np.where(clusterer.labels_ == labelID)[0] encodings = list() for i in idxs: if labelID not in results: results[labelID] = dict() results[labelID]['paths'] = list() results[labelID]['mean_encoding'] = None results[labelID]['std_dev'] = None results[labelID]['paths'].append(data[i]['path']) encodings.append(data[i]['encoding']) results[labelID]['mean_encoding'], results[labelID][ 'std_dev'] = self._compute_statistics(encodings) if processed is False: return results else: with open('video_results.pkl', 'wb') as file: pickle.dump(results, file, protocol=pickle.HIGHEST_PROTOCOL) return results return None
def fit(self, data, min_cluster_size, min_samples, alpha, cluster_selection_method): data = np.array(data) data = preprocessing.MinMaxScaler().fit_transform(data) model = HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples, alpha=alpha, cluster_selection_method=cluster_selection_method, allow_single_cluster=True) clustering = model.fit(data) return clustering
def HDBSCAN_cluster(d_array, **kwargs): """ :param d_array: :param min_members: :return: """ clusterer = HDBSCAN(**kwargs) in_arr = np.array(d_array.nonzero()).T clusterer.fit(in_arr) labels = clusterer.labels_ a = np.zeros(d_array.shape) for clust, tup in zip(labels, in_arr): if clust >= 0: a[tuple(tup)] = clust + 1 else: a[tuple(tup)] = clust return a
def HDBSCAN_cluster(d_array, **kwargs): """ Performs density-based clustering on input 3d map. :param d_array: input numpy array (usually 3d map) :param min_members: :return: numpy array """ clusterer = HDBSCAN(**kwargs) in_arr = np.array(d_array.nonzero()).T clusterer.fit(in_arr) labels = clusterer.labels_ a = np.zeros(d_array.shape) for clust, tup in zip(labels, in_arr): if clust >= 0: a[tuple(tup)] = clust + 1 else: a[tuple(tup)] = clust return a
def cluster_data_points(self): with open('data_points.pkl', 'rb') as file: data = pickle.load(file) points = [d['encoding'] for d in data] points = np.vstack(points) # points = normalize(points, norm='l2', axis=1) scaler = StandardScaler() scaler.fit(points) points = scaler.transform(points) with open('standardization_data.pkl', 'wb') as file: std_data = {'s_mean': scaler.mean_, 's_var': scaler.var_} pickle.dump(std_data, file, protocol=pickle.HIGHEST_PROTOCOL) dist_metric = Similarity() clusterer = HDBSCAN(min_cluster_size=self.clusterSize, metric='pyfunc', func=dist_metric.fractional_distance) clusterer.fit(points) results = dict() labelIDs = np.unique(clusterer.labels_) for labelID in labelIDs: idxs = np.where(clusterer.labels_ == labelID)[0] encodings = list() for i in idxs: if labelID not in results: results[labelID] = dict() results[labelID]['paths'] = list() results[labelID]['mean_encoding'] = None results[labelID]['std_dev'] = None results[labelID]['paths'].append(data[i]['path']) encodings.append(data[i]['encoding']) results[labelID]['mean_encoding'], results[labelID][ 'std_dev'] = self._compute_statistics(encodings) results[labelID]['sample_size'] = len(results[labelID]['paths']) with open('results.pkl', 'wb') as file: pickle.dump(results, file, protocol=pickle.HIGHEST_PROTOCOL) return True
def hdbscan_on_points(self, min_cluster_size, min_samples, xyz=False): """ Performs hdbscan on input points. :param min_cluster_size: [int], minimum cluster size :param min_samples: : [int], min samples >> see https://hdbscan.readthedocs.io/en/latest/parameter_selection.html :param xyz: [bool] if True the clustering will be done over xyz otherwise xy. :return: writes the points assigned with clusters to self.clustered_points """ masked_points = self.raw_points[self.masks] start = time.time() if xyz: xy = np.array([masked_points['X'], masked_points['Y'], masked_points['Z']]).T else: xy = np.array([masked_points['X'], masked_points['Y']]).T xy_clusterer = HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples) xy_clusterer.fit(xy) clustered_points = pd.DataFrame({'X': masked_points['X'], 'Y': masked_points['Y'], 'Z': masked_points['Z'], 'Red': masked_points['Red'], 'Green': masked_points['Green'], 'Blue': masked_points['Blue'], 'HAG': masked_points['HAG'], 'Coplanar': masked_points['Coplanar'], 'NormalX': masked_points['NormalX'], 'NormalY': masked_points['NormalY'], 'NormalZ': masked_points['NormalZ'], 'Classification': xy_clusterer.labels_}) # remove "noise" points self.clustered_points = clustered_points[clustered_points.Classification >= 0] end = time.time() print(f'found {np.unique(len(np.unique(self.clustered_points.Classification)))[0]} xy_clusters') print(f'clustering on xy took {round(end - start, 2)} seconds')
def _summarize_multi_leiden(self): # here we don't rely on hdbscan clustering, just use it to reduce the pairwise distances calculation # the resulting clusters are simply disconnected components based on hamming dist graph # This results an over-clustering, which will be merged by the supervise learning step. hdbscan = HDBSCAN(min_cluster_size=2, min_samples=1, metric='hamming', cluster_selection_method='eom', allow_single_cluster=True) hdbscan.fit(self.leiden_result_df) clusters = {} cur_num = 0 for _, sub_df in self.leiden_result_df.groupby( pd.Series(hdbscan.labels_, index=self.leiden_result_df.index)): pairwise_dist = pairwise_distances(sub_df, metric='hamming') # create a graph, cells within hamming_dist_cutoff are connected rows, cols = np.where(pairwise_dist < self.consensus_rate) edges = zip(sub_df.index[rows].tolist(), sub_df.index[cols].tolist()) g = nx.Graph() g.add_edges_from(edges) for comp in nx.connected_components(g): if len(comp) >= self.min_cluster_size: for node in comp: # each disconnected component assigned to a cluster clusters[node] = cur_num cur_num += 1 else: for node in comp: clusters[node] = -1 clusters = pd.Series(clusters).sort_index() print( f'{(clusters != -1).sum()} cells assigned to {clusters.unique().size - 1} raw clusters' ) print(f'{(clusters == -1).sum()} cells are multi-leiden outliers') self._multi_leiden_clusters = clusters.values return
def Get_cluster_labels(input_data, Doc2Vec_model): ''' Input: List of Results from search query, pre-loaded Doc2Vec_model (to prevent re-loading each time) Output: List of cluster labels ---------------------------------------------------------------------------- ''' Lem_words = [result_list[x]['Lemmatized'].split() for x in range(len(result_list))] vectors = [Doc2Vec_model.infer_vector(document) for document in Lem_words] distance = pairwise_distances(vectors, metric='cosine') clusterer = HDBSCAN( metric='precomputed', cluster_selection_method='leaf') db = clusterer.fit(distance.astype('float64')) return db.labels_, db.probabilities_
def hdbscan(X, min_cluster_size=5, gen_min_span_tree=True, **kwargs): """Clustering with Hierarchical DBSCAN Parameters ---------- X : array-like n x k attribute data min_cluster_size : int, default: 5 the minimum number of points necessary to generate a cluster gen_min_span_tree : bool Description of parameter `gen_min_span_tree` (the default is True). Returns ------- model: hdbscan HDBSCAN instance """ model = HDBSCAN(min_cluster_size=min_cluster_size) model.fit(X) return model