def clusterFromDistMatrix(distanceMatrix):
    clusterer = HDBSCAN(min_cluster_size=2, metric='precomputed')
    clusterer.fit(distanceMatrix)
    labels = clusterer.labels_
    probs = clusterer.probabilities_
    labels = np.array((labels))[np.newaxis]
    labels = labels.T
    probs = np.array((probs))[np.newaxis]
    probs = probs.T
    results = np.concatenate((probs, gv.medSequenceMatrix), axis=1)
    results = np.concatenate((labels, results), axis=1)
    results = np.array(sorted(results, key=lambda a_entry: a_entry[0]))
    with open('treatmentClusters.txt', 'w') as csvfile:
        csvfile.write(
            "Cluster; Probability; ID; Month 1; Month 2; Month 3; Month 4; Month 5; Month 6"
        )
        csvfile.write('\n')
        for i in range(results.shape[0]):
            csvfile.write(str(results[i, 0]))
            csvfile.write(';')
            csvfile.write(str(results[i, 1]))
            csvfile.write(';')
            csvfile.write(str(results[i, 2]))
            csvfile.write(';')
            for j in range(3, results.shape[1]):
                csvfile.write(
                    str(results[i, j]).replace('{', '').replace('}', ''))
                csvfile.write(';')
            csvfile.write('\n')
Пример #2
0
def hdbscan(X, min_cluster_size=5, gen_min_span_tree=True, **kwargs):
    """Clustering with Hierarchical DBSCAN.

    Parameters
    ----------
    X : array-like
         n x k attribute data
    min_cluster_size : int, default: 5
        the minimum number of points necessary to generate a cluster
    gen_min_span_tree : bool
        Description of parameter `gen_min_span_tree` (the default is True).
    kwargs

    Returns
    -------
    fitted cluster instance: hdbscan.hdbscan.HDBSCAN

    """
    try:
        from hdbscan import HDBSCAN
    except ImportError:
        raise ImportError(
            "You must have the hdbscan package installed to use this function")

    model = HDBSCAN(min_cluster_size=min_cluster_size)
    model.fit(X)
    return model
Пример #3
0
 def return_results(self, tiers):
     #Get the full matrix from the triangular matrix, get the row sums, and sort by them
     full_mat = self.matrix - self.matrix.T
     scores = np.sum(full_mat, axis=1)
     score_args = np.argsort(scores)[::-1][:self.limit]
     labs = list(np.array(self.labels)[score_args])
     scores = scores[score_args]
     tier_names = ['God', 'S', 'A', 'B', 'C', 'D', 'E', 'F', 'Garbage']
     max_tiers = 9
     #If the user has not specified a tier number to use, use HDBSCAN to come up with the number
     if tiers is None:
         tiers = 0
     if tiers <= 0:
         clusterer = HDBSCAN(min_cluster_size=2, min_samples=1, metric='l1')
         clusterer.fit(scores.reshape(-1, 1))
         tiers = max(clusterer.labels_)
         #print clusterer.labels_
     tiers = min(max_tiers, tiers + 1)
     #Use KMeans to split the scores into the set number of tiers
     clusterer = KMeans(n_clusters=tiers)
     try:
         clusterer.fit(scores.reshape(-1, 1))
     except OverflowError:
         print scores, tiers
         raise
     l_pairs = zip(list(scores), list(clusterer.labels_), labs)
     curr_tier = -1
     #Create the return string.
     for scr, tier, lab in l_pairs:
         if tier != curr_tier:
             print '\n'
             print tier_names.pop(0) + ':'
         curr_tier = tier
         print '%s %.2f' % (lab, scr)
Пример #4
0
    def run(self):
        if self.isStopped():
            self.canceled.emit()
            return False

        options = self.options
        clusterer = HDBSCAN(min_cluster_size=options.min_cluster_size,
                            min_samples=options.min_samples,
                            cluster_selection_epsilon=options.cluster_selection_epsilon,
                            cluster_selection_method=options.cluster_selection_method)
        layout_data = self._widget.get_layout_data()
        isolated_nodes = layout_data['isolated_nodes']
        layout = layout_data['layout']
        mask = np.ones_like(layout, dtype=bool)
        mask[isolated_nodes] = False
        x = layout[mask].reshape(-1, 2)
        clusterer.fit(x.astype(np.float64))

        i = 0
        result = []
        for n in self._widget.scene().nodes():
            if n.index() in isolated_nodes:
                result.append("Noise")
            else:
                result.append(f"Cluster {clusterer.labels_[i] + 1}" if clusterer.labels_[i] > 0 else "Noise")
                i += 1

        if not self.isStopped():
            return result
        else:
            self.canceled.emit()
Пример #5
0
def hdbscan_samples(data, min_samples, n, filename):

    hdbscan = HDBSCAN(min_samples=min_samples, metric='haversine')

    data = data[np.random.randint(low=0, high=len(data), size=n), :]

    t0 = time.time()
    hdbscan.fit(np.radians(data))
    t1 = time.time() - t0

    clusters = len(np.unique(hdbscan.labels_))

    project = os.path.realpath('.')
    csv = os.path.join(project, filename)

    if not os.path.exists(csv):
        with open(csv, mode='w') as timing:
            timing.write('min_samples,n,clusters,seconds\n')

    with open(csv, mode='a') as timing:
        timing.write('{},{},{},{}\n'.format(min_samples, n, clusters, t1))

    print('HDBSCAN: {} samples, {} clusters, {} seconds'.format(
        n, clusters, t1))

    return t1
Пример #6
0
    def hdbscan_clustering(self,
                           min_cluster_size=10,
                           min_cluster_portion=None,
                           min_samples=1,
                           metric='hamming',
                           cluster_selection_method='eom',
                           allow_single_cluster=True,
                           epsilon=0.2):
        if min_cluster_portion is not None:
            if min_cluster_size is None:
                min_cluster_size = 0
            min_cluster_size = max(min_cluster_size,
                                   self.n_obs * min_cluster_portion)
        else:
            if min_cluster_size is None:
                raise ValueError(
                    'Either min_cluster_size or min_cluster_portion should be provided'
                )

        runner = HDBSCAN(min_cluster_size=int(min_cluster_size),
                         min_samples=int(min_samples),
                         metric=metric,
                         cluster_selection_method=cluster_selection_method,
                         allow_single_cluster=allow_single_cluster)

        if self.leiden_result_df is None:
            raise ValueError(
                'Run multi_leiden_clustering first before hdbscan_clustering')
        runner.fit(self.leiden_result_df)
        self.hdbscan = runner
        self.reselect_clusters(epsilon=epsilon,
                               min_cluster_size=min_cluster_size)
        return
Пример #7
0
    def fitness(self):
        clusterer = HDBSCAN(
            algorithm=self.parametros["algorithm"],
            min_cluster_size=self.parametros["min_cluster_size"],
            min_samples=self.parametros["min_samples"],
            cluster_selection_method=self.
            parametros["cluster_selection_method"],
            cluster_selection_epsilon=self.
            parametros["cluster_selection_epsilon"])

        clusterer.fit(self.data)
        self.labels = clusterer.labels_
        silhouette_score = self.silhouette_score(self.data, self.labels)

        # balance = self.balance(clusterer.labels_)
        # percents = self.calc_percents(clusterer.labels_)
        # len_labels = self.len_labels(clusterer.labels_)
        # noise_percents = [item for item in percents if item[0] == -1][0][1]

        score = silhouette_score

        # print(percents)
        # print("\n")
        # print('\'' + str(json.dumps(self.parametros)) + '\'')
        # print("\n")
        # print(score)
        # print("---------------------\n\n")

        return score
def cluster_data_points(data_points,
                        cluster_size=5,
                        distance_metric_func="Fractional"):
    points = [d['encoding'] for d in data_points]
    points = np.vstack(points)
    scaler = StandardScaler()
    scaler.fit(points)
    points = scaler.transform(points)
    dist_metric = Similarity()
    if distance_metric_func == "Fractional":
        dist_metric_func = dist_metric.fractional_distance
    else:
        dist_metric_func = dist_metric.euclidean_distance
    clusterer = HDBSCAN(min_cluster_size=cluster_size,
                        metric='pyfunc',
                        func=dist_metric_func)
    clusterer.fit(points)
    logging.info("Fit complete.")
    results = {}
    labelIDs = np.unique(clusterer.labels_)
    for labelID in labelIDs:
        paths = []
        encodings = []
        idxs = np.where(clusterer.labels_ == labelID)[0]
        for i in idxs:
            data = data_points[i]
            paths.append(data['path'])
            encodings.append(data['encoding'])
        results[labelID] = {
            'paths': paths,
            'mean_encoding': np.mean(np.asarray(encodings), axis=0),
            'std_dev': np.std(encodings, axis=0),
            'sample_size': len(paths)
        }
    return results
Пример #9
0
def cluster_hdbscan(above_gps):
    sample_by_feature = above_gps.to_numpy()
    print(sample_by_feature.shape)

    clusterer = HDBSCAN()
    clusterer.fit(sample_by_feature)
    cluster_ids = list(clusterer.labels_)

    return cluster_ids
Пример #10
0
def hdbscan_cluster(df: pd.DataFrame,
                    min_cluster_size: int = 10,
                    gen_min_span_tree: bool = True):

    clusterer = HDBSCAN(min_cluster_size=min_cluster_size,
                        gen_min_span_tree=gen_min_span_tree)
    clusterer.fit(df)

    return clusterer.labels_, clusterer.probabilities_
Пример #11
0
    def perform_hdbscan(self, min_cluster_size=15):
        hdbscan_clusterer = HDBSCAN(min_cluster_size, metric="precomputed")
        hdbscan_clusterer.fit(self.distance_matrix)
        self.hdbscan_results = {
            "parameters": hdbscan_clusterer.get_params(),
            "labels": hdbscan_clusterer.labels_,
            "probabilities": hdbscan_clusterer.probabilities_,
            "n_clusters": np.unique(hdbscan_clusterer.labels_).max() + 1,
            'clusters': label_cnt_dict(hdbscan_clusterer.labels_)
        }

        print_dict(self.hdbscan_results)
Пример #12
0
def cluster(df, min_size=4, allow_single_cluster=True):
    """Use HDBSCAN --
    (Hierarchical Density-Based Spatial Clustering of Applications with Noise)
    to find the best clusters for the meander.
    """
    clusterer = HDBSCAN(min_cluster_size=min_size,
                        min_samples=3,
                        metric='haversine',
                        allow_single_cluster=allow_single_cluster)
    clusterer.fit(df[['lat', 'lng']])
    df.loc[:, 'label'] = ['ABCDEFGHIJKLMN'[i] for i in clusterer.labels_]
    return df.sort_values('label').reset_index(drop=True)
Пример #13
0
 def _run_hdbscan(affinity: np.ndarray, min_cluster_size_for_hdbscan: int, min_cluster_size: int, max_cluster_size: int):
     assert affinity.shape[0] == affinity.shape[1]
     if affinity.shape[0] > max_cluster_size:
         allow_single_cluster = False
     else:
         allow_single_cluster = True
     db = HDBSCAN(metric='precomputed',
                  min_cluster_size=min_cluster_size_for_hdbscan,
                  min_samples=1,
                  allow_single_cluster=allow_single_cluster)
     db.fit(affinity)
     return db
Пример #14
0
 def hdbscan(self, min_cluster_size=10, prediction_data=False):
     """ DBSCAN but allows for varying density clusters and no longer
     requires epsilon parameter, which is difficult to tune.
     http://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html
     Scales slightly worse than DBSCAN, but with a more intuitive parameter.
     """
     hdbscan = HDBSCAN(min_cluster_size=min_cluster_size,
                         prediction_data=prediction_data)
     if prediction_data:
         return hdbscan.fit(self._safe_dense(self.matrix))
     else:
         return hdbscan.fit(self.matrix)
Пример #15
0
class HDBSCAN:
    def __init__(self):
        self.cluster = HDBSCANBase(algorithm='best',
                                   approx_min_span_tree=True,
                                   gen_min_span_tree=False,
                                   leaf_size=40,
                                   metric='euclidean',
                                   min_cluster_size=15,
                                   min_samples=15,
                                   p=None)

    def fit(self, X, y=None):
        self.cluster.fit(X)
Пример #16
0
def clusters(cat, mask, colors):

    table = cat[mask]
    table.keep_columns(colors)
    data = table.to_pandas()

    clusterer = HDBSCAN(min_cluster_size=20)  #100 for real
    clusterer.fit(data)

    labels = Table.Column(clusterer.labels_, name='ct')
    proba = Table.Column(clusterer.probabilities_, name='prob_ct')
    stars = Table([cat[mask]['XMMSRCID'], labels, proba])

    return stars
    def cluster(self, progress=None):
        images = list()
        with open('encoding_data.pkl', 'rb') as file:
            data = pickle.load(file)
        data = np.array(data)
        encodings = [d['encoding'] for d in data]
        X = np.vstack(encodings)
        pca = PCA(n_components='mle', svd_solver='full')
        X_new = pca.fit_transform(X)
        clt = HDBSCAN(metric='euclidean', min_cluster_size=10)
        clt.fit(X_new)

        labelIDs = np.unique(clt.labels_)
        done = 0
        increment = float(100.00 / len(labelIDs))
        if progress is not None:
            progress.setValue(0)
        for labelID in labelIDs:
            faces = list()
            idxs = np.where(clt.labels_ == labelID)[0]
            idxs = np.random.choice(idxs,
                                    size=min(25, len(idxs)),
                                    replace=False)
            for i in idxs:
                image = cv2.imread(data[i]['path'])
                (h, w) = image.shape[:2]
                image = cv2.resize(image, (int(w * 0.25), int(h * 0.25)))
                (t, r, b, l) = data[i]['loc']
                face = image[t:b, l:r]
                face = cv2.resize(face, (96, 96))
                faces.append(face)

            montage = build_montages(faces, (96, 96), (5, 5))[0]
            if progress is not None:
                done += increment
                progress.setValue(done)
            title = 'Face ID #{}'.format(labelID)
            title = 'Unknown Faces' if labelID == -1 else title
            cv2.imshow(title, montage)
            key = cv2.waitKey(0) & 0xFF
            if key == ord('k'):
                idxs = np.where(clt.labels_ == labelID)[0]
                for i in idxs:
                    images.append(data[i]['path'])
                cv2.destroyAllWindows()
            elif key == ord('n'):
                cv2.destroyAllWindows()

        return images
Пример #18
0
 def HDBSCAN(input_data):
     if input_data.size < 1:
         return np.zeros((0)), np.zeros((0))
     hdbscan_object = Hierarchical_DBSCAN(allow_single_cluster=True)
     hdbscan_object.fit(input_data)
     labels = hdbscan_object.labels_
     uniqueLabels, Count = np.unique(labels, return_counts=True)
     if len(uniqueLabels) > 10:
         ind_sorted = np.argsort(-Count)
         lbls_sorted = uniqueLabels[ind_sorted]
         lbls_rm = lbls_sorted[10:-1]
         for i in lbls_rm:
             labels[labels == i] = -1
         labels = np.unique(labels, return_inverse=True)[1] - 1
     return labels
Пример #19
0
 def HDBSCAN(
     self, parameters
 ):  # data, min_cluster_size, min_samples, alpha, cluster_selection_method):
     result = {}
     default_min_cluster_size = 3
     default_min_samples = 3
     default_alpha = 0.5  #大于1的float
     default_cluster_selection_method = "eom"  # "eom", "leaf"
     data = np.array(parameters['data'])
     data = preprocessing.MinMaxScaler().fit_transform(data)
     if parameters.get('min_cluster_size') is not None:
         default_min_cluster_size = int(parameters['min_cluster_size'])
     if parameters.get('min_samples') is not None:
         default_min_samples = int(parameters['min_samples'])
     if parameters.get('alpha') is not None:
         default_alpha = float(parameters['alpha'])
     if parameters.get('cluster_selection_method') is not None:
         default_cluster_selection_method = str(
             parameters['cluster_selection_method'])
     model = HDBSCAN(
         min_cluster_size=default_min_cluster_size,
         min_samples=default_min_samples,
         alpha=default_alpha,
         cluster_selection_method=default_cluster_selection_method,
         allow_single_cluster=True)
     clustering = model.fit(data)
     result['clustering'] = clustering
     return result
def TrainCluster(x):
    clusterer = HDBSCAN(min_cluster_size=1250,
                        gen_min_span_tree=True,
                        prediction_data=True)  # creating a clustering object
    hdb = clusterer.fit(x)  #Fitting cluster object on training data
    hdb.prediction_data
    del (x)
    return hdb
Пример #21
0
    def fit(self, X):
        """
        Apply the ST DBSCAN algorithm 
        ----------
        X : 2D numpy array with
            The first element of the array should be the time 
            attribute as float. The following positions in the array are 
            treated as spatial coordinates. The structure should look like this [[time_step1, x, y], [time_step2, x, y]..]
            For example 2D dataset:
            array([[0,0.45,0.43],
            [0,0.54,0.34],...])
        Returns
        -------
        self
        """
        # check if input is correct
        X = check_array(X)

        if not self.eps1 > 0.0 or not self.eps2 > 0.0 or not self.min_samples > 0.0:
            raise ValueError('eps1, eps2, minPts must be positive')

        n, m = X.shape

        # Compute sqaured form Euclidean Distance Matrix for 'time' attribute and the spatial attributes
        time_dist = squareform(pdist(X[:, 0].reshape(n, 1),
                                     metric=self.metric))
        euc_dist = squareform(pdist(X[:, 1:], metric=self.metric))

        # filter the euc_dist matrix using the time_dist
        dist = np.where(time_dist <= self.eps2, euc_dist, 2 * self.eps1)

        db = HDBSCAN(min_samples=self.min_samples, metric='precomputed')

        # db = DBSCAN(eps=self.eps1,
        #            min_samples=self.min_samples,
        #            metric='precomputed')

        db.fit(dist)

        self.labels = db.labels_

        return self
Пример #22
0
    def cluster_data_points(self, data=None, processed=False):
        if data is None or len(data) < 1:
            return None
        if processed is True:
            with open('video_data.pkl', 'rb') as file:
                data = pickle.load(file)
                self.clusterSize = self.cs
        points = [d['encoding'] for d in data]
        points = np.vstack(points)
        points = normalize(points, norm='l2', axis=1)
        dist_metric = Similarity()

        clusterer = HDBSCAN(min_cluster_size=self.clusterSize,
                            metric='pyfunc',
                            func=dist_metric.fractional_distance)
        clusterer.fit(points)
        results = dict()

        labelIDs = np.unique(clusterer.labels_)
        for labelID in labelIDs:
            idxs = np.where(clusterer.labels_ == labelID)[0]
            encodings = list()
            for i in idxs:
                if labelID not in results:
                    results[labelID] = dict()
                    results[labelID]['paths'] = list()
                    results[labelID]['mean_encoding'] = None
                    results[labelID]['std_dev'] = None
                results[labelID]['paths'].append(data[i]['path'])
                encodings.append(data[i]['encoding'])
            results[labelID]['mean_encoding'], results[labelID][
                'std_dev'] = self._compute_statistics(encodings)

        if processed is False:
            return results
        else:
            with open('video_results.pkl', 'wb') as file:
                pickle.dump(results, file, protocol=pickle.HIGHEST_PROTOCOL)

            return results

        return None
Пример #23
0
 def fit(self, data, min_cluster_size, min_samples, alpha,
         cluster_selection_method):
     data = np.array(data)
     data = preprocessing.MinMaxScaler().fit_transform(data)
     model = HDBSCAN(min_cluster_size=min_cluster_size,
                     min_samples=min_samples,
                     alpha=alpha,
                     cluster_selection_method=cluster_selection_method,
                     allow_single_cluster=True)
     clustering = model.fit(data)
     return clustering
Пример #24
0
    def HDBSCAN_cluster(d_array, **kwargs):
        """
        
        :param d_array: 
        :param min_members: 
        :return: 
        """
        clusterer = HDBSCAN(**kwargs)
        in_arr = np.array(d_array.nonzero()).T

        clusterer.fit(in_arr)
        labels = clusterer.labels_

        a = np.zeros(d_array.shape)
        for clust, tup in zip(labels, in_arr):
            if clust >= 0:
                a[tuple(tup)] = clust + 1
            else:
                a[tuple(tup)] = clust
        return a
Пример #25
0
    def HDBSCAN_cluster(d_array, **kwargs):
        """
        Performs density-based clustering on input 3d map.
        :param d_array: input numpy array (usually 3d map)
        :param min_members:
        :return: numpy array
        """
        clusterer = HDBSCAN(**kwargs)
        in_arr = np.array(d_array.nonzero()).T

        clusterer.fit(in_arr)
        labels = clusterer.labels_

        a = np.zeros(d_array.shape)
        for clust, tup in zip(labels, in_arr):
            if clust >= 0:
                a[tuple(tup)] = clust + 1
            else:
                a[tuple(tup)] = clust
        return a
Пример #26
0
    def cluster_data_points(self):
        with open('data_points.pkl', 'rb') as file:
            data = pickle.load(file)

        points = [d['encoding'] for d in data]
        points = np.vstack(points)
        # points = normalize(points, norm='l2', axis=1)
        scaler = StandardScaler()
        scaler.fit(points)
        points = scaler.transform(points)
        with open('standardization_data.pkl', 'wb') as file:
            std_data = {'s_mean': scaler.mean_, 's_var': scaler.var_}
            pickle.dump(std_data, file, protocol=pickle.HIGHEST_PROTOCOL)
        dist_metric = Similarity()

        clusterer = HDBSCAN(min_cluster_size=self.clusterSize,
                            metric='pyfunc',
                            func=dist_metric.fractional_distance)
        clusterer.fit(points)
        results = dict()

        labelIDs = np.unique(clusterer.labels_)
        for labelID in labelIDs:
            idxs = np.where(clusterer.labels_ == labelID)[0]
            encodings = list()
            for i in idxs:
                if labelID not in results:
                    results[labelID] = dict()
                    results[labelID]['paths'] = list()
                    results[labelID]['mean_encoding'] = None
                    results[labelID]['std_dev'] = None
                results[labelID]['paths'].append(data[i]['path'])
                encodings.append(data[i]['encoding'])
            results[labelID]['mean_encoding'], results[labelID][
                'std_dev'] = self._compute_statistics(encodings)
            results[labelID]['sample_size'] = len(results[labelID]['paths'])

        with open('results.pkl', 'wb') as file:
            pickle.dump(results, file, protocol=pickle.HIGHEST_PROTOCOL)

        return True
Пример #27
0
    def hdbscan_on_points(self, min_cluster_size, min_samples, xyz=False):
        """
        Performs hdbscan on input points.
        :param min_cluster_size: [int], minimum cluster size
        :param min_samples: : [int], min samples
        >> see https://hdbscan.readthedocs.io/en/latest/parameter_selection.html

        :param xyz: [bool] if True the clustering will be done over xyz otherwise xy.

        :return: writes the points assigned with clusters to self.clustered_points
        """

        masked_points = self.raw_points[self.masks]
        start = time.time()
        if xyz:
            xy = np.array([masked_points['X'], masked_points['Y'], masked_points['Z']]).T
        else:
            xy = np.array([masked_points['X'], masked_points['Y']]).T

        xy_clusterer = HDBSCAN(min_cluster_size=min_cluster_size,
                               min_samples=min_samples)
        xy_clusterer.fit(xy)

        clustered_points = pd.DataFrame({'X': masked_points['X'],
                                         'Y': masked_points['Y'],
                                         'Z': masked_points['Z'],
                                         'Red': masked_points['Red'],
                                         'Green': masked_points['Green'],
                                         'Blue': masked_points['Blue'],
                                         'HAG': masked_points['HAG'],
                                         'Coplanar': masked_points['Coplanar'],
                                         'NormalX': masked_points['NormalX'],
                                         'NormalY': masked_points['NormalY'],
                                         'NormalZ': masked_points['NormalZ'],
                                         'Classification': xy_clusterer.labels_})
        # remove "noise" points
        self.clustered_points = clustered_points[clustered_points.Classification >= 0]
        end = time.time()
        print(f'found {np.unique(len(np.unique(self.clustered_points.Classification)))[0]} xy_clusters')
        print(f'clustering on xy took {round(end - start, 2)} seconds')
Пример #28
0
    def _summarize_multi_leiden(self):
        # here we don't rely on hdbscan clustering, just use it to reduce the pairwise distances calculation
        # the resulting clusters are simply disconnected components based on hamming dist graph
        # This results an over-clustering, which will be merged by the supervise learning step.
        hdbscan = HDBSCAN(min_cluster_size=2,
                          min_samples=1,
                          metric='hamming',
                          cluster_selection_method='eom',
                          allow_single_cluster=True)
        hdbscan.fit(self.leiden_result_df)
        clusters = {}
        cur_num = 0
        for _, sub_df in self.leiden_result_df.groupby(
                pd.Series(hdbscan.labels_, index=self.leiden_result_df.index)):
            pairwise_dist = pairwise_distances(sub_df, metric='hamming')
            # create a graph, cells within hamming_dist_cutoff are connected
            rows, cols = np.where(pairwise_dist < self.consensus_rate)
            edges = zip(sub_df.index[rows].tolist(),
                        sub_df.index[cols].tolist())
            g = nx.Graph()
            g.add_edges_from(edges)
            for comp in nx.connected_components(g):
                if len(comp) >= self.min_cluster_size:
                    for node in comp:
                        # each disconnected component assigned to a cluster
                        clusters[node] = cur_num
                    cur_num += 1
                else:
                    for node in comp:
                        clusters[node] = -1

        clusters = pd.Series(clusters).sort_index()
        print(
            f'{(clusters != -1).sum()} cells assigned to {clusters.unique().size - 1} raw clusters'
        )
        print(f'{(clusters == -1).sum()} cells are multi-leiden outliers')
        self._multi_leiden_clusters = clusters.values
        return
Пример #29
0
def Get_cluster_labels(input_data, Doc2Vec_model):
    '''
        Input: List of Results from search query, pre-loaded Doc2Vec_model (to prevent re-loading each time)
        Output: List of cluster labels
    ----------------------------------------------------------------------------
    '''

    Lem_words = [result_list[x]['Lemmatized'].split() for x in range(len(result_list))]
    vectors = [Doc2Vec_model.infer_vector(document) for document in Lem_words]

    distance = pairwise_distances(vectors, metric='cosine')
    clusterer = HDBSCAN( metric='precomputed', cluster_selection_method='leaf')
    db = clusterer.fit(distance.astype('float64'))
    return db.labels_, db.probabilities_
Пример #30
0
def hdbscan(X, min_cluster_size=5, gen_min_span_tree=True, **kwargs):
    """Clustering with Hierarchical DBSCAN

    Parameters
    ----------
    X : array-like
         n x k attribute data

    min_cluster_size : int, default: 5
        the minimum number of points necessary to generate a cluster

    gen_min_span_tree : bool
        Description of parameter `gen_min_span_tree` (the default is True).

    Returns
    -------
    model: hdbscan HDBSCAN instance

    """

    model = HDBSCAN(min_cluster_size=min_cluster_size)
    model.fit(X)
    return model