예제 #1
0
    def fitness(self):
        clusterer = HDBSCAN(
            algorithm=self.parametros["algorithm"],
            min_cluster_size=self.parametros["min_cluster_size"],
            min_samples=self.parametros["min_samples"],
            cluster_selection_method=self.
            parametros["cluster_selection_method"],
            cluster_selection_epsilon=self.
            parametros["cluster_selection_epsilon"])

        clusterer.fit(self.data)
        self.labels = clusterer.labels_
        silhouette_score = self.silhouette_score(self.data, self.labels)

        # balance = self.balance(clusterer.labels_)
        # percents = self.calc_percents(clusterer.labels_)
        # len_labels = self.len_labels(clusterer.labels_)
        # noise_percents = [item for item in percents if item[0] == -1][0][1]

        score = silhouette_score

        # print(percents)
        # print("\n")
        # print('\'' + str(json.dumps(self.parametros)) + '\'')
        # print("\n")
        # print(score)
        # print("---------------------\n\n")

        return score
예제 #2
0
def clustering(umap_embedding_fit, umap_embedding_predict, min_cluster_size, prediction_data):
    print("clustering...")
    hdbscan = HDBSCAN(min_cluster_size=min_cluster_size, prediction_data=prediction_data).fit(umap_embedding_fit)
    clustering = hdbscan.fit_predict(umap_embedding_predict)
    labels = hdbscan.labels_

    return clustering, labels
예제 #3
0
def test_hdbscan_allow_single_cluster_with_epsilon():
    np.random.seed(0)
    no_structure = np.random.rand(150, 2)
    # without epsilon we should see many noise points as children of root.
    labels = HDBSCAN(
        min_cluster_size=5,
        cluster_selection_epsilon=0.0,
        cluster_selection_method="eom",
        allow_single_cluster=True,
    ).fit_predict(no_structure)
    unique_labels, counts = np.unique(labels, return_counts=True)
    assert len(unique_labels) == 2
    assert counts[unique_labels == -1] == 46

    # for this random seed an epsilon of 0.2 will produce exactly 2 noise
    # points at that cut in single linkage.
    labels = HDBSCAN(
        min_cluster_size=5,
        cluster_selection_epsilon=0.2,
        cluster_selection_method="eom",
        allow_single_cluster=True,
    ).fit_predict(no_structure)
    unique_labels, counts = np.unique(labels, return_counts=True)
    assert len(unique_labels) == 2
    assert counts[unique_labels == -1] == 2
예제 #4
0
def hdbscan_samples(data, min_samples, n, filename):

    hdbscan = HDBSCAN(min_samples=min_samples, metric='haversine')

    data = data[np.random.randint(low=0, high=len(data), size=n), :]

    t0 = time.time()
    hdbscan.fit(np.radians(data))
    t1 = time.time() - t0

    clusters = len(np.unique(hdbscan.labels_))

    project = os.path.realpath('.')
    csv = os.path.join(project, filename)

    if not os.path.exists(csv):
        with open(csv, mode='w') as timing:
            timing.write('min_samples,n,clusters,seconds\n')

    with open(csv, mode='a') as timing:
        timing.write('{},{},{},{}\n'.format(min_samples, n, clusters, t1))

    print('HDBSCAN: {} samples, {} clusters, {} seconds'.format(
        n, clusters, t1))

    return t1
예제 #5
0
def test_switch_to_leaf():
    """
    Verify that when we request more clusters than 'eom' can handle,
        method switches to 'leaf' and the results match 'leaf'.
    """
    # Given the max number of clusters that can be produced by 'eom',
    #   (these are produced for epsilon=0) (??? Needs verification)
    clusterer = HDBSCAN(cluster_selection_method='eom',
                        cluster_selection_epsilon=0).fit(X)
    max_clusters = n_clusters_from_labels(clusterer.labels_)

    with warnings.catch_warnings(record=True) as w:
        # When we try flat clustering with 'eom' method for more n_clusters,
        clusterer_flat = HDBSCAN_flat(X,
                                      cluster_selection_method='eom',
                                      n_clusters=max_clusters + 2)
        # Then, a warning is raised saying 'eom' can't get this clustering,
        assert len(w) > 0
        assert issubclass(w[-1].category, UserWarning)
        assert "Cannot predict" in str(w[-1].message)

    # the resulting clusterer switches to using method 'leaf',
    assert clusterer_flat.cluster_selection_method == 'leaf', (
        "cluster selection method has not switched to 'leaf'")
    # and the resulting probabilities and labels must match
    epsilon = clusterer_flat.cluster_selection_epsilon
    clusterer_leaf = HDBSCAN(cluster_selection_method='leaf',
                             cluster_selection_epsilon=epsilon).fit(X)
    assert_array_equal(clusterer_flat.labels_, clusterer_leaf.labels_)
    assert_array_equal(clusterer_flat.probabilities_,
                       clusterer_leaf.probabilities_)
    return
예제 #6
0
    def apply(self, fX):
        from hdbscan import HDBSCAN
        clusterer = HDBSCAN(min_cluster_size=self.min_cluster_size,
                            min_samples=self.min_samples,
                            metric='precomputed')
        distance_matrix = squareform(pdist(fX, metric=self.metric))

        # apply clustering
        cluster_labels = clusterer.fit_predict(distance_matrix)

        # cluster embedding
        n_clusters = np.max(cluster_labels) + 1

        if n_clusters < 2:
            return np.zeros(fX.shape[0], dtype=np.int)

        fC = l2_normalize(
            np.vstack([np.sum(fX[cluster_labels == k, :], axis=0)
                       for k in range(n_clusters)]))

        # tag each undefined embedding to closest cluster
        undefined = cluster_labels == -1
        closest_cluster = np.argmin(
            cdist(fC, fX[undefined, :], metric=self.metric), axis=0)
        cluster_labels[undefined] = closest_cluster

        return cluster_labels
예제 #7
0
def test_flat_base_default():
    """
    Verify that the default clustering of HDBSCAN is preserved.
    """
    # Given, the base HDBSCAN with method 'eom'
    clusterer = HDBSCAN(cluster_selection_method='eom').fit(X)
    n_clusters = n_clusters_from_labels(clusterer.labels_)

    # When we ask for flat clustering with same n_clusters,
    clusterer_flat = HDBSCAN_flat(X,
                                  n_clusters=n_clusters,
                                  cluster_selection_method='eom')

    # Then, the labels and probabilities should match
    assert_array_equal(clusterer_flat.labels_, clusterer.labels_)
    assert_array_equal(clusterer_flat.probabilities_, clusterer.probabilities_)

    # Given, the base HDBSCAN with method 'leaf'
    clusterer = HDBSCAN(cluster_selection_method='leaf').fit(X)
    n_clusters = n_clusters_from_labels(clusterer.labels_)

    # When we ask for flat clustering with same n_clusters,
    clusterer_flat = HDBSCAN_flat(X,
                                  n_clusters=n_clusters,
                                  cluster_selection_method='leaf')

    # Then, the labels and probabilities should match
    assert_array_equal(clusterer_flat.labels_, clusterer.labels_)
    assert_array_equal(clusterer_flat.probabilities_, clusterer.probabilities_)
    return
예제 #8
0
    def hdbscan_clustering(self,
                           min_cluster_size=10,
                           min_cluster_portion=None,
                           min_samples=1,
                           metric='hamming',
                           cluster_selection_method='eom',
                           allow_single_cluster=True,
                           epsilon=0.2):
        if min_cluster_portion is not None:
            if min_cluster_size is None:
                min_cluster_size = 0
            min_cluster_size = max(min_cluster_size,
                                   self.n_obs * min_cluster_portion)
        else:
            if min_cluster_size is None:
                raise ValueError(
                    'Either min_cluster_size or min_cluster_portion should be provided'
                )

        runner = HDBSCAN(min_cluster_size=int(min_cluster_size),
                         min_samples=int(min_samples),
                         metric=metric,
                         cluster_selection_method=cluster_selection_method,
                         allow_single_cluster=allow_single_cluster)

        if self.leiden_result_df is None:
            raise ValueError(
                'Run multi_leiden_clustering first before hdbscan_clustering')
        runner.fit(self.leiden_result_df)
        self.hdbscan = runner
        self.reselect_clusters(epsilon=epsilon,
                               min_cluster_size=min_cluster_size)
        return
예제 #9
0
    def _cluster_train(df):

        start_time = time.time()
        # Note: Issue #88 open, prediction_data cannot be used with Haervsine metrics https://github.com/scikit-learn-contrib/hdbscan/issues/88
        db = HDBSCAN(min_samples=1,
                     metric='haversine',
                     core_dist_n_jobs=-1,
                     memory='./__pycache__/',
                     prediction_data=True)

        coords = df[['latitude', 'longitude']] * np.pi / 180

        df = df.assign(cluster=db.fit_predict(coords))
        # get the number of clusters
        num_clusters = db.labels_.max()
        message = 'Clustered {:,} points down to {:,} clusters, for {:.1f}% compression in {:,.2f} seconds'
        print(
            message.format(len(df), num_clusters,
                           100 * (1 - float(num_clusters) / len(df)),
                           time.time() - start_time))

        # Get the list of the point most in the center of each clusters
        cluster_centers = df[[
            'cluster', 'latitude', 'longitude'
        ]].groupby('cluster')['latitude', 'longitude'].agg(
            lambda x: _get_centermost_point(x.values))

        df = df.merge(cluster_centers,
                      left_on='cluster',
                      right_index=True,
                      how='left',
                      suffixes=('', '_cluster'))
        return db, cluster_centers, df
예제 #10
0
파일: clusterize.py 프로젝트: metgem/metgem
    def run(self):
        if self.isStopped():
            self.canceled.emit()
            return False

        options = self.options
        clusterer = HDBSCAN(min_cluster_size=options.min_cluster_size,
                            min_samples=options.min_samples,
                            cluster_selection_epsilon=options.cluster_selection_epsilon,
                            cluster_selection_method=options.cluster_selection_method)
        layout_data = self._widget.get_layout_data()
        isolated_nodes = layout_data['isolated_nodes']
        layout = layout_data['layout']
        mask = np.ones_like(layout, dtype=bool)
        mask[isolated_nodes] = False
        x = layout[mask].reshape(-1, 2)
        clusterer.fit(x.astype(np.float64))

        i = 0
        result = []
        for n in self._widget.scene().nodes():
            if n.index() in isolated_nodes:
                result.append("Noise")
            else:
                result.append(f"Cluster {clusterer.labels_[i] + 1}" if clusterer.labels_[i] > 0 else "Noise")
                i += 1

        if not self.isStopped():
            return result
        else:
            self.canceled.emit()
예제 #11
0
    def get_clusters(self, coordinates, original_df, coordinates_df,
                     csv_path):
        """
        It employs the HDBSCAN method to gather the supplied coordinates
        into clusters.

        Parameters
        ----------
        coordinates : numpy.array
            The array of coordinates that will be clustered. Its shape must
            fulfill the following dimensions: [M, N, 3], where M is the
            total number of models that have been sampled with PELE and
            N is the total number of atoms belonging to the residue that
            is being analyzed
        original_df : pandas.DataFrame
            Original dataframe from Analysis to be overwritten
        coordinates_df : pandas.DataFrame
            The filtered dataframe which was used to extract coordinates for
            clustering
        csv_path : str
            Directory where the CSV will be saved

        Returns
        -------
        clusters : numpy.array
            The array of cluster labels assigned to each conformer from
            the supplied array
        """
        from hdbscan import HDBSCAN
        coordinates = Clustering.fix_coordinates_shape(coordinates)
        clustering_method = HDBSCAN(cluster_selection_epsilon=self._bandwidth)
        clusters = clustering_method.fit_predict(coordinates)
        self._save_cluster_info(original_df, coordinates_df,
                                clusters, csv_path)
        return clusters
예제 #12
0
def cluster_data_points(data_points,
                        cluster_size=5,
                        distance_metric_func="Fractional"):
    points = [d['encoding'] for d in data_points]
    points = np.vstack(points)
    scaler = StandardScaler()
    scaler.fit(points)
    points = scaler.transform(points)
    dist_metric = Similarity()
    if distance_metric_func == "Fractional":
        dist_metric_func = dist_metric.fractional_distance
    else:
        dist_metric_func = dist_metric.euclidean_distance
    clusterer = HDBSCAN(min_cluster_size=cluster_size,
                        metric='pyfunc',
                        func=dist_metric_func)
    clusterer.fit(points)
    logging.info("Fit complete.")
    results = {}
    labelIDs = np.unique(clusterer.labels_)
    for labelID in labelIDs:
        paths = []
        encodings = []
        idxs = np.where(clusterer.labels_ == labelID)[0]
        for i in idxs:
            data = data_points[i]
            paths.append(data['path'])
            encodings.append(data['encoding'])
        results[labelID] = {
            'paths': paths,
            'mean_encoding': np.mean(np.asarray(encodings), axis=0),
            'std_dev': np.std(encodings, axis=0),
            'sample_size': len(paths)
        }
    return results
예제 #13
0
def hdbscan(X, min_cluster_size=5, gen_min_span_tree=True, **kwargs):
    """Clustering with Hierarchical DBSCAN.

    Parameters
    ----------
    X : array-like
         n x k attribute data
    min_cluster_size : int, default: 5
        the minimum number of points necessary to generate a cluster
    gen_min_span_tree : bool
        Description of parameter `gen_min_span_tree` (the default is True).
    kwargs

    Returns
    -------
    fitted cluster instance: hdbscan.hdbscan.HDBSCAN

    """
    try:
        from hdbscan import HDBSCAN
    except ImportError:
        raise ImportError(
            "You must have the hdbscan package installed to use this function")

    model = HDBSCAN(min_cluster_size=min_cluster_size)
    model.fit(X)
    return model
예제 #14
0
 def HDBSCAN(
     self, parameters
 ):  # data, min_cluster_size, min_samples, alpha, cluster_selection_method):
     result = {}
     default_min_cluster_size = 3
     default_min_samples = 3
     default_alpha = 0.5  #大于1的float
     default_cluster_selection_method = "eom"  # "eom", "leaf"
     data = np.array(parameters['data'])
     data = preprocessing.MinMaxScaler().fit_transform(data)
     if parameters.get('min_cluster_size') is not None:
         default_min_cluster_size = int(parameters['min_cluster_size'])
     if parameters.get('min_samples') is not None:
         default_min_samples = int(parameters['min_samples'])
     if parameters.get('alpha') is not None:
         default_alpha = float(parameters['alpha'])
     if parameters.get('cluster_selection_method') is not None:
         default_cluster_selection_method = str(
             parameters['cluster_selection_method'])
     model = HDBSCAN(
         min_cluster_size=default_min_cluster_size,
         min_samples=default_min_samples,
         alpha=default_alpha,
         cluster_selection_method=default_cluster_selection_method,
         allow_single_cluster=True)
     clustering = model.fit(data)
     result['clustering'] = clustering
     return result
def clusterFromDistMatrix(distanceMatrix):
    clusterer = HDBSCAN(min_cluster_size=2, metric='precomputed')
    clusterer.fit(distanceMatrix)
    labels = clusterer.labels_
    probs = clusterer.probabilities_
    labels = np.array((labels))[np.newaxis]
    labels = labels.T
    probs = np.array((probs))[np.newaxis]
    probs = probs.T
    results = np.concatenate((probs, gv.medSequenceMatrix), axis=1)
    results = np.concatenate((labels, results), axis=1)
    results = np.array(sorted(results, key=lambda a_entry: a_entry[0]))
    with open('treatmentClusters.txt', 'w') as csvfile:
        csvfile.write(
            "Cluster; Probability; ID; Month 1; Month 2; Month 3; Month 4; Month 5; Month 6"
        )
        csvfile.write('\n')
        for i in range(results.shape[0]):
            csvfile.write(str(results[i, 0]))
            csvfile.write(';')
            csvfile.write(str(results[i, 1]))
            csvfile.write(';')
            csvfile.write(str(results[i, 2]))
            csvfile.write(';')
            for j in range(3, results.shape[1]):
                csvfile.write(
                    str(results[i, j]).replace('{', '').replace('}', ''))
                csvfile.write(';')
            csvfile.write('\n')
def TrainCluster(x):
    clusterer = HDBSCAN(min_cluster_size=1250,
                        gen_min_span_tree=True,
                        prediction_data=True)  # creating a clustering object
    hdb = clusterer.fit(x)  #Fitting cluster object on training data
    hdb.prediction_data
    del (x)
    return hdb
예제 #17
0
파일: cluster.py 프로젝트: reskyner/pandda
def cluster_hdbscan(above_gps):
    sample_by_feature = above_gps.to_numpy()
    print(sample_by_feature.shape)

    clusterer = HDBSCAN()
    clusterer.fit(sample_by_feature)
    cluster_ids = list(clusterer.labels_)

    return cluster_ids
예제 #18
0
def hdbscan_cluster(df: pd.DataFrame,
                    min_cluster_size: int = 10,
                    gen_min_span_tree: bool = True):

    clusterer = HDBSCAN(min_cluster_size=min_cluster_size,
                        gen_min_span_tree=gen_min_span_tree)
    clusterer.fit(df)

    return clusterer.labels_, clusterer.probabilities_
예제 #19
0
def test_hdbscan_min_span_tree_availability():
    clusterer = HDBSCAN().fit(X)
    tree = clusterer.minimum_spanning_tree_
    assert tree is None
    D = distance.squareform(distance.pdist(X))
    D /= np.max(D)
    HDBSCAN(metric='precomputed').fit(D)
    tree = clusterer.minimum_spanning_tree_
    assert tree is None
예제 #20
0
def test_hdbscan_caching():

    cachedir = mkdtemp()
    labels1 = HDBSCAN(memory=cachedir, min_samples=5).fit(X).labels_
    labels2 = HDBSCAN(memory=cachedir, min_samples=5,
                      min_cluster_size=6).fit(X).labels_
    n_clusters1 = len(set(labels1)) - int(-1 in labels1)
    n_clusters2 = len(set(labels2)) - int(-1 in labels2)
    assert n_clusters1 == n_clusters2
예제 #21
0
def hdbscan_clustering(S,X,config):
    '''
    Computes H-DBSCAN clustering from an input similarity matrix.
    Returns the labels associated with the clustering.
    '''
    from hdbscan import HDBSCAN

    min_size = config.as_int("min_cluster_size")
    clf = HDBSCAN(min_cluster_size=min_size)
    return clf.fit_predict(X)
예제 #22
0
def hdbscan_clustering(S, X, config):
    '''
    Computes H-DBSCAN clustering from an input similarity matrix.
    Returns the labels associated with the clustering.
    '''
    from hdbscan import HDBSCAN

    min_size = config.as_int("min_cluster_size")
    clf = HDBSCAN(min_cluster_size=min_size)
    return clf.fit_predict(X)
예제 #23
0
def test_missing_data():
    """Tests if nan data are treated as infinite distance from all other points and assigned to -1 cluster"""
    model = HDBSCAN().fit(X_missing_data)
    assert model.labels_[0] == -1
    assert model.labels_[5] == -1
    assert model.probabilities_[0] == 0
    assert model.probabilities_[5] == 0
    assert model.probabilities_[5] == 0
    clean_indices = list(range(1, 5)) + list(range(6, 200))
    clean_model = HDBSCAN().fit(X_missing_data[clean_indices])
    assert np.allclose(clean_model.labels_, model.labels_[clean_indices])
예제 #24
0
 def fit(self, data, min_cluster_size, min_samples, alpha,
         cluster_selection_method):
     data = np.array(data)
     data = preprocessing.MinMaxScaler().fit_transform(data)
     model = HDBSCAN(min_cluster_size=min_cluster_size,
                     min_samples=min_samples,
                     alpha=alpha,
                     cluster_selection_method=cluster_selection_method,
                     allow_single_cluster=True)
     clustering = model.fit(data)
     return clustering
예제 #25
0
def test_hdbscan_centroids_medoids():
    centers = [(0.0, 0.0), (3.0, 3.0)]
    H, y = make_blobs(n_samples=1000, random_state=0, centers=centers, cluster_std=0.5)
    clusterer = HDBSCAN().fit(H)

    for idx, center in enumerate(centers):
        centroid = clusterer.weighted_cluster_centroid(idx)
        assert_array_almost_equal(centroid, center, decimal=1)

        medoid = clusterer.weighted_cluster_medoid(idx)
        assert_array_almost_equal(medoid, center, decimal=1)
 def hdbscan(self, args):
     start = time.time()
     model = HDBSCAN(
         min_cluster_size=args["min_cluster_size"],
         metric=args["metric"],
         leaf_size=args["leaf_size"],
         allow_single_cluster=args["allow_single_cluster"],
     ).fit(self.data_matrix)
     labels = model.predict(self.data_matrix)
     end = time.time()
     return labels, (end - start)
예제 #27
0
 def hdbscan(self, min_cluster_size=10, prediction_data=False):
     """ DBSCAN but allows for varying density clusters and no longer
     requires epsilon parameter, which is difficult to tune.
     http://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html
     Scales slightly worse than DBSCAN, but with a more intuitive parameter.
     """
     hdbscan = HDBSCAN(min_cluster_size=min_cluster_size,
                         prediction_data=prediction_data)
     if prediction_data:
         return hdbscan.fit(self._safe_dense(self.matrix))
     else:
         return hdbscan.fit(self.matrix)
예제 #28
0
 def _run_hdbscan(affinity: np.ndarray, min_cluster_size_for_hdbscan: int, min_cluster_size: int, max_cluster_size: int):
     assert affinity.shape[0] == affinity.shape[1]
     if affinity.shape[0] > max_cluster_size:
         allow_single_cluster = False
     else:
         allow_single_cluster = True
     db = HDBSCAN(metric='precomputed',
                  min_cluster_size=min_cluster_size_for_hdbscan,
                  min_samples=1,
                  allow_single_cluster=allow_single_cluster)
     db.fit(affinity)
     return db
예제 #29
0
def cluster(df, min_size=4, allow_single_cluster=True):
    """Use HDBSCAN --
    (Hierarchical Density-Based Spatial Clustering of Applications with Noise)
    to find the best clusters for the meander.
    """
    clusterer = HDBSCAN(min_cluster_size=min_size,
                        min_samples=3,
                        metric='haversine',
                        allow_single_cluster=allow_single_cluster)
    clusterer.fit(df[['lat', 'lng']])
    df.loc[:, 'label'] = ['ABCDEFGHIJKLMN'[i] for i in clusterer.labels_]
    return df.sort_values('label').reset_index(drop=True)
예제 #30
0
    def perform_hdbscan(self, min_cluster_size=15):
        hdbscan_clusterer = HDBSCAN(min_cluster_size, metric="precomputed")
        hdbscan_clusterer.fit(self.distance_matrix)
        self.hdbscan_results = {
            "parameters": hdbscan_clusterer.get_params(),
            "labels": hdbscan_clusterer.labels_,
            "probabilities": hdbscan_clusterer.probabilities_,
            "n_clusters": np.unique(hdbscan_clusterer.labels_).max() + 1,
            'clusters': label_cnt_dict(hdbscan_clusterer.labels_)
        }

        print_dict(self.hdbscan_results)
예제 #31
0
class HDBSCAN:
    def __init__(self):
        self.cluster = HDBSCANBase(algorithm='best',
                                   approx_min_span_tree=True,
                                   gen_min_span_tree=False,
                                   leaf_size=40,
                                   metric='euclidean',
                                   min_cluster_size=15,
                                   min_samples=15,
                                   p=None)

    def fit(self, X, y=None):
        self.cluster.fit(X)
예제 #32
0
파일: test.py 프로젝트: chongxi/mua
from vispy.visuals.transforms import STTransform
phy.gui.create_app()


mua = MUA(filename='S:/pcie.bin')
spk = mua.tospk()
fet = spk.tofet('pca')


# spike sort a channel centered spiking events
ch = 26
min_cluster_size = 5
leaf_size = 10

hdbcluster = HDBSCAN(min_cluster_size=min_cluster_size, 
                     leaf_size=leaf_size,
                     gen_min_span_tree=True, 
                     algorithm='boruvka_kdtree')
clu = hdbcluster.fit_predict(fet[ch])
print 'get clusters', np.unique(clu)


#
from phy.gui import GUI, create_app, run_app
create_app()
gui = GUI(position=(400, 200), size=(600, 400))

scatter_view = view_scatter_3d()
scatter_view.attach(gui)
scatter_view.set_data(fet[ch], clu)

예제 #33
0
파일: inference.py 프로젝트: slipguru/icing
    def fit(self, X, y=None, sample_weight=None):
        """X is a dataframe."""
        if self.method not in ("dbscan", "hdbscan", "spark"):
            raise ValueError("Unsupported method '%s'" % self.method)
        if not self.dbscan_params:
            self.dbscan_params = dict(
                min_samples=20, n_jobs=-1, algorithm='brute',
                metric=partial(distance_dataframe, X, **dict(
                    junction_dist=StringDistance(),
                    correct=False, tol=0)))
        if not self.hdbscan_params and self.method == 'hdbscan':
            self.hdbscan_params = dict(
                min_samples=20, n_jobs=-1,
                metric=partial(distance_dataframe, X, **dict(
                    junction_dist=StringDistance(),
                    correct=False, tol=0)))

        self.dbscan_params['eps'] = self.eps
        # new part: group by junction and v genes
        if self.method == 'hdbscan' and False:
            # no grouping; unsupported sample_weight
            groups_values = [[x] for x in np.arange(X.shape[0])]
        else:
            # list of lists
            groups_values = X.groupby(
                ["v_gene_set_str", self.model + "junc"]).groups.values()

        idxs = np.array([elem[0] for elem in groups_values])  # take one of them
        sample_weight = np.array([len(elem) for elem in groups_values])
        X_all = idxs.reshape(-1, 1)

        if self.kmeans_params.get('n_clusters', True):
            # ensure the number of clusters is higher than points
            self.kmeans_params['n_clusters'] = min(
                self.kmeans_params['n_clusters'], X_all.shape[0])
        kmeans = MiniBatchKMeans(**self.kmeans_params)

        lengths = X[self.model + 'junction_length'].values
        kmeans.fit(lengths[idxs].reshape(-1, 1))
        dbscan_labels = np.zeros_like(kmeans.labels_).ravel()

        if self.method == 'hdbscan':
            from hdbscan import HDBSCAN
            from hdbscan.prediction import all_points_membership_vectors
            dbscan_sk = HDBSCAN(**self.hdbscan_params)
        else:
            dbscan_sk = DBSCAN(**self.dbscan_params)
        if self.method == 'spark':
            from pyspark import SparkContext
            from icing.externals.pypardis import dbscan as dbpard
            sc = SparkContext.getOrCreate()
            sample_weight_map = dict(zip(idxs, sample_weight))
            # self.dbscan_params.pop('n_jobs', None)
            dbscan = dbpard.DBSCAN(
                dbscan_params=self.dbscan_params,
                **self.dbspark_params)
        # else:

        for i, label in enumerate(np.unique(kmeans.labels_)):
            idx_row = np.where(kmeans.labels_ == label)[0]

            if self.verbose:
                print("Iteration %d/%d" % (i, np.unique(kmeans.labels_).size),
                      "(%d seqs)" % idx_row.size, end='\r')

            X_idx = idxs[idx_row].reshape(-1, 1).astype('float64')
            weights = sample_weight[idx_row]

            if idx_row.size == 1:
                db_labels = np.array([0])
            elif self.method == 'spark' and idx_row.size > 5000:
                test_data = sc.parallelize(enumerate(X_idx))
                dbscan.train(test_data, sample_weight=sample_weight_map)
                db_labels = np.array(dbscan.assignments())[:, 1]
            elif self.method == 'hdbscan':
                db_labels = dbscan_sk.fit_predict(X_idx)  # unsupported weights
                # avoid noise samples
                soft_clusters = all_points_membership_vectors(dbscan_sk)
                db_labels = np.array([np.argmax(x) for x in soft_clusters])
            else:
                db_labels = dbscan_sk.fit_predict(
                    X_idx, sample_weight=weights)

            if len(dbscan_sk.core_sample_indices_) < 1:
                db_labels[:] = 0
            if -1 in db_labels:
                balltree = BallTree(
                    X_idx[dbscan_sk.core_sample_indices_],
                    metric=dbscan_sk.metric)
                noise_labels = balltree.query(
                    X_idx[db_labels == -1], k=1, return_distance=False).ravel()
                # get labels for core points, then assign to noise points based
                # on balltree
                dbscan_noise_labels = db_labels[
                    dbscan_sk.core_sample_indices_][noise_labels]
                db_labels[db_labels == -1] = dbscan_noise_labels

            # hopefully, there are no noisy samples at this time
            db_labels[db_labels > -1] = db_labels[db_labels > -1] + np.max(dbscan_labels) + 1
            dbscan_labels[idx_row] = db_labels  # + np.max(dbscan_labels) + 1

        if self.method == 'spark':
            sc.stop()
        labels = dbscan_labels

        # new part: put together the labels
        labels_ext = np.zeros(X.shape[0], dtype=int)
        labels_ext[idxs] = labels
        for i, list_ in enumerate(groups_values):
            labels_ext[list_] = labels[i]
        self.labels_ = labels_ext