Пример #1
0
def myHDBSCAN(nodes, numPartitions):
    nodesToClusters = {}
    for node in nodes.values():
        if node.user.protected:
            nodesToClusters[node.id] = -99

    for Id in nodesToClusters.keys():
        nodes.pop(Id)

    hdb = HDBSCAN(
        min_cluster_size=5,
        cluster_selection_epsilon=2, 
        cluster_selection_method="leaf",
        metric="manhattan",
    )

    for i in range(numPartitions - 1):
        if i == 0:
            clusters = hdb.fit_predict(createAdjacency(nodes))
            continue
        
        # Sort then group clusters by num. of members
        frequency = {key: len(tuple(group)) for key, group in groupby(sorted(clusters)) if "-1" not in str(key)}
        maximum = max(frequency.values())
        for key, freq in frequency.items():
            if freq == maximum:
                cluster = key  # Use 'cluster' to select biggest cluster

        # Split nodes that belong to 'cluster' into two clusters
        vec = hdb.fit_predict(createAdjacency(nodes, tuple(cluster == x for x in clusters)))

        # Update all new clustered elements from vec to clusters
        cnt = 0
        for index, el in enumerate(clusters):
            if el == cluster:
                clusters[index] = (clusters[index]) + (vec[cnt])
                cnt += 1

    # clusters = np.unique(n_clusters).tolist()
    # clusters.remove(-1)
    # distances = [
    #     [abs(hdb.weighted_cluster_centroid(i) - hdb.weighted_cluster_centroid(j)).mean() for j in clusters]
    #             for i in clusters]
    # edges = pd.DataFrame(distances).applymap(lambda x: x > 0.05).values.tolist()

    for Id, cluster in zip(nodes, clusters):
        nodesToClusters.update({Id: (cluster)})

    return nodesToClusters, None
Пример #2
0
    def _cluster_train(df):

        start_time = time.time()
        # Note: Issue #88 open, prediction_data cannot be used with Haervsine metrics https://github.com/scikit-learn-contrib/hdbscan/issues/88
        db = HDBSCAN(min_samples=1,
                     metric='haversine',
                     core_dist_n_jobs=-1,
                     memory='./__pycache__/',
                     prediction_data=True)

        coords = df[['latitude', 'longitude']] * np.pi / 180

        df = df.assign(cluster=db.fit_predict(coords))
        # get the number of clusters
        num_clusters = db.labels_.max()
        message = 'Clustered {:,} points down to {:,} clusters, for {:.1f}% compression in {:,.2f} seconds'
        print(
            message.format(len(df), num_clusters,
                           100 * (1 - float(num_clusters) / len(df)),
                           time.time() - start_time))

        # Get the list of the point most in the center of each clusters
        cluster_centers = df[[
            'cluster', 'latitude', 'longitude'
        ]].groupby('cluster')['latitude', 'longitude'].agg(
            lambda x: _get_centermost_point(x.values))

        df = df.merge(cluster_centers,
                      left_on='cluster',
                      right_index=True,
                      how='left',
                      suffixes=('', '_cluster'))
        return db, cluster_centers, df
Пример #3
0
    def apply(self, fX):
        from hdbscan import HDBSCAN
        clusterer = HDBSCAN(min_cluster_size=self.min_cluster_size,
                            min_samples=self.min_samples,
                            metric='precomputed')
        distance_matrix = squareform(pdist(fX, metric=self.metric))

        # apply clustering
        cluster_labels = clusterer.fit_predict(distance_matrix)

        # cluster embedding
        n_clusters = np.max(cluster_labels) + 1

        if n_clusters < 2:
            return np.zeros(fX.shape[0], dtype=np.int)

        fC = l2_normalize(
            np.vstack([np.sum(fX[cluster_labels == k, :], axis=0)
                       for k in range(n_clusters)]))

        # tag each undefined embedding to closest cluster
        undefined = cluster_labels == -1
        closest_cluster = np.argmin(
            cdist(fC, fX[undefined, :], metric=self.metric), axis=0)
        cluster_labels[undefined] = closest_cluster

        return cluster_labels
Пример #4
0
def clustering(umap_embedding_fit, umap_embedding_predict, min_cluster_size, prediction_data):
    print("clustering...")
    hdbscan = HDBSCAN(min_cluster_size=min_cluster_size, prediction_data=prediction_data).fit(umap_embedding_fit)
    clustering = hdbscan.fit_predict(umap_embedding_predict)
    labels = hdbscan.labels_

    return clustering, labels
Пример #5
0
    def get_clusters(self, coordinates, original_df, coordinates_df,
                     csv_path):
        """
        It employs the HDBSCAN method to gather the supplied coordinates
        into clusters.

        Parameters
        ----------
        coordinates : numpy.array
            The array of coordinates that will be clustered. Its shape must
            fulfill the following dimensions: [M, N, 3], where M is the
            total number of models that have been sampled with PELE and
            N is the total number of atoms belonging to the residue that
            is being analyzed
        original_df : pandas.DataFrame
            Original dataframe from Analysis to be overwritten
        coordinates_df : pandas.DataFrame
            The filtered dataframe which was used to extract coordinates for
            clustering
        csv_path : str
            Directory where the CSV will be saved

        Returns
        -------
        clusters : numpy.array
            The array of cluster labels assigned to each conformer from
            the supplied array
        """
        from hdbscan import HDBSCAN
        coordinates = Clustering.fix_coordinates_shape(coordinates)
        clustering_method = HDBSCAN(cluster_selection_epsilon=self._bandwidth)
        clusters = clustering_method.fit_predict(coordinates)
        self._save_cluster_info(original_df, coordinates_df,
                                clusters, csv_path)
        return clusters
Пример #6
0
def hdbscan_clustering(S,X,config):
    '''
    Computes H-DBSCAN clustering from an input similarity matrix.
    Returns the labels associated with the clustering.
    '''
    from hdbscan import HDBSCAN

    min_size = config.as_int("min_cluster_size")
    clf = HDBSCAN(min_cluster_size=min_size)
    return clf.fit_predict(X)
Пример #7
0
def hdbscan_clustering(S, X, config):
    '''
    Computes H-DBSCAN clustering from an input similarity matrix.
    Returns the labels associated with the clustering.
    '''
    from hdbscan import HDBSCAN

    min_size = config.as_int("min_cluster_size")
    clf = HDBSCAN(min_cluster_size=min_size)
    return clf.fit_predict(X)
Пример #8
0
def clustering(df_low_dim,
               algorithm='KMeans',
               K_means_n_clusters=15,
               hdbscan_min_cluster_size=20):
    """
    Perform clustering on dimention reduced data. 
    Clustering algorithms can be KMeans or HDBSCAN.

    """
    if algorithm == 'HDBSCAN':
        clustering = HDBSCAN(min_cluster_size=hdbscan_min_cluster_size)
    else:
        clustering = KMeans(K_means_n_clusters)
    labels = clustering.fit_predict(df_low_dim)
    df_labels = pd.Series(['Cluster ' + str(x) for x in labels])
    return df_labels
Пример #9
0
def cluster_space_hdb(classes,
                      vocab_vecs,
                      vocab,
                      min_cluster_size=5,
                      metric="sqeuclidean",
                      min_samples=None):
    cl = HDBSCAN(metric=metric,
                 min_cluster_size=min_cluster_size,
                 min_samples=min_samples).fit(vocab_vecs)

    c_labels = {}
    vocab_labels = {}

    for i in range(0, len(vocab)):
        vocab_labels[vocab[i].text] = cl.labels_[i]

    for l in classes.keys():
        c_labels[l] = cl.fit_predict(classes[l]["vecs"])

    return c_labels, vocab_labels
def cluster_data_using_hdbscan(points):
    """TODO

    Args:
        points: a list of numpy arrays

    Returns an array of clusters."""
    dbscan = HDBSCAN(algorithm='best',
                     alpha=1.0,
                     approx_min_span_tree=True,
                     gen_min_span_tree=False,
                     leaf_size=40,
                     metric='euclidean',
                     min_cluster_size=5,
                     min_samples=None,
                     p=None)
    indexes = dbscan.fit_predict(points)
    number_of_clusters = len(set(dbscan.labels_)) - \
        (1 if -1 in dbscan.labels_ else 0)
    return create_clusters(number_of_clusters=number_of_clusters,
                           indexes=indexes)
def cluster_hdbscan(orig_ys, parameters, MemoryDir=None, classes=None):
    n_components, min_samples, min_cluster_size = parameters

    # Convert to np if not already
    orig_ys = np.array(orig_ys)
    orig_ys = orig_ys.astype('float64')

    # Load memory if needed
    if MemoryDir:
        savedMemory = Memory(MemoryDir + str(n_components) + "_" +
                             str(min_samples) + "/")
    else:
        savedMemory = Memory(cachedir=None, verbose=0)

    # PCA to desired dimensionality
    pca = PCA(n_components=n_components)
    ys = pca.fit_transform(orig_ys)

    # Cluster using hdbscan
    clusterer = HDBSCAN(min_cluster_size=min_cluster_size,
                        min_samples=min_samples,
                        core_dist_n_jobs=-2,
                        algorithm='boruvka_kdtree',
                        cluster_selection_method='eom',
                        prediction_data=True,
                        memory=savedMemory)
    cluster_labels = clusterer.fit_predict(ys)
    outlier_scores = clusterer.outlier_scores_

    # Increase outlier score of outlier points by 1
    ys_idx = np.arange(len(cluster_labels))
    outlier_idx = ys_idx[cluster_labels == -1]
    outlier_scores[outlier_idx] += 1

    # Assign cluster labels to outlier points
    soft_cluster_labels = membership_vector(clusterer, ys[outlier_idx])
    weak_cluster_labels = np.argmax(soft_cluster_labels, -1)
    cluster_labels[outlier_idx] = weak_cluster_labels

    return cluster_labels, outlier_scores
Пример #12
0
def compute_clusters(pois, alg='dbscan', min_pts=None, eps=None, n_jobs=1):
    """Computes clusters using the DBSCAN or the HDBSCAN algorithm.

    Args:
         pois (GeoDataFrame): A POI GeoDataFrame.
         alg (string): The clustering algorithm to use (dbscan or hdbscan; default: dbscan).
         min_pts (integer): The minimum number of neighbors for a dense point.
         eps (float): The neighborhood radius.
         n_jobs (integer): Number of parallel jobs to run in the algorithm (default: 1)

    Returns:
          A GeoDataFrame containing the clustered POIs and their labels. The value of parameter `eps` for each cluster
          is also returned (which varies in the case of HDBSCAN).
    """

    # Prepare list of coordinates
    poi_list = [[p.x, p.y] for p in pois['geometry']]
    data_arr = np.array(poi_list)
    del poi_list[:]

    # Compute the clusters
    t0 = time()
    if alg == 'hdbscan':
        clusterer = HDBSCAN(min_cluster_size=min_pts,
                            min_samples=min_pts,
                            core_dist_n_jobs=n_jobs)
        labels = clusterer.fit_predict(data_arr)
        num_of_clusters = len(set(labels))

        tree = clusterer.condensed_tree_.to_pandas()
        cluster_tree = tree[tree.child_size > 1]
        chosen_clusters = clusterer.condensed_tree_._select_clusters()

        eps_per_cluster = cluster_tree[cluster_tree.child.isin(chosen_clusters)].\
            drop("parent", axis=1).drop("child", axis=1).reset_index().drop("index", axis=1)
        eps_per_cluster['lambda_val'] = eps_per_cluster['lambda_val'].apply(
            lambda x: 1 / x)
        eps_per_cluster.rename(columns={
            'lambda_val': 'eps',
            'child_size': 'cluster_size'
        },
                               inplace=True)

    else:
        clusterer = DBSCAN(eps=eps, min_samples=min_pts,
                           n_jobs=n_jobs).fit(data_arr)
        labels = clusterer.labels_

        num_of_clusters = len(set(labels))
        num_of_clusters_no_noise = set(labels)
        num_of_clusters_no_noise.discard(-1)
        num_of_clusters_no_noise = len(num_of_clusters_no_noise)

        eps_per_cluster = pd.DataFrame(
            {'eps': [eps] * num_of_clusters_no_noise})
        eps_per_cluster['cluster_size'] = 0

    print("Done in %0.3fs." % (time() - t0))

    # Assign cluster labels to initial POIs
    pois['cluster_id'] = labels

    # Separate POIs that are inside clusters from those that are noise
    pois_in_clusters = pois.loc[pois['cluster_id'] > -1]
    pois_noise = pois.loc[pois['cluster_id'] == -1]

    print('Number of clusters: %d' % num_of_clusters)
    print('Number of clustered POIs: %d' % (len(pois_in_clusters)))
    print('Number of outlier POIs: %d' % (len(pois_noise)))

    return pois_in_clusters, eps_per_cluster
                   label=f"Cluster {i+1}")
    ax.scatter(*data_tsne_2d[clst_dbscan == -1].T,
               s=20,
               color=colorcycle[-1],
               label=f"No Cluster")
    ax.legend(loc='upper right')
    ax.set_xlabel("First t-SNE dimension")
    ax.set_ylabel("Second t-SNE dimension")
    fig.savefig(
        f"figs_script/tsne_dbscan_perp_30_colored_{n_to_plot}_clique_{plot_type_name}.pdf"
    )

    # #### HDBScan

    hdbscan = HDBSCAN(min_cluster_size=5, core_dist_n_jobs=4)
    clst_hdbscan = hdbscan.fit_predict(clstData)
    print("Hdbscan clustering", pd.value_counts(clst_hdbscan), sep='\n')

    fig, ax = plt.subplots()
    for i in [0, 1]:
        ax.scatter(*data_tsne_2d[clst_hdbscan == i].T,
                   s=20,
                   color=colorcycle[i],
                   label=f"Cluster {i+1}")
    ax.scatter(*data_tsne_2d[clst_hdbscan == -1].T,
               s=20,
               color=colorcycle[-1],
               label=f"No Cluster")
    ax.legend(loc='upper right')
    ax.set_xlabel("First t-SNE dimension")
    ax.set_ylabel("Second t-SNE dimension")
Пример #14
0
    def fit(self, X, y=None, sample_weight=None):
        """X is a dataframe."""
        if self.method not in ("dbscan", "hdbscan", "spark"):
            raise ValueError("Unsupported method '%s'" % self.method)
        if not self.dbscan_params:
            self.dbscan_params = dict(min_samples=20,
                                      n_jobs=-1,
                                      algorithm='brute',
                                      metric=partial(
                                          distance_dataframe, X,
                                          **dict(
                                              junction_dist=StringDistance(),
                                              correct=False,
                                              tol=0)))
        if not self.hdbscan_params and self.method == 'hdbscan':
            self.hdbscan_params = dict(min_samples=20,
                                       n_jobs=-1,
                                       metric=partial(
                                           distance_dataframe, X,
                                           **dict(
                                               junction_dist=StringDistance(),
                                               correct=False,
                                               tol=0)))

        self.dbscan_params['eps'] = self.eps
        # new part: group by junction and v genes
        if self.method == 'hdbscan' and False:
            # no grouping; unsupported sample_weight
            groups_values = [[x] for x in np.arange(X.shape[0])]
        else:
            # list of lists
            groups_values = X.groupby(["v_gene_set_str",
                                       self.model + "junc"]).groups.values()

        idxs = np.array([elem[0]
                         for elem in groups_values])  # take one of them
        sample_weight = np.array([len(elem) for elem in groups_values])
        X_all = idxs.reshape(-1, 1)

        if self.kmeans_params.get('n_clusters', True):
            # ensure the number of clusters is higher than points
            self.kmeans_params['n_clusters'] = min(
                self.kmeans_params['n_clusters'], X_all.shape[0])
        kmeans = MiniBatchKMeans(**self.kmeans_params)

        lengths = X[self.model + 'junction_length'].values
        kmeans.fit(lengths[idxs].reshape(-1, 1))
        dbscan_labels = np.zeros_like(kmeans.labels_).ravel()

        if self.method == 'hdbscan':
            from hdbscan import HDBSCAN
            from hdbscan.prediction import all_points_membership_vectors
            dbscan_sk = HDBSCAN(**self.hdbscan_params)
        else:
            dbscan_sk = DBSCAN(**self.dbscan_params)
        if self.method == 'spark':
            from pyspark import SparkContext
            from icing.externals.pypardis import dbscan as dbpard
            sc = SparkContext.getOrCreate()
            sample_weight_map = dict(zip(idxs, sample_weight))
            # self.dbscan_params.pop('n_jobs', None)
            dbscan = dbpard.DBSCAN(dbscan_params=self.dbscan_params,
                                   **self.dbspark_params)
        # else:

        for i, label in enumerate(np.unique(kmeans.labels_)):
            idx_row = np.where(kmeans.labels_ == label)[0]

            if self.verbose:
                print("Iteration %d/%d" % (i, np.unique(kmeans.labels_).size),
                      "(%d seqs)" % idx_row.size,
                      end='\r')

            X_idx = idxs[idx_row].reshape(-1, 1).astype('float64')
            weights = sample_weight[idx_row]

            if idx_row.size == 1:
                db_labels = np.array([0])
            elif self.method == 'spark' and idx_row.size > 5000:
                test_data = sc.parallelize(enumerate(X_idx))
                dbscan.train(test_data, sample_weight=sample_weight_map)
                db_labels = np.array(dbscan.assignments())[:, 1]
            elif self.method == 'hdbscan':
                db_labels = dbscan_sk.fit_predict(X_idx)  # unsupported weights
                # avoid noise samples
                soft_clusters = all_points_membership_vectors(dbscan_sk)
                db_labels = np.array([np.argmax(x) for x in soft_clusters])
            else:
                db_labels = dbscan_sk.fit_predict(X_idx, sample_weight=weights)

            if len(dbscan_sk.core_sample_indices_) < 1:
                db_labels[:] = 0
            if -1 in db_labels:
                balltree = BallTree(X_idx[dbscan_sk.core_sample_indices_],
                                    metric=dbscan_sk.metric)
                noise_labels = balltree.query(X_idx[db_labels == -1],
                                              k=1,
                                              return_distance=False).ravel()
                # get labels for core points, then assign to noise points based
                # on balltree
                dbscan_noise_labels = db_labels[
                    dbscan_sk.core_sample_indices_][noise_labels]
                db_labels[db_labels == -1] = dbscan_noise_labels

            # hopefully, there are no noisy samples at this time
            db_labels[db_labels > -1] = db_labels[db_labels > -1] + np.max(
                dbscan_labels) + 1
            dbscan_labels[idx_row] = db_labels  # + np.max(dbscan_labels) + 1

        if self.method == 'spark':
            sc.stop()
        labels = dbscan_labels

        # new part: put together the labels
        labels_ext = np.zeros(X.shape[0], dtype=int)
        labels_ext[idxs] = labels
        for i, list_ in enumerate(groups_values):
            labels_ext[list_] = labels[i]
        self.labels_ = labels_ext
def Clustering(IL, mask, clustering_training_data):
    #ri_test = np.random.choice(range(len(IL)),size=np.int(IL.shape[0]/10))
    clusterer = HDBSCAN(min_cluster_size=1250, gen_min_span_tree=True)
    #hdb = clusterer.fit(clustering_training_data)
    IL.ix[mask, 'hdbscan_cluster'] = clusterer.fit_predict(IL)
Пример #16
0
mua = MUA(filename='S:/pcie.bin')
spk = mua.tospk()
fet = spk.tofet('pca')


# spike sort a channel centered spiking events
ch = 26
min_cluster_size = 5
leaf_size = 10

hdbcluster = HDBSCAN(min_cluster_size=min_cluster_size, 
                     leaf_size=leaf_size,
                     gen_min_span_tree=True, 
                     algorithm='boruvka_kdtree')
clu = hdbcluster.fit_predict(fet[ch])
print 'get clusters', np.unique(clu)


#
from phy.gui import GUI, create_app, run_app
create_app()
gui = GUI(position=(400, 200), size=(600, 400))

scatter_view = view_scatter_3d()
scatter_view.attach(gui)
scatter_view.set_data(fet[ch], clu)


nclu = len(np.unique(clu))
view = View(layout='grid',  shape=(3, nclu))
    def fit_tsne(self,
                 dataset,
                 min_cluster_size=55,
                 perplexity=40,
                 n_iter=2500,
                 learning_rate=700.0):
        self.corpus, self.orig = text_preprocessing(dataset, self.msg_column,
                                                    self.min_msg_length,
                                                    self.stop_words_set)

        logger.info("corpus length: " + str(len(self.corpus)))

        fname = \
            RESOURCE_DIR + '/tsne_' + \
            str(self.max_features) + '_' + \
            str(n_iter) + '_' +  \
            str(perplexity).replace(".", "_") + '_' + \
            str(learning_rate).replace(".", "_")

        if not self.overwrite:
            fname = free_file_name(fname, 'pkl')
        else:
            fname = fname + '.pkl'

        logger.info("Using t-SNE model file: " + str(fname))

        if not os.path.isfile(fname):
            cv = CountVectorizer(max_features=self.max_features)
            self.X = cv.fit_transform(self.corpus).toarray()

            pca_all = PCA()
            pca_all.fit_transform(self.X)

            ratio = 0.0
            n_components = 0
            while ratio < 0.85:
                ratio += pca_all.explained_variance_ratio_[n_components]
                n_components += 1

            pca = PCA(n_components)
            x_pca = pca.fit_transform(self.X)

            logger.info(
                'Cumulative explained variation for principal components: {}'.
                format(np.sum(pca.explained_variance_ratio_)))

            tsne = TSNE(n_components=2,
                        verbose=1,
                        perplexity=perplexity,
                        n_iter=n_iter,
                        learning_rate=learning_rate)
            self.tsne_results = tsne.fit_transform(x_pca)
            with open(fname, 'wb') as file:
                pickle.dump((cv, pca, self.tsne_results), file)
        else:
            with open(fname, 'rb') as file:
                cv, pca, self.tsne_results = pickle.load(file)
            self.X = cv.fit_transform(self.corpus).toarray()
            self.x_pca = pca.fit_transform(self.X)

        logger.info('X rows: ' + str(len(self.X)))
        logger.info('X cols: ' + str(len(self.X[0])))

        df = pd.DataFrame(columns=['X', 'Y'])
        df['X'] = self.tsne_results[:, 0]
        df['Y'] = self.tsne_results[:, 1]

        tsne_values = df.values

        clustering = HDBSCAN(min_cluster_size=min_cluster_size)
        self.y_pred = clustering.fit_predict(tsne_values)

        next_cluster_num = get_next_cluster_num()

        self.y = []
        filtered_values = []
        for i in range(0, len(self.y_pred)):
            if self.y_pred[i] < 0:
                continue
            filtered_values.append(tsne_values[i])
            self.y.append(int(self.y_pred[i]) + int(next_cluster_num))

        self.y = np.array(self.y, dtype='int')

        logger.info('Next cluster num: ' + str(next_cluster_num))

        clusters_codes = pd.DataFrame(self.y, columns=['cl'])['cl'].unique()

        self.n_clusters = len(clusters_codes)

        filtered_values = np.array(filtered_values)

        self.index_array = np.concatenate(
            (filtered_values, np.zeros(
                (len(filtered_values), 1), dtype='float')),
            axis=1)

        logger.info('Cluster codes (' + str(self.n_clusters) + '):')
        logger.info(clusters_codes)
Пример #18
0
with warnings.catch_warnings():
    warnings.simplefilter('ignore')

    umap = UMAP(random_state=42)
    embedding = umap.fit_transform(dist)

print(embedding[:5])

# # plt.scatter(embedding[:,0], embedding[:,1])

# # HDBSCAN

hdbscan = HDBSCAN(min_cluster_size=8)

clustering = hdbscan.fit_predict(embedding)

# # Three clusters !

# # plt.scatter(embedding[:,0], embedding[:,1], c=clustering);

# # Titles from the first cluster (Fun)

titles_cluster = get_titles_from_cluster(0)

# # Titles from the second cluster (Fan)

titles_cluster = get_titles_from_cluster(1)

# # Titles from the third cluster (cinemagoer)
Пример #19
0
def getClusters(umap):
    hdbscan = HDBSCAN(min_cluster_size=5)
    clusters = hdbscan.fit_predict(umap)
    return clusters
Пример #20
0
    def fit(self, X, y=None, sample_weight=None):
        """X is a dataframe."""
        if self.method not in ("dbscan", "hdbscan", "spark"):
            raise ValueError("Unsupported method '%s'" % self.method)
        if not self.dbscan_params:
            self.dbscan_params = dict(
                min_samples=20, n_jobs=-1, algorithm='brute',
                metric=partial(distance_dataframe, X, **dict(
                    junction_dist=StringDistance(),
                    correct=False, tol=0)))
        if not self.hdbscan_params and self.method == 'hdbscan':
            self.hdbscan_params = dict(
                min_samples=20, n_jobs=-1,
                metric=partial(distance_dataframe, X, **dict(
                    junction_dist=StringDistance(),
                    correct=False, tol=0)))

        self.dbscan_params['eps'] = self.eps
        # new part: group by junction and v genes
        if self.method == 'hdbscan' and False:
            # no grouping; unsupported sample_weight
            groups_values = [[x] for x in np.arange(X.shape[0])]
        else:
            # list of lists
            groups_values = X.groupby(
                ["v_gene_set_str", self.model + "junc"]).groups.values()

        idxs = np.array([elem[0] for elem in groups_values])  # take one of them
        sample_weight = np.array([len(elem) for elem in groups_values])
        X_all = idxs.reshape(-1, 1)

        if self.kmeans_params.get('n_clusters', True):
            # ensure the number of clusters is higher than points
            self.kmeans_params['n_clusters'] = min(
                self.kmeans_params['n_clusters'], X_all.shape[0])
        kmeans = MiniBatchKMeans(**self.kmeans_params)

        lengths = X[self.model + 'junction_length'].values
        kmeans.fit(lengths[idxs].reshape(-1, 1))
        dbscan_labels = np.zeros_like(kmeans.labels_).ravel()

        if self.method == 'hdbscan':
            from hdbscan import HDBSCAN
            from hdbscan.prediction import all_points_membership_vectors
            dbscan_sk = HDBSCAN(**self.hdbscan_params)
        else:
            dbscan_sk = DBSCAN(**self.dbscan_params)
        if self.method == 'spark':
            from pyspark import SparkContext
            from icing.externals.pypardis import dbscan as dbpard
            sc = SparkContext.getOrCreate()
            sample_weight_map = dict(zip(idxs, sample_weight))
            # self.dbscan_params.pop('n_jobs', None)
            dbscan = dbpard.DBSCAN(
                dbscan_params=self.dbscan_params,
                **self.dbspark_params)
        # else:

        for i, label in enumerate(np.unique(kmeans.labels_)):
            idx_row = np.where(kmeans.labels_ == label)[0]

            if self.verbose:
                print("Iteration %d/%d" % (i, np.unique(kmeans.labels_).size),
                      "(%d seqs)" % idx_row.size, end='\r')

            X_idx = idxs[idx_row].reshape(-1, 1).astype('float64')
            weights = sample_weight[idx_row]

            if idx_row.size == 1:
                db_labels = np.array([0])
            elif self.method == 'spark' and idx_row.size > 5000:
                test_data = sc.parallelize(enumerate(X_idx))
                dbscan.train(test_data, sample_weight=sample_weight_map)
                db_labels = np.array(dbscan.assignments())[:, 1]
            elif self.method == 'hdbscan':
                db_labels = dbscan_sk.fit_predict(X_idx)  # unsupported weights
                # avoid noise samples
                soft_clusters = all_points_membership_vectors(dbscan_sk)
                db_labels = np.array([np.argmax(x) for x in soft_clusters])
            else:
                db_labels = dbscan_sk.fit_predict(
                    X_idx, sample_weight=weights)

            if len(dbscan_sk.core_sample_indices_) < 1:
                db_labels[:] = 0
            if -1 in db_labels:
                balltree = BallTree(
                    X_idx[dbscan_sk.core_sample_indices_],
                    metric=dbscan_sk.metric)
                noise_labels = balltree.query(
                    X_idx[db_labels == -1], k=1, return_distance=False).ravel()
                # get labels for core points, then assign to noise points based
                # on balltree
                dbscan_noise_labels = db_labels[
                    dbscan_sk.core_sample_indices_][noise_labels]
                db_labels[db_labels == -1] = dbscan_noise_labels

            # hopefully, there are no noisy samples at this time
            db_labels[db_labels > -1] = db_labels[db_labels > -1] + np.max(dbscan_labels) + 1
            dbscan_labels[idx_row] = db_labels  # + np.max(dbscan_labels) + 1

        if self.method == 'spark':
            sc.stop()
        labels = dbscan_labels

        # new part: put together the labels
        labels_ext = np.zeros(X.shape[0], dtype=int)
        labels_ext[idxs] = labels
        for i, list_ in enumerate(groups_values):
            labels_ext[list_] = labels[i]
        self.labels_ = labels_ext
Пример #21
0
def return_hdbscansvm(df, txt_col, rf = False, clust_size = 15,  samp_size = 5, svmx = False, svmc = 1000, clust_metric = 'braycurtis'):

    super_flat =pd.DataFrame(df)
    r = super_flat
    
    hdb = HDBSCAN(min_cluster_size = clust_size, min_samples= samp_size, metric = clust_metric, cluster_selection_method = 'leaf')
    n = hdb.fit_predict(r)
    f = pd.get_dummies(n)

    with_cat = pd.Series([str(i) for i in list(zip(txt_col, n))], name = 'text')
    answers = pd.concat([with_cat, f], axis = 1 )
    
    answers = [answers, n, hdb]
    if svmx == True:

        ans_ = pd.concat([pd.DataFrame(txt_col), f], axis = 1 )
        ans_ = ans_[ans_[-1] == 0]
        ans_ = ans_.drop(-1, axis = 1)
        ans_ = ans_.melt(id_vars='text', var_name='cluster3', value_name='value')
        ans_ = ans_[ans_['value'] == 1]
        ans_ = ans_.drop('value', axis = 1)
        ans_svm = ans_[ans_['cluster3'] > -1]
        for_x = pd.concat([pd.DataFrame(txt_col),pd.DataFrame(r)], axis = 1)
        authorssvm = for_x.merge(ans_svm,left_on = 'text', right_on= 'text', how = 'right')
        y = list(authorssvm['cluster3'])
        X = authorssvm.drop(['cluster3', 'text'], axis = 1)
        X = X.fillna(0)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
        clf = svm.SVC(C = svmc, kernel = 'rbf', gamma = 0.7, random_state = 12)
        print('done')

        clf.fit(X_train,y_train)
        print('done')
        y_pred= clf.predict(X_test)

        print("Accuracy:",metrics.accuracy_score(y_test, y_pred))


        nsvm = clf.predict(r)
        f = pd.get_dummies(nsvm)

        with_catsvm = pd.Series([str(i) for i in list(zip(txt_col, nsvm))], name = 'text')
        answerssvm = pd.concat([with_catsvm, f], axis = 1 )
        answers = [answerssvm, nsvm, hdb]
    if rf == True:

        ans_ = pd.concat([pd.DataFrame(txt_col), f], axis = 1 )
        ans_ = ans_[ans_[-1] == 0]
        ans_ = ans_.drop(-1, axis = 1)
        ans_ = ans_.melt(id_vars='text', var_name='cluster3', value_name='value')
        ans_ = ans_[ans_['value'] == 1]
        ans_ = ans_.drop('value', axis = 1)
        ans_svm = ans_[ans_['cluster3'] > -1]
        for_x = pd.concat([pd.DataFrame(txt_col),pd.DataFrame(r)], axis = 1)
        authorssvm = for_x.merge(ans_svm,left_on = 'text', right_on= 'text', how = 'right')
        y = list(authorssvm['cluster3'])
        X = authorssvm.drop(['cluster3', 'text'], axis = 1)
        X = X.fillna(0)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
        clf = RandomForestClassifier(min_samples_split =50,max_depth = 10, random_state=12)
        print('done')

        clf.fit(X_train,y_train)
        print('done')
        y_pred= clf.predict(X_test)

        print("Accuracy:",metrics.accuracy_score(y_test, y_pred))


        nsvm = clf.predict(r)
        f = pd.get_dummies(nsvm)

        with_catsvm = pd.Series([str(i) for i in list(zip(txt_col, nsvm))], name = 'text')
        answerssvm = pd.concat([with_catsvm, f], axis = 1 )
        answers = [answerssvm, nsvm, hdb]
    
    return(answers)
Пример #22
0
 def __run_hdbscan(dataset, eps, min_cluster_size, min_samples, algorithm):
     clusterer = HDBSCAN(cluster_selection_epsilon=eps,
                         min_cluster_size=min_cluster_size,
                         min_samples=min_samples,
                         algorithm=algorithm)
     return clusterer, clusterer.fit_predict(dataset)








"""
hdbscan
"""
hdbscan = HDBSCAN(min_cluster_size=1375)

best_model = hdbscan
predictions = hdbscan.fit_predict(tracks_df)
print(len(set(predictions)))
visualize_clusters(data=tracks_df, predictions=predictions, n_cluster=len(set(hdbscan.labels_)), stochastic=True)
best_k = len(set(hdbscan.labels_))

print("len of tracks_df: %d" % len(tracks_df))

"""
explain track clusters with original afs
"""
statement = "SELECT track_id, danceability, energy, speechiness, acousticness, instrumentalness, tempo, valence, liveness FROM acoustic_features"
af_df = pd.read_sql(sql=statement, con=engine).set_index("track_id")
scaler = MinMaxScaler()
af_df = pd.DataFrame(scaler.fit_transform(af_df), columns=af_df.columns, index=af_df.index)

df = tracks_df.merge(af_df, left_index=True, right_index=True)
Пример #24
0
    def __init__(self,
                 pois,
                 alg="hdbscan",
                 min_samples=None,
                 eps=None,
                 n_jobs=-1,
                 **kwargs):
        """Computes clusters using the sklearn algorithms or HDBSCAN.
        Parameters:
            pois (GeoDataFrame): A POI GeoDataFrame.
            alg (string): The clustering algorithm to use (hdbscan, dbscan or optics; default: hdbscan).
            min_samples (float|integer): The number of samples in a neighborhood for a point
                to be considered as a core point. Expressed as an absolute number (int > 1) or
                a fraction of the number of samples (float between 0 and 1).
            eps (float): The neighborhood radius (used only in dbscan).
            n_jobs (integer): Number of parallel jobs to run in the algorithm (default: -1)
            **kwargs: Optional arguments depending on the algorithm.
        """
        if min_samples is None:
            min_samples = int(round(np.log(len(pois))))
        if alg == 'dbscan':
            assert eps is not None
        self.pois = pois
        self.alg = alg
        self.min_samples = min_samples
        self.eps = eps
        self.n_jobs = n_jobs

        # Prepare list of coordinates
        data_arr = pois.geometry.get_coordinates().values()

        # Compute the clusters
        if alg == 'hdbscan':
            min_cluster_size = kwargs.pop('min_cluster_size', 50)
            core_dist_n_jobs = kwargs.pop('core_dist_n_jobs', n_jobs)
            clusterer = HDBSCAN(min_cluster_size=min_cluster_size,
                                min_samples=min_samples,
                                core_dist_n_jobs=core_dist_n_jobs,
                                **kwargs)
            labels = clusterer.fit_predict(data_arr)

            tree = clusterer.condensed_tree_.to_pandas()
            cluster_tree = tree[tree.child_size > 1]
            chosen_clusters = clusterer.condensed_tree_._select_clusters()

            eps_per_cluster = cluster_tree[cluster_tree.child.isin(chosen_clusters)].\
                drop("parent", axis=1).drop("child", axis=1).reset_index().drop("index", axis=1)
            eps_per_cluster['lambda_val'] = eps_per_cluster[
                'lambda_val'].apply(lambda x: 1 / x)
            eps_per_cluster.rename(columns={
                'lambda_val': 'eps',
                'child_size': 'cluster_size'
            },
                                   inplace=True)

        else:
            if alg == 'dbscan':
                clusterer = DBSCAN(eps=eps,
                                   min_samples=min_samples,
                                   n_jobs=n_jobs,
                                   **kwargs).fit(data_arr)
            elif alg == 'optics':
                clusterer = OPTICS(min_samples=min_samples,
                                   eps=eps,
                                   n_jobs=n_jobs,
                                   **kwargs).fit(data_arr)
            else:
                raise Exception(
                    'Implemented algoriths are hdbscan, dbscan and optics.')
            labels = clusterer.labels_

            num_of_clusters_no_noise = set(labels)
            num_of_clusters_no_noise.discard(-1)
            num_of_clusters_no_noise = len(num_of_clusters_no_noise)

            eps_per_cluster = pd.DataFrame(
                {'eps': [eps] * num_of_clusters_no_noise})
            eps_per_cluster['cluster_size'] = 0

        # Assign cluster labels to initial POIs
        pois['cluster_id'] = labels

        # Separate POIs that are inside clusters from those that are noise
        pois_in_clusters = pois[pois.cluster_id > -1]
        pois_noise = pois[pois.cluster_id == -1]

        self._num_of_clusters = len(set(labels))
        self._pois_in_clusters = pois_in_clusters
        self._eps_per_cluster = eps_per_cluster
        self._pois_noise = pois_noise
        self._shape_type = None
Пример #25
0
X = x.loc[~missing, variables]

pd.scatter_matrix(X)

# Visualize using MDS, as we use a distance based clustering method.
# https://datascience.stackexchange.com/questions/22/k-means-clustering-for-mixed-numeric-and-categorical-data
Z = scale(X.astype(float))
scale_dist = dist(Z, "braycurtis")

mds_scale = MDS(n_components=2, dissimilarity="precomputed", max_iter=1000)
coords_scale = mds_scale.fit_transform(scale_dist)
plt.scatter(*coords_scale.T)

# Run HDSCAN* over precomputed distances.
clusterer = HDBSCAN(min_cluster_size=50, metric="precomputed")
labels = clusterer.fit_predict(scale_dist)

# Cursory diagnostics.
Counter(labels)
clusterer.cluster_persistence_
np.mean(clusterer.probabilities_[labels != -1])

fig, axes = plt.subplots(1, 2)
ax1, ax2 = axes

scatter(*coords_scale.T, labels=labels + 1, ax=ax1)
ax1.legend()
ax1.set_title("Clusters")

ax2.hist(clusterer.probabilities_[labels != -1], bins=60, normed=True)
ax2.set_title("Probability of belonging to assigned cluster")
    def fit_som(self,
                dataset,
                som_threshold=0.5,
                som_size=100,
                som_sigma=1.0,
                som_learning_rate=0.5):
        self.corpus, self.orig = text_preprocessing(dataset, self.msg_column,
                                                    self.min_msg_length,
                                                    self.stop_words_set)

        logger.info("corpus length: " + str(len(self.corpus)))

        fname = \
            RESOURCE_DIR + '/som_' + \
            str(self.max_features) + '_' + \
            str(som_size) + '_' + \
            str(som_sigma).replace(".", "_") + '_' + \
            str(som_learning_rate).replace(".", "_")

        if not self.overwrite:
            fname = free_file_name(fname, 'pkl')
        else:
            fname = fname + '.pkl'

        logger.info("Using SOM model file: " + str(fname))

        if not os.path.isfile(fname):
            cv = CountVectorizer(max_features=self.max_features)
            self.X = cv.fit_transform(self.corpus).toarray()
            self.sc = MinMaxScaler(feature_range=(0, 1))
            self.X_scale = self.sc.fit_transform(self.X)
            self.som = MiniSom(x=som_size,
                               y=som_size,
                               input_len=self.max_features,
                               sigma=som_sigma,
                               learning_rate=som_learning_rate)
            self.som.train_batch(data=self.X_scale,
                                 num_iteration=len(self.X_scale))
            with open(fname, 'wb') as file:
                pickle.dump((cv, self.sc, self.som), file)
        else:
            with open(fname, 'rb') as file:
                cv, self.sc, self.som = pickle.load(file)
            self.X = cv.fit_transform(self.corpus).toarray()
            self.X_scale = self.sc.fit_transform(self.X)

        logger.info('X rows: ' + str(len(self.X)))
        logger.info('X cols: ' + str(len(self.X[0])))

        distance_map = self.som.distance_map()

        indexes_coords = []
        indexes_dist = []
        for i in range(0, len(distance_map)):
            for j in range(0, len(distance_map[i])):
                if distance_map[i, j] < som_threshold:
                    indexes_coords.append(i)
                    indexes_coords.append(j)
                    indexes_dist.append(distance_map[i, j])

        coord_array = np.array(indexes_coords).reshape(
            int(len(indexes_coords) / 2), 2)
        dist_array = np.array(indexes_dist).reshape(int(len(indexes_dist)), 1)

        clustering = HDBSCAN(min_cluster_size=5)
        y_pred = clustering.fit_predict(coord_array)

        next_cluster_num = get_next_cluster_num()

        self.y = []
        filtered_coords = []
        filtered_dists = []
        for i in range(0, len(y_pred)):
            if y_pred[i] < 0:
                continue
            filtered_coords.append(coord_array[i])
            filtered_dists.append(dist_array[i])
            self.y.append(int(y_pred[i]) + int(next_cluster_num))

        self.y = np.array(self.y, dtype='int')

        logger.info('Next cluster num: ' + str(next_cluster_num))

        clusters_codes = pd.DataFrame(self.y, columns=['cl'])['cl'].unique()

        self.n_clusters = len(clusters_codes)

        coord_array = np.array(filtered_coords)
        dist_array = np.array(filtered_dists)

        self.index_array = np.concatenate((coord_array, dist_array), axis=1)

        logger.info('Cluster codes (' + str(self.n_clusters) + '):')
        logger.info(clusters_codes)