예제 #1
0
    def perform_spectral_clustering(self, no_clusters):
        spectral_clusterer = SpectralClustering(n_clusters=no_clusters)
        spectral_clusterer.fit(self.distance_matrix)
        self.spectral_results = {
            "parameters": spectral_clusterer.get_params(),
            "labels": spectral_clusterer.labels_,
            "n_clusters": np.unique(spectral_clusterer.labels_).max() + 1,
            "clusters": label_cnt_dict(spectral_clusterer.labels_)
        }

        print_dict(self.spectral_results)

        #gaussian kernel affinity matrix
        self.affinity_matrix = spectral_clusterer.affinity_matrix_
def do_spectral_clustering(target_csv):
    num_cluster = 24
    df_data = pd.read_csv(os.path.join(CONFIG.CSV_PATH, target_csv + '.csv'),
                          index_col=0,
                          header=0,
                          encoding='utf-8-sig')
    df_data.index.name = 'short_code'
    print(df_data.iloc[:100])
    print(df_data.shape)

    start_time = time.time()
    ds_data = df_data.sample(frac=0.5)
    ns_data = df_data.loc[set(df_data.index) - set(ds_data.index)]
    clustering = SpectralClustering(n_clusters=num_cluster,
                                    random_state=42,
                                    assign_labels='discretize')
    clustering.fit(ds_data)

    print("time elapsed for clustering: " + str(time.time() - start_time))
    print(clustering.get_params())
    print(clustering.labels_)
    count_percentage(clustering.labels_)
    start_time = time.time()
    result_ds = pd.DataFrame(data=clustering.labels_,
                             index=ds_data.index,
                             columns=['cluster'])
    ns_label = clustering.fit_predict(ns_data)
    result_ns = pd.DataFrame(data=ns_label,
                             index=ns_data.index,
                             columns=['cluster'])
    result_df = pd.concat([result_ds, result_ns])
    result_df.sort_index(inplace=True)
    print("time elapsed for prediction: " + str(time.time() - start_time))

    start_time = time.time()
    print("calinski_harabasz_score: ",
          calinski_harabasz_score(df_data, result_df['cluster'].squeeze()))
    print("silhouette_score: ",
          silhouette_score(df_data, result_df['cluster'].squeeze()))
    print("davies_bouldin_score: ",
          davies_bouldin_score(df_data, result_df['cluster'].squeeze()))
    print("time elapsed for scoring: " + str(time.time() - start_time))
    result_df.to_csv(os.path.join(CONFIG.CSV_PATH,
                                  'clustered_spectral_' + target_csv + '.csv'),
                     encoding='utf-8-sig')
예제 #3
0
def cluster(x: np.ndarray, y: np.ndarray) -> np.ndarray:
    """
    Fits a clustering model.

    :param x: the x train values.
    :param y: the label values.
    :return: the clustering labels.
    """
    logger.log('Creating model...')
    clustering = SpectralClustering(affinity='nearest_neighbors',
                                    n_clusters=5,
                                    n_neighbors=250,
                                    random_state=0,
                                    n_jobs=-1)
    clustering_params = clustering.get_params()
    logger.log('Applying Spectral Clustering with params: \n{}'.format(
        clustering_params))

    logger.log('Fitting...')
    start_time = time.perf_counter()
    clustering.fit(x)
    end_time = time.perf_counter()
    logger.log('Model has been fit in {:.3} seconds.'.format(end_time -
                                                             start_time))

    if PLOTTING_MODE != 'none':
        # Plot resulting clusters.
        plotter.subfolder = 'graphs/Spectral Clustering/clusters'
        plotter.filename = 'after_LLE_c={}-n={}'.format(
            clustering_params['n_clusters'], clustering_params['n_neighbors'])
        plotter.xlabel = 'first feature'
        plotter.ylabel = 'second feature'
        plotter.title = 'Spectral Clustering after LLE\nClusters: {}, Neighbors: {}' \
            .format(clustering_params['n_clusters'], clustering_params['n_neighbors'])
        plotter.scatter(x, clustering.labels_, clustering=True)

        # Plot classes compared to clusters.
        plotter.subfolder = 'graphs/Spectral Clustering/classes'
        plotter.scatter(x,
                        y,
                        clusters=clustering.labels_,
                        class_labels=helpers.datasets.get_gene_name)

    return clustering.labels_
예제 #4
0
def _spectral_clustering(table,
                         input_cols,
                         prediction_col='prediction',
                         n_clusters=8,
                         eigen_solver=None,
                         random_state=None,
                         n_init=10,
                         gamma=1.0,
                         affinity='rbf',
                         n_neighbors=10,
                         eigen_tol=0.0,
                         assign_labels='kmeans',
                         degree=3,
                         coef0=1):
    inputarr = table[input_cols]

    _eigen_solver = None if eigen_solver == 'None' else eigen_solver
    sc = SpectralClustering(n_clusters=n_clusters,
                            eigen_solver=_eigen_solver,
                            random_state=random_state,
                            n_init=n_init,
                            gamma=gamma,
                            affinity=affinity,
                            n_neighbors=n_neighbors,
                            eigen_tol=eigen_tol,
                            assign_labels=assign_labels,
                            degree=degree,
                            coef0=coef0)

    sc.fit(inputarr)

    label_name = {
        'n_clusters': 'N Clusters',
        'eigen_solver': 'Eigen Solver',
        'random_state': 'Seed',
        'n_init': 'N Init',
        'gamma': 'Gamma',
        'affinity': 'Affinity',
        'n_neighbors': 'N Neighbors',
        'eigen_tol': 'Eigen Tolerance',
        'assign_labels': 'Assign Labels',
        'degree': 'Degree',
        'coef0': 'Zero Coefficient'
    }
    get_param = sc.get_params()
    param_table = pd.DataFrame.from_items(
        [['Parameter', list(label_name.values())],
         ['Value', [get_param[x] for x in list(label_name.keys())]]])

    # cluster_centers = sc.cluster_centers_
    labels = sc.labels_
    colors = cm.nipy_spectral(np.arange(n_clusters).astype(float) / n_clusters)

    if len(input_cols) > 1:
        pca2_model = PCA(n_components=2).fit(inputarr)
        pca2 = pca2_model.transform(inputarr)
    fig_samples = _spectral_clustering_samples_plot(
        labels, table, input_cols, 100, n_clusters, colors) if len(
            table.index) > 100 else _spectral_clustering_samples_plot(
                labels, table, input_cols, None, n_clusters, colors)

    if len(input_cols) > 1:
        fig_pca = _spectral_clustering_pca_plot(labels, pca2_model, pca2,
                                                n_clusters, colors)
        rb = BrtcReprBuilder()
        rb.addMD(
            strip_margin("""
        | ## Spectral Clustering Result
        | - Samples
        | {fig_samples}
        | {fig_pca}
        | ### Parameters
        | {params}
        """.format(fig_samples=fig_samples,
                   fig_pca=fig_pca,
                   params=pandasDF2MD(param_table))))
    else:
        rb = BrtcReprBuilder()
        rb.addMD(
            strip_margin("""
        | ## Mean Shift Result
        | - Samples
        | {fig_samples}
        | ### Parameters
        | {params}
        """.format(fig_samples=fig_samples, params=pandasDF2MD(param_table))))

    model = _model_dict('spectral_clustering')
    model['model'] = sc
    model['input_cols'] = input_cols
    model['_repr_brtc_'] = rb.get()

    out_table = table.copy()
    out_table[prediction_col] = labels
    return {'out_table': out_table, 'model': model}
예제 #5
0
파일: eda.py 프로젝트: guptaanmol184/craved
    def perform_spectral_clustering(self,
                                    no_clusters,
                                    affinity='rbf',
                                    gamma=1.0,
                                    n_neighbors=10,
                                    pass_labels=False,
                                    n_init=10,
                                    force_manual=False):

        if force_manual:
            if not hasattr(self, "distance_matrix"):
                self.comp_distance_matrix()

            if affinity == 'rbf':
                self.affinity_matrix = np.exp(-gamma * self.distance_matrix**2)

            elif affinity == 'nearest_neighbors':
                self.affinity_matrix = kneighbors_graph(
                    self.data, n_neighbors=n_neighbors,
                    include_self=True).toarray()

            else:
                raise Exception("Affinity is NOT recognised as VALID ...")

            print("Computed Affinity Matrix ...")

            #laplacian matrix of graph
            lap, dd = laplacian(self.affinity_matrix,
                                normed=True,
                                return_diag=True)
            lap *= -1
            print("Computed Graph Laplacian ...")

            lambdas, diffusion_map = np.linalg.eigh(lap)
            print("Performed Eigen-decomposition ...")

            embedding = diffusion_map.T[(self.n_samples - no_clusters):] * dd

            #deterministic vector flip
            sign = np.sign(embedding[range(embedding.shape[0]),
                                     np.argmax(np.abs(embedding), axis=1)])
            embedding = embedding.T * sign

            if no_clusters == 2:
                visualise_2D(
                    embedding.T[0], embedding.T[1],
                    (self.class_labels) if pass_labels == True else None)

            elif no_clusters == 3:
                visualise_3D(
                    embedding.T[0], embedding.T[1], embedding.T[2],
                    (self.class_labels) if pass_labels == True else None)

            print("Performing K-Means clustering in eigen-space")
            kmeans_clusterer = KMeans(n_clusters=no_clusters, n_jobs=-1)
            kmeans_clusterer.fit(embedding)

            spectral_params = {
                "affinity": affinity,
                "gamma": gamma,
                "n_neighbors": n_neighbors,
                "n_init": n_init
            }

            self.spectral_results = {
                "parameters": spectral_params,
                "labels": kmeans_clusterer.labels_,
                "n_clusters": np.unique(kmeans_clusterer.labels_).max() + 1,
                "clusters": label_cnt_dict(kmeans_clusterer.labels_)
            }

        else:
            spectral_clusterer = SpectralClustering(n_clusters=no_clusters,
                                                    gamma=gamma,
                                                    affinity=affinity,
                                                    n_neighbors=n_neighbors,
                                                    n_init=n_init)
            spectral_clusterer.fit(
                self.data,
                y=(self.class_labels if pass_labels is True else None))
            self.spectral_results = {
                "parameters": spectral_clusterer.get_params(),
                "labels": spectral_clusterer.labels_,
                "n_clusters": np.unique(spectral_clusterer.labels_).max() + 1,
                "clusters": label_cnt_dict(spectral_clusterer.labels_)
            }

        print_dict(self.spectral_results)