Пример #1
0
def goKmeans():
    clusteringNum = request.form['clusteringNum']
    dataset = json.loads(request.form.get('dataset'))
    if (clusteringNum == '' or int(float(clusteringNum)) < 2):
        clusteringNum = 2
    dataset = np.array(dataset)
    #dataset = np.delete(dataset, 0, 1)
    new_list = list(
        list(float(a) for a in b if BN.is_number(a)) for b in dataset)
    kmeans = KMeans(n_clusters=int(float(clusteringNum)),
                    random_state=0).fit(new_list)
    new_list_as_array = np.array(new_list)
    SilhouetteVisualize = SilhouetteVisualizer(kmeans)
    SilhouetteVisualize.fit(new_list_as_array)
    if (len(new_list) > 10):
        k_upper_bound = 10
    else:
        k_upper_bound = len(new_list)
    KElbowVisualize = KElbowVisualizer(KMeans(), k=k_upper_bound)
    KElbowVisualize.fit(new_list_as_array)  # Fit the data to the visualizer
    silhouette = SilhouetteVisualize.silhouette_score_
    elbow = KElbowVisualize.elbow_value_
    return jsonify({
        'inputArray': list(new_list),
        'kmeansLabels': (kmeans.labels_.tolist()),
        'elbowValue': str(elbow),
        'silhouetteValue': ('%.3f' % silhouette)
    })
Пример #2
0
    def clustering(self, k):
        word_vectors = self.__model_p__.wv
        KM_model = KMeans(n_clusters=k,
                          max_iter=1000,
                          random_state=True,
                          n_init=50).fit(X=word_vectors.vectors)

        center_closest = []
        for i in range(k):
            center_closest.append([
                el[0] for el in word_vectors.similar_by_vector(
                    KM_model.cluster_centers_[i], topn=15, restrict_vocab=None)
            ])

        metric_str = 'euclidean'
        score = silhouette_score(word_vectors.vectors,
                                 KM_model.predict(word_vectors.vectors),
                                 metric=metric_str)
        print("silhouette_score:", score)

        SVmodel = SilhouetteVisualizer(KM_model, is_fitted=True)
        SVmodel.fit(word_vectors.vectors)
        SVmodel.show()
        words = pd.DataFrame(word_vectors.vocab.keys(), columns=['words'])
        words['vectors'] = words.words.apply(lambda x: word_vectors[f'{x}'])
        words['cluster'] = words.vectors.apply(
            lambda x: KM_model.predict([np.array(x)]))
        words.cluster = words.cluster.apply(lambda x: x[0])
        words['closeness_score'] = words.apply(
            lambda x: 1 / (KM_model.transform([x.vectors]).min()), axis=1)

        return KM_model, center_closest, score, words
Пример #3
0
def silhouettevisual(model, X, graph):
    visualizer = SilhouetteVisualizer(
        model,
        colors='yellowbrick',
        title=" Silhouette Plot of KMeans Clustering for " + graph)
    visualizer.fit(X)
    visualizer.show()
Пример #4
0
def log_silhouette_chart(model, X, experiment=None, **kwargs):
    """Log Silhouette Coefficients charts for KMeans clusterer.

    Charts are computed for j = 2, 3, ..., n_clusters.

    Make sure you created an experiment by using ``neptune.create_experiment()`` before you use this method.

    Tip:
        Check `Neptune documentation <https://docs.neptune.ai/integrations/scikit_learn.html>`_ for the full example.

    Args:
        model (:obj:`KMeans`):
            | KMeans object.
        X (:obj:`ndarray`):
            | Training instances to cluster.
        experiment (:obj:`neptune.experiments.Experiment`, optional, default is ``None``):
            | Neptune ``Experiment`` object to control to which experiment you log the data.
            | If ``None``, log to currently active, and most recent experiment.
        kwargs:
            KMeans parameters.

    Returns:
        ``None``

    Examples:
        .. code:: python3

            km = KMeans(n_init=11, max_iter=270)
            X, y = make_blobs(n_samples=579, n_features=17, centers=7, random_state=28743)

            neptune.init('my_workspace/my_project')
            neptune.create_experiment()

            log_silhouette_chart(km, X=X, n_clusters=12)
    """
    assert isinstance(model,
                      KMeans), 'Model should be sklearn KMeans instance.'
    exp = _validate_experiment(experiment)

    model.set_params(**kwargs)

    n_clusters = model.get_params()['n_clusters']

    for j in range(2, n_clusters + 1):
        model.set_params(**{'n_clusters': j})
        model.fit(X)

        try:
            fig, ax = plt.subplots()
            visualizer = SilhouetteVisualizer(model, is_fitted=True, ax=ax)
            visualizer.fit(X)
            visualizer.finalize()
            exp.log_image(
                'charts_sklearn',
                fig,
                image_name='Silhouette Coefficients for k={}'.format(j))
            plt.close(fig)
        except Exception as e:
            print('Did not log Silhouette Coefficients chart. Error {}'.format(
                e))
Пример #5
0
def plot_cluster_silhouette(estimator, dataset, version):
    visualizer = SilhouetteVisualizer(estimator, colors='yellowbrick')
    visualizer.fit(data.DATA[dataset][version]['x_train'])
    visualizer.show(
        f'{PLOTS_FOLDER}/{dataset}/{version}/{dataset}_{version}_{estimator.__class__.__name__}_cluster_silhouettes_k{estimator.n_clusters}.png'
    )
    plt.clf()
Пример #6
0
def kmeans_exp():

    with open('features_GMM.csv', mode='r') as feature_file:
        feature_reader = csv.reader(feature_file,
                                    delimiter=',',
                                    quotechar='"',
                                    quoting=csv.QUOTE_MINIMAL)
        i = 0
        for fs in feature_reader:
            if i > 0:
                print(f"user n#{i}")
                print(fs)
                fs.pop()
                df_training = pd.DataFrame()
                training_list, testing_dict, training_fs_list, validation_fs_dict = load_dataset(
                    i)
                training_list = training_list[0:16] + training_list[
                    21:25] + training_list[36:40]
                for element in training_list:
                    df_training = df_training.append(element)

                # knn = KNeighborsClassifier()
                model = KMeans(n_clusters=6, random_state=42).fit(df_training)

                for signature in testing_dict['genuine']:
                    cluster = model.predict(signature)
                    print("testing signature:")
                    occurences = Counter(cluster)
                    print(occurences)

                visualizer = SilhouetteVisualizer(model)
                visualizer.fit(df_training)
                visualizer.show()

            i += 1
def do_kmeans(df, k):
    k_means = KMeans(init='k-means++', n_clusters=k, n_init=10, max_iter=1000, random_state=40)
    k_means.fit(df)
    wcss = k_means.inertia_
    sil = silhouette_score(df, k_means.labels_)
    
    plt.style.use('default');

    sample_silhouette_values = silhouette_samples(df, k_means.labels_)
    sizes = 200*sample_silhouette_values

    plt.figure(figsize=(16, 10));
    plt.grid(True);

    plt.scatter(df.iloc[:, 0], df.iloc[:, 1], s=sizes, c=k_means.labels_)
    plt.scatter(k_means.cluster_centers_[:, 0], k_means.cluster_centers_[:, 1], marker='x', s=300, c="black")

    plt.title("K-Means (K={}, WCSS={:.2f}, Sil={:.2f})".format(k, wcss, sil), fontsize=20);
    plt.xlabel('Age', fontsize=22);
    plt.ylabel('Income', fontsize=22);
    plt.xticks(fontsize=18);
    plt.yticks(fontsize=18);
    plt.show()
    
    visualizer = SilhouetteVisualizer(k_means)
    visualizer.fit(df)
    visualizer.poof()
    fig = visualizer.ax.get_figure();
    
    print("K={}, WCSS={:.2f}, Sil={:.2f}".format(k, wcss, sil))
Пример #8
0
def silhouette(ax=None):
    X, y = make_blobs(centers=12, n_samples=1000, n_features=16, shuffle=True)

    viz = SilhouetteVisualizer(KMeans(9), ax=ax)
    viz.fit(X)
    viz.finalize()

    return viz
Пример #9
0
def create_silhouette_chart(model, X, **kwargs):
    """Create silhouette coefficients charts for KMeans clusterer.

    Charts are computed for j = 2, 3, ..., n_clusters.

    Tip:
        Check Sklearn-Neptune integration
        `documentation <https://docs-beta.neptune.ai/essentials/integrations/machine-learning-frameworks/sklearn>`_
        for the full example.

    Args:
        model (:obj:`KMeans`):
            | KMeans object.
        X (:obj:`ndarray`):
            | Training instances to cluster.
        kwargs:
            KMeans parameters.

    Returns:
        ``neptune.types.FileSeries`` object that you can assign to run's ``base_namespace``.

    Examples:
        .. code:: python3

            import neptune.new.integrations.sklearn as npt_utils

            km = KMeans(n_init=11, max_iter=270)
            X, y = make_blobs(n_samples=579, n_features=17, centers=7, random_state=28743)

            run = neptune.init(project='my_workspace/my_project')
            run['kmeans/silhouette'] = npt_utils.create_silhouette_chart(km, X, n_clusters=12)
    """
    assert isinstance(model,
                      KMeans), 'Model should be sklearn KMeans instance.'

    charts = []

    model.set_params(**kwargs)

    n_clusters = model.get_params()['n_clusters']

    for j in range(2, n_clusters + 1):
        model.set_params(**{'n_clusters': j})
        model.fit(X)

        try:
            fig, ax = plt.subplots()
            visualizer = SilhouetteVisualizer(model, is_fitted=True, ax=ax)
            visualizer.fit(X)
            visualizer.finalize()
            charts.append(neptune.types.File.as_image(fig))
            plt.close(fig)
        except Exception as e:
            print('Did not log Silhouette Coefficients chart. Error {}'.format(
                e))

    return neptune.types.FileSeries(charts)
Пример #10
0
def showSilhouette():
    # Make 8 blobs dataset
    X, y = make_blobs(centers=8)
    # Instantiate the clustering model and visualizer
    model = MiniBatchKMeans(6)
    visualizer = SilhouetteVisualizer(model)

    visualizer.fit(X)  # Fit the training data to the visualizer
    visualizer.poof()  # Draw/show/poof the data
 def silhouette(matrix, k):
     """
     This function is also not explicitly used since it shows the decided 'k' is good or not.
     :param matrix: tf-idf matrix
     :param k: decided k (from elbow matrix)
     :return: show graph with all cluster's internal similarities and uniqueness with other clusters.
     """
     model_kmeans = KMeans(n_clusters=k, max_iter=200)
     silhouette = SilhouetteVisualizer(model_kmeans)
     silhouette.fit(matrix)
     silhouette.poof()
Пример #12
0
def Silhouette_plot(x, from_k, to_k):
    sil_score = []
    for k in range(from_k, to_k + 1):
        #Instatiate the clustering model and visualizer
        m = KMeans(n_clusters=k)
        visualizer = SilhouetteVisualizer(m)
        visualizer.fit(x)
        #Draw/show/poof the data
        visualizer.poof()
        sil_score.append([visualizer.silhouette_score_.round(3), k])
    return sil_score
Пример #13
0
 def shiloutte_score_plot(self, directory):
     self._check_model()
     plt.figure(figsize=(10, 10))
     visualizer = SilhouetteVisualizer(
         self.best_estimator_.named_steps['clustering'],
         colors='yellowbrick',
         is_fitted=True)
     visualizer.fit(self.data_preprocessed)
     visualizer.show(directory + "/shiloutte_score.png")
     visualizer.finalize()
     plt.close()
def silhouette_plot(text, model, cv):
    '''
    Loads in a saved model and produces a silhouette score plot
    '''
    path = 'models/{}'.format(model)
    pipe = load(path)
    kmeans = pipe.named_steps['kmeans']
    svd = pipe.named_steps['truncatedsvd']
    X = svd.fit_transform(cv)
    visualizer = SilhouetteVisualizer(kmeans, colors='sns_deep')
    visualizer.fit(X)
    visualizer.show(outpath="plots/Silhouette.png")
    plt.close()
Пример #15
0
def silhouette(ax):
    from sklearn.cluster import KMeans
    from yellowbrick.cluster import SilhouetteVisualizer
    from sklearn.datasets import make_blobs

    kws = {
        'centers': 8,
        'n_samples': 1000,
        'n_features': 12,
        'shuffle': True,
    }

    X = make_blobs()
    X, y = make_blobs(centers=8)
    visualizer = SilhouetteVisualizer(KMeans(6), ax=ax)
    visualizer.title = "Silhouette Clusters for K-Means (k=6) on an 8 Blob Dataset"
    visualizer.fit(X)
    return visualizer
Пример #16
0
def investigate_cluster_nr(min, max, pca_scores):
    rows, cols = 6, 6  #REMEMBER TO ADJUST
    fig, ax = plt.subplots(rows, cols, figsize=(15, 8))
    row, col, first = 0, 0, 0
    for i in range(min, max):  #was 5, 30
        if first != 0:
            if col < cols - 1:
                col += 1
            elif col == cols - 1:
                col = 0
                row += 1
        first = 1
        kmeans_pca = KMeans(n_clusters=i, init='k-means++', random_state=42)
        kmeans_pca.fit(pca_scores)
        visualizer = SilhouetteVisualizer(kmeans_pca,
                                          colors='yellowbrick',
                                          ax=ax[row][col])
        visualizer.fit(pca_scores)
Пример #17
0
def get_elbow_plot(X):

    output_text = ""
    try:
        model = KMeans(random_state=40, )
        elbow_score = KElbowVisualizer(model, k=(1, 30))
        elbow_score.fit(X)
        elbow_value = elbow_score.elbow_value_
        model = KMeans(elbow_value, random_state=42)
        silhoutte_score = SilhouetteVisualizer(model, colors='yellowbrick')
        silhoutte_score.fit(X)

        output_text = """The optimal number of clusters is """ + \
                      str(silhoutte_score.n_clusters_) + """ and the silhouette score is """ + \
                      str(np.round(silhoutte_score.silhouette_score_, 2))
    except ValueError as e:
        print(e)

    return output_text
Пример #18
0
def sc(f, g, krange):
    df = pd.read_table(f, index_col=0, header=0)
    if g == 'c':
        df = df.T
    min_k = krange[0]
    max_k = krange[1]
    num_fig = max_k - min_k + 1
    num_row = math.ceil(num_fig / 3.0)
    plt.figure(figsize=(12, num_row * 4))
    for k in range(min_k, max_k + 1):
        model = KMeans(k, random_state=1)
        visualizer = SilhouetteVisualizer(model, colors='yellowbrick')
        plt.subplot(num_row, 3, k - min_k + 1)
        visualizer.fit(df)
        plt.title('k=%d silhouette score %0.2f' %
                  (k, visualizer.silhouette_score_))
        plt.grid(0)
        plt.tight_layout()
    plt.savefig(os.path.basename(f).replace('.txt', '_silhouette.pdf'))
Пример #19
0
def main():
    xtrain1, xtest1, ytrain1, ytest1, xtrain2, xtest2, ytrain2, ytest2 = load_data(
    )
    km = KMeans(4)
    visualizer = SilhouetteVisualizer(km, colors='yellowbrick')
    visualizer.fit(xtrain1)
    visualizer.show()
    ytest = km.fit_predict(xtrain1)
    print(metrics.homogeneity_score(ytrain1, ytest))
    score(xtrain2, 20, ytrain2)
    elbowplot(xtrain2, 20, "distortion",
              "K Means Clustering Distortion vs Number of Clusters dat2",
              "figs/kmeans/kmeans_elbow_dat2.png")
    elbowplot(xtrain1, 100, "distortion",
              "K Means Clustering Distortion vs Number of Clusters dat1",
              "figs/kmeans/kmeans_elbow_dat1.png")
    elbowplot(xtrain2,
              40,
              "silhouette",
              "K Means Clustering Silhouette Score vs Number of Clusters dat2",
              "figs/kmeans/kmeans_silhouette_dat2.png",
              elbow=False)
    elbowplot(xtrain1,
              100,
              "silhouette",
              "K Means Clustering Silhouette Score vs Number of Clusters dat1",
              "figs/kmeans/kmeans_silhouette_dat1.png",
              elbow=False)
    elbowplot(
        xtrain2,
        20,
        "calinski_harabasz",
        "K Means Clustering Calinski Harabasz Score vs Number of Clusters dat2",
        "figs/kmeans/kmeans_calinski_dat2.png",
        elbow=False)
    elbowplot(
        xtrain1,
        100,
        "calinski_harabasz",
        "K Means Clustering Calinski Harabasz Score vs Number of Clusters dat1",
        "figs/kmeans/kmeans_calinski_dat1.png",
        elbow=False)
Пример #20
0
def silhoutte_yellowbrick(
    X,
    y,
    features,
):
    plt.switch_backend('agg')
    plt.clf()
    X_train, X_test, y_train, y_test = train_test_split(X[features],
                                                        y,
                                                        stratify=y,
                                                        test_size=0.01)
    X = pd.DataFrame(X_test, columns=features)
    y = pd.Series(y_test)
    n_clusters = y.nunique()
    model = MiniBatchKMeans(n_clusters)
    visualizer_sil = SilhouetteVisualizer(model, colors='yellowbrick')
    visualizer_sil.fit(X)
    visualizer_sil.finalize()

    return plt
    def silhouette_plot(self, latent_data):
        """Silhouette Plots and Scores to determine optimal K in KMeans"""
        fig, ax = plt.subplots(2, 2, figsize=(15, 8))
        for i in [2, 3, 4, 5]:
            '''
            Create KMeans instance for different number of clusters
            '''
            km = KMeans(n_clusters=i, max_iter=10, random_state=0)
            q, mod = divmod(i, 2)
            '''
            Create SilhouetteVisualizer instance with KMeans instance
            Fit the visualizer
            '''
            visualizer = SilhouetteVisualizer(km,
                                              colors='yellowbrick',
                                              ax=ax[q - 1][mod])
            visualizer.fit(latent_data)

        fig.suptitle('Silhouette Plots for 2, 3, 4, 5 Cluster Centers',
                     fontsize=18)
        plt.savefig('Silhouette-Visualization.png', bbox_inches='tight')
        plt.close('all')
Пример #22
0
def kmeans_silhouette_plots(tfidf, num_clusters=[3, 5, 7, 9, 11]):
    '''
    Vectorizer results are normalized, which makes KMeans behave as
    spherical k-means for better results. Since LSA/SVD results are
    not normalized, we have to redo the normalization.

    The best silhouette value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters. Negative values generally indicate that a sample has been assigned to the wrong cluster, as a different cluster is more similar.
    '''

    print('\nUse kmeans silhouette score to visualize Silhouette Coefficients')
    for k in num_clusters:
        start = datetime.now()

        svd = TruncatedSVD(n_components=50, n_iter=10, random_state=0)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)

        reduced = lsa.fit_transform(tfidf)

        model = KMeans(n_clusters=k, init='k-means++')

        # Instantiate the clustering model and visualizer
        visualizer = SilhouetteVisualizer(model)

        # Fit the training data to the visualizer
        visualizer.fit(reduced)
        visualizer.finalize()

        filename = r'images/silhouette_plots/kmeans_silh_plot_' + str(
            k) + '_clusters_' + str(tfidf.shape[0]) + '_docs.png'
        plt.savefig(filename)
        plt.close()

        end = datetime.now()
        print('            ' + filename)
        print("            Time taken: {}".format(end - start))
Пример #23
0
    # Load the data from the files in the corpus
    for cat in categories:
        for name in os.listdir(os.path.join(path, cat)):
            files.append(os.path.join(path, cat, name))
            target.append(cat)

            with open(os.path.join(path, cat, name), 'r') as f:
                data.append(f.read())

    # Return the data bunch for use similar to the newsgroups example
    return Bunch(
        categories=categories,
        files=files,
        data=data,
        target=target,
    )

corpus = load_corpus('hobbies')
tfidf  = TfidfVectorizer(stop_words='english')
docs   = tfidf.fit_transform(corpus.data)

# Instantiate the clustering model and visualizer
visualizer = SilhouetteVisualizer(KMeans(n_clusters=6))
visualizer.fit(docs)
visualizer.poof()

# Instantiate the clustering model and visualizer
visualizer = KElbowVisualizer(KMeans(), metric='silhouette', k=[4,10])
visualizer.fit(docs)
visualizer.poof()
                         'euclidean'),
                   axis=1)) / df_normalized.shape[0])
# Plot the elbow
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

# Compute Silhoette Graph for different number of clusters to select
# optimal number of clusters
from sklearn.cluster import KMeans
from yellowbrick.cluster import SilhouetteVisualizer
for n_clusters in range(2, 9):
    model = SilhouetteVisualizer(KMeans(n_clusters))
    model.fit(df_normalized)
    model.poof()

# Utlize TSNE to visualize data
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import pylab as pl
num_of_clusters = 4
kmeans = KMeans(n_clusters=num_of_clusters)
kmeans.fit(df_normalized)

X = TSNE(n_components=2).fit_transform(df_normalized)

for i in range(0, X.shape[0]):
    if kmeans.labels_[i] == 0:
        c1 = pl.scatter(X[i, 0], X[i, 1], c='red')
X = Data
distorsions = []
for k in range(2, 10):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(X)
    distorsions.append(kmeans.inertia_)
k = range(2, 10)
fig = plt.figure(figsize=(15, 5))
plt.plot(k, distorsions)
plt.grid(True)
plt.title('Elbow curve')

#methode de silouette
from sklearn.metrics import silhouette_samples, silhouette_score
for k in range(2, 10):
    kmeans = KMeans(n_clusters=k)
    y_pred = kmeans.fit_predict(Data)
    score = silhouette_score(Data, y_pred)
    print("For k = {}, silhouette score is {})".format(k, score))

from yellowbrick.cluster import SilhouetteVisualizer
# Instantiate the clustering model and visualizer
for k in range(2, 10):
    model = KMeans(k, random_state=42)
    plt.subplot(221)
    visualizer = SilhouetteVisualizer(model, colors='yellowbrick')
    visualizer.fit(Data)  # Fit the data to the visualizer
    visualizer.show()  # Finalize and render the figure
    plt.subplot(222)
    plt.scatter(Data[:, 0], Data[:, 1], c=y_pred, cmap='rainbow')
Пример #26
0
# Clustering Evaluation Imports
from functools import partial

from sklearn.cluster import MiniBatchKMeans
from sklearn.datasets import make_blobs as sk_make_blobs

from yellowbrick.cluster import SilhouetteVisualizer

# Helpers for easy dataset creation
N_SAMPLES = 1000
N_FEATURES = 12
SHUFFLE = True

# Make blobs partial
make_blobs = partial(sk_make_blobs,
                     n_samples=N_SAMPLES,
                     n_features=N_FEATURES,
                     shuffle=SHUFFLE)

if __name__ == '__main__':
    # Make 8 blobs dataset
    X, y = make_blobs(centers=8)

    # Instantiate the clustering model and visualizer
    model = MiniBatchKMeans(6)
    visualizer = SilhouetteVisualizer(model)

    visualizer.fit(X)  # Fit the training data to the visualizer
    visualizer.poof(outpath="images/silhouette.png")  # Draw/show/poof the data
Пример #27
0
def explore_KMeans_clustering(
    df,
    num_cols=None,
    n_clusters=range(3, 5),
    include_silhouette=True,
    include_PCA=True,
    random_state=None,
):
    """create, fit and plot KMeans clustering on the dataset

    Parameters
    ----------
    df : pandas.DataFrame
        the dataset, should be transformed with StandardScaler
    num_cols : list, optional
        list of numeric column names, in case of None, get all numeric columns
    metric : str, optional
        metric, by default "euclidean"
    n_clusters : list, optional
        list of n_clusters hyperparams, by default range(2, 9)
    include_silhouette : bool, optional
        whether Silhouette plots should be generated, by default True
    include_PCA : bool, optional
        whether PCA plots should be generated, by default True
    random_state : int, optional
        a number determines random number generation for centroid initialization, by default None

    Returns
    -------
    dict
        a dictionary with key=type of plot, value=list of plots

    Examples
    -------
    >>> original_df = pd.read_csv("/data/menu.csv")
    >>> numeric_features = eda.get_numeric_columns(original_df)
    >>> numeric_transformer = make_pipeline(SimpleImputer(), StandardScaler())
    >>> preprocessor = make_column_transformer(
    >>>     (numeric_transformer, numeric_features)
    >>> )
    >>> df = pd.DataFrame(
    >>>     data=preprocessor.fit_transform(original_df), columns=numeric_features
    >>> )
    >>> explore_KMeans_clusterting(df)
    """
    if num_cols is None:
        num_cols = get_numeric_columns(df)
    else:
        _verify_numeric_cols(df, num_cols)
    x = df[num_cols]
    results = {}
    if 1 in n_clusters:
        raise Exception("n_cluster cannot be 1")

    print("------------------------")
    print("K-MEANS CLUSTERING")
    print("------------------------")

    if len(n_clusters) > 1:
        print("Generating KElbow plot for KMeans.")
        # visualize using KElbowVisualizer
        kmeans = KMeans(random_state=random_state)

        plt.clf()
        fig, ax = plt.subplots()
        elbow_visualizer = KElbowVisualizer(kmeans, k=n_clusters, ax=ax)
        elbow_visualizer.fit(x)  # Fit the data to the visualizer
        elbow_visualizer.show()
        plt.close()
        elbow_visualizer.k = elbow_visualizer.elbow_value_  # fix printing issue
        results["KElbow"] = fig
    else:
        results["KElbow"] = None

    # visualize using SilhouetteVisualizer
    print("Generating Silhouette & PCA plots")
    silhouette_plots = []
    pca_plots = []
    for k in n_clusters:
        print(f"Number of clusters: {k}")

        kmeans = KMeans(k, random_state=random_state)

        if include_silhouette:
            fig, ax = plt.subplots()
            s_visualizer = SilhouetteVisualizer(kmeans, colors="yellowbrick", ax=ax)
            s_visualizer.fit(x)  # Fit the data to the visualizer
            s_visualizer.show()
            silhouette_plots.append(fig)
            # plt.clf()
            plt.close()

        else:
            silhouette_plots.append(None)

        # PCA plots
        if include_PCA:
            labels = kmeans.fit_predict(x)
            pca_fig = plot_pca_clusters(x, labels, random_state=random_state)
            pca_plots.append(pca_fig)
        else:
            pca_plots.append(None)

    results["Silhouette"] = silhouette_plots
    results["PCA"] = pca_plots

    return results
Пример #28
0
    #Compute the silhouette score for each sample
    # sample_silhouette_value = silhouette_samples(x,m.labels_)
    # print(sample_silhouette_value)
    return sils


ss = sil_score(x, 2, 5)
print(f'score={ss}')
print(f'optinum number of clusters ={max(ss)[1]}')
#
#Visualize Silhouette
#Instantiate the clustering model and visualizer
model = KMeans(n_clusters=3)
visualizer = SilhouetteVisualizer(model)
#fit the training data to visualizer
visualizer.fit(x)
#Draw/show/poof the data
visualizer.poof()
print(visualizer.silhouette_score_)  #near 1 is good


def Silhouette_plot(x, from_k, to_k):
    sil_score = []
    for k in range(from_k, to_k + 1):
        #Instatiate the clustering model and visualizer
        m = KMeans(n_clusters=k)
        visualizer = SilhouetteVisualizer(m)
        visualizer.fit(x)
        #Draw/show/poof the data
        visualizer.poof()
        sil_score.append([visualizer.silhouette_score_.round(3), k])
Пример #29
0
plt.figure()
plt.grid(True)
plt.plot(list(silhouettes.keys()), list(silhouettes.values()))
plt.title('K-Means, Elbow Method')
plt.xlabel("Number of clusters, K")
plt.ylabel("Silhouette")
plt.savefig(
    '869Individual_AssignmentQ1_graphs/jelwery-kmeans-elbow-silhouette.png')

# In[18]:

###sklearn.metrics.davies_bouldin_score(X, k_means.labels_)

visualizer = SilhouetteVisualizer(k_means)
visualizer.fit(X)
visualizer.poof()
fig = visualizer.ax.get_figure()
fig.savefig(
    '869Individual_AssignmentQ1_graphs/jelwery-kmeans-5-silhouette.png',
    transparent=False)

# # Answer to Question [1], Part [c]

# In[19]:

###Intepretting the Clusters

###Means
k_means.cluster_centers_
Пример #30
0
def silhouette():
    X, _ = make_blobs(centers=8)
    oz = SilhouetteVisualizer(MiniBatchKMeans(6), ax=newfig())
    oz.fit(X)
    savefig(oz, "silhouette")
Пример #31
0
def explore_DBSCAN_clustering(
    df,
    num_cols=None,
    metric="euclidean",
    eps=[0.5],
    min_samples=[5],
    include_silhouette=True,
    include_PCA=True,
    random_state=None,
):
    """fit and plot DBSCAN clustering algorithms

    Parameters
    ----------
    df : pandas.DataFrame
        the dataset, should be transformed with StandardScaler
    num_cols : list, optional
        list of numeric column names, in case of None, get all numeric columns
    metric : str, optional
        metric, by default "euclidean"
    eps : list, optional
        list of eps hyperparams, by default [0.5]
    min_samples: list, optional
        list of min_samples hyperparams, by default [5]
    include_silhouette : bool, optional
        whether Silhouette plots should be generated, by default True
    include_PCA : bool, optional
        whether PCA plots should be generated, by default True
    random_state : int, optional
        a number determines random number generation for centroid initialization, by default None

    Returns
    -------
    Tuple
        list
            a list of n_clusters values returned by DBSCAN models
        dict
            a dictionary with key=type of plot, value=list of plots

    Examples
    -------
    >>> original_df = pd.read_csv("/data/menu.csv")
    >>> numeric_features = eda.get_numeric_columns(original_df)
    >>> numeric_transformer = make_pipeline(SimpleImputer(), StandardScaler())
    >>> preprocessor = make_column_transformer(
    >>>     (numeric_transformer, numeric_features)
    >>> )
    >>> df = pd.DataFrame(
    >>>     data=preprocessor.fit_transform(original_df), columns=numeric_features
    >>> )
    >>> n_clusters, dbscan_plots = explore_DBSCAN_clusterting(df)
    """
    if num_cols is None:
        num_cols = get_numeric_columns(df)
    else:
        _verify_numeric_cols(df, num_cols)

    x = df[num_cols]

    results = {}
    n_clusters = []

    s_plots = []
    pca_plots = []

    print("------------------------")
    print("DBSCAN CLUSTERING")
    print("------------------------")

    for e in eps:
        for ms in min_samples:
            dbscan = DBSCAN(eps=e, min_samples=ms, metric=metric)
            dbscan.fit(x)
            k = len(set(dbscan.labels_)) - 1  # exclduing -1 labels
            n_clusters.append(k)
            print(f"eps={e}, min_samples={ms}, n_cluster={k}")
            if include_silhouette and k > 0:
                # generat Silhouette plot
                dbscan.n_clusters = k
                dbscan.predict = lambda x: dbscan.labels_
                fig, ax = plt.subplots()
                s_visualizer = SilhouetteVisualizer(dbscan, colors="yellowbrick", ax=ax)
                s_visualizer.fit(x)
                s_visualizer.show()
                s_plots.append(fig)
                # plt.clf()
                plt.close()
            else:
                s_plots.append(None)

            if include_PCA:
                # genrate PCA plot
                p_lot = plot_pca_clusters(x, dbscan.labels_, random_state=random_state)
                pca_plots.append(p_lot)
            else:
                pca_plots.append(None)

    results["Silhouette"] = s_plots
    results["PCA"] = pca_plots

    return n_clusters, results