def log_silhouette_chart(model, X, experiment=None, **kwargs): """Log Silhouette Coefficients charts for KMeans clusterer. Charts are computed for j = 2, 3, ..., n_clusters. Make sure you created an experiment by using ``neptune.create_experiment()`` before you use this method. Tip: Check `Neptune documentation <https://docs.neptune.ai/integrations/scikit_learn.html>`_ for the full example. Args: model (:obj:`KMeans`): | KMeans object. X (:obj:`ndarray`): | Training instances to cluster. experiment (:obj:`neptune.experiments.Experiment`, optional, default is ``None``): | Neptune ``Experiment`` object to control to which experiment you log the data. | If ``None``, log to currently active, and most recent experiment. kwargs: KMeans parameters. Returns: ``None`` Examples: .. code:: python3 km = KMeans(n_init=11, max_iter=270) X, y = make_blobs(n_samples=579, n_features=17, centers=7, random_state=28743) neptune.init('my_workspace/my_project') neptune.create_experiment() log_silhouette_chart(km, X=X, n_clusters=12) """ assert isinstance(model, KMeans), 'Model should be sklearn KMeans instance.' exp = _validate_experiment(experiment) model.set_params(**kwargs) n_clusters = model.get_params()['n_clusters'] for j in range(2, n_clusters + 1): model.set_params(**{'n_clusters': j}) model.fit(X) try: fig, ax = plt.subplots() visualizer = SilhouetteVisualizer(model, is_fitted=True, ax=ax) visualizer.fit(X) visualizer.finalize() exp.log_image( 'charts_sklearn', fig, image_name='Silhouette Coefficients for k={}'.format(j)) plt.close(fig) except Exception as e: print('Did not log Silhouette Coefficients chart. Error {}'.format( e))
def silhouette(ax=None): X, y = make_blobs(centers=12, n_samples=1000, n_features=16, shuffle=True) viz = SilhouetteVisualizer(KMeans(9), ax=ax) viz.fit(X) viz.finalize() return viz
def create_silhouette_chart(model, X, **kwargs): """Create silhouette coefficients charts for KMeans clusterer. Charts are computed for j = 2, 3, ..., n_clusters. Tip: Check Sklearn-Neptune integration `documentation <https://docs-beta.neptune.ai/essentials/integrations/machine-learning-frameworks/sklearn>`_ for the full example. Args: model (:obj:`KMeans`): | KMeans object. X (:obj:`ndarray`): | Training instances to cluster. kwargs: KMeans parameters. Returns: ``neptune.types.FileSeries`` object that you can assign to run's ``base_namespace``. Examples: .. code:: python3 import neptune.new.integrations.sklearn as npt_utils km = KMeans(n_init=11, max_iter=270) X, y = make_blobs(n_samples=579, n_features=17, centers=7, random_state=28743) run = neptune.init(project='my_workspace/my_project') run['kmeans/silhouette'] = npt_utils.create_silhouette_chart(km, X, n_clusters=12) """ assert isinstance(model, KMeans), 'Model should be sklearn KMeans instance.' charts = [] model.set_params(**kwargs) n_clusters = model.get_params()['n_clusters'] for j in range(2, n_clusters + 1): model.set_params(**{'n_clusters': j}) model.fit(X) try: fig, ax = plt.subplots() visualizer = SilhouetteVisualizer(model, is_fitted=True, ax=ax) visualizer.fit(X) visualizer.finalize() charts.append(neptune.types.File.as_image(fig)) plt.close(fig) except Exception as e: print('Did not log Silhouette Coefficients chart. Error {}'.format( e)) return neptune.types.FileSeries(charts)
def shiloutte_score_plot(self, directory): self._check_model() plt.figure(figsize=(10, 10)) visualizer = SilhouetteVisualizer( self.best_estimator_.named_steps['clustering'], colors='yellowbrick', is_fitted=True) visualizer.fit(self.data_preprocessed) visualizer.show(directory + "/shiloutte_score.png") visualizer.finalize() plt.close()
def silhoutte_yellowbrick( X, y, features, ): plt.switch_backend('agg') plt.clf() X_train, X_test, y_train, y_test = train_test_split(X[features], y, stratify=y, test_size=0.01) X = pd.DataFrame(X_test, columns=features) y = pd.Series(y_test) n_clusters = y.nunique() model = MiniBatchKMeans(n_clusters) visualizer_sil = SilhouetteVisualizer(model, colors='yellowbrick') visualizer_sil.fit(X) visualizer_sil.finalize() return plt
def kmeans_silhouette_plots(tfidf, num_clusters=[3, 5, 7, 9, 11]): ''' Vectorizer results are normalized, which makes KMeans behave as spherical k-means for better results. Since LSA/SVD results are not normalized, we have to redo the normalization. The best silhouette value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters. Negative values generally indicate that a sample has been assigned to the wrong cluster, as a different cluster is more similar. ''' print('\nUse kmeans silhouette score to visualize Silhouette Coefficients') for k in num_clusters: start = datetime.now() svd = TruncatedSVD(n_components=50, n_iter=10, random_state=0) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) reduced = lsa.fit_transform(tfidf) model = KMeans(n_clusters=k, init='k-means++') # Instantiate the clustering model and visualizer visualizer = SilhouetteVisualizer(model) # Fit the training data to the visualizer visualizer.fit(reduced) visualizer.finalize() filename = r'images/silhouette_plots/kmeans_silh_plot_' + str( k) + '_clusters_' + str(tfidf.shape[0]) + '_docs.png' plt.savefig(filename) plt.close() end = datetime.now() print(' ' + filename) print(" Time taken: {}".format(end - start))