def clustering(self, k): word_vectors = self.__model_p__.wv KM_model = KMeans(n_clusters=k, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors) center_closest = [] for i in range(k): center_closest.append([ el[0] for el in word_vectors.similar_by_vector( KM_model.cluster_centers_[i], topn=15, restrict_vocab=None) ]) metric_str = 'euclidean' score = silhouette_score(word_vectors.vectors, KM_model.predict(word_vectors.vectors), metric=metric_str) print("silhouette_score:", score) SVmodel = SilhouetteVisualizer(KM_model, is_fitted=True) SVmodel.fit(word_vectors.vectors) SVmodel.show() words = pd.DataFrame(word_vectors.vocab.keys(), columns=['words']) words['vectors'] = words.words.apply(lambda x: word_vectors[f'{x}']) words['cluster'] = words.vectors.apply( lambda x: KM_model.predict([np.array(x)])) words.cluster = words.cluster.apply(lambda x: x[0]) words['closeness_score'] = words.apply( lambda x: 1 / (KM_model.transform([x.vectors]).min()), axis=1) return KM_model, center_closest, score, words
def silhouettevisual(model, X, graph): visualizer = SilhouetteVisualizer( model, colors='yellowbrick', title=" Silhouette Plot of KMeans Clustering for " + graph) visualizer.fit(X) visualizer.show()
def kmeans_exp(): with open('features_GMM.csv', mode='r') as feature_file: feature_reader = csv.reader(feature_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) i = 0 for fs in feature_reader: if i > 0: print(f"user n#{i}") print(fs) fs.pop() df_training = pd.DataFrame() training_list, testing_dict, training_fs_list, validation_fs_dict = load_dataset( i) training_list = training_list[0:16] + training_list[ 21:25] + training_list[36:40] for element in training_list: df_training = df_training.append(element) # knn = KNeighborsClassifier() model = KMeans(n_clusters=6, random_state=42).fit(df_training) for signature in testing_dict['genuine']: cluster = model.predict(signature) print("testing signature:") occurences = Counter(cluster) print(occurences) visualizer = SilhouetteVisualizer(model) visualizer.fit(df_training) visualizer.show() i += 1
def plot_cluster_silhouette(estimator, dataset, version): visualizer = SilhouetteVisualizer(estimator, colors='yellowbrick') visualizer.fit(data.DATA[dataset][version]['x_train']) visualizer.show( f'{PLOTS_FOLDER}/{dataset}/{version}/{dataset}_{version}_{estimator.__class__.__name__}_cluster_silhouettes_k{estimator.n_clusters}.png' ) plt.clf()
def shiloutte_score_plot(self, directory): self._check_model() plt.figure(figsize=(10, 10)) visualizer = SilhouetteVisualizer( self.best_estimator_.named_steps['clustering'], colors='yellowbrick', is_fitted=True) visualizer.fit(self.data_preprocessed) visualizer.show(directory + "/shiloutte_score.png") visualizer.finalize() plt.close()
def silhouette_plot(text, model, cv): ''' Loads in a saved model and produces a silhouette score plot ''' path = 'models/{}'.format(model) pipe = load(path) kmeans = pipe.named_steps['kmeans'] svd = pipe.named_steps['truncatedsvd'] X = svd.fit_transform(cv) visualizer = SilhouetteVisualizer(kmeans, colors='sns_deep') visualizer.fit(X) visualizer.show(outpath="plots/Silhouette.png") plt.close()
def main(): xtrain1, xtest1, ytrain1, ytest1, xtrain2, xtest2, ytrain2, ytest2 = load_data( ) km = KMeans(4) visualizer = SilhouetteVisualizer(km, colors='yellowbrick') visualizer.fit(xtrain1) visualizer.show() ytest = km.fit_predict(xtrain1) print(metrics.homogeneity_score(ytrain1, ytest)) score(xtrain2, 20, ytrain2) elbowplot(xtrain2, 20, "distortion", "K Means Clustering Distortion vs Number of Clusters dat2", "figs/kmeans/kmeans_elbow_dat2.png") elbowplot(xtrain1, 100, "distortion", "K Means Clustering Distortion vs Number of Clusters dat1", "figs/kmeans/kmeans_elbow_dat1.png") elbowplot(xtrain2, 40, "silhouette", "K Means Clustering Silhouette Score vs Number of Clusters dat2", "figs/kmeans/kmeans_silhouette_dat2.png", elbow=False) elbowplot(xtrain1, 100, "silhouette", "K Means Clustering Silhouette Score vs Number of Clusters dat1", "figs/kmeans/kmeans_silhouette_dat1.png", elbow=False) elbowplot( xtrain2, 20, "calinski_harabasz", "K Means Clustering Calinski Harabasz Score vs Number of Clusters dat2", "figs/kmeans/kmeans_calinski_dat2.png", elbow=False) elbowplot( xtrain1, 100, "calinski_harabasz", "K Means Clustering Calinski Harabasz Score vs Number of Clusters dat1", "figs/kmeans/kmeans_calinski_dat1.png", elbow=False)
def kMeans(): twitterX, twitterY, twitter_dataset, scaled_features = preprocess() gm = GaussianMixture(covariance_type='tied', n_components=18, n_init=10) gm.fit(scaled_features) print("GM Converged", gm.converged_) print("GM Convergence Iterations", gm.n_iter_) print("GM weights", gm.weights_) gm.predict(scaled_features) gm.predict_proba(scaled_features) gm.score_samples(scaled_features) aic = [] bic = [] for i in range(10): gm = GaussianMixture(covariance_type='spherical', n_components=9, n_init=10) gm.fit(scaled_features) aic.append(gm.aic(scaled_features)) bic.append(gm.bic(scaled_features)) plt.plot(aic, label="AIC") plt.plot(bic, label="BIC") # plt.xticks(np.arange(min(x), max(x) + 1, 1.0)) # plt.xticks(range(1,18)) plt.xlabel("Number of Clusters") plt.ylabel("Information Criterion") plt.legend() plt.show() twitter_trainingX, twitter_testingX, twitter_trainingY, twitter_testingY = train_test_split(twitterX, twitterY) error = [] #citation: https://stackoverflow.com/questions/36566844/pca-projection-and-reconstruction-in-scikit-learn for i in range(1, 8): pca = FastICA(n_components=i) pca.fit(twitter_trainingX) U, S, VT = np.linalg.svd(twitter_trainingX - twitter_trainingX.mean(0)) x_train_pca = pca.transform(twitter_trainingX) x_train_pca2 = (twitter_trainingX - pca.mean_).dot(pca.components_.T) x_projected = pca.inverse_transform(x_train_pca) x_projected2 = x_train_pca.dot(pca.components_) + pca.mean_ loss = ((twitter_trainingX - x_projected) ** 2).mean() error.append(loss) plt.clf() plt.figure(figsize=(15, 15)) plt.title("reconstruction error") plt.plot(error, 'r') plt.xticks(range(len(error)), range(1, 8), rotation='vertical') plt.xlim([-1, len(error)]) plt.show() clf = MLPClassifier(alpha=0.001, hidden_layer_sizes=(8,), random_state=1, solver='lbfgs') clf.fit(twitter_trainingX, twitter_trainingY) y_pred = clf.predict(twitter_testingX) print("Accuracy Score Normal", accuracy_score(twitter_testingY, y_pred)) kmeans = KMeans( init="random", n_clusters=3, n_init=10, max_iter=300, random_state=42 ) kmeans.fit(scaled_features) labels = kmeans.fit_predict(twitter_testingX) print("Accuracy Score K-Means", accuracy_score(twitter_testingY, labels)) for i in range(9): pca = PCA(n_components=i) pca.fit(scaled_features) cumsum = np.cumsum(pca.explained_variance_ratio_) plt.plot(cumsum, label="Explained Variance Ratio") # plt.xticks(np.arange(min(x), max(x) + 1, 1.0)) # plt.xticks(range(1,18)) plt.xlabel("Number of Dimensions") plt.ylabel("Explained Variance Ratio") plt.legend() plt.show() # ica num_batches = 100 inc_pca = IncrementalPCA(n_components=5) for X_batch in np.array_split(scaled_features, num_batches): inc_pca.partial_fit(X_batch) X_reduced_inc = inc_pca.transform(scaled_features) # randomized projections rnd_pca = PCA(n_components=5, svd_solver="randomized") X_reduced_rand = rnd_pca.fit_transform(scaled_features) # citation: https://scikit-learn.org/stable/auto_examples/feature_selection/plot_feature_selection.html#sphx-glr-auto-examples-feature-selection-plot-feature-selection-py # k best scaler = MinMaxScaler() digits_indices = np.arange(twitterX.shape[-1]) scaled_features_norm = scaler.fit_transform(scaled_features) k_selected = SelectKBest(f_classif, k=8) k_selected.fit(scaled_features_norm, twitterY) scores = -np.log10(k_selected.pvalues_) plt.bar(digits_indices - .45, scores, width=.2, label=r'Univariate score ($-Log(p_{value})$)') plt.xlabel("Features") plt.ylabel("F-Score") plt.show() digits kmeans = KMeans( init="random", n_clusters=5, n_init=10, max_iter=300, random_state=42 ) kmeans.fit(scaled_features) labels = kmeans.fit_predict(twitter_dataset) #the lowest SSE value print("KMeans Inertia", kmeans.inertia_) #final locations of the centroid print("KMeans Cluster Centers", kmeans.cluster_centers_) #num of iterations required to converge print("KMeans Iterations Required To Converge", kmeans.n_iter_) #labels print("KMeans Labels", kmeans.labels_[:5]) kmeans_kwargs = { "init":"random", "n_init":10, "max_iter":300, "random_state":42, } sse = [] for k in range(1, 18): kmeans = KMeans(n_clusters=k, **kmeans_kwargs) kmeans.fit(scaled_features) sse.append(kmeans.inertia_) model = KMeans(n_clusters=9) elbow_visualizer = KElbowVisualizer(model, k=(2, 18)) elbow_visualizer.fit(twitterX) elbow_visualizer.show() silhouette_visualizer = SilhouetteVisualizer(model, colors='yellowbrick') silhouette_visualizer.fit(twitterX) silhouette_visualizer.show() ic_visualizer = InterclusterDistance(model) ic_visualizer.fit(twitterX) ic_visualizer.show() X = twitter_dataset[:, []] plt.scatter()
model = KMeans(n_clusters=k, random_state=1) # 모델 선택 model.fit(X) # 모델 훈련 kmeans_models.append(model) inertias.append(model.inertia_) silhouettes.append(silhouette_score(X, model.labels_)) print('inertias:', inertias) print('silhouettes:', silhouettes) # k - 실루엣 점수 그래프 plt.plot(k_ranges[1:], silhouettes[1:], marker='o') plt.xlabel('k') plt.ylabel('silhouette score') plt.show() # 실루엣 점수(silhouette score) = (b - a) / max(a, b) # a: 동일한 클러스터 안에서 다른 샘플들과의 거리의 평균 # b: 가장 가까운 클러스터까지의 평균 거리 # -1 <= ss <= 1 # 실루엣 점수가 큰 모델의 클러스터 개수가 적절한 클러스터 개수 # ss = 1: 샘플들이 자기 클러스터 안에 잘 모여 있고, # 다른 클러스터와는 멀리 떨어져 있는 경우 # ss = 0: 샘들들이 클러스터 경계에 몰려 있는 경우 # ss = -1: 샘플들이 잘못된 클러스터에 포함되는 경우. # 실루엣 다이어그램 # pip install yellowbrick for model in kmeans_models[1:]: # 훈련된 각각의 KMeans 모델들에 대해서 visualizer = SilhouetteVisualizer(model, color='yellowbrick') visualizer.fit(X) visualizer.show()
def explore_DBSCAN_clustering( df, num_cols=None, metric="euclidean", eps=[0.5], min_samples=[5], include_silhouette=True, include_PCA=True, random_state=None, ): """fit and plot DBSCAN clustering algorithms Parameters ---------- df : pandas.DataFrame the dataset, should be transformed with StandardScaler num_cols : list, optional list of numeric column names, in case of None, get all numeric columns metric : str, optional metric, by default "euclidean" eps : list, optional list of eps hyperparams, by default [0.5] min_samples: list, optional list of min_samples hyperparams, by default [5] include_silhouette : bool, optional whether Silhouette plots should be generated, by default True include_PCA : bool, optional whether PCA plots should be generated, by default True random_state : int, optional a number determines random number generation for centroid initialization, by default None Returns ------- Tuple list a list of n_clusters values returned by DBSCAN models dict a dictionary with key=type of plot, value=list of plots Examples ------- >>> original_df = pd.read_csv("/data/menu.csv") >>> numeric_features = eda.get_numeric_columns(original_df) >>> numeric_transformer = make_pipeline(SimpleImputer(), StandardScaler()) >>> preprocessor = make_column_transformer( >>> (numeric_transformer, numeric_features) >>> ) >>> df = pd.DataFrame( >>> data=preprocessor.fit_transform(original_df), columns=numeric_features >>> ) >>> n_clusters, dbscan_plots = explore_DBSCAN_clusterting(df) """ if num_cols is None: num_cols = get_numeric_columns(df) else: _verify_numeric_cols(df, num_cols) x = df[num_cols] results = {} n_clusters = [] s_plots = [] pca_plots = [] print("------------------------") print("DBSCAN CLUSTERING") print("------------------------") for e in eps: for ms in min_samples: dbscan = DBSCAN(eps=e, min_samples=ms, metric=metric) dbscan.fit(x) k = len(set(dbscan.labels_)) - 1 # exclduing -1 labels n_clusters.append(k) print(f"eps={e}, min_samples={ms}, n_cluster={k}") if include_silhouette and k > 0: # generat Silhouette plot dbscan.n_clusters = k dbscan.predict = lambda x: dbscan.labels_ fig, ax = plt.subplots() s_visualizer = SilhouetteVisualizer(dbscan, colors="yellowbrick", ax=ax) s_visualizer.fit(x) s_visualizer.show() s_plots.append(fig) # plt.clf() plt.close() else: s_plots.append(None) if include_PCA: # genrate PCA plot p_lot = plot_pca_clusters(x, dbscan.labels_, random_state=random_state) pca_plots.append(p_lot) else: pca_plots.append(None) results["Silhouette"] = s_plots results["PCA"] = pca_plots return n_clusters, results
def explore_KMeans_clustering( df, num_cols=None, n_clusters=range(3, 5), include_silhouette=True, include_PCA=True, random_state=None, ): """create, fit and plot KMeans clustering on the dataset Parameters ---------- df : pandas.DataFrame the dataset, should be transformed with StandardScaler num_cols : list, optional list of numeric column names, in case of None, get all numeric columns metric : str, optional metric, by default "euclidean" n_clusters : list, optional list of n_clusters hyperparams, by default range(2, 9) include_silhouette : bool, optional whether Silhouette plots should be generated, by default True include_PCA : bool, optional whether PCA plots should be generated, by default True random_state : int, optional a number determines random number generation for centroid initialization, by default None Returns ------- dict a dictionary with key=type of plot, value=list of plots Examples ------- >>> original_df = pd.read_csv("/data/menu.csv") >>> numeric_features = eda.get_numeric_columns(original_df) >>> numeric_transformer = make_pipeline(SimpleImputer(), StandardScaler()) >>> preprocessor = make_column_transformer( >>> (numeric_transformer, numeric_features) >>> ) >>> df = pd.DataFrame( >>> data=preprocessor.fit_transform(original_df), columns=numeric_features >>> ) >>> explore_KMeans_clusterting(df) """ if num_cols is None: num_cols = get_numeric_columns(df) else: _verify_numeric_cols(df, num_cols) x = df[num_cols] results = {} if 1 in n_clusters: raise Exception("n_cluster cannot be 1") print("------------------------") print("K-MEANS CLUSTERING") print("------------------------") if len(n_clusters) > 1: print("Generating KElbow plot for KMeans.") # visualize using KElbowVisualizer kmeans = KMeans(random_state=random_state) plt.clf() fig, ax = plt.subplots() elbow_visualizer = KElbowVisualizer(kmeans, k=n_clusters, ax=ax) elbow_visualizer.fit(x) # Fit the data to the visualizer elbow_visualizer.show() plt.close() elbow_visualizer.k = elbow_visualizer.elbow_value_ # fix printing issue results["KElbow"] = fig else: results["KElbow"] = None # visualize using SilhouetteVisualizer print("Generating Silhouette & PCA plots") silhouette_plots = [] pca_plots = [] for k in n_clusters: print(f"Number of clusters: {k}") kmeans = KMeans(k, random_state=random_state) if include_silhouette: fig, ax = plt.subplots() s_visualizer = SilhouetteVisualizer(kmeans, colors="yellowbrick", ax=ax) s_visualizer.fit(x) # Fit the data to the visualizer s_visualizer.show() silhouette_plots.append(fig) # plt.clf() plt.close() else: silhouette_plots.append(None) # PCA plots if include_PCA: labels = kmeans.fit_predict(x) pca_fig = plot_pca_clusters(x, labels, random_state=random_state) pca_plots.append(pca_fig) else: pca_plots.append(None) results["Silhouette"] = silhouette_plots results["PCA"] = pca_plots return results
X = Data distorsions = [] for k in range(2, 10): kmeans = KMeans(n_clusters=k) kmeans.fit(X) distorsions.append(kmeans.inertia_) k = range(2, 10) fig = plt.figure(figsize=(15, 5)) plt.plot(k, distorsions) plt.grid(True) plt.title('Elbow curve') #methode de silouette from sklearn.metrics import silhouette_samples, silhouette_score for k in range(2, 10): kmeans = KMeans(n_clusters=k) y_pred = kmeans.fit_predict(Data) score = silhouette_score(Data, y_pred) print("For k = {}, silhouette score is {})".format(k, score)) from yellowbrick.cluster import SilhouetteVisualizer # Instantiate the clustering model and visualizer for k in range(2, 10): model = KMeans(k, random_state=42) plt.subplot(221) visualizer = SilhouetteVisualizer(model, colors='yellowbrick') visualizer.fit(Data) # Fit the data to the visualizer visualizer.show() # Finalize and render the figure plt.subplot(222) plt.scatter(Data[:, 0], Data[:, 1], c=y_pred, cmap='rainbow')
plt.ylabel('Gap Value') plt.title('Gap Values by Cluster Count') plt.savefig("Gap Values.png") plt.show() # ============================================================================= # ============================================================================= # Using the silhouette to find the optimal number of clusters for n_clusters in range(4, 10): model = KMeans(n_clusters, init='k-means++') cluster_labels = model.fit_predict(X) visualizer = SilhouetteVisualizer(model) visualizer.fit(X) # Fit the training data to the visualizer visualizer.show(outpath="BoW_Silhouette %d" % n_clusters) visualizer.poof() # Draw/show/poof the data silhouette_avg = silhouette_score(X, cluster_labels) print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg) # ============================================================================= # ============================================================================= # Clustering Using K-Means kmeans = KMeans(n_clusters=4, init='k-means++', random_state=42) kmeans.fit(X) y_kmeans = kmeans.predict(X) # reduce the features to 2D reduced_features = pca.fit_transform(X)
score_lst.append(silhouette_score(Data_f_scaled, DBSCAN_1, metric=metric_str)) score_lst.append(silhouette_score(Data_f_scaled, DBSCAN_2, metric=metric_str)) score_lst.append( silhouette_score(Data_f_scaled, SpectralClustering_0, metric=metric_str)) score_lst.append( silhouette_score(Data_f_scaled, SpectralClustering_1, metric=metric_str)) score_lst.append( silhouette_score(Data_f_scaled, SpectralClustering_2, metric=metric_str)) print(score_lst) for i in n_clust_vec: fig = plt.figure(figsize=(20, 40)) model = SilhouetteVisualizer(KMeans(i, random_state=0)) model.fit(Data_reduced_scaled) model.show() fig.savefig(path_join(directory_img, "k_m_silhouettes_{}.png".format(i))) plt.close() Data_TSNE_2 = TSNE(n_components=2).fit_transform(Data_reduced_scaled) Data_TSNE_3 = TSNE(n_components=3).fit_transform(Data_reduced_scaled) fig = plt.figure(figsize=(20, 40)) Data_Video = Data_f[Data_f['Type_Video'] == 1] ax = fig.add_subplot(2, 1, 1) ax.scatter(Data_TSNE_3[:, 0], Data_TSNE_3[:, 1], c=k_means_results[0], cmap='viridis', marker='o', s=30)
n_init=10, max_iter=maxIter, tol=tol) visualizer = SilhouetteVisualizer(km, colors="yellowbrick") visualizer.fit(np.array(data)) km.fit(data) distortions.append(km.inertia_) cluster_labels = km.predict(data) silhouette_avg = silhouette_score(data, cluster_labels) opath = str(1) + "-1d" path = os.path.join("./", opath) if not os.path.exists(path): os.mkdir(path) startEndPath = str(numClustersStart) + "-" + str(numClustersEnd) visualizer.show(outpath="./" + opath + "/" + str(n_clusters) + ".png") # TODO plt.cla() plt.clf() plt.close("all") print( "For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg, ) print("For n_clusters =", n_clusters, "The distortion is :", km.inertia_) dictionary = {} print(n_clusters, "clusters:") for i in range(n_clusters): dictionary[i] = [] for i in range(len(cluster_labels)):
def k_means_e_metodo_do_cotovelo(nome_arq_saida_todos_momentos, k_array, metodos_do_cotovelo, is_plotar=True, is_plotar_momentos_3d=False): df_momentos_familias = pd.read_csv(nome_arq_saida_todos_momentos, sep=",") df_momentos_familias = df_momentos_familias.apply(pd.to_numeric, errors='coerce') df_momentos_familias = df_momentos_familias.dropna() df_momentos_familias = df_momentos_familias.reset_index(drop=True) np_elem_norm = df_momentos_familias.to_numpy() # método distorcao_km_inertia - soma das distâncias ao quadrado soma_das_dist_ao_quadrado = [] arr_kmeans = [] for cada_k in k_array: k_means_model = analisador_k_means(df_momentos_familias, np_elem_norm, cada_k, is_plotar, is_plotar_momentos_3d) arr_kmeans.append(k_means_model) soma_das_dist_ao_quadrado.append(k_means_model.inertia_) if "silhueta_yellowbrick" in metodos_do_cotovelo: visualizer = SilhouetteVisualizer(k_means_model, colors='yellowbrick') visualizer.fit(np_elem_norm) # Fit the data to the visualizer visualizer.fig.savefig( "./silhueta_yellowbrick__k_{}.png".format(cada_k)) if is_plotar: visualizer.show() plt.close('all') for metodo_do_cotovelo in metodos_do_cotovelo: if metodo_do_cotovelo == "distorcao_km_inertia": plt.plot(k_array, soma_das_dist_ao_quadrado, 'bx-') plt.xlabel('k') plt.ylabel('Distorção') plt.title('Método do cotovelo para encontrar o melhor k') plt.savefig("./{}.png".format(metodo_do_cotovelo)) if is_plotar: plt.show() elif metodo_do_cotovelo == "distorcao_yellowbrick": kmeans = KMeans(random_state=0) visualizer = KElbowVisualizer(kmeans, k=k_array, metric='distortion') visualizer.fit(np_elem_norm) # Fit the data to the visualizer plt.savefig("./{}.png".format(metodo_do_cotovelo)) melhor_k = visualizer.elbow_value_ if is_plotar: visualizer.show() elif metodo_do_cotovelo == "calinski_harabasz_yellowbrick": kmeans = KMeans(random_state=0) visualizer = KElbowVisualizer(kmeans, k=k_array, metric='calinski_harabasz') visualizer.fit(np_elem_norm) # Fit the data to the visualizer plt.savefig("./{}.png".format(metodo_do_cotovelo)) if is_plotar: visualizer.show() plt.close('all') return arr_kmeans, melhor_k
def silhouette(X): for i in range(2, 4): model = KMeans(i, random_state=42) visualizer = SilhouetteVisualizer(model, colors='yellowbrick') visualizer.fit(X) # Fit the data to the visualizer visualizer.show() # Finalize and render the figure
def kMeans(): # citation: https://realpython.com/k-means-clustering-python/ digits = load_digits() # features digits_features = digits.data[:, 0:-1] # label label = digits.data[:, -1] scaler = StandardScaler() scaled_features = scaler.fit_transform(digits_features) # citation: hands on machine learning gm = GaussianMixture(covariance_type='spherical', n_components=8, n_init=10) gm.fit(scaled_features) print("GM Converged", gm.converged_) print("GM Convergence Iterations", gm.n_iter_) print("GM weights", gm.weights_) gm.predict(scaled_features) gm.predict_proba(scaled_features) gm.score_samples(scaled_features) aic = [] bic = [] for i in range(21): gm = GaussianMixture(covariance_type='spherical', n_components=20, n_init=10) gm.fit(scaled_features) aic.append(gm.aic(scaled_features)) bic.append(gm.bic(scaled_features)) plt.plot(aic, label="AIC") plt.plot(bic, label="BIC") # plt.xticks(np.arange(min(x), max(x) + 1, 1.0)) # plt.xticks(range(1,18)) plt.xlabel("Number of Clusters") plt.ylabel("Information Criterion") plt.legend() plt.show() # x_centered = digits_features - digits_features.mean(axis=0) # U, s, Vt = np.linalg.svd(x_centered) # c1 = Vt.T[:, 0] # c2 = Vt.T[:, 1] # W2 = Vt.T[:, :2] # X2D = x_centered.dot(W2) # pca = PCA() # pca.fit(scaled_features) # cumsum = np.cumsum(pca.explained_variance_ratio_) # d = np.argmax(cumsum >= 0.95) + 1 # pca = PCA(n_components=0.95) # X_reduced = pca.fit_transform(scaled_features) explained_variance = [] for i in range(63): pca = PCA(n_components=i) pca.fit(scaled_features) cumsum = np.cumsum(pca.explained_variance_ratio_) plt.plot(cumsum, label="Explained Variance Ratio") # plt.xticks(np.arange(min(x), max(x) + 1, 1.0)) # plt.xticks(range(1,18)) plt.xlabel("Number of Dimensions") plt.ylabel("Explained Variance Ratio") plt.legend() plt.show() digits_trainingX, digits_testingX, digits_trainingY, digits_testingY = train_test_split( digits_features, label) # ica # citation: https://stackoverflow.com/questions/36566844/pca-projection-and-reconstruction-in-scikit-learn error = [] for i in range(1, 50): pca = PCA(n_components=i) pca.fit(digits_trainingX) U, S, VT = np.linalg.svd(digits_trainingX - digits_trainingX.mean(0)) x_train_pca = pca.transform(digits_trainingX) x_train_pca2 = (digits_trainingX - pca.mean_).dot(pca.components_.T) x_projected = pca.inverse_transform(x_train_pca) x_projected2 = x_train_pca.dot(pca.components_) + pca.mean_ loss = ((digits_trainingX - x_projected)**2).mean() error.append(loss) plt.clf() plt.figure(figsize=(15, 15)) plt.title("reconstruction error") plt.plot(error, 'r') plt.xticks(range(len(error)), range(1, 50), rotation='vertical') plt.xlim([-1, len(error)]) plt.show() clf = MLPClassifier(alpha=0.001, hidden_layer_sizes=(8, ), random_state=1, solver='lbfgs') clf.fit(digits_trainingX, digits_trainingY) y_pred = clf.predict(digits_testingX) print("Accuracy Score Normal", accuracy_score(digits_testingY, y_pred)) k_acc = [] k_gm = [] time_arr = [] for k in range(1, 15): kmeans = KMeans(n_clusters=k) X_train = kmeans.fit_transform(digits_trainingX) X_test = kmeans.transform(digits_testingX) start_time = time.time() clf = MLPClassifier(alpha=0.001, hidden_layer_sizes=(8, ), random_state=1, solver='lbfgs') clf.fit(X_train, digits_trainingY) total_time = time.time() - start_time y_pred = clf.predict(X_test) score = accuracy_score(digits_testingY, y_pred) k_acc.append(score) time_arr.append(total_time) plt.plot(k_acc, label="K-Means") plt.plot(time_arr, label="Computation Time") # plt.xticks(np.arange(min(x), max(x) + 1, 1.0)) # plt.xticks(range(1,18)) plt.xlabel("k # of clusters") plt.ylabel("NN Accuracy") plt.legend() plt.show() acc = [] acc_ica = [] acc_rca = [] for i in range(1, 40): pca = PCA(n_components=i) X_train = pca.fit_transform(digits_trainingX) X_test = pca.transform(digits_testingX) clf = MLPClassifier(alpha=0.001, hidden_layer_sizes=(8, ), random_state=1, solver='lbfgs') clf.fit(X_train, digits_trainingY) y_pred = clf.predict(X_test) score = accuracy_score(digits_testingY, y_pred) acc.append(score) ica = FastICA(n_components=i) x_train_i = ica.fit_transform(digits_trainingX) x_test_i = ica.transform(digits_testingX) clf.fit(x_train_i, digits_trainingY) y_pred_i = clf.predict(x_test_i) score_i = accuracy_score(digits_testingY, y_pred_i) acc_ica.append(score_i) rca = GaussianRandomProjection(n_components=i) x_train_r = rca.fit_transform(digits_trainingX) x_test_r = rca.transform(digits_testingX) clf.fit(x_train_r, digits_trainingY) y_pred_r = clf.predict(x_test_r) score_r = accuracy_score(digits_testingY, y_pred_r) acc_rca.append(score_r) plt.plot(acc, label="PCA") plt.plot(acc_ica, label="ICA") plt.plot(acc_rca, label="RCA") # plt.xticks(np.arange(min(x), max(x) + 1, 1.0)) # plt.xticks(range(1,18)) plt.xlabel("Components") plt.ylabel("NN Accuracy") plt.legend() plt.show() # cumsum = np.cumsum(pca.explained_variance_ratio_) # d = np.argmax(cumsum >= 0.95) + 1 # randomized projections rnd_pca = PCA(n_components=50, svd_solver="randomized") X_reduced_rand = rnd_pca.fit_transform(scaled_features) # citation: https://scikit-learn.org/stable/auto_examples/feature_selection/plot_feature_selection.html#sphx-glr-auto-examples-feature-selection-plot-feature-selection-py # k best scaler = MinMaxScaler() digits_indices = np.arange(digits_features.shape[-1]) scaled_features_norm = scaler.fit_transform(scaled_features) k_selected = SelectKBest(f_classif, k=50) k_selected.fit(scaled_features_norm, label) scores = -np.log10(k_selected.pvalues_) plt.bar(digits_indices - .45, scores, width=.2, label=r'Univariate score ($-Log(p_{value})$)') plt.xlabel("Features") plt.ylabel("F-Score") plt.show() gm = GaussianMixture(covariance_type='spherical', n_components=8, n_init=10) gm.fit(X_reduced_inc) print("GM Converged - PCA Inc", gm.converged_) print("GM Convergence Iterations", gm.n_iter_) print("GM weights", gm.weights_) gm.predict(X_reduced_inc) gm.predict_proba(X_reduced_inc) gm.score_samples(X_reduced_inc) kmeans = KMeans(init="random", n_clusters=63, n_init=10, max_iter=300, random_state=42) kmeans.fit(scaled_features) # the lowest SSE value print("KMeans Inertia", kmeans.inertia_) # final locations of the centroid print("KMeans Cluster Centers", kmeans.cluster_centers_) # num of iterations required to converge print("KMeans Iterations Required To Converge", kmeans.n_iter_) # labels print("KMeans Labels", kmeans.labels_[:5]) kmeans_kwargs = { "init": "random", "n_init": 10, "max_iter": 300, "random_state": 42, } sse = [] for k in range(1, 63): kmeans = KMeans(n_clusters=k, **kmeans_kwargs) kmeans.fit(scaled_features) sse.append(kmeans.inertia_) kl = KneeLocator(range(1, 63), sse, curve="convex", direction="decreasing") # optimal k (number of clusters) for this dataset print("Elbow", kl.elbow) clf = MLPClassifier(alpha=0.001, hidden_layer_sizes=(8, ), random_state=1, solver='lbfgs') clf.fit(digits_trainingX, digits_trainingY) y_pred = clf.predict(digits_testingX) model = KMeans(n_clusters=5) kmeans.fit(scaled_features) labels = kmeans.fit_predict(digits_testingX) print("Accuracy Score Normal", accuracy_score(digits_testingY, y_pred)) print("Accuracy Score K-Means", accuracy_score(digits_testingY, labels)) elbow_visualizer = KElbowVisualizer(model, k=(2, 63)) elbow_visualizer.fit(digits_features) elbow_visualizer.show() silhouette_visualizer = SilhouetteVisualizer(model, colors='yellowbrick') silhouette_visualizer.fit(digits_features) silhouette_visualizer.show() ic_visualizer = InterclusterDistance(model) ic_visualizer.fit(digits_features) ic_visualizer.show() # gmm = GaussianMixture(n_components=7).fit(digits_features) # labels = gmm.predict(digits_features) # plt.scatter(digits_features[:, 0], digits_features[:, 1], c=labels, s=40, cmap='viridis') # plt.show() # digits_features_pd = pd.DataFrame(data=digits_features[1:, 1:], # index=digits_features[1:,0], # columns=digits_features[0,1:]) # pd.plotting.scatter_matrix(digits_features_pd) # probs = GaussianMixture.predict_proba(digits_features) # print(probs[:5].round(3)) kmeans = KMeans(init="random", n_clusters=18, n_init=10, max_iter=300, random_state=42) kmeans.fit(X_reduced_inc) # the lowest SSE value print("KMeans Inertia", kmeans.inertia_) # final locations of the centroid print("KMeans Cluster Centers", kmeans.cluster_centers_) # num of iterations required to converge print("KMeans Iterations Required To Converge", kmeans.n_iter_) # labels print("KMeans Labels", kmeans.labels_[:5]) kmeans_kwargs = { "init": "random", "n_init": 10, "max_iter": 300, "random_state": 42, } sse = [] for k in range(1, 18): kmeans = KMeans(n_clusters=k, **kmeans_kwargs) kmeans.fit(scaled_features) sse.append(kmeans.inertia_) kl = KneeLocator(range(1, 18), sse, curve="convex", direction="decreasing") # optimal k (number of clusters) for this dataset print("Elbow", kl.elbow) model = KMeans() elbow_visualizer = KElbowVisualizer(model, k=(2, 18)) elbow_visualizer.fit(X_reduced_inc) elbow_visualizer.show() silhouette_visualizer = SilhouetteVisualizer(model, colors='yellowbrick') silhouette_visualizer.fit(X_reduced_inc) silhouette_visualizer.show() ic_visualizer = InterclusterDistance(model) ic_visualizer.fit(X_reduced_inc) ic_visualizer.show()
score1_confinement = [word_vectors.similarity('confinement', el[0]) for el in g1] """ g2 = word_vectors.similar_by_vector(KM_model2.cluster_centers_[2], topn = 15, restrict_vocab = None) """ metric_str = 'euclidean' score = silhouette_score(word_vectors.vectors, KM_model2.predict(word_vectors.vectors), metric = metric_str) print("silhouette_score:", score) SVmodel = SilhouetteVisualizer(KM_model2, is_fitted = True) SVmodel.fit(word_vectors.vectors) SVmodel.show() words = pd.DataFrame(word_vectors.vocab.keys(), columns = ['words']) words['vectors'] = words.words.apply(lambda x: word_vectors[f'{x}']) words['cluster'] = words.vectors.apply(lambda x: KM_model2.predict( [np.array(x)])) words.cluster = words.cluster.apply(lambda x: x[0]) words['cluster_value'] = [1 if i == 0 else -1 for i in words.cluster] words['closeness_score'] = words.apply( lambda x: 1/(KM_model2.transform([x.vectors]).min()), axis = 1) words['sentiment_coeff'] = words.closeness_score * words.cluster_value clus_time = time.time() print("clustering time: %s seconds " % (clus_time - w2v_time))
def visualize(self): model = KMeans(self.k) visualizer = SilhouetteVisualizer(model, colors='yellowbrick') visualizer.fit(self.data) visualizer.show()