def run_kmeans(X,y,title): kclusters = list(np.arange(2,50,2)) sil_scores = []; f1_scores = []; homo_scores = []; train_times = [] for k in kclusters: start_time = timeit.default_timer() km = KMeans(n_clusters=k, n_init=10,random_state=100,n_jobs=-1).fit(X) end_time = timeit.default_timer() train_times.append(end_time - start_time) sil_scores.append(sil_score(X, km.labels_)) y_mode_vote = cluster_predictions(y,km.labels_) f1_scores.append(f1_score(y, y_mode_vote)) fig = plt.figure() ax = fig.add_subplot(111) ax.plot(kclusters, sil_scores) plt.grid(True) plt.xlabel('Num. Clusters') plt.ylabel('Silhouette') plt.title(title + 'k-means Silhouette') plt.show() fig = plt.figure() ax = fig.add_subplot(111) ax.plot(kclusters, f1_scores) plt.grid(True) plt.xlabel('No. Clusters') plt.ylabel('F1 Score') plt.title('F1 Scores KMeans: '+ title) plt.show()
def em_experiment(X, y, title, folder=""): cluster_range = list(np.arange(2, 11, 1)) sil_scores, accuracy_scores, homo_scores, sse_scores, ami_scores, bic_scores = ( [] for i in range(6)) completeness_scores = [] for k in cluster_range: # print(k) em = EM(n_components=k).fit(X) em_labels = em.predict(X) sil_scores.append(sil_score(X, em_labels)) sse_scores.append(em.score(X)) # print(sil_score(X,em_labels)) homo_scores.append(homogeneity_score(y, em_labels)) completeness_scores.append(completeness_score(y, em_labels)) ami_scores.append(adjusted_mutual_info_score(y, em_labels)) bic_scores.append(em.bic(X)) plt.plot(cluster_range, sil_scores) plt.xlabel('No. Components') plt.ylabel('Avg Silhouette Score') plt.title('Silhouette Score for EM: ' + title) plt.savefig(folder + '/EMSIL.png') plt.close() plt.plot(cluster_range, homo_scores) plt.xlabel('No. Components') plt.ylabel('Homogeneity Score') plt.title('Homogeneity Scores EM: ' + title) plt.savefig(folder + '/EMHOMOGENEITY.png') plt.close() plt.plot(cluster_range, completeness_scores) plt.xlabel('No. Components') plt.ylabel('Completeness Score') plt.title('Completeness Score for EM: ' + title) plt.savefig(folder + '/EMCompletness.png') plt.close() plt.plot(cluster_range, sse_scores) plt.xlabel('No. Components') plt.ylabel('SSE Score') plt.title('SSE Scores EM: ' + title) plt.savefig(folder + '/EMSSE.png') plt.close() plt.plot(cluster_range, ami_scores) plt.xlabel('No. Components') plt.ylabel('AMI Score') plt.title('Adjusted Mutual Information Scores EM: ' + title) plt.savefig(folder + '/EMAMI.png') plt.close() plt.plot(cluster_range, bic_scores) plt.xlabel('No. Components') plt.ylabel('AMI Score') plt.title('BIC Scores EM: ' + title) plt.savefig(folder + '/EMBIC.png') plt.close()
def run_kmeans(X, y, title): kclusters = list(np.arange(2, 50, 2)) sil_scores = [] f1_scores = [] homo_scores = [] train_times = [] for k in kclusters: start_time = timeit.default_timer() km = KMeans(n_clusters=k, n_init=10, random_state=100, n_jobs=-1).fit(X) end_time = timeit.default_timer() train_times.append(end_time - start_time) sil_scores.append(sil_score(X, km.labels_)) y_mode_vote = cluster_predictions(y, km.labels_) f1_scores.append(f1_score(y, y_mode_vote)) homo_scores.append(homogeneity_score(y, km.labels_)) # elbow curve for silhouette score fig = plt.figure() ax = fig.add_subplot(111) ax.plot(kclusters, sil_scores) plt.grid(True) plt.xlabel('No. Clusters') plt.ylabel('Avg Silhouette Score') plt.title('Elbow Plot for KMeans: ' + title) plt.show() # plot homogeneity scores fig = plt.figure() ax = fig.add_subplot(111) ax.plot(kclusters, homo_scores) plt.grid(True) plt.xlabel('No. Clusters') plt.ylabel('Homogeneity Score') plt.title('Homogeneity Scores KMeans: ' + title) plt.show() # plot f1 scores fig = plt.figure() ax = fig.add_subplot(111) ax.plot(kclusters, f1_scores) plt.grid(True) plt.xlabel('No. Clusters') plt.ylabel('F1 Score') plt.title('F1 Scores KMeans: ' + title) plt.show() # plot model training time fig = plt.figure() ax = fig.add_subplot(111) ax.plot(kclusters, train_times) plt.grid(True) plt.xlabel('No. Clusters') plt.ylabel('Training Time (s)') plt.title('KMeans Training Time: ' + title) plt.show()
def run_EM(X,y,title): kdist = list(np.arange(2,100,5)) sil_scores = []; f1_scores = []; homo_scores = []; train_times = []; aic_scores = []; bic_scores = [] for k in kdist: start_time = timeit.default_timer() em = EM(n_components=k,covariance_type='diag',n_init=1,warm_start=True,random_state=100).fit(X) end_time = timeit.default_timer() train_times.append(end_time - start_time) labels = em.predict(X) sil_scores.append(sil_score(X, labels)) y_mode_vote = cluster_predictions(y,labels) f1_scores.append(f1_score(y, y_mode_vote)) homo_scores.append(homogeneity_score(y, labels)) aic_scores.append(em.aic(X)) bic_scores.append(em.bic(X)) fig = plt.figure() ax = fig.add_subplot(111) ax.plot(kdist, sil_scores) plt.grid(True) plt.xlabel('# Clusters') plt.ylabel('Silhouette') plt.title(title + ' Exp Max Silhouette') plt.show() fig = plt.figure() ax = fig.add_subplot(111) ax.plot(kdist, sil_scores) plt.grid(True) plt.xlabel('# Clusters') plt.ylabel('Silhouette') plt.title(title + ' Exp Max Silhouette') plt.show() fig = plt.figure() ax = fig.add_subplot(111) ax.plot(kdist, f1_scores) plt.grid(True) plt.xlabel('# Clusters') plt.ylabel('F1 Score') plt.title(title + 'Exp Max F1') plt.show() fig = plt.figure() ax = fig.add_subplot(111) ax.plot(kdist, aic_scores, label='AIC') ax.plot(kdist, bic_scores,label='BIC') plt.grid(True) plt.xlabel('# Clusters') plt.ylabel('Model Complexity Score') plt.title(title + 'Exp Max Model Complexity') plt.legend(loc="best") plt.show()
def kmeans_experiment(X, y, title, folder=""): cluster_range = list(np.arange(2, 40, 1)) sil_scores, accuracy_scores, homo_scores, sse_scores, ami_scores = ( [] for i in range(5)) completeness_scores = [] print(title) for k in cluster_range: print(k) km = KMeans(n_clusters=k).fit(X) km_labels = km.predict(X) # sse_scores.append(km.score(X)) sse_scores.append(km.inertia_) sil_scores.append(sil_score(X, km_labels)) # print(sil_score(X, km_labels)) homo_scores.append(homogeneity_score(y, km_labels)) completeness_scores.append(completeness_score(y, km_labels)) ami_scores.append(adjusted_mutual_info_score(y, km_labels)) plt.plot(cluster_range, sil_scores) plt.xlabel('No. Clusters') plt.ylabel('Avg Silhouette Score') plt.title('Silhouette Score for KMeans: ' + title) plt.savefig(folder + '/KMSIL.png') plt.close() plt.plot(cluster_range, homo_scores) plt.xlabel('No. Clusters') plt.ylabel('Homogeneity Score') plt.title('Homogeneity Scores KMeans: ' + title) plt.savefig(folder + '/KMHOMOGENEITY.png') plt.close() plt.plot(cluster_range, sse_scores) plt.xlabel('No. Clusters') plt.ylabel('SSE Score') plt.title('SSE Scores KMeans: ' + title) plt.savefig(folder + '/KMSSE.png') plt.close() plt.plot(cluster_range, ami_scores) plt.xlabel('No. Clusters') plt.ylabel('AMI Score') plt.title('Adjusted Mutual Information Scores KMeans: ' + title) plt.savefig(folder + '/KMAMI.png') plt.close() plt.plot(cluster_range, completeness_scores) plt.xlabel('No. Clusters') plt.ylabel('Completeness Score') plt.title('Completeness Scores KMeans: ' + title) plt.savefig(folder + '/KMCompleteness.png') plt.close()
def run_EM(X, y, title): kdist = list(np.arange(2, 100, 5)) sil_scores = [] train_times = [] aic_scores = [] bic_scores = [] for k in kdist: start_time = timeit.default_timer() em = EM(n_components=k, covariance_type='spherical', random_state=100).fit(X) end_time = timeit.default_timer() train_times.append(end_time - start_time) labels = em.predict(X) sil_scores.append(sil_score(X, labels)) # y_mode_vote = cluster_predictions(y, labels) # f1_scores.append(f1_score(y, y_mode_vote)) # homo_scores.append(homogeneity_score(y, labels)) aic_scores.append(em.aic(X)) bic_scores.append(em.bic(X)) # elbow curve for silhouette score fig = plt.figure() ax = fig.add_subplot(111) ax.plot(kdist, sil_scores) plt.grid(True) plt.xlabel('No. Distributions') plt.ylabel('Avg Silhouette Score') plt.title('Silhouette Analysis for EM: ' + title) plt.show() # plot model AIC and BIC fig = plt.figure() ax = fig.add_subplot(111) ax.plot(kdist, aic_scores, label='AIC') ax.plot(kdist, bic_scores, label='BIC') plt.grid(True) plt.xlabel('No. Distributions') plt.ylabel('Model Complexity Score') plt.title('EM Model Complexity: ' + title) plt.legend(loc="best") plt.show()
def run_kmeans(X, y, title): kclusters = list(np.arange(2, 50, 2)) sil_scores = [] train_times = [] for k in kclusters: start_time = timeit.default_timer() km = KMeans(n_clusters=k, n_init=10, random_state=100, n_jobs=-1).fit(X) end_time = timeit.default_timer() train_times.append(end_time - start_time) sil_scores.append(sil_score(X, km.labels_)) # elbow curve for silhouette score fig = plt.figure() ax = fig.add_subplot(111) ax.plot(kclusters, sil_scores) plt.grid(True) plt.xlabel('No. Clusters') plt.ylabel('Avg Silhouette Score') plt.title('Silhouette Analysis for KMeans: ' + title) plt.show()
def run_EM(X,y,title): #kdist = [2,3,4,5] #kdist = list(range(2,51)) kdist = list(np.arange(2,20,2)) sil_scores = []; f1_scores = []; homo_scores = []; train_times = []; aic_scores = []; bic_scores = [] for k in kdist: start_time = timeit.default_timer() em = EM(n_components=k,covariance_type='diag',n_init=1,warm_start=True,random_state=100).fit(X) end_time = timeit.default_timer() train_times.append(end_time - start_time) labels = em.predict(X) sil_scores.append(sil_score(X, labels)) y_mode_vote = cluster_predictions(y,labels) f1_scores.append(f1_score(y, y_mode_vote)) homo_scores.append(homogeneity_score(y, labels)) aic_scores.append(em.aic(X)) bic_scores.append(em.bic(X)) # elbow curve for silhouette score fig = plt.figure() ax = fig.add_subplot(111) ax.plot(kdist, sil_scores) plt.grid(True) plt.xlabel('No. Distributions') plt.ylabel('Avg Silhouette Score') plt.title('Elbow Plot for EM: '+ title) plt.show() # plot homogeneity scores fig = plt.figure() ax = fig.add_subplot(111) ax.plot(kdist, homo_scores) plt.grid(True) plt.xlabel('No. Distributions') plt.ylabel('Homogeneity Score') plt.title('Homogeneity Scores EM: '+ title) plt.show() # plot f1 scores fig = plt.figure() ax = fig.add_subplot(111) ax.plot(kdist, f1_scores) plt.grid(True) plt.xlabel('No. Distributions') plt.ylabel('F1 Score') plt.title('F1 Scores EM: '+ title) plt.show() # plot model AIC and BIC fig = plt.figure() ax = fig.add_subplot(111) ax.plot(kdist, aic_scores, label='AIC') ax.plot(kdist, bic_scores,label='BIC') plt.grid(True) plt.xlabel('No. Distributions') plt.ylabel('Model Complexity Score') plt.title('EM Model Complexity: '+ title) plt.legend(loc="best") plt.show()
def __do_perform(self, custom_out=None, main_experiment=None): if custom_out is not None: # if not os.path.exists(custom_out): # os.makedirs(custom_out) self._old_out = self._out self._out = custom_out elif self._old_out is not None: self._out = self._old_out if main_experiment is not None: self.log("Performing {} as part of {}".format(self.experiment_name(), main_experiment.experiment_name())) else: self.log("Performing {}".format(self.experiment_name())) # Adapted from https://github.com/JonathanTay/CS-7641-assignment-3/blob/master/clustering.py # %% Data for 1-3 sse = defaultdict(list) ll = defaultdict(list) bic = defaultdict(list) sil = defaultdict(lambda: defaultdict(list)) sil_s = np.empty(shape=(2*len(self._clusters)*self._details.ds.training_x.shape[0],4), dtype='<U21') acc = defaultdict(lambda: defaultdict(float)) adj_mi = defaultdict(lambda: defaultdict(float)) km = kmeans(random_state=self._details.seed) gmm = GMM(random_state=self._details.seed) st = clock() j = 0 for k in self._clusters: km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(self._details.ds.training_x) gmm.fit(self._details.ds.training_x) km_labels = km.predict(self._details.ds.training_x) gmm_labels = gmm.predict(self._details.ds.training_x) sil[k]['Kmeans'] = sil_score(self._details.ds.training_x, km_labels) sil[k]['GMM'] = sil_score(self._details.ds.training_x, gmm_labels) km_sil_samples = sil_samples(self._details.ds.training_x, km_labels) gmm_sil_samples = sil_samples(self._details.ds.training_x, gmm_labels) # There has got to be a better way to do this, but I can't brain right now for i, x in enumerate(km_sil_samples): sil_s[j] = [k, 'Kmeans', round(x, 6), km_labels[i]] j += 1 for i, x in enumerate(gmm_sil_samples): sil_s[j] = [k, 'GMM', round(x, 6), gmm_labels[i]] j += 1 sse[k] = [km.score(self._details.ds.training_x)] ll[k] = [gmm.score(self._details.ds.training_x)] bic[k] = [gmm.bic(self._details.ds.training_x)] acc[k]['Kmeans'] = cluster_acc(self._details.ds.training_y, km_labels) acc[k]['GMM'] = cluster_acc(self._details.ds.training_y, gmm_labels) adj_mi[k]['Kmeans'] = ami(self._details.ds.training_y, km_labels) adj_mi[k]['GMM'] = ami(self._details.ds.training_y, gmm_labels) self.log("Cluster: {}, time: {}".format(k, clock() - st)) sse = (-pd.DataFrame(sse)).T sse.index.name = 'k' sse.columns = ['{} sse (left)'.format(self._details.ds_readable_name)] ll = pd.DataFrame(ll).T ll.index.name = 'k' ll.columns = ['{} log-likelihood'.format(self._details.ds_readable_name)] bic = pd.DataFrame(bic).T bic.index.name = 'k' bic.columns = ['{} BIC'.format(self._details.ds_readable_name)] sil = pd.DataFrame(sil).T sil_s = pd.DataFrame(sil_s, columns=['k', 'type', 'score', 'label']).set_index('k') #.T # sil_s = sil_s.T acc = pd.DataFrame(acc).T adj_mi = pd.DataFrame(adj_mi).T sil.index.name = 'k' sil_s.index.name = 'k' acc.index.name = 'k' adj_mi.index.name = 'k' sse.to_csv(self._out.format('{}_sse.csv'.format(self._details.ds_name))) ll.to_csv(self._out.format('{}_logliklihood.csv'.format(self._details.ds_name))) bic.to_csv(self._out.format('{}_bic.csv'.format(self._details.ds_name))) sil.to_csv(self._out.format('{}_sil_score.csv'.format(self._details.ds_name))) sil_s.to_csv(self._out.format('{}_sil_samples.csv'.format(self._details.ds_name))) acc.to_csv(self._out.format('{}_acc.csv'.format(self._details.ds_name))) adj_mi.to_csv(self._out.format('{}_adj_mi.csv'.format(self._details.ds_name))) # %% NN fit data (2,3) grid = {'km__n_clusters': self._clusters, 'NN__alpha': self._nn_reg, 'NN__hidden_layer_sizes': self._nn_arch} mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=self._details.seed) km = kmeans(random_state=self._details.seed, n_jobs=self._details.threads) pipe = Pipeline([('km', km), ('NN', mlp)], memory=experiments.pipeline_memory) gs, _ = self.gs_with_best_estimator(pipe, grid, type='kmeans') self.log("KMmeans Grid search complete") tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(self._out.format('{}_cluster_kmeans.csv'.format(self._details.ds_name))) grid = {'gmm__n_components': self._clusters, 'NN__alpha': self._nn_reg, 'NN__hidden_layer_sizes': self._nn_arch} mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=self._details.seed) gmm = CustomGMM(random_state=self._details.seed) pipe = Pipeline([('gmm', gmm), ('NN', mlp)], memory=experiments.pipeline_memory) gs, _ = self.gs_with_best_estimator(pipe, grid, type='gmm') self.log("GMM search complete") tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(self._out.format('{}_cluster_GMM.csv'.format(self._details.ds_name))) # %% For chart 4/5 self._details.ds.training_x2D = TSNE(verbose=10, random_state=self._details.seed).fit_transform( self._details.ds.training_x ) ds_2d = pd.DataFrame(np.hstack((self._details.ds.training_x2D, np.atleast_2d(self._details.ds.training_y).T)), columns=['x', 'y', 'target']) ds_2d.to_csv(self._out.format('{}_2D.csv'.format(self._details.ds_name))) self.log("Done")
def vis(X, y, nameappendix, k): scaler = MinMaxScaler(feature_range=[0,100]) scaler.fit(X) X = pd.DataFrame(scaler.transform(X)) fig, (ax1, ax2) = plt.subplots(1, 2) fig.set_size_inches(15, 6) ax1.set_xlim([-0.1, 1]) ax1.set_ylim([0, len(X) + (k + 1) * 10]) print("Num of clusters: ", k) clusters = KMeans(n_clusters = k, random_state = 10).fit(X) labels = clusters.labels_ print("NMI score: %.5f" % normalized_mutual_info_score(y, labels)) silhouette_avg = sil_score(X, labels) print("Silhouette score: ", silhouette_avg) sample_silhouette_values = silhouette_samples(X, labels) y_lower = 10 for i in range(k): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = sample_silhouette_values[labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i # color = plt.spectral(float(i) / numOfCluster) color = plt.get_cmap('Spectral')(float(i) / k) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples ax1.set_title("Silhouette Coefficients for Clusters.") ax1.set_xlabel("Silhouette Coefficient Values") ax1.set_ylabel("Cluster Labels") # The vertical line for average silhouette score of all the values ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) # Clear the yaxis labels / ticks ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) # 2nd Plot showing the actual clusters formed # colors = plt.spectral(labels.astype(float) / numOfCluster) colors = plt.get_cmap('Spectral')(labels.astype(float) / k) # print(X.values[:, 10]) # colors = ["b","g","r","c","m","y","k"] ax2.scatter(X.values[:, 3], X.values[:,5], marker='.', s=30, lw=0, alpha=0.7, c=colors, edgecolor='k') # Labeling the clusters centers = clusters.cluster_centers_ # Draw white circles at cluster centers ax2.scatter(centers[:, 3], centers[:, 5], marker='o', c="white", alpha=1, s=200, edgecolor='k') for i, c in enumerate(centers): ax2.scatter(c[3], c[5], marker='$%d$' % i, alpha=1, s=50, edgecolor='k') ax2.set_title("Clustering Visualization") ax2.set_xlabel("1st feature: Pressure X4") ax2.set_ylabel("2nd feature: Pressure X5") plt.suptitle("Analysis for KMeans for " + str(k) + " Clusters", fontsize=14, fontweight='bold') # plt.savefig('img/kmeans_vis' + str(k) + '.png') plt.show()
def __do_perform(self, custom_out=None, main_experiment=None ): # ./output/ICA/clustering//{}', ICAExperiment if custom_out is not None: # if not os.path.exists(custom_out): # os.makedirs(custom_out) self._old_out = self._out # './output/ICA/{}' self._out = custom_out # ./output/ICA/clustering//{}' elif self._old_out is not None: self._out = self._old_out if main_experiment is not None: self.log("Performing {} as part of {}".format( self.experiment_name(), main_experiment.experiment_name())) # 'clustering', 'ICA' else: self.log("Performing {}".format(self.experiment_name())) # Adapted from https://github.com/JonathanTay/CS-7641-assignment-3/blob/master/clustering.py # %% Data for 1-3 sse = defaultdict(list) ll = defaultdict(list) bic = defaultdict(list) sil = defaultdict(lambda: defaultdict(list)) sil_s = np.empty(shape=(2 * len(self._clusters) * self._details.ds.training_x.shape[0], 4), dtype='<U21') acc = defaultdict(lambda: defaultdict(float)) adj_mi = defaultdict(lambda: defaultdict(float)) km = kmeans(random_state=self._details.seed) gmm = GMM(random_state=self._details.seed) st = clock() j = 0 for k in self._clusters: km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit( self._details.ds.training_x ) # cluster the ICA-transformed input features using kMeans with varying K gmm.fit( self._details.ds.training_x ) # cluster the ICA-transformed input features using GMM with varying k km_labels = km.predict( self._details.ds.training_x ) # give each ICA-transformed input feature a label gmm_labels = gmm.predict(self._details.ds.training_x) sil[k]['Kmeans'] = sil_score( self._details.ds.training_x, km_labels ) # compute mean silhouette score for all ICA-transformed input features sil[k]['GMM'] = sil_score(self._details.ds.training_x, gmm_labels) km_sil_samples = sil_samples( self._details.ds.training_x, km_labels ) # compute silhouette score for each ICA-transformed input feature gmm_sil_samples = sil_samples(self._details.ds.training_x, gmm_labels) # There has got to be a better way to do this, but I can't brain right now for i, x in enumerate(km_sil_samples): sil_s[j] = [ k, 'Kmeans', round(x, 6), km_labels[i] ] # record the silhouette score x for each instance i given its label kn_labels[i] by kMeans with value k j += 1 for i, x in enumerate(gmm_sil_samples): sil_s[j] = [k, 'GMM', round(x, 6), gmm_labels[i]] j += 1 sse[k] = [ km.score(self._details.ds.training_x) ] # score (opposite of the value of X on the k-Means objective (what is the objective???) ll[k] = [gmm.score(self._details.ds.training_x) ] # per-sample average log-likelihood bic[k] = [ gmm.bic(self._details.ds.training_x) ] # bayesian information criterion (review ???) on the input X acc[k]['Kmeans'] = cluster_acc( self._details.ds.training_y, km_labels ) # compute the accuracy of the clustering algorithm on the ICA-transformed data (against the original y-label) if it predicted the majority y-label for each cluster acc[k]['GMM'] = cluster_acc(self._details.ds.training_y, gmm_labels) adj_mi[k]['Kmeans'] = ami( self._details.ds.training_y, km_labels ) # compute the adjusted mutual information between the true labels and the cluster predicted labels (how well does clustering match truth) adj_mi[k]['GMM'] = ami(self._details.ds.training_y, gmm_labels) self.log("Cluster: {}, time: {}".format(k, clock() - st)) sse = (-pd.DataFrame(sse)).T sse.index.name = 'k' sse.columns = ['{} sse (left)'.format(self._details.ds_readable_name) ] # Bank sse (left) ll = pd.DataFrame(ll).T ll.index.name = 'k' ll.columns = [ '{} log-likelihood'.format(self._details.ds_readable_name) ] # Bank log-likelihood bic = pd.DataFrame(bic).T bic.index.name = 'k' bic.columns = ['{} BIC'.format(self._details.ds_readable_name) ] # Bank BIC sil = pd.DataFrame(sil).T sil_s = pd.DataFrame(sil_s, columns=['k', 'type', 'score', 'label']).set_index('k') #.T # sil_s = sil_s.T acc = pd.DataFrame(acc).T adj_mi = pd.DataFrame(adj_mi).T sil.index.name = 'k' sil_s.index.name = 'k' acc.index.name = 'k' adj_mi.index.name = 'k' # write scores to files sse.to_csv(self._out.format('{}_sse.csv'.format( self._details.ds_name))) ll.to_csv( self._out.format('{}_logliklihood.csv'.format( self._details.ds_name))) bic.to_csv(self._out.format('{}_bic.csv'.format( self._details.ds_name))) sil.to_csv( self._out.format('{}_sil_score.csv'.format(self._details.ds_name))) sil_s.to_csv( self._out.format('{}_sil_samples.csv'.format( self._details.ds_name))) acc.to_csv(self._out.format('{}_acc.csv'.format( self._details.ds_name))) adj_mi.to_csv( self._out.format('{}_adj_mi.csv'.format(self._details.ds_name))) # %% NN fit data (2,3) # train a NN on clustered data grid = { 'km__n_clusters': self._clusters, 'NN__alpha': self._nn_reg, 'NN__hidden_layer_sizes': self._nn_arch } mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=self._details.seed) km = kmeans(random_state=self._details.seed, n_jobs=self._details.threads) pipe = Pipeline( [('km', km), ('NN', mlp)], memory=experiments.pipeline_memory ) # run a NN on the clustered data (only on the cluster labels, or input features + cluster labels???) gs, _ = self.gs_with_best_estimator( pipe, grid, type='kmeans') # write the best NN to file self.log("KMmeans Grid search complete") tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv( self._out.format('{}_cluster_kmeans.csv'.format( self._details.ds_name)) ) # write grid search results --> bank_cluster_kmeans.csv grid = { 'gmm__n_components': self._clusters, 'NN__alpha': self._nn_reg, 'NN__hidden_layer_sizes': self._nn_arch } mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=self._details.seed) gmm = CustomGMM(random_state=self._details.seed) pipe = Pipeline([('gmm', gmm), ('NN', mlp)], memory=experiments.pipeline_memory) gs, _ = self.gs_with_best_estimator( pipe, grid, type='gmm') # write the best NN to file self.log("GMM search complete") tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv( self._out.format('{}_cluster_GMM.csv'.format( self._details.ds_name)) ) # write grid search results --> bank_cluster_GMM.csv # %% For chart 4/5 # perform TSNE D.R on training data (why???) self._details.ds.training_x2D = TSNE( verbose=10, random_state=self._details.seed).fit_transform( self._details.ds.training_x) ds_2d = pd.DataFrame( np.hstack((self._details.ds.training_x2D, np.atleast_2d(self._details.ds.training_y).T)), columns=['x', 'y', 'target'] ) # prepare NN-learnable data using TSNE D.R'd input features + label ds_2d.to_csv( self._out.format('{}_2D.csv'.format( self._details.ds_name))) # --> bank_2D.csv self.log("Done")
print('Now processing {} data with {} using {} clusters...'.format(ds, r, k)) data_st = clock() # fit the credit data km.fit(dataX) km_labels = km.predict(dataX) gmm.fit(dataX) gmm_labels = gmm.predict(dataX) # save the labels labels[k]['Kmeans'] = km_labels labels[k]['GMM'] = gmm_labels sil[k]['Kmeans'] = sil_score(dataX, km_labels) sil[k]['GMM'] = sil_score(dataX, gmm_labels) km_sil_samples = sil_samples(dataX, km_labels) gmm_sil_samples = sil_samples(dataX, gmm_labels) for i, x in enumerate(km_sil_samples): sil_samp[j] = [k, 'Kmeans', round(x, 6), km_labels[i]] j += 1 for i, x in enumerate(gmm_sil_samples): sil_samp[j] = [k, 'GMM', round(x, 6), gmm_labels[i]] j += 1 sse[k] = km.score(dataX) ll[k] = gmm.score(dataX) bic[k] = gmm.bic(dataX) acc[k]['Kmeans'] = cluster_acc(dataY,km.predict(dataX)) acc[k]['GMM'] = cluster_acc(dataY,gmm.predict(dataX)) adj_mi[k]['Kmeans'] = ami(dataY,km.predict(dataX))
nmf_array = nmf.fit_transform(x_tfidf) # Normalize the nmf array nmf_array = normalizer.fit_transform(nmf_array) # NMF labels labels = [np.argmax(x) for x in nmf_array] # Weighted matrix of similarities weighted_matrix = nmf_array.dot(nmf_array.T) # ------ Silhouette coefficient of dissimilarities ---- # dissim = np.ones(weighted_matrix.shape) - weighted_matrix sil = sil_score(dissim, labels, metric = 'precomputed') # --------------------- Modularity -------------------- # weighted_matrix_graph = deepcopy(weighted_matrix) for i in range(len(weighted_matrix_graph)): weighted_matrix_graph[i][i] = 0.00 graph = igraph.Graph.Weighted_Adjacency(list(weighted_matrix_graph),\ mode = igraph.ADJ_MAX) weights = [es['weight'] for es in graph.es] mod = graph.modularity(labels, weights = weights) # ---------------- Weak merit factor ------------ #
def cluster(cluster_range, dataset, dir): global start, kmeans_accuracy_for_k, end, em_prediction_y, em_pca_accuracy_for_k kmeans_accuracy, em_accuracy, kmeans_timetaken, em_timetaken = {}, {}, {}, {} sse = defaultdict(list) ll = defaultdict(list) bic = defaultdict(list) sil = defaultdict(lambda: defaultdict(list)) acc = defaultdict(lambda: defaultdict(float)) adj_mi = defaultdict(lambda: defaultdict(float)) for k in cluster_range: # Kmeans Clustering km = KMeans(n_clusters=k, random_state=0) gmm = GaussianMixture(n_components=k, random_state=0) start = datetime.now() kmeans_predicted_y = km.fit_predict(dataset.x) end = datetime.now() # EM Clustering start = datetime.now() em_prediction_y = gmm.fit_predict(dataset.x) end = datetime.now() ## Accuracy kmeans_accuracy_for_k = common_utils.get_cluster_accuracy( dataset.y, kmeans_predicted_y) kmeans_accuracy[k] = kmeans_accuracy_for_k kmeans_timetaken[k] = (end - start).total_seconds() em_pca_accuracy_for_k = common_utils.get_cluster_accuracy( dataset.y, em_prediction_y) em_accuracy[k] = em_pca_accuracy_for_k em_timetaken[k] = (end - start).total_seconds() ## PLotting sil[k]['Kmeans'] = sil_score(dataset.x, kmeans_predicted_y) sil[k]['GMM'] = sil_score(dataset.x, em_prediction_y) sse[k] = [km.score(dataset.x)] ll[k] = [gmm.score(dataset.x)] bic[k] = [gmm.bic(dataset.x)] adj_mi[k]['Kmeans'] = ami(dataset.y, kmeans_predicted_y) adj_mi[k]['GMM'] = ami(dataset.y, em_prediction_y) sse = (-pd.DataFrame(sse)).T sse.index.name = 'k' sse.columns = ['{} sse (left)'.format(dataset.dataset_name)] ll = pd.DataFrame(ll).T ll.index.name = 'k' ll.columns = ['{} log-likelihood'.format(dataset.dataset_name)] bic = pd.DataFrame(bic).T bic.index.name = 'k' bic.columns = ['{} BIC'.format(dataset.dataset_name)] sil = pd.DataFrame(sil).T adj_mi = pd.DataFrame(adj_mi).T sil.index.name = 'k' adj_mi.index.name = 'k' sse.to_csv(dir + '{}_sse.csv'.format(dataset.dataset_name)) ll.to_csv(dir + '{}_logliklihood.csv'.format(dataset.dataset_name)) bic.to_csv(dir + '{}_bic.csv'.format(dataset.dataset_name)) sil.to_csv(dir + '{}_sil_score.csv'.format(dataset.dataset_name)) adj_mi.to_csv(dir + '{}_adj_mi.csv'.format(dataset.dataset_name)) neural_net_score = nn_experiment(dataset) common_utils.plot_clustering_accuracy(kmeans_accuracy, "k-means - clusters vs Accuracy", dir) common_utils.plot_clustering_time(kmeans_timetaken, "k-means - clusters vs Time", dir) common_utils.plot_clustering_accuracy(em_accuracy, "EM clusters - vs Accuracy", dir) common_utils.plot_clustering_time(em_timetaken, "EM clusters - vs Time", dir) common_utils.read_and_plot_sse( 'Clustering', dir + '{}_sse.csv'.format(dataset.dataset_name), dir) common_utils.read_and_plot_loglikelihood( 'Clustering', dir + '{}_logliklihood.csv'.format(dataset.dataset_name), dir) common_utils.read_and_plot_bic( 'Clustering', dir + '{}_bic.csv'.format(dataset.dataset_name), dir) common_utils.read_and_plot_sil_score( 'Clustering', dir + '{}_sil_score.csv'.format(dataset.dataset_name), dir) common_utils.read_and_plot_adj_mi( 'Clustering', dir + '{}_adj_mi.csv'.format(dataset.dataset_name), dir)