def plot_km_em_clusters(x_train_scaled, x=0, y=1, z = 0, dataset_name = "", km_clusters = 12, em_clusters=12): kmeans = KMeans(n_clusters=km_clusters, random_state=random_state) y_pred = kmeans.fit_predict(x_train_scaled) plot_clusters_3d(x_train_scaled, y_pred, x, y, z, dataset_name=dataset_name, classifier = "K-means") gm = BayesianGaussianMixture(n_components = em_clusters, random_state=random_state, reg_covar=1e-01) y_pred = gm.fit_predict(x_train_scaled) plot_clusters_3d(x_train_scaled, y_pred, x, y, z, dataset_name=dataset_name, classifier = "EM")
def view_gmm_graph(selected: np.array, validation_type: str = 'default', df: pd.DataFrame = None, n_init: int = 5, max_k=150) -> tuple: scores = [] temp_df = df.copy() if df is not None else None temp_selected = selected.toarray() if validation_type == 'default': for i in range(10, max_k, 10): model = BayesianGaussianMixture(i, covariance_type='diag', n_init=n_init, init_params='kmeans', random_state=12) scores.append(model.fit(temp_selected).score(temp_selected)) print(f'k={i}, score={scores[-1]}') elif validation_type == 'purity': for i in range(10, max_k, 10): model = BayesianGaussianMixture(i, covariance_type='diag', n_init=n_init, init_params='kmeans', random_state=12) temp_df['KMeans'] = model.fit_predict(temp_selected) scores.append(cluster_purity(temp_df)) print(f'k={i}, score={scores[-1]}') plt.plot(range(10, max_k, 10), scores) plt.xlabel('Number of clusters') plt.ylabel('Score') plt.show() return list(range(10, max_k, 10)), scores
def test_bayesian_mixture_fit_predict_n_init(): # Check that fit_predict is equivalent to fit.predict, when n_init > 1 X = np.random.RandomState(0).randn(50, 5) gm = BayesianGaussianMixture(n_components=5, n_init=10, random_state=0) y_pred1 = gm.fit_predict(X) y_pred2 = gm.predict(X) assert_array_equal(y_pred1, y_pred2)
def test_bayesian_mixture_fit_predict_n_init(): # Check that fit_predict is equivalent to fit.predict, when n_init > 1 X = np.random.RandomState(0).randn(1000, 5) gm = BayesianGaussianMixture(n_components=5, n_init=10, random_state=0) y_pred1 = gm.fit_predict(X) y_pred2 = gm.predict(X) assert_array_equal(y_pred1, y_pred2)
def BayesianGaussianMixture(V, **kwargs): """Performs clustering on *V* by using Gaussian mixture models with variational inference. The function uses :func:`sklearn.micture.GaussianMixture`. See sklearn documents for details. :arg V: row-normalized eigenvectors for the purpose of clustering. :type V: :class:`numpy.ndarray` :arg n_clusters: specifies the number of clusters. :type n_clusters: int """ try: from sklearn.mixture import BayesianGaussianMixture except ImportError: raise ImportError('Use of this function (BayesianGaussianMixture) requires the ' 'installation of sklearn.') n_components = kwargs.pop('n_components', None) if n_components == None: n_components = kwargs.pop('n_clusters',None) if n_components == None: n_components = 1 n_init = kwargs.pop('n_init', 1) mixture = BayesianGaussianMixture(n_init=n_init, **kwargs).fit(V) return mixture.fit_predict(V)
def model_dpgmm(k, x_scaled): global labels print("\nDirichlet Process Gaussian Mixture ... [" + str(len(x_scaled)) + ' training samples -> ' + str(k) + ' initial clusters]') model = BayesianGaussianMixture(n_components=k, covariance_type='full') labels = model.fit_predict(x_scaled) print('Done! Final cluster numbers = ', len(np.unique(labels))) do_dataframe()
def identifyTransitionsComplete(p, window_size): joint_angles = (p['X'][:, :, 0:7]) joint_velocity = (p['X'][:, :, 7:14]) torque = (p['U'][:, :, :]) endeff_pose = (p['EX'][:, :, 0:6]) endeff_velocity = (p['EX'][:, :, 6:12]) force_feedback = (p['F'][:, :, :]) total_rollout = joint_velocity.shape[0] traj = [] for index in range(0, total_rollout): traj_feature = np.hstack( (joint_angles[index, :, :], endeff_pose[index, :, :], force_feedback[index, :, :])) traj.append(traj_feature) traj = np.array(traj) traj_time = traj.shape[1] dim = traj.shape[2] total_size = total_rollout * traj_time demo_data_array = np.zeros((total_size - window_size, dim * window_size)) inc = 0 for j in range(0, total_rollout): trajC = traj[j, :, :] for i in range(window_size, traj_time): window = trajC[i - window_size:i, :] demo_data_array[inc, :] = np.reshape(window, (1, dim * window_size)) inc = inc + 1 estimator = BayesianGaussianMixture(n_components=10, n_init=10, max_iter=300, weight_concentration_prior=1e-1, init_params='random', verbose=False) labels = estimator.fit_predict(demo_data_array) # print(estimator.weights_) filtabels = smoothing(labels) # print(labels) inc = 0 transitions = [] for j in range(window_size, total_size): if inc == 0 or j == window_size: pass # self._transitions.append((i,0)) elif j == (total_size - 1): pass # self._transitions.append((i,n-1)) elif filtabels[inc - 1] != filtabels[inc]: transitions.append(j - window_size) inc = inc + 1 transitions.append(0) transitions.append(total_size - 1) transitions.sort() print("[TSC] Discovered Transitions (number): ", len(transitions)) return transitions
def partition_data(self,j): dp = BayesianGaussianMixture(n_components = int(self.alpha*np.log(self.N)) ,weight_concentration_prior = self.alpha, init_params='kmeans',weight_concentration_prior_type='dirichlet_process') Z = dp.fit_predict(self.X[self.U[j]]) le = LE() Z = le.fit_transform(Z) Z_count = np.bincount(Z) assert(Z.max()+1 == Z_count.size) self.K[j] = int(Z_count.size) self.marginal_LL_k[j] = {k:0 for k in range(int(self.K[j])) } return(Z,Z_count)
def cluster(points, clouds=None, concentration_prior=None, K=100, restarts=10, seed=0): """ Clusters a set of data points lying in an arbitrary number of clusters. Arguments: data (list of lists of floats): list of data points to be clustered. clouds (list or lists of floats, same second dimension as data): bootstrapped bins for clustering sampleName (string): The name of the input sample. concentration_prior (float): Tuning parameter for clustering, must be between 0 and 1. Used to determine concentration of points in clusters -- higher favors more clusters, lower favors fewer clusters. K (int): maximum number of clusters to infer restarts (int): number of initializations to try for GMM seed (int): random number generator seed for GMM Returns: mus (list of lists of floats): List of cluster means. sigmas (list of 2D lists of floats): List of cluster covariances. clusterAssignments (list of ints): The assignment of each interval to a cluster, where an entry j at index i means the ith interval has been assigned to the jth meta-interval. numPoints (list of ints): Number of points assigned to each cluster numClusters (int): The number of clusters. """ from sklearn.mixture import BayesianGaussianMixture from collections import Counter sp.log(msg="## Clustering with K={} and c={}...\n".format( K, concentration_prior), level="INFO") total = list(points) if clouds is not None: total.extend(list(clouds)) npArray = np.array(total) gmm = BayesianGaussianMixture( n_components=K, n_init=restarts, weight_concentration_prior=concentration_prior, max_iter=int(1e6), random_state=seed) targetAssignments = gmm.fit_predict(npArray) targetAssignments = targetAssignments[:len(points)] mus = gmm.means_ sigmas = gmm.covariances_ cntr = Counter(targetAssignments) numPoints = [cntr[i] if i in cntr else 0 for i in range(K)] numClusters = len(cntr) return mus, sigmas, targetAssignments, numPoints, numClusters
def add_gmm_labels(df: pd.DataFrame, selected: np.array, k: int = 60, n_init=5): dense_selected = selected.toarray() model = BayesianGaussianMixture(k, covariance_type='diag', n_init=n_init, init_params='kmeans', random_state=12) labels = model.fit_predict(dense_selected) df['GMM'] = labels return model
def ms_VB(self): # 直接使用 sklearn 中的 VBGMM self.K = MAX_K clf = BayesianGaussianMixture(n_components=MAX_K, covariance_type="full", max_iter=200, random_state=0) y = self.correct_order(clf.fit_predict(self.x), clf) self.mus = clf.means_ print(y) self.show_scatter(y, "VBGMM") accuracy = np.mean(self.real_y.ravel() == y.ravel()) print(accuracy)
def relabel(dataset, n_components=4): if not len(dataset): return np.array([]) new_labels = -1 * np.ones(len(dataset.labels)) for l in np.unique(dataset.labels): gmm = BayesianGaussianMixture(n_components=n_components, weight_concentration_prior=1 / (n_components * 2), max_iter=200) ttt = PCA(n_components=6).fit_transform( dataset.waveforms[dataset.labels == l]) gmm_labels = gmm.fit_predict(ttt) new_labels[dataset.labels == l] = gmm_labels + 1 + np.max(new_labels) return new_labels
def identifyTransitions(traj, window_size, weight_prior, n_components): ''' Transition detection function based on DPGMM and windowing approach :param traj: trajectory with states, action, contact forces :param window_size: windows size used to accumulate states :param weight_prior, n_components: parameter used for DPGMM :return: transition points (index) in the trajectory ''' total_size = traj.shape[0] dim = traj.shape[1] demo_data_array = np.zeros((total_size - window_size, dim * window_size)) inc = 0 for i in range(window_size, total_size): window = traj[i - window_size:i, :] demo_data_array[inc, :] = np.reshape(window, (1, dim * window_size)) inc = inc + 1 estimator = BayesianGaussianMixture( n_components=n_components, n_init=10, max_iter=300, weight_concentration_prior=weight_prior, init_params='random', verbose=False) labels = estimator.fit_predict(demo_data_array) filtabels = smoothing(labels) inc = 0 transitions = [] for j in range(window_size, total_size): if inc == 0 or j == window_size: pass # self._transitions.append((i,0)) elif j == (total_size - 1): pass # self._transitions.append((i,n-1)) elif filtabels[inc - 1] != filtabels[inc]: transitions.append(j - window_size) inc = inc + 1 transitions.append(0) transitions.append(total_size - 1) transitions.sort() # print("[TSC] Discovered Transitions (number): ", len(transitions)) return transitions
def execute(self, namespace): from sklearn.mixture import GaussianMixture, BayesianGaussianMixture from PYME.IO import MetaDataHandler points = namespace[self.input_points] X = np.stack([points['x'], points['y'], points['z']], axis=1) if self.mode == 'n': gmm = GaussianMixture(n_components=self.n, covariance_type=self.covariance) predictions = gmm.fit_predict(X) elif self.mode == 'bic': n_components = range(1, self.n + 1) bic = np.zeros(len(n_components)) for ind in range(len(n_components)): gmm = GaussianMixture(n_components=n_components[ind], covariance_type=self.covariance) gmm.fit(X) bic[ind] = gmm.bic(X) logger.debug('%d BIC: %f' % (n_components[ind], bic[ind])) best = n_components[np.argmin(bic)] if best == self.n or (self.n > 10 and best > 0.9 * self.n): logger.warning( 'BIC optimization selected n components near n max') gmm = GaussianMixture(n_components=best, covariance_type=self.covariance) predictions = gmm.fit_predict(X) elif self.mode == 'bayesian': bgm = BayesianGaussianMixture(n_components=self.n, covariance_type=self.covariance) predictions = bgm.fit_predict(X) out = tabular.MappingFilter(points) try: out.mdh = MetaDataHandler.DictMDHandler(points.mdh) except AttributeError: pass out.addColumn(self.label_key, predictions) namespace[self.output_labeled] = out
def get_direction(self, X, mu): r""" Generate direction vectors. Parameters ---------- X : array Array of shape ``(nwalkers//2, ndim)`` with the walker positions of the complementary ensemble. mu : float The value of the scale factor ``mu``. Returns ------- directions : array Array of direction vectors of shape ``(nwalkers//2, ndim)``. """ if not self.tune: mu = self.mu0 n = X.shape[0] mixture = BayesianGaussianMixture(n_components=self.n_components) labels = mixture.fit_predict(X) means = mixture.means_ covariances = mixture.covariances_ i, j = np.random.choice(labels, 2, replace=False) if i != j: directions = np.random.multivariate_normal( means[i], covariances[i] * self.rescale_cov, size=n) - np.random.multivariate_normal( means[j], covariances[j] * self.rescale_cov, size=n) tune_once = False else: directions = mu * np.random.multivariate_normal( np.zeros_like(means[i]), covariances[i], size=n) if self.tune: tune_once = True else: tune_once = False return 2.0 * directions, tune_once
def identifyTransitions(traj, window_size): """ Identify transition by accumulating data points using sliding window and using DP GMM to find clusters in a single trajectory """ total_size = traj.shape[0] dim = traj.shape[1] demo_data_array = np.zeros((total_size - window_size, dim * window_size)) inc = 0 for i in range(window_size, total_size): window = traj[i - window_size:i, :] demo_data_array[inc, :] = np.reshape(window, (1, dim * window_size)) inc = inc + 1 estimator = BayesianGaussianMixture(n_components=5, n_init=10, max_iter=300, weight_concentration_prior=0.01, init_params='random', verbose=False) labels = estimator.fit_predict(demo_data_array) # print(estimator.weights_) filtabels = smoothing(labels) # print(labels) inc = 0 transitions = [] for j in range(window_size, total_size): if inc == 0 or j == window_size: pass # self._transitions.append((i,0)) elif j == (total_size - 1): pass # self._transitions.append((i,n-1)) elif filtabels[inc - 1] != filtabels[inc]: transitions.append(j - window_size) inc = inc + 1 transitions.append(0) transitions.append(total_size - 1) transitions.sort() print("[TSC] Discovered Transitions (number): ", len(transitions)) return transitions
def identifyTransitions(traj, window_size): total_size = traj.shape[0] dim = traj.shape[1] demo_data_array = np.zeros((total_size - window_size, dim * window_size)) inc = 0 for i in range(window_size, total_size): window = traj[i - window_size:i, :] demo_data_array[inc, :] = np.reshape(window, (1, dim * window_size)) inc = inc + 1 estimator = BayesianGaussianMixture(n_components=10, n_init=10, max_iter=300, weight_concentration_prior=1e-2, init_params='random', verbose=False) labels = estimator.fit_predict(demo_data_array) # print(estimator.weights_) filtabels = smoothing(labels) # print(labels) inc = 0 transitions = [] for j in range(window_size, total_size): if inc == 0 or j == window_size: pass # self._transitions.append((i,0)) elif j == (total_size - 1): pass # self._transitions.append((i,n-1)) elif filtabels[inc - 1] != filtabels[inc]: transitions.append(j - window_size) inc = inc + 1 transitions.append(0) transitions.append(total_size - 1) transitions.sort() # print("[TSC] Discovered Transitions (time): ", transitions) return transitions
def local_gmm(global_label, levels, n=10, image=None): assert ( type(image) is np.ndarray and image.dtype == np.uint8 and len(image.shape) == 2 or image is None ), "The input image has to be a uint8 2D numpy array, or omitted." assert ( type(global_label) is np.ndarray and global_label.dtype == np.uint8 and len(global_label.shape) == 2 ), "The input global_label has to be a uint8 2D numpy array." assert len(levels) == np.max( global_label ), "The nubmer of levels should match that of the global labels." assert type(n) is int local_label = np.zeros(global_label.shape, dtype=np.uint8) blob_levels = [] for i in range(len(levels)): lvl_ind = (global_label == i + 1).nonzero() data = np.transpose(lvl_ind) gmm = BayesianGaussianMixture(n_components=min(n, len(data)), random_state=123) prediction = gmm.fit_predict(data) for j in np.unique(prediction): blob_levels.append(levels[i]) blob_pts = prediction == j local_label[lvl_ind[0][blob_pts], lvl_ind[1][blob_pts]] = len(blob_levels) label_resized = ( local_label if image.shape == local_label.shape else cv2.resize(local_label, image.shape[::-1], interpolation=cv2.INTER_NEAREST) ) image_labeled = ( None if image is None else img_as_ubyte(label2rgb(label_resized, image, bg_label=0)) ) return image_labeled, local_label, blob_levels
def cluster_vbgm(aligned_maps): # sample_by_features = np.vstack([xmap.flatten() for xmap in aligned_maps]) embedding = embed(aligned_maps) clusterer = BayesianGaussianMixture(n_components=10) return clusterer.fit_predict(embedding)
n_features=2, center_box=[-5, 5], centers=nb_centers, random_state=1000) # Train the model with concentration 1000 and 0.1 for c in (1000.0, 0.1): gm = BayesianGaussianMixture(n_components=5, weight_concentration_prior=c, max_iter=10000, random_state=1000) gm.fit(X) print('Weights: {}'.format(gm.weights_)) Y_pred = gm.fit_predict(X) print((Y_pred == 0).sum()) print((Y_pred == 1).sum()) print((Y_pred == 2).sum()) print((Y_pred == 3).sum()) print((Y_pred == 4).sum()) # Compute the parameters of the Gaussian mixture m1 = gm.means_[0] m2 = gm.means_[1] m3 = gm.means_[2] m4 = gm.means_[3] m5 = gm.means_[4] c1 = gm.covariances_[0]
# for i in range(len(nc_array)): # dp = GaussianMixture(n_components=nc_array[i], covariance_type='full', max_iter=10000, verbose=0) # dp.fit(label_data) # # dpgmm_list.append(dp) # log_acc[i] = dp.lower_bound_ - nc_array[i]/rho # # dpgmm = dpgmm_list[np.argmax(log_acc)] # print(len(dpgmm.covariances_), log_acc) dpgmm = BayesianGaussianMixture(n_components=n_components, covariance_type='full', max_iter=50000, weight_concentration_prior_type='dirichlet_process', mean_precision_prior=.8, weight_concentration_prior=gamma, n_init=1, init_params='random', reg_covar=1e-6) # gmm_labels = dpgmm.predict(label_data) gmm_labels = dpgmm.fit_predict(label_data) # print('Variational', max(gmm_labels)+1) for k, cov in enumerate(dpgmm.covariances_): c1, c2 = np.diag(cov) if c1/c2 < cov_ratio and c2/c1 < cov_ratio: em_data.extend(label_data[gmm_labels == k]) # import pdb; pdb.set_trace() if np.mod(m, 10) == 0: print('cluster {} out of {}'.format(m, len(arg_labels))) # plot_results(label_data, gmm_labels, dpgmm.means_, dpgmm.covariances_) # plt.show()
def model_dpgmm(k, x_scaled): global labels model = BayesianGaussianMixture(n_components=k, covariance_type='full') labels = model.fit_predict(x_scaled)
X_pen_scaled = X_pen_scaled.reshape(-1, 1) X_train_pen, X_test_pen, y_train_pen, y_test_pen = train_test_split( X_pen_scaled, ypen, test_size=0.20) pen_classifier.fit(X_train_pen, y_train_pen) pen_pred = pen_classifier.predict(X_test_pen) pen_error_kmean.append(1 - metrics.accuracy_score(pen_pred, y_test_pen)) #=========================================================== #===========================EM============================= from sklearn.decomposition import FastICA for i in range(1, 31): X_pen_scaled = pen_scaler.fit_transform(Xpen) pen_bgm = BayesianGaussianMixture(n_components=i) X_pen_scaled = pen_bgm.fit_predict(X_pen_scaled) X_pen_scaled = X_pen_scaled.reshape(-1, 1) X_train_pen, X_test_pen, y_train_pen, y_test_pen = train_test_split( X_pen_scaled, ypen, test_size=0.20) pen_classifier.fit(X_train_pen, y_train_pen) pen_pred = pen_classifier.predict(X_test_pen) pen_error_em.append(1 - metrics.accuracy_score(pen_pred, y_test_pen)) #=========================================================== plt.figure(figsize=(12, 6)) plt.plot(range(1, 31), pen_error, label='No Clustering', color='red',
row_ix = np.where(yhat == cluster) # create scatter of these samples plt.scatter(X[row_ix, 0], X[row_ix, 1]) # show the plot plt.show() # advanced #================================================ ddgmm = BayesianGaussianMixture( n_components=2, covariance_type='full', weight_concentration_prior=100, weight_concentration_prior_type="dirichlet_distribution", max_iter=100, random_state=1337).fit(X) yhat = ddgmm.fit_predict(X) # retrieve unique clusters clusters = np.unique(yhat) # create scatter plot for samples from each cluster for cluster in clusters: # get row indexes for samples with this cluster row_ix = np.where(yhat == cluster) # create scatter of these samples plt.scatter(X[row_ix, 0], X[row_ix, 1]) # show the plot plt.show() dpgmm = BayesianGaussianMixture( n_components=2, covariance_type='full', weight_concentration_prior=100,
print(test_mean[np.argmax(test_mean)]) print(train_mean[np.argmax(test_mean)]) mlp_learner = MLPClassifier(hidden_layer_sizes=(100,),activation='relu',solver='sgd', learning_rate = 'adaptive', learning_rate_init = 0.07) train_sizes, train_scores, test_scores = learning_curve(mlp_learner, x_train_scaled_km, y_train, cv=5, n_jobs=-1, train_sizes=np.linspace(0.01, 1.0, 100)) train_mean = np.mean(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) plot_data(train_sizes, test_mean, title="Neural Network Learning Curve - Dataset 1 + Clustering Result", x_label="Training Size", y_label="Accuracy Score", color="orange", label='CV (+K-means Result)', linestyle='dashed') plot_data(train_sizes, train_mean, title="Neural Network Learning Curve - Dataset 1 + Clustering Result", x_label="Training Size", y_label="Accuracy Score", color="orange", label='Training (+K-means Result') print(train_sizes) print((train_sizes[np.argmax(test_mean)])) print(test_mean[np.argmax(test_mean)]) print(train_mean[np.argmax(test_mean)]) gm = BayesianGaussianMixture(n_components = 14, random_state=random_state, reg_covar=1e-01) y_pred = gm.fit_predict(x_projected_pca) x_train_scaled_em = np.column_stack((x_train_scaled,y_pred)) mlp_learner = MLPClassifier(hidden_layer_sizes=(100,),activation='relu',solver='sgd', learning_rate = 'adaptive', learning_rate_init = 0.07) train_sizes, train_scores, test_scores = learning_curve(mlp_learner, x_train_scaled_em, y_train, cv=5, n_jobs=-1, train_sizes=np.linspace(0.01, 1.0, 100)) train_mean = np.mean(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) plot_data(train_sizes, test_mean, title="Neural Network Learning Curve - Dataset 1 + Clustering Result", x_label="Training Size", y_label="Accuracy Score", color="blue", label='CV (+EM Result)', linestyle='dashed') plot_data(train_sizes, train_mean, title="Neural Network Learning Curve - Dataset 1 + Clustering Result", x_label="Training Size", y_label="Accuracy Score", color="blue", label='Training (+EM Result)') print(train_sizes) print((train_sizes[np.argmax(test_mean)])) print(test_mean[np.argmax(test_mean)]) print(train_mean[np.argmax(test_mean)]) plt.savefig('Dataset 1 + Clustering NN learning curve.png')
scl = [0, "rgb(150,0,90)"], [0.125, "rgb(0, 0, 200)"], [0.25, "rgb(0, 25, 255)"], \ [0.375, "rgb(0, 152, 255)"], [0.5, "rgb(44, 255, 150)"], [0.625, "rgb(151, 255, 0)"], \ [0.75, "rgb(255, 234, 0)"], [0.875, "rgb(255, 111, 0)"], [1, "rgb(255, 0, 0)"] if __name__ == '__main__': scoords = SitesCoords() sites_i = 31265 sites_f = 12100 nc = 50 mutual = True lsites = scoords.get_direct_neighbors(sites_i, 0.35) # lsites = range(sites_i, sites_f) lclust = compute_clusterings(lsites, nc, mutual=mutual) mdist = compute_distance_matrix(lclust, mutual=mutual) #plot_md_scaling(mdist) tdata = md_scaling(mdist) #cs = adjust_nc(tdata) #kmeans = KMeans(n_clusters=cs) #labels = kmeans.fit_predict(tdata) gmm = BayesianGaussianMixture(n_components=10, covariance_type='full', max_iter=1000, n_init=10, tol=0.00001) labels = gmm.fit_predict(tdata) create_plot(data_plot(lsites, labels), str(sites_i))
#============================1. Uncomment this section for use with plots A, B, C, D ======================================================= for i in range(20): from sklearn.mixture import BayesianGaussianMixture # Chose BayesianGaussianMixture because it received better accuracy scores than just GaussianMixture bgm_sat = BayesianGaussianMixture( n_components=7, covariance_type='tied', weight_concentration_prior=params[i], max_iter=500) # 7 categories, domain knowledge bgm_pen = BayesianGaussianMixture( n_components=10, covariance_type='full', weight_concentration_prior=params[i], max_iter=500) # 10 categories, domain knowledge start_time = time.time() sat_labels_pred = bgm_sat.fit_predict(X_train_sat_og) #=======2. Use only for Plot C================================== # sat_labels_train = bgm_sat.fit_predict(X_train_sat_og) # sat_labels_test = bgm_sat.predict(X_test_sat_og) #======================================================= end_time = time.time() sat_time = end_time - start_time start_time = time.time() pen_labels_pred = bgm_pen.fit_predict(X_train_pen_og) #==========3. Use only for making Plot C====================== # pen_labels_train = bgm_sat.fit_predict(X_train_pen_og) # pen_labels_test = bgm_sat.predict(X_test_pen_og) #===================================================== end_time = time.time() pen_time = end_time - start_time #=====================================================================================================================================
def execute(self, namespace): from sklearn.mixture import GaussianMixture, BayesianGaussianMixture from PYME.IO import MetaDataHandler points = namespace[self.input_points] X = np.stack([points['x'], points['y'], points['z']], axis=1) if self.mode == 'n': gmm = GaussianMixture(n_components=self.n, covariance_type=self.covariance, max_iter=self.max_iter, init_params=self.init_params) predictions = gmm.fit_predict(X) + 1 # PYME labeling scheme log_prob = gmm.score_samples(X) if not gmm.converged_: logger.error('GMM fitting did not converge') predictions = np.zeros(len(points), int) log_prob = -np.inf * np.ones(len(points)) elif self.mode == 'bic': n_components = range(1, self.n + 1) bic = np.zeros(len(n_components)) for ind in range(len(n_components)): gmm = GaussianMixture(n_components=n_components[ind], covariance_type=self.covariance, max_iter=self.max_iter, init_params=self.init_params) gmm.fit(X) bic[ind] = gmm.bic(X) logger.debug('%d BIC: %f' % (n_components[ind], bic[ind])) best = n_components[np.argmin(bic)] if best == self.n or (self.n > 10 and best > 0.9 * self.n): logger.warning( 'BIC optimization selected n components near n max') gmm = GaussianMixture(n_components=best, covariance_type=self.covariance, max_iter=self.max_iter, init_params=self.init_params) predictions = gmm.fit_predict(X) + 1 # PYME labeling scheme log_prob = gmm.score_samples(X) if not gmm.converged_: logger.error('GMM fitting did not converge') predictions = np.zeros(len(points), int) log_prob = -np.inf * np.ones(len(points)) elif self.mode == 'bayesian': bgm = BayesianGaussianMixture(n_components=self.n, covariance_type=self.covariance, max_iter=self.max_iter, init_params=self.init_params) predictions = bgm.fit_predict(X) + 1 # PYME labeling scheme log_prob = bgm.score_samples(X) if not bgm.converged_: logger.error('GMM fitting did not converge') predictions = np.zeros(len(points), int) log_prob = -np.inf * np.ones(len(points)) out = tabular.MappingFilter(points) try: out.mdh = MetaDataHandler.DictMDHandler(points.mdh) except AttributeError: pass out.addColumn(self.label_key, predictions) out.addColumn(self.label_key + '_log_prob', log_prob) avg_log_prob = np.empty_like(log_prob) for label in np.unique(predictions): mask = label == predictions avg_log_prob[mask] = np.mean(log_prob[mask]) out.addColumn(self.label_key + '_avg_log_prob', avg_log_prob) namespace[self.output_labeled] = out
n_components=args.nclusters, covariance_type='diag', max_iter=1000, weight_concentration_prior_type='dirichlet_process') dimred = TSNE(n_components=2) fig2, ax2 = plt.subplots(1, 1) cmap = iter([plt.cm.tab20(x) for x in range(0, 20)]) with torch.no_grad(): diter = iter(train_loader) y, lab = diter.next() mu, lvar = model.encode(y.view(-1, n_genes)) y2 = model.reparam(mu, lvar) clustering.fit(y2.numpy()) idx = clustering.fit_predict(y2.numpy()) # idx = km.fit_predict(y2.numpy()) dr = dimred.fit_transform(y2.numpy()) mx, mn = np.max(dr), np.min(dr) ax2.set_xlim([mn, mx]) for ii in np.unique(idx): clr = np.array(next(cmap)).reshape(1, -1) ax2.scatter(dr[idx == ii, 0], dr[idx == ii, 1], c=clr, **params) fig2.tight_layout() fig2.savefig(osp.join(gdir, ''.join([tag, '_tsne.png']))) inlatent = pd.DataFrame(y2.numpy(), index=lab, columns=[str(x) for x in range(args.latent_dim)])
def train(data:np.ndarray, obs_len:int, filter_name:str, model_dir:str, result_dir:str, save_model:bool=True)->NoReturn: print('[Bayesian Gaussian Mixture Clustering][train] creating model...') bgm = BayesianGaussianMixture(n_components=3, covariance_type="full", max_iter=1000, tol=1e-5, n_init=10, random_state=7, weight_concentration_prior_type='dirichlet_process', init_params="kmeans") print('[Bayesian Gaussian Mixture Clustering][train] training...') _y = bgm.fit_predict(X=data) _y = np.expand_dims(_y, axis=1) print(f'[Bayesian Gaussian Mixture Clustering][train] converged?:{bgm.converged_}') print('[Bayesian Gaussian Mixture Clustering][train] params (center and covariance):') for i, m, c, w in zip(range(1, 4), bgm.means_, bgm.covariances_, bgm.weights_): print(f'\tc_{i}-> mean: {m}') print(f'\t\tcov: {c}') print(f'\t\tweight: {w}') print('[Bayesian Gaussian Mixture Clustering][train] results:') _c, _l = np.unique(_y, return_counts=True) for i, c in zip(_c,_l): print (f'\tc_{i}: {c}') if save_model: model_file=f'bgm_{obs_len}s_{filter_name}.pkl' print (f'[Bayesian Gaussian Mixture Clustering][train] saving model ({model_file})...') with open(os.path.join(model_dir, model_file), 'wb') as f: pickle.dump(bgm, f) result_file = f'results_bgm_train_{obs_len}s_{filter_name}.csv' print (f'[Bayesian Gaussian Mixture Clustering][train] saving results ({result_file})...') labels = ['mean_velocity', 'mean_acceleration', 'mean_deceleration', 'std_lateral_jerk', 'driving_style'] result = np.concatenate((data, _y), axis=1) df = pd.DataFrame(data=result, columns=labels) df.to_csv(os.path.join(result_dir,result_file)) result_file = result_file.replace('results', 'params').replace('csv', 'json') print (f'[Bayesian Gaussian Mixture Clustering][train] saving results ({result_file})...') _d = {} _d['means'] = bgm.means_.tolist() _d['covariances'] = bgm.covariances_.tolist() _d['weights'] = bgm.weights_.tolist() with open(os.path.join(result_dir, result_file), 'w') as f: json.dump(_d, f)
def km_em(x_train_scaled, dataset_name="", true_vals = y_train, reg_covar = 1e-01): distortions = [] sil = [] n = 22 # v_measure = [] homogeneity = [] completeness = [] mutual_info = [] adj_rand_score = [] sil = [] kmeans_times = [] homogeneity_em = [] completeness_em = [] mutual_info_em = [] adj_rand_score_em = [] sil_em = [] em_times = [] em_likelihood = [] for i in range(2,n+1): # print(i) start_time = time.time() kmeans = KMeans(n_clusters=i, random_state=random_state) kmeans.fit(x_train_scaled) distortions.append(kmeans.inertia_) y_pred = kmeans.predict(x_train_scaled) kmeans_times.append(time.time()-start_time) homogeneity.append(homogeneity_score(true_vals, y_pred.tolist())) completeness.append(completeness_score(true_vals, y_pred.tolist())) mutual_info.append(adjusted_mutual_info_score(true_vals, y_pred.tolist())) adj_rand_score.append(adjusted_rand_score(true_vals, y_pred.tolist())) sil.append(silhouette_score(x_train_scaled, kmeans.labels_, metric='euclidean')) start_time = time.time() gm = BayesianGaussianMixture(n_components = i, random_state=random_state, reg_covar=reg_covar) y_pred = gm.fit_predict(x_train_scaled) em_times.append(time.time()-start_time) homogeneity_em.append(homogeneity_score(true_vals, y_pred.tolist())) completeness_em.append(completeness_score(true_vals, y_pred.tolist())) mutual_info_em.append(adjusted_mutual_info_score(true_vals, y_pred.tolist())) adj_rand_score_em.append(adjusted_rand_score(true_vals, y_pred.tolist())) if len(set(y_pred))>1: sil_em.append(silhouette_score(x_train_scaled, y_pred, metric='euclidean')) else: sil_em.append(1) em_likelihood.append(gm.score(x_train_scaled)) # plot plt.plot(range(2, n+1), distortions, marker='o') plt.title("K-means Elbow ("+(str(dataset_name))+")") plt.xlabel('Number of clusters') plt.ylabel('Sum of Squared Distances') plt.savefig((str(dataset_name))+' km elbow.png') plt.show() plt.plot(range(2, n+1), sil, marker='o') plt.title('K-means Silhouette Scores ('+(str(dataset_name))+')') plt.xlabel('Number of clusters') plt.ylabel('Silhouette Score') plt.savefig((str(dataset_name))+' km silho.png') plt.show() plt.plot(range(2, n+1), em_likelihood, marker='o') plt.title('EM likelihood ('+(str(dataset_name))+')') plt.xlabel('Number of clusters') plt.ylabel('Likelihood') plt.savefig((str(dataset_name))+' em likelihood.png') plt.show() plt.plot(range(2, n+1), sil_em, marker='o') plt.title('EM Silhouette Scores ('+(str(dataset_name))+')') plt.xlabel('Number of clusters') plt.ylabel('Silhouette Score') plt.savefig((str(dataset_name))+' em silho.png') plt.show() plt.close() plot_data(list(range(1, n)), homogeneity, title="Performance Evaluation k-means ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="blue", label='Homogeneity') plot_data(list(range(1, n)), completeness, title="Performance Evaluation k-means ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="orange", label='Completeness') plot_data(list(range(1, n)), mutual_info, title="Performance Evaluation k-means ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="red", label='Adgusted Mutual Info') plot_data(list(range(1, n)), adj_rand_score, title="Performance Evaluation k-means ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="green", label='Adjusted random index') # plot_data(list(range(1, n)), v_measure, title="Performance Evaluation k-means", x_label="Number of Clusters", y_label="Score", color="brown", label='V-measure') plt.savefig((str(dataset_name))+' km perfo.png') plt.show() plt.close() plot_data(list(range(1, n)), homogeneity_em, title="Performance Evaluation EM ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="blue", label='Homogeneity') plot_data(list(range(1, n)), completeness_em, title="Performance Evaluation EM ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="orange", label='Completeness') plot_data(list(range(1, n)), mutual_info_em, title="Performance Evaluation EM ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="red", label='Adgusted Mutual Info') plot_data(list(range(1, n)), adj_rand_score_em, title="Performance Evaluation EM ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="green", label='Adjusted random index') # plot_data(list(range(1, n)), v_measure, title="Performance Evaluation EM", x_label="Number of Clusters", y_label="Score", color="brown", label='V-measure') plt.savefig((str(dataset_name))+' em perfo.png') plt.show() plt.close() plot_data(list(range(1, n)), kmeans_times, title="k-means/EM Running Time ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Time", color="red", label='k-means') plot_data(list(range(1, n)), em_times, title="k-means/EM Running Time ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Time", color="blue", label='EM') plt.savefig((str(dataset_name))+' km-em time.png') plt.show() print('kmeans_times') print(kmeans_times) print('em_times') print(em_times) return {'sil': sil, 'kmeans_times':kmeans_times, 'em_times':em_times, 'homogeneity':homogeneity, 'completeness':completeness, 'mutual_info':mutual_info, 'adj_rand_score':adj_rand_score, 'homogeneity_em':homogeneity_em, 'completeness_em':completeness_em, 'mutual_info_em':mutual_info_em, 'adj_rand_score_em':adj_rand_score_em}