def test_bayesian_mixture_check_is_fitted(): rng = np.random.RandomState(0) n_samples, n_features = 10, 2 # Check raise message bgmm = BayesianGaussianMixture(random_state=rng) X = rng.rand(n_samples, n_features) msg = "This BayesianGaussianMixture instance is not fitted yet." with pytest.raises(ValueError, match=msg): bgmm.score(X)
def bayesian_gaussian_mixture(vector: np.array, n: int, BIC_calculate = False): if BIC_calculate == True: np.random.seed(140597) mask = np.random.choice([False, True], len(vector), p=[0.70, 0.30]) model_train = BayesianGaussianMixture(n_components=n, covariance_type='full').fit(vector[~mask]) validation_score = model_train.score(vector[mask]) train_score = model_train.score(vector[~mask]) return validation_score, train_score else: np.random.seed(140597) mask = np.random.choice([False, True], len(vector), p=[0.70, 0.30]) dpgmm = BayesianGaussianMixture(n_components=n, covariance_type='full', max_iter=900, tol=1e-4).fit(vector[~mask]) cluster_label = dpgmm.predict(vector) return cluster_label
def genotype(cnvays): result = [] n_com = 10 if cnvays.shape[1] >= 10 else cnvays.shape[1] n_init = 3 for cnvay in cnvays: cnv = [[x] for x in cnvay] dpgmm = BayesianGaussianMixture( n_components=n_com, n_init=n_init, max_iter=10000, weight_concentration_prior_type='dirichlet_process').fit(cnv) labels = dpgmm.predict(cnv) normed_ay = np.arange(0, np.max(cnvay) + 0.5, 0.5) swlabels = {} for rawlabel in np.unique(labels): swlabels[rawlabel] = normed_ay[np.argmin( np.abs(normed_ay - np.median(cnvay[labels == rawlabel])))] newlabels = [swlabels[x] for x in labels] gtlabes = {0: 'dd', 0.5: 'Ad', 1: 'AA', 1.5: 'AB', 2: 'BB', 2.5: 'BC'} finalline = [gtlabes.get(x, 'M') for x in newlabels] if len(np.unique(finalline)) > 1: sc = silhouette_score(cnv, finalline, metric='euclidean') # silhouette_score chs = calinski_harabaz_score(cnv, labels) # calinski_harabaz_score else: sc = np.nan chs = np.nan llh = dpgmm.score( cnv) # Log likelihood of the Gaussian mixture given X finalline += [sc, chs, llh] result.append(finalline) return result
def bayes_gauss_classifier(dir_models, ticket, x, x_test, y, y_test): # GaussianMixture print('getting model...BayesianGaussianMixture') clf = BayesianGaussianMixture(n_components=3) print('training...') clf.fit(x, y) print('predicting...') predicted = clf.predict(x_test) print(classification_report(y_test, predicted)) id = len(os.listdir(dir_models)) joblib.dump( clf, dir_models + ticket + '_bayesian_gaussian_mixture_' + str(id) + '.pkl') return clf.score(x_test, y_test)
def search_optimal_cluster_size(data_set_name: str, data_points: np.ndarray, start: int, stop: int, max_data_points=2000) -> int: """Determine the optimal number of clusters in the given data_points based on the maximum GMM log likelihood. It assumes the component in the data points are linearly independent. Sampling is performed to improve efficiency. Arguments: data_set_name {str} -- The name of the data set (for display purpose). data_points {np.ndarray} -- Of shape [num_samples, num_components]. start {int} -- Starting number of clusters to search from. stop {int} -- The largest number of clusters to search on. Keyword Arguments: max_data_points {int} -- The largest number of data points to take on. Sampling is performed after this number of samples (default: {2000}) Returns: int -- The optimal number of clusters. """ if data_points.shape[0] > max_data_points: inds = np.arange(start=0, stop=data_points.shape[0]) np.random.shuffle(inds) data_points = data_points[inds[:max_data_points], :] best_likelihood = -float("inf") best_cluster_size = start for i in range(start, stop + 1): gmm = BayesianGaussianMixture(n_components=i, covariance_type="diag", tol=1e-2, n_init=5) gmm.fit(X=data_points) if not gmm.converged_: continue likelihood = gmm.score(X=data_points) if likelihood > best_likelihood: best_likelihood = likelihood best_cluster_size = i print(data_set_name, "|cluster_size=", i, "|current_best=", best_cluster_size, "|current_best_score=", best_likelihood) return best_cluster_size
def evaluate_DPMM(y,y_test): from sklearn.mixture import BayesianGaussianMixture np.random.seed(300) d = np.shape(y)[1] start = time.time() #Fit GMM DPMM = BayesianGaussianMixture(n_components=30,covariance_type = 'diag',n_init =100,\ weight_concentration_prior_type='dirichlet_process', covariance_prior = np.ones(d),degrees_of_freedom_prior = d,mean_precision_prior = 1, mean_prior = np.zeros(d)) DPMM.fit(y) end =time.time() print('DPMM fitting time is {}'.format(end-start)) #Predict test_loglik = DPMM.score(y_test) return(test_loglik)
def km_em(x_train_scaled, dataset_name="", true_vals = y_train, reg_covar = 1e-01): distortions = [] sil = [] n = 22 # v_measure = [] homogeneity = [] completeness = [] mutual_info = [] adj_rand_score = [] sil = [] kmeans_times = [] homogeneity_em = [] completeness_em = [] mutual_info_em = [] adj_rand_score_em = [] sil_em = [] em_times = [] em_likelihood = [] for i in range(2,n+1): # print(i) start_time = time.time() kmeans = KMeans(n_clusters=i, random_state=random_state) kmeans.fit(x_train_scaled) distortions.append(kmeans.inertia_) y_pred = kmeans.predict(x_train_scaled) kmeans_times.append(time.time()-start_time) homogeneity.append(homogeneity_score(true_vals, y_pred.tolist())) completeness.append(completeness_score(true_vals, y_pred.tolist())) mutual_info.append(adjusted_mutual_info_score(true_vals, y_pred.tolist())) adj_rand_score.append(adjusted_rand_score(true_vals, y_pred.tolist())) sil.append(silhouette_score(x_train_scaled, kmeans.labels_, metric='euclidean')) start_time = time.time() gm = BayesianGaussianMixture(n_components = i, random_state=random_state, reg_covar=reg_covar) y_pred = gm.fit_predict(x_train_scaled) em_times.append(time.time()-start_time) homogeneity_em.append(homogeneity_score(true_vals, y_pred.tolist())) completeness_em.append(completeness_score(true_vals, y_pred.tolist())) mutual_info_em.append(adjusted_mutual_info_score(true_vals, y_pred.tolist())) adj_rand_score_em.append(adjusted_rand_score(true_vals, y_pred.tolist())) if len(set(y_pred))>1: sil_em.append(silhouette_score(x_train_scaled, y_pred, metric='euclidean')) else: sil_em.append(1) em_likelihood.append(gm.score(x_train_scaled)) # plot plt.plot(range(2, n+1), distortions, marker='o') plt.title("K-means Elbow ("+(str(dataset_name))+")") plt.xlabel('Number of clusters') plt.ylabel('Sum of Squared Distances') plt.savefig((str(dataset_name))+' km elbow.png') plt.show() plt.plot(range(2, n+1), sil, marker='o') plt.title('K-means Silhouette Scores ('+(str(dataset_name))+')') plt.xlabel('Number of clusters') plt.ylabel('Silhouette Score') plt.savefig((str(dataset_name))+' km silho.png') plt.show() plt.plot(range(2, n+1), em_likelihood, marker='o') plt.title('EM likelihood ('+(str(dataset_name))+')') plt.xlabel('Number of clusters') plt.ylabel('Likelihood') plt.savefig((str(dataset_name))+' em likelihood.png') plt.show() plt.plot(range(2, n+1), sil_em, marker='o') plt.title('EM Silhouette Scores ('+(str(dataset_name))+')') plt.xlabel('Number of clusters') plt.ylabel('Silhouette Score') plt.savefig((str(dataset_name))+' em silho.png') plt.show() plt.close() plot_data(list(range(1, n)), homogeneity, title="Performance Evaluation k-means ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="blue", label='Homogeneity') plot_data(list(range(1, n)), completeness, title="Performance Evaluation k-means ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="orange", label='Completeness') plot_data(list(range(1, n)), mutual_info, title="Performance Evaluation k-means ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="red", label='Adgusted Mutual Info') plot_data(list(range(1, n)), adj_rand_score, title="Performance Evaluation k-means ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="green", label='Adjusted random index') # plot_data(list(range(1, n)), v_measure, title="Performance Evaluation k-means", x_label="Number of Clusters", y_label="Score", color="brown", label='V-measure') plt.savefig((str(dataset_name))+' km perfo.png') plt.show() plt.close() plot_data(list(range(1, n)), homogeneity_em, title="Performance Evaluation EM ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="blue", label='Homogeneity') plot_data(list(range(1, n)), completeness_em, title="Performance Evaluation EM ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="orange", label='Completeness') plot_data(list(range(1, n)), mutual_info_em, title="Performance Evaluation EM ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="red", label='Adgusted Mutual Info') plot_data(list(range(1, n)), adj_rand_score_em, title="Performance Evaluation EM ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="green", label='Adjusted random index') # plot_data(list(range(1, n)), v_measure, title="Performance Evaluation EM", x_label="Number of Clusters", y_label="Score", color="brown", label='V-measure') plt.savefig((str(dataset_name))+' em perfo.png') plt.show() plt.close() plot_data(list(range(1, n)), kmeans_times, title="k-means/EM Running Time ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Time", color="red", label='k-means') plot_data(list(range(1, n)), em_times, title="k-means/EM Running Time ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Time", color="blue", label='EM') plt.savefig((str(dataset_name))+' km-em time.png') plt.show() print('kmeans_times') print(kmeans_times) print('em_times') print(em_times) return {'sil': sil, 'kmeans_times':kmeans_times, 'em_times':em_times, 'homogeneity':homogeneity, 'completeness':completeness, 'mutual_info':mutual_info, 'adj_rand_score':adj_rand_score, 'homogeneity_em':homogeneity_em, 'completeness_em':completeness_em, 'mutual_info_em':mutual_info_em, 'adj_rand_score_em':adj_rand_score_em}
embeddings = [] for index in indices: embeddings.append(np.load(files[index])) print("Loaded npys") start_time = time.time() cluster_data = np.array(embeddings) print(cluster_data.shape) gmm = BayesianGaussianMixture( n_components=10, covariance_type="full", tol=1e-4, max_iter=1000, init_params="random", weight_concentration_prior_type="dirichlet_process", weight_concentration_prior=1.0 / 10, warm_start=False) gmm.fit(cluster_data) print(gmm.means_.shape) print(gmm.covariances_.shape) print(gmm.weight_concentration_) print(gmm.lower_bound_) print(gmm.score(cluster_data)) end_time = time.time() print("Time taken: ", end_time - start_time)
def extract_nodules_best_gmix(segmentation_volume, max_n_components=40, plot=False): "This function finds the best gaussian mix for a volume of probabilities" no_samples = 10000 occurences = np.round(no_samples * segmentation_volume / np.sum(segmentation_volume)).astype(int) total_occ = np.sum(occurences) samples = np.zeros((total_occ, 3)) counter = 0 for x in xrange(segmentation_volume.shape[0]): for y in xrange(segmentation_volume.shape[1]): for z in xrange(segmentation_volume.shape[2]): for occ in range(occurences[x, y, z]): samples[counter] = [x, y, z] counter += 1 best_score = -1 best_gmix = None best_no_c = -1 for no_c in range(1, max_n_components): gmix = BayesianGaussianMixture(n_components=no_c, covariance_type='full') gmix.fit(samples) score = gmix.score(samples) print 'score', score print 'means' print gmix.means_ print 'weights' print gmix.weights_ if plot: for idx, mean in enumerate(gmix.means_): center = np.round(mean).astype(np.int) fig = plt.figure() fig.suptitle('weight=' + str(gmix.weights_[idx])) ax1 = fig.add_subplot(3, 1, 1) ax1.imshow(segmentation_volume[center[0], :, :].transpose()) circ1 = plt.Circle((center[1], center[2]), 10, color='g', fill=False) ax1.add_patch(circ1) ax2 = fig.add_subplot(3, 1, 2) ax2.imshow(segmentation_volume[:, center[1], :]) circ2 = plt.Circle((center[0], center[2]), 10, color='g', fill=False) ax2.add_patch(circ2) ax3 = fig.add_subplot(3, 1, 3) ax3.imshow(segmentation_volume[:, :, center[2]].transpose()) circ3 = plt.Circle((center[0], center[1]), 10, color='g', fill=False) ax3.add_patch(circ3) fig.savefig('no_c_' + str(no_c) + '_' + str(idx) + '.pdf') if score > best_score: best_score = score best_gmix = best_gmix best_no_c = no_c print "Best gaussian mix when using", best_no_c, 'gaussians' return best_gmix
gmm = BayesianGaussianMixture( n_components=n_comps, covariance_type="full", tol=1e-4, max_iter=2500, init_params="random", weight_concentration_prior_type="dirichlet_distribution", weight_concentration_prior=1e+4, warm_start=True) for epoch in range(epochs): random.shuffle(all_indices) iters = math.floor(len(files) / number_of_data_points_per_iter) - 1 print("Epoch: " + str(epoch + 1)) for iter in range(iters): start_time = time.time() start_index = iter * number_of_data_points_per_iter end_index = start_index + number_of_data_points_per_iter inds_for_this_iter = all_indices[start_index:end_index] cluster_data = embeddings[inds_for_this_iter] gmm.fit(cluster_data) end_time = time.time() print("Likelihood: " + str(gmm.score(val_data)) + ", Time: " + str(end_time - start_time)) print("Weight Concentration : ") print(gmm.weight_concentration_) np.save(open(os.path.join(output_dir, "gmm_means.npy"), "wb"), gmm.means_) np.save(open(os.path.join(output_dir, "gmm_covs.npy"), "wb"), gmm.covariances_) np.save(open(os.path.join(output_dir, "gmm_weights.npy"), "wb"), gmm.weights_)
class FisherVectorGMM: """ Fisher Vector derived from GMM --- Attributes ----------- n_kernels: int number of kernels in GMM convars_type: str convariance type for GMM use_bayesian: bool whether or not to use Baysian GMM gmm: GaussianMixture() or BayesianGaussianMixture() GMM instance in sklearn means: np.array() means learned in GMM covars: np.array() covariance learned in GMM weights: np.array() weights learned in GMM --------------------------------------- Functions ----------- fit(): public fit raw data into GMM predict(): public predict FV for one video (variable frames) predict_alternative(): public predict FV for one video (variable frames) alternative not validated save(): public save GMM model into external file load(): public load GMM model from external file """ def __init__(self, n_kernels=1, convars_type='diag', use_bayesian=False): # para n_kernels: # para convars_type: # para use_bayesian: assert convars_type in ['diag', 'full'] assert n_kernels >= 0 # == 0 dummy instance self.name = 'kernels%d_convars%s_bayes%d' % (n_kernels, convars_type, use_bayesian) self.n_kernels = n_kernels self.convars_type = convars_type self.use_bayesian = use_bayesian self.fitted = False self.config = json.load(open('./config/model.json', 'r'))['fisher_vector'] self.save_dir = self.config['save_dir'] self.data_dir = self.config['data_dir'] self.means = None self.covars = None self.weights = None if not self.use_bayesian: self.gmm = GaussianMixture(n_components=self.n_kernels, covariance_type=self.convars_type, max_iter=1000, verbose=2) else: self.gmm = BayesianGaussianMixture( n_components=self.n_kernels, covariance_type=self.convars_type, max_iter=1000, verbose=2) def fit(self, X): # para X: shape [n_frames, n_features, n_feature_dim] # if os.path.isfile(os.path.join(self.save_dir, self.name, 'gmm.model')): # print("\nmodel already trained ---", self.name) # self.load() # return # elif not os.path.isdir(os.path.join(self.save_dir, self.name)): # os.mkdir(os.path.join(self.save_dir, self.name)) self.feature_dim = X.shape[-1] # X = X.reshape(-1, X.shape[-1]) print("\nfitting data into GMM with %d kernels" % self.n_kernels) self.gmm.fit(X) self.means = self.gmm.means_ self.covars = self.gmm.covariances_ self.weights = self.gmm.weights_ print("\nfitting completed") # if cov_type is diagonal - make sure that covars holds a diagonal matrix if self.convars_type == 'diag': cov_matrices = np.empty(shape=(self.n_kernels, self.covars.shape[1], self.covars.shape[1])) for i in range(self.n_kernels): cov_matrices[i, :, :] = np.diag(self.covars[i, :]) self.covars = cov_matrices assert self.covars.ndim == 3 print("\nmodel trained ---", self.name) # self.save() def score(self, X): return self.gmm.score(X.reshape(-1, X.shape[-1])) def predict(self, X, normalized=True): # para X: shape [n_frames, n_feature_dim] assert X.ndim == 2 assert X.shape[ 0] >= self.n_kernels, 'n_frames should be greater than n_kernels' print("\ninferring fisher vectors with given GMM ...") X_matrix = X.reshape(-1, X.shape[-1]) # [n_frames, n_feature_dim] # set equal weights to predict likelihood ratio self.gmm.weights_ = np.ones(self.n_kernels) / self.n_kernels likelihood_ratio = self.gmm.predict_proba(X_matrix).reshape( X.shape[0], self.n_kernels) # [n_frames, n_kernels] var = np.diagonal(self.covars, axis1=1, axis2=2) # [n_kernels, n_feature_dim] # decrease the memory use norm_dev_from_modes = np.tile(X[:, None, :], (1, self.n_kernels, 1)) np.subtract(norm_dev_from_modes, self.means[None, :], out=norm_dev_from_modes) np.divide(norm_dev_from_modes, var[None, :], out=norm_dev_from_modes) """ norm_dev_from_modes: (X - mean) / var [n_frames, n_kernels, n_feature_dim] """ # mean deviation mean_dev = np.multiply(likelihood_ratio[:, :, None], norm_dev_from_modes).mean( axis=0) # [n_kernels, n_feature_dim] mean_dev = np.multiply(1 / np.sqrt(self.weights[:, None]), mean_dev) # [n_kernels, n_feature_dim] # covariance deviation cov_dev = np.multiply(likelihood_ratio[:, :, None], norm_dev_from_modes**2 - 1).mean( axis=0) # [n_kernels, n_feature_dim] cov_dev = np.multiply(1 / np.sqrt(2 * self.weights[:, None]), cov_dev) # [n_kernels, n_feature_dim] # stack vectors of mean and covariance fisher_vector = np.concatenate([mean_dev, cov_dev], axis=1) if normalized: fisher_vector = np.sqrt(np.abs(fisher_vector)) * np.sign( fisher_vector) # power normalization fisher_vector = fisher_vector / np.linalg.norm( fisher_vector, axis=0) # L2 normalization # fisher_vector[fisher_vector < 10**-4] = 0 # threshold print("\ninferring completed.") assert fisher_vector.ndim == 2 return fisher_vector def predict_alternative(self, X, normalized=True): X = np.atleast_2d(X) N = X.shape[0] # Compute posterior probabilities. Q = self.gmm.predict_proba(X) # NxK # Compute the sufficient statistics of descriptors. Q_sum = np.sum(Q, 0)[:, np.newaxis] / N Q_X = np.dot(Q.T, X) / N Q_XX_2 = np.dot(Q.T, X**2) / N # compute derivatives with respect to mixing weights, means and variances. d_pi = Q_sum.squeeze() - self.gmm.weights_ d_mu = Q_X - Q_sum * self.gmm.means_ d_sigma = (-Q_XX_2 - Q_sum * self.gmm.means_**2 + Q_sum * self.gmm.covariances_ + 2 * Q_X * self.gmm.means_) # merge derivatives into a vector. fisher_vector = np.hstack((d_pi, d_mu.flatten(), d_sigma.flatten())) if normalized: fisher_vector = np.sqrt(np.abs(fisher_vector)) * np.sign( fisher_vector) # power normalization fisher_vector = fisher_vector / np.linalg.norm(fisher_vector, axis=0) # L2 norm return fisher_vector def save(self): with open(os.path.join(self.save_dir, self.name, 'gmm.model'), 'wb') as out_gmm: pickle.dump(self.gmm, out_gmm, protocol=3) with open(os.path.join(self.save_dir, self.name, 'covars.data'), 'wb') as out_covars: pickle.dump(self.covars, out_covars, protocol=3) out_gmm.close() out_covars.close() print("\nmodel saved. --- ", self.name) def load(self): with open(os.path.join(self.save_dir, self.name, 'gmm.model'), 'rb') as in_gmm: self.gmm = pickle.load(in_gmm) with open(os.path.join(self.save_dir, self.name, 'covars.data'), 'rb') as in_covars: self.covars = pickle.load(in_covars) in_gmm.close() in_covars.close() if not self.use_bayesian: assert isinstance(self.gmm, GaussianMixture) else: assert isinstance(self.gmm, BayesianGaussianMixture) self.means = self.gmm.means_ self.weights = self.gmm.weights_ print("\nmodel loaded. --- ", self.name) def save_vector(self, fisher_vector, partition, dynamics=False, label=False): if not label: filename = 'vector_%s_%d' % ( partition, self.n_kernels) if dynamics else 'fisher_vector_%s_%d' % ( partition, self.n_kernels) np.save(os.path.join(self.data_dir, filename), fisher_vector) else: filename = 'label_%s' % partition np.save(os.path.join(self.data_dir, filename), fisher_vector) def load_vector(self, partition, dynamics=False, label=False, bic=False): if not label: if not bic: filename = 'vector_%s_%d.npy' % ( partition, self.n_kernels ) if dynamics else 'fisher_vector_%s_%d.npy' % (partition, self.n_kernels) else: filename = 'vector_%s_0.npy' % partition if dynamics else 'fisher_vector_%s_0.npy' % partition fisher_vector = np.load(os.path.join(self.data_dir, filename), allow_pickle=True) return fisher_vector else: filename = 'label_%s.npy' % partition label = np.load(os.path.join(self.data_dir, filename)) return label
else: anomaly_tracker.append(0) test_cuboids.append(test_images_PED1[i]) test_cuboids_np = np.array(test_cuboids) test_cuboids_np = test_cuboids_np.astype('float64') test_cuboids_np *= 255.0 / test_cuboids_np.max() test_cuboids_t = torch.from_numpy(test_cuboids_np) test_cuboids_t = test_cuboids_t.permute(0, 4, 1, 2, 3) batch_size = 1 test_dataloader = torch.utils.data.DataLoader(test_cuboids_t, shuffle=False, batch_size=batch_size, num_workers=4, drop_last=True) print('Test dataloader loaded') model.eval() test_inputs = [] for y in test_dataloader: clust_pred, d_ignore = model(y) for i in range(len(clust_pred)): test_inputs.append(clust_pred[i].detach().cpu().numpy()) for sample in test_inputs: sb_test_scores.append(sb.score(sample.reshape(1, -1))) print('Done with testing PED1') np.save('SB_twoStage_PED1_Unsupervised' + str(num_epochs) + '.npy', np.array(sb_test_scores)) print('SB results saved')
print('naive GMM with fix k.') start_time = time.time() ngmm = GaussianMixture(trav.n_components,covariance_type='diag').fit(data) record_ngmm_fix_k.iloc[t,0] = time.time() - start_time #print("--- %s seconds ---" % (t)) #record_ngmm.iloc[t,2] = ngmm.bic(data) record_ngmm_fix_k.iloc[t,1] = ngmm.score(data) record_ngmm_fix_k.iloc[t,2] = trav.n_components #print(ngmm_ll) print('dpgmm.') start_time = time.time() dpgmm = BayesianGaussianMixture(n_components=max_cluster_num,max_iter=500).fit(data) record_dpgmm.iloc[t,0] = time.time() - start_time record_dpgmm.iloc[t,1] = dpgmm.score(data) record_dpgmm.iloc[t,2] = len(dpgmm.weights_) db_summary = pd.concat([record_sort,record_gmm,record_gmm_fix_k,record_ngmm,record_ngmm_fix_k,record_dpgmm]) db_summary['DB'] = name db_summary['method'] = ['CITE-sort']*record_sort.shape[0] + ['GMM']*record_gmm.shape[0] + ['GMM_fixk']*record_gmm_fix_k.shape[0] + \ ['nGMM']*record_ngmm.shape[0] + ['nGMM_fixk']*record_ngmm_fix_k.shape[0] + ['dpgmm']*record_dpgmm.shape[0] db_summary.to_csv(savepath+'/record_'+name+'.csv') record_full[name] = db_summary
centers = [[1, 1], [-1, -1], [1, -1]] X,Y = make_blobs(n_samples=750, centers=centers, cluster_std=0.4,random_state=0) X = StandardScaler().fit_transform(X) ''' 贝叶斯混合高斯 在最大期望的基础上,做了些变种。比如加入了很多先验的事情,需要对数据集有更好的理解 优点: 自动选择的一些超参数,对参数数量的敏感度低,加入了正则化来约束先验 缺点: 速度慢些,超参数需要使用交叉验证从而增加的计算量,包含了模型中存在很多隐藏的偏差 ''' cluster = BayesianGaussianMixture(n_components=3, covariance_type='full', tol=0.001, reg_covar=1e-06, max_iter=100, n_init=1, init_params='kmeans', weight_concentration_prior_type='dirichlet_process', weight_concentration_prior=None, mean_precision_prior=None, mean_prior=None, degrees_of_freedom_prior=None, covariance_prior=None, random_state=None, warm_start=False, verbose=0, verbose_interval=10) cluster.fit(X) cluster.score(X,Y) ''' n_components 混合成分的数量 covariance_type 协方差类型 spherical 球面 diagonal 对角线 tied 所有矩阵共享相同的一般矩阵 full 全协方差 tol 收敛阈值 reg_covar 正则化项,在计算协方差矩阵式的约束 max_iter 最大迭代次数 n_init 不太懂 init_params 初始化权重的计算方法 weight_concentration_prior_type 描述重量浓度类型的字符串 weight_concentration_prior !不太理解,不过应该是最重要的!
x = X[:, j].reshape(-1, 1) model = BayesianGaussianMixture(n_components=comps[j], covariance_type='full') model.fit(x) both = np.column_stack([x, model.predict(x)]) # attempts to cluster the whole feature space...may or may not be useful #X_filt = medfilt(X) num_clusters = np.arange(1, 20 + 1) scores = [] for num in num_clusters: #model = KMeans(n_clusters=num,max_iter=500,n_init=20) #model = GaussianMixture(n_components=num, covariance_type='full') model = BayesianGaussianMixture(n_components=num, covariance_type='full') model.fit(X) scores += [-model.score(X)] plt.plot(np.arange(1, 20 + 1), scores) plt.xlim(0, 20) plt.xticks(np.arange(1, 20 + 1)) plt.xlabel('number of clusters') plt.ylabel('loss') plt.show() # 4,6,10 clusters? #model = KMeans(n_clusters=4,max_iter=500,n_init=20) model = BayesianGaussianMixture(n_components=10, covariance_type='full') model.fit(X) plt.hist(model.predict(X)) plt.show() #print model.cluster_centers_