def fit(self, data, categorical_columns=tuple(), ordinal_columns=tuple()): self.meta = get_metadata(data, categorical_columns, ordinal_columns) model = [] self.output_info = [] self.output_dim = 0 self.components = [] for id_, info in enumerate(self.meta): if info["type"] == "continuous": gm = BayesianGaussianMixture( self.n_clusters, weight_concentration_prior_type="dirichlet_process", weight_concentration_prior=0.001, n_init=1, ) gm.fit(data.iloc[:, id_].values.reshape([-1, 1])) model.append(gm) comp = gm.weights_ > self.eps self.components.append(comp) self.output_info += [(1, "tanh"), (np.sum(comp), "softmax")] self.output_dim += 1 + np.sum(comp) else: model.append(None) self.components.append(None) self.output_info += [(info["size"], "softmax")] self.output_dim += info["size"] self.model = model
def fit(self, sequence): # # Normalize sequence # self.norm_std = np.std(sequence) # sequence = sequence / self.norm_std # Create training matrix train_matrix, labels = self._create_training_matrix( sequence, self.n_features) # Include labels in feature matrix train_matrix_extended = np.column_stack((train_matrix, labels)) N, dims_TOT = np.shape(train_matrix_extended) # GMM fit gmm = BayesianGaussianMixture(n_components=self.n_components, covariance_type='full', max_iter=1000) gmm.fit(train_matrix_extended) # Get gmm_parameters alpha = gmm.weights_ mu = gmm.means_ sigma = self._get_covariances(gmm) n_components = gmm.n_components self.gmm_parameters = { 'n_components': n_components, 'n_features': self.n_features, 'alpha': alpha, 'mu': mu, 'sigma': sigma }
def cluster(self, dim, method='dpgmm', max_n_clusters=80, max_iter=300, refresh=True): ''' dim is the dim index for clustering ''' print('clustering DPGMM') from sklearn.mixture import BayesianGaussianMixture as DPGMM dpgmm = DPGMM(n_components=max_n_clusters, covariance_type='full', weight_concentration_prior=1e-3, weight_concentration_prior_type='dirichlet_process', init_params="kmeans", max_iter=max_iter, random_state=0, verbose=1, verbose_interval=10) # init can be "kmeans" or "random" dpgmm.fit(self.fet[:, dim]) label = dpgmm.predict(self.fet[:, dim]) self.clu.membership = label self.clu.__construct__() self.clu.emit('cluster') if refresh is True: self.set_data(self.fet, self.clu) return label
def _bgm_fit(self, x): """Fit a Bayesian Gaussian Mixture to the data given by x. Parameters ---------- x : array-like, shape (n_samples, n_attributes) The data to be fit. Returns ------- model : BayesianGaussianMixture from the sklearn package The BayesianGaussianMixture object that has been fit to the data. """ model = BGM(n_components=self.n_components, tol=self.tol, max_iter=self.max_iter, n_init=self.n_init, covariance_type=self.cov_type, weight_concentration_prior_type=self. weight_concentration_prior_type, weight_concentration_prior=self.weight_concentration_prior) data = x.astype('float32') model.fit(data) return model
def test_bayesian_mixture_predict_predict_proba(): # this is the same test as test_gaussian_mixture_predict_predict_proba() rng = np.random.RandomState(0) rand_data = RandomData(rng) for prior_type in PRIOR_TYPE: for covar_type in COVARIANCE_TYPE: X = rand_data.X[covar_type] Y = rand_data.Y bgmm = BayesianGaussianMixture( n_components=rand_data.n_components, random_state=rng, weight_concentration_prior_type=prior_type, covariance_type=covar_type) # Check a warning message arrive if we don't do fit assert_raise_message(NotFittedError, "This BayesianGaussianMixture instance" " is not fitted yet. Call 'fit' with " "appropriate arguments before using " "this method.", bgmm.predict, X) bgmm.fit(X) Y_pred = bgmm.predict(X) Y_pred_proba = bgmm.predict_proba(X).argmax(axis=1) assert_array_equal(Y_pred, Y_pred_proba) assert_greater_equal(adjusted_rand_score(Y, Y_pred), .95)
def __init__(self, **kwargs): super().__init__(data_set=kwargs.pop('data_set', None), **kwargs) self.clf_ = kwargs.get('clf', None) if self.clf_ is None: raise ValueError("missing required keyword-only argument 'clf'") if not callable(getattr(self.clf_, 'fit', None)) or not callable( (getattr(self.clf_, 'predict_proba', None))): raise TypeError( "'clf' must be an instance with the methods 'fit' and 'predict_proba'" ) n_components = int( kwargs.pop('n_components', np.min([20, len(self.data_set_)]))) if n_components < 0 or n_components > len(self.data_set_): raise ValueError( "'n_components' must be an integer in the interval [1, n_samples]" ) # fit Gaussian mixture model for pre-clustering gmm = BayesianGaussianMixture(n_components=n_components, covariance_type='spherical', max_iter=1000, random_state=self.random_state_) gmm.fit(self.data_set_.X_) self.y_cluster_ = gmm.predict(self.data_set_.X_) self.p_x_ = np.exp(gmm.score_samples(self.data_set_.X_))
def setUp(self): self.random_state = 1 self.X, self.y = load_breast_cancer(return_X_y=True) self.X = StandardScaler().fit_transform(self.X) mixture_model = BayesianGaussianMixture(n_components=2) mixture_model.fit(self.X) self.clf = CMM(mixture_model=mixture_model)
def fit(self, X, Y): """Function fitting the gaussian X - inputs Y - classes """ self.classes = len(set(Y)) # We assume classes are in (0...K-1) self.gaussians = [] self.p_y = np.zeros(self.classes) # p(y) for individual_class in range(self.classes): print("Fitting GMM for the %s class" % individual_class) X_class = X[Y == individual_class] self.p_y[individual_class] = len(X_class) # Each Gaussian is a Bayesian Gaussian Mixture Object # The 10 argument is the maximum number of clusters (chosen arbitrarily, could be more) GMM = BayesianGaussianMixture(10) # The fit function performs the variational inferance update (could take long, iterative algorithm) GMM.fit(X_class) self.gaussians.append(GMM) print("Finished fitting the GMM for the %s class" % individual_class) print("======================================================") self.p_y = self.p_y / self.p_y.sum() # This normalizes p(y)
def gibbs_resampling_EM(self, iter_n=1): self.itr = 0 Np = len(self.Rot) for iter in range(iter_n): EM = BayesianGaussianMixture(n_components=10) EM.fit(self.pf_debug[:, 0:2]) for i in range(Np): sample = EM.sample() self.Rot[i].theta += 0.5 * np.random.randn( ) + 90.0 * np.random.choice(4, p=[0.8, 0.05, 0.1, 0.05]) self.Rot[i].x = np.squeeze(sample[0]) self.Rot[i].y = np.squeeze(sample[1]) self.likelihood_PF() W = self.scores / np.sum( self.scores) # Normalized scores for resampling Np = len(self.Rot) index = np.random.choice(a=Np, size=Np, p=W) # resample by score Rot_arr = [] Rot_arr = self.Rot # creat new temporery array for new sampels kmeans = KMeans(n_clusters=4, init='k-means++', max_iter=300, n_init=10, random_state=0) kmeans.fit(self.pf_debug[:, 2].reshape((-1, 1))) index = np.random.choice(a=4, size=Np) for i, idx in enumerate(index): self.Rot[i].theta = np.squeeze( kmeans.cluster_centers_[idx]) + 0.5 * np.random.randn() self.Rot[i].x += np.random.normal(0, 0.01) self.Rot[i].y += np.random.normal(0, 0.01) print 'resample done'
def run(self): args = self.args uniblock_path = self._get_uniblock_path() feature = load(os.path.join(uniblock_path, 'feature.dump')) X = feature.get_feature_matrix(args.corpus_path) legal, mask = self._infer_nonzero(X) dump(legal, os.path.join(uniblock_path, 'legal.dump')) dump(mask, os.path.join(uniblock_path, 'mask.dump')) X = X[:, legal] bgm = BayesianGaussianMixture( n_components=args.k, covariance_type=args.cov, max_iter=200, random_state=0, verbose=0 if not args.verbose else 2, verbose_interval=1, tol=args.tol, n_init=args.n_init, init_params=args.init_params, ) bgm.fit(X) dump(bgm, os.path.join(uniblock_path, 'bgm.dump')) scores = bgm.score_samples(X) self._log_scores(scores, args.corpus_path) self._log_stats(scores, args.corpus_path)
class VBEM(object): def __init__(self, n_components=5, dataset=None): self.model = BayesianGaussianMixture(n_components=n_components, max_iter=10000) self.n_components = n_components self.class_num = dataset.class_num self.data_num = dataset.data_num self.data = dataset.data self.label = dataset.label self.bestVBEM_k = 0 self.model.fit(self.data) def draw(self): label = self.model.predict(self.data) self.bestVBEM_k = max(label) + 1 data_2d = pd.DataFrame(self.data, columns=['x', 'y']) label_2d = pd.DataFrame(label, columns=['label']) label_names = np.unique(label) colors = [ plt.cm.tab10(i / float(len(label_names))) for i in range(len(label_names)) ] tmp_2d = pd.concat([data_2d, label_2d], axis=1) plt.figure() for i, label in enumerate(label_names): plt.scatter(tmp_2d.loc[tmp_2d.label == label].x, tmp_2d.loc[tmp_2d.label == label].y, s=5, cmap=colors[i], alpha=0.5) plt.title('Best GMM with VBEM_' + str(self.class_num) + '_' + str(self.data_num)) plt.savefig('res/GMM_VBEM_' + str(self.class_num) + '_' + str(self.data_num) + '.jpg')
class VBEM(object): def __init__(self, n_components=1, verbose=2, verbose_interval=1, data=None): self.model = BayesianGaussianMixture(n_components=n_components, verbose=verbose, verbose_interval=verbose_interval) self.n_components = n_components if data == None: self.dataset = Dataset() self.dataset.generate() else: self.dataset = data self.data = self.dataset.data def train(self): self.model.fit(self.data) def show(self, n=None): plt.figure() self.model.fit(self.data) labels = self.model.predict(self.data) plt.scatter(self.data[:, 0], self.data[:, 1], c=labels, s=10) if n == None: plt.show() else: plt.savefig('Pro2/vbem_%d_%d' % (n, 4))
def test_predict(self): mixture = BayesianGaussianMixture(n_components=1, random_state=0) mixture.fit(X=self.X) cmm = CMM( mixture_model=mixture, classes=['tokyo', 'paris', 'new york'], missing_label='nan', random_state=0 ) self.assertRaises(NotFittedError, cmm.predict, X=self.X) cmm.fit(X=self.X, y=self.y_nan) y = cmm.predict(self.X) np.testing.assert_array_equal(['paris', 'tokyo', 'tokyo'], y) cmm = CMM( mixture_model=mixture, classes=['tokyo', 'paris'], missing_label='nan', random_state=1 ) cmm.fit(X=self.X, y=self.y_nan) y = cmm.predict(self.X) np.testing.assert_array_equal(['tokyo', 'tokyo', 'paris'], y) cmm.fit(X=self.X, y=self.y, sample_weight=self.w) y = cmm.predict(self.X) np.testing.assert_array_equal(['tokyo', 'tokyo', 'tokyo'], y) cmm = CMM( mixture_model=mixture, classes=['tokyo', 'paris'], missing_label='nan', cost_matrix=[[0, 1], [10, 0]] ) cmm.fit(X=self.X, y=self.y) y = cmm.predict(self.X) np.testing.assert_array_equal(['paris', 'paris', 'paris'], y) cmm.fit(X=self.X, y=self.y, sample_weight=self.w) y = cmm.predict(self.X) np.testing.assert_array_equal(['paris', 'paris', 'paris'], y)
def bayesian_gaussian_mixture(self, n_components, weight_concentration_prior_type, weight_concentration_prior, mean_precision_prior, n_init, max_iter, init_params): '''Bayesian Gaussian Mixture clustering algorithm. Low value for weight_concentration_prior will put more weight on a few components, high value will allow a larger number of components to be active in the mixture.''' bgm = BayesianGaussianMixture(n_components=n_components, weight_concentration_prior_type=weight_concentration_prior_type, weight_concentration_prior=weight_concentration_prior, mean_precision_prior=mean_precision_prior, n_init=n_init, max_iter=max_iter, init_params=init_params) bgm.fit(self.X) self.labels = bgm.predict(self.X) unique, counts = np.unique(self.labels, return_counts=True) mydict = dict(zip(unique, counts)) print(mydict) plt.bar(list(mydict.keys()), mydict.values(), color = 'g') plt.ylabel("Number of skews") plt.xlabel("Cluster") plt.title(weight_concentration_prior_type) plt.gcf().text(0.05, 0.05, "Parameters initialized using: "+init_params) plt.gcf().text(0.05, 0.01, "Weight concentration prior: "+str(weight_concentration_prior)) plt.gcf().text(0.7, 0.05, "Mean precision prior: "+str(mean_precision_prior)) plt.gcf().text(0.7, 0.01, "Likelihood: "+str("%.2f"%bgm.lower_bound_)) #plt.show() print("Weights: "+str(bgm.weights_)) print("Converged: "+str(bgm.converged_)) print("Number of iterations to reach convergence: "+str(bgm.n_iter_)) print("Lower bound value on likelihood: "+str(bgm.lower_bound_)) print("Bayesian Gaussian mixture complete")
def pca_gmm_gen_mdl(X, n_pca=15, n_gmm=10, scree=False, w=None): pca, Xr, Xm = pca_mdl(X, n_components=n_pca, w=w) pdf = BayesianGaussianMixture(n_components=n_gmm, covariance_type='full', max_iter=25000) pdf.fit(Xr) if scree: plt.figure() plt.plot(np.cumsum(pca.explained_variance_ratio_)) plt.title('Scree plot') plt.xlabel('PCA Components') plt.ylabel('Explained Variance') if w is not None: def gen_samples(n): Xr_n, _ = pdf.sample(n) Xr_n_ll = pdf.score_samples(Xr_n) return Xr_n, pca.inverse_transform(Xr_n) / w[None, :], Xr_n_ll else: def gen_samples(n): Xr_n, _ = pdf.sample(n) Xr_n_ll = pdf.score_samples(Xr_n) return Xr_n, pca.inverse_transform(Xr_n), Xr_n_ll return gen_samples, Xr, Xm
def partition_data(self,args): method, j = args if method== "vi": dp = BayesianGaussianMixture(n_components = self.K,weight_concentration_prior = self.alpha, max_iter=1,init_params='kmeans',weight_concentration_prior_type='dirichlet_process') dp.fit(self.X[self.U[j]]) Z = dp.predict(self.X[self.U[j]]).astype(int) Z_star = dp.predict(self.X_star).astype(int) if method=="gmm": Z,Z_star= self.uncollapsed_dp_partition_alt(j) elif method=="kmean": km = KMeans(n_clusters=self.K) Z = km.fit_predict(self.X[self.U[j]]).astype(int) Z_star = km.predict(self.X_star[self.U[j]]).astype(int) else: Z = np.random.choice(self.K,size = self.N_minibatch,replace=True) Z_star = np.random.choice(np.unique(Z),size = self.N_star,replace=True) le = LE() le.fit(np.hstack((Z,Z_star))) Z = le.transform(Z) Z_star = le.transform(Z_star) if (method=="vi"): #& (self.vi_partition): Z_diff = np.setdiff1d(Z_star,Z) if Z_diff.size > 0: idx = np.hstack((np.where(Z_star==k) for k in Z_diff)).flatten() unique_Z = np.unique(Z) post_Z = dp.predict_proba(self.X_star[idx])[:,unique_Z] Z_star[idx] = [np.random.choice(unique_Z,p = post_Z_i / post_Z_i.sum() ) for post_Z_i in post_Z] assert(np.setdiff1d(Z_star,Z).size == 0) return(Z,Z_star)
def test_bayesian_mixture_predict_predict_proba(): # this is the same test as test_gaussian_mixture_predict_predict_proba() rng = np.random.RandomState(0) rand_data = RandomData(rng) for prior_type in PRIOR_TYPE: for covar_type in COVARIANCE_TYPE: X = rand_data.X[covar_type] Y = rand_data.Y bgmm = BayesianGaussianMixture( n_components=rand_data.n_components, random_state=rng, weight_concentration_prior_type=prior_type, covariance_type=covar_type) # Check a warning message arrive if we don't do fit assert_raise_message( NotFittedError, "This BayesianGaussianMixture instance" " is not fitted yet. Call 'fit' with " "appropriate arguments before using " "this estimator.", bgmm.predict, X) bgmm.fit(X) Y_pred = bgmm.predict(X) Y_pred_proba = bgmm.predict_proba(X).argmax(axis=1) assert_array_equal(Y_pred, Y_pred_proba) assert adjusted_rand_score(Y, Y_pred) >= .95
def load(self, phipsis): self.length = len(phipsis) num_component = min(10, self.length) gm_ = GM(n_components=num_component) gm_.fit(X=phipsis) weights = gm_.weights_ to_keep = weights > 0.05 num_component = sum(to_keep) gm = GM(n_components=num_component) gm.fit(X=phipsis) precisions = gm.precisions_cholesky_ # self.means = gm.means_ self.phipsis = phipsis weight = np.mean(precisions[:, 0, 0]) \ + np.mean(precisions[:, 1, 1]) weight = weight * self.weight_scaling_factor # for matcher weight self.weight = min(weight, 1) self.weight *= self.weight_accom_factor covs = gm.covariances_ cov_invs = np.array([np.linalg.inv(cov) for cov in covs]) cluster_dist = gm.predict_proba(phipsis) self.cov_dist = np.einsum("ijk, li->ljk", cov_invs, cluster_dist) self.gm = gm # for matcher weight # matcher_weight should be a product of the precision/clustering # behaviour of the distribution, and the posterior probability of the # queried point. So, higher clustering but point does not belong in # distribution => other pressures acting on queried point => should # assign lower weight. Lower clustering and point belong => low # clustering means low pressure on point, so it shouldn't matter that # much. return
def test_check_covariance_precision(): # We check that the dot product of the covariance and the precision # matrices is identity. rng = np.random.RandomState(0) rand_data = RandomData(rng, scale=7) n_components, n_features = 2 * rand_data.n_components, 2 # Computation of the full_covariance bgmm = BayesianGaussianMixture(n_components=n_components, max_iter=100, random_state=rng, tol=1e-3, reg_covar=0) for covar_type in COVARIANCE_TYPE: bgmm.covariance_type = covar_type bgmm.fit(rand_data.X[covar_type]) if covar_type == 'full': for covar, precision in zip(bgmm.covariances_, bgmm.precisions_): assert_almost_equal(np.dot(covar, precision), np.eye(n_features)) elif covar_type == 'tied': assert_almost_equal(np.dot(bgmm.covariances_, bgmm.precisions_), np.eye(n_features)) elif covar_type == 'diag': assert_almost_equal(bgmm.covariances_ * bgmm.precisions_, np.ones((n_components, n_features))) else: assert_almost_equal(bgmm.covariances_ * bgmm.precisions_, np.ones(n_components))
def test_bayesian_mixture_weights_prior_initialisation(): rng = np.random.RandomState(0) n_samples, n_components, n_features = 10, 5, 2 X = rng.rand(n_samples, n_features) # Check raise message for a bad value of weight_concentration_prior bad_weight_concentration_prior_ = 0.0 bgmm = BayesianGaussianMixture( weight_concentration_prior=bad_weight_concentration_prior_, random_state=0) msg = ("The parameter 'weight_concentration_prior' should be greater " f"than 0., but got {bad_weight_concentration_prior_:.3f}.") with pytest.raises(ValueError, match=msg): bgmm.fit(X) # Check correct init for a given value of weight_concentration_prior weight_concentration_prior = rng.rand() bgmm = BayesianGaussianMixture( weight_concentration_prior=weight_concentration_prior, random_state=rng).fit(X) assert_almost_equal(weight_concentration_prior, bgmm.weight_concentration_prior_) # Check correct init for the default value of weight_concentration_prior bgmm = BayesianGaussianMixture(n_components=n_components, random_state=rng).fit(X) assert_almost_equal(1.0 / n_components, bgmm.weight_concentration_prior_)
def fit(self, X, Y): # number of classes = number of unique elements of Y: self.K = len(set(Y)) self.N = len(X) # gaussians for every class: self.gauss = [] # the probability of class, p(Y), for every class: self.p_y = [] # assuming that classes are in [0, K-1], # calculate stats for every class: for i in range(self.K): t0 = datetime.now() Xi = X[Y == i] # calculate the mean per feature: mean_Xi = np.mean(Xi, axis=0) # create a GMM model: gmm = BayesianGaussianMixture( n_components=10) # n_components = max # clusters # fit the data to the gmm: print('Fitting GMM', i) gmm.fit(Xi) print('elapsed time:', datetime.now() - t0, '\n') # save to the storage: self.gauss.append({'model': gmm, 'mean': mean_Xi}) # the probability of class, p(Y=k) = #k_class_samples / #all_samples: self.p_y.append(len(Xi) / self.N)
def do_bgm(self, n_components=6, seed=42): """Bayesian Gaussian Mixture. Infer the effective number of components in a Gaussian Mixture Model via variational Bayesian estimation. n_effective_componenents < n_components if the model sets some weights close to 0. Args: n_components (int): Number of components in GMM. seed (int): Random seed. Returns: bgm_output (dict): Labels and probabilities. """ np.random.seed(seed) bgm = BayesianGaussianMixture(n_components=n_components, covariance_type='full', weight_concentration_prior=1e-2, weight_concentration_prior_type='dirichlet_process', mean_precision_prior=1e-2, init_params='random', max_iter=100, random_state=seed) bgm.fit(self.X) bgm_labels = bgm.predict(self.X) bgm_prob = bgm.predict_proba(self.X)[:,0] bgm_output = {'bgm_labels': bgm_labels, 'bgm_prob': bgm_prob} return bgm_output
def init_parameters(self, data): self.transmat_ = np.ones((self.num_unique_states, self.num_unique_states)) self.transmat_ = self.transmat_ / np.sum(self.transmat_, axis=1) self.transmat_ = self.transmat_ / np.sum(self.transmat_, axis=1).reshape(1, -1).T self.emission_matrix = np.zeros((self.num_unique_states, self.num_observations)) self.means_ = np.random.rand(self.num_unique_states) self.covars_ = np.ones(self.num_unique_states) # main_kmeans = cluster.KMeans(n_clusters=self.n_components, # random_state=self.random_state) # labels = main_kmeans.fit_predict(data) # kmeanses = [] # random_state = check_random_state(None) # for label in range(self.n_components): # kmeans = cluster.KMeans(n_clusters=self.n_mix, # random_state=self.random_state) # kmeans.fit(data[np.where(labels == label)]) # kmeanses.append(kmeans) # for i, kmeans in enumerate(kmeanses): # self.means_[i] = kmeans.cluster_centers_ # Run simple EM (no HMM) iterations = 40 reshaped_data = data.reshape(-1, 1) assignments, centers, _ = kmeans.kmeans_best_of_n(reshaped_data, self.num_unique_states, n_trials=5) new_centers = [distributions.Gaussian(c.mean, np.eye(1)) \ for c in centers] tau, obs_distr, pi, gmm_ll_train, gmm_ll_test = \ em.em(reshaped_data, new_centers, assignments, n_iter=iterations) for i in range(len(centers)): self.means_[i] = centers[i].mean self.startprob_ = pi gmm = BayesianGaussianMixture(n_components=3, init_params="kmeans", max_iter=1500) gmm.fit(data.reshape(-1, 1)) self.means_ = gmm.means_.flatten()
def test_check_covariance_precision(): # We check that the dot product of the covariance and the precision # matrices is identity. rng = np.random.RandomState(0) rand_data = RandomData(rng, scale=7) n_components, n_features = 2 * rand_data.n_components, 2 # Computation of the full_covariance bgmm = BayesianGaussianMixture(n_components=n_components, max_iter=100, random_state=rng, tol=1e-3, reg_covar=0) for covar_type in COVARIANCE_TYPE: bgmm.covariance_type = covar_type bgmm.fit(rand_data.X[covar_type]) if covar_type == 'full': for covar, precision in zip(bgmm.covariances_, bgmm.precisions_): assert_almost_equal(np.dot(covar, precision), np.eye(n_features)) elif covar_type == 'tied': assert_almost_equal(np.dot(bgmm.covariances_, bgmm.precisions_), np.eye(n_features)) elif covar_type == 'diag': assert_almost_equal(bgmm.covariances_ * bgmm.precisions_, np.ones((n_components, n_features))) else: assert_almost_equal(bgmm.covariances_ * bgmm.precisions_, np.ones(n_components))
def airmass_labels(z, P, T, H2O, O3, n_airmass=5, labels=None): cH2O = mf2mol_cum(H2O, P, T) cO3 = mf2mol_cum(O3, P, T) T_surf = T[:, z < 3].mean(axis=1) T_grad = np.diff(T[:, z < 6], axis=1).mean(axis=1) H2O_tot = cH2O[:, -1] O3_tot = cO3[:, -1] f = lambda x: (x - x.mean()) / x.std() features = np.vstack((f(T_surf), f(T_grad), f(H2O_tot), f(O3_tot))).T if labels is None: pdf = BayesianGaussianMixture(n_components=n_airmass, covariance_type='full', max_iter=25000) pdf.fit(features) labels = pdf.predict(features) plt.figure() for ii in range(n_airmass): ix = labels == ii plt.subplot(1, 3, 1) plt.plot(T_surf[ix], H2O_tot[ix], '.') plt.xlabel('Mean T (z<3km) [K]') plt.ylabel('Total H2O [mol]') plt.subplot(1, 3, 2) plt.plot(T_surf[ix], 1e6 * O3_tot[ix], '.') plt.xlabel('Mean T (z<3km) [K]') plt.ylabel('Total O3 [µmol]') plt.subplot(1, 3, 3) plt.plot(H2O_tot[ix], 1e6 * O3_tot[ix], '.') plt.xlabel('Total H2O [mol]') plt.ylabel('Total O3 [µmol]') return labels
def test_predict_freq(self): mixture = BayesianGaussianMixture(n_components=1) mixture.fit(X=self.X, y=self.y) cmm = CMM(mixture_model=mixture, classes=['tokyo', 'paris', 'new york'], missing_label='nan') self.assertRaises(NotFittedError, cmm.predict_freq, X=self.X) cmm.fit(X=self.X, y=self.y_nan) F = cmm.predict_freq(X=self.X) np.testing.assert_array_equal(np.zeros((len(self.X), 3)), F) cmm.fit(X=self.X, y=self.y, sample_weight=self.w) F = cmm.predict_freq(X=[self.X[0]]) np.testing.assert_array_equal([[0, 1, 2]], F) X, y = make_blobs(n_samples=200, centers=2) y_nan = np.full_like(y, np.nan, dtype=float) mixture = BayesianGaussianMixture(n_components=5) cmm = CMM(mixture_model=mixture, classes=[0, 1], weight_mode='similarities') self.assertRaises(NotFittedError, cmm.predict_freq, X=self.X) cmm.fit(X=X, y=y_nan) F = cmm.predict_freq(X=X) np.testing.assert_array_equal(F.shape, [200, 2]) self.assertEqual(F.sum(), 0) cmm.fit(X=X, y=y) F = cmm.predict_freq(X=X) self.assertTrue(F.sum() > 0)
def fit(self, data, categorical_columns=tuple(), ordinal_columns=tuple()): self.meta = self.get_metadata(data, categorical_columns, ordinal_columns) model = [] self.output_info = [] self.output_dim = 0 self.components = [] for id_, info in enumerate(self.meta): if info['type'] == CONTINUOUS: gm = BayesianGaussianMixture( self.n_clusters, weight_concentration_prior_type='dirichlet_process', weight_concentration_prior=0.001, n_init=1) gm.fit(data[:, id_].reshape([-1, 1])) model.append(gm) comp = gm.weights_ > self.eps self.components.append(comp) self.output_info += [(1, 'tanh'), (np.sum(comp), 'softmax')] self.output_dim += 1 + np.sum(comp) else: model.append(None) self.components.append(None) self.output_info += [(info['size'], 'softmax')] self.output_dim += info['size'] self.model = model
def encode(self,x): samples=list() for i in range(x.shape[0]):#sampling for BGM samples.append(np.array(utils.tosample(x[i])).reshape(-1,1)) allmeans=[] allcovs=[] allweights=[] BGM45=np.zeros((x.shape[0],3*self.n_components)) for i in range(x.shape[0]): #BGM=BayesianGaussianMixture(n_components=self.n_components,covariance_type='spherical',weight_concentration_prior=1e-10,max_iter=5000,tol=1e-7,n_init=5) BGM=BayesianGaussianMixture(n_components=self.n_components,covariance_type='spherical',weight_concentration_prior=1e-10,max_iter=500) BGM.fit(samples[i]) means=np.reshape(BGM.means_,(-1,)) permu=np.argsort(means) means=means[permu] BGM45[i][self.n_components:2*self.n_components]=means covs=BGM.covariances_ covs=covs[permu] BGM45[i][2*self.n_components:3*self.n_components]=covs weights=BGM.weights_ weights=weights[permu] BGM45[i][0:self.n_components]=weights*len(samples[i]) if self.visualization==1: plt.plot(x[i]) X=np.linspace(0,self.lofd,num=200,endpoint=False) Ys=utils.toGM(X,self.n_components,BGM45[i][self.n_components:2*self.n_components],BGM45[i][2*self.n_components:3*self.n_components],BGM45[i][0:self.n_components]) for j in range(self.n_components): plt.plot(X,Ys[j]) plt.ylim(0,255) plt.show() return BGM45
def predict_cp_interval(self, n_components = 30): ''' Estimates the (phenotypical) levels of observed amplitudes, regardless of order. Consequently, each observed time-point is classified. Between each transition from one inferred level to another one, a change-point with uniform distirbution is inferred. :param n_components: maximum number of components of the mixture model (default is 30) ''' logging.warn("Predicting CP intervals") state_mix = BayesianGaussianMixture( n_components, n_init = 10, weight_concentration_prior_type = 'dirichlet_distribution', verbose = 1, max_iter = 500, tol=1e-12 ) observed = self.observation[~np.isnan(self.observation)].reshape(-1, 1) state_mix.fit(observed) classified = deepcopy(self.observation) predicted = state_mix.predict(classified[~np.isnan(classified)].reshape(-1, 1)) classified[~np.isnan(classified)] = predicted last = None begin = 0 for i, c in enumerate(classified): if not np.isnan(c): last = c begin = i break segments = [] for i in range(begin, classified.shape[0]): if not np.isnan(classified[i]): if classified[i] != last: s = np.max(np.argwhere(~np.isnan(classified[0:i-1]))) segments.append((s, i)) last = classified[i] begin = i # calculate uniform distribution parameters result = [] for segment in segments: a = segment[0] b = segment[1] distro = { 'begin': a, 'end': b } result.append(distro) result = pd.DataFrame(result) return result, state_mix
def fit(self, x, y): self.N = len(np.unique(y)) self.models = [] for n in range(self.N): x_n = x[y == n] model = BayesianGaussianMixture(10) model.fit(x_n) self.models.append(model)
def fit_GMM(data,num_components): gmm = GMM(n_components=num_components) gmm.fit(data) predicted_class = gmm.predict(data) num_classes = np.unique(gmm.predict(data)).shape[0] return gmm,predicted_class,num_classes
def bayes_gauss(X): bgm = BayesianGaussianMixture(n_components=10, n_init=10, random_state=42) bgm.fit(X) print(np.round(bgm.weights_, 2)) plot_gaussian_mixture(bgm, X) plt.show()
def fit(self, X, Y): # assume classes are numbered 0...K-1 self.K = len(set(Y)) self.gaussians = [] self.p_y = np.zeros(self.K) for k in range(self.K): print("Fitting gmm", k) Xk = X[Y == k] self.p_y[k] = len(Xk) gmm = BayesianGaussianMixture(10) gmm.fit(Xk) self.gaussians.append(gmm) # normalize p(y) self.p_y /= self.p_y.sum()
def test_monotonic_likelihood(): # We check that each step of the each step of variational inference without # regularization improve monotonically the training set of the bound rng = np.random.RandomState(0) rand_data = RandomData(rng, scale=7) n_components = rand_data.n_components for covar_type in COVARIANCE_TYPE: X = rand_data.X[covar_type] bgmm = BayesianGaussianMixture( n_components=2 * n_components, covariance_type=covar_type, warm_start=True, max_iter=1, random_state=rng, tol=1e-4, ) current_lower_bound = -np.infty # Do one training iteration at a time so we can make sure that the # training log likelihood increases after each iteration. for _ in range(500): prev_lower_bound = current_lower_bound current_lower_bound = bgmm.fit(X).lower_bound_ assert_greater_equal(current_lower_bound, prev_lower_bound) if bgmm.converged_: break assert bgmm.converged_
def gmm_entropy(points, n_est=None, n_components=None): #from sklearn.mixture import GaussianMixture as GMM from sklearn.mixture import BayesianGaussianMixture as GMM n, d = points.shape # Default to the full set if n_est is None: n_est = n # reduce size of draw to n_est if n_est >= n: x = points else: x = points[permutation(n)[:n_est]] n = n_est if n_components is None: n_components = int(5*sqrt(d)) ## Standardization doesn't seem to help ## Note: sigma may be zero #x, mu, sigma = standardize(x) # if standardized predictor = GMM(n_components=n_components, covariance_type='full', #verbose=True, max_iter=1000) predictor.fit(x) eval_x, _ = predictor.sample(n_est) weight_x = predictor.score_samples(eval_x) H = -np.mean(weight_x) #with np.errstate(divide='ignore'): H = H + np.sum(np.log(sigma)) # if standardized dH = 0. ## cross-check against own calcs #alt = GaussianMixture(predictor.weights_, mu=predictor.means_, sigma=predictor.covariances_) #print("alt", H, alt.entropy()) #print(np.vstack((weight_x[:10], alt.logpdf(eval_x[:10]))).T) return H / LN2, dH / LN2
def kde_entropy_sklearn_gmm(points, n_est=None, n_components=None): """ Use sklearn.neigbors.KernelDensity pdf to estimate entropy. Data is standardized before kde. Sample points drawn from gaussian mixture model from original points. Fails for bimodal and dirichlet, similar to statsmodels kde. """ from sklearn.mixture import BayesianGaussianMixture as GMM n, d = points.shape # Default to the full set if n_est is None: n_est = n # reduce size of draw to n_est if n_est >= n: x = points else: x = points[permutation(n)[:n_est]] n = n_est if n_components is None: n_components = int(5*sqrt(d)) predictor = GMM(n_components=n_components, covariance_type='full', #verbose=True, max_iter=1000) predictor.fit(x) evaluation_points, _ = predictor.sample(n_est) logp = sklearn_log_density(x, evaluation_points=evaluation_points) H = -np.mean(logp) return H / LN2
def test_bayesian_mixture_fit_predict(seed, max_iter, tol): rng = np.random.RandomState(seed) rand_data = RandomData(rng, scale=7) n_components = 2 * rand_data.n_components for covar_type in COVARIANCE_TYPE: bgmm1 = BayesianGaussianMixture(n_components=n_components, max_iter=max_iter, random_state=rng, tol=tol, reg_covar=0) bgmm1.covariance_type = covar_type bgmm2 = copy.deepcopy(bgmm1) X = rand_data.X[covar_type] Y_pred1 = bgmm1.fit(X).predict(X) Y_pred2 = bgmm2.fit_predict(X) assert_array_equal(Y_pred1, Y_pred2)
def wnn_entropy(points, k=None, weights=True, n_est=None, gmm=None): r""" Weighted Kozachenko-Leonenko nearest-neighbour entropy calculation. *k* is the number of neighbours to consider, with default $k=n^{1/3}$ *n_est* is the number of points to use for estimating the entropy, with default $n_\rm{est} = n$ *weights* is True for default weights, False for unweighted (using the distance to the kth neighbour only), or a vector of weights of length *k*. *gmm* is the number of gaussians to use to model the distribution using a gaussian mixture model. Default is 0, and the points represent an empirical distribution. Returns entropy H in bits and its uncertainty. Berrett, T. B., Samworth, R.J., Yuan, M., 2016. Efficient multivariate entropy estimation via k-nearest neighbour distances. https://arxiv.org/abs/1606.00304 """ from sklearn.neighbors import NearestNeighbors n, d = points.shape # Default to the full set if n_est is None: n_est = n # reduce size of draw to n_est if n_est >= n: x = points else: x = points[permutation(n)[:n_est]] n = n_est # Default k based on n if k is None: # Private communication: cube root of n is a good choice for k # Personal observation: k should be much bigger than d k = max(int(n**(1/3)), 3*d) # If weights are given then use them (setting the appropriate k), # otherwise use the default weights. if isinstance(weights, bool): weights = _wnn_weights(k, d, weights) else: k = len(weights) #print("weights", weights, sum(weights)) # select knn algorithm algorithm = 'auto' #algorithm = 'kd_tree' #algorithm = 'ball_tree' #algorithm = 'brute' n_components = 0 if gmm is None else gmm # H = 1/n sum_i=1^n sum_j=1^k w_j log E_{j,i} # E_{j,i} = e^-Psi(j) V_d (n-1) z_{j,i}^d = C z^d # logC = -Psi(j) + log(V_d) + log(n-1) # H = 1/n sum sum w_j logC + d/n sum sum w_j log(z) # = sum w_j logC + d/n sum sum w_j log(z) # = A + d/n B # H^2 = 1/n sum Psi = digamma(np.arange(1, k+1)) logVd = d/2*log(pi) - gammaln(1 + d/2) logC = -Psi + logVd + log(n-1) # TODO: standardizing points doesn't work. # Standardize the data so that distances conform. This is equivalent to # a u-substitution u = sigma x + mu, so the integral needs to be corrected # for dU = det(sigma) dx. Since the standardization squishes the dimensions # independently, sigma is a diagonal matrix, with the determinant equal to # the product of the diagonal elements. #x, mu, sigma = standardize(x) # Note: sigma may be zero #detDU = np.prod(sigma) detDU = 1. if n_components > 0: # Use Gaussian mixture to model the distribution from sklearn.mixture import GaussianMixture as GMM predictor = GMM(n_components=gmm, covariance_type='full') predictor.fit(x) eval_x, _ = predictor.sample(n_est) #weight_x = predictor.score_samples(eval_x) skip = 0 else: # Empirical distribution # TODO: should we use the full draw for kNN and a subset for eval points? # Choose a subset for evaluating the entropy estimate, if desired #print(n_est, n) #eval_x = x if n_est >= n else x[permutation(n)[:n_est]] eval_x = x #weight_x = 1 skip = 1 tree = NearestNeighbors(algorithm=algorithm, n_neighbors=k+skip) tree.fit(x) dist, _ind = tree.kneighbors(eval_x, n_neighbors=k+skip, return_distance=True) # Remove first column. Since test points are in x, the first column will # be a point from x with distance 0, and can be ignored. if skip: dist = dist[:, skip:] # Find log distances. This can be problematic for MCMC runs where a # step is rejected, and therefore identical points are in the distribution. # Ignore them by replacing these points with nan and using nanmean. # TODO: need proper analysis of duplicated points in MCMC chain dist[dist == 0] = nan logdist = log(dist) H_unweighted = logC + d*np.nanmean(logdist, axis=0) H = np.dot(H_unweighted, weights)[0] Hsq_k = np.nanmean((logC[-1] + d*logdist[:,-1])**2) # TODO: abs shouldn't be needed? if Hsq_k < H**2: print("warning: avg(H^2) < avg(H)^2") dH = sqrt(abs(Hsq_k - H**2)/n_est) #print("unweighted", H_unweighted) #print("weighted", H, Hsq_k, H**2, dH, detDU, LN2) return H * detDU / LN2, dH * detDU / LN2
X = np.c_[data_thr.orbit, data_thr.rate, data_thr.rateA, data_thr.rateB, data_thr.rateC, data_thr.rateCA] scaler = StandardScaler() X = scaler.fit_transform(X) # 1 corresponds to data_thr.rate and 4=5-1 to data_thr.rateC w = w / np.sqrt(scaler.var_[1:]) # w = np.exp(-np.exp(3 * w.mean(axis=1))) w = 1. / w.mean(axis=1) ** 2 Html_file = open("gmm_sklearn_files/gmm3_sklearn.html", "w") gmm = BayesianGaussianMixture(n_components=3, alpha_prior=0.1, beta_prior=1, n_init=5) gmm.fit(X) # , weights=w) not implemented in sklearn yet preds = gmm.predict(X) probs = gmm.predict_proba(X) data_thr['preds'] = pd.Series(preds).astype("category") color_key = ["red", "blue", "yellow", "grey", "black", "purple", "pink", "brown", "green", "orange"] # Spectral9 color_key = color_key[:len(set(preds))+1] covs = gmm.covariances_ means = gmm.means_ # transform cov for non-standardizeed data: covs = np.array([np.dot(np.diag(np.sqrt(scaler.var_)), np.dot(covs[j], np.diag(np.sqrt(scaler.var_))))
v = vector[0] / sp.linalg.norm(vector[0]) angle = 180* np.arctan(v[1] / v[0]) / np.pi e = Ellipse(xy=center, width=width, height=height, angle=angle, color=clrs[i], alpha=0.5, clip_box = ax.bbox) ax.add_artist(e) ax1_min, ax1_max, ax2_min, ax2_max = plt.axis() plt.xlim((x1_min, x1_max)) plt.ylim((x2_min, x2_max)) plt.title('GMM', fontsize=15) plt.grid(b=True, ls=':', color='#606060') # DPGMM dpgmm = BayesianGaussianMixture(n_components=n_components, covariance_type='full', max_iter=1000, n_init=5, weight_concentration_prior_type='dirichlet_process', weight_concentration_prior=0.1) dpgmm.fit(x) #假定高斯分布的参数是随机变量,且服从dirichlet_process过程,weight_concentration_prior越大越考虑到先验,越小越靠近样本 centers = dpgmm.means_ covs = dpgmm.covariances_ print u'DPGMM均值 = \n', centers print u'DPGMM方差 = \n', covs y_hat = dpgmm.predict(x) print y_hat ax = plt.subplot(212) grid_hat = dpgmm.predict(grid_test) grid_hat = grid_hat.reshape(x1.shape) plt.pcolormesh(x1, x2, grid_hat, cmap=cm) plt.scatter(x[:, 0], x[:, 1], s=20, c=y, cmap=cm, marker='o', edgecolors='#202020') for i, cc in enumerate(zip(centers, covs)): if i not in y_hat:
v = vector[0] / sp.linalg.norm(vector[0]) angle = 180* np.arctan(v[1] / v[0]) / np.pi e = Ellipse(xy=center, width=width, height=height, angle=angle, color=clrs[i], alpha=0.5, clip_box = ax.bbox) ax.add_artist(e) ax1_min, ax1_max, ax2_min, ax2_max = plt.axis() plt.xlim((x1_min, x1_max)) plt.ylim((x2_min, x2_max)) plt.title('GMM', fontsize=15) plt.grid(b=True, ls=':', color='#606060') # DPGMM dpgmm = BayesianGaussianMixture(n_components=n_components, covariance_type='full', max_iter=1000, n_init=5, weight_concentration_prior_type='dirichlet_process', weight_concentration_prior=0.1) dpgmm.fit(x) centers = dpgmm.means_ covs = dpgmm.covariances_ print('DPGMM均值 = \n', centers) print('DPGMM方差 = \n', covs) y_hat = dpgmm.predict(x) print(y_hat) ax = plt.subplot(212) grid_hat = dpgmm.predict(grid_test) grid_hat = grid_hat.reshape(x1.shape) plt.pcolormesh(x1, x2, grid_hat, cmap=cm) plt.scatter(x[:, 0], x[:, 1], s=20, c=y, cmap=cm, marker='o', edgecolors='#202020') for i, cc in enumerate(zip(centers, covs)): if i not in y_hat:
def test_bayesian_mixture_precisions_prior_initialisation(): rng = np.random.RandomState(0) n_samples, n_features = 10, 2 X = rng.rand(n_samples, n_features) # Check raise message for a bad value of degrees_of_freedom_prior bad_degrees_of_freedom_prior_ = n_features - 1.0 bgmm = BayesianGaussianMixture(degrees_of_freedom_prior=bad_degrees_of_freedom_prior_, random_state=rng) assert_raise_message( ValueError, "The parameter 'degrees_of_freedom_prior' should be " "greater than %d, but got %.3f." % (n_features - 1, bad_degrees_of_freedom_prior_), bgmm.fit, X, ) # Check correct init for a given value of degrees_of_freedom_prior degrees_of_freedom_prior = rng.rand() + n_features - 1.0 bgmm = BayesianGaussianMixture(degrees_of_freedom_prior=degrees_of_freedom_prior, random_state=rng).fit(X) assert_almost_equal(degrees_of_freedom_prior, bgmm.degrees_of_freedom_prior_) # Check correct init for the default value of degrees_of_freedom_prior degrees_of_freedom_prior_default = n_features bgmm = BayesianGaussianMixture(degrees_of_freedom_prior=degrees_of_freedom_prior_default, random_state=rng).fit(X) assert_almost_equal(degrees_of_freedom_prior_default, bgmm.degrees_of_freedom_prior_) # Check correct init for a given value of covariance_prior covariance_prior = { "full": np.cov(X.T, bias=1) + 10, "tied": np.cov(X.T, bias=1) + 5, "diag": np.diag(np.atleast_2d(np.cov(X.T, bias=1))) + 3, "spherical": rng.rand(), } bgmm = BayesianGaussianMixture(random_state=rng) for cov_type in ["full", "tied", "diag", "spherical"]: bgmm.covariance_type = cov_type bgmm.covariance_prior = covariance_prior[cov_type] bgmm.fit(X) assert_almost_equal(covariance_prior[cov_type], bgmm.covariance_prior_) # Check raise message for a bad spherical value of covariance_prior bad_covariance_prior_ = -1.0 bgmm = BayesianGaussianMixture( covariance_type="spherical", covariance_prior=bad_covariance_prior_, random_state=rng ) assert_raise_message( ValueError, "The parameter 'spherical covariance_prior' " "should be greater than 0., but got %.3f." % bad_covariance_prior_, bgmm.fit, X, ) # Check correct init for the default value of covariance_prior covariance_prior_default = { "full": np.atleast_2d(np.cov(X.T)), "tied": np.atleast_2d(np.cov(X.T)), "diag": np.var(X, axis=0, ddof=1), "spherical": np.var(X, axis=0, ddof=1).mean(), } bgmm = BayesianGaussianMixture(random_state=0) for cov_type in ["full", "tied", "diag", "spherical"]: bgmm.covariance_type = cov_type bgmm.fit(X) assert_almost_equal(covariance_prior_default[cov_type], bgmm.covariance_prior_)