예제 #1
0
    def fit(self, data, categorical_columns=tuple(), ordinal_columns=tuple()):
        self.meta = get_metadata(data, categorical_columns, ordinal_columns)
        model = []

        self.output_info = []
        self.output_dim = 0
        self.components = []
        for id_, info in enumerate(self.meta):
            if info["type"] == "continuous":
                gm = BayesianGaussianMixture(
                    self.n_clusters,
                    weight_concentration_prior_type="dirichlet_process",
                    weight_concentration_prior=0.001,
                    n_init=1,
                )
                gm.fit(data.iloc[:, id_].values.reshape([-1, 1]))
                model.append(gm)
                comp = gm.weights_ > self.eps
                self.components.append(comp)

                self.output_info += [(1, "tanh"), (np.sum(comp), "softmax")]
                self.output_dim += 1 + np.sum(comp)
            else:
                model.append(None)
                self.components.append(None)
                self.output_info += [(info["size"], "softmax")]
                self.output_dim += info["size"]

        self.model = model
예제 #2
0
    def fit(self, sequence):

        #        # Normalize sequence
        #        self.norm_std = np.std(sequence)
        #        sequence = sequence / self.norm_std

        # Create training matrix
        train_matrix, labels = self._create_training_matrix(
            sequence, self.n_features)

        # Include labels in feature matrix
        train_matrix_extended = np.column_stack((train_matrix, labels))
        N, dims_TOT = np.shape(train_matrix_extended)

        # GMM fit
        gmm = BayesianGaussianMixture(n_components=self.n_components,
                                      covariance_type='full',
                                      max_iter=1000)
        gmm.fit(train_matrix_extended)

        # Get gmm_parameters
        alpha = gmm.weights_
        mu = gmm.means_
        sigma = self._get_covariances(gmm)
        n_components = gmm.n_components

        self.gmm_parameters = {
            'n_components': n_components,
            'n_features': self.n_features,
            'alpha': alpha,
            'mu': mu,
            'sigma': sigma
        }
예제 #3
0
    def cluster(self,
                dim,
                method='dpgmm',
                max_n_clusters=80,
                max_iter=300,
                refresh=True):
        '''
        dim is the dim index for clustering
        '''
        print('clustering DPGMM')
        from sklearn.mixture import BayesianGaussianMixture as DPGMM
        dpgmm = DPGMM(n_components=max_n_clusters,
                      covariance_type='full',
                      weight_concentration_prior=1e-3,
                      weight_concentration_prior_type='dirichlet_process',
                      init_params="kmeans",
                      max_iter=max_iter,
                      random_state=0,
                      verbose=1,
                      verbose_interval=10)  # init can be "kmeans" or "random"
        dpgmm.fit(self.fet[:, dim])
        label = dpgmm.predict(self.fet[:, dim])
        self.clu.membership = label
        self.clu.__construct__()
        self.clu.emit('cluster')

        if refresh is True:
            self.set_data(self.fet, self.clu)
        return label
예제 #4
0
    def _bgm_fit(self, x):
        """Fit a Bayesian Gaussian Mixture to the data given by x.

        Parameters
        ----------
        x : array-like, shape (n_samples, n_attributes)
            The data to be fit.

        Returns
        -------
        model : BayesianGaussianMixture from the sklearn package
            The BayesianGaussianMixture object that has been fit to the data.
        """
        model = BGM(n_components=self.n_components,
                    tol=self.tol,
                    max_iter=self.max_iter,
                    n_init=self.n_init,
                    covariance_type=self.cov_type,
                    weight_concentration_prior_type=self.
                    weight_concentration_prior_type,
                    weight_concentration_prior=self.weight_concentration_prior)
        data = x.astype('float32')
        model.fit(data)

        return model
def test_bayesian_mixture_predict_predict_proba():
    # this is the same test as test_gaussian_mixture_predict_predict_proba()
    rng = np.random.RandomState(0)
    rand_data = RandomData(rng)
    for prior_type in PRIOR_TYPE:
        for covar_type in COVARIANCE_TYPE:
            X = rand_data.X[covar_type]
            Y = rand_data.Y
            bgmm = BayesianGaussianMixture(
                n_components=rand_data.n_components,
                random_state=rng,
                weight_concentration_prior_type=prior_type,
                covariance_type=covar_type)

            # Check a warning message arrive if we don't do fit
            assert_raise_message(NotFittedError,
                                 "This BayesianGaussianMixture instance"
                                 " is not fitted yet. Call 'fit' with "
                                 "appropriate arguments before using "
                                 "this method.", bgmm.predict, X)

            bgmm.fit(X)
            Y_pred = bgmm.predict(X)
            Y_pred_proba = bgmm.predict_proba(X).argmax(axis=1)
            assert_array_equal(Y_pred, Y_pred_proba)
            assert_greater_equal(adjusted_rand_score(Y, Y_pred), .95)
    def __init__(self, **kwargs):
        super().__init__(data_set=kwargs.pop('data_set', None), **kwargs)

        self.clf_ = kwargs.get('clf', None)
        if self.clf_ is None:
            raise ValueError("missing required keyword-only argument 'clf'")
        if not callable(getattr(self.clf_, 'fit', None)) or not callable(
            (getattr(self.clf_, 'predict_proba', None))):
            raise TypeError(
                "'clf' must be an instance with the methods 'fit' and 'predict_proba'"
            )

        n_components = int(
            kwargs.pop('n_components', np.min([20, len(self.data_set_)])))
        if n_components < 0 or n_components > len(self.data_set_):
            raise ValueError(
                "'n_components' must be an integer in the interval [1, n_samples]"
            )

        # fit Gaussian mixture model for pre-clustering
        gmm = BayesianGaussianMixture(n_components=n_components,
                                      covariance_type='spherical',
                                      max_iter=1000,
                                      random_state=self.random_state_)
        gmm.fit(self.data_set_.X_)
        self.y_cluster_ = gmm.predict(self.data_set_.X_)
        self.p_x_ = np.exp(gmm.score_samples(self.data_set_.X_))
예제 #7
0
 def setUp(self):
     self.random_state = 1
     self.X, self.y = load_breast_cancer(return_X_y=True)
     self.X = StandardScaler().fit_transform(self.X)
     mixture_model = BayesianGaussianMixture(n_components=2)
     mixture_model.fit(self.X)
     self.clf = CMM(mixture_model=mixture_model)
예제 #8
0
    def fit(self, X, Y):
        """Function fitting the gaussian
        X - inputs
        Y - classes
        """

        self.classes = len(set(Y))  # We assume classes are in (0...K-1)
        self.gaussians = []
        self.p_y = np.zeros(self.classes)  # p(y)

        for individual_class in range(self.classes):

            print("Fitting GMM for the %s class" % individual_class)

            X_class = X[Y == individual_class]

            self.p_y[individual_class] = len(X_class)

            # Each Gaussian is a Bayesian Gaussian Mixture Object
            # The 10 argument is the maximum number of clusters (chosen arbitrarily, could be more)

            GMM = BayesianGaussianMixture(10)

            # The fit function performs the variational inferance update (could take long, iterative algorithm)

            GMM.fit(X_class)

            self.gaussians.append(GMM)

            print("Finished fitting the GMM for the %s class" %
                  individual_class)
            print("======================================================")

        self.p_y = self.p_y / self.p_y.sum()  # This normalizes p(y)
    def gibbs_resampling_EM(self, iter_n=1):
        self.itr = 0
        Np = len(self.Rot)
        for iter in range(iter_n):
            EM = BayesianGaussianMixture(n_components=10)
            EM.fit(self.pf_debug[:, 0:2])
            for i in range(Np):
                sample = EM.sample()
                self.Rot[i].theta += 0.5 * np.random.randn(
                ) + 90.0 * np.random.choice(4, p=[0.8, 0.05, 0.1, 0.05])
                self.Rot[i].x = np.squeeze(sample[0])
                self.Rot[i].y = np.squeeze(sample[1])

            self.likelihood_PF()
            W = self.scores / np.sum(
                self.scores)  # Normalized scores for resampling
            Np = len(self.Rot)
            index = np.random.choice(a=Np, size=Np, p=W)  # resample by score
            Rot_arr = []
            Rot_arr = self.Rot  # creat new temporery array for new sampels

            kmeans = KMeans(n_clusters=4,
                            init='k-means++',
                            max_iter=300,
                            n_init=10,
                            random_state=0)
            kmeans.fit(self.pf_debug[:, 2].reshape((-1, 1)))
            index = np.random.choice(a=4, size=Np)
            for i, idx in enumerate(index):
                self.Rot[i].theta = np.squeeze(
                    kmeans.cluster_centers_[idx]) + 0.5 * np.random.randn()
                self.Rot[i].x += np.random.normal(0, 0.01)
                self.Rot[i].y += np.random.normal(0, 0.01)

            print 'resample done'
예제 #10
0
 def run(self):
     args = self.args
     uniblock_path = self._get_uniblock_path()
     feature = load(os.path.join(uniblock_path, 'feature.dump'))
     X = feature.get_feature_matrix(args.corpus_path)
     legal, mask = self._infer_nonzero(X)
     dump(legal, os.path.join(uniblock_path, 'legal.dump'))
     dump(mask, os.path.join(uniblock_path, 'mask.dump'))
     X = X[:, legal]
     bgm = BayesianGaussianMixture(
         n_components=args.k,
         covariance_type=args.cov,
         max_iter=200,
         random_state=0,
         verbose=0 if not args.verbose else 2,
         verbose_interval=1,
         tol=args.tol,
         n_init=args.n_init,
         init_params=args.init_params,
     )
     bgm.fit(X)
     dump(bgm, os.path.join(uniblock_path, 'bgm.dump'))
     scores = bgm.score_samples(X)
     self._log_scores(scores, args.corpus_path)
     self._log_stats(scores, args.corpus_path)
class VBEM(object):
    def __init__(self, n_components=5, dataset=None):
        self.model = BayesianGaussianMixture(n_components=n_components,
                                             max_iter=10000)
        self.n_components = n_components
        self.class_num = dataset.class_num
        self.data_num = dataset.data_num
        self.data = dataset.data
        self.label = dataset.label
        self.bestVBEM_k = 0
        self.model.fit(self.data)

    def draw(self):
        label = self.model.predict(self.data)
        self.bestVBEM_k = max(label) + 1
        data_2d = pd.DataFrame(self.data, columns=['x', 'y'])
        label_2d = pd.DataFrame(label, columns=['label'])
        label_names = np.unique(label)
        colors = [
            plt.cm.tab10(i / float(len(label_names)))
            for i in range(len(label_names))
        ]
        tmp_2d = pd.concat([data_2d, label_2d], axis=1)

        plt.figure()
        for i, label in enumerate(label_names):
            plt.scatter(tmp_2d.loc[tmp_2d.label == label].x,
                        tmp_2d.loc[tmp_2d.label == label].y,
                        s=5,
                        cmap=colors[i],
                        alpha=0.5)
        plt.title('Best GMM with VBEM_' + str(self.class_num) + '_' +
                  str(self.data_num))
        plt.savefig('res/GMM_VBEM_' + str(self.class_num) + '_' +
                    str(self.data_num) + '.jpg')
예제 #12
0
class VBEM(object):
    def __init__(self,
                 n_components=1,
                 verbose=2,
                 verbose_interval=1,
                 data=None):
        self.model = BayesianGaussianMixture(n_components=n_components,
                                             verbose=verbose,
                                             verbose_interval=verbose_interval)
        self.n_components = n_components
        if data == None:
            self.dataset = Dataset()
            self.dataset.generate()
        else:
            self.dataset = data
        self.data = self.dataset.data

    def train(self):
        self.model.fit(self.data)

    def show(self, n=None):
        plt.figure()
        self.model.fit(self.data)
        labels = self.model.predict(self.data)
        plt.scatter(self.data[:, 0], self.data[:, 1], c=labels, s=10)
        if n == None:
            plt.show()
        else:
            plt.savefig('Pro2/vbem_%d_%d' % (n, 4))
예제 #13
0
 def test_predict(self):
     mixture = BayesianGaussianMixture(n_components=1, random_state=0)
     mixture.fit(X=self.X)
     cmm = CMM(
         mixture_model=mixture, classes=['tokyo', 'paris', 'new york'],
         missing_label='nan', random_state=0
     )
     self.assertRaises(NotFittedError, cmm.predict, X=self.X)
     cmm.fit(X=self.X, y=self.y_nan)
     y = cmm.predict(self.X)
     np.testing.assert_array_equal(['paris', 'tokyo', 'tokyo'], y)
     cmm = CMM(
         mixture_model=mixture, classes=['tokyo', 'paris'],
         missing_label='nan', random_state=1
     )
     cmm.fit(X=self.X, y=self.y_nan)
     y = cmm.predict(self.X)
     np.testing.assert_array_equal(['tokyo', 'tokyo', 'paris'], y)
     cmm.fit(X=self.X, y=self.y, sample_weight=self.w)
     y = cmm.predict(self.X)
     np.testing.assert_array_equal(['tokyo', 'tokyo', 'tokyo'], y)
     cmm = CMM(
         mixture_model=mixture, classes=['tokyo', 'paris'],
         missing_label='nan', cost_matrix=[[0, 1], [10, 0]]
     )
     cmm.fit(X=self.X, y=self.y)
     y = cmm.predict(self.X)
     np.testing.assert_array_equal(['paris', 'paris', 'paris'], y)
     cmm.fit(X=self.X, y=self.y, sample_weight=self.w)
     y = cmm.predict(self.X)
     np.testing.assert_array_equal(['paris', 'paris', 'paris'], y)
예제 #14
0
    def bayesian_gaussian_mixture(self, n_components, weight_concentration_prior_type, weight_concentration_prior,
                                mean_precision_prior, n_init, max_iter, init_params):
        '''Bayesian Gaussian Mixture clustering algorithm. Low value for weight_concentration_prior will put more
        weight on a few components, high value will allow a larger number of components to be active in the mixture.'''
        bgm = BayesianGaussianMixture(n_components=n_components,
                                      weight_concentration_prior_type=weight_concentration_prior_type,
                                      weight_concentration_prior=weight_concentration_prior,
                                      mean_precision_prior=mean_precision_prior,
                                      n_init=n_init,
                                      max_iter=max_iter,
                                      init_params=init_params)
        bgm.fit(self.X)
        self.labels = bgm.predict(self.X)

        unique, counts = np.unique(self.labels, return_counts=True)
        mydict = dict(zip(unique, counts))
        print(mydict)

        plt.bar(list(mydict.keys()), mydict.values(), color = 'g')
        plt.ylabel("Number of skews")
        plt.xlabel("Cluster")
        plt.title(weight_concentration_prior_type)

        plt.gcf().text(0.05, 0.05, "Parameters initialized using: "+init_params)
        plt.gcf().text(0.05, 0.01, "Weight concentration prior: "+str(weight_concentration_prior))
        plt.gcf().text(0.7, 0.05, "Mean precision prior: "+str(mean_precision_prior))
        plt.gcf().text(0.7, 0.01, "Likelihood: "+str("%.2f"%bgm.lower_bound_))
        #plt.show()

        print("Weights: "+str(bgm.weights_))
        print("Converged: "+str(bgm.converged_))
        print("Number of iterations to reach convergence: "+str(bgm.n_iter_))
        print("Lower bound value on likelihood: "+str(bgm.lower_bound_))
        print("Bayesian Gaussian mixture complete")
def pca_gmm_gen_mdl(X, n_pca=15, n_gmm=10, scree=False, w=None):
    pca, Xr, Xm = pca_mdl(X, n_components=n_pca, w=w)
    pdf = BayesianGaussianMixture(n_components=n_gmm,
                                  covariance_type='full',
                                  max_iter=25000)
    pdf.fit(Xr)
    if scree:
        plt.figure()
        plt.plot(np.cumsum(pca.explained_variance_ratio_))
        plt.title('Scree plot')
        plt.xlabel('PCA Components')
        plt.ylabel('Explained Variance')
    if w is not None:

        def gen_samples(n):
            Xr_n, _ = pdf.sample(n)
            Xr_n_ll = pdf.score_samples(Xr_n)
            return Xr_n, pca.inverse_transform(Xr_n) / w[None, :], Xr_n_ll
    else:

        def gen_samples(n):
            Xr_n, _ = pdf.sample(n)
            Xr_n_ll = pdf.score_samples(Xr_n)
            return Xr_n, pca.inverse_transform(Xr_n), Xr_n_ll

    return gen_samples, Xr, Xm
예제 #16
0
 def partition_data(self,args):
     method, j = args
     if method== "vi":
         dp = BayesianGaussianMixture(n_components = self.K,weight_concentration_prior = self.alpha, max_iter=1,init_params='kmeans',weight_concentration_prior_type='dirichlet_process')
         dp.fit(self.X[self.U[j]])
         Z = dp.predict(self.X[self.U[j]]).astype(int)
         Z_star = dp.predict(self.X_star).astype(int)
     if method=="gmm":
         Z,Z_star= self.uncollapsed_dp_partition_alt(j)
     elif method=="kmean":
         km = KMeans(n_clusters=self.K)
         Z = km.fit_predict(self.X[self.U[j]]).astype(int)
         Z_star = km.predict(self.X_star[self.U[j]]).astype(int)
     else:
         Z = np.random.choice(self.K,size = self.N_minibatch,replace=True)
         Z_star = np.random.choice(np.unique(Z),size = self.N_star,replace=True)
     le = LE()
     le.fit(np.hstack((Z,Z_star)))
     Z = le.transform(Z)
     Z_star = le.transform(Z_star)
     if (method=="vi"): #& (self.vi_partition):
         Z_diff = np.setdiff1d(Z_star,Z)
         if Z_diff.size > 0:
             idx = np.hstack((np.where(Z_star==k) for k in Z_diff)).flatten()
             unique_Z = np.unique(Z)
             post_Z = dp.predict_proba(self.X_star[idx])[:,unique_Z]
             Z_star[idx] = [np.random.choice(unique_Z,p = post_Z_i / post_Z_i.sum() ) for post_Z_i in post_Z]
             assert(np.setdiff1d(Z_star,Z).size == 0)
     return(Z,Z_star)
def test_bayesian_mixture_predict_predict_proba():
    # this is the same test as test_gaussian_mixture_predict_predict_proba()
    rng = np.random.RandomState(0)
    rand_data = RandomData(rng)
    for prior_type in PRIOR_TYPE:
        for covar_type in COVARIANCE_TYPE:
            X = rand_data.X[covar_type]
            Y = rand_data.Y
            bgmm = BayesianGaussianMixture(
                n_components=rand_data.n_components,
                random_state=rng,
                weight_concentration_prior_type=prior_type,
                covariance_type=covar_type)

            # Check a warning message arrive if we don't do fit
            assert_raise_message(
                NotFittedError, "This BayesianGaussianMixture instance"
                " is not fitted yet. Call 'fit' with "
                "appropriate arguments before using "
                "this estimator.", bgmm.predict, X)

            bgmm.fit(X)
            Y_pred = bgmm.predict(X)
            Y_pred_proba = bgmm.predict_proba(X).argmax(axis=1)
            assert_array_equal(Y_pred, Y_pred_proba)
            assert adjusted_rand_score(Y, Y_pred) >= .95
예제 #18
0
    def load(self, phipsis):
        self.length = len(phipsis)

        num_component = min(10, self.length)
        gm_ = GM(n_components=num_component)
        gm_.fit(X=phipsis)
        weights = gm_.weights_
        to_keep = weights > 0.05
        num_component = sum(to_keep)

        gm = GM(n_components=num_component)
        gm.fit(X=phipsis)
        precisions = gm.precisions_cholesky_

        # self.means = gm.means_
        self.phipsis = phipsis
        weight = np.mean(precisions[:, 0, 0]) \
                 + np.mean(precisions[:, 1, 1])
        weight = weight * self.weight_scaling_factor  # for matcher weight
        self.weight = min(weight, 1)
        self.weight *= self.weight_accom_factor
        covs = gm.covariances_
        cov_invs = np.array([np.linalg.inv(cov) for cov in covs])
        cluster_dist = gm.predict_proba(phipsis)
        self.cov_dist = np.einsum("ijk, li->ljk", cov_invs, cluster_dist)
        self.gm = gm  # for matcher weight
        # matcher_weight should be a product of the precision/clustering
        # behaviour of the distribution, and the posterior probability of the
        #  queried point. So, higher clustering but point does not belong in
        # distribution => other pressures acting on queried point => should
        # assign lower weight. Lower clustering and point belong => low
        # clustering means low pressure on point, so it shouldn't matter that
        #  much.
        return
def test_check_covariance_precision():
    # We check that the dot product of the covariance and the precision
    # matrices is identity.
    rng = np.random.RandomState(0)
    rand_data = RandomData(rng, scale=7)
    n_components, n_features = 2 * rand_data.n_components, 2

    # Computation of the full_covariance
    bgmm = BayesianGaussianMixture(n_components=n_components,
                                   max_iter=100, random_state=rng, tol=1e-3,
                                   reg_covar=0)
    for covar_type in COVARIANCE_TYPE:
        bgmm.covariance_type = covar_type
        bgmm.fit(rand_data.X[covar_type])

        if covar_type == 'full':
            for covar, precision in zip(bgmm.covariances_, bgmm.precisions_):
                assert_almost_equal(np.dot(covar, precision),
                                    np.eye(n_features))
        elif covar_type == 'tied':
            assert_almost_equal(np.dot(bgmm.covariances_, bgmm.precisions_),
                                np.eye(n_features))

        elif covar_type == 'diag':
            assert_almost_equal(bgmm.covariances_ * bgmm.precisions_,
                                np.ones((n_components, n_features)))

        else:
            assert_almost_equal(bgmm.covariances_ * bgmm.precisions_,
                                np.ones(n_components))
예제 #20
0
def test_bayesian_mixture_weights_prior_initialisation():
    rng = np.random.RandomState(0)
    n_samples, n_components, n_features = 10, 5, 2
    X = rng.rand(n_samples, n_features)

    # Check raise message for a bad value of weight_concentration_prior
    bad_weight_concentration_prior_ = 0.0
    bgmm = BayesianGaussianMixture(
        weight_concentration_prior=bad_weight_concentration_prior_,
        random_state=0)
    msg = ("The parameter 'weight_concentration_prior' should be greater "
           f"than 0., but got {bad_weight_concentration_prior_:.3f}.")
    with pytest.raises(ValueError, match=msg):
        bgmm.fit(X)

    # Check correct init for a given value of weight_concentration_prior
    weight_concentration_prior = rng.rand()
    bgmm = BayesianGaussianMixture(
        weight_concentration_prior=weight_concentration_prior,
        random_state=rng).fit(X)
    assert_almost_equal(weight_concentration_prior,
                        bgmm.weight_concentration_prior_)

    # Check correct init for the default value of weight_concentration_prior
    bgmm = BayesianGaussianMixture(n_components=n_components,
                                   random_state=rng).fit(X)
    assert_almost_equal(1.0 / n_components, bgmm.weight_concentration_prior_)
    def fit(self, X, Y):
        # number of classes = number of unique elements of Y:
        self.K = len(set(Y))
        self.N = len(X)

        # gaussians for every class:
        self.gauss = []

        # the probability of class, p(Y), for every class:
        self.p_y = []

        # assuming that classes are in [0, K-1],
        # calculate stats for every class:
        for i in range(self.K):
            t0 = datetime.now()
            Xi = X[Y == i]
            # calculate the mean per feature:
            mean_Xi = np.mean(Xi, axis=0)
            # create a GMM model:
            gmm = BayesianGaussianMixture(
                n_components=10)  # n_components = max # clusters
            # fit the data to the gmm:
            print('Fitting GMM', i)
            gmm.fit(Xi)
            print('elapsed time:', datetime.now() - t0, '\n')
            # save to the storage:
            self.gauss.append({'model': gmm, 'mean': mean_Xi})
            # the probability of class, p(Y=k) = #k_class_samples / #all_samples:
            self.p_y.append(len(Xi) / self.N)
예제 #22
0
    def do_bgm(self, n_components=6, seed=42):
        """Bayesian Gaussian Mixture.

        Infer the effective number of components in a Gaussian Mixture Model via variational Bayesian estimation.

        n_effective_componenents < n_components if the model sets some weights close to 0.

        Args:
            n_components (int): Number of components in GMM.
            seed (int): Random seed.

        Returns:
            bgm_output (dict): Labels and probabilities.

        """

        np.random.seed(seed)
        bgm = BayesianGaussianMixture(n_components=n_components, covariance_type='full', weight_concentration_prior=1e-2, weight_concentration_prior_type='dirichlet_process', mean_precision_prior=1e-2, init_params='random', max_iter=100, random_state=seed)

        bgm.fit(self.X)
        bgm_labels = bgm.predict(self.X)
        bgm_prob = bgm.predict_proba(self.X)[:,0]

        bgm_output = {'bgm_labels': bgm_labels, 'bgm_prob': bgm_prob}

        return bgm_output
예제 #23
0
    def init_parameters(self, data):
        self.transmat_ = np.ones((self.num_unique_states, self.num_unique_states))
        self.transmat_ = self.transmat_ / np.sum(self.transmat_, axis=1)
        self.transmat_ = self.transmat_ / np.sum(self.transmat_, axis=1).reshape(1, -1).T
        self.emission_matrix = np.zeros((self.num_unique_states, self.num_observations))
        self.means_ = np.random.rand(self.num_unique_states)
        self.covars_ = np.ones(self.num_unique_states)

        # main_kmeans = cluster.KMeans(n_clusters=self.n_components,
        #                              random_state=self.random_state)
        # labels = main_kmeans.fit_predict(data)
        # kmeanses = []
        # random_state = check_random_state(None)
        # for label in range(self.n_components):
        #     kmeans = cluster.KMeans(n_clusters=self.n_mix,
        #                             random_state=self.random_state)
        #     kmeans.fit(data[np.where(labels == label)])
        #     kmeanses.append(kmeans)
        # for i, kmeans in enumerate(kmeanses):
        #     self.means_[i] = kmeans.cluster_centers_

        # Run simple EM (no HMM)
        iterations = 40
        reshaped_data = data.reshape(-1, 1)
        assignments, centers, _ = kmeans.kmeans_best_of_n(reshaped_data, self.num_unique_states, n_trials=5)
        new_centers = [distributions.Gaussian(c.mean, np.eye(1)) \
                       for c in centers]
        tau, obs_distr, pi, gmm_ll_train, gmm_ll_test = \
            em.em(reshaped_data, new_centers, assignments, n_iter=iterations)
        for i in range(len(centers)):
            self.means_[i] = centers[i].mean
        self.startprob_ = pi
        gmm = BayesianGaussianMixture(n_components=3, init_params="kmeans", max_iter=1500)
        gmm.fit(data.reshape(-1, 1))
        self.means_ = gmm.means_.flatten()
def test_check_covariance_precision():
    # We check that the dot product of the covariance and the precision
    # matrices is identity.
    rng = np.random.RandomState(0)
    rand_data = RandomData(rng, scale=7)
    n_components, n_features = 2 * rand_data.n_components, 2

    # Computation of the full_covariance
    bgmm = BayesianGaussianMixture(n_components=n_components,
                                   max_iter=100,
                                   random_state=rng,
                                   tol=1e-3,
                                   reg_covar=0)
    for covar_type in COVARIANCE_TYPE:
        bgmm.covariance_type = covar_type
        bgmm.fit(rand_data.X[covar_type])

        if covar_type == 'full':
            for covar, precision in zip(bgmm.covariances_, bgmm.precisions_):
                assert_almost_equal(np.dot(covar, precision),
                                    np.eye(n_features))
        elif covar_type == 'tied':
            assert_almost_equal(np.dot(bgmm.covariances_, bgmm.precisions_),
                                np.eye(n_features))

        elif covar_type == 'diag':
            assert_almost_equal(bgmm.covariances_ * bgmm.precisions_,
                                np.ones((n_components, n_features)))

        else:
            assert_almost_equal(bgmm.covariances_ * bgmm.precisions_,
                                np.ones(n_components))
def airmass_labels(z, P, T, H2O, O3, n_airmass=5, labels=None):
    cH2O = mf2mol_cum(H2O, P, T)
    cO3 = mf2mol_cum(O3, P, T)
    T_surf = T[:, z < 3].mean(axis=1)
    T_grad = np.diff(T[:, z < 6], axis=1).mean(axis=1)
    H2O_tot = cH2O[:, -1]
    O3_tot = cO3[:, -1]
    f = lambda x: (x - x.mean()) / x.std()
    features = np.vstack((f(T_surf), f(T_grad), f(H2O_tot), f(O3_tot))).T
    if labels is None:
        pdf = BayesianGaussianMixture(n_components=n_airmass,
                                      covariance_type='full',
                                      max_iter=25000)
        pdf.fit(features)
        labels = pdf.predict(features)
    plt.figure()
    for ii in range(n_airmass):
        ix = labels == ii
        plt.subplot(1, 3, 1)
        plt.plot(T_surf[ix], H2O_tot[ix], '.')
        plt.xlabel('Mean T (z<3km) [K]')
        plt.ylabel('Total H2O [mol]')
        plt.subplot(1, 3, 2)
        plt.plot(T_surf[ix], 1e6 * O3_tot[ix], '.')
        plt.xlabel('Mean T (z<3km) [K]')
        plt.ylabel('Total O3 [µmol]')
        plt.subplot(1, 3, 3)
        plt.plot(H2O_tot[ix], 1e6 * O3_tot[ix], '.')
        plt.xlabel('Total H2O [mol]')
        plt.ylabel('Total O3 [µmol]')
    return labels
예제 #26
0
 def test_predict_freq(self):
     mixture = BayesianGaussianMixture(n_components=1)
     mixture.fit(X=self.X, y=self.y)
     cmm = CMM(mixture_model=mixture,
               classes=['tokyo', 'paris', 'new york'], missing_label='nan')
     self.assertRaises(NotFittedError, cmm.predict_freq, X=self.X)
     cmm.fit(X=self.X, y=self.y_nan)
     F = cmm.predict_freq(X=self.X)
     np.testing.assert_array_equal(np.zeros((len(self.X), 3)), F)
     cmm.fit(X=self.X, y=self.y, sample_weight=self.w)
     F = cmm.predict_freq(X=[self.X[0]])
     np.testing.assert_array_equal([[0, 1, 2]], F)
     X, y = make_blobs(n_samples=200, centers=2)
     y_nan = np.full_like(y, np.nan, dtype=float)
     mixture = BayesianGaussianMixture(n_components=5)
     cmm = CMM(mixture_model=mixture, classes=[0, 1],
               weight_mode='similarities')
     self.assertRaises(NotFittedError, cmm.predict_freq, X=self.X)
     cmm.fit(X=X, y=y_nan)
     F = cmm.predict_freq(X=X)
     np.testing.assert_array_equal(F.shape, [200, 2])
     self.assertEqual(F.sum(), 0)
     cmm.fit(X=X, y=y)
     F = cmm.predict_freq(X=X)
     self.assertTrue(F.sum() > 0)
예제 #27
0
    def fit(self, data, categorical_columns=tuple(), ordinal_columns=tuple()):
        self.meta = self.get_metadata(data, categorical_columns,
                                      ordinal_columns)
        model = []

        self.output_info = []
        self.output_dim = 0
        self.components = []
        for id_, info in enumerate(self.meta):
            if info['type'] == CONTINUOUS:
                gm = BayesianGaussianMixture(
                    self.n_clusters,
                    weight_concentration_prior_type='dirichlet_process',
                    weight_concentration_prior=0.001,
                    n_init=1)
                gm.fit(data[:, id_].reshape([-1, 1]))
                model.append(gm)
                comp = gm.weights_ > self.eps
                self.components.append(comp)

                self.output_info += [(1, 'tanh'), (np.sum(comp), 'softmax')]
                self.output_dim += 1 + np.sum(comp)
            else:
                model.append(None)
                self.components.append(None)
                self.output_info += [(info['size'], 'softmax')]
                self.output_dim += info['size']

        self.model = model
예제 #28
0
파일: gmm.py 프로젝트: Adamli12/M-protein
 def encode(self,x):
     samples=list()
     for i in range(x.shape[0]):#sampling for BGM
         samples.append(np.array(utils.tosample(x[i])).reshape(-1,1))
     allmeans=[]
     allcovs=[]
     allweights=[]
     BGM45=np.zeros((x.shape[0],3*self.n_components))
     for i in range(x.shape[0]):
         #BGM=BayesianGaussianMixture(n_components=self.n_components,covariance_type='spherical',weight_concentration_prior=1e-10,max_iter=5000,tol=1e-7,n_init=5)
         BGM=BayesianGaussianMixture(n_components=self.n_components,covariance_type='spherical',weight_concentration_prior=1e-10,max_iter=500)
         BGM.fit(samples[i])
         means=np.reshape(BGM.means_,(-1,))
         permu=np.argsort(means)
         means=means[permu]
         BGM45[i][self.n_components:2*self.n_components]=means
         covs=BGM.covariances_
         covs=covs[permu]
         BGM45[i][2*self.n_components:3*self.n_components]=covs
         weights=BGM.weights_
         weights=weights[permu]
         BGM45[i][0:self.n_components]=weights*len(samples[i])
         if self.visualization==1:
             plt.plot(x[i])
             X=np.linspace(0,self.lofd,num=200,endpoint=False)
             Ys=utils.toGM(X,self.n_components,BGM45[i][self.n_components:2*self.n_components],BGM45[i][2*self.n_components:3*self.n_components],BGM45[i][0:self.n_components])
             for j in range(self.n_components):
                 plt.plot(X,Ys[j])
                 plt.ylim(0,255)
             plt.show()
     return BGM45
예제 #29
0
 def predict_cp_interval(self, n_components = 30):
     '''
     Estimates the (phenotypical) levels of observed amplitudes, regardless of order. Consequently, each
     observed time-point is classified. Between each transition from one inferred level to another one, a
     change-point with uniform distirbution is inferred.
     
     :param n_components: maximum number of components of the mixture model (default is 30)
     '''
     
     logging.warn("Predicting CP intervals")
      
     state_mix = BayesianGaussianMixture(
         n_components, 
         n_init = 10,
         weight_concentration_prior_type = 'dirichlet_distribution',
         verbose = 1,
         max_iter = 500,
         tol=1e-12
     )
     
     observed = self.observation[~np.isnan(self.observation)].reshape(-1, 1)
     
     state_mix.fit(observed)
     
     classified = deepcopy(self.observation)
     predicted = state_mix.predict(classified[~np.isnan(classified)].reshape(-1, 1))
     classified[~np.isnan(classified)] = predicted
     
     last = None
     begin = 0
     for i, c in enumerate(classified):
         if not np.isnan(c):
             last = c
             begin = i
             break
     
     segments = []
     for i in range(begin, classified.shape[0]):
         if not np.isnan(classified[i]):
             if classified[i] != last:
                 s = np.max(np.argwhere(~np.isnan(classified[0:i-1])))
                 segments.append((s, i))
             last = classified[i]
             begin = i
     
     
     # calculate uniform distribution parameters
     result = []
     for segment in segments:
         a = segment[0]
         b = segment[1]
         distro = {
             'begin': a,
             'end': b
         }
         result.append(distro)
     result = pd.DataFrame(result)
     
     return result, state_mix
예제 #30
0
 def fit(self, x, y):
     self.N = len(np.unique(y))
     self.models = []
     for n in range(self.N):
         x_n = x[y == n]
         model = BayesianGaussianMixture(10)
         model.fit(x_n)
         self.models.append(model)
예제 #31
0
def fit_GMM(data,num_components):
    gmm = GMM(n_components=num_components)
    gmm.fit(data)

    predicted_class = gmm.predict(data)
    num_classes = np.unique(gmm.predict(data)).shape[0]

    return gmm,predicted_class,num_classes
예제 #32
0
 def bayes_gauss(X):
     bgm = BayesianGaussianMixture(n_components=10,
                                   n_init=10,
                                   random_state=42)
     bgm.fit(X)
     print(np.round(bgm.weights_, 2))
     plot_gaussian_mixture(bgm, X)
     plt.show()
  def fit(self, X, Y):
    # assume classes are numbered 0...K-1
    self.K = len(set(Y))

    self.gaussians = []
    self.p_y = np.zeros(self.K)
    for k in range(self.K):
      print("Fitting gmm", k)
      Xk = X[Y == k]
      self.p_y[k] = len(Xk)
      gmm = BayesianGaussianMixture(10)
      gmm.fit(Xk)
      self.gaussians.append(gmm)
    # normalize p(y)
    self.p_y /= self.p_y.sum()
예제 #34
0
def test_monotonic_likelihood():
    # We check that each step of the each step of variational inference without
    # regularization improve monotonically the training set of the bound
    rng = np.random.RandomState(0)
    rand_data = RandomData(rng, scale=7)
    n_components = rand_data.n_components

    for covar_type in COVARIANCE_TYPE:
        X = rand_data.X[covar_type]
        bgmm = BayesianGaussianMixture(
            n_components=2 * n_components,
            covariance_type=covar_type,
            warm_start=True,
            max_iter=1,
            random_state=rng,
            tol=1e-4,
        )
        current_lower_bound = -np.infty
        # Do one training iteration at a time so we can make sure that the
        # training log likelihood increases after each iteration.
        for _ in range(500):
            prev_lower_bound = current_lower_bound
            current_lower_bound = bgmm.fit(X).lower_bound_
            assert_greater_equal(current_lower_bound, prev_lower_bound)

            if bgmm.converged_:
                break
        assert bgmm.converged_
예제 #35
0
파일: entropy.py 프로젝트: bumps/bumps
def gmm_entropy(points, n_est=None, n_components=None):
    #from sklearn.mixture import GaussianMixture as GMM
    from sklearn.mixture import BayesianGaussianMixture as GMM
    n, d = points.shape

    # Default to the full set
    if n_est is None:
        n_est = n

    # reduce size of draw to n_est
    if n_est >= n:
        x = points
    else:
        x = points[permutation(n)[:n_est]]
        n = n_est

    if n_components is None:
        n_components = int(5*sqrt(d))

    ## Standardization doesn't seem to help
    ## Note: sigma may be zero
    #x, mu, sigma = standardize(x)   # if standardized
    predictor = GMM(n_components=n_components, covariance_type='full',
                    #verbose=True,
                    max_iter=1000)
    predictor.fit(x)
    eval_x, _ = predictor.sample(n_est)
    weight_x = predictor.score_samples(eval_x)
    H = -np.mean(weight_x)
    #with np.errstate(divide='ignore'): H = H + np.sum(np.log(sigma))   # if standardized
    dH = 0.
    ## cross-check against own calcs
    #alt = GaussianMixture(predictor.weights_, mu=predictor.means_, sigma=predictor.covariances_)
    #print("alt", H, alt.entropy())
    #print(np.vstack((weight_x[:10], alt.logpdf(eval_x[:10]))).T)
    return H / LN2, dH / LN2
예제 #36
0
파일: entropy.py 프로젝트: bumps/bumps
def kde_entropy_sklearn_gmm(points, n_est=None, n_components=None):
    """
    Use sklearn.neigbors.KernelDensity pdf to estimate entropy.

    Data is standardized before kde.

    Sample points drawn from gaussian mixture model from original points.

    Fails for bimodal and dirichlet, similar to statsmodels kde.
    """
    from sklearn.mixture import BayesianGaussianMixture as GMM
    n, d = points.shape

    # Default to the full set
    if n_est is None:
        n_est = n

    # reduce size of draw to n_est
    if n_est >= n:
        x = points
    else:
        x = points[permutation(n)[:n_est]]
        n = n_est

    if n_components is None:
        n_components = int(5*sqrt(d))

    predictor = GMM(n_components=n_components, covariance_type='full',
                    #verbose=True,
                    max_iter=1000)
    predictor.fit(x)
    evaluation_points, _ = predictor.sample(n_est)

    logp = sklearn_log_density(x, evaluation_points=evaluation_points)
    H = -np.mean(logp)
    return H / LN2
def test_bayesian_mixture_fit_predict(seed, max_iter, tol):
    rng = np.random.RandomState(seed)
    rand_data = RandomData(rng, scale=7)
    n_components = 2 * rand_data.n_components

    for covar_type in COVARIANCE_TYPE:
        bgmm1 = BayesianGaussianMixture(n_components=n_components,
                                        max_iter=max_iter, random_state=rng,
                                        tol=tol, reg_covar=0)
        bgmm1.covariance_type = covar_type
        bgmm2 = copy.deepcopy(bgmm1)
        X = rand_data.X[covar_type]

        Y_pred1 = bgmm1.fit(X).predict(X)
        Y_pred2 = bgmm2.fit_predict(X)
        assert_array_equal(Y_pred1, Y_pred2)
예제 #38
0
파일: entropy.py 프로젝트: bumps/bumps
def wnn_entropy(points, k=None, weights=True, n_est=None, gmm=None):
    r"""
    Weighted Kozachenko-Leonenko nearest-neighbour entropy calculation.

    *k* is the number of neighbours to consider, with default $k=n^{1/3}$

    *n_est* is the number of points to use for estimating the entropy,
    with default $n_\rm{est} = n$

    *weights* is True for default weights, False for unweighted (using the
    distance to the kth neighbour only), or a vector of weights of length *k*.

    *gmm* is the number of gaussians to use to model the distribution using
    a gaussian mixture model.  Default is 0, and the points represent an
    empirical distribution.

    Returns entropy H in bits and its uncertainty.

    Berrett, T. B., Samworth, R.J., Yuan, M., 2016. Efficient multivariate
    entropy estimation via k-nearest neighbour distances.
    https://arxiv.org/abs/1606.00304
    """
    from sklearn.neighbors import NearestNeighbors
    n, d = points.shape

    # Default to the full set
    if n_est is None:
        n_est = n

    # reduce size of draw to n_est
    if n_est >= n:
        x = points
    else:
        x = points[permutation(n)[:n_est]]
        n = n_est

    # Default k based on n
    if k is None:
        # Private communication: cube root of n is a good choice for k
        # Personal observation: k should be much bigger than d
        k = max(int(n**(1/3)), 3*d)

    # If weights are given then use them (setting the appropriate k),
    # otherwise use the default weights.
    if isinstance(weights, bool):
        weights = _wnn_weights(k, d, weights)
    else:
        k = len(weights)
    #print("weights", weights, sum(weights))

    # select knn algorithm
    algorithm = 'auto'
    #algorithm = 'kd_tree'
    #algorithm = 'ball_tree'
    #algorithm = 'brute'

    n_components = 0 if gmm is None else gmm

    # H = 1/n sum_i=1^n sum_j=1^k w_j log E_{j,i}
    # E_{j,i} = e^-Psi(j) V_d (n-1) z_{j,i}^d = C z^d
    # logC = -Psi(j) + log(V_d) + log(n-1)
    # H = 1/n sum sum w_j logC + d/n sum sum w_j log(z)
    #   = sum w_j logC + d/n sum sum w_j log(z)
    #   = A + d/n B
    # H^2 = 1/n sum
    Psi = digamma(np.arange(1, k+1))
    logVd = d/2*log(pi) - gammaln(1 + d/2)
    logC = -Psi + logVd + log(n-1)

    # TODO: standardizing points doesn't work.
    # Standardize the data so that distances conform.  This is equivalent to
    # a u-substitution u = sigma x + mu, so the integral needs to be corrected
    # for dU = det(sigma) dx.  Since the standardization squishes the dimensions
    # independently, sigma is a diagonal matrix, with the determinant equal to
    # the product of the diagonal elements.
    #x, mu, sigma = standardize(x)  # Note: sigma may be zero
    #detDU = np.prod(sigma)
    detDU = 1.

    if n_components > 0:
        # Use Gaussian mixture to model the distribution
        from sklearn.mixture import GaussianMixture as GMM
        predictor = GMM(n_components=gmm, covariance_type='full')
        predictor.fit(x)
        eval_x, _ = predictor.sample(n_est)
        #weight_x = predictor.score_samples(eval_x)
        skip = 0
    else:
        # Empirical distribution
        # TODO: should we use the full draw for kNN and a subset for eval points?
        # Choose a subset for evaluating the entropy estimate, if desired
        #print(n_est, n)
        #eval_x = x if n_est >= n else x[permutation(n)[:n_est]]
        eval_x = x
        #weight_x = 1
        skip = 1

    tree = NearestNeighbors(algorithm=algorithm, n_neighbors=k+skip)
    tree.fit(x)
    dist, _ind = tree.kneighbors(eval_x, n_neighbors=k+skip, return_distance=True)
    # Remove first column. Since test points are in x, the first column will
    # be a point from x with distance 0, and can be ignored.
    if skip:
        dist = dist[:, skip:]
    # Find log distances.  This can be problematic for MCMC runs where a
    # step is rejected, and therefore identical points are in the distribution.
    # Ignore them by replacing these points with nan and using nanmean.
    # TODO: need proper analysis of duplicated points in MCMC chain
    dist[dist == 0] = nan
    logdist = log(dist)
    H_unweighted = logC + d*np.nanmean(logdist, axis=0)
    H = np.dot(H_unweighted, weights)[0]
    Hsq_k = np.nanmean((logC[-1] + d*logdist[:,-1])**2)
    # TODO: abs shouldn't be needed?
    if Hsq_k < H**2:
        print("warning: avg(H^2) < avg(H)^2")
    dH = sqrt(abs(Hsq_k - H**2)/n_est)
    #print("unweighted", H_unweighted)
    #print("weighted", H, Hsq_k, H**2, dH, detDU, LN2)
    return H * detDU / LN2, dH * detDU / LN2
예제 #39
0
X = np.c_[data_thr.orbit, data_thr.rate, data_thr.rateA, data_thr.rateB,
          data_thr.rateC, data_thr.rateCA]

scaler = StandardScaler()
X = scaler.fit_transform(X)

# 1 corresponds to data_thr.rate and 4=5-1 to data_thr.rateC
w = w / np.sqrt(scaler.var_[1:])
# w = np.exp(-np.exp(3 * w.mean(axis=1)))
w = 1. / w.mean(axis=1) ** 2

Html_file = open("gmm_sklearn_files/gmm3_sklearn.html", "w")

gmm = BayesianGaussianMixture(n_components=3, alpha_prior=0.1, beta_prior=1,
                              n_init=5)
gmm.fit(X)  # , weights=w) not implemented in sklearn yet
preds = gmm.predict(X)
probs = gmm.predict_proba(X)

data_thr['preds'] = pd.Series(preds).astype("category")

color_key = ["red", "blue", "yellow", "grey", "black", "purple", "pink",
             "brown", "green", "orange"]  # Spectral9
color_key = color_key[:len(set(preds))+1]

covs = gmm.covariances_
means = gmm.means_

# transform cov for non-standardizeed data:
covs = np.array([np.dot(np.diag(np.sqrt(scaler.var_)),
                        np.dot(covs[j], np.diag(np.sqrt(scaler.var_))))
        v = vector[0] / sp.linalg.norm(vector[0])
        angle = 180* np.arctan(v[1] / v[0]) / np.pi
        e = Ellipse(xy=center, width=width, height=height,
                    angle=angle, color=clrs[i], alpha=0.5, clip_box = ax.bbox)
        ax.add_artist(e)

    ax1_min, ax1_max, ax2_min, ax2_max = plt.axis()
    plt.xlim((x1_min, x1_max))
    plt.ylim((x2_min, x2_max))
    plt.title('GMM', fontsize=15)
    plt.grid(b=True, ls=':', color='#606060')

    # DPGMM
    dpgmm = BayesianGaussianMixture(n_components=n_components, covariance_type='full', max_iter=1000, n_init=5,
                                    weight_concentration_prior_type='dirichlet_process', weight_concentration_prior=0.1)
    dpgmm.fit(x)         #假定高斯分布的参数是随机变量,且服从dirichlet_process过程,weight_concentration_prior越大越考虑到先验,越小越靠近样本
    centers = dpgmm.means_
    covs = dpgmm.covariances_
    print u'DPGMM均值 = \n', centers
    print u'DPGMM方差 = \n', covs
    y_hat = dpgmm.predict(x)
    print y_hat

    ax = plt.subplot(212)
    grid_hat = dpgmm.predict(grid_test)
    grid_hat = grid_hat.reshape(x1.shape)
    plt.pcolormesh(x1, x2, grid_hat, cmap=cm)
    plt.scatter(x[:, 0], x[:, 1], s=20, c=y, cmap=cm, marker='o', edgecolors='#202020')

    for i, cc in enumerate(zip(centers, covs)):
        if i not in y_hat:
        v = vector[0] / sp.linalg.norm(vector[0])
        angle = 180* np.arctan(v[1] / v[0]) / np.pi
        e = Ellipse(xy=center, width=width, height=height,
                    angle=angle, color=clrs[i], alpha=0.5, clip_box = ax.bbox)
        ax.add_artist(e)

    ax1_min, ax1_max, ax2_min, ax2_max = plt.axis()
    plt.xlim((x1_min, x1_max))
    plt.ylim((x2_min, x2_max))
    plt.title('GMM', fontsize=15)
    plt.grid(b=True, ls=':', color='#606060')

    # DPGMM
    dpgmm = BayesianGaussianMixture(n_components=n_components, covariance_type='full', max_iter=1000, n_init=5,
                                    weight_concentration_prior_type='dirichlet_process', weight_concentration_prior=0.1)
    dpgmm.fit(x)
    centers = dpgmm.means_
    covs = dpgmm.covariances_
    print('DPGMM均值 = \n', centers)
    print('DPGMM方差 = \n', covs)
    y_hat = dpgmm.predict(x)
    print(y_hat)

    ax = plt.subplot(212)
    grid_hat = dpgmm.predict(grid_test)
    grid_hat = grid_hat.reshape(x1.shape)
    plt.pcolormesh(x1, x2, grid_hat, cmap=cm)
    plt.scatter(x[:, 0], x[:, 1], s=20, c=y, cmap=cm, marker='o', edgecolors='#202020')

    for i, cc in enumerate(zip(centers, covs)):
        if i not in y_hat:
예제 #42
0
def test_bayesian_mixture_precisions_prior_initialisation():
    rng = np.random.RandomState(0)
    n_samples, n_features = 10, 2
    X = rng.rand(n_samples, n_features)

    # Check raise message for a bad value of degrees_of_freedom_prior
    bad_degrees_of_freedom_prior_ = n_features - 1.0
    bgmm = BayesianGaussianMixture(degrees_of_freedom_prior=bad_degrees_of_freedom_prior_, random_state=rng)
    assert_raise_message(
        ValueError,
        "The parameter 'degrees_of_freedom_prior' should be "
        "greater than %d, but got %.3f." % (n_features - 1, bad_degrees_of_freedom_prior_),
        bgmm.fit,
        X,
    )

    # Check correct init for a given value of degrees_of_freedom_prior
    degrees_of_freedom_prior = rng.rand() + n_features - 1.0
    bgmm = BayesianGaussianMixture(degrees_of_freedom_prior=degrees_of_freedom_prior, random_state=rng).fit(X)
    assert_almost_equal(degrees_of_freedom_prior, bgmm.degrees_of_freedom_prior_)

    # Check correct init for the default value of degrees_of_freedom_prior
    degrees_of_freedom_prior_default = n_features
    bgmm = BayesianGaussianMixture(degrees_of_freedom_prior=degrees_of_freedom_prior_default, random_state=rng).fit(X)
    assert_almost_equal(degrees_of_freedom_prior_default, bgmm.degrees_of_freedom_prior_)

    # Check correct init for a given value of covariance_prior
    covariance_prior = {
        "full": np.cov(X.T, bias=1) + 10,
        "tied": np.cov(X.T, bias=1) + 5,
        "diag": np.diag(np.atleast_2d(np.cov(X.T, bias=1))) + 3,
        "spherical": rng.rand(),
    }

    bgmm = BayesianGaussianMixture(random_state=rng)
    for cov_type in ["full", "tied", "diag", "spherical"]:
        bgmm.covariance_type = cov_type
        bgmm.covariance_prior = covariance_prior[cov_type]
        bgmm.fit(X)
        assert_almost_equal(covariance_prior[cov_type], bgmm.covariance_prior_)

    # Check raise message for a bad spherical value of covariance_prior
    bad_covariance_prior_ = -1.0
    bgmm = BayesianGaussianMixture(
        covariance_type="spherical", covariance_prior=bad_covariance_prior_, random_state=rng
    )
    assert_raise_message(
        ValueError,
        "The parameter 'spherical covariance_prior' "
        "should be greater than 0., but got %.3f." % bad_covariance_prior_,
        bgmm.fit,
        X,
    )

    # Check correct init for the default value of covariance_prior
    covariance_prior_default = {
        "full": np.atleast_2d(np.cov(X.T)),
        "tied": np.atleast_2d(np.cov(X.T)),
        "diag": np.var(X, axis=0, ddof=1),
        "spherical": np.var(X, axis=0, ddof=1).mean(),
    }

    bgmm = BayesianGaussianMixture(random_state=0)
    for cov_type in ["full", "tied", "diag", "spherical"]:
        bgmm.covariance_type = cov_type
        bgmm.fit(X)
        assert_almost_equal(covariance_prior_default[cov_type], bgmm.covariance_prior_)