def fit(self,
            data,
            max_iter=DEFAULT_MAX_ITER,
            lr_mean=DEFAULT_LR,
            conv_factor_mean=DEFAULT_CONV_FACTOR):
        """Fit a Gaussian mixture model (GMM) given the data.

        Alternates between Expectation and Maximization steps
        for some number of iterations.

        Parameters
        ----------
        data : array-like, shape=[n_samples, n_features]
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        max_iter : int
            Optional, default: 100.
            Maximum number of iterations.
        lr_mean : float
            Optional, default: 5e-2.
            Learning rate for the mean.
        conv_factor_mean : float
            Optional, default: 5e-3.
            Convergence factor for the mean.

        Returns
        -------
        self : object
            Return the components of the computed
            Gaussian mixture model: means, variances and mixture_coefficients.
        """
        self._dimension = data.shape[-1]
        if self.initialisation_method == 'kmeans':
            kmeans = RiemannianKMeans(metric=self.metric,
                                      n_clusters=self.n_gaussians,
                                      init='random',
                                      mean_method='frechet-poincare-ball')

            centroids = kmeans.fit(X=data, max_iter=100)
            labels = kmeans.predict(X=data)

            self.means = centroids
            self.variances = gs.zeros(self.n_gaussians)

            labeled_data = gs.vstack([labels, gs.transpose(data)])
            labeled_data = gs.transpose(labeled_data)
            for label, centroid in enumerate(centroids):
                label_mask = gs.where(labeled_data[:, 0] == label)
                grouped_by_label = labeled_data[label_mask][:, 1:]
                v = variance(grouped_by_label, centroid, self.metric)
                if grouped_by_label.shape[0] == 1:
                    v += MIN_VAR_INIT
                self.variances[label] = v
        else:
            self.means = (gs.random.rand(self.n_gaussians, self._dimension) -
                          0.5) / self._dimension
            self.variances = gs.random.rand(self.n_gaussians) / 10 + 0.8

        self.mixture_coefficients = \
            gs.ones(self.n_gaussians) / self.n_gaussians
        posterior_probabilities = gs.ones((data.shape[0], self.means.shape[0]))

        self.variances_range,\
            self.normalization_factor_var, \
            self.phi_inv_var =\
            self.metric.normalization_factor_init(
                gs.arange(
                    ZETA_LOWER_BOUND, ZETA_UPPER_BOUND, ZETA_STEP))

        for epoch in range(max_iter):
            old_posterior_probabilities = posterior_probabilities

            posterior_probabilities = self._expectation(data)

            condition = gs.mean(
                gs.abs(old_posterior_probabilities - posterior_probabilities))

            if (condition < EM_CONV_RATE and epoch > MINIMUM_EPOCHS):
                logging.info('EM converged in %s iterations', epoch)
                return self.means, self.variances, self.mixture_coefficients

            self._maximization(data,
                               posterior_probabilities,
                               lr_means=lr_mean,
                               conv_factor_mean=conv_factor_mean)

        logging.info('WARNING: EM did not converge \n'
                     'Please increase MINIMUM_EPOCHS.')

        return self.means, self.variances, self.mixture_coefficients
Exemplo n.º 2
0
    def fit(self, data):
        """Fit a Gaussian mixture model (GMM) given the data.

        Alternates between Expectation and Maximization steps
        for some number of iterations.

        Parameters
        ----------
        data : array-like, shape=[n_samples, n_features]
            Training data, where n_samples is the number of samples
            and n_features is the number of features.

        Returns
        -------
        self : object
            Return the components of the computed
            Gaussian mixture model: means, variances and mixture_coefficients.
        """
        self._dimension = data.shape[-1]
        if self.initialisation_method == "kmeans":
            kmeans = RiemannianKMeans(
                metric=self.metric,
                n_clusters=self.n_gaussians,
                init="random",
                init_step_size=self.init_step_size,
                mean_method="batch",
            )

            centroids = kmeans.fit(X=data)
            labels = kmeans.predict(X=data)

            self.means = centroids
            self.variances = gs.zeros(self.n_gaussians)

            labeled_data = gs.vstack([labels, gs.transpose(data)])
            labeled_data = gs.transpose(labeled_data)
            for label, centroid in enumerate(centroids):
                label_mask = gs.where(labeled_data[:, 0] == label)
                grouped_by_label = labeled_data[label_mask][:, 1:]
                v = variance(grouped_by_label, centroid, self.metric)
                if grouped_by_label.shape[0] == 1:
                    v += MIN_VAR_INIT
                self.variances[label] = v
        else:
            self.means = (gs.random.rand(self.n_gaussians, self._dimension) -
                          0.5) / self._dimension
            self.variances = gs.random.rand(self.n_gaussians) / 10 + 0.8

        self.mixture_coefficients = gs.ones(
            self.n_gaussians) / self.n_gaussians
        posterior_probabilities = gs.ones((data.shape[0], self.means.shape[0]))

        (
            self.variances_range,
            self.normalization_factor_var,
            self.phi_inv_var,
        ) = self.normalization_factor_init(
            gs.arange(ZETA_LOWER_BOUND, ZETA_UPPER_BOUND, ZETA_STEP))

        for epoch in range(self.max_iter):
            old_posterior_probabilities = posterior_probabilities

            posterior_probabilities = self._expectation(data)

            condition = gs.mean(
                gs.abs(old_posterior_probabilities - posterior_probabilities))

            if condition < EM_CONV_RATE and epoch > MINIMUM_EPOCHS:
                logging.info("EM converged in %s iterations", epoch)
                return self.means, self.variances, self.mixture_coefficients

            self._maximization(data, posterior_probabilities)

        logging.info("WARNING: EM did not converge \n"
                     "Please increase MINIMUM_EPOCHS.")

        return self.means, self.variances, self.mixture_coefficients