def fit(self, data, max_iter=DEFAULT_MAX_ITER, lr_mean=DEFAULT_LR, conv_factor_mean=DEFAULT_CONV_FACTOR): """Fit a Gaussian mixture model (GMM) given the data. Alternates between Expectation and Maximization steps for some number of iterations. Parameters ---------- data : array-like, shape=[n_samples, n_features] Training data, where n_samples is the number of samples and n_features is the number of features. max_iter : int Optional, default: 100. Maximum number of iterations. lr_mean : float Optional, default: 5e-2. Learning rate for the mean. conv_factor_mean : float Optional, default: 5e-3. Convergence factor for the mean. Returns ------- self : object Return the components of the computed Gaussian mixture model: means, variances and mixture_coefficients. """ self._dimension = data.shape[-1] if self.initialisation_method == 'kmeans': kmeans = RiemannianKMeans(metric=self.metric, n_clusters=self.n_gaussians, init='random', mean_method='frechet-poincare-ball') centroids = kmeans.fit(X=data, max_iter=100) labels = kmeans.predict(X=data) self.means = centroids self.variances = gs.zeros(self.n_gaussians) labeled_data = gs.vstack([labels, gs.transpose(data)]) labeled_data = gs.transpose(labeled_data) for label, centroid in enumerate(centroids): label_mask = gs.where(labeled_data[:, 0] == label) grouped_by_label = labeled_data[label_mask][:, 1:] v = variance(grouped_by_label, centroid, self.metric) if grouped_by_label.shape[0] == 1: v += MIN_VAR_INIT self.variances[label] = v else: self.means = (gs.random.rand(self.n_gaussians, self._dimension) - 0.5) / self._dimension self.variances = gs.random.rand(self.n_gaussians) / 10 + 0.8 self.mixture_coefficients = \ gs.ones(self.n_gaussians) / self.n_gaussians posterior_probabilities = gs.ones((data.shape[0], self.means.shape[0])) self.variances_range,\ self.normalization_factor_var, \ self.phi_inv_var =\ self.metric.normalization_factor_init( gs.arange( ZETA_LOWER_BOUND, ZETA_UPPER_BOUND, ZETA_STEP)) for epoch in range(max_iter): old_posterior_probabilities = posterior_probabilities posterior_probabilities = self._expectation(data) condition = gs.mean( gs.abs(old_posterior_probabilities - posterior_probabilities)) if (condition < EM_CONV_RATE and epoch > MINIMUM_EPOCHS): logging.info('EM converged in %s iterations', epoch) return self.means, self.variances, self.mixture_coefficients self._maximization(data, posterior_probabilities, lr_means=lr_mean, conv_factor_mean=conv_factor_mean) logging.info('WARNING: EM did not converge \n' 'Please increase MINIMUM_EPOCHS.') return self.means, self.variances, self.mixture_coefficients
def fit(self, data): """Fit a Gaussian mixture model (GMM) given the data. Alternates between Expectation and Maximization steps for some number of iterations. Parameters ---------- data : array-like, shape=[n_samples, n_features] Training data, where n_samples is the number of samples and n_features is the number of features. Returns ------- self : object Return the components of the computed Gaussian mixture model: means, variances and mixture_coefficients. """ self._dimension = data.shape[-1] if self.initialisation_method == "kmeans": kmeans = RiemannianKMeans( metric=self.metric, n_clusters=self.n_gaussians, init="random", init_step_size=self.init_step_size, mean_method="batch", ) centroids = kmeans.fit(X=data) labels = kmeans.predict(X=data) self.means = centroids self.variances = gs.zeros(self.n_gaussians) labeled_data = gs.vstack([labels, gs.transpose(data)]) labeled_data = gs.transpose(labeled_data) for label, centroid in enumerate(centroids): label_mask = gs.where(labeled_data[:, 0] == label) grouped_by_label = labeled_data[label_mask][:, 1:] v = variance(grouped_by_label, centroid, self.metric) if grouped_by_label.shape[0] == 1: v += MIN_VAR_INIT self.variances[label] = v else: self.means = (gs.random.rand(self.n_gaussians, self._dimension) - 0.5) / self._dimension self.variances = gs.random.rand(self.n_gaussians) / 10 + 0.8 self.mixture_coefficients = gs.ones( self.n_gaussians) / self.n_gaussians posterior_probabilities = gs.ones((data.shape[0], self.means.shape[0])) ( self.variances_range, self.normalization_factor_var, self.phi_inv_var, ) = self.normalization_factor_init( gs.arange(ZETA_LOWER_BOUND, ZETA_UPPER_BOUND, ZETA_STEP)) for epoch in range(self.max_iter): old_posterior_probabilities = posterior_probabilities posterior_probabilities = self._expectation(data) condition = gs.mean( gs.abs(old_posterior_probabilities - posterior_probabilities)) if condition < EM_CONV_RATE and epoch > MINIMUM_EPOCHS: logging.info("EM converged in %s iterations", epoch) return self.means, self.variances, self.mixture_coefficients self._maximization(data, posterior_probabilities) logging.info("WARNING: EM did not converge \n" "Please increase MINIMUM_EPOCHS.") return self.means, self.variances, self.mixture_coefficients