def exp_domain(self, tangent_vec, base_point): """Compute the domain of the Euclidean exponential map. Compute the real interval of time where the Euclidean geodesic starting at point `base_point` in direction `tangent_vec` is defined. Parameters ---------- tangent_vec : array-like, shape=[n_samples, n, n] base_point : array-like, shape=[n_samples, n, n] Returns ------- exp_domain : array-like, shape=[n_samples, 2] """ base_point = gs.to_ndarray(base_point, to_ndim=3) tangent_vec = gs.to_ndarray(tangent_vec, to_ndim=3) invsqrt_base_point = gs.linalg.powerm(base_point, -.5) reduced_vec = gs.matmul(invsqrt_base_point, tangent_vec) reduced_vec = gs.matmul(reduced_vec, invsqrt_base_point) eigvals = gs.linalg.eigvalsh(reduced_vec) min_eig = gs.amin(eigvals, axis=1) max_eig = gs.amax(eigvals, axis=1) inf_value = gs.where(max_eig <= 0, -math.inf, -1 / max_eig) inf_value = gs.to_ndarray(inf_value, to_ndim=2) sup_value = gs.where(min_eig >= 0, math.inf, -1 / min_eig) sup_value = gs.to_ndarray(sup_value, to_ndim=2) domain = gs.concatenate((inf_value, sup_value), axis=1) return domain
def exp_domain(tangent_vec, base_point): """Compute the domain of the Euclidean exponential map. Compute the real interval of time where the Euclidean geodesic starting at point `base_point` in direction `tangent_vec` is defined. Parameters ---------- tangent_vec : array-like, shape=[..., n, n] Tangent vector at base point. base_point : array-like, shape=[..., n, n] Base point. Returns ------- exp_domain : array-like, shape=[..., 2] Interval of time where the geodesic is defined. """ invsqrt_base_point = SymmetricMatrices.powerm(base_point, -.5) reduced_vec = gs.matmul(invsqrt_base_point, tangent_vec) reduced_vec = gs.matmul(reduced_vec, invsqrt_base_point) eigvals = gs.linalg.eigvalsh(reduced_vec) min_eig = gs.amin(eigvals, axis=1) max_eig = gs.amax(eigvals, axis=1) inf_value = gs.where( max_eig <= 0., gs.array(-math.inf), - 1. / max_eig) inf_value = gs.to_ndarray(inf_value, to_ndim=2) sup_value = gs.where( min_eig >= 0., gs.array(-math.inf), - 1. / min_eig) sup_value = gs.to_ndarray(sup_value, to_ndim=2) domain = gs.concatenate((inf_value, sup_value), axis=1) return domain
def test_geodesic(self): """Test geodesic. Check that the norm of the velocity is constant. """ initial_point = self.categorical.random_point() end_point = self.categorical.random_point() n_steps = 100 geod = self.metric.geodesic(initial_point=initial_point, end_point=end_point) t = gs.linspace(0.0, 1.0, n_steps) geod_at_t = geod(t) velocity = n_steps * (geod_at_t[1:, :] - geod_at_t[:-1, :]) velocity_norm = self.metric.norm(velocity, geod_at_t[:-1, :]) result = (1 / gs.amin(velocity_norm) * (gs.amax(velocity_norm) - gs.amin(velocity_norm))) expected = 0.0 self.assertAllClose(expected, result, rtol=1.0)
def _expectation(self, data): """Update the posterior probabilities. Parameters ---------- data : array-like, shape=[n_samples, n_features] Training data, where n_samples is the number of samples and n_features is the number of features. """ probability_distribution_function = gmm_pdf( data, self.means, self.variances, norm_func=find_normalization_factor, metric=self.metric, variances_range=self.variances_range, norm_func_var=self.normalization_factor_var, ) if gs.isnan(probability_distribution_function.mean()): logging.warning("EXPECTATION : Probability distribution function" "contain elements that are not numbers") num_normalized_pdf = gs.einsum("j,...j->...j", self.mixture_coefficients, probability_distribution_function) valid_pdf_condition = gs.amin(gs.sum(num_normalized_pdf, -1)) if valid_pdf_condition <= PDF_TOL: num_normalized_pdf[gs.sum(num_normalized_pdf, -1) <= PDF_TOL] = 1 sum_pdf = gs.sum(num_normalized_pdf, -1) posterior_probabilities = gs.einsum("...i,...->...i", num_normalized_pdf, 1 / sum_pdf) if gs.any(gs.mean(posterior_probabilities)) is None: logging.warning("EXPECTATION : posterior probabilities " "contain elements that are not numbers.") if (1 - SUM_CHECK_PDF >= gs.mean(gs.sum(posterior_probabilities, 1)) >= 1 + SUM_CHECK_PDF): logging.warning("EXPECTATION : posterior probabilities " "do not sum to 1.") if gs.any(gs.sum(posterior_probabilities, 0) < PDF_TOL): logging.warning("EXPECTATION : Gaussian got no elements " "(precision error) reinitialize") posterior_probabilities[posterior_probabilities == 0] = PDF_TOL return posterior_probabilities
def exp_domain(self, tangent_vec, base_point): base_point = gs.to_ndarray(base_point, to_ndim=3) tangent_vec = gs.to_ndarray(tangent_vec, to_ndim=3) invsqrt_base_point = gs.linalg.powerm(base_point, -.5) reduced_vec = gs.matmul(invsqrt_base_point, tangent_vec) reduced_vec = gs.matmul(reduced_vec, invsqrt_base_point) eigvals = gs.linalg.eigvalsh(reduced_vec) min_eig = gs.amin(eigvals, axis=1) max_eig = gs.amax(eigvals, axis=1) inf_value = gs.where(max_eig <= 0, -math.inf, -1 / max_eig) inf_value = gs.to_ndarray(inf_value, to_ndim=2) sup_value = gs.where(min_eig >= 0, math.inf, -1 / min_eig) sup_value = gs.to_ndarray(sup_value, to_ndim=2) domain = gs.concatenate((inf_value, sup_value), axis=1) return domain
def _expectation(self, data): """Update the posterior probabilities. Parameters ---------- data : array-like, shape=[n_samples, n_features] Training data, where n_samples is the number of samples and n_features is the number of features. """ probability_distribution_function = \ PoincareBall.gmm_pdf( data, self.means, self.variances, norm_func=self.riemannian_metric.find_normalization_factor, metric=self.riemannian_metric, variances_range=self.variances_range, norm_func_var=self.normalization_factor_var) if gs.isnan(probability_distribution_function.mean()): logging.warning('EXPECTATION : Probability distribution function' 'contain elements that are not numbers') num_normalized_pdf = gs.einsum('j,...j->...j', self.mixture_coefficients, probability_distribution_function) valid_pdf_condition = gs.amin(gs.sum(num_normalized_pdf, -1)) if valid_pdf_condition <= PDF_TOL: num_normalized_pdf[gs.sum(num_normalized_pdf, -1) <= PDF_TOL] = 1 sum_pdf = gs.sum(num_normalized_pdf, -1) posterior_probabilities =\ gs.einsum('...i,...->...i', num_normalized_pdf, 1 / sum_pdf) if gs.any(gs.mean(posterior_probabilities)) is None: logging.warning('EXPECTATION : posterior probabilities ' 'contain elements that are not numbers.') if 1 - SUM_CHECK_PDF >= gs.mean(gs.sum( posterior_probabilities, 1)) >= 1 + SUM_CHECK_PDF: logging.warning('EXPECTATION : posterior probabilities ' 'do not sum to 1.') return posterior_probabilities
def fit(self, X): """Provide clusters centroids and data labels. Alternate between computing the mean of each cluster and labelling data according to the new positions of the centroids. Parameters ---------- X : array-like, shape=[..., n_features] Training data, where n_samples is the number of samples and n_features is the number of features. max_iter : int Maximum number of iterations. Optional, default: 100. Returns ------- self : array-like, shape=[n_clusters,] Centroids. """ n_samples = X.shape[0] if self.verbose > 0: logging.info("Initializing...") if self.init == "kmeans++": centroids = [gs.expand_dims(X[randint(0, n_samples - 1)], 0)] for i in range(self.n_clusters - 1): dists = [ gs.to_ndarray(self.metric.dist(centroids[j], X), 2, 1) for j in range(i + 1) ] dists = gs.hstack(dists) dists_to_closest_centroid = gs.amin(dists, 1) indices = gs.arange(n_samples) weights = dists_to_closest_centroid / gs.sum( dists_to_closest_centroid) index = rv_discrete(values=(indices, weights)).rvs() centroids.append(gs.expand_dims(X[index], 0)) else: centroids = [ gs.expand_dims(X[randint(0, n_samples - 1)], 0) for i in range(self.n_clusters) ] self.centroids = gs.concatenate(centroids, axis=0) self.init_centroids = gs.concatenate(centroids, axis=0) dists = [ gs.to_ndarray(self.metric.dist(self.centroids[i], X), 2, 1) for i in range(self.n_clusters) ] dists = gs.hstack(dists) self.labels = gs.argmin(dists, 1) index = 0 while index < self.max_iter: index += 1 if self.verbose > 0: logging.info(f"Iteration {index}...") old_centroids = gs.copy(self.centroids) for i in range(self.n_clusters): fold = gs.squeeze(X[self.labels == i]) if len(fold) > 0: mean = FrechetMean( metric=self.metric, max_iter=self.max_iter_mean, point_type=self.point_type, method=self.mean_method, init_step_size=self.init_step_size, ) mean.fit(fold) self.centroids[i] = mean.estimate_ else: self.centroids[i] = X[randint(0, n_samples - 1)] dists = [ gs.to_ndarray(self.metric.dist(self.centroids[i], X), 2, 1) for i in range(self.n_clusters) ] dists = gs.hstack(dists) self.labels = gs.argmin(dists, 1) dists_to_closest_centroid = gs.amin(dists, 1) self.inertia = gs.sum(dists_to_closest_centroid**2) centroids_distances = self.metric.dist(old_centroids, self.centroids) if self.verbose > 0: logging.info( f"Convergence criterion at the end of iteration {index} " f"is {gs.mean(centroids_distances)}.") if gs.mean(centroids_distances) < self.tol: if self.verbose > 0: logging.info( f"Convergence reached after {index} iterations.") if self.n_clusters == 1: self.centroids = gs.squeeze(self.centroids, axis=0) return gs.copy(self.centroids) if index == self.max_iter: logging.warning( f"K-means maximum number of iterations {self.max_iter} reached. " "The mean may be inaccurate.") if self.n_clusters == 1: self.centroids = gs.squeeze(self.centroids, axis=0) return gs.copy(self.centroids)