예제 #1
0
    def exp_domain(self, tangent_vec, base_point):
        """Compute the domain of the Euclidean exponential map.

        Compute the real interval of time where the Euclidean geodesic starting
        at point `base_point` in direction `tangent_vec` is defined.

        Parameters
        ----------
        tangent_vec : array-like, shape=[n_samples, n, n]
        base_point : array-like, shape=[n_samples, n, n]

        Returns
        -------
        exp_domain : array-like, shape=[n_samples, 2]
        """
        base_point = gs.to_ndarray(base_point, to_ndim=3)
        tangent_vec = gs.to_ndarray(tangent_vec, to_ndim=3)
        invsqrt_base_point = gs.linalg.powerm(base_point, -.5)
        reduced_vec = gs.matmul(invsqrt_base_point, tangent_vec)
        reduced_vec = gs.matmul(reduced_vec, invsqrt_base_point)
        eigvals = gs.linalg.eigvalsh(reduced_vec)
        min_eig = gs.amin(eigvals, axis=1)
        max_eig = gs.amax(eigvals, axis=1)
        inf_value = gs.where(max_eig <= 0, -math.inf, -1 / max_eig)
        inf_value = gs.to_ndarray(inf_value, to_ndim=2)
        sup_value = gs.where(min_eig >= 0, math.inf, -1 / min_eig)
        sup_value = gs.to_ndarray(sup_value, to_ndim=2)
        domain = gs.concatenate((inf_value, sup_value), axis=1)

        return domain
예제 #2
0
    def exp_domain(tangent_vec, base_point):
        """Compute the domain of the Euclidean exponential map.

        Compute the real interval of time where the Euclidean geodesic starting
        at point `base_point` in direction `tangent_vec` is defined.

        Parameters
        ----------
        tangent_vec : array-like, shape=[..., n, n]
            Tangent vector at base point.
        base_point : array-like, shape=[..., n, n]
            Base point.

        Returns
        -------
        exp_domain : array-like, shape=[..., 2]
            Interval of time where the geodesic is defined.
        """
        invsqrt_base_point = SymmetricMatrices.powerm(base_point, -.5)

        reduced_vec = gs.matmul(invsqrt_base_point, tangent_vec)
        reduced_vec = gs.matmul(reduced_vec, invsqrt_base_point)
        eigvals = gs.linalg.eigvalsh(reduced_vec)
        min_eig = gs.amin(eigvals, axis=1)
        max_eig = gs.amax(eigvals, axis=1)
        inf_value = gs.where(
            max_eig <= 0., gs.array(-math.inf), - 1. / max_eig)
        inf_value = gs.to_ndarray(inf_value, to_ndim=2)
        sup_value = gs.where(
            min_eig >= 0., gs.array(-math.inf), - 1. / min_eig)
        sup_value = gs.to_ndarray(sup_value, to_ndim=2)
        domain = gs.concatenate((inf_value, sup_value), axis=1)

        return domain
예제 #3
0
    def test_geodesic(self):
        """Test geodesic.

        Check that the norm of the velocity is constant.
        """
        initial_point = self.categorical.random_point()
        end_point = self.categorical.random_point()

        n_steps = 100
        geod = self.metric.geodesic(initial_point=initial_point,
                                    end_point=end_point)
        t = gs.linspace(0.0, 1.0, n_steps)
        geod_at_t = geod(t)
        velocity = n_steps * (geod_at_t[1:, :] - geod_at_t[:-1, :])
        velocity_norm = self.metric.norm(velocity, geod_at_t[:-1, :])
        result = (1 / gs.amin(velocity_norm) *
                  (gs.amax(velocity_norm) - gs.amin(velocity_norm)))
        expected = 0.0

        self.assertAllClose(expected, result, rtol=1.0)
예제 #4
0
    def _expectation(self, data):
        """Update the posterior probabilities.

        Parameters
        ----------
        data : array-like, shape=[n_samples, n_features]
            Training data, where n_samples is the number of samples and
            n_features is the number of features.
        """
        probability_distribution_function = gmm_pdf(
            data,
            self.means,
            self.variances,
            norm_func=find_normalization_factor,
            metric=self.metric,
            variances_range=self.variances_range,
            norm_func_var=self.normalization_factor_var,
        )

        if gs.isnan(probability_distribution_function.mean()):
            logging.warning("EXPECTATION : Probability distribution function"
                            "contain elements that are not numbers")

        num_normalized_pdf = gs.einsum("j,...j->...j",
                                       self.mixture_coefficients,
                                       probability_distribution_function)
        valid_pdf_condition = gs.amin(gs.sum(num_normalized_pdf, -1))

        if valid_pdf_condition <= PDF_TOL:

            num_normalized_pdf[gs.sum(num_normalized_pdf, -1) <= PDF_TOL] = 1

        sum_pdf = gs.sum(num_normalized_pdf, -1)
        posterior_probabilities = gs.einsum("...i,...->...i",
                                            num_normalized_pdf, 1 / sum_pdf)

        if gs.any(gs.mean(posterior_probabilities)) is None:

            logging.warning("EXPECTATION : posterior probabilities "
                            "contain elements that are not numbers.")

        if (1 - SUM_CHECK_PDF >= gs.mean(gs.sum(posterior_probabilities, 1)) >=
                1 + SUM_CHECK_PDF):

            logging.warning("EXPECTATION : posterior probabilities "
                            "do not sum to 1.")

        if gs.any(gs.sum(posterior_probabilities, 0) < PDF_TOL):
            logging.warning("EXPECTATION : Gaussian got no elements "
                            "(precision error) reinitialize")
            posterior_probabilities[posterior_probabilities == 0] = PDF_TOL

        return posterior_probabilities
예제 #5
0
    def exp_domain(self, tangent_vec, base_point):
        base_point = gs.to_ndarray(base_point, to_ndim=3)
        tangent_vec = gs.to_ndarray(tangent_vec, to_ndim=3)
        invsqrt_base_point = gs.linalg.powerm(base_point, -.5)
        reduced_vec = gs.matmul(invsqrt_base_point, tangent_vec)
        reduced_vec = gs.matmul(reduced_vec, invsqrt_base_point)
        eigvals = gs.linalg.eigvalsh(reduced_vec)
        min_eig = gs.amin(eigvals, axis=1)
        max_eig = gs.amax(eigvals, axis=1)
        inf_value = gs.where(max_eig <= 0, -math.inf, -1 / max_eig)
        inf_value = gs.to_ndarray(inf_value, to_ndim=2)
        sup_value = gs.where(min_eig >= 0, math.inf, -1 / min_eig)
        sup_value = gs.to_ndarray(sup_value, to_ndim=2)
        domain = gs.concatenate((inf_value, sup_value), axis=1)

        return domain
    def _expectation(self, data):
        """Update the posterior probabilities.

        Parameters
        ----------
        data : array-like, shape=[n_samples, n_features]
            Training data, where n_samples is the number of samples and
            n_features is the number of features.
        """
        probability_distribution_function = \
            PoincareBall.gmm_pdf(
                data, self.means, self.variances,
                norm_func=self.riemannian_metric.find_normalization_factor,
                metric=self.riemannian_metric,
                variances_range=self.variances_range,
                norm_func_var=self.normalization_factor_var)

        if gs.isnan(probability_distribution_function.mean()):
            logging.warning('EXPECTATION : Probability distribution function'
                            'contain elements that are not numbers')

        num_normalized_pdf = gs.einsum('j,...j->...j',
                                       self.mixture_coefficients,
                                       probability_distribution_function)
        valid_pdf_condition = gs.amin(gs.sum(num_normalized_pdf, -1))

        if valid_pdf_condition <= PDF_TOL:

            num_normalized_pdf[gs.sum(num_normalized_pdf, -1) <= PDF_TOL] = 1

        sum_pdf = gs.sum(num_normalized_pdf, -1)
        posterior_probabilities =\
            gs.einsum('...i,...->...i', num_normalized_pdf, 1 / sum_pdf)

        if gs.any(gs.mean(posterior_probabilities)) is None:

            logging.warning('EXPECTATION : posterior probabilities '
                            'contain elements that are not numbers.')

        if 1 - SUM_CHECK_PDF >= gs.mean(gs.sum(
                posterior_probabilities, 1)) >= 1 + SUM_CHECK_PDF:

            logging.warning('EXPECTATION : posterior probabilities '
                            'do not sum to 1.')

        return posterior_probabilities
예제 #7
0
    def fit(self, X):
        """Provide clusters centroids and data labels.

        Alternate between computing the mean of each cluster
        and labelling data according to the new positions of the centroids.

        Parameters
        ----------
        X : array-like, shape=[..., n_features]
            Training data, where n_samples is the number of samples and
            n_features is the number of features.
        max_iter : int
            Maximum number of iterations.
            Optional, default: 100.

        Returns
        -------
        self : array-like, shape=[n_clusters,]
            Centroids.
        """
        n_samples = X.shape[0]
        if self.verbose > 0:
            logging.info("Initializing...")
        if self.init == "kmeans++":
            centroids = [gs.expand_dims(X[randint(0, n_samples - 1)], 0)]
            for i in range(self.n_clusters - 1):
                dists = [
                    gs.to_ndarray(self.metric.dist(centroids[j], X), 2, 1)
                    for j in range(i + 1)
                ]
                dists = gs.hstack(dists)
                dists_to_closest_centroid = gs.amin(dists, 1)
                indices = gs.arange(n_samples)
                weights = dists_to_closest_centroid / gs.sum(
                    dists_to_closest_centroid)
                index = rv_discrete(values=(indices, weights)).rvs()
                centroids.append(gs.expand_dims(X[index], 0))
        else:
            centroids = [
                gs.expand_dims(X[randint(0, n_samples - 1)], 0)
                for i in range(self.n_clusters)
            ]
        self.centroids = gs.concatenate(centroids, axis=0)
        self.init_centroids = gs.concatenate(centroids, axis=0)

        dists = [
            gs.to_ndarray(self.metric.dist(self.centroids[i], X), 2, 1)
            for i in range(self.n_clusters)
        ]
        dists = gs.hstack(dists)
        self.labels = gs.argmin(dists, 1)
        index = 0
        while index < self.max_iter:
            index += 1
            if self.verbose > 0:
                logging.info(f"Iteration {index}...")

            old_centroids = gs.copy(self.centroids)
            for i in range(self.n_clusters):
                fold = gs.squeeze(X[self.labels == i])

                if len(fold) > 0:

                    mean = FrechetMean(
                        metric=self.metric,
                        max_iter=self.max_iter_mean,
                        point_type=self.point_type,
                        method=self.mean_method,
                        init_step_size=self.init_step_size,
                    )
                    mean.fit(fold)

                    self.centroids[i] = mean.estimate_
                else:
                    self.centroids[i] = X[randint(0, n_samples - 1)]

            dists = [
                gs.to_ndarray(self.metric.dist(self.centroids[i], X), 2, 1)
                for i in range(self.n_clusters)
            ]
            dists = gs.hstack(dists)
            self.labels = gs.argmin(dists, 1)
            dists_to_closest_centroid = gs.amin(dists, 1)
            self.inertia = gs.sum(dists_to_closest_centroid**2)
            centroids_distances = self.metric.dist(old_centroids,
                                                   self.centroids)
            if self.verbose > 0:
                logging.info(
                    f"Convergence criterion at the end of iteration {index} "
                    f"is {gs.mean(centroids_distances)}.")

            if gs.mean(centroids_distances) < self.tol:
                if self.verbose > 0:
                    logging.info(
                        f"Convergence reached after {index} iterations.")

                if self.n_clusters == 1:
                    self.centroids = gs.squeeze(self.centroids, axis=0)

                return gs.copy(self.centroids)

        if index == self.max_iter:
            logging.warning(
                f"K-means maximum number of iterations {self.max_iter} reached. "
                "The mean may be inaccurate.")

        if self.n_clusters == 1:
            self.centroids = gs.squeeze(self.centroids, axis=0)
        return gs.copy(self.centroids)