Пример #1
0
def test_stable_cumsum():
    assert_array_equal(stable_cumsum([1, 2, 3]), np.cumsum([1, 2, 3]))
    r = np.random.RandomState(0).rand(100000)
    assert_warns(RuntimeWarning, stable_cumsum, r, rtol=0, atol=0)

    # test axis parameter
    A = np.random.RandomState(36).randint(1000, size=(5, 5, 5))
    assert_array_equal(stable_cumsum(A, axis=0), np.cumsum(A, axis=0))
    assert_array_equal(stable_cumsum(A, axis=1), np.cumsum(A, axis=1))
    assert_array_equal(stable_cumsum(A, axis=2), np.cumsum(A, axis=2))
Пример #2
0
def test_stable_cumsum():
    if np_version < (1, 9):
        raise SkipTest("Sum is as unstable as cumsum for numpy < 1.9")
    assert_array_equal(stable_cumsum([1, 2, 3]), np.cumsum([1, 2, 3]))
    r = np.random.RandomState(0).rand(100000)
    assert_warns(RuntimeWarning, stable_cumsum, r, rtol=0, atol=0)

    # test axis parameter
    A = np.random.RandomState(36).randint(1000, size=(5, 5, 5))
    assert_array_equal(stable_cumsum(A, axis=0), np.cumsum(A, axis=0))
    assert_array_equal(stable_cumsum(A, axis=1), np.cumsum(A, axis=1))
    assert_array_equal(stable_cumsum(A, axis=2), np.cumsum(A, axis=2))
def test_stable_cumsum():
    if np_version < (1, 9):
        raise SkipTest("Sum is as unstable as cumsum for numpy < 1.9")
    assert_array_equal(stable_cumsum([1, 2, 3]), np.cumsum([1, 2, 3]))
    r = np.random.RandomState(0).rand(100000)
    assert_raise_message(RuntimeError,
                         'cumsum was found to be unstable: its last element '
                         'does not correspond to sum',
                         stable_cumsum, r, rtol=0, atol=0)
Пример #4
0
def _weighted_percentile(array, sample_weight, percentile=50):
    """
    Compute the weighted ``percentile`` of ``array`` with ``sample_weight``.
    """
    sorted_idx = np.argsort(array)

    # Find index of median prediction for each sample
    weight_cdf = stable_cumsum(sample_weight[sorted_idx])
    percentile_idx = np.searchsorted(
        weight_cdf, (percentile / 100.) * weight_cdf[-1])
    return array[sorted_idx[percentile_idx]]
Пример #5
0
def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None):
    """Init n_clusters seeds according to k-means++
    Parameters
    ----------
    X : {ndarray, sparse matrix} of shape (n_samples, n_features)
        The data to pick seeds for. To avoid memory copy, the input data
        should be double precision (dtype=np.float64).
    n_clusters : int
        The number of seeds to choose
    x_squared_norms : ndarray of shape (n_samples,)
        Squared Euclidean norm of each data point.
    random_state : RandomState instance
        The generator used to initialize the centers.
        See :term:`Glossary <random_state>`.
    n_local_trials : int, default=None
        The number of seeding trials for each center (except the first),
        of which the one reducing inertia the most is greedily chosen.
        Set to None to make the number of trials depend logarithmically
        on the number of seeds (2+log(k)); this is the default.
    Notes
    -----
    Selects initial cluster centers for k-mean clustering in a smart way
    to speed up convergence. see: Arthur, D. and Vassilvitskii, S.
    "k-means++: the advantages of careful seeding". ACM-SIAM symposium
    on Discrete algorithms. 2007
    Version ported from http://www.stanford.edu/~darthur/kMeansppTest.zip,
    which is the implementation used in the aforementioned paper.
    """
    n_samples, n_features = X.shape

    centers = np.empty((n_clusters, n_features), dtype=X.dtype)

    assert x_squared_norms is not None, 'x_squared_norms None in _k_init'

    # Set the number of local seeding trials if none is given
    if n_local_trials is None:
        # This is what Arthur/Vassilvitskii tried, but did not report
        # specific results for other than mentioning in the conclusion
        # that it helped.
        n_local_trials = 2 + int(np.log(n_clusters))

    # Pick first center randomly
    center_id = random_state.randint(n_samples)
    centers[0] = X[center_id]

    # Initialize list of closest distances and calculate current potential
    closest_dist_sq = euclidean_distances(centers[0, np.newaxis],
                                          X,
                                          Y_norm_squared=x_squared_norms,
                                          squared=True)
    current_pot = closest_dist_sq.sum()

    # Pick the remaining n_clusters-1 points
    for c in range(1, n_clusters):
        # Choose center candidates by sampling with probability proportional
        # to the squared distance to the closest existing center
        rand_vals = random_state.random_sample(n_local_trials) * current_pot
        candidate_ids = np.searchsorted(stable_cumsum(closest_dist_sq),
                                        rand_vals)
        # XXX: numerical imprecision can result in a candidate_id out of range
        np.clip(candidate_ids,
                None,
                closest_dist_sq.size - 1,
                out=candidate_ids)

        # Compute distances to center candidates
        distance_to_candidates = euclidean_distances(
            X[candidate_ids], X, Y_norm_squared=x_squared_norms, squared=True)

        # update closest distances squared and potential for each candidate
        np.minimum(closest_dist_sq,
                   distance_to_candidates,
                   out=distance_to_candidates)
        candidates_pot = distance_to_candidates.sum(axis=1)

        # Decide which candidate is the best
        best_candidate = np.argmin(candidates_pot)
        current_pot = candidates_pot[best_candidate]
        closest_dist_sq = distance_to_candidates[best_candidate]
        best_candidate = candidate_ids[best_candidate]

        # Permanently add best center candidate found in local tries
        centers[c] = X[best_candidate]

    return centers
Пример #6
0
    def _kpp_init(self, D, n_clusters, random_state_, n_local_trials=None):
        """Init n_clusters seeds with a method similar to k-means++

        Parameters
        -----------
        D : array, shape (n_samples, n_samples)
            The distance matrix we will use to select medoid indices.

        n_clusters : integer
            The number of seeds to choose

        random_state : RandomState
            The generator used to initialize the centers.

        n_local_trials : integer, optional
            The number of seeding trials for each center (except the first),
            of which the one reducing inertia the most is greedily chosen.
            Set to None to make the number of trials depend logarithmically
            on the number of seeds (2+log(k)); this is the default.

        Notes
        -----
        Selects initial cluster centers for k-medoid clustering in a smart way
        to speed up convergence. see: Arthur, D. and Vassilvitskii, S.
        "k-means++: the advantages of careful seeding". ACM-SIAM symposium
        on Discrete algorithms. 2007

        Version ported from http://www.stanford.edu/~darthur/kMeansppTest.zip,
        which is the implementation used in the aforementioned paper.
        """
        n_samples, _ = D.shape

        centers = np.empty(n_clusters, dtype=int)

        # Set the number of local seeding trials if none is given
        if n_local_trials is None:
            # This is what Arthur/Vassilvitskii tried, but did not report
            # specific results for other than mentioning in the conclusion
            # that it helped.
            n_local_trials = 2 + int(np.log(n_clusters))

        center_id = random_state_.randint(n_samples)
        centers[0] = center_id

        # Initialize list of closest distances and calculate current potential
        closest_dist_sq = D[centers[0], :]**2
        current_pot = closest_dist_sq.sum()

        # pick the remaining n_clusters-1 points
        for cluster_index in range(1, n_clusters):
            rand_vals = (random_state_.random_sample(n_local_trials) *
                         current_pot)
            candidate_ids = np.searchsorted(stable_cumsum(closest_dist_sq),
                                            rand_vals)

            # Compute distances to center candidates
            distance_to_candidates = D[candidate_ids, :]**2

            # Decide which candidate is the best
            best_candidate = None
            best_pot = None
            best_dist_sq = None
            for trial in range(n_local_trials):
                # Compute potential when including center candidate
                new_dist_sq = np.minimum(closest_dist_sq,
                                         distance_to_candidates[trial])
                new_pot = new_dist_sq.sum()

                # Store result if it is the best local trial so far
                if (best_candidate is None) or (new_pot < best_pot):
                    best_candidate = candidate_ids[trial]
                    best_pot = new_pot
                    best_dist_sq = new_dist_sq

            centers[cluster_index] = best_candidate
            current_pot = best_pot
            closest_dist_sq = best_dist_sq

        return centers
Пример #7
0
    def _fit_full(self, X, n_components):
        self.accountant.check(self.epsilon, 0)

        n_samples, n_features = X.shape

        if self.centered:
            self.mean_ = np.zeros_like(np.mean(X, axis=0))
        else:
            if self.bounds is None:
                warnings.warn(
                    "Bounds parameter hasn't been specified, so falling back to determining range from the data.\n"
                    "This will result in additional privacy leakage. To ensure differential privacy with no "
                    "additional privacy loss, specify `range` for each valued returned by np.mean().",
                    PrivacyLeakWarning)

                self.bounds = (np.min(X, axis=0), np.max(X, axis=0))

            self.bounds = check_bounds(self.bounds, n_features)
            self.mean_ = mean(X,
                              epsilon=self.epsilon / 2,
                              bounds=self.bounds,
                              axis=0,
                              accountant=BudgetAccountant())

        X -= self.mean_

        if self.data_norm is None:
            warnings.warn(
                "Data norm has not been specified and will be calculated on the data provided.  This will "
                "result in additional privacy leakage. To ensure differential privacy and no additional "
                "privacy leakage, specify `data_norm` at initialisation.",
                PrivacyLeakWarning)
            self.data_norm = np.linalg.norm(X, axis=1).max()

        X = clip_to_norm(X, self.data_norm)

        s, u = covariance_eig(
            X,
            epsilon=self.epsilon if self.centered else self.epsilon / 2,
            norm=self.data_norm,
            dims=n_components if isinstance(n_components, Integral) else None)
        u, _ = svd_flip(u, np.zeros_like(u).T)
        s = np.sqrt(s)

        components_ = u.T

        # Get variance explained by singular values
        explained_variance_ = np.sort((s**2) / (n_samples - 1))[::-1]
        total_var = explained_variance_.sum()
        explained_variance_ratio_ = explained_variance_ / total_var
        singular_values_ = s.copy()  # Store the singular values.

        # Post-process the number of components required
        if n_components == 'mle':
            # TODO: Update when sklearn requirement changes to >= 0.23, removing try...except
            try:
                n_components = sk_pca._infer_dimension(explained_variance_,
                                                       n_samples)
            except AttributeError:
                n_components = sk_pca._infer_dimension_(
                    explained_variance_, n_samples, n_features)
        elif 0 < n_components < 1.0:
            # number of components for which the cumulated explained
            # variance percentage is superior to the desired threshold
            ratio_cumsum = stable_cumsum(explained_variance_ratio_)
            n_components = np.searchsorted(ratio_cumsum, n_components) + 1

        # Compute noise covariance using Probabilistic PCA model
        # The sigma2 maximum likelihood (cf. eq. 12.46)
        if n_components < min(n_features, n_samples):
            self.noise_variance_ = explained_variance_[n_components:].mean()
        else:
            self.noise_variance_ = 0.

        self.n_samples_, self.n_features_ = n_samples, n_features
        self.components_ = components_[:n_components]
        self.n_components_ = n_components
        self.explained_variance_ = explained_variance_[:n_components]
        self.explained_variance_ratio_ = explained_variance_ratio_[:
                                                                   n_components]
        self.singular_values_ = singular_values_[:n_components]

        self.accountant.spend(self.epsilon, 0)

        return u, s[:n_components], u.T
Пример #8
0
    def fit(self, X, y=None, sample_weight=None):
        """Train whitening spatial filters.

        Parameters
        ----------
        X : ndarray, shape (n_matrices, n_channels, n_channels)
            Set of SPD matrices.
        y : None
            Ignored as unsupervised.
        sample_weight : None | ndarray, shape (n_matrices,), default=None
            Weight of each matrix, to compute the weighted mean covariance
            matrix used for whitening and dimension reduction. If None, it uses
            equal weights.

        Returns
        -------
        self : Whitening instance
            The Whitening instance.
        """
        # weighted mean of input covariance matrices
        Xm = mean_covariance(X,
                             metric=self.metric,
                             sample_weight=sample_weight)

        # whitening without dimension reduction
        if self.dim_red is None:
            self.n_components_ = X.shape[-1]
            self.filters_ = invsqrtm(Xm)
            self.inv_filters_ = sqrtm(Xm)

        # whitening with dimension reduction
        elif isinstance(self.dim_red, dict):
            if len(self.dim_red) > 1:
                raise ValueError(
                    'Dictionary dim_red must contain only one element (Got %d)'
                    % len(self.dim_red))
            dim_red_key = next(iter(self.dim_red))
            dim_red_val = self.dim_red.get(dim_red_key)

            eigvals, eigvecs = eigh(Xm, eigvals_only=False)
            eigvals = eigvals[::-1]  # sort eigvals in descending order
            eigvecs = np.fliplr(eigvecs)  # idem for eigvecs

            if dim_red_key == 'n_components':
                if dim_red_val < 1:
                    raise ValueError(
                        'Value n_components must be superior to 1 (Got %d)' %
                        dim_red_val)
                if not isinstance(dim_red_val, numbers.Integral):
                    raise ValueError(
                        'n_components=%d must be of type int (Got %r)' %
                        (dim_red_val, type(dim_red_val)))
                self.n_components_ = min(dim_red_val, X.shape[-1])

            elif dim_red_key == 'expl_var':
                if not 0 < dim_red_val <= 1:
                    raise ValueError(
                        'Value expl_var must be included in (0, 1] (Got %d)' %
                        dim_red_val)
                cum_expl_var = stable_cumsum(eigvals / eigvals.sum())
                if self.verbose:
                    print('Cumulative explained variance: \n %r' %
                          cum_expl_var)
                self.n_components_ = np.searchsorted(
                    cum_expl_var, dim_red_val, side='right') + 1

            elif dim_red_key == 'max_cond':
                if dim_red_val <= 1:
                    raise ValueError(
                        'Value max_cond must be strictly superior to 1 '
                        '(Got %d)' % dim_red_val)
                conds = eigvals[0] / eigvals
                if self.verbose:
                    print('Condition numbers: \n %r' % conds)
                self.n_components_ = np.searchsorted(conds,
                                                     dim_red_val,
                                                     side='left')

            else:
                raise ValueError('Unknown key in parameter dim_red: %r' %
                                 dim_red_key)

            # dimension reduction
            if self.verbose:
                print('Dimension reduction of Whitening on %d components' %
                      self.n_components_)
            pca_filters = eigvecs[:, :self.n_components_]
            pca_sqrtvals = np.sqrt(eigvals[:self.n_components_])
            # whitening
            self.filters_ = pca_filters @ np.diag(1. / pca_sqrtvals)
            self.inv_filters_ = np.diag(pca_sqrtvals).T @ pca_filters.T

        else:
            raise ValueError('Unknown type for parameter dim_red: %r' %
                             type(self.dim_red))

        return self
Пример #9
0
    def _fit_daal4py(self, X, n_components):
        n_samples, n_features = X.shape
        n_sf_min = min(n_samples, n_features)

        _validate_n_components(n_components, n_samples, n_features)

        if n_components == 'mle':
            daal_n_components = n_features
        elif n_components < 1:
            daal_n_components = n_sf_min
        else:
            daal_n_components = n_components

        fpType = getFPType(X)
        centering_algo = daal4py.normalization_zscore(
            fptype=fpType, doScale=False)
        pca_alg = daal4py.pca(
            fptype=fpType,
            method='svdDense',
            normalization=centering_algo,
            resultsToCompute='mean|variance|eigenvalue',
            isDeterministic=True,
            nComponents=daal_n_components
        )
        pca_res = pca_alg.compute(X)

        self.mean_ = pca_res.means.ravel()
        variances_ = pca_res.variances.ravel()
        components_ = pca_res.eigenvectors
        explained_variance_ = pca_res.eigenvalues.ravel()
        tot_var  = explained_variance_.sum()
        explained_variance_ratio_ = explained_variance_ / tot_var

        if n_components == 'mle':
            n_components = \
                _infer_dimension_(explained_variance_, n_samples, n_features)
        elif 0 < n_components < 1.0:
            # number of components for which the cumulated explained
            # variance percentage is superior to the desired threshold
            ratio_cumsum = stable_cumsum(explained_variance_ratio_)
            n_components = np.searchsorted(ratio_cumsum, n_components) + 1

        # Compute noise covariance using Probabilistic PCA model
        # The sigma2 maximum likelihood (cf. eq. 12.46)
        if n_components < n_sf_min:
            if explained_variance_.shape[0] == n_sf_min:
                self.noise_variance_ = explained_variance_[n_components:].mean()
            else:
                resid_var_ = variances_.sum()
                resid_var_ -= explained_variance_[:n_components].sum()
                self.noise_variance_ = resid_var_ / (n_sf_min - n_components)
        else:
            self.noise_variance_ = 0.

        self.n_samples_, self.n_features_ = n_samples, n_features
        self.components_ = components_[:n_components]
        self.n_components_ = n_components
        self.explained_variance_ = explained_variance_[:n_components]
        self.explained_variance_ratio_ = \
            explained_variance_ratio_[:n_components]
        self.singular_values_ = np.sqrt((n_samples - 1) * self.explained_variance_)
Пример #10
0
    def _boost(self, iboost, X, y, sample_weight, random_state):
        """Implement a single boost for regression

        Perform a single boost according to the AdaBoost.R2 algorithm and
        return the updated sample weights.

        Parameters
        ----------
        iboost : int
            The index of the current boost iteration.

        X : {array-like, sparse matrix} of shape = [n_samples, n_features]
            The training input samples. Sparse matrix can be CSC, CSR, COO,
            DOK, or LIL. DOK and LIL are converted to CSR.

        y : array-like of shape = [n_samples]
            The target values (class labels in classification, real numbers in
            regression).

        sample_weight : array-like of shape = [n_samples]
            The current sample weights.

        random_state : numpy.RandomState
            The current random number generator

        Returns
        -------
        sample_weight : array-like of shape = [n_samples] or None
            The reweighted sample weights.
            If None then boosting has terminated early.

        estimator_weight : float
            The weight for the current boost.
            If None then boosting has terminated early.

        estimator_error : float
            The regression error for the current boost.
            If None then boosting has terminated early.
        """
        estimator = self._make_estimator(random_state=random_state)

        # Weighted sampling of the training set with replacement
        # For NumPy >= 1.7.0 use np.random.choice
        cdf = stable_cumsum(sample_weight)
        cdf /= cdf[-1]
        uniform_samples = random_state.random_sample(X.shape[0])
        bootstrap_idx = cdf.searchsorted(uniform_samples, side='right')
        # searchsorted returns a scalar
        bootstrap_idx = np.array(bootstrap_idx, copy=False)

        # Fit on the bootstrapped sample and obtain a prediction
        # for all samples in the training set
        estimator.fit(X[bootstrap_idx], y[bootstrap_idx])
        y_predict = estimator.predict(X)

        error_vect = np.abs(y_predict - y)
        error_max = error_vect.max()

        if error_max != 0.:
            error_vect /= error_max

        if self.loss == 'square':
            error_vect **= 2
        elif self.loss == 'exponential':
            error_vect = 1. - np.exp(-error_vect)

        # Calculate the average loss
        estimator_error = (sample_weight * error_vect).sum()

        if estimator_error <= 0:
            # Stop if fit is perfect
            return sample_weight, 1., 0.

        elif estimator_error >= 0.5:
            # Discard current estimator only if it isn't the only one
            if len(self.estimators_) > 1:
                self.estimators_.pop(-1)
            return None, None, None

        beta = estimator_error / (1. - estimator_error)

        # Boost weight using AdaBoost.R2 alg
        estimator_weight = self.learning_rate * np.log(1. / beta)

        if not iboost == self.n_estimators - 1:
            sample_weight *= np.power(beta,
                                      (1. - error_vect) * self.learning_rate)

        return sample_weight, estimator_weight, estimator_error
Пример #11
0
    def fit(self, X, y=None):
        """Fit.

        Compute and diagonalize cospectra, to estimate forward and backward
        spatial filters.

        Parameters
        ----------
        X : ndarray, shape (n_subjects, n_conditions, n_channels, n_samples) | list of n_subjects of list of n_conditions ndarray of shape (n_channels, n_samples), with same n_conditions and n_channels but different n_samples
            Signal in channel space, acquired for different subjects and under
            different experimental conditions.
        y : None
            Currently not used, here for compatibility with sklearn API.

        Returns
        -------
        self : AJDC instance
            The AJDC instance.
        """
        # definition of params for Welch's method
        cospcov = est.CospCovariances(
            window=self.window,
            overlap=self.overlap,
            fmin=self.fmin,
            fmax=self.fmax,
            fs=self.fs)
        # estimation of cospectra on subjects and conditions
        cosp = []
        for s in range(len(X)):
            cosp_ = cospcov.transform(X[s])
            if s == 0:
                n_conditions = cosp_.shape[0]
                self.n_channels_ = cosp_.shape[1]
                self.freqs_ = cospcov.freqs_
            else:
                if n_conditions != cosp_.shape[0]:
                    raise ValueError('Unequal number of conditions')
                if self.n_channels_ != cosp_.shape[1]:
                    raise ValueError('Unequal number of channels')
            cosp.append(cosp_)
        cosp = numpy.transpose(numpy.array(cosp), axes=(0, 1, 4, 2, 3))

        # trace-normalization of cospectra, Eq(3) in [2]
        cosp = normalize(cosp, "trace")
        # average of cospectra across subjects, Eq(7) in [2]
        cosp = numpy.mean(cosp, axis=0, keepdims=False)
        # concatenation of cospectra along conditions
        self._cosp_channels = numpy.concatenate(cosp, axis=0)
        # estimation of non-diagonality weights, Eq(B.1) in [1]
        weights = get_nondiag_weight(self._cosp_channels)

        # dimension reduction, computed on the weighted mean of cospectra
        # across frequencies (and conditions)
        cosp_av = numpy.average(
            self._cosp_channels,
            axis=0,
            weights=weights)
        eigvals, eigvecs = eigh(cosp_av, eigvals_only=False)
        eigvals = eigvals[::-1]         # sorted in descending order
        eigvecs = numpy.fliplr(eigvecs) # idem
        cum_expl_var = stable_cumsum(eigvals / eigvals.sum())
        self.n_sources_ = numpy.searchsorted(
            cum_expl_var, self.expl_var, side='right') + 1
        if self.verbose:
            print("Fitting AJDC to data using {} components ".format(
                self.n_sources_))
        pca_filters = eigvecs[:, :self.n_sources_]
        pca_vals = eigvals[:self.n_sources_]

        # whitening, Eq.(8) in [2]
        whit_filters = pca_filters @ numpy.diag(1. / numpy.sqrt(pca_vals))
        whit_inv_filters = pca_filters @ numpy.diag(numpy.sqrt(pca_vals))

        # apply dimension reduction and whitening on cospectra
        cosp_rw = whit_filters.T @ self._cosp_channels @ whit_filters

        # approximate joint diagonalization, currently by Pham's algorithm [3]
        diag_filters, self._cosp_sources = ajd_pham(
            cosp_rw,
            n_iter_max=100,
            sample_weight=weights)

        # computation of forward and backward filters, Eq.(9) and (10) in [2]
        self.forward_filters_ = diag_filters @ whit_filters.T
        self.backward_filters_ = whit_inv_filters @ inv(diag_filters)
        return self
Пример #12
0
    def _fit_full(self, X, n_components):
        """Fit the model by computing full SVD on X"""
        n_samples, n_features = X.shape

        if n_components == 'mle':
            if n_samples < n_features:
                raise ValueError("n_components='mle' is only supported "
                                 "if n_samples >= n_features")
        elif n_components == 'latent_root':
            pass
        elif not 0 <= n_components <= min(n_samples, n_features):
            raise ValueError("n_components=%r must be between 0 and "
                             "min(n_samples, n_features)=%r with "
                             "svd_solver='full'" %
                             (n_components, min(n_samples, n_features)))
        elif n_components >= 1:
            if not isinstance(n_components, numbers.Integral):
                raise ValueError("n_components=%r must be of type int "
                                 "when greater than or equal to 1, "
                                 "was of type=%r" %
                                 (n_components, type(n_components)))

        # Center data
        self.mean_ = np.mean(X, axis=0)
        X -= self.mean_

        U, S, V = linalg.svd(X, full_matrices=False)
        # flip eigenvectors' sign to enforce deterministic output
        U, V = svd_flip(U, V)

        components_ = V

        # Get variance explained by singular values
        explained_variance_ = (S**2) / (n_samples - 1)
        total_var = explained_variance_.sum()
        explained_variance_ratio_ = explained_variance_ / total_var
        singular_values_ = S.copy()  # Store the singular values.

        # Postprocess the number of components required
        if n_components == 'mle':
            n_components = \
                _infer_dimension_(explained_variance_, n_samples, n_features)
        elif n_components == 'latent_root':
            n_components = (explained_variance_ > 1).sum()
        elif 0 < n_components < 1.0:
            # number of components for which the cumulated explained
            # variance percentage is superior to the desired threshold
            ratio_cumsum = stable_cumsum(explained_variance_ratio_)
            n_components = np.searchsorted(ratio_cumsum, n_components) + 1

        # Compute noise covariance using Probabilistic PCA model
        # The sigma2 maximum likelihood (cf. eq. 12.46)
        if n_components < min(n_features, n_samples):
            self.noise_variance_ = explained_variance_[n_components:].mean()

        else:
            self.noise_variance_ = 0.

        self.n_samples_, self.n_features_ = n_samples, n_features
        self.components_ = components_[:n_components]
        self.n_components_ = n_components
        self.explained_variance_ = explained_variance_[:n_components]
        self.explained_variance_ratio_ = \
            explained_variance_ratio_[:n_components]
        self.singular_values_ = singular_values_[:n_components]

        return U, S, V
Пример #13
0
def _k_init_metric(X,
                   n_clusters,
                   cdist_metric,
                   random_state,
                   n_local_trials=None):
    """Init n_clusters seeds according to k-means++ with a custom distance
    metric.

    Parameters
    ----------
    X : array, shape (n_samples, n_timestamps, n_features)
        The data to pick seeds for.

    n_clusters : integer
        The number of seeds to choose

    cdist_metric : function
        Function to be called for cross-distance computations

    random_state : RandomState instance
        Generator used to initialize the centers.

    n_local_trials : integer, optional
        The number of seeding trials for each center (except the first),
        of which the one reducing inertia the most is greedily chosen.
        Set to None to make the number of trials depend logarithmically
        on the number of seeds (2+log(k)); this is the default.

    Notes
    -----
    Selects initial cluster centers for k-mean clustering in a smart way
    to speed up convergence. see: Arthur, D. and Vassilvitskii, S.
    "k-means++: the advantages of careful seeding". ACM-SIAM symposium
    on Discrete algorithms. 2007

    Version adapted from scikit-learn for use with a custom metric in place of
    Euclidean distance.
    """
    n_samples, n_timestamps, n_features = X.shape

    centers = numpy.empty((n_clusters, n_timestamps, n_features),
                          dtype=X.dtype)

    # Set the number of local seeding trials if none is given
    if n_local_trials is None:
        # This is what Arthur/Vassilvitskii tried, but did not report
        # specific results for other than mentioning in the conclusion
        # that it helped.
        n_local_trials = 2 + int(numpy.log(n_clusters))

    # Pick first center randomly
    center_id = random_state.randint(n_samples)
    centers[0] = X[center_id]

    # Initialize list of closest distances and calculate current potential
    closest_dist_sq = cdist_metric(centers[0, numpy.newaxis], X)**2
    current_pot = closest_dist_sq.sum()

    # Pick the remaining n_clusters-1 points
    for c in range(1, n_clusters):
        # Choose center candidates by sampling with probability proportional
        # to the squared distance to the closest existing center
        rand_vals = random_state.random_sample(n_local_trials) * current_pot
        candidate_ids = numpy.searchsorted(stable_cumsum(closest_dist_sq),
                                           rand_vals)
        # XXX: numerical imprecision can result in a candidate_id out of range
        numpy.clip(candidate_ids,
                   None,
                   closest_dist_sq.size - 1,
                   out=candidate_ids)

        # Compute distances to center candidates
        distance_to_candidates = cdist_metric(X[candidate_ids], X)**2

        # update closest distances squared and potential for each candidate
        numpy.minimum(closest_dist_sq,
                      distance_to_candidates,
                      out=distance_to_candidates)
        candidates_pot = distance_to_candidates.sum(axis=1)

        # Decide which candidate is the best
        best_candidate = numpy.argmin(candidates_pot)
        current_pot = candidates_pot[best_candidate]
        closest_dist_sq = distance_to_candidates[best_candidate]
        best_candidate = candidate_ids[best_candidate]

        # Permanently add best center candidate found in local tries
        centers[c] = X[best_candidate]

    return centers
Пример #14
0
def _k_init(norm, X, n_clusters, random_state, n_local_trials=None):
    """Init n_clusters seeds according to k-means++

    Parameters
    ----------
    norm : `l1` or `l2`
        manhattan or euclidean distance

    X : array or sparse matrix, shape (n_samples, n_features)
        The data to pick seeds for. To avoid memory copy, the input data
        should be double precision (dtype=numpy.float64).

    n_clusters : integer
        The number of seeds to choose

    random_state : int, RandomState instance
        The generator used to initialize the centers. Use an int to make the
        randomness deterministic.
        See :term:`Glossary <random_state>`.

    n_local_trials : integer, optional
        The number of seeding trials for each center (except the first),
        of which the one reducing inertia the most is greedily chosen.
        Set to None to make the number of trials depend logarithmically
        on the number of seeds (2+log(k)); this is the default.

    Notes
    -----
    Selects initial cluster centers for k-mean clustering in a smart way
    to speed up convergence. see: Arthur, D. and Vassilvitskii, S.
    "k-means++: the advantages of careful seeding". ACM-SIAM symposium
    on Discrete algorithms. 2007

    Version ported from http://www.stanford.edu/~darthur/kMeansppTest.zip,
    which is the implementation used in the aforementioned paper.
    """
    n_samples, n_features = X.shape

    centers = numpy.empty((n_clusters, n_features), dtype=X.dtype)

    # Set the number of local seeding trials if none is given
    if n_local_trials is None:
        # This is what Arthur/Vassilvitskii tried, but did not report
        # specific results for other than mentioning in the conclusion
        # that it helped.
        n_local_trials = 2 + int(numpy.log(n_clusters))

    # Pick first center randomly
    center_id = random_state.randint(n_samples)
    if issparse(X):
        centers[0] = X[center_id].toarray()
    else:
        centers[0] = X[center_id]

    # Initialize list of closest distances and calculate current potential
    if norm.lower() == 'l2':
        dist_fct = lambda x, y: euclidean_distances(x, y, squared=True)
    elif norm.lower() == 'l1':
        dist_fct = lambda x, y: manhattan_distances(x, y)
    else:
        raise NotImplementedError(
            "norm must be 'l1' or 'l2' not '{}'.".format(norm))

    closest_dist_sq = dist_fct(centers[0, numpy.newaxis], X)
    current_pot = closest_dist_sq.sum()

    # Pick the remaining n_clusters-1 points
    for c in range(1, n_clusters):
        # Choose center candidates by sampling with probability proportional
        # to the squared distance to the closest existing center
        rand_vals = random_state.random_sample(n_local_trials) * current_pot
        candidate_ids = numpy.searchsorted(stable_cumsum(closest_dist_sq),
                                           rand_vals)
        numpy.clip(candidate_ids,
                   None,
                   closest_dist_sq.size - 1,
                   out=candidate_ids)

        # Compute distances to center candidates
        distance_to_candidates = dist_fct(X[candidate_ids], X)

        # update closest distances squared and potential for each candidate
        numpy.minimum(closest_dist_sq,
                      distance_to_candidates,
                      out=distance_to_candidates)
        candidates_pot = distance_to_candidates.sum(axis=1)

        # Decide which candidate is the best
        best_candidate = numpy.argmin(candidates_pot)
        current_pot = candidates_pot[best_candidate]
        closest_dist_sq = distance_to_candidates[best_candidate]
        best_candidate = candidate_ids[best_candidate]

        # Permanently add best center candidate found in local tries
        if issparse(X):
            centers[c] = X[best_candidate].toarray()
        else:
            centers[c] = X[best_candidate]

    return centers
Пример #15
0
def uplift_curve(y_true, uplift, treatment):
    """Compute Uplift curve.

    For computing the area under the Uplift Curve, see :func:`.uplift_auc_score`.

    Args:
        y_true (1d array-like): Correct (true) target values.
        uplift (1d array-like): Predicted uplift, as returned by a model.
        treatment (1d array-like): Treatment labels.

    Returns:
        array (shape = [>2]), array (shape = [>2]): Points on a curve.

    See also:
        :func:`.uplift_auc_score`: Compute normalized Area Under the Uplift curve from prediction scores.

        :func:`.perfect_uplift_curve`: Compute the perfect Uplift curve.

        :func:`.plot_uplift_curve`: Plot Uplift curves from predictions.

        :func:`.qini_curve`: Compute Qini curve.

    References:
        Devriendt, F., Guns, T., & Verbeke, W. (2020). Learning to rank for uplift modeling. ArXiv, abs/2002.05897.
    """

    check_consistent_length(y_true, uplift, treatment)
    check_is_binary(treatment)
    y_true, uplift, treatment = np.array(y_true), np.array(uplift), np.array(
        treatment)

    desc_score_indices = np.argsort(uplift, kind="mergesort")[::-1]
    y_true, uplift, treatment = y_true[desc_score_indices], uplift[
        desc_score_indices], treatment[desc_score_indices]

    y_true_ctrl, y_true_trmnt = y_true.copy(), y_true.copy()

    y_true_ctrl[treatment == 1] = 0
    y_true_trmnt[treatment == 0] = 0

    distinct_value_indices = np.where(np.diff(uplift))[0]
    threshold_indices = np.r_[distinct_value_indices, uplift.size - 1]

    num_trmnt = stable_cumsum(treatment)[threshold_indices]
    y_trmnt = stable_cumsum(y_true_trmnt)[threshold_indices]

    num_all = threshold_indices + 1

    num_ctrl = num_all - num_trmnt
    y_ctrl = stable_cumsum(y_true_ctrl)[threshold_indices]

    curve_values = (np.divide(
        y_trmnt, num_trmnt, out=np.zeros_like(y_trmnt),
        where=num_trmnt != 0) - np.divide(
            y_ctrl, num_ctrl, out=np.zeros_like(y_ctrl),
            where=num_ctrl != 0)) * num_all

    if num_all.size == 0 or curve_values[0] != 0 or num_all[0] != 0:
        # Add an extra threshold position if necessary
        # to make sure that the curve starts at (0, 0)
        num_all = np.r_[0, num_all]
        curve_values = np.r_[0, curve_values]

    return num_all, curve_values
Пример #16
0
def qini_curve(y_true, uplift, treatment):
    """Compute Qini curve.

    For computing the area under the Qini Curve, see :func:`.qini_auc_score`.

    Args:
        y_true (1d array-like): Correct (true) target values.
        uplift (1d array-like): Predicted uplift, as returned by a model.
        treatment (1d array-like): Treatment labels.

    Returns:
        array (shape = [>2]), array (shape = [>2]): Points on a curve.

    See also:
        :func:`.uplift_curve`: Compute the area under the Qini curve.

        :func:`.perfect_qini_curve`: Compute the perfect Qini curve.

        :func:`.plot_qini_curves`: Plot Qini curves from predictions..

        :func:`.uplift_curve`: Compute Uplift curve.

    References:
        Nicholas J Radcliffe. (2007). Using control groups to target on predicted lift:
        Building and assessing uplift model. Direct Marketing Analytics Journal, (3):14–21, 2007.

        Devriendt, F., Guns, T., & Verbeke, W. (2020). Learning to rank for uplift modeling. ArXiv, abs/2002.05897.
    """

    check_consistent_length(y_true, uplift, treatment)
    check_is_binary(treatment)
    y_true, uplift, treatment = np.array(y_true), np.array(uplift), np.array(
        treatment)

    desc_score_indices = np.argsort(uplift, kind="mergesort")[::-1]

    y_true = y_true[desc_score_indices]
    treatment = treatment[desc_score_indices]
    uplift = uplift[desc_score_indices]

    y_true_ctrl, y_true_trmnt = y_true.copy(), y_true.copy()

    y_true_ctrl[treatment == 1] = 0
    y_true_trmnt[treatment == 0] = 0

    distinct_value_indices = np.where(np.diff(uplift))[0]
    threshold_indices = np.r_[distinct_value_indices, uplift.size - 1]

    num_trmnt = stable_cumsum(treatment)[threshold_indices]
    y_trmnt = stable_cumsum(y_true_trmnt)[threshold_indices]

    num_all = threshold_indices + 1

    num_ctrl = num_all - num_trmnt
    y_ctrl = stable_cumsum(y_true_ctrl)[threshold_indices]

    curve_values = y_trmnt - y_ctrl * np.divide(
        num_trmnt, num_ctrl, out=np.zeros_like(num_trmnt), where=num_ctrl != 0)
    if num_all.size == 0 or curve_values[0] != 0 or num_all[0] != 0:
        # Add an extra threshold position if necessary
        # to make sure that the curve starts at (0, 0)
        num_all = np.r_[0, num_all]
        curve_values = np.r_[0, curve_values]

    return num_all, curve_values
Пример #17
0
  def calibrate_threshold(self, pairs_valid, y_valid, strategy='accuracy',
                          min_rate=None, beta=1.):
    """Decision threshold calibration for pairwise binary classification

    Method that calibrates the decision threshold (cutoff point) of the metric
    learner. This threshold will then be used when calling the method
    `predict`. The methods for picking cutoff points make use of traditional
    binary classification evaluation statistics such as the true positive and
    true negative rates and F-scores. The threshold will be found to maximize
    the chosen score on the validation set ``(pairs_valid, y_valid)``.

    See more in the :ref:`User Guide <calibration>`.

    Parameters
    ----------
    strategy : str, optional (default='accuracy')
      The strategy to use for choosing the cutoff threshold.

      'accuracy'
          Selects a decision threshold that maximizes the accuracy.
      'f_beta'
          Selects a decision threshold that maximizes the f_beta score,
          with beta given by the parameter `beta`.
      'max_tpr'
          Selects a decision threshold that yields the highest true positive
          rate with true negative rate at least equal to the value of the
          parameter `min_rate`.
      'max_tnr'
          Selects a decision threshold that yields the highest true negative
          rate with true positive rate at least equal to the value of the
          parameter `min_rate`.

    beta : float in [0, 1], optional (default=None)
      Beta value to be used in case strategy == 'f_beta'.

    min_rate : float in [0, 1] or None, (default=None)
      In case strategy is 'max_tpr' or 'max_tnr' this parameter must be set
      to specify the minimal value for the true negative rate or true positive
      rate respectively that needs to be achieved.

    pairs_valid : array-like, shape=(n_pairs_valid, 2, n_features)
      The validation set of pairs to use to set the threshold.

    y_valid : array-like, shape=(n_pairs_valid,)
      The labels of the pairs of the validation set to use to set the
      threshold. They must be +1 for positive pairs and -1 for negative pairs.

    References
    ----------
    .. [1] Receiver-operating characteristic (ROC) plots: a fundamental
           evaluation tool in clinical medicine, MH Zweig, G Campbell -
           Clinical chemistry, 1993

    .. [2] most of the code of this function is from scikit-learn's PR #10117

    See Also
    --------
    sklearn.calibration : scikit-learn's module for calibrating classifiers
    """

    self._validate_calibration_params(strategy, min_rate, beta)

    pairs_valid, y_valid = self._prepare_inputs(pairs_valid, y_valid,
                                                type_of_inputs='tuples')

    n_samples = pairs_valid.shape[0]
    if strategy == 'accuracy':
      scores = self.decision_function(pairs_valid)
      scores_sorted_idces = np.argsort(scores)[::-1]
      scores_sorted = scores[scores_sorted_idces]
      # true labels ordered by decision_function value: (higher first)
      y_ordered = y_valid[scores_sorted_idces]
      # we need to add a threshold that will reject all points
      scores_sorted = np.concatenate([[scores_sorted[0] + 1], scores_sorted])

      # finds the threshold that maximizes the accuracy:
      cum_tp = stable_cumsum(y_ordered == 1)  # cumulative number of true
      # positives
      # we need to add the point where all samples are rejected:
      cum_tp = np.concatenate([[0.], cum_tp])
      cum_tn_inverted = stable_cumsum(y_ordered[::-1] == -1)
      cum_tn = np.concatenate([[0.], cum_tn_inverted])[::-1]
      cum_accuracy = (cum_tp + cum_tn) / n_samples
      imax = np.argmax(cum_accuracy)
      # we set the threshold to the lowest accepted score
      # note: we are working with negative distances but we want the threshold
      # to be with respect to the actual distances so we take minus sign
      self.threshold_ = - scores_sorted[imax]
      # note: if the best is to reject all points it's already one of the
      # thresholds (scores_sorted[0])
      return self

    if strategy == 'f_beta':
      precision, recall, thresholds = precision_recall_curve(
          y_valid, self.decision_function(pairs_valid), pos_label=1)

      # here the thresholds are decreasing
      # We ignore the warnings here, in the same taste as
      # https://github.com/scikit-learn/scikit-learn/blob/62d205980446a1abc1065
      # f4332fd74eee57fcf73/sklearn/metrics/classification.py#L1284
      with np.errstate(divide='ignore', invalid='ignore'):
        f_beta = ((1 + beta**2) * (precision * recall) /
                  (beta**2 * precision + recall))
      # We need to set nans to zero otherwise they will be considered higher
      # than the others (also discussed in https://github.com/scikit-learn/
      # scikit-learn/pull/10117/files#r262115773)
      f_beta[np.isnan(f_beta)] = 0.
      imax = np.argmax(f_beta)
      # we set the threshold to the lowest accepted score
      # note: we are working with negative distances but we want the threshold
      # to be with respect to the actual distances so we take minus sign
      self.threshold_ = - thresholds[imax]
      # Note: we don't need to deal with rejecting all points (i.e. threshold =
      # max_scores + 1), since this can never happen to be optimal
      # (see a more detailed discussion in test_calibrate_threshold_extreme)
      return self

    fpr, tpr, thresholds = roc_curve(y_valid,
                                     self.decision_function(pairs_valid),
                                     pos_label=1)
    # here the thresholds are decreasing
    fpr, tpr, thresholds = fpr, tpr, thresholds

    if strategy in ['max_tpr', 'max_tnr']:
      if strategy == 'max_tpr':
        indices = np.where(1 - fpr >= min_rate)[0]
        imax = np.argmax(tpr[indices])

      if strategy == 'max_tnr':
        indices = np.where(tpr >= min_rate)[0]
        imax = np.argmax(1 - fpr[indices])

      imax_valid = indices[imax]
      # note: we are working with negative distances but we want the threshold
      # to be with respect to the actual distances so we take minus sign
      if indices[imax] == len(thresholds):  # we want to accept everything
        self.threshold_ = - (thresholds[imax_valid] - 1)
      else:
        # thanks to roc_curve, the first point will always be max_scores
        # + 1, see: https://github.com/scikit-learn/scikit-learn/pull/13523
        self.threshold_ = - thresholds[imax_valid]
      return self
Пример #18
0
    def _fit(self, X, base_point=None, point_type='vector'):
        """Fit the model by computing full SVD on X"""
        if point_type == 'matrix':
            raise NotImplementedError(
                'This is currently only implemented for vectors.')
        if base_point is None:
            base_point = self.metric.mean(X)

        tangent_vecs = self.metric.log(X, base_point=base_point)

        # Convert to sklearn format
        X = tangent_vecs

        X = check_array(X, dtype=[np.float64, np.float32], ensure_2d=True,
                        copy=self.copy)

        # Handle n_components==None
        if self.n_components is None:
            n_components = min(X.shape)
        else:
            n_components = self.n_components
        n_samples, n_features = X.shape

        if n_components == 'mle':
            if n_samples < n_features:
                raise ValueError("n_components='mle' is only supported "
                                 "if n_samples >= n_features")
        elif not 0 <= n_components <= min(n_samples, n_features):
            raise ValueError("n_components=%r must be between 0 and "
                             "min(n_samples, n_features)=%r with "
                             "svd_solver='full'"
                             % (n_components, min(n_samples, n_features)))
        elif n_components >= 1:
            if not isinstance(n_components, (numbers.Integral, np.integer)):
                raise ValueError("n_components=%r must be of type int "
                                 "when greater than or equal to 1, "
                                 "was of type=%r"
                                 % (n_components, type(n_components)))

        # Center data
        self.mean_ = np.mean(X, axis=0)
        X -= self.mean_

        U, S, V = linalg.svd(X, full_matrices=False)
        # flip eigenvectors' sign to enforce deterministic output
        U, V = svd_flip(U, V)

        components_ = V

        # Get variance explained by singular values
        explained_variance_ = (S ** 2) / (n_samples - 1)
        total_var = explained_variance_.sum()
        explained_variance_ratio_ = explained_variance_ / total_var
        singular_values_ = S.copy()  # Store the singular values.

        # Postprocess the number of components required
        if n_components == 'mle':
            n_components = \
                _infer_dimension_(explained_variance_, n_samples, n_features)
        elif 0 < n_components < 1.0:
            # number of components for which the cumulated explained
            # variance percentage is superior to the desired threshold
            ratio_cumsum = stable_cumsum(explained_variance_ratio_)
            n_components = np.searchsorted(ratio_cumsum, n_components) + 1

        # Compute noise covariance using Probabilistic PCA model
        # The sigma2 maximum likelihood (cf. eq. 12.46)
        if n_components < min(n_features, n_samples):
            self.noise_variance_ = explained_variance_[n_components:].mean()
        else:
            self.noise_variance_ = 0.

        self.n_samples_, self.n_features_ = n_samples, n_features
        self.components_ = components_[:n_components]
        self.n_components_ = n_components
        self.explained_variance_ = explained_variance_[:n_components]
        self.explained_variance_ratio_ = \
            explained_variance_ratio_[:n_components]
        self.singular_values_ = singular_values_[:n_components]

        return U, S, V
Пример #19
0
    def _fit_full_daal4py(self, X, n_components):
        n_samples, n_features = X.shape
        n_sf_min = min(n_samples, n_features)

        if n_components == 'mle':
            daal_n_components = n_features
        elif n_components < 1:
            daal_n_components = n_sf_min
        else:
            daal_n_components = n_components

        fpType = getFPType(X)

        covariance_algo = daal4py.covariance(
            fptype=fpType, outputMatrixType='covarianceMatrix')
        covariance_res = covariance_algo.compute(X)

        self.mean_ = covariance_res.mean.ravel()
        covariance = covariance_res.covariance
        variances_ = np.array([covariance[i, i] for i in range(n_features)])

        pca_alg = daal4py.pca(fptype=fpType,
                              method='correlationDense',
                              resultsToCompute='eigenvalue',
                              isDeterministic=True,
                              nComponents=daal_n_components)
        pca_res = pca_alg.compute(X, covariance)

        components_ = pca_res.eigenvectors
        explained_variance_ = np.maximum(pca_res.eigenvalues.ravel(), 0)
        tot_var = explained_variance_.sum()
        explained_variance_ratio_ = explained_variance_ / tot_var

        if n_components == 'mle':
            if sklearn_check_version('0.23'):
                n_components = _infer_dimension(explained_variance_, n_samples)
            else:
                n_components = _infer_dimension_(explained_variance_,
                                                 n_samples, n_features)
        elif 0 < n_components < 1.0:
            ratio_cumsum = stable_cumsum(explained_variance_ratio_)
            n_components = np.searchsorted(
                ratio_cumsum, n_components, side='right') + 1

        if n_components < n_sf_min:
            if explained_variance_.shape[0] == n_sf_min:
                self.noise_variance_ = explained_variance_[n_components:].mean(
                )
            else:
                resid_var_ = variances_.sum()
                resid_var_ -= explained_variance_[:n_components].sum()
                self.noise_variance_ = resid_var_ / (n_sf_min - n_components)
        else:
            self.noise_variance_ = 0.

        self.n_samples_, self.n_features_ = n_samples, n_features
        self.components_ = components_[:n_components]
        self.n_components_ = n_components
        self.explained_variance_ = explained_variance_[:n_components]
        self.explained_variance_ratio_ = explained_variance_ratio_[:
                                                                   n_components]
        self.singular_values_ = np.sqrt(
            (n_samples - 1) * self.explained_variance_)
Пример #20
0
    def _fit(self, X, base_point=None):
        """Fit the model by computing full SVD on X.

        Parameters
        ----------
        X : array-like, shape=[..., n_features]
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : Ignored (Compliance with scikit-learn interface)
        base_point : array-like, shape=[..., n_features]
            Point at which to perform the tangent PCA.
            Optional, default to Frechet mean if None.

        Returns
        -------
        U, S, V : array-like
            Matrices of the SVD decomposition
        """
        if base_point is None:
            mean = FrechetMean(metric=self.metric, point_type=self.point_type)
            mean.fit(X)
            base_point = mean.estimate_

        tangent_vecs = self.metric.log(X, base_point=base_point)

        if self.point_type == "matrix":
            if Matrices.is_symmetric(tangent_vecs).all():
                X = SymmetricMatrices.to_vector(tangent_vecs)
            else:
                X = gs.reshape(tangent_vecs, (len(X), -1))
        else:
            X = tangent_vecs

        if self.n_components is None:
            n_components = min(X.shape)
        else:
            n_components = self.n_components
        n_samples, n_features = X.shape

        if n_components == "mle":
            if n_samples < n_features:
                raise ValueError("n_components='mle' is only supported "
                                 "if n_samples >= n_features")
        elif not 0 <= n_components <= min(n_samples, n_features):
            raise ValueError("n_components=%r must be between 0 and "
                             "min(n_samples, n_features)=%r with "
                             "svd_solver='full'" %
                             (n_components, min(n_samples, n_features)))
        elif n_components >= 1:
            if not isinstance(n_components, numbers.Integral):
                raise ValueError("n_components=%r must be of type int "
                                 "when greater than or equal to 1, "
                                 "was of type=%r" %
                                 (n_components, type(n_components)))

        # Center data - the mean should be 0 if base_point is the Frechet mean
        self.mean_ = gs.mean(X, axis=0)
        X -= self.mean_

        U, S, V = gs.linalg.svd(X, full_matrices=False)
        # flip eigenvectors' sign to enforce deterministic output
        U, V = svd_flip(U, V)

        components_ = V

        # Get variance explained by singular values
        explained_variance_ = (S**2) / (n_samples - 1)
        total_var = explained_variance_.sum()
        explained_variance_ratio_ = explained_variance_ / total_var
        singular_values_ = gs.copy(S)  # Store the singular values.

        # Postprocess the number of components required
        if n_components == "mle":
            n_components = _infer_dimension_(explained_variance_, n_samples,
                                             n_features)
        elif 0 < n_components < 1.0:
            # number of components for which the cumulated explained
            # variance percentage is superior to the desired threshold
            ratio_cumsum = stable_cumsum(explained_variance_ratio_)
            n_components = gs.searchsorted(ratio_cumsum, n_components) + 1

        # Compute noise covariance using Probabilistic PCA model
        # The sigma2 maximum likelihood (cf. eq. 12.46)
        if n_components < min(n_features, n_samples):
            self.noise_variance_ = explained_variance_[n_components:].mean()
        else:
            self.noise_variance_ = 0.0

        self.base_point_fit = base_point
        self.n_samples_, self.n_features_ = n_samples, n_features
        self.components_ = components_[:n_components]
        self.n_components_ = int(n_components)
        self.explained_variance_ = explained_variance_[:n_components]
        self.explained_variance_ratio_ = explained_variance_ratio_[:
                                                                   n_components]
        self.singular_values_ = singular_values_[:n_components]

        return U, S, V
Пример #21
0
  def calibrate_threshold(self, pairs_valid, y_valid, strategy='accuracy',
                          min_rate=None, beta=1.):
    """Decision threshold calibration for pairwise binary classification

    Method that calibrates the decision threshold (cutoff point) of the metric
    learner. This threshold will then be used when calling the method
    `predict`. The methods for picking cutoff points make use of traditional
    binary classification evaluation statistics such as the true positive and
    true negative rates and F-scores. The threshold will be found to maximize
    the chosen score on the validation set ``(pairs_valid, y_valid)``.

    See more in the :ref:`User Guide <calibration>`.

    Parameters
    ----------
    strategy : str, optional (default='accuracy')
      The strategy to use for choosing the cutoff threshold.

      'accuracy'
          Selects a decision threshold that maximizes the accuracy.
      'f_beta'
          Selects a decision threshold that maximizes the f_beta score,
          with beta given by the parameter `beta`.
      'max_tpr'
          Selects a decision threshold that yields the highest true positive
          rate with true negative rate at least equal to the value of the
          parameter `min_rate`.
      'max_tnr'
          Selects a decision threshold that yields the highest true negative
          rate with true positive rate at least equal to the value of the
          parameter `min_rate`.

    beta : float in [0, 1], optional (default=None)
      Beta value to be used in case strategy == 'f_beta'.

    min_rate : float in [0, 1] or None, (default=None)
      In case strategy is 'max_tpr' or 'max_tnr' this parameter must be set
      to specify the minimal value for the true negative rate or true positive
      rate respectively that needs to be achieved.

    pairs_valid : array-like, shape=(n_pairs_valid, 2, n_features)
      The validation set of pairs to use to set the threshold.

    y_valid : array-like, shape=(n_pairs_valid,)
      The labels of the pairs of the validation set to use to set the
      threshold. They must be +1 for positive pairs and -1 for negative pairs.

    References
    ----------
    .. [1] Receiver-operating characteristic (ROC) plots: a fundamental
           evaluation tool in clinical medicine, MH Zweig, G Campbell -
           Clinical chemistry, 1993

    .. [2] most of the code of this function is from scikit-learn's PR #10117

    See Also
    --------
    sklearn.calibration : scikit-learn's module for calibrating classifiers
    """

    self._validate_calibration_params(strategy, min_rate, beta)

    pairs_valid, y_valid = self._prepare_inputs(pairs_valid, y_valid,
                                                type_of_inputs='tuples')

    n_samples = pairs_valid.shape[0]
    if strategy == 'accuracy':
      scores = self.decision_function(pairs_valid)
      scores_sorted_idces = np.argsort(scores)[::-1]
      scores_sorted = scores[scores_sorted_idces]
      # true labels ordered by decision_function value: (higher first)
      y_ordered = y_valid[scores_sorted_idces]
      # we need to add a threshold that will reject all points
      scores_sorted = np.concatenate([[scores_sorted[0] + 1], scores_sorted])

      # finds the threshold that maximizes the accuracy:
      cum_tp = stable_cumsum(y_ordered == 1)  # cumulative number of true
      # positives
      # we need to add the point where all samples are rejected:
      cum_tp = np.concatenate([[0.], cum_tp])
      cum_tn_inverted = stable_cumsum(y_ordered[::-1] == -1)
      cum_tn = np.concatenate([[0.], cum_tn_inverted])[::-1]
      cum_accuracy = (cum_tp + cum_tn) / n_samples
      imax = np.argmax(cum_accuracy)
      # we set the threshold to the lowest accepted score
      # note: we are working with negative distances but we want the threshold
      # to be with respect to the actual distances so we take minus sign
      self.threshold_ = - scores_sorted[imax]
      # note: if the best is to reject all points it's already one of the
      # thresholds (scores_sorted[0])
      return self

    if strategy == 'f_beta':
      precision, recall, thresholds = precision_recall_curve(
          y_valid, self.decision_function(pairs_valid), pos_label=1)

      # here the thresholds are decreasing
      # We ignore the warnings here, in the same taste as
      # https://github.com/scikit-learn/scikit-learn/blob/62d205980446a1abc1065
      # f4332fd74eee57fcf73/sklearn/metrics/classification.py#L1284
      with np.errstate(divide='ignore', invalid='ignore'):
        f_beta = ((1 + beta**2) * (precision * recall) /
                  (beta**2 * precision + recall))
      # We need to set nans to zero otherwise they will be considered higher
      # than the others (also discussed in https://github.com/scikit-learn/
      # scikit-learn/pull/10117/files#r262115773)
      f_beta[np.isnan(f_beta)] = 0.
      imax = np.argmax(f_beta)
      # we set the threshold to the lowest accepted score
      # note: we are working with negative distances but we want the threshold
      # to be with respect to the actual distances so we take minus sign
      self.threshold_ = - thresholds[imax]
      # Note: we don't need to deal with rejecting all points (i.e. threshold =
      # max_scores + 1), since this can never happen to be optimal
      # (see a more detailed discussion in test_calibrate_threshold_extreme)
      return self

    fpr, tpr, thresholds = roc_curve(y_valid,
                                     self.decision_function(pairs_valid),
                                     pos_label=1)
    # here the thresholds are decreasing
    fpr, tpr, thresholds = fpr, tpr, thresholds

    if strategy in ['max_tpr', 'max_tnr']:
      if strategy == 'max_tpr':
        indices = np.where(1 - fpr >= min_rate)[0]
        imax = np.argmax(tpr[indices])

      if strategy == 'max_tnr':
        indices = np.where(tpr >= min_rate)[0]
        imax = np.argmax(1 - fpr[indices])

      imax_valid = indices[imax]
      # note: we are working with negative distances but we want the threshold
      # to be with respect to the actual distances so we take minus sign
      if indices[imax] == len(thresholds):  # we want to accept everything
        self.threshold_ = - (thresholds[imax_valid] - 1)
      else:
        # thanks to roc_curve, the first point will always be max_scores
        # + 1, see: https://github.com/scikit-learn/scikit-learn/pull/13523
        self.threshold_ = - thresholds[imax_valid]
      return self
Пример #22
0
def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None):
    """Calculate true and false positives per binary classification threshold.

    Parameters
    ----------
    y_true : array, shape = [n_samples]
        True targets of binary classification

    y_score : array, shape = [n_samples]
        Estimated probabilities or decision function

    pos_label : int or str, default=None
        The label of the positive class

    sample_weight : array-like of shape = [n_samples], optional
        Sample weights.

    Returns
    -------
    fps : array, shape = [n_thresholds]
        A count of false positives, at index i being the number of negative
        samples assigned a score >= thresholds[i]. The total number of
        negative samples is equal to fps[-1] (thus true negatives are given by
        fps[-1] - fps).

    tps : array, shape = [n_thresholds <= len(np.unique(y_score))]
        An increasing count of true positives, at index i being the number
        of positive samples assigned a score >= thresholds[i]. The total
        number of positive samples is equal to tps[-1] (thus false negatives
        are given by tps[-1] - tps).

    thresholds : array, shape = [n_thresholds]
        Decreasing score values.
    """
    check_consistent_length(y_true, y_score)
    y_true = column_or_1d(y_true)
    y_score = column_or_1d(y_score)
    assert_all_finite(y_true)
    assert_all_finite(y_score)

    if sample_weight is not None:
        sample_weight = column_or_1d(sample_weight)

    # ensure binary classification if pos_label is not specified
    classes = np.unique(y_true)
    if (pos_label is None and
        not (np.array_equal(classes, [0, 1]) or
             np.array_equal(classes, [-1, 1]) or
             np.array_equal(classes, [0]) or
             np.array_equal(classes, [-1]) or
             np.array_equal(classes, [1]))):
        raise ValueError("Data is not binary and pos_label is not specified")
    elif pos_label is None:
        pos_label = 1.

    # make y_true a boolean vector
    y_true = (y_true == pos_label)

    # sort scores and corresponding truth values
    desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1]
    y_score = y_score[desc_score_indices]
    y_true = y_true[desc_score_indices]
    if sample_weight is not None:
        weight = sample_weight[desc_score_indices]
    else:
        weight = 1.

    # y_score typically has many tied values. Here we extract
    # the indices associated with the distinct values. We also
    # concatenate a value for the end of the curve.
    distinct_value_indices = np.where(np.diff(y_score))[0]
    threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1]

    # accumulate the true positives with decreasing threshold
    tps = stable_cumsum(y_true * weight)[threshold_idxs]
    if sample_weight is not None:
        fps = stable_cumsum(weight)[threshold_idxs] - tps
    else:
        fps = 1 + threshold_idxs - tps
    return fps, tps, y_score[threshold_idxs]