Пример #1
0
def test_ll():
    """Tests that the log-likelihood for generalized linear models is correctly
    calculated."""

    # identity
    y_true = np.array([1, 2, 3])
    y_pred = np.array([np.e + 1, np.e + 2, np.e + 3])
    ll = log_likelihood_glm('normal', y_true, y_pred)
    assert_almost_equal(ll, -4.5)

    # poisson
    y_true = np.array([1 / np.log(2.), 1 / np.log(3.), 1 / np.log(4.)])
    y_pred = np.array([2., 3., 4.])
    ll = log_likelihood_glm('poisson', y_true, y_pred)
    assert_almost_equal(ll, -2)

    # poisson with all zeros
    y_true = np.zeros(3)
    y_pred = np.zeros(3)
    ll = log_likelihood_glm('poisson', y_true, y_pred)
    assert_equal(ll, 0.)

    # poisson with all zeros, but predicted is not all zeros
    y_pred = np.zeros(3)
    y_true = np.array([0., 0., 1.])
    ll = log_likelihood_glm('poisson', y_true, y_pred)
    assert_equal(ll, -np.inf)
Пример #2
0
def empirical_bayes(X, y, y_pred, ssq_hat, beta):

    n, p = X.shape
    beta = beta.ravel()
    y = y.ravel()
    support = np.array(beta).astype(bool)

    # Paper provides closed form expression
    # Using the conditional marginal likelihood criterion
    q = np.count_nonzero(beta)

    ll = log_likelihood_glm('normal', y, y_pred)

    if q > 0:
        support = (beta != 0).astype(bool)
        Tgamma = beta[support].T @ X[:, support].T @ X[:, support] @ beta[
            support] / ssq_hat

        R = -2 * (xlogy(p - q, p - q) + xlogy(q, q))

        if np.divide(Tgamma, q) > 1:

            B = q + q * np.log(Tgamma) - xlogy(q, q)

            CCML = -2 * ll + B + R

        else:
            B = Tgamma
            CCML = -2 * ll + Tgamma + R

        return CCML, B, R
    else:
        # Do not give the opportunity to select support wiht 0 coefficients
        return np.inf, 0, 0
Пример #3
0
def GIC(y, y_pred, model_size, penalty):

    y = y.ravel()
    y_pred = y_pred.ravel()

    ll = log_likelihood_glm('normal', y, y_pred)
    return -2 * ll + penalty * model_size
Пример #4
0
    def score_predictions(self, metric, fitter, X, y, support):
        """Score, according to some metric, predictions provided by a model.

        The resulting score will be negated if an information criterion is
        specified.

        Parameters
        ----------
        metric : string
            The type of score to run on the prediction. Valid options include
            'r2' (explained variance), 'BIC' (Bayesian information criterion),
            'AIC' (Akaike information criterion), and 'AICc' (corrected AIC).

        fitter : Poisson object
            The Poisson object that has been fit to the data with the
            respective hyperparameters.

        X : nd-array
            The design matrix.

        y : nd-array
            The response vector.

        support: array-like
            The indices of the non-zero features.

        Returns
        -------
        score : float
            The score.
        """
        # for Poisson, use predict_mean to calculate the "predicted" values
        y_pred = fitter.predict_mean(X[:, support])
        # calculate the log-likelihood
        ll = utils.log_likelihood_glm(model='poisson', y_true=y, y_pred=y_pred)
        if metric == 'log':
            score = ll
        # information criteria
        else:
            n_features = np.count_nonzero(support)
            if fitter.intercept_ != 0:
                n_features += 1
            n_samples = y.size
            if metric == 'BIC':
                score = utils.BIC(ll, n_features, n_samples)
            elif metric == 'AIC':
                score = utils.AIC(ll, n_features)
            elif metric == 'AICc':
                score = utils.AICc(ll, n_features, n_samples)
            else:
                raise ValueError(metric + ' is not a valid metric.')
            # negate the score since lower information criterion is
            # preferable
            score = -score

        return score
Пример #5
0
def test_LinearRegressor_scoring_defaults():
    """Tests that the correct default train/test data are being used
    for scoring estimates in UoIAbstractLinearRegressor. Further
    tests that the scoring itself is being done correctly."""
    seed = 5

    X, y = make_regression(n_samples=100,
                           n_features=10,
                           n_informative=10,
                           random_state=seed)

    train_idxs, test_idxs = train_test_split(np.arange(X.shape[0]),
                                             test_size=0.1,
                                             random_state=seed)
    X_train = X[train_idxs]
    y_train = y[train_idxs]

    X_test = X[test_idxs]
    y_test = y[test_idxs]

    fitter = LinearRegression().fit(X_train, y_train)
    support = np.ones(X.shape[1]).astype(bool)
    # r2 - must use test data
    uoi = UoI_Lasso(estimation_score='r2')
    assert (uoi._estimation_target == 1)

    score = uoi._score_predictions('r2', fitter, X, y, support,
                                   (train_idxs, test_idxs))
    assert_equal(r2_score(y_test, fitter.predict(X_test)), score)

    ll = log_likelihood_glm('normal', y_train,
                            fitter.predict(X_train[:, support]))
    # BIC - must use train data
    uoi = UoI_Lasso(estimation_score='BIC')
    assert (uoi._estimation_target == 0)
    score = -1 * uoi._score_predictions('BIC', fitter, X, y, support,
                                        (train_idxs, test_idxs))
    assert_equal(BIC(ll, *X_train.T.shape), score)

    # AIC - must use train data
    uoi = UoI_Lasso(estimation_score='AIC')
    assert (uoi._estimation_target == 0)

    score = -1 * uoi._score_predictions('AIC', fitter, X, y, support,
                                        (train_idxs, test_idxs))
    assert_equal(AIC(ll, X_train.shape[1]), score)

    # AICc - must use train data
    uoi = UoI_Lasso(estimation_score='AICc')
    assert (uoi._estimation_target == 0)

    score = -1 * uoi._score_predictions('AICc', fitter, X, y, support,
                                        (train_idxs, test_idxs))
    assert_equal(AICc(ll, *X_train.T.shape), score)
Пример #6
0
    def score_predictions(metric, fitter, X, y, support):
        """Score, according to some metric, predictions provided by a model.

        the resulting score will be negated if an information criterion is
        specified

        Parameters
        ----------
        metric : string
            The type of score to run on the prediction. Valid options include
            'r2' (explained variance), 'BIC' (Bayesian information criterion),
            'AIC' (Akaike information criterion), and 'AICc' (corrected AIC).

        y_true : array-like
            The true response variables.

        y_pred : array-like
            The predicted response variables.

        supports: array-like
            The value of the supports for the model that was used to generate
            *y_pred*.

        Returns
        -------
        score : float
            The score.
        """
        y_pred = fitter.predict(X[:, support])
        if metric == 'r2':
            score = r2_score(y, y_pred)
        else:
            ll = utils.log_likelihood_glm(model='normal',
                                          y_true=y,
                                          y_pred=y_pred)
            n_features = np.count_nonzero(support)
            n_samples = y.size
            if metric == 'BIC':
                score = utils.BIC(ll, n_features, n_samples)
            elif metric == 'AIC':
                score = utils.AIC(ll, n_features)
            elif metric == 'AICc':
                score = utils.AICc(ll, n_features, n_samples)
            else:
                raise ValueError(metric + ' is not a valid option.')
            # negate the score since lower information criterion is preferable
            score = -score
        return score
Пример #7
0
def full_bayes_factor(y, y_pred, n_features, model_size, sparsity_prior,
                      penalty):

    y = y.ravel()
    y_pred = y_pred.ravel()

    n_samples = y.size

    # Log likelihood
    ll = log_likelihood_glm('normal', y, y_pred)

    # Regularization Penalty (prior)
    p1 = 2 * penalty * model_size

    # Normal BIC penalty
    BIC = model_size * np.log(n_samples)

    # Second order Bayes factor approximation
    RSS = np.sum((y - y_pred)**2)
    BIC2 = n_samples**3 / (2 * RSS * 3)

    # Term arising from normalization
    BIC3 = model_size * np.log(2 * np.pi)

    # If provided with a list of sparsity estimates, we are specifying
    # a beta hyperprior, and need to integrate over it correspondingly
    if not np.isscalar(sparsity_prior):
        M_k = beta_binomial_model(sparsity_prior, n_features, model_size)
    else:
        if sparsity_prior == 1:
            sparsity_prior = 0.999

        # Model probability prior
        M_k = scipy.special.binom(n_features, model_size) * \
              sparsity_prior**model_size * (1 - sparsity_prior)**(n_features - model_size)

    # If the model probability evaluates to 0, set it to a very small but finite value to
    # avoid blowups in the log
    if M_k == 0:
        M_k = 1e-9

    P_M = 2 * np.log(M_k)

    #    bayes_factor = 2 * ll - BIC - BIC2 + BIC3 - p1 + P_M

    return ll, p1, BIC, BIC2, BIC3, M_k, P_M
Пример #8
0
    def _score_predictions(self, metric, fitter, X, y, support, boot_idxs):
        """Score, according to some metric, predictions provided by a model.

        The resulting score will be negated if an information criterion is
        specified.

        Parameters
        ----------
        metric : string
            The type of score to run on the prediction. Valid options include
            'r2' (explained variance), 'BIC' (Bayesian information criterion),
            'AIC' (Akaike information criterion), and 'AICc' (corrected AIC).
        fitter : object
            Must contain .predict and .predict_proba methods.
        X : array-like
            The design matrix.
        y : array-like
            Response vector.
        supports : array-like
            The value of the supports for the model
        boot_idxs : 2-tuple of array-like objects
            Tuple of (train_idxs, test_idxs) generated from a bootstrap
            sample. If this is specified, then the appropriate set of
            data will be used for evaluating scores: test data for r^2,
            and training data for information criteria

        Returns
        -------
        score : float
            The score.
        """

        # Select the data relevant for the estimation_score
        X = X[boot_idxs[self._estimation_target]]
        y = y[boot_idxs[self._estimation_target]]

        if y.ndim == 2:
            if y.shape[1] > 1:
                raise ValueError('y should either have shape ' +
                                 '(n_samples, ) or (n_samples, 1).')
            y = np.squeeze(y)
        elif y.ndim > 2:
            raise ValueError('y should either have shape ' +
                             '(n_samples, ) or (n_samples, 1).')

        y_pred = fitter.predict(X[:, support])
        if y.shape != y_pred.shape:
            raise ValueError('Targets and predictions are not the same shape.')

        if metric == 'r2':
            score = r2_score(y, y_pred)
        else:
            ll = utils.log_likelihood_glm(model='normal',
                                          y_true=y,
                                          y_pred=y_pred)
            n_features = np.count_nonzero(support)
            n_samples = X.shape[0]
            if metric == 'BIC':
                score = utils.BIC(ll, n_features, n_samples)
            elif metric == 'AIC':
                score = utils.AIC(ll, n_features)
            elif metric == 'AICc':
                score = utils.AICc(ll, n_features, n_samples)
            else:
                raise ValueError(metric + ' is not a valid option.')
            # negate the score since lower information criterion is preferable
            score = -score
        return score
Пример #9
0
    def _score_predictions(self, metric, fitter, X, y, support, boot_idxs=None):
        """Score, according to some metric, predictions provided by a model.

        The resulting score will be negated if an information criterion is
        specified.

        Parameters
        ----------
        metric : string
            The type of score to run on the prediction. Valid options include
            'r2' (explained variance), 'BIC' (Bayesian information criterion),
            'AIC' (Akaike information criterion), and 'AICc' (corrected AIC).
        fitter : Poisson object
            The Poisson object that has been fit to the data with the
            respective hyperparameters.
        X : ndarray, shape (n_samples, n_features)
            The design matrix.
        y : ndarray, shape (n_samples,)
            The response vector.
        support: ndarray
            The indices of the non-zero features.
        boot_idxs : 2-tuple of array-like objects
            Tuple of (train_idxs, test_idxs) generated from a bootstrap
            sample. If this is specified, then the appropriate set of
            data will be used for evaluating scores: test data for r^2,
            and training data for information criteria

        Returns
        -------
        score : float
            The score.
        """

        # Select the train data
        if boot_idxs is not None:
            X = X[boot_idxs[self._estimation_target]]
            y = y[boot_idxs[self._estimation_target]]

        # for Poisson, use predict_mean to calculate the "predicted" values
        y_pred = fitter.predict_mean(X[:, support])
        # calculate the log-likelihood
        ll = utils.log_likelihood_glm(model='poisson', y_true=y, y_pred=y_pred)
        if metric == 'log':
            score = ll
        # information criteria
        else:
            n_features = np.count_nonzero(support)
            if fitter.intercept_ != 0:
                n_features += 1
            n_samples = X.shape[0]
            if metric == 'BIC':
                score = utils.BIC(n_samples * ll, n_features, n_samples)
            elif metric == 'AIC':
                score = utils.AIC(n_samples * ll, n_features)
            elif metric == 'AICc':
                score = utils.AICc(n_samples * ll, n_features, n_samples)
            else:
                raise ValueError(metric + ' is not a valid metric.')
            # negate the score since lower information criterion is preferable
            score = -score

        return score